From 20fabb84bff26f8563309733e6cf7fe6f6bd8b5d Mon Sep 17 00:00:00 2001 From: kisfenyo Date: Tue, 24 Feb 2026 18:45:25 +0100 Subject: [PATCH] v0.7.0: kiskegyed.hu parser, dual measurements, site links as URLs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - New parser for kiskegyed.hu: ingredients (with groups, dual measurements), instructions (ol > li > div), tags (section.tags) - Dual measurement handling: "3 ek (70 g)" extracts alternate measurement to comment field - Cross-site linking: kiskegyed→sobors links are followed to get full recipe (mirrors existing sobors→kiskegyed support) - Supported sites now shown as clickable URLs in the import page - supported_sites() returns dicts with name and url Co-Authored-By: Claude Opus 4.6 --- CHANGELOG.md | 10 +++ README.md | 15 ++++ app/scraper.py | 169 +++++++++++++++++++++++++++++++++++++- app/templates/import.html | 4 +- 4 files changed, 193 insertions(+), 5 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 8248b0c..363e713 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,15 @@ # Changelog +## v0.7.0 (2026-02-24) + +### Added +- Kiskegyed.hu parser: ingredients (with groups, dual measurements), instructions, tags +- Cross-site recipe linking: kiskegyed→sobors and sobors→kiskegyed links are followed automatically +- Dual measurement support: parenthesized alternate measurements (e.g. "3 ek (70 g)") extracted to comment field + +### Changed +- Supported sites list now shows clickable URLs instead of plain text + ## v0.6.1 (2026-02-24) ### Added diff --git a/README.md b/README.md index a2ddc6b..2f1fa0b 100644 --- a/README.md +++ b/README.md @@ -44,6 +44,7 @@ Docker container for importing recipes from Hungarian websites into [Mealie](htt | streetkitchen.hu | Yes (with groups) | Yes (ol/ul/paragraph) | Yes | Yes (from JSON-LD categories) | | nosalty.hu | Yes (with groups) | Yes (with section headers) | Yes | Yes | | sobors.hu | Yes (with groups) | Yes (with section headers, follows linked recipes) | Yes | Yes | +| kiskegyed.hu | Yes (with groups, dual measurements) | Yes (follows sobors.hu links) | Yes | Yes | | *Other sites* | Fallback (schema.org JSON-LD) | Fallback (schema.org JSON-LD) | Yes (og:image) | Fallback (schema.org keywords) | ### Mindmegette.hu Parser @@ -96,6 +97,20 @@ Extracts data from the sobors.hu recipe pages: - **Article-style ingredient fallback**: Pages without the structured `div.hozzavalok-container` are parsed from article-body `h4` + `ul > li` plain text - **Tags**: `div.cikk-cimkek > ul.cikk-cimkek-list > li > a` (skips generic "Receptek" category) +### Kiskegyed.hu Parser + +Extracts data from kiskegyed.hu recipe pages: + +- **Title**: `h2` element (with ` - Kiskegyed` suffix stripped) +- **Description**: `section#leadText > p` +- **Image**: `og:image` meta tag +- **Ingredients**: `div.recipe_ingredients` → `ul.list > li` items; group headers from `

` or `

` elements +- **Ingredient groups**: `

Name:

` or `

A ...hez

` format +- **Dual measurements**: "3 ek (70 g) búzafinomliszt" → qty: 3, unit: ek, food: búzafinomliszt, extra: 70 g +- **Instructions**: `div.recipe_preparation > ol > li > div` +- **Cross-site links**: Pages linking to sobors.hu are followed to get the full recipe +- **Tags**: `section.tags > a > span` (# prefix stripped, "recept" filtered) + ### Generic Fallback Parser For unsupported sites, attempts extraction via: diff --git a/app/scraper.py b/app/scraper.py index ce6c9e8..a61cc54 100644 --- a/app/scraper.py +++ b/app/scraper.py @@ -70,9 +70,16 @@ def scrape(url: str) -> dict: return result -def supported_sites() -> list[str]: - """Return list of supported site hostname substrings.""" - return [s for s, _ in _PARSERS] +def supported_sites() -> list[dict]: + """Return list of supported sites with name and URL.""" + _SITE_URLS = { + "mindmegette": "https://www.mindmegette.hu", + "streetkitchen": "https://streetkitchen.hu", + "nosalty": "https://www.nosalty.hu", + "sobors": "https://sobors.hu", + "kiskegyed": "https://www.kiskegyed.hu", + } + return [{"name": s + ".hu", "url": _SITE_URLS.get(s, "#")} for s, _ in _PARSERS] # --------------------------------------------------------------------------- @@ -545,6 +552,162 @@ def _parse_sobors(soup: BeautifulSoup, url: str) -> dict: } +# --------------------------------------------------------------------------- +# kiskegyed.hu +# --------------------------------------------------------------------------- + + +@_register("kiskegyed") +def _parse_kiskegyed(soup: BeautifulSoup, url: str) -> dict: + # Title: h2 inside the detail section + title = "" + h2 = soup.find("h2") + if h2: + title = h2.get_text(strip=True) + if not title: + title = _og(soup, "og:title") or _text(soup.find("title")) + if title: + title = re.sub(r"\s*[-–|]\s*Kiskegyed.*$", "", title, flags=re.IGNORECASE).strip() + + # Description: section#leadText > p + description = "" + lead = soup.find("section", id="leadText") + if lead: + p = lead.find("p") + if p: + description = p.get_text(strip=True) + if not description: + description = _og(soup, "og:description") or "" + + image_url = _og(soup, "og:image") + + # --- Ingredients --- + # Container: div.recipe_ingredients + # Groups:

Name:

or

A ...hez

+ # Items: ul.list > li (plain text with optional links) + ingredients = [] + ing_container = soup.find("div", class_="recipe_ingredients") + if ing_container: + for el in ing_container.find_all(["p", "ul"]): + if el.name == "p": + group_text = el.get_text(strip=True).rstrip(":") + # Skip the "Hozzávalók" header and serving info + if not group_text or group_text.lower().startswith("hozzávalók"): + continue + # Skip serving info like "4 személyre" + if re.match(r"^\d+\s+személyre$", group_text): + continue + ingredients.append({"group": group_text}) + elif el.name == "ul" and "list" in (el.get("class") or []): + for li in el.find_all("li"): + line = li.get_text(strip=True) + if not line: + continue + qty, unit, food, extra = _parse_kiskegyed_ingredient(line) + ingredients.append({ + "quantity": qty, + "unit": unit, + "food": food, + "extra": extra, + }) + + # --- Instructions --- + # Container: div.recipe_preparation > ol > li > div + instructions = [] + linked_url = None + prep_container = soup.find("div", class_="recipe_preparation") + if prep_container: + # Check for cross-link to another recipe site (e.g. sobors.hu) + for a in prep_container.find_all("a", href=True): + href = a["href"] + if href.startswith("http") and "kiskegyed.hu" not in href: + # Check if it points to a supported recipe site + linked_host = _host(href) + if any(s in linked_host for s, _ in _PARSERS if s != "kiskegyed"): + linked_url = href + break + + ol = prep_container.find("ol") + if ol: + for li in ol.find_all("li", recursive=False): + div = li.find("div") + txt = div.get_text(strip=True) if div else li.get_text(strip=True) + if txt: + instructions.append(txt) + + # If instructions are empty or just a redirect, follow the linked recipe + if linked_url and len(instructions) <= 2: + try: + linked_data = scrape(linked_url) + if linked_data.get("instructions"): + instructions = linked_data["instructions"] + if not ingredients and linked_data.get("ingredients"): + ingredients = linked_data["ingredients"] + except Exception: + pass + + # --- Tags --- + # Container: section.tags > a > span (text starts with #) + tags = [] + tag_section = soup.find("section", class_="tags") + if tag_section: + skip = {"recept", "receptek"} + for a in tag_section.find_all("a"): + span = a.find("span") + tag_text = span.get_text(strip=True) if span else a.get_text(strip=True) + tag_text = tag_text.lstrip("#").strip() + if tag_text and tag_text.lower() not in skip: + tags.append(tag_text) + + return { + "title": title or "Ismeretlen recept", + "description": description, + "image_url": image_url, + "ingredients": ingredients, + "instructions": instructions, + "tags": tags, + "original_url": url, + } + + +def _parse_kiskegyed_ingredient(line: str) -> tuple[str, str, str, str]: + """Parse a kiskegyed.hu ingredient line. + + Handles dual measurements like '3 ek (70 g) búzafinomliszt (BL 55)' + → qty='3', unit='ek', food='búzafinomliszt', extra='70 g; BL 55' + """ + extras = [] + + # Try: qty unit (alt_measurement) food... + m = re.match( + r"^([0-9][0-9.,/½¼¾-]*)\s+(\S+)\s+\(([^)]+)\)\s+(.+)$", line + ) + if m: + qty = m.group(1).strip() + unit = m.group(2).strip() + extras.append(m.group(3).strip()) + food_raw = m.group(4).strip() + # Extract trailing parenthesized note from food + fm = re.match(r"^(.+?)\s*\(([^)]+)\)\s*$", food_raw) + if fm: + food_raw = fm.group(1).strip() + extras.append(fm.group(2).strip()) + return (qty, unit, food_raw, "; ".join(extras)) + + # Try: qty unit food... + m2 = re.match(r"^([0-9][0-9.,/½¼¾-]*)\s+(\S+)\s+(.+)$", line) + if m2: + return (m2.group(1).strip(), m2.group(2).strip(), m2.group(3).strip(), "") + + # Try: qty food (e.g. "2 tojás") + m3 = re.match(r"^([0-9][0-9.,/½¼¾-]*)\s+(.+)$", line) + if m3: + return (m3.group(1).strip(), "", m3.group(2).strip(), "") + + # No quantity (e.g. "ízlés szerint só") + return ("", "", line, "") + + def _parse_sobors_article_ingredients(container, ingredients: list): """Parse article-style ingredients from sobors.hu (h4 headers + ul > li plain text).""" for el in container.find_all(["h4", "ul"]): diff --git a/app/templates/import.html b/app/templates/import.html index 26fd81c..8de5c92 100644 --- a/app/templates/import.html +++ b/app/templates/import.html @@ -326,7 +326,7 @@

- Támogatott oldalak: {{ supported_sites | join(', ') }} + egyéb (schema.org) + Támogatott oldalak: {% for s in supported_sites %}{{ s.name }}{% if not loop.last %}, {% endif %}{% endfor %} + egyéb (schema.org)