diff --git a/CHANGELOG.md b/CHANGELOG.md index 591e13c..ca6c0d6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,14 @@ # Changelog +## v0.6.1 (2026-02-24) + +### Added +- Sobors.hu linked recipe support: when instructions link to another site (e.g. kiskegyed.hu), the scraper follows the link and imports the real recipe content +- Article-style ingredient fallback for sobors.hu pages without structured ingredient containers + +### Changed +- Favicon updated to white logo variant (logo_notext_white.svg) + ## v0.6.0 (2026-02-24) ### Added diff --git a/README.md b/README.md index 0068508..a2ddc6b 100644 --- a/README.md +++ b/README.md @@ -43,7 +43,7 @@ Docker container for importing recipes from Hungarian websites into [Mealie](htt | mindmegette.hu | Yes | Yes | Yes | Yes | | streetkitchen.hu | Yes (with groups) | Yes (ol/ul/paragraph) | Yes | Yes (from JSON-LD categories) | | nosalty.hu | Yes (with groups) | Yes (with section headers) | Yes | Yes | -| sobors.hu | Yes (with groups) | Yes (with section headers) | Yes | Yes | +| sobors.hu | Yes (with groups) | Yes (with section headers, follows linked recipes) | Yes | Yes | | *Other sites* | Fallback (schema.org JSON-LD) | Fallback (schema.org JSON-LD) | Yes (og:image) | Fallback (schema.org keywords) | ### Mindmegette.hu Parser @@ -92,6 +92,8 @@ Extracts data from the sobors.hu recipe pages: - **Ingredients**: `div.hozzavalok-container` → `section` elements with `ul > li`, each containing `span.mennyiseg` (qty), `span.mertekegyseg` (unit), `span.hozzavalo` (food) - **Ingredient groups**: `section > h4` headers (e.g., "A szószhoz:", "A húsgolyókhoz:") - **Instructions**: `div.recept_leiras` → `

` tags, with `

` section headers +- **Linked recipes**: Some pages link to another site (e.g. kiskegyed.hu) instead of showing full instructions. The parser detects external links in the instruction area and follows them to scrape the real recipe content. +- **Article-style ingredient fallback**: Pages without the structured `div.hozzavalok-container` are parsed from article-body `h4` + `ul > li` plain text - **Tags**: `div.cikk-cimkek > ul.cikk-cimkek-list > li > a` (skips generic "Receptek" category) ### Generic Fallback Parser diff --git a/app/scraper.py b/app/scraper.py index 27aeb2a..93578ae 100644 --- a/app/scraper.py +++ b/app/scraper.py @@ -439,7 +439,7 @@ def _parse_sobors(soup: BeautifulSoup, url: str) -> dict: image_url = _og(soup, "og:image") # --- Ingredients --- - # Container: div.hozzavalok-container + # Container: div.hozzavalok-container (structured recipe pages) # Groups: section > h4 (group header), section > ul > li # Each li > span > span.mennyiseg, span.mertekegyseg, span.hozzavalo ingredients = [] @@ -467,12 +467,28 @@ def _parse_sobors(soup: BeautifulSoup, url: str) -> dict: "extra": "", }) + # Fallback: article-style ingredients (h4 group headers + ul > li plain text) + # Some sobors.hu pages (especially linked recipes) use this simpler format. + if not ingredients: + article = soup.find("div", class_="cikk-torzs") or soup.find("article") + if article: + _parse_sobors_article_ingredients(article, ingredients) + # --- Instructions --- # Container: div.recept_leiras.recept_he-elkeszites # Content:

tags for steps,

Section

for section headers instructions = [] + linked_url = None inst_container = soup.find("div", class_="recept_leiras") if inst_container: + # Check for external link (linked recipe pattern — e.g. "click here for + # full recipe on kiskegyed.hu") + for a in inst_container.find_all("a", href=True): + href = a["href"] + if href.startswith("http") and "sobors.hu" not in href: + linked_url = href + break + for el in inst_container.find_all(["h3", "p"]): if el.name == "h3": header = el.get_text(strip=True) @@ -485,6 +501,18 @@ def _parse_sobors(soup: BeautifulSoup, url: str) -> dict: txt = re.sub(r"^\d+\.\s+", "", txt) instructions.append(txt) + # If instructions just contain a redirect to another site, try to follow + # the link and scrape the real recipe from there. + if linked_url and len(instructions) <= 2: + try: + linked_data = scrape(linked_url) + if linked_data.get("instructions"): + instructions = linked_data["instructions"] + if not ingredients and linked_data.get("ingredients"): + ingredients = linked_data["ingredients"] + except Exception: + pass # keep whatever we scraped from sobors.hu + # --- Tags --- # Container: div.cikk-cimkek > ul.cikk-cimkek-list > li > a # Skip the generic "Receptek" category tag and "Olvasói receptek" tag @@ -510,6 +538,42 @@ def _parse_sobors(soup: BeautifulSoup, url: str) -> dict: } +def _parse_sobors_article_ingredients(container, ingredients: list): + """Parse article-style ingredients from sobors.hu (h4 headers + ul > li plain text).""" + for el in container.find_all(["h4", "ul"]): + if el.name == "h4": + group_name = el.get_text(strip=True).rstrip(":") + if group_name and not group_name.lower().startswith("hozzávalók"): + ingredients.append({"group": group_name}) + elif el.name == "ul": + # Only consider lists that follow an h4 or are inside the ingredient context + prev = el.find_previous_sibling() + if prev and prev.name == "h4": + for li in el.find_all("li"): + line = li.get_text(strip=True) + if not line: + continue + qty, unit, food = _parse_ingredient_line(line) + ingredients.append({ + "quantity": qty, + "unit": unit, + "food": food, + "extra": "", + }) + + +def _parse_ingredient_line(line: str) -> tuple[str, str, str]: + """Parse a plain-text ingredient line like '2 dl tejföl' into (qty, unit, food).""" + m = re.match(r"^([0-9][0-9.,/½¼¾-]*)\s*(\S+)\s+(.+)$", line) + if m: + return (m.group(1).strip(), m.group(2).strip(), m.group(3).strip()) + # Just quantity + food (e.g. "2 tojás") + m2 = re.match(r"^([0-9][0-9.,/½¼¾-]*)\s+(.+)$", line) + if m2: + return (m2.group(1).strip(), "", m2.group(2).strip()) + return ("", "", line) + + def _split_qty_unit(raw: str) -> tuple[str, str]: """Split a merged quantity+unit string like '200g' into ('200', 'g').""" raw = raw.strip() diff --git a/app/templates/base.html b/app/templates/base.html index c0f75e8..9e9e61f 100644 --- a/app/templates/base.html +++ b/app/templates/base.html @@ -7,7 +7,7 @@ - +