v0.6.1: follow linked recipes on sobors.hu, white favicon

- Sobors.hu parser: detect external links in instructions and follow them to scrape real recipe content (e.g. kiskegyed.hu linked recipes) - Article-style ingredient fallback for sobors.hu pages without structured ingredient containers (h4 + ul > li plain text) - Favicon changed to logo_notext_white.svg Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-24 18:18:54 +01:00
parent 45534391f0
commit baa63a43b2
4 changed files with 78 additions and 3 deletions
@@ -439,7 +439,7 @@ def _parse_sobors(soup: BeautifulSoup, url: str) -> dict:
    image_url = _og(soup, "og:image")

    # --- Ingredients ---
-    # Container: div.hozzavalok-container
+    # Container: div.hozzavalok-container (structured recipe pages)
    # Groups: section > h4 (group header), section > ul > li
    # Each li > span > span.mennyiseg, span.mertekegyseg, span.hozzavalo
    ingredients = []
@@ -467,12 +467,28 @@ def _parse_sobors(soup: BeautifulSoup, url: str) -> dict:
                    "extra": "",
                })

+    # Fallback: article-style ingredients (h4 group headers + ul > li plain text)
+    # Some sobors.hu pages (especially linked recipes) use this simpler format.
+    if not ingredients:
+        article = soup.find("div", class_="cikk-torzs") or soup.find("article")
+        if article:
+            _parse_sobors_article_ingredients(article, ingredients)
+
    # --- Instructions ---
    # Container: div.recept_leiras.recept_he-elkeszites
    # Content: <p> tags for steps, <h3><strong>Section</strong></h3> for section headers
    instructions = []
+    linked_url = None
    inst_container = soup.find("div", class_="recept_leiras")
    if inst_container:
+        # Check for external link (linked recipe pattern — e.g. "click here for
+        # full recipe on kiskegyed.hu")
+        for a in inst_container.find_all("a", href=True):
+            href = a["href"]
+            if href.startswith("http") and "sobors.hu" not in href:
+                linked_url = href
+                break
+
        for el in inst_container.find_all(["h3", "p"]):
            if el.name == "h3":
                header = el.get_text(strip=True)
@@ -485,6 +501,18 @@ def _parse_sobors(soup: BeautifulSoup, url: str) -> dict:
                    txt = re.sub(r"^\d+\.\s+", "", txt)
                    instructions.append(txt)

+    # If instructions just contain a redirect to another site, try to follow
+    # the link and scrape the real recipe from there.
+    if linked_url and len(instructions) <= 2:
+        try:
+            linked_data = scrape(linked_url)
+            if linked_data.get("instructions"):
+                instructions = linked_data["instructions"]
+            if not ingredients and linked_data.get("ingredients"):
+                ingredients = linked_data["ingredients"]
+        except Exception:
+            pass  # keep whatever we scraped from sobors.hu
+
    # --- Tags ---
    # Container: div.cikk-cimkek > ul.cikk-cimkek-list > li > a
    # Skip the generic "Receptek" category tag and "Olvasói receptek" tag
@@ -510,6 +538,42 @@ def _parse_sobors(soup: BeautifulSoup, url: str) -> dict:
    }


+def _parse_sobors_article_ingredients(container, ingredients: list):
+    """Parse article-style ingredients from sobors.hu (h4 headers + ul > li plain text)."""
+    for el in container.find_all(["h4", "ul"]):
+        if el.name == "h4":
+            group_name = el.get_text(strip=True).rstrip(":")
+            if group_name and not group_name.lower().startswith("hozzávalók"):
+                ingredients.append({"group": group_name})
+        elif el.name == "ul":
+            # Only consider lists that follow an h4 or are inside the ingredient context
+            prev = el.find_previous_sibling()
+            if prev and prev.name == "h4":
+                for li in el.find_all("li"):
+                    line = li.get_text(strip=True)
+                    if not line:
+                        continue
+                    qty, unit, food = _parse_ingredient_line(line)
+                    ingredients.append({
+                        "quantity": qty,
+                        "unit": unit,
+                        "food": food,
+                        "extra": "",
+                    })
+
+
+def _parse_ingredient_line(line: str) -> tuple[str, str, str]:
+    """Parse a plain-text ingredient line like '2 dl tejföl' into (qty, unit, food)."""
+    m = re.match(r"^([0-9][0-9.,/½¼¾-]*)\s*(\S+)\s+(.+)$", line)
+    if m:
+        return (m.group(1).strip(), m.group(2).strip(), m.group(3).strip())
+    # Just quantity + food (e.g. "2 tojás")
+    m2 = re.match(r"^([0-9][0-9.,/½¼¾-]*)\s+(.+)$", line)
+    if m2:
+        return (m2.group(1).strip(), "", m2.group(2).strip())
+    return ("", "", line)
+
+
 def _split_qty_unit(raw: str) -> tuple[str, str]:
    """Split a merged quantity+unit string like '200g' into ('200', 'g')."""
    raw = raw.strip()