v0.8.0: gastrohobbi.hu parser, fix ingredient fraction parsing

Add gastrohobbi.hu parser (WPBakery page builder layout): ingredients with groups, instructions with embedded lists, tags from JSON-LD articleSection, prep time extraction. Fix ingredient line parser: fractions like "1/2" no longer split due to regex backtracking, en-dash ranges normalized, unicode fractions (½¼¾) recognized as quantity start across all parsers. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-24 19:17:13 +01:00
parent ba5dae2caa
commit 0ec9ce0c6d
3 changed files with 197 additions and 5 deletions
@@ -78,6 +78,7 @@ def supported_sites() -> list[dict]:
        "nosalty": "https://www.nosalty.hu",
        "sobors": "https://sobors.hu",
        "kiskegyed": "https://www.kiskegyed.hu",
+        "gastrohobbi": "https://gastrohobbi.hu",
    }
    return [{"name": s + ".hu", "url": _SITE_URLS.get(s, "#")} for s, _ in _PARSERS]

@@ -682,7 +683,7 @@ def _parse_kiskegyed_ingredient(line: str) -> tuple[str, str, str, str]:
    # Try: qty unit (alt_measurement) food...
    # Unit can be multi-word (e.g. "kis fej"), so use .+? (non-greedy)
    m = re.match(
-        r"^([0-9][0-9.,/½¼¾-]*)\s+(.+?)\s+\(([^)]+)\)\s+(.+)$", line
+        r"^([0-9½¼¾][0-9.,/½¼¾-]*)\s+(.+?)\s+\(([^)]+)\)\s+(.+)$", line
    )
    if m:
        qty = m.group(1).strip()
@@ -697,12 +698,12 @@ def _parse_kiskegyed_ingredient(line: str) -> tuple[str, str, str, str]:
        return (qty, unit, food_raw, "; ".join(extras))

    # Try: qty unit food...
-    m2 = re.match(r"^([0-9][0-9.,/½¼¾-]*)\s+(\S+)\s+(.+)$", line)
+    m2 = re.match(r"^([0-9½¼¾][0-9.,/½¼¾-]*)\s+(\S+)\s+(.+)$", line)
    if m2:
        return (m2.group(1).strip(), m2.group(2).strip(), m2.group(3).strip(), "")

    # Try: qty food (e.g. "2 tojás")
-    m3 = re.match(r"^([0-9][0-9.,/½¼¾-]*)\s+(.+)$", line)
+    m3 = re.match(r"^([0-9½¼¾][0-9.,/½¼¾-]*)\s+(.+)$", line)
    if m3:
        return (m3.group(1).strip(), "", m3.group(2).strip(), "")

@@ -710,6 +711,168 @@ def _parse_kiskegyed_ingredient(line: str) -> tuple[str, str, str, str]:
    return ("", "", line, "")


+# ---------------------------------------------------------------------------
+# gastrohobbi.hu
+# ---------------------------------------------------------------------------
+
+
+@_register("gastrohobbi")
+def _parse_gastrohobbi(soup: BeautifulSoup, url: str) -> dict:
+    # Title: h1.mpcth-post-title > span
+    title = ""
+    title_el = soup.select_one("h1.mpcth-post-title span.mpcth-color-main-border")
+    if title_el:
+        title = title_el.get_text(strip=True)
+    if not title:
+        title = _og(soup, "og:title") or _text(soup.find("title"))
+    if title:
+        title = re.sub(r"\s*[-–|]\s*GastroHobbi.*$", "", title, flags=re.IGNORECASE).strip()
+
+    # Description: first <p> in the first wpb_text_column before the inner recipe row
+    description = ""
+    first_text_col = soup.select_one("div.wpb-content-wrapper div.wpb_text_column div.wpb_wrapper")
+    if first_text_col:
+        p = first_text_col.find("p")
+        if p:
+            description = p.get_text(strip=True)
+    if not description:
+        description = _og(soup, "og:description") or ""
+
+    image_url = _og(soup, "og:image")
+
+    # --- Ingredients ---
+    # Find h3 containing "Hozzávalók" then walk siblings for ul and group h3 elements
+    ingredients = []
+    _gastrohobbi_parse_ingredients(soup, ingredients)
+
+    # --- Instructions ---
+    # Find h3 containing "Elkészítés:" then collect following <p> elements
+    instructions = []
+    prep_time = ""
+    _gastrohobbi_parse_instructions(soup, instructions)
+
+    # Extract prep time from h3 containing "Elkészítési idő:"
+    for h3 in soup.find_all("h3"):
+        text = h3.get_text(strip=True)
+        if "elkészítési idő" in text.lower():
+            # Text after the <em> tag: "Elkészítési idő: 60 perc"
+            # The time part is outside the <em><strong> wrapper
+            em = h3.find("em")
+            if em:
+                em.decompose()
+            time_text = h3.get_text(strip=True).strip()
+            if time_text:
+                prep_time = time_text
+            break
+
+    # --- Tags ---
+    # From JSON-LD Article.articleSection
+    tags = []
+    skip_tags = {"receptjeink", "receptek"}
+    for script in soup.find_all("script", type="application/ld+json"):
+        try:
+            data = json.loads(script.string or "")
+            graph = data.get("@graph", [data]) if isinstance(data, dict) else data
+            for item in graph:
+                if isinstance(item, dict) and item.get("@type") == "Article":
+                    sections = item.get("articleSection", [])
+                    if isinstance(sections, list):
+                        tags = [s.strip() for s in sections
+                                if s.strip() and s.strip().lower() not in skip_tags]
+                    break
+        except (json.JSONDecodeError, TypeError, AttributeError):
+            continue
+
+    # Append prep time to description if available
+    if prep_time:
+        if description:
+            description += f" (Elkészítési idő: {prep_time})"
+        else:
+            description = f"Elkészítési idő: {prep_time}"
+
+    return {
+        "title": title or "Ismeretlen recept",
+        "description": description,
+        "image_url": image_url,
+        "ingredients": ingredients,
+        "instructions": instructions,
+        "tags": tags,
+        "original_url": url,
+    }
+
+
+def _gastrohobbi_parse_ingredients(soup: BeautifulSoup, ingredients: list):
+    """Parse ingredients from gastrohobbi.hu — find Hozzávalók h3 then walk siblings."""
+    # Find the h3 that contains "Hozzávalók"
+    header = None
+    for h3 in soup.find_all("h3"):
+        if "hozzávalók" in h3.get_text(strip=True).lower():
+            header = h3
+            break
+    if not header:
+        return
+
+    # Walk siblings after the header within the same container
+    for sib in header.find_next_siblings():
+        tag = sib.name
+        text = sib.get_text(strip=True)
+        if not text:
+            continue
+        # Stop at the "Elkészítés" section
+        if tag == "h3" and "elkészítés" in text.lower():
+            break
+        # Ingredient group header (plain h3 without em>strong, e.g. "A csipetkéhez:")
+        if tag == "h3":
+            group_name = text.rstrip(":")
+            if group_name:
+                ingredients.append({"group": group_name})
+            continue
+        # Ingredient list
+        if tag == "ul":
+            for li in sib.find_all("li", recursive=False):
+                p = li.find("p")
+                line = p.get_text(strip=True) if p else li.get_text(strip=True)
+                if not line:
+                    continue
+                qty, unit, food = _parse_ingredient_line(line)
+                ingredients.append({
+                    "quantity": qty, "unit": unit, "food": food, "extra": "",
+                })
+
+
+def _gastrohobbi_parse_instructions(soup: BeautifulSoup, instructions: list):
+    """Parse instructions from gastrohobbi.hu — find Elkészítés h3 then collect <p> siblings."""
+    header = None
+    for h3 in soup.find_all("h3"):
+        text = h3.get_text(strip=True)
+        if text.lower().startswith("elkészítés") and "idő" not in text.lower():
+            header = h3
+            break
+    if not header:
+        return
+
+    for sib in header.find_next_siblings():
+        tag = sib.name
+        text = sib.get_text(strip=True)
+        # Stop at prep time h3 or any other section header
+        if tag == "h3":
+            break
+        if tag == "p":
+            # Skip empty / whitespace-only paragraphs
+            if not text or text == "\xa0":
+                continue
+            instructions.append(text)
+        elif tag == "ul":
+            # Embedded list in instructions (e.g. cooking time options)
+            for li in sib.find_all("li"):
+                # Skip wrapper li elements that contain nested lists
+                if li.find("ul"):
+                    continue
+                li_text = li.get_text(strip=True)
+                if li_text:
+                    instructions.append(f"  • {li_text}")
+
+
 def _parse_sobors_article_ingredients(container, ingredients: list):
    """Parse article-style ingredients from sobors.hu (h4 headers + ul > li plain text)."""
    for el in container.find_all(["h4", "ul"]):
@@ -736,11 +899,14 @@ def _parse_sobors_article_ingredients(container, ingredients: list):

 def _parse_ingredient_line(line: str) -> tuple[str, str, str]:
    """Parse a plain-text ingredient line like '2 dl tejföl' into (qty, unit, food)."""
-    m = re.match(r"^([0-9][0-9.,/½¼¾-]*)\s*(\S+)\s+(.+)$", line)
+    # Normalize en-dash/em-dash ranges: "10 – 15" → "10-15"
+    line = re.sub(r"\s*[–—]\s*", "-", line)
+    # qty unit food (e.g. "2 dl tejföl", "½ tk őrölt kömény")
+    m = re.match(r"^([0-9½¼¾][0-9.,/½¼¾-]*)\s+(\S+)\s+(.+)$", line)
    if m:
        return (m.group(1).strip(), m.group(2).strip(), m.group(3).strip())
    # Just quantity + food (e.g. "2 tojás")
-    m2 = re.match(r"^([0-9][0-9.,/½¼¾-]*)\s+(.+)$", line)
+    m2 = re.match(r"^([0-9½¼¾][0-9.,/½¼¾-]*)\s+(.+)$", line)
    if m2:
        return (m2.group(1).strip(), "", m2.group(2).strip())
    return ("", "", line)