Add streetkitchen.hu parser with ingredient groups and multiple instruction formats

Handles three instruction layouts: ol steps, ul steps, and paragraph-style. Parses merged qty+unit strings (e.g. "200g" → qty=200, unit=g). Deduplicates ingredients by targeting the specific grid container. Tags extracted from JSON-LD recipeCategory. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-24 16:02:39 +01:00
parent 0370fb462b
commit bef4b9978d
1 changed files with 141 additions and 1 deletions
@@ -4,6 +4,7 @@ Each supported site has a parser registered via _PARSERS.
 Unsupported sites fall back to generic schema.org / og-tag extraction.
 """

+import json
 import re
 import requests
 from bs4 import BeautifulSoup
@@ -160,6 +161,146 @@ def _parse_mindmegette(soup: BeautifulSoup, url: str) -> dict:
    }


+# ---------------------------------------------------------------------------
+# streetkitchen.hu
+# ---------------------------------------------------------------------------
+
+
+@_register("streetkitchen")
+def _parse_streetkitchen(soup: BeautifulSoup, url: str) -> dict:
+    title = _og(soup, "og:title") or _text(soup.find("title"))
+    if title:
+        title = re.sub(r"\s*\|\s*Street Kitchen$", "", title).strip()
+
+    description = _og(soup, "og:description") or ""
+    image_url = _og(soup, "og:image")
+
+    # --- Ingredients ---
+    # Find the main ingredient grid (grid-cols-1 lg:grid-cols-2).
+    # The page renders ingredients twice (mobile + desktop); we pick the
+    # specific grid to avoid duplicates.
+    ingredients = []
+    ing_grid = None
+    for g in soup.select("div.grid"):
+        cls = " ".join(g.get("class", []))
+        if "grid-cols-1" in cls and "lg:grid-cols-2" in cls:
+            ing_grid = g
+            break
+
+    if ing_grid:
+        # Walk top-level divs — each may contain an h5 group header + rows
+        for section in ing_grid.find_all("div", recursive=False):
+            h5 = section.find("h5")
+            if h5:
+                group_name = h5.get_text(strip=True)
+                if group_name:
+                    ingredients.append({"group": group_name})
+
+            for row in section.select("div.my-2.flex.items-center.gap-2.text-lg"):
+                inner = row.select_one("div.flex.items-center.gap-2")
+                if not inner:
+                    continue
+                divs = inner.find_all("div", recursive=False)
+                bold = inner.find("div", class_="font-bold")
+                food = bold.get_text(strip=True) if bold else ""
+                if not food:
+                    continue
+
+                # First non-bold div is quantity+unit merged (e.g. "200g", "1fej")
+                qty_raw = ""
+                extra = ""
+                for d in divs:
+                    if d == bold:
+                        continue
+                    txt = d.get_text(strip=True)
+                    if txt.startswith("(") and txt.endswith(")"):
+                        extra = txt.strip("() ")
+                    elif not qty_raw:
+                        qty_raw = txt
+
+                # Split "200g" → qty="200", unit="g"
+                qty, unit = _split_qty_unit(qty_raw)
+
+                ingredients.append({
+                    "quantity": qty,
+                    "unit": unit,
+                    "food": food,
+                    "extra": extra,
+                })
+
+    # --- Instructions ---
+    instructions = []
+    prep = (soup.find("div", id="Streetk_content_preparation_wrapper")
+            or soup.select_one(".recipe-preparation"))
+    if prep:
+        ol = prep.find("ol")
+        ul = prep.find("ul")
+        if ol:
+            for li in ol.find_all("li", recursive=False):
+                txt = li.get_text(strip=True)
+                if txt:
+                    instructions.append(txt)
+        elif ul:
+            for li in ul.find_all("li", recursive=False):
+                txt = li.get_text(strip=True)
+                if txt:
+                    instructions.append(txt)
+        else:
+            # Paragraph-style: <p> blocks, sometimes with <strong> headers
+            for p in prep.find_all("p"):
+                txt = p.get_text(strip=True)
+                if txt:
+                    instructions.append(txt)
+
+    # If still nothing, try the description wrapper
+    if not instructions:
+        desc_article = soup.find("article", id="Streetk_content_description_wrapper")
+        if desc_article:
+            for p in desc_article.find_all("p"):
+                txt = p.get_text(strip=True)
+                if txt:
+                    instructions.append(txt)
+
+    # --- Tags ---
+    tags = []
+    # Prefer recipeCategory from JSON-LD (comma-separated)
+    for script in soup.find_all("script", type="application/ld+json"):
+        try:
+            data = json.loads(script.string or "")
+            graph = data.get("@graph", [data]) if isinstance(data, dict) else data
+            for item in graph:
+                if isinstance(item, dict) and item.get("@type") == "Recipe":
+                    cat = item.get("recipeCategory", "")
+                    if isinstance(cat, str) and cat:
+                        tags = [t.strip() for t in cat.split(",") if t.strip()]
+                    elif isinstance(cat, list):
+                        tags = [str(t).strip() for t in cat if str(t).strip()]
+                    break
+        except (json.JSONDecodeError, TypeError, AttributeError):
+            continue
+
+    return {
+        "title": title or "Ismeretlen recept",
+        "description": description,
+        "image_url": image_url,
+        "ingredients": ingredients,
+        "instructions": instructions,
+        "tags": tags,
+        "original_url": url,
+    }
+
+
+def _split_qty_unit(raw: str) -> tuple[str, str]:
+    """Split a merged quantity+unit string like '200g' into ('200', 'g')."""
+    raw = raw.strip()
+    if not raw:
+        return ("", "")
+    m = re.match(r"^([0-9][0-9 .,/-]*)(.*)$", raw)
+    if m:
+        return (m.group(1).strip(), m.group(2).strip())
+    return ("", raw)
+
+
 # ---------------------------------------------------------------------------
 # Generic fallback (og-tags + schema.org microdata)
 # ---------------------------------------------------------------------------
@@ -177,7 +318,6 @@ def _parse_generic(soup: BeautifulSoup, url: str) -> dict:
    # Try schema.org JSON-LD
    for script in soup.find_all("script", type="application/ld+json"):
        try:
-            import json
            data = json.loads(script.string or "")
            if isinstance(data, list):
                data = data[0]