From bef4b9978d8a490e65d85e8383b72b833eeec0b9 Mon Sep 17 00:00:00 2001 From: kisfenyo Date: Tue, 24 Feb 2026 16:02:39 +0100 Subject: [PATCH] Add streetkitchen.hu parser with ingredient groups and multiple instruction formats MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Handles three instruction layouts: ol steps, ul steps, and paragraph-style. Parses merged qty+unit strings (e.g. "200g" → qty=200, unit=g). Deduplicates ingredients by targeting the specific grid container. Tags extracted from JSON-LD recipeCategory. Co-Authored-By: Claude Opus 4.6 --- app/scraper.py | 142 ++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 141 insertions(+), 1 deletion(-) diff --git a/app/scraper.py b/app/scraper.py index 4a0ae6d..68f6f0e 100644 --- a/app/scraper.py +++ b/app/scraper.py @@ -4,6 +4,7 @@ Each supported site has a parser registered via _PARSERS. Unsupported sites fall back to generic schema.org / og-tag extraction. """ +import json import re import requests from bs4 import BeautifulSoup @@ -160,6 +161,146 @@ def _parse_mindmegette(soup: BeautifulSoup, url: str) -> dict: } +# --------------------------------------------------------------------------- +# streetkitchen.hu +# --------------------------------------------------------------------------- + + +@_register("streetkitchen") +def _parse_streetkitchen(soup: BeautifulSoup, url: str) -> dict: + title = _og(soup, "og:title") or _text(soup.find("title")) + if title: + title = re.sub(r"\s*\|\s*Street Kitchen$", "", title).strip() + + description = _og(soup, "og:description") or "" + image_url = _og(soup, "og:image") + + # --- Ingredients --- + # Find the main ingredient grid (grid-cols-1 lg:grid-cols-2). + # The page renders ingredients twice (mobile + desktop); we pick the + # specific grid to avoid duplicates. + ingredients = [] + ing_grid = None + for g in soup.select("div.grid"): + cls = " ".join(g.get("class", [])) + if "grid-cols-1" in cls and "lg:grid-cols-2" in cls: + ing_grid = g + break + + if ing_grid: + # Walk top-level divs — each may contain an h5 group header + rows + for section in ing_grid.find_all("div", recursive=False): + h5 = section.find("h5") + if h5: + group_name = h5.get_text(strip=True) + if group_name: + ingredients.append({"group": group_name}) + + for row in section.select("div.my-2.flex.items-center.gap-2.text-lg"): + inner = row.select_one("div.flex.items-center.gap-2") + if not inner: + continue + divs = inner.find_all("div", recursive=False) + bold = inner.find("div", class_="font-bold") + food = bold.get_text(strip=True) if bold else "" + if not food: + continue + + # First non-bold div is quantity+unit merged (e.g. "200g", "1fej") + qty_raw = "" + extra = "" + for d in divs: + if d == bold: + continue + txt = d.get_text(strip=True) + if txt.startswith("(") and txt.endswith(")"): + extra = txt.strip("() ") + elif not qty_raw: + qty_raw = txt + + # Split "200g" → qty="200", unit="g" + qty, unit = _split_qty_unit(qty_raw) + + ingredients.append({ + "quantity": qty, + "unit": unit, + "food": food, + "extra": extra, + }) + + # --- Instructions --- + instructions = [] + prep = (soup.find("div", id="Streetk_content_preparation_wrapper") + or soup.select_one(".recipe-preparation")) + if prep: + ol = prep.find("ol") + ul = prep.find("ul") + if ol: + for li in ol.find_all("li", recursive=False): + txt = li.get_text(strip=True) + if txt: + instructions.append(txt) + elif ul: + for li in ul.find_all("li", recursive=False): + txt = li.get_text(strip=True) + if txt: + instructions.append(txt) + else: + # Paragraph-style:

blocks, sometimes with headers + for p in prep.find_all("p"): + txt = p.get_text(strip=True) + if txt: + instructions.append(txt) + + # If still nothing, try the description wrapper + if not instructions: + desc_article = soup.find("article", id="Streetk_content_description_wrapper") + if desc_article: + for p in desc_article.find_all("p"): + txt = p.get_text(strip=True) + if txt: + instructions.append(txt) + + # --- Tags --- + tags = [] + # Prefer recipeCategory from JSON-LD (comma-separated) + for script in soup.find_all("script", type="application/ld+json"): + try: + data = json.loads(script.string or "") + graph = data.get("@graph", [data]) if isinstance(data, dict) else data + for item in graph: + if isinstance(item, dict) and item.get("@type") == "Recipe": + cat = item.get("recipeCategory", "") + if isinstance(cat, str) and cat: + tags = [t.strip() for t in cat.split(",") if t.strip()] + elif isinstance(cat, list): + tags = [str(t).strip() for t in cat if str(t).strip()] + break + except (json.JSONDecodeError, TypeError, AttributeError): + continue + + return { + "title": title or "Ismeretlen recept", + "description": description, + "image_url": image_url, + "ingredients": ingredients, + "instructions": instructions, + "tags": tags, + "original_url": url, + } + + +def _split_qty_unit(raw: str) -> tuple[str, str]: + """Split a merged quantity+unit string like '200g' into ('200', 'g').""" + raw = raw.strip() + if not raw: + return ("", "") + m = re.match(r"^([0-9][0-9 .,/-]*)(.*)$", raw) + if m: + return (m.group(1).strip(), m.group(2).strip()) + return ("", raw) + + # --------------------------------------------------------------------------- # Generic fallback (og-tags + schema.org microdata) # --------------------------------------------------------------------------- @@ -177,7 +318,6 @@ def _parse_generic(soup: BeautifulSoup, url: str) -> dict: # Try schema.org JSON-LD for script in soup.find_all("script", type="application/ld+json"): try: - import json data = json.loads(script.string or "") if isinstance(data, list): data = data[0]