Add streetkitchen.hu parser with ingredient groups and multiple instruction formats
Handles three instruction layouts: ol steps, ul steps, and paragraph-style. Parses merged qty+unit strings (e.g. "200g" → qty=200, unit=g). Deduplicates ingredients by targeting the specific grid container. Tags extracted from JSON-LD recipeCategory. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
+141
-1
@@ -4,6 +4,7 @@ Each supported site has a parser registered via _PARSERS.
|
||||
Unsupported sites fall back to generic schema.org / og-tag extraction.
|
||||
"""
|
||||
|
||||
import json
|
||||
import re
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
@@ -160,6 +161,146 @@ def _parse_mindmegette(soup: BeautifulSoup, url: str) -> dict:
|
||||
}
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# streetkitchen.hu
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
@_register("streetkitchen")
|
||||
def _parse_streetkitchen(soup: BeautifulSoup, url: str) -> dict:
|
||||
title = _og(soup, "og:title") or _text(soup.find("title"))
|
||||
if title:
|
||||
title = re.sub(r"\s*\|\s*Street Kitchen$", "", title).strip()
|
||||
|
||||
description = _og(soup, "og:description") or ""
|
||||
image_url = _og(soup, "og:image")
|
||||
|
||||
# --- Ingredients ---
|
||||
# Find the main ingredient grid (grid-cols-1 lg:grid-cols-2).
|
||||
# The page renders ingredients twice (mobile + desktop); we pick the
|
||||
# specific grid to avoid duplicates.
|
||||
ingredients = []
|
||||
ing_grid = None
|
||||
for g in soup.select("div.grid"):
|
||||
cls = " ".join(g.get("class", []))
|
||||
if "grid-cols-1" in cls and "lg:grid-cols-2" in cls:
|
||||
ing_grid = g
|
||||
break
|
||||
|
||||
if ing_grid:
|
||||
# Walk top-level divs — each may contain an h5 group header + rows
|
||||
for section in ing_grid.find_all("div", recursive=False):
|
||||
h5 = section.find("h5")
|
||||
if h5:
|
||||
group_name = h5.get_text(strip=True)
|
||||
if group_name:
|
||||
ingredients.append({"group": group_name})
|
||||
|
||||
for row in section.select("div.my-2.flex.items-center.gap-2.text-lg"):
|
||||
inner = row.select_one("div.flex.items-center.gap-2")
|
||||
if not inner:
|
||||
continue
|
||||
divs = inner.find_all("div", recursive=False)
|
||||
bold = inner.find("div", class_="font-bold")
|
||||
food = bold.get_text(strip=True) if bold else ""
|
||||
if not food:
|
||||
continue
|
||||
|
||||
# First non-bold div is quantity+unit merged (e.g. "200g", "1fej")
|
||||
qty_raw = ""
|
||||
extra = ""
|
||||
for d in divs:
|
||||
if d == bold:
|
||||
continue
|
||||
txt = d.get_text(strip=True)
|
||||
if txt.startswith("(") and txt.endswith(")"):
|
||||
extra = txt.strip("() ")
|
||||
elif not qty_raw:
|
||||
qty_raw = txt
|
||||
|
||||
# Split "200g" → qty="200", unit="g"
|
||||
qty, unit = _split_qty_unit(qty_raw)
|
||||
|
||||
ingredients.append({
|
||||
"quantity": qty,
|
||||
"unit": unit,
|
||||
"food": food,
|
||||
"extra": extra,
|
||||
})
|
||||
|
||||
# --- Instructions ---
|
||||
instructions = []
|
||||
prep = (soup.find("div", id="Streetk_content_preparation_wrapper")
|
||||
or soup.select_one(".recipe-preparation"))
|
||||
if prep:
|
||||
ol = prep.find("ol")
|
||||
ul = prep.find("ul")
|
||||
if ol:
|
||||
for li in ol.find_all("li", recursive=False):
|
||||
txt = li.get_text(strip=True)
|
||||
if txt:
|
||||
instructions.append(txt)
|
||||
elif ul:
|
||||
for li in ul.find_all("li", recursive=False):
|
||||
txt = li.get_text(strip=True)
|
||||
if txt:
|
||||
instructions.append(txt)
|
||||
else:
|
||||
# Paragraph-style: <p> blocks, sometimes with <strong> headers
|
||||
for p in prep.find_all("p"):
|
||||
txt = p.get_text(strip=True)
|
||||
if txt:
|
||||
instructions.append(txt)
|
||||
|
||||
# If still nothing, try the description wrapper
|
||||
if not instructions:
|
||||
desc_article = soup.find("article", id="Streetk_content_description_wrapper")
|
||||
if desc_article:
|
||||
for p in desc_article.find_all("p"):
|
||||
txt = p.get_text(strip=True)
|
||||
if txt:
|
||||
instructions.append(txt)
|
||||
|
||||
# --- Tags ---
|
||||
tags = []
|
||||
# Prefer recipeCategory from JSON-LD (comma-separated)
|
||||
for script in soup.find_all("script", type="application/ld+json"):
|
||||
try:
|
||||
data = json.loads(script.string or "")
|
||||
graph = data.get("@graph", [data]) if isinstance(data, dict) else data
|
||||
for item in graph:
|
||||
if isinstance(item, dict) and item.get("@type") == "Recipe":
|
||||
cat = item.get("recipeCategory", "")
|
||||
if isinstance(cat, str) and cat:
|
||||
tags = [t.strip() for t in cat.split(",") if t.strip()]
|
||||
elif isinstance(cat, list):
|
||||
tags = [str(t).strip() for t in cat if str(t).strip()]
|
||||
break
|
||||
except (json.JSONDecodeError, TypeError, AttributeError):
|
||||
continue
|
||||
|
||||
return {
|
||||
"title": title or "Ismeretlen recept",
|
||||
"description": description,
|
||||
"image_url": image_url,
|
||||
"ingredients": ingredients,
|
||||
"instructions": instructions,
|
||||
"tags": tags,
|
||||
"original_url": url,
|
||||
}
|
||||
|
||||
|
||||
def _split_qty_unit(raw: str) -> tuple[str, str]:
|
||||
"""Split a merged quantity+unit string like '200g' into ('200', 'g')."""
|
||||
raw = raw.strip()
|
||||
if not raw:
|
||||
return ("", "")
|
||||
m = re.match(r"^([0-9][0-9 .,/-]*)(.*)$", raw)
|
||||
if m:
|
||||
return (m.group(1).strip(), m.group(2).strip())
|
||||
return ("", raw)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Generic fallback (og-tags + schema.org microdata)
|
||||
# ---------------------------------------------------------------------------
|
||||
@@ -177,7 +318,6 @@ def _parse_generic(soup: BeautifulSoup, url: str) -> dict:
|
||||
# Try schema.org JSON-LD
|
||||
for script in soup.find_all("script", type="application/ld+json"):
|
||||
try:
|
||||
import json
|
||||
data = json.loads(script.string or "")
|
||||
if isinstance(data, list):
|
||||
data = data[0]
|
||||
|
||||
Reference in New Issue
Block a user