"""Recipe scraper — parses Hungarian recipe sites into a structured dict. Each supported site has a parser registered via _PARSERS. Unsupported sites fall back to generic schema.org / og-tag extraction. """ import json import re import requests from bs4 import BeautifulSoup _HEADERS = { "User-Agent": "RecipeImporter/1.0 (Hungarian recipe scraper)", "Accept-Language": "hu-HU,hu;q=0.9,en;q=0.5", } # Maps a substring of the hostname to a parser function. # Order matters: first match wins. _PARSERS: list[tuple[str, "callable"]] = [] def _register(host_substring: str): """Decorator: register a parser for URLs whose hostname contains *host_substring*.""" def decorator(fn): _PARSERS.append((host_substring, fn)) return fn return decorator # --------------------------------------------------------------------------- # Public API # --------------------------------------------------------------------------- def scrape(url: str) -> dict: """Fetch *url* and return a recipe dict. Returns:: { "title": str, "description": str, "image_url": str | None, "ingredients": [{"quantity": str, "unit": str, "food": str, "extra": str}, ...], "instructions": [str, ...], "tags": [str, ...], "original_url": str, } Raises ValueError on unsupported sites or parse failures. """ resp = requests.get(url, headers=_HEADERS, timeout=30) resp.raise_for_status() resp.encoding = resp.apparent_encoding or "utf-8" soup = BeautifulSoup(resp.text, "lxml") host = _host(url) for substring, parser in _PARSERS: if substring in host: return parser(soup, url) # Fallback: try generic schema.org / og-tag extraction return _parse_generic(soup, url) def supported_sites() -> list[str]: """Return list of supported site hostname substrings.""" return [s for s, _ in _PARSERS] # --------------------------------------------------------------------------- # mindmegette.hu # --------------------------------------------------------------------------- @_register("mindmegette") def _parse_mindmegette(soup: BeautifulSoup, url: str) -> dict: title = _og(soup, "og:title") or _text(soup.find("title")) # Strip " | Mindmegette.hu" suffix if title: title = re.sub(r"\s*\|\s*Mindmegette\.hu$", "", title).strip() description = _og(soup, "og:description") or "" image_url = _og(soup, "og:image") # --- Ingredients --- # Multiple div.ingredients containers may exist (one per group). # Group title: A habaráshoz: ingredients = [] for ing_container in soup.find_all("div", class_="ingredients"): # Check for a group title group_el = ing_container.find("strong", class_="ingredients-group") group_name = _text(group_el).rstrip(":").strip() if group_el else "" if group_name: ingredients.append({"group": group_name}) for row in ing_container.find_all("div", class_="ingredients-meta"): # Actual HTML: qty unit # name (extra) qty_el = row.find("strong") unit_el = None for sp in row.find_all("span"): if not sp.get("class"): unit_el = sp break name_el = row.find("a", class_="ingredients-link") extra_el = row.find("small") or row.find("span", class_="extra") qty = _text(qty_el) unit = _text(unit_el) food = _text(name_el) extra = _text(extra_el).strip("() ") if not food: # Fallback: grab whole row text food = row.get_text(separator=" ", strip=True) if food: ingredients.append({ "quantity": qty, "unit": unit, "food": food, "extra": extra, }) # --- Instructions --- instructions = [] wysiwyg = soup.find("mindmegette-wysiwyg-box") if wysiwyg: for li in wysiwyg.find_all("li"): txt = _text(li) if txt: instructions.append(txt) # Fallback: look for block-content divs if not instructions: for div in soup.find_all("div", class_="block-content"): ol = div.find("ol") if ol: for li in ol.find_all("li"): txt = _text(li) if txt: instructions.append(txt) # --- Tags --- tags = [] tag_wrapper = soup.select_one("div.desktop-wrapper") if tag_wrapper: for a in tag_wrapper.select("a.tag"): tag_text = a.get_text(strip=True) if tag_text: tags.append(tag_text) return { "title": title or "Ismeretlen recept", "description": description, "image_url": image_url, "ingredients": ingredients, "instructions": instructions, "tags": tags, "original_url": url, } # --------------------------------------------------------------------------- # streetkitchen.hu # --------------------------------------------------------------------------- @_register("streetkitchen") def _parse_streetkitchen(soup: BeautifulSoup, url: str) -> dict: title = _og(soup, "og:title") or _text(soup.find("title")) if title: title = re.sub(r"\s*\|\s*Street Kitchen$", "", title).strip() description = _og(soup, "og:description") or "" image_url = _og(soup, "og:image") # --- Ingredients --- # Find the main ingredient grid (grid-cols-1 lg:grid-cols-2). # The page renders ingredients twice (mobile + desktop); we pick the # specific grid to avoid duplicates. ingredients = [] ing_grid = None for g in soup.select("div.grid"): cls = " ".join(g.get("class", [])) if "grid-cols-1" in cls and "lg:grid-cols-2" in cls: ing_grid = g break if ing_grid: # Walk top-level divs — each may contain an h5 group header + rows for section in ing_grid.find_all("div", recursive=False): h5 = section.find("h5") if h5: group_name = h5.get_text(strip=True) if group_name: ingredients.append({"group": group_name}) for row in section.select("div.my-2.flex.items-center.gap-2.text-lg"): inner = row.select_one("div.flex.items-center.gap-2") if not inner: continue divs = inner.find_all("div", recursive=False) bold = inner.find("div", class_="font-bold") food = bold.get_text(strip=True) if bold else "" if not food: continue # First non-bold div is quantity+unit merged (e.g. "200g", "1fej") qty_raw = "" extra = "" for d in divs: if d == bold: continue txt = d.get_text(strip=True) if txt.startswith("(") and txt.endswith(")"): extra = txt.strip("() ") elif not qty_raw: qty_raw = txt # Split "200g" → qty="200", unit="g" qty, unit = _split_qty_unit(qty_raw) # Extract parenthesised note from inside food name # e.g. "fehérborecet (ízlés szerint)" → food="fehérborecet", extra="ízlés szerint" if not extra: m = re.match(r"^(.+?)\s*\(([^)]+)\)\s*$", food) if m: food = m.group(1).strip() extra = m.group(2).strip() ingredients.append({ "quantity": qty, "unit": unit, "food": food, "extra": extra, }) # --- Instructions --- instructions = [] prep = (soup.find("div", id="Streetk_content_preparation_wrapper") or soup.select_one(".recipe-preparation")) if prep: ol = prep.find("ol") ul = prep.find("ul") if ol: for li in ol.find_all("li", recursive=False): txt = li.get_text(strip=True) if txt: instructions.append(txt) elif ul: for li in ul.find_all("li", recursive=False): txt = li.get_text(strip=True) if txt: instructions.append(txt) else: # Paragraph-style:
blocks, sometimes with headers
for p in prep.find_all("p"):
txt = p.get_text(strip=True)
if txt:
instructions.append(txt)
# If still nothing, try the description wrapper
if not instructions:
desc_article = soup.find("article", id="Streetk_content_description_wrapper")
if desc_article:
for p in desc_article.find_all("p"):
txt = p.get_text(strip=True)
if txt:
instructions.append(txt)
# --- Tags ---
tags = []
# Prefer recipeCategory from JSON-LD (comma-separated)
for script in soup.find_all("script", type="application/ld+json"):
try:
data = json.loads(script.string or "")
graph = data.get("@graph", [data]) if isinstance(data, dict) else data
for item in graph:
if isinstance(item, dict) and item.get("@type") == "Recipe":
cat = item.get("recipeCategory", "")
if isinstance(cat, str) and cat:
tags = [t.strip() for t in cat.split(",") if t.strip()]
elif isinstance(cat, list):
tags = [str(t).strip() for t in cat if str(t).strip()]
break
except (json.JSONDecodeError, TypeError, AttributeError):
continue
return {
"title": title or "Ismeretlen recept",
"description": description,
"image_url": image_url,
"ingredients": ingredients,
"instructions": instructions,
"tags": tags,
"original_url": url,
}
# ---------------------------------------------------------------------------
# nosalty.hu
# ---------------------------------------------------------------------------
@_register("nosalty")
def _parse_nosalty(soup: BeautifulSoup, url: str) -> dict:
title = _og(soup, "og:title") or _text(soup.find("title"))
if title:
title = re.sub(r"\s*\|.*$", "", title).strip()
# Story as description (no dedicated description on nosalty)
description = ""
story = soup.find("div", id="recipe-story")
if story:
paragraphs = [p.get_text(strip=True) for p in story.find_all("p")
if p.get_text(strip=True)]
description = " ".join(paragraphs)
image_url = _og(soup, "og:image")
# --- Ingredients ---
# Scoped to div#ingredients to avoid per-serving / nutrition duplicates.
# Structure: h3.m-list__title = group header, ul.m-list__list = ingredient rows.
ingredients = []
ing_container = soup.find("div", id="ingredients")
if ing_container:
for el in ing_container.find_all(["h3", "ul"]):
cls = el.get("class") or []
if el.name == "h3" and "m-list__title" in cls:
group_name = el.get_text(strip=True)
if group_name:
ingredients.append({"group": group_name})
elif el.name == "ul" and "m-list__list" in cls:
for li in el.find_all("li", class_="m-list__item"):
_parse_nosalty_ingredient(li, ingredients)
# --- Instructions ---
# Container: div#select inside div.p-recipe__directions.
# h4.m-list__title = section header, ol.m-list__list = steps.
instructions = []
dir_container = soup.find("div", id="select")
if dir_container:
for el in dir_container.find_all(["h4", "ol"]):
cls = el.get("class") or []
if el.name == "h4" and "m-list__title" in cls:
section_name = el.get_text(strip=True)
if section_name:
instructions.append(f"--- {section_name} ---")
elif el.name == "ol" and "m-list__list" in cls:
for li in el.find_all("li", class_="m-list__item"):
txt = li.get_text(strip=True)
if txt:
instructions.append(txt)
# --- Tags ---
# Scoped to div.p-recipe__attributeList to avoid site-wide SEO tags.
tags = []
attr_list = soup.find("div", class_="p-recipe__attributeList")
if attr_list:
for a in attr_list.find_all("a", class_="m-tags__tagItem"):
tag_text = a.get_text(strip=True)
if tag_text:
tags.append(tag_text)
return {
"title": title or "Ismeretlen recept",
"description": description,
"image_url": image_url,
"ingredients": ingredients,
"instructions": instructions,
"tags": tags,
"original_url": url,
}
def _parse_nosalty_ingredient(li, ingredients: list):
"""Parse a single nosalty ingredient