"""Recipe scraper — parses Hungarian recipe sites into a structured dict. Each supported site has a parser registered via _PARSERS. Unsupported sites fall back to generic schema.org / og-tag extraction. """ import json import re import requests from bs4 import BeautifulSoup _HEADERS = { "User-Agent": "RecipeImporter/1.0 (Hungarian recipe scraper)", "Accept-Language": "hu-HU,hu;q=0.9,en;q=0.5", } # Maps a substring of the hostname to a parser function. # Order matters: first match wins. _PARSERS: list[tuple[str, "callable"]] = [] def _register(host_substring: str): """Decorator: register a parser for URLs whose hostname contains *host_substring*.""" def decorator(fn): _PARSERS.append((host_substring, fn)) return fn return decorator # --------------------------------------------------------------------------- # Public API # --------------------------------------------------------------------------- def scrape(url: str) -> dict: """Fetch *url* and return a recipe dict. Returns:: { "title": str, "description": str, "image_url": str | None, "ingredients": [{"quantity": str, "unit": str, "food": str, "extra": str}, ...], "instructions": [str, ...], "tags": [str, ...], "original_url": str, } Raises ValueError on unsupported sites or parse failures. """ resp = requests.get(url, headers=_HEADERS, timeout=30) resp.raise_for_status() resp.encoding = resp.apparent_encoding or "utf-8" soup = BeautifulSoup(resp.text, "lxml") host = _host(url) result = None for substring, parser in _PARSERS: if substring in host: result = parser(soup, url) break if result is None: # Fallback: try generic schema.org / og-tag extraction result = _parse_generic(soup, url) # Post-process: extract parenthesized comments from food into extra _extract_ingredient_comments(result) # Strip trailing "recept*" from title (e.g. "receptje", "recept") title = result.get("title", "") if title: result["title"] = re.sub(r"\s+recept\w*$", "", title, flags=re.IGNORECASE).strip() return result def supported_sites() -> list[dict]: """Return list of supported sites with name and URL.""" _SITE_URLS = { "mindmegette": "https://www.mindmegette.hu", "streetkitchen": "https://streetkitchen.hu", "nosalty": "https://www.nosalty.hu", "sobors": "https://sobors.hu", "kiskegyed": "https://www.kiskegyed.hu", "gastrohobbi": "https://gastrohobbi.hu", } return [{"name": s + ".hu", "url": _SITE_URLS.get(s, "#")} for s, _ in _PARSERS] # --------------------------------------------------------------------------- # mindmegette.hu # --------------------------------------------------------------------------- @_register("mindmegette") def _parse_mindmegette(soup: BeautifulSoup, url: str) -> dict: # Prefer h1 (clean meal name) over og:title (often has "receptje" suffix) h1 = soup.find("h1") title = _text(h1) if h1 else "" if not title: title = _og(soup, "og:title") or _text(soup.find("title")) # Strip " | Mindmegette.hu" or " - Mindmegette.hu" suffix if title: title = re.sub(r"\s*[-–|]\s*Mindmegette\.hu$", "", title).strip() description = _og(soup, "og:description") or "" image_url = _og(soup, "og:image") # --- Ingredients --- # Format A (regular /recept/ pages): div.ingredients containers with structured rows # Format B (alt /alapetelek/ pages): h3 "Hozzávalók" →
blocks, sometimes with headers
for p in prep.find_all("p"):
txt = p.get_text(strip=True)
if txt:
instructions.append(txt)
# If still nothing, try the description wrapper
if not instructions:
desc_article = soup.find("article", id="Streetk_content_description_wrapper")
if desc_article:
for p in desc_article.find_all("p"):
txt = p.get_text(strip=True)
if txt:
instructions.append(txt)
# --- Tags ---
tags = []
# Prefer recipeCategory from JSON-LD (comma-separated)
for script in soup.find_all("script", type="application/ld+json"):
try:
data = json.loads(script.string or "")
graph = data.get("@graph", [data]) if isinstance(data, dict) else data
for item in graph:
if isinstance(item, dict) and item.get("@type") == "Recipe":
cat = item.get("recipeCategory", "")
if isinstance(cat, str) and cat:
tags = [t.strip() for t in cat.split(",") if t.strip()]
elif isinstance(cat, list):
tags = [str(t).strip() for t in cat if str(t).strip()]
break
except (json.JSONDecodeError, TypeError, AttributeError):
continue
return {
"title": title or "Ismeretlen recept",
"description": description,
"image_url": image_url,
"ingredients": ingredients,
"instructions": instructions,
"tags": tags,
"original_url": url,
}
# ---------------------------------------------------------------------------
# nosalty.hu
# ---------------------------------------------------------------------------
@_register("nosalty")
def _parse_nosalty(soup: BeautifulSoup, url: str) -> dict:
title = _og(soup, "og:title") or _text(soup.find("title"))
if title:
title = re.sub(r"\s*\|.*$", "", title).strip()
# Story as description (no dedicated description on nosalty)
description = ""
story = soup.find("div", id="recipe-story")
if story:
paragraphs = [p.get_text(strip=True) for p in story.find_all("p")
if p.get_text(strip=True)]
description = " ".join(paragraphs)
image_url = _og(soup, "og:image")
# --- Ingredients ---
# Scoped to div#ingredients to avoid per-serving / nutrition duplicates.
# Structure: h3.m-list__title = group header, ul.m-list__list = ingredient rows.
ingredients = []
ing_container = soup.find("div", id="ingredients")
if ing_container:
for el in ing_container.find_all(["h3", "ul"]):
cls = el.get("class") or []
if el.name == "h3" and "m-list__title" in cls:
group_name = el.get_text(strip=True)
if group_name:
ingredients.append({"group": group_name})
elif el.name == "ul" and "m-list__list" in cls:
for li in el.find_all("li", class_="m-list__item"):
_parse_nosalty_ingredient(li, ingredients)
# --- Instructions ---
# Container: div#select inside div.p-recipe__directions.
# h4.m-list__title = section header, ol.m-list__list = steps.
instructions = []
dir_container = soup.find("div", id="select")
if dir_container:
for el in dir_container.find_all(["h4", "ol"]):
cls = el.get("class") or []
if el.name == "h4" and "m-list__title" in cls:
section_name = el.get_text(strip=True)
if section_name:
instructions.append(f"--- {section_name} ---")
elif el.name == "ol" and "m-list__list" in cls:
for li in el.find_all("li", class_="m-list__item"):
txt = li.get_text(strip=True)
if txt:
instructions.append(txt)
# --- Tags ---
# Scoped to div.p-recipe__attributeList to avoid site-wide SEO tags.
tags = []
attr_list = soup.find("div", class_="p-recipe__attributeList")
if attr_list:
for a in attr_list.find_all("a", class_="m-tags__tagItem"):
tag_text = a.get_text(strip=True)
if tag_text:
tags.append(tag_text)
return {
"title": title or "Ismeretlen recept",
"description": description,
"image_url": image_url,
"ingredients": ingredients,
"instructions": instructions,
"tags": tags,
"original_url": url,
}
def _parse_nosalty_ingredient(li, ingredients: list):
"""Parse a single nosalty ingredient tags for steps, Name: A ...hez in the first wpb_text_column before the inner recipe row
description = ""
first_text_col = soup.select_one("div.wpb-content-wrapper div.wpb_text_column div.wpb_wrapper")
if first_text_col:
p = first_text_col.find("p")
if p:
description = p.get_text(strip=True)
if not description:
description = _og(soup, "og:description") or ""
image_url = _og(soup, "og:image")
# --- Ingredients ---
# Find h3 containing "Hozzávalók" then walk siblings for ul and group h3 elements
ingredients = []
_gastrohobbi_parse_ingredients(soup, ingredients)
# --- Instructions ---
# Find h3 containing "Elkészítés:" then collect following elements
instructions = []
prep_time = ""
_gastrohobbi_parse_instructions(soup, instructions)
# Extract prep time from h3 containing "Elkészítési idő:"
for h3 in soup.find_all("h3"):
text = h3.get_text(strip=True)
if "elkészítési idő" in text.lower():
# Text after the tag: "Elkészítési idő: 60 perc"
# The time part is outside the wrapper
em = h3.find("em")
if em:
em.decompose()
time_text = h3.get_text(strip=True).strip()
if time_text:
prep_time = time_text
break
# --- Tags ---
# From JSON-LD Article.articleSection
tags = []
skip_tags = {"receptjeink", "receptek"}
for script in soup.find_all("script", type="application/ld+json"):
try:
data = json.loads(script.string or "")
graph = data.get("@graph", [data]) if isinstance(data, dict) else data
for item in graph:
if isinstance(item, dict) and item.get("@type") == "Article":
sections = item.get("articleSection", [])
if isinstance(sections, list):
tags = [s.strip() for s in sections
if s.strip() and s.strip().lower() not in skip_tags]
break
except (json.JSONDecodeError, TypeError, AttributeError):
continue
# Append prep time to description if available
if prep_time:
if description:
description += f" (Elkészítési idő: {prep_time})"
else:
description = f"Elkészítési idő: {prep_time}"
return {
"title": title or "Ismeretlen recept",
"description": description,
"image_url": image_url,
"ingredients": ingredients,
"instructions": instructions,
"tags": tags,
"original_url": url,
}
def _gastrohobbi_parse_ingredients(soup: BeautifulSoup, ingredients: list):
"""Parse ingredients from gastrohobbi.hu — find Hozzávalók h3 then walk siblings."""
# Find the h3 that contains "Hozzávalók"
header = None
for h3 in soup.find_all("h3"):
if "hozzávalók" in h3.get_text(strip=True).lower():
header = h3
break
if not header:
return
# Walk siblings after the header within the same container
for sib in header.find_next_siblings():
tag = sib.name
text = sib.get_text(strip=True)
if not text:
continue
# Stop at the "Elkészítés" section
if tag == "h3" and "elkészítés" in text.lower():
break
# Ingredient group header (plain h3 without em>strong, e.g. "A csipetkéhez:")
if tag == "h3":
group_name = text.rstrip(":")
if group_name:
ingredients.append({"group": group_name})
continue
# Ingredient list
if tag == "ul":
for li in sib.find_all("li", recursive=False):
p = li.find("p")
line = p.get_text(strip=True) if p else li.get_text(strip=True)
if not line:
continue
qty, unit, food = _parse_ingredient_line(line)
ingredients.append({
"quantity": qty, "unit": unit, "food": food, "extra": "",
})
def _gastrohobbi_parse_instructions(soup: BeautifulSoup, instructions: list):
"""Parse instructions from gastrohobbi.hu — find Elkészítés h3 then collect siblings."""
header = None
for h3 in soup.find_all("h3"):
text = h3.get_text(strip=True)
if text.lower().startswith("elkészítés") and "idő" not in text.lower():
header = h3
break
if not header:
return
for sib in header.find_next_siblings():
tag = sib.name
text = sib.get_text(strip=True)
# Stop at prep time h3 or any other section header
if tag == "h3":
break
if tag == "p":
# Skip empty / whitespace-only paragraphs
if not text or text == "\xa0":
continue
instructions.append(text)
elif tag == "ul":
# Embedded list in instructions (e.g. cooking time options)
for li in sib.find_all("li"):
# Skip wrapper li elements that contain nested lists
if li.find("ul"):
continue
li_text = li.get_text(strip=True)
if li_text:
instructions.append(f" • {li_text}")
def _parse_sobors_article_ingredients(container, ingredients: list):
"""Parse article-style ingredients from sobors.hu (h4 headers + ul > li plain text)."""
for el in container.find_all(["h4", "ul"]):
if el.name == "h4":
group_name = el.get_text(strip=True).rstrip(":")
if group_name and not group_name.lower().startswith("hozzávalók"):
ingredients.append({"group": group_name})
elif el.name == "ul":
# Only consider lists that follow an h4 or are inside the ingredient context
prev = el.find_previous_sibling()
if prev and prev.name == "h4":
for li in el.find_all("li"):
line = li.get_text(strip=True)
if not line:
continue
qty, unit, food = _parse_ingredient_line(line)
ingredients.append({
"quantity": qty,
"unit": unit,
"food": food,
"extra": "",
})
def _parse_ingredient_line(line: str) -> tuple[str, str, str]:
"""Parse a plain-text ingredient line like '2 dl tejföl' into (qty, unit, food)."""
# Normalize en-dash/em-dash ranges: "10 – 15" → "10-15"
line = re.sub(r"\s*[–—]\s*", "-", line)
# qty unit food (e.g. "2 dl tejföl", "½ tk őrölt kömény")
m = re.match(r"^([0-9½¼¾][0-9.,/½¼¾-]*)\s+(\S+)\s+(.+)$", line)
if m:
return (m.group(1).strip(), m.group(2).strip(), m.group(3).strip())
# Just quantity + food (e.g. "2 tojás")
m2 = re.match(r"^([0-9½¼¾][0-9.,/½¼¾-]*)\s+(.+)$", line)
if m2:
return (m2.group(1).strip(), "", m2.group(2).strip())
return ("", "", line)
def _split_qty_unit(raw: str) -> tuple[str, str]:
"""Split a merged quantity+unit string like '200g' into ('200', 'g')."""
raw = raw.strip()
if not raw:
return ("", "")
m = re.match(r"^([0-9][0-9 .,/-]*)(.*)$", raw)
if m:
return (m.group(1).strip(), m.group(2).strip())
return ("", raw)
# ---------------------------------------------------------------------------
# Generic fallback (og-tags + schema.org microdata)
# ---------------------------------------------------------------------------
def _parse_generic(soup: BeautifulSoup, url: str) -> dict:
title = _og(soup, "og:title") or _text(soup.find("title")) or "Ismeretlen recept"
description = _og(soup, "og:description") or ""
image_url = _og(soup, "og:image")
ingredients = []
instructions = []
tags = []
# Try schema.org JSON-LD
for script in soup.find_all("script", type="application/ld+json"):
try:
data = json.loads(script.string or "")
if isinstance(data, list):
data = data[0]
if data.get("@type") == "Recipe":
for line in data.get("recipeIngredient", []):
ingredients.append({
"quantity": "", "unit": "", "food": line, "extra": "",
})
raw_instructions = data.get("recipeInstructions", [])
for item in raw_instructions:
if isinstance(item, str):
instructions.append(item)
elif isinstance(item, dict):
instructions.append(item.get("text", ""))
# Extract keywords
kw = data.get("keywords", "")
if isinstance(kw, str):
tags = [k.strip() for k in kw.split(",") if k.strip()]
elif isinstance(kw, list):
tags = [str(k).strip() for k in kw if str(k).strip()]
break
except (json.JSONDecodeError, TypeError, AttributeError):
continue
return {
"title": title,
"description": description,
"image_url": image_url,
"ingredients": ingredients,
"instructions": instructions,
"tags": tags,
"original_url": url,
}
# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------
def _extract_ingredient_comments(data: dict):
"""Move trailing (comment) from food field to extra field for all ingredients."""
for ing in data.get("ingredients", []):
if "group" in ing:
continue
food = ing.get("food", "")
extra = ing.get("extra", "")
if food and not extra:
m = re.match(r"^(.+?)\s*\(([^)]+)\)\s*$", food)
if m:
ing["food"] = m.group(1).strip()
ing["extra"] = m.group(2).strip()
def _host(url: str) -> str:
from urllib.parse import urlparse
return urlparse(url).hostname or ""
def _og(soup: BeautifulSoup, prop: str) -> str | None:
tag = soup.find("meta", property=prop)
if tag and tag.get("content"):
return tag["content"]
return None
def _text(el) -> str:
if el is None:
return ""
return el.get_text(strip=True)
Section
for section headers
instructions = []
linked_url = None
inst_container = soup.find("div", class_="recept_leiras")
if inst_container:
# Check for external link (linked recipe pattern — e.g. "click here for
# full recipe on kiskegyed.hu")
for a in inst_container.find_all("a", href=True):
href = a["href"]
if href.startswith("http") and "sobors.hu" not in href:
linked_url = href
break
for el in inst_container.find_all(["h3", "p"]):
if el.name == "h3":
header = el.get_text(strip=True)
if header:
instructions.append(f"--- {header} ---")
elif el.name == "p":
txt = el.get_text(strip=True)
if txt:
# Strip leading numbering like "1. " from reader recipes
txt = re.sub(r"^\d+\.\s+", "", txt)
instructions.append(txt)
# If instructions just contain a redirect to another site, try to follow
# the link and scrape the real recipe from there.
if linked_url and len(instructions) <= 2:
try:
linked_data = scrape(linked_url)
if linked_data.get("instructions"):
instructions = linked_data["instructions"]
if not ingredients and linked_data.get("ingredients"):
ingredients = linked_data["ingredients"]
except Exception:
pass # keep whatever we scraped from sobors.hu
# --- Tags ---
# Container: div.cikk-cimkek > ul.cikk-cimkek-list > li > a
# Skip the generic "Receptek" category tag and "Olvasói receptek" tag
tags = []
tag_container = soup.find("div", class_="cikk-cimkek")
if tag_container:
tag_list = tag_container.find("ul", class_="cikk-cimkek-list")
if tag_list:
skip = {"receptek", "olvasói receptek"}
for a in tag_list.find_all("a"):
tag_text = a.get_text(strip=True)
if tag_text and tag_text.lower() not in skip:
tags.append(tag_text)
return {
"title": title or "Ismeretlen recept",
"description": description,
"image_url": image_url,
"ingredients": ingredients,
"instructions": instructions,
"tags": tags,
"original_url": url,
}
# ---------------------------------------------------------------------------
# kiskegyed.hu
# ---------------------------------------------------------------------------
@_register("kiskegyed")
def _parse_kiskegyed(soup: BeautifulSoup, url: str) -> dict:
# Title: h2 inside the detail section
title = ""
h2 = soup.find("h2")
if h2:
title = h2.get_text(strip=True)
if not title:
title = _og(soup, "og:title") or _text(soup.find("title"))
if title:
title = re.sub(r"\s*[-–|]\s*Kiskegyed.*$", "", title, flags=re.IGNORECASE).strip()
# Description: section#leadText > p
description = ""
lead = soup.find("section", id="leadText")
if lead:
p = lead.find("p")
if p:
description = p.get_text(strip=True)
if not description:
description = _og(soup, "og:description") or ""
image_url = _og(soup, "og:image")
# --- Ingredients ---
# Container: div.recipe_ingredients
# Groups: