e922822286
Mindmegette regular pages: use h1 element (clean meal name like "Sajtkrémes csirkés leves") instead of og:title (which has "receptje" suffix). Also add global post-processing to strip trailing recept/ receptje/receptek from titles across all parsers. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1058 lines
38 KiB
Python
1058 lines
38 KiB
Python
"""Recipe scraper — parses Hungarian recipe sites into a structured dict.
|
||
|
||
Each supported site has a parser registered via _PARSERS.
|
||
Unsupported sites fall back to generic schema.org / og-tag extraction.
|
||
"""
|
||
|
||
import json
|
||
import re
|
||
import requests
|
||
from bs4 import BeautifulSoup
|
||
|
||
_HEADERS = {
|
||
"User-Agent": "RecipeImporter/1.0 (Hungarian recipe scraper)",
|
||
"Accept-Language": "hu-HU,hu;q=0.9,en;q=0.5",
|
||
}
|
||
|
||
# Maps a substring of the hostname to a parser function.
|
||
# Order matters: first match wins.
|
||
_PARSERS: list[tuple[str, "callable"]] = []
|
||
|
||
|
||
def _register(host_substring: str):
|
||
"""Decorator: register a parser for URLs whose hostname contains *host_substring*."""
|
||
def decorator(fn):
|
||
_PARSERS.append((host_substring, fn))
|
||
return fn
|
||
return decorator
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Public API
|
||
# ---------------------------------------------------------------------------
|
||
|
||
|
||
def scrape(url: str) -> dict:
|
||
"""Fetch *url* and return a recipe dict.
|
||
|
||
Returns::
|
||
|
||
{
|
||
"title": str,
|
||
"description": str,
|
||
"image_url": str | None,
|
||
"ingredients": [{"quantity": str, "unit": str, "food": str, "extra": str}, ...],
|
||
"instructions": [str, ...],
|
||
"tags": [str, ...],
|
||
"original_url": str,
|
||
}
|
||
|
||
Raises ValueError on unsupported sites or parse failures.
|
||
"""
|
||
resp = requests.get(url, headers=_HEADERS, timeout=30)
|
||
resp.raise_for_status()
|
||
resp.encoding = resp.apparent_encoding or "utf-8"
|
||
soup = BeautifulSoup(resp.text, "lxml")
|
||
|
||
host = _host(url)
|
||
result = None
|
||
for substring, parser in _PARSERS:
|
||
if substring in host:
|
||
result = parser(soup, url)
|
||
break
|
||
|
||
if result is None:
|
||
# Fallback: try generic schema.org / og-tag extraction
|
||
result = _parse_generic(soup, url)
|
||
|
||
# Post-process: extract parenthesized comments from food into extra
|
||
_extract_ingredient_comments(result)
|
||
|
||
# Strip trailing "recept*" from title (e.g. "receptje", "recept")
|
||
title = result.get("title", "")
|
||
if title:
|
||
result["title"] = re.sub(r"\s+recept\w*$", "", title, flags=re.IGNORECASE).strip()
|
||
|
||
return result
|
||
|
||
|
||
def supported_sites() -> list[dict]:
|
||
"""Return list of supported sites with name and URL."""
|
||
_SITE_URLS = {
|
||
"mindmegette": "https://www.mindmegette.hu",
|
||
"streetkitchen": "https://streetkitchen.hu",
|
||
"nosalty": "https://www.nosalty.hu",
|
||
"sobors": "https://sobors.hu",
|
||
"kiskegyed": "https://www.kiskegyed.hu",
|
||
"gastrohobbi": "https://gastrohobbi.hu",
|
||
}
|
||
return [{"name": s + ".hu", "url": _SITE_URLS.get(s, "#")} for s, _ in _PARSERS]
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# mindmegette.hu
|
||
# ---------------------------------------------------------------------------
|
||
|
||
|
||
@_register("mindmegette")
|
||
def _parse_mindmegette(soup: BeautifulSoup, url: str) -> dict:
|
||
# Prefer h1 (clean meal name) over og:title (often has "receptje" suffix)
|
||
h1 = soup.find("h1")
|
||
title = _text(h1) if h1 else ""
|
||
if not title:
|
||
title = _og(soup, "og:title") or _text(soup.find("title"))
|
||
# Strip " | Mindmegette.hu" or " - Mindmegette.hu" suffix
|
||
if title:
|
||
title = re.sub(r"\s*[-–|]\s*Mindmegette\.hu$", "", title).strip()
|
||
|
||
description = _og(soup, "og:description") or ""
|
||
image_url = _og(soup, "og:image")
|
||
|
||
# --- Ingredients ---
|
||
# Format A (regular /recept/ pages): div.ingredients containers with structured rows
|
||
# Format B (alt /alapetelek/ pages): h3 "Hozzávalók" → <ul><li> inside wysiwyg box
|
||
ingredients = []
|
||
wysiwyg = soup.find("mindmegette-wysiwyg-box")
|
||
|
||
for ing_container in soup.find_all("div", class_="ingredients"):
|
||
group_el = ing_container.find("strong", class_="ingredients-group")
|
||
group_name = _text(group_el).rstrip(":").strip() if group_el else ""
|
||
if group_name:
|
||
ingredients.append({"group": group_name})
|
||
|
||
for row in ing_container.find_all("div", class_="ingredients-meta"):
|
||
qty_el = row.find("strong")
|
||
unit_el = None
|
||
for sp in row.find_all("span"):
|
||
if not sp.get("class"):
|
||
unit_el = sp
|
||
break
|
||
name_el = row.find("a", class_="ingredients-link")
|
||
extra_el = row.find("small") or row.find("span", class_="extra")
|
||
|
||
qty = _text(qty_el)
|
||
unit = _text(unit_el)
|
||
food = _text(name_el)
|
||
extra = _text(extra_el).strip("() ")
|
||
|
||
if not food:
|
||
food = row.get_text(separator=" ", strip=True)
|
||
|
||
if food:
|
||
ingredients.append({
|
||
"quantity": qty, "unit": unit, "food": food, "extra": extra,
|
||
})
|
||
|
||
# Fallback: h3 "Hozzávalók" → <ul> inside wysiwyg box (alt page format)
|
||
if not ingredients and wysiwyg:
|
||
hozz_h3 = None
|
||
for h3 in wysiwyg.find_all("h3"):
|
||
if "hozzávalók" in h3.get_text(strip=True).lower():
|
||
hozz_h3 = h3
|
||
break
|
||
if hozz_h3:
|
||
# Use h2 just before "Hozzávalók" as the meal name (e.g. "Salsa Roja")
|
||
prev_h2 = hozz_h3.find_previous_sibling("h2")
|
||
if prev_h2:
|
||
meal_name = prev_h2.get_text(strip=True)
|
||
if meal_name:
|
||
title = meal_name
|
||
ul = hozz_h3.find_next_sibling("ul")
|
||
if ul:
|
||
for li in ul.find_all("li"):
|
||
line = li.get_text(strip=True)
|
||
if not line:
|
||
continue
|
||
qty, unit, food = _parse_ingredient_line(line)
|
||
ingredients.append({
|
||
"quantity": qty, "unit": unit, "food": food, "extra": "",
|
||
})
|
||
|
||
# --- Instructions ---
|
||
instructions = []
|
||
if wysiwyg:
|
||
# Look for h3 "Elkészítés" → <ol> (alt format)
|
||
elk_h3 = None
|
||
for h3 in wysiwyg.find_all("h3"):
|
||
if "elkészítés" in h3.get_text(strip=True).lower():
|
||
elk_h3 = h3
|
||
break
|
||
if elk_h3:
|
||
ol = elk_h3.find_next_sibling("ol")
|
||
if ol:
|
||
for li in ol.find_all("li"):
|
||
txt = _text(li)
|
||
if txt:
|
||
instructions.append(txt)
|
||
# Regular format: instructions in block-content <ol> (no h3 header)
|
||
if not instructions:
|
||
for ol in wysiwyg.find_all("ol"):
|
||
for li in ol.find_all("li"):
|
||
txt = _text(li)
|
||
if txt:
|
||
instructions.append(txt)
|
||
# Fallback: look for block-content divs outside wysiwyg
|
||
if not instructions:
|
||
for div in soup.find_all("div", class_="block-content"):
|
||
ol = div.find("ol")
|
||
if ol:
|
||
for li in ol.find_all("li"):
|
||
txt = _text(li)
|
||
if txt:
|
||
instructions.append(txt)
|
||
|
||
# --- Tags ---
|
||
tags = []
|
||
tag_wrapper = soup.select_one("div.desktop-wrapper")
|
||
if tag_wrapper:
|
||
for a in tag_wrapper.select("a.tag"):
|
||
tag_text = a.get_text(strip=True)
|
||
if tag_text:
|
||
tags.append(tag_text)
|
||
|
||
return {
|
||
"title": title or "Ismeretlen recept",
|
||
"description": description,
|
||
"image_url": image_url,
|
||
"ingredients": ingredients,
|
||
"instructions": instructions,
|
||
"tags": tags,
|
||
"original_url": url,
|
||
}
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# streetkitchen.hu
|
||
# ---------------------------------------------------------------------------
|
||
|
||
|
||
@_register("streetkitchen")
|
||
def _parse_streetkitchen(soup: BeautifulSoup, url: str) -> dict:
|
||
title = _og(soup, "og:title") or _text(soup.find("title"))
|
||
if title:
|
||
title = re.sub(r"\s*\|\s*Street Kitchen$", "", title).strip()
|
||
|
||
description = _og(soup, "og:description") or ""
|
||
image_url = _og(soup, "og:image")
|
||
|
||
# --- Ingredients ---
|
||
# Find the main ingredient grid (grid-cols-1 lg:grid-cols-2).
|
||
# The page renders ingredients twice (mobile + desktop); we pick the
|
||
# specific grid to avoid duplicates.
|
||
ingredients = []
|
||
ing_grid = None
|
||
for g in soup.select("div.grid"):
|
||
cls = " ".join(g.get("class", []))
|
||
if "grid-cols-1" in cls and "lg:grid-cols-2" in cls:
|
||
ing_grid = g
|
||
break
|
||
|
||
if ing_grid:
|
||
# Walk top-level divs — each may contain an h5 group header + rows
|
||
for section in ing_grid.find_all("div", recursive=False):
|
||
h5 = section.find("h5")
|
||
if h5:
|
||
group_name = h5.get_text(strip=True)
|
||
if group_name:
|
||
ingredients.append({"group": group_name})
|
||
|
||
for row in section.select("div.my-2.flex.items-center.gap-2.text-lg"):
|
||
inner = row.select_one("div.flex.items-center.gap-2")
|
||
if not inner:
|
||
continue
|
||
divs = inner.find_all("div", recursive=False)
|
||
bold = inner.find("div", class_="font-bold")
|
||
food = bold.get_text(strip=True) if bold else ""
|
||
if not food:
|
||
continue
|
||
|
||
# First non-bold div is quantity+unit merged (e.g. "200g", "1fej")
|
||
qty_raw = ""
|
||
extra = ""
|
||
for d in divs:
|
||
if d == bold:
|
||
continue
|
||
txt = d.get_text(strip=True)
|
||
if txt.startswith("(") and txt.endswith(")"):
|
||
extra = txt.strip("() ")
|
||
elif not qty_raw:
|
||
qty_raw = txt
|
||
|
||
# Split "200g" → qty="200", unit="g"
|
||
qty, unit = _split_qty_unit(qty_raw)
|
||
|
||
# Extract parenthesised note from inside food name
|
||
# e.g. "fehérborecet (ízlés szerint)" → food="fehérborecet", extra="ízlés szerint"
|
||
if not extra:
|
||
m = re.match(r"^(.+?)\s*\(([^)]+)\)\s*$", food)
|
||
if m:
|
||
food = m.group(1).strip()
|
||
extra = m.group(2).strip()
|
||
|
||
ingredients.append({
|
||
"quantity": qty,
|
||
"unit": unit,
|
||
"food": food,
|
||
"extra": extra,
|
||
})
|
||
|
||
# --- Instructions ---
|
||
instructions = []
|
||
prep = (soup.find("div", id="Streetk_content_preparation_wrapper")
|
||
or soup.select_one(".recipe-preparation"))
|
||
if prep:
|
||
ol = prep.find("ol")
|
||
ul = prep.find("ul")
|
||
if ol:
|
||
for li in ol.find_all("li", recursive=False):
|
||
txt = li.get_text(strip=True)
|
||
if txt:
|
||
instructions.append(txt)
|
||
elif ul:
|
||
for li in ul.find_all("li", recursive=False):
|
||
txt = li.get_text(strip=True)
|
||
if txt:
|
||
instructions.append(txt)
|
||
else:
|
||
# Paragraph-style: <p> blocks, sometimes with <strong> headers
|
||
for p in prep.find_all("p"):
|
||
txt = p.get_text(strip=True)
|
||
if txt:
|
||
instructions.append(txt)
|
||
|
||
# If still nothing, try the description wrapper
|
||
if not instructions:
|
||
desc_article = soup.find("article", id="Streetk_content_description_wrapper")
|
||
if desc_article:
|
||
for p in desc_article.find_all("p"):
|
||
txt = p.get_text(strip=True)
|
||
if txt:
|
||
instructions.append(txt)
|
||
|
||
# --- Tags ---
|
||
tags = []
|
||
# Prefer recipeCategory from JSON-LD (comma-separated)
|
||
for script in soup.find_all("script", type="application/ld+json"):
|
||
try:
|
||
data = json.loads(script.string or "")
|
||
graph = data.get("@graph", [data]) if isinstance(data, dict) else data
|
||
for item in graph:
|
||
if isinstance(item, dict) and item.get("@type") == "Recipe":
|
||
cat = item.get("recipeCategory", "")
|
||
if isinstance(cat, str) and cat:
|
||
tags = [t.strip() for t in cat.split(",") if t.strip()]
|
||
elif isinstance(cat, list):
|
||
tags = [str(t).strip() for t in cat if str(t).strip()]
|
||
break
|
||
except (json.JSONDecodeError, TypeError, AttributeError):
|
||
continue
|
||
|
||
return {
|
||
"title": title or "Ismeretlen recept",
|
||
"description": description,
|
||
"image_url": image_url,
|
||
"ingredients": ingredients,
|
||
"instructions": instructions,
|
||
"tags": tags,
|
||
"original_url": url,
|
||
}
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# nosalty.hu
|
||
# ---------------------------------------------------------------------------
|
||
|
||
|
||
@_register("nosalty")
|
||
def _parse_nosalty(soup: BeautifulSoup, url: str) -> dict:
|
||
title = _og(soup, "og:title") or _text(soup.find("title"))
|
||
if title:
|
||
title = re.sub(r"\s*\|.*$", "", title).strip()
|
||
|
||
# Story as description (no dedicated description on nosalty)
|
||
description = ""
|
||
story = soup.find("div", id="recipe-story")
|
||
if story:
|
||
paragraphs = [p.get_text(strip=True) for p in story.find_all("p")
|
||
if p.get_text(strip=True)]
|
||
description = " ".join(paragraphs)
|
||
|
||
image_url = _og(soup, "og:image")
|
||
|
||
# --- Ingredients ---
|
||
# Scoped to div#ingredients to avoid per-serving / nutrition duplicates.
|
||
# Structure: h3.m-list__title = group header, ul.m-list__list = ingredient rows.
|
||
ingredients = []
|
||
ing_container = soup.find("div", id="ingredients")
|
||
if ing_container:
|
||
for el in ing_container.find_all(["h3", "ul"]):
|
||
cls = el.get("class") or []
|
||
if el.name == "h3" and "m-list__title" in cls:
|
||
group_name = el.get_text(strip=True)
|
||
if group_name:
|
||
ingredients.append({"group": group_name})
|
||
elif el.name == "ul" and "m-list__list" in cls:
|
||
for li in el.find_all("li", class_="m-list__item"):
|
||
_parse_nosalty_ingredient(li, ingredients)
|
||
|
||
# --- Instructions ---
|
||
# Container: div#select inside div.p-recipe__directions.
|
||
# h4.m-list__title = section header, ol.m-list__list = steps.
|
||
instructions = []
|
||
dir_container = soup.find("div", id="select")
|
||
if dir_container:
|
||
for el in dir_container.find_all(["h4", "ol"]):
|
||
cls = el.get("class") or []
|
||
if el.name == "h4" and "m-list__title" in cls:
|
||
section_name = el.get_text(strip=True)
|
||
if section_name:
|
||
instructions.append(f"--- {section_name} ---")
|
||
elif el.name == "ol" and "m-list__list" in cls:
|
||
for li in el.find_all("li", class_="m-list__item"):
|
||
txt = li.get_text(strip=True)
|
||
if txt:
|
||
instructions.append(txt)
|
||
|
||
# --- Tags ---
|
||
# Scoped to div.p-recipe__attributeList to avoid site-wide SEO tags.
|
||
tags = []
|
||
attr_list = soup.find("div", class_="p-recipe__attributeList")
|
||
if attr_list:
|
||
for a in attr_list.find_all("a", class_="m-tags__tagItem"):
|
||
tag_text = a.get_text(strip=True)
|
||
if tag_text:
|
||
tags.append(tag_text)
|
||
|
||
return {
|
||
"title": title or "Ismeretlen recept",
|
||
"description": description,
|
||
"image_url": image_url,
|
||
"ingredients": ingredients,
|
||
"instructions": instructions,
|
||
"tags": tags,
|
||
"original_url": url,
|
||
}
|
||
|
||
|
||
def _parse_nosalty_ingredient(li, ingredients: list):
|
||
"""Parse a single nosalty ingredient <li> into the ingredients list."""
|
||
inner = li.find("div")
|
||
if not inner:
|
||
return
|
||
|
||
food_el = inner.find("a", class_="a-link")
|
||
if not food_el:
|
||
return
|
||
|
||
food = food_el.get_text(strip=True)
|
||
if not food:
|
||
return
|
||
|
||
# Walk children of inner div in order.
|
||
# Spans before the <a> link = quantity, spans after = extra/note.
|
||
qty_raw = ""
|
||
extra_parts = []
|
||
before_link = True
|
||
for child in inner.children:
|
||
if child is food_el:
|
||
before_link = False
|
||
continue
|
||
if not hasattr(child, "get_text"):
|
||
continue
|
||
text = child.get_text(strip=True)
|
||
if not text:
|
||
continue
|
||
if before_link:
|
||
qty_raw = text
|
||
else:
|
||
extra_parts.append(text.strip("() "))
|
||
|
||
extra = "; ".join(p for p in extra_parts if p)
|
||
qty, unit = _split_qty_unit(qty_raw)
|
||
|
||
ingredients.append({
|
||
"quantity": qty,
|
||
"unit": unit,
|
||
"food": food,
|
||
"extra": extra,
|
||
})
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# sobors.hu
|
||
# ---------------------------------------------------------------------------
|
||
|
||
|
||
@_register("sobors")
|
||
def _parse_sobors(soup: BeautifulSoup, url: str) -> dict:
|
||
# Title: h3.recept_nev
|
||
title = ""
|
||
title_el = soup.find("h3", class_="recept_nev")
|
||
if title_el:
|
||
title = title_el.get_text(strip=True)
|
||
if not title:
|
||
title = _og(soup, "og:title") or _text(soup.find("title"))
|
||
if title:
|
||
title = re.sub(r"\s*[-–|]\s*SóBors.*$", "", title, flags=re.IGNORECASE).strip()
|
||
|
||
description = _og(soup, "og:description") or ""
|
||
image_url = _og(soup, "og:image")
|
||
|
||
# --- Ingredients ---
|
||
# Container: div.hozzavalok-container (structured recipe pages)
|
||
# Groups: section > h4 (group header), section > ul > li
|
||
# Each li > span > span.mennyiseg, span.mertekegyseg, span.hozzavalo
|
||
ingredients = []
|
||
ing_container = soup.find("div", class_="hozzavalok-container")
|
||
if ing_container:
|
||
for section in ing_container.find_all("section"):
|
||
h4 = section.find("h4")
|
||
if h4:
|
||
group_name = h4.get_text(strip=True).rstrip(":")
|
||
if group_name:
|
||
ingredients.append({"group": group_name})
|
||
for li in section.find_all("li"):
|
||
qty_el = li.find("span", class_="mennyiseg")
|
||
unit_el = li.find("span", class_="mertekegyseg")
|
||
food_el = li.find("span", class_="hozzavalo")
|
||
food = _text(food_el)
|
||
if not food:
|
||
continue
|
||
qty = _text(qty_el)
|
||
unit = _text(unit_el)
|
||
ingredients.append({
|
||
"quantity": qty,
|
||
"unit": unit,
|
||
"food": food,
|
||
"extra": "",
|
||
})
|
||
|
||
# Fallback: article-style ingredients (h4 group headers + ul > li plain text)
|
||
# Some sobors.hu pages (especially linked recipes) use this simpler format.
|
||
if not ingredients:
|
||
article = soup.find("div", class_="cikk-torzs") or soup.find("article")
|
||
if article:
|
||
_parse_sobors_article_ingredients(article, ingredients)
|
||
|
||
# --- Instructions ---
|
||
# Container: div.recept_leiras.recept_he-elkeszites
|
||
# Content: <p> tags for steps, <h3><strong>Section</strong></h3> for section headers
|
||
instructions = []
|
||
linked_url = None
|
||
inst_container = soup.find("div", class_="recept_leiras")
|
||
if inst_container:
|
||
# Check for external link (linked recipe pattern — e.g. "click here for
|
||
# full recipe on kiskegyed.hu")
|
||
for a in inst_container.find_all("a", href=True):
|
||
href = a["href"]
|
||
if href.startswith("http") and "sobors.hu" not in href:
|
||
linked_url = href
|
||
break
|
||
|
||
for el in inst_container.find_all(["h3", "p"]):
|
||
if el.name == "h3":
|
||
header = el.get_text(strip=True)
|
||
if header:
|
||
instructions.append(f"--- {header} ---")
|
||
elif el.name == "p":
|
||
txt = el.get_text(strip=True)
|
||
if txt:
|
||
# Strip leading numbering like "1. " from reader recipes
|
||
txt = re.sub(r"^\d+\.\s+", "", txt)
|
||
instructions.append(txt)
|
||
|
||
# If instructions just contain a redirect to another site, try to follow
|
||
# the link and scrape the real recipe from there.
|
||
if linked_url and len(instructions) <= 2:
|
||
try:
|
||
linked_data = scrape(linked_url)
|
||
if linked_data.get("instructions"):
|
||
instructions = linked_data["instructions"]
|
||
if not ingredients and linked_data.get("ingredients"):
|
||
ingredients = linked_data["ingredients"]
|
||
except Exception:
|
||
pass # keep whatever we scraped from sobors.hu
|
||
|
||
# --- Tags ---
|
||
# Container: div.cikk-cimkek > ul.cikk-cimkek-list > li > a
|
||
# Skip the generic "Receptek" category tag and "Olvasói receptek" tag
|
||
tags = []
|
||
tag_container = soup.find("div", class_="cikk-cimkek")
|
||
if tag_container:
|
||
tag_list = tag_container.find("ul", class_="cikk-cimkek-list")
|
||
if tag_list:
|
||
skip = {"receptek", "olvasói receptek"}
|
||
for a in tag_list.find_all("a"):
|
||
tag_text = a.get_text(strip=True)
|
||
if tag_text and tag_text.lower() not in skip:
|
||
tags.append(tag_text)
|
||
|
||
return {
|
||
"title": title or "Ismeretlen recept",
|
||
"description": description,
|
||
"image_url": image_url,
|
||
"ingredients": ingredients,
|
||
"instructions": instructions,
|
||
"tags": tags,
|
||
"original_url": url,
|
||
}
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# kiskegyed.hu
|
||
# ---------------------------------------------------------------------------
|
||
|
||
|
||
@_register("kiskegyed")
|
||
def _parse_kiskegyed(soup: BeautifulSoup, url: str) -> dict:
|
||
# Title: h2 inside the detail section
|
||
title = ""
|
||
h2 = soup.find("h2")
|
||
if h2:
|
||
title = h2.get_text(strip=True)
|
||
if not title:
|
||
title = _og(soup, "og:title") or _text(soup.find("title"))
|
||
if title:
|
||
title = re.sub(r"\s*[-–|]\s*Kiskegyed.*$", "", title, flags=re.IGNORECASE).strip()
|
||
|
||
# Description: section#leadText > p
|
||
description = ""
|
||
lead = soup.find("section", id="leadText")
|
||
if lead:
|
||
p = lead.find("p")
|
||
if p:
|
||
description = p.get_text(strip=True)
|
||
if not description:
|
||
description = _og(soup, "og:description") or ""
|
||
|
||
image_url = _og(soup, "og:image")
|
||
|
||
# --- Ingredients ---
|
||
# Container: div.recipe_ingredients
|
||
# Groups: <p>Name:</p> or <p><em>A ...hez</em></p>
|
||
# Items: ul.list > li (plain text with optional <a> links)
|
||
ingredients = []
|
||
ing_container = soup.find("div", class_="recipe_ingredients")
|
||
if ing_container:
|
||
for el in ing_container.find_all(["p", "ul"]):
|
||
if el.name == "p":
|
||
group_text = el.get_text(strip=True).rstrip(":")
|
||
# Skip the "Hozzávalók" header and serving info
|
||
if not group_text or group_text.lower().startswith("hozzávalók"):
|
||
continue
|
||
# Skip serving info like "4 személyre"
|
||
if re.match(r"^\d+\s+személyre$", group_text):
|
||
continue
|
||
ingredients.append({"group": group_text})
|
||
elif el.name == "ul" and "list" in (el.get("class") or []):
|
||
for li in el.find_all("li"):
|
||
# Use separator to preserve spaces around <a> tags
|
||
line = re.sub(r"\s+", " ", li.get_text(" ")).strip()
|
||
if not line:
|
||
continue
|
||
qty, unit, food, extra = _parse_kiskegyed_ingredient(line)
|
||
ingredients.append({
|
||
"quantity": qty,
|
||
"unit": unit,
|
||
"food": food,
|
||
"extra": extra,
|
||
})
|
||
|
||
# --- Instructions ---
|
||
# Container: div.recipe_preparation > ol > li > div
|
||
instructions = []
|
||
linked_url = None
|
||
prep_container = soup.find("div", class_="recipe_preparation")
|
||
if prep_container:
|
||
# Check for cross-link to another recipe site (e.g. sobors.hu)
|
||
for a in prep_container.find_all("a", href=True):
|
||
href = a["href"]
|
||
if href.startswith("http") and "kiskegyed.hu" not in href:
|
||
# Check if it points to a supported recipe site
|
||
linked_host = _host(href)
|
||
if any(s in linked_host for s, _ in _PARSERS if s != "kiskegyed"):
|
||
linked_url = href
|
||
break
|
||
|
||
ol = prep_container.find("ol")
|
||
if ol:
|
||
for li in ol.find_all("li", recursive=False):
|
||
div = li.find("div")
|
||
txt = div.get_text(strip=True) if div else li.get_text(strip=True)
|
||
if txt:
|
||
instructions.append(txt)
|
||
|
||
# If instructions are empty or just a redirect, follow the linked recipe
|
||
if linked_url and len(instructions) <= 2:
|
||
try:
|
||
linked_data = scrape(linked_url)
|
||
if linked_data.get("instructions"):
|
||
instructions = linked_data["instructions"]
|
||
if not ingredients and linked_data.get("ingredients"):
|
||
ingredients = linked_data["ingredients"]
|
||
except Exception:
|
||
pass
|
||
|
||
# --- Tags ---
|
||
# Container: section.tags > a > span (text starts with #)
|
||
tags = []
|
||
tag_section = soup.find("section", class_="tags")
|
||
if tag_section:
|
||
skip = {"recept", "receptek"}
|
||
for a in tag_section.find_all("a"):
|
||
span = a.find("span")
|
||
tag_text = span.get_text(strip=True) if span else a.get_text(strip=True)
|
||
tag_text = tag_text.lstrip("#").strip()
|
||
if tag_text and tag_text.lower() not in skip:
|
||
tags.append(tag_text)
|
||
|
||
return {
|
||
"title": title or "Ismeretlen recept",
|
||
"description": description,
|
||
"image_url": image_url,
|
||
"ingredients": ingredients,
|
||
"instructions": instructions,
|
||
"tags": tags,
|
||
"original_url": url,
|
||
}
|
||
|
||
|
||
def _parse_kiskegyed_ingredient(line: str) -> tuple[str, str, str, str]:
|
||
"""Parse a kiskegyed.hu ingredient line.
|
||
|
||
Handles dual measurements like '3 ek (70 g) búzafinomliszt (BL 55)'
|
||
→ qty='3', unit='ek', food='búzafinomliszt', extra='70 g; BL 55'
|
||
"""
|
||
extras = []
|
||
|
||
# Try: qty unit (alt_measurement) food...
|
||
# Unit can be multi-word (e.g. "kis fej"), so use .+? (non-greedy)
|
||
m = re.match(
|
||
r"^([0-9½¼¾][0-9.,/½¼¾-]*)\s+(.+?)\s+\(([^)]+)\)\s+(.+)$", line
|
||
)
|
||
if m:
|
||
qty = m.group(1).strip()
|
||
unit = m.group(2).strip()
|
||
extras.append(m.group(3).strip())
|
||
food_raw = m.group(4).strip()
|
||
# Extract trailing parenthesized note from food
|
||
fm = re.match(r"^(.+?)\s*\(([^)]+)\)\s*$", food_raw)
|
||
if fm:
|
||
food_raw = fm.group(1).strip()
|
||
extras.append(fm.group(2).strip())
|
||
return (qty, unit, food_raw, "; ".join(extras))
|
||
|
||
# Try: qty unit food...
|
||
m2 = re.match(r"^([0-9½¼¾][0-9.,/½¼¾-]*)\s+(\S+)\s+(.+)$", line)
|
||
if m2:
|
||
return (m2.group(1).strip(), m2.group(2).strip(), m2.group(3).strip(), "")
|
||
|
||
# Try: qty food (e.g. "2 tojás")
|
||
m3 = re.match(r"^([0-9½¼¾][0-9.,/½¼¾-]*)\s+(.+)$", line)
|
||
if m3:
|
||
return (m3.group(1).strip(), "", m3.group(2).strip(), "")
|
||
|
||
# No quantity (e.g. "ízlés szerint só")
|
||
return ("", "", line, "")
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# gastrohobbi.hu
|
||
# ---------------------------------------------------------------------------
|
||
|
||
|
||
@_register("gastrohobbi")
|
||
def _parse_gastrohobbi(soup: BeautifulSoup, url: str) -> dict:
|
||
# Title: h1.mpcth-post-title > span
|
||
title = ""
|
||
title_el = soup.select_one("h1.mpcth-post-title span.mpcth-color-main-border")
|
||
if title_el:
|
||
title = title_el.get_text(strip=True)
|
||
if not title:
|
||
title = _og(soup, "og:title") or _text(soup.find("title"))
|
||
if title:
|
||
title = re.sub(r"\s*[-–|]\s*GastroHobbi.*$", "", title, flags=re.IGNORECASE).strip()
|
||
|
||
# Description: first <p> in the first wpb_text_column before the inner recipe row
|
||
description = ""
|
||
first_text_col = soup.select_one("div.wpb-content-wrapper div.wpb_text_column div.wpb_wrapper")
|
||
if first_text_col:
|
||
p = first_text_col.find("p")
|
||
if p:
|
||
description = p.get_text(strip=True)
|
||
if not description:
|
||
description = _og(soup, "og:description") or ""
|
||
|
||
image_url = _og(soup, "og:image")
|
||
|
||
# --- Ingredients ---
|
||
# Find h3 containing "Hozzávalók" then walk siblings for ul and group h3 elements
|
||
ingredients = []
|
||
_gastrohobbi_parse_ingredients(soup, ingredients)
|
||
|
||
# --- Instructions ---
|
||
# Find h3 containing "Elkészítés:" then collect following <p> elements
|
||
instructions = []
|
||
prep_time = ""
|
||
_gastrohobbi_parse_instructions(soup, instructions)
|
||
|
||
# Extract prep time from h3 containing "Elkészítési idő:"
|
||
for h3 in soup.find_all("h3"):
|
||
text = h3.get_text(strip=True)
|
||
if "elkészítési idő" in text.lower():
|
||
# Text after the <em> tag: "Elkészítési idő: 60 perc"
|
||
# The time part is outside the <em><strong> wrapper
|
||
em = h3.find("em")
|
||
if em:
|
||
em.decompose()
|
||
time_text = h3.get_text(strip=True).strip()
|
||
if time_text:
|
||
prep_time = time_text
|
||
break
|
||
|
||
# --- Tags ---
|
||
# From JSON-LD Article.articleSection
|
||
tags = []
|
||
skip_tags = {"receptjeink", "receptek"}
|
||
for script in soup.find_all("script", type="application/ld+json"):
|
||
try:
|
||
data = json.loads(script.string or "")
|
||
graph = data.get("@graph", [data]) if isinstance(data, dict) else data
|
||
for item in graph:
|
||
if isinstance(item, dict) and item.get("@type") == "Article":
|
||
sections = item.get("articleSection", [])
|
||
if isinstance(sections, list):
|
||
tags = [s.strip() for s in sections
|
||
if s.strip() and s.strip().lower() not in skip_tags]
|
||
break
|
||
except (json.JSONDecodeError, TypeError, AttributeError):
|
||
continue
|
||
|
||
# Append prep time to description if available
|
||
if prep_time:
|
||
if description:
|
||
description += f" (Elkészítési idő: {prep_time})"
|
||
else:
|
||
description = f"Elkészítési idő: {prep_time}"
|
||
|
||
return {
|
||
"title": title or "Ismeretlen recept",
|
||
"description": description,
|
||
"image_url": image_url,
|
||
"ingredients": ingredients,
|
||
"instructions": instructions,
|
||
"tags": tags,
|
||
"original_url": url,
|
||
}
|
||
|
||
|
||
def _gastrohobbi_parse_ingredients(soup: BeautifulSoup, ingredients: list):
|
||
"""Parse ingredients from gastrohobbi.hu — find Hozzávalók h3 then walk siblings."""
|
||
# Find the h3 that contains "Hozzávalók"
|
||
header = None
|
||
for h3 in soup.find_all("h3"):
|
||
if "hozzávalók" in h3.get_text(strip=True).lower():
|
||
header = h3
|
||
break
|
||
if not header:
|
||
return
|
||
|
||
# Walk siblings after the header within the same container
|
||
for sib in header.find_next_siblings():
|
||
tag = sib.name
|
||
text = sib.get_text(strip=True)
|
||
if not text:
|
||
continue
|
||
# Stop at the "Elkészítés" section
|
||
if tag == "h3" and "elkészítés" in text.lower():
|
||
break
|
||
# Ingredient group header (plain h3 without em>strong, e.g. "A csipetkéhez:")
|
||
if tag == "h3":
|
||
group_name = text.rstrip(":")
|
||
if group_name:
|
||
ingredients.append({"group": group_name})
|
||
continue
|
||
# Ingredient list
|
||
if tag == "ul":
|
||
for li in sib.find_all("li", recursive=False):
|
||
p = li.find("p")
|
||
line = p.get_text(strip=True) if p else li.get_text(strip=True)
|
||
if not line:
|
||
continue
|
||
qty, unit, food = _parse_ingredient_line(line)
|
||
ingredients.append({
|
||
"quantity": qty, "unit": unit, "food": food, "extra": "",
|
||
})
|
||
|
||
|
||
def _gastrohobbi_parse_instructions(soup: BeautifulSoup, instructions: list):
|
||
"""Parse instructions from gastrohobbi.hu — find Elkészítés h3 then collect <p> siblings."""
|
||
header = None
|
||
for h3 in soup.find_all("h3"):
|
||
text = h3.get_text(strip=True)
|
||
if text.lower().startswith("elkészítés") and "idő" not in text.lower():
|
||
header = h3
|
||
break
|
||
if not header:
|
||
return
|
||
|
||
for sib in header.find_next_siblings():
|
||
tag = sib.name
|
||
text = sib.get_text(strip=True)
|
||
# Stop at prep time h3 or any other section header
|
||
if tag == "h3":
|
||
break
|
||
if tag == "p":
|
||
# Skip empty / whitespace-only paragraphs
|
||
if not text or text == "\xa0":
|
||
continue
|
||
instructions.append(text)
|
||
elif tag == "ul":
|
||
# Embedded list in instructions (e.g. cooking time options)
|
||
for li in sib.find_all("li"):
|
||
# Skip wrapper li elements that contain nested lists
|
||
if li.find("ul"):
|
||
continue
|
||
li_text = li.get_text(strip=True)
|
||
if li_text:
|
||
instructions.append(f" • {li_text}")
|
||
|
||
|
||
def _parse_sobors_article_ingredients(container, ingredients: list):
|
||
"""Parse article-style ingredients from sobors.hu (h4 headers + ul > li plain text)."""
|
||
for el in container.find_all(["h4", "ul"]):
|
||
if el.name == "h4":
|
||
group_name = el.get_text(strip=True).rstrip(":")
|
||
if group_name and not group_name.lower().startswith("hozzávalók"):
|
||
ingredients.append({"group": group_name})
|
||
elif el.name == "ul":
|
||
# Only consider lists that follow an h4 or are inside the ingredient context
|
||
prev = el.find_previous_sibling()
|
||
if prev and prev.name == "h4":
|
||
for li in el.find_all("li"):
|
||
line = li.get_text(strip=True)
|
||
if not line:
|
||
continue
|
||
qty, unit, food = _parse_ingredient_line(line)
|
||
ingredients.append({
|
||
"quantity": qty,
|
||
"unit": unit,
|
||
"food": food,
|
||
"extra": "",
|
||
})
|
||
|
||
|
||
def _parse_ingredient_line(line: str) -> tuple[str, str, str]:
|
||
"""Parse a plain-text ingredient line like '2 dl tejföl' into (qty, unit, food)."""
|
||
# Normalize en-dash/em-dash ranges: "10 – 15" → "10-15"
|
||
line = re.sub(r"\s*[–—]\s*", "-", line)
|
||
# qty unit food (e.g. "2 dl tejföl", "½ tk őrölt kömény")
|
||
m = re.match(r"^([0-9½¼¾][0-9.,/½¼¾-]*)\s+(\S+)\s+(.+)$", line)
|
||
if m:
|
||
return (m.group(1).strip(), m.group(2).strip(), m.group(3).strip())
|
||
# Just quantity + food (e.g. "2 tojás")
|
||
m2 = re.match(r"^([0-9½¼¾][0-9.,/½¼¾-]*)\s+(.+)$", line)
|
||
if m2:
|
||
return (m2.group(1).strip(), "", m2.group(2).strip())
|
||
return ("", "", line)
|
||
|
||
|
||
def _split_qty_unit(raw: str) -> tuple[str, str]:
|
||
"""Split a merged quantity+unit string like '200g' into ('200', 'g')."""
|
||
raw = raw.strip()
|
||
if not raw:
|
||
return ("", "")
|
||
m = re.match(r"^([0-9][0-9 .,/-]*)(.*)$", raw)
|
||
if m:
|
||
return (m.group(1).strip(), m.group(2).strip())
|
||
return ("", raw)
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Generic fallback (og-tags + schema.org microdata)
|
||
# ---------------------------------------------------------------------------
|
||
|
||
|
||
def _parse_generic(soup: BeautifulSoup, url: str) -> dict:
|
||
title = _og(soup, "og:title") or _text(soup.find("title")) or "Ismeretlen recept"
|
||
description = _og(soup, "og:description") or ""
|
||
image_url = _og(soup, "og:image")
|
||
|
||
ingredients = []
|
||
instructions = []
|
||
tags = []
|
||
|
||
# Try schema.org JSON-LD
|
||
for script in soup.find_all("script", type="application/ld+json"):
|
||
try:
|
||
data = json.loads(script.string or "")
|
||
if isinstance(data, list):
|
||
data = data[0]
|
||
if data.get("@type") == "Recipe":
|
||
for line in data.get("recipeIngredient", []):
|
||
ingredients.append({
|
||
"quantity": "", "unit": "", "food": line, "extra": "",
|
||
})
|
||
raw_instructions = data.get("recipeInstructions", [])
|
||
for item in raw_instructions:
|
||
if isinstance(item, str):
|
||
instructions.append(item)
|
||
elif isinstance(item, dict):
|
||
instructions.append(item.get("text", ""))
|
||
# Extract keywords
|
||
kw = data.get("keywords", "")
|
||
if isinstance(kw, str):
|
||
tags = [k.strip() for k in kw.split(",") if k.strip()]
|
||
elif isinstance(kw, list):
|
||
tags = [str(k).strip() for k in kw if str(k).strip()]
|
||
break
|
||
except (json.JSONDecodeError, TypeError, AttributeError):
|
||
continue
|
||
|
||
return {
|
||
"title": title,
|
||
"description": description,
|
||
"image_url": image_url,
|
||
"ingredients": ingredients,
|
||
"instructions": instructions,
|
||
"tags": tags,
|
||
"original_url": url,
|
||
}
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Helpers
|
||
# ---------------------------------------------------------------------------
|
||
|
||
|
||
def _extract_ingredient_comments(data: dict):
|
||
"""Move trailing (comment) from food field to extra field for all ingredients."""
|
||
for ing in data.get("ingredients", []):
|
||
if "group" in ing:
|
||
continue
|
||
food = ing.get("food", "")
|
||
extra = ing.get("extra", "")
|
||
if food and not extra:
|
||
m = re.match(r"^(.+?)\s*\(([^)]+)\)\s*$", food)
|
||
if m:
|
||
ing["food"] = m.group(1).strip()
|
||
ing["extra"] = m.group(2).strip()
|
||
|
||
|
||
def _host(url: str) -> str:
|
||
from urllib.parse import urlparse
|
||
return urlparse(url).hostname or ""
|
||
|
||
|
||
def _og(soup: BeautifulSoup, prop: str) -> str | None:
|
||
tag = soup.find("meta", property=prop)
|
||
if tag and tag.get("content"):
|
||
return tag["content"]
|
||
return None
|
||
|
||
|
||
def _text(el) -> str:
|
||
if el is None:
|
||
return ""
|
||
return el.get_text(strip=True)
|