Files
recipe-importer/app/scraper.py
T
admin 0ec9ce0c6d v0.8.0: gastrohobbi.hu parser, fix ingredient fraction parsing
Add gastrohobbi.hu parser (WPBakery page builder layout): ingredients
with groups, instructions with embedded lists, tags from JSON-LD
articleSection, prep time extraction.

Fix ingredient line parser: fractions like "1/2" no longer split due to
regex backtracking, en-dash ranges normalized, unicode fractions (½¼¾)
recognized as quantity start across all parsers.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-24 19:17:13 +01:00

1013 lines
36 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""Recipe scraper — parses Hungarian recipe sites into a structured dict.
Each supported site has a parser registered via _PARSERS.
Unsupported sites fall back to generic schema.org / og-tag extraction.
"""
import json
import re
import requests
from bs4 import BeautifulSoup
_HEADERS = {
"User-Agent": "RecipeImporter/1.0 (Hungarian recipe scraper)",
"Accept-Language": "hu-HU,hu;q=0.9,en;q=0.5",
}
# Maps a substring of the hostname to a parser function.
# Order matters: first match wins.
_PARSERS: list[tuple[str, "callable"]] = []
def _register(host_substring: str):
"""Decorator: register a parser for URLs whose hostname contains *host_substring*."""
def decorator(fn):
_PARSERS.append((host_substring, fn))
return fn
return decorator
# ---------------------------------------------------------------------------
# Public API
# ---------------------------------------------------------------------------
def scrape(url: str) -> dict:
"""Fetch *url* and return a recipe dict.
Returns::
{
"title": str,
"description": str,
"image_url": str | None,
"ingredients": [{"quantity": str, "unit": str, "food": str, "extra": str}, ...],
"instructions": [str, ...],
"tags": [str, ...],
"original_url": str,
}
Raises ValueError on unsupported sites or parse failures.
"""
resp = requests.get(url, headers=_HEADERS, timeout=30)
resp.raise_for_status()
resp.encoding = resp.apparent_encoding or "utf-8"
soup = BeautifulSoup(resp.text, "lxml")
host = _host(url)
result = None
for substring, parser in _PARSERS:
if substring in host:
result = parser(soup, url)
break
if result is None:
# Fallback: try generic schema.org / og-tag extraction
result = _parse_generic(soup, url)
# Post-process: extract parenthesized comments from food into extra
_extract_ingredient_comments(result)
return result
def supported_sites() -> list[dict]:
"""Return list of supported sites with name and URL."""
_SITE_URLS = {
"mindmegette": "https://www.mindmegette.hu",
"streetkitchen": "https://streetkitchen.hu",
"nosalty": "https://www.nosalty.hu",
"sobors": "https://sobors.hu",
"kiskegyed": "https://www.kiskegyed.hu",
"gastrohobbi": "https://gastrohobbi.hu",
}
return [{"name": s + ".hu", "url": _SITE_URLS.get(s, "#")} for s, _ in _PARSERS]
# ---------------------------------------------------------------------------
# mindmegette.hu
# ---------------------------------------------------------------------------
@_register("mindmegette")
def _parse_mindmegette(soup: BeautifulSoup, url: str) -> dict:
title = _og(soup, "og:title") or _text(soup.find("title"))
# Strip " | Mindmegette.hu" suffix
if title:
title = re.sub(r"\s*\|\s*Mindmegette\.hu$", "", title).strip()
description = _og(soup, "og:description") or ""
image_url = _og(soup, "og:image")
# --- Ingredients ---
# Multiple div.ingredients containers may exist (one per group).
# Group title: <strong class="ingredients-group">A habaráshoz:</strong>
ingredients = []
for ing_container in soup.find_all("div", class_="ingredients"):
# Check for a group title
group_el = ing_container.find("strong", class_="ingredients-group")
group_name = _text(group_el).rstrip(":").strip() if group_el else ""
if group_name:
ingredients.append({"group": group_name})
for row in ing_container.find_all("div", class_="ingredients-meta"):
# Actual HTML: <strong>qty</strong> <span>unit</span>
# <a class="ingredients-link">name</a> <small>(extra)</small>
qty_el = row.find("strong")
unit_el = None
for sp in row.find_all("span"):
if not sp.get("class"):
unit_el = sp
break
name_el = row.find("a", class_="ingredients-link")
extra_el = row.find("small") or row.find("span", class_="extra")
qty = _text(qty_el)
unit = _text(unit_el)
food = _text(name_el)
extra = _text(extra_el).strip("() ")
if not food:
# Fallback: grab whole row text
food = row.get_text(separator=" ", strip=True)
if food:
ingredients.append({
"quantity": qty,
"unit": unit,
"food": food,
"extra": extra,
})
# --- Instructions ---
instructions = []
wysiwyg = soup.find("mindmegette-wysiwyg-box")
if wysiwyg:
for li in wysiwyg.find_all("li"):
txt = _text(li)
if txt:
instructions.append(txt)
# Fallback: look for block-content divs
if not instructions:
for div in soup.find_all("div", class_="block-content"):
ol = div.find("ol")
if ol:
for li in ol.find_all("li"):
txt = _text(li)
if txt:
instructions.append(txt)
# --- Tags ---
tags = []
tag_wrapper = soup.select_one("div.desktop-wrapper")
if tag_wrapper:
for a in tag_wrapper.select("a.tag"):
tag_text = a.get_text(strip=True)
if tag_text:
tags.append(tag_text)
return {
"title": title or "Ismeretlen recept",
"description": description,
"image_url": image_url,
"ingredients": ingredients,
"instructions": instructions,
"tags": tags,
"original_url": url,
}
# ---------------------------------------------------------------------------
# streetkitchen.hu
# ---------------------------------------------------------------------------
@_register("streetkitchen")
def _parse_streetkitchen(soup: BeautifulSoup, url: str) -> dict:
title = _og(soup, "og:title") or _text(soup.find("title"))
if title:
title = re.sub(r"\s*\|\s*Street Kitchen$", "", title).strip()
description = _og(soup, "og:description") or ""
image_url = _og(soup, "og:image")
# --- Ingredients ---
# Find the main ingredient grid (grid-cols-1 lg:grid-cols-2).
# The page renders ingredients twice (mobile + desktop); we pick the
# specific grid to avoid duplicates.
ingredients = []
ing_grid = None
for g in soup.select("div.grid"):
cls = " ".join(g.get("class", []))
if "grid-cols-1" in cls and "lg:grid-cols-2" in cls:
ing_grid = g
break
if ing_grid:
# Walk top-level divs — each may contain an h5 group header + rows
for section in ing_grid.find_all("div", recursive=False):
h5 = section.find("h5")
if h5:
group_name = h5.get_text(strip=True)
if group_name:
ingredients.append({"group": group_name})
for row in section.select("div.my-2.flex.items-center.gap-2.text-lg"):
inner = row.select_one("div.flex.items-center.gap-2")
if not inner:
continue
divs = inner.find_all("div", recursive=False)
bold = inner.find("div", class_="font-bold")
food = bold.get_text(strip=True) if bold else ""
if not food:
continue
# First non-bold div is quantity+unit merged (e.g. "200g", "1fej")
qty_raw = ""
extra = ""
for d in divs:
if d == bold:
continue
txt = d.get_text(strip=True)
if txt.startswith("(") and txt.endswith(")"):
extra = txt.strip("() ")
elif not qty_raw:
qty_raw = txt
# Split "200g" → qty="200", unit="g"
qty, unit = _split_qty_unit(qty_raw)
# Extract parenthesised note from inside food name
# e.g. "fehérborecet (ízlés szerint)" → food="fehérborecet", extra="ízlés szerint"
if not extra:
m = re.match(r"^(.+?)\s*\(([^)]+)\)\s*$", food)
if m:
food = m.group(1).strip()
extra = m.group(2).strip()
ingredients.append({
"quantity": qty,
"unit": unit,
"food": food,
"extra": extra,
})
# --- Instructions ---
instructions = []
prep = (soup.find("div", id="Streetk_content_preparation_wrapper")
or soup.select_one(".recipe-preparation"))
if prep:
ol = prep.find("ol")
ul = prep.find("ul")
if ol:
for li in ol.find_all("li", recursive=False):
txt = li.get_text(strip=True)
if txt:
instructions.append(txt)
elif ul:
for li in ul.find_all("li", recursive=False):
txt = li.get_text(strip=True)
if txt:
instructions.append(txt)
else:
# Paragraph-style: <p> blocks, sometimes with <strong> headers
for p in prep.find_all("p"):
txt = p.get_text(strip=True)
if txt:
instructions.append(txt)
# If still nothing, try the description wrapper
if not instructions:
desc_article = soup.find("article", id="Streetk_content_description_wrapper")
if desc_article:
for p in desc_article.find_all("p"):
txt = p.get_text(strip=True)
if txt:
instructions.append(txt)
# --- Tags ---
tags = []
# Prefer recipeCategory from JSON-LD (comma-separated)
for script in soup.find_all("script", type="application/ld+json"):
try:
data = json.loads(script.string or "")
graph = data.get("@graph", [data]) if isinstance(data, dict) else data
for item in graph:
if isinstance(item, dict) and item.get("@type") == "Recipe":
cat = item.get("recipeCategory", "")
if isinstance(cat, str) and cat:
tags = [t.strip() for t in cat.split(",") if t.strip()]
elif isinstance(cat, list):
tags = [str(t).strip() for t in cat if str(t).strip()]
break
except (json.JSONDecodeError, TypeError, AttributeError):
continue
return {
"title": title or "Ismeretlen recept",
"description": description,
"image_url": image_url,
"ingredients": ingredients,
"instructions": instructions,
"tags": tags,
"original_url": url,
}
# ---------------------------------------------------------------------------
# nosalty.hu
# ---------------------------------------------------------------------------
@_register("nosalty")
def _parse_nosalty(soup: BeautifulSoup, url: str) -> dict:
title = _og(soup, "og:title") or _text(soup.find("title"))
if title:
title = re.sub(r"\s*\|.*$", "", title).strip()
# Story as description (no dedicated description on nosalty)
description = ""
story = soup.find("div", id="recipe-story")
if story:
paragraphs = [p.get_text(strip=True) for p in story.find_all("p")
if p.get_text(strip=True)]
description = " ".join(paragraphs)
image_url = _og(soup, "og:image")
# --- Ingredients ---
# Scoped to div#ingredients to avoid per-serving / nutrition duplicates.
# Structure: h3.m-list__title = group header, ul.m-list__list = ingredient rows.
ingredients = []
ing_container = soup.find("div", id="ingredients")
if ing_container:
for el in ing_container.find_all(["h3", "ul"]):
cls = el.get("class") or []
if el.name == "h3" and "m-list__title" in cls:
group_name = el.get_text(strip=True)
if group_name:
ingredients.append({"group": group_name})
elif el.name == "ul" and "m-list__list" in cls:
for li in el.find_all("li", class_="m-list__item"):
_parse_nosalty_ingredient(li, ingredients)
# --- Instructions ---
# Container: div#select inside div.p-recipe__directions.
# h4.m-list__title = section header, ol.m-list__list = steps.
instructions = []
dir_container = soup.find("div", id="select")
if dir_container:
for el in dir_container.find_all(["h4", "ol"]):
cls = el.get("class") or []
if el.name == "h4" and "m-list__title" in cls:
section_name = el.get_text(strip=True)
if section_name:
instructions.append(f"--- {section_name} ---")
elif el.name == "ol" and "m-list__list" in cls:
for li in el.find_all("li", class_="m-list__item"):
txt = li.get_text(strip=True)
if txt:
instructions.append(txt)
# --- Tags ---
# Scoped to div.p-recipe__attributeList to avoid site-wide SEO tags.
tags = []
attr_list = soup.find("div", class_="p-recipe__attributeList")
if attr_list:
for a in attr_list.find_all("a", class_="m-tags__tagItem"):
tag_text = a.get_text(strip=True)
if tag_text:
tags.append(tag_text)
return {
"title": title or "Ismeretlen recept",
"description": description,
"image_url": image_url,
"ingredients": ingredients,
"instructions": instructions,
"tags": tags,
"original_url": url,
}
def _parse_nosalty_ingredient(li, ingredients: list):
"""Parse a single nosalty ingredient <li> into the ingredients list."""
inner = li.find("div")
if not inner:
return
food_el = inner.find("a", class_="a-link")
if not food_el:
return
food = food_el.get_text(strip=True)
if not food:
return
# Walk children of inner div in order.
# Spans before the <a> link = quantity, spans after = extra/note.
qty_raw = ""
extra_parts = []
before_link = True
for child in inner.children:
if child is food_el:
before_link = False
continue
if not hasattr(child, "get_text"):
continue
text = child.get_text(strip=True)
if not text:
continue
if before_link:
qty_raw = text
else:
extra_parts.append(text.strip("() "))
extra = "; ".join(p for p in extra_parts if p)
qty, unit = _split_qty_unit(qty_raw)
ingredients.append({
"quantity": qty,
"unit": unit,
"food": food,
"extra": extra,
})
# ---------------------------------------------------------------------------
# sobors.hu
# ---------------------------------------------------------------------------
@_register("sobors")
def _parse_sobors(soup: BeautifulSoup, url: str) -> dict:
# Title: h3.recept_nev
title = ""
title_el = soup.find("h3", class_="recept_nev")
if title_el:
title = title_el.get_text(strip=True)
if not title:
title = _og(soup, "og:title") or _text(soup.find("title"))
if title:
title = re.sub(r"\s*[-|]\s*SóBors.*$", "", title, flags=re.IGNORECASE).strip()
description = _og(soup, "og:description") or ""
image_url = _og(soup, "og:image")
# --- Ingredients ---
# Container: div.hozzavalok-container (structured recipe pages)
# Groups: section > h4 (group header), section > ul > li
# Each li > span > span.mennyiseg, span.mertekegyseg, span.hozzavalo
ingredients = []
ing_container = soup.find("div", class_="hozzavalok-container")
if ing_container:
for section in ing_container.find_all("section"):
h4 = section.find("h4")
if h4:
group_name = h4.get_text(strip=True).rstrip(":")
if group_name:
ingredients.append({"group": group_name})
for li in section.find_all("li"):
qty_el = li.find("span", class_="mennyiseg")
unit_el = li.find("span", class_="mertekegyseg")
food_el = li.find("span", class_="hozzavalo")
food = _text(food_el)
if not food:
continue
qty = _text(qty_el)
unit = _text(unit_el)
ingredients.append({
"quantity": qty,
"unit": unit,
"food": food,
"extra": "",
})
# Fallback: article-style ingredients (h4 group headers + ul > li plain text)
# Some sobors.hu pages (especially linked recipes) use this simpler format.
if not ingredients:
article = soup.find("div", class_="cikk-torzs") or soup.find("article")
if article:
_parse_sobors_article_ingredients(article, ingredients)
# --- Instructions ---
# Container: div.recept_leiras.recept_he-elkeszites
# Content: <p> tags for steps, <h3><strong>Section</strong></h3> for section headers
instructions = []
linked_url = None
inst_container = soup.find("div", class_="recept_leiras")
if inst_container:
# Check for external link (linked recipe pattern — e.g. "click here for
# full recipe on kiskegyed.hu")
for a in inst_container.find_all("a", href=True):
href = a["href"]
if href.startswith("http") and "sobors.hu" not in href:
linked_url = href
break
for el in inst_container.find_all(["h3", "p"]):
if el.name == "h3":
header = el.get_text(strip=True)
if header:
instructions.append(f"--- {header} ---")
elif el.name == "p":
txt = el.get_text(strip=True)
if txt:
# Strip leading numbering like "1. " from reader recipes
txt = re.sub(r"^\d+\.\s+", "", txt)
instructions.append(txt)
# If instructions just contain a redirect to another site, try to follow
# the link and scrape the real recipe from there.
if linked_url and len(instructions) <= 2:
try:
linked_data = scrape(linked_url)
if linked_data.get("instructions"):
instructions = linked_data["instructions"]
if not ingredients and linked_data.get("ingredients"):
ingredients = linked_data["ingredients"]
except Exception:
pass # keep whatever we scraped from sobors.hu
# --- Tags ---
# Container: div.cikk-cimkek > ul.cikk-cimkek-list > li > a
# Skip the generic "Receptek" category tag and "Olvasói receptek" tag
tags = []
tag_container = soup.find("div", class_="cikk-cimkek")
if tag_container:
tag_list = tag_container.find("ul", class_="cikk-cimkek-list")
if tag_list:
skip = {"receptek", "olvasói receptek"}
for a in tag_list.find_all("a"):
tag_text = a.get_text(strip=True)
if tag_text and tag_text.lower() not in skip:
tags.append(tag_text)
return {
"title": title or "Ismeretlen recept",
"description": description,
"image_url": image_url,
"ingredients": ingredients,
"instructions": instructions,
"tags": tags,
"original_url": url,
}
# ---------------------------------------------------------------------------
# kiskegyed.hu
# ---------------------------------------------------------------------------
@_register("kiskegyed")
def _parse_kiskegyed(soup: BeautifulSoup, url: str) -> dict:
# Title: h2 inside the detail section
title = ""
h2 = soup.find("h2")
if h2:
title = h2.get_text(strip=True)
if not title:
title = _og(soup, "og:title") or _text(soup.find("title"))
if title:
title = re.sub(r"\s*[-|]\s*Kiskegyed.*$", "", title, flags=re.IGNORECASE).strip()
# Description: section#leadText > p
description = ""
lead = soup.find("section", id="leadText")
if lead:
p = lead.find("p")
if p:
description = p.get_text(strip=True)
if not description:
description = _og(soup, "og:description") or ""
image_url = _og(soup, "og:image")
# --- Ingredients ---
# Container: div.recipe_ingredients
# Groups: <p>Name:</p> or <p><em>A ...hez</em></p>
# Items: ul.list > li (plain text with optional <a> links)
ingredients = []
ing_container = soup.find("div", class_="recipe_ingredients")
if ing_container:
for el in ing_container.find_all(["p", "ul"]):
if el.name == "p":
group_text = el.get_text(strip=True).rstrip(":")
# Skip the "Hozzávalók" header and serving info
if not group_text or group_text.lower().startswith("hozzávalók"):
continue
# Skip serving info like "4 személyre"
if re.match(r"^\d+\s+személyre$", group_text):
continue
ingredients.append({"group": group_text})
elif el.name == "ul" and "list" in (el.get("class") or []):
for li in el.find_all("li"):
# Use separator to preserve spaces around <a> tags
line = re.sub(r"\s+", " ", li.get_text(" ")).strip()
if not line:
continue
qty, unit, food, extra = _parse_kiskegyed_ingredient(line)
ingredients.append({
"quantity": qty,
"unit": unit,
"food": food,
"extra": extra,
})
# --- Instructions ---
# Container: div.recipe_preparation > ol > li > div
instructions = []
linked_url = None
prep_container = soup.find("div", class_="recipe_preparation")
if prep_container:
# Check for cross-link to another recipe site (e.g. sobors.hu)
for a in prep_container.find_all("a", href=True):
href = a["href"]
if href.startswith("http") and "kiskegyed.hu" not in href:
# Check if it points to a supported recipe site
linked_host = _host(href)
if any(s in linked_host for s, _ in _PARSERS if s != "kiskegyed"):
linked_url = href
break
ol = prep_container.find("ol")
if ol:
for li in ol.find_all("li", recursive=False):
div = li.find("div")
txt = div.get_text(strip=True) if div else li.get_text(strip=True)
if txt:
instructions.append(txt)
# If instructions are empty or just a redirect, follow the linked recipe
if linked_url and len(instructions) <= 2:
try:
linked_data = scrape(linked_url)
if linked_data.get("instructions"):
instructions = linked_data["instructions"]
if not ingredients and linked_data.get("ingredients"):
ingredients = linked_data["ingredients"]
except Exception:
pass
# --- Tags ---
# Container: section.tags > a > span (text starts with #)
tags = []
tag_section = soup.find("section", class_="tags")
if tag_section:
skip = {"recept", "receptek"}
for a in tag_section.find_all("a"):
span = a.find("span")
tag_text = span.get_text(strip=True) if span else a.get_text(strip=True)
tag_text = tag_text.lstrip("#").strip()
if tag_text and tag_text.lower() not in skip:
tags.append(tag_text)
return {
"title": title or "Ismeretlen recept",
"description": description,
"image_url": image_url,
"ingredients": ingredients,
"instructions": instructions,
"tags": tags,
"original_url": url,
}
def _parse_kiskegyed_ingredient(line: str) -> tuple[str, str, str, str]:
"""Parse a kiskegyed.hu ingredient line.
Handles dual measurements like '3 ek (70 g) búzafinomliszt (BL 55)'
→ qty='3', unit='ek', food='búzafinomliszt', extra='70 g; BL 55'
"""
extras = []
# Try: qty unit (alt_measurement) food...
# Unit can be multi-word (e.g. "kis fej"), so use .+? (non-greedy)
m = re.match(
r"^([0-9½¼¾][0-9.,/½¼¾-]*)\s+(.+?)\s+\(([^)]+)\)\s+(.+)$", line
)
if m:
qty = m.group(1).strip()
unit = m.group(2).strip()
extras.append(m.group(3).strip())
food_raw = m.group(4).strip()
# Extract trailing parenthesized note from food
fm = re.match(r"^(.+?)\s*\(([^)]+)\)\s*$", food_raw)
if fm:
food_raw = fm.group(1).strip()
extras.append(fm.group(2).strip())
return (qty, unit, food_raw, "; ".join(extras))
# Try: qty unit food...
m2 = re.match(r"^([0-9½¼¾][0-9.,/½¼¾-]*)\s+(\S+)\s+(.+)$", line)
if m2:
return (m2.group(1).strip(), m2.group(2).strip(), m2.group(3).strip(), "")
# Try: qty food (e.g. "2 tojás")
m3 = re.match(r"^([0-9½¼¾][0-9.,/½¼¾-]*)\s+(.+)$", line)
if m3:
return (m3.group(1).strip(), "", m3.group(2).strip(), "")
# No quantity (e.g. "ízlés szerint só")
return ("", "", line, "")
# ---------------------------------------------------------------------------
# gastrohobbi.hu
# ---------------------------------------------------------------------------
@_register("gastrohobbi")
def _parse_gastrohobbi(soup: BeautifulSoup, url: str) -> dict:
# Title: h1.mpcth-post-title > span
title = ""
title_el = soup.select_one("h1.mpcth-post-title span.mpcth-color-main-border")
if title_el:
title = title_el.get_text(strip=True)
if not title:
title = _og(soup, "og:title") or _text(soup.find("title"))
if title:
title = re.sub(r"\s*[-|]\s*GastroHobbi.*$", "", title, flags=re.IGNORECASE).strip()
# Description: first <p> in the first wpb_text_column before the inner recipe row
description = ""
first_text_col = soup.select_one("div.wpb-content-wrapper div.wpb_text_column div.wpb_wrapper")
if first_text_col:
p = first_text_col.find("p")
if p:
description = p.get_text(strip=True)
if not description:
description = _og(soup, "og:description") or ""
image_url = _og(soup, "og:image")
# --- Ingredients ---
# Find h3 containing "Hozzávalók" then walk siblings for ul and group h3 elements
ingredients = []
_gastrohobbi_parse_ingredients(soup, ingredients)
# --- Instructions ---
# Find h3 containing "Elkészítés:" then collect following <p> elements
instructions = []
prep_time = ""
_gastrohobbi_parse_instructions(soup, instructions)
# Extract prep time from h3 containing "Elkészítési idő:"
for h3 in soup.find_all("h3"):
text = h3.get_text(strip=True)
if "elkészítési idő" in text.lower():
# Text after the <em> tag: "Elkészítési idő: 60 perc"
# The time part is outside the <em><strong> wrapper
em = h3.find("em")
if em:
em.decompose()
time_text = h3.get_text(strip=True).strip()
if time_text:
prep_time = time_text
break
# --- Tags ---
# From JSON-LD Article.articleSection
tags = []
skip_tags = {"receptjeink", "receptek"}
for script in soup.find_all("script", type="application/ld+json"):
try:
data = json.loads(script.string or "")
graph = data.get("@graph", [data]) if isinstance(data, dict) else data
for item in graph:
if isinstance(item, dict) and item.get("@type") == "Article":
sections = item.get("articleSection", [])
if isinstance(sections, list):
tags = [s.strip() for s in sections
if s.strip() and s.strip().lower() not in skip_tags]
break
except (json.JSONDecodeError, TypeError, AttributeError):
continue
# Append prep time to description if available
if prep_time:
if description:
description += f" (Elkészítési idő: {prep_time})"
else:
description = f"Elkészítési idő: {prep_time}"
return {
"title": title or "Ismeretlen recept",
"description": description,
"image_url": image_url,
"ingredients": ingredients,
"instructions": instructions,
"tags": tags,
"original_url": url,
}
def _gastrohobbi_parse_ingredients(soup: BeautifulSoup, ingredients: list):
"""Parse ingredients from gastrohobbi.hu — find Hozzávalók h3 then walk siblings."""
# Find the h3 that contains "Hozzávalók"
header = None
for h3 in soup.find_all("h3"):
if "hozzávalók" in h3.get_text(strip=True).lower():
header = h3
break
if not header:
return
# Walk siblings after the header within the same container
for sib in header.find_next_siblings():
tag = sib.name
text = sib.get_text(strip=True)
if not text:
continue
# Stop at the "Elkészítés" section
if tag == "h3" and "elkészítés" in text.lower():
break
# Ingredient group header (plain h3 without em>strong, e.g. "A csipetkéhez:")
if tag == "h3":
group_name = text.rstrip(":")
if group_name:
ingredients.append({"group": group_name})
continue
# Ingredient list
if tag == "ul":
for li in sib.find_all("li", recursive=False):
p = li.find("p")
line = p.get_text(strip=True) if p else li.get_text(strip=True)
if not line:
continue
qty, unit, food = _parse_ingredient_line(line)
ingredients.append({
"quantity": qty, "unit": unit, "food": food, "extra": "",
})
def _gastrohobbi_parse_instructions(soup: BeautifulSoup, instructions: list):
"""Parse instructions from gastrohobbi.hu — find Elkészítés h3 then collect <p> siblings."""
header = None
for h3 in soup.find_all("h3"):
text = h3.get_text(strip=True)
if text.lower().startswith("elkészítés") and "idő" not in text.lower():
header = h3
break
if not header:
return
for sib in header.find_next_siblings():
tag = sib.name
text = sib.get_text(strip=True)
# Stop at prep time h3 or any other section header
if tag == "h3":
break
if tag == "p":
# Skip empty / whitespace-only paragraphs
if not text or text == "\xa0":
continue
instructions.append(text)
elif tag == "ul":
# Embedded list in instructions (e.g. cooking time options)
for li in sib.find_all("li"):
# Skip wrapper li elements that contain nested lists
if li.find("ul"):
continue
li_text = li.get_text(strip=True)
if li_text:
instructions.append(f"{li_text}")
def _parse_sobors_article_ingredients(container, ingredients: list):
"""Parse article-style ingredients from sobors.hu (h4 headers + ul > li plain text)."""
for el in container.find_all(["h4", "ul"]):
if el.name == "h4":
group_name = el.get_text(strip=True).rstrip(":")
if group_name and not group_name.lower().startswith("hozzávalók"):
ingredients.append({"group": group_name})
elif el.name == "ul":
# Only consider lists that follow an h4 or are inside the ingredient context
prev = el.find_previous_sibling()
if prev and prev.name == "h4":
for li in el.find_all("li"):
line = li.get_text(strip=True)
if not line:
continue
qty, unit, food = _parse_ingredient_line(line)
ingredients.append({
"quantity": qty,
"unit": unit,
"food": food,
"extra": "",
})
def _parse_ingredient_line(line: str) -> tuple[str, str, str]:
"""Parse a plain-text ingredient line like '2 dl tejföl' into (qty, unit, food)."""
# Normalize en-dash/em-dash ranges: "10 15" → "10-15"
line = re.sub(r"\s*[–—]\s*", "-", line)
# qty unit food (e.g. "2 dl tejföl", "½ tk őrölt kömény")
m = re.match(r"^([0-9½¼¾][0-9.,/½¼¾-]*)\s+(\S+)\s+(.+)$", line)
if m:
return (m.group(1).strip(), m.group(2).strip(), m.group(3).strip())
# Just quantity + food (e.g. "2 tojás")
m2 = re.match(r"^([0-9½¼¾][0-9.,/½¼¾-]*)\s+(.+)$", line)
if m2:
return (m2.group(1).strip(), "", m2.group(2).strip())
return ("", "", line)
def _split_qty_unit(raw: str) -> tuple[str, str]:
"""Split a merged quantity+unit string like '200g' into ('200', 'g')."""
raw = raw.strip()
if not raw:
return ("", "")
m = re.match(r"^([0-9][0-9 .,/-]*)(.*)$", raw)
if m:
return (m.group(1).strip(), m.group(2).strip())
return ("", raw)
# ---------------------------------------------------------------------------
# Generic fallback (og-tags + schema.org microdata)
# ---------------------------------------------------------------------------
def _parse_generic(soup: BeautifulSoup, url: str) -> dict:
title = _og(soup, "og:title") or _text(soup.find("title")) or "Ismeretlen recept"
description = _og(soup, "og:description") or ""
image_url = _og(soup, "og:image")
ingredients = []
instructions = []
tags = []
# Try schema.org JSON-LD
for script in soup.find_all("script", type="application/ld+json"):
try:
data = json.loads(script.string or "")
if isinstance(data, list):
data = data[0]
if data.get("@type") == "Recipe":
for line in data.get("recipeIngredient", []):
ingredients.append({
"quantity": "", "unit": "", "food": line, "extra": "",
})
raw_instructions = data.get("recipeInstructions", [])
for item in raw_instructions:
if isinstance(item, str):
instructions.append(item)
elif isinstance(item, dict):
instructions.append(item.get("text", ""))
# Extract keywords
kw = data.get("keywords", "")
if isinstance(kw, str):
tags = [k.strip() for k in kw.split(",") if k.strip()]
elif isinstance(kw, list):
tags = [str(k).strip() for k in kw if str(k).strip()]
break
except (json.JSONDecodeError, TypeError, AttributeError):
continue
return {
"title": title,
"description": description,
"image_url": image_url,
"ingredients": ingredients,
"instructions": instructions,
"tags": tags,
"original_url": url,
}
# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------
def _extract_ingredient_comments(data: dict):
"""Move trailing (comment) from food field to extra field for all ingredients."""
for ing in data.get("ingredients", []):
if "group" in ing:
continue
food = ing.get("food", "")
extra = ing.get("extra", "")
if food and not extra:
m = re.match(r"^(.+?)\s*\(([^)]+)\)\s*$", food)
if m:
ing["food"] = m.group(1).strip()
ing["extra"] = m.group(2).strip()
def _host(url: str) -> str:
from urllib.parse import urlparse
return urlparse(url).hostname or ""
def _og(soup: BeautifulSoup, prop: str) -> str | None:
tag = soup.find("meta", property=prop)
if tag and tag.get("content"):
return tag["content"]
return None
def _text(el) -> str:
if el is None:
return ""
return el.get_text(strip=True)