Files
recipe-importer/app/scraper.py
T
admin 2408984421 Extract ingredient comments from food field, add import-to-both button
- Global post-processing in scrape() extracts trailing (comment) from
  ingredient food names into the extra/comment field. Works for all parsers.
- Added "Importálás mindkettőbe" button on single import page when both
  Mealie and Tandoor are configured.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-24 18:27:58 +01:00

682 lines
24 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""Recipe scraper — parses Hungarian recipe sites into a structured dict.
Each supported site has a parser registered via _PARSERS.
Unsupported sites fall back to generic schema.org / og-tag extraction.
"""
import json
import re
import requests
from bs4 import BeautifulSoup
_HEADERS = {
"User-Agent": "RecipeImporter/1.0 (Hungarian recipe scraper)",
"Accept-Language": "hu-HU,hu;q=0.9,en;q=0.5",
}
# Maps a substring of the hostname to a parser function.
# Order matters: first match wins.
_PARSERS: list[tuple[str, "callable"]] = []
def _register(host_substring: str):
"""Decorator: register a parser for URLs whose hostname contains *host_substring*."""
def decorator(fn):
_PARSERS.append((host_substring, fn))
return fn
return decorator
# ---------------------------------------------------------------------------
# Public API
# ---------------------------------------------------------------------------
def scrape(url: str) -> dict:
"""Fetch *url* and return a recipe dict.
Returns::
{
"title": str,
"description": str,
"image_url": str | None,
"ingredients": [{"quantity": str, "unit": str, "food": str, "extra": str}, ...],
"instructions": [str, ...],
"tags": [str, ...],
"original_url": str,
}
Raises ValueError on unsupported sites or parse failures.
"""
resp = requests.get(url, headers=_HEADERS, timeout=30)
resp.raise_for_status()
resp.encoding = resp.apparent_encoding or "utf-8"
soup = BeautifulSoup(resp.text, "lxml")
host = _host(url)
result = None
for substring, parser in _PARSERS:
if substring in host:
result = parser(soup, url)
break
if result is None:
# Fallback: try generic schema.org / og-tag extraction
result = _parse_generic(soup, url)
# Post-process: extract parenthesized comments from food into extra
_extract_ingredient_comments(result)
return result
def supported_sites() -> list[str]:
"""Return list of supported site hostname substrings."""
return [s for s, _ in _PARSERS]
# ---------------------------------------------------------------------------
# mindmegette.hu
# ---------------------------------------------------------------------------
@_register("mindmegette")
def _parse_mindmegette(soup: BeautifulSoup, url: str) -> dict:
title = _og(soup, "og:title") or _text(soup.find("title"))
# Strip " | Mindmegette.hu" suffix
if title:
title = re.sub(r"\s*\|\s*Mindmegette\.hu$", "", title).strip()
description = _og(soup, "og:description") or ""
image_url = _og(soup, "og:image")
# --- Ingredients ---
# Multiple div.ingredients containers may exist (one per group).
# Group title: <strong class="ingredients-group">A habaráshoz:</strong>
ingredients = []
for ing_container in soup.find_all("div", class_="ingredients"):
# Check for a group title
group_el = ing_container.find("strong", class_="ingredients-group")
group_name = _text(group_el).rstrip(":").strip() if group_el else ""
if group_name:
ingredients.append({"group": group_name})
for row in ing_container.find_all("div", class_="ingredients-meta"):
# Actual HTML: <strong>qty</strong> <span>unit</span>
# <a class="ingredients-link">name</a> <small>(extra)</small>
qty_el = row.find("strong")
unit_el = None
for sp in row.find_all("span"):
if not sp.get("class"):
unit_el = sp
break
name_el = row.find("a", class_="ingredients-link")
extra_el = row.find("small") or row.find("span", class_="extra")
qty = _text(qty_el)
unit = _text(unit_el)
food = _text(name_el)
extra = _text(extra_el).strip("() ")
if not food:
# Fallback: grab whole row text
food = row.get_text(separator=" ", strip=True)
if food:
ingredients.append({
"quantity": qty,
"unit": unit,
"food": food,
"extra": extra,
})
# --- Instructions ---
instructions = []
wysiwyg = soup.find("mindmegette-wysiwyg-box")
if wysiwyg:
for li in wysiwyg.find_all("li"):
txt = _text(li)
if txt:
instructions.append(txt)
# Fallback: look for block-content divs
if not instructions:
for div in soup.find_all("div", class_="block-content"):
ol = div.find("ol")
if ol:
for li in ol.find_all("li"):
txt = _text(li)
if txt:
instructions.append(txt)
# --- Tags ---
tags = []
tag_wrapper = soup.select_one("div.desktop-wrapper")
if tag_wrapper:
for a in tag_wrapper.select("a.tag"):
tag_text = a.get_text(strip=True)
if tag_text:
tags.append(tag_text)
return {
"title": title or "Ismeretlen recept",
"description": description,
"image_url": image_url,
"ingredients": ingredients,
"instructions": instructions,
"tags": tags,
"original_url": url,
}
# ---------------------------------------------------------------------------
# streetkitchen.hu
# ---------------------------------------------------------------------------
@_register("streetkitchen")
def _parse_streetkitchen(soup: BeautifulSoup, url: str) -> dict:
title = _og(soup, "og:title") or _text(soup.find("title"))
if title:
title = re.sub(r"\s*\|\s*Street Kitchen$", "", title).strip()
description = _og(soup, "og:description") or ""
image_url = _og(soup, "og:image")
# --- Ingredients ---
# Find the main ingredient grid (grid-cols-1 lg:grid-cols-2).
# The page renders ingredients twice (mobile + desktop); we pick the
# specific grid to avoid duplicates.
ingredients = []
ing_grid = None
for g in soup.select("div.grid"):
cls = " ".join(g.get("class", []))
if "grid-cols-1" in cls and "lg:grid-cols-2" in cls:
ing_grid = g
break
if ing_grid:
# Walk top-level divs — each may contain an h5 group header + rows
for section in ing_grid.find_all("div", recursive=False):
h5 = section.find("h5")
if h5:
group_name = h5.get_text(strip=True)
if group_name:
ingredients.append({"group": group_name})
for row in section.select("div.my-2.flex.items-center.gap-2.text-lg"):
inner = row.select_one("div.flex.items-center.gap-2")
if not inner:
continue
divs = inner.find_all("div", recursive=False)
bold = inner.find("div", class_="font-bold")
food = bold.get_text(strip=True) if bold else ""
if not food:
continue
# First non-bold div is quantity+unit merged (e.g. "200g", "1fej")
qty_raw = ""
extra = ""
for d in divs:
if d == bold:
continue
txt = d.get_text(strip=True)
if txt.startswith("(") and txt.endswith(")"):
extra = txt.strip("() ")
elif not qty_raw:
qty_raw = txt
# Split "200g" → qty="200", unit="g"
qty, unit = _split_qty_unit(qty_raw)
# Extract parenthesised note from inside food name
# e.g. "fehérborecet (ízlés szerint)" → food="fehérborecet", extra="ízlés szerint"
if not extra:
m = re.match(r"^(.+?)\s*\(([^)]+)\)\s*$", food)
if m:
food = m.group(1).strip()
extra = m.group(2).strip()
ingredients.append({
"quantity": qty,
"unit": unit,
"food": food,
"extra": extra,
})
# --- Instructions ---
instructions = []
prep = (soup.find("div", id="Streetk_content_preparation_wrapper")
or soup.select_one(".recipe-preparation"))
if prep:
ol = prep.find("ol")
ul = prep.find("ul")
if ol:
for li in ol.find_all("li", recursive=False):
txt = li.get_text(strip=True)
if txt:
instructions.append(txt)
elif ul:
for li in ul.find_all("li", recursive=False):
txt = li.get_text(strip=True)
if txt:
instructions.append(txt)
else:
# Paragraph-style: <p> blocks, sometimes with <strong> headers
for p in prep.find_all("p"):
txt = p.get_text(strip=True)
if txt:
instructions.append(txt)
# If still nothing, try the description wrapper
if not instructions:
desc_article = soup.find("article", id="Streetk_content_description_wrapper")
if desc_article:
for p in desc_article.find_all("p"):
txt = p.get_text(strip=True)
if txt:
instructions.append(txt)
# --- Tags ---
tags = []
# Prefer recipeCategory from JSON-LD (comma-separated)
for script in soup.find_all("script", type="application/ld+json"):
try:
data = json.loads(script.string or "")
graph = data.get("@graph", [data]) if isinstance(data, dict) else data
for item in graph:
if isinstance(item, dict) and item.get("@type") == "Recipe":
cat = item.get("recipeCategory", "")
if isinstance(cat, str) and cat:
tags = [t.strip() for t in cat.split(",") if t.strip()]
elif isinstance(cat, list):
tags = [str(t).strip() for t in cat if str(t).strip()]
break
except (json.JSONDecodeError, TypeError, AttributeError):
continue
return {
"title": title or "Ismeretlen recept",
"description": description,
"image_url": image_url,
"ingredients": ingredients,
"instructions": instructions,
"tags": tags,
"original_url": url,
}
# ---------------------------------------------------------------------------
# nosalty.hu
# ---------------------------------------------------------------------------
@_register("nosalty")
def _parse_nosalty(soup: BeautifulSoup, url: str) -> dict:
title = _og(soup, "og:title") or _text(soup.find("title"))
if title:
title = re.sub(r"\s*\|.*$", "", title).strip()
# Story as description (no dedicated description on nosalty)
description = ""
story = soup.find("div", id="recipe-story")
if story:
paragraphs = [p.get_text(strip=True) for p in story.find_all("p")
if p.get_text(strip=True)]
description = " ".join(paragraphs)
image_url = _og(soup, "og:image")
# --- Ingredients ---
# Scoped to div#ingredients to avoid per-serving / nutrition duplicates.
# Structure: h3.m-list__title = group header, ul.m-list__list = ingredient rows.
ingredients = []
ing_container = soup.find("div", id="ingredients")
if ing_container:
for el in ing_container.find_all(["h3", "ul"]):
cls = el.get("class") or []
if el.name == "h3" and "m-list__title" in cls:
group_name = el.get_text(strip=True)
if group_name:
ingredients.append({"group": group_name})
elif el.name == "ul" and "m-list__list" in cls:
for li in el.find_all("li", class_="m-list__item"):
_parse_nosalty_ingredient(li, ingredients)
# --- Instructions ---
# Container: div#select inside div.p-recipe__directions.
# h4.m-list__title = section header, ol.m-list__list = steps.
instructions = []
dir_container = soup.find("div", id="select")
if dir_container:
for el in dir_container.find_all(["h4", "ol"]):
cls = el.get("class") or []
if el.name == "h4" and "m-list__title" in cls:
section_name = el.get_text(strip=True)
if section_name:
instructions.append(f"--- {section_name} ---")
elif el.name == "ol" and "m-list__list" in cls:
for li in el.find_all("li", class_="m-list__item"):
txt = li.get_text(strip=True)
if txt:
instructions.append(txt)
# --- Tags ---
# Scoped to div.p-recipe__attributeList to avoid site-wide SEO tags.
tags = []
attr_list = soup.find("div", class_="p-recipe__attributeList")
if attr_list:
for a in attr_list.find_all("a", class_="m-tags__tagItem"):
tag_text = a.get_text(strip=True)
if tag_text:
tags.append(tag_text)
return {
"title": title or "Ismeretlen recept",
"description": description,
"image_url": image_url,
"ingredients": ingredients,
"instructions": instructions,
"tags": tags,
"original_url": url,
}
def _parse_nosalty_ingredient(li, ingredients: list):
"""Parse a single nosalty ingredient <li> into the ingredients list."""
inner = li.find("div")
if not inner:
return
food_el = inner.find("a", class_="a-link")
if not food_el:
return
food = food_el.get_text(strip=True)
if not food:
return
# Walk children of inner div in order.
# Spans before the <a> link = quantity, spans after = extra/note.
qty_raw = ""
extra_parts = []
before_link = True
for child in inner.children:
if child is food_el:
before_link = False
continue
if not hasattr(child, "get_text"):
continue
text = child.get_text(strip=True)
if not text:
continue
if before_link:
qty_raw = text
else:
extra_parts.append(text.strip("() "))
extra = "; ".join(p for p in extra_parts if p)
qty, unit = _split_qty_unit(qty_raw)
ingredients.append({
"quantity": qty,
"unit": unit,
"food": food,
"extra": extra,
})
# ---------------------------------------------------------------------------
# sobors.hu
# ---------------------------------------------------------------------------
@_register("sobors")
def _parse_sobors(soup: BeautifulSoup, url: str) -> dict:
# Title: h3.recept_nev
title = ""
title_el = soup.find("h3", class_="recept_nev")
if title_el:
title = title_el.get_text(strip=True)
if not title:
title = _og(soup, "og:title") or _text(soup.find("title"))
if title:
title = re.sub(r"\s*[-|]\s*SóBors.*$", "", title, flags=re.IGNORECASE).strip()
description = _og(soup, "og:description") or ""
image_url = _og(soup, "og:image")
# --- Ingredients ---
# Container: div.hozzavalok-container (structured recipe pages)
# Groups: section > h4 (group header), section > ul > li
# Each li > span > span.mennyiseg, span.mertekegyseg, span.hozzavalo
ingredients = []
ing_container = soup.find("div", class_="hozzavalok-container")
if ing_container:
for section in ing_container.find_all("section"):
h4 = section.find("h4")
if h4:
group_name = h4.get_text(strip=True).rstrip(":")
if group_name:
ingredients.append({"group": group_name})
for li in section.find_all("li"):
qty_el = li.find("span", class_="mennyiseg")
unit_el = li.find("span", class_="mertekegyseg")
food_el = li.find("span", class_="hozzavalo")
food = _text(food_el)
if not food:
continue
qty = _text(qty_el)
unit = _text(unit_el)
ingredients.append({
"quantity": qty,
"unit": unit,
"food": food,
"extra": "",
})
# Fallback: article-style ingredients (h4 group headers + ul > li plain text)
# Some sobors.hu pages (especially linked recipes) use this simpler format.
if not ingredients:
article = soup.find("div", class_="cikk-torzs") or soup.find("article")
if article:
_parse_sobors_article_ingredients(article, ingredients)
# --- Instructions ---
# Container: div.recept_leiras.recept_he-elkeszites
# Content: <p> tags for steps, <h3><strong>Section</strong></h3> for section headers
instructions = []
linked_url = None
inst_container = soup.find("div", class_="recept_leiras")
if inst_container:
# Check for external link (linked recipe pattern — e.g. "click here for
# full recipe on kiskegyed.hu")
for a in inst_container.find_all("a", href=True):
href = a["href"]
if href.startswith("http") and "sobors.hu" not in href:
linked_url = href
break
for el in inst_container.find_all(["h3", "p"]):
if el.name == "h3":
header = el.get_text(strip=True)
if header:
instructions.append(f"--- {header} ---")
elif el.name == "p":
txt = el.get_text(strip=True)
if txt:
# Strip leading numbering like "1. " from reader recipes
txt = re.sub(r"^\d+\.\s+", "", txt)
instructions.append(txt)
# If instructions just contain a redirect to another site, try to follow
# the link and scrape the real recipe from there.
if linked_url and len(instructions) <= 2:
try:
linked_data = scrape(linked_url)
if linked_data.get("instructions"):
instructions = linked_data["instructions"]
if not ingredients and linked_data.get("ingredients"):
ingredients = linked_data["ingredients"]
except Exception:
pass # keep whatever we scraped from sobors.hu
# --- Tags ---
# Container: div.cikk-cimkek > ul.cikk-cimkek-list > li > a
# Skip the generic "Receptek" category tag and "Olvasói receptek" tag
tags = []
tag_container = soup.find("div", class_="cikk-cimkek")
if tag_container:
tag_list = tag_container.find("ul", class_="cikk-cimkek-list")
if tag_list:
skip = {"receptek", "olvasói receptek"}
for a in tag_list.find_all("a"):
tag_text = a.get_text(strip=True)
if tag_text and tag_text.lower() not in skip:
tags.append(tag_text)
return {
"title": title or "Ismeretlen recept",
"description": description,
"image_url": image_url,
"ingredients": ingredients,
"instructions": instructions,
"tags": tags,
"original_url": url,
}
def _parse_sobors_article_ingredients(container, ingredients: list):
"""Parse article-style ingredients from sobors.hu (h4 headers + ul > li plain text)."""
for el in container.find_all(["h4", "ul"]):
if el.name == "h4":
group_name = el.get_text(strip=True).rstrip(":")
if group_name and not group_name.lower().startswith("hozzávalók"):
ingredients.append({"group": group_name})
elif el.name == "ul":
# Only consider lists that follow an h4 or are inside the ingredient context
prev = el.find_previous_sibling()
if prev and prev.name == "h4":
for li in el.find_all("li"):
line = li.get_text(strip=True)
if not line:
continue
qty, unit, food = _parse_ingredient_line(line)
ingredients.append({
"quantity": qty,
"unit": unit,
"food": food,
"extra": "",
})
def _parse_ingredient_line(line: str) -> tuple[str, str, str]:
"""Parse a plain-text ingredient line like '2 dl tejföl' into (qty, unit, food)."""
m = re.match(r"^([0-9][0-9.,/½¼¾-]*)\s*(\S+)\s+(.+)$", line)
if m:
return (m.group(1).strip(), m.group(2).strip(), m.group(3).strip())
# Just quantity + food (e.g. "2 tojás")
m2 = re.match(r"^([0-9][0-9.,/½¼¾-]*)\s+(.+)$", line)
if m2:
return (m2.group(1).strip(), "", m2.group(2).strip())
return ("", "", line)
def _split_qty_unit(raw: str) -> tuple[str, str]:
"""Split a merged quantity+unit string like '200g' into ('200', 'g')."""
raw = raw.strip()
if not raw:
return ("", "")
m = re.match(r"^([0-9][0-9 .,/-]*)(.*)$", raw)
if m:
return (m.group(1).strip(), m.group(2).strip())
return ("", raw)
# ---------------------------------------------------------------------------
# Generic fallback (og-tags + schema.org microdata)
# ---------------------------------------------------------------------------
def _parse_generic(soup: BeautifulSoup, url: str) -> dict:
title = _og(soup, "og:title") or _text(soup.find("title")) or "Ismeretlen recept"
description = _og(soup, "og:description") or ""
image_url = _og(soup, "og:image")
ingredients = []
instructions = []
tags = []
# Try schema.org JSON-LD
for script in soup.find_all("script", type="application/ld+json"):
try:
data = json.loads(script.string or "")
if isinstance(data, list):
data = data[0]
if data.get("@type") == "Recipe":
for line in data.get("recipeIngredient", []):
ingredients.append({
"quantity": "", "unit": "", "food": line, "extra": "",
})
raw_instructions = data.get("recipeInstructions", [])
for item in raw_instructions:
if isinstance(item, str):
instructions.append(item)
elif isinstance(item, dict):
instructions.append(item.get("text", ""))
# Extract keywords
kw = data.get("keywords", "")
if isinstance(kw, str):
tags = [k.strip() for k in kw.split(",") if k.strip()]
elif isinstance(kw, list):
tags = [str(k).strip() for k in kw if str(k).strip()]
break
except (json.JSONDecodeError, TypeError, AttributeError):
continue
return {
"title": title,
"description": description,
"image_url": image_url,
"ingredients": ingredients,
"instructions": instructions,
"tags": tags,
"original_url": url,
}
# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------
def _extract_ingredient_comments(data: dict):
"""Move trailing (comment) from food field to extra field for all ingredients."""
for ing in data.get("ingredients", []):
if "group" in ing:
continue
food = ing.get("food", "")
extra = ing.get("extra", "")
if food and not extra:
m = re.match(r"^(.+?)\s*\(([^)]+)\)\s*$", food)
if m:
ing["food"] = m.group(1).strip()
ing["extra"] = m.group(2).strip()
def _host(url: str) -> str:
from urllib.parse import urlparse
return urlparse(url).hostname or ""
def _og(soup: BeautifulSoup, prop: str) -> str | None:
tag = soup.find("meta", property=prop)
if tag and tag.get("content"):
return tag["content"]
return None
def _text(el) -> str:
if el is None:
return ""
return el.get_text(strip=True)