Files
recipe-importer/app/scraper.py
T
admin a0bcb62588 v0.6.0: Sobors.hu parser, HTTP auth, recipe validation, UI polish
- New sobors.hu parser with ingredient groups and section headers
- Incomplete recipe warnings (missing ingredients/instructions)
- Optional HTTP Basic Auth (configurable on settings page)
- Brand text: "Recept" in white, "Importáló" in blue
- Larger logo (36px), favicon using logo_notext.svg

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-24 18:07:05 +01:00

597 lines
20 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""Recipe scraper — parses Hungarian recipe sites into a structured dict.
Each supported site has a parser registered via _PARSERS.
Unsupported sites fall back to generic schema.org / og-tag extraction.
"""
import json
import re
import requests
from bs4 import BeautifulSoup
_HEADERS = {
"User-Agent": "RecipeImporter/1.0 (Hungarian recipe scraper)",
"Accept-Language": "hu-HU,hu;q=0.9,en;q=0.5",
}
# Maps a substring of the hostname to a parser function.
# Order matters: first match wins.
_PARSERS: list[tuple[str, "callable"]] = []
def _register(host_substring: str):
"""Decorator: register a parser for URLs whose hostname contains *host_substring*."""
def decorator(fn):
_PARSERS.append((host_substring, fn))
return fn
return decorator
# ---------------------------------------------------------------------------
# Public API
# ---------------------------------------------------------------------------
def scrape(url: str) -> dict:
"""Fetch *url* and return a recipe dict.
Returns::
{
"title": str,
"description": str,
"image_url": str | None,
"ingredients": [{"quantity": str, "unit": str, "food": str, "extra": str}, ...],
"instructions": [str, ...],
"tags": [str, ...],
"original_url": str,
}
Raises ValueError on unsupported sites or parse failures.
"""
resp = requests.get(url, headers=_HEADERS, timeout=30)
resp.raise_for_status()
resp.encoding = resp.apparent_encoding or "utf-8"
soup = BeautifulSoup(resp.text, "lxml")
host = _host(url)
for substring, parser in _PARSERS:
if substring in host:
return parser(soup, url)
# Fallback: try generic schema.org / og-tag extraction
return _parse_generic(soup, url)
def supported_sites() -> list[str]:
"""Return list of supported site hostname substrings."""
return [s for s, _ in _PARSERS]
# ---------------------------------------------------------------------------
# mindmegette.hu
# ---------------------------------------------------------------------------
@_register("mindmegette")
def _parse_mindmegette(soup: BeautifulSoup, url: str) -> dict:
title = _og(soup, "og:title") or _text(soup.find("title"))
# Strip " | Mindmegette.hu" suffix
if title:
title = re.sub(r"\s*\|\s*Mindmegette\.hu$", "", title).strip()
description = _og(soup, "og:description") or ""
image_url = _og(soup, "og:image")
# --- Ingredients ---
# Multiple div.ingredients containers may exist (one per group).
# Group title: <strong class="ingredients-group">A habaráshoz:</strong>
ingredients = []
for ing_container in soup.find_all("div", class_="ingredients"):
# Check for a group title
group_el = ing_container.find("strong", class_="ingredients-group")
group_name = _text(group_el).rstrip(":").strip() if group_el else ""
if group_name:
ingredients.append({"group": group_name})
for row in ing_container.find_all("div", class_="ingredients-meta"):
# Actual HTML: <strong>qty</strong> <span>unit</span>
# <a class="ingredients-link">name</a> <small>(extra)</small>
qty_el = row.find("strong")
unit_el = None
for sp in row.find_all("span"):
if not sp.get("class"):
unit_el = sp
break
name_el = row.find("a", class_="ingredients-link")
extra_el = row.find("small") or row.find("span", class_="extra")
qty = _text(qty_el)
unit = _text(unit_el)
food = _text(name_el)
extra = _text(extra_el).strip("() ")
if not food:
# Fallback: grab whole row text
food = row.get_text(separator=" ", strip=True)
if food:
ingredients.append({
"quantity": qty,
"unit": unit,
"food": food,
"extra": extra,
})
# --- Instructions ---
instructions = []
wysiwyg = soup.find("mindmegette-wysiwyg-box")
if wysiwyg:
for li in wysiwyg.find_all("li"):
txt = _text(li)
if txt:
instructions.append(txt)
# Fallback: look for block-content divs
if not instructions:
for div in soup.find_all("div", class_="block-content"):
ol = div.find("ol")
if ol:
for li in ol.find_all("li"):
txt = _text(li)
if txt:
instructions.append(txt)
# --- Tags ---
tags = []
tag_wrapper = soup.select_one("div.desktop-wrapper")
if tag_wrapper:
for a in tag_wrapper.select("a.tag"):
tag_text = a.get_text(strip=True)
if tag_text:
tags.append(tag_text)
return {
"title": title or "Ismeretlen recept",
"description": description,
"image_url": image_url,
"ingredients": ingredients,
"instructions": instructions,
"tags": tags,
"original_url": url,
}
# ---------------------------------------------------------------------------
# streetkitchen.hu
# ---------------------------------------------------------------------------
@_register("streetkitchen")
def _parse_streetkitchen(soup: BeautifulSoup, url: str) -> dict:
title = _og(soup, "og:title") or _text(soup.find("title"))
if title:
title = re.sub(r"\s*\|\s*Street Kitchen$", "", title).strip()
description = _og(soup, "og:description") or ""
image_url = _og(soup, "og:image")
# --- Ingredients ---
# Find the main ingredient grid (grid-cols-1 lg:grid-cols-2).
# The page renders ingredients twice (mobile + desktop); we pick the
# specific grid to avoid duplicates.
ingredients = []
ing_grid = None
for g in soup.select("div.grid"):
cls = " ".join(g.get("class", []))
if "grid-cols-1" in cls and "lg:grid-cols-2" in cls:
ing_grid = g
break
if ing_grid:
# Walk top-level divs — each may contain an h5 group header + rows
for section in ing_grid.find_all("div", recursive=False):
h5 = section.find("h5")
if h5:
group_name = h5.get_text(strip=True)
if group_name:
ingredients.append({"group": group_name})
for row in section.select("div.my-2.flex.items-center.gap-2.text-lg"):
inner = row.select_one("div.flex.items-center.gap-2")
if not inner:
continue
divs = inner.find_all("div", recursive=False)
bold = inner.find("div", class_="font-bold")
food = bold.get_text(strip=True) if bold else ""
if not food:
continue
# First non-bold div is quantity+unit merged (e.g. "200g", "1fej")
qty_raw = ""
extra = ""
for d in divs:
if d == bold:
continue
txt = d.get_text(strip=True)
if txt.startswith("(") and txt.endswith(")"):
extra = txt.strip("() ")
elif not qty_raw:
qty_raw = txt
# Split "200g" → qty="200", unit="g"
qty, unit = _split_qty_unit(qty_raw)
# Extract parenthesised note from inside food name
# e.g. "fehérborecet (ízlés szerint)" → food="fehérborecet", extra="ízlés szerint"
if not extra:
m = re.match(r"^(.+?)\s*\(([^)]+)\)\s*$", food)
if m:
food = m.group(1).strip()
extra = m.group(2).strip()
ingredients.append({
"quantity": qty,
"unit": unit,
"food": food,
"extra": extra,
})
# --- Instructions ---
instructions = []
prep = (soup.find("div", id="Streetk_content_preparation_wrapper")
or soup.select_one(".recipe-preparation"))
if prep:
ol = prep.find("ol")
ul = prep.find("ul")
if ol:
for li in ol.find_all("li", recursive=False):
txt = li.get_text(strip=True)
if txt:
instructions.append(txt)
elif ul:
for li in ul.find_all("li", recursive=False):
txt = li.get_text(strip=True)
if txt:
instructions.append(txt)
else:
# Paragraph-style: <p> blocks, sometimes with <strong> headers
for p in prep.find_all("p"):
txt = p.get_text(strip=True)
if txt:
instructions.append(txt)
# If still nothing, try the description wrapper
if not instructions:
desc_article = soup.find("article", id="Streetk_content_description_wrapper")
if desc_article:
for p in desc_article.find_all("p"):
txt = p.get_text(strip=True)
if txt:
instructions.append(txt)
# --- Tags ---
tags = []
# Prefer recipeCategory from JSON-LD (comma-separated)
for script in soup.find_all("script", type="application/ld+json"):
try:
data = json.loads(script.string or "")
graph = data.get("@graph", [data]) if isinstance(data, dict) else data
for item in graph:
if isinstance(item, dict) and item.get("@type") == "Recipe":
cat = item.get("recipeCategory", "")
if isinstance(cat, str) and cat:
tags = [t.strip() for t in cat.split(",") if t.strip()]
elif isinstance(cat, list):
tags = [str(t).strip() for t in cat if str(t).strip()]
break
except (json.JSONDecodeError, TypeError, AttributeError):
continue
return {
"title": title or "Ismeretlen recept",
"description": description,
"image_url": image_url,
"ingredients": ingredients,
"instructions": instructions,
"tags": tags,
"original_url": url,
}
# ---------------------------------------------------------------------------
# nosalty.hu
# ---------------------------------------------------------------------------
@_register("nosalty")
def _parse_nosalty(soup: BeautifulSoup, url: str) -> dict:
title = _og(soup, "og:title") or _text(soup.find("title"))
if title:
title = re.sub(r"\s*\|.*$", "", title).strip()
# Story as description (no dedicated description on nosalty)
description = ""
story = soup.find("div", id="recipe-story")
if story:
paragraphs = [p.get_text(strip=True) for p in story.find_all("p")
if p.get_text(strip=True)]
description = " ".join(paragraphs)
image_url = _og(soup, "og:image")
# --- Ingredients ---
# Scoped to div#ingredients to avoid per-serving / nutrition duplicates.
# Structure: h3.m-list__title = group header, ul.m-list__list = ingredient rows.
ingredients = []
ing_container = soup.find("div", id="ingredients")
if ing_container:
for el in ing_container.find_all(["h3", "ul"]):
cls = el.get("class") or []
if el.name == "h3" and "m-list__title" in cls:
group_name = el.get_text(strip=True)
if group_name:
ingredients.append({"group": group_name})
elif el.name == "ul" and "m-list__list" in cls:
for li in el.find_all("li", class_="m-list__item"):
_parse_nosalty_ingredient(li, ingredients)
# --- Instructions ---
# Container: div#select inside div.p-recipe__directions.
# h4.m-list__title = section header, ol.m-list__list = steps.
instructions = []
dir_container = soup.find("div", id="select")
if dir_container:
for el in dir_container.find_all(["h4", "ol"]):
cls = el.get("class") or []
if el.name == "h4" and "m-list__title" in cls:
section_name = el.get_text(strip=True)
if section_name:
instructions.append(f"--- {section_name} ---")
elif el.name == "ol" and "m-list__list" in cls:
for li in el.find_all("li", class_="m-list__item"):
txt = li.get_text(strip=True)
if txt:
instructions.append(txt)
# --- Tags ---
# Scoped to div.p-recipe__attributeList to avoid site-wide SEO tags.
tags = []
attr_list = soup.find("div", class_="p-recipe__attributeList")
if attr_list:
for a in attr_list.find_all("a", class_="m-tags__tagItem"):
tag_text = a.get_text(strip=True)
if tag_text:
tags.append(tag_text)
return {
"title": title or "Ismeretlen recept",
"description": description,
"image_url": image_url,
"ingredients": ingredients,
"instructions": instructions,
"tags": tags,
"original_url": url,
}
def _parse_nosalty_ingredient(li, ingredients: list):
"""Parse a single nosalty ingredient <li> into the ingredients list."""
inner = li.find("div")
if not inner:
return
food_el = inner.find("a", class_="a-link")
if not food_el:
return
food = food_el.get_text(strip=True)
if not food:
return
# Walk children of inner div in order.
# Spans before the <a> link = quantity, spans after = extra/note.
qty_raw = ""
extra_parts = []
before_link = True
for child in inner.children:
if child is food_el:
before_link = False
continue
if not hasattr(child, "get_text"):
continue
text = child.get_text(strip=True)
if not text:
continue
if before_link:
qty_raw = text
else:
extra_parts.append(text.strip("() "))
extra = "; ".join(p for p in extra_parts if p)
qty, unit = _split_qty_unit(qty_raw)
ingredients.append({
"quantity": qty,
"unit": unit,
"food": food,
"extra": extra,
})
# ---------------------------------------------------------------------------
# sobors.hu
# ---------------------------------------------------------------------------
@_register("sobors")
def _parse_sobors(soup: BeautifulSoup, url: str) -> dict:
# Title: h3.recept_nev
title = ""
title_el = soup.find("h3", class_="recept_nev")
if title_el:
title = title_el.get_text(strip=True)
if not title:
title = _og(soup, "og:title") or _text(soup.find("title"))
if title:
title = re.sub(r"\s*[-|]\s*SóBors.*$", "", title, flags=re.IGNORECASE).strip()
description = _og(soup, "og:description") or ""
image_url = _og(soup, "og:image")
# --- Ingredients ---
# Container: div.hozzavalok-container
# Groups: section > h4 (group header), section > ul > li
# Each li > span > span.mennyiseg, span.mertekegyseg, span.hozzavalo
ingredients = []
ing_container = soup.find("div", class_="hozzavalok-container")
if ing_container:
for section in ing_container.find_all("section"):
h4 = section.find("h4")
if h4:
group_name = h4.get_text(strip=True).rstrip(":")
if group_name:
ingredients.append({"group": group_name})
for li in section.find_all("li"):
qty_el = li.find("span", class_="mennyiseg")
unit_el = li.find("span", class_="mertekegyseg")
food_el = li.find("span", class_="hozzavalo")
food = _text(food_el)
if not food:
continue
qty = _text(qty_el)
unit = _text(unit_el)
ingredients.append({
"quantity": qty,
"unit": unit,
"food": food,
"extra": "",
})
# --- Instructions ---
# Container: div.recept_leiras.recept_he-elkeszites
# Content: <p> tags for steps, <h3><strong>Section</strong></h3> for section headers
instructions = []
inst_container = soup.find("div", class_="recept_leiras")
if inst_container:
for el in inst_container.find_all(["h3", "p"]):
if el.name == "h3":
header = el.get_text(strip=True)
if header:
instructions.append(f"--- {header} ---")
elif el.name == "p":
txt = el.get_text(strip=True)
if txt:
# Strip leading numbering like "1. " from reader recipes
txt = re.sub(r"^\d+\.\s+", "", txt)
instructions.append(txt)
# --- Tags ---
# Container: div.cikk-cimkek > ul.cikk-cimkek-list > li > a
# Skip the generic "Receptek" category tag and "Olvasói receptek" tag
tags = []
tag_container = soup.find("div", class_="cikk-cimkek")
if tag_container:
tag_list = tag_container.find("ul", class_="cikk-cimkek-list")
if tag_list:
skip = {"receptek", "olvasói receptek"}
for a in tag_list.find_all("a"):
tag_text = a.get_text(strip=True)
if tag_text and tag_text.lower() not in skip:
tags.append(tag_text)
return {
"title": title or "Ismeretlen recept",
"description": description,
"image_url": image_url,
"ingredients": ingredients,
"instructions": instructions,
"tags": tags,
"original_url": url,
}
def _split_qty_unit(raw: str) -> tuple[str, str]:
"""Split a merged quantity+unit string like '200g' into ('200', 'g')."""
raw = raw.strip()
if not raw:
return ("", "")
m = re.match(r"^([0-9][0-9 .,/-]*)(.*)$", raw)
if m:
return (m.group(1).strip(), m.group(2).strip())
return ("", raw)
# ---------------------------------------------------------------------------
# Generic fallback (og-tags + schema.org microdata)
# ---------------------------------------------------------------------------
def _parse_generic(soup: BeautifulSoup, url: str) -> dict:
title = _og(soup, "og:title") or _text(soup.find("title")) or "Ismeretlen recept"
description = _og(soup, "og:description") or ""
image_url = _og(soup, "og:image")
ingredients = []
instructions = []
tags = []
# Try schema.org JSON-LD
for script in soup.find_all("script", type="application/ld+json"):
try:
data = json.loads(script.string or "")
if isinstance(data, list):
data = data[0]
if data.get("@type") == "Recipe":
for line in data.get("recipeIngredient", []):
ingredients.append({
"quantity": "", "unit": "", "food": line, "extra": "",
})
raw_instructions = data.get("recipeInstructions", [])
for item in raw_instructions:
if isinstance(item, str):
instructions.append(item)
elif isinstance(item, dict):
instructions.append(item.get("text", ""))
# Extract keywords
kw = data.get("keywords", "")
if isinstance(kw, str):
tags = [k.strip() for k in kw.split(",") if k.strip()]
elif isinstance(kw, list):
tags = [str(k).strip() for k in kw if str(k).strip()]
break
except (json.JSONDecodeError, TypeError, AttributeError):
continue
return {
"title": title,
"description": description,
"image_url": image_url,
"ingredients": ingredients,
"instructions": instructions,
"tags": tags,
"original_url": url,
}
# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------
def _host(url: str) -> str:
from urllib.parse import urlparse
return urlparse(url).hostname or ""
def _og(soup: BeautifulSoup, prop: str) -> str | None:
tag = soup.find("meta", property=prop)
if tag and tag.get("content"):
return tag["content"]
return None
def _text(el) -> str:
if el is None:
return ""
return el.get_text(strip=True)