Files
recipe-importer/app/scraper.py
T
admin 73a2319f5a Add nosalty.hu parser
Extracts ingredients (with groups), instructions (with section
headers), tags, and story-as-description from nosalty.hu recipe pages.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-24 16:40:16 +01:00

502 lines
17 KiB
Python

"""Recipe scraper — parses Hungarian recipe sites into a structured dict.
Each supported site has a parser registered via _PARSERS.
Unsupported sites fall back to generic schema.org / og-tag extraction.
"""
import json
import re
import requests
from bs4 import BeautifulSoup
_HEADERS = {
"User-Agent": "RecipeImporter/1.0 (Hungarian recipe scraper)",
"Accept-Language": "hu-HU,hu;q=0.9,en;q=0.5",
}
# Maps a substring of the hostname to a parser function.
# Order matters: first match wins.
_PARSERS: list[tuple[str, "callable"]] = []
def _register(host_substring: str):
"""Decorator: register a parser for URLs whose hostname contains *host_substring*."""
def decorator(fn):
_PARSERS.append((host_substring, fn))
return fn
return decorator
# ---------------------------------------------------------------------------
# Public API
# ---------------------------------------------------------------------------
def scrape(url: str) -> dict:
"""Fetch *url* and return a recipe dict.
Returns::
{
"title": str,
"description": str,
"image_url": str | None,
"ingredients": [{"quantity": str, "unit": str, "food": str, "extra": str}, ...],
"instructions": [str, ...],
"tags": [str, ...],
"original_url": str,
}
Raises ValueError on unsupported sites or parse failures.
"""
resp = requests.get(url, headers=_HEADERS, timeout=30)
resp.raise_for_status()
resp.encoding = resp.apparent_encoding or "utf-8"
soup = BeautifulSoup(resp.text, "lxml")
host = _host(url)
for substring, parser in _PARSERS:
if substring in host:
return parser(soup, url)
# Fallback: try generic schema.org / og-tag extraction
return _parse_generic(soup, url)
def supported_sites() -> list[str]:
"""Return list of supported site hostname substrings."""
return [s for s, _ in _PARSERS]
# ---------------------------------------------------------------------------
# mindmegette.hu
# ---------------------------------------------------------------------------
@_register("mindmegette")
def _parse_mindmegette(soup: BeautifulSoup, url: str) -> dict:
title = _og(soup, "og:title") or _text(soup.find("title"))
# Strip " | Mindmegette.hu" suffix
if title:
title = re.sub(r"\s*\|\s*Mindmegette\.hu$", "", title).strip()
description = _og(soup, "og:description") or ""
image_url = _og(soup, "og:image")
# --- Ingredients ---
# Multiple div.ingredients containers may exist (one per group).
# Group title: <strong class="ingredients-group">A habaráshoz:</strong>
ingredients = []
for ing_container in soup.find_all("div", class_="ingredients"):
# Check for a group title
group_el = ing_container.find("strong", class_="ingredients-group")
group_name = _text(group_el).rstrip(":").strip() if group_el else ""
if group_name:
ingredients.append({"group": group_name})
for row in ing_container.find_all("div", class_="ingredients-meta"):
# Actual HTML: <strong>qty</strong> <span>unit</span>
# <a class="ingredients-link">name</a> <small>(extra)</small>
qty_el = row.find("strong")
unit_el = None
for sp in row.find_all("span"):
if not sp.get("class"):
unit_el = sp
break
name_el = row.find("a", class_="ingredients-link")
extra_el = row.find("small") or row.find("span", class_="extra")
qty = _text(qty_el)
unit = _text(unit_el)
food = _text(name_el)
extra = _text(extra_el).strip("() ")
if not food:
# Fallback: grab whole row text
food = row.get_text(separator=" ", strip=True)
if food:
ingredients.append({
"quantity": qty,
"unit": unit,
"food": food,
"extra": extra,
})
# --- Instructions ---
instructions = []
wysiwyg = soup.find("mindmegette-wysiwyg-box")
if wysiwyg:
for li in wysiwyg.find_all("li"):
txt = _text(li)
if txt:
instructions.append(txt)
# Fallback: look for block-content divs
if not instructions:
for div in soup.find_all("div", class_="block-content"):
ol = div.find("ol")
if ol:
for li in ol.find_all("li"):
txt = _text(li)
if txt:
instructions.append(txt)
# --- Tags ---
tags = []
tag_wrapper = soup.select_one("div.desktop-wrapper")
if tag_wrapper:
for a in tag_wrapper.select("a.tag"):
tag_text = a.get_text(strip=True)
if tag_text:
tags.append(tag_text)
return {
"title": title or "Ismeretlen recept",
"description": description,
"image_url": image_url,
"ingredients": ingredients,
"instructions": instructions,
"tags": tags,
"original_url": url,
}
# ---------------------------------------------------------------------------
# streetkitchen.hu
# ---------------------------------------------------------------------------
@_register("streetkitchen")
def _parse_streetkitchen(soup: BeautifulSoup, url: str) -> dict:
title = _og(soup, "og:title") or _text(soup.find("title"))
if title:
title = re.sub(r"\s*\|\s*Street Kitchen$", "", title).strip()
description = _og(soup, "og:description") or ""
image_url = _og(soup, "og:image")
# --- Ingredients ---
# Find the main ingredient grid (grid-cols-1 lg:grid-cols-2).
# The page renders ingredients twice (mobile + desktop); we pick the
# specific grid to avoid duplicates.
ingredients = []
ing_grid = None
for g in soup.select("div.grid"):
cls = " ".join(g.get("class", []))
if "grid-cols-1" in cls and "lg:grid-cols-2" in cls:
ing_grid = g
break
if ing_grid:
# Walk top-level divs — each may contain an h5 group header + rows
for section in ing_grid.find_all("div", recursive=False):
h5 = section.find("h5")
if h5:
group_name = h5.get_text(strip=True)
if group_name:
ingredients.append({"group": group_name})
for row in section.select("div.my-2.flex.items-center.gap-2.text-lg"):
inner = row.select_one("div.flex.items-center.gap-2")
if not inner:
continue
divs = inner.find_all("div", recursive=False)
bold = inner.find("div", class_="font-bold")
food = bold.get_text(strip=True) if bold else ""
if not food:
continue
# First non-bold div is quantity+unit merged (e.g. "200g", "1fej")
qty_raw = ""
extra = ""
for d in divs:
if d == bold:
continue
txt = d.get_text(strip=True)
if txt.startswith("(") and txt.endswith(")"):
extra = txt.strip("() ")
elif not qty_raw:
qty_raw = txt
# Split "200g" → qty="200", unit="g"
qty, unit = _split_qty_unit(qty_raw)
# Extract parenthesised note from inside food name
# e.g. "fehérborecet (ízlés szerint)" → food="fehérborecet", extra="ízlés szerint"
if not extra:
m = re.match(r"^(.+?)\s*\(([^)]+)\)\s*$", food)
if m:
food = m.group(1).strip()
extra = m.group(2).strip()
ingredients.append({
"quantity": qty,
"unit": unit,
"food": food,
"extra": extra,
})
# --- Instructions ---
instructions = []
prep = (soup.find("div", id="Streetk_content_preparation_wrapper")
or soup.select_one(".recipe-preparation"))
if prep:
ol = prep.find("ol")
ul = prep.find("ul")
if ol:
for li in ol.find_all("li", recursive=False):
txt = li.get_text(strip=True)
if txt:
instructions.append(txt)
elif ul:
for li in ul.find_all("li", recursive=False):
txt = li.get_text(strip=True)
if txt:
instructions.append(txt)
else:
# Paragraph-style: <p> blocks, sometimes with <strong> headers
for p in prep.find_all("p"):
txt = p.get_text(strip=True)
if txt:
instructions.append(txt)
# If still nothing, try the description wrapper
if not instructions:
desc_article = soup.find("article", id="Streetk_content_description_wrapper")
if desc_article:
for p in desc_article.find_all("p"):
txt = p.get_text(strip=True)
if txt:
instructions.append(txt)
# --- Tags ---
tags = []
# Prefer recipeCategory from JSON-LD (comma-separated)
for script in soup.find_all("script", type="application/ld+json"):
try:
data = json.loads(script.string or "")
graph = data.get("@graph", [data]) if isinstance(data, dict) else data
for item in graph:
if isinstance(item, dict) and item.get("@type") == "Recipe":
cat = item.get("recipeCategory", "")
if isinstance(cat, str) and cat:
tags = [t.strip() for t in cat.split(",") if t.strip()]
elif isinstance(cat, list):
tags = [str(t).strip() for t in cat if str(t).strip()]
break
except (json.JSONDecodeError, TypeError, AttributeError):
continue
return {
"title": title or "Ismeretlen recept",
"description": description,
"image_url": image_url,
"ingredients": ingredients,
"instructions": instructions,
"tags": tags,
"original_url": url,
}
# ---------------------------------------------------------------------------
# nosalty.hu
# ---------------------------------------------------------------------------
@_register("nosalty")
def _parse_nosalty(soup: BeautifulSoup, url: str) -> dict:
title = _og(soup, "og:title") or _text(soup.find("title"))
if title:
title = re.sub(r"\s*\|.*$", "", title).strip()
# Story as description (no dedicated description on nosalty)
description = ""
story = soup.find("div", id="recipe-story")
if story:
paragraphs = [p.get_text(strip=True) for p in story.find_all("p")
if p.get_text(strip=True)]
description = " ".join(paragraphs)
image_url = _og(soup, "og:image")
# --- Ingredients ---
# Scoped to div#ingredients to avoid per-serving / nutrition duplicates.
# Structure: h3.m-list__title = group header, ul.m-list__list = ingredient rows.
ingredients = []
ing_container = soup.find("div", id="ingredients")
if ing_container:
for el in ing_container.find_all(["h3", "ul"]):
cls = el.get("class") or []
if el.name == "h3" and "m-list__title" in cls:
group_name = el.get_text(strip=True)
if group_name:
ingredients.append({"group": group_name})
elif el.name == "ul" and "m-list__list" in cls:
for li in el.find_all("li", class_="m-list__item"):
_parse_nosalty_ingredient(li, ingredients)
# --- Instructions ---
# Container: div#select inside div.p-recipe__directions.
# h4.m-list__title = section header, ol.m-list__list = steps.
instructions = []
dir_container = soup.find("div", id="select")
if dir_container:
for el in dir_container.find_all(["h4", "ol"]):
cls = el.get("class") or []
if el.name == "h4" and "m-list__title" in cls:
section_name = el.get_text(strip=True)
if section_name:
instructions.append(f"--- {section_name} ---")
elif el.name == "ol" and "m-list__list" in cls:
for li in el.find_all("li", class_="m-list__item"):
txt = li.get_text(strip=True)
if txt:
instructions.append(txt)
# --- Tags ---
tags = []
for a in soup.find_all("a", class_="m-tags__tagItem"):
tag_text = a.get_text(strip=True)
if tag_text:
tags.append(tag_text)
return {
"title": title or "Ismeretlen recept",
"description": description,
"image_url": image_url,
"ingredients": ingredients,
"instructions": instructions,
"tags": tags,
"original_url": url,
}
def _parse_nosalty_ingredient(li, ingredients: list):
"""Parse a single nosalty ingredient <li> into the ingredients list."""
inner = li.find("div")
if not inner:
return
food_el = inner.find("a", class_="a-link")
if not food_el:
return
food = food_el.get_text(strip=True)
if not food:
return
# Walk children of inner div in order.
# Spans before the <a> link = quantity, spans after = extra/note.
qty_raw = ""
extra_parts = []
before_link = True
for child in inner.children:
if child is food_el:
before_link = False
continue
if not hasattr(child, "get_text"):
continue
text = child.get_text(strip=True)
if not text:
continue
if before_link:
qty_raw = text
else:
extra_parts.append(text.strip("() "))
extra = "; ".join(p for p in extra_parts if p)
qty, unit = _split_qty_unit(qty_raw)
ingredients.append({
"quantity": qty,
"unit": unit,
"food": food,
"extra": extra,
})
def _split_qty_unit(raw: str) -> tuple[str, str]:
"""Split a merged quantity+unit string like '200g' into ('200', 'g')."""
raw = raw.strip()
if not raw:
return ("", "")
m = re.match(r"^([0-9][0-9 .,/-]*)(.*)$", raw)
if m:
return (m.group(1).strip(), m.group(2).strip())
return ("", raw)
# ---------------------------------------------------------------------------
# Generic fallback (og-tags + schema.org microdata)
# ---------------------------------------------------------------------------
def _parse_generic(soup: BeautifulSoup, url: str) -> dict:
title = _og(soup, "og:title") or _text(soup.find("title")) or "Ismeretlen recept"
description = _og(soup, "og:description") or ""
image_url = _og(soup, "og:image")
ingredients = []
instructions = []
tags = []
# Try schema.org JSON-LD
for script in soup.find_all("script", type="application/ld+json"):
try:
data = json.loads(script.string or "")
if isinstance(data, list):
data = data[0]
if data.get("@type") == "Recipe":
for line in data.get("recipeIngredient", []):
ingredients.append({
"quantity": "", "unit": "", "food": line, "extra": "",
})
raw_instructions = data.get("recipeInstructions", [])
for item in raw_instructions:
if isinstance(item, str):
instructions.append(item)
elif isinstance(item, dict):
instructions.append(item.get("text", ""))
# Extract keywords
kw = data.get("keywords", "")
if isinstance(kw, str):
tags = [k.strip() for k in kw.split(",") if k.strip()]
elif isinstance(kw, list):
tags = [str(k).strip() for k in kw if str(k).strip()]
break
except (json.JSONDecodeError, TypeError, AttributeError):
continue
return {
"title": title,
"description": description,
"image_url": image_url,
"ingredients": ingredients,
"instructions": instructions,
"tags": tags,
"original_url": url,
}
# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------
def _host(url: str) -> str:
from urllib.parse import urlparse
return urlparse(url).hostname or ""
def _og(soup: BeautifulSoup, prop: str) -> str | None:
tag = soup.find("meta", property=prop)
if tag and tag.get("content"):
return tag["content"]
return None
def _text(el) -> str:
if el is None:
return ""
return el.get_text(strip=True)