Files
recipe-importer/app/scraper.py
T
admin 0192de5177 fix: ingredient parsing — match actual HTML elements and add spaces
The scraper looked for span.quantity/span.unit/span.name which don't
exist. The real HTML uses <strong> for qty, plain <span> for unit,
<a class="ingredients-link"> for name, and <small> for extras like
"(darált)". Also add referenceId to Mealie ingredients (required field).

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-24 08:11:28 +01:00

195 lines
6.2 KiB
Python

"""Recipe scraper — parses Hungarian recipe sites into a structured dict.
Currently supported: mindmegette.hu
"""
import re
import requests
from bs4 import BeautifulSoup
_HEADERS = {
"User-Agent": "RecipeImporter/1.0 (Hungarian recipe scraper)",
"Accept-Language": "hu-HU,hu;q=0.9,en;q=0.5",
}
# ---------------------------------------------------------------------------
# Public API
# ---------------------------------------------------------------------------
def scrape(url: str) -> dict:
"""Fetch *url* and return a recipe dict.
Returns::
{
"title": str,
"description": str,
"image_url": str | None,
"ingredients": [str, ...],
"instructions": [str, ...],
"original_url": str,
}
Raises ValueError on unsupported sites or parse failures.
"""
resp = requests.get(url, headers=_HEADERS, timeout=30)
resp.raise_for_status()
resp.encoding = resp.apparent_encoding or "utf-8"
soup = BeautifulSoup(resp.text, "lxml")
host = _host(url)
if "mindmegette" in host:
return _parse_mindmegette(soup, url)
else:
# Fallback: try generic schema.org / og-tag extraction
return _parse_generic(soup, url)
# ---------------------------------------------------------------------------
# mindmegette.hu
# ---------------------------------------------------------------------------
def _parse_mindmegette(soup: BeautifulSoup, url: str) -> dict:
title = _og(soup, "og:title") or _text(soup.find("title"))
# Strip " | Mindmegette.hu" suffix
if title:
title = re.sub(r"\s*\|\s*Mindmegette\.hu$", "", title).strip()
description = _og(soup, "og:description") or ""
image_url = _og(soup, "og:image")
# --- Ingredients ---
ingredients = []
ing_container = soup.find("div", class_="ingredients")
if ing_container:
for row in ing_container.find_all("div", class_="ingredients-meta"):
parts = []
# Actual HTML: <strong>qty</strong> <span>unit</span>
# <a class="ingredients-link">name</a>
qty_el = row.find("strong")
# Unit: first plain <span> (not one with a specific class like
# "ingredients-checkbox" etc.)
unit_el = None
for sp in row.find_all("span"):
if not sp.get("class"):
unit_el = sp
break
name_el = row.find("a", class_="ingredients-link")
# Extra info: <small>(darált)</small> or <span class="extra">
extra_el = row.find("small") or row.find("span", class_="extra")
if qty_el:
parts.append(_text(qty_el))
if unit_el:
parts.append(_text(unit_el))
if name_el:
parts.append(_text(name_el))
if extra_el:
extra = _text(extra_el)
if extra:
# Wrap in parens if not already
if not extra.startswith("("):
extra = f"({extra})"
parts.append(extra)
line = " ".join(p for p in parts if p)
if not line:
# Fallback: grab whole row text with spaces between elements
line = row.get_text(separator=" ", strip=True)
if line:
ingredients.append(line)
# --- Instructions ---
instructions = []
wysiwyg = soup.find("mindmegette-wysiwyg-box")
if wysiwyg:
for li in wysiwyg.find_all("li"):
txt = _text(li)
if txt:
instructions.append(txt)
# Fallback: look for block-content divs
if not instructions:
for div in soup.find_all("div", class_="block-content"):
ol = div.find("ol")
if ol:
for li in ol.find_all("li"):
txt = _text(li)
if txt:
instructions.append(txt)
return {
"title": title or "Ismeretlen recept",
"description": description,
"image_url": image_url,
"ingredients": ingredients,
"instructions": instructions,
"original_url": url,
}
# ---------------------------------------------------------------------------
# Generic fallback (og-tags + schema.org microdata)
# ---------------------------------------------------------------------------
def _parse_generic(soup: BeautifulSoup, url: str) -> dict:
title = _og(soup, "og:title") or _text(soup.find("title")) or "Ismeretlen recept"
description = _og(soup, "og:description") or ""
image_url = _og(soup, "og:image")
ingredients = []
instructions = []
# Try schema.org JSON-LD
for script in soup.find_all("script", type="application/ld+json"):
try:
import json
data = json.loads(script.string or "")
if isinstance(data, list):
data = data[0]
if data.get("@type") == "Recipe":
ingredients = data.get("recipeIngredient", [])
raw_instructions = data.get("recipeInstructions", [])
for item in raw_instructions:
if isinstance(item, str):
instructions.append(item)
elif isinstance(item, dict):
instructions.append(item.get("text", ""))
break
except (json.JSONDecodeError, TypeError, AttributeError):
continue
return {
"title": title,
"description": description,
"image_url": image_url,
"ingredients": ingredients,
"instructions": instructions,
"original_url": url,
}
# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------
def _host(url: str) -> str:
from urllib.parse import urlparse
return urlparse(url).hostname or ""
def _og(soup: BeautifulSoup, prop: str) -> str | None:
tag = soup.find("meta", property=prop)
if tag and tag.get("content"):
return tag["content"]
return None
def _text(el) -> str:
if el is None:
return ""
return el.get_text(strip=True)