bbd0889471
- Scraper extracts tags from mindmegette.hu (<a class="tag">) and schema.org keywords - Tag editor UI with removable chips, search/autocomplete for existing tags, custom add - Mealie: auto-create tags via POST /api/organizers/tags, include in recipe PATCH - Tandoor: include keywords in recipe POST (auto-created by name) - New GET /tags endpoint returns existing tags from both services for search Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
237 lines
7.6 KiB
Python
237 lines
7.6 KiB
Python
"""Recipe scraper — parses Hungarian recipe sites into a structured dict.
|
|
|
|
Each supported site has a parser registered via _PARSERS.
|
|
Unsupported sites fall back to generic schema.org / og-tag extraction.
|
|
"""
|
|
|
|
import re
|
|
import requests
|
|
from bs4 import BeautifulSoup
|
|
|
|
_HEADERS = {
|
|
"User-Agent": "RecipeImporter/1.0 (Hungarian recipe scraper)",
|
|
"Accept-Language": "hu-HU,hu;q=0.9,en;q=0.5",
|
|
}
|
|
|
|
# Maps a substring of the hostname to a parser function.
|
|
# Order matters: first match wins.
|
|
_PARSERS: list[tuple[str, "callable"]] = []
|
|
|
|
|
|
def _register(host_substring: str):
|
|
"""Decorator: register a parser for URLs whose hostname contains *host_substring*."""
|
|
def decorator(fn):
|
|
_PARSERS.append((host_substring, fn))
|
|
return fn
|
|
return decorator
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Public API
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def scrape(url: str) -> dict:
|
|
"""Fetch *url* and return a recipe dict.
|
|
|
|
Returns::
|
|
|
|
{
|
|
"title": str,
|
|
"description": str,
|
|
"image_url": str | None,
|
|
"ingredients": [{"quantity": str, "unit": str, "food": str, "extra": str}, ...],
|
|
"instructions": [str, ...],
|
|
"tags": [str, ...],
|
|
"original_url": str,
|
|
}
|
|
|
|
Raises ValueError on unsupported sites or parse failures.
|
|
"""
|
|
resp = requests.get(url, headers=_HEADERS, timeout=30)
|
|
resp.raise_for_status()
|
|
resp.encoding = resp.apparent_encoding or "utf-8"
|
|
soup = BeautifulSoup(resp.text, "lxml")
|
|
|
|
host = _host(url)
|
|
for substring, parser in _PARSERS:
|
|
if substring in host:
|
|
return parser(soup, url)
|
|
|
|
# Fallback: try generic schema.org / og-tag extraction
|
|
return _parse_generic(soup, url)
|
|
|
|
|
|
def supported_sites() -> list[str]:
|
|
"""Return list of supported site hostname substrings."""
|
|
return [s for s, _ in _PARSERS]
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# mindmegette.hu
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
@_register("mindmegette")
|
|
def _parse_mindmegette(soup: BeautifulSoup, url: str) -> dict:
|
|
title = _og(soup, "og:title") or _text(soup.find("title"))
|
|
# Strip " | Mindmegette.hu" suffix
|
|
if title:
|
|
title = re.sub(r"\s*\|\s*Mindmegette\.hu$", "", title).strip()
|
|
|
|
description = _og(soup, "og:description") or ""
|
|
image_url = _og(soup, "og:image")
|
|
|
|
# --- Ingredients ---
|
|
# Multiple div.ingredients containers may exist (one per group).
|
|
# Group title: <strong class="ingredients-group">A habaráshoz:</strong>
|
|
ingredients = []
|
|
for ing_container in soup.find_all("div", class_="ingredients"):
|
|
# Check for a group title
|
|
group_el = ing_container.find("strong", class_="ingredients-group")
|
|
group_name = _text(group_el).rstrip(":").strip() if group_el else ""
|
|
if group_name:
|
|
ingredients.append({"group": group_name})
|
|
|
|
for row in ing_container.find_all("div", class_="ingredients-meta"):
|
|
# Actual HTML: <strong>qty</strong> <span>unit</span>
|
|
# <a class="ingredients-link">name</a> <small>(extra)</small>
|
|
qty_el = row.find("strong")
|
|
unit_el = None
|
|
for sp in row.find_all("span"):
|
|
if not sp.get("class"):
|
|
unit_el = sp
|
|
break
|
|
name_el = row.find("a", class_="ingredients-link")
|
|
extra_el = row.find("small") or row.find("span", class_="extra")
|
|
|
|
qty = _text(qty_el)
|
|
unit = _text(unit_el)
|
|
food = _text(name_el)
|
|
extra = _text(extra_el).strip("() ")
|
|
|
|
if not food:
|
|
# Fallback: grab whole row text
|
|
food = row.get_text(separator=" ", strip=True)
|
|
|
|
if food:
|
|
ingredients.append({
|
|
"quantity": qty,
|
|
"unit": unit,
|
|
"food": food,
|
|
"extra": extra,
|
|
})
|
|
|
|
# --- Instructions ---
|
|
instructions = []
|
|
wysiwyg = soup.find("mindmegette-wysiwyg-box")
|
|
if wysiwyg:
|
|
for li in wysiwyg.find_all("li"):
|
|
txt = _text(li)
|
|
if txt:
|
|
instructions.append(txt)
|
|
# Fallback: look for block-content divs
|
|
if not instructions:
|
|
for div in soup.find_all("div", class_="block-content"):
|
|
ol = div.find("ol")
|
|
if ol:
|
|
for li in ol.find_all("li"):
|
|
txt = _text(li)
|
|
if txt:
|
|
instructions.append(txt)
|
|
|
|
# --- Tags ---
|
|
tags = []
|
|
tag_wrapper = soup.select_one("div.desktop-wrapper")
|
|
if tag_wrapper:
|
|
for a in tag_wrapper.select("a.tag"):
|
|
tag_text = a.get_text(strip=True)
|
|
if tag_text:
|
|
tags.append(tag_text)
|
|
|
|
return {
|
|
"title": title or "Ismeretlen recept",
|
|
"description": description,
|
|
"image_url": image_url,
|
|
"ingredients": ingredients,
|
|
"instructions": instructions,
|
|
"tags": tags,
|
|
"original_url": url,
|
|
}
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Generic fallback (og-tags + schema.org microdata)
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def _parse_generic(soup: BeautifulSoup, url: str) -> dict:
|
|
title = _og(soup, "og:title") or _text(soup.find("title")) or "Ismeretlen recept"
|
|
description = _og(soup, "og:description") or ""
|
|
image_url = _og(soup, "og:image")
|
|
|
|
ingredients = []
|
|
instructions = []
|
|
tags = []
|
|
|
|
# Try schema.org JSON-LD
|
|
for script in soup.find_all("script", type="application/ld+json"):
|
|
try:
|
|
import json
|
|
data = json.loads(script.string or "")
|
|
if isinstance(data, list):
|
|
data = data[0]
|
|
if data.get("@type") == "Recipe":
|
|
for line in data.get("recipeIngredient", []):
|
|
ingredients.append({
|
|
"quantity": "", "unit": "", "food": line, "extra": "",
|
|
})
|
|
raw_instructions = data.get("recipeInstructions", [])
|
|
for item in raw_instructions:
|
|
if isinstance(item, str):
|
|
instructions.append(item)
|
|
elif isinstance(item, dict):
|
|
instructions.append(item.get("text", ""))
|
|
# Extract keywords
|
|
kw = data.get("keywords", "")
|
|
if isinstance(kw, str):
|
|
tags = [k.strip() for k in kw.split(",") if k.strip()]
|
|
elif isinstance(kw, list):
|
|
tags = [str(k).strip() for k in kw if str(k).strip()]
|
|
break
|
|
except (json.JSONDecodeError, TypeError, AttributeError):
|
|
continue
|
|
|
|
return {
|
|
"title": title,
|
|
"description": description,
|
|
"image_url": image_url,
|
|
"ingredients": ingredients,
|
|
"instructions": instructions,
|
|
"tags": tags,
|
|
"original_url": url,
|
|
}
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Helpers
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def _host(url: str) -> str:
|
|
from urllib.parse import urlparse
|
|
return urlparse(url).hostname or ""
|
|
|
|
|
|
def _og(soup: BeautifulSoup, prop: str) -> str | None:
|
|
tag = soup.find("meta", property=prop)
|
|
if tag and tag.get("content"):
|
|
return tag["content"]
|
|
return None
|
|
|
|
|
|
def _text(el) -> str:
|
|
if el is None:
|
|
return ""
|
|
return el.get_text(strip=True)
|