fix: group title on first ingredient + multi-site parser registry

- Fix ingredient groups creating empty entries in Mealie: set title
  field on the first ingredient after the group marker instead
- Refactor scraper with @_register decorator for URL-based site dispatch
- Update README with structured ingredients, groups, MEALIE_INTERNAL_URL

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-02-24 08:51:14 +01:00
parent c235d5caa7
commit a27b322409
3 changed files with 52 additions and 22 deletions
+27 -6
View File
@@ -1,6 +1,7 @@
"""Recipe scraper — parses Hungarian recipe sites into a structured dict.
Currently supported: mindmegette.hu
Each supported site has a parser registered via _PARSERS.
Unsupported sites fall back to generic schema.org / og-tag extraction.
"""
import re
@@ -12,6 +13,19 @@ _HEADERS = {
"Accept-Language": "hu-HU,hu;q=0.9,en;q=0.5",
}
# Maps a substring of the hostname to a parser function.
# Order matters: first match wins.
_PARSERS: list[tuple[str, "callable"]] = []
def _register(host_substring: str):
"""Decorator: register a parser for URLs whose hostname contains *host_substring*."""
def decorator(fn):
_PARSERS.append((host_substring, fn))
return fn
return decorator
# ---------------------------------------------------------------------------
# Public API
# ---------------------------------------------------------------------------
@@ -39,11 +53,17 @@ def scrape(url: str) -> dict:
soup = BeautifulSoup(resp.text, "lxml")
host = _host(url)
if "mindmegette" in host:
return _parse_mindmegette(soup, url)
else:
# Fallback: try generic schema.org / og-tag extraction
return _parse_generic(soup, url)
for substring, parser in _PARSERS:
if substring in host:
return parser(soup, url)
# Fallback: try generic schema.org / og-tag extraction
return _parse_generic(soup, url)
def supported_sites() -> list[str]:
"""Return list of supported site hostname substrings."""
return [s for s, _ in _PARSERS]
# ---------------------------------------------------------------------------
@@ -51,6 +71,7 @@ def scrape(url: str) -> dict:
# ---------------------------------------------------------------------------
@_register("mindmegette")
def _parse_mindmegette(soup: BeautifulSoup, url: str) -> dict:
title = _og(soup, "og:title") or _text(soup.find("title"))
# Strip " | Mindmegette.hu" suffix