fix: group title on first ingredient + multi-site parser registry
- Fix ingredient groups creating empty entries in Mealie: set title field on the first ingredient after the group marker instead - Refactor scraper with @_register decorator for URL-based site dispatch - Update README with structured ingredients, groups, MEALIE_INTERNAL_URL Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
+11
-12
@@ -145,27 +145,26 @@ class MealieClient:
|
||||
|
||||
def _build_payload(self, recipe: dict) -> dict:
|
||||
ingredients = []
|
||||
pending_group = ""
|
||||
for item in recipe.get("ingredients", []):
|
||||
if isinstance(item, dict):
|
||||
# Group header marker
|
||||
# Group header marker — apply title to the next real ingredient
|
||||
if "group" in item and "food" not in item:
|
||||
ingredients.append({
|
||||
"referenceId": str(uuid.uuid4()),
|
||||
"title": item["group"],
|
||||
"note": "",
|
||||
"isFood": False,
|
||||
"disableAmount": True,
|
||||
})
|
||||
else:
|
||||
ingredients.append(self._build_ingredient(item))
|
||||
pending_group = item["group"]
|
||||
continue
|
||||
ing = self._build_ingredient(item)
|
||||
else:
|
||||
# Legacy: plain string
|
||||
ingredients.append({
|
||||
ing = {
|
||||
"referenceId": str(uuid.uuid4()),
|
||||
"note": str(item),
|
||||
"isFood": False,
|
||||
"disableAmount": True,
|
||||
})
|
||||
}
|
||||
if pending_group:
|
||||
ing["title"] = pending_group
|
||||
pending_group = ""
|
||||
ingredients.append(ing)
|
||||
|
||||
instructions = []
|
||||
for text in recipe.get("instructions", []):
|
||||
|
||||
+27
-6
@@ -1,6 +1,7 @@
|
||||
"""Recipe scraper — parses Hungarian recipe sites into a structured dict.
|
||||
|
||||
Currently supported: mindmegette.hu
|
||||
Each supported site has a parser registered via _PARSERS.
|
||||
Unsupported sites fall back to generic schema.org / og-tag extraction.
|
||||
"""
|
||||
|
||||
import re
|
||||
@@ -12,6 +13,19 @@ _HEADERS = {
|
||||
"Accept-Language": "hu-HU,hu;q=0.9,en;q=0.5",
|
||||
}
|
||||
|
||||
# Maps a substring of the hostname to a parser function.
|
||||
# Order matters: first match wins.
|
||||
_PARSERS: list[tuple[str, "callable"]] = []
|
||||
|
||||
|
||||
def _register(host_substring: str):
|
||||
"""Decorator: register a parser for URLs whose hostname contains *host_substring*."""
|
||||
def decorator(fn):
|
||||
_PARSERS.append((host_substring, fn))
|
||||
return fn
|
||||
return decorator
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Public API
|
||||
# ---------------------------------------------------------------------------
|
||||
@@ -39,11 +53,17 @@ def scrape(url: str) -> dict:
|
||||
soup = BeautifulSoup(resp.text, "lxml")
|
||||
|
||||
host = _host(url)
|
||||
if "mindmegette" in host:
|
||||
return _parse_mindmegette(soup, url)
|
||||
else:
|
||||
# Fallback: try generic schema.org / og-tag extraction
|
||||
return _parse_generic(soup, url)
|
||||
for substring, parser in _PARSERS:
|
||||
if substring in host:
|
||||
return parser(soup, url)
|
||||
|
||||
# Fallback: try generic schema.org / og-tag extraction
|
||||
return _parse_generic(soup, url)
|
||||
|
||||
|
||||
def supported_sites() -> list[str]:
|
||||
"""Return list of supported site hostname substrings."""
|
||||
return [s for s, _ in _PARSERS]
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
@@ -51,6 +71,7 @@ def scrape(url: str) -> dict:
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
@_register("mindmegette")
|
||||
def _parse_mindmegette(soup: BeautifulSoup, url: str) -> dict:
|
||||
title = _og(soup, "og:title") or _text(soup.find("title"))
|
||||
# Strip " | Mindmegette.hu" suffix
|
||||
|
||||
Reference in New Issue
Block a user