v0.8.0: gastrohobbi.hu parser, fix ingredient fraction parsing
Add gastrohobbi.hu parser (WPBakery page builder layout): ingredients with groups, instructions with embedded lists, tags from JSON-LD articleSection, prep time extraction. Fix ingredient line parser: fractions like "1/2" no longer split due to regex backtracking, en-dash ranges normalized, unicode fractions (½¼¾) recognized as quantity start across all parsers. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -1,5 +1,17 @@
|
|||||||
# Changelog
|
# Changelog
|
||||||
|
|
||||||
|
## v0.8.0 (2026-02-24)
|
||||||
|
|
||||||
|
### Added
|
||||||
|
- GastroHobbi.hu parser: ingredients (with groups), instructions (with embedded lists), tags from JSON-LD articleSection
|
||||||
|
- Prep time extraction appended to description
|
||||||
|
|
||||||
|
### Fixed
|
||||||
|
- Ingredient line parser: fractions like "1/2" no longer split incorrectly due to regex backtracking
|
||||||
|
- En-dash ranges in ingredients (e.g. "10 – 15 dkg") now normalized to "10-15 dkg"
|
||||||
|
- Unicode fractions (½, ¼, ¾) now recognized as quantity start across all parsers
|
||||||
|
- Embedded lists in instructions (nested ul>li) no longer produce duplicate entries
|
||||||
|
|
||||||
## v0.7.0 (2026-02-24)
|
## v0.7.0 (2026-02-24)
|
||||||
|
|
||||||
### Added
|
### Added
|
||||||
|
|||||||
@@ -45,6 +45,7 @@ Docker container for importing recipes from Hungarian websites into [Mealie](htt
|
|||||||
| nosalty.hu | Yes (with groups) | Yes (with section headers) | Yes | Yes |
|
| nosalty.hu | Yes (with groups) | Yes (with section headers) | Yes | Yes |
|
||||||
| sobors.hu | Yes (with groups) | Yes (with section headers, follows linked recipes) | Yes | Yes |
|
| sobors.hu | Yes (with groups) | Yes (with section headers, follows linked recipes) | Yes | Yes |
|
||||||
| kiskegyed.hu | Yes (with groups, dual measurements) | Yes (follows sobors.hu links) | Yes | Yes |
|
| kiskegyed.hu | Yes (with groups, dual measurements) | Yes (follows sobors.hu links) | Yes | Yes |
|
||||||
|
| gastrohobbi.hu | Yes (with groups) | Yes (with embedded lists) | Yes | Yes (from JSON-LD categories) |
|
||||||
| *Other sites* | Fallback (schema.org JSON-LD) | Fallback (schema.org JSON-LD) | Yes (og:image) | Fallback (schema.org keywords) |
|
| *Other sites* | Fallback (schema.org JSON-LD) | Fallback (schema.org JSON-LD) | Yes (og:image) | Fallback (schema.org keywords) |
|
||||||
|
|
||||||
### Mindmegette.hu Parser
|
### Mindmegette.hu Parser
|
||||||
@@ -111,6 +112,19 @@ Extracts data from kiskegyed.hu recipe pages:
|
|||||||
- **Cross-site links**: Pages linking to sobors.hu are followed to get the full recipe
|
- **Cross-site links**: Pages linking to sobors.hu are followed to get the full recipe
|
||||||
- **Tags**: `section.tags > a > span` (# prefix stripped, "recept" filtered)
|
- **Tags**: `section.tags > a > span` (# prefix stripped, "recept" filtered)
|
||||||
|
|
||||||
|
### GastroHobbi.hu Parser
|
||||||
|
|
||||||
|
Extracts data from gastrohobbi.hu recipe pages (WPBakery page builder layout):
|
||||||
|
|
||||||
|
- **Title**: `h1.mpcth-post-title > span.mpcth-color-main-border`
|
||||||
|
- **Description**: First `<p>` in the first `wpb_text_column` before the recipe columns; falls back to `og:description`
|
||||||
|
- **Image**: `og:image` meta tag
|
||||||
|
- **Ingredients**: Finds `h3` containing "Hozzávalók:", then walks sibling `<ul>` elements; items from `li > p` or `li` directly
|
||||||
|
- **Ingredient groups**: Plain `<h3>` elements between ingredient lists (e.g. "A csipetkéhez:")
|
||||||
|
- **Instructions**: `<p>` elements following the "Elkészítés:" `h3`; embedded `<ul>` items rendered as bullet points
|
||||||
|
- **Prep time**: Extracted from "Elkészítési idő:" `h3`, appended to description
|
||||||
|
- **Tags**: JSON-LD `Article.articleSection` array (site uses Article schema, not Recipe)
|
||||||
|
|
||||||
### Generic Fallback Parser
|
### Generic Fallback Parser
|
||||||
|
|
||||||
For unsupported sites, attempts extraction via:
|
For unsupported sites, attempts extraction via:
|
||||||
|
|||||||
+171
-5
@@ -78,6 +78,7 @@ def supported_sites() -> list[dict]:
|
|||||||
"nosalty": "https://www.nosalty.hu",
|
"nosalty": "https://www.nosalty.hu",
|
||||||
"sobors": "https://sobors.hu",
|
"sobors": "https://sobors.hu",
|
||||||
"kiskegyed": "https://www.kiskegyed.hu",
|
"kiskegyed": "https://www.kiskegyed.hu",
|
||||||
|
"gastrohobbi": "https://gastrohobbi.hu",
|
||||||
}
|
}
|
||||||
return [{"name": s + ".hu", "url": _SITE_URLS.get(s, "#")} for s, _ in _PARSERS]
|
return [{"name": s + ".hu", "url": _SITE_URLS.get(s, "#")} for s, _ in _PARSERS]
|
||||||
|
|
||||||
@@ -682,7 +683,7 @@ def _parse_kiskegyed_ingredient(line: str) -> tuple[str, str, str, str]:
|
|||||||
# Try: qty unit (alt_measurement) food...
|
# Try: qty unit (alt_measurement) food...
|
||||||
# Unit can be multi-word (e.g. "kis fej"), so use .+? (non-greedy)
|
# Unit can be multi-word (e.g. "kis fej"), so use .+? (non-greedy)
|
||||||
m = re.match(
|
m = re.match(
|
||||||
r"^([0-9][0-9.,/½¼¾-]*)\s+(.+?)\s+\(([^)]+)\)\s+(.+)$", line
|
r"^([0-9½¼¾][0-9.,/½¼¾-]*)\s+(.+?)\s+\(([^)]+)\)\s+(.+)$", line
|
||||||
)
|
)
|
||||||
if m:
|
if m:
|
||||||
qty = m.group(1).strip()
|
qty = m.group(1).strip()
|
||||||
@@ -697,12 +698,12 @@ def _parse_kiskegyed_ingredient(line: str) -> tuple[str, str, str, str]:
|
|||||||
return (qty, unit, food_raw, "; ".join(extras))
|
return (qty, unit, food_raw, "; ".join(extras))
|
||||||
|
|
||||||
# Try: qty unit food...
|
# Try: qty unit food...
|
||||||
m2 = re.match(r"^([0-9][0-9.,/½¼¾-]*)\s+(\S+)\s+(.+)$", line)
|
m2 = re.match(r"^([0-9½¼¾][0-9.,/½¼¾-]*)\s+(\S+)\s+(.+)$", line)
|
||||||
if m2:
|
if m2:
|
||||||
return (m2.group(1).strip(), m2.group(2).strip(), m2.group(3).strip(), "")
|
return (m2.group(1).strip(), m2.group(2).strip(), m2.group(3).strip(), "")
|
||||||
|
|
||||||
# Try: qty food (e.g. "2 tojás")
|
# Try: qty food (e.g. "2 tojás")
|
||||||
m3 = re.match(r"^([0-9][0-9.,/½¼¾-]*)\s+(.+)$", line)
|
m3 = re.match(r"^([0-9½¼¾][0-9.,/½¼¾-]*)\s+(.+)$", line)
|
||||||
if m3:
|
if m3:
|
||||||
return (m3.group(1).strip(), "", m3.group(2).strip(), "")
|
return (m3.group(1).strip(), "", m3.group(2).strip(), "")
|
||||||
|
|
||||||
@@ -710,6 +711,168 @@ def _parse_kiskegyed_ingredient(line: str) -> tuple[str, str, str, str]:
|
|||||||
return ("", "", line, "")
|
return ("", "", line, "")
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# gastrohobbi.hu
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
@_register("gastrohobbi")
|
||||||
|
def _parse_gastrohobbi(soup: BeautifulSoup, url: str) -> dict:
|
||||||
|
# Title: h1.mpcth-post-title > span
|
||||||
|
title = ""
|
||||||
|
title_el = soup.select_one("h1.mpcth-post-title span.mpcth-color-main-border")
|
||||||
|
if title_el:
|
||||||
|
title = title_el.get_text(strip=True)
|
||||||
|
if not title:
|
||||||
|
title = _og(soup, "og:title") or _text(soup.find("title"))
|
||||||
|
if title:
|
||||||
|
title = re.sub(r"\s*[-–|]\s*GastroHobbi.*$", "", title, flags=re.IGNORECASE).strip()
|
||||||
|
|
||||||
|
# Description: first <p> in the first wpb_text_column before the inner recipe row
|
||||||
|
description = ""
|
||||||
|
first_text_col = soup.select_one("div.wpb-content-wrapper div.wpb_text_column div.wpb_wrapper")
|
||||||
|
if first_text_col:
|
||||||
|
p = first_text_col.find("p")
|
||||||
|
if p:
|
||||||
|
description = p.get_text(strip=True)
|
||||||
|
if not description:
|
||||||
|
description = _og(soup, "og:description") or ""
|
||||||
|
|
||||||
|
image_url = _og(soup, "og:image")
|
||||||
|
|
||||||
|
# --- Ingredients ---
|
||||||
|
# Find h3 containing "Hozzávalók" then walk siblings for ul and group h3 elements
|
||||||
|
ingredients = []
|
||||||
|
_gastrohobbi_parse_ingredients(soup, ingredients)
|
||||||
|
|
||||||
|
# --- Instructions ---
|
||||||
|
# Find h3 containing "Elkészítés:" then collect following <p> elements
|
||||||
|
instructions = []
|
||||||
|
prep_time = ""
|
||||||
|
_gastrohobbi_parse_instructions(soup, instructions)
|
||||||
|
|
||||||
|
# Extract prep time from h3 containing "Elkészítési idő:"
|
||||||
|
for h3 in soup.find_all("h3"):
|
||||||
|
text = h3.get_text(strip=True)
|
||||||
|
if "elkészítési idő" in text.lower():
|
||||||
|
# Text after the <em> tag: "Elkészítési idő: 60 perc"
|
||||||
|
# The time part is outside the <em><strong> wrapper
|
||||||
|
em = h3.find("em")
|
||||||
|
if em:
|
||||||
|
em.decompose()
|
||||||
|
time_text = h3.get_text(strip=True).strip()
|
||||||
|
if time_text:
|
||||||
|
prep_time = time_text
|
||||||
|
break
|
||||||
|
|
||||||
|
# --- Tags ---
|
||||||
|
# From JSON-LD Article.articleSection
|
||||||
|
tags = []
|
||||||
|
skip_tags = {"receptjeink", "receptek"}
|
||||||
|
for script in soup.find_all("script", type="application/ld+json"):
|
||||||
|
try:
|
||||||
|
data = json.loads(script.string or "")
|
||||||
|
graph = data.get("@graph", [data]) if isinstance(data, dict) else data
|
||||||
|
for item in graph:
|
||||||
|
if isinstance(item, dict) and item.get("@type") == "Article":
|
||||||
|
sections = item.get("articleSection", [])
|
||||||
|
if isinstance(sections, list):
|
||||||
|
tags = [s.strip() for s in sections
|
||||||
|
if s.strip() and s.strip().lower() not in skip_tags]
|
||||||
|
break
|
||||||
|
except (json.JSONDecodeError, TypeError, AttributeError):
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Append prep time to description if available
|
||||||
|
if prep_time:
|
||||||
|
if description:
|
||||||
|
description += f" (Elkészítési idő: {prep_time})"
|
||||||
|
else:
|
||||||
|
description = f"Elkészítési idő: {prep_time}"
|
||||||
|
|
||||||
|
return {
|
||||||
|
"title": title or "Ismeretlen recept",
|
||||||
|
"description": description,
|
||||||
|
"image_url": image_url,
|
||||||
|
"ingredients": ingredients,
|
||||||
|
"instructions": instructions,
|
||||||
|
"tags": tags,
|
||||||
|
"original_url": url,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def _gastrohobbi_parse_ingredients(soup: BeautifulSoup, ingredients: list):
|
||||||
|
"""Parse ingredients from gastrohobbi.hu — find Hozzávalók h3 then walk siblings."""
|
||||||
|
# Find the h3 that contains "Hozzávalók"
|
||||||
|
header = None
|
||||||
|
for h3 in soup.find_all("h3"):
|
||||||
|
if "hozzávalók" in h3.get_text(strip=True).lower():
|
||||||
|
header = h3
|
||||||
|
break
|
||||||
|
if not header:
|
||||||
|
return
|
||||||
|
|
||||||
|
# Walk siblings after the header within the same container
|
||||||
|
for sib in header.find_next_siblings():
|
||||||
|
tag = sib.name
|
||||||
|
text = sib.get_text(strip=True)
|
||||||
|
if not text:
|
||||||
|
continue
|
||||||
|
# Stop at the "Elkészítés" section
|
||||||
|
if tag == "h3" and "elkészítés" in text.lower():
|
||||||
|
break
|
||||||
|
# Ingredient group header (plain h3 without em>strong, e.g. "A csipetkéhez:")
|
||||||
|
if tag == "h3":
|
||||||
|
group_name = text.rstrip(":")
|
||||||
|
if group_name:
|
||||||
|
ingredients.append({"group": group_name})
|
||||||
|
continue
|
||||||
|
# Ingredient list
|
||||||
|
if tag == "ul":
|
||||||
|
for li in sib.find_all("li", recursive=False):
|
||||||
|
p = li.find("p")
|
||||||
|
line = p.get_text(strip=True) if p else li.get_text(strip=True)
|
||||||
|
if not line:
|
||||||
|
continue
|
||||||
|
qty, unit, food = _parse_ingredient_line(line)
|
||||||
|
ingredients.append({
|
||||||
|
"quantity": qty, "unit": unit, "food": food, "extra": "",
|
||||||
|
})
|
||||||
|
|
||||||
|
|
||||||
|
def _gastrohobbi_parse_instructions(soup: BeautifulSoup, instructions: list):
|
||||||
|
"""Parse instructions from gastrohobbi.hu — find Elkészítés h3 then collect <p> siblings."""
|
||||||
|
header = None
|
||||||
|
for h3 in soup.find_all("h3"):
|
||||||
|
text = h3.get_text(strip=True)
|
||||||
|
if text.lower().startswith("elkészítés") and "idő" not in text.lower():
|
||||||
|
header = h3
|
||||||
|
break
|
||||||
|
if not header:
|
||||||
|
return
|
||||||
|
|
||||||
|
for sib in header.find_next_siblings():
|
||||||
|
tag = sib.name
|
||||||
|
text = sib.get_text(strip=True)
|
||||||
|
# Stop at prep time h3 or any other section header
|
||||||
|
if tag == "h3":
|
||||||
|
break
|
||||||
|
if tag == "p":
|
||||||
|
# Skip empty / whitespace-only paragraphs
|
||||||
|
if not text or text == "\xa0":
|
||||||
|
continue
|
||||||
|
instructions.append(text)
|
||||||
|
elif tag == "ul":
|
||||||
|
# Embedded list in instructions (e.g. cooking time options)
|
||||||
|
for li in sib.find_all("li"):
|
||||||
|
# Skip wrapper li elements that contain nested lists
|
||||||
|
if li.find("ul"):
|
||||||
|
continue
|
||||||
|
li_text = li.get_text(strip=True)
|
||||||
|
if li_text:
|
||||||
|
instructions.append(f" • {li_text}")
|
||||||
|
|
||||||
|
|
||||||
def _parse_sobors_article_ingredients(container, ingredients: list):
|
def _parse_sobors_article_ingredients(container, ingredients: list):
|
||||||
"""Parse article-style ingredients from sobors.hu (h4 headers + ul > li plain text)."""
|
"""Parse article-style ingredients from sobors.hu (h4 headers + ul > li plain text)."""
|
||||||
for el in container.find_all(["h4", "ul"]):
|
for el in container.find_all(["h4", "ul"]):
|
||||||
@@ -736,11 +899,14 @@ def _parse_sobors_article_ingredients(container, ingredients: list):
|
|||||||
|
|
||||||
def _parse_ingredient_line(line: str) -> tuple[str, str, str]:
|
def _parse_ingredient_line(line: str) -> tuple[str, str, str]:
|
||||||
"""Parse a plain-text ingredient line like '2 dl tejföl' into (qty, unit, food)."""
|
"""Parse a plain-text ingredient line like '2 dl tejföl' into (qty, unit, food)."""
|
||||||
m = re.match(r"^([0-9][0-9.,/½¼¾-]*)\s*(\S+)\s+(.+)$", line)
|
# Normalize en-dash/em-dash ranges: "10 – 15" → "10-15"
|
||||||
|
line = re.sub(r"\s*[–—]\s*", "-", line)
|
||||||
|
# qty unit food (e.g. "2 dl tejföl", "½ tk őrölt kömény")
|
||||||
|
m = re.match(r"^([0-9½¼¾][0-9.,/½¼¾-]*)\s+(\S+)\s+(.+)$", line)
|
||||||
if m:
|
if m:
|
||||||
return (m.group(1).strip(), m.group(2).strip(), m.group(3).strip())
|
return (m.group(1).strip(), m.group(2).strip(), m.group(3).strip())
|
||||||
# Just quantity + food (e.g. "2 tojás")
|
# Just quantity + food (e.g. "2 tojás")
|
||||||
m2 = re.match(r"^([0-9][0-9.,/½¼¾-]*)\s+(.+)$", line)
|
m2 = re.match(r"^([0-9½¼¾][0-9.,/½¼¾-]*)\s+(.+)$", line)
|
||||||
if m2:
|
if m2:
|
||||||
return (m2.group(1).strip(), "", m2.group(2).strip())
|
return (m2.group(1).strip(), "", m2.group(2).strip())
|
||||||
return ("", "", line)
|
return ("", "", line)
|
||||||
|
|||||||
Reference in New Issue
Block a user