diff --git a/CHANGELOG.md b/CHANGELOG.md index 363e713..2504b21 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,17 @@ # Changelog +## v0.8.0 (2026-02-24) + +### Added +- GastroHobbi.hu parser: ingredients (with groups), instructions (with embedded lists), tags from JSON-LD articleSection +- Prep time extraction appended to description + +### Fixed +- Ingredient line parser: fractions like "1/2" no longer split incorrectly due to regex backtracking +- En-dash ranges in ingredients (e.g. "10 – 15 dkg") now normalized to "10-15 dkg" +- Unicode fractions (½, ¼, ¾) now recognized as quantity start across all parsers +- Embedded lists in instructions (nested ul>li) no longer produce duplicate entries + ## v0.7.0 (2026-02-24) ### Added diff --git a/README.md b/README.md index 2f1fa0b..0a81387 100644 --- a/README.md +++ b/README.md @@ -45,6 +45,7 @@ Docker container for importing recipes from Hungarian websites into [Mealie](htt | nosalty.hu | Yes (with groups) | Yes (with section headers) | Yes | Yes | | sobors.hu | Yes (with groups) | Yes (with section headers, follows linked recipes) | Yes | Yes | | kiskegyed.hu | Yes (with groups, dual measurements) | Yes (follows sobors.hu links) | Yes | Yes | +| gastrohobbi.hu | Yes (with groups) | Yes (with embedded lists) | Yes | Yes (from JSON-LD categories) | | *Other sites* | Fallback (schema.org JSON-LD) | Fallback (schema.org JSON-LD) | Yes (og:image) | Fallback (schema.org keywords) | ### Mindmegette.hu Parser @@ -111,6 +112,19 @@ Extracts data from kiskegyed.hu recipe pages: - **Cross-site links**: Pages linking to sobors.hu are followed to get the full recipe - **Tags**: `section.tags > a > span` (# prefix stripped, "recept" filtered) +### GastroHobbi.hu Parser + +Extracts data from gastrohobbi.hu recipe pages (WPBakery page builder layout): + +- **Title**: `h1.mpcth-post-title > span.mpcth-color-main-border` +- **Description**: First `
` in the first `wpb_text_column` before the recipe columns; falls back to `og:description` +- **Image**: `og:image` meta tag +- **Ingredients**: Finds `h3` containing "Hozzávalók:", then walks sibling `
` elements following the "Elkészítés:" `h3`; embedded `
in the first wpb_text_column before the inner recipe row + description = "" + first_text_col = soup.select_one("div.wpb-content-wrapper div.wpb_text_column div.wpb_wrapper") + if first_text_col: + p = first_text_col.find("p") + if p: + description = p.get_text(strip=True) + if not description: + description = _og(soup, "og:description") or "" + + image_url = _og(soup, "og:image") + + # --- Ingredients --- + # Find h3 containing "Hozzávalók" then walk siblings for ul and group h3 elements + ingredients = [] + _gastrohobbi_parse_ingredients(soup, ingredients) + + # --- Instructions --- + # Find h3 containing "Elkészítés:" then collect following
elements
+ instructions = []
+ prep_time = ""
+ _gastrohobbi_parse_instructions(soup, instructions)
+
+ # Extract prep time from h3 containing "Elkészítési idő:"
+ for h3 in soup.find_all("h3"):
+ text = h3.get_text(strip=True)
+ if "elkészítési idő" in text.lower():
+ # Text after the tag: "Elkészítési idő: 60 perc"
+ # The time part is outside the wrapper
+ em = h3.find("em")
+ if em:
+ em.decompose()
+ time_text = h3.get_text(strip=True).strip()
+ if time_text:
+ prep_time = time_text
+ break
+
+ # --- Tags ---
+ # From JSON-LD Article.articleSection
+ tags = []
+ skip_tags = {"receptjeink", "receptek"}
+ for script in soup.find_all("script", type="application/ld+json"):
+ try:
+ data = json.loads(script.string or "")
+ graph = data.get("@graph", [data]) if isinstance(data, dict) else data
+ for item in graph:
+ if isinstance(item, dict) and item.get("@type") == "Article":
+ sections = item.get("articleSection", [])
+ if isinstance(sections, list):
+ tags = [s.strip() for s in sections
+ if s.strip() and s.strip().lower() not in skip_tags]
+ break
+ except (json.JSONDecodeError, TypeError, AttributeError):
+ continue
+
+ # Append prep time to description if available
+ if prep_time:
+ if description:
+ description += f" (Elkészítési idő: {prep_time})"
+ else:
+ description = f"Elkészítési idő: {prep_time}"
+
+ return {
+ "title": title or "Ismeretlen recept",
+ "description": description,
+ "image_url": image_url,
+ "ingredients": ingredients,
+ "instructions": instructions,
+ "tags": tags,
+ "original_url": url,
+ }
+
+
+def _gastrohobbi_parse_ingredients(soup: BeautifulSoup, ingredients: list):
+ """Parse ingredients from gastrohobbi.hu — find Hozzávalók h3 then walk siblings."""
+ # Find the h3 that contains "Hozzávalók"
+ header = None
+ for h3 in soup.find_all("h3"):
+ if "hozzávalók" in h3.get_text(strip=True).lower():
+ header = h3
+ break
+ if not header:
+ return
+
+ # Walk siblings after the header within the same container
+ for sib in header.find_next_siblings():
+ tag = sib.name
+ text = sib.get_text(strip=True)
+ if not text:
+ continue
+ # Stop at the "Elkészítés" section
+ if tag == "h3" and "elkészítés" in text.lower():
+ break
+ # Ingredient group header (plain h3 without em>strong, e.g. "A csipetkéhez:")
+ if tag == "h3":
+ group_name = text.rstrip(":")
+ if group_name:
+ ingredients.append({"group": group_name})
+ continue
+ # Ingredient list
+ if tag == "ul":
+ for li in sib.find_all("li", recursive=False):
+ p = li.find("p")
+ line = p.get_text(strip=True) if p else li.get_text(strip=True)
+ if not line:
+ continue
+ qty, unit, food = _parse_ingredient_line(line)
+ ingredients.append({
+ "quantity": qty, "unit": unit, "food": food, "extra": "",
+ })
+
+
+def _gastrohobbi_parse_instructions(soup: BeautifulSoup, instructions: list):
+ """Parse instructions from gastrohobbi.hu — find Elkészítés h3 then collect siblings."""
+ header = None
+ for h3 in soup.find_all("h3"):
+ text = h3.get_text(strip=True)
+ if text.lower().startswith("elkészítés") and "idő" not in text.lower():
+ header = h3
+ break
+ if not header:
+ return
+
+ for sib in header.find_next_siblings():
+ tag = sib.name
+ text = sib.get_text(strip=True)
+ # Stop at prep time h3 or any other section header
+ if tag == "h3":
+ break
+ if tag == "p":
+ # Skip empty / whitespace-only paragraphs
+ if not text or text == "\xa0":
+ continue
+ instructions.append(text)
+ elif tag == "ul":
+ # Embedded list in instructions (e.g. cooking time options)
+ for li in sib.find_all("li"):
+ # Skip wrapper li elements that contain nested lists
+ if li.find("ul"):
+ continue
+ li_text = li.get_text(strip=True)
+ if li_text:
+ instructions.append(f" • {li_text}")
+
+
def _parse_sobors_article_ingredients(container, ingredients: list):
"""Parse article-style ingredients from sobors.hu (h4 headers + ul > li plain text)."""
for el in container.find_all(["h4", "ul"]):
@@ -736,11 +899,14 @@ def _parse_sobors_article_ingredients(container, ingredients: list):
def _parse_ingredient_line(line: str) -> tuple[str, str, str]:
"""Parse a plain-text ingredient line like '2 dl tejföl' into (qty, unit, food)."""
- m = re.match(r"^([0-9][0-9.,/½¼¾-]*)\s*(\S+)\s+(.+)$", line)
+ # Normalize en-dash/em-dash ranges: "10 – 15" → "10-15"
+ line = re.sub(r"\s*[–—]\s*", "-", line)
+ # qty unit food (e.g. "2 dl tejföl", "½ tk őrölt kömény")
+ m = re.match(r"^([0-9½¼¾][0-9.,/½¼¾-]*)\s+(\S+)\s+(.+)$", line)
if m:
return (m.group(1).strip(), m.group(2).strip(), m.group(3).strip())
# Just quantity + food (e.g. "2 tojás")
- m2 = re.match(r"^([0-9][0-9.,/½¼¾-]*)\s+(.+)$", line)
+ m2 = re.match(r"^([0-9½¼¾][0-9.,/½¼¾-]*)\s+(.+)$", line)
if m2:
return (m2.group(1).strip(), "", m2.group(2).strip())
return ("", "", line)