v0.8.1: fix mindmegette alt page format (wysiwyg ingredients/instructions)

Support alternative mindmegette.hu pages (e.g. /alapetelek/) where
ingredients are in <ul> after h3 "Hozzávalók" and instructions in <ol>
after h3 "Elkészítés" inside the wysiwyg box, instead of structured
div.ingredients containers.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-02-24 20:30:47 +01:00
parent 31ea1d0bf2
commit 6f12cc06a4
2 changed files with 53 additions and 18 deletions
+6
View File
@@ -1,5 +1,11 @@
# Changelog
## v0.8.1 (2026-02-24)
### Fixed
- Mindmegette.hu: support alternative page format (e.g. `/alapetelek/` pages) where ingredients are in `<ul>` and instructions in `<ol>` inside the wysiwyg box, instead of structured `div.ingredients` containers
- Mindmegette.hu: title suffix stripping now handles both `|` and `-` separators
## v0.8.0 (2026-02-24)
### Added
+47 -18
View File
@@ -91,27 +91,26 @@ def supported_sites() -> list[dict]:
@_register("mindmegette")
def _parse_mindmegette(soup: BeautifulSoup, url: str) -> dict:
title = _og(soup, "og:title") or _text(soup.find("title"))
# Strip " | Mindmegette.hu" suffix
# Strip " | Mindmegette.hu" or " - Mindmegette.hu" suffix
if title:
title = re.sub(r"\s*\|\s*Mindmegette\.hu$", "", title).strip()
title = re.sub(r"\s*[-|]\s*Mindmegette\.hu$", "", title).strip()
description = _og(soup, "og:description") or ""
image_url = _og(soup, "og:image")
# --- Ingredients ---
# Multiple div.ingredients containers may exist (one per group).
# Group title: <strong class="ingredients-group">A habaráshoz:</strong>
# Format A (regular /recept/ pages): div.ingredients containers with structured rows
# Format B (alt /alapetelek/ pages): h3 "Hozzávalók" → <ul><li> inside wysiwyg box
ingredients = []
wysiwyg = soup.find("mindmegette-wysiwyg-box")
for ing_container in soup.find_all("div", class_="ingredients"):
# Check for a group title
group_el = ing_container.find("strong", class_="ingredients-group")
group_name = _text(group_el).rstrip(":").strip() if group_el else ""
if group_name:
ingredients.append({"group": group_name})
for row in ing_container.find_all("div", class_="ingredients-meta"):
# Actual HTML: <strong>qty</strong> <span>unit</span>
# <a class="ingredients-link">name</a> <small>(extra)</small>
qty_el = row.find("strong")
unit_el = None
for sp in row.find_all("span"):
@@ -127,26 +126,56 @@ def _parse_mindmegette(soup: BeautifulSoup, url: str) -> dict:
extra = _text(extra_el).strip("() ")
if not food:
# Fallback: grab whole row text
food = row.get_text(separator=" ", strip=True)
if food:
ingredients.append({
"quantity": qty,
"unit": unit,
"food": food,
"extra": extra,
"quantity": qty, "unit": unit, "food": food, "extra": extra,
})
# Fallback: h3 "Hozzávalók" → <ul> inside wysiwyg box (alt page format)
if not ingredients and wysiwyg:
hozz_h3 = None
for h3 in wysiwyg.find_all("h3"):
if "hozzávalók" in h3.get_text(strip=True).lower():
hozz_h3 = h3
break
if hozz_h3:
ul = hozz_h3.find_next_sibling("ul")
if ul:
for li in ul.find_all("li"):
line = li.get_text(strip=True)
if not line:
continue
qty, unit, food = _parse_ingredient_line(line)
ingredients.append({
"quantity": qty, "unit": unit, "food": food, "extra": "",
})
# --- Instructions ---
instructions = []
wysiwyg = soup.find("mindmegette-wysiwyg-box")
if wysiwyg:
for li in wysiwyg.find_all("li"):
txt = _text(li)
if txt:
instructions.append(txt)
# Fallback: look for block-content divs
# Look for h3 "Elkészítés" → <ol> (alt format)
elk_h3 = None
for h3 in wysiwyg.find_all("h3"):
if "elkészítés" in h3.get_text(strip=True).lower():
elk_h3 = h3
break
if elk_h3:
ol = elk_h3.find_next_sibling("ol")
if ol:
for li in ol.find_all("li"):
txt = _text(li)
if txt:
instructions.append(txt)
# Regular format: instructions in block-content <ol> (no h3 header)
if not instructions:
for ol in wysiwyg.find_all("ol"):
for li in ol.find_all("li"):
txt = _text(li)
if txt:
instructions.append(txt)
# Fallback: look for block-content divs outside wysiwyg
if not instructions:
for div in soup.find_all("div", class_="block-content"):
ol = div.find("ol")