v0.8.1: fix mindmegette alt page format (wysiwyg ingredients/instructions)
Support alternative mindmegette.hu pages (e.g. /alapetelek/) where ingredients are in <ul> after h3 "Hozzávalók" and instructions in <ol> after h3 "Elkészítés" inside the wysiwyg box, instead of structured div.ingredients containers. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -1,5 +1,11 @@
|
|||||||
# Changelog
|
# Changelog
|
||||||
|
|
||||||
|
## v0.8.1 (2026-02-24)
|
||||||
|
|
||||||
|
### Fixed
|
||||||
|
- Mindmegette.hu: support alternative page format (e.g. `/alapetelek/` pages) where ingredients are in `<ul>` and instructions in `<ol>` inside the wysiwyg box, instead of structured `div.ingredients` containers
|
||||||
|
- Mindmegette.hu: title suffix stripping now handles both `|` and `-` separators
|
||||||
|
|
||||||
## v0.8.0 (2026-02-24)
|
## v0.8.0 (2026-02-24)
|
||||||
|
|
||||||
### Added
|
### Added
|
||||||
|
|||||||
+47
-18
@@ -91,27 +91,26 @@ def supported_sites() -> list[dict]:
|
|||||||
@_register("mindmegette")
|
@_register("mindmegette")
|
||||||
def _parse_mindmegette(soup: BeautifulSoup, url: str) -> dict:
|
def _parse_mindmegette(soup: BeautifulSoup, url: str) -> dict:
|
||||||
title = _og(soup, "og:title") or _text(soup.find("title"))
|
title = _og(soup, "og:title") or _text(soup.find("title"))
|
||||||
# Strip " | Mindmegette.hu" suffix
|
# Strip " | Mindmegette.hu" or " - Mindmegette.hu" suffix
|
||||||
if title:
|
if title:
|
||||||
title = re.sub(r"\s*\|\s*Mindmegette\.hu$", "", title).strip()
|
title = re.sub(r"\s*[-–|]\s*Mindmegette\.hu$", "", title).strip()
|
||||||
|
|
||||||
description = _og(soup, "og:description") or ""
|
description = _og(soup, "og:description") or ""
|
||||||
image_url = _og(soup, "og:image")
|
image_url = _og(soup, "og:image")
|
||||||
|
|
||||||
# --- Ingredients ---
|
# --- Ingredients ---
|
||||||
# Multiple div.ingredients containers may exist (one per group).
|
# Format A (regular /recept/ pages): div.ingredients containers with structured rows
|
||||||
# Group title: <strong class="ingredients-group">A habaráshoz:</strong>
|
# Format B (alt /alapetelek/ pages): h3 "Hozzávalók" → <ul><li> inside wysiwyg box
|
||||||
ingredients = []
|
ingredients = []
|
||||||
|
wysiwyg = soup.find("mindmegette-wysiwyg-box")
|
||||||
|
|
||||||
for ing_container in soup.find_all("div", class_="ingredients"):
|
for ing_container in soup.find_all("div", class_="ingredients"):
|
||||||
# Check for a group title
|
|
||||||
group_el = ing_container.find("strong", class_="ingredients-group")
|
group_el = ing_container.find("strong", class_="ingredients-group")
|
||||||
group_name = _text(group_el).rstrip(":").strip() if group_el else ""
|
group_name = _text(group_el).rstrip(":").strip() if group_el else ""
|
||||||
if group_name:
|
if group_name:
|
||||||
ingredients.append({"group": group_name})
|
ingredients.append({"group": group_name})
|
||||||
|
|
||||||
for row in ing_container.find_all("div", class_="ingredients-meta"):
|
for row in ing_container.find_all("div", class_="ingredients-meta"):
|
||||||
# Actual HTML: <strong>qty</strong> <span>unit</span>
|
|
||||||
# <a class="ingredients-link">name</a> <small>(extra)</small>
|
|
||||||
qty_el = row.find("strong")
|
qty_el = row.find("strong")
|
||||||
unit_el = None
|
unit_el = None
|
||||||
for sp in row.find_all("span"):
|
for sp in row.find_all("span"):
|
||||||
@@ -127,26 +126,56 @@ def _parse_mindmegette(soup: BeautifulSoup, url: str) -> dict:
|
|||||||
extra = _text(extra_el).strip("() ")
|
extra = _text(extra_el).strip("() ")
|
||||||
|
|
||||||
if not food:
|
if not food:
|
||||||
# Fallback: grab whole row text
|
|
||||||
food = row.get_text(separator=" ", strip=True)
|
food = row.get_text(separator=" ", strip=True)
|
||||||
|
|
||||||
if food:
|
if food:
|
||||||
ingredients.append({
|
ingredients.append({
|
||||||
"quantity": qty,
|
"quantity": qty, "unit": unit, "food": food, "extra": extra,
|
||||||
"unit": unit,
|
|
||||||
"food": food,
|
|
||||||
"extra": extra,
|
|
||||||
})
|
})
|
||||||
|
|
||||||
|
# Fallback: h3 "Hozzávalók" → <ul> inside wysiwyg box (alt page format)
|
||||||
|
if not ingredients and wysiwyg:
|
||||||
|
hozz_h3 = None
|
||||||
|
for h3 in wysiwyg.find_all("h3"):
|
||||||
|
if "hozzávalók" in h3.get_text(strip=True).lower():
|
||||||
|
hozz_h3 = h3
|
||||||
|
break
|
||||||
|
if hozz_h3:
|
||||||
|
ul = hozz_h3.find_next_sibling("ul")
|
||||||
|
if ul:
|
||||||
|
for li in ul.find_all("li"):
|
||||||
|
line = li.get_text(strip=True)
|
||||||
|
if not line:
|
||||||
|
continue
|
||||||
|
qty, unit, food = _parse_ingredient_line(line)
|
||||||
|
ingredients.append({
|
||||||
|
"quantity": qty, "unit": unit, "food": food, "extra": "",
|
||||||
|
})
|
||||||
|
|
||||||
# --- Instructions ---
|
# --- Instructions ---
|
||||||
instructions = []
|
instructions = []
|
||||||
wysiwyg = soup.find("mindmegette-wysiwyg-box")
|
|
||||||
if wysiwyg:
|
if wysiwyg:
|
||||||
for li in wysiwyg.find_all("li"):
|
# Look for h3 "Elkészítés" → <ol> (alt format)
|
||||||
txt = _text(li)
|
elk_h3 = None
|
||||||
if txt:
|
for h3 in wysiwyg.find_all("h3"):
|
||||||
instructions.append(txt)
|
if "elkészítés" in h3.get_text(strip=True).lower():
|
||||||
# Fallback: look for block-content divs
|
elk_h3 = h3
|
||||||
|
break
|
||||||
|
if elk_h3:
|
||||||
|
ol = elk_h3.find_next_sibling("ol")
|
||||||
|
if ol:
|
||||||
|
for li in ol.find_all("li"):
|
||||||
|
txt = _text(li)
|
||||||
|
if txt:
|
||||||
|
instructions.append(txt)
|
||||||
|
# Regular format: instructions in block-content <ol> (no h3 header)
|
||||||
|
if not instructions:
|
||||||
|
for ol in wysiwyg.find_all("ol"):
|
||||||
|
for li in ol.find_all("li"):
|
||||||
|
txt = _text(li)
|
||||||
|
if txt:
|
||||||
|
instructions.append(txt)
|
||||||
|
# Fallback: look for block-content divs outside wysiwyg
|
||||||
if not instructions:
|
if not instructions:
|
||||||
for div in soup.find_all("div", class_="block-content"):
|
for div in soup.find_all("div", class_="block-content"):
|
||||||
ol = div.find("ol")
|
ol = div.find("ol")
|
||||||
|
|||||||
Reference in New Issue
Block a user