v0.8.1: fix mindmegette alt page format (wysiwyg ingredients/instructions)
Support alternative mindmegette.hu pages (e.g. /alapetelek/) where ingredients are in <ul> after h3 "Hozzávalók" and instructions in <ol> after h3 "Elkészítés" inside the wysiwyg box, instead of structured div.ingredients containers. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
+47
-18
@@ -91,27 +91,26 @@ def supported_sites() -> list[dict]:
|
||||
@_register("mindmegette")
|
||||
def _parse_mindmegette(soup: BeautifulSoup, url: str) -> dict:
|
||||
title = _og(soup, "og:title") or _text(soup.find("title"))
|
||||
# Strip " | Mindmegette.hu" suffix
|
||||
# Strip " | Mindmegette.hu" or " - Mindmegette.hu" suffix
|
||||
if title:
|
||||
title = re.sub(r"\s*\|\s*Mindmegette\.hu$", "", title).strip()
|
||||
title = re.sub(r"\s*[-–|]\s*Mindmegette\.hu$", "", title).strip()
|
||||
|
||||
description = _og(soup, "og:description") or ""
|
||||
image_url = _og(soup, "og:image")
|
||||
|
||||
# --- Ingredients ---
|
||||
# Multiple div.ingredients containers may exist (one per group).
|
||||
# Group title: <strong class="ingredients-group">A habaráshoz:</strong>
|
||||
# Format A (regular /recept/ pages): div.ingredients containers with structured rows
|
||||
# Format B (alt /alapetelek/ pages): h3 "Hozzávalók" → <ul><li> inside wysiwyg box
|
||||
ingredients = []
|
||||
wysiwyg = soup.find("mindmegette-wysiwyg-box")
|
||||
|
||||
for ing_container in soup.find_all("div", class_="ingredients"):
|
||||
# Check for a group title
|
||||
group_el = ing_container.find("strong", class_="ingredients-group")
|
||||
group_name = _text(group_el).rstrip(":").strip() if group_el else ""
|
||||
if group_name:
|
||||
ingredients.append({"group": group_name})
|
||||
|
||||
for row in ing_container.find_all("div", class_="ingredients-meta"):
|
||||
# Actual HTML: <strong>qty</strong> <span>unit</span>
|
||||
# <a class="ingredients-link">name</a> <small>(extra)</small>
|
||||
qty_el = row.find("strong")
|
||||
unit_el = None
|
||||
for sp in row.find_all("span"):
|
||||
@@ -127,26 +126,56 @@ def _parse_mindmegette(soup: BeautifulSoup, url: str) -> dict:
|
||||
extra = _text(extra_el).strip("() ")
|
||||
|
||||
if not food:
|
||||
# Fallback: grab whole row text
|
||||
food = row.get_text(separator=" ", strip=True)
|
||||
|
||||
if food:
|
||||
ingredients.append({
|
||||
"quantity": qty,
|
||||
"unit": unit,
|
||||
"food": food,
|
||||
"extra": extra,
|
||||
"quantity": qty, "unit": unit, "food": food, "extra": extra,
|
||||
})
|
||||
|
||||
# Fallback: h3 "Hozzávalók" → <ul> inside wysiwyg box (alt page format)
|
||||
if not ingredients and wysiwyg:
|
||||
hozz_h3 = None
|
||||
for h3 in wysiwyg.find_all("h3"):
|
||||
if "hozzávalók" in h3.get_text(strip=True).lower():
|
||||
hozz_h3 = h3
|
||||
break
|
||||
if hozz_h3:
|
||||
ul = hozz_h3.find_next_sibling("ul")
|
||||
if ul:
|
||||
for li in ul.find_all("li"):
|
||||
line = li.get_text(strip=True)
|
||||
if not line:
|
||||
continue
|
||||
qty, unit, food = _parse_ingredient_line(line)
|
||||
ingredients.append({
|
||||
"quantity": qty, "unit": unit, "food": food, "extra": "",
|
||||
})
|
||||
|
||||
# --- Instructions ---
|
||||
instructions = []
|
||||
wysiwyg = soup.find("mindmegette-wysiwyg-box")
|
||||
if wysiwyg:
|
||||
for li in wysiwyg.find_all("li"):
|
||||
txt = _text(li)
|
||||
if txt:
|
||||
instructions.append(txt)
|
||||
# Fallback: look for block-content divs
|
||||
# Look for h3 "Elkészítés" → <ol> (alt format)
|
||||
elk_h3 = None
|
||||
for h3 in wysiwyg.find_all("h3"):
|
||||
if "elkészítés" in h3.get_text(strip=True).lower():
|
||||
elk_h3 = h3
|
||||
break
|
||||
if elk_h3:
|
||||
ol = elk_h3.find_next_sibling("ol")
|
||||
if ol:
|
||||
for li in ol.find_all("li"):
|
||||
txt = _text(li)
|
||||
if txt:
|
||||
instructions.append(txt)
|
||||
# Regular format: instructions in block-content <ol> (no h3 header)
|
||||
if not instructions:
|
||||
for ol in wysiwyg.find_all("ol"):
|
||||
for li in ol.find_all("li"):
|
||||
txt = _text(li)
|
||||
if txt:
|
||||
instructions.append(txt)
|
||||
# Fallback: look for block-content divs outside wysiwyg
|
||||
if not instructions:
|
||||
for div in soup.find_all("div", class_="block-content"):
|
||||
ol = div.find("ol")
|
||||
|
||||
Reference in New Issue
Block a user