v0.7.0: kiskegyed.hu parser, dual measurements, site links as URLs
- New parser for kiskegyed.hu: ingredients (with groups, dual measurements), instructions (ol > li > div), tags (section.tags) - Dual measurement handling: "3 ek (70 g)" extracts alternate measurement to comment field - Cross-site linking: kiskegyed→sobors links are followed to get full recipe (mirrors existing sobors→kiskegyed support) - Supported sites now shown as clickable URLs in the import page - supported_sites() returns dicts with name and url Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
+166
-3
@@ -70,9 +70,16 @@ def scrape(url: str) -> dict:
|
||||
return result
|
||||
|
||||
|
||||
def supported_sites() -> list[str]:
|
||||
"""Return list of supported site hostname substrings."""
|
||||
return [s for s, _ in _PARSERS]
|
||||
def supported_sites() -> list[dict]:
|
||||
"""Return list of supported sites with name and URL."""
|
||||
_SITE_URLS = {
|
||||
"mindmegette": "https://www.mindmegette.hu",
|
||||
"streetkitchen": "https://streetkitchen.hu",
|
||||
"nosalty": "https://www.nosalty.hu",
|
||||
"sobors": "https://sobors.hu",
|
||||
"kiskegyed": "https://www.kiskegyed.hu",
|
||||
}
|
||||
return [{"name": s + ".hu", "url": _SITE_URLS.get(s, "#")} for s, _ in _PARSERS]
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
@@ -545,6 +552,162 @@ def _parse_sobors(soup: BeautifulSoup, url: str) -> dict:
|
||||
}
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# kiskegyed.hu
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
@_register("kiskegyed")
|
||||
def _parse_kiskegyed(soup: BeautifulSoup, url: str) -> dict:
|
||||
# Title: h2 inside the detail section
|
||||
title = ""
|
||||
h2 = soup.find("h2")
|
||||
if h2:
|
||||
title = h2.get_text(strip=True)
|
||||
if not title:
|
||||
title = _og(soup, "og:title") or _text(soup.find("title"))
|
||||
if title:
|
||||
title = re.sub(r"\s*[-–|]\s*Kiskegyed.*$", "", title, flags=re.IGNORECASE).strip()
|
||||
|
||||
# Description: section#leadText > p
|
||||
description = ""
|
||||
lead = soup.find("section", id="leadText")
|
||||
if lead:
|
||||
p = lead.find("p")
|
||||
if p:
|
||||
description = p.get_text(strip=True)
|
||||
if not description:
|
||||
description = _og(soup, "og:description") or ""
|
||||
|
||||
image_url = _og(soup, "og:image")
|
||||
|
||||
# --- Ingredients ---
|
||||
# Container: div.recipe_ingredients
|
||||
# Groups: <p>Name:</p> or <p><em>A ...hez</em></p>
|
||||
# Items: ul.list > li (plain text with optional <a> links)
|
||||
ingredients = []
|
||||
ing_container = soup.find("div", class_="recipe_ingredients")
|
||||
if ing_container:
|
||||
for el in ing_container.find_all(["p", "ul"]):
|
||||
if el.name == "p":
|
||||
group_text = el.get_text(strip=True).rstrip(":")
|
||||
# Skip the "Hozzávalók" header and serving info
|
||||
if not group_text or group_text.lower().startswith("hozzávalók"):
|
||||
continue
|
||||
# Skip serving info like "4 személyre"
|
||||
if re.match(r"^\d+\s+személyre$", group_text):
|
||||
continue
|
||||
ingredients.append({"group": group_text})
|
||||
elif el.name == "ul" and "list" in (el.get("class") or []):
|
||||
for li in el.find_all("li"):
|
||||
line = li.get_text(strip=True)
|
||||
if not line:
|
||||
continue
|
||||
qty, unit, food, extra = _parse_kiskegyed_ingredient(line)
|
||||
ingredients.append({
|
||||
"quantity": qty,
|
||||
"unit": unit,
|
||||
"food": food,
|
||||
"extra": extra,
|
||||
})
|
||||
|
||||
# --- Instructions ---
|
||||
# Container: div.recipe_preparation > ol > li > div
|
||||
instructions = []
|
||||
linked_url = None
|
||||
prep_container = soup.find("div", class_="recipe_preparation")
|
||||
if prep_container:
|
||||
# Check for cross-link to another recipe site (e.g. sobors.hu)
|
||||
for a in prep_container.find_all("a", href=True):
|
||||
href = a["href"]
|
||||
if href.startswith("http") and "kiskegyed.hu" not in href:
|
||||
# Check if it points to a supported recipe site
|
||||
linked_host = _host(href)
|
||||
if any(s in linked_host for s, _ in _PARSERS if s != "kiskegyed"):
|
||||
linked_url = href
|
||||
break
|
||||
|
||||
ol = prep_container.find("ol")
|
||||
if ol:
|
||||
for li in ol.find_all("li", recursive=False):
|
||||
div = li.find("div")
|
||||
txt = div.get_text(strip=True) if div else li.get_text(strip=True)
|
||||
if txt:
|
||||
instructions.append(txt)
|
||||
|
||||
# If instructions are empty or just a redirect, follow the linked recipe
|
||||
if linked_url and len(instructions) <= 2:
|
||||
try:
|
||||
linked_data = scrape(linked_url)
|
||||
if linked_data.get("instructions"):
|
||||
instructions = linked_data["instructions"]
|
||||
if not ingredients and linked_data.get("ingredients"):
|
||||
ingredients = linked_data["ingredients"]
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# --- Tags ---
|
||||
# Container: section.tags > a > span (text starts with #)
|
||||
tags = []
|
||||
tag_section = soup.find("section", class_="tags")
|
||||
if tag_section:
|
||||
skip = {"recept", "receptek"}
|
||||
for a in tag_section.find_all("a"):
|
||||
span = a.find("span")
|
||||
tag_text = span.get_text(strip=True) if span else a.get_text(strip=True)
|
||||
tag_text = tag_text.lstrip("#").strip()
|
||||
if tag_text and tag_text.lower() not in skip:
|
||||
tags.append(tag_text)
|
||||
|
||||
return {
|
||||
"title": title or "Ismeretlen recept",
|
||||
"description": description,
|
||||
"image_url": image_url,
|
||||
"ingredients": ingredients,
|
||||
"instructions": instructions,
|
||||
"tags": tags,
|
||||
"original_url": url,
|
||||
}
|
||||
|
||||
|
||||
def _parse_kiskegyed_ingredient(line: str) -> tuple[str, str, str, str]:
|
||||
"""Parse a kiskegyed.hu ingredient line.
|
||||
|
||||
Handles dual measurements like '3 ek (70 g) búzafinomliszt (BL 55)'
|
||||
→ qty='3', unit='ek', food='búzafinomliszt', extra='70 g; BL 55'
|
||||
"""
|
||||
extras = []
|
||||
|
||||
# Try: qty unit (alt_measurement) food...
|
||||
m = re.match(
|
||||
r"^([0-9][0-9.,/½¼¾-]*)\s+(\S+)\s+\(([^)]+)\)\s+(.+)$", line
|
||||
)
|
||||
if m:
|
||||
qty = m.group(1).strip()
|
||||
unit = m.group(2).strip()
|
||||
extras.append(m.group(3).strip())
|
||||
food_raw = m.group(4).strip()
|
||||
# Extract trailing parenthesized note from food
|
||||
fm = re.match(r"^(.+?)\s*\(([^)]+)\)\s*$", food_raw)
|
||||
if fm:
|
||||
food_raw = fm.group(1).strip()
|
||||
extras.append(fm.group(2).strip())
|
||||
return (qty, unit, food_raw, "; ".join(extras))
|
||||
|
||||
# Try: qty unit food...
|
||||
m2 = re.match(r"^([0-9][0-9.,/½¼¾-]*)\s+(\S+)\s+(.+)$", line)
|
||||
if m2:
|
||||
return (m2.group(1).strip(), m2.group(2).strip(), m2.group(3).strip(), "")
|
||||
|
||||
# Try: qty food (e.g. "2 tojás")
|
||||
m3 = re.match(r"^([0-9][0-9.,/½¼¾-]*)\s+(.+)$", line)
|
||||
if m3:
|
||||
return (m3.group(1).strip(), "", m3.group(2).strip(), "")
|
||||
|
||||
# No quantity (e.g. "ízlés szerint só")
|
||||
return ("", "", line, "")
|
||||
|
||||
|
||||
def _parse_sobors_article_ingredients(container, ingredients: list):
|
||||
"""Parse article-style ingredients from sobors.hu (h4 headers + ul > li plain text)."""
|
||||
for el in container.find_all(["h4", "ul"]):
|
||||
|
||||
Reference in New Issue
Block a user