v0.7.0: kiskegyed.hu parser, dual measurements, site links as URLs

- New parser for kiskegyed.hu: ingredients (with groups, dual measurements),
  instructions (ol > li > div), tags (section.tags)
- Dual measurement handling: "3 ek (70 g)" extracts alternate measurement
  to comment field
- Cross-site linking: kiskegyed→sobors links are followed to get full recipe
  (mirrors existing sobors→kiskegyed support)
- Supported sites now shown as clickable URLs in the import page
- supported_sites() returns dicts with name and url

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-02-24 18:45:25 +01:00
parent 0912311357
commit 20fabb84bf
4 changed files with 193 additions and 5 deletions
+166 -3
View File
@@ -70,9 +70,16 @@ def scrape(url: str) -> dict:
return result
def supported_sites() -> list[str]:
"""Return list of supported site hostname substrings."""
return [s for s, _ in _PARSERS]
def supported_sites() -> list[dict]:
"""Return list of supported sites with name and URL."""
_SITE_URLS = {
"mindmegette": "https://www.mindmegette.hu",
"streetkitchen": "https://streetkitchen.hu",
"nosalty": "https://www.nosalty.hu",
"sobors": "https://sobors.hu",
"kiskegyed": "https://www.kiskegyed.hu",
}
return [{"name": s + ".hu", "url": _SITE_URLS.get(s, "#")} for s, _ in _PARSERS]
# ---------------------------------------------------------------------------
@@ -545,6 +552,162 @@ def _parse_sobors(soup: BeautifulSoup, url: str) -> dict:
}
# ---------------------------------------------------------------------------
# kiskegyed.hu
# ---------------------------------------------------------------------------
@_register("kiskegyed")
def _parse_kiskegyed(soup: BeautifulSoup, url: str) -> dict:
# Title: h2 inside the detail section
title = ""
h2 = soup.find("h2")
if h2:
title = h2.get_text(strip=True)
if not title:
title = _og(soup, "og:title") or _text(soup.find("title"))
if title:
title = re.sub(r"\s*[-|]\s*Kiskegyed.*$", "", title, flags=re.IGNORECASE).strip()
# Description: section#leadText > p
description = ""
lead = soup.find("section", id="leadText")
if lead:
p = lead.find("p")
if p:
description = p.get_text(strip=True)
if not description:
description = _og(soup, "og:description") or ""
image_url = _og(soup, "og:image")
# --- Ingredients ---
# Container: div.recipe_ingredients
# Groups: <p>Name:</p> or <p><em>A ...hez</em></p>
# Items: ul.list > li (plain text with optional <a> links)
ingredients = []
ing_container = soup.find("div", class_="recipe_ingredients")
if ing_container:
for el in ing_container.find_all(["p", "ul"]):
if el.name == "p":
group_text = el.get_text(strip=True).rstrip(":")
# Skip the "Hozzávalók" header and serving info
if not group_text or group_text.lower().startswith("hozzávalók"):
continue
# Skip serving info like "4 személyre"
if re.match(r"^\d+\s+személyre$", group_text):
continue
ingredients.append({"group": group_text})
elif el.name == "ul" and "list" in (el.get("class") or []):
for li in el.find_all("li"):
line = li.get_text(strip=True)
if not line:
continue
qty, unit, food, extra = _parse_kiskegyed_ingredient(line)
ingredients.append({
"quantity": qty,
"unit": unit,
"food": food,
"extra": extra,
})
# --- Instructions ---
# Container: div.recipe_preparation > ol > li > div
instructions = []
linked_url = None
prep_container = soup.find("div", class_="recipe_preparation")
if prep_container:
# Check for cross-link to another recipe site (e.g. sobors.hu)
for a in prep_container.find_all("a", href=True):
href = a["href"]
if href.startswith("http") and "kiskegyed.hu" not in href:
# Check if it points to a supported recipe site
linked_host = _host(href)
if any(s in linked_host for s, _ in _PARSERS if s != "kiskegyed"):
linked_url = href
break
ol = prep_container.find("ol")
if ol:
for li in ol.find_all("li", recursive=False):
div = li.find("div")
txt = div.get_text(strip=True) if div else li.get_text(strip=True)
if txt:
instructions.append(txt)
# If instructions are empty or just a redirect, follow the linked recipe
if linked_url and len(instructions) <= 2:
try:
linked_data = scrape(linked_url)
if linked_data.get("instructions"):
instructions = linked_data["instructions"]
if not ingredients and linked_data.get("ingredients"):
ingredients = linked_data["ingredients"]
except Exception:
pass
# --- Tags ---
# Container: section.tags > a > span (text starts with #)
tags = []
tag_section = soup.find("section", class_="tags")
if tag_section:
skip = {"recept", "receptek"}
for a in tag_section.find_all("a"):
span = a.find("span")
tag_text = span.get_text(strip=True) if span else a.get_text(strip=True)
tag_text = tag_text.lstrip("#").strip()
if tag_text and tag_text.lower() not in skip:
tags.append(tag_text)
return {
"title": title or "Ismeretlen recept",
"description": description,
"image_url": image_url,
"ingredients": ingredients,
"instructions": instructions,
"tags": tags,
"original_url": url,
}
def _parse_kiskegyed_ingredient(line: str) -> tuple[str, str, str, str]:
"""Parse a kiskegyed.hu ingredient line.
Handles dual measurements like '3 ek (70 g) búzafinomliszt (BL 55)'
→ qty='3', unit='ek', food='búzafinomliszt', extra='70 g; BL 55'
"""
extras = []
# Try: qty unit (alt_measurement) food...
m = re.match(
r"^([0-9][0-9.,/½¼¾-]*)\s+(\S+)\s+\(([^)]+)\)\s+(.+)$", line
)
if m:
qty = m.group(1).strip()
unit = m.group(2).strip()
extras.append(m.group(3).strip())
food_raw = m.group(4).strip()
# Extract trailing parenthesized note from food
fm = re.match(r"^(.+?)\s*\(([^)]+)\)\s*$", food_raw)
if fm:
food_raw = fm.group(1).strip()
extras.append(fm.group(2).strip())
return (qty, unit, food_raw, "; ".join(extras))
# Try: qty unit food...
m2 = re.match(r"^([0-9][0-9.,/½¼¾-]*)\s+(\S+)\s+(.+)$", line)
if m2:
return (m2.group(1).strip(), m2.group(2).strip(), m2.group(3).strip(), "")
# Try: qty food (e.g. "2 tojás")
m3 = re.match(r"^([0-9][0-9.,/½¼¾-]*)\s+(.+)$", line)
if m3:
return (m3.group(1).strip(), "", m3.group(2).strip(), "")
# No quantity (e.g. "ízlés szerint só")
return ("", "", line, "")
def _parse_sobors_article_ingredients(container, ingredients: list):
"""Parse article-style ingredients from sobors.hu (h4 headers + ul > li plain text)."""
for el in container.find_all(["h4", "ul"]):
+2 -2
View File
@@ -326,7 +326,7 @@
<!-- Single import tab -->
<div id="tabSingle">
<p style="font-size:0.85rem;color:var(--text-dim);margin-bottom:0.8rem;">
Támogatott oldalak: <span class="supported-sites">{{ supported_sites | join(', ') }}</span> + egyéb (schema.org)
Támogatott oldalak: <span class="supported-sites">{% for s in supported_sites %}<a href="{{ s.url }}" target="_blank" style="color:var(--accent-light);text-decoration:none;">{{ s.name }}</a>{% if not loop.last %}, {% endif %}{% endfor %}</span> + egyéb (schema.org)
</p>
<div class="flex">
<input type="url" id="recipeUrl" class="grow" style="margin-bottom:0"
@@ -341,7 +341,7 @@
<!-- Bulk import tab -->
<div id="tabBulk" style="display:none">
<p style="font-size:0.85rem;color:var(--text-dim);margin-bottom:0.8rem;">
Támogatott oldalak: <span class="supported-sites">{{ supported_sites | join(', ') }}</span> + egyéb (schema.org)
Támogatott oldalak: <span class="supported-sites">{% for s in supported_sites %}<a href="{{ s.url }}" target="_blank" style="color:var(--accent-light);text-decoration:none;">{{ s.name }}</a>{% if not loop.last %}, {% endif %}{% endfor %}</span> + egyéb (schema.org)
</p>
<label for="bulkUrls">URL-ek (soronként egy)</label>