v0.6.1: follow linked recipes on sobors.hu, white favicon
- Sobors.hu parser: detect external links in instructions and follow them to scrape real recipe content (e.g. kiskegyed.hu linked recipes) - Article-style ingredient fallback for sobors.hu pages without structured ingredient containers (h4 + ul > li plain text) - Favicon changed to logo_notext_white.svg Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -1,5 +1,14 @@
|
|||||||
# Changelog
|
# Changelog
|
||||||
|
|
||||||
|
## v0.6.1 (2026-02-24)
|
||||||
|
|
||||||
|
### Added
|
||||||
|
- Sobors.hu linked recipe support: when instructions link to another site (e.g. kiskegyed.hu), the scraper follows the link and imports the real recipe content
|
||||||
|
- Article-style ingredient fallback for sobors.hu pages without structured ingredient containers
|
||||||
|
|
||||||
|
### Changed
|
||||||
|
- Favicon updated to white logo variant (logo_notext_white.svg)
|
||||||
|
|
||||||
## v0.6.0 (2026-02-24)
|
## v0.6.0 (2026-02-24)
|
||||||
|
|
||||||
### Added
|
### Added
|
||||||
|
|||||||
@@ -43,7 +43,7 @@ Docker container for importing recipes from Hungarian websites into [Mealie](htt
|
|||||||
| mindmegette.hu | Yes | Yes | Yes | Yes |
|
| mindmegette.hu | Yes | Yes | Yes | Yes |
|
||||||
| streetkitchen.hu | Yes (with groups) | Yes (ol/ul/paragraph) | Yes | Yes (from JSON-LD categories) |
|
| streetkitchen.hu | Yes (with groups) | Yes (ol/ul/paragraph) | Yes | Yes (from JSON-LD categories) |
|
||||||
| nosalty.hu | Yes (with groups) | Yes (with section headers) | Yes | Yes |
|
| nosalty.hu | Yes (with groups) | Yes (with section headers) | Yes | Yes |
|
||||||
| sobors.hu | Yes (with groups) | Yes (with section headers) | Yes | Yes |
|
| sobors.hu | Yes (with groups) | Yes (with section headers, follows linked recipes) | Yes | Yes |
|
||||||
| *Other sites* | Fallback (schema.org JSON-LD) | Fallback (schema.org JSON-LD) | Yes (og:image) | Fallback (schema.org keywords) |
|
| *Other sites* | Fallback (schema.org JSON-LD) | Fallback (schema.org JSON-LD) | Yes (og:image) | Fallback (schema.org keywords) |
|
||||||
|
|
||||||
### Mindmegette.hu Parser
|
### Mindmegette.hu Parser
|
||||||
@@ -92,6 +92,8 @@ Extracts data from the sobors.hu recipe pages:
|
|||||||
- **Ingredients**: `div.hozzavalok-container` → `section` elements with `ul > li`, each containing `span.mennyiseg` (qty), `span.mertekegyseg` (unit), `span.hozzavalo` (food)
|
- **Ingredients**: `div.hozzavalok-container` → `section` elements with `ul > li`, each containing `span.mennyiseg` (qty), `span.mertekegyseg` (unit), `span.hozzavalo` (food)
|
||||||
- **Ingredient groups**: `section > h4` headers (e.g., "A szószhoz:", "A húsgolyókhoz:")
|
- **Ingredient groups**: `section > h4` headers (e.g., "A szószhoz:", "A húsgolyókhoz:")
|
||||||
- **Instructions**: `div.recept_leiras` → `<p>` tags, with `<h3><strong>` section headers
|
- **Instructions**: `div.recept_leiras` → `<p>` tags, with `<h3><strong>` section headers
|
||||||
|
- **Linked recipes**: Some pages link to another site (e.g. kiskegyed.hu) instead of showing full instructions. The parser detects external links in the instruction area and follows them to scrape the real recipe content.
|
||||||
|
- **Article-style ingredient fallback**: Pages without the structured `div.hozzavalok-container` are parsed from article-body `h4` + `ul > li` plain text
|
||||||
- **Tags**: `div.cikk-cimkek > ul.cikk-cimkek-list > li > a` (skips generic "Receptek" category)
|
- **Tags**: `div.cikk-cimkek > ul.cikk-cimkek-list > li > a` (skips generic "Receptek" category)
|
||||||
|
|
||||||
### Generic Fallback Parser
|
### Generic Fallback Parser
|
||||||
|
|||||||
+65
-1
@@ -439,7 +439,7 @@ def _parse_sobors(soup: BeautifulSoup, url: str) -> dict:
|
|||||||
image_url = _og(soup, "og:image")
|
image_url = _og(soup, "og:image")
|
||||||
|
|
||||||
# --- Ingredients ---
|
# --- Ingredients ---
|
||||||
# Container: div.hozzavalok-container
|
# Container: div.hozzavalok-container (structured recipe pages)
|
||||||
# Groups: section > h4 (group header), section > ul > li
|
# Groups: section > h4 (group header), section > ul > li
|
||||||
# Each li > span > span.mennyiseg, span.mertekegyseg, span.hozzavalo
|
# Each li > span > span.mennyiseg, span.mertekegyseg, span.hozzavalo
|
||||||
ingredients = []
|
ingredients = []
|
||||||
@@ -467,12 +467,28 @@ def _parse_sobors(soup: BeautifulSoup, url: str) -> dict:
|
|||||||
"extra": "",
|
"extra": "",
|
||||||
})
|
})
|
||||||
|
|
||||||
|
# Fallback: article-style ingredients (h4 group headers + ul > li plain text)
|
||||||
|
# Some sobors.hu pages (especially linked recipes) use this simpler format.
|
||||||
|
if not ingredients:
|
||||||
|
article = soup.find("div", class_="cikk-torzs") or soup.find("article")
|
||||||
|
if article:
|
||||||
|
_parse_sobors_article_ingredients(article, ingredients)
|
||||||
|
|
||||||
# --- Instructions ---
|
# --- Instructions ---
|
||||||
# Container: div.recept_leiras.recept_he-elkeszites
|
# Container: div.recept_leiras.recept_he-elkeszites
|
||||||
# Content: <p> tags for steps, <h3><strong>Section</strong></h3> for section headers
|
# Content: <p> tags for steps, <h3><strong>Section</strong></h3> for section headers
|
||||||
instructions = []
|
instructions = []
|
||||||
|
linked_url = None
|
||||||
inst_container = soup.find("div", class_="recept_leiras")
|
inst_container = soup.find("div", class_="recept_leiras")
|
||||||
if inst_container:
|
if inst_container:
|
||||||
|
# Check for external link (linked recipe pattern — e.g. "click here for
|
||||||
|
# full recipe on kiskegyed.hu")
|
||||||
|
for a in inst_container.find_all("a", href=True):
|
||||||
|
href = a["href"]
|
||||||
|
if href.startswith("http") and "sobors.hu" not in href:
|
||||||
|
linked_url = href
|
||||||
|
break
|
||||||
|
|
||||||
for el in inst_container.find_all(["h3", "p"]):
|
for el in inst_container.find_all(["h3", "p"]):
|
||||||
if el.name == "h3":
|
if el.name == "h3":
|
||||||
header = el.get_text(strip=True)
|
header = el.get_text(strip=True)
|
||||||
@@ -485,6 +501,18 @@ def _parse_sobors(soup: BeautifulSoup, url: str) -> dict:
|
|||||||
txt = re.sub(r"^\d+\.\s+", "", txt)
|
txt = re.sub(r"^\d+\.\s+", "", txt)
|
||||||
instructions.append(txt)
|
instructions.append(txt)
|
||||||
|
|
||||||
|
# If instructions just contain a redirect to another site, try to follow
|
||||||
|
# the link and scrape the real recipe from there.
|
||||||
|
if linked_url and len(instructions) <= 2:
|
||||||
|
try:
|
||||||
|
linked_data = scrape(linked_url)
|
||||||
|
if linked_data.get("instructions"):
|
||||||
|
instructions = linked_data["instructions"]
|
||||||
|
if not ingredients and linked_data.get("ingredients"):
|
||||||
|
ingredients = linked_data["ingredients"]
|
||||||
|
except Exception:
|
||||||
|
pass # keep whatever we scraped from sobors.hu
|
||||||
|
|
||||||
# --- Tags ---
|
# --- Tags ---
|
||||||
# Container: div.cikk-cimkek > ul.cikk-cimkek-list > li > a
|
# Container: div.cikk-cimkek > ul.cikk-cimkek-list > li > a
|
||||||
# Skip the generic "Receptek" category tag and "Olvasói receptek" tag
|
# Skip the generic "Receptek" category tag and "Olvasói receptek" tag
|
||||||
@@ -510,6 +538,42 @@ def _parse_sobors(soup: BeautifulSoup, url: str) -> dict:
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def _parse_sobors_article_ingredients(container, ingredients: list):
|
||||||
|
"""Parse article-style ingredients from sobors.hu (h4 headers + ul > li plain text)."""
|
||||||
|
for el in container.find_all(["h4", "ul"]):
|
||||||
|
if el.name == "h4":
|
||||||
|
group_name = el.get_text(strip=True).rstrip(":")
|
||||||
|
if group_name and not group_name.lower().startswith("hozzávalók"):
|
||||||
|
ingredients.append({"group": group_name})
|
||||||
|
elif el.name == "ul":
|
||||||
|
# Only consider lists that follow an h4 or are inside the ingredient context
|
||||||
|
prev = el.find_previous_sibling()
|
||||||
|
if prev and prev.name == "h4":
|
||||||
|
for li in el.find_all("li"):
|
||||||
|
line = li.get_text(strip=True)
|
||||||
|
if not line:
|
||||||
|
continue
|
||||||
|
qty, unit, food = _parse_ingredient_line(line)
|
||||||
|
ingredients.append({
|
||||||
|
"quantity": qty,
|
||||||
|
"unit": unit,
|
||||||
|
"food": food,
|
||||||
|
"extra": "",
|
||||||
|
})
|
||||||
|
|
||||||
|
|
||||||
|
def _parse_ingredient_line(line: str) -> tuple[str, str, str]:
|
||||||
|
"""Parse a plain-text ingredient line like '2 dl tejföl' into (qty, unit, food)."""
|
||||||
|
m = re.match(r"^([0-9][0-9.,/½¼¾-]*)\s*(\S+)\s+(.+)$", line)
|
||||||
|
if m:
|
||||||
|
return (m.group(1).strip(), m.group(2).strip(), m.group(3).strip())
|
||||||
|
# Just quantity + food (e.g. "2 tojás")
|
||||||
|
m2 = re.match(r"^([0-9][0-9.,/½¼¾-]*)\s+(.+)$", line)
|
||||||
|
if m2:
|
||||||
|
return (m2.group(1).strip(), "", m2.group(2).strip())
|
||||||
|
return ("", "", line)
|
||||||
|
|
||||||
|
|
||||||
def _split_qty_unit(raw: str) -> tuple[str, str]:
|
def _split_qty_unit(raw: str) -> tuple[str, str]:
|
||||||
"""Split a merged quantity+unit string like '200g' into ('200', 'g')."""
|
"""Split a merged quantity+unit string like '200g' into ('200', 'g')."""
|
||||||
raw = raw.strip()
|
raw = raw.strip()
|
||||||
|
|||||||
@@ -7,7 +7,7 @@
|
|||||||
<link rel="preconnect" href="https://fonts.googleapis.com">
|
<link rel="preconnect" href="https://fonts.googleapis.com">
|
||||||
<link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
|
<link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
|
||||||
<link href="https://fonts.googleapis.com/css2?family=Plus+Jakarta+Sans:wght@400;500;600;700&display=swap" rel="stylesheet">
|
<link href="https://fonts.googleapis.com/css2?family=Plus+Jakarta+Sans:wght@400;500;600;700&display=swap" rel="stylesheet">
|
||||||
<link rel="icon" type="image/svg+xml" href="/assets/logo_notext.svg">
|
<link rel="icon" type="image/svg+xml" href="/assets/logo_notext_white.svg">
|
||||||
<style>
|
<style>
|
||||||
:root {
|
:root {
|
||||||
--bg: #0d1117;
|
--bg: #0d1117;
|
||||||
|
|||||||
Reference in New Issue
Block a user