v0.6.1: follow linked recipes on sobors.hu, white favicon

- Sobors.hu parser: detect external links in instructions and follow them
  to scrape real recipe content (e.g. kiskegyed.hu linked recipes)
- Article-style ingredient fallback for sobors.hu pages without structured
  ingredient containers (h4 + ul > li plain text)
- Favicon changed to logo_notext_white.svg

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-02-24 18:18:54 +01:00
parent 45534391f0
commit baa63a43b2
4 changed files with 78 additions and 3 deletions
+65 -1
View File
@@ -439,7 +439,7 @@ def _parse_sobors(soup: BeautifulSoup, url: str) -> dict:
image_url = _og(soup, "og:image")
# --- Ingredients ---
# Container: div.hozzavalok-container
# Container: div.hozzavalok-container (structured recipe pages)
# Groups: section > h4 (group header), section > ul > li
# Each li > span > span.mennyiseg, span.mertekegyseg, span.hozzavalo
ingredients = []
@@ -467,12 +467,28 @@ def _parse_sobors(soup: BeautifulSoup, url: str) -> dict:
"extra": "",
})
# Fallback: article-style ingredients (h4 group headers + ul > li plain text)
# Some sobors.hu pages (especially linked recipes) use this simpler format.
if not ingredients:
article = soup.find("div", class_="cikk-torzs") or soup.find("article")
if article:
_parse_sobors_article_ingredients(article, ingredients)
# --- Instructions ---
# Container: div.recept_leiras.recept_he-elkeszites
# Content: <p> tags for steps, <h3><strong>Section</strong></h3> for section headers
instructions = []
linked_url = None
inst_container = soup.find("div", class_="recept_leiras")
if inst_container:
# Check for external link (linked recipe pattern — e.g. "click here for
# full recipe on kiskegyed.hu")
for a in inst_container.find_all("a", href=True):
href = a["href"]
if href.startswith("http") and "sobors.hu" not in href:
linked_url = href
break
for el in inst_container.find_all(["h3", "p"]):
if el.name == "h3":
header = el.get_text(strip=True)
@@ -485,6 +501,18 @@ def _parse_sobors(soup: BeautifulSoup, url: str) -> dict:
txt = re.sub(r"^\d+\.\s+", "", txt)
instructions.append(txt)
# If instructions just contain a redirect to another site, try to follow
# the link and scrape the real recipe from there.
if linked_url and len(instructions) <= 2:
try:
linked_data = scrape(linked_url)
if linked_data.get("instructions"):
instructions = linked_data["instructions"]
if not ingredients and linked_data.get("ingredients"):
ingredients = linked_data["ingredients"]
except Exception:
pass # keep whatever we scraped from sobors.hu
# --- Tags ---
# Container: div.cikk-cimkek > ul.cikk-cimkek-list > li > a
# Skip the generic "Receptek" category tag and "Olvasói receptek" tag
@@ -510,6 +538,42 @@ def _parse_sobors(soup: BeautifulSoup, url: str) -> dict:
}
def _parse_sobors_article_ingredients(container, ingredients: list):
"""Parse article-style ingredients from sobors.hu (h4 headers + ul > li plain text)."""
for el in container.find_all(["h4", "ul"]):
if el.name == "h4":
group_name = el.get_text(strip=True).rstrip(":")
if group_name and not group_name.lower().startswith("hozzávalók"):
ingredients.append({"group": group_name})
elif el.name == "ul":
# Only consider lists that follow an h4 or are inside the ingredient context
prev = el.find_previous_sibling()
if prev and prev.name == "h4":
for li in el.find_all("li"):
line = li.get_text(strip=True)
if not line:
continue
qty, unit, food = _parse_ingredient_line(line)
ingredients.append({
"quantity": qty,
"unit": unit,
"food": food,
"extra": "",
})
def _parse_ingredient_line(line: str) -> tuple[str, str, str]:
"""Parse a plain-text ingredient line like '2 dl tejföl' into (qty, unit, food)."""
m = re.match(r"^([0-9][0-9.,/½¼¾-]*)\s*(\S+)\s+(.+)$", line)
if m:
return (m.group(1).strip(), m.group(2).strip(), m.group(3).strip())
# Just quantity + food (e.g. "2 tojás")
m2 = re.match(r"^([0-9][0-9.,/½¼¾-]*)\s+(.+)$", line)
if m2:
return (m2.group(1).strip(), "", m2.group(2).strip())
return ("", "", line)
def _split_qty_unit(raw: str) -> tuple[str, str]:
"""Split a merged quantity+unit string like '200g' into ('200', 'g')."""
raw = raw.strip()
+1 -1
View File
@@ -7,7 +7,7 @@
<link rel="preconnect" href="https://fonts.googleapis.com">
<link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
<link href="https://fonts.googleapis.com/css2?family=Plus+Jakarta+Sans:wght@400;500;600;700&display=swap" rel="stylesheet">
<link rel="icon" type="image/svg+xml" href="/assets/logo_notext.svg">
<link rel="icon" type="image/svg+xml" href="/assets/logo_notext_white.svg">
<style>
:root {
--bg: #0d1117;