v0.6.1: follow linked recipes on sobors.hu, white favicon
- Sobors.hu parser: detect external links in instructions and follow them to scrape real recipe content (e.g. kiskegyed.hu linked recipes) - Article-style ingredient fallback for sobors.hu pages without structured ingredient containers (h4 + ul > li plain text) - Favicon changed to logo_notext_white.svg Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
+65
-1
@@ -439,7 +439,7 @@ def _parse_sobors(soup: BeautifulSoup, url: str) -> dict:
|
||||
image_url = _og(soup, "og:image")
|
||||
|
||||
# --- Ingredients ---
|
||||
# Container: div.hozzavalok-container
|
||||
# Container: div.hozzavalok-container (structured recipe pages)
|
||||
# Groups: section > h4 (group header), section > ul > li
|
||||
# Each li > span > span.mennyiseg, span.mertekegyseg, span.hozzavalo
|
||||
ingredients = []
|
||||
@@ -467,12 +467,28 @@ def _parse_sobors(soup: BeautifulSoup, url: str) -> dict:
|
||||
"extra": "",
|
||||
})
|
||||
|
||||
# Fallback: article-style ingredients (h4 group headers + ul > li plain text)
|
||||
# Some sobors.hu pages (especially linked recipes) use this simpler format.
|
||||
if not ingredients:
|
||||
article = soup.find("div", class_="cikk-torzs") or soup.find("article")
|
||||
if article:
|
||||
_parse_sobors_article_ingredients(article, ingredients)
|
||||
|
||||
# --- Instructions ---
|
||||
# Container: div.recept_leiras.recept_he-elkeszites
|
||||
# Content: <p> tags for steps, <h3><strong>Section</strong></h3> for section headers
|
||||
instructions = []
|
||||
linked_url = None
|
||||
inst_container = soup.find("div", class_="recept_leiras")
|
||||
if inst_container:
|
||||
# Check for external link (linked recipe pattern — e.g. "click here for
|
||||
# full recipe on kiskegyed.hu")
|
||||
for a in inst_container.find_all("a", href=True):
|
||||
href = a["href"]
|
||||
if href.startswith("http") and "sobors.hu" not in href:
|
||||
linked_url = href
|
||||
break
|
||||
|
||||
for el in inst_container.find_all(["h3", "p"]):
|
||||
if el.name == "h3":
|
||||
header = el.get_text(strip=True)
|
||||
@@ -485,6 +501,18 @@ def _parse_sobors(soup: BeautifulSoup, url: str) -> dict:
|
||||
txt = re.sub(r"^\d+\.\s+", "", txt)
|
||||
instructions.append(txt)
|
||||
|
||||
# If instructions just contain a redirect to another site, try to follow
|
||||
# the link and scrape the real recipe from there.
|
||||
if linked_url and len(instructions) <= 2:
|
||||
try:
|
||||
linked_data = scrape(linked_url)
|
||||
if linked_data.get("instructions"):
|
||||
instructions = linked_data["instructions"]
|
||||
if not ingredients and linked_data.get("ingredients"):
|
||||
ingredients = linked_data["ingredients"]
|
||||
except Exception:
|
||||
pass # keep whatever we scraped from sobors.hu
|
||||
|
||||
# --- Tags ---
|
||||
# Container: div.cikk-cimkek > ul.cikk-cimkek-list > li > a
|
||||
# Skip the generic "Receptek" category tag and "Olvasói receptek" tag
|
||||
@@ -510,6 +538,42 @@ def _parse_sobors(soup: BeautifulSoup, url: str) -> dict:
|
||||
}
|
||||
|
||||
|
||||
def _parse_sobors_article_ingredients(container, ingredients: list):
|
||||
"""Parse article-style ingredients from sobors.hu (h4 headers + ul > li plain text)."""
|
||||
for el in container.find_all(["h4", "ul"]):
|
||||
if el.name == "h4":
|
||||
group_name = el.get_text(strip=True).rstrip(":")
|
||||
if group_name and not group_name.lower().startswith("hozzávalók"):
|
||||
ingredients.append({"group": group_name})
|
||||
elif el.name == "ul":
|
||||
# Only consider lists that follow an h4 or are inside the ingredient context
|
||||
prev = el.find_previous_sibling()
|
||||
if prev and prev.name == "h4":
|
||||
for li in el.find_all("li"):
|
||||
line = li.get_text(strip=True)
|
||||
if not line:
|
||||
continue
|
||||
qty, unit, food = _parse_ingredient_line(line)
|
||||
ingredients.append({
|
||||
"quantity": qty,
|
||||
"unit": unit,
|
||||
"food": food,
|
||||
"extra": "",
|
||||
})
|
||||
|
||||
|
||||
def _parse_ingredient_line(line: str) -> tuple[str, str, str]:
|
||||
"""Parse a plain-text ingredient line like '2 dl tejföl' into (qty, unit, food)."""
|
||||
m = re.match(r"^([0-9][0-9.,/½¼¾-]*)\s*(\S+)\s+(.+)$", line)
|
||||
if m:
|
||||
return (m.group(1).strip(), m.group(2).strip(), m.group(3).strip())
|
||||
# Just quantity + food (e.g. "2 tojás")
|
||||
m2 = re.match(r"^([0-9][0-9.,/½¼¾-]*)\s+(.+)$", line)
|
||||
if m2:
|
||||
return (m2.group(1).strip(), "", m2.group(2).strip())
|
||||
return ("", "", line)
|
||||
|
||||
|
||||
def _split_qty_unit(raw: str) -> tuple[str, str]:
|
||||
"""Split a merged quantity+unit string like '200g' into ('200', 'g')."""
|
||||
raw = raw.strip()
|
||||
|
||||
Reference in New Issue
Block a user