Add nosalty.hu parser
Extracts ingredients (with groups), instructions (with section headers), tags, and story-as-description from nosalty.hu recipe pages. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
+117
@@ -298,6 +298,123 @@ def _parse_streetkitchen(soup: BeautifulSoup, url: str) -> dict:
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# nosalty.hu
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
@_register("nosalty")
|
||||||
|
def _parse_nosalty(soup: BeautifulSoup, url: str) -> dict:
|
||||||
|
title = _og(soup, "og:title") or _text(soup.find("title"))
|
||||||
|
if title:
|
||||||
|
title = re.sub(r"\s*\|.*$", "", title).strip()
|
||||||
|
|
||||||
|
# Story as description (no dedicated description on nosalty)
|
||||||
|
description = ""
|
||||||
|
story = soup.find("div", id="recipe-story")
|
||||||
|
if story:
|
||||||
|
paragraphs = [p.get_text(strip=True) for p in story.find_all("p")
|
||||||
|
if p.get_text(strip=True)]
|
||||||
|
description = " ".join(paragraphs)
|
||||||
|
|
||||||
|
image_url = _og(soup, "og:image")
|
||||||
|
|
||||||
|
# --- Ingredients ---
|
||||||
|
# Scoped to div#ingredients to avoid per-serving / nutrition duplicates.
|
||||||
|
# Structure: h3.m-list__title = group header, ul.m-list__list = ingredient rows.
|
||||||
|
ingredients = []
|
||||||
|
ing_container = soup.find("div", id="ingredients")
|
||||||
|
if ing_container:
|
||||||
|
for el in ing_container.find_all(["h3", "ul"]):
|
||||||
|
cls = el.get("class") or []
|
||||||
|
if el.name == "h3" and "m-list__title" in cls:
|
||||||
|
group_name = el.get_text(strip=True)
|
||||||
|
if group_name:
|
||||||
|
ingredients.append({"group": group_name})
|
||||||
|
elif el.name == "ul" and "m-list__list" in cls:
|
||||||
|
for li in el.find_all("li", class_="m-list__item"):
|
||||||
|
_parse_nosalty_ingredient(li, ingredients)
|
||||||
|
|
||||||
|
# --- Instructions ---
|
||||||
|
# Container: div#select inside div.p-recipe__directions.
|
||||||
|
# h4.m-list__title = section header, ol.m-list__list = steps.
|
||||||
|
instructions = []
|
||||||
|
dir_container = soup.find("div", id="select")
|
||||||
|
if dir_container:
|
||||||
|
for el in dir_container.find_all(["h4", "ol"]):
|
||||||
|
cls = el.get("class") or []
|
||||||
|
if el.name == "h4" and "m-list__title" in cls:
|
||||||
|
section_name = el.get_text(strip=True)
|
||||||
|
if section_name:
|
||||||
|
instructions.append(f"--- {section_name} ---")
|
||||||
|
elif el.name == "ol" and "m-list__list" in cls:
|
||||||
|
for li in el.find_all("li", class_="m-list__item"):
|
||||||
|
txt = li.get_text(strip=True)
|
||||||
|
if txt:
|
||||||
|
instructions.append(txt)
|
||||||
|
|
||||||
|
# --- Tags ---
|
||||||
|
tags = []
|
||||||
|
for a in soup.find_all("a", class_="m-tags__tagItem"):
|
||||||
|
tag_text = a.get_text(strip=True)
|
||||||
|
if tag_text:
|
||||||
|
tags.append(tag_text)
|
||||||
|
|
||||||
|
return {
|
||||||
|
"title": title or "Ismeretlen recept",
|
||||||
|
"description": description,
|
||||||
|
"image_url": image_url,
|
||||||
|
"ingredients": ingredients,
|
||||||
|
"instructions": instructions,
|
||||||
|
"tags": tags,
|
||||||
|
"original_url": url,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def _parse_nosalty_ingredient(li, ingredients: list):
|
||||||
|
"""Parse a single nosalty ingredient <li> into the ingredients list."""
|
||||||
|
inner = li.find("div")
|
||||||
|
if not inner:
|
||||||
|
return
|
||||||
|
|
||||||
|
food_el = inner.find("a", class_="a-link")
|
||||||
|
if not food_el:
|
||||||
|
return
|
||||||
|
|
||||||
|
food = food_el.get_text(strip=True)
|
||||||
|
if not food:
|
||||||
|
return
|
||||||
|
|
||||||
|
# Walk children of inner div in order.
|
||||||
|
# Spans before the <a> link = quantity, spans after = extra/note.
|
||||||
|
qty_raw = ""
|
||||||
|
extra_parts = []
|
||||||
|
before_link = True
|
||||||
|
for child in inner.children:
|
||||||
|
if child is food_el:
|
||||||
|
before_link = False
|
||||||
|
continue
|
||||||
|
if not hasattr(child, "get_text"):
|
||||||
|
continue
|
||||||
|
text = child.get_text(strip=True)
|
||||||
|
if not text:
|
||||||
|
continue
|
||||||
|
if before_link:
|
||||||
|
qty_raw = text
|
||||||
|
else:
|
||||||
|
extra_parts.append(text.strip("() "))
|
||||||
|
|
||||||
|
extra = "; ".join(p for p in extra_parts if p)
|
||||||
|
qty, unit = _split_qty_unit(qty_raw)
|
||||||
|
|
||||||
|
ingredients.append({
|
||||||
|
"quantity": qty,
|
||||||
|
"unit": unit,
|
||||||
|
"food": food,
|
||||||
|
"extra": extra,
|
||||||
|
})
|
||||||
|
|
||||||
|
|
||||||
def _split_qty_unit(raw: str) -> tuple[str, str]:
|
def _split_qty_unit(raw: str) -> tuple[str, str]:
|
||||||
"""Split a merged quantity+unit string like '200g' into ('200', 'g')."""
|
"""Split a merged quantity+unit string like '200g' into ('200', 'g')."""
|
||||||
raw = raw.strip()
|
raw = raw.strip()
|
||||||
|
|||||||
Reference in New Issue
Block a user