Add nosalty.hu parser
Extracts ingredients (with groups), instructions (with section headers), tags, and story-as-description from nosalty.hu recipe pages. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
+117
@@ -298,6 +298,123 @@ def _parse_streetkitchen(soup: BeautifulSoup, url: str) -> dict:
|
||||
}
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# nosalty.hu
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
@_register("nosalty")
|
||||
def _parse_nosalty(soup: BeautifulSoup, url: str) -> dict:
|
||||
title = _og(soup, "og:title") or _text(soup.find("title"))
|
||||
if title:
|
||||
title = re.sub(r"\s*\|.*$", "", title).strip()
|
||||
|
||||
# Story as description (no dedicated description on nosalty)
|
||||
description = ""
|
||||
story = soup.find("div", id="recipe-story")
|
||||
if story:
|
||||
paragraphs = [p.get_text(strip=True) for p in story.find_all("p")
|
||||
if p.get_text(strip=True)]
|
||||
description = " ".join(paragraphs)
|
||||
|
||||
image_url = _og(soup, "og:image")
|
||||
|
||||
# --- Ingredients ---
|
||||
# Scoped to div#ingredients to avoid per-serving / nutrition duplicates.
|
||||
# Structure: h3.m-list__title = group header, ul.m-list__list = ingredient rows.
|
||||
ingredients = []
|
||||
ing_container = soup.find("div", id="ingredients")
|
||||
if ing_container:
|
||||
for el in ing_container.find_all(["h3", "ul"]):
|
||||
cls = el.get("class") or []
|
||||
if el.name == "h3" and "m-list__title" in cls:
|
||||
group_name = el.get_text(strip=True)
|
||||
if group_name:
|
||||
ingredients.append({"group": group_name})
|
||||
elif el.name == "ul" and "m-list__list" in cls:
|
||||
for li in el.find_all("li", class_="m-list__item"):
|
||||
_parse_nosalty_ingredient(li, ingredients)
|
||||
|
||||
# --- Instructions ---
|
||||
# Container: div#select inside div.p-recipe__directions.
|
||||
# h4.m-list__title = section header, ol.m-list__list = steps.
|
||||
instructions = []
|
||||
dir_container = soup.find("div", id="select")
|
||||
if dir_container:
|
||||
for el in dir_container.find_all(["h4", "ol"]):
|
||||
cls = el.get("class") or []
|
||||
if el.name == "h4" and "m-list__title" in cls:
|
||||
section_name = el.get_text(strip=True)
|
||||
if section_name:
|
||||
instructions.append(f"--- {section_name} ---")
|
||||
elif el.name == "ol" and "m-list__list" in cls:
|
||||
for li in el.find_all("li", class_="m-list__item"):
|
||||
txt = li.get_text(strip=True)
|
||||
if txt:
|
||||
instructions.append(txt)
|
||||
|
||||
# --- Tags ---
|
||||
tags = []
|
||||
for a in soup.find_all("a", class_="m-tags__tagItem"):
|
||||
tag_text = a.get_text(strip=True)
|
||||
if tag_text:
|
||||
tags.append(tag_text)
|
||||
|
||||
return {
|
||||
"title": title or "Ismeretlen recept",
|
||||
"description": description,
|
||||
"image_url": image_url,
|
||||
"ingredients": ingredients,
|
||||
"instructions": instructions,
|
||||
"tags": tags,
|
||||
"original_url": url,
|
||||
}
|
||||
|
||||
|
||||
def _parse_nosalty_ingredient(li, ingredients: list):
|
||||
"""Parse a single nosalty ingredient <li> into the ingredients list."""
|
||||
inner = li.find("div")
|
||||
if not inner:
|
||||
return
|
||||
|
||||
food_el = inner.find("a", class_="a-link")
|
||||
if not food_el:
|
||||
return
|
||||
|
||||
food = food_el.get_text(strip=True)
|
||||
if not food:
|
||||
return
|
||||
|
||||
# Walk children of inner div in order.
|
||||
# Spans before the <a> link = quantity, spans after = extra/note.
|
||||
qty_raw = ""
|
||||
extra_parts = []
|
||||
before_link = True
|
||||
for child in inner.children:
|
||||
if child is food_el:
|
||||
before_link = False
|
||||
continue
|
||||
if not hasattr(child, "get_text"):
|
||||
continue
|
||||
text = child.get_text(strip=True)
|
||||
if not text:
|
||||
continue
|
||||
if before_link:
|
||||
qty_raw = text
|
||||
else:
|
||||
extra_parts.append(text.strip("() "))
|
||||
|
||||
extra = "; ".join(p for p in extra_parts if p)
|
||||
qty, unit = _split_qty_unit(qty_raw)
|
||||
|
||||
ingredients.append({
|
||||
"quantity": qty,
|
||||
"unit": unit,
|
||||
"food": food,
|
||||
"extra": extra,
|
||||
})
|
||||
|
||||
|
||||
def _split_qty_unit(raw: str) -> tuple[str, str]:
|
||||
"""Split a merged quantity+unit string like '200g' into ('200', 'g')."""
|
||||
raw = raw.strip()
|
||||
|
||||
Reference in New Issue
Block a user