"""Recipe scraper — parses Hungarian recipe sites into a structured dict. Each supported site has a parser registered via _PARSERS. Unsupported sites fall back to generic schema.org / og-tag extraction. """ import json import re import requests from bs4 import BeautifulSoup _HEADERS = { "User-Agent": "RecipeImporter/1.0 (Hungarian recipe scraper)", "Accept-Language": "hu-HU,hu;q=0.9,en;q=0.5", } # Maps a substring of the hostname to a parser function. # Order matters: first match wins. _PARSERS: list[tuple[str, "callable"]] = [] def _register(host_substring: str): """Decorator: register a parser for URLs whose hostname contains *host_substring*.""" def decorator(fn): _PARSERS.append((host_substring, fn)) return fn return decorator # --------------------------------------------------------------------------- # Public API # --------------------------------------------------------------------------- def scrape(url: str) -> dict: """Fetch *url* and return a recipe dict. Returns:: { "title": str, "description": str, "image_url": str | None, "ingredients": [{"quantity": str, "unit": str, "food": str, "extra": str}, ...], "instructions": [str, ...], "tags": [str, ...], "original_url": str, } Raises ValueError on unsupported sites or parse failures. """ resp = requests.get(url, headers=_HEADERS, timeout=30) resp.raise_for_status() resp.encoding = resp.apparent_encoding or "utf-8" soup = BeautifulSoup(resp.text, "lxml") host = _host(url) result = None for substring, parser in _PARSERS: if substring in host: result = parser(soup, url) break if result is None: # Fallback: try generic schema.org / og-tag extraction result = _parse_generic(soup, url) # Post-process: extract parenthesized comments from food into extra _extract_ingredient_comments(result) return result def supported_sites() -> list[dict]: """Return list of supported sites with name and URL.""" _SITE_URLS = { "mindmegette": "https://www.mindmegette.hu", "streetkitchen": "https://streetkitchen.hu", "nosalty": "https://www.nosalty.hu", "sobors": "https://sobors.hu", "kiskegyed": "https://www.kiskegyed.hu", "gastrohobbi": "https://gastrohobbi.hu", } return [{"name": s + ".hu", "url": _SITE_URLS.get(s, "#")} for s, _ in _PARSERS] # --------------------------------------------------------------------------- # mindmegette.hu # --------------------------------------------------------------------------- @_register("mindmegette") def _parse_mindmegette(soup: BeautifulSoup, url: str) -> dict: title = _og(soup, "og:title") or _text(soup.find("title")) # Strip " | Mindmegette.hu" suffix if title: title = re.sub(r"\s*\|\s*Mindmegette\.hu$", "", title).strip() description = _og(soup, "og:description") or "" image_url = _og(soup, "og:image") # --- Ingredients --- # Multiple div.ingredients containers may exist (one per group). # Group title: A habaráshoz: ingredients = [] for ing_container in soup.find_all("div", class_="ingredients"): # Check for a group title group_el = ing_container.find("strong", class_="ingredients-group") group_name = _text(group_el).rstrip(":").strip() if group_el else "" if group_name: ingredients.append({"group": group_name}) for row in ing_container.find_all("div", class_="ingredients-meta"): # Actual HTML: qty unit # name (extra) qty_el = row.find("strong") unit_el = None for sp in row.find_all("span"): if not sp.get("class"): unit_el = sp break name_el = row.find("a", class_="ingredients-link") extra_el = row.find("small") or row.find("span", class_="extra") qty = _text(qty_el) unit = _text(unit_el) food = _text(name_el) extra = _text(extra_el).strip("() ") if not food: # Fallback: grab whole row text food = row.get_text(separator=" ", strip=True) if food: ingredients.append({ "quantity": qty, "unit": unit, "food": food, "extra": extra, }) # --- Instructions --- instructions = [] wysiwyg = soup.find("mindmegette-wysiwyg-box") if wysiwyg: for li in wysiwyg.find_all("li"): txt = _text(li) if txt: instructions.append(txt) # Fallback: look for block-content divs if not instructions: for div in soup.find_all("div", class_="block-content"): ol = div.find("ol") if ol: for li in ol.find_all("li"): txt = _text(li) if txt: instructions.append(txt) # --- Tags --- tags = [] tag_wrapper = soup.select_one("div.desktop-wrapper") if tag_wrapper: for a in tag_wrapper.select("a.tag"): tag_text = a.get_text(strip=True) if tag_text: tags.append(tag_text) return { "title": title or "Ismeretlen recept", "description": description, "image_url": image_url, "ingredients": ingredients, "instructions": instructions, "tags": tags, "original_url": url, } # --------------------------------------------------------------------------- # streetkitchen.hu # --------------------------------------------------------------------------- @_register("streetkitchen") def _parse_streetkitchen(soup: BeautifulSoup, url: str) -> dict: title = _og(soup, "og:title") or _text(soup.find("title")) if title: title = re.sub(r"\s*\|\s*Street Kitchen$", "", title).strip() description = _og(soup, "og:description") or "" image_url = _og(soup, "og:image") # --- Ingredients --- # Find the main ingredient grid (grid-cols-1 lg:grid-cols-2). # The page renders ingredients twice (mobile + desktop); we pick the # specific grid to avoid duplicates. ingredients = [] ing_grid = None for g in soup.select("div.grid"): cls = " ".join(g.get("class", [])) if "grid-cols-1" in cls and "lg:grid-cols-2" in cls: ing_grid = g break if ing_grid: # Walk top-level divs — each may contain an h5 group header + rows for section in ing_grid.find_all("div", recursive=False): h5 = section.find("h5") if h5: group_name = h5.get_text(strip=True) if group_name: ingredients.append({"group": group_name}) for row in section.select("div.my-2.flex.items-center.gap-2.text-lg"): inner = row.select_one("div.flex.items-center.gap-2") if not inner: continue divs = inner.find_all("div", recursive=False) bold = inner.find("div", class_="font-bold") food = bold.get_text(strip=True) if bold else "" if not food: continue # First non-bold div is quantity+unit merged (e.g. "200g", "1fej") qty_raw = "" extra = "" for d in divs: if d == bold: continue txt = d.get_text(strip=True) if txt.startswith("(") and txt.endswith(")"): extra = txt.strip("() ") elif not qty_raw: qty_raw = txt # Split "200g" → qty="200", unit="g" qty, unit = _split_qty_unit(qty_raw) # Extract parenthesised note from inside food name # e.g. "fehérborecet (ízlés szerint)" → food="fehérborecet", extra="ízlés szerint" if not extra: m = re.match(r"^(.+?)\s*\(([^)]+)\)\s*$", food) if m: food = m.group(1).strip() extra = m.group(2).strip() ingredients.append({ "quantity": qty, "unit": unit, "food": food, "extra": extra, }) # --- Instructions --- instructions = [] prep = (soup.find("div", id="Streetk_content_preparation_wrapper") or soup.select_one(".recipe-preparation")) if prep: ol = prep.find("ol") ul = prep.find("ul") if ol: for li in ol.find_all("li", recursive=False): txt = li.get_text(strip=True) if txt: instructions.append(txt) elif ul: for li in ul.find_all("li", recursive=False): txt = li.get_text(strip=True) if txt: instructions.append(txt) else: # Paragraph-style:

blocks, sometimes with headers for p in prep.find_all("p"): txt = p.get_text(strip=True) if txt: instructions.append(txt) # If still nothing, try the description wrapper if not instructions: desc_article = soup.find("article", id="Streetk_content_description_wrapper") if desc_article: for p in desc_article.find_all("p"): txt = p.get_text(strip=True) if txt: instructions.append(txt) # --- Tags --- tags = [] # Prefer recipeCategory from JSON-LD (comma-separated) for script in soup.find_all("script", type="application/ld+json"): try: data = json.loads(script.string or "") graph = data.get("@graph", [data]) if isinstance(data, dict) else data for item in graph: if isinstance(item, dict) and item.get("@type") == "Recipe": cat = item.get("recipeCategory", "") if isinstance(cat, str) and cat: tags = [t.strip() for t in cat.split(",") if t.strip()] elif isinstance(cat, list): tags = [str(t).strip() for t in cat if str(t).strip()] break except (json.JSONDecodeError, TypeError, AttributeError): continue return { "title": title or "Ismeretlen recept", "description": description, "image_url": image_url, "ingredients": ingredients, "instructions": instructions, "tags": tags, "original_url": url, } # --------------------------------------------------------------------------- # nosalty.hu # --------------------------------------------------------------------------- @_register("nosalty") def _parse_nosalty(soup: BeautifulSoup, url: str) -> dict: title = _og(soup, "og:title") or _text(soup.find("title")) if title: title = re.sub(r"\s*\|.*$", "", title).strip() # Story as description (no dedicated description on nosalty) description = "" story = soup.find("div", id="recipe-story") if story: paragraphs = [p.get_text(strip=True) for p in story.find_all("p") if p.get_text(strip=True)] description = " ".join(paragraphs) image_url = _og(soup, "og:image") # --- Ingredients --- # Scoped to div#ingredients to avoid per-serving / nutrition duplicates. # Structure: h3.m-list__title = group header, ul.m-list__list = ingredient rows. ingredients = [] ing_container = soup.find("div", id="ingredients") if ing_container: for el in ing_container.find_all(["h3", "ul"]): cls = el.get("class") or [] if el.name == "h3" and "m-list__title" in cls: group_name = el.get_text(strip=True) if group_name: ingredients.append({"group": group_name}) elif el.name == "ul" and "m-list__list" in cls: for li in el.find_all("li", class_="m-list__item"): _parse_nosalty_ingredient(li, ingredients) # --- Instructions --- # Container: div#select inside div.p-recipe__directions. # h4.m-list__title = section header, ol.m-list__list = steps. instructions = [] dir_container = soup.find("div", id="select") if dir_container: for el in dir_container.find_all(["h4", "ol"]): cls = el.get("class") or [] if el.name == "h4" and "m-list__title" in cls: section_name = el.get_text(strip=True) if section_name: instructions.append(f"--- {section_name} ---") elif el.name == "ol" and "m-list__list" in cls: for li in el.find_all("li", class_="m-list__item"): txt = li.get_text(strip=True) if txt: instructions.append(txt) # --- Tags --- # Scoped to div.p-recipe__attributeList to avoid site-wide SEO tags. tags = [] attr_list = soup.find("div", class_="p-recipe__attributeList") if attr_list: for a in attr_list.find_all("a", class_="m-tags__tagItem"): tag_text = a.get_text(strip=True) if tag_text: tags.append(tag_text) return { "title": title or "Ismeretlen recept", "description": description, "image_url": image_url, "ingredients": ingredients, "instructions": instructions, "tags": tags, "original_url": url, } def _parse_nosalty_ingredient(li, ingredients: list): """Parse a single nosalty ingredient

  • into the ingredients list.""" inner = li.find("div") if not inner: return food_el = inner.find("a", class_="a-link") if not food_el: return food = food_el.get_text(strip=True) if not food: return # Walk children of inner div in order. # Spans before the link = quantity, spans after = extra/note. qty_raw = "" extra_parts = [] before_link = True for child in inner.children: if child is food_el: before_link = False continue if not hasattr(child, "get_text"): continue text = child.get_text(strip=True) if not text: continue if before_link: qty_raw = text else: extra_parts.append(text.strip("() ")) extra = "; ".join(p for p in extra_parts if p) qty, unit = _split_qty_unit(qty_raw) ingredients.append({ "quantity": qty, "unit": unit, "food": food, "extra": extra, }) # --------------------------------------------------------------------------- # sobors.hu # --------------------------------------------------------------------------- @_register("sobors") def _parse_sobors(soup: BeautifulSoup, url: str) -> dict: # Title: h3.recept_nev title = "" title_el = soup.find("h3", class_="recept_nev") if title_el: title = title_el.get_text(strip=True) if not title: title = _og(soup, "og:title") or _text(soup.find("title")) if title: title = re.sub(r"\s*[-–|]\s*SóBors.*$", "", title, flags=re.IGNORECASE).strip() description = _og(soup, "og:description") or "" image_url = _og(soup, "og:image") # --- Ingredients --- # Container: div.hozzavalok-container (structured recipe pages) # Groups: section > h4 (group header), section > ul > li # Each li > span > span.mennyiseg, span.mertekegyseg, span.hozzavalo ingredients = [] ing_container = soup.find("div", class_="hozzavalok-container") if ing_container: for section in ing_container.find_all("section"): h4 = section.find("h4") if h4: group_name = h4.get_text(strip=True).rstrip(":") if group_name: ingredients.append({"group": group_name}) for li in section.find_all("li"): qty_el = li.find("span", class_="mennyiseg") unit_el = li.find("span", class_="mertekegyseg") food_el = li.find("span", class_="hozzavalo") food = _text(food_el) if not food: continue qty = _text(qty_el) unit = _text(unit_el) ingredients.append({ "quantity": qty, "unit": unit, "food": food, "extra": "", }) # Fallback: article-style ingredients (h4 group headers + ul > li plain text) # Some sobors.hu pages (especially linked recipes) use this simpler format. if not ingredients: article = soup.find("div", class_="cikk-torzs") or soup.find("article") if article: _parse_sobors_article_ingredients(article, ingredients) # --- Instructions --- # Container: div.recept_leiras.recept_he-elkeszites # Content:

    tags for steps,

    Section

    for section headers instructions = [] linked_url = None inst_container = soup.find("div", class_="recept_leiras") if inst_container: # Check for external link (linked recipe pattern — e.g. "click here for # full recipe on kiskegyed.hu") for a in inst_container.find_all("a", href=True): href = a["href"] if href.startswith("http") and "sobors.hu" not in href: linked_url = href break for el in inst_container.find_all(["h3", "p"]): if el.name == "h3": header = el.get_text(strip=True) if header: instructions.append(f"--- {header} ---") elif el.name == "p": txt = el.get_text(strip=True) if txt: # Strip leading numbering like "1. " from reader recipes txt = re.sub(r"^\d+\.\s+", "", txt) instructions.append(txt) # If instructions just contain a redirect to another site, try to follow # the link and scrape the real recipe from there. if linked_url and len(instructions) <= 2: try: linked_data = scrape(linked_url) if linked_data.get("instructions"): instructions = linked_data["instructions"] if not ingredients and linked_data.get("ingredients"): ingredients = linked_data["ingredients"] except Exception: pass # keep whatever we scraped from sobors.hu # --- Tags --- # Container: div.cikk-cimkek > ul.cikk-cimkek-list > li > a # Skip the generic "Receptek" category tag and "Olvasói receptek" tag tags = [] tag_container = soup.find("div", class_="cikk-cimkek") if tag_container: tag_list = tag_container.find("ul", class_="cikk-cimkek-list") if tag_list: skip = {"receptek", "olvasói receptek"} for a in tag_list.find_all("a"): tag_text = a.get_text(strip=True) if tag_text and tag_text.lower() not in skip: tags.append(tag_text) return { "title": title or "Ismeretlen recept", "description": description, "image_url": image_url, "ingredients": ingredients, "instructions": instructions, "tags": tags, "original_url": url, } # --------------------------------------------------------------------------- # kiskegyed.hu # --------------------------------------------------------------------------- @_register("kiskegyed") def _parse_kiskegyed(soup: BeautifulSoup, url: str) -> dict: # Title: h2 inside the detail section title = "" h2 = soup.find("h2") if h2: title = h2.get_text(strip=True) if not title: title = _og(soup, "og:title") or _text(soup.find("title")) if title: title = re.sub(r"\s*[-–|]\s*Kiskegyed.*$", "", title, flags=re.IGNORECASE).strip() # Description: section#leadText > p description = "" lead = soup.find("section", id="leadText") if lead: p = lead.find("p") if p: description = p.get_text(strip=True) if not description: description = _og(soup, "og:description") or "" image_url = _og(soup, "og:image") # --- Ingredients --- # Container: div.recipe_ingredients # Groups:

    Name:

    or

    A ...hez

    # Items: ul.list > li (plain text with optional
    links) ingredients = [] ing_container = soup.find("div", class_="recipe_ingredients") if ing_container: for el in ing_container.find_all(["p", "ul"]): if el.name == "p": group_text = el.get_text(strip=True).rstrip(":") # Skip the "Hozzávalók" header and serving info if not group_text or group_text.lower().startswith("hozzávalók"): continue # Skip serving info like "4 személyre" if re.match(r"^\d+\s+személyre$", group_text): continue ingredients.append({"group": group_text}) elif el.name == "ul" and "list" in (el.get("class") or []): for li in el.find_all("li"): # Use separator to preserve spaces around tags line = re.sub(r"\s+", " ", li.get_text(" ")).strip() if not line: continue qty, unit, food, extra = _parse_kiskegyed_ingredient(line) ingredients.append({ "quantity": qty, "unit": unit, "food": food, "extra": extra, }) # --- Instructions --- # Container: div.recipe_preparation > ol > li > div instructions = [] linked_url = None prep_container = soup.find("div", class_="recipe_preparation") if prep_container: # Check for cross-link to another recipe site (e.g. sobors.hu) for a in prep_container.find_all("a", href=True): href = a["href"] if href.startswith("http") and "kiskegyed.hu" not in href: # Check if it points to a supported recipe site linked_host = _host(href) if any(s in linked_host for s, _ in _PARSERS if s != "kiskegyed"): linked_url = href break ol = prep_container.find("ol") if ol: for li in ol.find_all("li", recursive=False): div = li.find("div") txt = div.get_text(strip=True) if div else li.get_text(strip=True) if txt: instructions.append(txt) # If instructions are empty or just a redirect, follow the linked recipe if linked_url and len(instructions) <= 2: try: linked_data = scrape(linked_url) if linked_data.get("instructions"): instructions = linked_data["instructions"] if not ingredients and linked_data.get("ingredients"): ingredients = linked_data["ingredients"] except Exception: pass # --- Tags --- # Container: section.tags > a > span (text starts with #) tags = [] tag_section = soup.find("section", class_="tags") if tag_section: skip = {"recept", "receptek"} for a in tag_section.find_all("a"): span = a.find("span") tag_text = span.get_text(strip=True) if span else a.get_text(strip=True) tag_text = tag_text.lstrip("#").strip() if tag_text and tag_text.lower() not in skip: tags.append(tag_text) return { "title": title or "Ismeretlen recept", "description": description, "image_url": image_url, "ingredients": ingredients, "instructions": instructions, "tags": tags, "original_url": url, } def _parse_kiskegyed_ingredient(line: str) -> tuple[str, str, str, str]: """Parse a kiskegyed.hu ingredient line. Handles dual measurements like '3 ek (70 g) búzafinomliszt (BL 55)' → qty='3', unit='ek', food='búzafinomliszt', extra='70 g; BL 55' """ extras = [] # Try: qty unit (alt_measurement) food... # Unit can be multi-word (e.g. "kis fej"), so use .+? (non-greedy) m = re.match( r"^([0-9½¼¾][0-9.,/½¼¾-]*)\s+(.+?)\s+\(([^)]+)\)\s+(.+)$", line ) if m: qty = m.group(1).strip() unit = m.group(2).strip() extras.append(m.group(3).strip()) food_raw = m.group(4).strip() # Extract trailing parenthesized note from food fm = re.match(r"^(.+?)\s*\(([^)]+)\)\s*$", food_raw) if fm: food_raw = fm.group(1).strip() extras.append(fm.group(2).strip()) return (qty, unit, food_raw, "; ".join(extras)) # Try: qty unit food... m2 = re.match(r"^([0-9½¼¾][0-9.,/½¼¾-]*)\s+(\S+)\s+(.+)$", line) if m2: return (m2.group(1).strip(), m2.group(2).strip(), m2.group(3).strip(), "") # Try: qty food (e.g. "2 tojás") m3 = re.match(r"^([0-9½¼¾][0-9.,/½¼¾-]*)\s+(.+)$", line) if m3: return (m3.group(1).strip(), "", m3.group(2).strip(), "") # No quantity (e.g. "ízlés szerint só") return ("", "", line, "") # --------------------------------------------------------------------------- # gastrohobbi.hu # --------------------------------------------------------------------------- @_register("gastrohobbi") def _parse_gastrohobbi(soup: BeautifulSoup, url: str) -> dict: # Title: h1.mpcth-post-title > span title = "" title_el = soup.select_one("h1.mpcth-post-title span.mpcth-color-main-border") if title_el: title = title_el.get_text(strip=True) if not title: title = _og(soup, "og:title") or _text(soup.find("title")) if title: title = re.sub(r"\s*[-–|]\s*GastroHobbi.*$", "", title, flags=re.IGNORECASE).strip() # Description: first

    in the first wpb_text_column before the inner recipe row description = "" first_text_col = soup.select_one("div.wpb-content-wrapper div.wpb_text_column div.wpb_wrapper") if first_text_col: p = first_text_col.find("p") if p: description = p.get_text(strip=True) if not description: description = _og(soup, "og:description") or "" image_url = _og(soup, "og:image") # --- Ingredients --- # Find h3 containing "Hozzávalók" then walk siblings for ul and group h3 elements ingredients = [] _gastrohobbi_parse_ingredients(soup, ingredients) # --- Instructions --- # Find h3 containing "Elkészítés:" then collect following

    elements instructions = [] prep_time = "" _gastrohobbi_parse_instructions(soup, instructions) # Extract prep time from h3 containing "Elkészítési idő:" for h3 in soup.find_all("h3"): text = h3.get_text(strip=True) if "elkészítési idő" in text.lower(): # Text after the tag: "Elkészítési idő: 60 perc" # The time part is outside the wrapper em = h3.find("em") if em: em.decompose() time_text = h3.get_text(strip=True).strip() if time_text: prep_time = time_text break # --- Tags --- # From JSON-LD Article.articleSection tags = [] skip_tags = {"receptjeink", "receptek"} for script in soup.find_all("script", type="application/ld+json"): try: data = json.loads(script.string or "") graph = data.get("@graph", [data]) if isinstance(data, dict) else data for item in graph: if isinstance(item, dict) and item.get("@type") == "Article": sections = item.get("articleSection", []) if isinstance(sections, list): tags = [s.strip() for s in sections if s.strip() and s.strip().lower() not in skip_tags] break except (json.JSONDecodeError, TypeError, AttributeError): continue # Append prep time to description if available if prep_time: if description: description += f" (Elkészítési idő: {prep_time})" else: description = f"Elkészítési idő: {prep_time}" return { "title": title or "Ismeretlen recept", "description": description, "image_url": image_url, "ingredients": ingredients, "instructions": instructions, "tags": tags, "original_url": url, } def _gastrohobbi_parse_ingredients(soup: BeautifulSoup, ingredients: list): """Parse ingredients from gastrohobbi.hu — find Hozzávalók h3 then walk siblings.""" # Find the h3 that contains "Hozzávalók" header = None for h3 in soup.find_all("h3"): if "hozzávalók" in h3.get_text(strip=True).lower(): header = h3 break if not header: return # Walk siblings after the header within the same container for sib in header.find_next_siblings(): tag = sib.name text = sib.get_text(strip=True) if not text: continue # Stop at the "Elkészítés" section if tag == "h3" and "elkészítés" in text.lower(): break # Ingredient group header (plain h3 without em>strong, e.g. "A csipetkéhez:") if tag == "h3": group_name = text.rstrip(":") if group_name: ingredients.append({"group": group_name}) continue # Ingredient list if tag == "ul": for li in sib.find_all("li", recursive=False): p = li.find("p") line = p.get_text(strip=True) if p else li.get_text(strip=True) if not line: continue qty, unit, food = _parse_ingredient_line(line) ingredients.append({ "quantity": qty, "unit": unit, "food": food, "extra": "", }) def _gastrohobbi_parse_instructions(soup: BeautifulSoup, instructions: list): """Parse instructions from gastrohobbi.hu — find Elkészítés h3 then collect

    siblings.""" header = None for h3 in soup.find_all("h3"): text = h3.get_text(strip=True) if text.lower().startswith("elkészítés") and "idő" not in text.lower(): header = h3 break if not header: return for sib in header.find_next_siblings(): tag = sib.name text = sib.get_text(strip=True) # Stop at prep time h3 or any other section header if tag == "h3": break if tag == "p": # Skip empty / whitespace-only paragraphs if not text or text == "\xa0": continue instructions.append(text) elif tag == "ul": # Embedded list in instructions (e.g. cooking time options) for li in sib.find_all("li"): # Skip wrapper li elements that contain nested lists if li.find("ul"): continue li_text = li.get_text(strip=True) if li_text: instructions.append(f" • {li_text}") def _parse_sobors_article_ingredients(container, ingredients: list): """Parse article-style ingredients from sobors.hu (h4 headers + ul > li plain text).""" for el in container.find_all(["h4", "ul"]): if el.name == "h4": group_name = el.get_text(strip=True).rstrip(":") if group_name and not group_name.lower().startswith("hozzávalók"): ingredients.append({"group": group_name}) elif el.name == "ul": # Only consider lists that follow an h4 or are inside the ingredient context prev = el.find_previous_sibling() if prev and prev.name == "h4": for li in el.find_all("li"): line = li.get_text(strip=True) if not line: continue qty, unit, food = _parse_ingredient_line(line) ingredients.append({ "quantity": qty, "unit": unit, "food": food, "extra": "", }) def _parse_ingredient_line(line: str) -> tuple[str, str, str]: """Parse a plain-text ingredient line like '2 dl tejföl' into (qty, unit, food).""" # Normalize en-dash/em-dash ranges: "10 – 15" → "10-15" line = re.sub(r"\s*[–—]\s*", "-", line) # qty unit food (e.g. "2 dl tejföl", "½ tk őrölt kömény") m = re.match(r"^([0-9½¼¾][0-9.,/½¼¾-]*)\s+(\S+)\s+(.+)$", line) if m: return (m.group(1).strip(), m.group(2).strip(), m.group(3).strip()) # Just quantity + food (e.g. "2 tojás") m2 = re.match(r"^([0-9½¼¾][0-9.,/½¼¾-]*)\s+(.+)$", line) if m2: return (m2.group(1).strip(), "", m2.group(2).strip()) return ("", "", line) def _split_qty_unit(raw: str) -> tuple[str, str]: """Split a merged quantity+unit string like '200g' into ('200', 'g').""" raw = raw.strip() if not raw: return ("", "") m = re.match(r"^([0-9][0-9 .,/-]*)(.*)$", raw) if m: return (m.group(1).strip(), m.group(2).strip()) return ("", raw) # --------------------------------------------------------------------------- # Generic fallback (og-tags + schema.org microdata) # --------------------------------------------------------------------------- def _parse_generic(soup: BeautifulSoup, url: str) -> dict: title = _og(soup, "og:title") or _text(soup.find("title")) or "Ismeretlen recept" description = _og(soup, "og:description") or "" image_url = _og(soup, "og:image") ingredients = [] instructions = [] tags = [] # Try schema.org JSON-LD for script in soup.find_all("script", type="application/ld+json"): try: data = json.loads(script.string or "") if isinstance(data, list): data = data[0] if data.get("@type") == "Recipe": for line in data.get("recipeIngredient", []): ingredients.append({ "quantity": "", "unit": "", "food": line, "extra": "", }) raw_instructions = data.get("recipeInstructions", []) for item in raw_instructions: if isinstance(item, str): instructions.append(item) elif isinstance(item, dict): instructions.append(item.get("text", "")) # Extract keywords kw = data.get("keywords", "") if isinstance(kw, str): tags = [k.strip() for k in kw.split(",") if k.strip()] elif isinstance(kw, list): tags = [str(k).strip() for k in kw if str(k).strip()] break except (json.JSONDecodeError, TypeError, AttributeError): continue return { "title": title, "description": description, "image_url": image_url, "ingredients": ingredients, "instructions": instructions, "tags": tags, "original_url": url, } # --------------------------------------------------------------------------- # Helpers # --------------------------------------------------------------------------- def _extract_ingredient_comments(data: dict): """Move trailing (comment) from food field to extra field for all ingredients.""" for ing in data.get("ingredients", []): if "group" in ing: continue food = ing.get("food", "") extra = ing.get("extra", "") if food and not extra: m = re.match(r"^(.+?)\s*\(([^)]+)\)\s*$", food) if m: ing["food"] = m.group(1).strip() ing["extra"] = m.group(2).strip() def _host(url: str) -> str: from urllib.parse import urlparse return urlparse(url).hostname or "" def _og(soup: BeautifulSoup, prop: str) -> str | None: tag = soup.find("meta", property=prop) if tag and tag.get("content"): return tag["content"] return None def _text(el) -> str: if el is None: return "" return el.get_text(strip=True)