"""Recipe scraper — parses Hungarian recipe sites into a structured dict. Each supported site has a parser registered via _PARSERS. Unsupported sites fall back to generic schema.org / og-tag extraction. """ import re import requests from bs4 import BeautifulSoup _HEADERS = { "User-Agent": "RecipeImporter/1.0 (Hungarian recipe scraper)", "Accept-Language": "hu-HU,hu;q=0.9,en;q=0.5", } # Maps a substring of the hostname to a parser function. # Order matters: first match wins. _PARSERS: list[tuple[str, "callable"]] = [] def _register(host_substring: str): """Decorator: register a parser for URLs whose hostname contains *host_substring*.""" def decorator(fn): _PARSERS.append((host_substring, fn)) return fn return decorator # --------------------------------------------------------------------------- # Public API # --------------------------------------------------------------------------- def scrape(url: str) -> dict: """Fetch *url* and return a recipe dict. Returns:: { "title": str, "description": str, "image_url": str | None, "ingredients": [{"quantity": str, "unit": str, "food": str, "extra": str}, ...], "instructions": [str, ...], "tags": [str, ...], "original_url": str, } Raises ValueError on unsupported sites or parse failures. """ resp = requests.get(url, headers=_HEADERS, timeout=30) resp.raise_for_status() resp.encoding = resp.apparent_encoding or "utf-8" soup = BeautifulSoup(resp.text, "lxml") host = _host(url) for substring, parser in _PARSERS: if substring in host: return parser(soup, url) # Fallback: try generic schema.org / og-tag extraction return _parse_generic(soup, url) def supported_sites() -> list[str]: """Return list of supported site hostname substrings.""" return [s for s, _ in _PARSERS] # --------------------------------------------------------------------------- # mindmegette.hu # --------------------------------------------------------------------------- @_register("mindmegette") def _parse_mindmegette(soup: BeautifulSoup, url: str) -> dict: title = _og(soup, "og:title") or _text(soup.find("title")) # Strip " | Mindmegette.hu" suffix if title: title = re.sub(r"\s*\|\s*Mindmegette\.hu$", "", title).strip() description = _og(soup, "og:description") or "" image_url = _og(soup, "og:image") # --- Ingredients --- # Multiple div.ingredients containers may exist (one per group). # Group title: A habaráshoz: ingredients = [] for ing_container in soup.find_all("div", class_="ingredients"): # Check for a group title group_el = ing_container.find("strong", class_="ingredients-group") group_name = _text(group_el).rstrip(":").strip() if group_el else "" if group_name: ingredients.append({"group": group_name}) for row in ing_container.find_all("div", class_="ingredients-meta"): # Actual HTML: qty unit # name (extra) qty_el = row.find("strong") unit_el = None for sp in row.find_all("span"): if not sp.get("class"): unit_el = sp break name_el = row.find("a", class_="ingredients-link") extra_el = row.find("small") or row.find("span", class_="extra") qty = _text(qty_el) unit = _text(unit_el) food = _text(name_el) extra = _text(extra_el).strip("() ") if not food: # Fallback: grab whole row text food = row.get_text(separator=" ", strip=True) if food: ingredients.append({ "quantity": qty, "unit": unit, "food": food, "extra": extra, }) # --- Instructions --- instructions = [] wysiwyg = soup.find("mindmegette-wysiwyg-box") if wysiwyg: for li in wysiwyg.find_all("li"): txt = _text(li) if txt: instructions.append(txt) # Fallback: look for block-content divs if not instructions: for div in soup.find_all("div", class_="block-content"): ol = div.find("ol") if ol: for li in ol.find_all("li"): txt = _text(li) if txt: instructions.append(txt) # --- Tags --- tags = [] tag_wrapper = soup.select_one("div.desktop-wrapper") if tag_wrapper: for a in tag_wrapper.select("a.tag"): tag_text = a.get_text(strip=True) if tag_text: tags.append(tag_text) return { "title": title or "Ismeretlen recept", "description": description, "image_url": image_url, "ingredients": ingredients, "instructions": instructions, "tags": tags, "original_url": url, } # --------------------------------------------------------------------------- # Generic fallback (og-tags + schema.org microdata) # --------------------------------------------------------------------------- def _parse_generic(soup: BeautifulSoup, url: str) -> dict: title = _og(soup, "og:title") or _text(soup.find("title")) or "Ismeretlen recept" description = _og(soup, "og:description") or "" image_url = _og(soup, "og:image") ingredients = [] instructions = [] tags = [] # Try schema.org JSON-LD for script in soup.find_all("script", type="application/ld+json"): try: import json data = json.loads(script.string or "") if isinstance(data, list): data = data[0] if data.get("@type") == "Recipe": for line in data.get("recipeIngredient", []): ingredients.append({ "quantity": "", "unit": "", "food": line, "extra": "", }) raw_instructions = data.get("recipeInstructions", []) for item in raw_instructions: if isinstance(item, str): instructions.append(item) elif isinstance(item, dict): instructions.append(item.get("text", "")) # Extract keywords kw = data.get("keywords", "") if isinstance(kw, str): tags = [k.strip() for k in kw.split(",") if k.strip()] elif isinstance(kw, list): tags = [str(k).strip() for k in kw if str(k).strip()] break except (json.JSONDecodeError, TypeError, AttributeError): continue return { "title": title, "description": description, "image_url": image_url, "ingredients": ingredients, "instructions": instructions, "tags": tags, "original_url": url, } # --------------------------------------------------------------------------- # Helpers # --------------------------------------------------------------------------- def _host(url: str) -> str: from urllib.parse import urlparse return urlparse(url).hostname or "" def _og(soup: BeautifulSoup, prop: str) -> str | None: tag = soup.find("meta", property=prop) if tag and tag.get("content"): return tag["content"] return None def _text(el) -> str: if el is None: return "" return el.get_text(strip=True)