"""Recipe scraper — parses Hungarian recipe sites into a structured dict. Currently supported: mindmegette.hu """ import re import requests from bs4 import BeautifulSoup _HEADERS = { "User-Agent": "RecipeImporter/1.0 (Hungarian recipe scraper)", "Accept-Language": "hu-HU,hu;q=0.9,en;q=0.5", } # --------------------------------------------------------------------------- # Public API # --------------------------------------------------------------------------- def scrape(url: str) -> dict: """Fetch *url* and return a recipe dict. Returns:: { "title": str, "description": str, "image_url": str | None, "ingredients": [{"quantity": str, "unit": str, "food": str, "extra": str}, ...], "instructions": [str, ...], "original_url": str, } Raises ValueError on unsupported sites or parse failures. """ resp = requests.get(url, headers=_HEADERS, timeout=30) resp.raise_for_status() resp.encoding = resp.apparent_encoding or "utf-8" soup = BeautifulSoup(resp.text, "lxml") host = _host(url) if "mindmegette" in host: return _parse_mindmegette(soup, url) else: # Fallback: try generic schema.org / og-tag extraction return _parse_generic(soup, url) # --------------------------------------------------------------------------- # mindmegette.hu # --------------------------------------------------------------------------- def _parse_mindmegette(soup: BeautifulSoup, url: str) -> dict: title = _og(soup, "og:title") or _text(soup.find("title")) # Strip " | Mindmegette.hu" suffix if title: title = re.sub(r"\s*\|\s*Mindmegette\.hu$", "", title).strip() description = _og(soup, "og:description") or "" image_url = _og(soup, "og:image") # --- Ingredients --- ingredients = [] ing_container = soup.find("div", class_="ingredients") if ing_container: for row in ing_container.find_all("div", class_="ingredients-meta"): # Actual HTML: qty unit # name (extra) qty_el = row.find("strong") unit_el = None for sp in row.find_all("span"): if not sp.get("class"): unit_el = sp break name_el = row.find("a", class_="ingredients-link") extra_el = row.find("small") or row.find("span", class_="extra") qty = _text(qty_el) unit = _text(unit_el) food = _text(name_el) extra = _text(extra_el).strip("() ") if not food: # Fallback: grab whole row text food = row.get_text(separator=" ", strip=True) if food: ingredients.append({ "quantity": qty, "unit": unit, "food": food, "extra": extra, }) # --- Instructions --- instructions = [] wysiwyg = soup.find("mindmegette-wysiwyg-box") if wysiwyg: for li in wysiwyg.find_all("li"): txt = _text(li) if txt: instructions.append(txt) # Fallback: look for block-content divs if not instructions: for div in soup.find_all("div", class_="block-content"): ol = div.find("ol") if ol: for li in ol.find_all("li"): txt = _text(li) if txt: instructions.append(txt) return { "title": title or "Ismeretlen recept", "description": description, "image_url": image_url, "ingredients": ingredients, "instructions": instructions, "original_url": url, } # --------------------------------------------------------------------------- # Generic fallback (og-tags + schema.org microdata) # --------------------------------------------------------------------------- def _parse_generic(soup: BeautifulSoup, url: str) -> dict: title = _og(soup, "og:title") or _text(soup.find("title")) or "Ismeretlen recept" description = _og(soup, "og:description") or "" image_url = _og(soup, "og:image") ingredients = [] instructions = [] # Try schema.org JSON-LD for script in soup.find_all("script", type="application/ld+json"): try: import json data = json.loads(script.string or "") if isinstance(data, list): data = data[0] if data.get("@type") == "Recipe": for line in data.get("recipeIngredient", []): ingredients.append({ "quantity": "", "unit": "", "food": line, "extra": "", }) raw_instructions = data.get("recipeInstructions", []) for item in raw_instructions: if isinstance(item, str): instructions.append(item) elif isinstance(item, dict): instructions.append(item.get("text", "")) break except (json.JSONDecodeError, TypeError, AttributeError): continue return { "title": title, "description": description, "image_url": image_url, "ingredients": ingredients, "instructions": instructions, "original_url": url, } # --------------------------------------------------------------------------- # Helpers # --------------------------------------------------------------------------- def _host(url: str) -> str: from urllib.parse import urlparse return urlparse(url).hostname or "" def _og(soup: BeautifulSoup, prop: str) -> str | None: tag = soup.find("meta", property=prop) if tag and tag.get("content"): return tag["content"] return None def _text(el) -> str: if el is None: return "" return el.get_text(strip=True)