recipe-importer/app/scraper.py

"""Recipe scraper — parses Hungarian recipe sites into a structured dict.

Currently supported: mindmegette.hu
"""

import re
import requests
from bs4 import BeautifulSoup

_HEADERS = {
    "User-Agent": "RecipeImporter/1.0 (Hungarian recipe scraper)",
    "Accept-Language": "hu-HU,hu;q=0.9,en;q=0.5",
}

# ---------------------------------------------------------------------------
# Public API
# ---------------------------------------------------------------------------


def scrape(url: str) -> dict:
    """Fetch *url* and return a recipe dict.

    Returns::

        {
            "title": str,
            "description": str,
            "image_url": str | None,
            "ingredients": [str, ...],
            "instructions": [str, ...],
            "original_url": str,
        }

    Raises ValueError on unsupported sites or parse failures.
    """
    resp = requests.get(url, headers=_HEADERS, timeout=30)
    resp.raise_for_status()
    resp.encoding = resp.apparent_encoding or "utf-8"
    soup = BeautifulSoup(resp.text, "lxml")

    host = _host(url)
    if "mindmegette" in host:
        return _parse_mindmegette(soup, url)
    else:
        # Fallback: try generic schema.org / og-tag extraction
        return _parse_generic(soup, url)


# ---------------------------------------------------------------------------
# mindmegette.hu
# ---------------------------------------------------------------------------


def _parse_mindmegette(soup: BeautifulSoup, url: str) -> dict:
    title = _og(soup, "og:title") or _text(soup.find("title"))
    # Strip " | Mindmegette.hu" suffix
    if title:
        title = re.sub(r"\s*\|\s*Mindmegette\.hu$", "", title).strip()

    description = _og(soup, "og:description") or ""
    image_url = _og(soup, "og:image")

    # --- Ingredients ---
    ingredients = []
    ing_container = soup.find("div", class_="ingredients")
    if ing_container:
        for row in ing_container.find_all("div", class_="ingredients-meta"):
            parts = []
            # Actual HTML: <strong>qty</strong> <span>unit</span>
            #              <a class="ingredients-link">name</a>
            qty_el = row.find("strong")
            # Unit: first plain <span> (not one with a specific class like
            # "ingredients-checkbox" etc.)
            unit_el = None
            for sp in row.find_all("span"):
                if not sp.get("class"):
                    unit_el = sp
                    break
            name_el = row.find("a", class_="ingredients-link")
            # Extra info: <small>(darált)</small> or <span class="extra">
            extra_el = row.find("small") or row.find("span", class_="extra")

            if qty_el:
                parts.append(_text(qty_el))
            if unit_el:
                parts.append(_text(unit_el))
            if name_el:
                parts.append(_text(name_el))
            if extra_el:
                extra = _text(extra_el)
                if extra:
                    # Wrap in parens if not already
                    if not extra.startswith("("):
                        extra = f"({extra})"
                    parts.append(extra)

            line = " ".join(p for p in parts if p)
            if not line:
                # Fallback: grab whole row text with spaces between elements
                line = row.get_text(separator=" ", strip=True)
            if line:
                ingredients.append(line)

    # --- Instructions ---
    instructions = []
    wysiwyg = soup.find("mindmegette-wysiwyg-box")
    if wysiwyg:
        for li in wysiwyg.find_all("li"):
            txt = _text(li)
            if txt:
                instructions.append(txt)
    # Fallback: look for block-content divs
    if not instructions:
        for div in soup.find_all("div", class_="block-content"):
            ol = div.find("ol")
            if ol:
                for li in ol.find_all("li"):
                    txt = _text(li)
                    if txt:
                        instructions.append(txt)

    return {
        "title": title or "Ismeretlen recept",
        "description": description,
        "image_url": image_url,
        "ingredients": ingredients,
        "instructions": instructions,
        "original_url": url,
    }


# ---------------------------------------------------------------------------
# Generic fallback (og-tags + schema.org microdata)
# ---------------------------------------------------------------------------


def _parse_generic(soup: BeautifulSoup, url: str) -> dict:
    title = _og(soup, "og:title") or _text(soup.find("title")) or "Ismeretlen recept"
    description = _og(soup, "og:description") or ""
    image_url = _og(soup, "og:image")

    ingredients = []
    instructions = []

    # Try schema.org JSON-LD
    for script in soup.find_all("script", type="application/ld+json"):
        try:
            import json
            data = json.loads(script.string or "")
            if isinstance(data, list):
                data = data[0]
            if data.get("@type") == "Recipe":
                ingredients = data.get("recipeIngredient", [])
                raw_instructions = data.get("recipeInstructions", [])
                for item in raw_instructions:
                    if isinstance(item, str):
                        instructions.append(item)
                    elif isinstance(item, dict):
                        instructions.append(item.get("text", ""))
                break
        except (json.JSONDecodeError, TypeError, AttributeError):
            continue

    return {
        "title": title,
        "description": description,
        "image_url": image_url,
        "ingredients": ingredients,
        "instructions": instructions,
        "original_url": url,
    }


# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------


def _host(url: str) -> str:
    from urllib.parse import urlparse
    return urlparse(url).hostname or ""


def _og(soup: BeautifulSoup, prop: str) -> str | None:
    tag = soup.find("meta", property=prop)
    if tag and tag.get("content"):
        return tag["content"]
    return None


def _text(el) -> str:
    if el is None:
        return ""
    return el.get_text(strip=True)