recipe-importer/app/scraper.py

"""Recipe scraper — parses Hungarian recipe sites into a structured dict.

Each supported site has a parser registered via _PARSERS.
Unsupported sites fall back to generic schema.org / og-tag extraction.
"""

import json
import re
import requests
from bs4 import BeautifulSoup

_HEADERS = {
    "User-Agent": "RecipeImporter/1.0 (Hungarian recipe scraper)",
    "Accept-Language": "hu-HU,hu;q=0.9,en;q=0.5",
}

# Maps a substring of the hostname to a parser function.
# Order matters: first match wins.
_PARSERS: list[tuple[str, "callable"]] = []


def _register(host_substring: str):
    """Decorator: register a parser for URLs whose hostname contains *host_substring*."""
    def decorator(fn):
        _PARSERS.append((host_substring, fn))
        return fn
    return decorator


# ---------------------------------------------------------------------------
# Public API
# ---------------------------------------------------------------------------


def scrape(url: str) -> dict:
    """Fetch *url* and return a recipe dict.

    Returns::

        {
            "title": str,
            "description": str,
            "image_url": str | None,
            "ingredients": [{"quantity": str, "unit": str, "food": str, "extra": str}, ...],
            "instructions": [str, ...],
            "tags": [str, ...],
            "original_url": str,
        }

    Raises ValueError on unsupported sites or parse failures.
    """
    resp = requests.get(url, headers=_HEADERS, timeout=30)
    resp.raise_for_status()
    resp.encoding = resp.apparent_encoding or "utf-8"
    soup = BeautifulSoup(resp.text, "lxml")

    host = _host(url)
    result = None
    for substring, parser in _PARSERS:
        if substring in host:
            result = parser(soup, url)
            break

    if result is None:
        # Fallback: try generic schema.org / og-tag extraction
        result = _parse_generic(soup, url)

    # Post-process: extract parenthesized comments from food into extra
    _extract_ingredient_comments(result)
    return result


def supported_sites() -> list[dict]:
    """Return list of supported sites with name and URL."""
    _SITE_URLS = {
        "mindmegette": "https://www.mindmegette.hu",
        "streetkitchen": "https://streetkitchen.hu",
        "nosalty": "https://www.nosalty.hu",
        "sobors": "https://sobors.hu",
        "kiskegyed": "https://www.kiskegyed.hu",
        "gastrohobbi": "https://gastrohobbi.hu",
    }
    return [{"name": s + ".hu", "url": _SITE_URLS.get(s, "#")} for s, _ in _PARSERS]


# ---------------------------------------------------------------------------
# mindmegette.hu
# ---------------------------------------------------------------------------


@_register("mindmegette")
def _parse_mindmegette(soup: BeautifulSoup, url: str) -> dict:
    title = _og(soup, "og:title") or _text(soup.find("title"))
    # Strip " | Mindmegette.hu" suffix
    if title:
        title = re.sub(r"\s*\|\s*Mindmegette\.hu$", "", title).strip()

    description = _og(soup, "og:description") or ""
    image_url = _og(soup, "og:image")

    # --- Ingredients ---
    # Multiple div.ingredients containers may exist (one per group).
    # Group title: <strong class="ingredients-group">A habaráshoz:</strong>
    ingredients = []
    for ing_container in soup.find_all("div", class_="ingredients"):
        # Check for a group title
        group_el = ing_container.find("strong", class_="ingredients-group")
        group_name = _text(group_el).rstrip(":").strip() if group_el else ""
        if group_name:
            ingredients.append({"group": group_name})

        for row in ing_container.find_all("div", class_="ingredients-meta"):
            # Actual HTML: <strong>qty</strong> <span>unit</span>
            #              <a class="ingredients-link">name</a> <small>(extra)</small>
            qty_el = row.find("strong")
            unit_el = None
            for sp in row.find_all("span"):
                if not sp.get("class"):
                    unit_el = sp
                    break
            name_el = row.find("a", class_="ingredients-link")
            extra_el = row.find("small") or row.find("span", class_="extra")

            qty = _text(qty_el)
            unit = _text(unit_el)
            food = _text(name_el)
            extra = _text(extra_el).strip("() ")

            if not food:
                # Fallback: grab whole row text
                food = row.get_text(separator=" ", strip=True)

            if food:
                ingredients.append({
                    "quantity": qty,
                    "unit": unit,
                    "food": food,
                    "extra": extra,
                })

    # --- Instructions ---
    instructions = []
    wysiwyg = soup.find("mindmegette-wysiwyg-box")
    if wysiwyg:
        for li in wysiwyg.find_all("li"):
            txt = _text(li)
            if txt:
                instructions.append(txt)
    # Fallback: look for block-content divs
    if not instructions:
        for div in soup.find_all("div", class_="block-content"):
            ol = div.find("ol")
            if ol:
                for li in ol.find_all("li"):
                    txt = _text(li)
                    if txt:
                        instructions.append(txt)

    # --- Tags ---
    tags = []
    tag_wrapper = soup.select_one("div.desktop-wrapper")
    if tag_wrapper:
        for a in tag_wrapper.select("a.tag"):
            tag_text = a.get_text(strip=True)
            if tag_text:
                tags.append(tag_text)

    return {
        "title": title or "Ismeretlen recept",
        "description": description,
        "image_url": image_url,
        "ingredients": ingredients,
        "instructions": instructions,
        "tags": tags,
        "original_url": url,
    }


# ---------------------------------------------------------------------------
# streetkitchen.hu
# ---------------------------------------------------------------------------


@_register("streetkitchen")
def _parse_streetkitchen(soup: BeautifulSoup, url: str) -> dict:
    title = _og(soup, "og:title") or _text(soup.find("title"))
    if title:
        title = re.sub(r"\s*\|\s*Street Kitchen$", "", title).strip()

    description = _og(soup, "og:description") or ""
    image_url = _og(soup, "og:image")

    # --- Ingredients ---
    # Find the main ingredient grid (grid-cols-1 lg:grid-cols-2).
    # The page renders ingredients twice (mobile + desktop); we pick the
    # specific grid to avoid duplicates.
    ingredients = []
    ing_grid = None
    for g in soup.select("div.grid"):
        cls = " ".join(g.get("class", []))
        if "grid-cols-1" in cls and "lg:grid-cols-2" in cls:
            ing_grid = g
            break

    if ing_grid:
        # Walk top-level divs — each may contain an h5 group header + rows
        for section in ing_grid.find_all("div", recursive=False):
            h5 = section.find("h5")
            if h5:
                group_name = h5.get_text(strip=True)
                if group_name:
                    ingredients.append({"group": group_name})

            for row in section.select("div.my-2.flex.items-center.gap-2.text-lg"):
                inner = row.select_one("div.flex.items-center.gap-2")
                if not inner:
                    continue
                divs = inner.find_all("div", recursive=False)
                bold = inner.find("div", class_="font-bold")
                food = bold.get_text(strip=True) if bold else ""
                if not food:
                    continue

                # First non-bold div is quantity+unit merged (e.g. "200g", "1fej")
                qty_raw = ""
                extra = ""
                for d in divs:
                    if d == bold:
                        continue
                    txt = d.get_text(strip=True)
                    if txt.startswith("(") and txt.endswith(")"):
                        extra = txt.strip("() ")
                    elif not qty_raw:
                        qty_raw = txt

                # Split "200g" → qty="200", unit="g"
                qty, unit = _split_qty_unit(qty_raw)

                # Extract parenthesised note from inside food name
                # e.g. "fehérborecet (ízlés szerint)" → food="fehérborecet", extra="ízlés szerint"
                if not extra:
                    m = re.match(r"^(.+?)\s*\(([^)]+)\)\s*$", food)
                    if m:
                        food = m.group(1).strip()
                        extra = m.group(2).strip()

                ingredients.append({
                    "quantity": qty,
                    "unit": unit,
                    "food": food,
                    "extra": extra,
                })

    # --- Instructions ---
    instructions = []
    prep = (soup.find("div", id="Streetk_content_preparation_wrapper")
            or soup.select_one(".recipe-preparation"))
    if prep:
        ol = prep.find("ol")
        ul = prep.find("ul")
        if ol:
            for li in ol.find_all("li", recursive=False):
                txt = li.get_text(strip=True)
                if txt:
                    instructions.append(txt)
        elif ul:
            for li in ul.find_all("li", recursive=False):
                txt = li.get_text(strip=True)
                if txt:
                    instructions.append(txt)
        else:
            # Paragraph-style: <p> blocks, sometimes with <strong> headers
            for p in prep.find_all("p"):
                txt = p.get_text(strip=True)
                if txt:
                    instructions.append(txt)

    # If still nothing, try the description wrapper
    if not instructions:
        desc_article = soup.find("article", id="Streetk_content_description_wrapper")
        if desc_article:
            for p in desc_article.find_all("p"):
                txt = p.get_text(strip=True)
                if txt:
                    instructions.append(txt)

    # --- Tags ---
    tags = []
    # Prefer recipeCategory from JSON-LD (comma-separated)
    for script in soup.find_all("script", type="application/ld+json"):
        try:
            data = json.loads(script.string or "")
            graph = data.get("@graph", [data]) if isinstance(data, dict) else data
            for item in graph:
                if isinstance(item, dict) and item.get("@type") == "Recipe":
                    cat = item.get("recipeCategory", "")
                    if isinstance(cat, str) and cat:
                        tags = [t.strip() for t in cat.split(",") if t.strip()]
                    elif isinstance(cat, list):
                        tags = [str(t).strip() for t in cat if str(t).strip()]
                    break
        except (json.JSONDecodeError, TypeError, AttributeError):
            continue

    return {
        "title": title or "Ismeretlen recept",
        "description": description,
        "image_url": image_url,
        "ingredients": ingredients,
        "instructions": instructions,
        "tags": tags,
        "original_url": url,
    }


# ---------------------------------------------------------------------------
# nosalty.hu
# ---------------------------------------------------------------------------


@_register("nosalty")
def _parse_nosalty(soup: BeautifulSoup, url: str) -> dict:
    title = _og(soup, "og:title") or _text(soup.find("title"))
    if title:
        title = re.sub(r"\s*\|.*$", "", title).strip()

    # Story as description (no dedicated description on nosalty)
    description = ""
    story = soup.find("div", id="recipe-story")
    if story:
        paragraphs = [p.get_text(strip=True) for p in story.find_all("p")
                      if p.get_text(strip=True)]
        description = " ".join(paragraphs)

    image_url = _og(soup, "og:image")

    # --- Ingredients ---
    # Scoped to div#ingredients to avoid per-serving / nutrition duplicates.
    # Structure: h3.m-list__title = group header, ul.m-list__list = ingredient rows.
    ingredients = []
    ing_container = soup.find("div", id="ingredients")
    if ing_container:
        for el in ing_container.find_all(["h3", "ul"]):
            cls = el.get("class") or []
            if el.name == "h3" and "m-list__title" in cls:
                group_name = el.get_text(strip=True)
                if group_name:
                    ingredients.append({"group": group_name})
            elif el.name == "ul" and "m-list__list" in cls:
                for li in el.find_all("li", class_="m-list__item"):
                    _parse_nosalty_ingredient(li, ingredients)

    # --- Instructions ---
    # Container: div#select inside div.p-recipe__directions.
    # h4.m-list__title = section header, ol.m-list__list = steps.
    instructions = []
    dir_container = soup.find("div", id="select")
    if dir_container:
        for el in dir_container.find_all(["h4", "ol"]):
            cls = el.get("class") or []
            if el.name == "h4" and "m-list__title" in cls:
                section_name = el.get_text(strip=True)
                if section_name:
                    instructions.append(f"--- {section_name} ---")
            elif el.name == "ol" and "m-list__list" in cls:
                for li in el.find_all("li", class_="m-list__item"):
                    txt = li.get_text(strip=True)
                    if txt:
                        instructions.append(txt)

    # --- Tags ---
    # Scoped to div.p-recipe__attributeList to avoid site-wide SEO tags.
    tags = []
    attr_list = soup.find("div", class_="p-recipe__attributeList")
    if attr_list:
        for a in attr_list.find_all("a", class_="m-tags__tagItem"):
            tag_text = a.get_text(strip=True)
            if tag_text:
                tags.append(tag_text)

    return {
        "title": title or "Ismeretlen recept",
        "description": description,
        "image_url": image_url,
        "ingredients": ingredients,
        "instructions": instructions,
        "tags": tags,
        "original_url": url,
    }


def _parse_nosalty_ingredient(li, ingredients: list):
    """Parse a single nosalty ingredient <li> into the ingredients list."""
    inner = li.find("div")
    if not inner:
        return

    food_el = inner.find("a", class_="a-link")
    if not food_el:
        return

    food = food_el.get_text(strip=True)
    if not food:
        return

    # Walk children of inner div in order.
    # Spans before the <a> link = quantity, spans after = extra/note.
    qty_raw = ""
    extra_parts = []
    before_link = True
    for child in inner.children:
        if child is food_el:
            before_link = False
            continue
        if not hasattr(child, "get_text"):
            continue
        text = child.get_text(strip=True)
        if not text:
            continue
        if before_link:
            qty_raw = text
        else:
            extra_parts.append(text.strip("() "))

    extra = "; ".join(p for p in extra_parts if p)
    qty, unit = _split_qty_unit(qty_raw)

    ingredients.append({
        "quantity": qty,
        "unit": unit,
        "food": food,
        "extra": extra,
    })


# ---------------------------------------------------------------------------
# sobors.hu
# ---------------------------------------------------------------------------


@_register("sobors")
def _parse_sobors(soup: BeautifulSoup, url: str) -> dict:
    # Title: h3.recept_nev
    title = ""
    title_el = soup.find("h3", class_="recept_nev")
    if title_el:
        title = title_el.get_text(strip=True)
    if not title:
        title = _og(soup, "og:title") or _text(soup.find("title"))
    if title:
        title = re.sub(r"\s*[-–|]\s*SóBors.*$", "", title, flags=re.IGNORECASE).strip()

    description = _og(soup, "og:description") or ""
    image_url = _og(soup, "og:image")

    # --- Ingredients ---
    # Container: div.hozzavalok-container (structured recipe pages)
    # Groups: section > h4 (group header), section > ul > li
    # Each li > span > span.mennyiseg, span.mertekegyseg, span.hozzavalo
    ingredients = []
    ing_container = soup.find("div", class_="hozzavalok-container")
    if ing_container:
        for section in ing_container.find_all("section"):
            h4 = section.find("h4")
            if h4:
                group_name = h4.get_text(strip=True).rstrip(":")
                if group_name:
                    ingredients.append({"group": group_name})
            for li in section.find_all("li"):
                qty_el = li.find("span", class_="mennyiseg")
                unit_el = li.find("span", class_="mertekegyseg")
                food_el = li.find("span", class_="hozzavalo")
                food = _text(food_el)
                if not food:
                    continue
                qty = _text(qty_el)
                unit = _text(unit_el)
                ingredients.append({
                    "quantity": qty,
                    "unit": unit,
                    "food": food,
                    "extra": "",
                })

    # Fallback: article-style ingredients (h4 group headers + ul > li plain text)
    # Some sobors.hu pages (especially linked recipes) use this simpler format.
    if not ingredients:
        article = soup.find("div", class_="cikk-torzs") or soup.find("article")
        if article:
            _parse_sobors_article_ingredients(article, ingredients)

    # --- Instructions ---
    # Container: div.recept_leiras.recept_he-elkeszites
    # Content: <p> tags for steps, <h3><strong>Section</strong></h3> for section headers
    instructions = []
    linked_url = None
    inst_container = soup.find("div", class_="recept_leiras")
    if inst_container:
        # Check for external link (linked recipe pattern — e.g. "click here for
        # full recipe on kiskegyed.hu")
        for a in inst_container.find_all("a", href=True):
            href = a["href"]
            if href.startswith("http") and "sobors.hu" not in href:
                linked_url = href
                break

        for el in inst_container.find_all(["h3", "p"]):
            if el.name == "h3":
                header = el.get_text(strip=True)
                if header:
                    instructions.append(f"--- {header} ---")
            elif el.name == "p":
                txt = el.get_text(strip=True)
                if txt:
                    # Strip leading numbering like "1.   " from reader recipes
                    txt = re.sub(r"^\d+\.\s+", "", txt)
                    instructions.append(txt)

    # If instructions just contain a redirect to another site, try to follow
    # the link and scrape the real recipe from there.
    if linked_url and len(instructions) <= 2:
        try:
            linked_data = scrape(linked_url)
            if linked_data.get("instructions"):
                instructions = linked_data["instructions"]
            if not ingredients and linked_data.get("ingredients"):
                ingredients = linked_data["ingredients"]
        except Exception:
            pass  # keep whatever we scraped from sobors.hu

    # --- Tags ---
    # Container: div.cikk-cimkek > ul.cikk-cimkek-list > li > a
    # Skip the generic "Receptek" category tag and "Olvasói receptek" tag
    tags = []
    tag_container = soup.find("div", class_="cikk-cimkek")
    if tag_container:
        tag_list = tag_container.find("ul", class_="cikk-cimkek-list")
        if tag_list:
            skip = {"receptek", "olvasói receptek"}
            for a in tag_list.find_all("a"):
                tag_text = a.get_text(strip=True)
                if tag_text and tag_text.lower() not in skip:
                    tags.append(tag_text)

    return {
        "title": title or "Ismeretlen recept",
        "description": description,
        "image_url": image_url,
        "ingredients": ingredients,
        "instructions": instructions,
        "tags": tags,
        "original_url": url,
    }


# ---------------------------------------------------------------------------
# kiskegyed.hu
# ---------------------------------------------------------------------------


@_register("kiskegyed")
def _parse_kiskegyed(soup: BeautifulSoup, url: str) -> dict:
    # Title: h2 inside the detail section
    title = ""
    h2 = soup.find("h2")
    if h2:
        title = h2.get_text(strip=True)
    if not title:
        title = _og(soup, "og:title") or _text(soup.find("title"))
    if title:
        title = re.sub(r"\s*[-–|]\s*Kiskegyed.*$", "", title, flags=re.IGNORECASE).strip()

    # Description: section#leadText > p
    description = ""
    lead = soup.find("section", id="leadText")
    if lead:
        p = lead.find("p")
        if p:
            description = p.get_text(strip=True)
    if not description:
        description = _og(soup, "og:description") or ""

    image_url = _og(soup, "og:image")

    # --- Ingredients ---
    # Container: div.recipe_ingredients
    # Groups: <p>Name:</p> or <p><em>A ...hez</em></p>
    # Items: ul.list > li (plain text with optional <a> links)
    ingredients = []
    ing_container = soup.find("div", class_="recipe_ingredients")
    if ing_container:
        for el in ing_container.find_all(["p", "ul"]):
            if el.name == "p":
                group_text = el.get_text(strip=True).rstrip(":")
                # Skip the "Hozzávalók" header and serving info
                if not group_text or group_text.lower().startswith("hozzávalók"):
                    continue
                # Skip serving info like "4 személyre"
                if re.match(r"^\d+\s+személyre$", group_text):
                    continue
                ingredients.append({"group": group_text})
            elif el.name == "ul" and "list" in (el.get("class") or []):
                for li in el.find_all("li"):
                    # Use separator to preserve spaces around <a> tags
                    line = re.sub(r"\s+", " ", li.get_text(" ")).strip()
                    if not line:
                        continue
                    qty, unit, food, extra = _parse_kiskegyed_ingredient(line)
                    ingredients.append({
                        "quantity": qty,
                        "unit": unit,
                        "food": food,
                        "extra": extra,
                    })

    # --- Instructions ---
    # Container: div.recipe_preparation > ol > li > div
    instructions = []
    linked_url = None
    prep_container = soup.find("div", class_="recipe_preparation")
    if prep_container:
        # Check for cross-link to another recipe site (e.g. sobors.hu)
        for a in prep_container.find_all("a", href=True):
            href = a["href"]
            if href.startswith("http") and "kiskegyed.hu" not in href:
                # Check if it points to a supported recipe site
                linked_host = _host(href)
                if any(s in linked_host for s, _ in _PARSERS if s != "kiskegyed"):
                    linked_url = href
                    break

        ol = prep_container.find("ol")
        if ol:
            for li in ol.find_all("li", recursive=False):
                div = li.find("div")
                txt = div.get_text(strip=True) if div else li.get_text(strip=True)
                if txt:
                    instructions.append(txt)

    # If instructions are empty or just a redirect, follow the linked recipe
    if linked_url and len(instructions) <= 2:
        try:
            linked_data = scrape(linked_url)
            if linked_data.get("instructions"):
                instructions = linked_data["instructions"]
            if not ingredients and linked_data.get("ingredients"):
                ingredients = linked_data["ingredients"]
        except Exception:
            pass

    # --- Tags ---
    # Container: section.tags > a > span (text starts with #)
    tags = []
    tag_section = soup.find("section", class_="tags")
    if tag_section:
        skip = {"recept", "receptek"}
        for a in tag_section.find_all("a"):
            span = a.find("span")
            tag_text = span.get_text(strip=True) if span else a.get_text(strip=True)
            tag_text = tag_text.lstrip("#").strip()
            if tag_text and tag_text.lower() not in skip:
                tags.append(tag_text)

    return {
        "title": title or "Ismeretlen recept",
        "description": description,
        "image_url": image_url,
        "ingredients": ingredients,
        "instructions": instructions,
        "tags": tags,
        "original_url": url,
    }


def _parse_kiskegyed_ingredient(line: str) -> tuple[str, str, str, str]:
    """Parse a kiskegyed.hu ingredient line.

    Handles dual measurements like '3 ek (70 g) búzafinomliszt (BL 55)'
    → qty='3', unit='ek', food='búzafinomliszt', extra='70 g; BL 55'
    """
    extras = []

    # Try: qty unit (alt_measurement) food...
    # Unit can be multi-word (e.g. "kis fej"), so use .+? (non-greedy)
    m = re.match(
        r"^([0-9½¼¾][0-9.,/½¼¾-]*)\s+(.+?)\s+\(([^)]+)\)\s+(.+)$", line
    )
    if m:
        qty = m.group(1).strip()
        unit = m.group(2).strip()
        extras.append(m.group(3).strip())
        food_raw = m.group(4).strip()
        # Extract trailing parenthesized note from food
        fm = re.match(r"^(.+?)\s*\(([^)]+)\)\s*$", food_raw)
        if fm:
            food_raw = fm.group(1).strip()
            extras.append(fm.group(2).strip())
        return (qty, unit, food_raw, "; ".join(extras))

    # Try: qty unit food...
    m2 = re.match(r"^([0-9½¼¾][0-9.,/½¼¾-]*)\s+(\S+)\s+(.+)$", line)
    if m2:
        return (m2.group(1).strip(), m2.group(2).strip(), m2.group(3).strip(), "")

    # Try: qty food (e.g. "2 tojás")
    m3 = re.match(r"^([0-9½¼¾][0-9.,/½¼¾-]*)\s+(.+)$", line)
    if m3:
        return (m3.group(1).strip(), "", m3.group(2).strip(), "")

    # No quantity (e.g. "ízlés szerint só")
    return ("", "", line, "")


# ---------------------------------------------------------------------------
# gastrohobbi.hu
# ---------------------------------------------------------------------------


@_register("gastrohobbi")
def _parse_gastrohobbi(soup: BeautifulSoup, url: str) -> dict:
    # Title: h1.mpcth-post-title > span
    title = ""
    title_el = soup.select_one("h1.mpcth-post-title span.mpcth-color-main-border")
    if title_el:
        title = title_el.get_text(strip=True)
    if not title:
        title = _og(soup, "og:title") or _text(soup.find("title"))
    if title:
        title = re.sub(r"\s*[-–|]\s*GastroHobbi.*$", "", title, flags=re.IGNORECASE).strip()

    # Description: first <p> in the first wpb_text_column before the inner recipe row
    description = ""
    first_text_col = soup.select_one("div.wpb-content-wrapper div.wpb_text_column div.wpb_wrapper")
    if first_text_col:
        p = first_text_col.find("p")
        if p:
            description = p.get_text(strip=True)
    if not description:
        description = _og(soup, "og:description") or ""

    image_url = _og(soup, "og:image")

    # --- Ingredients ---
    # Find h3 containing "Hozzávalók" then walk siblings for ul and group h3 elements
    ingredients = []
    _gastrohobbi_parse_ingredients(soup, ingredients)

    # --- Instructions ---
    # Find h3 containing "Elkészítés:" then collect following <p> elements
    instructions = []
    prep_time = ""
    _gastrohobbi_parse_instructions(soup, instructions)

    # Extract prep time from h3 containing "Elkészítési idő:"
    for h3 in soup.find_all("h3"):
        text = h3.get_text(strip=True)
        if "elkészítési idő" in text.lower():
            # Text after the <em> tag: "Elkészítési idő: 60 perc"
            # The time part is outside the <em><strong> wrapper
            em = h3.find("em")
            if em:
                em.decompose()
            time_text = h3.get_text(strip=True).strip()
            if time_text:
                prep_time = time_text
            break

    # --- Tags ---
    # From JSON-LD Article.articleSection
    tags = []
    skip_tags = {"receptjeink", "receptek"}
    for script in soup.find_all("script", type="application/ld+json"):
        try:
            data = json.loads(script.string or "")
            graph = data.get("@graph", [data]) if isinstance(data, dict) else data
            for item in graph:
                if isinstance(item, dict) and item.get("@type") == "Article":
                    sections = item.get("articleSection", [])
                    if isinstance(sections, list):
                        tags = [s.strip() for s in sections
                                if s.strip() and s.strip().lower() not in skip_tags]
                    break
        except (json.JSONDecodeError, TypeError, AttributeError):
            continue

    # Append prep time to description if available
    if prep_time:
        if description:
            description += f" (Elkészítési idő: {prep_time})"
        else:
            description = f"Elkészítési idő: {prep_time}"

    return {
        "title": title or "Ismeretlen recept",
        "description": description,
        "image_url": image_url,
        "ingredients": ingredients,
        "instructions": instructions,
        "tags": tags,
        "original_url": url,
    }


def _gastrohobbi_parse_ingredients(soup: BeautifulSoup, ingredients: list):
    """Parse ingredients from gastrohobbi.hu — find Hozzávalók h3 then walk siblings."""
    # Find the h3 that contains "Hozzávalók"
    header = None
    for h3 in soup.find_all("h3"):
        if "hozzávalók" in h3.get_text(strip=True).lower():
            header = h3
            break
    if not header:
        return

    # Walk siblings after the header within the same container
    for sib in header.find_next_siblings():
        tag = sib.name
        text = sib.get_text(strip=True)
        if not text:
            continue
        # Stop at the "Elkészítés" section
        if tag == "h3" and "elkészítés" in text.lower():
            break
        # Ingredient group header (plain h3 without em>strong, e.g. "A csipetkéhez:")
        if tag == "h3":
            group_name = text.rstrip(":")
            if group_name:
                ingredients.append({"group": group_name})
            continue
        # Ingredient list
        if tag == "ul":
            for li in sib.find_all("li", recursive=False):
                p = li.find("p")
                line = p.get_text(strip=True) if p else li.get_text(strip=True)
                if not line:
                    continue
                qty, unit, food = _parse_ingredient_line(line)
                ingredients.append({
                    "quantity": qty, "unit": unit, "food": food, "extra": "",
                })


def _gastrohobbi_parse_instructions(soup: BeautifulSoup, instructions: list):
    """Parse instructions from gastrohobbi.hu — find Elkészítés h3 then collect <p> siblings."""
    header = None
    for h3 in soup.find_all("h3"):
        text = h3.get_text(strip=True)
        if text.lower().startswith("elkészítés") and "idő" not in text.lower():
            header = h3
            break
    if not header:
        return

    for sib in header.find_next_siblings():
        tag = sib.name
        text = sib.get_text(strip=True)
        # Stop at prep time h3 or any other section header
        if tag == "h3":
            break
        if tag == "p":
            # Skip empty / whitespace-only paragraphs
            if not text or text == "\xa0":
                continue
            instructions.append(text)
        elif tag == "ul":
            # Embedded list in instructions (e.g. cooking time options)
            for li in sib.find_all("li"):
                # Skip wrapper li elements that contain nested lists
                if li.find("ul"):
                    continue
                li_text = li.get_text(strip=True)
                if li_text:
                    instructions.append(f"  • {li_text}")


def _parse_sobors_article_ingredients(container, ingredients: list):
    """Parse article-style ingredients from sobors.hu (h4 headers + ul > li plain text)."""
    for el in container.find_all(["h4", "ul"]):
        if el.name == "h4":
            group_name = el.get_text(strip=True).rstrip(":")
            if group_name and not group_name.lower().startswith("hozzávalók"):
                ingredients.append({"group": group_name})
        elif el.name == "ul":
            # Only consider lists that follow an h4 or are inside the ingredient context
            prev = el.find_previous_sibling()
            if prev and prev.name == "h4":
                for li in el.find_all("li"):
                    line = li.get_text(strip=True)
                    if not line:
                        continue
                    qty, unit, food = _parse_ingredient_line(line)
                    ingredients.append({
                        "quantity": qty,
                        "unit": unit,
                        "food": food,
                        "extra": "",
                    })


def _parse_ingredient_line(line: str) -> tuple[str, str, str]:
    """Parse a plain-text ingredient line like '2 dl tejföl' into (qty, unit, food)."""
    # Normalize en-dash/em-dash ranges: "10 – 15" → "10-15"
    line = re.sub(r"\s*[–—]\s*", "-", line)
    # qty unit food (e.g. "2 dl tejföl", "½ tk őrölt kömény")
    m = re.match(r"^([0-9½¼¾][0-9.,/½¼¾-]*)\s+(\S+)\s+(.+)$", line)
    if m:
        return (m.group(1).strip(), m.group(2).strip(), m.group(3).strip())
    # Just quantity + food (e.g. "2 tojás")
    m2 = re.match(r"^([0-9½¼¾][0-9.,/½¼¾-]*)\s+(.+)$", line)
    if m2:
        return (m2.group(1).strip(), "", m2.group(2).strip())
    return ("", "", line)


def _split_qty_unit(raw: str) -> tuple[str, str]:
    """Split a merged quantity+unit string like '200g' into ('200', 'g')."""
    raw = raw.strip()
    if not raw:
        return ("", "")
    m = re.match(r"^([0-9][0-9 .,/-]*)(.*)$", raw)
    if m:
        return (m.group(1).strip(), m.group(2).strip())
    return ("", raw)


# ---------------------------------------------------------------------------
# Generic fallback (og-tags + schema.org microdata)
# ---------------------------------------------------------------------------


def _parse_generic(soup: BeautifulSoup, url: str) -> dict:
    title = _og(soup, "og:title") or _text(soup.find("title")) or "Ismeretlen recept"
    description = _og(soup, "og:description") or ""
    image_url = _og(soup, "og:image")

    ingredients = []
    instructions = []
    tags = []

    # Try schema.org JSON-LD
    for script in soup.find_all("script", type="application/ld+json"):
        try:
            data = json.loads(script.string or "")
            if isinstance(data, list):
                data = data[0]
            if data.get("@type") == "Recipe":
                for line in data.get("recipeIngredient", []):
                    ingredients.append({
                        "quantity": "", "unit": "", "food": line, "extra": "",
                    })
                raw_instructions = data.get("recipeInstructions", [])
                for item in raw_instructions:
                    if isinstance(item, str):
                        instructions.append(item)
                    elif isinstance(item, dict):
                        instructions.append(item.get("text", ""))
                # Extract keywords
                kw = data.get("keywords", "")
                if isinstance(kw, str):
                    tags = [k.strip() for k in kw.split(",") if k.strip()]
                elif isinstance(kw, list):
                    tags = [str(k).strip() for k in kw if str(k).strip()]
                break
        except (json.JSONDecodeError, TypeError, AttributeError):
            continue

    return {
        "title": title,
        "description": description,
        "image_url": image_url,
        "ingredients": ingredients,
        "instructions": instructions,
        "tags": tags,
        "original_url": url,
    }


# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------


def _extract_ingredient_comments(data: dict):
    """Move trailing (comment) from food field to extra field for all ingredients."""
    for ing in data.get("ingredients", []):
        if "group" in ing:
            continue
        food = ing.get("food", "")
        extra = ing.get("extra", "")
        if food and not extra:
            m = re.match(r"^(.+?)\s*\(([^)]+)\)\s*$", food)
            if m:
                ing["food"] = m.group(1).strip()
                ing["extra"] = m.group(2).strip()


def _host(url: str) -> str:
    from urllib.parse import urlparse
    return urlparse(url).hostname or ""


def _og(soup: BeautifulSoup, prop: str) -> str | None:
    tag = soup.find("meta", property=prop)
    if tag and tag.get("content"):
        return tag["content"]
    return None


def _text(el) -> str:
    if el is None:
        return ""
    return el.get_text(strip=True)