v0.8.3: prefer h1 for mindmegette title, strip trailing "recept" globally

Mindmegette regular pages: use h1 element (clean meal name like "Sajtkrémes csirkés leves") instead of og:title (which has "receptje" suffix). Also add global post-processing to strip trailing recept/ receptje/receptek from titles across all parsers. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-24 20:43:24 +01:00
parent 0d5728b732
commit e922822286
2 changed files with 17 additions and 1 deletions
@@ -67,6 +67,12 @@ def scrape(url: str) -> dict:

    # Post-process: extract parenthesized comments from food into extra
    _extract_ingredient_comments(result)
+
+    # Strip trailing "recept*" from title (e.g. "receptje", "recept")
+    title = result.get("title", "")
+    if title:
+        result["title"] = re.sub(r"\s+recept\w*$", "", title, flags=re.IGNORECASE).strip()
+
    return result


@@ -90,7 +96,11 @@ def supported_sites() -> list[dict]:

@_register("mindmegette")
 def _parse_mindmegette(soup: BeautifulSoup, url: str) -> dict:
-    title = _og(soup, "og:title") or _text(soup.find("title"))
+    # Prefer h1 (clean meal name) over og:title (often has "receptje" suffix)
+    h1 = soup.find("h1")
+    title = _text(h1) if h1 else ""
+    if not title:
+        title = _og(soup, "og:title") or _text(soup.find("title"))
    # Strip " | Mindmegette.hu" or " - Mindmegette.hu" suffix
    if title:
        title = re.sub(r"\s*[-–|]\s*Mindmegette\.hu$", "", title).strip()