From e922822286905f2f0b6fd3f86a11a8d621403d9a Mon Sep 17 00:00:00 2001 From: kisfenyo Date: Tue, 24 Feb 2026 20:43:24 +0100 Subject: [PATCH] v0.8.3: prefer h1 for mindmegette title, strip trailing "recept" globally MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Mindmegette regular pages: use h1 element (clean meal name like "Sajtkrémes csirkés leves") instead of og:title (which has "receptje" suffix). Also add global post-processing to strip trailing recept/ receptje/receptek from titles across all parsers. Co-Authored-By: Claude Opus 4.6 --- CHANGELOG.md | 6 ++++++ app/scraper.py | 12 +++++++++++- 2 files changed, 17 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index b0222c5..6eebd82 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,11 @@ # Changelog +## v0.8.3 (2026-02-24) + +### Fixed +- Mindmegette.hu: prefer `

` element for title (clean meal name) over og:title (which often has "receptje" suffix) +- Global: strip trailing "recept"/"receptje" etc. from recipe titles across all parsers + ## v0.8.2 (2026-02-24) ### Fixed diff --git a/app/scraper.py b/app/scraper.py index f0de071..045e7c9 100644 --- a/app/scraper.py +++ b/app/scraper.py @@ -67,6 +67,12 @@ def scrape(url: str) -> dict: # Post-process: extract parenthesized comments from food into extra _extract_ingredient_comments(result) + + # Strip trailing "recept*" from title (e.g. "receptje", "recept") + title = result.get("title", "") + if title: + result["title"] = re.sub(r"\s+recept\w*$", "", title, flags=re.IGNORECASE).strip() + return result @@ -90,7 +96,11 @@ def supported_sites() -> list[dict]: @_register("mindmegette") def _parse_mindmegette(soup: BeautifulSoup, url: str) -> dict: - title = _og(soup, "og:title") or _text(soup.find("title")) + # Prefer h1 (clean meal name) over og:title (often has "receptje" suffix) + h1 = soup.find("h1") + title = _text(h1) if h1 else "" + if not title: + title = _og(soup, "og:title") or _text(soup.find("title")) # Strip " | Mindmegette.hu" or " - Mindmegette.hu" suffix if title: title = re.sub(r"\s*[-–|]\s*Mindmegette\.hu$", "", title).strip()