From bab59753f3272dce74be0f21347b805413731f1c Mon Sep 17 00:00:00 2001
From: kisfenyo <nagyfenyvesi.viktor@gmail.com>
Date: Tue, 24 Feb 2026 18:47:41 +0100
Subject: [PATCH] Fix kiskegyed ingredient parsing: spaces around links,
 multi-word units

- Use get_text(" ") with whitespace normalization to preserve spaces
  between text nodes and <a> tag content in ingredient lines
- Use non-greedy .+? for unit in dual measurement regex to handle
  multi-word units like "kis fej"

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 app/scraper.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)
diff --git a/app/scraper.py b/app/scraper.py
index a61cc54..e705ce2 100644
--- a/app/scraper.py
+++ b/app/scraper.py
@@ -600,7 +600,8 @@ def _parse_kiskegyed(soup: BeautifulSoup, url: str) -> dict:
                 ingredients.append({"group": group_text})
             elif el.name == "ul" and "list" in (el.get("class") or []):
                 for li in el.find_all("li"):
-                    line = li.get_text(strip=True)
+                    # Use separator to preserve spaces around <a> tags
+                    line = re.sub(r"\s+", " ", li.get_text(" ")).strip()
                     if not line:
                         continue
                     qty, unit, food, extra = _parse_kiskegyed_ingredient(line)
@@ -679,8 +680,9 @@ def _parse_kiskegyed_ingredient(line: str) -> tuple[str, str, str, str]:
     extras = []
 
     # Try: qty unit (alt_measurement) food...
+    # Unit can be multi-word (e.g. "kis fej"), so use .+? (non-greedy)
     m = re.match(
-        r"^([0-9][0-9.,/½¼¾-]*)\s+(\S+)\s+\(([^)]+)\)\s+(.+)$", line
+        r"^([0-9][0-9.,/½¼¾-]*)\s+(.+?)\s+\(([^)]+)\)\s+(.+)$", line
     )
     if m:
         qty = m.group(1).strip()