Fix kiskegyed ingredient parsing: spaces around links, multi-word units
- Use get_text(" ") with whitespace normalization to preserve spaces
between text nodes and <a> tag content in ingredient lines
- Use non-greedy .+? for unit in dual measurement regex to handle
multi-word units like "kis fej"
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
+4
-2
@@ -600,7 +600,8 @@ def _parse_kiskegyed(soup: BeautifulSoup, url: str) -> dict:
|
|||||||
ingredients.append({"group": group_text})
|
ingredients.append({"group": group_text})
|
||||||
elif el.name == "ul" and "list" in (el.get("class") or []):
|
elif el.name == "ul" and "list" in (el.get("class") or []):
|
||||||
for li in el.find_all("li"):
|
for li in el.find_all("li"):
|
||||||
line = li.get_text(strip=True)
|
# Use separator to preserve spaces around <a> tags
|
||||||
|
line = re.sub(r"\s+", " ", li.get_text(" ")).strip()
|
||||||
if not line:
|
if not line:
|
||||||
continue
|
continue
|
||||||
qty, unit, food, extra = _parse_kiskegyed_ingredient(line)
|
qty, unit, food, extra = _parse_kiskegyed_ingredient(line)
|
||||||
@@ -679,8 +680,9 @@ def _parse_kiskegyed_ingredient(line: str) -> tuple[str, str, str, str]:
|
|||||||
extras = []
|
extras = []
|
||||||
|
|
||||||
# Try: qty unit (alt_measurement) food...
|
# Try: qty unit (alt_measurement) food...
|
||||||
|
# Unit can be multi-word (e.g. "kis fej"), so use .+? (non-greedy)
|
||||||
m = re.match(
|
m = re.match(
|
||||||
r"^([0-9][0-9.,/½¼¾-]*)\s+(\S+)\s+\(([^)]+)\)\s+(.+)$", line
|
r"^([0-9][0-9.,/½¼¾-]*)\s+(.+?)\s+\(([^)]+)\)\s+(.+)$", line
|
||||||
)
|
)
|
||||||
if m:
|
if m:
|
||||||
qty = m.group(1).strip()
|
qty = m.group(1).strip()
|
||||||
|
|||||||
Reference in New Issue
Block a user