diff --git a/README.md b/README.md index 8bc33ee..078119a 100644 --- a/README.md +++ b/README.md @@ -48,7 +48,8 @@ Extracts data from the Angular-rendered HTML: - **Title**: `og:title` meta tag, with ` | Mindmegette.hu` suffix stripped - **Description**: `og:description` meta tag - **Image**: `og:image` meta tag -- **Ingredients**: `div.ingredients` → `div.ingredients-meta` rows, each containing `span.quantity`, `span.unit`, `span.name`, `span.extra` +- **Ingredients**: `div.ingredients` → `div.ingredients-meta` rows, each containing `` (qty), `` (unit), `` (food), `` (extra) +- **Ingredient groups**: Multiple `div.ingredients` containers; group title via `` - **Instructions**: `mindmegette-wysiwyg-box` → `ol > li` elements ### Generic Fallback Parser @@ -57,14 +58,22 @@ For unsupported sites, attempts extraction via: 1. Schema.org JSON-LD `@type: Recipe` blocks (`recipeIngredient`, `recipeInstructions`) 2. OpenGraph meta tags for title, description, image +### Adding a New Site Parser + +1. Create a parser function in `app/scraper.py` with the `@_register("hostname")` decorator +2. The function receives `(soup: BeautifulSoup, url: str)` and returns the standard recipe dict +3. The hostname substring is matched against the URL — first match wins, unmatched URLs use the generic fallback + ## Mealie API Integration The importer uses the Mealie REST API: 1. **POST** `/api/recipes` — create a stub recipe (returns slug) -2. **PATCH** `/api/recipes/{slug}` — populate ingredients, instructions, description, orgURL +2. **PATCH** `/api/recipes/{slug}` — populate structured ingredients (with unit/food IDs), instructions, description, orgURL 3. **PUT** `/api/recipes/{slug}/image` — upload the recipe image +**Structured ingredients**: The client resolves unit and food names to Mealie database IDs. Missing units/foods are created automatically via the API. Ingredient groups are supported via the `title` field on the first ingredient of each group. + Authentication uses a long-lived API token (Bearer header), created in Mealie at *Profile → API Tokens*. ## Configuration @@ -83,7 +92,7 @@ All settings are persisted to `/data/config.json` (mounted as a Docker volume). ```yaml services: recipe-importer: - image: gitea.dooplex.hu/admin/recipe-importer:0.1.0 + image: gitea.dooplex.hu/admin/recipe-importer:0.1.7 container_name: recipe-importer restart: unless-stopped ports: @@ -104,6 +113,7 @@ volumes: | `SECRET_KEY` | `recipe-importer-dev-key` | Flask session secret | | `DATA_DIR` | `/data` | Persistent storage path | | `VERSION` | `dev` | Shown in the UI navbar | +| `MEALIE_INTERNAL_URL` | *(empty)* | Docker-internal Mealie URL (e.g. `http://mealie:9000`) to avoid Cloudflare hairpin | ## Building @@ -120,7 +130,7 @@ The UI is in Hungarian and uses a dark theme. The workflow is: 1. **Settings** (`/settings`) — Enter Mealie URL and API key, test connection 2. **Import** (`/import`) — Paste a recipe URL, click "Beolvasás" (Scrape) -3. **Review** — Edit the title, description, ingredients, instructions in the preview +3. **Review** — Edit structured ingredients (4-column: quantity, unit, food, note), add/remove ingredient groups, edit instructions 4. **Send** — Click "Importálás Mealie-be" to push to Mealie ## Tech Stack diff --git a/app/mealie.py b/app/mealie.py index 4ce0437..90094e7 100644 --- a/app/mealie.py +++ b/app/mealie.py @@ -145,27 +145,26 @@ class MealieClient: def _build_payload(self, recipe: dict) -> dict: ingredients = [] + pending_group = "" for item in recipe.get("ingredients", []): if isinstance(item, dict): - # Group header marker + # Group header marker — apply title to the next real ingredient if "group" in item and "food" not in item: - ingredients.append({ - "referenceId": str(uuid.uuid4()), - "title": item["group"], - "note": "", - "isFood": False, - "disableAmount": True, - }) - else: - ingredients.append(self._build_ingredient(item)) + pending_group = item["group"] + continue + ing = self._build_ingredient(item) else: # Legacy: plain string - ingredients.append({ + ing = { "referenceId": str(uuid.uuid4()), "note": str(item), "isFood": False, "disableAmount": True, - }) + } + if pending_group: + ing["title"] = pending_group + pending_group = "" + ingredients.append(ing) instructions = [] for text in recipe.get("instructions", []): diff --git a/app/scraper.py b/app/scraper.py index 795985f..898fe57 100644 --- a/app/scraper.py +++ b/app/scraper.py @@ -1,6 +1,7 @@ """Recipe scraper — parses Hungarian recipe sites into a structured dict. -Currently supported: mindmegette.hu +Each supported site has a parser registered via _PARSERS. +Unsupported sites fall back to generic schema.org / og-tag extraction. """ import re @@ -12,6 +13,19 @@ _HEADERS = { "Accept-Language": "hu-HU,hu;q=0.9,en;q=0.5", } +# Maps a substring of the hostname to a parser function. +# Order matters: first match wins. +_PARSERS: list[tuple[str, "callable"]] = [] + + +def _register(host_substring: str): + """Decorator: register a parser for URLs whose hostname contains *host_substring*.""" + def decorator(fn): + _PARSERS.append((host_substring, fn)) + return fn + return decorator + + # --------------------------------------------------------------------------- # Public API # --------------------------------------------------------------------------- @@ -39,11 +53,17 @@ def scrape(url: str) -> dict: soup = BeautifulSoup(resp.text, "lxml") host = _host(url) - if "mindmegette" in host: - return _parse_mindmegette(soup, url) - else: - # Fallback: try generic schema.org / og-tag extraction - return _parse_generic(soup, url) + for substring, parser in _PARSERS: + if substring in host: + return parser(soup, url) + + # Fallback: try generic schema.org / og-tag extraction + return _parse_generic(soup, url) + + +def supported_sites() -> list[str]: + """Return list of supported site hostname substrings.""" + return [s for s, _ in _PARSERS] # --------------------------------------------------------------------------- @@ -51,6 +71,7 @@ def scrape(url: str) -> dict: # --------------------------------------------------------------------------- +@_register("mindmegette") def _parse_mindmegette(soup: BeautifulSoup, url: str) -> dict: title = _og(soup, "og:title") or _text(soup.find("title")) # Strip " | Mindmegette.hu" suffix