diff --git a/calibre-system/cwa.yaml b/calibre-system/cwa.yaml index c6c69de..2260f78 100644 --- a/calibre-system/cwa.yaml +++ b/calibre-system/cwa.yaml @@ -7,6 +7,348 @@ kind: Namespace metadata: name: calibre-system --- +# Custom Metadata Providers ConfigMap +# Contains moly.hu provider for Hungarian book metadata +apiVersion: v1 +kind: ConfigMap +metadata: + name: calibre-custom-metadata-providers + namespace: calibre-system + labels: + app.kubernetes.io/instance: calibre + app.kubernetes.io/name: calibre-web-automated +data: + moly_hu.py: | + # -*- coding: utf-8 -*- + # Calibre-Web Automated - Moly.hu Metadata Provider + # Based on Calibre plugin by Hokutya + # Adapted for CWA + # SPDX-License-Identifier: GPL-3.0-or-later + + import concurrent.futures + import re + import requests + from lxml.html import fromstring + from typing import List, Optional + from datetime import datetime + + from cps.services.Metadata import MetaRecord, MetaSourceInfo, Metadata + import cps.logger as logger + + log = logger.create() + + + class Moly_hu(Metadata): + __name__ = "Moly.hu" + __id__ = "moly_hu" + + BASE_URL = "https://moly.hu" + BOOK_URL = BASE_URL + "/konyvek/" + SEARCH_URL = BASE_URL + "/kereses?utf8=%E2%9C%93&query=" + + headers = { + 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:130.0) Gecko/20100101 Firefox/130.0', + 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', + 'Accept-Language': 'hu-HU,hu;q=0.9,en;q=0.8', + 'Accept-Encoding': 'gzip, deflate, br', + } + + session = requests.Session() + session.headers.update(headers) + + def search( + self, query: str, generic_cover: str = "", locale: str = "hu" + ) -> Optional[List[MetaRecord]]: + """Search moly.hu for books matching the query""" + + if not self.active: + return [] + + val = [] + + try: + # Search for books + search_url = self.SEARCH_URL + requests.utils.quote(query) + log.info(f"Moly.hu searching: {search_url}") + + response = self.session.get(search_url, timeout=15) + response.raise_for_status() + + # Parse search results + root = fromstring(response.text) + book_links = self._parse_search_results(root, query) + + if not book_links: + log.info(f"Moly.hu: No results found for '{query}'") + return [] + + # Fetch details for each book (max 5) + with concurrent.futures.ThreadPoolExecutor(max_workers=3) as executor: + futures = { + executor.submit(self._get_book_details, link, idx): idx + for idx, link in enumerate(book_links[:5]) + } + + for future in concurrent.futures.as_completed(futures, timeout=20): + try: + result = future.result() + if result: + val.append(result) + except Exception as e: + log.warning(f"Moly.hu worker error: {e}") + + except requests.exceptions.Timeout: + log.warning("Moly.hu search timed out") + return [] + except requests.exceptions.HTTPError as e: + log.error(f"Moly.hu HTTP error: {e}") + return [] + except Exception as e: + log.error_or_exception(f"Moly.hu search error: {e}") + return [] + + # Sort by relevance (order from search results) + val.sort(key=lambda x: x.source.id if hasattr(x, 'source') else 0) + return val + + def _parse_search_results(self, root, query: str) -> List[str]: + """Extract book URLs from search results page""" + results = root.xpath('//a[@class="book_selector"]/@href') + book_urls = [] + + for href in results: + if href and href not in book_urls: + book_urls.append(self.BASE_URL + href) + + log.info(f"Moly.hu found {len(book_urls)} results") + return book_urls + + def _get_book_details(self, url: str, index: int) -> Optional[MetaRecord]: + """Fetch and parse book details from a moly.hu book page""" + try: + response = self.session.get(url, timeout=15) + response.raise_for_status() + + # Clean up HTML + raw = response.text + raw = raw.replace('', '').replace('', '') + + root = fromstring(raw) + + # Parse all fields + title = self._parse_title(root) + authors = self._parse_authors(root) + + if not title: + return None + + match = MetaRecord( + title=title, + authors=authors if authors else [""], + source=MetaSourceInfo( + id=self.__id__, + description="Moly.hu - Magyar könyves közösség", + link=self.BASE_URL + ), + url=url, + identifiers={"moly_hu": self._parse_moly_id(url)}, + ) + + # Optional fields + match.description = self._parse_description(root) + match.cover = self._parse_cover(root) + match.publisher = self._parse_publisher(root) + match.publishedDate = self._parse_published_date(root) + match.rating = self._parse_rating(root) + match.tags = self._parse_tags(root) + + # Series info + series_info = self._parse_series(root) + if series_info: + match.series = series_info[0] + try: + match.series_index = int(series_info[1]) + except (ValueError, IndexError): + match.series_index = 1 + + # ISBN + isbn = self._parse_isbn(root) + if isbn: + match.identifiers["isbn"] = isbn + + return match + + except Exception as e: + log.warning(f"Moly.hu error fetching {url}: {e}") + return None + + def _parse_moly_id(self, url: str) -> Optional[str]: + """Extract moly.hu book ID from URL""" + try: + m = re.search(r'/konyvek/(.*)', url) + if m: + return m.group(1) + except: + pass + return None + + def _parse_title(self, root) -> Optional[str]: + """Parse book title""" + title_node = root.xpath('//*[@id="content"]//*[@class="fn"]/text()') + if not title_node: + title_node = root.xpath('//*[@id="content"]//*[@class="item"]/text()') + if title_node: + return title_node[0].strip().replace('\u200b', '') + return None + + def _parse_authors(self, root) -> List[str]: + """Parse author names""" + author_nodes = root.xpath('//*[@id="content"]//div[@class="authors"]/a/text()') + if author_nodes: + return [str(author).strip() for author in author_nodes] + return [] + + def _parse_description(self, root) -> Optional[str]: + """Parse book description/comments""" + description_node = root.xpath( + '//*[@id="content"]//*[@class="text" and @id="full_description"]/p/text()' + ) + if not description_node: + description_node = root.xpath('//*[@id="content"]//*[@class="text"]/p/text()') + if not description_node: + description_node = root.xpath( + '//*[@id="content"]//*[@class="text shrinkable"]/p/text()' + ) + + if description_node: + # Clean up description + desc = '\n'.join(description_node) + desc = desc.replace('\n\n', '\n').replace('\n \n', '\n') + desc = desc.replace('Vigyázat! Cselekményleírást tartalmaz.\n', '') + return desc.strip() + return None + + def _parse_cover(self, root) -> Optional[str]: + """Parse cover image URL""" + cover_nodes = root.xpath('(//*[@class="coverbox"]//a/@href)[1]') + if cover_nodes: + cover_url = cover_nodes[0] + if not cover_url.startswith('http'): + cover_url = self.BASE_URL + cover_url + return cover_url + + # Fallback: try img src directly + img_nodes = root.xpath('//*[@class="coverbox"]//img/@src') + if img_nodes: + img_url = img_nodes[0] + if not img_url.startswith('http'): + img_url = self.BASE_URL + img_url + return img_url + return None + + def _parse_publisher(self, root) -> Optional[str]: + """Parse publisher name""" + publisher_node_1 = root.xpath( + '//*[@id="content"]//*[@class="items"]/div/div[1]/a/text()' + ) + if publisher_node_1 and publisher_node_1[0] == '+': + publisher_node = root.xpath( + '//*[@id="content"]//*[@class="items"]/div/div[2]/a/text()' + ) + else: + publisher_node = publisher_node_1 + + if publisher_node: + return publisher_node[0].strip() + return None + + def _parse_published_date(self, root) -> Optional[str]: + """Parse publication date (year)""" + publication_node_1 = root.xpath( + '//*[@id="content"]//*[@class="items"]/div/div[1]/text()' + ) + if not publication_node_1: + publication_node = root.xpath( + '//*[@id="content"]//*[@class="items"]/div/div[2]/text()' + ) + else: + publication_node = publication_node_1 + + for value in publication_node: + m = re.search(r'(\d{4})', value) + if m: + return m.group(1) + return None + + def _parse_rating(self, root) -> int: + """Parse rating (converted to 0-5 scale)""" + rating_node = root.xpath( + '//*[@id="content"]//*[@class="rating"]//*[@class="like_count"]/text()' + ) + if rating_node: + try: + # Moly.hu uses percentage, convert to 0-5 scale + percentage = float(rating_node[0].strip('%').strip()) + return round(percentage * 0.05) + except (ValueError, IndexError): + pass + return 0 + + def _parse_tags(self, root) -> List[str]: + """Parse tags/genres""" + # Genre tags (in brackets) + tags_genre = root.xpath('//*[@id="book_tags"]//*[@class="tag genre"]/text()') + tags_genre = [f"[{str(t).strip()}]" for t in tags_genre if str(t).strip()] + + # Regular tags + tags_regular = root.xpath('//*[@id="book_tags"]//*[@class="tag"]/text()') + tags_regular = [str(t).strip() for t in tags_regular if str(t).strip()] + + return tags_genre + tags_regular + + def _parse_series(self, root) -> Optional[List[str]]: + """Parse series name and index""" + series_node = root.xpath('//*[@id="content"]//*[@class="action"]/text()') + + if not series_node: + return None + + series_text = series_node[0].strip('().') + parts = series_text.rsplit(' ', 1) + + # Check if it's actually edition info, not series + if len(parts) > 1 and parts[1] == 'kiadás': + return None + + if len(parts) == 2: + return [parts[0], parts[1]] + elif len(parts) == 1: + return [parts[0], "1"] + + return None + + def _parse_isbn(self, root) -> Optional[str]: + """Parse ISBN""" + # Try first location + isbn_nodes = root.xpath( + '//*[@id="content"]//*[@class="items"]/div/div[2]/text()' + ) + for value in isbn_nodes: + m = re.search(r'(\d{13}|\d{10})', value) + if m: + return m.group(1) + + # Try second location + isbn_nodes = root.xpath( + '//*[@id="content"]//*[@class="items"]/div/div[3]/text()' + ) + for value in isbn_nodes: + m = re.search(r'(\d{13}|\d{10})', value) + if m: + return m.group(1) + + return None +--- # Calibre-Web-Automated Deployment apiVersion: apps/v1 kind: Deployment @@ -32,6 +374,8 @@ spec: annotations: # Version checker pattern - CWA uses semantic versioning match-regex.version-checker.io/calibre-web-automated: '^V?[0-9]+\.[0-9]+\.[0-9]+$' + # Force rollout when ConfigMap changes (update this hash when modifying providers) + configmap.reloader.stakater.com/reload: "calibre-custom-metadata-providers" spec: containers: - name: calibre-web-automated @@ -98,6 +442,11 @@ spec: # Calibre library - your existing library location - name: library mountPath: /calibre-library + # Custom metadata providers (moly.hu) + - name: custom-metadata-providers + mountPath: /app/calibre-web-automated/cps/metadata_provider/moly_hu.py + subPath: moly_hu.py + readOnly: true volumes: - name: config persistentVolumeClaim: @@ -112,6 +461,10 @@ spec: hostPath: path: /mnt/4_hdd/data/calibre type: DirectoryOrCreate + # Custom metadata providers from ConfigMap + - name: custom-metadata-providers + configMap: + name: calibre-custom-metadata-providers --- # Calibre-Web-Automated Service apiVersion: v1