added moly provider as configmap

2026-01-25 20:28:27 +01:00
parent 54046d2bdf
commit af97c95ef1
1 changed files with 353 additions and 0 deletions
@@ -7,6 +7,348 @@ kind: Namespace
 metadata:
  name: calibre-system
 ---
 # Custom Metadata Providers ConfigMap
 # Contains moly.hu provider for Hungarian book metadata
 apiVersion: v1
 kind: ConfigMap
 metadata:
  name: calibre-custom-metadata-providers
  namespace: calibre-system
  labels:
    app.kubernetes.io/instance: calibre
    app.kubernetes.io/name: calibre-web-automated
 data:
  moly_hu.py: |
    # -*- coding: utf-8 -*-
    # Calibre-Web Automated - Moly.hu Metadata Provider
    # Based on Calibre plugin by Hokutya <mail@hokutya.com>
    # Adapted for CWA
    # SPDX-License-Identifier: GPL-3.0-or-later
    import concurrent.futures
    import re
    import requests
    from lxml.html import fromstring
    from typing import List, Optional
    from datetime import datetime
    from cps.services.Metadata import MetaRecord, MetaSourceInfo, Metadata
    import cps.logger as logger
    log = logger.create()
    class Moly_hu(Metadata):
        __name__ = "Moly.hu"
        __id__ = "moly_hu"
        BASE_URL = "https://moly.hu"
        BOOK_URL = BASE_URL + "/konyvek/"
        SEARCH_URL = BASE_URL + "/kereses?utf8=%E2%9C%93&query="
        headers = {
            'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:130.0) Gecko/20100101 Firefox/130.0',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
            'Accept-Language': 'hu-HU,hu;q=0.9,en;q=0.8',
            'Accept-Encoding': 'gzip, deflate, br',
        }
        session = requests.Session()
        session.headers.update(headers)
        def search(
            self, query: str, generic_cover: str = "", locale: str = "hu"
        ) -> Optional[List[MetaRecord]]:
            """Search moly.hu for books matching the query"""
            if not self.active:
                return []
            val = []
            try:
                # Search for books
                search_url = self.SEARCH_URL + requests.utils.quote(query)
                log.info(f"Moly.hu searching: {search_url}")
                response = self.session.get(search_url, timeout=15)
                response.raise_for_status()
                # Parse search results
                root = fromstring(response.text)
                book_links = self._parse_search_results(root, query)
                if not book_links:
                    log.info(f"Moly.hu: No results found for '{query}'")
                    return []
                # Fetch details for each book (max 5)
                with concurrent.futures.ThreadPoolExecutor(max_workers=3) as executor:
                    futures = {
                        executor.submit(self._get_book_details, link, idx): idx 
                        for idx, link in enumerate(book_links[:5])
                    }
                    for future in concurrent.futures.as_completed(futures, timeout=20):
                        try:
                            result = future.result()
                            if result:
                                val.append(result)
                        except Exception as e:
                            log.warning(f"Moly.hu worker error: {e}")
            except requests.exceptions.Timeout:
                log.warning("Moly.hu search timed out")
                return []
            except requests.exceptions.HTTPError as e:
                log.error(f"Moly.hu HTTP error: {e}")
                return []
            except Exception as e:
                log.error_or_exception(f"Moly.hu search error: {e}")
                return []
            # Sort by relevance (order from search results)
            val.sort(key=lambda x: x.source.id if hasattr(x, 'source') else 0)
            return val
        def _parse_search_results(self, root, query: str) -> List[str]:
            """Extract book URLs from search results page"""
            results = root.xpath('//a[@class="book_selector"]/@href')
            book_urls = []
            for href in results:
                if href and href not in book_urls:
                    book_urls.append(self.BASE_URL + href)
            log.info(f"Moly.hu found {len(book_urls)} results")
            return book_urls
        def _get_book_details(self, url: str, index: int) -> Optional[MetaRecord]:
            """Fetch and parse book details from a moly.hu book page"""
            try:
                response = self.session.get(url, timeout=15)
                response.raise_for_status()
                # Clean up HTML
                raw = response.text
                raw = raw.replace('<em>', '').replace('</em>', '')
                root = fromstring(raw)
                # Parse all fields
                title = self._parse_title(root)
                authors = self._parse_authors(root)
                if not title:
                    return None
                match = MetaRecord(
                    title=title,
                    authors=authors if authors else [""],
                    source=MetaSourceInfo(
                        id=self.__id__,
                        description="Moly.hu - Magyar könyves közösség",
                        link=self.BASE_URL
                    ),
                    url=url,
                    identifiers={"moly_hu": self._parse_moly_id(url)},
                )
                # Optional fields
                match.description = self._parse_description(root)
                match.cover = self._parse_cover(root)
                match.publisher = self._parse_publisher(root)
                match.publishedDate = self._parse_published_date(root)
                match.rating = self._parse_rating(root)
                match.tags = self._parse_tags(root)
                # Series info
                series_info = self._parse_series(root)
                if series_info:
                    match.series = series_info[0]
                    try:
                        match.series_index = int(series_info[1])
                    except (ValueError, IndexError):
                        match.series_index = 1
                # ISBN
                isbn = self._parse_isbn(root)
                if isbn:
                    match.identifiers["isbn"] = isbn
                return match
            except Exception as e:
                log.warning(f"Moly.hu error fetching {url}: {e}")
                return None
        def _parse_moly_id(self, url: str) -> Optional[str]:
            """Extract moly.hu book ID from URL"""
            try:
                m = re.search(r'/konyvek/(.*)', url)
                if m:
                    return m.group(1)
            except:
                pass
            return None
        def _parse_title(self, root) -> Optional[str]:
            """Parse book title"""
            title_node = root.xpath('//*[@id="content"]//*[@class="fn"]/text()')
            if not title_node:
                title_node = root.xpath('//*[@id="content"]//*[@class="item"]/text()')
            if title_node:
                return title_node[0].strip().replace('\u200b', '')
            return None
        def _parse_authors(self, root) -> List[str]:
            """Parse author names"""
            author_nodes = root.xpath('//*[@id="content"]//div[@class="authors"]/a/text()')
            if author_nodes:
                return [str(author).strip() for author in author_nodes]
            return []
        def _parse_description(self, root) -> Optional[str]:
            """Parse book description/comments"""
            description_node = root.xpath(
                '//*[@id="content"]//*[@class="text" and @id="full_description"]/p/text()'
            )
            if not description_node:
                description_node = root.xpath('//*[@id="content"]//*[@class="text"]/p/text()')
            if not description_node:
                description_node = root.xpath(
                    '//*[@id="content"]//*[@class="text shrinkable"]/p/text()'
                )
            if description_node:
                # Clean up description
                desc = '\n'.join(description_node)
                desc = desc.replace('\n\n', '\n').replace('\n \n', '\n')
                desc = desc.replace('Vigyázat! Cselekményleírást tartalmaz.\n', '')
                return desc.strip()
            return None
        def _parse_cover(self, root) -> Optional[str]:
            """Parse cover image URL"""
            cover_nodes = root.xpath('(//*[@class="coverbox"]//a/@href)[1]')
            if cover_nodes:
                cover_url = cover_nodes[0]
                if not cover_url.startswith('http'):
                    cover_url = self.BASE_URL + cover_url
                return cover_url
            # Fallback: try img src directly
            img_nodes = root.xpath('//*[@class="coverbox"]//img/@src')
            if img_nodes:
                img_url = img_nodes[0]
                if not img_url.startswith('http'):
                    img_url = self.BASE_URL + img_url
                return img_url
            return None
        def _parse_publisher(self, root) -> Optional[str]:
            """Parse publisher name"""
            publisher_node_1 = root.xpath(
                '//*[@id="content"]//*[@class="items"]/div/div[1]/a/text()'
            )
            if publisher_node_1 and publisher_node_1[0] == '+':
                publisher_node = root.xpath(
                    '//*[@id="content"]//*[@class="items"]/div/div[2]/a/text()'
                )
            else:
                publisher_node = publisher_node_1
            if publisher_node:
                return publisher_node[0].strip()
            return None
        def _parse_published_date(self, root) -> Optional[str]:
            """Parse publication date (year)"""
            publication_node_1 = root.xpath(
                '//*[@id="content"]//*[@class="items"]/div/div[1]/text()'
            )
            if not publication_node_1:
                publication_node = root.xpath(
                    '//*[@id="content"]//*[@class="items"]/div/div[2]/text()'
                )
            else:
                publication_node = publication_node_1
            for value in publication_node:
                m = re.search(r'(\d{4})', value)
                if m:
                    return m.group(1)
            return None
        def _parse_rating(self, root) -> int:
            """Parse rating (converted to 0-5 scale)"""
            rating_node = root.xpath(
                '//*[@id="content"]//*[@class="rating"]//*[@class="like_count"]/text()'
            )
            if rating_node:
                try:
                    # Moly.hu uses percentage, convert to 0-5 scale
                    percentage = float(rating_node[0].strip('%').strip())
                    return round(percentage * 0.05)
                except (ValueError, IndexError):
                    pass
            return 0
        def _parse_tags(self, root) -> List[str]:
            """Parse tags/genres"""
            # Genre tags (in brackets)
            tags_genre = root.xpath('//*[@id="book_tags"]//*[@class="tag genre"]/text()')
            tags_genre = [f"[{str(t).strip()}]" for t in tags_genre if str(t).strip()]
            # Regular tags
            tags_regular = root.xpath('//*[@id="book_tags"]//*[@class="tag"]/text()')
            tags_regular = [str(t).strip() for t in tags_regular if str(t).strip()]
            return tags_genre + tags_regular
        def _parse_series(self, root) -> Optional[List[str]]:
            """Parse series name and index"""
            series_node = root.xpath('//*[@id="content"]//*[@class="action"]/text()')
            if not series_node:
                return None
            series_text = series_node[0].strip('().')
            parts = series_text.rsplit(' ', 1)
            # Check if it's actually edition info, not series
            if len(parts) > 1 and parts[1] == 'kiadás':
                return None
            if len(parts) == 2:
                return [parts[0], parts[1]]
            elif len(parts) == 1:
                return [parts[0], "1"]
            return None
        def _parse_isbn(self, root) -> Optional[str]:
            """Parse ISBN"""
            # Try first location
            isbn_nodes = root.xpath(
                '//*[@id="content"]//*[@class="items"]/div/div[2]/text()'
            )
            for value in isbn_nodes:
                m = re.search(r'(\d{13}|\d{10})', value)
                if m:
                    return m.group(1)
            # Try second location
            isbn_nodes = root.xpath(
                '//*[@id="content"]//*[@class="items"]/div/div[3]/text()'
            )
            for value in isbn_nodes:
                m = re.search(r'(\d{13}|\d{10})', value)
                if m:
                    return m.group(1)
            return None
 ---
 # Calibre-Web-Automated Deployment
 apiVersion: apps/v1
 kind: Deployment
@@ -32,6 +374,8 @@ spec:
      annotations:
        # Version checker pattern - CWA uses semantic versioning
        match-regex.version-checker.io/calibre-web-automated: '^V?[0-9]+\.[0-9]+\.[0-9]+$'
        # Force rollout when ConfigMap changes (update this hash when modifying providers)
        configmap.reloader.stakater.com/reload: "calibre-custom-metadata-providers"
    spec:
      containers:
        - name: calibre-web-automated
@@ -98,6 +442,11 @@ spec:
            # Calibre library - your existing library location
            - name: library
              mountPath: /calibre-library
            # Custom metadata providers (moly.hu)
            - name: custom-metadata-providers
              mountPath: /app/calibre-web-automated/cps/metadata_provider/moly_hu.py
              subPath: moly_hu.py
              readOnly: true
      volumes:
        - name: config
          persistentVolumeClaim:
@@ -112,6 +461,10 @@ spec:
          hostPath:
            path: /mnt/4_hdd/data/calibre
            type: DirectoryOrCreate
        # Custom metadata providers from ConfigMap
        - name: custom-metadata-providers
          configMap:
            name: calibre-custom-metadata-providers
 ---
 # Calibre-Web-Automated Service
 apiVersion: v1