homelab-manifests/calibre-system/cwa.yaml

---
# Calibre-Web-Automated - All-in-one eBook library solution
# Replaces Calibre + Calibre-web with automation features
# Namespace
apiVersion: v1
kind: Namespace
metadata:
  name: calibre-system
---
# Custom Metadata Providers ConfigMap
# Contains moly.hu provider for Hungarian book metadata
apiVersion: v1
kind: ConfigMap
metadata:
  name: calibre-custom-metadata-providers
  namespace: calibre-system
  labels:
    app.kubernetes.io/instance: calibre
    app.kubernetes.io/name: calibre-web-automated
data:
  moly_hu.py: |
    # -*- coding: utf-8 -*-
    # Calibre-Web Automated - Moly.hu Metadata Provider
    # Based on Calibre plugin by Hokutya <mail@hokutya.com>
    # Adapted for CWA
    # SPDX-License-Identifier: GPL-3.0-or-later

    import concurrent.futures
    import re
    import requests
    from lxml.html import fromstring
    from typing import List, Optional
    from datetime import datetime

    from cps.services.Metadata import MetaRecord, MetaSourceInfo, Metadata
    import cps.logger as logger

    log = logger.create()


    class Moly_hu(Metadata):
        __name__ = "Moly.hu"
        __id__ = "moly_hu"

        BASE_URL = "https://moly.hu"
        BOOK_URL = BASE_URL + "/konyvek/"
        SEARCH_URL = BASE_URL + "/kereses?utf8=%E2%9C%93&query="

        headers = {
            'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:130.0) Gecko/20100101 Firefox/130.0',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
            'Accept-Language': 'hu-HU,hu;q=0.9,en;q=0.8',
            'Accept-Encoding': 'gzip, deflate, br',
        }

        session = requests.Session()
        session.headers.update(headers)

        def search(
            self, query: str, generic_cover: str = "", locale: str = "hu"
        ) -> Optional[List[MetaRecord]]:
            """Search moly.hu for books matching the query"""

            if not self.active:
                return []

            val = []

            try:
                # Search for books
                search_url = self.SEARCH_URL + requests.utils.quote(query)
                log.info(f"Moly.hu searching: {search_url}")

                response = self.session.get(search_url, timeout=15)
                response.raise_for_status()

                # Parse search results
                root = fromstring(response.text)
                book_links = self._parse_search_results(root, query)

                if not book_links:
                    log.info(f"Moly.hu: No results found for '{query}'")
                    return []

                # Fetch details for each book (max 5)
                with concurrent.futures.ThreadPoolExecutor(max_workers=3) as executor:
                    futures = {
                        executor.submit(self._get_book_details, link, idx): idx
                        for idx, link in enumerate(book_links[:5])
                    }

                    for future in concurrent.futures.as_completed(futures, timeout=20):
                        try:
                            result = future.result()
                            if result:
                                val.append(result)
                        except Exception as e:
                            log.warning(f"Moly.hu worker error: {e}")

            except requests.exceptions.Timeout:
                log.warning("Moly.hu search timed out")
                return []
            except requests.exceptions.HTTPError as e:
                log.error(f"Moly.hu HTTP error: {e}")
                return []
            except Exception as e:
                log.error_or_exception(f"Moly.hu search error: {e}")
                return []

            # Sort by relevance (order from search results)
            val.sort(key=lambda x: x.source.id if hasattr(x, 'source') else 0)
            return val

        def _parse_search_results(self, root, query: str) -> List[str]:
            """Extract book URLs from search results page"""
            results = root.xpath('//a[@class="book_selector"]/@href')
            book_urls = []

            for href in results:
                if href and href not in book_urls:
                    book_urls.append(self.BASE_URL + href)

            log.info(f"Moly.hu found {len(book_urls)} results")
            return book_urls

        def _get_book_details(self, url: str, index: int) -> Optional[MetaRecord]:
            """Fetch and parse book details from a moly.hu book page"""
            try:
                response = self.session.get(url, timeout=15)
                response.raise_for_status()

                # Clean up HTML
                raw = response.text
                raw = raw.replace('<em>', '').replace('</em>', '')

                root = fromstring(raw)

                # Parse all fields
                title = self._parse_title(root)
                authors = self._parse_authors(root)

                if not title:
                    return None

                moly_id = self._parse_moly_id(url)

                match = MetaRecord(
                    id=moly_id,
                    title=title,
                    authors=authors if authors else [""],
                    source=MetaSourceInfo(
                        id=self.__id__,
                        description="Moly.hu - Magyar könyves közösség",
                        link=self.BASE_URL
                    ),
                    url=url,
                    identifiers={"moly_hu": moly_id},
                )

                # Optional fields
                match.description = self._parse_description(root)
                match.cover = self._parse_cover(root)
                match.publisher = self._parse_publisher(root)
                match.publishedDate = self._parse_published_date(root)
                match.rating = self._parse_rating(root)
                match.tags = self._parse_tags(root)

                # Series info
                series_info = self._parse_series(root)
                if series_info:
                    match.series = series_info[0]
                    try:
                        match.series_index = int(series_info[1])
                    except (ValueError, IndexError):
                        match.series_index = 1

                # ISBN
                isbn = self._parse_isbn(root)
                if isbn:
                    match.identifiers["isbn"] = isbn

                return match

            except Exception as e:
                log.warning(f"Moly.hu error fetching {url}: {e}")
                return None

        def _parse_moly_id(self, url: str) -> Optional[str]:
            """Extract moly.hu book ID from URL"""
            try:
                m = re.search(r'/konyvek/(.*)', url)
                if m:
                    return m.group(1)
            except:
                pass
            return None

        def _parse_title(self, root) -> Optional[str]:
            """Parse book title"""
            title_node = root.xpath('//*[@id="content"]//*[@class="fn"]/text()')
            if not title_node:
                title_node = root.xpath('//*[@id="content"]//*[@class="item"]/text()')
            if title_node:
                return title_node[0].strip().replace('\u200b', '')
            return None

        def _parse_authors(self, root) -> List[str]:
            """Parse author names"""
            author_nodes = root.xpath('//*[@id="content"]//div[@class="authors"]/a/text()')
            if author_nodes:
                return [str(author).strip() for author in author_nodes]
            return []

        def _parse_description(self, root) -> Optional[str]:
            """Parse book description/comments"""
            description_node = root.xpath(
                '//*[@id="content"]//*[@class="text" and @id="full_description"]/p/text()'
            )
            if not description_node:
                description_node = root.xpath('//*[@id="content"]//*[@class="text"]/p/text()')
            if not description_node:
                description_node = root.xpath(
                    '//*[@id="content"]//*[@class="text shrinkable"]/p/text()'
                )

            if description_node:
                # Clean up description
                desc = '\n'.join(description_node)
                desc = desc.replace('\n\n', '\n').replace('\n \n', '\n')
                desc = desc.replace('Vigyázat! Cselekményleírást tartalmaz.\n', '')
                return desc.strip()
            return None

        def _parse_cover(self, root) -> Optional[str]:
            """Parse cover image URL"""
            cover_nodes = root.xpath('(//*[@class="coverbox"]//a/@href)[1]')
            if cover_nodes:
                cover_url = cover_nodes[0]
                if not cover_url.startswith('http'):
                    cover_url = self.BASE_URL + cover_url
                return cover_url

            # Fallback: try img src directly
            img_nodes = root.xpath('//*[@class="coverbox"]//img/@src')
            if img_nodes:
                img_url = img_nodes[0]
                if not img_url.startswith('http'):
                    img_url = self.BASE_URL + img_url
                return img_url
            return None

        def _parse_publisher(self, root) -> Optional[str]:
            """Parse publisher name"""
            publisher_node_1 = root.xpath(
                '//*[@id="content"]//*[@class="items"]/div/div[1]/a/text()'
            )
            if publisher_node_1 and publisher_node_1[0] == '+':
                publisher_node = root.xpath(
                    '//*[@id="content"]//*[@class="items"]/div/div[2]/a/text()'
                )
            else:
                publisher_node = publisher_node_1

            if publisher_node:
                return publisher_node[0].strip()
            return None

        def _parse_published_date(self, root) -> Optional[str]:
            """Parse publication date (year)"""
            publication_node_1 = root.xpath(
                '//*[@id="content"]//*[@class="items"]/div/div[1]/text()'
            )
            if not publication_node_1:
                publication_node = root.xpath(
                    '//*[@id="content"]//*[@class="items"]/div/div[2]/text()'
                )
            else:
                publication_node = publication_node_1

            for value in publication_node:
                m = re.search(r'(\d{4})', value)
                if m:
                    return m.group(1)
            return None

        def _parse_rating(self, root) -> int:
            """Parse rating (converted to 0-5 scale)"""
            rating_node = root.xpath(
                '//*[@id="content"]//*[@class="rating"]//*[@class="like_count"]/text()'
            )
            if rating_node:
                try:
                    # Moly.hu uses percentage, convert to 0-5 scale
                    percentage = float(rating_node[0].strip('%').strip())
                    return round(percentage * 0.05)
                except (ValueError, IndexError):
                    pass
            return 0

        def _parse_tags(self, root) -> List[str]:
            """Parse tags/genres"""
            # Genre tags (in brackets)
            tags_genre = root.xpath('//*[@id="book_tags"]//*[@class="tag genre"]/text()')
            tags_genre = [f"[{str(t).strip()}]" for t in tags_genre if str(t).strip()]

            # Regular tags
            tags_regular = root.xpath('//*[@id="book_tags"]//*[@class="tag"]/text()')
            tags_regular = [str(t).strip() for t in tags_regular if str(t).strip()]

            return tags_genre + tags_regular

        def _parse_series(self, root) -> Optional[List[str]]:
            """Parse series name and index"""
            series_node = root.xpath('//*[@id="content"]//*[@class="action"]/text()')

            if not series_node:
                return None

            series_text = series_node[0].strip('().')
            parts = series_text.rsplit(' ', 1)

            # Check if it's actually edition info, not series
            if len(parts) > 1 and parts[1] == 'kiadás':
                return None

            if len(parts) == 2:
                return [parts[0], parts[1]]
            elif len(parts) == 1:
                return [parts[0], "1"]

            return None

        def _parse_isbn(self, root) -> Optional[str]:
            """Parse ISBN"""
            # Try first location
            isbn_nodes = root.xpath(
                '//*[@id="content"]//*[@class="items"]/div/div[2]/text()'
            )
            for value in isbn_nodes:
                m = re.search(r'(\d{13}|\d{10})', value)
                if m:
                    return m.group(1)

            # Try second location
            isbn_nodes = root.xpath(
                '//*[@id="content"]//*[@class="items"]/div/div[3]/text()'
            )
            for value in isbn_nodes:
                m = re.search(r'(\d{13}|\d{10})', value)
                if m:
                    return m.group(1)

            return None
---
# Calibre-Web-Automated Deployment
apiVersion: apps/v1
kind: Deployment
metadata:
  name: calibre-web-automated
  namespace: calibre-system
  labels:
    app.kubernetes.io/instance: calibre
    app.kubernetes.io/name: calibre-web-automated
spec:
  replicas: 1
  strategy:
    type: Recreate
  selector:
    matchLabels:
      app.kubernetes.io/instance: calibre
      app.kubernetes.io/name: calibre-web-automated
  template:
    metadata:
      labels:
        app.kubernetes.io/instance: calibre
        app.kubernetes.io/name: calibre-web-automated
      annotations:
        # Version checker pattern - CWA uses semantic versioning
        match-regex.version-checker.io/calibre-web-automated: '^V?[0-9]+\.[0-9]+\.[0-9]+$'
        # Force rollout when ConfigMap changes (update this hash when modifying providers)
        configmap.reloader.stakater.com/reload: "calibre-custom-metadata-providers"
    spec:
      containers:
        - name: calibre-web-automated
          image: crocodilestick/calibre-web-automated:latest
          imagePullPolicy: IfNotPresent
          env:
            - name: PUID
              value: "1000"
            - name: PGID
              value: "1000"
            - name: TZ
              value: Europe/Budapest
            # Use default port 8083
            - name: CWA_PORT_OVERRIDE
              value: "8083"
            # Disable WAL mode if on network share (set to true if using NFS)
            - name: NETWORK_SHARE_MODE
              value: "false"
            # Number of proxies in chain (Cloudflare -> nginx-ingress -> app)
            - name: TRUSTED_PROXY_COUNT
              value: "2"
          ports:
            - name: http
              containerPort: 8083
              protocol: TCP
          resources:
            requests:
              cpu: 100m
              memory: 512Mi
            limits:
              cpu: "2"
              memory: 2Gi
          livenessProbe:
            httpGet:
              path: /
              port: http
            initialDelaySeconds: 120
            periodSeconds: 60
            timeoutSeconds: 10
            failureThreshold: 5
          readinessProbe:
            httpGet:
              path: /
              port: http
            initialDelaySeconds: 60
            periodSeconds: 10
            timeoutSeconds: 5
            failureThreshold: 3
          startupProbe:
            httpGet:
              path: /
              port: http
            periodSeconds: 10
            timeoutSeconds: 5
            # CWA can take time to initialize, especially first run
            failureThreshold: 60
          volumeMounts:
            # Config directory for app database, logs, processed books backup
            - name: config
              mountPath: /config
            # Book ingest folder - files here are DELETED after processing
            - name: ingest
              mountPath: /cwa-book-ingest
            # Calibre library - your existing library location
            - name: library
              mountPath: /calibre-library
            # Custom metadata providers (moly.hu)
            - name: custom-metadata-providers
              mountPath: /app/calibre-web-automated/cps/metadata_provider/moly_hu.py
              subPath: moly_hu.py
              readOnly: true
      volumes:
        - name: config
          persistentVolumeClaim:
            claimName: calibre-web-automated-config
        # Ingest folder on hostPath for easy file dropping
        - name: ingest
          hostPath:
            path: /mnt/4_hdd/data/calibre-ingest
            type: DirectoryOrCreate
        # Your existing Calibre library location
        - name: library
          hostPath:
            path: /mnt/4_hdd/data/calibre
            type: DirectoryOrCreate
        # Custom metadata providers from ConfigMap
        - name: custom-metadata-providers
          configMap:
            name: calibre-custom-metadata-providers
---
# Calibre-Web-Automated Service
apiVersion: v1
kind: Service
metadata:
  name: calibre-web-automated
  namespace: calibre-system
  labels:
    app.kubernetes.io/instance: calibre
    app.kubernetes.io/name: calibre-web-automated
spec:
  type: ClusterIP
  ports:
    - name: http
      port: 8083
      targetPort: http
      protocol: TCP
  selector:
    app.kubernetes.io/instance: calibre
    app.kubernetes.io/name: calibre-web-automated
---
# Main Ingress (books.dooplex.hu - primary reading interface)
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
  name: calibre-web-automated
  namespace: calibre-system
  labels:
    app.kubernetes.io/instance: calibre
    app.kubernetes.io/name: calibre-web-automated
  annotations:
    cert-manager.io/cluster-issuer: letsencrypt-prod
    external-dns.alpha.kubernetes.io/hostname: books.dooplex.hu,books.home
    nginx.ingress.kubernetes.io/proxy-body-size: "0"
    nginx.ingress.kubernetes.io/proxy-read-timeout: "600"
    nginx.ingress.kubernetes.io/proxy-send-timeout: "600"
    nginx.ingress.kubernetes.io/ssl-redirect: "true"
    # Forward auth headers for Authentik integration
    nginx.ingress.kubernetes.io/auth-response-headers: Set-Cookie,X-authentik-username,X-authentik-groups,X-authentik-email,X-authentik-name,X-authentik-uid
    nginx.ingress.kubernetes.io/auth-snippet: proxy_set_header X-Forwarded-Host $http_host;
    nginx.ingress.kubernetes.io/configuration-snippet: |
      set $geo_allowed 0;
      if ($remote_addr ~ "^192\.168\.") { set $geo_allowed 1; }
      if ($remote_addr ~ "^10\.") { set $geo_allowed 1; }
      if ($geoip2_country_code = "HU") { set $geo_allowed 1; }
      if ($geo_allowed = 0) {
        return 403 "Access restricted to Hungary";
      }
spec:
  ingressClassName: nginx-internal
  tls:
    - hosts:
        - books.dooplex.hu
      secretName: calibre-web-automated-tls
  rules:
    - host: books.dooplex.hu
      http:
        paths:
          - path: /
            pathType: Prefix
            backend:
              service:
                name: calibre-web-automated
                port:
                  number: 8083
    - host: books.home
      http:
        paths:
          - path: /
            pathType: Prefix
            backend:
              service:
                name: calibre-web-automated
                port:
                  number: 8083
---
# Config PVC - stores app.db, logs, processed_books backup
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
  name: calibre-web-automated-config
  namespace: calibre-system
  labels:
    app.kubernetes.io/instance: calibre
    app.kubernetes.io/name: calibre-web-automated
    recurring-job-group.longhorn.io/needbackup: enabled
    recurring-job.longhorn.io/source: enabled
spec:
  accessModes:
    - ReadWriteOnce
  storageClassName: longhorn
  resources:
    requests:
      # Larger than typical - stores backup of processed books by default
      storage: 10Gi
---
# Optional: Authentik integration for SSO
# Uncomment and configure if using Authentik proxy authentication
# apiVersion: networking.k8s.io/v1
# kind: Ingress
# metadata:
#   name: calibre-web-automated-auth
#   namespace: calibre-system
#   annotations:
#     cert-manager.io/cluster-issuer: letsencrypt-prod
#     nginx.ingress.kubernetes.io/auth-url: http://authentik-outpost-proxy.authentik-system.svc.cluster.local:9000/outpost.goauthentik.io/auth/nginx
#     nginx.ingress.kubernetes.io/auth-signin: https://auth.dooplex.hu/outpost.goauthentik.io/start?rd=$escaped_request_uri
#     nginx.ingress.kubernetes.io/auth-response-headers: Set-Cookie,X-authentik-username,X-authentik-groups,X-authentik-email,X-authentik-name,X-authentik-uid
#     nginx.ingress.kubernetes.io/auth-snippet: proxy_set_header X-Forwarded-Host $http_host;
# spec:
#   ingressClassName: nginx-internal
#   tls:
#     - hosts:
#         - books.dooplex.hu
#       secretName: calibre-web-automated-tls
#   rules:
#     - host: books.dooplex.hu
#       http:
#         paths:
#           - path: /
#             pathType: Prefix
#             backend:
#               service:
#                 name: calibre-web-automated
#                 port:
#                   number: 8083