--- # Calibre-Web-Automated - All-in-one eBook library solution # Replaces Calibre + Calibre-web with automation features # Namespace apiVersion: v1 kind: Namespace metadata: name: calibre-system --- # Custom Metadata Providers ConfigMap # Contains moly.hu provider for Hungarian book metadata apiVersion: v1 kind: ConfigMap metadata: name: calibre-custom-metadata-providers namespace: calibre-system labels: app.kubernetes.io/instance: calibre app.kubernetes.io/name: calibre-web-automated data: moly_hu.py: | # -*- coding: utf-8 -*- # Calibre-Web Automated - Moly.hu Metadata Provider # Based on Calibre plugin by Hokutya # Adapted for CWA # SPDX-License-Identifier: GPL-3.0-or-later import concurrent.futures import re import requests from lxml.html import fromstring from typing import List, Optional from datetime import datetime from cps.services.Metadata import MetaRecord, MetaSourceInfo, Metadata import cps.logger as logger log = logger.create() class Moly_hu(Metadata): __name__ = "Moly.hu" __id__ = "moly_hu" BASE_URL = "https://moly.hu" BOOK_URL = BASE_URL + "/konyvek/" SEARCH_URL = BASE_URL + "/kereses?utf8=%E2%9C%93&query=" headers = { 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:130.0) Gecko/20100101 Firefox/130.0', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'hu-HU,hu;q=0.9,en;q=0.8', 'Accept-Encoding': 'gzip, deflate, br', } session = requests.Session() session.headers.update(headers) def search( self, query: str, generic_cover: str = "", locale: str = "hu" ) -> Optional[List[MetaRecord]]: """Search moly.hu for books matching the query""" if not self.active: return [] val = [] try: # Search for books search_url = self.SEARCH_URL + requests.utils.quote(query) log.info(f"Moly.hu searching: {search_url}") response = self.session.get(search_url, timeout=15) response.raise_for_status() # Parse search results root = fromstring(response.text) book_links = self._parse_search_results(root, query) if not book_links: log.info(f"Moly.hu: No results found for '{query}'") return [] # Fetch details for each book (max 5) with concurrent.futures.ThreadPoolExecutor(max_workers=3) as executor: futures = { executor.submit(self._get_book_details, link, idx): idx for idx, link in enumerate(book_links[:5]) } for future in concurrent.futures.as_completed(futures, timeout=20): try: result = future.result() if result: val.append(result) except Exception as e: log.warning(f"Moly.hu worker error: {e}") except requests.exceptions.Timeout: log.warning("Moly.hu search timed out") return [] except requests.exceptions.HTTPError as e: log.error(f"Moly.hu HTTP error: {e}") return [] except Exception as e: log.error_or_exception(f"Moly.hu search error: {e}") return [] # Sort by relevance (order from search results) val.sort(key=lambda x: x.source.id if hasattr(x, 'source') else 0) return val def _parse_search_results(self, root, query: str) -> List[str]: """Extract book URLs from search results page""" results = root.xpath('//a[@class="book_selector"]/@href') book_urls = [] for href in results: if href and href not in book_urls: book_urls.append(self.BASE_URL + href) log.info(f"Moly.hu found {len(book_urls)} results") return book_urls def _get_book_details(self, url: str, index: int) -> Optional[MetaRecord]: """Fetch and parse book details from a moly.hu book page""" try: response = self.session.get(url, timeout=15) response.raise_for_status() # Clean up HTML raw = response.text raw = raw.replace('', '').replace('', '') root = fromstring(raw) # Parse all fields title = self._parse_title(root) authors = self._parse_authors(root) if not title: return None moly_id = self._parse_moly_id(url) match = MetaRecord( id=moly_id, title=title, authors=authors if authors else [""], source=MetaSourceInfo( id=self.__id__, description="Moly.hu - Magyar könyves közösség", link=self.BASE_URL ), url=url, identifiers={"moly_hu": moly_id}, ) # Optional fields match.description = self._parse_description(root) match.cover = self._parse_cover(root) match.publisher = self._parse_publisher(root) match.publishedDate = self._parse_published_date(root) match.rating = self._parse_rating(root) match.tags = self._parse_tags(root) # Series info series_info = self._parse_series(root) if series_info: match.series = series_info[0] try: match.series_index = int(series_info[1]) except (ValueError, IndexError): match.series_index = 1 # ISBN isbn = self._parse_isbn(root) if isbn: match.identifiers["isbn"] = isbn return match except Exception as e: log.warning(f"Moly.hu error fetching {url}: {e}") return None def _parse_moly_id(self, url: str) -> Optional[str]: """Extract moly.hu book ID from URL""" try: m = re.search(r'/konyvek/(.*)', url) if m: return m.group(1) except: pass return None def _parse_title(self, root) -> Optional[str]: """Parse book title""" title_node = root.xpath('//*[@id="content"]//*[@class="fn"]/text()') if not title_node: title_node = root.xpath('//*[@id="content"]//*[@class="item"]/text()') if title_node: return title_node[0].strip().replace('\u200b', '') return None def _parse_authors(self, root) -> List[str]: """Parse author names""" author_nodes = root.xpath('//*[@id="content"]//div[@class="authors"]/a/text()') if author_nodes: return [str(author).strip() for author in author_nodes] return [] def _parse_description(self, root) -> Optional[str]: """Parse book description/comments""" description_node = root.xpath( '//*[@id="content"]//*[@class="text" and @id="full_description"]/p/text()' ) if not description_node: description_node = root.xpath('//*[@id="content"]//*[@class="text"]/p/text()') if not description_node: description_node = root.xpath( '//*[@id="content"]//*[@class="text shrinkable"]/p/text()' ) if description_node: # Clean up description desc = '\n'.join(description_node) desc = desc.replace('\n\n', '\n').replace('\n \n', '\n') desc = desc.replace('Vigyázat! Cselekményleírást tartalmaz.\n', '') return desc.strip() return None def _parse_cover(self, root) -> Optional[str]: """Parse cover image URL""" cover_nodes = root.xpath('(//*[@class="coverbox"]//a/@href)[1]') if cover_nodes: cover_url = cover_nodes[0] if not cover_url.startswith('http'): cover_url = self.BASE_URL + cover_url return cover_url # Fallback: try img src directly img_nodes = root.xpath('//*[@class="coverbox"]//img/@src') if img_nodes: img_url = img_nodes[0] if not img_url.startswith('http'): img_url = self.BASE_URL + img_url return img_url return None def _parse_publisher(self, root) -> Optional[str]: """Parse publisher name""" publisher_node_1 = root.xpath( '//*[@id="content"]//*[@class="items"]/div/div[1]/a/text()' ) if publisher_node_1 and publisher_node_1[0] == '+': publisher_node = root.xpath( '//*[@id="content"]//*[@class="items"]/div/div[2]/a/text()' ) else: publisher_node = publisher_node_1 if publisher_node: return publisher_node[0].strip() return None def _parse_published_date(self, root) -> Optional[str]: """Parse publication date (year)""" publication_node_1 = root.xpath( '//*[@id="content"]//*[@class="items"]/div/div[1]/text()' ) if not publication_node_1: publication_node = root.xpath( '//*[@id="content"]//*[@class="items"]/div/div[2]/text()' ) else: publication_node = publication_node_1 for value in publication_node: m = re.search(r'(\d{4})', value) if m: return m.group(1) return None def _parse_rating(self, root) -> int: """Parse rating (converted to 0-5 scale)""" rating_node = root.xpath( '//*[@id="content"]//*[@class="rating"]//*[@class="like_count"]/text()' ) if rating_node: try: # Moly.hu uses percentage, convert to 0-5 scale percentage = float(rating_node[0].strip('%').strip()) return round(percentage * 0.05) except (ValueError, IndexError): pass return 0 def _parse_tags(self, root) -> List[str]: """Parse tags/genres""" # Genre tags (in brackets) tags_genre = root.xpath('//*[@id="book_tags"]//*[@class="tag genre"]/text()') tags_genre = [f"[{str(t).strip()}]" for t in tags_genre if str(t).strip()] # Regular tags tags_regular = root.xpath('//*[@id="book_tags"]//*[@class="tag"]/text()') tags_regular = [str(t).strip() for t in tags_regular if str(t).strip()] return tags_genre + tags_regular def _parse_series(self, root) -> Optional[List[str]]: """Parse series name and index""" series_node = root.xpath('//*[@id="content"]//*[@class="action"]/text()') if not series_node: return None series_text = series_node[0].strip('().') parts = series_text.rsplit(' ', 1) # Check if it's actually edition info, not series if len(parts) > 1 and parts[1] == 'kiadás': return None if len(parts) == 2: return [parts[0], parts[1]] elif len(parts) == 1: return [parts[0], "1"] return None def _parse_isbn(self, root) -> Optional[str]: """Parse ISBN""" # Try first location isbn_nodes = root.xpath( '//*[@id="content"]//*[@class="items"]/div/div[2]/text()' ) for value in isbn_nodes: m = re.search(r'(\d{13}|\d{10})', value) if m: return m.group(1) # Try second location isbn_nodes = root.xpath( '//*[@id="content"]//*[@class="items"]/div/div[3]/text()' ) for value in isbn_nodes: m = re.search(r'(\d{13}|\d{10})', value) if m: return m.group(1) return None --- # Calibre-Web-Automated Deployment apiVersion: apps/v1 kind: Deployment metadata: name: calibre-web-automated namespace: calibre-system labels: app.kubernetes.io/instance: calibre app.kubernetes.io/name: calibre-web-automated spec: replicas: 1 strategy: type: Recreate selector: matchLabels: app.kubernetes.io/instance: calibre app.kubernetes.io/name: calibre-web-automated template: metadata: labels: app.kubernetes.io/instance: calibre app.kubernetes.io/name: calibre-web-automated annotations: # Version checker pattern - CWA uses semantic versioning match-regex.version-checker.io/calibre-web-automated: '^V?[0-9]+\.[0-9]+\.[0-9]+$' # Force rollout when ConfigMap changes (update this hash when modifying providers) configmap.reloader.stakater.com/reload: "calibre-custom-metadata-providers" spec: containers: - name: calibre-web-automated image: crocodilestick/calibre-web-automated:latest imagePullPolicy: IfNotPresent env: - name: PUID value: "1000" - name: PGID value: "1000" - name: TZ value: Europe/Budapest # Use default port 8083 - name: CWA_PORT_OVERRIDE value: "8083" # Disable WAL mode if on network share (set to true if using NFS) - name: NETWORK_SHARE_MODE value: "false" # Number of proxies in chain (Cloudflare -> nginx-ingress -> app) - name: TRUSTED_PROXY_COUNT value: "2" ports: - name: http containerPort: 8083 protocol: TCP resources: requests: cpu: 100m memory: 512Mi limits: cpu: "2" memory: 2Gi livenessProbe: httpGet: path: / port: http initialDelaySeconds: 120 periodSeconds: 60 timeoutSeconds: 10 failureThreshold: 5 readinessProbe: httpGet: path: / port: http initialDelaySeconds: 60 periodSeconds: 10 timeoutSeconds: 5 failureThreshold: 3 startupProbe: httpGet: path: / port: http periodSeconds: 10 timeoutSeconds: 5 # CWA can take time to initialize, especially first run failureThreshold: 60 volumeMounts: # Config directory for app database, logs, processed books backup - name: config mountPath: /config # Book ingest folder - files here are DELETED after processing - name: ingest mountPath: /cwa-book-ingest # Calibre library - your existing library location - name: library mountPath: /calibre-library # Custom metadata providers (moly.hu) - name: custom-metadata-providers mountPath: /app/calibre-web-automated/cps/metadata_provider/moly_hu.py subPath: moly_hu.py readOnly: true volumes: - name: config persistentVolumeClaim: claimName: calibre-web-automated-config # Ingest folder on hostPath for easy file dropping - name: ingest hostPath: path: /mnt/4_hdd/data/calibre-ingest type: DirectoryOrCreate # Your existing Calibre library location - name: library hostPath: path: /mnt/4_hdd/data/calibre type: DirectoryOrCreate # Custom metadata providers from ConfigMap - name: custom-metadata-providers configMap: name: calibre-custom-metadata-providers --- # Calibre-Web-Automated Service apiVersion: v1 kind: Service metadata: name: calibre-web-automated namespace: calibre-system labels: app.kubernetes.io/instance: calibre app.kubernetes.io/name: calibre-web-automated spec: type: ClusterIP ports: - name: http port: 8083 targetPort: http protocol: TCP selector: app.kubernetes.io/instance: calibre app.kubernetes.io/name: calibre-web-automated --- # Main Ingress (books.dooplex.hu - primary reading interface) apiVersion: networking.k8s.io/v1 kind: Ingress metadata: name: calibre-web-automated namespace: calibre-system labels: app.kubernetes.io/instance: calibre app.kubernetes.io/name: calibre-web-automated annotations: cert-manager.io/cluster-issuer: letsencrypt-prod external-dns.alpha.kubernetes.io/hostname: books.dooplex.hu,books.home nginx.ingress.kubernetes.io/proxy-body-size: "0" nginx.ingress.kubernetes.io/proxy-read-timeout: "600" nginx.ingress.kubernetes.io/proxy-send-timeout: "600" nginx.ingress.kubernetes.io/ssl-redirect: "true" # Forward auth headers for Authentik integration nginx.ingress.kubernetes.io/auth-response-headers: Set-Cookie,X-authentik-username,X-authentik-groups,X-authentik-email,X-authentik-name,X-authentik-uid nginx.ingress.kubernetes.io/auth-snippet: proxy_set_header X-Forwarded-Host $http_host; nginx.ingress.kubernetes.io/configuration-snippet: | set $geo_allowed 0; if ($remote_addr ~ "^192\.168\.") { set $geo_allowed 1; } if ($remote_addr ~ "^10\.") { set $geo_allowed 1; } if ($geoip2_country_code = "HU") { set $geo_allowed 1; } if ($geo_allowed = 0) { return 403 "Access restricted to Hungary"; } spec: ingressClassName: nginx-internal tls: - hosts: - books.dooplex.hu secretName: calibre-web-automated-tls rules: - host: books.dooplex.hu http: paths: - path: / pathType: Prefix backend: service: name: calibre-web-automated port: number: 8083 - host: books.home http: paths: - path: / pathType: Prefix backend: service: name: calibre-web-automated port: number: 8083 --- # Config PVC - stores app.db, logs, processed_books backup apiVersion: v1 kind: PersistentVolumeClaim metadata: name: calibre-web-automated-config namespace: calibre-system labels: app.kubernetes.io/instance: calibre app.kubernetes.io/name: calibre-web-automated recurring-job-group.longhorn.io/needbackup: enabled recurring-job.longhorn.io/source: enabled spec: accessModes: - ReadWriteOnce storageClassName: longhorn resources: requests: # Larger than typical - stores backup of processed books by default storage: 10Gi --- # Optional: Authentik integration for SSO # Uncomment and configure if using Authentik proxy authentication # apiVersion: networking.k8s.io/v1 # kind: Ingress # metadata: # name: calibre-web-automated-auth # namespace: calibre-system # annotations: # cert-manager.io/cluster-issuer: letsencrypt-prod # nginx.ingress.kubernetes.io/auth-url: http://authentik-outpost-proxy.authentik-system.svc.cluster.local:9000/outpost.goauthentik.io/auth/nginx # nginx.ingress.kubernetes.io/auth-signin: https://auth.dooplex.hu/outpost.goauthentik.io/start?rd=$escaped_request_uri # nginx.ingress.kubernetes.io/auth-response-headers: Set-Cookie,X-authentik-username,X-authentik-groups,X-authentik-email,X-authentik-name,X-authentik-uid # nginx.ingress.kubernetes.io/auth-snippet: proxy_set_header X-Forwarded-Host $http_host; # spec: # ingressClassName: nginx-internal # tls: # - hosts: # - books.dooplex.hu # secretName: calibre-web-automated-tls # rules: # - host: books.dooplex.hu # http: # paths: # - path: / # pathType: Prefix # backend: # service: # name: calibre-web-automated # port: # number: 8083