596 lines
21 KiB
YAML
596 lines
21 KiB
YAML
---
|
|
# Calibre-Web-Automated - All-in-one eBook library solution
|
|
# Replaces Calibre + Calibre-web with automation features
|
|
# Namespace
|
|
apiVersion: v1
|
|
kind: Namespace
|
|
metadata:
|
|
name: calibre-system
|
|
---
|
|
# Custom Metadata Providers ConfigMap
|
|
# Contains moly.hu provider for Hungarian book metadata
|
|
apiVersion: v1
|
|
kind: ConfigMap
|
|
metadata:
|
|
name: calibre-custom-metadata-providers
|
|
namespace: calibre-system
|
|
labels:
|
|
app.kubernetes.io/instance: calibre
|
|
app.kubernetes.io/name: calibre-web-automated
|
|
data:
|
|
moly_hu.py: |
|
|
# -*- coding: utf-8 -*-
|
|
# Calibre-Web Automated - Moly.hu Metadata Provider
|
|
# Based on Calibre plugin by Hokutya <mail@hokutya.com>
|
|
# Adapted for CWA
|
|
# SPDX-License-Identifier: GPL-3.0-or-later
|
|
|
|
import concurrent.futures
|
|
import re
|
|
import requests
|
|
from lxml.html import fromstring
|
|
from typing import List, Optional
|
|
from datetime import datetime
|
|
|
|
from cps.services.Metadata import MetaRecord, MetaSourceInfo, Metadata
|
|
import cps.logger as logger
|
|
|
|
log = logger.create()
|
|
|
|
|
|
class Moly_hu(Metadata):
|
|
__name__ = "Moly.hu"
|
|
__id__ = "moly_hu"
|
|
|
|
BASE_URL = "https://moly.hu"
|
|
BOOK_URL = BASE_URL + "/konyvek/"
|
|
SEARCH_URL = BASE_URL + "/kereses?utf8=%E2%9C%93&query="
|
|
|
|
headers = {
|
|
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:130.0) Gecko/20100101 Firefox/130.0',
|
|
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
|
'Accept-Language': 'hu-HU,hu;q=0.9,en;q=0.8',
|
|
'Accept-Encoding': 'gzip, deflate, br',
|
|
}
|
|
|
|
session = requests.Session()
|
|
session.headers.update(headers)
|
|
|
|
def search(
|
|
self, query: str, generic_cover: str = "", locale: str = "hu"
|
|
) -> Optional[List[MetaRecord]]:
|
|
"""Search moly.hu for books matching the query"""
|
|
|
|
if not self.active:
|
|
return []
|
|
|
|
val = []
|
|
|
|
try:
|
|
# Search for books
|
|
search_url = self.SEARCH_URL + requests.utils.quote(query)
|
|
log.info(f"Moly.hu searching: {search_url}")
|
|
|
|
response = self.session.get(search_url, timeout=15)
|
|
response.raise_for_status()
|
|
|
|
# Parse search results
|
|
root = fromstring(response.text)
|
|
book_links = self._parse_search_results(root, query)
|
|
|
|
if not book_links:
|
|
log.info(f"Moly.hu: No results found for '{query}'")
|
|
return []
|
|
|
|
# Fetch details for each book (max 5)
|
|
with concurrent.futures.ThreadPoolExecutor(max_workers=3) as executor:
|
|
futures = {
|
|
executor.submit(self._get_book_details, link, idx): idx
|
|
for idx, link in enumerate(book_links[:5])
|
|
}
|
|
|
|
for future in concurrent.futures.as_completed(futures, timeout=20):
|
|
try:
|
|
result = future.result()
|
|
if result:
|
|
val.append(result)
|
|
except Exception as e:
|
|
log.warning(f"Moly.hu worker error: {e}")
|
|
|
|
except requests.exceptions.Timeout:
|
|
log.warning("Moly.hu search timed out")
|
|
return []
|
|
except requests.exceptions.HTTPError as e:
|
|
log.error(f"Moly.hu HTTP error: {e}")
|
|
return []
|
|
except Exception as e:
|
|
log.error_or_exception(f"Moly.hu search error: {e}")
|
|
return []
|
|
|
|
# Sort by relevance (order from search results)
|
|
val.sort(key=lambda x: x.source.id if hasattr(x, 'source') else 0)
|
|
return val
|
|
|
|
def _parse_search_results(self, root, query: str) -> List[str]:
|
|
"""Extract book URLs from search results page"""
|
|
results = root.xpath('//a[@class="book_selector"]/@href')
|
|
book_urls = []
|
|
|
|
for href in results:
|
|
if href and href not in book_urls:
|
|
book_urls.append(self.BASE_URL + href)
|
|
|
|
log.info(f"Moly.hu found {len(book_urls)} results")
|
|
return book_urls
|
|
|
|
def _get_book_details(self, url: str, index: int) -> Optional[MetaRecord]:
|
|
"""Fetch and parse book details from a moly.hu book page"""
|
|
try:
|
|
response = self.session.get(url, timeout=15)
|
|
response.raise_for_status()
|
|
|
|
# Clean up HTML
|
|
raw = response.text
|
|
raw = raw.replace('<em>', '').replace('</em>', '')
|
|
|
|
root = fromstring(raw)
|
|
|
|
# Parse all fields
|
|
title = self._parse_title(root)
|
|
authors = self._parse_authors(root)
|
|
|
|
if not title:
|
|
return None
|
|
|
|
moly_id = self._parse_moly_id(url)
|
|
|
|
match = MetaRecord(
|
|
id=moly_id,
|
|
title=title,
|
|
authors=authors if authors else [""],
|
|
source=MetaSourceInfo(
|
|
id=self.__id__,
|
|
description="Moly.hu - Magyar könyves közösség",
|
|
link=self.BASE_URL
|
|
),
|
|
url=url,
|
|
identifiers={"moly_hu": moly_id},
|
|
)
|
|
|
|
# Optional fields
|
|
match.description = self._parse_description(root)
|
|
match.cover = self._parse_cover(root)
|
|
match.publisher = self._parse_publisher(root)
|
|
match.publishedDate = self._parse_published_date(root)
|
|
match.rating = self._parse_rating(root)
|
|
match.tags = self._parse_tags(root)
|
|
|
|
# Series info
|
|
series_info = self._parse_series(root)
|
|
if series_info:
|
|
match.series = series_info[0]
|
|
try:
|
|
match.series_index = int(series_info[1])
|
|
except (ValueError, IndexError):
|
|
match.series_index = 1
|
|
|
|
# ISBN
|
|
isbn = self._parse_isbn(root)
|
|
if isbn:
|
|
match.identifiers["isbn"] = isbn
|
|
|
|
return match
|
|
|
|
except Exception as e:
|
|
log.warning(f"Moly.hu error fetching {url}: {e}")
|
|
return None
|
|
|
|
def _parse_moly_id(self, url: str) -> Optional[str]:
|
|
"""Extract moly.hu book ID from URL"""
|
|
try:
|
|
m = re.search(r'/konyvek/(.*)', url)
|
|
if m:
|
|
return m.group(1)
|
|
except:
|
|
pass
|
|
return None
|
|
|
|
def _parse_title(self, root) -> Optional[str]:
|
|
"""Parse book title"""
|
|
title_node = root.xpath('//*[@id="content"]//*[@class="fn"]/text()')
|
|
if not title_node:
|
|
title_node = root.xpath('//*[@id="content"]//*[@class="item"]/text()')
|
|
if title_node:
|
|
return title_node[0].strip().replace('\u200b', '')
|
|
return None
|
|
|
|
def _parse_authors(self, root) -> List[str]:
|
|
"""Parse author names"""
|
|
author_nodes = root.xpath('//*[@id="content"]//div[@class="authors"]/a/text()')
|
|
if author_nodes:
|
|
return [str(author).strip() for author in author_nodes]
|
|
return []
|
|
|
|
def _parse_description(self, root) -> Optional[str]:
|
|
"""Parse book description/comments"""
|
|
description_node = root.xpath(
|
|
'//*[@id="content"]//*[@class="text" and @id="full_description"]/p/text()'
|
|
)
|
|
if not description_node:
|
|
description_node = root.xpath('//*[@id="content"]//*[@class="text"]/p/text()')
|
|
if not description_node:
|
|
description_node = root.xpath(
|
|
'//*[@id="content"]//*[@class="text shrinkable"]/p/text()'
|
|
)
|
|
|
|
if description_node:
|
|
# Clean up description
|
|
desc = '\n'.join(description_node)
|
|
desc = desc.replace('\n\n', '\n').replace('\n \n', '\n')
|
|
desc = desc.replace('Vigyázat! Cselekményleírást tartalmaz.\n', '')
|
|
return desc.strip()
|
|
return None
|
|
|
|
def _parse_cover(self, root) -> Optional[str]:
|
|
"""Parse cover image URL"""
|
|
cover_nodes = root.xpath('(//*[@class="coverbox"]//a/@href)[1]')
|
|
if cover_nodes:
|
|
cover_url = cover_nodes[0]
|
|
if not cover_url.startswith('http'):
|
|
cover_url = self.BASE_URL + cover_url
|
|
return cover_url
|
|
|
|
# Fallback: try img src directly
|
|
img_nodes = root.xpath('//*[@class="coverbox"]//img/@src')
|
|
if img_nodes:
|
|
img_url = img_nodes[0]
|
|
if not img_url.startswith('http'):
|
|
img_url = self.BASE_URL + img_url
|
|
return img_url
|
|
return None
|
|
|
|
def _parse_publisher(self, root) -> Optional[str]:
|
|
"""Parse publisher name"""
|
|
publisher_node_1 = root.xpath(
|
|
'//*[@id="content"]//*[@class="items"]/div/div[1]/a/text()'
|
|
)
|
|
if publisher_node_1 and publisher_node_1[0] == '+':
|
|
publisher_node = root.xpath(
|
|
'//*[@id="content"]//*[@class="items"]/div/div[2]/a/text()'
|
|
)
|
|
else:
|
|
publisher_node = publisher_node_1
|
|
|
|
if publisher_node:
|
|
return publisher_node[0].strip()
|
|
return None
|
|
|
|
def _parse_published_date(self, root) -> Optional[str]:
|
|
"""Parse publication date (year)"""
|
|
publication_node_1 = root.xpath(
|
|
'//*[@id="content"]//*[@class="items"]/div/div[1]/text()'
|
|
)
|
|
if not publication_node_1:
|
|
publication_node = root.xpath(
|
|
'//*[@id="content"]//*[@class="items"]/div/div[2]/text()'
|
|
)
|
|
else:
|
|
publication_node = publication_node_1
|
|
|
|
for value in publication_node:
|
|
m = re.search(r'(\d{4})', value)
|
|
if m:
|
|
return m.group(1)
|
|
return None
|
|
|
|
def _parse_rating(self, root) -> int:
|
|
"""Parse rating (converted to 0-5 scale)"""
|
|
rating_node = root.xpath(
|
|
'//*[@id="content"]//*[@class="rating"]//*[@class="like_count"]/text()'
|
|
)
|
|
if rating_node:
|
|
try:
|
|
# Moly.hu uses percentage, convert to 0-5 scale
|
|
percentage = float(rating_node[0].strip('%').strip())
|
|
return round(percentage * 0.05)
|
|
except (ValueError, IndexError):
|
|
pass
|
|
return 0
|
|
|
|
def _parse_tags(self, root) -> List[str]:
|
|
"""Parse tags/genres"""
|
|
# Genre tags (in brackets)
|
|
tags_genre = root.xpath('//*[@id="book_tags"]//*[@class="tag genre"]/text()')
|
|
tags_genre = [f"[{str(t).strip()}]" for t in tags_genre if str(t).strip()]
|
|
|
|
# Regular tags
|
|
tags_regular = root.xpath('//*[@id="book_tags"]//*[@class="tag"]/text()')
|
|
tags_regular = [str(t).strip() for t in tags_regular if str(t).strip()]
|
|
|
|
return tags_genre + tags_regular
|
|
|
|
def _parse_series(self, root) -> Optional[List[str]]:
|
|
"""Parse series name and index"""
|
|
series_node = root.xpath('//*[@id="content"]//*[@class="action"]/text()')
|
|
|
|
if not series_node:
|
|
return None
|
|
|
|
series_text = series_node[0].strip('().')
|
|
parts = series_text.rsplit(' ', 1)
|
|
|
|
# Check if it's actually edition info, not series
|
|
if len(parts) > 1 and parts[1] == 'kiadás':
|
|
return None
|
|
|
|
if len(parts) == 2:
|
|
return [parts[0], parts[1]]
|
|
elif len(parts) == 1:
|
|
return [parts[0], "1"]
|
|
|
|
return None
|
|
|
|
def _parse_isbn(self, root) -> Optional[str]:
|
|
"""Parse ISBN"""
|
|
# Try first location
|
|
isbn_nodes = root.xpath(
|
|
'//*[@id="content"]//*[@class="items"]/div/div[2]/text()'
|
|
)
|
|
for value in isbn_nodes:
|
|
m = re.search(r'(\d{13}|\d{10})', value)
|
|
if m:
|
|
return m.group(1)
|
|
|
|
# Try second location
|
|
isbn_nodes = root.xpath(
|
|
'//*[@id="content"]//*[@class="items"]/div/div[3]/text()'
|
|
)
|
|
for value in isbn_nodes:
|
|
m = re.search(r'(\d{13}|\d{10})', value)
|
|
if m:
|
|
return m.group(1)
|
|
|
|
return None
|
|
---
|
|
# Calibre-Web-Automated Deployment
|
|
apiVersion: apps/v1
|
|
kind: Deployment
|
|
metadata:
|
|
name: calibre-web-automated
|
|
namespace: calibre-system
|
|
labels:
|
|
app.kubernetes.io/instance: calibre
|
|
app.kubernetes.io/name: calibre-web-automated
|
|
spec:
|
|
replicas: 1
|
|
strategy:
|
|
type: Recreate
|
|
selector:
|
|
matchLabels:
|
|
app.kubernetes.io/instance: calibre
|
|
app.kubernetes.io/name: calibre-web-automated
|
|
template:
|
|
metadata:
|
|
labels:
|
|
app.kubernetes.io/instance: calibre
|
|
app.kubernetes.io/name: calibre-web-automated
|
|
annotations:
|
|
# Version checker pattern - CWA uses semantic versioning
|
|
match-regex.version-checker.io/calibre-web-automated: '^V?[0-9]+\.[0-9]+\.[0-9]+$'
|
|
# Force rollout when ConfigMap changes (update this hash when modifying providers)
|
|
configmap.reloader.stakater.com/reload: "calibre-custom-metadata-providers"
|
|
spec:
|
|
containers:
|
|
- name: calibre-web-automated
|
|
image: crocodilestick/calibre-web-automated:latest
|
|
imagePullPolicy: IfNotPresent
|
|
env:
|
|
- name: PUID
|
|
value: "1000"
|
|
- name: PGID
|
|
value: "1000"
|
|
- name: TZ
|
|
value: Europe/Budapest
|
|
# Use default port 8083
|
|
- name: CWA_PORT_OVERRIDE
|
|
value: "8083"
|
|
# Disable WAL mode if on network share (set to true if using NFS)
|
|
- name: NETWORK_SHARE_MODE
|
|
value: "false"
|
|
# Number of proxies in chain (Cloudflare -> nginx-ingress -> app)
|
|
- name: TRUSTED_PROXY_COUNT
|
|
value: "2"
|
|
ports:
|
|
- name: http
|
|
containerPort: 8083
|
|
protocol: TCP
|
|
resources:
|
|
requests:
|
|
cpu: 100m
|
|
memory: 512Mi
|
|
limits:
|
|
cpu: "2"
|
|
memory: 2Gi
|
|
livenessProbe:
|
|
httpGet:
|
|
path: /
|
|
port: http
|
|
initialDelaySeconds: 120
|
|
periodSeconds: 60
|
|
timeoutSeconds: 10
|
|
failureThreshold: 5
|
|
readinessProbe:
|
|
httpGet:
|
|
path: /
|
|
port: http
|
|
initialDelaySeconds: 60
|
|
periodSeconds: 10
|
|
timeoutSeconds: 5
|
|
failureThreshold: 3
|
|
startupProbe:
|
|
httpGet:
|
|
path: /
|
|
port: http
|
|
periodSeconds: 10
|
|
timeoutSeconds: 5
|
|
# CWA can take time to initialize, especially first run
|
|
failureThreshold: 60
|
|
volumeMounts:
|
|
# Config directory for app database, logs, processed books backup
|
|
- name: config
|
|
mountPath: /config
|
|
# Book ingest folder - files here are DELETED after processing
|
|
- name: ingest
|
|
mountPath: /cwa-book-ingest
|
|
# Calibre library - your existing library location
|
|
- name: library
|
|
mountPath: /calibre-library
|
|
# Custom metadata providers (moly.hu)
|
|
- name: custom-metadata-providers
|
|
mountPath: /app/calibre-web-automated/cps/metadata_provider/moly_hu.py
|
|
subPath: moly_hu.py
|
|
readOnly: true
|
|
volumes:
|
|
- name: config
|
|
persistentVolumeClaim:
|
|
claimName: calibre-web-automated-config
|
|
# Ingest folder on hostPath for easy file dropping
|
|
- name: ingest
|
|
hostPath:
|
|
path: /mnt/4_hdd/data/calibre-ingest
|
|
type: DirectoryOrCreate
|
|
# Your existing Calibre library location
|
|
- name: library
|
|
hostPath:
|
|
path: /mnt/4_hdd/data/calibre
|
|
type: DirectoryOrCreate
|
|
# Custom metadata providers from ConfigMap
|
|
- name: custom-metadata-providers
|
|
configMap:
|
|
name: calibre-custom-metadata-providers
|
|
---
|
|
# Calibre-Web-Automated Service
|
|
apiVersion: v1
|
|
kind: Service
|
|
metadata:
|
|
name: calibre-web-automated
|
|
namespace: calibre-system
|
|
labels:
|
|
app.kubernetes.io/instance: calibre
|
|
app.kubernetes.io/name: calibre-web-automated
|
|
spec:
|
|
type: ClusterIP
|
|
ports:
|
|
- name: http
|
|
port: 8083
|
|
targetPort: http
|
|
protocol: TCP
|
|
selector:
|
|
app.kubernetes.io/instance: calibre
|
|
app.kubernetes.io/name: calibre-web-automated
|
|
---
|
|
# Main Ingress (books.dooplex.hu - primary reading interface)
|
|
apiVersion: networking.k8s.io/v1
|
|
kind: Ingress
|
|
metadata:
|
|
name: calibre-web-automated
|
|
namespace: calibre-system
|
|
labels:
|
|
app.kubernetes.io/instance: calibre
|
|
app.kubernetes.io/name: calibre-web-automated
|
|
annotations:
|
|
cert-manager.io/cluster-issuer: letsencrypt-prod
|
|
external-dns.alpha.kubernetes.io/hostname: books.dooplex.hu,books.home
|
|
nginx.ingress.kubernetes.io/proxy-body-size: "0"
|
|
nginx.ingress.kubernetes.io/proxy-read-timeout: "600"
|
|
nginx.ingress.kubernetes.io/proxy-send-timeout: "600"
|
|
nginx.ingress.kubernetes.io/ssl-redirect: "true"
|
|
# Forward auth headers for Authentik integration
|
|
nginx.ingress.kubernetes.io/auth-response-headers: Set-Cookie,X-authentik-username,X-authentik-groups,X-authentik-email,X-authentik-name,X-authentik-uid
|
|
nginx.ingress.kubernetes.io/auth-snippet: proxy_set_header X-Forwarded-Host $http_host;
|
|
nginx.ingress.kubernetes.io/configuration-snippet: |
|
|
set $geo_allowed 0;
|
|
if ($remote_addr ~ "^192\.168\.") { set $geo_allowed 1; }
|
|
if ($remote_addr ~ "^10\.") { set $geo_allowed 1; }
|
|
if ($geoip2_country_code = "HU") { set $geo_allowed 1; }
|
|
if ($geo_allowed = 0) {
|
|
return 403 "Access restricted to Hungary";
|
|
}
|
|
spec:
|
|
ingressClassName: nginx-internal
|
|
tls:
|
|
- hosts:
|
|
- books.dooplex.hu
|
|
secretName: calibre-web-automated-tls
|
|
rules:
|
|
- host: books.dooplex.hu
|
|
http:
|
|
paths:
|
|
- path: /
|
|
pathType: Prefix
|
|
backend:
|
|
service:
|
|
name: calibre-web-automated
|
|
port:
|
|
number: 8083
|
|
- host: books.home
|
|
http:
|
|
paths:
|
|
- path: /
|
|
pathType: Prefix
|
|
backend:
|
|
service:
|
|
name: calibre-web-automated
|
|
port:
|
|
number: 8083
|
|
---
|
|
# Config PVC - stores app.db, logs, processed_books backup
|
|
apiVersion: v1
|
|
kind: PersistentVolumeClaim
|
|
metadata:
|
|
name: calibre-web-automated-config
|
|
namespace: calibre-system
|
|
labels:
|
|
app.kubernetes.io/instance: calibre
|
|
app.kubernetes.io/name: calibre-web-automated
|
|
recurring-job-group.longhorn.io/needbackup: enabled
|
|
recurring-job.longhorn.io/source: enabled
|
|
spec:
|
|
accessModes:
|
|
- ReadWriteOnce
|
|
storageClassName: longhorn
|
|
resources:
|
|
requests:
|
|
# Larger than typical - stores backup of processed books by default
|
|
storage: 10Gi
|
|
---
|
|
# Optional: Authentik integration for SSO
|
|
# Uncomment and configure if using Authentik proxy authentication
|
|
# apiVersion: networking.k8s.io/v1
|
|
# kind: Ingress
|
|
# metadata:
|
|
# name: calibre-web-automated-auth
|
|
# namespace: calibre-system
|
|
# annotations:
|
|
# cert-manager.io/cluster-issuer: letsencrypt-prod
|
|
# nginx.ingress.kubernetes.io/auth-url: http://authentik-outpost-proxy.authentik-system.svc.cluster.local:9000/outpost.goauthentik.io/auth/nginx
|
|
# nginx.ingress.kubernetes.io/auth-signin: https://auth.dooplex.hu/outpost.goauthentik.io/start?rd=$escaped_request_uri
|
|
# nginx.ingress.kubernetes.io/auth-response-headers: Set-Cookie,X-authentik-username,X-authentik-groups,X-authentik-email,X-authentik-name,X-authentik-uid
|
|
# nginx.ingress.kubernetes.io/auth-snippet: proxy_set_header X-Forwarded-Host $http_host;
|
|
# spec:
|
|
# ingressClassName: nginx-internal
|
|
# tls:
|
|
# - hosts:
|
|
# - books.dooplex.hu
|
|
# secretName: calibre-web-automated-tls
|
|
# rules:
|
|
# - host: books.dooplex.hu
|
|
# http:
|
|
# paths:
|
|
# - path: /
|
|
# pathType: Prefix
|
|
# backend:
|
|
# service:
|
|
# name: calibre-web-automated
|
|
# port:
|
|
# number: 8083 |