added moly provider as configmap

This commit is contained in:
2026-01-25 20:28:27 +01:00
parent 54046d2bdf
commit af97c95ef1
+353
View File
@@ -7,6 +7,348 @@ kind: Namespace
metadata: metadata:
name: calibre-system name: calibre-system
--- ---
# Custom Metadata Providers ConfigMap
# Contains moly.hu provider for Hungarian book metadata
apiVersion: v1
kind: ConfigMap
metadata:
name: calibre-custom-metadata-providers
namespace: calibre-system
labels:
app.kubernetes.io/instance: calibre
app.kubernetes.io/name: calibre-web-automated
data:
moly_hu.py: |
# -*- coding: utf-8 -*-
# Calibre-Web Automated - Moly.hu Metadata Provider
# Based on Calibre plugin by Hokutya <mail@hokutya.com>
# Adapted for CWA
# SPDX-License-Identifier: GPL-3.0-or-later
import concurrent.futures
import re
import requests
from lxml.html import fromstring
from typing import List, Optional
from datetime import datetime
from cps.services.Metadata import MetaRecord, MetaSourceInfo, Metadata
import cps.logger as logger
log = logger.create()
class Moly_hu(Metadata):
__name__ = "Moly.hu"
__id__ = "moly_hu"
BASE_URL = "https://moly.hu"
BOOK_URL = BASE_URL + "/konyvek/"
SEARCH_URL = BASE_URL + "/kereses?utf8=%E2%9C%93&query="
headers = {
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:130.0) Gecko/20100101 Firefox/130.0',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'hu-HU,hu;q=0.9,en;q=0.8',
'Accept-Encoding': 'gzip, deflate, br',
}
session = requests.Session()
session.headers.update(headers)
def search(
self, query: str, generic_cover: str = "", locale: str = "hu"
) -> Optional[List[MetaRecord]]:
"""Search moly.hu for books matching the query"""
if not self.active:
return []
val = []
try:
# Search for books
search_url = self.SEARCH_URL + requests.utils.quote(query)
log.info(f"Moly.hu searching: {search_url}")
response = self.session.get(search_url, timeout=15)
response.raise_for_status()
# Parse search results
root = fromstring(response.text)
book_links = self._parse_search_results(root, query)
if not book_links:
log.info(f"Moly.hu: No results found for '{query}'")
return []
# Fetch details for each book (max 5)
with concurrent.futures.ThreadPoolExecutor(max_workers=3) as executor:
futures = {
executor.submit(self._get_book_details, link, idx): idx
for idx, link in enumerate(book_links[:5])
}
for future in concurrent.futures.as_completed(futures, timeout=20):
try:
result = future.result()
if result:
val.append(result)
except Exception as e:
log.warning(f"Moly.hu worker error: {e}")
except requests.exceptions.Timeout:
log.warning("Moly.hu search timed out")
return []
except requests.exceptions.HTTPError as e:
log.error(f"Moly.hu HTTP error: {e}")
return []
except Exception as e:
log.error_or_exception(f"Moly.hu search error: {e}")
return []
# Sort by relevance (order from search results)
val.sort(key=lambda x: x.source.id if hasattr(x, 'source') else 0)
return val
def _parse_search_results(self, root, query: str) -> List[str]:
"""Extract book URLs from search results page"""
results = root.xpath('//a[@class="book_selector"]/@href')
book_urls = []
for href in results:
if href and href not in book_urls:
book_urls.append(self.BASE_URL + href)
log.info(f"Moly.hu found {len(book_urls)} results")
return book_urls
def _get_book_details(self, url: str, index: int) -> Optional[MetaRecord]:
"""Fetch and parse book details from a moly.hu book page"""
try:
response = self.session.get(url, timeout=15)
response.raise_for_status()
# Clean up HTML
raw = response.text
raw = raw.replace('<em>', '').replace('</em>', '')
root = fromstring(raw)
# Parse all fields
title = self._parse_title(root)
authors = self._parse_authors(root)
if not title:
return None
match = MetaRecord(
title=title,
authors=authors if authors else [""],
source=MetaSourceInfo(
id=self.__id__,
description="Moly.hu - Magyar könyves közösség",
link=self.BASE_URL
),
url=url,
identifiers={"moly_hu": self._parse_moly_id(url)},
)
# Optional fields
match.description = self._parse_description(root)
match.cover = self._parse_cover(root)
match.publisher = self._parse_publisher(root)
match.publishedDate = self._parse_published_date(root)
match.rating = self._parse_rating(root)
match.tags = self._parse_tags(root)
# Series info
series_info = self._parse_series(root)
if series_info:
match.series = series_info[0]
try:
match.series_index = int(series_info[1])
except (ValueError, IndexError):
match.series_index = 1
# ISBN
isbn = self._parse_isbn(root)
if isbn:
match.identifiers["isbn"] = isbn
return match
except Exception as e:
log.warning(f"Moly.hu error fetching {url}: {e}")
return None
def _parse_moly_id(self, url: str) -> Optional[str]:
"""Extract moly.hu book ID from URL"""
try:
m = re.search(r'/konyvek/(.*)', url)
if m:
return m.group(1)
except:
pass
return None
def _parse_title(self, root) -> Optional[str]:
"""Parse book title"""
title_node = root.xpath('//*[@id="content"]//*[@class="fn"]/text()')
if not title_node:
title_node = root.xpath('//*[@id="content"]//*[@class="item"]/text()')
if title_node:
return title_node[0].strip().replace('\u200b', '')
return None
def _parse_authors(self, root) -> List[str]:
"""Parse author names"""
author_nodes = root.xpath('//*[@id="content"]//div[@class="authors"]/a/text()')
if author_nodes:
return [str(author).strip() for author in author_nodes]
return []
def _parse_description(self, root) -> Optional[str]:
"""Parse book description/comments"""
description_node = root.xpath(
'//*[@id="content"]//*[@class="text" and @id="full_description"]/p/text()'
)
if not description_node:
description_node = root.xpath('//*[@id="content"]//*[@class="text"]/p/text()')
if not description_node:
description_node = root.xpath(
'//*[@id="content"]//*[@class="text shrinkable"]/p/text()'
)
if description_node:
# Clean up description
desc = '\n'.join(description_node)
desc = desc.replace('\n\n', '\n').replace('\n \n', '\n')
desc = desc.replace('Vigyázat! Cselekményleírást tartalmaz.\n', '')
return desc.strip()
return None
def _parse_cover(self, root) -> Optional[str]:
"""Parse cover image URL"""
cover_nodes = root.xpath('(//*[@class="coverbox"]//a/@href)[1]')
if cover_nodes:
cover_url = cover_nodes[0]
if not cover_url.startswith('http'):
cover_url = self.BASE_URL + cover_url
return cover_url
# Fallback: try img src directly
img_nodes = root.xpath('//*[@class="coverbox"]//img/@src')
if img_nodes:
img_url = img_nodes[0]
if not img_url.startswith('http'):
img_url = self.BASE_URL + img_url
return img_url
return None
def _parse_publisher(self, root) -> Optional[str]:
"""Parse publisher name"""
publisher_node_1 = root.xpath(
'//*[@id="content"]//*[@class="items"]/div/div[1]/a/text()'
)
if publisher_node_1 and publisher_node_1[0] == '+':
publisher_node = root.xpath(
'//*[@id="content"]//*[@class="items"]/div/div[2]/a/text()'
)
else:
publisher_node = publisher_node_1
if publisher_node:
return publisher_node[0].strip()
return None
def _parse_published_date(self, root) -> Optional[str]:
"""Parse publication date (year)"""
publication_node_1 = root.xpath(
'//*[@id="content"]//*[@class="items"]/div/div[1]/text()'
)
if not publication_node_1:
publication_node = root.xpath(
'//*[@id="content"]//*[@class="items"]/div/div[2]/text()'
)
else:
publication_node = publication_node_1
for value in publication_node:
m = re.search(r'(\d{4})', value)
if m:
return m.group(1)
return None
def _parse_rating(self, root) -> int:
"""Parse rating (converted to 0-5 scale)"""
rating_node = root.xpath(
'//*[@id="content"]//*[@class="rating"]//*[@class="like_count"]/text()'
)
if rating_node:
try:
# Moly.hu uses percentage, convert to 0-5 scale
percentage = float(rating_node[0].strip('%').strip())
return round(percentage * 0.05)
except (ValueError, IndexError):
pass
return 0
def _parse_tags(self, root) -> List[str]:
"""Parse tags/genres"""
# Genre tags (in brackets)
tags_genre = root.xpath('//*[@id="book_tags"]//*[@class="tag genre"]/text()')
tags_genre = [f"[{str(t).strip()}]" for t in tags_genre if str(t).strip()]
# Regular tags
tags_regular = root.xpath('//*[@id="book_tags"]//*[@class="tag"]/text()')
tags_regular = [str(t).strip() for t in tags_regular if str(t).strip()]
return tags_genre + tags_regular
def _parse_series(self, root) -> Optional[List[str]]:
"""Parse series name and index"""
series_node = root.xpath('//*[@id="content"]//*[@class="action"]/text()')
if not series_node:
return None
series_text = series_node[0].strip('().')
parts = series_text.rsplit(' ', 1)
# Check if it's actually edition info, not series
if len(parts) > 1 and parts[1] == 'kiadás':
return None
if len(parts) == 2:
return [parts[0], parts[1]]
elif len(parts) == 1:
return [parts[0], "1"]
return None
def _parse_isbn(self, root) -> Optional[str]:
"""Parse ISBN"""
# Try first location
isbn_nodes = root.xpath(
'//*[@id="content"]//*[@class="items"]/div/div[2]/text()'
)
for value in isbn_nodes:
m = re.search(r'(\d{13}|\d{10})', value)
if m:
return m.group(1)
# Try second location
isbn_nodes = root.xpath(
'//*[@id="content"]//*[@class="items"]/div/div[3]/text()'
)
for value in isbn_nodes:
m = re.search(r'(\d{13}|\d{10})', value)
if m:
return m.group(1)
return None
---
# Calibre-Web-Automated Deployment # Calibre-Web-Automated Deployment
apiVersion: apps/v1 apiVersion: apps/v1
kind: Deployment kind: Deployment
@@ -32,6 +374,8 @@ spec:
annotations: annotations:
# Version checker pattern - CWA uses semantic versioning # Version checker pattern - CWA uses semantic versioning
match-regex.version-checker.io/calibre-web-automated: '^V?[0-9]+\.[0-9]+\.[0-9]+$' match-regex.version-checker.io/calibre-web-automated: '^V?[0-9]+\.[0-9]+\.[0-9]+$'
# Force rollout when ConfigMap changes (update this hash when modifying providers)
configmap.reloader.stakater.com/reload: "calibre-custom-metadata-providers"
spec: spec:
containers: containers:
- name: calibre-web-automated - name: calibre-web-automated
@@ -98,6 +442,11 @@ spec:
# Calibre library - your existing library location # Calibre library - your existing library location
- name: library - name: library
mountPath: /calibre-library mountPath: /calibre-library
# Custom metadata providers (moly.hu)
- name: custom-metadata-providers
mountPath: /app/calibre-web-automated/cps/metadata_provider/moly_hu.py
subPath: moly_hu.py
readOnly: true
volumes: volumes:
- name: config - name: config
persistentVolumeClaim: persistentVolumeClaim:
@@ -112,6 +461,10 @@ spec:
hostPath: hostPath:
path: /mnt/4_hdd/data/calibre path: /mnt/4_hdd/data/calibre
type: DirectoryOrCreate type: DirectoryOrCreate
# Custom metadata providers from ConfigMap
- name: custom-metadata-providers
configMap:
name: calibre-custom-metadata-providers
--- ---
# Calibre-Web-Automated Service # Calibre-Web-Automated Service
apiVersion: v1 apiVersion: v1