990 lines
37 KiB
YAML
990 lines
37 KiB
YAML
---
|
||
# Calibre-Web-Automated - All-in-one eBook library solution
|
||
# Namespace
|
||
apiVersion: v1
|
||
kind: Namespace
|
||
metadata:
|
||
name: calibre-system
|
||
---
|
||
# Custom Metadata Providers ConfigMap
|
||
# Contains Hungarian metadata providers: moly.hu and libri.hu
|
||
apiVersion: v1
|
||
kind: ConfigMap
|
||
metadata:
|
||
name: calibre-custom-metadata-providers
|
||
namespace: calibre-system
|
||
labels:
|
||
app.kubernetes.io/instance: calibre
|
||
app.kubernetes.io/name: calibre-web-automated
|
||
data:
|
||
moly_hu.py: |
|
||
# -*- coding: utf-8 -*-
|
||
# Calibre-Web Automated - Moly.hu Metadata Provider
|
||
# Based on Calibre plugin by Hokutya <mail@hokutya.com>
|
||
# Adapted for CWA
|
||
# SPDX-License-Identifier: GPL-3.0-or-later
|
||
|
||
import concurrent.futures
|
||
import re
|
||
import requests
|
||
from lxml.html import fromstring
|
||
from typing import List, Optional, Tuple
|
||
|
||
from cps.services.Metadata import MetaRecord, MetaSourceInfo, Metadata
|
||
import cps.logger as logger
|
||
|
||
log = logger.create()
|
||
|
||
|
||
def strip_accents(s: str) -> str:
|
||
"""Remove accents from Hungarian text for comparison"""
|
||
if not s:
|
||
return ""
|
||
symbols = "öÖüÜóÓőŐúÚéÉáÁűŰíÍ"
|
||
replacements = "oOuUoOoOuUeEaAuUiI"
|
||
trans = str.maketrans(symbols, replacements)
|
||
return s.translate(trans).lower()
|
||
|
||
|
||
def normalize_title(title: str) -> str:
|
||
"""Normalize title for comparison"""
|
||
if not title:
|
||
return ""
|
||
title = re.sub(r'\([^)]*\)', '', title)
|
||
title = re.sub(r'\[[^\]]*\]', '', title)
|
||
title = re.sub(r'[^\w\s]', ' ', title)
|
||
title = re.sub(r'\s+', ' ', title).strip()
|
||
return strip_accents(title)
|
||
|
||
|
||
def calculate_relevance(query_title: str, query_author: str,
|
||
result_title: str, result_authors: List[str]) -> int:
|
||
"""Calculate relevance score (lower is better, 0 is exact match)"""
|
||
score = 500
|
||
|
||
norm_query_title = normalize_title(query_title)
|
||
norm_result_title = normalize_title(result_title)
|
||
|
||
if norm_query_title == norm_result_title:
|
||
score -= 300
|
||
elif norm_query_title in norm_result_title or norm_result_title in norm_query_title:
|
||
score -= 200
|
||
elif any(word in norm_result_title for word in norm_query_title.split() if len(word) > 2):
|
||
score -= 100
|
||
else:
|
||
score += 200
|
||
|
||
if query_author and result_authors:
|
||
norm_query_author = strip_accents(query_author)
|
||
result_authors_norm = [strip_accents(a) for a in result_authors]
|
||
|
||
query_parts = norm_query_author.split()
|
||
reversed_author = f"{query_parts[-1]} {' '.join(query_parts[:-1])}" if len(query_parts) >= 2 else norm_query_author
|
||
|
||
for author_norm in result_authors_norm:
|
||
if norm_query_author == author_norm or reversed_author == author_norm:
|
||
score -= 200
|
||
break
|
||
elif norm_query_author in author_norm or author_norm in norm_query_author:
|
||
score -= 100
|
||
break
|
||
elif any(part in author_norm for part in query_parts if len(part) > 2):
|
||
score -= 50
|
||
break
|
||
|
||
return max(0, score)
|
||
|
||
|
||
class Moly_hu(Metadata):
|
||
__name__ = "Moly.hu"
|
||
__id__ = "moly_hu"
|
||
|
||
BASE_URL = "https://moly.hu"
|
||
BOOK_URL = BASE_URL + "/konyvek/"
|
||
SEARCH_URL = BASE_URL + "/kereses?utf8=%E2%9C%93&query="
|
||
|
||
headers = {
|
||
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:130.0) Gecko/20100101 Firefox/130.0',
|
||
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
||
'Accept-Language': 'hu-HU,hu;q=0.9,en;q=0.8',
|
||
}
|
||
|
||
session = requests.Session()
|
||
session.headers.update(headers)
|
||
|
||
def search(
|
||
self, query: str, generic_cover: str = "", locale: str = "hu"
|
||
) -> Optional[List[MetaRecord]]:
|
||
if not self.active:
|
||
return []
|
||
|
||
val = []
|
||
query_author = ""
|
||
query_title = query.strip()
|
||
|
||
try:
|
||
search_url = self.SEARCH_URL + requests.utils.quote(query)
|
||
log.info(f"Moly.hu searching: {search_url}")
|
||
|
||
response = self.session.get(search_url, timeout=15)
|
||
response.raise_for_status()
|
||
|
||
root = fromstring(response.text)
|
||
book_data = self._parse_search_results(root, query_title, query_author)
|
||
|
||
if not book_data:
|
||
log.info(f"Moly.hu: No results found for '{query}'")
|
||
return []
|
||
|
||
with concurrent.futures.ThreadPoolExecutor(max_workers=3) as executor:
|
||
futures = {
|
||
executor.submit(self._get_book_details, url, idx, query_title, query_author): idx
|
||
for idx, (url, _) in enumerate(book_data[:5])
|
||
}
|
||
|
||
for future in concurrent.futures.as_completed(futures, timeout=20):
|
||
try:
|
||
result = future.result()
|
||
if result:
|
||
val.append(result)
|
||
except Exception as e:
|
||
log.warning(f"Moly.hu worker error: {e}")
|
||
|
||
except requests.exceptions.Timeout:
|
||
log.warning("Moly.hu search timed out")
|
||
return []
|
||
except requests.exceptions.HTTPError as e:
|
||
log.error(f"Moly.hu HTTP error: {e}")
|
||
return []
|
||
except Exception as e:
|
||
log.error_or_exception(f"Moly.hu search error: {e}")
|
||
return []
|
||
|
||
val.sort(key=lambda x: getattr(x, '_relevance_score', 500))
|
||
return val
|
||
|
||
def _parse_search_results(self, root, query_title: str, query_author: str) -> List[Tuple[str, int]]:
|
||
results = root.xpath('//a[@class="book_selector"]')
|
||
book_data = []
|
||
|
||
for result in results:
|
||
href = result.get('href')
|
||
if not href:
|
||
continue
|
||
|
||
text = result.text_content().strip() if result.text_content() else ""
|
||
result_author = ""
|
||
result_title = text
|
||
if ':' in text:
|
||
parts = text.split(':', 1)
|
||
result_author = parts[0].strip()
|
||
result_title = parts[1].strip()
|
||
|
||
relevance = calculate_relevance(query_title, query_author, result_title, [result_author])
|
||
url = self.BASE_URL + href
|
||
book_data.append((url, relevance))
|
||
|
||
book_data.sort(key=lambda x: x[1])
|
||
log.info(f"Moly.hu found {len(book_data)} results")
|
||
return book_data
|
||
|
||
def _get_book_details(self, url: str, index: int, query_title: str, query_author: str) -> Optional[MetaRecord]:
|
||
try:
|
||
response = self.session.get(url, timeout=15)
|
||
response.raise_for_status()
|
||
|
||
raw = response.text.replace('<em>', '').replace('</em>', '')
|
||
root = fromstring(raw)
|
||
|
||
title = self._parse_title(root)
|
||
authors = self._parse_authors(root)
|
||
|
||
if not title:
|
||
return None
|
||
|
||
moly_id = self._parse_moly_id(url)
|
||
|
||
match = MetaRecord(
|
||
id=moly_id,
|
||
title=title,
|
||
authors=authors if authors else [""],
|
||
source=MetaSourceInfo(
|
||
id=self.__id__,
|
||
description="Moly.hu - Magyar könyves közösség",
|
||
link=self.BASE_URL
|
||
),
|
||
url=url,
|
||
identifiers={"moly_hu": moly_id},
|
||
)
|
||
|
||
match._relevance_score = calculate_relevance(query_title, query_author, title, authors) + index
|
||
|
||
match.description = self._parse_description(root)
|
||
match.cover = self._parse_cover(root)
|
||
match.publisher = self._parse_publisher(root)
|
||
match.publishedDate = self._parse_published_date(root)
|
||
match.rating = self._parse_rating(root)
|
||
match.tags = self._parse_tags(root)
|
||
|
||
series_info = self._parse_series(root)
|
||
if series_info:
|
||
match.series = series_info[0]
|
||
try:
|
||
match.series_index = int(series_info[1])
|
||
except (ValueError, IndexError):
|
||
match.series_index = 1
|
||
|
||
isbn = self._parse_isbn(root)
|
||
if isbn:
|
||
match.identifiers["isbn"] = isbn
|
||
|
||
return match
|
||
|
||
except Exception as e:
|
||
log.warning(f"Moly.hu error fetching {url}: {e}")
|
||
return None
|
||
|
||
def _parse_moly_id(self, url: str) -> Optional[str]:
|
||
try:
|
||
m = re.search(r'/konyvek/(.*)', url)
|
||
if m:
|
||
return m.group(1)
|
||
except:
|
||
pass
|
||
return None
|
||
|
||
def _parse_title(self, root) -> Optional[str]:
|
||
title_node = root.xpath('//*[@id="content"]//*[@class="fn"]/text()')
|
||
if not title_node:
|
||
title_node = root.xpath('//*[@id="content"]//*[@class="item"]/text()')
|
||
if title_node:
|
||
return title_node[0].strip().replace('\u200b', '')
|
||
return None
|
||
|
||
def _parse_authors(self, root) -> List[str]:
|
||
author_nodes = root.xpath('//*[@id="content"]//div[@class="authors"]/a/text()')
|
||
if author_nodes:
|
||
return [str(author).strip() for author in author_nodes]
|
||
return []
|
||
|
||
def _parse_description(self, root) -> Optional[str]:
|
||
description_node = root.xpath('//*[@id="content"]//*[@class="text" and @id="full_description"]/p/text()')
|
||
if not description_node:
|
||
description_node = root.xpath('//*[@id="content"]//*[@class="text"]/p/text()')
|
||
if not description_node:
|
||
description_node = root.xpath('//*[@id="content"]//*[@class="text shrinkable"]/p/text()')
|
||
|
||
if description_node:
|
||
desc = '\n'.join(description_node)
|
||
desc = desc.replace('\n\n', '\n').replace('\n \n', '\n')
|
||
desc = desc.replace('Vigyázat! Cselekményleírást tartalmaz.\n', '')
|
||
return desc.strip()
|
||
return None
|
||
|
||
def _parse_cover(self, root) -> Optional[str]:
|
||
cover_nodes = root.xpath('(//*[@class="coverbox"]//a/@href)[1]')
|
||
if cover_nodes:
|
||
cover_url = cover_nodes[0]
|
||
if not cover_url.startswith('http'):
|
||
cover_url = self.BASE_URL + cover_url
|
||
return cover_url
|
||
|
||
img_nodes = root.xpath('//*[@class="coverbox"]//img/@src')
|
||
if img_nodes:
|
||
img_url = img_nodes[0]
|
||
if not img_url.startswith('http'):
|
||
img_url = self.BASE_URL + img_url
|
||
return img_url
|
||
return None
|
||
|
||
def _parse_publisher(self, root) -> Optional[str]:
|
||
publisher_node_1 = root.xpath('//*[@id="content"]//*[@class="items"]/div/div[1]/a/text()')
|
||
if publisher_node_1 and publisher_node_1[0] == '+':
|
||
publisher_node = root.xpath('//*[@id="content"]//*[@class="items"]/div/div[2]/a/text()')
|
||
else:
|
||
publisher_node = publisher_node_1
|
||
|
||
if publisher_node:
|
||
return publisher_node[0].strip()
|
||
return None
|
||
|
||
def _parse_published_date(self, root) -> Optional[str]:
|
||
"""Parse publication date - return as YYYY-01-01 format"""
|
||
publication_node_1 = root.xpath('//*[@id="content"]//*[@class="items"]/div/div[1]/text()')
|
||
if not publication_node_1:
|
||
publication_node = root.xpath('//*[@id="content"]//*[@class="items"]/div/div[2]/text()')
|
||
else:
|
||
publication_node = publication_node_1
|
||
|
||
for value in publication_node:
|
||
m = re.search(r'(\d{4})', value)
|
||
if m:
|
||
# Return as full date (January 1st of that year)
|
||
return f"{m.group(1)}-01-01"
|
||
return None
|
||
|
||
def _parse_rating(self, root) -> int:
|
||
rating_node = root.xpath('//*[@id="content"]//*[@class="rating"]//*[@class="like_count"]/text()')
|
||
if rating_node:
|
||
try:
|
||
percentage = float(rating_node[0].strip('%').strip())
|
||
return round(percentage * 0.05)
|
||
except (ValueError, IndexError):
|
||
pass
|
||
return 0
|
||
|
||
def _parse_tags(self, root) -> List[str]:
|
||
tags_genre = root.xpath('//*[@id="book_tags"]//*[@class="tag genre"]/text()')
|
||
tags_genre = [f"[{str(t).strip()}]" for t in tags_genre if str(t).strip()]
|
||
|
||
tags_regular = root.xpath('//*[@id="book_tags"]//*[@class="tag"]/text()')
|
||
tags_regular = [str(t).strip() for t in tags_regular if str(t).strip()]
|
||
|
||
return tags_genre + tags_regular
|
||
|
||
def _parse_series(self, root) -> Optional[List[str]]:
|
||
series_node = root.xpath('//*[@id="content"]//*[@class="action"]/text()')
|
||
|
||
if not series_node:
|
||
return None
|
||
|
||
series_text = series_node[0].strip('().')
|
||
parts = series_text.rsplit(' ', 1)
|
||
|
||
if len(parts) > 1 and parts[1] == 'kiadás':
|
||
return None
|
||
|
||
if len(parts) == 2:
|
||
return [parts[0], parts[1]]
|
||
elif len(parts) == 1:
|
||
return [parts[0], "1"]
|
||
|
||
return None
|
||
|
||
def _parse_isbn(self, root) -> Optional[str]:
|
||
isbn_nodes = root.xpath('//*[@id="content"]//*[@class="items"]/div/div[2]/text()')
|
||
for value in isbn_nodes:
|
||
m = re.search(r'(\d{13}|\d{10})', value)
|
||
if m:
|
||
return m.group(1)
|
||
|
||
isbn_nodes = root.xpath('//*[@id="content"]//*[@class="items"]/div/div[3]/text()')
|
||
for value in isbn_nodes:
|
||
m = re.search(r'(\d{13}|\d{10})', value)
|
||
if m:
|
||
return m.group(1)
|
||
|
||
return None
|
||
|
||
libri_hu.py: |
|
||
# -*- coding: utf-8 -*-
|
||
# Calibre-Web Automated - Libri.hu Metadata Provider
|
||
# Based on Calibre plugin by Hoffer Csaba, Kloon & Hokutya
|
||
# Adapted for CWA
|
||
# SPDX-License-Identifier: GPL-3.0-or-later
|
||
|
||
import concurrent.futures
|
||
import re
|
||
import requests
|
||
from lxml.html import fromstring, tostring
|
||
from lxml import html as lh
|
||
from typing import List, Optional, Tuple, Dict
|
||
|
||
from cps.services.Metadata import MetaRecord, MetaSourceInfo, Metadata
|
||
import cps.logger as logger
|
||
|
||
log = logger.create()
|
||
|
||
|
||
def strip_accents(s: str) -> str:
|
||
"""Remove accents from Hungarian text for comparison"""
|
||
if not s:
|
||
return ""
|
||
symbols = "öÖüÜóÓőŐúÚéÉáÁűŰíÍąĄćĆęĘłŁńŃśŚźŹżŻ"
|
||
replacements = "oOuUoOoOuUeEaAuUiIaAcCeElLnNsSzZzZ"
|
||
trans = str.maketrans(symbols, replacements)
|
||
return s.translate(trans).lower()
|
||
|
||
|
||
def normalize_title(title: str) -> str:
|
||
"""Normalize title for comparison"""
|
||
if not title:
|
||
return ""
|
||
title = re.sub(r'\([^)]*\)', '', title)
|
||
title = re.sub(r'\[[^\]]*\]', '', title)
|
||
title = re.sub(r'[^\w\s]', ' ', title)
|
||
title = re.sub(r'\s+', ' ', title).strip()
|
||
return strip_accents(title)
|
||
|
||
|
||
def calculate_relevance(query_title: str, query_author: str,
|
||
result_title: str, result_authors: List[str]) -> int:
|
||
"""Calculate relevance score (lower is better, 0 is exact match)"""
|
||
score = 500
|
||
|
||
norm_query_title = normalize_title(query_title)
|
||
norm_result_title = normalize_title(result_title)
|
||
|
||
if norm_query_title == norm_result_title:
|
||
score -= 300
|
||
elif norm_query_title in norm_result_title or norm_result_title in norm_query_title:
|
||
score -= 200
|
||
elif any(word in norm_result_title for word in norm_query_title.split() if len(word) > 2):
|
||
score -= 100
|
||
else:
|
||
score += 200
|
||
|
||
if query_author and result_authors:
|
||
norm_query_author = strip_accents(query_author)
|
||
result_authors_norm = [strip_accents(a) for a in result_authors]
|
||
|
||
query_parts = norm_query_author.split()
|
||
reversed_author = f"{query_parts[-1]} {' '.join(query_parts[:-1])}" if len(query_parts) >= 2 else norm_query_author
|
||
|
||
for author_norm in result_authors_norm:
|
||
if norm_query_author == author_norm or reversed_author == author_norm:
|
||
score -= 200
|
||
break
|
||
elif norm_query_author in author_norm or author_norm in norm_query_author:
|
||
score -= 100
|
||
break
|
||
elif any(part in author_norm for part in query_parts if len(part) > 2):
|
||
score -= 50
|
||
break
|
||
|
||
return max(0, score)
|
||
|
||
|
||
class Libri_hu(Metadata):
|
||
__name__ = "Libri.hu"
|
||
__id__ = "libri_hu"
|
||
|
||
BASE_URL = "https://www.libri.hu"
|
||
BOOK_URL = BASE_URL + "/konyv"
|
||
# Detailed search URL format
|
||
SEARCH_URL = BASE_URL + "/talalati_lista/?reszletes=1&s_det=1&cim="
|
||
|
||
headers = {
|
||
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:130.0) Gecko/20100101 Firefox/130.0',
|
||
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
||
'Accept-Language': 'hu-HU,hu;q=0.9,en;q=0.8',
|
||
}
|
||
|
||
session = requests.Session()
|
||
session.headers.update(headers)
|
||
|
||
def search(
|
||
self, query: str, generic_cover: str = "", locale: str = "hu"
|
||
) -> Optional[List[MetaRecord]]:
|
||
if not self.active:
|
||
return []
|
||
|
||
val = []
|
||
query_author = ""
|
||
query_title = query.strip()
|
||
|
||
try:
|
||
# Libri.hu detailed search URL - search by title
|
||
search_url = f"{self.SEARCH_URL}{requests.utils.quote(query)}"
|
||
log.info(f"Libri.hu searching: {search_url}")
|
||
|
||
response = self.session.get(search_url, timeout=15)
|
||
response.raise_for_status()
|
||
|
||
root = fromstring(response.text)
|
||
book_data = self._parse_search_results(root, query_title, query_author)
|
||
|
||
if not book_data:
|
||
log.info(f"Libri.hu: No results found for '{query}'")
|
||
return []
|
||
|
||
with concurrent.futures.ThreadPoolExecutor(max_workers=3) as executor:
|
||
futures = {
|
||
executor.submit(self._get_book_details, url, idx, query_title, query_author): idx
|
||
for idx, (url, _) in enumerate(book_data[:5])
|
||
}
|
||
|
||
for future in concurrent.futures.as_completed(futures, timeout=20):
|
||
try:
|
||
result = future.result()
|
||
if result:
|
||
val.append(result)
|
||
except Exception as e:
|
||
log.warning(f"Libri.hu worker error: {e}")
|
||
|
||
except requests.exceptions.Timeout:
|
||
log.warning("Libri.hu search timed out")
|
||
return []
|
||
except requests.exceptions.HTTPError as e:
|
||
log.error(f"Libri.hu HTTP error: {e}")
|
||
return []
|
||
except Exception as e:
|
||
log.error_or_exception(f"Libri.hu search error: {e}")
|
||
return []
|
||
|
||
val.sort(key=lambda x: getattr(x, '_relevance_score', 500))
|
||
return val
|
||
|
||
def _parse_search_results(self, root, query_title: str, query_author: str) -> List[Tuple[str, int]]:
|
||
"""Parse search results page"""
|
||
book_data = []
|
||
|
||
# Libri.hu book URLs end with .html and have author.title pattern
|
||
# e.g., /konyv/orvos-toth_noemi.Orokolt-sors-514.html
|
||
all_links = root.xpath("//a[contains(@href, '/konyv/') and contains(@href, '.html')]/@href")
|
||
|
||
# Deduplicate and filter
|
||
seen = set()
|
||
book_links = []
|
||
for href in all_links:
|
||
if href not in seen and '.html' in href:
|
||
seen.add(href)
|
||
book_links.append(href)
|
||
|
||
for href in book_links[:10]: # Limit to 10 results
|
||
if not href.startswith('http'):
|
||
url = self.BASE_URL + href
|
||
else:
|
||
url = href
|
||
|
||
# Extract title from URL for preliminary relevance
|
||
# URL format: /konyv/author_name.Book-Title-123.html
|
||
url_title = ""
|
||
if '.' in href:
|
||
parts = href.split('.')
|
||
if len(parts) >= 2:
|
||
# Get the title part (between first . and .html)
|
||
url_title = parts[1].replace('-', ' ').replace('_', ' ')
|
||
|
||
relevance = calculate_relevance(query_title, query_author, url_title, [])
|
||
book_data.append((url, relevance))
|
||
|
||
# Sort by relevance
|
||
book_data.sort(key=lambda x: x[1])
|
||
|
||
log.info(f"Libri.hu found {len(book_data)} results")
|
||
return book_data
|
||
|
||
def _get_book_details(self, url: str, index: int, query_title: str, query_author: str) -> Optional[MetaRecord]:
|
||
try:
|
||
response = self.session.get(url, timeout=15)
|
||
response.raise_for_status()
|
||
|
||
# Libri.hu uses ISO-8859-2 (Latin-2) encoding for Hungarian
|
||
# Try to detect and decode properly
|
||
content = response.content
|
||
try:
|
||
# First try UTF-8
|
||
text = content.decode('utf-8')
|
||
except UnicodeDecodeError:
|
||
try:
|
||
# Try Latin-2 (Hungarian)
|
||
text = content.decode('iso-8859-2')
|
||
except UnicodeDecodeError:
|
||
# Fallback to Latin-1 with replacement
|
||
text = content.decode('latin-1', errors='replace')
|
||
|
||
root = lh.document_fromstring(text)
|
||
|
||
# Parse book properties table
|
||
book_props = self._parse_book_properties(root)
|
||
|
||
title = self._parse_title(root)
|
||
authors = self._parse_authors(root)
|
||
|
||
if not title:
|
||
return None
|
||
|
||
libri_id = self._parse_libri_id(url)
|
||
|
||
match = MetaRecord(
|
||
id=libri_id,
|
||
title=title,
|
||
authors=authors if authors else [""],
|
||
source=MetaSourceInfo(
|
||
id=self.__id__,
|
||
description="Libri.hu - Könyvesbolt",
|
||
link=self.BASE_URL
|
||
),
|
||
url=url,
|
||
identifiers={"libri_hu": libri_id},
|
||
)
|
||
|
||
match._relevance_score = calculate_relevance(query_title, query_author, title, authors) + index
|
||
|
||
# ISBN
|
||
isbn = book_props.get('ISBN', '').strip()
|
||
if isbn:
|
||
match.identifiers["isbn"] = isbn
|
||
|
||
# Publisher
|
||
publisher = book_props.get('Kiadó', '').strip()
|
||
if publisher:
|
||
match.publisher = publisher
|
||
|
||
# Publication date
|
||
pub_year = book_props.get('Kiadás éve', '').strip()
|
||
if pub_year:
|
||
# Extract just the year and format as full date
|
||
year_match = re.search(r'(\d{4})', pub_year)
|
||
if year_match:
|
||
match.publishedDate = f"{year_match.group(1)}-01-01"
|
||
|
||
# Series
|
||
series = book_props.get('Sorozat', '').strip()
|
||
if series:
|
||
match.series = series
|
||
|
||
# Language
|
||
lang = book_props.get('Nyelv', '').strip().lower()
|
||
if lang:
|
||
match.languages = [self._translate_language(lang)]
|
||
|
||
# Description
|
||
match.description = self._parse_description(root)
|
||
|
||
# Cover
|
||
match.cover = self._parse_cover(root)
|
||
|
||
# Rating
|
||
match.rating = self._parse_rating(root)
|
||
|
||
# Tags from breadcrumbs
|
||
match.tags = self._parse_tags(root)
|
||
|
||
return match
|
||
|
||
except Exception as e:
|
||
log.warning(f"Libri.hu error fetching {url}: {e}")
|
||
return None
|
||
|
||
def _parse_book_properties(self, root) -> Dict[str, str]:
|
||
"""Parse the book properties table"""
|
||
book_properties = {}
|
||
|
||
# Try to find the properties table
|
||
tables = root.xpath('//*[@id="productPageMainItem"]//table')
|
||
if not tables:
|
||
tables = root.xpath('//table[contains(@class, "product")]')
|
||
|
||
for table in tables:
|
||
for row in table.findall('.//tr'):
|
||
cells = row.findall('.//th') + row.findall('.//td')
|
||
if len(cells) >= 2:
|
||
key = cells[0].text_content().strip().rstrip(':')
|
||
value = cells[1].text_content().strip()
|
||
if key and value:
|
||
book_properties[key] = value
|
||
|
||
return book_properties
|
||
|
||
def _parse_libri_id(self, url: str) -> Optional[str]:
|
||
try:
|
||
# URL format: /konyv/author_name.Book-Title-123.html
|
||
m = re.search(r'/konyv/(.+)\.html', url)
|
||
if m:
|
||
return m.group(1)
|
||
except:
|
||
pass
|
||
return None
|
||
|
||
def _parse_title(self, root) -> Optional[str]:
|
||
# Try multiple selectors
|
||
selectors = [
|
||
'//*[@id="productPageMainItem"]//*[@class="h2 mb-2"]/text()',
|
||
'//*[@id="productPageMainItem"]//h1/text()',
|
||
'//h1[@class="book-title"]/text()',
|
||
'//meta[@property="og:title"]/@content',
|
||
]
|
||
|
||
for selector in selectors:
|
||
nodes = root.xpath(selector)
|
||
if nodes:
|
||
title = nodes[0].strip()
|
||
if title:
|
||
# Check for subtitle
|
||
subtitle_nodes = root.xpath('//*[@id="productPageMainItem"]//*[@class="subtitle"]/text()')
|
||
if subtitle_nodes:
|
||
title = f"{title} – {subtitle_nodes[0].strip()}"
|
||
return title
|
||
return None
|
||
|
||
def _parse_authors(self, root) -> List[str]:
|
||
selectors = [
|
||
'//*[@id="productPageMainItem"]/div/div/div[2]/p[1]/a/text()',
|
||
'//*[@id="productPageMainItem"]//a[contains(@href, "/szerzo/")]/text()',
|
||
'//a[@class="author"]/text()',
|
||
]
|
||
|
||
for selector in selectors:
|
||
nodes = root.xpath(selector)
|
||
if nodes:
|
||
authors = [str(a).strip().replace('-', '') for a in nodes if str(a).strip()]
|
||
if authors:
|
||
return authors
|
||
return []
|
||
|
||
def _parse_description(self, root) -> Optional[str]:
|
||
selectors = [
|
||
'//*[@id="product-description"]',
|
||
'//*[@class="description"]',
|
||
'//*[@itemprop="description"]',
|
||
]
|
||
|
||
for selector in selectors:
|
||
nodes = root.xpath(selector)
|
||
if nodes:
|
||
text = nodes[0].text_content().strip()
|
||
if text:
|
||
return text
|
||
return None
|
||
|
||
def _parse_cover(self, root) -> Optional[str]:
|
||
selectors = [
|
||
'//*[@property="og:image"]/@content',
|
||
'//*[@class="cover"]//img/@src',
|
||
'//*[@id="productPageMainItem"]//img/@src',
|
||
]
|
||
|
||
for selector in selectors:
|
||
nodes = root.xpath(selector)
|
||
if nodes:
|
||
url = nodes[0].strip()
|
||
if url:
|
||
if not url.startswith('http'):
|
||
url = self.BASE_URL + url
|
||
return url
|
||
return None
|
||
|
||
def _parse_rating(self, root) -> int:
|
||
nodes = root.xpath('//*[@id="productPageMainItem"]//*[@itemprop="ratingValue"]/@content')
|
||
if nodes:
|
||
try:
|
||
rating = float(nodes[0].strip())
|
||
return round(rating)
|
||
except:
|
||
pass
|
||
return 0
|
||
|
||
def _parse_tags(self, root) -> List[str]:
|
||
nodes = root.xpath('//*[@id="navigationBar"]//text()')
|
||
if nodes:
|
||
tags = [tag.strip().lower() for tag in nodes if tag.strip()]
|
||
# Filter out navigation elements
|
||
tags = [t for t in tags if t and t not in ['>', '/', 'főoldal', 'home']]
|
||
return tags
|
||
return []
|
||
|
||
def _translate_language(self, lang: str) -> str:
|
||
lang_map = {
|
||
'magyar': 'hu',
|
||
'angol': 'en',
|
||
'amerikai': 'en',
|
||
'német': 'de',
|
||
'francia': 'fr',
|
||
'olasz': 'it',
|
||
'spanyol': 'es',
|
||
'orosz': 'ru',
|
||
'török': 'tr',
|
||
'görög': 'el',
|
||
'kínai': 'zh',
|
||
'japán': 'ja',
|
||
}
|
||
return lang_map.get(lang.lower(), 'hu')
|
||
---
|
||
apiVersion: apps/v1
|
||
kind: Deployment
|
||
metadata:
|
||
name: calibre-web-automated
|
||
namespace: calibre-system
|
||
labels:
|
||
app.kubernetes.io/instance: calibre
|
||
app.kubernetes.io/name: calibre-web-automated
|
||
spec:
|
||
replicas: 1
|
||
strategy:
|
||
type: Recreate
|
||
selector:
|
||
matchLabels:
|
||
app.kubernetes.io/instance: calibre
|
||
app.kubernetes.io/name: calibre-web-automated
|
||
template:
|
||
metadata:
|
||
labels:
|
||
app.kubernetes.io/instance: calibre
|
||
app.kubernetes.io/name: calibre-web-automated
|
||
annotations:
|
||
match-regex.version-checker.io/calibre-web-automated: '^V?[0-9]+\.[0-9]+\.[0-9]+$'
|
||
configmap.reloader.stakater.com/reload: "calibre-custom-metadata-providers"
|
||
spec:
|
||
containers:
|
||
- name: calibre-web-automated
|
||
image: crocodilestick/calibre-web-automated:latest
|
||
imagePullPolicy: IfNotPresent
|
||
env:
|
||
- name: PUID
|
||
value: "1000"
|
||
- name: PGID
|
||
value: "1000"
|
||
- name: TZ
|
||
value: Europe/Budapest
|
||
- name: CWA_PORT_OVERRIDE
|
||
value: "8083"
|
||
- name: NETWORK_SHARE_MODE
|
||
value: "false"
|
||
- name: TRUSTED_PROXY_COUNT
|
||
value: "2"
|
||
ports:
|
||
- name: http
|
||
containerPort: 8083
|
||
protocol: TCP
|
||
resources:
|
||
requests:
|
||
cpu: 100m
|
||
memory: 512Mi
|
||
limits:
|
||
cpu: "2"
|
||
memory: 2Gi
|
||
livenessProbe:
|
||
httpGet:
|
||
path: /
|
||
port: http
|
||
initialDelaySeconds: 120
|
||
periodSeconds: 60
|
||
timeoutSeconds: 10
|
||
failureThreshold: 5
|
||
readinessProbe:
|
||
httpGet:
|
||
path: /
|
||
port: http
|
||
initialDelaySeconds: 60
|
||
periodSeconds: 10
|
||
timeoutSeconds: 5
|
||
failureThreshold: 3
|
||
startupProbe:
|
||
httpGet:
|
||
path: /
|
||
port: http
|
||
periodSeconds: 10
|
||
timeoutSeconds: 5
|
||
failureThreshold: 60
|
||
volumeMounts:
|
||
- name: config
|
||
mountPath: /config
|
||
- name: ingest
|
||
mountPath: /cwa-book-ingest
|
||
- name: library
|
||
mountPath: /calibre-library
|
||
- name: custom-metadata-providers
|
||
mountPath: /app/calibre-web-automated/cps/metadata_provider/moly_hu.py
|
||
subPath: moly_hu.py
|
||
readOnly: true
|
||
- name: custom-metadata-providers
|
||
mountPath: /app/calibre-web-automated/cps/metadata_provider/libri_hu.py
|
||
subPath: libri_hu.py
|
||
readOnly: true
|
||
volumes:
|
||
- name: config
|
||
persistentVolumeClaim:
|
||
claimName: calibre-web-automated-config
|
||
- name: ingest
|
||
hostPath:
|
||
path: /mnt/4_hdd/data/calibre-ingest
|
||
type: DirectoryOrCreate
|
||
- name: library
|
||
hostPath:
|
||
path: /mnt/4_hdd/data/calibre
|
||
type: DirectoryOrCreate
|
||
- name: custom-metadata-providers
|
||
configMap:
|
||
name: calibre-custom-metadata-providers
|
||
---
|
||
apiVersion: v1
|
||
kind: Service
|
||
metadata:
|
||
name: calibre-web-automated
|
||
namespace: calibre-system
|
||
labels:
|
||
app.kubernetes.io/instance: calibre
|
||
app.kubernetes.io/name: calibre-web-automated
|
||
spec:
|
||
type: ClusterIP
|
||
ports:
|
||
- name: http
|
||
port: 8083
|
||
targetPort: http
|
||
protocol: TCP
|
||
selector:
|
||
app.kubernetes.io/instance: calibre
|
||
app.kubernetes.io/name: calibre-web-automated
|
||
---
|
||
apiVersion: networking.k8s.io/v1
|
||
kind: Ingress
|
||
metadata:
|
||
name: calibre-web-automated
|
||
namespace: calibre-system
|
||
labels:
|
||
app.kubernetes.io/instance: calibre
|
||
app.kubernetes.io/name: calibre-web-automated
|
||
annotations:
|
||
cert-manager.io/cluster-issuer: letsencrypt-prod
|
||
external-dns.alpha.kubernetes.io/hostname: books.dooplex.hu,books.home
|
||
nginx.ingress.kubernetes.io/proxy-body-size: "0"
|
||
nginx.ingress.kubernetes.io/proxy-read-timeout: "600"
|
||
nginx.ingress.kubernetes.io/proxy-send-timeout: "600"
|
||
nginx.ingress.kubernetes.io/ssl-redirect: "true"
|
||
nginx.ingress.kubernetes.io/auth-response-headers: Set-Cookie,X-authentik-username,X-authentik-groups,X-authentik-email,X-authentik-name,X-authentik-uid
|
||
nginx.ingress.kubernetes.io/auth-snippet: proxy_set_header X-Forwarded-Host $http_host;
|
||
nginx.ingress.kubernetes.io/configuration-snippet: |
|
||
set $geo_allowed 0;
|
||
if ($remote_addr ~ "^192\.168\.") { set $geo_allowed 1; }
|
||
if ($remote_addr ~ "^10\.") { set $geo_allowed 1; }
|
||
if ($geoip2_country_code = "HU") { set $geo_allowed 1; }
|
||
if ($geo_allowed = 0) {
|
||
return 403 "Access restricted to Hungary";
|
||
}
|
||
spec:
|
||
ingressClassName: nginx-internal
|
||
tls:
|
||
- hosts:
|
||
- books.dooplex.hu
|
||
secretName: calibre-web-automated-tls
|
||
rules:
|
||
- host: books.dooplex.hu
|
||
http:
|
||
paths:
|
||
- path: /
|
||
pathType: Prefix
|
||
backend:
|
||
service:
|
||
name: calibre-web-automated
|
||
port:
|
||
number: 8083
|
||
- host: books.home
|
||
http:
|
||
paths:
|
||
- path: /
|
||
pathType: Prefix
|
||
backend:
|
||
service:
|
||
name: calibre-web-automated
|
||
port:
|
||
number: 8083
|
||
---
|
||
apiVersion: v1
|
||
kind: PersistentVolumeClaim
|
||
metadata:
|
||
name: calibre-web-automated-config
|
||
namespace: calibre-system
|
||
labels:
|
||
app.kubernetes.io/instance: calibre
|
||
app.kubernetes.io/name: calibre-web-automated
|
||
recurring-job-group.longhorn.io/needbackup: enabled
|
||
recurring-job.longhorn.io/source: enabled
|
||
spec:
|
||
accessModes:
|
||
- ReadWriteOnce
|
||
storageClassName: longhorn
|
||
resources:
|
||
requests:
|
||
storage: 10Gi |