added libri too
This commit is contained in:
+498
-127
@@ -8,7 +8,7 @@ metadata:
|
||||
name: calibre-system
|
||||
---
|
||||
# Custom Metadata Providers ConfigMap
|
||||
# Contains moly.hu provider for Hungarian book metadata
|
||||
# Contains Hungarian metadata providers: moly.hu and libri.hu
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
@@ -29,8 +29,7 @@ data:
|
||||
import re
|
||||
import requests
|
||||
from lxml.html import fromstring
|
||||
from typing import List, Optional
|
||||
from datetime import datetime
|
||||
from typing import List, Optional, Tuple
|
||||
|
||||
from cps.services.Metadata import MetaRecord, MetaSourceInfo, Metadata
|
||||
import cps.logger as logger
|
||||
@@ -38,6 +37,65 @@ data:
|
||||
log = logger.create()
|
||||
|
||||
|
||||
def strip_accents(s: str) -> str:
|
||||
"""Remove accents from Hungarian text for comparison"""
|
||||
if not s:
|
||||
return ""
|
||||
symbols = "öÖüÜóÓőŐúÚéÉáÁűŰíÍ"
|
||||
replacements = "oOuUoOoOuUeEaAuUiI"
|
||||
trans = str.maketrans(symbols, replacements)
|
||||
return s.translate(trans).lower()
|
||||
|
||||
|
||||
def normalize_title(title: str) -> str:
|
||||
"""Normalize title for comparison"""
|
||||
if not title:
|
||||
return ""
|
||||
title = re.sub(r'\([^)]*\)', '', title)
|
||||
title = re.sub(r'\[[^\]]*\]', '', title)
|
||||
title = re.sub(r'[^\w\s]', ' ', title)
|
||||
title = re.sub(r'\s+', ' ', title).strip()
|
||||
return strip_accents(title)
|
||||
|
||||
|
||||
def calculate_relevance(query_title: str, query_author: str,
|
||||
result_title: str, result_authors: List[str]) -> int:
|
||||
"""Calculate relevance score (lower is better, 0 is exact match)"""
|
||||
score = 500
|
||||
|
||||
norm_query_title = normalize_title(query_title)
|
||||
norm_result_title = normalize_title(result_title)
|
||||
|
||||
if norm_query_title == norm_result_title:
|
||||
score -= 300
|
||||
elif norm_query_title in norm_result_title or norm_result_title in norm_query_title:
|
||||
score -= 200
|
||||
elif any(word in norm_result_title for word in norm_query_title.split() if len(word) > 2):
|
||||
score -= 100
|
||||
else:
|
||||
score += 200
|
||||
|
||||
if query_author and result_authors:
|
||||
norm_query_author = strip_accents(query_author)
|
||||
result_authors_norm = [strip_accents(a) for a in result_authors]
|
||||
|
||||
query_parts = norm_query_author.split()
|
||||
reversed_author = f"{query_parts[-1]} {' '.join(query_parts[:-1])}" if len(query_parts) >= 2 else norm_query_author
|
||||
|
||||
for author_norm in result_authors_norm:
|
||||
if norm_query_author == author_norm or reversed_author == author_norm:
|
||||
score -= 200
|
||||
break
|
||||
elif norm_query_author in author_norm or author_norm in norm_query_author:
|
||||
score -= 100
|
||||
break
|
||||
elif any(part in author_norm for part in query_parts if len(part) > 2):
|
||||
score -= 50
|
||||
break
|
||||
|
||||
return max(0, score)
|
||||
|
||||
|
||||
class Moly_hu(Metadata):
|
||||
__name__ = "Moly.hu"
|
||||
__id__ = "moly_hu"
|
||||
@@ -50,7 +108,6 @@ data:
|
||||
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:130.0) Gecko/20100101 Firefox/130.0',
|
||||
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
||||
'Accept-Language': 'hu-HU,hu;q=0.9,en;q=0.8',
|
||||
'Accept-Encoding': 'gzip, deflate, br',
|
||||
}
|
||||
|
||||
session = requests.Session()
|
||||
@@ -59,34 +116,31 @@ data:
|
||||
def search(
|
||||
self, query: str, generic_cover: str = "", locale: str = "hu"
|
||||
) -> Optional[List[MetaRecord]]:
|
||||
"""Search moly.hu for books matching the query"""
|
||||
|
||||
if not self.active:
|
||||
return []
|
||||
|
||||
val = []
|
||||
query_author = ""
|
||||
query_title = query.strip()
|
||||
|
||||
try:
|
||||
# Search for books
|
||||
search_url = self.SEARCH_URL + requests.utils.quote(query)
|
||||
log.info(f"Moly.hu searching: {search_url}")
|
||||
|
||||
response = self.session.get(search_url, timeout=15)
|
||||
response.raise_for_status()
|
||||
|
||||
# Parse search results
|
||||
root = fromstring(response.text)
|
||||
book_links = self._parse_search_results(root, query)
|
||||
book_data = self._parse_search_results(root, query_title, query_author)
|
||||
|
||||
if not book_links:
|
||||
if not book_data:
|
||||
log.info(f"Moly.hu: No results found for '{query}'")
|
||||
return []
|
||||
|
||||
# Fetch details for each book (max 5)
|
||||
with concurrent.futures.ThreadPoolExecutor(max_workers=3) as executor:
|
||||
futures = {
|
||||
executor.submit(self._get_book_details, link, idx): idx
|
||||
for idx, link in enumerate(book_links[:5])
|
||||
executor.submit(self._get_book_details, url, idx, query_title, query_author): idx
|
||||
for idx, (url, _) in enumerate(book_data[:5])
|
||||
}
|
||||
|
||||
for future in concurrent.futures.as_completed(futures, timeout=20):
|
||||
@@ -107,35 +161,42 @@ data:
|
||||
log.error_or_exception(f"Moly.hu search error: {e}")
|
||||
return []
|
||||
|
||||
# Sort by relevance (order from search results)
|
||||
val.sort(key=lambda x: x.source.id if hasattr(x, 'source') else 0)
|
||||
val.sort(key=lambda x: getattr(x, '_relevance_score', 500))
|
||||
return val
|
||||
|
||||
def _parse_search_results(self, root, query: str) -> List[str]:
|
||||
"""Extract book URLs from search results page"""
|
||||
results = root.xpath('//a[@class="book_selector"]/@href')
|
||||
book_urls = []
|
||||
def _parse_search_results(self, root, query_title: str, query_author: str) -> List[Tuple[str, int]]:
|
||||
results = root.xpath('//a[@class="book_selector"]')
|
||||
book_data = []
|
||||
|
||||
for href in results:
|
||||
if href and href not in book_urls:
|
||||
book_urls.append(self.BASE_URL + href)
|
||||
for result in results:
|
||||
href = result.get('href')
|
||||
if not href:
|
||||
continue
|
||||
|
||||
log.info(f"Moly.hu found {len(book_urls)} results")
|
||||
return book_urls
|
||||
text = result.text_content().strip() if result.text_content() else ""
|
||||
result_author = ""
|
||||
result_title = text
|
||||
if ':' in text:
|
||||
parts = text.split(':', 1)
|
||||
result_author = parts[0].strip()
|
||||
result_title = parts[1].strip()
|
||||
|
||||
def _get_book_details(self, url: str, index: int) -> Optional[MetaRecord]:
|
||||
"""Fetch and parse book details from a moly.hu book page"""
|
||||
relevance = calculate_relevance(query_title, query_author, result_title, [result_author])
|
||||
url = self.BASE_URL + href
|
||||
book_data.append((url, relevance))
|
||||
|
||||
book_data.sort(key=lambda x: x[1])
|
||||
log.info(f"Moly.hu found {len(book_data)} results")
|
||||
return book_data
|
||||
|
||||
def _get_book_details(self, url: str, index: int, query_title: str, query_author: str) -> Optional[MetaRecord]:
|
||||
try:
|
||||
response = self.session.get(url, timeout=15)
|
||||
response.raise_for_status()
|
||||
|
||||
# Clean up HTML
|
||||
raw = response.text
|
||||
raw = raw.replace('<em>', '').replace('</em>', '')
|
||||
|
||||
raw = response.text.replace('<em>', '').replace('</em>', '')
|
||||
root = fromstring(raw)
|
||||
|
||||
# Parse all fields
|
||||
title = self._parse_title(root)
|
||||
authors = self._parse_authors(root)
|
||||
|
||||
@@ -157,7 +218,8 @@ data:
|
||||
identifiers={"moly_hu": moly_id},
|
||||
)
|
||||
|
||||
# Optional fields
|
||||
match._relevance_score = calculate_relevance(query_title, query_author, title, authors)
|
||||
|
||||
match.description = self._parse_description(root)
|
||||
match.cover = self._parse_cover(root)
|
||||
match.publisher = self._parse_publisher(root)
|
||||
@@ -165,7 +227,6 @@ data:
|
||||
match.rating = self._parse_rating(root)
|
||||
match.tags = self._parse_tags(root)
|
||||
|
||||
# Series info
|
||||
series_info = self._parse_series(root)
|
||||
if series_info:
|
||||
match.series = series_info[0]
|
||||
@@ -174,7 +235,6 @@ data:
|
||||
except (ValueError, IndexError):
|
||||
match.series_index = 1
|
||||
|
||||
# ISBN
|
||||
isbn = self._parse_isbn(root)
|
||||
if isbn:
|
||||
match.identifiers["isbn"] = isbn
|
||||
@@ -186,7 +246,6 @@ data:
|
||||
return None
|
||||
|
||||
def _parse_moly_id(self, url: str) -> Optional[str]:
|
||||
"""Extract moly.hu book ID from URL"""
|
||||
try:
|
||||
m = re.search(r'/konyvek/(.*)', url)
|
||||
if m:
|
||||
@@ -196,7 +255,6 @@ data:
|
||||
return None
|
||||
|
||||
def _parse_title(self, root) -> Optional[str]:
|
||||
"""Parse book title"""
|
||||
title_node = root.xpath('//*[@id="content"]//*[@class="fn"]/text()')
|
||||
if not title_node:
|
||||
title_node = root.xpath('//*[@id="content"]//*[@class="item"]/text()')
|
||||
@@ -205,26 +263,19 @@ data:
|
||||
return None
|
||||
|
||||
def _parse_authors(self, root) -> List[str]:
|
||||
"""Parse author names"""
|
||||
author_nodes = root.xpath('//*[@id="content"]//div[@class="authors"]/a/text()')
|
||||
if author_nodes:
|
||||
return [str(author).strip() for author in author_nodes]
|
||||
return []
|
||||
|
||||
def _parse_description(self, root) -> Optional[str]:
|
||||
"""Parse book description/comments"""
|
||||
description_node = root.xpath(
|
||||
'//*[@id="content"]//*[@class="text" and @id="full_description"]/p/text()'
|
||||
)
|
||||
description_node = root.xpath('//*[@id="content"]//*[@class="text" and @id="full_description"]/p/text()')
|
||||
if not description_node:
|
||||
description_node = root.xpath('//*[@id="content"]//*[@class="text"]/p/text()')
|
||||
if not description_node:
|
||||
description_node = root.xpath(
|
||||
'//*[@id="content"]//*[@class="text shrinkable"]/p/text()'
|
||||
)
|
||||
description_node = root.xpath('//*[@id="content"]//*[@class="text shrinkable"]/p/text()')
|
||||
|
||||
if description_node:
|
||||
# Clean up description
|
||||
desc = '\n'.join(description_node)
|
||||
desc = desc.replace('\n\n', '\n').replace('\n \n', '\n')
|
||||
desc = desc.replace('Vigyázat! Cselekményleírást tartalmaz.\n', '')
|
||||
@@ -232,7 +283,6 @@ data:
|
||||
return None
|
||||
|
||||
def _parse_cover(self, root) -> Optional[str]:
|
||||
"""Parse cover image URL"""
|
||||
cover_nodes = root.xpath('(//*[@class="coverbox"]//a/@href)[1]')
|
||||
if cover_nodes:
|
||||
cover_url = cover_nodes[0]
|
||||
@@ -240,7 +290,6 @@ data:
|
||||
cover_url = self.BASE_URL + cover_url
|
||||
return cover_url
|
||||
|
||||
# Fallback: try img src directly
|
||||
img_nodes = root.xpath('//*[@class="coverbox"]//img/@src')
|
||||
if img_nodes:
|
||||
img_url = img_nodes[0]
|
||||
@@ -250,14 +299,9 @@ data:
|
||||
return None
|
||||
|
||||
def _parse_publisher(self, root) -> Optional[str]:
|
||||
"""Parse publisher name"""
|
||||
publisher_node_1 = root.xpath(
|
||||
'//*[@id="content"]//*[@class="items"]/div/div[1]/a/text()'
|
||||
)
|
||||
publisher_node_1 = root.xpath('//*[@id="content"]//*[@class="items"]/div/div[1]/a/text()')
|
||||
if publisher_node_1 and publisher_node_1[0] == '+':
|
||||
publisher_node = root.xpath(
|
||||
'//*[@id="content"]//*[@class="items"]/div/div[2]/a/text()'
|
||||
)
|
||||
publisher_node = root.xpath('//*[@id="content"]//*[@class="items"]/div/div[2]/a/text()')
|
||||
else:
|
||||
publisher_node = publisher_node_1
|
||||
|
||||
@@ -266,14 +310,9 @@ data:
|
||||
return None
|
||||
|
||||
def _parse_published_date(self, root) -> Optional[str]:
|
||||
"""Parse publication date (year)"""
|
||||
publication_node_1 = root.xpath(
|
||||
'//*[@id="content"]//*[@class="items"]/div/div[1]/text()'
|
||||
)
|
||||
publication_node_1 = root.xpath('//*[@id="content"]//*[@class="items"]/div/div[1]/text()')
|
||||
if not publication_node_1:
|
||||
publication_node = root.xpath(
|
||||
'//*[@id="content"]//*[@class="items"]/div/div[2]/text()'
|
||||
)
|
||||
publication_node = root.xpath('//*[@id="content"]//*[@class="items"]/div/div[2]/text()')
|
||||
else:
|
||||
publication_node = publication_node_1
|
||||
|
||||
@@ -284,13 +323,9 @@ data:
|
||||
return None
|
||||
|
||||
def _parse_rating(self, root) -> int:
|
||||
"""Parse rating (converted to 0-5 scale)"""
|
||||
rating_node = root.xpath(
|
||||
'//*[@id="content"]//*[@class="rating"]//*[@class="like_count"]/text()'
|
||||
)
|
||||
rating_node = root.xpath('//*[@id="content"]//*[@class="rating"]//*[@class="like_count"]/text()')
|
||||
if rating_node:
|
||||
try:
|
||||
# Moly.hu uses percentage, convert to 0-5 scale
|
||||
percentage = float(rating_node[0].strip('%').strip())
|
||||
return round(percentage * 0.05)
|
||||
except (ValueError, IndexError):
|
||||
@@ -298,19 +333,15 @@ data:
|
||||
return 0
|
||||
|
||||
def _parse_tags(self, root) -> List[str]:
|
||||
"""Parse tags/genres"""
|
||||
# Genre tags (in brackets)
|
||||
tags_genre = root.xpath('//*[@id="book_tags"]//*[@class="tag genre"]/text()')
|
||||
tags_genre = [f"[{str(t).strip()}]" for t in tags_genre if str(t).strip()]
|
||||
|
||||
# Regular tags
|
||||
tags_regular = root.xpath('//*[@id="book_tags"]//*[@class="tag"]/text()')
|
||||
tags_regular = [str(t).strip() for t in tags_regular if str(t).strip()]
|
||||
|
||||
return tags_genre + tags_regular
|
||||
|
||||
def _parse_series(self, root) -> Optional[List[str]]:
|
||||
"""Parse series name and index"""
|
||||
series_node = root.xpath('//*[@id="content"]//*[@class="action"]/text()')
|
||||
|
||||
if not series_node:
|
||||
@@ -319,7 +350,6 @@ data:
|
||||
series_text = series_node[0].strip('().')
|
||||
parts = series_text.rsplit(' ', 1)
|
||||
|
||||
# Check if it's actually edition info, not series
|
||||
if len(parts) > 1 and parts[1] == 'kiadás':
|
||||
return None
|
||||
|
||||
@@ -331,26 +361,406 @@ data:
|
||||
return None
|
||||
|
||||
def _parse_isbn(self, root) -> Optional[str]:
|
||||
"""Parse ISBN"""
|
||||
# Try first location
|
||||
isbn_nodes = root.xpath(
|
||||
'//*[@id="content"]//*[@class="items"]/div/div[2]/text()'
|
||||
)
|
||||
isbn_nodes = root.xpath('//*[@id="content"]//*[@class="items"]/div/div[2]/text()')
|
||||
for value in isbn_nodes:
|
||||
m = re.search(r'(\d{13}|\d{10})', value)
|
||||
if m:
|
||||
return m.group(1)
|
||||
|
||||
# Try second location
|
||||
isbn_nodes = root.xpath(
|
||||
'//*[@id="content"]//*[@class="items"]/div/div[3]/text()'
|
||||
)
|
||||
isbn_nodes = root.xpath('//*[@id="content"]//*[@class="items"]/div/div[3]/text()')
|
||||
for value in isbn_nodes:
|
||||
m = re.search(r'(\d{13}|\d{10})', value)
|
||||
if m:
|
||||
return m.group(1)
|
||||
|
||||
return None
|
||||
|
||||
libri_hu.py: |
|
||||
# -*- coding: utf-8 -*-
|
||||
# Calibre-Web Automated - Libri.hu Metadata Provider
|
||||
# Based on Calibre plugin by Hoffer Csaba, Kloon & Hokutya
|
||||
# Adapted for CWA
|
||||
# SPDX-License-Identifier: GPL-3.0-or-later
|
||||
|
||||
import concurrent.futures
|
||||
import re
|
||||
import requests
|
||||
from lxml.html import fromstring, tostring
|
||||
from lxml import html as lh
|
||||
from typing import List, Optional, Tuple, Dict
|
||||
|
||||
from cps.services.Metadata import MetaRecord, MetaSourceInfo, Metadata
|
||||
import cps.logger as logger
|
||||
|
||||
log = logger.create()
|
||||
|
||||
|
||||
def strip_accents(s: str) -> str:
|
||||
"""Remove accents from Hungarian text for comparison"""
|
||||
if not s:
|
||||
return ""
|
||||
symbols = "öÖüÜóÓőŐúÚéÉáÁűŰíÍąĄćĆęĘłŁńŃśŚźŹżŻ"
|
||||
replacements = "oOuUoOoOuUeEaAuUiIaAcCeElLnNsSzZzZ"
|
||||
trans = str.maketrans(symbols, replacements)
|
||||
return s.translate(trans).lower()
|
||||
|
||||
|
||||
def normalize_title(title: str) -> str:
|
||||
"""Normalize title for comparison"""
|
||||
if not title:
|
||||
return ""
|
||||
title = re.sub(r'\([^)]*\)', '', title)
|
||||
title = re.sub(r'\[[^\]]*\]', '', title)
|
||||
title = re.sub(r'[^\w\s]', ' ', title)
|
||||
title = re.sub(r'\s+', ' ', title).strip()
|
||||
return strip_accents(title)
|
||||
|
||||
|
||||
def calculate_relevance(query_title: str, query_author: str,
|
||||
result_title: str, result_authors: List[str]) -> int:
|
||||
"""Calculate relevance score (lower is better, 0 is exact match)"""
|
||||
score = 500
|
||||
|
||||
norm_query_title = normalize_title(query_title)
|
||||
norm_result_title = normalize_title(result_title)
|
||||
|
||||
if norm_query_title == norm_result_title:
|
||||
score -= 300
|
||||
elif norm_query_title in norm_result_title or norm_result_title in norm_query_title:
|
||||
score -= 200
|
||||
elif any(word in norm_result_title for word in norm_query_title.split() if len(word) > 2):
|
||||
score -= 100
|
||||
else:
|
||||
score += 200
|
||||
|
||||
if query_author and result_authors:
|
||||
norm_query_author = strip_accents(query_author)
|
||||
result_authors_norm = [strip_accents(a) for a in result_authors]
|
||||
|
||||
query_parts = norm_query_author.split()
|
||||
reversed_author = f"{query_parts[-1]} {' '.join(query_parts[:-1])}" if len(query_parts) >= 2 else norm_query_author
|
||||
|
||||
for author_norm in result_authors_norm:
|
||||
if norm_query_author == author_norm or reversed_author == author_norm:
|
||||
score -= 200
|
||||
break
|
||||
elif norm_query_author in author_norm or author_norm in norm_query_author:
|
||||
score -= 100
|
||||
break
|
||||
elif any(part in author_norm for part in query_parts if len(part) > 2):
|
||||
score -= 50
|
||||
break
|
||||
|
||||
return max(0, score)
|
||||
|
||||
|
||||
class Libri_hu(Metadata):
|
||||
__name__ = "Libri.hu"
|
||||
__id__ = "libri_hu"
|
||||
|
||||
BASE_URL = "https://www.libri.hu"
|
||||
BOOK_URL = BASE_URL + "/konyv"
|
||||
SEARCH_URL = BASE_URL + "/talalati-lista"
|
||||
|
||||
headers = {
|
||||
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:130.0) Gecko/20100101 Firefox/130.0',
|
||||
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
||||
'Accept-Language': 'hu-HU,hu;q=0.9,en;q=0.8',
|
||||
}
|
||||
|
||||
session = requests.Session()
|
||||
session.headers.update(headers)
|
||||
|
||||
def search(
|
||||
self, query: str, generic_cover: str = "", locale: str = "hu"
|
||||
) -> Optional[List[MetaRecord]]:
|
||||
if not self.active:
|
||||
return []
|
||||
|
||||
val = []
|
||||
query_author = ""
|
||||
query_title = query.strip()
|
||||
|
||||
try:
|
||||
# Libri.hu search URL format
|
||||
search_url = f"{self.SEARCH_URL}?kereses={requests.utils.quote(query)}"
|
||||
log.info(f"Libri.hu searching: {search_url}")
|
||||
|
||||
response = self.session.get(search_url, timeout=15)
|
||||
response.raise_for_status()
|
||||
|
||||
root = fromstring(response.text)
|
||||
book_data = self._parse_search_results(root, query_title, query_author)
|
||||
|
||||
if not book_data:
|
||||
log.info(f"Libri.hu: No results found for '{query}'")
|
||||
return []
|
||||
|
||||
with concurrent.futures.ThreadPoolExecutor(max_workers=3) as executor:
|
||||
futures = {
|
||||
executor.submit(self._get_book_details, url, idx, query_title, query_author): idx
|
||||
for idx, (url, _) in enumerate(book_data[:5])
|
||||
}
|
||||
|
||||
for future in concurrent.futures.as_completed(futures, timeout=20):
|
||||
try:
|
||||
result = future.result()
|
||||
if result:
|
||||
val.append(result)
|
||||
except Exception as e:
|
||||
log.warning(f"Libri.hu worker error: {e}")
|
||||
|
||||
except requests.exceptions.Timeout:
|
||||
log.warning("Libri.hu search timed out")
|
||||
return []
|
||||
except requests.exceptions.HTTPError as e:
|
||||
log.error(f"Libri.hu HTTP error: {e}")
|
||||
return []
|
||||
except Exception as e:
|
||||
log.error_or_exception(f"Libri.hu search error: {e}")
|
||||
return []
|
||||
|
||||
val.sort(key=lambda x: getattr(x, '_relevance_score', 500))
|
||||
return val
|
||||
|
||||
def _parse_search_results(self, root, query_title: str, query_author: str) -> List[Tuple[str, int]]:
|
||||
"""Parse search results page"""
|
||||
book_data = []
|
||||
|
||||
# Try multiple possible XPath selectors for Libri's search results
|
||||
book_links = root.xpath('//*[@id="book-list-result-items"]//h4[@class="book"]/a/@href')
|
||||
|
||||
if not book_links:
|
||||
# Alternative selector
|
||||
book_links = root.xpath('//a[contains(@href, "/konyv/")]/@href')
|
||||
# Filter to unique book URLs
|
||||
book_links = list(set([l for l in book_links if '/konyv/' in l and '.html' in l]))
|
||||
|
||||
for href in book_links[:10]: # Limit to 10 results
|
||||
if not href.startswith('http'):
|
||||
url = self.BASE_URL + href
|
||||
else:
|
||||
url = href
|
||||
|
||||
# Can't calculate preliminary relevance without title info from search page
|
||||
# so use index-based scoring
|
||||
book_data.append((url, len(book_data) * 10))
|
||||
|
||||
log.info(f"Libri.hu found {len(book_data)} results")
|
||||
return book_data
|
||||
|
||||
def _get_book_details(self, url: str, index: int, query_title: str, query_author: str) -> Optional[MetaRecord]:
|
||||
try:
|
||||
response = self.session.get(url, timeout=15)
|
||||
response.raise_for_status()
|
||||
|
||||
# Parse with lxml
|
||||
root = lh.document_fromstring(response.content.decode('utf-8', errors='replace'))
|
||||
|
||||
# Parse book properties table
|
||||
book_props = self._parse_book_properties(root)
|
||||
|
||||
title = self._parse_title(root)
|
||||
authors = self._parse_authors(root)
|
||||
|
||||
if not title:
|
||||
return None
|
||||
|
||||
libri_id = self._parse_libri_id(url)
|
||||
|
||||
match = MetaRecord(
|
||||
id=libri_id,
|
||||
title=title,
|
||||
authors=authors if authors else [""],
|
||||
source=MetaSourceInfo(
|
||||
id=self.__id__,
|
||||
description="Libri.hu - Könyvesbolt",
|
||||
link=self.BASE_URL
|
||||
),
|
||||
url=url,
|
||||
identifiers={"libri_hu": libri_id},
|
||||
)
|
||||
|
||||
match._relevance_score = calculate_relevance(query_title, query_author, title, authors)
|
||||
|
||||
# ISBN
|
||||
isbn = book_props.get('ISBN', '').strip()
|
||||
if isbn:
|
||||
match.identifiers["isbn"] = isbn
|
||||
|
||||
# Publisher
|
||||
publisher = book_props.get('Kiadó', '').strip()
|
||||
if publisher:
|
||||
match.publisher = publisher
|
||||
|
||||
# Publication date
|
||||
pub_year = book_props.get('Kiadás éve', '').strip()
|
||||
if pub_year:
|
||||
match.publishedDate = pub_year
|
||||
|
||||
# Series
|
||||
series = book_props.get('Sorozat', '').strip()
|
||||
if series:
|
||||
match.series = series
|
||||
|
||||
# Language
|
||||
lang = book_props.get('Nyelv', '').strip().lower()
|
||||
if lang:
|
||||
match.languages = [self._translate_language(lang)]
|
||||
|
||||
# Description
|
||||
match.description = self._parse_description(root)
|
||||
|
||||
# Cover
|
||||
match.cover = self._parse_cover(root)
|
||||
|
||||
# Rating
|
||||
match.rating = self._parse_rating(root)
|
||||
|
||||
# Tags from breadcrumbs
|
||||
match.tags = self._parse_tags(root)
|
||||
|
||||
return match
|
||||
|
||||
except Exception as e:
|
||||
log.warning(f"Libri.hu error fetching {url}: {e}")
|
||||
return None
|
||||
|
||||
def _parse_book_properties(self, root) -> Dict[str, str]:
|
||||
"""Parse the book properties table"""
|
||||
book_properties = {}
|
||||
|
||||
# Try to find the properties table
|
||||
tables = root.xpath('//*[@id="productPageMainItem"]//table')
|
||||
if not tables:
|
||||
tables = root.xpath('//table[contains(@class, "product")]')
|
||||
|
||||
for table in tables:
|
||||
for row in table.findall('.//tr'):
|
||||
cells = row.findall('.//th') + row.findall('.//td')
|
||||
if len(cells) >= 2:
|
||||
key = cells[0].text_content().strip().rstrip(':')
|
||||
value = cells[1].text_content().strip()
|
||||
if key and value:
|
||||
book_properties[key] = value
|
||||
|
||||
return book_properties
|
||||
|
||||
def _parse_libri_id(self, url: str) -> Optional[str]:
|
||||
try:
|
||||
m = re.search(r'/konyv/(.*)\.html', url)
|
||||
if m:
|
||||
return m.group(1)
|
||||
m = re.search(r'/konyv/([^/]+)', url)
|
||||
if m:
|
||||
return m.group(1)
|
||||
except:
|
||||
pass
|
||||
return None
|
||||
|
||||
def _parse_title(self, root) -> Optional[str]:
|
||||
# Try multiple selectors
|
||||
selectors = [
|
||||
'//*[@id="productPageMainItem"]//*[@class="h2 mb-2"]/text()',
|
||||
'//*[@id="productPageMainItem"]//h1/text()',
|
||||
'//h1[@class="book-title"]/text()',
|
||||
'//meta[@property="og:title"]/@content',
|
||||
]
|
||||
|
||||
for selector in selectors:
|
||||
nodes = root.xpath(selector)
|
||||
if nodes:
|
||||
title = nodes[0].strip()
|
||||
if title:
|
||||
# Check for subtitle
|
||||
subtitle_nodes = root.xpath('//*[@id="productPageMainItem"]//*[@class="subtitle"]/text()')
|
||||
if subtitle_nodes:
|
||||
title = f"{title} – {subtitle_nodes[0].strip()}"
|
||||
return title
|
||||
return None
|
||||
|
||||
def _parse_authors(self, root) -> List[str]:
|
||||
selectors = [
|
||||
'//*[@id="productPageMainItem"]/div/div/div[2]/p[1]/a/text()',
|
||||
'//*[@id="productPageMainItem"]//a[contains(@href, "/szerzo/")]/text()',
|
||||
'//a[@class="author"]/text()',
|
||||
]
|
||||
|
||||
for selector in selectors:
|
||||
nodes = root.xpath(selector)
|
||||
if nodes:
|
||||
authors = [str(a).strip().replace('-', '') for a in nodes if str(a).strip()]
|
||||
if authors:
|
||||
return authors
|
||||
return []
|
||||
|
||||
def _parse_description(self, root) -> Optional[str]:
|
||||
selectors = [
|
||||
'//*[@id="product-description"]',
|
||||
'//*[@class="description"]',
|
||||
'//*[@itemprop="description"]',
|
||||
]
|
||||
|
||||
for selector in selectors:
|
||||
nodes = root.xpath(selector)
|
||||
if nodes:
|
||||
text = nodes[0].text_content().strip()
|
||||
if text:
|
||||
return text
|
||||
return None
|
||||
|
||||
def _parse_cover(self, root) -> Optional[str]:
|
||||
selectors = [
|
||||
'//*[@property="og:image"]/@content',
|
||||
'//*[@class="cover"]//img/@src',
|
||||
'//*[@id="productPageMainItem"]//img/@src',
|
||||
]
|
||||
|
||||
for selector in selectors:
|
||||
nodes = root.xpath(selector)
|
||||
if nodes:
|
||||
url = nodes[0].strip()
|
||||
if url:
|
||||
if not url.startswith('http'):
|
||||
url = self.BASE_URL + url
|
||||
return url
|
||||
return None
|
||||
|
||||
def _parse_rating(self, root) -> int:
|
||||
nodes = root.xpath('//*[@id="productPageMainItem"]//*[@itemprop="ratingValue"]/@content')
|
||||
if nodes:
|
||||
try:
|
||||
rating = float(nodes[0].strip())
|
||||
return round(rating)
|
||||
except:
|
||||
pass
|
||||
return 0
|
||||
|
||||
def _parse_tags(self, root) -> List[str]:
|
||||
nodes = root.xpath('//*[@id="navigationBar"]//text()')
|
||||
if nodes:
|
||||
tags = [tag.strip().lower() for tag in nodes if tag.strip()]
|
||||
# Filter out navigation elements
|
||||
tags = [t for t in tags if t and t not in ['>', '/', 'főoldal', 'home']]
|
||||
return tags
|
||||
return []
|
||||
|
||||
def _translate_language(self, lang: str) -> str:
|
||||
lang_map = {
|
||||
'magyar': 'hu',
|
||||
'angol': 'en',
|
||||
'amerikai': 'en',
|
||||
'német': 'de',
|
||||
'francia': 'fr',
|
||||
'olasz': 'it',
|
||||
'spanyol': 'es',
|
||||
'orosz': 'ru',
|
||||
'török': 'tr',
|
||||
'görög': 'el',
|
||||
'kínai': 'zh',
|
||||
'japán': 'ja',
|
||||
}
|
||||
return lang_map.get(lang.lower(), 'hu')
|
||||
---
|
||||
# Calibre-Web-Automated Deployment
|
||||
apiVersion: apps/v1
|
||||
@@ -377,7 +787,7 @@ spec:
|
||||
annotations:
|
||||
# Version checker pattern - CWA uses semantic versioning
|
||||
match-regex.version-checker.io/calibre-web-automated: '^V?[0-9]+\.[0-9]+\.[0-9]+$'
|
||||
# Force rollout when ConfigMap changes (update this hash when modifying providers)
|
||||
# Force rollout when ConfigMap changes
|
||||
configmap.reloader.stakater.com/reload: "calibre-custom-metadata-providers"
|
||||
spec:
|
||||
containers:
|
||||
@@ -391,13 +801,10 @@ spec:
|
||||
value: "1000"
|
||||
- name: TZ
|
||||
value: Europe/Budapest
|
||||
# Use default port 8083
|
||||
- name: CWA_PORT_OVERRIDE
|
||||
value: "8083"
|
||||
# Disable WAL mode if on network share (set to true if using NFS)
|
||||
- name: NETWORK_SHARE_MODE
|
||||
value: "false"
|
||||
# Number of proxies in chain (Cloudflare -> nginx-ingress -> app)
|
||||
- name: TRUSTED_PROXY_COUNT
|
||||
value: "2"
|
||||
ports:
|
||||
@@ -433,38 +840,35 @@ spec:
|
||||
port: http
|
||||
periodSeconds: 10
|
||||
timeoutSeconds: 5
|
||||
# CWA can take time to initialize, especially first run
|
||||
failureThreshold: 60
|
||||
volumeMounts:
|
||||
# Config directory for app database, logs, processed books backup
|
||||
- name: config
|
||||
mountPath: /config
|
||||
# Book ingest folder - files here are DELETED after processing
|
||||
- name: ingest
|
||||
mountPath: /cwa-book-ingest
|
||||
# Calibre library - your existing library location
|
||||
- name: library
|
||||
mountPath: /calibre-library
|
||||
# Custom metadata providers (moly.hu)
|
||||
# Hungarian metadata providers
|
||||
- name: custom-metadata-providers
|
||||
mountPath: /app/calibre-web-automated/cps/metadata_provider/moly_hu.py
|
||||
subPath: moly_hu.py
|
||||
readOnly: true
|
||||
- name: custom-metadata-providers
|
||||
mountPath: /app/calibre-web-automated/cps/metadata_provider/libri_hu.py
|
||||
subPath: libri_hu.py
|
||||
readOnly: true
|
||||
volumes:
|
||||
- name: config
|
||||
persistentVolumeClaim:
|
||||
claimName: calibre-web-automated-config
|
||||
# Ingest folder on hostPath for easy file dropping
|
||||
- name: ingest
|
||||
hostPath:
|
||||
path: /mnt/4_hdd/data/calibre-ingest
|
||||
type: DirectoryOrCreate
|
||||
# Your existing Calibre library location
|
||||
- name: library
|
||||
hostPath:
|
||||
path: /mnt/4_hdd/data/calibre
|
||||
type: DirectoryOrCreate
|
||||
# Custom metadata providers from ConfigMap
|
||||
- name: custom-metadata-providers
|
||||
configMap:
|
||||
name: calibre-custom-metadata-providers
|
||||
@@ -489,7 +893,7 @@ spec:
|
||||
app.kubernetes.io/instance: calibre
|
||||
app.kubernetes.io/name: calibre-web-automated
|
||||
---
|
||||
# Main Ingress (books.dooplex.hu - primary reading interface)
|
||||
# Main Ingress (books.dooplex.hu)
|
||||
apiVersion: networking.k8s.io/v1
|
||||
kind: Ingress
|
||||
metadata:
|
||||
@@ -505,7 +909,6 @@ metadata:
|
||||
nginx.ingress.kubernetes.io/proxy-read-timeout: "600"
|
||||
nginx.ingress.kubernetes.io/proxy-send-timeout: "600"
|
||||
nginx.ingress.kubernetes.io/ssl-redirect: "true"
|
||||
# Forward auth headers for Authentik integration
|
||||
nginx.ingress.kubernetes.io/auth-response-headers: Set-Cookie,X-authentik-username,X-authentik-groups,X-authentik-email,X-authentik-name,X-authentik-uid
|
||||
nginx.ingress.kubernetes.io/auth-snippet: proxy_set_header X-Forwarded-Host $http_host;
|
||||
nginx.ingress.kubernetes.io/configuration-snippet: |
|
||||
@@ -544,7 +947,7 @@ spec:
|
||||
port:
|
||||
number: 8083
|
||||
---
|
||||
# Config PVC - stores app.db, logs, processed_books backup
|
||||
# Config PVC
|
||||
apiVersion: v1
|
||||
kind: PersistentVolumeClaim
|
||||
metadata:
|
||||
@@ -561,36 +964,4 @@ spec:
|
||||
storageClassName: longhorn
|
||||
resources:
|
||||
requests:
|
||||
# Larger than typical - stores backup of processed books by default
|
||||
storage: 10Gi
|
||||
---
|
||||
# Optional: Authentik integration for SSO
|
||||
# Uncomment and configure if using Authentik proxy authentication
|
||||
# apiVersion: networking.k8s.io/v1
|
||||
# kind: Ingress
|
||||
# metadata:
|
||||
# name: calibre-web-automated-auth
|
||||
# namespace: calibre-system
|
||||
# annotations:
|
||||
# cert-manager.io/cluster-issuer: letsencrypt-prod
|
||||
# nginx.ingress.kubernetes.io/auth-url: http://authentik-outpost-proxy.authentik-system.svc.cluster.local:9000/outpost.goauthentik.io/auth/nginx
|
||||
# nginx.ingress.kubernetes.io/auth-signin: https://auth.dooplex.hu/outpost.goauthentik.io/start?rd=$escaped_request_uri
|
||||
# nginx.ingress.kubernetes.io/auth-response-headers: Set-Cookie,X-authentik-username,X-authentik-groups,X-authentik-email,X-authentik-name,X-authentik-uid
|
||||
# nginx.ingress.kubernetes.io/auth-snippet: proxy_set_header X-Forwarded-Host $http_host;
|
||||
# spec:
|
||||
# ingressClassName: nginx-internal
|
||||
# tls:
|
||||
# - hosts:
|
||||
# - books.dooplex.hu
|
||||
# secretName: calibre-web-automated-tls
|
||||
# rules:
|
||||
# - host: books.dooplex.hu
|
||||
# http:
|
||||
# paths:
|
||||
# - path: /
|
||||
# pathType: Prefix
|
||||
# backend:
|
||||
# service:
|
||||
# name: calibre-web-automated
|
||||
# port:
|
||||
# number: 8083
|
||||
Reference in New Issue
Block a user