改编,不是乱编

This commit is contained in:
ZaneYork 2024-01-16 12:01:58 +08:00
parent 53099b46c7
commit 516b23abfd
21 changed files with 923 additions and 295 deletions

View File

@ -0,0 +1,11 @@
<?xml version="1.0" encoding="UTF-8"?>
<module type="PYTHON_MODULE" version="4">
<component name="NewModuleRootManager">
<content url="file://$MODULE_DIR$">
<sourceFolder url="file://$MODULE_DIR$/src" isTestSource="false" />
<sourceFolder url="file://$MODULE_DIR$/tests" isTestSource="true" />
</content>
<orderEntry type="jdk" jdkName="calibre-web-api" jdkType="Python SDK" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
</module>

View File

@ -1,32 +1,13 @@
# calibre-web-douban-api
# calibre-web-novel-api
**2023-07-15**
此项目从[calibre-web-douban-api](https://github.com/fugary/calibre-web-douban-api)项目Fork而来是calibre的网文搜索api provider需要放到metadata_provider目录下
最新豆瓣屏蔽直接访问封面图片自动实现使用本地代理封面图片保存时使用requests下载并保存
**2022-10-08**
原douban列表url被屏蔽老版本不能访问建议更新最新的[NewDouban.py](https://github.com/fugary/calibre-web-douban-api/releases/download/v1.1.0/NewDouban.py)
**2022-08-10**
**最新V0.6.19版本的calbire-web的豆瓣插件已经回来了除了标签外应该都有数据了可以不用此插件了**
新版calibre-web已经移除douban-api了而且把从get_meta.js中直接发起请求获取数据改成了从服务端使用python获取数据。
此项目是添加一个豆瓣api provider实现需要放到metadata_provider目录下
### 使用方法
复制`src/NewDouban.py`到`calibre-web/cps/metadata_provider/`目录下,重启项目即可。
复制`src/cps/metadata_provider/`下全部文件到`calibre-web/cps/metadata_provider/`目录下,重启项目即可。
此应用是基于Python抓取网页的形式获取书籍信息频率过高访问可能被屏蔽。
参考文档https://fugary.com/?p=238
**新版calibre-web 0.6.17以上使用**
小于0.6.17版本请下载https://github.com/fugary/calibre-web-douban-api/releases/tag/0.6.16

View File

@ -1,266 +0,0 @@
import random
import re
import time
import dataclasses
import urllib
import requests
from concurrent.futures import ThreadPoolExecutor, as_completed
from urllib.parse import urlparse, unquote
from lxml import etree
from functools import lru_cache
from cps.services.Metadata import Metadata, MetaSourceInfo, MetaRecord
from cps.search_metadata import meta
from flask import request, Response
from cps import helper
# 是否自动代理封面地址
DOUBAN_PROXY_COVER = True
# 如果自动计算的服务器地址不正确可以填写自己的calibre-web地址参考http://nas_ip:8083/
DOUBAN_PROXY_COVER_HOST_URL = ''
DOUBAN_PROXY_COVER_PATH = 'metadata/douban_cover?cover='
DOUBAN_SEARCH_URL = "https://www.douban.com/search"
DOUBAN_BASE = "https://book.douban.com/"
DOUBAN_COVER_DOMAIN = 'doubanio.com'
DOUBAN_BOOK_CAT = "1001"
DOUBAN_BOOK_CACHE_SIZE = 500 # 最大缓存数量
DOUBAN_CONCURRENCY_SIZE = 5 # 并发查询数
DOUBAN_BOOK_URL_PATTERN = re.compile(".*/subject/(\\d+)/?")
DEFAULT_HEADERS = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3573.0 Safari/537.36',
'Accept-Encoding': 'gzip, deflate',
'Referer': DOUBAN_BASE
}
PROVIDER_NAME = "New Douban Books"
PROVIDER_ID = "new_douban"
class NewDouban(Metadata):
__name__ = PROVIDER_NAME
__id__ = PROVIDER_ID
def __init__(self):
self.searcher = DoubanBookSearcher()
self.hack_helper_cover()
super().__init__()
def search(self, query: str, generic_cover: str = "", locale: str = "en"):
if self.active:
return self.searcher.search_books(query)
@staticmethod
def hack_helper_cover():
"""
覆盖helper.save_cover_from_url方法实现豆瓣的封面下载
:return:
"""
save_cover = helper.save_cover_from_url
def new_save_cover(url, book_path):
if DOUBAN_COVER_DOMAIN in url:
cover_url = url
if DOUBAN_PROXY_COVER:
component = urllib.parse.urlparse(url)
query = urllib.parse.parse_qs(component.query)
cover_url = urllib.parse.unquote(query.get('cover')[0])
res = requests.get(cover_url, headers=DEFAULT_HEADERS)
return helper.save_cover(res, book_path)
else:
return save_cover(url, book_path)
helper.save_cover_from_url = new_save_cover
@dataclasses.dataclass
class DoubanMetaRecord(MetaRecord):
def __getattribute__(self, item): # cover通过本地服务代理访问
if item == 'cover' and DOUBAN_PROXY_COVER:
cover_url = super().__getattribute__(item)
if cover_url:
try:
host_url = DOUBAN_PROXY_COVER_HOST_URL
if not host_url and request.host_url:
host_url = request.host_url
if host_url and host_url not in cover_url:
self.cover = host_url + DOUBAN_PROXY_COVER_PATH + urllib.parse.quote(cover_url)
except BaseException:
pass
return super().__getattribute__(item)
class DoubanBookSearcher:
def __init__(self):
self.book_loader = DoubanBookLoader()
self.thread_pool = ThreadPoolExecutor(max_workers=10, thread_name_prefix='douban_async')
def calc_url(self, href):
query = urlparse(href).query
params = {item.split('=')[0]: item.split('=')[1] for item in query.split('&')}
url = unquote(params['url'])
if DOUBAN_BOOK_URL_PATTERN.match(url):
return url
def load_book_urls_new(self, query):
url = DOUBAN_SEARCH_URL
params = {"cat": DOUBAN_BOOK_CAT, "q": query}
res = requests.get(url, params, headers=DEFAULT_HEADERS)
book_urls = []
if res.status_code in [200, 201]:
html = etree.HTML(res.content)
alist = html.xpath('//a[@class="nbg"]')
for link in alist:
href = link.attrib['href']
parsed = self.calc_url(href)
if parsed and len(book_urls) < DOUBAN_CONCURRENCY_SIZE:
book_urls.append(parsed)
return book_urls
def search_books(self, query):
book_urls = self.load_book_urls_new(query)
books = []
futures = [self.thread_pool.submit(self.book_loader.load_book, book_url) for book_url in book_urls]
for future in as_completed(futures):
book = future.result()
if book is not None:
books.append(future.result())
return books
class DoubanBookLoader:
def __init__(self):
self.book_parser = DoubanBookHtmlParser()
@lru_cache(maxsize=DOUBAN_BOOK_CACHE_SIZE)
def load_book(self, url):
book = None
self.random_sleep()
start_time = time.time()
res = requests.get(url, headers=DEFAULT_HEADERS)
if res.status_code in [200, 201]:
print("下载书籍:{}成功,耗时{:.0f}ms".format(url, (time.time() - start_time) * 1000))
book_detail_content = res.content
book = self.book_parser.parse_book(url, book_detail_content.decode("utf8"))
return book
def random_sleep(self):
random_sec = random.random() / 10
print("Random sleep time {}s".format(random_sec))
time.sleep(random_sec)
class DoubanBookHtmlParser:
def __init__(self):
self.id_pattern = DOUBAN_BOOK_URL_PATTERN
self.date_pattern = re.compile("(\\d{4})-(\\d+)")
self.tag_pattern = re.compile("criteria = '(.+)'")
def parse_book(self, url, book_content):
book = DoubanMetaRecord(
id="",
title="",
authors=[],
publisher="",
description="",
url="",
source=MetaSourceInfo(
id=PROVIDER_ID,
description=PROVIDER_NAME,
link="https://book.douban.com/"
)
)
html = etree.HTML(book_content)
title_element = html.xpath("//span[@property='v:itemreviewed']")
book.title = self.get_text(title_element)
share_element = html.xpath("//a[@data-url]")
if len(share_element):
url = share_element[0].attrib['data-url']
book.url = url
id_match = self.id_pattern.match(url)
if id_match:
book.id = id_match.group(1)
img_element = html.xpath("//a[@class='nbg']")
if len(img_element):
cover = img_element[0].attrib['href']
if not cover or cover.endswith('update_image'):
book.cover = ''
else:
book.cover = cover
rating_element = html.xpath("//strong[@property='v:average']")
book.rating = self.get_rating(rating_element)
elements = html.xpath("//span[@class='pl']")
for element in elements:
text = self.get_text(element)
if text.startswith("作者") or text.startswith("译者"):
book.authors.extend([self.get_text(author_element) for author_element in
filter(self.author_filter, element.findall("..//a"))])
elif text.startswith("出版社"):
book.publisher = self.get_tail(element)
elif text.startswith("副标题"):
book.title = book.title + ':' + self.get_tail(element)
elif text.startswith("出版年"):
book.publishedDate = self.get_publish_date(self.get_tail(element))
elif text.startswith("丛书"):
book.series = self.get_text(element.getnext())
elif text.startswith("ISBN"):
book.identifiers["isbn"] = self.get_tail(element)
summary_element = html.xpath("//div[@id='link-report']//div[@class='intro']")
if len(summary_element):
book.description = etree.tostring(summary_element[-1], encoding="utf8").decode("utf8").strip()
tag_elements = html.xpath("//a[contains(@class, 'tag')]")
if len(tag_elements):
book.tags = [self.get_text(tag_element) for tag_element in tag_elements]
else:
book.tags = self.get_tags(book_content)
return book
def get_tags(self, book_content):
tag_match = self.tag_pattern.findall(book_content)
if len(tag_match):
return [tag.replace('7:', '') for tag in
filter(lambda tag: tag and tag.startswith('7:'), tag_match[0].split('|'))]
return []
def get_publish_date(self, date_str):
if date_str:
date_match = self.date_pattern.fullmatch(date_str)
if date_match:
date_str = "{}-{}-1".format(date_match.group(1), date_match.group(2))
return date_str
def get_rating(self, rating_element):
return float(self.get_text(rating_element, '0')) / 2
def author_filter(self, a_element):
a_href = a_element.attrib['href']
return '/author' in a_href or '/search' in a_href
def get_text(self, element, default_str=''):
text = default_str
if len(element) and element[0].text:
text = element[0].text.strip()
elif isinstance(element, etree._Element) and element.text:
text = element.text.strip()
return text if text else default_str
def get_tail(self, element, default_str=''):
text = default_str
if isinstance(element, etree._Element) and element.tail:
text = element.tail.strip()
if not text:
text = self.get_text(element.getnext(), default_str)
return text if text else default_str
@meta.route("/metadata/douban_cover", methods=["GET"])
def proxy_douban_cover():
"""
代理豆瓣封面展示
:return:
"""
cover_url = urllib.parse.unquote(request.args.get('cover'))
res = requests.get(cover_url, headers=DEFAULT_HEADERS)
return Response(res.content, mimetype=res.headers['Content-Type'])

View File

@ -0,0 +1,57 @@
import traceback
from cps.metadata_provider.net.QQReader import QQReader
from cps.metadata_provider.net.Qidian import Qidian
from cps.metadata_provider.net.Qimao import Qimao
from cps.metadata_provider.net.Fanqie import Fanqie
from cps.metadata_provider.net.Tadu import Tadu
from cps.metadata_provider.net.Zxcs import Zxcs
from cps.services.Metadata import Metadata
PROVIDER_NAME = "网文聚合"
PROVIDER_ID = "netnovel"
class NetNovel(Metadata):
__name__ = PROVIDER_NAME
__id__ = PROVIDER_ID
def __init__(self):
self.searchers = [
Qidian(),
QQReader(),
Qimao(),
Fanqie(),
Tadu(),
Zxcs()
]
super().__init__()
def search(self, query: str, generic_cover: str = "", locale: str = "en"):
if self.active:
sections = query.split(';')
author = '未知'
if len(sections) > 1:
title = sections[0]
author = sections[1]
else:
title = query
result_title_match = []
result_rest = []
for searcher in self.searchers:
try:
results = searcher.search(query)
for result in results:
if result.title == title:
if author in result.authors:
return [result]
else:
result_title_match.append(result)
else:
result_rest.append(result)
except:
traceback.print_exc()
if len(result_title_match) > 0:
return result_title_match
result_title_match.extend(result_rest)
return result_title_match

View File

@ -0,0 +1,183 @@
import dataclasses
import random
import time
from concurrent.futures import ThreadPoolExecutor, as_completed
from functools import lru_cache
import requests
from lxml import etree
from cps.services.Metadata import MetaRecord, MetaSourceInfo
CONST_BOOK_CACHE_SIZE = 500 # 最大缓存数量
CONST_CONCURRENCY_SIZE = 5 # 并发查询数
class GenericSearchDefine:
def __init__(self,
base_url,
book_url_pattern,
provider_name,
provider_id
):
self.base_url = base_url
self.book_url_pattern = book_url_pattern
self.provider_name = provider_name
self.provider_id = provider_id
self.default_headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3573.0 Safari/537.36',
'Accept-Encoding': 'gzip, deflate',
'Referer': base_url
}
def get_search_page(self, query):
pass
def get_search_results(self, html, query, author):
pass
def get_title(self, html):
pass
def get_author(self, html):
pass
def get_cover(self, html):
pass
def get_description(self, html):
pass
def get_tags(self, html):
pass
def get_text(self, element, default_str=''):
text = default_str
if len(element) and element[0].text:
text = element[0].text.strip()
elif isinstance(element, etree._Element) and element.text:
text = element.text.strip()
return text if text else default_str
@dataclasses.dataclass
class GenericSearchMetaRecord(MetaRecord):
def __getattribute__(self, item): # cover通过本地服务代理访问
return super().__getattribute__(item)
class GenericSearchBookSearcher:
def __init__(self, definition: GenericSearchDefine):
self.definition = definition
self.book_loader = GenericSearchBookLoader(definition)
self.thread_pool = ThreadPoolExecutor(max_workers=10, thread_name_prefix=definition.provider_id + '_async')
def load_book_urls_new(self, query):
sections = query.split(';')
if len(sections) > 1:
query = sections[0]
author = sections[1]
else:
author = None
res = self.definition.get_search_page(query)
book_urls = []
exact_urls = []
if res.status_code in [200, 201]:
if(res.headers['Content-Type'].find('/json') >= 0):
html = res.json()
else:
html = etree.HTML(res.content)
book_urls, exact_urls = self.definition.get_search_results(html, query, author)
if len(exact_urls) > 0:
return exact_urls
return book_urls[:3]
def search_books(self, query):
book_urls = self.load_book_urls_new(query)
books = []
futures = [self.thread_pool.submit(self.book_loader.load_book, book_url) for book_url in book_urls]
for future in as_completed(futures):
book = future.result()
if book is not None:
books.append(future.result())
return books
def search_books_single(self, query):
sections = query.split(';')
if len(sections) > 1:
query = sections[0]
author = sections[1]
else:
author = None
res = self.definition.get_search_page(query)
books = []
exact_books = []
if res.status_code in [200, 201]:
if (res.headers['Content-Type'].find('/json') >= 0):
html = res.json()
else:
html = etree.HTML(res.content)
books, exact_books = self.definition.get_search_results(html, query, author)
if len(exact_books) > 0:
return exact_books
return books[:3]
class GenericSearchBookLoader:
def __init__(self, definition: GenericSearchDefine):
self.book_parser = GenericSearchBookHtmlParser(definition)
self.definition = definition
@lru_cache(maxsize=CONST_BOOK_CACHE_SIZE)
def load_book(self, url):
book = None
self.random_sleep()
start_time = time.time()
res = requests.get(url, headers=self.definition.default_headers)
if res.status_code in [200, 201]:
print("下载书籍:{}成功,耗时{:.0f}ms".format(url, (time.time() - start_time) * 1000))
book_detail_content = res.content
book = self.book_parser.parse_book(url, book_detail_content.decode("utf8"))
return book
def random_sleep(self):
random_sec = random.random() / 10
print("Random sleep time {}s".format(random_sec))
time.sleep(random_sec)
class GenericSearchBookHtmlParser:
def __init__(self, definition: GenericSearchDefine):
self.definition = definition
self.id_pattern = definition.book_url_pattern
def parse_book(self, url, book_content):
book = GenericSearchMetaRecord(
id="",
title="",
authors=[],
publisher="",
description="",
url="",
source=MetaSourceInfo(
id=self.definition.provider_id,
description=self.definition.provider_name,
link=self.definition.base_url
)
)
html = etree.HTML(book_content)
book.title = self.definition.get_title(html)
book.url = url
id_match = self.id_pattern.match(url)
if id_match:
book.id = id_match.group(1)
book.cover = self.definition.get_cover(html)
book.publisher = self.definition.provider_name
# rating_element = html.xpath("//strong[@property='v:average']")
# book.rating = self.get_rating(rating_element)
book.authors = self.definition.get_author(html)
book.description = self.definition.get_description(html)
book.tags = self.definition.get_tags(html)
return book

View File

@ -0,0 +1,73 @@
import re
import requests
from lxml import etree
from lxml.etree import HTML
from cps.metadata_provider.net.Base import GenericSearchDefine, GenericSearchBookSearcher, GenericSearchMetaRecord
from cps.services.Metadata import Metadata, MetaSourceInfo
class FanqieSearchDefine(GenericSearchDefine):
def __init__(self):
self.search_url = "https://fanqienovel.com/api/author/search/search_book/v1?filter=127%2C127%2C127%2C127&page_count=10&page_index=0&query_type=0&query_word="
base_url = "https://fanqienovel.com"
book_url_pattern = re.compile(".*/page/(\\d+)/?")
provider_name = "番茄小说网"
provider_id = "fanqienovel"
super().__init__(base_url, book_url_pattern, provider_name, provider_id)
def get_search_page(self, query):
url = self.search_url + query
return requests.get(url, {}, headers=self.default_headers)
def get_search_results(self, html, query, author):
books = []
exact_books = []
alist = html["data"]["search_book_data_list"]
for link in alist:
book_id = link['book_id']
parsed = "https://fanqienovel.com/page/" + book_id
title = link['book_name']
author = link['author']
thumb_url = link['thumb_url']
category = link['category'].split(',')
description = link['book_abstract']
book = GenericSearchMetaRecord(
id=book_id,
title=title,
authors=[author],
cover=thumb_url,
publisher=self.provider_name,
description=description,
url=parsed,
tags=category,
source=MetaSourceInfo(
id=self.provider_id,
description=self.provider_name,
link=self.base_url
)
)
books.append(book)
if title == query:
if author == link['author']:
return [], [book]
exact_books.append(book)
return books, exact_books
definition = FanqieSearchDefine()
class Fanqie(Metadata):
__name__ = definition.provider_name
__id__ = definition.provider_id
def __init__(self):
self.searcher = GenericSearchBookSearcher(definition)
super().__init__()
def search(self, query: str, generic_cover: str = "", locale: str = "en"):
if self.active:
return self.searcher.search_books_single(query)

View File

@ -0,0 +1,91 @@
import re
import requests
from lxml import etree
from cps.metadata_provider.net.Base import GenericSearchDefine, GenericSearchBookSearcher
from cps.services.Metadata import Metadata
class JinjiangSearchDefine(GenericSearchDefine):
def __init__(self):
self.search_url = "https://www.jjwxc.net/search.php?kw="
base_url = "https://www.jjwxc.net"
book_url_pattern = re.compile(".*/onebook.php\\?novelid=(\\d+)/?")
provider_name = "晋江文学城"
provider_id = "jinjiang"
super().__init__(base_url, book_url_pattern, provider_name, provider_id)
self.default_headers['Accept-Charset'] = "utf-8"
def get_search_page(self, query):
url = self.search_url + query + "&t=1&ord=relate"
return requests.get(url, {}, headers=self.default_headers)
def get_search_results(self, html, query, author):
book_urls = []
exact_urls = []
alist = html.xpath("id('search_result')/div/h3[@class='title']/a")
for link in alist:
href = link.attrib['href']
if self.book_url_pattern.match(href):
parsed = href
book_urls.append(parsed)
else:
continue
title = link.xpath('text()')[0]
if title == query:
item_base = link.getparent().getparent()
if len(item_base):
item_author = item_base.xpath('//div[@class="info"]/a[contains(@href,"oneauthor")]/span/text()')
if len(item_author) > 0:
if author == item_author[0]:
return [], [parsed]
exact_urls.append(parsed)
return book_urls, exact_urls
def get_title(self, html):
title_element = html.xpath('//h1[@itemprop="name"]/span[@itemprop="articleSection"]')
return self.get_text(title_element)
def get_author(self, html):
author_element = html.xpath('//h2/a/span[@itemprop="author"]')
return [self.get_text(author_element)]
def get_cover(self, html):
img_element = html.xpath('//img[@class="noveldefaultimage"]')
if len(img_element):
cover = img_element[0].attrib['src']
if not cover:
return ''
else:
return "https:" + cover
def get_description(self, html):
summary_element = html.xpath('id("novelintro")')
other_element = html.xpath('id("novelintro")/a')
if len(other_element):
summary_element.remove(other_element)
if len(summary_element):
return etree.tostring(summary_element[-1], encoding="utf8").decode("utf8").strip()
def get_tags(self, html):
tag_elements = html.xpath('//div[@class="smallreadbody"]/span/a[contains(@href,"bookbase")]')
if len(tag_elements):
return [self.get_text(tag_element) for tag_element in tag_elements]
definition = JinjiangSearchDefine()
class Jinjiang(Metadata):
__name__ = definition.provider_name
__id__ = definition.provider_id
def __init__(self):
self.searcher = GenericSearchBookSearcher(definition)
super().__init__()
def search(self, query: str, generic_cover: str = "", locale: str = "en"):
if self.active:
return self.searcher.search_books(query)

View File

@ -0,0 +1,87 @@
import re
import requests
from lxml import etree
from cps.metadata_provider.net.Base import GenericSearchDefine, GenericSearchBookSearcher
from cps.services.Metadata import Metadata
class QQReaderSearchDefine(GenericSearchDefine):
def __init__(self):
self.search_url = "https://book.qq.com/so/"
base_url = "https://book.qq.com/"
book_url_pattern = re.compile(".*/book-detail/(\\d+)/?")
provider_name = "QQ阅读"
provider_id = "qqreader"
super().__init__(base_url, book_url_pattern, provider_name, provider_id)
def get_search_page(self, query):
url = self.search_url + query
return requests.get(url, {}, headers=self.default_headers)
def get_search_results(self, html, query, author):
book_urls = []
exact_urls = []
alist = html.xpath("//div[contains(@class,'book-large')]/a")
for link in alist:
href = link.attrib['href']
if self.book_url_pattern.match(href):
parsed = "https:" + href
book_urls.append(parsed)
else:
continue
title = link.attrib['title']
if title == query:
item_base = link
if len(item_base):
item_author = item_base.xpath('div[@class="content"]/p[@class="other"]/object/a[contains(@href,"book-writer")]/text()')
if len(item_author) > 0:
if author == item_author[0]:
return [], [parsed]
exact_urls.append(parsed)
return book_urls, exact_urls
def get_title(self, html):
title_element = html.xpath("//h1[@class='book-title']/text()")
return title_element[0].strip()
def get_author(self, html):
author_element = html.xpath("//div[@class='book-meta']/a[contains(@class,'author')]")
return [self.get_text(author_element).replace('', '')]
def get_cover(self, html):
img_element = html.xpath("//div[@class='page-header-content']//img[@class='ypc-book-cover']")
if len(img_element):
cover = img_element[0].attrib['src']
if not cover:
return ''
else:
return cover
def get_description(self, html):
summary_element = html.xpath("//div[contains(@class,'book-intro')]")
if len(summary_element):
return etree.tostring(summary_element[-1], encoding="utf8").decode("utf8").strip()
def get_tags(self, html):
tag_elements = html.xpath("//div[@class='book-tags']/a[contains(@class,'tag')]")
if len(tag_elements):
return [self.get_text(tag_element) for tag_element in tag_elements]
definition = QQReaderSearchDefine()
class QQReader(Metadata):
__name__ = definition.provider_name
__id__ = definition.provider_id
def __init__(self):
self.searcher = GenericSearchBookSearcher(definition)
super().__init__()
def search(self, query: str, generic_cover: str = "", locale: str = "en"):
if self.active:
return self.searcher.search_books(query)

View File

@ -0,0 +1,88 @@
import re
import requests
from lxml import etree
from lxml.etree import HTML
from cps.metadata_provider.net.Base import GenericSearchDefine, GenericSearchBookSearcher
from cps.services.Metadata import Metadata
class QidianSearchDefine(GenericSearchDefine):
def __init__(self):
self.search_url = "https://www.qidian.com/so/"
base_url = "https://www.qidian.com/"
book_url_pattern = re.compile(".*/book/(\\d+)/?")
provider_name = "起点中文网"
provider_id = "qidian"
super().__init__(base_url, book_url_pattern, provider_name, provider_id)
def get_search_page(self, query):
url = self.search_url + query + ".html"
return requests.get(url, {}, headers=self.default_headers)
def get_search_results(self, html, query, author):
book_urls = []
exact_urls = []
alist = html.xpath("//h3/a[contains(@title,'在线阅读')]")
for link in alist:
href = link.attrib['href']
if self.book_url_pattern.match(href):
parsed = "https:" + href
book_urls.append(parsed)
else:
continue
title = link.attrib['title'].replace('在线阅读', '')
if title == query:
item_base = link.getparent().getparent()
if len(item_base):
item_author = item_base.xpath('p[@class="author"]/a[@class="name"]/text()')
if len(item_author) > 0:
if author == item_author[0]:
return [], [parsed]
exact_urls.append(parsed)
return book_urls, exact_urls
def get_title(self, html):
title_element = html.xpath('id("bookName")')
return self.get_text(title_element)
def get_author(self, html):
author_element = html.xpath("//a[@class='writer-name']")
return [self.get_text(author_element)]
def get_cover(self, html):
img_element = html.xpath('id("bookImg")/img')
if len(img_element):
cover = img_element[0].attrib['src']
if not cover:
return ''
else:
return "https:" + cover
def get_description(self, html):
summary_element = html.xpath('id("book-intro-detail")')
if len(summary_element):
return etree.tostring(summary_element[-1], encoding="utf8").decode("utf8").strip()
def get_tags(self, html):
tag_elements = html.xpath("//p[@class='book-attribute']/a")
if len(tag_elements):
return [self.get_text(tag_element) for tag_element in tag_elements]
definition = QidianSearchDefine()
class Qidian(Metadata):
__name__ = definition.provider_name
__id__ = definition.provider_id
def __init__(self):
self.searcher = GenericSearchBookSearcher(definition)
super().__init__()
def search(self, query: str, generic_cover: str = "", locale: str = "en"):
if self.active:
return self.searcher.search_books(query)

View File

@ -0,0 +1,87 @@
import re
import requests
from lxml import etree
from cps.metadata_provider.net.Base import GenericSearchDefine, GenericSearchBookSearcher
from cps.services.Metadata import Metadata
class QimaoSearchDefine(GenericSearchDefine):
def __init__(self):
self.search_url = "https://www.qimao.com/search/index/?keyword="
base_url = "https://www.qimao.com"
book_url_pattern = re.compile(".*/shuku/(\\d+)/?")
provider_name = "七猫中文网"
provider_id = "qimao"
super().__init__(base_url, book_url_pattern, provider_name, provider_id)
def get_search_page(self, query):
url = self.search_url + query
return requests.get(url, {}, headers=self.default_headers)
def get_search_results(self, html, query, author):
book_urls = []
exact_urls = []
alist = html.xpath("//li/div[@class='txt']/span[@class='s-tit']/a")
for link in alist:
href = link.attrib['href']
if self.book_url_pattern.match(href):
parsed = self.base_url + href
book_urls.append(parsed)
else:
continue
title = ''.join(etree.HTML(etree.tostring(link)).xpath('//text()')).strip()
if title == query:
item_base = link.getparent().getparent()
if len(item_base):
item_author = item_base.xpath('p[@class="p-bottom"]/span[1]/a/text()')
if len(item_author) > 0:
if author == item_author[0]:
return [], [parsed]
exact_urls.append(parsed)
return book_urls, exact_urls
def get_title(self, html):
title_element = html.xpath('//div[starts-with(@class,"title")]/span[@class="txt"]')
return self.get_text(title_element)
def get_author(self, html):
author_element = html.xpath('//div[@class="sub-title"]/span[@class="txt"]/em/a')
return [self.get_text(author_element)]
def get_cover(self, html):
img_element = html.xpath('//div[@class="wrap-pic"]/img')
if len(img_element):
cover = img_element[0].attrib['src']
if not cover:
return ''
else:
return cover
def get_description(self, html):
summary_element = html.xpath('//p[@class="intro"]')
if len(summary_element):
return etree.tostring(summary_element[-1], encoding="utf8").decode("utf8").strip()
def get_tags(self, html):
tag_elements = html.xpath('//div[@class="tags-wrap"]/em/a')
if len(tag_elements):
return [self.get_text(tag_element) for tag_element in tag_elements]
definition = QimaoSearchDefine()
class Qimao(Metadata):
__name__ = definition.provider_name
__id__ = definition.provider_id
def __init__(self):
self.searcher = GenericSearchBookSearcher(definition)
super().__init__()
def search(self, query: str, generic_cover: str = "", locale: str = "en"):
if self.active:
return self.searcher.search_books(query)

View File

@ -0,0 +1,91 @@
import re
import requests
from lxml import etree
from cps.metadata_provider.net.Base import GenericSearchDefine, GenericSearchBookSearcher
from cps.services.Metadata import Metadata
class TaduSearchDefine(GenericSearchDefine):
def __init__(self):
self.search_url = "https://www.tadu.com/search"
base_url = "https://www.tadu.com"
book_url_pattern = re.compile(".*/book/(\\d+)/?")
provider_name = "塔读文学"
provider_id = "tadu"
super().__init__(base_url, book_url_pattern, provider_name, provider_id)
def get_search_page(self, query):
url = self.search_url
params = {"query": query}
return requests.post(url, params, headers=self.default_headers)
def get_search_results(self, html, query, author):
book_urls = []
exact_urls = []
alist = html.xpath("//a[contains(@class,'bookNm')]")
for link in alist:
href = link.attrib['href']
if self.book_url_pattern.match(href):
parsed = self.base_url + href
book_urls.append(parsed)
else:
continue
if len(link) > 0:
title = ''.join(etree.HTML(etree.tostring(link)).xpath('//text()')).strip()
else:
title = link.text
if title == query:
item_base = link.getparent()
if len(item_base):
item_author = item_base.xpath('div[starts-with(@class,"bot_list")]/div[@class="condition"]/a[@class="authorNm"]/text()')
if len(item_author) > 0:
if author == item_author[0]:
return [], [parsed]
exact_urls.append(parsed)
return book_urls, exact_urls
def get_title(self, html):
title_element = html.xpath("//a[@class='bkNm']/text()")
return title_element[0].strip()
def get_author(self, html):
author_element = html.xpath("//div[@class='bookNm']/span[contains(@class,'author')]")
return [self.get_text(author_element).replace('', '')]
def get_cover(self, html):
img_element = html.xpath("//a[@class='bookImg']/img")
if len(img_element):
cover = img_element[0].attrib['src']
if not cover:
return ''
else:
return cover
def get_description(self, html):
summary_element = html.xpath("//p[contains(@class,'intro')]")
if len(summary_element):
return etree.tostring(summary_element[-1], encoding="utf8").decode("utf8").strip()
def get_tags(self, html):
tag_elements = html.xpath("//div[@class='sortList']/a[text()!='无标签']")
if len(tag_elements):
return [self.get_text(tag_element) for tag_element in tag_elements]
definition = TaduSearchDefine()
class Tadu(Metadata):
__name__ = definition.provider_name
__id__ = definition.provider_id
def __init__(self):
self.searcher = GenericSearchBookSearcher(definition)
super().__init__()
def search(self, query: str, generic_cover: str = "", locale: str = "en"):
if self.active:
return self.searcher.search_books(query)

View File

@ -0,0 +1,96 @@
import re
import requests
from lxml import etree
from cps.metadata_provider.net.Base import GenericSearchDefine, GenericSearchBookSearcher
from cps.services.Metadata import Metadata
class ZxcsSearchDefine(GenericSearchDefine):
def __init__(self):
self.search_url = "https://zxcs.info/index.php"
base_url = "https://zxcs.info"
book_url_pattern = re.compile(".*/post/(\\d+)/?")
provider_name = "知轩藏书"
provider_id = "zxcs"
self.book_name_pattern = re.compile(".*?《(.+?)》.*")
self.author_name_pattern = re.compile(".*?作者[:\s]+(.+)")
super().__init__(base_url, book_url_pattern, provider_name, provider_id)
def get_search_page(self, query):
url = self.search_url
params = {"keyword": query}
return requests.get(url, params, headers=self.default_headers)
def get_search_results(self, html, query, author):
book_urls = []
exact_urls = []
alist = html.xpath("id('plist')/dt/a")
for link in alist:
href = link.attrib['href']
if self.book_url_pattern.match(href):
parsed = href
book_urls.append(parsed)
else:
continue
title = link.text
title_groups = self.book_name_pattern.fullmatch(title)
if title_groups:
title = title_groups.group(1)
if title == query:
author_groups = self.author_name_pattern.fullmatch(link.text)
if author_groups:
item_author = author_groups.group(1)
if author == item_author:
return [], [parsed]
exact_urls.append(parsed)
return book_urls, exact_urls
def get_title(self, html):
title_element = html.xpath("//div[contains(@class,'book-info')]/h1/text()")
return title_element[0].strip().replace("", "").replace("", "")
def get_author(self, html):
author_element = html.xpath("//p[@class='intro']")
return [self.get_text(author_element).replace('', '')]
def get_cover(self, html):
img_element = html.xpath("id('bookImg')/img")
if len(img_element):
cover = img_element[0].attrib['src']
if not cover:
return ''
else:
return self.base_url + cover
def get_description(self, html):
vote_element = html.xpath("id('vote')")
if len(vote_element) > 0:
vote_element[0].getparent().remove(vote_element[0])
summary_element = html.xpath("//div[@class='book-info-detail']")
if len(summary_element):
return (etree.tostring(summary_element[-1], encoding="utf8").decode("utf8").strip()
.replace('<cite class="icon-pin"/> <br/>',''))
def get_tags(self, html):
tag_elements = html.xpath("//p[@class='tag']/a")
if len(tag_elements):
return [self.get_text(tag_element).replace('精校','') for tag_element in tag_elements]
definition = ZxcsSearchDefine()
class Zxcs(Metadata):
__name__ = definition.provider_name
__id__ = definition.provider_id
def __init__(self):
self.searcher = GenericSearchBookSearcher(definition)
super().__init__()
def search(self, query: str, generic_cover: str = "", locale: str = "en"):
if self.active:
return self.searcher.search_books(query)

7
tests/FanqieTest.py Normal file
View File

@ -0,0 +1,7 @@
from cps.metadata_provider.net.Fanqie import Fanqie
if __name__ == "__main__":
fanqie = Fanqie()
result = fanqie.search("夺舍;木牛流猫")
for book in result:
print(book)

7
tests/JinjiangTest.py Normal file
View File

@ -0,0 +1,7 @@
from cps.metadata_provider.net.Jinjiang import Jinjiang
if __name__ == "__main__":
jinjiang = Jinjiang()
result = jinjiang.search("如意书;蒋牧童")
for book in result:
print(book)

7
tests/NetNovelTest.py Normal file
View File

@ -0,0 +1,7 @@
from cps.metadata_provider.NetNovel import NetNovel
if __name__ == "__main__":
netnovel = NetNovel()
result = netnovel.search("战皇;傲天无痕")
for book in result:
print(book)

View File

@ -1,7 +0,0 @@
from NewDouban import NewDouban
if __name__ == "__main__":
douban = NewDouban()
result = douban.search("知识考古学")
for book in result:
print(book)

7
tests/QQReaderTest.py Normal file
View File

@ -0,0 +1,7 @@
from cps.metadata_provider.net.QQReader import QQReader
if __name__ == "__main__":
qqreader = QQReader()
result = qqreader.search("第一序列;会说话的肘子")
for book in result:
print(book)

7
tests/QidianTest.py Normal file
View File

@ -0,0 +1,7 @@
from cps.metadata_provider.net.Qidian import Qidian
if __name__ == "__main__":
qidian = Qidian()
result = qidian.search("第一序列")
for book in result:
print(book)

7
tests/QimaoTest.py Normal file
View File

@ -0,0 +1,7 @@
from cps.metadata_provider.net.Qimao import Qimao
if __name__ == "__main__":
qimao = Qimao()
result = qimao.search("综武:人在酒楼,捡尸王语嫣;要长记性啊")
for book in result:
print(book)

7
tests/TaduTest.py Normal file
View File

@ -0,0 +1,7 @@
from cps.metadata_provider.net.Tadu import Tadu
if __name__ == "__main__":
tadu = Tadu()
result = tadu.search("全职修真高手;洗剑")
for book in result:
print(book)

7
tests/ZxcsTest.py Normal file
View File

@ -0,0 +1,7 @@
from cps.metadata_provider.net.Zxcs import Zxcs
if __name__ == "__main__":
zxcs = Zxcs()
result = zxcs.search("夜的命名术;会说话的肘子")
for book in result:
print(book)