From 516b23abfd36d714a8b1a39bee08a1ff8d8ead03 Mon Sep 17 00:00:00 2001 From: ZaneYork Date: Tue, 16 Jan 2024 12:01:58 +0800 Subject: [PATCH] =?UTF-8?q?=E6=94=B9=E7=BC=96=EF=BC=8C=E4=B8=8D=E6=98=AF?= =?UTF-8?q?=E4=B9=B1=E7=BC=96?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .idea/calibre-web-novel-api.iml | 11 + README.md | 25 +- src/NewDouban.py | 266 ---------------------- src/cps/metadata_provider/NetNovel.py | 57 +++++ src/cps/metadata_provider/net/Base.py | 183 +++++++++++++++ src/cps/metadata_provider/net/Fanqie.py | 73 ++++++ src/cps/metadata_provider/net/Jinjiang.py | 91 ++++++++ src/cps/metadata_provider/net/QQReader.py | 87 +++++++ src/cps/metadata_provider/net/Qidian.py | 88 +++++++ src/cps/metadata_provider/net/Qimao.py | 87 +++++++ src/cps/metadata_provider/net/Tadu.py | 91 ++++++++ src/cps/metadata_provider/net/Zxcs.py | 96 ++++++++ tests/FanqieTest.py | 7 + tests/JinjiangTest.py | 7 + tests/NetNovelTest.py | 7 + tests/NewDoubanTest.py | 7 - tests/QQReaderTest.py | 7 + tests/QidianTest.py | 7 + tests/QimaoTest.py | 7 + tests/TaduTest.py | 7 + tests/ZxcsTest.py | 7 + 21 files changed, 923 insertions(+), 295 deletions(-) create mode 100644 .idea/calibre-web-novel-api.iml delete mode 100644 src/NewDouban.py create mode 100644 src/cps/metadata_provider/NetNovel.py create mode 100644 src/cps/metadata_provider/net/Base.py create mode 100644 src/cps/metadata_provider/net/Fanqie.py create mode 100644 src/cps/metadata_provider/net/Jinjiang.py create mode 100644 src/cps/metadata_provider/net/QQReader.py create mode 100644 src/cps/metadata_provider/net/Qidian.py create mode 100644 src/cps/metadata_provider/net/Qimao.py create mode 100644 src/cps/metadata_provider/net/Tadu.py create mode 100644 src/cps/metadata_provider/net/Zxcs.py create mode 100644 tests/FanqieTest.py create mode 100644 tests/JinjiangTest.py create mode 100644 tests/NetNovelTest.py delete mode 100644 tests/NewDoubanTest.py create mode 100644 tests/QQReaderTest.py create mode 100644 tests/QidianTest.py create mode 100644 tests/QimaoTest.py create mode 100644 tests/TaduTest.py create mode 100644 tests/ZxcsTest.py diff --git a/.idea/calibre-web-novel-api.iml b/.idea/calibre-web-novel-api.iml new file mode 100644 index 0000000..e21d11f --- /dev/null +++ b/.idea/calibre-web-novel-api.iml @@ -0,0 +1,11 @@ + + + + + + + + + + + \ No newline at end of file diff --git a/README.md b/README.md index b087b36..0ea2c9f 100644 --- a/README.md +++ b/README.md @@ -1,32 +1,13 @@ -# calibre-web-douban-api +# calibre-web-novel-api -**2023-07-15** +此项目从[calibre-web-douban-api](https://github.com/fugary/calibre-web-douban-api)项目Fork而来,是calibre的网文搜索api provider,需要放到metadata_provider目录下 -最新豆瓣屏蔽直接访问封面图片,自动实现使用本地代理封面图片,保存时使用requests下载并保存 - -**2022-10-08** - -原douban列表url被屏蔽,老版本不能访问,建议更新最新的[NewDouban.py](https://github.com/fugary/calibre-web-douban-api/releases/download/v1.1.0/NewDouban.py) - -**2022-08-10** - -**最新V0.6.19版本的calbire-web的豆瓣插件已经回来了,除了标签外,应该都有数据了,可以不用此插件了** - -新版calibre-web已经移除douban-api了,而且把从get_meta.js中直接发起请求获取数据改成了从服务端使用python获取数据。 - -此项目是添加一个豆瓣api provider实现,需要放到metadata_provider目录下 ### 使用方法 -复制`src/NewDouban.py`到`calibre-web/cps/metadata_provider/`目录下,重启项目即可。 +复制`src/cps/metadata_provider/`下全部文件到`calibre-web/cps/metadata_provider/`目录下,重启项目即可。 此应用是基于Python抓取网页的形式获取书籍信息,频率过高访问可能被屏蔽。 -参考文档:https://fugary.com/?p=238 - -**新版calibre-web 0.6.17以上使用** - -小于0.6.17版本,请下载:https://github.com/fugary/calibre-web-douban-api/releases/tag/0.6.16 - diff --git a/src/NewDouban.py b/src/NewDouban.py deleted file mode 100644 index c87bc28..0000000 --- a/src/NewDouban.py +++ /dev/null @@ -1,266 +0,0 @@ -import random -import re -import time -import dataclasses -import urllib - -import requests -from concurrent.futures import ThreadPoolExecutor, as_completed -from urllib.parse import urlparse, unquote -from lxml import etree -from functools import lru_cache - -from cps.services.Metadata import Metadata, MetaSourceInfo, MetaRecord - -from cps.search_metadata import meta -from flask import request, Response -from cps import helper - -# 是否自动代理封面地址 -DOUBAN_PROXY_COVER = True -# 如果自动计算的服务器地址不正确,可以填写自己的calibre-web地址,参考:http://nas_ip:8083/ -DOUBAN_PROXY_COVER_HOST_URL = '' -DOUBAN_PROXY_COVER_PATH = 'metadata/douban_cover?cover=' -DOUBAN_SEARCH_URL = "https://www.douban.com/search" -DOUBAN_BASE = "https://book.douban.com/" -DOUBAN_COVER_DOMAIN = 'doubanio.com' -DOUBAN_BOOK_CAT = "1001" -DOUBAN_BOOK_CACHE_SIZE = 500 # 最大缓存数量 -DOUBAN_CONCURRENCY_SIZE = 5 # 并发查询数 -DOUBAN_BOOK_URL_PATTERN = re.compile(".*/subject/(\\d+)/?") -DEFAULT_HEADERS = { - 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3573.0 Safari/537.36', - 'Accept-Encoding': 'gzip, deflate', - 'Referer': DOUBAN_BASE -} -PROVIDER_NAME = "New Douban Books" -PROVIDER_ID = "new_douban" - - -class NewDouban(Metadata): - __name__ = PROVIDER_NAME - __id__ = PROVIDER_ID - - def __init__(self): - self.searcher = DoubanBookSearcher() - self.hack_helper_cover() - super().__init__() - - def search(self, query: str, generic_cover: str = "", locale: str = "en"): - if self.active: - return self.searcher.search_books(query) - - @staticmethod - def hack_helper_cover(): - """ - 覆盖helper.save_cover_from_url方法实现豆瓣的封面下载 - :return: - """ - save_cover = helper.save_cover_from_url - - def new_save_cover(url, book_path): - if DOUBAN_COVER_DOMAIN in url: - cover_url = url - if DOUBAN_PROXY_COVER: - component = urllib.parse.urlparse(url) - query = urllib.parse.parse_qs(component.query) - cover_url = urllib.parse.unquote(query.get('cover')[0]) - res = requests.get(cover_url, headers=DEFAULT_HEADERS) - return helper.save_cover(res, book_path) - else: - return save_cover(url, book_path) - - helper.save_cover_from_url = new_save_cover - - -@dataclasses.dataclass -class DoubanMetaRecord(MetaRecord): - - def __getattribute__(self, item): # cover通过本地服务代理访问 - if item == 'cover' and DOUBAN_PROXY_COVER: - cover_url = super().__getattribute__(item) - if cover_url: - try: - host_url = DOUBAN_PROXY_COVER_HOST_URL - if not host_url and request.host_url: - host_url = request.host_url - if host_url and host_url not in cover_url: - self.cover = host_url + DOUBAN_PROXY_COVER_PATH + urllib.parse.quote(cover_url) - except BaseException: - pass - return super().__getattribute__(item) - - -class DoubanBookSearcher: - - def __init__(self): - self.book_loader = DoubanBookLoader() - self.thread_pool = ThreadPoolExecutor(max_workers=10, thread_name_prefix='douban_async') - - def calc_url(self, href): - query = urlparse(href).query - params = {item.split('=')[0]: item.split('=')[1] for item in query.split('&')} - url = unquote(params['url']) - if DOUBAN_BOOK_URL_PATTERN.match(url): - return url - - def load_book_urls_new(self, query): - url = DOUBAN_SEARCH_URL - params = {"cat": DOUBAN_BOOK_CAT, "q": query} - res = requests.get(url, params, headers=DEFAULT_HEADERS) - book_urls = [] - if res.status_code in [200, 201]: - html = etree.HTML(res.content) - alist = html.xpath('//a[@class="nbg"]') - for link in alist: - href = link.attrib['href'] - parsed = self.calc_url(href) - if parsed and len(book_urls) < DOUBAN_CONCURRENCY_SIZE: - book_urls.append(parsed) - return book_urls - - def search_books(self, query): - book_urls = self.load_book_urls_new(query) - books = [] - futures = [self.thread_pool.submit(self.book_loader.load_book, book_url) for book_url in book_urls] - for future in as_completed(futures): - book = future.result() - if book is not None: - books.append(future.result()) - return books - - -class DoubanBookLoader: - - def __init__(self): - self.book_parser = DoubanBookHtmlParser() - - @lru_cache(maxsize=DOUBAN_BOOK_CACHE_SIZE) - def load_book(self, url): - book = None - self.random_sleep() - start_time = time.time() - res = requests.get(url, headers=DEFAULT_HEADERS) - if res.status_code in [200, 201]: - print("下载书籍:{}成功,耗时{:.0f}ms".format(url, (time.time() - start_time) * 1000)) - book_detail_content = res.content - book = self.book_parser.parse_book(url, book_detail_content.decode("utf8")) - return book - - def random_sleep(self): - random_sec = random.random() / 10 - print("Random sleep time {}s".format(random_sec)) - time.sleep(random_sec) - -class DoubanBookHtmlParser: - def __init__(self): - self.id_pattern = DOUBAN_BOOK_URL_PATTERN - self.date_pattern = re.compile("(\\d{4})-(\\d+)") - self.tag_pattern = re.compile("criteria = '(.+)'") - - def parse_book(self, url, book_content): - book = DoubanMetaRecord( - id="", - title="", - authors=[], - publisher="", - description="", - url="", - source=MetaSourceInfo( - id=PROVIDER_ID, - description=PROVIDER_NAME, - link="https://book.douban.com/" - ) - ) - html = etree.HTML(book_content) - title_element = html.xpath("//span[@property='v:itemreviewed']") - book.title = self.get_text(title_element) - share_element = html.xpath("//a[@data-url]") - if len(share_element): - url = share_element[0].attrib['data-url'] - book.url = url - id_match = self.id_pattern.match(url) - if id_match: - book.id = id_match.group(1) - img_element = html.xpath("//a[@class='nbg']") - if len(img_element): - cover = img_element[0].attrib['href'] - if not cover or cover.endswith('update_image'): - book.cover = '' - else: - book.cover = cover - rating_element = html.xpath("//strong[@property='v:average']") - book.rating = self.get_rating(rating_element) - elements = html.xpath("//span[@class='pl']") - for element in elements: - text = self.get_text(element) - if text.startswith("作者") or text.startswith("译者"): - book.authors.extend([self.get_text(author_element) for author_element in - filter(self.author_filter, element.findall("..//a"))]) - elif text.startswith("出版社"): - book.publisher = self.get_tail(element) - elif text.startswith("副标题"): - book.title = book.title + ':' + self.get_tail(element) - elif text.startswith("出版年"): - book.publishedDate = self.get_publish_date(self.get_tail(element)) - elif text.startswith("丛书"): - book.series = self.get_text(element.getnext()) - elif text.startswith("ISBN"): - book.identifiers["isbn"] = self.get_tail(element) - summary_element = html.xpath("//div[@id='link-report']//div[@class='intro']") - if len(summary_element): - book.description = etree.tostring(summary_element[-1], encoding="utf8").decode("utf8").strip() - tag_elements = html.xpath("//a[contains(@class, 'tag')]") - if len(tag_elements): - book.tags = [self.get_text(tag_element) for tag_element in tag_elements] - else: - book.tags = self.get_tags(book_content) - return book - - def get_tags(self, book_content): - tag_match = self.tag_pattern.findall(book_content) - if len(tag_match): - return [tag.replace('7:', '') for tag in - filter(lambda tag: tag and tag.startswith('7:'), tag_match[0].split('|'))] - return [] - - def get_publish_date(self, date_str): - if date_str: - date_match = self.date_pattern.fullmatch(date_str) - if date_match: - date_str = "{}-{}-1".format(date_match.group(1), date_match.group(2)) - return date_str - - def get_rating(self, rating_element): - return float(self.get_text(rating_element, '0')) / 2 - - def author_filter(self, a_element): - a_href = a_element.attrib['href'] - return '/author' in a_href or '/search' in a_href - - def get_text(self, element, default_str=''): - text = default_str - if len(element) and element[0].text: - text = element[0].text.strip() - elif isinstance(element, etree._Element) and element.text: - text = element.text.strip() - return text if text else default_str - - def get_tail(self, element, default_str=''): - text = default_str - if isinstance(element, etree._Element) and element.tail: - text = element.tail.strip() - if not text: - text = self.get_text(element.getnext(), default_str) - return text if text else default_str - - -@meta.route("/metadata/douban_cover", methods=["GET"]) -def proxy_douban_cover(): - """ - 代理豆瓣封面展示 - :return: - """ - cover_url = urllib.parse.unquote(request.args.get('cover')) - res = requests.get(cover_url, headers=DEFAULT_HEADERS) - return Response(res.content, mimetype=res.headers['Content-Type']) diff --git a/src/cps/metadata_provider/NetNovel.py b/src/cps/metadata_provider/NetNovel.py new file mode 100644 index 0000000..7dc33c3 --- /dev/null +++ b/src/cps/metadata_provider/NetNovel.py @@ -0,0 +1,57 @@ +import traceback + +from cps.metadata_provider.net.QQReader import QQReader +from cps.metadata_provider.net.Qidian import Qidian +from cps.metadata_provider.net.Qimao import Qimao +from cps.metadata_provider.net.Fanqie import Fanqie +from cps.metadata_provider.net.Tadu import Tadu +from cps.metadata_provider.net.Zxcs import Zxcs +from cps.services.Metadata import Metadata + +PROVIDER_NAME = "网文聚合" +PROVIDER_ID = "netnovel" + + +class NetNovel(Metadata): + __name__ = PROVIDER_NAME + __id__ = PROVIDER_ID + + def __init__(self): + self.searchers = [ + Qidian(), + QQReader(), + Qimao(), + Fanqie(), + Tadu(), + Zxcs() + ] + super().__init__() + + def search(self, query: str, generic_cover: str = "", locale: str = "en"): + if self.active: + sections = query.split(';') + author = '未知' + if len(sections) > 1: + title = sections[0] + author = sections[1] + else: + title = query + result_title_match = [] + result_rest = [] + for searcher in self.searchers: + try: + results = searcher.search(query) + for result in results: + if result.title == title: + if author in result.authors: + return [result] + else: + result_title_match.append(result) + else: + result_rest.append(result) + except: + traceback.print_exc() + if len(result_title_match) > 0: + return result_title_match + result_title_match.extend(result_rest) + return result_title_match diff --git a/src/cps/metadata_provider/net/Base.py b/src/cps/metadata_provider/net/Base.py new file mode 100644 index 0000000..31281a9 --- /dev/null +++ b/src/cps/metadata_provider/net/Base.py @@ -0,0 +1,183 @@ +import dataclasses +import random +import time +from concurrent.futures import ThreadPoolExecutor, as_completed +from functools import lru_cache + +import requests +from lxml import etree + +from cps.services.Metadata import MetaRecord, MetaSourceInfo + +CONST_BOOK_CACHE_SIZE = 500 # 最大缓存数量 +CONST_CONCURRENCY_SIZE = 5 # 并发查询数 + + +class GenericSearchDefine: + def __init__(self, + base_url, + book_url_pattern, + provider_name, + provider_id + ): + self.base_url = base_url + self.book_url_pattern = book_url_pattern + self.provider_name = provider_name + self.provider_id = provider_id + self.default_headers = { + 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3573.0 Safari/537.36', + 'Accept-Encoding': 'gzip, deflate', + 'Referer': base_url + } + + def get_search_page(self, query): + pass + + def get_search_results(self, html, query, author): + pass + + def get_title(self, html): + pass + + def get_author(self, html): + pass + + def get_cover(self, html): + pass + + def get_description(self, html): + pass + + def get_tags(self, html): + pass + + def get_text(self, element, default_str=''): + text = default_str + if len(element) and element[0].text: + text = element[0].text.strip() + elif isinstance(element, etree._Element) and element.text: + text = element.text.strip() + return text if text else default_str + + +@dataclasses.dataclass +class GenericSearchMetaRecord(MetaRecord): + + def __getattribute__(self, item): # cover通过本地服务代理访问 + return super().__getattribute__(item) + + +class GenericSearchBookSearcher: + + def __init__(self, definition: GenericSearchDefine): + self.definition = definition + self.book_loader = GenericSearchBookLoader(definition) + self.thread_pool = ThreadPoolExecutor(max_workers=10, thread_name_prefix=definition.provider_id + '_async') + + def load_book_urls_new(self, query): + sections = query.split(';') + if len(sections) > 1: + query = sections[0] + author = sections[1] + else: + author = None + res = self.definition.get_search_page(query) + book_urls = [] + exact_urls = [] + if res.status_code in [200, 201]: + if(res.headers['Content-Type'].find('/json') >= 0): + html = res.json() + else: + html = etree.HTML(res.content) + book_urls, exact_urls = self.definition.get_search_results(html, query, author) + if len(exact_urls) > 0: + return exact_urls + return book_urls[:3] + + def search_books(self, query): + book_urls = self.load_book_urls_new(query) + books = [] + futures = [self.thread_pool.submit(self.book_loader.load_book, book_url) for book_url in book_urls] + for future in as_completed(futures): + book = future.result() + if book is not None: + books.append(future.result()) + return books + + def search_books_single(self, query): + sections = query.split(';') + if len(sections) > 1: + query = sections[0] + author = sections[1] + else: + author = None + res = self.definition.get_search_page(query) + books = [] + exact_books = [] + if res.status_code in [200, 201]: + if (res.headers['Content-Type'].find('/json') >= 0): + html = res.json() + else: + html = etree.HTML(res.content) + books, exact_books = self.definition.get_search_results(html, query, author) + if len(exact_books) > 0: + return exact_books + return books[:3] + +class GenericSearchBookLoader: + + def __init__(self, definition: GenericSearchDefine): + self.book_parser = GenericSearchBookHtmlParser(definition) + self.definition = definition + + @lru_cache(maxsize=CONST_BOOK_CACHE_SIZE) + def load_book(self, url): + book = None + self.random_sleep() + start_time = time.time() + res = requests.get(url, headers=self.definition.default_headers) + if res.status_code in [200, 201]: + print("下载书籍:{}成功,耗时{:.0f}ms".format(url, (time.time() - start_time) * 1000)) + book_detail_content = res.content + book = self.book_parser.parse_book(url, book_detail_content.decode("utf8")) + return book + + def random_sleep(self): + random_sec = random.random() / 10 + print("Random sleep time {}s".format(random_sec)) + time.sleep(random_sec) + + +class GenericSearchBookHtmlParser: + def __init__(self, definition: GenericSearchDefine): + self.definition = definition + self.id_pattern = definition.book_url_pattern + + def parse_book(self, url, book_content): + book = GenericSearchMetaRecord( + id="", + title="", + authors=[], + publisher="", + description="", + url="", + source=MetaSourceInfo( + id=self.definition.provider_id, + description=self.definition.provider_name, + link=self.definition.base_url + ) + ) + html = etree.HTML(book_content) + book.title = self.definition.get_title(html) + book.url = url + id_match = self.id_pattern.match(url) + if id_match: + book.id = id_match.group(1) + book.cover = self.definition.get_cover(html) + book.publisher = self.definition.provider_name + # rating_element = html.xpath("//strong[@property='v:average']") + # book.rating = self.get_rating(rating_element) + book.authors = self.definition.get_author(html) + book.description = self.definition.get_description(html) + book.tags = self.definition.get_tags(html) + return book diff --git a/src/cps/metadata_provider/net/Fanqie.py b/src/cps/metadata_provider/net/Fanqie.py new file mode 100644 index 0000000..4620430 --- /dev/null +++ b/src/cps/metadata_provider/net/Fanqie.py @@ -0,0 +1,73 @@ +import re + +import requests +from lxml import etree +from lxml.etree import HTML + +from cps.metadata_provider.net.Base import GenericSearchDefine, GenericSearchBookSearcher, GenericSearchMetaRecord +from cps.services.Metadata import Metadata, MetaSourceInfo + + +class FanqieSearchDefine(GenericSearchDefine): + + def __init__(self): + self.search_url = "https://fanqienovel.com/api/author/search/search_book/v1?filter=127%2C127%2C127%2C127&page_count=10&page_index=0&query_type=0&query_word=" + base_url = "https://fanqienovel.com" + book_url_pattern = re.compile(".*/page/(\\d+)/?") + provider_name = "番茄小说网" + provider_id = "fanqienovel" + super().__init__(base_url, book_url_pattern, provider_name, provider_id) + + def get_search_page(self, query): + url = self.search_url + query + return requests.get(url, {}, headers=self.default_headers) + + def get_search_results(self, html, query, author): + books = [] + exact_books = [] + alist = html["data"]["search_book_data_list"] + for link in alist: + book_id = link['book_id'] + parsed = "https://fanqienovel.com/page/" + book_id + title = link['book_name'] + author = link['author'] + thumb_url = link['thumb_url'] + category = link['category'].split(',') + description = link['book_abstract'] + book = GenericSearchMetaRecord( + id=book_id, + title=title, + authors=[author], + cover=thumb_url, + publisher=self.provider_name, + description=description, + url=parsed, + tags=category, + source=MetaSourceInfo( + id=self.provider_id, + description=self.provider_name, + link=self.base_url + ) + ) + books.append(book) + if title == query: + if author == link['author']: + return [], [book] + exact_books.append(book) + return books, exact_books + + +definition = FanqieSearchDefine() + + +class Fanqie(Metadata): + __name__ = definition.provider_name + __id__ = definition.provider_id + + def __init__(self): + self.searcher = GenericSearchBookSearcher(definition) + super().__init__() + + def search(self, query: str, generic_cover: str = "", locale: str = "en"): + if self.active: + return self.searcher.search_books_single(query) diff --git a/src/cps/metadata_provider/net/Jinjiang.py b/src/cps/metadata_provider/net/Jinjiang.py new file mode 100644 index 0000000..439a549 --- /dev/null +++ b/src/cps/metadata_provider/net/Jinjiang.py @@ -0,0 +1,91 @@ +import re + +import requests +from lxml import etree + +from cps.metadata_provider.net.Base import GenericSearchDefine, GenericSearchBookSearcher +from cps.services.Metadata import Metadata + + +class JinjiangSearchDefine(GenericSearchDefine): + + def __init__(self): + self.search_url = "https://www.jjwxc.net/search.php?kw=" + base_url = "https://www.jjwxc.net" + book_url_pattern = re.compile(".*/onebook.php\\?novelid=(\\d+)/?") + provider_name = "晋江文学城" + provider_id = "jinjiang" + super().__init__(base_url, book_url_pattern, provider_name, provider_id) + self.default_headers['Accept-Charset'] = "utf-8" + + def get_search_page(self, query): + url = self.search_url + query + "&t=1&ord=relate" + return requests.get(url, {}, headers=self.default_headers) + + def get_search_results(self, html, query, author): + book_urls = [] + exact_urls = [] + alist = html.xpath("id('search_result')/div/h3[@class='title']/a") + for link in alist: + href = link.attrib['href'] + if self.book_url_pattern.match(href): + parsed = href + book_urls.append(parsed) + else: + continue + title = link.xpath('text()')[0] + if title == query: + item_base = link.getparent().getparent() + if len(item_base): + item_author = item_base.xpath('//div[@class="info"]/a[contains(@href,"oneauthor")]/span/text()') + if len(item_author) > 0: + if author == item_author[0]: + return [], [parsed] + exact_urls.append(parsed) + return book_urls, exact_urls + + def get_title(self, html): + title_element = html.xpath('//h1[@itemprop="name"]/span[@itemprop="articleSection"]') + return self.get_text(title_element) + + def get_author(self, html): + author_element = html.xpath('//h2/a/span[@itemprop="author"]') + return [self.get_text(author_element)] + + def get_cover(self, html): + img_element = html.xpath('//img[@class="noveldefaultimage"]') + if len(img_element): + cover = img_element[0].attrib['src'] + if not cover: + return '' + else: + return "https:" + cover + + def get_description(self, html): + summary_element = html.xpath('id("novelintro")') + other_element = html.xpath('id("novelintro")/a') + if len(other_element): + summary_element.remove(other_element) + if len(summary_element): + return etree.tostring(summary_element[-1], encoding="utf8").decode("utf8").strip() + + def get_tags(self, html): + tag_elements = html.xpath('//div[@class="smallreadbody"]/span/a[contains(@href,"bookbase")]') + if len(tag_elements): + return [self.get_text(tag_element) for tag_element in tag_elements] + + +definition = JinjiangSearchDefine() + + +class Jinjiang(Metadata): + __name__ = definition.provider_name + __id__ = definition.provider_id + + def __init__(self): + self.searcher = GenericSearchBookSearcher(definition) + super().__init__() + + def search(self, query: str, generic_cover: str = "", locale: str = "en"): + if self.active: + return self.searcher.search_books(query) diff --git a/src/cps/metadata_provider/net/QQReader.py b/src/cps/metadata_provider/net/QQReader.py new file mode 100644 index 0000000..72e38a3 --- /dev/null +++ b/src/cps/metadata_provider/net/QQReader.py @@ -0,0 +1,87 @@ +import re + +import requests +from lxml import etree + +from cps.metadata_provider.net.Base import GenericSearchDefine, GenericSearchBookSearcher +from cps.services.Metadata import Metadata + + +class QQReaderSearchDefine(GenericSearchDefine): + + def __init__(self): + self.search_url = "https://book.qq.com/so/" + base_url = "https://book.qq.com/" + book_url_pattern = re.compile(".*/book-detail/(\\d+)/?") + provider_name = "QQ阅读" + provider_id = "qqreader" + super().__init__(base_url, book_url_pattern, provider_name, provider_id) + + def get_search_page(self, query): + url = self.search_url + query + return requests.get(url, {}, headers=self.default_headers) + + def get_search_results(self, html, query, author): + book_urls = [] + exact_urls = [] + alist = html.xpath("//div[contains(@class,'book-large')]/a") + for link in alist: + href = link.attrib['href'] + if self.book_url_pattern.match(href): + parsed = "https:" + href + book_urls.append(parsed) + else: + continue + title = link.attrib['title'] + if title == query: + item_base = link + if len(item_base): + item_author = item_base.xpath('div[@class="content"]/p[@class="other"]/object/a[contains(@href,"book-writer")]/text()') + if len(item_author) > 0: + if author == item_author[0]: + return [], [parsed] + exact_urls.append(parsed) + return book_urls, exact_urls + + def get_title(self, html): + title_element = html.xpath("//h1[@class='book-title']/text()") + return title_element[0].strip() + + def get_author(self, html): + author_element = html.xpath("//div[@class='book-meta']/a[contains(@class,'author')]") + return [self.get_text(author_element).replace(' 著', '')] + + def get_cover(self, html): + img_element = html.xpath("//div[@class='page-header-content']//img[@class='ypc-book-cover']") + if len(img_element): + cover = img_element[0].attrib['src'] + if not cover: + return '' + else: + return cover + + def get_description(self, html): + summary_element = html.xpath("//div[contains(@class,'book-intro')]") + if len(summary_element): + return etree.tostring(summary_element[-1], encoding="utf8").decode("utf8").strip() + + def get_tags(self, html): + tag_elements = html.xpath("//div[@class='book-tags']/a[contains(@class,'tag')]") + if len(tag_elements): + return [self.get_text(tag_element) for tag_element in tag_elements] + + +definition = QQReaderSearchDefine() + + +class QQReader(Metadata): + __name__ = definition.provider_name + __id__ = definition.provider_id + + def __init__(self): + self.searcher = GenericSearchBookSearcher(definition) + super().__init__() + + def search(self, query: str, generic_cover: str = "", locale: str = "en"): + if self.active: + return self.searcher.search_books(query) diff --git a/src/cps/metadata_provider/net/Qidian.py b/src/cps/metadata_provider/net/Qidian.py new file mode 100644 index 0000000..5b65678 --- /dev/null +++ b/src/cps/metadata_provider/net/Qidian.py @@ -0,0 +1,88 @@ +import re + +import requests +from lxml import etree +from lxml.etree import HTML + +from cps.metadata_provider.net.Base import GenericSearchDefine, GenericSearchBookSearcher +from cps.services.Metadata import Metadata + + +class QidianSearchDefine(GenericSearchDefine): + + def __init__(self): + self.search_url = "https://www.qidian.com/so/" + base_url = "https://www.qidian.com/" + book_url_pattern = re.compile(".*/book/(\\d+)/?") + provider_name = "起点中文网" + provider_id = "qidian" + super().__init__(base_url, book_url_pattern, provider_name, provider_id) + + def get_search_page(self, query): + url = self.search_url + query + ".html" + return requests.get(url, {}, headers=self.default_headers) + + def get_search_results(self, html, query, author): + book_urls = [] + exact_urls = [] + alist = html.xpath("//h3/a[contains(@title,'在线阅读')]") + for link in alist: + href = link.attrib['href'] + if self.book_url_pattern.match(href): + parsed = "https:" + href + book_urls.append(parsed) + else: + continue + title = link.attrib['title'].replace('在线阅读', '') + if title == query: + item_base = link.getparent().getparent() + if len(item_base): + item_author = item_base.xpath('p[@class="author"]/a[@class="name"]/text()') + if len(item_author) > 0: + if author == item_author[0]: + return [], [parsed] + exact_urls.append(parsed) + return book_urls, exact_urls + + def get_title(self, html): + title_element = html.xpath('id("bookName")') + return self.get_text(title_element) + + def get_author(self, html): + author_element = html.xpath("//a[@class='writer-name']") + return [self.get_text(author_element)] + + def get_cover(self, html): + img_element = html.xpath('id("bookImg")/img') + if len(img_element): + cover = img_element[0].attrib['src'] + if not cover: + return '' + else: + return "https:" + cover + + def get_description(self, html): + summary_element = html.xpath('id("book-intro-detail")') + if len(summary_element): + return etree.tostring(summary_element[-1], encoding="utf8").decode("utf8").strip() + + def get_tags(self, html): + tag_elements = html.xpath("//p[@class='book-attribute']/a") + if len(tag_elements): + return [self.get_text(tag_element) for tag_element in tag_elements] + + +definition = QidianSearchDefine() + + +class Qidian(Metadata): + __name__ = definition.provider_name + __id__ = definition.provider_id + + def __init__(self): + self.searcher = GenericSearchBookSearcher(definition) + super().__init__() + + def search(self, query: str, generic_cover: str = "", locale: str = "en"): + if self.active: + return self.searcher.search_books(query) diff --git a/src/cps/metadata_provider/net/Qimao.py b/src/cps/metadata_provider/net/Qimao.py new file mode 100644 index 0000000..fb059f2 --- /dev/null +++ b/src/cps/metadata_provider/net/Qimao.py @@ -0,0 +1,87 @@ +import re + +import requests +from lxml import etree + +from cps.metadata_provider.net.Base import GenericSearchDefine, GenericSearchBookSearcher +from cps.services.Metadata import Metadata + + +class QimaoSearchDefine(GenericSearchDefine): + + def __init__(self): + self.search_url = "https://www.qimao.com/search/index/?keyword=" + base_url = "https://www.qimao.com" + book_url_pattern = re.compile(".*/shuku/(\\d+)/?") + provider_name = "七猫中文网" + provider_id = "qimao" + super().__init__(base_url, book_url_pattern, provider_name, provider_id) + + def get_search_page(self, query): + url = self.search_url + query + return requests.get(url, {}, headers=self.default_headers) + + def get_search_results(self, html, query, author): + book_urls = [] + exact_urls = [] + alist = html.xpath("//li/div[@class='txt']/span[@class='s-tit']/a") + for link in alist: + href = link.attrib['href'] + if self.book_url_pattern.match(href): + parsed = self.base_url + href + book_urls.append(parsed) + else: + continue + title = ''.join(etree.HTML(etree.tostring(link)).xpath('//text()')).strip() + if title == query: + item_base = link.getparent().getparent() + if len(item_base): + item_author = item_base.xpath('p[@class="p-bottom"]/span[1]/a/text()') + if len(item_author) > 0: + if author == item_author[0]: + return [], [parsed] + exact_urls.append(parsed) + return book_urls, exact_urls + + def get_title(self, html): + title_element = html.xpath('//div[starts-with(@class,"title")]/span[@class="txt"]') + return self.get_text(title_element) + + def get_author(self, html): + author_element = html.xpath('//div[@class="sub-title"]/span[@class="txt"]/em/a') + return [self.get_text(author_element)] + + def get_cover(self, html): + img_element = html.xpath('//div[@class="wrap-pic"]/img') + if len(img_element): + cover = img_element[0].attrib['src'] + if not cover: + return '' + else: + return cover + + def get_description(self, html): + summary_element = html.xpath('//p[@class="intro"]') + if len(summary_element): + return etree.tostring(summary_element[-1], encoding="utf8").decode("utf8").strip() + + def get_tags(self, html): + tag_elements = html.xpath('//div[@class="tags-wrap"]/em/a') + if len(tag_elements): + return [self.get_text(tag_element) for tag_element in tag_elements] + + +definition = QimaoSearchDefine() + + +class Qimao(Metadata): + __name__ = definition.provider_name + __id__ = definition.provider_id + + def __init__(self): + self.searcher = GenericSearchBookSearcher(definition) + super().__init__() + + def search(self, query: str, generic_cover: str = "", locale: str = "en"): + if self.active: + return self.searcher.search_books(query) diff --git a/src/cps/metadata_provider/net/Tadu.py b/src/cps/metadata_provider/net/Tadu.py new file mode 100644 index 0000000..6b53b62 --- /dev/null +++ b/src/cps/metadata_provider/net/Tadu.py @@ -0,0 +1,91 @@ +import re + +import requests +from lxml import etree + +from cps.metadata_provider.net.Base import GenericSearchDefine, GenericSearchBookSearcher +from cps.services.Metadata import Metadata + + +class TaduSearchDefine(GenericSearchDefine): + + def __init__(self): + self.search_url = "https://www.tadu.com/search" + base_url = "https://www.tadu.com" + book_url_pattern = re.compile(".*/book/(\\d+)/?") + provider_name = "塔读文学" + provider_id = "tadu" + super().__init__(base_url, book_url_pattern, provider_name, provider_id) + + def get_search_page(self, query): + url = self.search_url + params = {"query": query} + return requests.post(url, params, headers=self.default_headers) + + def get_search_results(self, html, query, author): + book_urls = [] + exact_urls = [] + alist = html.xpath("//a[contains(@class,'bookNm')]") + for link in alist: + href = link.attrib['href'] + if self.book_url_pattern.match(href): + parsed = self.base_url + href + book_urls.append(parsed) + else: + continue + if len(link) > 0: + title = ''.join(etree.HTML(etree.tostring(link)).xpath('//text()')).strip() + else: + title = link.text + if title == query: + item_base = link.getparent() + if len(item_base): + item_author = item_base.xpath('div[starts-with(@class,"bot_list")]/div[@class="condition"]/a[@class="authorNm"]/text()') + if len(item_author) > 0: + if author == item_author[0]: + return [], [parsed] + exact_urls.append(parsed) + return book_urls, exact_urls + + def get_title(self, html): + title_element = html.xpath("//a[@class='bkNm']/text()") + return title_element[0].strip() + + def get_author(self, html): + author_element = html.xpath("//div[@class='bookNm']/span[contains(@class,'author')]") + return [self.get_text(author_element).replace(' 著', '')] + + def get_cover(self, html): + img_element = html.xpath("//a[@class='bookImg']/img") + if len(img_element): + cover = img_element[0].attrib['src'] + if not cover: + return '' + else: + return cover + + def get_description(self, html): + summary_element = html.xpath("//p[contains(@class,'intro')]") + if len(summary_element): + return etree.tostring(summary_element[-1], encoding="utf8").decode("utf8").strip() + + def get_tags(self, html): + tag_elements = html.xpath("//div[@class='sortList']/a[text()!='无标签']") + if len(tag_elements): + return [self.get_text(tag_element) for tag_element in tag_elements] + + +definition = TaduSearchDefine() + + +class Tadu(Metadata): + __name__ = definition.provider_name + __id__ = definition.provider_id + + def __init__(self): + self.searcher = GenericSearchBookSearcher(definition) + super().__init__() + + def search(self, query: str, generic_cover: str = "", locale: str = "en"): + if self.active: + return self.searcher.search_books(query) diff --git a/src/cps/metadata_provider/net/Zxcs.py b/src/cps/metadata_provider/net/Zxcs.py new file mode 100644 index 0000000..31d832c --- /dev/null +++ b/src/cps/metadata_provider/net/Zxcs.py @@ -0,0 +1,96 @@ +import re + +import requests +from lxml import etree + +from cps.metadata_provider.net.Base import GenericSearchDefine, GenericSearchBookSearcher +from cps.services.Metadata import Metadata + + +class ZxcsSearchDefine(GenericSearchDefine): + + def __init__(self): + self.search_url = "https://zxcs.info/index.php" + base_url = "https://zxcs.info" + book_url_pattern = re.compile(".*/post/(\\d+)/?") + provider_name = "知轩藏书" + provider_id = "zxcs" + self.book_name_pattern = re.compile(".*?《(.+?)》.*") + self.author_name_pattern = re.compile(".*?作者[::\s]+(.+)") + super().__init__(base_url, book_url_pattern, provider_name, provider_id) + + def get_search_page(self, query): + url = self.search_url + params = {"keyword": query} + return requests.get(url, params, headers=self.default_headers) + + def get_search_results(self, html, query, author): + book_urls = [] + exact_urls = [] + alist = html.xpath("id('plist')/dt/a") + for link in alist: + href = link.attrib['href'] + if self.book_url_pattern.match(href): + parsed = href + book_urls.append(parsed) + else: + continue + title = link.text + title_groups = self.book_name_pattern.fullmatch(title) + if title_groups: + title = title_groups.group(1) + if title == query: + author_groups = self.author_name_pattern.fullmatch(link.text) + if author_groups: + item_author = author_groups.group(1) + if author == item_author: + return [], [parsed] + exact_urls.append(parsed) + return book_urls, exact_urls + + def get_title(self, html): + title_element = html.xpath("//div[contains(@class,'book-info')]/h1/text()") + return title_element[0].strip().replace("《", "").replace("》", "") + + def get_author(self, html): + author_element = html.xpath("//p[@class='intro']") + return [self.get_text(author_element).replace(' 著', '')] + + def get_cover(self, html): + img_element = html.xpath("id('bookImg')/img") + if len(img_element): + cover = img_element[0].attrib['src'] + if not cover: + return '' + else: + return self.base_url + cover + + def get_description(self, html): + vote_element = html.xpath("id('vote')") + if len(vote_element) > 0: + vote_element[0].getparent().remove(vote_element[0]) + summary_element = html.xpath("//div[@class='book-info-detail']") + if len(summary_element): + return (etree.tostring(summary_element[-1], encoding="utf8").decode("utf8").strip() + .replace('
','')) + + def get_tags(self, html): + tag_elements = html.xpath("//p[@class='tag']/a") + if len(tag_elements): + return [self.get_text(tag_element).replace('精校','') for tag_element in tag_elements] + + +definition = ZxcsSearchDefine() + + +class Zxcs(Metadata): + __name__ = definition.provider_name + __id__ = definition.provider_id + + def __init__(self): + self.searcher = GenericSearchBookSearcher(definition) + super().__init__() + + def search(self, query: str, generic_cover: str = "", locale: str = "en"): + if self.active: + return self.searcher.search_books(query) diff --git a/tests/FanqieTest.py b/tests/FanqieTest.py new file mode 100644 index 0000000..d206a7b --- /dev/null +++ b/tests/FanqieTest.py @@ -0,0 +1,7 @@ +from cps.metadata_provider.net.Fanqie import Fanqie + +if __name__ == "__main__": + fanqie = Fanqie() + result = fanqie.search("夺舍;木牛流猫") + for book in result: + print(book) diff --git a/tests/JinjiangTest.py b/tests/JinjiangTest.py new file mode 100644 index 0000000..d54c5df --- /dev/null +++ b/tests/JinjiangTest.py @@ -0,0 +1,7 @@ +from cps.metadata_provider.net.Jinjiang import Jinjiang + +if __name__ == "__main__": + jinjiang = Jinjiang() + result = jinjiang.search("如意书;蒋牧童") + for book in result: + print(book) diff --git a/tests/NetNovelTest.py b/tests/NetNovelTest.py new file mode 100644 index 0000000..b8d163a --- /dev/null +++ b/tests/NetNovelTest.py @@ -0,0 +1,7 @@ +from cps.metadata_provider.NetNovel import NetNovel + +if __name__ == "__main__": + netnovel = NetNovel() + result = netnovel.search("战皇;傲天无痕") + for book in result: + print(book) diff --git a/tests/NewDoubanTest.py b/tests/NewDoubanTest.py deleted file mode 100644 index e6ab151..0000000 --- a/tests/NewDoubanTest.py +++ /dev/null @@ -1,7 +0,0 @@ -from NewDouban import NewDouban - -if __name__ == "__main__": - douban = NewDouban() - result = douban.search("知识考古学") - for book in result: - print(book) diff --git a/tests/QQReaderTest.py b/tests/QQReaderTest.py new file mode 100644 index 0000000..c3721f0 --- /dev/null +++ b/tests/QQReaderTest.py @@ -0,0 +1,7 @@ +from cps.metadata_provider.net.QQReader import QQReader + +if __name__ == "__main__": + qqreader = QQReader() + result = qqreader.search("第一序列;会说话的肘子") + for book in result: + print(book) diff --git a/tests/QidianTest.py b/tests/QidianTest.py new file mode 100644 index 0000000..a9f05c6 --- /dev/null +++ b/tests/QidianTest.py @@ -0,0 +1,7 @@ +from cps.metadata_provider.net.Qidian import Qidian + +if __name__ == "__main__": + qidian = Qidian() + result = qidian.search("第一序列") + for book in result: + print(book) diff --git a/tests/QimaoTest.py b/tests/QimaoTest.py new file mode 100644 index 0000000..f728148 --- /dev/null +++ b/tests/QimaoTest.py @@ -0,0 +1,7 @@ +from cps.metadata_provider.net.Qimao import Qimao + +if __name__ == "__main__": + qimao = Qimao() + result = qimao.search("综武:人在酒楼,捡尸王语嫣;要长记性啊") + for book in result: + print(book) diff --git a/tests/TaduTest.py b/tests/TaduTest.py new file mode 100644 index 0000000..eb8e10a --- /dev/null +++ b/tests/TaduTest.py @@ -0,0 +1,7 @@ +from cps.metadata_provider.net.Tadu import Tadu + +if __name__ == "__main__": + tadu = Tadu() + result = tadu.search("全职修真高手;洗剑") + for book in result: + print(book) diff --git a/tests/ZxcsTest.py b/tests/ZxcsTest.py new file mode 100644 index 0000000..796ca8a --- /dev/null +++ b/tests/ZxcsTest.py @@ -0,0 +1,7 @@ +from cps.metadata_provider.net.Zxcs import Zxcs + +if __name__ == "__main__": + zxcs = Zxcs() + result = zxcs.search("夜的命名术;会说话的肘子") + for book in result: + print(book)