From 68de1465f022145f83dc7450f4c1bf4863a1f0f9 Mon Sep 17 00:00:00 2001 From: Rhilip Date: Sat, 9 Jun 2018 12:33:26 +0800 Subject: [PATCH] :construction: Start to solve search mistake in HUDBT --- extractors/base/site.py | 17 +++-------------- extractors/byrbt.py | 2 +- extractors/hudbt.py | 41 +++++++++++++++++++++-------------------- extractors/nwsuaf6.py | 2 +- extractors/tjupt.py | 25 +++++-------------------- utils/constants.py | 6 ++++++ 6 files changed, 37 insertions(+), 56 deletions(-) diff --git a/extractors/base/site.py b/extractors/base/site.py index a59df05..58eeacf 100644 --- a/extractors/base/site.py +++ b/extractors/base/site.py @@ -8,7 +8,6 @@ import requests from bs4 import BeautifulSoup -from html2bbcode.parser import HTML2BBCode import utils.descr as descr from utils.constants import Video_Containers @@ -81,7 +80,7 @@ def online_check(self) -> bool: else: if self.suspended != 0: Logger.info("The Site: {si} is Online now,after {count} times tries." - "Will check the session soon.".format(si=self.url_host, count=self.suspended)) + "Will check the session soon.".format(si=self.url_host, count=self.suspended)) self.suspended = 0 # Set self.suspended as 0 first, then session_check() self.session_check() return True if self.suspended == 0 else False @@ -109,21 +108,11 @@ def _get_torrent(torrent): torrent = tc.get_torrent(torrent) return torrent - @staticmethod - def _descr_html2ubb(string: str) -> str: - """ - Build-in function to make a string from html to bbcode - - :param string: str - :return: str - """ - return str(HTML2BBCode().feed(string)) - def _assist_delay(self): if self._ASSIST_ONLY: Logger.info("Autoseed-{mo} only allowed to assist." - "it will sleep {sl} Seconds to wait the reseed site " - "to have this torrent".format(mo=self.name, sl=self._ASSIST_DELAY_TIME)) + "it will sleep {sl} Seconds to wait the reseed site " + "to have this torrent".format(mo=self.name, sl=self._ASSIST_DELAY_TIME)) time.sleep(self._ASSIST_DELAY_TIME) def _get_torrent_ptn(self, torrent): diff --git a/extractors/byrbt.py b/extractors/byrbt.py index 049c394..e0d3457 100644 --- a/extractors/byrbt.py +++ b/extractors/byrbt.py @@ -153,7 +153,7 @@ def sort_title_info(raw_title, raw_type, raw_sec_type) -> dict: len_split = len(type_dict[raw_type]["split"]) if len_split != len(raw_title_group): Logger.warning("The raw title \"{raw}\" may lack of tag (now: {no},ask: {co})," - "The split may wrong.".format(raw=raw_title, no=len(raw_title_group), co=len_split)) + "The split may wrong.".format(raw=raw_title, no=len(raw_title_group), co=len_split)) while len_split > len(raw_title_group): raw_title_group.append("") raw_title_group.reverse() diff --git a/extractors/hudbt.py b/extractors/hudbt.py index 821dde5..0fa0512 100644 --- a/extractors/hudbt.py +++ b/extractors/hudbt.py @@ -7,7 +7,8 @@ from bs4 import BeautifulSoup from extractors.base.nexusphp import NexusPHP -from utils.constants import ubb_clean, episode_eng2chs +from utils.constants import ubb_clean, episode_eng2chs, html2ubb +from utils.load.handler import rootLogger as Logger def title_clean(noext: str) -> str: @@ -41,25 +42,25 @@ def torrent_clone(self, tid) -> dict: return_dict = {} details_bs = self.page_torrent_detail(tid=tid, bs=True) - return_dict["clone_id"] = tid - - # 解析原种页面 - return_dict["name"] = details_bs.find("h1", id="page-title").text # 标题 - return_dict["small_descr"] = details_bs.find("dt", text="副标题").next_sibling.text # 副标题 - - # IMDb - imdb_another = details_bs.find("a", href=re.compile("http://www.imdb.com/title/tt")) - return_dict["url"] = imdb_another.text if imdb_another else "" - - for key_dict, key_search in [("type", "cat"), ("standard_sel", "standard")]: # 类型, 质量 - temp_reg = re.compile("torrents.php\?{}=(\d+)".format(key_search)) - temp_tag = details_bs.find("a", href=temp_reg) - return_dict[key_dict] = re.search(temp_reg, temp_tag["href"]).group(1) - - # 简介 - descr_html = str((details_bs.select("div#kdescr > div.bbcode") or "")[0]) - descr_ubb = self._descr_html2ubb(descr_html) - return_dict["descr"] = ubb_clean(descr_ubb) + if re.search("没有该ID的种子", str(details_bs)): + Logger.error("Error,this torrent may not exist or ConnectError") + else: # 解析原种页面 + return_dict["clone_id"] = tid # 传入引用种子号 + return_dict["name"] = details_bs.find("h1", id="page-title").text # 标题 + return_dict["small_descr"] = details_bs.find("dt", text="副标题").next_sibling.text # 副标题 + + imdb_another = details_bs.find("a", href=re.compile("http://www.imdb.com/title/tt")) + return_dict["url"] = imdb_another.text if imdb_another else "" # IMDb + + for key_dict, key_search in [("type", "cat"), ("standard_sel", "standard")]: # 类型, 质量 + temp_reg = re.compile("torrents.php\?{}=(\d+)".format(key_search)) + temp_tag = details_bs.find("a", href=temp_reg) + return_dict[key_dict] = re.search(temp_reg, temp_tag["href"]).group(1) + + # 简介 + descr_html = str((details_bs.select("div#kdescr > div.bbcode") or "")[0]) + descr_ubb = html2ubb(descr_html) + return_dict["descr"] = ubb_clean(descr_ubb) return return_dict diff --git a/extractors/nwsuaf6.py b/extractors/nwsuaf6.py index 81f7a3e..48c83c1 100644 --- a/extractors/nwsuaf6.py +++ b/extractors/nwsuaf6.py @@ -142,7 +142,7 @@ def date_raw_update(self, torrent_name_search, raw_info: dict) -> dict: # TODO if len_split == 0: if len_split != len(raw_title_group): Logger.warning("The raw title \"{raw}\" may lack of tag (now: {no},ask: {co})," - "The split may wrong.".format(raw=raw_title, no=len(raw_title_group), co=len_split)) + "The split may wrong.".format(raw=raw_title, no=len(raw_title_group), co=len_split)) while len_split > len(raw_title_group): raw_title_group.append("") raw_title_group.reverse() diff --git a/extractors/tjupt.py b/extractors/tjupt.py index 39f1ad6..faef8c7 100644 --- a/extractors/tjupt.py +++ b/extractors/tjupt.py @@ -102,30 +102,15 @@ def torrent_clone(self, tid): def date_raw_update(self, torrent_name_search, raw_info: dict) -> dict: # TODO Change info due to reseed torrent's name information - if int(raw_info["type"]) == 401: # 电影 + type_ = int(raw_info["type"]) + if type_ == 401: # 电影 pass - elif int(raw_info["type"]) == 402: # 剧集 + elif type_ == 402: # 剧集 raw_info["ename"] = torrent_name_search.group("full_name") # 英文名 raw_info["tvseasoninfo"] = torrent_name_search.group("episode") # 集数 - raw_info["subsinfo"] = "1" # 强制更新字幕情况为"暂无字幕" - elif int(raw_info["type"]) == 403: # 综艺 - pass - elif int(raw_info["type"]) == 404: # 资料 - pass - elif int(raw_info["type"]) == 405: # 动漫 + raw_info["subsinfo"] = 1 # 强制更新字幕情况为"暂无字幕" + elif type_ == 405: # 动漫 raw_info["animenum"] = torrent_name_search.group("episode") # 动漫集数 - elif int(raw_info["type"]) == 407: # 体育 - pass - elif int(raw_info["type"]) == 408: # 软件 - pass - elif int(raw_info["type"]) == 409: # 游戏 - pass - elif int(raw_info["type"]) == 410: # 其他 - pass - elif int(raw_info["type"]) == 411: # 纪录片 - pass - elif int(raw_info["type"]) == 412: # 移动视频 - pass return raw_info diff --git a/utils/constants.py b/utils/constants.py index 5bde9dc..24493c3 100644 --- a/utils/constants.py +++ b/utils/constants.py @@ -5,6 +5,8 @@ import re import time +from html2bbcode.parser import HTML2BBCode + Support_Site = [ # The tuple is like (config_dict_name in setting, Package name, Class name) ("site_byrbt", "extractors.byrbt", "Byrbt"), @@ -48,3 +50,7 @@ def episode_eng2chs(ep: str) -> str: if season_episode_info_search.group("episode"): season_episode_info += " 第{e}集".format(e=season_episode_info_search.group("episode")) return season_episode_info + + +def html2ubb(html: str) -> str: + return str(HTML2BBCode().feed(html))