From e9088942e7f28a4a27c19dd941c452872bdb758d Mon Sep 17 00:00:00 2001 From: cdhigh Date: Sat, 9 Nov 2024 08:29:36 -0300 Subject: [PATCH] minor improvements for translator --- .../lib/ebook_translator/engines/base.py | 1 + .../lib/ebook_translator/html_translator.py | 18 +++++++++++++----- application/static/reader.js | 3 ++- 3 files changed, 16 insertions(+), 6 deletions(-) diff --git a/application/lib/ebook_translator/engines/base.py b/application/lib/ebook_translator/engines/base.py index 19720468..717fbf39 100644 --- a/application/lib/ebook_translator/engines/base.py +++ b/application/lib/ebook_translator/engines/base.py @@ -22,6 +22,7 @@ class Base: request_attempt = 3 request_timeout = 10.0 max_error_count = 10 + max_len_per_request = 3000 def __init__(self, config=None): self.source_lang = None #语种显示的名字 diff --git a/application/lib/ebook_translator/html_translator.py b/application/lib/ebook_translator/html_translator.py index 3aa5a37d..9369bbad 100644 --- a/application/lib/ebook_translator/html_translator.py +++ b/application/lib/ebook_translator/html_translator.py @@ -100,10 +100,11 @@ def translate_soup(self, soup): #提取soup包含文本的节点,返回一个列表 [(tag, text),...] def extract_soup_text(self, soup): elements = [] + maxLen = self.translator.max_len_per_request #确定soup节点是否直接包含文本元素 def _contains_text(tag): - if (tag.name == 'table' or tag.string is not None or + if ((tag.name == 'table') or (tag.string is not None) or [x for x in tag.children if isinstance(x, NavigableString) and str(x).strip()]): return True return False @@ -113,21 +114,28 @@ def _tag_is_filtered(tag): return tag.name in ('pre', 'code', 'abbr', 'style', 'script', 'textarea', 'input', 'select', 'link', 'img', 'option', 'datalist') + #判断节点没有子标签节点,只有文本 + def _tag_has_only_text(tag): + return all(isinstance(e, NavigableString) for e in tag.children) + #递归函数,用于遍历BeautifulSoup元素的所有子节点并提取文本内容 #tag: 开始的BeautifulSoup元素 #position: 翻译后的文本显示的位置 def _extract(tag, position): for child in tag.find_all(recursive=False): if _contains_text(child) and not _tag_is_filtered(child): - text = str(child).strip() if position == 'replace' else child.get_text() - elements.append((child, text)) + text = str(child).strip() if position == 'replace' else child.get_text().strip() + if text and _tag_has_only_text(child) or len(text) < maxLen: + elements.append((child, text)) + continue + #if text: # #因为非AI翻译容易误翻译超链接里面的内容,所以这里去掉超链接 # if position != 'replace' and ']*>', '', text) # text = text.replace('', '') - else: - _extract(child, position) + + _extract(child, position) position = self.params.get('position', 'below') _extract(soup.body, position) diff --git a/application/static/reader.js b/application/static/reader.js index e24f7633..eca7f37d 100644 --- a/application/static/reader.js +++ b/application/static/reader.js @@ -824,11 +824,12 @@ function populateBooks(expandLevel) { if (!article || !article.src || !article.title) { continue; } + var sTitle = article.title.replace(/"/g, '"'); ostr.push( ''); }