Skip to content

Commit

Permalink
improve translator
Browse files Browse the repository at this point in the history
  • Loading branch information
cdhigh committed Dec 17, 2024
1 parent 76777e9 commit 8820a65
Show file tree
Hide file tree
Showing 3 changed files with 97 additions and 44 deletions.
17 changes: 8 additions & 9 deletions application/lib/calibre/web/feeds/news.py
Original file line number Diff line number Diff line change
Expand Up @@ -337,7 +337,7 @@ class BasicNewsRecipe(Recipe):
#: ]
#:
#: will remove everything from `<!--Article ends here-->` to `</body>`.
preprocess_regexps = []
preprocess_regexps = [(re.compile(r'\r\n?|\n'), ''),]

#: The CSS that is used to style the templates, i.e., the navigation bars and
#: the Tables of Contents. Rather than overriding this variable, you should
Expand Down Expand Up @@ -1121,15 +1121,14 @@ def _postprocess_html(self, soup, first_fetch, job_info):
if not h_tag.get_text(strip=True):
h_tag.string = title

ans = self.postprocess_html(soup, first_fetch)
soup = self.postprocess_html(soup, first_fetch)

# Nuke HTML5 tags
for x in ans.find_all(['article', 'aside', 'header', 'footer', 'nav', 'main',
'figcaption', 'figure', 'section', 'time']):
for x in soup.find_all(['article', 'aside', 'header', 'footer', 'nav', 'main',
'figcaption', 'figure', 'section']):
x.name = 'div'

#for x in ans.find_all('mark'):
# x.name = 'strong'
for x in soup.find_all(['bdo', 'kbd', 'mark', 'time']):
x.name = 'span'

#If tts need, 'tts' propery is set by WorkerImpl
tts_enable = self.tts.get('enable')
Expand Down Expand Up @@ -1158,8 +1157,8 @@ def _postprocess_html(self, soup, first_fetch, job_info):
self.log.exception('Failed to get article object for postprocessing')
pass
else:
self.populate_article_metadata(article, ans, first_fetch)
return ans
self.populate_article_metadata(article, soup, first_fetch)
return soup

#在文章末尾添加分享链接
def append_share_links(self, soup, url):
Expand Down
12 changes: 6 additions & 6 deletions application/lib/ebook_translator/engines/languages.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
google = {
'English': 'en',
'Afrikaans': 'af',
'Albanian': 'sq',
'Amharic': 'am',
Expand All @@ -25,7 +26,6 @@
'Dhivehi': 'dv',
'Dogri': 'doi',
'Dutch': 'nl',
'English': 'en',
'Esperanto': 'eo',
'Estonian': 'et',
'Ewe': 'ee',
Expand Down Expand Up @@ -179,12 +179,12 @@

deepl = {
'source': {
'English': 'EN',
'Bulgarian': 'BG',
'Czech': 'CS',
'Danish': 'DA',
'German': 'DE',
'Greek': 'EL',
'English': 'EN',
'Spanish': 'ES',
'Estonian': 'ET',
'Finnish': 'FI',
Expand All @@ -210,14 +210,14 @@
'Chinese': 'ZH'
},
'target': {
'English': 'EN',
'English (British)': 'EN-GB',
'English (American)': 'EN-US',
'Bulgarian': 'BG',
'Czech': 'CS',
'Danish': 'DA',
'German': 'DE',
'Greek': 'EL',
'English': 'EN',
'English (British)': 'EN-GB',
'English (American)': 'EN-US',
'Spanish': 'ES',
'Estonian': 'ET',
'Finnish': 'FI',
Expand Down Expand Up @@ -247,6 +247,7 @@
}

microsoft = {
'English': 'en',
'Afrikaans': 'af',
'Albanian': 'sq',
'Amharic': 'am',
Expand All @@ -270,7 +271,6 @@
'Dari': 'prs',
'Divehi': 'dv',
'Dutch': 'nl',
'English': 'en',
'Estonian': 'et',
'Faroese': 'fo',
'Fijian': 'fj',
Expand Down
112 changes: 83 additions & 29 deletions application/lib/ebook_translator/html_translator.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@
from ebook_translator.engines import *
from application.ke_utils import loc_exc_pos

DEBUG_SPLITED_TRANS_SOUP = False #是否保存分割后的soup,用于调试优化

#生成一个当前所有支持的翻译引擎的字典,在网页内使用
def get_trans_engines():
info = {}
Expand All @@ -17,6 +19,14 @@ def get_trans_engines():
return info

class HtmlTranslator:
#常见的基本不影响布局的行内元素标签集合
INLINE_TAGS = {'a', 'abbr', 'b', 'bdo', 'cite', 'dfn', 'em', 'i', 'img', 'kbd', 'mark',
'q', 's', 'samp', 'small', 'span', 'strong', 'sub', 'sup', 'u', 'var', 'wbr'}

#不需要翻译的标签
NO_TRANS_TAGS = {'pre', 'code', 'abbr', 'style', 'script', 'textarea', 'input', 'select',
'link', 'img', 'option', 'datalist'}

def __init__(self, params: dict, thread_num: int=1):
#params.setdefault('stream', False)
self.thread_num = thread_num
Expand Down Expand Up @@ -85,9 +95,13 @@ def translate_soup(self, soup):
failed = 0
elements = self.extract_soup_text(soup)
count = len(elements)
for idx, (tag, text) in enumerate(elements, 1):
self.debugSave(soup, elements)
for idx, (tag, text, needTrans) in enumerate(elements, 1):
try:
trans = self.translator.translate(text)
if needTrans and not DEBUG_SPLITED_TRANS_SOUP:
trans = self.translator.translate(text)
else:
trans = text
if trans:
self.add_translation_soup(soup, tag, trans, self.dst)
success += 1
Expand All @@ -105,49 +119,71 @@ def extract_soup_text(self, soup):
elements = []
maxLen = self.translator.max_len_per_request

#确定soup节点是否直接包含文本元素
def _contains_text(tag):
if ((tag.name == 'table') or (tag.string is not None) or
[x for x in tag.children if isinstance(x, NavigableString) and str(x).strip()]):
return True
return False

#过滤掉不需要翻译的tag
def _tag_is_filtered(tag):
return tag.name in ('pre', 'code', 'abbr', 'style', 'script', 'textarea',
'input', 'select', 'link', 'img', 'option', 'datalist')

#判断节点没有子标签节点,只有文本
def _tag_has_only_text(tag):
return all(isinstance(e, NavigableString) for e in tag.children)

#递归函数,用于遍历BeautifulSoup元素的所有子节点并提取文本内容
#内嵌递归函数:用于遍历BeautifulSoup元素的所有子节点并提取文本内容
#tag: 开始的BeautifulSoup元素
#position: 翻译后的文本显示的位置
#返回: [(tag, text, needTrans),]
def _extract(tag, position):
for child in tag.find_all(recursive=False):
#跳过AI自动生成的摘要
if isinstance(child, Tag) and 'ai_generated_summary' in child.get('class', []):
continue

if _contains_text(child) and not _tag_is_filtered(child):
text = str(child).strip() if position == 'replace' else child.get_text().strip()
if text and _tag_has_only_text(child) or len(text) < maxLen:
elements.append((child, text))
needTrans = getattr(child, 'name', None) not in self.NO_TRANS_TAGS
if self._contains_text(child):
if needTrans:
if position == 'replace':
text = str(child).strip()
text = re.sub(r'<a\b[^>]*>(.*?)</a>', r'<u>\1</u>', text) #去掉超链接
else:
text = child.get_text().strip()
if text and (self._tag_has_only_text(child) or (len(text) < maxLen)):
elements.append((child, text, needTrans))
continue
elif position == 'replace': #只有替代译文才保留不需要翻译的段落,其他情况使用原段落
elements.append((child, str(child).strip(), needTrans))
continue

#if text:
# #因为非AI翻译容易误翻译超链接里面的内容,所以这里去掉超链接
# if position != 'replace' and '<a' in text:
# text = re.sub(r'<a\b[^>]*>', '<u>', text)
# text = text.replace('</a>', '</u>')

_extract(child, position)

position = self.params.get('position', 'below')
_extract(soup.body, position)
return elements

#确定soup节点是否直接包含文本元素
def _contains_text(self, tag):
#以下几个条件任意一个
#1. 文本节点
#2. p/table元素
#3. 没有子节点
tagName = getattr(tag, 'name', None)
if isinstance(tag, NavigableString) or (tagName in ('p', 'table')) or (tag.string is not None):
return True

#4. 有直接的裸文本节点,之前在创建soup时已经去除了回车换行,这里文本节点就是文本内容
if [x for x in tag.children if isinstance(x, NavigableString)]:
return True

#5. div内部只有行内元素
if (tagName == 'div') and self._all_inline_elements(tag):
return True

return False

#一个节点内是否只有行内元素
def _all_inline_elements(self, tag):
for elem in tag.descendants:
if isinstance(elem, Tag) and elem.name not in self.INLINE_TAGS:
return False
return True

#判断节点没有子标签节点,只有文本或链接
def _tag_has_only_text(self, tag):
if isinstance(tag, NavigableString):
return True
return all(isinstance(e, NavigableString) or (getattr(e, 'name', None) == 'a')
for e in tag.children)

#将翻译结果添加到DOM树
#tag: 原文的tag
#trans: 译文文本字符串
Expand Down Expand Up @@ -249,3 +285,21 @@ def add_translation_soup_title(self, soup, position, tag, transTag):
else: #replace
tag.string = transTxt or origTxt

#将soup文本分割提取的内容保存到文件,用于调试优化
def debugSave(self, soup, elements):
if not DEBUG_SPLITED_TRANS_SOUP:
return

import os
fileName = os.path.join(os.path.dirname(__file__), 'debug_trans_soup.html')
mode = 'a' if os.path.isfile(fileName) and os.path.getsize(fileName) < 500000 else 'w'
with open(fileName, mode, encoding='utf-8') as f:
f.write('TITLE: {}\n'.format(soup.find('title').string))
for tag, text, needTrans in elements:
f.write(tag.prettify())
if needTrans:
f.write('T-------------------------\n')
else:
f.write('N-------------------------\n')
f.write(text)
f.write('\n========================================================\n\n')

0 comments on commit 8820a65

Please sign in to comment.