3.0.0E

cdhigh · Apr 21, 2024 · 34d1274 · 34d1274
1 parent e3fa272
commit 34d1274
Show file tree

Hide file tree

Showing 28 changed files with 820 additions and 201 deletions.
diff --git a/application/back_end/send_mail_adpt.py b/application/back_end/send_mail_adpt.py
@@ -57,9 +57,11 @@ def avaliable_sm_services():
 #title: 邮件标题
 #attachment: 附件二进制内容，或元祖 (filename, content)
 #fileWithTime: 发送的附件文件名是否附带当前时间
-def send_to_kindle(user, title, attachment, fileWithTime=True):
+#to: 目标邮件地址，可以为列表或逗号分隔的字符串，如果为空，则使用kindle_email
+def send_to_kindle(user, title, attachment, fileWithTime=True, to=None):
     lcTime = user.local_time('%Y-%m-%d_%H-%M')
     subject = f"KindleEar {lcTime}"
+    to = to or user.cfg('kindle_email')
 
     if not isinstance(attachment, tuple):
         lcTime = "({})".format(lcTime) if fileWithTime else ""
@@ -72,13 +74,13 @@ def send_to_kindle(user, title, attachment, fileWithTime=True):
     status = 'ok'
     body = "Deliver from KindleEar"
     try:
-        send_mail(user, user.cfg('kindle_email'), subject, body, attachment)
+        send_mail(user, to, subject, body, attachment)
     except Exception as e:
         status = str(e)
         default_log.warning(f'Failed to send mail "{title}": {status}')
 
     size = sum([len(a[1]) for a in attachment])
-    save_delivery_log(user, title, size, status=status)
+    save_delivery_log(user, title, size, status=status, to=to)
 
 #统一的发送邮件函数
 def send_mail(user, to, subject, body, attachments=None, html=None):

diff --git a/application/lib/calibre/constants.py b/application/lib/calibre/constants.py
@@ -53,4 +53,5 @@ def __getitem__(self, name):
 
 plugins = Plugins()
 config_dir = ""
-DEBUG = False
+DEBUG = False
+CONFIG_DIR_MODE = 0o700
diff --git a/application/lib/calibre/ebooks/conversion/plumber.py b/application/lib/calibre/ebooks/conversion/plumber.py
@@ -376,8 +376,10 @@ def run(self):
             #    f.write(DEBUG_README)
             for x in ('input', '0.parsed', '1.structure', '2.processed'):
                 x = os.path.join(self.opts.debug_pipeline, x)
-                if os.path.exists(x):
+                try:
                     shutil.rmtree(x)
+                except:
+                    pass
 
         self.output_plugin.specialize_options(self.log, self.opts, self.input_fmt)
         #根据需要，创建临时目录或创建内存缓存

diff --git a/application/lib/calibre/ebooks/oeb/polish/parsing.py b/application/lib/calibre/ebooks/oeb/polish/parsing.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python2
+#!/usr/bin/env python
 # vim:fileencoding=utf-8
 from __future__ import (unicode_literals, division, absolute_import,
                         print_function)
@@ -18,6 +18,7 @@
 from html5lib.treebuilders.base import TreeBuilder as BaseTreeBuilder
 from html5lib._ihatexml import InfosetFilter, DataLossWarning
 from html5lib.html5parser import HTMLParser
+import html5lib
 
 from calibre import xml_replace_entities
 from calibre.ebooks.chardet import xml_to_unicode, strip_encoding_declarations
@@ -57,9 +58,9 @@ class Element(ElementBase):
     def __str__(self):
         attrs = ''
         if self.attrib:
-            attrs = ' ' + ' '.join('%s="%s"' % (k, v) for k, v in self.attrib.iteritems())
+            attrs = ' ' + ' '.join('%s="%s"' % (k, v) for k, v in self.attrib.items())
         ns = self.tag.rpartition('}')[0][1:]
-        prefix = {v:k for k, v in self.nsmap.iteritems()}[ns] or ''
+        prefix = {v:k for k, v in self.nsmap.items()}[ns] or ''
         if prefix:
             prefix += ':'
         return '<%s%s%s (%s)>' % (prefix, getattr(self, 'name', self.tag), attrs, hex(id(self)))
@@ -227,7 +228,7 @@ def clean_attrib(name, val, nsmap, attrib, namespaced_attribs):
             return None, True
         nsmap_changed = False
         if ns == xlink_ns and 'xlink' not in nsmap:
-            for prefix, nns in tuple(nsmap.iteritems()):
+            for prefix, nns in tuple(nsmap.items()):
                 if nns == xlink_ns:
                     del nsmap[prefix]
             nsmap['xlink'] = xlink_ns
@@ -239,7 +240,7 @@ def clean_attrib(name, val, nsmap, attrib, namespaced_attribs):
         if prefix == 'xmlns':
             # Use an existing prefix for this namespace, if
             # possible
-            existing = {x:k for k, x in nsmap.iteritems()}.get(val, False)
+            existing = {x:k for k, x in nsmap.items()}.get(val, False)
             if existing is not False:
                 name = existing
             nsmap[name] = val
@@ -270,7 +271,7 @@ def makeelement_ns(ctx, namespace, prefix, name, attrib, nsmap):
     # constructor, therefore they have to be set one by one.
     nsmap_changed = False
     namespaced_attribs = {}
-    for k, v in attrib.iteritems():
+    for k, v in attrib.items():
         try:
             elem.set(k, v)
         except (ValueError, TypeError):
@@ -285,7 +286,7 @@ def makeelement_ns(ctx, namespace, prefix, name, attrib, nsmap):
         nelem = ctx.makeelement(elem.tag, nsmap=nsmap)
         for k, v in elem.items():  # Only elem.items() preserves attrib order
             nelem.set(k, v)
-        for (prefix, name), v in namespaced_attribs.iteritems():
+        for (prefix, name), v in namespaced_attribs.items():
             ns = nsmap.get(prefix, None)
             if ns is not None:
                 try:
@@ -307,7 +308,7 @@ def makeelement_ns(ctx, namespace, prefix, name, attrib, nsmap):
 
     # Ensure that svg and mathml elements get no namespace prefixes
     if elem.prefix is not None and namespace in known_namespaces:
-        for k, v in tuple(nsmap.iteritems()):
+        for k, v in tuple(nsmap.items()):
             if v == namespace:
                 del nsmap[k]
         nsmap[None] = namespace
@@ -420,7 +421,7 @@ def apply_html_attributes(self, attrs):
         if not attrs:
             return
         html = self.openElements[0]
-        for k, v in attrs.iteritems():
+        for k, v in attrs.items():
             if k not in html.attrib and k != 'xmlns':
                 try:
                     html.set(k, v)
@@ -448,7 +449,7 @@ def apply_body_attributes(self, attrs):
         if not attrs:
             return
         body = self.openElements[1]
-        for k, v in attrs.iteritems():
+        for k, v in attrs.items():
             if k not in body.attrib and k !='xmlns':
                 try:
                     body.set(k, v)
@@ -473,7 +474,7 @@ def makeelement(ctx, name, attrib):
         elem = ctx.makeelement(name)
     except ValueError:
         elem = ctx.makeelement(to_xml_name(name))
-    for k, v in attrib.iteritems():
+    for k, v in attrib.items():
         try:
             elem.set(k, v)
         except TypeError:
@@ -517,7 +518,7 @@ def apply_html_attributes(self, attrs):
         if not attrs:
             return
         html = self.openElements[0]
-        for k, v in attrs.iteritems():
+        for k, v in attrs.items():
             if k not in html.attrib and k != 'xmlns':
                 try:
                     html.set(k, v)
@@ -530,7 +531,7 @@ def apply_body_attributes(self, attrs):
         if not attrs:
             return
         body = self.openElements[1]
-        for k, v in attrs.iteritems():
+        for k, v in attrs.items():
             if k not in body.attrib and k != 'xmlns':
                 try:
                     body.set(k, v)
@@ -630,35 +631,14 @@ def html5_parse(raw, decoder=None, log=None, discard_namespaces=False, line_numb
 
 def parse_html5(raw, decoder=None, log=None, discard_namespaces=False, line_numbers=True, linenumber_attribute=None, replace_entities=True, fix_newlines=True):
     if isinstance(raw, bytes):
-        raw = xml_to_unicode(raw)[0] if decoder is None else decoder(raw)
+        raw = decoder(raw) if decoder else xml_to_unicode(raw)[0]
     if replace_entities:
         raw = xml_replace_entities(raw)
     if fix_newlines:
         raw = raw.replace('\r\n', '\n').replace('\r', '\n')
     raw = replace_chars.sub('', raw)
-
-    stream_class = partial(FastStream, track_position=line_numbers)
-    stream = stream_class(raw)
-    builder = partial(NoNamespaceTreeBuilder if discard_namespaces else TreeBuilder, linenumber_attribute=linenumber_attribute)
-    while True:
-        try:
-            parser = HTMLParser(tree=builder, track_positions=line_numbers, namespaceHTMLElements=not discard_namespaces)
-            with warnings.catch_warnings():
-                warnings.simplefilter('ignore', category=DataLossWarning)
-                try:
-                    parser.parse(stream, parseMeta=False, useChardet=False)
-                finally:
-                    parser.tree.proxy_cache = None
-        except NamespacedHTMLPresent as err:
-            raw = re.sub(r'<\s*/{0,1}(%s:)' % err.prefix, lambda m: m.group().replace(m.group(1), ''), raw, flags=re.I)
-            stream = stream_class(raw)
-            continue
-        break
-    root = parser.tree.getDocument()
-    if (discard_namespaces and root.tag != 'html') or (
-        not discard_namespaces and (root.tag != '{%s}%s' % (namespaces['html'], 'html') or root.prefix)):
-        raise ValueError('Failed to parse correctly, root has tag: %s and prefix: %s' % (root.tag, root.prefix))
-    return root
+    doc = html5lib.parse(raw, treebuilder="lxml", namespaceHTMLElements=False)
+    return doc.getroot()
 
 def parse(raw, decoder=None, log=None, line_numbers=True, linenumber_attribute=None, replace_entities=True, force_html5_parse=False):
     if isinstance(raw, bytes):

diff --git a/application/lib/calibre/utils/img.py b/application/lib/calibre/utils/img.py
@@ -214,8 +214,11 @@ def save_cover_data_to(
             ratio = min(newWidth / width, newHeight / height)
             img = img.resize((int(width * ratio), int(height * ratio)), Image.Resampling.LANCZOS)
 
-    if (grayscale or eink) and img.mode != "L":
-        img = img.convert("L")
+    if (grayscale or eink) and img.mode != 'L':
+        img = img.convert('L')
+        changed = True
+    elif img.mode == 'LA' or (img.mode == 'P' and 'transparency' in img.info):
+        img = img.convert('RGBA').convert('RGB')
         changed = True
     elif img.mode != 'RGB':
         img = img.convert('RGB')

diff --git a/application/lib/calibre/web/feeds/news.py b/application/lib/calibre/web/feeds/news.py
@@ -17,7 +17,7 @@
 from calibre.ebooks.metadata import MetaInformation
 from calibre.ebooks.metadata.opf2 import OPFCreator
 from calibre.ebooks.metadata.toc import TOC
-from calibre.ptempfile import PersistentTemporaryFile
+from calibre.ptempfile import PersistentTemporaryFile, PersistentTemporaryDirectory
 from calibre.utils.img import save_cover_data_to
 from calibre.utils.date import now as nowf
 from calibre.utils.localization import canonicalize_lang, ngettext
@@ -433,6 +433,10 @@ class BasicNewsRecipe(Recipe):
     #: Set to False if you do not want to use gzipped transfers. Note that some old servers flake out with gzip
     handle_gzip = True
 
+    # set by worker.py
+    translator = {}
+    tts = {}
+
     # See the built-in recipes for examples of these settings.
 
     def short_title(self):
@@ -960,7 +964,7 @@ def __init__(self, options, log, output_dir, fs, feed_index_start=0):
         elif self.scale_news_images_to_device:
             self.scale_news_images = options.output_profile.screen_size
 
-        self.w2d_opts = wOpts = Web2diskOptions()
+        self.web2disk_options = wOpts = Web2diskOptions()
         for attr in ('keep_only_tags', 'remove_tags', 'preprocess_regexps', 'skip_ad_pages', 'preprocess_html', 
             'remove_tags_after', 'remove_tags_before', 'is_link_wanted', 'compress_news_images', 
             'compress_news_images_max_size', 'compress_news_images_auto_size', 'scale_news_images', 'filter_regexps',
@@ -1063,6 +1067,10 @@ def _postprocess_html(self, soup, first_fetch, job_info):
             h_tag = soup.new_tag('h2')
             h_tag.string = title
             body_tag.insert(0, h_tag)
+        elif h_tag: #去掉标题前面的部分内容
+            for tag in h_tag.previous_siblings:
+                if len(tag.get_text(strip=True)) < 20:
+                    tag.extract()
 
         #job_info.article.url才是真实的url，对于内嵌内容RSS，job_info.url为一个临时文件名
         self.append_share_links(soup, url=job_info.article.url)
@@ -1074,8 +1082,12 @@ def _postprocess_html(self, soup, first_fetch, job_info):
             'figcaption', 'figure', 'section', 'time']):
             x.name = 'div'
 
+        #If tts need, tts propery is set by WorkerImpl
+        if self.tts.get('enable'):
+            self.audiofy_html(soup, title, job_info)
+
         #If translation need, translator propery is set by WorkerImpl
-        if (getattr(self, 'translator', None) or {}).get('enable'):
+        if self.translator.get('enable'):
             self.translate_html(soup, title)
 
         if job_info:
@@ -1284,10 +1296,10 @@ def feed2index(self, f, feeds):
     def _fetch_article(self, job_info, preloaded=None):
         url = job_info.url
         br = self.browser
-        self.w2d_opts.browser = br
-        self.w2d_opts.dir = job_info.art_dir
+        self.web2disk_options.browser = br
+        self.web2disk_options.dir = job_info.art_dir
 
-        fetcher = RecursiveFetcher(self.w2d_opts, self.fs, self.log, job_info, self.image_map, self.css_map)
+        fetcher = RecursiveFetcher(self.web2disk_options, self.fs, self.log, job_info, self.image_map, self.css_map)
         fetcher.browser = br
         fetcher.base_dir = job_info.art_dir
         fetcher.current_dir = job_info.art_dir
@@ -1456,7 +1468,9 @@ def build_index(self):
                 self.jobs.append(req)
 
         self.jobs_done = 0
-        if self.simultaneous_downloads > 1:
+        trans_enable = self.translator.get('enable') or self.tts.get('enable')
+        #如果翻译使能，则不能使用多线程，否则容易触发流量告警导致IP被封锁
+        if (self.simultaneous_downloads > 1) and not trans_enable:
             tp = ThreadPool(self.simultaneous_downloads)
             for req in self.jobs:
                 tp.putRequest(req, block=True, timeout=0)
@@ -1482,7 +1496,7 @@ def build_index(self):
             raise ValueError('No articles downloaded, aborting')
 
         #翻译Feed的标题
-        if (getattr(self, 'translator', None) or {}).get('enable'):
+        if self.translator.get('enable'):
             self.translate_titles(feeds)
 
         for f, feed in enumerate(feeds, self.feed_index_start):
@@ -1558,8 +1572,8 @@ def _download_masthead(self, mu):
     def download_masthead(self, url):
         try:
             self._download_masthead(url)
-        except:
-            self.log.exception("Failed to download supplied masthead_url")
+        except Exception as e:
+            self.log.exception(f"Failed to download supplied masthead_url: {e}")
 
     def resolve_masthead(self):
         self.masthead_path = None
@@ -2000,6 +2014,31 @@ def translate_titles(self, feeds):
             else: #replace
                 item['obj'].title = item['translated']
 
+    #调用在线TTS服务平台，将html转为语音
+    #每个音频片段都会调用一次callback(audioDict, title, feed_index, article_index)
+    def audiofy_html(self, soup, title, job_info):
+        default_log.info(f'audiofy_html {title}')
+        from ebook_tts import HtmlAudiolator
+        audiolator = HtmlAudiolator(self.tts)
+        self.log.debug(f'Translating [{title}]')
+        ret = audiolator.audiofy_soup(soup)
+        if not ret['error']: #保存音频到磁盘，这个地方就不能使用fs了，因为最后合并mp3时无法使用虚拟文件系统
+            if not self.tts.get('audio_dir'):
+                system_temp_dir = os.environ.get('KE_TEMP_DIR')
+                self.tts['audio_dir'] = PersistentTemporaryDirectory(prefix='tts_', dir=system_temp_dir)
+            audio_dir = self.tts['audio_dir']
+            ext = ret['mime'].split('/')[-1]
+            ext = {'mpeg': 'mp3'}.get(ext, ext)
+            for idx, audio in enumerate(ret['audios']):
+                filename = f'{job_info.f_idx:04d}_{job_info.a_idx:04d}_{idx:04d}.{ext}'
+                filename = os.path.join(audio_dir, filename)
+                try:
+                    with open(filename, 'wb') as f:
+                        f.write(audio)
+                except Exception as e:
+                    self.log.warning(f'Failed to write "{filename}": {e}')
+        else:
+            self.log.warning(f'Failed to audiofy "{title}": {ret["error"]}')
 
 class CustomIndexRecipe(BasicNewsRecipe):
 
@@ -2025,8 +2064,8 @@ def create_opf(self):
     def download(self):
         index = self.custom_index()
         url = 'file:'+index if iswindows else 'file://'+index
-        self.w2d_opts.browser = self.clone_browser(self.browser)
-        fetcher = RecursiveFetcher(self.w2d_opts, self.fs, self.log)
+        self.web2disk_options.browser = self.clone_browser(self.browser)
+        fetcher = RecursiveFetcher(self.web2disk_options, self.fs, self.log)
         fetcher.base_dir = self.output_dir
         fetcher.current_dir = self.output_dir
         fetcher.show_progress = False
@@ -2109,7 +2148,7 @@ def parse_feeds(self):
                     continue
 
                 added.add(url)
-                lastTime = LastDelivered.get_or_none(user=self.user.name, url=url)
+                lastTime = LastDelivered.get_or_none((LastDelivered.user==self.user.name) & (LastDelivered.url==url))
                 delta = (datetime.datetime.utcnow() - lastTime.datetime) if lastTime else None
                 #这里oldest_article和其他的recipe不一样，这个参数表示在这个区间内不会重复推送
                 if ((not lastTime) or (not self.oldest_article) or