From 34d1274d1affbf35e88e2842e466f84ef144f619 Mon Sep 17 00:00:00 2001 From: cdhigh Date: Sat, 20 Apr 2024 22:37:47 -0300 Subject: [PATCH] 3.0.0E --- application/back_end/send_mail_adpt.py | 8 +- application/lib/calibre/constants.py | 3 +- .../lib/calibre/ebooks/conversion/plumber.py | 4 +- .../lib/calibre/ebooks/oeb/polish/parsing.py | 54 ++--- application/lib/calibre/utils/img.py | 7 +- application/lib/calibre/web/feeds/news.py | 65 ++++- application/lib/ebook_tts/engines/google.py | 14 +- application/lib/ebook_tts/html_audiolator.py | 105 ++++++-- application/lib/html5_parser.py | 6 + application/lib/mechanize.py | 6 + application/lib/ssml_builder.py | 227 ++++++++++++++++++ application/lib/urlopener.py | 78 +++++- application/recipes/builtin_recipes.xml | 72 ++++-- application/recipes/builtin_recipes.zip | Bin 4404704 -> 4465726 bytes application/static/base.css | 46 ++-- application/static/base.js | 24 +- application/templates/base.html | 9 +- application/templates/my.html | 7 +- application/view/subscribe.py | 2 +- application/view/translator.py | 3 +- application/work/worker.py | 87 ++++++- docker/postfix/readme.md | 24 +- docs/Chinese/deployment.md | 27 ++- docs/Chinese/faq.md | 7 + docs/English/deployment.md | 29 ++- docs/English/faq.md | 10 + tools/archive_builtin_recipes.py | 12 +- tools/update_req.py | 85 ++++--- 28 files changed, 820 insertions(+), 201 deletions(-) create mode 100644 application/lib/html5_parser.py create mode 100644 application/lib/mechanize.py create mode 100644 application/lib/ssml_builder.py diff --git a/application/back_end/send_mail_adpt.py b/application/back_end/send_mail_adpt.py index df21a2a4..870dc031 100644 --- a/application/back_end/send_mail_adpt.py +++ b/application/back_end/send_mail_adpt.py @@ -57,9 +57,11 @@ def avaliable_sm_services(): #title: 邮件标题 #attachment: 附件二进制内容,或元祖 (filename, content) #fileWithTime: 发送的附件文件名是否附带当前时间 -def send_to_kindle(user, title, attachment, fileWithTime=True): +#to: 目标邮件地址,可以为列表或逗号分隔的字符串,如果为空,则使用kindle_email +def send_to_kindle(user, title, attachment, fileWithTime=True, to=None): lcTime = user.local_time('%Y-%m-%d_%H-%M') subject = f"KindleEar {lcTime}" + to = to or user.cfg('kindle_email') if not isinstance(attachment, tuple): lcTime = "({})".format(lcTime) if fileWithTime else "" @@ -72,13 +74,13 @@ def send_to_kindle(user, title, attachment, fileWithTime=True): status = 'ok' body = "Deliver from KindleEar" try: - send_mail(user, user.cfg('kindle_email'), subject, body, attachment) + send_mail(user, to, subject, body, attachment) except Exception as e: status = str(e) default_log.warning(f'Failed to send mail "{title}": {status}') size = sum([len(a[1]) for a in attachment]) - save_delivery_log(user, title, size, status=status) + save_delivery_log(user, title, size, status=status, to=to) #统一的发送邮件函数 def send_mail(user, to, subject, body, attachments=None, html=None): diff --git a/application/lib/calibre/constants.py b/application/lib/calibre/constants.py index 8500ac62..9f337d99 100644 --- a/application/lib/calibre/constants.py +++ b/application/lib/calibre/constants.py @@ -53,4 +53,5 @@ def __getitem__(self, name): plugins = Plugins() config_dir = "" -DEBUG = False \ No newline at end of file +DEBUG = False +CONFIG_DIR_MODE = 0o700 diff --git a/application/lib/calibre/ebooks/conversion/plumber.py b/application/lib/calibre/ebooks/conversion/plumber.py index 5a821c42..2f9327f0 100644 --- a/application/lib/calibre/ebooks/conversion/plumber.py +++ b/application/lib/calibre/ebooks/conversion/plumber.py @@ -376,8 +376,10 @@ def run(self): # f.write(DEBUG_README) for x in ('input', '0.parsed', '1.structure', '2.processed'): x = os.path.join(self.opts.debug_pipeline, x) - if os.path.exists(x): + try: shutil.rmtree(x) + except: + pass self.output_plugin.specialize_options(self.log, self.opts, self.input_fmt) #根据需要,创建临时目录或创建内存缓存 diff --git a/application/lib/calibre/ebooks/oeb/polish/parsing.py b/application/lib/calibre/ebooks/oeb/polish/parsing.py index 2cf427f3..cacb91d4 100644 --- a/application/lib/calibre/ebooks/oeb/polish/parsing.py +++ b/application/lib/calibre/ebooks/oeb/polish/parsing.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python2 +#!/usr/bin/env python # vim:fileencoding=utf-8 from __future__ import (unicode_literals, division, absolute_import, print_function) @@ -18,6 +18,7 @@ from html5lib.treebuilders.base import TreeBuilder as BaseTreeBuilder from html5lib._ihatexml import InfosetFilter, DataLossWarning from html5lib.html5parser import HTMLParser +import html5lib from calibre import xml_replace_entities from calibre.ebooks.chardet import xml_to_unicode, strip_encoding_declarations @@ -57,9 +58,9 @@ class Element(ElementBase): def __str__(self): attrs = '' if self.attrib: - attrs = ' ' + ' '.join('%s="%s"' % (k, v) for k, v in self.attrib.iteritems()) + attrs = ' ' + ' '.join('%s="%s"' % (k, v) for k, v in self.attrib.items()) ns = self.tag.rpartition('}')[0][1:] - prefix = {v:k for k, v in self.nsmap.iteritems()}[ns] or '' + prefix = {v:k for k, v in self.nsmap.items()}[ns] or '' if prefix: prefix += ':' return '<%s%s%s (%s)>' % (prefix, getattr(self, 'name', self.tag), attrs, hex(id(self))) @@ -227,7 +228,7 @@ def clean_attrib(name, val, nsmap, attrib, namespaced_attribs): return None, True nsmap_changed = False if ns == xlink_ns and 'xlink' not in nsmap: - for prefix, nns in tuple(nsmap.iteritems()): + for prefix, nns in tuple(nsmap.items()): if nns == xlink_ns: del nsmap[prefix] nsmap['xlink'] = xlink_ns @@ -239,7 +240,7 @@ def clean_attrib(name, val, nsmap, attrib, namespaced_attribs): if prefix == 'xmlns': # Use an existing prefix for this namespace, if # possible - existing = {x:k for k, x in nsmap.iteritems()}.get(val, False) + existing = {x:k for k, x in nsmap.items()}.get(val, False) if existing is not False: name = existing nsmap[name] = val @@ -270,7 +271,7 @@ def makeelement_ns(ctx, namespace, prefix, name, attrib, nsmap): # constructor, therefore they have to be set one by one. nsmap_changed = False namespaced_attribs = {} - for k, v in attrib.iteritems(): + for k, v in attrib.items(): try: elem.set(k, v) except (ValueError, TypeError): @@ -285,7 +286,7 @@ def makeelement_ns(ctx, namespace, prefix, name, attrib, nsmap): nelem = ctx.makeelement(elem.tag, nsmap=nsmap) for k, v in elem.items(): # Only elem.items() preserves attrib order nelem.set(k, v) - for (prefix, name), v in namespaced_attribs.iteritems(): + for (prefix, name), v in namespaced_attribs.items(): ns = nsmap.get(prefix, None) if ns is not None: try: @@ -307,7 +308,7 @@ def makeelement_ns(ctx, namespace, prefix, name, attrib, nsmap): # Ensure that svg and mathml elements get no namespace prefixes if elem.prefix is not None and namespace in known_namespaces: - for k, v in tuple(nsmap.iteritems()): + for k, v in tuple(nsmap.items()): if v == namespace: del nsmap[k] nsmap[None] = namespace @@ -420,7 +421,7 @@ def apply_html_attributes(self, attrs): if not attrs: return html = self.openElements[0] - for k, v in attrs.iteritems(): + for k, v in attrs.items(): if k not in html.attrib and k != 'xmlns': try: html.set(k, v) @@ -448,7 +449,7 @@ def apply_body_attributes(self, attrs): if not attrs: return body = self.openElements[1] - for k, v in attrs.iteritems(): + for k, v in attrs.items(): if k not in body.attrib and k !='xmlns': try: body.set(k, v) @@ -473,7 +474,7 @@ def makeelement(ctx, name, attrib): elem = ctx.makeelement(name) except ValueError: elem = ctx.makeelement(to_xml_name(name)) - for k, v in attrib.iteritems(): + for k, v in attrib.items(): try: elem.set(k, v) except TypeError: @@ -517,7 +518,7 @@ def apply_html_attributes(self, attrs): if not attrs: return html = self.openElements[0] - for k, v in attrs.iteritems(): + for k, v in attrs.items(): if k not in html.attrib and k != 'xmlns': try: html.set(k, v) @@ -530,7 +531,7 @@ def apply_body_attributes(self, attrs): if not attrs: return body = self.openElements[1] - for k, v in attrs.iteritems(): + for k, v in attrs.items(): if k not in body.attrib and k != 'xmlns': try: body.set(k, v) @@ -630,35 +631,14 @@ def html5_parse(raw, decoder=None, log=None, discard_namespaces=False, line_numb def parse_html5(raw, decoder=None, log=None, discard_namespaces=False, line_numbers=True, linenumber_attribute=None, replace_entities=True, fix_newlines=True): if isinstance(raw, bytes): - raw = xml_to_unicode(raw)[0] if decoder is None else decoder(raw) + raw = decoder(raw) if decoder else xml_to_unicode(raw)[0] if replace_entities: raw = xml_replace_entities(raw) if fix_newlines: raw = raw.replace('\r\n', '\n').replace('\r', '\n') raw = replace_chars.sub('', raw) - - stream_class = partial(FastStream, track_position=line_numbers) - stream = stream_class(raw) - builder = partial(NoNamespaceTreeBuilder if discard_namespaces else TreeBuilder, linenumber_attribute=linenumber_attribute) - while True: - try: - parser = HTMLParser(tree=builder, track_positions=line_numbers, namespaceHTMLElements=not discard_namespaces) - with warnings.catch_warnings(): - warnings.simplefilter('ignore', category=DataLossWarning) - try: - parser.parse(stream, parseMeta=False, useChardet=False) - finally: - parser.tree.proxy_cache = None - except NamespacedHTMLPresent as err: - raw = re.sub(r'<\s*/{0,1}(%s:)' % err.prefix, lambda m: m.group().replace(m.group(1), ''), raw, flags=re.I) - stream = stream_class(raw) - continue - break - root = parser.tree.getDocument() - if (discard_namespaces and root.tag != 'html') or ( - not discard_namespaces and (root.tag != '{%s}%s' % (namespaces['html'], 'html') or root.prefix)): - raise ValueError('Failed to parse correctly, root has tag: %s and prefix: %s' % (root.tag, root.prefix)) - return root + doc = html5lib.parse(raw, treebuilder="lxml", namespaceHTMLElements=False) + return doc.getroot() def parse(raw, decoder=None, log=None, line_numbers=True, linenumber_attribute=None, replace_entities=True, force_html5_parse=False): if isinstance(raw, bytes): diff --git a/application/lib/calibre/utils/img.py b/application/lib/calibre/utils/img.py index e7b1ae16..c52370ac 100644 --- a/application/lib/calibre/utils/img.py +++ b/application/lib/calibre/utils/img.py @@ -214,8 +214,11 @@ def save_cover_data_to( ratio = min(newWidth / width, newHeight / height) img = img.resize((int(width * ratio), int(height * ratio)), Image.Resampling.LANCZOS) - if (grayscale or eink) and img.mode != "L": - img = img.convert("L") + if (grayscale or eink) and img.mode != 'L': + img = img.convert('L') + changed = True + elif img.mode == 'LA' or (img.mode == 'P' and 'transparency' in img.info): + img = img.convert('RGBA').convert('RGB') changed = True elif img.mode != 'RGB': img = img.convert('RGB') diff --git a/application/lib/calibre/web/feeds/news.py b/application/lib/calibre/web/feeds/news.py index 3b674d88..093c9689 100644 --- a/application/lib/calibre/web/feeds/news.py +++ b/application/lib/calibre/web/feeds/news.py @@ -17,7 +17,7 @@ from calibre.ebooks.metadata import MetaInformation from calibre.ebooks.metadata.opf2 import OPFCreator from calibre.ebooks.metadata.toc import TOC -from calibre.ptempfile import PersistentTemporaryFile +from calibre.ptempfile import PersistentTemporaryFile, PersistentTemporaryDirectory from calibre.utils.img import save_cover_data_to from calibre.utils.date import now as nowf from calibre.utils.localization import canonicalize_lang, ngettext @@ -433,6 +433,10 @@ class BasicNewsRecipe(Recipe): #: Set to False if you do not want to use gzipped transfers. Note that some old servers flake out with gzip handle_gzip = True + # set by worker.py + translator = {} + tts = {} + # See the built-in recipes for examples of these settings. def short_title(self): @@ -960,7 +964,7 @@ def __init__(self, options, log, output_dir, fs, feed_index_start=0): elif self.scale_news_images_to_device: self.scale_news_images = options.output_profile.screen_size - self.w2d_opts = wOpts = Web2diskOptions() + self.web2disk_options = wOpts = Web2diskOptions() for attr in ('keep_only_tags', 'remove_tags', 'preprocess_regexps', 'skip_ad_pages', 'preprocess_html', 'remove_tags_after', 'remove_tags_before', 'is_link_wanted', 'compress_news_images', 'compress_news_images_max_size', 'compress_news_images_auto_size', 'scale_news_images', 'filter_regexps', @@ -1063,6 +1067,10 @@ def _postprocess_html(self, soup, first_fetch, job_info): h_tag = soup.new_tag('h2') h_tag.string = title body_tag.insert(0, h_tag) + elif h_tag: #去掉标题前面的部分内容 + for tag in h_tag.previous_siblings: + if len(tag.get_text(strip=True)) < 20: + tag.extract() #job_info.article.url才是真实的url,对于内嵌内容RSS,job_info.url为一个临时文件名 self.append_share_links(soup, url=job_info.article.url) @@ -1074,8 +1082,12 @@ def _postprocess_html(self, soup, first_fetch, job_info): 'figcaption', 'figure', 'section', 'time']): x.name = 'div' + #If tts need, tts propery is set by WorkerImpl + if self.tts.get('enable'): + self.audiofy_html(soup, title, job_info) + #If translation need, translator propery is set by WorkerImpl - if (getattr(self, 'translator', None) or {}).get('enable'): + if self.translator.get('enable'): self.translate_html(soup, title) if job_info: @@ -1284,10 +1296,10 @@ def feed2index(self, f, feeds): def _fetch_article(self, job_info, preloaded=None): url = job_info.url br = self.browser - self.w2d_opts.browser = br - self.w2d_opts.dir = job_info.art_dir + self.web2disk_options.browser = br + self.web2disk_options.dir = job_info.art_dir - fetcher = RecursiveFetcher(self.w2d_opts, self.fs, self.log, job_info, self.image_map, self.css_map) + fetcher = RecursiveFetcher(self.web2disk_options, self.fs, self.log, job_info, self.image_map, self.css_map) fetcher.browser = br fetcher.base_dir = job_info.art_dir fetcher.current_dir = job_info.art_dir @@ -1456,7 +1468,9 @@ def build_index(self): self.jobs.append(req) self.jobs_done = 0 - if self.simultaneous_downloads > 1: + trans_enable = self.translator.get('enable') or self.tts.get('enable') + #如果翻译使能,则不能使用多线程,否则容易触发流量告警导致IP被封锁 + if (self.simultaneous_downloads > 1) and not trans_enable: tp = ThreadPool(self.simultaneous_downloads) for req in self.jobs: tp.putRequest(req, block=True, timeout=0) @@ -1482,7 +1496,7 @@ def build_index(self): raise ValueError('No articles downloaded, aborting') #翻译Feed的标题 - if (getattr(self, 'translator', None) or {}).get('enable'): + if self.translator.get('enable'): self.translate_titles(feeds) for f, feed in enumerate(feeds, self.feed_index_start): @@ -1558,8 +1572,8 @@ def _download_masthead(self, mu): def download_masthead(self, url): try: self._download_masthead(url) - except: - self.log.exception("Failed to download supplied masthead_url") + except Exception as e: + self.log.exception(f"Failed to download supplied masthead_url: {e}") def resolve_masthead(self): self.masthead_path = None @@ -2000,6 +2014,31 @@ def translate_titles(self, feeds): else: #replace item['obj'].title = item['translated'] + #调用在线TTS服务平台,将html转为语音 + #每个音频片段都会调用一次callback(audioDict, title, feed_index, article_index) + def audiofy_html(self, soup, title, job_info): + default_log.info(f'audiofy_html {title}') + from ebook_tts import HtmlAudiolator + audiolator = HtmlAudiolator(self.tts) + self.log.debug(f'Translating [{title}]') + ret = audiolator.audiofy_soup(soup) + if not ret['error']: #保存音频到磁盘,这个地方就不能使用fs了,因为最后合并mp3时无法使用虚拟文件系统 + if not self.tts.get('audio_dir'): + system_temp_dir = os.environ.get('KE_TEMP_DIR') + self.tts['audio_dir'] = PersistentTemporaryDirectory(prefix='tts_', dir=system_temp_dir) + audio_dir = self.tts['audio_dir'] + ext = ret['mime'].split('/')[-1] + ext = {'mpeg': 'mp3'}.get(ext, ext) + for idx, audio in enumerate(ret['audios']): + filename = f'{job_info.f_idx:04d}_{job_info.a_idx:04d}_{idx:04d}.{ext}' + filename = os.path.join(audio_dir, filename) + try: + with open(filename, 'wb') as f: + f.write(audio) + except Exception as e: + self.log.warning(f'Failed to write "{filename}": {e}') + else: + self.log.warning(f'Failed to audiofy "{title}": {ret["error"]}') class CustomIndexRecipe(BasicNewsRecipe): @@ -2025,8 +2064,8 @@ def create_opf(self): def download(self): index = self.custom_index() url = 'file:'+index if iswindows else 'file://'+index - self.w2d_opts.browser = self.clone_browser(self.browser) - fetcher = RecursiveFetcher(self.w2d_opts, self.fs, self.log) + self.web2disk_options.browser = self.clone_browser(self.browser) + fetcher = RecursiveFetcher(self.web2disk_options, self.fs, self.log) fetcher.base_dir = self.output_dir fetcher.current_dir = self.output_dir fetcher.show_progress = False @@ -2109,7 +2148,7 @@ def parse_feeds(self): continue added.add(url) - lastTime = LastDelivered.get_or_none(user=self.user.name, url=url) + lastTime = LastDelivered.get_or_none((LastDelivered.user==self.user.name) & (LastDelivered.url==url)) delta = (datetime.datetime.utcnow() - lastTime.datetime) if lastTime else None #这里oldest_article和其他的recipe不一样,这个参数表示在这个区间内不会重复推送 if ((not lastTime) or (not self.oldest_article) or diff --git a/application/lib/ebook_tts/engines/google.py b/application/lib/ebook_tts/engines/google.py index 876a77cc..92f47a1b 100644 --- a/application/lib/ebook_tts/engines/google.py +++ b/application/lib/ebook_tts/engines/google.py @@ -81,7 +81,8 @@ class GoogleWebTTSFree: api_key_hint = '' default_api_host = 'https://translate.google.com' default_timeout = 60 - request_interval = 0 + request_interval = 10 + max_len_per_request = 500 languages = gtts_languages def __init__(self, params): @@ -111,7 +112,8 @@ class GoogleTextToSpeech: api_key_hint = '' default_api_host = '' default_timeout = 60 - request_interval = 0 + request_interval = 0.5 + max_len_per_request = 500 languages = gtts_languages def __init__(self, params): @@ -127,6 +129,7 @@ def __init__(self, params): #Limit is 5000 bytes per request #https://cloud.google.com/text-to-speech/quotas def tts(self, text): + client = texttospeech.TextToSpeechClient() input_text = texttospeech.SynthesisInput(text=text) # Note: the voice can also be specified by name. @@ -137,12 +140,11 @@ def tts(self, text): ssml_gender=texttospeech.SsmlVoiceGender.FEMALE, ) - audio_config = texttospeech.AudioConfig( - audio_encoding=texttospeech.AudioEncoding.MP3 - ) + audio_config = texttospeech.AudioConfig(audio_encoding=texttospeech.AudioEncoding.MP3) response = client.synthesize_speech( request={"input": input_text, "voice": voice, "audio_config": audio_config} ) - return ('audio/mpeg', response.audio_content) \ No newline at end of file + return ('audio/mpeg', response.audio_content) + diff --git a/application/lib/ebook_tts/html_audiolator.py b/application/lib/ebook_tts/html_audiolator.py index a098077d..dc2c2edb 100644 --- a/application/lib/ebook_tts/html_audiolator.py +++ b/application/lib/ebook_tts/html_audiolator.py @@ -19,11 +19,11 @@ def __init__(self, params: dict): self.params = params self.engineName = self.params.get('engine') self.language = self.params.get('language', 'en') - self.audiolator = builtin_tts_engines.get(self.engineName, GoogleTtsFree)(params) + self.audiolator = builtin_tts_engines.get(self.engineName, GoogleWebTTSFree)(params) - #翻译文本 + #语音化文本,注意文本不要太长,一般几百个字符以内 #data: 文本/字典/列表 {'text': text, ...}, [{},{}] - #返回:{'mime':, 'audiofied': , 'text':, ..., 'error':,} + #返回:{'mime':, 'audio': , 'text':, ..., 'error':,} #如果输入是列表,返回也是列表,否则返回字典 def audiofy_text(self, data): retList = True @@ -39,14 +39,14 @@ def audiofy_text(self, data): for idx, item in enumerate(data): text = item['text'] item['error'] = '' - item['audiofied'] = b'' + item['audio'] = '' item['mime'] = '' if text: - if 1: - item['mime'], item['audiofied'] = self.audiolator.tts(text) - #except Exception as e: - #default_log.warning('audiofy_text failed: ' + str(e)) - #item['error'] = str(e) + try: + item['mime'], item['audio'] = self.audiolator.tts(text) + except Exception as e: + default_log.warning('audiofy_text failed: ' + str(e)) + item['error'] = str(e) else: item['error'] = _('The input text is empty') ret.append(item) @@ -58,23 +58,35 @@ def audiofy_text(self, data): elif ret: return ret[0] else: - return {'error': 'unknown error', 'audiofied': b'', 'mime':'', 'text': ''} + return {'error': 'unknown error', 'audio': '', 'mime':'', 'text': ''} - #语音化BeautifulSoup实例,返回 {'error':, 'audiofied':, 'mime':, 'text':} + #语音化BeautifulSoup实例,返回 {'error':, 'mime':, 'audio':[], 'texts':[]} def audiofy_soup(self, soup): - text = self.extract_soup_text(soup) - ret = {'text': text, 'error': '', 'audiofied': b'', 'mime': ''} - if text: - if 1: - ret['mime'], ret['audiofied'] = self.audiolator.tts(text) - #except Exception as e: - #default_log.warning('audiofy_text failed: ' + str(e)) - ret['error'] = str(e) - else: + ret = {'error': '', 'audios': [], 'mime': '', 'texts':[]} + texts = self.extract_soup_text(soup) + if not texts: ret['error'] = _('The input text is empty') + return ret + + try: + title = soup.find('title').string + except: + title = 'Untitled' + + for text in self.split_strings(texts, self.audiolator.max_len_per_request): + try: + mime, audio = self.audiolator.tts(text) + ret['mime'] = ret['mime'] or mime + if audio: + ret['texts'].append(text) + ret['audios'].append(audio) + else: + default_log.warning(f'audiofy_soup got empty audio for "{title}": {text[:30]}') + except Exception as e: + ret['error'] = str(e) return ret - #提取soup适合语音化的文本,直接返回文本内容 + #提取soup适合语音化的文本,返回文本内容列表 def extract_soup_text(self, soup): texts = [] @@ -100,4 +112,53 @@ def _extract(tag): else: _extract(child) _extract(soup.body) - return '\n'.join(texts) + return texts + + #将字符串数组合并或拆分重组为每个字符串不超过max_len的新数组 + def split_strings(self, strings, max_len): + step1 = [] + current = [] + currLen = 0 + for text in strings: #第一步,先合并短字符串 + thisLen = len(text) + if current and (currLen + thisLen + 1 >= max_len): + step1.append(' '.join(current)) + current = [text] + currLen = thisLen + else: + current.append(text) + currLen += thisLen + 1 + + if current: + step1.append(' '.join(current)) + + #第二步,拆分超长字符串 + result = [] + for item in step1: + if len(item) > max_len + 1: #拆分 + subItems = [] + for line in item.split('\n'): #按照回车进行分割 + if len(line) > max_len: + #再按照空格进行分割 + words = line.split() + current_line = '' + current = [] + currLen = 0 + for word in words: + thisLen = len(word) + if current and (currLen + thisLen + 1 >= max_len): + subItems.append(' '.join(current)) + current = [word] + currLen = thisLen + else: + current.append(word) + currLen += thisLen + 1 + if current: + subItems.append(' '.join(current)) + else: + subItems.append(line) + result.extend(subItems) + else: + result.append(item) + + return result diff --git a/application/lib/html5_parser.py b/application/lib/html5_parser.py new file mode 100644 index 00000000..79b49fb9 --- /dev/null +++ b/application/lib/html5_parser.py @@ -0,0 +1,6 @@ +#!/usr/bin/env python3 +# -*- coding:utf-8 -*- +#因为html5_parser不提供二进制安装包,所以KindleEar使用html5lib代替 +#为了让依赖html5_parser的recipe可以继续使用,使用此文件做桩 +from calibre.ebooks.oeb.polish.parsing import parse + diff --git a/application/lib/mechanize.py b/application/lib/mechanize.py new file mode 100644 index 00000000..23c18389 --- /dev/null +++ b/application/lib/mechanize.py @@ -0,0 +1,6 @@ +#!/usr/bin/env python3 +# -*- coding:utf-8 -*- +#mechanize 的兼容层,让其他recipe可以不修改就可以使用 +from urlopener import UrlOpener as Browser, Request + + diff --git a/application/lib/ssml_builder.py b/application/lib/ssml_builder.py new file mode 100644 index 00000000..2bd3d5c2 --- /dev/null +++ b/application/lib/ssml_builder.py @@ -0,0 +1,227 @@ +# -*- coding: utf-8 -*- +#https://github.com/Reverseblade/ssml-builder + +import re + + +class Speech: + + VALID_INTERPRET_AS = ('characters', 'spell-out', 'cardinal', 'number', + 'ordinal', 'digits', 'fraction', 'unit', 'date', + 'time', 'telephone', 'address', 'interjection', 'expletive') + + VALID_PROSODY_ATTRIBUTES = { + 'rate': ('x-slow', 'slow', 'medium', 'fast', 'x-fast'), + 'pitch': ('x-low', 'low', 'medium', 'high', 'x-high'), + 'volume': ('silent', 'x-soft', 'soft', 'medium', 'loud', 'x-loud') + } + + VALID_VOICE_NAMES = ('Ivy', 'Joanna', 'Joey', 'Justin', 'Kendra', 'Kimberly', + 'Matthew', 'Salli', 'Nicole', 'Russell', 'Amy', 'Brian', 'Emma', + 'Aditi', 'Raveena', 'Hans', 'Marlene', 'Vicki', 'Conchita', 'Enrique', + 'Carla', 'Giorgio', 'Mizuki', 'Takumi', 'Celine', 'Lea', 'Mathieu') + + VALID_EMPHASIS_LEVELS = ('strong', 'moderate', 'reduced') + + def __init__(self): + self.speech = "" + + def speak(self): + """ + + :return: + """ + return '{}'.format(self.speech) + + def add_text(self, value): + """ + add text + :return: + """ + self.speech += value + return self + + def say_as(self, value, interpret_as, is_nested=False): + """ + + :param value: + :param interpret_as: + :param is_nested: + :return: + """ + + if interpret_as not in self.VALID_INTERPRET_AS: + raise ValueError('The interpret-as provided to say_as is not valid') + + ssml = '' \ + '{value}'.format(interpret_as=interpret_as, value=value) + + if is_nested: + return ssml + + self.speech += ssml + return self + + def prosody(self, value, rate='medium', pitch='medium', volume='medium', is_nested=False): + """ + + :param value: + :param rate: + :param pitch: + :param volume: + :param is_nested: + :return: + """ + + if rate not in self.VALID_PROSODY_ATTRIBUTES['rate']: + if re.match(r'^\d+%$', rate) is None: + raise ValueError('The rate provided to prosody is not valid') + + if pitch not in self.VALID_PROSODY_ATTRIBUTES['pitch']: + if re.match(r'^(\+|\-)+\d+(\.\d+)*%$', pitch) is None: + raise ValueError('The pitch provided to prosody is not valid') + + if volume not in self.VALID_PROSODY_ATTRIBUTES['volume']: + raise ValueError('The volume provided to prosody is not valid') + + ssml = '' \ + '{value}'.format(rate=rate, pitch=pitch, volume=volume, value=value) + + if is_nested: + return ssml + + self.speech += ssml + return self + + def sub(self, value, alias, is_nested=False): + """ + + :param value: + :param alias: + :param is_nested: + :return: + """ + + ssml = '{}'.format(alias, value) + + if is_nested: + return ssml + + self.speech += ssml + return self + + def lang(self, value, lang, is_nested=False): + """ + + :param value: + :param lang: + :param is_nested: + :return: + """ + + ssml = '{}'.format(lang, value) + + if is_nested: + return ssml + + self.speech += ssml + return self + + def voice(self, value, name, is_nested=False): + """ + + :param value: + :param name: + :return: + """ + + #if name not in self.VALID_VOICE_NAMES: + # raise ValueError('The name provided to voice is not valid') + + ssml = '{}'.format(name, value) + + if is_nested: + return ssml + + self.speech += '{}'.format(name, value) + return self + + def pause(self, time, is_nested=False): + """ + + :param time: + :param is_nested: + :return: + """ + + ssml = ''.format(time) + + if is_nested: + return ssml + + self.speech += ssml + return self + + def whisper(self, value, is_nested=False): + """ + :param value: + :param is_nested: + :return: + """ + + ssml = '{}'.format(value) + + if is_nested: + return ssml + + self.speech += ssml + return self + + def audio(self, src, is_nested=False): + """ + :param src: + :param is_nested: + :return: + """ + + ssml = '