3.2.1

1. Add proxy feature 2. Add dsl dictionary feature
cdhigh · Nov 21, 2024 · 94070ab · 94070ab
1 parent e2d2c2d
commit 94070ab
Show file tree

Hide file tree

Showing 48 changed files with 647 additions and 202 deletions.
diff --git a/application/back_end/db_models.py b/application/back_end/db_models.py
@@ -5,7 +5,7 @@
 #Author: cdhigh <https://github.com/cdhigh>
 import os, random, datetime
 from operator import attrgetter
-from ..utils import PasswordManager, ke_encrypt, ke_decrypt, utcnow, compare_version
+from ..ke_utils import PasswordManager, ke_encrypt, ke_decrypt, utcnow, compare_version
 
 if os.getenv('DATABASE_URL', '').startswith(("datastore", "mongodb", "redis", "pickle")):
     from .db_models_nosql import *
@@ -46,7 +46,7 @@ def cfg(self, item, default=None):
             return {'email': '', 'kindle_email': '', 'secret_key': '', 'timezone': 0,
                 'inbound_email': 'save,forward', 'keep_in_email_days': 1,
                 'delivery_mode': 'email,local', 'webshelf_days': 7,
-                'reader_params': {}}.get(item, value)
+                'reader_params': {}, 'proxy': ''}.get(item, value)
         else:
             return value
     def set_cfg(self, item, value):

diff --git a/application/back_end/send_mail_adpt.py b/application/back_end/send_mail_adpt.py
@@ -6,7 +6,7 @@
 #https://cloud.google.com/appengine/docs/standard/python3/reference/services/bundled/google/appengine/api/mail
 #https://cloud.google.com/appengine/docs/standard/python3/services/mail
 import os, datetime, zipfile, base64
-from ..utils import str_to_bool, sanitize_filename
+from ..ke_utils import str_to_bool, sanitize_filename
 from ..base_handler import save_delivery_log
 
 #google.appengine will apply patch for os.env module
@@ -207,7 +207,7 @@ def mailjet_send_mail(apikey, secret_key, sender, to, subject, body, html=None,
 def save_mail_to_local(dest_dir, subject, body, attachments=None, html=None, **kwargs):
     attachments = attachments or []
     mailDir = os.path.join(appDir, dest_dir)
-    if not os.path.exists(mailDir):
+    if not os.path.isdir(mailDir):
         os.makedirs(mailDir)
 
     now = str(datetime.datetime.now().strftime('%H-%M-%S'))

diff --git a/application/back_end/task_queue_celery.py b/application/back_end/task_queue_celery.py
@@ -34,7 +34,7 @@ def __call__(self, *args, **kwargs):
         transport_opts = {'data_folder_in': dir_in, 'data_folder_out': dir_out, 'processed_folder': dir_procsed, 
             'store_processed': True}
         for d in [dir_, dir_in, dir_out, dir_procsed]:
-            if not os.path.exists(d):
+            if not os.path.isdir(d):
                 os.makedirs(d)
         broker_url = 'filesystem://'
 

diff --git a/application/utils.py → application/ke_utils.py b/application/utils.py → application/ke_utils.py
diff --git a/application/lib/build_ebook.py b/application/lib/build_ebook.py
@@ -9,7 +9,7 @@
 from calibre.web.feeds.recipes import compile_recipe
 from recipe_helper import GenerateRecipeSource
 from urlopener import UrlOpener
-from application.utils import loc_exc_pos
+from application.ke_utils import loc_exc_pos
 
 #从输入格式生成对应的输出格式
 #input_: 如果是recipe，为编译后的recipe(或列表)，或者是一个输入文件名，或一个BytesIO

diff --git a/application/lib/calibre/ebooks/conversion/plugins/recipe_input.py b/application/lib/calibre/ebooks/conversion/plugins/recipe_input.py
@@ -14,7 +14,7 @@
 from calibre.ebooks.metadata import MetaInformation
 from calibre.ebooks.metadata.opf2 import OPFCreator
 from calibre.ebooks.metadata.toc import TOC
-from application.utils import loc_exc_pos
+from application.ke_utils import loc_exc_pos
 
 class RecipeDisabled(Exception):
     pass

diff --git a/application/lib/calibre/ebooks/conversion/plumber.py b/application/lib/calibre/ebooks/conversion/plumber.py
@@ -25,7 +25,7 @@
 from polyglot.builtins import string_or_bytes
 
 from filesystem_dict import FsDictStub
-from application.utils import get_directory_size, loc_exc_pos
+from application.ke_utils import get_directory_size, loc_exc_pos
 from application.base_handler import save_delivery_log
 
 DEBUG_README=b'''

diff --git a/application/lib/calibre/web/feeds/news.py b/application/lib/calibre/web/feeds/news.py
@@ -34,7 +34,7 @@
 from requests_file import LocalFileAdapter
 from filesystem_dict import FsDictStub
 from application.back_end.db_models import LastDelivered
-from application.utils import loc_exc_pos
+from application.ke_utils import loc_exc_pos
 
 MASTHEAD_SIZE = (600, 60)
 DEFAULT_MASTHEAD_IMAGE = 'mastheadImage.gif'

diff --git a/application/lib/calibre/web/fetch/simple.py b/application/lib/calibre/web/fetch/simple.py
@@ -32,7 +32,7 @@
     URLError, quote, url2pathname, urljoin, urlparse, urlsplit, urlunparse,
     urlunsplit, urlopen
 )
-from application.utils import loc_exc_pos
+from application.ke_utils import loc_exc_pos
 
 class AbortArticle(Exception):
     pass

diff --git a/application/lib/dictionary/__init__.py b/application/lib/dictionary/__init__.py
@@ -8,10 +8,11 @@
 from .oxford_learners import OxfordLearners
 from .stardict import StarDict
 from .mdict import MDict
+from .lingvo import LingvoDict
 
 all_dict_engines = {DictOrg.name: DictOrg, DictCn.name: DictCn, DictCc.name: DictCc,
     MerriamWebster.name: MerriamWebster, OxfordLearners.name: OxfordLearners,
-    StarDict.name: StarDict, MDict.name: MDict}
+    StarDict.name: StarDict, MDict.name: MDict, LingvoDict.name: LingvoDict}
 
 #创建一个词典实例
 def CreateDictInst(engine, database, host=None):

diff --git a/application/lib/dictionary/lingvo/__init__.py b/application/lib/dictionary/lingvo/__init__.py
@@ -0,0 +1,3 @@
+#!/usr/bin/env python3
+# -*- coding:utf-8 -*-
+from .lingvo_dict import LingvoDict
diff --git a/application/lib/dictionary/lingvo/dsl_reader.py b/application/lib/dictionary/lingvo/dsl_reader.py
@@ -0,0 +1,156 @@
+#!/usr/bin/env python3
+# -*- coding:utf-8 -*-
+#dsl离线词典支持，不支持dsl.dz，即使使用indexed_gzip还是慢，建议先解压为dsl再使用
+#Author: cdhigh <https://github.com/cdhigh>
+import os, re, logging, io
+import chardet
+
+try:
+    import marisa_trie
+except:
+    marisa_trie = None
+
+#外部接口
+class DslReader:
+    TRIE_FMT = '>LH' #释义开始位置，释义块字数
+
+    def __init__(self, fileName):
+        self.log = logging.getLogger()
+        self.fileName = fileName
+        self.encoding = None
+        firstPart = os.path.splitext(fileName)[0]
+        self.trieFileName = firstPart + '.trie'
+        self.encFileName = firstPart + '.enc'
+        self.trie = None
+
+        if os.path.isfile(self.encFileName):
+            with open(self.encFileName, 'r', encoding='utf-8') as f:
+                self.encoding = f.read().strip()
+
+        if os.path.isfile(self.trieFileName):
+            try:
+                self.trie = marisa_trie.RecordTrie(self.TRIE_FMT) #type:ignore
+                self.trie.load(self.trieFileName)
+            except Exception as e:
+                self.trie = None
+                self.log.warning(f'Failed to load dsldict trie data: {fileName}: {e}')
+
+        if self.trie:
+            return
+
+        #分析索引数据，构建前缀树
+        self.log.info(f"Building trie for {fileName}")
+        self.buildTrie()
+
+    #分析索引数据，构建前缀树
+    #代码简单点，全部读入内存
+    def buildTrie(self):
+        f = self.openDslFile()
+        encoding = self.encoding
+        records = []
+        currWord = ''
+        meanStart = None
+        meanWordCnt = 0
+        while True:
+            line = f.readline()
+            if line.startswith(('#', r'{{', '\n', '\r')):
+                meanWordCnt += len(line)
+                continue
+
+            if not line: #文件结束
+                if currWord and meanStart is not None:
+                    records.append((currWord, (meanStart, min(meanWordCnt, 65000))))
+                break
+
+            #开始一个词条
+            if not line.startswith((' ', '\t')):
+                if currWord and meanStart is not None:
+                    #保存前词条的偏移位置
+                    records.append((currWord, (meanStart, min(meanWordCnt, 65000))))
+                    meanStart = None
+
+                currWord = line.strip()
+                if meanStart is None:
+                    meanStart = f.tell() #f.tell()特别慢，要等到需要的时候才调用
+                meanWordCnt = 0
+            else: #有缩进，是释义块
+                meanWordCnt += len(line)
+
+        f.close()
+        self.trie = marisa_trie.RecordTrie(self.TRIE_FMT, records) #type:ignore
+        self.trie.save(self.trieFileName)
+        del records
+        del self.trie
+        self.trie = marisa_trie.RecordTrie(self.TRIE_FMT) #type:ignore
+        self.trie.load(self.trieFileName)
+
+    #打开文件，返回文件实例
+    def openDslFile(self):
+        if not self.encoding: #检测编码，因为很多词典不按官方的要求使用unicode
+            import chardet
+            with open(self.fileName, 'rb') as f:
+                data = f.read(10000)
+            ret = chardet.detect(data)
+            encoding = ret['encoding'] if ret['confidence'] >= 0.8 else None
+
+            #逐一测试
+            if not encoding:
+                for enc in ['utf-16', 'utf-16-le', 'windows-1252']:
+                    try:
+                        with open(self.fileName, 'r', encoding=enc) as f:
+                            f.readline()
+                        encoding = enc
+                        break
+                    except UnicodeError:
+                        pass
+
+            self.encoding = (encoding or 'utf-16').lower()
+            with open(self.encFileName, 'w', encoding='utf-8') as fEnc:
+                fEnc.write(self.encoding)
+
+        return open(self.fileName, 'r', encoding=self.encoding)
+
+    #查词接口
+    def get(self, word, default=''): #type:ignore
+        for wd in [word, word.lower(), word.capitalize()]:
+            if wd in self.trie:
+                break
+        else:
+            return default
+
+        start, size = self.trie[wd][0]
+        lines = []
+        with self.openDslFile() as f:
+            f.seek(start)
+            lines = f.read(size).splitlines()
+        mean = '\n'.join([line for line in lines if line.startswith((' ', '\t'))])
+        return self.dslMeanToHtml(mean)
+
+    #将原始释义转换为合法的html文本
+    def dslMeanToHtml(self, mean):
+        simpleTags = {"[']": '<u>', "[/']": '</u>', '[b]': '<b>', '[/b]': '</b>', '[i]': '<i>', 
+            '[/i]': '</i>', '[u]': '<u>', '[/u]': '</u>',  '[sub]': '<sub>', '[/sub]': '</sub>',
+            '[sup]': '<sup>', '[/sup]': '</sup>', '[/c]': '</span>', '@': '<br/>', '\t': '',
+            '[*]': '<span>', '[/*]': '</span>', '\\[': '[', '\\]': ']', '\n': '<br/>',
+            '[ex]': '<span style="color:#808080">', '[/ex]': '</span>',
+            '[p]': '<i style="color:#008000">', '[/p]': '</i>',
+            '[url]': '<span style="color:#0000ff;text-decoration:underline">', '[/url]': '</span>',
+            '[ref]': '<span style="color:#0000ff;text-decoration:underline">', '[/ref]': '</span>',}
+        removeTags = ['[/m]', '[com]', '[/com]', '[trn]', '[/trn]', '[trs]',
+            '[/trs]', '[!trn]', '[/!trn]', '[!trs]', '[/!trs]', '[/lang]']
+
+        #print(mean) #TODO
+        for tag, repl in simpleTags.items():
+            mean = mean.replace(tag, repl)
+        for tag in removeTags:
+            mean = mean.replace(tag, '')
+
+        # 替换[m]，根据匹配内容生成相应数量的空格
+        mean = re.sub(r'\[m\d+?\]', lambda match: '&nbsp;' * int(match.group(0)[2:-1]), mean)
+        mean = re.sub(r'\[c.*?\]', '<span style="color:#006400">', mean)
+        #浏览器不支持 entry:// 协议，会直接拦截导致无法跳转，
+        mean = re.sub(r'\[lang.*?\]', '', mean)
+        mean = re.sub(r'\[s\].*?\[/s\]', '', mean)
+        mean = re.sub(r'<<(.*?)>>', r'<a href="https://kindleear/entry/\1">\1</a>', mean)
+        #print(mean) #TODO
+        return mean
diff --git a/application/lib/dictionary/lingvo/lingvo_dict.py b/application/lib/dictionary/lingvo/lingvo_dict.py
@@ -0,0 +1,53 @@
+#!/usr/bin/env python3
+# -*- coding:utf-8 -*-
+#lingvo dsl 离线词典支持
+#Author: cdhigh <https://github.com/cdhigh>
+import os, re
+from application.ke_utils import loc_exc_pos
+from .dsl_reader import DslReader
+
+#获取本地的dsl文件列表，只有列表，没有校验是否有效
+def getDslDictList():
+    dictDir = os.environ.get('DICTIONARY_DIR')
+    if not dictDir or not os.path.isdir(dictDir):
+        return {}
+
+    ret = {}
+    for dirPath, _, fileNames in os.walk(dictDir):
+        for fileName in fileNames:
+            if fileName.lower().endswith('.dsl'):
+                dictName = os.path.splitext(fileName)[0]
+                #为了界面显示和其他dict的一致，键为词典全路径名，值为词典名字
+                ret[os.path.join(dirPath, fileName)] = dictName
+    return ret
+
+class LingvoDict:
+    name = "lingvo"
+    #词典列表，键为词典缩写，值为词典描述
+    databases = getDslDictList()
+
+    #更新词典列表
+    @classmethod
+    def refresh(cls):
+        cls.databases = getDslDictList()
+
+    def __init__(self, database='', host=None):
+        self.database = database
+        self.dictionary = None
+        self.initError = None
+        if database in self.databases:
+            try:
+                self.dictionary = DslReader(database)
+            except:
+                self.initError = loc_exc_pos(f'Init LingvoDict failed: {self.databases[database]}')
+                default_log.warning(self.initError)
+        else:
+            self.initError = f'Dict not found: {self.databases[database]}'
+            default_log.warning(self.initError)
+
+    #返回当前使用的词典名字
+    def __repr__(self):
+        return '{} [{}]'.format(self.name, self.databases.get(self.database, ''))
+
+    def definition(self, word, language=''):
+        return self.initError if self.initError else self.dictionary.get(word)
diff --git a/application/lib/dictionary/mdict/mdict.py b/application/lib/dictionary/mdict/mdict.py
@@ -1,9 +1,10 @@
 #!/usr/bin/env python3
 # -*- coding:utf-8 -*-
 #mdx离线词典接口
+#Author: cdhigh <https://github.com/cdhigh>
 import os
 from bs4 import BeautifulSoup
-from application.utils import xml_escape
+from application.ke_utils import xml_escape, loc_exc_pos
 from .readmdict import MDX
 try:
     import marisa_trie
@@ -38,20 +39,25 @@ def refresh(cls):
     def __init__(self, database='', host=None):
         self.database = database
         self.dictionary = None
+        self.initError = None
         if database in self.databases:
             try:
                 self.dictionary = IndexedMdx(database)
-            except Exception as e:
-                default_log.warning(f'Instantiate mdict failed: {self.databases[database]}: {e}')
+            except:
+                self.initError = loc_exc_pos(f'Init mdict failed: {self.databases[database]}')
+                default_log.warning(self.initError)
         else:
-            default_log.warning(f'dict not found: {self.databases[database]}')
+            self.initError = f'Dict not found: {self.databases[database]}'
+            default_log.warning(self.initError)
 
     #返回当前使用的词典名字
     def __repr__(self):
         return 'mdict [{}]'.format(self.databases.get(self.database, ''))
 
     def definition(self, word, language=''):
-        return self.dictionary.get(word) if self.dictionary else ''
+        if self.initError:
+            return self.initError
+        return self.dictionary.get(word)
 
 #经过词典树缓存的Mdx
 class IndexedMdx:
@@ -94,18 +100,23 @@ def __init__(self, fname, encoding="", substyle=False, passcode=None):
     def get(self, word):
         if not self.trie:
             return ''
-        word = word.lower().strip()
+
         #和mdict官方应用一样，输入:about返回词典基本信息
         if word == ':about':
             return self.dict_html_info()
 
-        indexes = self.trie[word] if word in self.trie else None
+        for wd in [word, word.lower(), word.capitalize()]:
+            if wd in self.trie:
+                indexes = self.trie[word]
+                break
+        else:
+            return ''
+
         ret = self.get_content_by_Index(indexes)
         if ret.startswith('@@@LINK='):
             word = ret[8:].strip()
-            if word:
-                indexes = self.trie[word] if word in self.trie else None
-                ret = self.get_content_by_Index(indexes)
+            if word and word in self.trie:
+                ret = self.get_content_by_Index(self.trie[word])
         return ret
 
     def __contains__(self, word) -> bool:

diff --git a/application/lib/dictionary/stardict/__init__.py b/application/lib/dictionary/stardict/__init__.py
@@ -0,0 +1,3 @@
+#!/usr/bin/env python3
+# -*- coding:utf-8 -*-
+from .stardict import StarDict