porting to python3

cdhigh · Apr 3, 2024 · a2fc2c5 · a2fc2c5
1 parent 53e6cc9
commit a2fc2c5
Show file tree

Hide file tree

Showing 18 changed files with 385 additions and 244 deletions.
diff --git a/application/back_end/db_models.py b/application/back_end/db_models.py
@@ -220,7 +220,8 @@ class SharedRssCategory(MyBaseModel):
 
 class LastDelivered(MyBaseModel):
     user = CharField()
-    bookname = CharField()
+    bookname = CharField(default='')
+    url = CharField(default='')
     num = IntegerField(default=0)
     record = CharField(default='')
     datetime = DateTimeField(default=datetime.datetime.utcnow)

diff --git a/application/lib/calibre/web/feeds/news.py b/application/lib/calibre/web/feeds/news.py
@@ -8,14 +8,9 @@
 __docformat__ = "restructuredtext en"
 
 
-import io
-import os
-import re
-import sys
-import time
-import traceback
+import io, os, re, sys, time, datetime, traceback
 from collections import defaultdict, namedtuple
-from urllib.parse import urlparse, urlsplit, quote
+from urllib.parse import urlparse, urlsplit, quote, urljoin
 from urllib.error import HTTPError, URLError
 from calibre import __appname__, as_unicode, force_unicode, iswindows, preferred_encoding, strftime
 from calibre.ebooks.BeautifulSoup import BeautifulSoup, CData, NavigableString, Tag
@@ -39,6 +34,7 @@
 from urlopener import UrlOpener
 from requests_file import LocalFileAdapter
 from filesystem_dict import FsDictStub
+from application.back_end.db_models import LastDelivered
 
 MASTHEAD_SIZE = (600, 60)
 DEFAULT_MASTHEAD_IMAGE = 'mastheadImage.gif'
@@ -925,6 +921,7 @@ def __init__(self, options, log, output_dir, fs, feed_index_start=0):
             self.title = str(self.title, 'utf-8', 'replace')
 
         self.options = options
+        self.user = self.options.user
         self.debug = options.verbose > 1
         self.output_dir = output_dir
         self.fs = fs
@@ -1045,7 +1042,7 @@ def _postprocess_html(self, soup, first_fetch, job_info):
             del img['srcset']
 
         #如果需要，去掉正文中的超链接(使用斜体下划线标识)，以避免误触
-        remove_hyperlinks = self.options.user.remove_hyperlinks
+        remove_hyperlinks = self.user.remove_hyperlinks
         if remove_hyperlinks in ('text', 'all'):
             for a_ in soup.find_all('a'):
                 a_.name = 'i'
@@ -1099,18 +1096,17 @@ def append_share_links(self, soup, url):
         if not soup:
             return
 
-        user = self.options.user
-        shareLinks = user.share_links
+        shareLinks = self.user.share_links
         aTags = []
         for type_ in ['Evernote', 'Wiz', 'Pocket', 'Instapaper']:
             if shareLinks.get(type_, {}).get('enable'):
-                ashare = soup.new_tag('a', href=self.make_share_link(type_, user, url, soup))
+                ashare = soup.new_tag('a', href=self.make_share_link(type_, self.user, url, soup))
                 ashare.string = _('Save to {}').format(type_)
                 aTags.append(ashare)
 
         for type_ in ['Weibo', 'Facebook', 'X', 'Tumblr']:
             if shareLinks.get(type_):
-                ashare = soup.new_tag('a', href=self.make_share_link(type_, user, url, soup))
+                ashare = soup.new_tag('a', href=self.make_share_link(type_, self.user, url, soup))
                 ashare.string =  _('Share on {}').format(type_)
                 aTags.append(ashare)
 
@@ -1284,8 +1280,8 @@ def _fetch_article(self, job_info, preloaded=None):
         if preloaded is not None:
             fetcher.preloaded_urls[url] = preloaded
 
-        #res为对应url的一个html文件名
-        res = fetcher.start_fetch(url)
+        res = fetcher.start_fetch(url)  #res为对应url的一个html文件名
+
         path = fetcher.downloaded_paths
         failures = fetcher.failed_links
         if not res or not self.fs.exists(res):
@@ -2053,6 +2049,129 @@ def parse_feeds(self):
         return [feed]
 
 
+#保存的url为网页url，给定一个规则，从网页url里面提取链接，每个链接一篇文章
+#一般用于新闻类的网页，即使新闻网站不提供RSS，也可以每天去获取新闻
+#这个类建议搭配KindleEar chrome插件使用，插件可以自动生成抓取脚本
+class WebPageUrlNewsRecipe(BasicNewsRecipe):
+    max_articles_per_feed = 30
+
+    #为一个二维列表，可以保存多个标签规则，每个规则都很灵活，只要是BeautifulSoup的合法规则即可(字典或CSS选择器字符串)
+    #每个顶层元素为一个html标签的查找规则列表，从父节点到子节点，依次往下一直到最后一个元素为止
+    #最后一个元素必须为链接，或其子节点有链接，则此链接为文章最终链接，链接的文本为文章标题
+    #比如：url_extract_rules = [[{'name': 'div', 'attrs': {'class': 'art', 'data': True}}, {'name': 'a'}],]
+    #或:url_extract_rules = [['div.art[data]', 'a'],]
+    url_extract_rules = []
+
+    #格式和 url_extract_rules 一致，在文章的网页中提取文章正文，为空则使用自动提取
+    content_extract_rules = []
+
+    #返回一个Feed实例列表
+    def parse_feeds(self):
+        main_urls = self.get_feeds()
+        if not main_urls:
+            self.log.warning(f'There are no urls in "{self.title}"')
+            return []
+
+        feeds = []
+        id_counter = 0
+        added = set();
+        for obj in main_urls:
+            main_title, main_url = (self.title, obj) if isinstance(obj, str) else obj
+            feed = Feed()
+            feed.title = main_title
+            feed.description = ''
+            feed.image_url = None
+            feed.oldest_article = self.oldest_article
+            feed.articles = []
+            now = time.gmtime()
+
+            for title, url in self.extract_urls(main_title, main_url):
+                if len(feed) >= self.max_articles_per_feed:
+                    break
+                if url in added: #避免重复添加
+                    continue
+
+                added.add(url)
+                lastTime = LastDelivered.get_or_none(user=self.user.name, url=url)
+                delta = (datetime.datetime.utcnow() - lastTime.datetime) if lastTime else None
+                #这里oldest_article和其他的recipe不一样，这个参数表示在这个区间内不会重复推送
+                if ((not lastTime) or (not self.oldest_article) or 
+                    (delta.days * 24 * 3600 + delta.seconds > 24 * 3600 * self.oldest_article)):
+                    id_counter += 1
+                    feed.articles.append(Article(f'internal id#{id_counter}', title, url, 'KindleEar', '', now, ''))
+
+                    if lastTime:
+                        lastTime.datetime = datetime.datetime.utcnow()
+                        lastTime.save()
+                    else:
+                        LastDelivered.create(user=self.user.name, url=url)
+                else:
+                    self.log.debug(f'Skipping article {title}({url}) as it is too old.')
+
+            feed.id_counter = id_counter
+            if len(feed) > 0:
+                feeds.append(feed)
+
+        return feeds
+
+    #在一个soup对象中查找所有满足条件的tag
+    def _soup_find_all(self, tag, rule):
+        return tag.find_all(**rule) if isinstance(rule, dict) else tag.select(rule)
+
+    #从一个网页中根据指定的规则，提取文章链接
+    def extract_urls(self, main_title, main_url):
+        resp = self.browser.open(main_url, timeout=self.timeout)
+        if resp.status_code != 200:
+            self.log.warning(f'Failed to fetch {main_url}: {UrlOpener.CodeMap(resp.status_code)}')
+            return []
+
+        soup = BeautifulSoup(resp.text, 'lxml')
+
+        articles = []
+        for rule in self.url_extract_rules:
+            resultTags = self._soup_find_all(soup, rule[0])
+            for flt in rule[1:]:
+                resultTags = [self._soup_find_all(tag, flt) for tag in resultTags]
+                resultTags = [tag for sublist in resultTags for tag in sublist] #二级列表展开为一级列表
+
+            for item in resultTags:
+                #如果最终tag不是链接，则在子节点中查找
+                item = item.find_all('a') if item.name.lower() != 'a' else [item]
+                for tag in item:
+                    title = ' '.join(tag.stripped_strings) or main_title
+                    url = tag.attrs.get('href', None)
+                    if not url.startswith('http'):
+                        url = urljoin(main_url, url)
+                    if title and url:
+                        articles.append((title, url))
+
+        self.log.debug(f'Found {len(articles)} articles in {self.title}\n')
+        self.log.debug(str(articles))
+        return articles
+
+    #提取文章内容，这个函数在文章被下载解码为unicode后，转换为DOM树前被调用
+    def preprocess_raw_html(self, raw_html, url):
+        if self.auto_cleanup or not self.content_extract_rules: #由readability自动提取
+            return raw_html
+
+        soup = BeautifulSoup(raw_html, 'lxml')
+        oldBody = soup.find('body')
+        if not oldBody:
+            return raw_html
+
+        newBody = soup.new_tag('body')
+        for rule in self.content_extract_rules:
+            resultTags = self._soup_find_all(soup, rule[0])
+            for flt in rule[1:]:
+                resultTags = [self._soup_find_all(tag, flt) for tag in resultTags]
+                resultTags = [tag for sublist in resultTags for tag in sublist] #二级列表展开为一级列表
+
+            newBody.extend(resultTags)
+
+        oldBody.replace_with(newBody)
+        return str(soup)
+
+
 class CalibrePeriodical(BasicNewsRecipe):
 
     #: Set this to the slug for the calibre periodical

diff --git a/application/lib/calibre/web/feeds/recipes/__init__.py b/application/lib/calibre/web/feeds/recipes/__init__.py
@@ -7,13 +7,13 @@
 '''
 import re, time, io
 from calibre.web.feeds.news import (BasicNewsRecipe, CustomIndexRecipe,
-    AutomaticNewsRecipe, UrlNewsRecipe, CalibrePeriodical)
+    AutomaticNewsRecipe, UrlNewsRecipe, CalibrePeriodical, WebPageUrlNewsRecipe)
 from calibre.ebooks.BeautifulSoup import BeautifulSoup
 from calibre.utils.config import JSONConfig
 from polyglot.builtins import itervalues, codepoint_to_chr
 
 basic_recipes = (BasicNewsRecipe, AutomaticNewsRecipe, UrlNewsRecipe, CustomIndexRecipe,
-        CalibrePeriodical)
+        CalibrePeriodical, WebPageUrlNewsRecipe)
 
 custom_recipes = JSONConfig('custom_recipes/index.json')
 
@@ -42,11 +42,13 @@ def compile_recipe(src):
     src = io.StringIO(src, newline=None).getvalue()
 
     namespace = {
-            'BasicNewsRecipe':BasicNewsRecipe,
-            'AutomaticNewsRecipe':AutomaticNewsRecipe,
-            'UrlNewsRecipe':UrlNewsRecipe,
-            'time':time, 're':re,
-            'BeautifulSoup':BeautifulSoup,
+            'BasicNewsRecipe': BasicNewsRecipe,
+            'AutomaticNewsRecipe': AutomaticNewsRecipe,
+            'UrlNewsRecipe': UrlNewsRecipe,
+            'WebPageUrlNewsRecipe': WebPageUrlNewsRecipe,
+            'time': time,
+            're': re,
+            'BeautifulSoup': BeautifulSoup,
             'unicode': str,
             'unichr': codepoint_to_chr,
             'xrange': range,
@@ -55,8 +57,7 @@ def compile_recipe(src):
     ua = namespace.get('calibre_most_common_ua')
 
     for x in itervalues(namespace):
-        if (isinstance(x, type) and issubclass(x, BasicNewsRecipe) and x not
-                in basic_recipes):
+        if (isinstance(x, type) and issubclass(x, BasicNewsRecipe) and x not in basic_recipes):
             x.calibre_most_common_ua = ua
             return x
 

diff --git a/application/lib/smtp_mail.py b/application/lib/smtp_mail.py
@@ -13,7 +13,7 @@ def smtp_send_mail(sender, to, subject, body, host, username, password, port=Non
     if ':' in host:
         host, port = host.split(':', 2)
         port = int(port)
-    else:
+    elif not port:
         port = 587 #587-TLS, 465-SSL, 25-Nocrpt
 
     to = to if isinstance(to, list) else [to]

diff --git a/application/static/base.js b/application/static/base.js
@@ -672,9 +672,11 @@ function OpenUploadRecipeDialog() {
           modal.close();
           //更新本地数据
           delete data.status;
+          let language = data.language;
           my_uploaded_recipes.unshift(data);
           PopulateLibrary('');
-          ShowSimpleModalDialog('<h2>{0}</h2><p>{1}</p>'.format(i18n.congratulations, i18n.recipeUploadedTips));
+          ShowSimpleModalDialog('<h2>{0}</h2><p>{1}</p>'.format(i18n.congratulations, 
+            i18n.recipeUploadedTips.format(LanguageName(language))));
         } else if (data.status == i18n.loginRequired) {
           window.location.href = '/login';
         } else {
@@ -718,16 +720,14 @@ function DeleteUploadRecipe(id, title) {
 
 //在页面下发插入bookmarklet
 function insertBookmarkletGmailThis(subscribeUrl, mailPrefix) {
-  var parser = $('<a>', {
-    href: subscribeUrl
-  });
+  var parser = $('<a>', {href: subscribeUrl});
   var host = parser.prop('hostname');
   var length = host.length;
   var addr = '';
   if ((length > 12) && host.substr(length - 12, 12) == '.appspot.com') {
     addr = '{0}read@{1}.appspotmail.com'.format(mailPrefix, host.substr(0, length - 12));
   } else {
-    return;
+    addr = '{0}read@{1}'.format(mailPrefix, host);
   }
 
   var parent = $('#bookmarklet_content');

diff --git a/application/templates/base.html b/application/templates/base.html
@@ -51,7 +51,7 @@
     chooseRecipeFile: '{{_("Choose a recipe file to upload")|safe}}',
     congratulations: '{{_("Congratulations")|safe}}',
     thanks: '{{_("Thanks")|safe}}',
-    recipeUploadedTips: '{{_("Your recipe has been uploaded, and it can be found in the Library section. If you dont see it, please make sure to switch to the correct language.")|safe}}',
+    recipeUploadedTips: '{{_("Your recipe has been uploaded, and it can be found in the Library section. If you dont see it, please make sure to switch to the correct language ({0}).")|safe}}',
     recipeDeleted: '{{_("Your recipe have been deleted.")|safe}}',
     kindleifySelection: '{{_("Kindleify Selection")|safe}}',
     verify: '{{_("Verify")|safe}}',

diff --git a/application/translations/tr_TR/LC_MESSAGES/messages.mo b/application/translations/tr_TR/LC_MESSAGES/messages.mo