Skip to content

Commit

Permalink
porting to python3
Browse files Browse the repository at this point in the history
  • Loading branch information
cdhigh committed Apr 3, 2024
1 parent 53e6cc9 commit a2fc2c5
Show file tree
Hide file tree
Showing 18 changed files with 385 additions and 244 deletions.
3 changes: 2 additions & 1 deletion application/back_end/db_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -220,7 +220,8 @@ class SharedRssCategory(MyBaseModel):

class LastDelivered(MyBaseModel):
user = CharField()
bookname = CharField()
bookname = CharField(default='')
url = CharField(default='')
num = IntegerField(default=0)
record = CharField(default='')
datetime = DateTimeField(default=datetime.datetime.utcnow)
Expand Down
147 changes: 133 additions & 14 deletions application/lib/calibre/web/feeds/news.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,14 +8,9 @@
__docformat__ = "restructuredtext en"


import io
import os
import re
import sys
import time
import traceback
import io, os, re, sys, time, datetime, traceback
from collections import defaultdict, namedtuple
from urllib.parse import urlparse, urlsplit, quote
from urllib.parse import urlparse, urlsplit, quote, urljoin
from urllib.error import HTTPError, URLError
from calibre import __appname__, as_unicode, force_unicode, iswindows, preferred_encoding, strftime
from calibre.ebooks.BeautifulSoup import BeautifulSoup, CData, NavigableString, Tag
Expand All @@ -39,6 +34,7 @@
from urlopener import UrlOpener
from requests_file import LocalFileAdapter
from filesystem_dict import FsDictStub
from application.back_end.db_models import LastDelivered

MASTHEAD_SIZE = (600, 60)
DEFAULT_MASTHEAD_IMAGE = 'mastheadImage.gif'
Expand Down Expand Up @@ -925,6 +921,7 @@ def __init__(self, options, log, output_dir, fs, feed_index_start=0):
self.title = str(self.title, 'utf-8', 'replace')

self.options = options
self.user = self.options.user
self.debug = options.verbose > 1
self.output_dir = output_dir
self.fs = fs
Expand Down Expand Up @@ -1045,7 +1042,7 @@ def _postprocess_html(self, soup, first_fetch, job_info):
del img['srcset']

#如果需要,去掉正文中的超链接(使用斜体下划线标识),以避免误触
remove_hyperlinks = self.options.user.remove_hyperlinks
remove_hyperlinks = self.user.remove_hyperlinks
if remove_hyperlinks in ('text', 'all'):
for a_ in soup.find_all('a'):
a_.name = 'i'
Expand Down Expand Up @@ -1099,18 +1096,17 @@ def append_share_links(self, soup, url):
if not soup:
return

user = self.options.user
shareLinks = user.share_links
shareLinks = self.user.share_links
aTags = []
for type_ in ['Evernote', 'Wiz', 'Pocket', 'Instapaper']:
if shareLinks.get(type_, {}).get('enable'):
ashare = soup.new_tag('a', href=self.make_share_link(type_, user, url, soup))
ashare = soup.new_tag('a', href=self.make_share_link(type_, self.user, url, soup))
ashare.string = _('Save to {}').format(type_)
aTags.append(ashare)

for type_ in ['Weibo', 'Facebook', 'X', 'Tumblr']:
if shareLinks.get(type_):
ashare = soup.new_tag('a', href=self.make_share_link(type_, user, url, soup))
ashare = soup.new_tag('a', href=self.make_share_link(type_, self.user, url, soup))
ashare.string = _('Share on {}').format(type_)
aTags.append(ashare)

Expand Down Expand Up @@ -1284,8 +1280,8 @@ def _fetch_article(self, job_info, preloaded=None):
if preloaded is not None:
fetcher.preloaded_urls[url] = preloaded

#res为对应url的一个html文件名
res = fetcher.start_fetch(url)
res = fetcher.start_fetch(url) #res为对应url的一个html文件名

path = fetcher.downloaded_paths
failures = fetcher.failed_links
if not res or not self.fs.exists(res):
Expand Down Expand Up @@ -2053,6 +2049,129 @@ def parse_feeds(self):
return [feed]


#保存的url为网页url,给定一个规则,从网页url里面提取链接,每个链接一篇文章
#一般用于新闻类的网页,即使新闻网站不提供RSS,也可以每天去获取新闻
#这个类建议搭配KindleEar chrome插件使用,插件可以自动生成抓取脚本
class WebPageUrlNewsRecipe(BasicNewsRecipe):
max_articles_per_feed = 30

#为一个二维列表,可以保存多个标签规则,每个规则都很灵活,只要是BeautifulSoup的合法规则即可(字典或CSS选择器字符串)
#每个顶层元素为一个html标签的查找规则列表,从父节点到子节点,依次往下一直到最后一个元素为止
#最后一个元素必须为链接,或其子节点有链接,则此链接为文章最终链接,链接的文本为文章标题
#比如:url_extract_rules = [[{'name': 'div', 'attrs': {'class': 'art', 'data': True}}, {'name': 'a'}],]
#或:url_extract_rules = [['div.art[data]', 'a'],]
url_extract_rules = []

#格式和 url_extract_rules 一致,在文章的网页中提取文章正文,为空则使用自动提取
content_extract_rules = []

#返回一个Feed实例列表
def parse_feeds(self):
main_urls = self.get_feeds()
if not main_urls:
self.log.warning(f'There are no urls in "{self.title}"')
return []

feeds = []
id_counter = 0
added = set();
for obj in main_urls:
main_title, main_url = (self.title, obj) if isinstance(obj, str) else obj
feed = Feed()
feed.title = main_title
feed.description = ''
feed.image_url = None
feed.oldest_article = self.oldest_article
feed.articles = []
now = time.gmtime()

for title, url in self.extract_urls(main_title, main_url):
if len(feed) >= self.max_articles_per_feed:
break
if url in added: #避免重复添加
continue

added.add(url)
lastTime = LastDelivered.get_or_none(user=self.user.name, url=url)
delta = (datetime.datetime.utcnow() - lastTime.datetime) if lastTime else None
#这里oldest_article和其他的recipe不一样,这个参数表示在这个区间内不会重复推送
if ((not lastTime) or (not self.oldest_article) or
(delta.days * 24 * 3600 + delta.seconds > 24 * 3600 * self.oldest_article)):
id_counter += 1
feed.articles.append(Article(f'internal id#{id_counter}', title, url, 'KindleEar', '', now, ''))

if lastTime:
lastTime.datetime = datetime.datetime.utcnow()
lastTime.save()
else:
LastDelivered.create(user=self.user.name, url=url)
else:
self.log.debug(f'Skipping article {title}({url}) as it is too old.')

feed.id_counter = id_counter
if len(feed) > 0:
feeds.append(feed)

return feeds

#在一个soup对象中查找所有满足条件的tag
def _soup_find_all(self, tag, rule):
return tag.find_all(**rule) if isinstance(rule, dict) else tag.select(rule)

#从一个网页中根据指定的规则,提取文章链接
def extract_urls(self, main_title, main_url):
resp = self.browser.open(main_url, timeout=self.timeout)
if resp.status_code != 200:
self.log.warning(f'Failed to fetch {main_url}: {UrlOpener.CodeMap(resp.status_code)}')
return []

soup = BeautifulSoup(resp.text, 'lxml')

articles = []
for rule in self.url_extract_rules:
resultTags = self._soup_find_all(soup, rule[0])
for flt in rule[1:]:
resultTags = [self._soup_find_all(tag, flt) for tag in resultTags]
resultTags = [tag for sublist in resultTags for tag in sublist] #二级列表展开为一级列表

for item in resultTags:
#如果最终tag不是链接,则在子节点中查找
item = item.find_all('a') if item.name.lower() != 'a' else [item]
for tag in item:
title = ' '.join(tag.stripped_strings) or main_title
url = tag.attrs.get('href', None)
if not url.startswith('http'):
url = urljoin(main_url, url)
if title and url:
articles.append((title, url))

self.log.debug(f'Found {len(articles)} articles in {self.title}\n')
self.log.debug(str(articles))
return articles

#提取文章内容,这个函数在文章被下载解码为unicode后,转换为DOM树前被调用
def preprocess_raw_html(self, raw_html, url):
if self.auto_cleanup or not self.content_extract_rules: #由readability自动提取
return raw_html

soup = BeautifulSoup(raw_html, 'lxml')
oldBody = soup.find('body')
if not oldBody:
return raw_html

newBody = soup.new_tag('body')
for rule in self.content_extract_rules:
resultTags = self._soup_find_all(soup, rule[0])
for flt in rule[1:]:
resultTags = [self._soup_find_all(tag, flt) for tag in resultTags]
resultTags = [tag for sublist in resultTags for tag in sublist] #二级列表展开为一级列表

newBody.extend(resultTags)

oldBody.replace_with(newBody)
return str(soup)


class CalibrePeriodical(BasicNewsRecipe):

#: Set this to the slug for the calibre periodical
Expand Down
19 changes: 10 additions & 9 deletions application/lib/calibre/web/feeds/recipes/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,13 @@
'''
import re, time, io
from calibre.web.feeds.news import (BasicNewsRecipe, CustomIndexRecipe,
AutomaticNewsRecipe, UrlNewsRecipe, CalibrePeriodical)
AutomaticNewsRecipe, UrlNewsRecipe, CalibrePeriodical, WebPageUrlNewsRecipe)
from calibre.ebooks.BeautifulSoup import BeautifulSoup
from calibre.utils.config import JSONConfig
from polyglot.builtins import itervalues, codepoint_to_chr

basic_recipes = (BasicNewsRecipe, AutomaticNewsRecipe, UrlNewsRecipe, CustomIndexRecipe,
CalibrePeriodical)
CalibrePeriodical, WebPageUrlNewsRecipe)

custom_recipes = JSONConfig('custom_recipes/index.json')

Expand Down Expand Up @@ -42,11 +42,13 @@ def compile_recipe(src):
src = io.StringIO(src, newline=None).getvalue()

namespace = {
'BasicNewsRecipe':BasicNewsRecipe,
'AutomaticNewsRecipe':AutomaticNewsRecipe,
'UrlNewsRecipe':UrlNewsRecipe,
'time':time, 're':re,
'BeautifulSoup':BeautifulSoup,
'BasicNewsRecipe': BasicNewsRecipe,
'AutomaticNewsRecipe': AutomaticNewsRecipe,
'UrlNewsRecipe': UrlNewsRecipe,
'WebPageUrlNewsRecipe': WebPageUrlNewsRecipe,
'time': time,
're': re,
'BeautifulSoup': BeautifulSoup,
'unicode': str,
'unichr': codepoint_to_chr,
'xrange': range,
Expand All @@ -55,8 +57,7 @@ def compile_recipe(src):
ua = namespace.get('calibre_most_common_ua')

for x in itervalues(namespace):
if (isinstance(x, type) and issubclass(x, BasicNewsRecipe) and x not
in basic_recipes):
if (isinstance(x, type) and issubclass(x, BasicNewsRecipe) and x not in basic_recipes):
x.calibre_most_common_ua = ua
return x

Expand Down
2 changes: 1 addition & 1 deletion application/lib/smtp_mail.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ def smtp_send_mail(sender, to, subject, body, host, username, password, port=Non
if ':' in host:
host, port = host.split(':', 2)
port = int(port)
else:
elif not port:
port = 587 #587-TLS, 465-SSL, 25-Nocrpt

to = to if isinstance(to, list) else [to]
Expand Down
10 changes: 5 additions & 5 deletions application/static/base.js
Original file line number Diff line number Diff line change
Expand Up @@ -672,9 +672,11 @@ function OpenUploadRecipeDialog() {
modal.close();
//更新本地数据
delete data.status;
let language = data.language;
my_uploaded_recipes.unshift(data);
PopulateLibrary('');
ShowSimpleModalDialog('<h2>{0}</h2><p>{1}</p>'.format(i18n.congratulations, i18n.recipeUploadedTips));
ShowSimpleModalDialog('<h2>{0}</h2><p>{1}</p>'.format(i18n.congratulations,
i18n.recipeUploadedTips.format(LanguageName(language))));
} else if (data.status == i18n.loginRequired) {
window.location.href = '/login';
} else {
Expand Down Expand Up @@ -718,16 +720,14 @@ function DeleteUploadRecipe(id, title) {

//在页面下发插入bookmarklet
function insertBookmarkletGmailThis(subscribeUrl, mailPrefix) {
var parser = $('<a>', {
href: subscribeUrl
});
var parser = $('<a>', {href: subscribeUrl});
var host = parser.prop('hostname');
var length = host.length;
var addr = '';
if ((length > 12) && host.substr(length - 12, 12) == '.appspot.com') {
addr = '{0}read@{1}.appspotmail.com'.format(mailPrefix, host.substr(0, length - 12));
} else {
return;
addr = '{0}read@{1}'.format(mailPrefix, host);
}

var parent = $('#bookmarklet_content');
Expand Down
2 changes: 1 addition & 1 deletion application/templates/base.html
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@
chooseRecipeFile: '{{_("Choose a recipe file to upload")|safe}}',
congratulations: '{{_("Congratulations")|safe}}',
thanks: '{{_("Thanks")|safe}}',
recipeUploadedTips: '{{_("Your recipe has been uploaded, and it can be found in the Library section. If you dont see it, please make sure to switch to the correct language.")|safe}}',
recipeUploadedTips: '{{_("Your recipe has been uploaded, and it can be found in the Library section. If you dont see it, please make sure to switch to the correct language ({0}).")|safe}}',
recipeDeleted: '{{_("Your recipe have been deleted.")|safe}}',
kindleifySelection: '{{_("Kindleify Selection")|safe}}',
verify: '{{_("Verify")|safe}}',
Expand Down
Binary file modified application/translations/tr_TR/LC_MESSAGES/messages.mo
Binary file not shown.
Loading

0 comments on commit a2fc2c5

Please sign in to comment.