From 1ed3285b168177ec6e9611d38ae40a7445c5d520 Mon Sep 17 00:00:00 2001 From: cdhigh Date: Tue, 15 Oct 2024 08:55:04 -0300 Subject: [PATCH] sync some calibre's updates --- application/lib/calibre/ebooks/epub/pages.py | 5 +- .../lib/calibre/ebooks/epub/periodical.py | 5 +- .../lib/calibre/ebooks/html_entities.py | 113 ++++++- application/lib/calibre/library/__init__.py | 7 +- application/lib/calibre/library/comments.py | 13 +- application/lib/calibre/utils/formatter.py | 4 +- application/lib/calibre/utils/imghdr.py | 3 +- application/lib/calibre/utils/serialize.py | 4 +- application/lib/calibre/utils/short_uuid.py | 4 +- application/lib/calibre/utils/smartypants.py | 2 + application/lib/calibre/utils/speedups.py | 8 +- application/lib/calibre/utils/terminal.py | 10 +- application/lib/calibre/utils/threadpool.py | 7 +- application/lib/calibre/utils/xml_parse.py | 12 +- application/lib/calibre/utils/zipfile.py | 2 +- application/lib/calibre/web/feeds/news.py | 2 +- .../lib/calibre/web/site_parsers/natgeo.py | 181 ++++++++++ .../lib/calibre/web/site_parsers/nytimes.py | 311 ++++++++++++------ .../lib/ebook_translator/html_translator.py | 16 +- 19 files changed, 555 insertions(+), 154 deletions(-) create mode 100644 application/lib/calibre/web/site_parsers/natgeo.py diff --git a/application/lib/calibre/ebooks/epub/pages.py b/application/lib/calibre/ebooks/epub/pages.py index da06eef0..7f58aa72 100644 --- a/application/lib/calibre/ebooks/epub/pages.py +++ b/application/lib/calibre/ebooks/epub/pages.py @@ -9,10 +9,11 @@ import re from itertools import count -from calibre.ebooks.oeb.base import XHTML_NS -from calibre.ebooks.oeb.base import OEBBook + from lxml.etree import XPath +from calibre.ebooks.oeb.base import XHTML_NS, OEBBook + NSMAP = {'h': XHTML_NS, 'html': XHTML_NS, 'xhtml': XHTML_NS} PAGE_RE = re.compile(r'page', re.IGNORECASE) ROMAN_RE = re.compile(r'^[ivxlcdm]+$', re.IGNORECASE) diff --git a/application/lib/calibre/ebooks/epub/periodical.py b/application/lib/calibre/ebooks/epub/periodical.py index 8b2a6610..80db09a5 100644 --- a/application/lib/calibre/ebooks/epub/periodical.py +++ b/application/lib/calibre/ebooks/epub/periodical.py @@ -5,11 +5,12 @@ __copyright__ = '2010, Kovid Goyal ' __docformat__ = 'restructuredtext en' -from uuid import uuid4 import time +from uuid import uuid4 +from calibre import prepare_string_for_xml as xml +from calibre import strftime from calibre.constants import __appname__, __version__ -from calibre import strftime, prepare_string_for_xml as xml from calibre.utils.date import parse_date SONY_METADATA = '''\ diff --git a/application/lib/calibre/ebooks/html_entities.py b/application/lib/calibre/ebooks/html_entities.py index ca2dc2fd..6817ac03 100644 --- a/application/lib/calibre/ebooks/html_entities.py +++ b/application/lib/calibre/ebooks/html_entities.py @@ -1,6 +1,7 @@ #!/usr/bin/env python # License: GPLv3 Copyright: 2017, Kovid Goyal +from calibre import my_unichr html5_entities = { # ENTITY_DATA {{{ @@ -91,7 +92,7 @@ 'DifferentialD': 'ⅆ', 'Dopf': '𝔻', 'Dot': '¨', - 'DotDot': '⃜\u20dc', + 'DotDot': '⃜', 'DotEqual': '≐', 'DoubleContourIntegral': '∯', 'DoubleDot': '¨', @@ -502,7 +503,7 @@ 'TRADE': '™', 'TSHcy': 'Ћ', 'TScy': 'Ц', - 'Tab': ' ', + 'Tab': '\t', 'Tau': 'Τ', 'Tcaron': 'Ť', 'Tcedil': 'Ţ', @@ -1105,6 +1106,7 @@ 'hearts': '♥', 'heartsuit': '♥', 'hellip': '…', + 'hellips': '…', 'hercon': '⊹', 'hfr': '𝔥', 'hksearow': '⤥', @@ -1857,6 +1859,7 @@ 'square': '□', 'squarf': '▪', 'squf': '▪', + 'squot': "'", 'srarr': '→', 'sscr': '𝓈', 'ssetmn': '∖', @@ -2133,19 +2136,107 @@ } -if __name__ == '__main__': +def entity_to_unicode_in_python(match, exceptions=(), encoding='cp1252', result_exceptions={}): + def check(ch): + return result_exceptions.get(ch, ch) + + ent = match.group(1) + if ent in exceptions: + return '&'+ent+';' + if ent in {'apos', 'squot'}: # squot is generated by some broken CMS software + return check("'") + if ent == 'hellips': + ent = 'hellip' + if ent.startswith('#'): + try: + if ent[1] in ('x', 'X'): + num = int(ent[2:], 16) + else: + num = int(ent[1:]) + except: + return '&'+ent+';' + if encoding is None or num > 255: + return check(my_unichr(num)) + try: + return check(bytes(bytearray((num,))).decode(encoding)) + except UnicodeDecodeError: + return check(my_unichr(num)) + from calibre.ebooks.html_entities import html5_entities + try: + return check(html5_entities[ent]) + except KeyError: + pass + from polyglot.html_entities import name2codepoint + try: + return check(my_unichr(name2codepoint[ent])) + except KeyError: + return '&'+ent+';' + + +def find_tests(): + import unittest + class TestHTMLEntityReplacement(unittest.TestCase): + def test_html_entity_replacement(self): + from calibre_extensions.fast_html_entities import replace_all_entities + def t(inp, exp): + self.assertEqual(exp, replace_all_entities(inp), f'Failed for input: {inp!r}') + def x(inp, exp): + self.assertEqual(exp, replace_all_entities(inp, True), f'Failed for input: {inp!r}') + t('aӒb', 'aӒb') + t('', '') + t('a', 'a') + t('&', '&') + t('&', '&') + t('&', '&') + t('a&;b &#;c', 'a&;b &#;c') + t('<', '<') + t('&<', '&<') + t('a&b<c', 'a&b b\n\ncd', '

a b

cd

'), ]: - cval = comments_to_html(pat) - self.assertEqual(cval, val) + try: + cval = comments_to_html(pat) + except DeprecationWarning: + pass # new lxml + old Beautiful soup == deprecation warning + else: + self.assertEqual(cval, val) return unittest.defaultTestLoader.loadTestsFromTestCase(Test) diff --git a/application/lib/calibre/utils/formatter.py b/application/lib/calibre/utils/formatter.py index ab7934cc..8f09febd 100644 --- a/application/lib/calibre/utils/formatter.py +++ b/application/lib/calibre/utils/formatter.py @@ -20,9 +20,7 @@ from calibre.constants import DEBUG from calibre.ebooks.metadata.book.base import field_metadata from calibre.utils.config import tweaks -from calibre.utils.formatter_functions import ( - StoredObjectType, formatter_functions, function_object_type, get_database, -) +from calibre.utils.formatter_functions import StoredObjectType, formatter_functions, function_object_type, get_database from calibre.utils.icu import strcmp from calibre.utils.localization import _ from polyglot.builtins import error_message diff --git a/application/lib/calibre/utils/imghdr.py b/application/lib/calibre/utils/imghdr.py index e23c4f8d..186b71c0 100644 --- a/application/lib/calibre/utils/imghdr.py +++ b/application/lib/calibre/utils/imghdr.py @@ -2,8 +2,9 @@ # License: GPLv3 Copyright: 2016, Kovid Goyal -from struct import unpack, error import os +from struct import error, unpack + from calibre.utils.speedups import ReadOnlyFileBuffer from polyglot.builtins import string_or_bytes diff --git a/application/lib/calibre/utils/serialize.py b/application/lib/calibre/utils/serialize.py index 707298df..81e187f0 100644 --- a/application/lib/calibre/utils/serialize.py +++ b/application/lib/calibre/utils/serialize.py @@ -25,9 +25,9 @@ def encoder(obj): if isinstance(obj, (set, frozenset)): return encoded(1, tuple(obj), ExtType) if getattr(obj, '__calibre_serializable__', False): + from calibre.db.categories import Tag from calibre.ebooks.metadata.book.base import Metadata from calibre.library.field_metadata import FieldMetadata, fm_as_dict - from calibre.db.categories import Tag if isinstance(obj, Metadata): from calibre.ebooks.metadata.book.serialize import metadata_as_dict return encoded( @@ -60,8 +60,8 @@ def json_dumps(data, **kw): def decode_metadata(x, for_json): - from polyglot.binary import from_base64_bytes from calibre.ebooks.metadata.book.serialize import metadata_from_dict + from polyglot.binary import from_base64_bytes obj = metadata_from_dict(x) if for_json and obj.cover_data and obj.cover_data[1]: obj.cover_data = obj.cover_data[0], from_base64_bytes(obj.cover_data[1]) diff --git a/application/lib/calibre/utils/short_uuid.py b/application/lib/calibre/utils/short_uuid.py index f97475c4..9b344e20 100644 --- a/application/lib/calibre/utils/short_uuid.py +++ b/application/lib/calibre/utils/short_uuid.py @@ -6,7 +6,9 @@ Generate UUID encoded using a user specified alphabet. ''' -import string, math, uuid as _uuid +import math +import string +import uuid as _uuid def num_to_string(number, alphabet, alphabet_len, pad_to_length=None): diff --git a/application/lib/calibre/utils/smartypants.py b/application/lib/calibre/utils/smartypants.py index 3d44112f..a00a6151 100644 --- a/application/lib/calibre/utils/smartypants.py +++ b/application/lib/calibre/utils/smartypants.py @@ -875,6 +875,8 @@ def test_ordinal_numbers(self): def test_educated_quotes(self): self.assertEqual(sp('''"Isn't this fun?"'''), '''“Isn’t this fun?”''') + self.assertEqual(sp("'abc'"), '‘abc’') + tests = unittest.defaultTestLoader.loadTestsFromTestCase(TestSmartypantsAllAttributes) if return_tests: diff --git a/application/lib/calibre/utils/speedups.py b/application/lib/calibre/utils/speedups.py index 2e835a4c..270cc4c7 100644 --- a/application/lib/calibre/utils/speedups.py +++ b/application/lib/calibre/utils/speedups.py @@ -9,14 +9,15 @@ class ReadOnlyFileBuffer: ''' A zero copy implementation of a file like object. Uses memoryviews for efficiency. ''' - def __init__(self, raw): + def __init__(self, raw: bytes, name: str = ''): self.sz, self.mv = len(raw), (raw if isinstance(raw, memoryview) else memoryview(raw)) self.pos = 0 + self.name: str = name def tell(self): return self.pos - def read(self, n=None): + def read(self, n = None) -> memoryview: if n is None: ans = self.mv[self.pos:] self.pos = self.sz @@ -35,6 +36,9 @@ def seek(self, pos, whence=os.SEEK_SET): self.pos = max(0, min(self.pos, self.sz)) return self.pos + def seekable(self): + return True + def getvalue(self): return self.mv diff --git a/application/lib/calibre/utils/terminal.py b/application/lib/calibre/utils/terminal.py index cd31103b..6aee59ef 100644 --- a/application/lib/calibre/utils/terminal.py +++ b/application/lib/calibre/utils/terminal.py @@ -4,10 +4,12 @@ __copyright__ = '2012, Kovid Goyal ' __docformat__ = 'restructuredtext en' -import os, sys, re +import os +import re +import sys -from calibre.prints import is_binary from calibre.constants import iswindows +from calibre.prints import is_binary from polyglot.builtins import iteritems if iswindows: @@ -233,7 +235,9 @@ class CONSOLE_SCREEN_BUFFER_INFO(Structure): def get_term_geometry(): - import fcntl, termios, struct + import fcntl + import struct + import termios def ioctl_GWINSZ(fd): try: diff --git a/application/lib/calibre/utils/threadpool.py b/application/lib/calibre/utils/threadpool.py index 818183e4..1bc1db50 100644 --- a/application/lib/calibre/utils/threadpool.py +++ b/application/lib/calibre/utils/threadpool.py @@ -47,6 +47,7 @@ # standard library modules import threading + from polyglot import queue # exceptions @@ -72,7 +73,7 @@ class WorkerThread(threading.Thread): """ def __init__(self, requestsQueue, resultsQueue, **kwds): - """Set up thread in daemonic mode and start it immediatedly. + """Set up thread in daemonic mode and start it immediately. requestsQueue and resultQueue are instances of queue.Queue passed by the ThreadPool class when it creates a new worker thread. @@ -96,9 +97,9 @@ def run(self): break # and exit try: self.resultQueue.put( - (request, request.callable(request.args, **request.kwds)) + (request, request.callable(*request.args, **request.kwds)) ) - except Exception as e: + except: request.exception = True import traceback self.resultQueue.put((request, traceback.format_exc())) diff --git a/application/lib/calibre/utils/xml_parse.py b/application/lib/calibre/utils/xml_parse.py index a31c6ed8..fdd57dca 100644 --- a/application/lib/calibre/utils/xml_parse.py +++ b/application/lib/calibre/utils/xml_parse.py @@ -36,8 +36,16 @@ def safe_xml_fromstring(string_or_bytes, recover=True): return ans +def unsafe_xml_fromstring(string_or_bytes): + parser = etree.XMLParser(resolve_entities=True) + return fs(string_or_bytes, parser=parser) + + def find_tests(): - import unittest, tempfile, os + import os + import tempfile + import unittest + from calibre.constants import iswindows class TestXMLParse(unittest.TestCase): @@ -61,7 +69,7 @@ def t(tid, val, expected, safe=True): raw = templ.format(id=tid, val=val) err = None try: - root = safe_xml_fromstring(raw) if safe else etree.fromstring(raw) + root = safe_xml_fromstring(raw) if safe else unsafe_xml_fromstring(raw) except Exception as e: err = str(e) root = None diff --git a/application/lib/calibre/utils/zipfile.py b/application/lib/calibre/utils/zipfile.py index e493b75c..03c57bc7 100644 --- a/application/lib/calibre/utils/zipfile.py +++ b/application/lib/calibre/utils/zipfile.py @@ -17,7 +17,7 @@ from calibre.constants import filesystem_encoding from calibre.ebooks.chardet import detect from calibre.ptempfile import SpooledTemporaryFile -from polyglot.builtins import string_or_bytes, as_bytes +from polyglot.builtins import as_bytes, string_or_bytes try: import zlib # We may need its compression method diff --git a/application/lib/calibre/web/feeds/news.py b/application/lib/calibre/web/feeds/news.py index c248ebe4..9f05402a 100644 --- a/application/lib/calibre/web/feeds/news.py +++ b/application/lib/calibre/web/feeds/news.py @@ -2318,7 +2318,7 @@ def get_browser(self): br.select_form(name='login') br['username'] = self.username br['password'] = self.password - raw = br.submit_selected().content + raw = br.submit().read() if 'href="/my-account"' not in raw: raise LoginFailed( _('Failed to log in, check your username and password for' diff --git a/application/lib/calibre/web/site_parsers/natgeo.py b/application/lib/calibre/web/site_parsers/natgeo.py new file mode 100644 index 00000000..3e60a3e9 --- /dev/null +++ b/application/lib/calibre/web/site_parsers/natgeo.py @@ -0,0 +1,181 @@ +#!/usr/bin/env python + +from __future__ import absolute_import, division, print_function, unicode_literals + +import json +from pprint import pprint + +from calibre import prepare_string_for_xml as escape +from calibre.utils.iso8601 import parse_iso8601 + +module_version = 1 # needed for live updates +pprint + + +def extract_json(raw): + s = raw.find("window['__natgeo__']") + script = raw[s : raw.find('', s)] + content = json.loads(script[script.find('{') :].rstrip(';'))['page']['content'] + if content.get('prismarticle'): + return content['prismarticle'] + if content.get('article'): + return content['article'] + + +def parse_contributors(grp): + for item in grp: + line = '
' + escape(item['title']) + ' ' + for c in item['contributors']: + line += escape(c['displayName']) + yield line + '
' + + +def parse_lead_image(media): + if 'image' in media: + yield '

' + if 'dsc' in media['image']: + yield ( + f'

' + ) + else: + yield f'
' + if 'caption' in media and 'credit' in media: + yield ( + '
' + + media['caption'] + + ' ' + + media['credit'] + + '
' + ) + elif 'caption' in media: + yield '
' + media['caption'] + '
' + yield '

' + + +def parse_inline(inl): + if inl.get('content', {}).get('name', '') == 'Image': + props = inl['content']['props'] + yield '

' + if 'image' in props: + yield f'

' + if 'caption' in props: + yield ( + f'
{props["caption"].get("text", "")} {props["caption"].get("credit", "")}
' + ) + yield '

' + if inl.get('content', {}).get('name', '') == 'ImageGroup': + if 'images' in inl['content']['props']: + for imgs in inl['content']['props']['images']: + yield '

' + if 'src' in imgs: + yield f'

' + if 'caption' in imgs: + yield ( + f'
{imgs["caption"].get("text", "")} {imgs["caption"].get("credit", "")}
' + ) + yield '

' + + +def parse_cont(content): + for cont in content.get('content', {}): + if isinstance(cont, dict): + yield from parse_body(cont) + if isinstance(cont, str): + yield cont + + +def parse_body(x): + if isinstance(x, dict): + if 'type' in x: + tag = x['type'] + if tag == 'inline': + yield ''.join(parse_inline(x)) + elif 'attrs' in x and 'href' in x.get('attrs', ''): + yield '<' + tag + f' href="{x["attrs"]["href"]}">' + yield from parse_cont(x) + yield '' + else: + yield '<' + tag + '>' + yield from parse_cont(x) + yield '' + elif isinstance(x, list): + for y in x: + if isinstance(y, dict): + yield from parse_body(y) + +def parse_bdy(item): + c = item['cntnt'] + if item.get('type') == 'inline': + if c.get('cmsType') == 'listicle': + if 'title' in c: + yield '

' + escape(c['title']) + '

' + yield c['text'] + elif c.get('cmsType') == 'image': + yield from parse_lead_image(c) + elif c.get('cmsType') == 'imagegroup': + for imgs in c['images']: + yield from parse_lead_image(imgs) + elif c.get('cmsType') == 'pullquote': + if 'quote' in c: + yield '
' + c['quote'] + '
' + elif c.get('cmsType') == 'editorsNote': + if 'note' in c: + yield '
' + c['note'] + '
' + else: + if c['mrkup'].strip().startswith('<'): + yield c['mrkup'] + else: + yield '<{tag}>{markup}'.format( + tag=item['type'], markup=c['mrkup']) + +def parse_article(edg): + sc = edg['schma'] + yield '
' + escape(edg['sctn']) + '
' + yield '

' + escape(sc['sclTtl']) + '

' + if sc.get('sclDsc'): + yield '' + yield '

' + yield from parse_contributors(edg.get('cntrbGrp', {})) + ts = parse_iso8601(edg['mdDt'], as_utc=False).strftime('%B %d, %Y') + yield '

Published: ' + escape(ts) + '
' + if 'readTime' in edg: + yield '
' + escape(edg['readTime']) + '
' + yield '

' + if edg.get('ldMda', {}).get('cmsType') == 'image': + yield from parse_lead_image(edg['ldMda']) + if edg.get('prismData'): + for main in edg['prismData']['mainComponents']: + if main['name'] == 'Body': + for item in main['props']['body']: + if isinstance(item, dict): + if item.get('type', '') == 'inline': + yield ''.join(parse_inline(item)) + elif isinstance(item, list): + for line in item: + yield ''.join(parse_body(line)) + elif edg.get('bdy'): + for item in edg['bdy']: + yield from parse_bdy(item) + + +def article_parse(data): + yield '' + for frm in data['frms']: + if not frm: + continue + for mod in frm.get('mods', ()): + for edg in mod.get('edgs', ()): + if edg.get('cmsType') == 'ImmersiveLeadTile': + if 'image' in edg.get('cmsImage', {}): + yield from parse_lead_image(edg['cmsImage']) + if edg.get('cmsType') == 'ArticleBodyTile': + yield from parse_article(edg) + yield '' + + +def extract_html(raw_html): + data = extract_json(raw_html) + return '\n'.join(article_parse(data)) diff --git a/application/lib/calibre/web/site_parsers/nytimes.py b/application/lib/calibre/web/site_parsers/nytimes.py index c78e3edc..17208821 100644 --- a/application/lib/calibre/web/site_parsers/nytimes.py +++ b/application/lib/calibre/web/site_parsers/nytimes.py @@ -9,92 +9,190 @@ from calibre.utils.iso8601 import parse_iso8601 -module_version = 4 # needed for live updates +module_version = 11 # needed for live updates pprint -def is_heading(tn): - return tn in ('Heading1Block', 'Heading2Block', 'Heading3Block', 'Heading4Block') - - -def process_inline_text(lines, block): - text = '' - if 'text@stripHtml' in block: - text = escape(block['text@stripHtml']) - elif 'renderedRepresentation' in block: # happens in byline blocks - text = block['renderedRepresentation'] - elif 'text' in block: - text = block['text'] - if text: - for fmt in block.get('formats', ()): - tn = fmt['__typename'] - if tn == 'LinkFormat': - ab = fmt - text = '{}'.format(ab['url'], ab.get('title') or '', text) - elif tn == 'BoldFormat': - text = '' + text + '' - lines.append(text) - - -def process_paragraph(lines, block, content_key='content'): - tn = block['__typename'] - m = re.match(r'Heading([1-6])Block', tn) - if m is not None: - tag = 'h' + m.group(1) - else: - tag = 'p' - ta = block.get('textAlign') or 'LEFT' - style = f'text-align: {ta.lower()}' - lines.append(f'<{tag} style="{style}">') - for item in block[content_key]: - tn = item['__typename'] - if tn in ('TextInline', 'Byline'): - process_inline_text(lines, item) - lines.append('') - - -def process_timestamp(lines, block): - ts = block['timestamp'] - dt = parse_iso8601(ts, as_utc=False) - lines.append('

' + escape(dt.strftime('%b %d, %Y')) + '

') - - -def process_header(lines, block): - label = block.get('label') - if label: - process_paragraph(lines, label) - headline = block.get('headline') - if headline: - process_paragraph(lines, headline) - summary = block.get('summary') - if summary: - process_paragraph(lines, summary) - lm = block.get('ledeMedia') - if lm and lm.get('__typename') == 'ImageBlock': - process_image_block(lines, lm) - byline = block.get('byline') - if byline: - process_paragraph(lines, byline, content_key='bylines') - timestamp = block.get('timestampBlock') - if timestamp: - process_timestamp(lines, timestamp) - - -def process_image_block(lines, block): - media = block['media'] - caption = media.get('caption') - caption_lines = [] - if caption: - process_inline_text(caption_lines, caption) - crops = media['crops'] - renditions = crops[0]['renditions'] - img = renditions[0]['url'] - if 'web.archive.org' in img: - img = img.partition('/')[-1] - img = img[img.find('https://'):] - lines.append(f'
') - lines.extend(caption_lines) - lines.append('
') +def parse_image(i): + crop = i.get('crops') or i.get('spanImageCrops') + if crop: + yield f'
' + if i.get('caption'): + yield f'
{"".join(parse_types(i["caption"]))}' + if i.get('credit'): + yield f' {i["credit"]}' + yield '
' + elif i.get('legacyHtmlCaption'): + if i['legacyHtmlCaption'].strip(): + yield f'
{i["legacyHtmlCaption"]}
' + yield '
' + + +def parse_img_grid(g): + for grd in g.get('gridMedia', {}): + yield ''.join(parse_image(grd)) + if g.get('caption'): + yield f'
{g["caption"]}' + if g.get('credit'): + yield f' {g["credit"]}' + yield '
' + + +def parse_vid(v): + if v.get('promotionalMedia'): + headline = v.get('headline', {}).get('default', '') + rendition = v.get('renditions') + yield ( + f'
Video: {headline}
' + if rendition + else f'
{headline}
' + ) + yield ''.join(parse_types(v['promotionalMedia'])) + if v.get('promotionalSummary'): + yield f'
{v["promotionalSummary"]}
' + + +def parse_emb(e): + if e.get('html') and 'datawrapper.dwcdn.net' in e.get('html', ''): + dw = re.search(r'datawrapper.dwcdn.net/(.{5})', e['html']).group(1) + yield f'
' + elif e.get('promotionalMedia'): + if e.get('headline'): + yield f'
{e["headline"]["default"]}
' + yield ''.join(parse_types(e['promotionalMedia'])) + if e.get('note'): + yield f'
{e["note"]}
' + + +def parse_byline(byl): + for b in byl.get('bylines', {}): + yield f'
{b["renderedRepresentation"]}
' + yield '
' + for rl in byl.get('role', {}): + if ''.join(parse_cnt(rl)).strip(): + yield ''.join(parse_cnt(rl)) + yield '
' + + +def iso_date(x): + dt = parse_iso8601(x, as_utc=False) + return dt.strftime('%b %d, %Y at %I:%M %p') + + +def parse_header(h): + if h.get('label'): + yield f'
{"".join(parse_types(h["label"]))}
' + if h.get('headline'): + yield ''.join(parse_types(h['headline'])) + if h.get('summary'): + yield f'

{"".join(parse_types(h["summary"]))}

' + if h.get('ledeMedia'): + yield ''.join(parse_types(h['ledeMedia'])) + if h.get('byline'): + yield ''.join(parse_types(h['byline'])) + if h.get('timestampBlock'): + yield ''.join(parse_types(h['timestampBlock'])) + + +def parse_fmt_type(fm): + for f in fm.get('formats', {}): + ftype = f.get('__typename', '') + if ftype == 'BoldFormat': + yield '' + if ftype == 'ItalicFormat': + yield '' + if ftype == 'LinkFormat': + hrf = f['url'] + yield f'' + yield fm.get('text', '') + for f in reversed(fm.get('formats', {})): + ftype = f.get('__typename', '') + if ftype == 'BoldFormat': + yield '' + if ftype == 'ItalicFormat': + yield '' + if ftype == 'LinkFormat': + yield '' + + +def parse_cnt(cnt): + for k in cnt: + if isinstance(cnt[k], list): + if k == 'formats': + yield ''.join(parse_fmt_type(cnt)) + else: + for cnt_ in cnt[k]: + yield ''.join(parse_types(cnt_)) + if isinstance(cnt[k], dict): + yield ''.join(parse_types(cnt[k])) + if cnt.get('text') and 'formats' not in cnt and 'content' not in cnt: + if isinstance(cnt['text'], str): + yield cnt['text'] + + +def parse_types(x): + typename = x.get('__typename', '') + + align = '' + if x.get('textAlign'): + align = f' style="text-align: {x["textAlign"].lower()};"' + + if 'Header' in typename: + yield '\n'.join(parse_header(x)) + + elif typename.startswith('Heading'): + htag = 'h' + re.match(r'Heading([1-6])Block', typename).group(1) + yield f'<{htag}{align}>{"".join(parse_cnt(x))}' + + elif typename == 'ParagraphBlock': + yield f'

{"".join(parse_cnt(x))}

' + elif typename in {'DetailBlock', 'TextRunKV'}: + yield f'

{"".join(parse_cnt(x))}

' + + elif typename == 'BylineBlock': + yield f'

{"".join(parse_byline(x))}
' + elif typename == 'LabelBlock': + yield f'
{"".join(parse_cnt(x))}
' + elif typename == 'BlockquoteBlock': + yield f'
{"".join(parse_cnt(x))}
' + elif typename == 'TimestampBlock': + yield f'
{iso_date(x["timestamp"])}
' + elif typename == 'LineBreakInline': + yield '
' + elif typename == 'RuleBlock': + yield '
' + + elif typename == 'Image': + yield ''.join(parse_image(x)) + + elif typename == 'GridBlock': + yield ''.join(parse_img_grid(x)) + + elif typename == 'Video': + yield ''.join(parse_vid(x)) + + elif typename == 'EmbeddedInteractive': + yield ''.join(parse_emb(x)) + + elif typename == 'ListBlock': + yield f'\n
    {"".join(parse_cnt(x))}
' + elif typename == 'ListItemBlock': + yield f'\n
  • {"".join(parse_cnt(x))}
  • ' + + elif typename and typename not in { + 'RelatedLinksBlock', + 'EmailSignupBlock', + 'Dropzone', + 'AudioBlock', + }: + yield ''.join(parse_cnt(x)) + + +def article_parse(data): + yield '' + for d in data: + yield from parse_types(d) + yield '' def json_to_html(raw): @@ -105,18 +203,8 @@ def json_to_html(raw): except TypeError: data = data['initialState'] return live_json_to_html(data) - article = next(iter(data.values())) - body = article['sprinkledBody']['content'] - lines = [] - for item in body: - tn = item['__typename'] - if tn in ('HeaderBasicBlock', 'HeaderLegacyBlock', 'HeaderFullBleedVerticalBlock'): - process_header(lines, item) - elif tn in ('ParagraphBlock', 'LabelBlock', 'DetailBlock') or is_heading(tn): - process_paragraph(lines, item) - elif tn == 'ImageBlock': - process_image_block(lines, item) - return '' + '\n'.join(lines) + '' + content = data['article']['sprinkledBody']['content'] + return '\n'.join(article_parse(content)) def add_live_item(item, item_type, lines): @@ -161,7 +249,7 @@ def add_live_item(item, item_type, lines): def live_json_to_html(data): - for k, v in data["ROOT_QUERY"].items(): + for k, v in data['ROOT_QUERY'].items(): if isinstance(v, dict) and 'id' in v: root = data[v['id']] s = data[root['storylines'][0]['id']] @@ -178,37 +266,50 @@ def live_json_to_html(data): return '' + '\n'.join(lines) + '' -def extract_html(soup): +def extract_html(soup, url): + if '/interactive/' in url: + return ( + '

    ' + + 'This is an interactive article, which is supposed to be read in a browser.' + + '

    ' + ) script = soup.findAll('script', text=lambda x: x and 'window.__preloadedData' in x)[0] script = str(script) - raw = script[script.find('{'):script.rfind(';')].strip().rstrip(';') + raw = script[script.find('{') : script.rfind(';')].strip().rstrip(';') return json_to_html(raw) -def download_url(url=None, br=None): - # Get the URL from the Wayback machine +def download_url_from_wayback(category, url, br=None): from mechanize import Request + host = 'http://localhost:8090' host = 'https://wayback1.calibre-ebook.com' - if url is None: - url = sys.argv[-1] rq = Request( - host + '/nytimes', - data=json.dumps({"url": url}), - headers={'User-Agent': 'calibre', 'Content-Type': 'application/json'} + host + '/' + category, + data=json.dumps({'url': url}), + headers={'User-Agent': 'calibre', 'Content-Type': 'application/json'}, ) if br is None: from calibre import browser + br = browser() br.set_handle_gzip(True) return br.open_novisit(rq, timeout=3 * 60).read() +def download_url(url=None, br=None): + # Get the URL from the Wayback machine + if url is None: + url = sys.argv[-1] + return download_url_from_wayback('nytimes', url, br) + + if __name__ == '__main__': f = sys.argv[-1] raw = open(f).read() if f.endswith('.html'): from calibre.ebooks.BeautifulSoup import BeautifulSoup + soup = BeautifulSoup(raw) print(extract_html(soup)) else: diff --git a/application/lib/ebook_translator/html_translator.py b/application/lib/ebook_translator/html_translator.py index 7498c079..5a24868d 100644 --- a/application/lib/ebook_translator/html_translator.py +++ b/application/lib/ebook_translator/html_translator.py @@ -142,17 +142,17 @@ def add_translation_soup(self, soup, tag, trans, dst): origStyle = self.params.get('orig_style', '') transStyle = self.params.get('trans_style', '') trans = trans.replace('<', '<').replace('>', '>') - transTag = BeautifulSoup(trans, 'html.parser') #'html.parser'解析器不会自动添加 - if not transTag.contents: - return - transTag = transTag.contents[0] - if isinstance(transTag, NavigableString): - oldTxt = str(transTag) + #纯文本,不是html + if '<' not in trans or '>' not in trans: transTagName = 'span' if tag.name in ('title', 'tr', 'td', 'th', 'thead', 'tbody', 'table', 'ul', 'ol', 'li', 'a') else tag.name transTag = soup.new_tag(transTagName) - transTag.string = oldTxt - + transTag.string = trans + else: + transTag = BeautifulSoup(trans, 'html.parser') #'html.parser'解析器不会自动添加 + if not transTag.contents: + return + transTag = transTag.contents[0] if origStyle: old = tag.get('style') tag['style'] = f'{old};{origStyle}' if old else origStyle