From 1ed3285b168177ec6e9611d38ae40a7445c5d520 Mon Sep 17 00:00:00 2001
From: cdhigh
Date: Tue, 15 Oct 2024 08:55:04 -0300
Subject: [PATCH] sync some calibre's updates
---
application/lib/calibre/ebooks/epub/pages.py | 5 +-
.../lib/calibre/ebooks/epub/periodical.py | 5 +-
.../lib/calibre/ebooks/html_entities.py | 113 ++++++-
application/lib/calibre/library/__init__.py | 7 +-
application/lib/calibre/library/comments.py | 13 +-
application/lib/calibre/utils/formatter.py | 4 +-
application/lib/calibre/utils/imghdr.py | 3 +-
application/lib/calibre/utils/serialize.py | 4 +-
application/lib/calibre/utils/short_uuid.py | 4 +-
application/lib/calibre/utils/smartypants.py | 2 +
application/lib/calibre/utils/speedups.py | 8 +-
application/lib/calibre/utils/terminal.py | 10 +-
application/lib/calibre/utils/threadpool.py | 7 +-
application/lib/calibre/utils/xml_parse.py | 12 +-
application/lib/calibre/utils/zipfile.py | 2 +-
application/lib/calibre/web/feeds/news.py | 2 +-
.../lib/calibre/web/site_parsers/natgeo.py | 181 ++++++++++
.../lib/calibre/web/site_parsers/nytimes.py | 311 ++++++++++++------
.../lib/ebook_translator/html_translator.py | 16 +-
19 files changed, 555 insertions(+), 154 deletions(-)
create mode 100644 application/lib/calibre/web/site_parsers/natgeo.py
diff --git a/application/lib/calibre/ebooks/epub/pages.py b/application/lib/calibre/ebooks/epub/pages.py
index da06eef0..7f58aa72 100644
--- a/application/lib/calibre/ebooks/epub/pages.py
+++ b/application/lib/calibre/ebooks/epub/pages.py
@@ -9,10 +9,11 @@
import re
from itertools import count
-from calibre.ebooks.oeb.base import XHTML_NS
-from calibre.ebooks.oeb.base import OEBBook
+
from lxml.etree import XPath
+from calibre.ebooks.oeb.base import XHTML_NS, OEBBook
+
NSMAP = {'h': XHTML_NS, 'html': XHTML_NS, 'xhtml': XHTML_NS}
PAGE_RE = re.compile(r'page', re.IGNORECASE)
ROMAN_RE = re.compile(r'^[ivxlcdm]+$', re.IGNORECASE)
diff --git a/application/lib/calibre/ebooks/epub/periodical.py b/application/lib/calibre/ebooks/epub/periodical.py
index 8b2a6610..80db09a5 100644
--- a/application/lib/calibre/ebooks/epub/periodical.py
+++ b/application/lib/calibre/ebooks/epub/periodical.py
@@ -5,11 +5,12 @@
__copyright__ = '2010, Kovid Goyal '
__docformat__ = 'restructuredtext en'
-from uuid import uuid4
import time
+from uuid import uuid4
+from calibre import prepare_string_for_xml as xml
+from calibre import strftime
from calibre.constants import __appname__, __version__
-from calibre import strftime, prepare_string_for_xml as xml
from calibre.utils.date import parse_date
SONY_METADATA = '''\
diff --git a/application/lib/calibre/ebooks/html_entities.py b/application/lib/calibre/ebooks/html_entities.py
index ca2dc2fd..6817ac03 100644
--- a/application/lib/calibre/ebooks/html_entities.py
+++ b/application/lib/calibre/ebooks/html_entities.py
@@ -1,6 +1,7 @@
#!/usr/bin/env python
# License: GPLv3 Copyright: 2017, Kovid Goyal
+from calibre import my_unichr
html5_entities = {
# ENTITY_DATA {{{
@@ -91,7 +92,7 @@
'DifferentialD': 'ⅆ',
'Dopf': '𝔻',
'Dot': '¨',
- 'DotDot': '⃜\u20dc',
+ 'DotDot': '⃜',
'DotEqual': '≐',
'DoubleContourIntegral': '∯',
'DoubleDot': '¨',
@@ -502,7 +503,7 @@
'TRADE': '™',
'TSHcy': 'Ћ',
'TScy': 'Ц',
- 'Tab': ' ',
+ 'Tab': '\t',
'Tau': 'Τ',
'Tcaron': 'Ť',
'Tcedil': 'Ţ',
@@ -1105,6 +1106,7 @@
'hearts': '♥',
'heartsuit': '♥',
'hellip': '…',
+ 'hellips': '…',
'hercon': '⊹',
'hfr': '𝔥',
'hksearow': '⤥',
@@ -1857,6 +1859,7 @@
'square': '□',
'squarf': '▪',
'squf': '▪',
+ 'squot': "'",
'srarr': '→',
'sscr': '𝓈',
'ssetmn': '∖',
@@ -2133,19 +2136,107 @@
}
-if __name__ == '__main__':
+def entity_to_unicode_in_python(match, exceptions=(), encoding='cp1252', result_exceptions={}):
+ def check(ch):
+ return result_exceptions.get(ch, ch)
+
+ ent = match.group(1)
+ if ent in exceptions:
+ return '&'+ent+';'
+ if ent in {'apos', 'squot'}: # squot is generated by some broken CMS software
+ return check("'")
+ if ent == 'hellips':
+ ent = 'hellip'
+ if ent.startswith('#'):
+ try:
+ if ent[1] in ('x', 'X'):
+ num = int(ent[2:], 16)
+ else:
+ num = int(ent[1:])
+ except:
+ return '&'+ent+';'
+ if encoding is None or num > 255:
+ return check(my_unichr(num))
+ try:
+ return check(bytes(bytearray((num,))).decode(encoding))
+ except UnicodeDecodeError:
+ return check(my_unichr(num))
+ from calibre.ebooks.html_entities import html5_entities
+ try:
+ return check(html5_entities[ent])
+ except KeyError:
+ pass
+ from polyglot.html_entities import name2codepoint
+ try:
+ return check(my_unichr(name2codepoint[ent]))
+ except KeyError:
+ return '&'+ent+';'
+
+
+def find_tests():
+ import unittest
+ class TestHTMLEntityReplacement(unittest.TestCase):
+ def test_html_entity_replacement(self):
+ from calibre_extensions.fast_html_entities import replace_all_entities
+ def t(inp, exp):
+ self.assertEqual(exp, replace_all_entities(inp), f'Failed for input: {inp!r}')
+ def x(inp, exp):
+ self.assertEqual(exp, replace_all_entities(inp, True), f'Failed for input: {inp!r}')
+ t('aӒb', 'aӒb')
+ t('', '')
+ t('a', 'a')
+ t('&', '&')
+ t('&', '&')
+ t('&', '&')
+ t('a&;b c', 'a&;b c')
+ t('<', '<')
+ t('&<', '&<')
+ t('a&b<c', 'a&b b\n\ncd',
'a b
cd
'),
]:
- cval = comments_to_html(pat)
- self.assertEqual(cval, val)
+ try:
+ cval = comments_to_html(pat)
+ except DeprecationWarning:
+ pass # new lxml + old Beautiful soup == deprecation warning
+ else:
+ self.assertEqual(cval, val)
return unittest.defaultTestLoader.loadTestsFromTestCase(Test)
diff --git a/application/lib/calibre/utils/formatter.py b/application/lib/calibre/utils/formatter.py
index ab7934cc..8f09febd 100644
--- a/application/lib/calibre/utils/formatter.py
+++ b/application/lib/calibre/utils/formatter.py
@@ -20,9 +20,7 @@
from calibre.constants import DEBUG
from calibre.ebooks.metadata.book.base import field_metadata
from calibre.utils.config import tweaks
-from calibre.utils.formatter_functions import (
- StoredObjectType, formatter_functions, function_object_type, get_database,
-)
+from calibre.utils.formatter_functions import StoredObjectType, formatter_functions, function_object_type, get_database
from calibre.utils.icu import strcmp
from calibre.utils.localization import _
from polyglot.builtins import error_message
diff --git a/application/lib/calibre/utils/imghdr.py b/application/lib/calibre/utils/imghdr.py
index e23c4f8d..186b71c0 100644
--- a/application/lib/calibre/utils/imghdr.py
+++ b/application/lib/calibre/utils/imghdr.py
@@ -2,8 +2,9 @@
# License: GPLv3 Copyright: 2016, Kovid Goyal
-from struct import unpack, error
import os
+from struct import error, unpack
+
from calibre.utils.speedups import ReadOnlyFileBuffer
from polyglot.builtins import string_or_bytes
diff --git a/application/lib/calibre/utils/serialize.py b/application/lib/calibre/utils/serialize.py
index 707298df..81e187f0 100644
--- a/application/lib/calibre/utils/serialize.py
+++ b/application/lib/calibre/utils/serialize.py
@@ -25,9 +25,9 @@ def encoder(obj):
if isinstance(obj, (set, frozenset)):
return encoded(1, tuple(obj), ExtType)
if getattr(obj, '__calibre_serializable__', False):
+ from calibre.db.categories import Tag
from calibre.ebooks.metadata.book.base import Metadata
from calibre.library.field_metadata import FieldMetadata, fm_as_dict
- from calibre.db.categories import Tag
if isinstance(obj, Metadata):
from calibre.ebooks.metadata.book.serialize import metadata_as_dict
return encoded(
@@ -60,8 +60,8 @@ def json_dumps(data, **kw):
def decode_metadata(x, for_json):
- from polyglot.binary import from_base64_bytes
from calibre.ebooks.metadata.book.serialize import metadata_from_dict
+ from polyglot.binary import from_base64_bytes
obj = metadata_from_dict(x)
if for_json and obj.cover_data and obj.cover_data[1]:
obj.cover_data = obj.cover_data[0], from_base64_bytes(obj.cover_data[1])
diff --git a/application/lib/calibre/utils/short_uuid.py b/application/lib/calibre/utils/short_uuid.py
index f97475c4..9b344e20 100644
--- a/application/lib/calibre/utils/short_uuid.py
+++ b/application/lib/calibre/utils/short_uuid.py
@@ -6,7 +6,9 @@
Generate UUID encoded using a user specified alphabet.
'''
-import string, math, uuid as _uuid
+import math
+import string
+import uuid as _uuid
def num_to_string(number, alphabet, alphabet_len, pad_to_length=None):
diff --git a/application/lib/calibre/utils/smartypants.py b/application/lib/calibre/utils/smartypants.py
index 3d44112f..a00a6151 100644
--- a/application/lib/calibre/utils/smartypants.py
+++ b/application/lib/calibre/utils/smartypants.py
@@ -875,6 +875,8 @@ def test_ordinal_numbers(self):
def test_educated_quotes(self):
self.assertEqual(sp('''"Isn't this fun?"'''), '''“Isn’t this fun?”''')
+ self.assertEqual(sp("'abc'"), '‘abc’')
+
tests = unittest.defaultTestLoader.loadTestsFromTestCase(TestSmartypantsAllAttributes)
if return_tests:
diff --git a/application/lib/calibre/utils/speedups.py b/application/lib/calibre/utils/speedups.py
index 2e835a4c..270cc4c7 100644
--- a/application/lib/calibre/utils/speedups.py
+++ b/application/lib/calibre/utils/speedups.py
@@ -9,14 +9,15 @@ class ReadOnlyFileBuffer:
''' A zero copy implementation of a file like object. Uses memoryviews for efficiency. '''
- def __init__(self, raw):
+ def __init__(self, raw: bytes, name: str = ''):
self.sz, self.mv = len(raw), (raw if isinstance(raw, memoryview) else memoryview(raw))
self.pos = 0
+ self.name: str = name
def tell(self):
return self.pos
- def read(self, n=None):
+ def read(self, n = None) -> memoryview:
if n is None:
ans = self.mv[self.pos:]
self.pos = self.sz
@@ -35,6 +36,9 @@ def seek(self, pos, whence=os.SEEK_SET):
self.pos = max(0, min(self.pos, self.sz))
return self.pos
+ def seekable(self):
+ return True
+
def getvalue(self):
return self.mv
diff --git a/application/lib/calibre/utils/terminal.py b/application/lib/calibre/utils/terminal.py
index cd31103b..6aee59ef 100644
--- a/application/lib/calibre/utils/terminal.py
+++ b/application/lib/calibre/utils/terminal.py
@@ -4,10 +4,12 @@
__copyright__ = '2012, Kovid Goyal '
__docformat__ = 'restructuredtext en'
-import os, sys, re
+import os
+import re
+import sys
-from calibre.prints import is_binary
from calibre.constants import iswindows
+from calibre.prints import is_binary
from polyglot.builtins import iteritems
if iswindows:
@@ -233,7 +235,9 @@ class CONSOLE_SCREEN_BUFFER_INFO(Structure):
def get_term_geometry():
- import fcntl, termios, struct
+ import fcntl
+ import struct
+ import termios
def ioctl_GWINSZ(fd):
try:
diff --git a/application/lib/calibre/utils/threadpool.py b/application/lib/calibre/utils/threadpool.py
index 818183e4..1bc1db50 100644
--- a/application/lib/calibre/utils/threadpool.py
+++ b/application/lib/calibre/utils/threadpool.py
@@ -47,6 +47,7 @@
# standard library modules
import threading
+
from polyglot import queue
# exceptions
@@ -72,7 +73,7 @@ class WorkerThread(threading.Thread):
"""
def __init__(self, requestsQueue, resultsQueue, **kwds):
- """Set up thread in daemonic mode and start it immediatedly.
+ """Set up thread in daemonic mode and start it immediately.
requestsQueue and resultQueue are instances of queue.Queue passed
by the ThreadPool class when it creates a new worker thread.
@@ -96,9 +97,9 @@ def run(self):
break # and exit
try:
self.resultQueue.put(
- (request, request.callable(request.args, **request.kwds))
+ (request, request.callable(*request.args, **request.kwds))
)
- except Exception as e:
+ except:
request.exception = True
import traceback
self.resultQueue.put((request, traceback.format_exc()))
diff --git a/application/lib/calibre/utils/xml_parse.py b/application/lib/calibre/utils/xml_parse.py
index a31c6ed8..fdd57dca 100644
--- a/application/lib/calibre/utils/xml_parse.py
+++ b/application/lib/calibre/utils/xml_parse.py
@@ -36,8 +36,16 @@ def safe_xml_fromstring(string_or_bytes, recover=True):
return ans
+def unsafe_xml_fromstring(string_or_bytes):
+ parser = etree.XMLParser(resolve_entities=True)
+ return fs(string_or_bytes, parser=parser)
+
+
def find_tests():
- import unittest, tempfile, os
+ import os
+ import tempfile
+ import unittest
+
from calibre.constants import iswindows
class TestXMLParse(unittest.TestCase):
@@ -61,7 +69,7 @@ def t(tid, val, expected, safe=True):
raw = templ.format(id=tid, val=val)
err = None
try:
- root = safe_xml_fromstring(raw) if safe else etree.fromstring(raw)
+ root = safe_xml_fromstring(raw) if safe else unsafe_xml_fromstring(raw)
except Exception as e:
err = str(e)
root = None
diff --git a/application/lib/calibre/utils/zipfile.py b/application/lib/calibre/utils/zipfile.py
index e493b75c..03c57bc7 100644
--- a/application/lib/calibre/utils/zipfile.py
+++ b/application/lib/calibre/utils/zipfile.py
@@ -17,7 +17,7 @@
from calibre.constants import filesystem_encoding
from calibre.ebooks.chardet import detect
from calibre.ptempfile import SpooledTemporaryFile
-from polyglot.builtins import string_or_bytes, as_bytes
+from polyglot.builtins import as_bytes, string_or_bytes
try:
import zlib # We may need its compression method
diff --git a/application/lib/calibre/web/feeds/news.py b/application/lib/calibre/web/feeds/news.py
index c248ebe4..9f05402a 100644
--- a/application/lib/calibre/web/feeds/news.py
+++ b/application/lib/calibre/web/feeds/news.py
@@ -2318,7 +2318,7 @@ def get_browser(self):
br.select_form(name='login')
br['username'] = self.username
br['password'] = self.password
- raw = br.submit_selected().content
+ raw = br.submit().read()
if 'href="/my-account"' not in raw:
raise LoginFailed(
_('Failed to log in, check your username and password for'
diff --git a/application/lib/calibre/web/site_parsers/natgeo.py b/application/lib/calibre/web/site_parsers/natgeo.py
new file mode 100644
index 00000000..3e60a3e9
--- /dev/null
+++ b/application/lib/calibre/web/site_parsers/natgeo.py
@@ -0,0 +1,181 @@
+#!/usr/bin/env python
+
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+import json
+from pprint import pprint
+
+from calibre import prepare_string_for_xml as escape
+from calibre.utils.iso8601 import parse_iso8601
+
+module_version = 1 # needed for live updates
+pprint
+
+
+def extract_json(raw):
+ s = raw.find("window['__natgeo__']")
+ script = raw[s : raw.find('', s)]
+ content = json.loads(script[script.find('{') :].rstrip(';'))['page']['content']
+ if content.get('prismarticle'):
+ return content['prismarticle']
+ if content.get('article'):
+ return content['article']
+
+
+def parse_contributors(grp):
+ for item in grp:
+ line = '' + escape(item['title']) + ' '
+ for c in item['contributors']:
+ line += escape(c['displayName'])
+ yield line + '
'
+
+
+def parse_lead_image(media):
+ if 'image' in media:
+ yield ''
+ if 'dsc' in media['image']:
+ yield (
+ f'
'
+ )
+ else:
+ yield f''
+ if 'caption' in media and 'credit' in media:
+ yield (
+ ''
+ + media['caption']
+ + ' '
+ + media['credit']
+ + '
'
+ )
+ elif 'caption' in media:
+ yield '' + media['caption'] + '
'
+ yield '
'
+
+
+def parse_inline(inl):
+ if inl.get('content', {}).get('name', '') == 'Image':
+ props = inl['content']['props']
+ yield ''
+ if 'image' in props:
+ yield f'
'
+ if 'caption' in props:
+ yield (
+ f'{props["caption"].get("text", "")} {props["caption"].get("credit", "")}
'
+ )
+ yield ''
+ if inl.get('content', {}).get('name', '') == 'ImageGroup':
+ if 'images' in inl['content']['props']:
+ for imgs in inl['content']['props']['images']:
+ yield ''
+ if 'src' in imgs:
+ yield f'
'
+ if 'caption' in imgs:
+ yield (
+ f'{imgs["caption"].get("text", "")} {imgs["caption"].get("credit", "")}
'
+ )
+ yield ''
+
+
+def parse_cont(content):
+ for cont in content.get('content', {}):
+ if isinstance(cont, dict):
+ yield from parse_body(cont)
+ if isinstance(cont, str):
+ yield cont
+
+
+def parse_body(x):
+ if isinstance(x, dict):
+ if 'type' in x:
+ tag = x['type']
+ if tag == 'inline':
+ yield ''.join(parse_inline(x))
+ elif 'attrs' in x and 'href' in x.get('attrs', ''):
+ yield '<' + tag + f' href="{x["attrs"]["href"]}">'
+ yield from parse_cont(x)
+ yield '' + tag + '>'
+ else:
+ yield '<' + tag + '>'
+ yield from parse_cont(x)
+ yield '' + tag + '>'
+ elif isinstance(x, list):
+ for y in x:
+ if isinstance(y, dict):
+ yield from parse_body(y)
+
+def parse_bdy(item):
+ c = item['cntnt']
+ if item.get('type') == 'inline':
+ if c.get('cmsType') == 'listicle':
+ if 'title' in c:
+ yield '' + escape(c['title']) + '
'
+ yield c['text']
+ elif c.get('cmsType') == 'image':
+ yield from parse_lead_image(c)
+ elif c.get('cmsType') == 'imagegroup':
+ for imgs in c['images']:
+ yield from parse_lead_image(imgs)
+ elif c.get('cmsType') == 'pullquote':
+ if 'quote' in c:
+ yield '' + c['quote'] + '
'
+ elif c.get('cmsType') == 'editorsNote':
+ if 'note' in c:
+ yield '' + c['note'] + '
'
+ else:
+ if c['mrkup'].strip().startswith('<'):
+ yield c['mrkup']
+ else:
+ yield '<{tag}>{markup}{tag}>'.format(
+ tag=item['type'], markup=c['mrkup'])
+
+def parse_article(edg):
+ sc = edg['schma']
+ yield '' + escape(edg['sctn']) + '
'
+ yield '' + escape(sc['sclTtl']) + '
'
+ if sc.get('sclDsc'):
+ yield '' + escape(sc['sclDsc']) + '
'
+ yield ''
+ yield from parse_contributors(edg.get('cntrbGrp', {}))
+ ts = parse_iso8601(edg['mdDt'], as_utc=False).strftime('%B %d, %Y')
+ yield '
Published: ' + escape(ts) + '
'
+ if 'readTime' in edg:
+ yield '' + escape(edg['readTime']) + '
'
+ yield ''
+ if edg.get('ldMda', {}).get('cmsType') == 'image':
+ yield from parse_lead_image(edg['ldMda'])
+ if edg.get('prismData'):
+ for main in edg['prismData']['mainComponents']:
+ if main['name'] == 'Body':
+ for item in main['props']['body']:
+ if isinstance(item, dict):
+ if item.get('type', '') == 'inline':
+ yield ''.join(parse_inline(item))
+ elif isinstance(item, list):
+ for line in item:
+ yield ''.join(parse_body(line))
+ elif edg.get('bdy'):
+ for item in edg['bdy']:
+ yield from parse_bdy(item)
+
+
+def article_parse(data):
+ yield ''
+ for frm in data['frms']:
+ if not frm:
+ continue
+ for mod in frm.get('mods', ()):
+ for edg in mod.get('edgs', ()):
+ if edg.get('cmsType') == 'ImmersiveLeadTile':
+ if 'image' in edg.get('cmsImage', {}):
+ yield from parse_lead_image(edg['cmsImage'])
+ if edg.get('cmsType') == 'ArticleBodyTile':
+ yield from parse_article(edg)
+ yield ''
+
+
+def extract_html(raw_html):
+ data = extract_json(raw_html)
+ return '\n'.join(article_parse(data))
diff --git a/application/lib/calibre/web/site_parsers/nytimes.py b/application/lib/calibre/web/site_parsers/nytimes.py
index c78e3edc..17208821 100644
--- a/application/lib/calibre/web/site_parsers/nytimes.py
+++ b/application/lib/calibre/web/site_parsers/nytimes.py
@@ -9,92 +9,190 @@
from calibre.utils.iso8601 import parse_iso8601
-module_version = 4 # needed for live updates
+module_version = 11 # needed for live updates
pprint
-def is_heading(tn):
- return tn in ('Heading1Block', 'Heading2Block', 'Heading3Block', 'Heading4Block')
-
-
-def process_inline_text(lines, block):
- text = ''
- if 'text@stripHtml' in block:
- text = escape(block['text@stripHtml'])
- elif 'renderedRepresentation' in block: # happens in byline blocks
- text = block['renderedRepresentation']
- elif 'text' in block:
- text = block['text']
- if text:
- for fmt in block.get('formats', ()):
- tn = fmt['__typename']
- if tn == 'LinkFormat':
- ab = fmt
- text = '{}'.format(ab['url'], ab.get('title') or '', text)
- elif tn == 'BoldFormat':
- text = '' + text + ''
- lines.append(text)
-
-
-def process_paragraph(lines, block, content_key='content'):
- tn = block['__typename']
- m = re.match(r'Heading([1-6])Block', tn)
- if m is not None:
- tag = 'h' + m.group(1)
- else:
- tag = 'p'
- ta = block.get('textAlign') or 'LEFT'
- style = f'text-align: {ta.lower()}'
- lines.append(f'<{tag} style="{style}">')
- for item in block[content_key]:
- tn = item['__typename']
- if tn in ('TextInline', 'Byline'):
- process_inline_text(lines, item)
- lines.append('' + tag + '>')
-
-
-def process_timestamp(lines, block):
- ts = block['timestamp']
- dt = parse_iso8601(ts, as_utc=False)
- lines.append('' + escape(dt.strftime('%b %d, %Y')) + '
')
-
-
-def process_header(lines, block):
- label = block.get('label')
- if label:
- process_paragraph(lines, label)
- headline = block.get('headline')
- if headline:
- process_paragraph(lines, headline)
- summary = block.get('summary')
- if summary:
- process_paragraph(lines, summary)
- lm = block.get('ledeMedia')
- if lm and lm.get('__typename') == 'ImageBlock':
- process_image_block(lines, lm)
- byline = block.get('byline')
- if byline:
- process_paragraph(lines, byline, content_key='bylines')
- timestamp = block.get('timestampBlock')
- if timestamp:
- process_timestamp(lines, timestamp)
-
-
-def process_image_block(lines, block):
- media = block['media']
- caption = media.get('caption')
- caption_lines = []
- if caption:
- process_inline_text(caption_lines, caption)
- crops = media['crops']
- renditions = crops[0]['renditions']
- img = renditions[0]['url']
- if 'web.archive.org' in img:
- img = img.partition('/')[-1]
- img = img[img.find('https://'):]
- lines.append(f'')
- lines.extend(caption_lines)
- lines.append('
')
+def parse_image(i):
+ crop = i.get('crops') or i.get('spanImageCrops')
+ if crop:
+ yield f''
+ if i.get('caption'):
+ yield f'
{"".join(parse_types(i["caption"]))}'
+ if i.get('credit'):
+ yield f' {i["credit"]}'
+ yield '
'
+ elif i.get('legacyHtmlCaption'):
+ if i['legacyHtmlCaption'].strip():
+ yield f'
{i["legacyHtmlCaption"]}
'
+ yield '
'
+
+
+def parse_img_grid(g):
+ for grd in g.get('gridMedia', {}):
+ yield ''.join(parse_image(grd))
+ if g.get('caption'):
+ yield f'{g["caption"]}'
+ if g.get('credit'):
+ yield f' {g["credit"]}'
+ yield '
'
+
+
+def parse_vid(v):
+ if v.get('promotionalMedia'):
+ headline = v.get('headline', {}).get('default', '')
+ rendition = v.get('renditions')
+ yield (
+ f''
+ if rendition
+ else f'{headline}
'
+ )
+ yield ''.join(parse_types(v['promotionalMedia']))
+ if v.get('promotionalSummary'):
+ yield f'{v["promotionalSummary"]}
'
+
+
+def parse_emb(e):
+ if e.get('html') and 'datawrapper.dwcdn.net' in e.get('html', ''):
+ dw = re.search(r'datawrapper.dwcdn.net/(.{5})', e['html']).group(1)
+ yield f''
+ elif e.get('promotionalMedia'):
+ if e.get('headline'):
+ yield f'{e["headline"]["default"]}
'
+ yield ''.join(parse_types(e['promotionalMedia']))
+ if e.get('note'):
+ yield f'{e["note"]}
'
+
+
+def parse_byline(byl):
+ for b in byl.get('bylines', {}):
+ yield f'{b["renderedRepresentation"]}
'
+ yield ''
+ for rl in byl.get('role', {}):
+ if ''.join(parse_cnt(rl)).strip():
+ yield ''.join(parse_cnt(rl))
+ yield '
'
+
+
+def iso_date(x):
+ dt = parse_iso8601(x, as_utc=False)
+ return dt.strftime('%b %d, %Y at %I:%M %p')
+
+
+def parse_header(h):
+ if h.get('label'):
+ yield f'{"".join(parse_types(h["label"]))}
'
+ if h.get('headline'):
+ yield ''.join(parse_types(h['headline']))
+ if h.get('summary'):
+ yield f'{"".join(parse_types(h["summary"]))}
'
+ if h.get('ledeMedia'):
+ yield ''.join(parse_types(h['ledeMedia']))
+ if h.get('byline'):
+ yield ''.join(parse_types(h['byline']))
+ if h.get('timestampBlock'):
+ yield ''.join(parse_types(h['timestampBlock']))
+
+
+def parse_fmt_type(fm):
+ for f in fm.get('formats', {}):
+ ftype = f.get('__typename', '')
+ if ftype == 'BoldFormat':
+ yield ''
+ if ftype == 'ItalicFormat':
+ yield ''
+ if ftype == 'LinkFormat':
+ hrf = f['url']
+ yield f''
+ yield fm.get('text', '')
+ for f in reversed(fm.get('formats', {})):
+ ftype = f.get('__typename', '')
+ if ftype == 'BoldFormat':
+ yield ''
+ if ftype == 'ItalicFormat':
+ yield ''
+ if ftype == 'LinkFormat':
+ yield ''
+
+
+def parse_cnt(cnt):
+ for k in cnt:
+ if isinstance(cnt[k], list):
+ if k == 'formats':
+ yield ''.join(parse_fmt_type(cnt))
+ else:
+ for cnt_ in cnt[k]:
+ yield ''.join(parse_types(cnt_))
+ if isinstance(cnt[k], dict):
+ yield ''.join(parse_types(cnt[k]))
+ if cnt.get('text') and 'formats' not in cnt and 'content' not in cnt:
+ if isinstance(cnt['text'], str):
+ yield cnt['text']
+
+
+def parse_types(x):
+ typename = x.get('__typename', '')
+
+ align = ''
+ if x.get('textAlign'):
+ align = f' style="text-align: {x["textAlign"].lower()};"'
+
+ if 'Header' in typename:
+ yield '\n'.join(parse_header(x))
+
+ elif typename.startswith('Heading'):
+ htag = 'h' + re.match(r'Heading([1-6])Block', typename).group(1)
+ yield f'<{htag}{align}>{"".join(parse_cnt(x))}{htag}>'
+
+ elif typename == 'ParagraphBlock':
+ yield f'{"".join(parse_cnt(x))}
'
+ elif typename in {'DetailBlock', 'TextRunKV'}:
+ yield f'{"".join(parse_cnt(x))}
'
+
+ elif typename == 'BylineBlock':
+ yield f'
{"".join(parse_byline(x))}
'
+ elif typename == 'LabelBlock':
+ yield f'{"".join(parse_cnt(x))}
'
+ elif typename == 'BlockquoteBlock':
+ yield f'{"".join(parse_cnt(x))}
'
+ elif typename == 'TimestampBlock':
+ yield f'{iso_date(x["timestamp"])}
'
+ elif typename == 'LineBreakInline':
+ yield '
'
+ elif typename == 'RuleBlock':
+ yield '
'
+
+ elif typename == 'Image':
+ yield ''.join(parse_image(x))
+
+ elif typename == 'GridBlock':
+ yield ''.join(parse_img_grid(x))
+
+ elif typename == 'Video':
+ yield ''.join(parse_vid(x))
+
+ elif typename == 'EmbeddedInteractive':
+ yield ''.join(parse_emb(x))
+
+ elif typename == 'ListBlock':
+ yield f'\n'
+ elif typename == 'ListItemBlock':
+ yield f'\n{"".join(parse_cnt(x))}'
+
+ elif typename and typename not in {
+ 'RelatedLinksBlock',
+ 'EmailSignupBlock',
+ 'Dropzone',
+ 'AudioBlock',
+ }:
+ yield ''.join(parse_cnt(x))
+
+
+def article_parse(data):
+ yield ''
+ for d in data:
+ yield from parse_types(d)
+ yield ''
def json_to_html(raw):
@@ -105,18 +203,8 @@ def json_to_html(raw):
except TypeError:
data = data['initialState']
return live_json_to_html(data)
- article = next(iter(data.values()))
- body = article['sprinkledBody']['content']
- lines = []
- for item in body:
- tn = item['__typename']
- if tn in ('HeaderBasicBlock', 'HeaderLegacyBlock', 'HeaderFullBleedVerticalBlock'):
- process_header(lines, item)
- elif tn in ('ParagraphBlock', 'LabelBlock', 'DetailBlock') or is_heading(tn):
- process_paragraph(lines, item)
- elif tn == 'ImageBlock':
- process_image_block(lines, item)
- return '' + '\n'.join(lines) + ''
+ content = data['article']['sprinkledBody']['content']
+ return '\n'.join(article_parse(content))
def add_live_item(item, item_type, lines):
@@ -161,7 +249,7 @@ def add_live_item(item, item_type, lines):
def live_json_to_html(data):
- for k, v in data["ROOT_QUERY"].items():
+ for k, v in data['ROOT_QUERY'].items():
if isinstance(v, dict) and 'id' in v:
root = data[v['id']]
s = data[root['storylines'][0]['id']]
@@ -178,37 +266,50 @@ def live_json_to_html(data):
return '' + '\n'.join(lines) + ''
-def extract_html(soup):
+def extract_html(soup, url):
+ if '/interactive/' in url:
+ return (
+ ''
+ + 'This is an interactive article, which is supposed to be read in a browser.'
+ + '
'
+ )
script = soup.findAll('script', text=lambda x: x and 'window.__preloadedData' in x)[0]
script = str(script)
- raw = script[script.find('{'):script.rfind(';')].strip().rstrip(';')
+ raw = script[script.find('{') : script.rfind(';')].strip().rstrip(';')
return json_to_html(raw)
-def download_url(url=None, br=None):
- # Get the URL from the Wayback machine
+def download_url_from_wayback(category, url, br=None):
from mechanize import Request
+
host = 'http://localhost:8090'
host = 'https://wayback1.calibre-ebook.com'
- if url is None:
- url = sys.argv[-1]
rq = Request(
- host + '/nytimes',
- data=json.dumps({"url": url}),
- headers={'User-Agent': 'calibre', 'Content-Type': 'application/json'}
+ host + '/' + category,
+ data=json.dumps({'url': url}),
+ headers={'User-Agent': 'calibre', 'Content-Type': 'application/json'},
)
if br is None:
from calibre import browser
+
br = browser()
br.set_handle_gzip(True)
return br.open_novisit(rq, timeout=3 * 60).read()
+def download_url(url=None, br=None):
+ # Get the URL from the Wayback machine
+ if url is None:
+ url = sys.argv[-1]
+ return download_url_from_wayback('nytimes', url, br)
+
+
if __name__ == '__main__':
f = sys.argv[-1]
raw = open(f).read()
if f.endswith('.html'):
from calibre.ebooks.BeautifulSoup import BeautifulSoup
+
soup = BeautifulSoup(raw)
print(extract_html(soup))
else:
diff --git a/application/lib/ebook_translator/html_translator.py b/application/lib/ebook_translator/html_translator.py
index 7498c079..5a24868d 100644
--- a/application/lib/ebook_translator/html_translator.py
+++ b/application/lib/ebook_translator/html_translator.py
@@ -142,17 +142,17 @@ def add_translation_soup(self, soup, tag, trans, dst):
origStyle = self.params.get('orig_style', '')
transStyle = self.params.get('trans_style', '')
trans = trans.replace('<', '<').replace('>', '>')
- transTag = BeautifulSoup(trans, 'html.parser') #'html.parser'解析器不会自动添加
- if not transTag.contents:
- return
- transTag = transTag.contents[0]
- if isinstance(transTag, NavigableString):
- oldTxt = str(transTag)
+ #纯文本,不是html
+ if '<' not in trans or '>' not in trans:
transTagName = 'span' if tag.name in ('title', 'tr', 'td', 'th', 'thead', 'tbody', 'table',
'ul', 'ol', 'li', 'a') else tag.name
transTag = soup.new_tag(transTagName)
- transTag.string = oldTxt
-
+ transTag.string = trans
+ else:
+ transTag = BeautifulSoup(trans, 'html.parser') #'html.parser'解析器不会自动添加
+ if not transTag.contents:
+ return
+ transTag = transTag.contents[0]
if origStyle:
old = tag.get('style')
tag['style'] = f'{old};{origStyle}' if old else origStyle