From 1ed3285b168177ec6e9611d38ae40a7445c5d520 Mon Sep 17 00:00:00 2001
From: cdhigh <cdhigh@users.noreply.github.com>
Date: Tue, 15 Oct 2024 08:55:04 -0300
Subject: [PATCH] sync some calibre's updates

---
 application/lib/calibre/ebooks/epub/pages.py  |   5 +-
 .../lib/calibre/ebooks/epub/periodical.py     |   5 +-
 .../lib/calibre/ebooks/html_entities.py       | 113 ++++++-
 application/lib/calibre/library/__init__.py   |   7 +-
 application/lib/calibre/library/comments.py   |  13 +-
 application/lib/calibre/utils/formatter.py    |   4 +-
 application/lib/calibre/utils/imghdr.py       |   3 +-
 application/lib/calibre/utils/serialize.py    |   4 +-
 application/lib/calibre/utils/short_uuid.py   |   4 +-
 application/lib/calibre/utils/smartypants.py  |   2 +
 application/lib/calibre/utils/speedups.py     |   8 +-
 application/lib/calibre/utils/terminal.py     |  10 +-
 application/lib/calibre/utils/threadpool.py   |   7 +-
 application/lib/calibre/utils/xml_parse.py    |  12 +-
 application/lib/calibre/utils/zipfile.py      |   2 +-
 application/lib/calibre/web/feeds/news.py     |   2 +-
 .../lib/calibre/web/site_parsers/natgeo.py    | 181 ++++++++++
 .../lib/calibre/web/site_parsers/nytimes.py   | 311 ++++++++++++------
 .../lib/ebook_translator/html_translator.py   |  16 +-
 19 files changed, 555 insertions(+), 154 deletions(-)
 create mode 100644 application/lib/calibre/web/site_parsers/natgeo.py
diff --git a/application/lib/calibre/ebooks/epub/pages.py b/application/lib/calibre/ebooks/epub/pages.py
index da06eef0..7f58aa72 100644
--- a/application/lib/calibre/ebooks/epub/pages.py
+++ b/application/lib/calibre/ebooks/epub/pages.py
@@ -9,10 +9,11 @@
 
 import re
 from itertools import count
-from calibre.ebooks.oeb.base import XHTML_NS
-from calibre.ebooks.oeb.base import OEBBook
+
 from lxml.etree import XPath
 
+from calibre.ebooks.oeb.base import XHTML_NS, OEBBook
+
 NSMAP = {'h': XHTML_NS, 'html': XHTML_NS, 'xhtml': XHTML_NS}
 PAGE_RE = re.compile(r'page', re.IGNORECASE)
 ROMAN_RE = re.compile(r'^[ivxlcdm]+$', re.IGNORECASE)
diff --git a/application/lib/calibre/ebooks/epub/periodical.py b/application/lib/calibre/ebooks/epub/periodical.py
index 8b2a6610..80db09a5 100644
--- a/application/lib/calibre/ebooks/epub/periodical.py
+++ b/application/lib/calibre/ebooks/epub/periodical.py
@@ -5,11 +5,12 @@
 __copyright__ = '2010, Kovid Goyal <kovid@kovidgoyal.net>'
 __docformat__ = 'restructuredtext en'
 
-from uuid import uuid4
 import time
+from uuid import uuid4
 
+from calibre import prepare_string_for_xml as xml
+from calibre import strftime
 from calibre.constants import __appname__, __version__
-from calibre import strftime, prepare_string_for_xml as xml
 from calibre.utils.date import parse_date
 
 SONY_METADATA = '''\
diff --git a/application/lib/calibre/ebooks/html_entities.py b/application/lib/calibre/ebooks/html_entities.py
index ca2dc2fd..6817ac03 100644
--- a/application/lib/calibre/ebooks/html_entities.py
+++ b/application/lib/calibre/ebooks/html_entities.py
@@ -1,6 +1,7 @@
 #!/usr/bin/env python
 # License: GPLv3 Copyright: 2017, Kovid Goyal <kovid at kovidgoyal.net>
 
+from calibre import my_unichr
 
 html5_entities = {
 # ENTITY_DATA {{{
@@ -91,7 +92,7 @@
     'DifferentialD': 'ⅆ',
     'Dopf': '𝔻',
     'Dot': '¨',
-    'DotDot': '⃜\u20dc',
+    'DotDot': '⃜',
     'DotEqual': '≐',
     'DoubleContourIntegral': '∯',
     'DoubleDot': '¨',
@@ -502,7 +503,7 @@
     'TRADE': '™',
     'TSHcy': 'Ћ',
     'TScy': 'Ц',
-    'Tab': '	',
+    'Tab': '\t',
     'Tau': 'Τ',
     'Tcaron': 'Ť',
     'Tcedil': 'Ţ',
@@ -1105,6 +1106,7 @@
     'hearts': '♥',
     'heartsuit': '♥',
     'hellip': '…',
+    'hellips': '…',
     'hercon': '⊹',
     'hfr': '𝔥',
     'hksearow': '⤥',
@@ -1857,6 +1859,7 @@
     'square': '□',
     'squarf': '▪',
     'squf': '▪',
+    'squot': "'",
     'srarr': '→',
     'sscr': '𝓈',
     'ssetmn': '∖',
@@ -2133,19 +2136,107 @@
 }
 
 
-if __name__ == '__main__':
+def entity_to_unicode_in_python(match, exceptions=(), encoding='cp1252', result_exceptions={}):
+    def check(ch):
+        return result_exceptions.get(ch, ch)
+
+    ent = match.group(1)
+    if ent in exceptions:
+        return '&'+ent+';'
+    if ent in {'apos', 'squot'}:  # squot is generated by some broken CMS software
+        return check("'")
+    if ent == 'hellips':
+        ent = 'hellip'
+    if ent.startswith('#'):
+        try:
+            if ent[1] in ('x', 'X'):
+                num = int(ent[2:], 16)
+            else:
+                num = int(ent[1:])
+        except:
+            return '&'+ent+';'
+        if encoding is None or num > 255:
+            return check(my_unichr(num))
+        try:
+            return check(bytes(bytearray((num,))).decode(encoding))
+        except UnicodeDecodeError:
+            return check(my_unichr(num))
+    from calibre.ebooks.html_entities import html5_entities
+    try:
+        return check(html5_entities[ent])
+    except KeyError:
+        pass
+    from polyglot.html_entities import name2codepoint
+    try:
+        return check(my_unichr(name2codepoint[ent]))
+    except KeyError:
+        return '&'+ent+';'
+
+
+def find_tests():
+    import unittest
+    class TestHTMLEntityReplacement(unittest.TestCase):
+        def test_html_entity_replacement(self):
+            from calibre_extensions.fast_html_entities import replace_all_entities
+            def t(inp, exp):
+                self.assertEqual(exp, replace_all_entities(inp), f'Failed for input: {inp!r}')
+            def x(inp, exp):
+                self.assertEqual(exp, replace_all_entities(inp, True), f'Failed for input: {inp!r}')
+            t('a&#1234;b', 'aӒb')
+            t('', '')
+            t('a', 'a')
+            t('&', '&')
+            t('&amp', '&amp')
+            t('&amp;', '&')
+            t('a&;b &#;c', 'a&;b &#;c')
+            t('&lt;', '<')
+            t('&amp;&lt;', '&<')
+            t('a&amp;b&lt;c', 'a&b<c')
+            t('a&acE;b', 'a∾̳b')
+            t('a&#1234;b', 'aӒb')
+            t('a&#X1234;b', 'a\u1234b')
+            t('a&#x1034fA;b', 'a\U001034fAb')
+            t('a&#0;b&#x000;c', 'abc')
+            x('&amp;&lt;&gt;&apos;&quot;', '&amp;&lt;&gt;&apos;&quot;')
+
+    return unittest.defaultTestLoader.loadTestsFromTestCase(TestHTMLEntityReplacement)
+
+
+def generate_entity_lists():
     import re
-    from html5lib.constants import entities
-    entities = {k.replace(';', ''): entities[k] for k in entities}
+    from html import entities as e
+    entities = {k.rstrip(';'): e.name2codepoint[k] for k in e.name2codepoint}
+    entities.update({k.rstrip(';'): e.html5[k] for k in e.html5})
+    # common misspelled entity names
+    for k, v in {'squot': "'", 'hellips': entities['hellip']}.items():
+        if k not in entities:
+            entities[k] = v
     lines = []
+    native_lines = '''\
+struct html_entity { const char *name, *val; }
+%%
+'''.splitlines()
+
+    def esc_for_c(x):
+        if x == '\n':
+            return '\\n'
+        if x in '''"\\''':
+            return '\\' + x
+        return x
 
     for k in sorted(entities):
-        lines.append(f"    '{k}': {entities[k]!r},")
+        v = entities[k]
+        lines.append(f"    '{k}': {v!r},")
+        native_lines.append(f'"{esc_for_c(k)}","{esc_for_c(v)}"')
 
     with open(__file__, 'r+b') as f:
         raw = f.read().decode('utf-8')
-        raw = re.sub(r'^# ENTITY_DATA {{{.+^# }}}',
-                     '# ENTITY_DATA {{{\n' + '\n'.join(lines) + '\n# }}}',
-                     raw, flags=re.M | re.DOTALL)
-        f.seek(0), f.truncate()
-        f.write(raw.encode('utf-8'))
+        pat = re.compile(r'^# ENTITY_DATA {{{.+?^# }}}', flags=re.M | re.DOTALL)
+        raw = pat.sub(lambda m: '# ENTITY_DATA {{{\n' + '\n'.join(lines) + '\n# }}}', raw)
+        f.seek(0), f.truncate(), f.write(raw.encode('utf-8'))
+
+    import subprocess
+    with open(__file__.replace('.py', '.h'), 'wb') as f:
+        cp = subprocess.run(['gperf', '--struct-type', '--readonly', '--includes', '--compare-strncmp'], input='\n'.join(native_lines).encode(), stdout=f)
+        if cp.returncode != 0:
+            raise SystemExit(cp.returncode)
diff --git a/application/lib/calibre/library/__init__.py b/application/lib/calibre/library/__init__.py
index 6b9978a8..6ffac893 100644
--- a/application/lib/calibre/library/__init__.py
+++ b/application/lib/calibre/library/__init__.py
@@ -24,7 +24,12 @@ def generate_test_db(library_path,  # {{{
         max_authors=10,
         max_tags=10
         ):
-    import random, string, os, sys, time
+    import os
+    import random
+    import string
+    import sys
+    import time
+
     from calibre.constants import preferred_encoding
 
     if not os.path.exists(library_path):
diff --git a/application/lib/calibre/library/comments.py b/application/lib/calibre/library/comments.py
index 21320e28..661fe829 100644
--- a/application/lib/calibre/library/comments.py
+++ b/application/lib/calibre/library/comments.py
@@ -6,10 +6,7 @@
 
 from calibre import prepare_string_for_xml
 from calibre.constants import preferred_encoding
-from calibre.ebooks.BeautifulSoup import (
-    BeautifulSoup, CData, Comment, Declaration, NavigableString,
-    ProcessingInstruction
-)
+from calibre.ebooks.BeautifulSoup import BeautifulSoup, CData, Comment, Declaration, NavigableString, ProcessingInstruction
 from calibre.utils.html2text import html2text
 
 # Hackish - ignoring sentences ending or beginning in numbers to avoid
@@ -168,7 +165,11 @@ def test_comments_to_html(self):
                     ('a <?xml asd> b\n\ncd',
                         '<p class="description">a  b</p><p class="description">cd</p>'),
             ]:
-                cval = comments_to_html(pat)
-                self.assertEqual(cval, val)
+                try:
+                    cval = comments_to_html(pat)
+                except DeprecationWarning:
+                    pass  # new lxml + old Beautiful soup == deprecation warning
+                else:
+                    self.assertEqual(cval, val)
 
     return unittest.defaultTestLoader.loadTestsFromTestCase(Test)
diff --git a/application/lib/calibre/utils/formatter.py b/application/lib/calibre/utils/formatter.py
index ab7934cc..8f09febd 100644
--- a/application/lib/calibre/utils/formatter.py
+++ b/application/lib/calibre/utils/formatter.py
@@ -20,9 +20,7 @@
 from calibre.constants import DEBUG
 from calibre.ebooks.metadata.book.base import field_metadata
 from calibre.utils.config import tweaks
-from calibre.utils.formatter_functions import (
-    StoredObjectType, formatter_functions, function_object_type, get_database,
-)
+from calibre.utils.formatter_functions import StoredObjectType, formatter_functions, function_object_type, get_database
 from calibre.utils.icu import strcmp
 from calibre.utils.localization import _
 from polyglot.builtins import error_message
diff --git a/application/lib/calibre/utils/imghdr.py b/application/lib/calibre/utils/imghdr.py
index e23c4f8d..186b71c0 100644
--- a/application/lib/calibre/utils/imghdr.py
+++ b/application/lib/calibre/utils/imghdr.py
@@ -2,8 +2,9 @@
 # License: GPLv3 Copyright: 2016, Kovid Goyal <kovid at kovidgoyal.net>
 
 
-from struct import unpack, error
 import os
+from struct import error, unpack
+
 from calibre.utils.speedups import ReadOnlyFileBuffer
 from polyglot.builtins import string_or_bytes
 
diff --git a/application/lib/calibre/utils/serialize.py b/application/lib/calibre/utils/serialize.py
index 707298df..81e187f0 100644
--- a/application/lib/calibre/utils/serialize.py
+++ b/application/lib/calibre/utils/serialize.py
@@ -25,9 +25,9 @@ def encoder(obj):
         if isinstance(obj, (set, frozenset)):
             return encoded(1, tuple(obj), ExtType)
         if getattr(obj, '__calibre_serializable__', False):
+            from calibre.db.categories import Tag
             from calibre.ebooks.metadata.book.base import Metadata
             from calibre.library.field_metadata import FieldMetadata, fm_as_dict
-            from calibre.db.categories import Tag
             if isinstance(obj, Metadata):
                 from calibre.ebooks.metadata.book.serialize import metadata_as_dict
                 return encoded(
@@ -60,8 +60,8 @@ def json_dumps(data, **kw):
 
 
 def decode_metadata(x, for_json):
-    from polyglot.binary import from_base64_bytes
     from calibre.ebooks.metadata.book.serialize import metadata_from_dict
+    from polyglot.binary import from_base64_bytes
     obj = metadata_from_dict(x)
     if for_json and obj.cover_data and obj.cover_data[1]:
         obj.cover_data = obj.cover_data[0], from_base64_bytes(obj.cover_data[1])
diff --git a/application/lib/calibre/utils/short_uuid.py b/application/lib/calibre/utils/short_uuid.py
index f97475c4..9b344e20 100644
--- a/application/lib/calibre/utils/short_uuid.py
+++ b/application/lib/calibre/utils/short_uuid.py
@@ -6,7 +6,9 @@
 Generate UUID encoded using a user specified alphabet.
 '''
 
-import string, math, uuid as _uuid
+import math
+import string
+import uuid as _uuid
 
 
 def num_to_string(number, alphabet, alphabet_len, pad_to_length=None):
diff --git a/application/lib/calibre/utils/smartypants.py b/application/lib/calibre/utils/smartypants.py
index 3d44112f..a00a6151 100644
--- a/application/lib/calibre/utils/smartypants.py
+++ b/application/lib/calibre/utils/smartypants.py
@@ -875,6 +875,8 @@ def test_ordinal_numbers(self):
 
         def test_educated_quotes(self):
             self.assertEqual(sp('''"Isn't this fun?"'''), '''&#8220;Isn&#8217;t this fun?&#8221;''')
+            self.assertEqual(sp("'abc'"), '&#8216;abc&#8217;')
+
 
     tests = unittest.defaultTestLoader.loadTestsFromTestCase(TestSmartypantsAllAttributes)
     if return_tests:
diff --git a/application/lib/calibre/utils/speedups.py b/application/lib/calibre/utils/speedups.py
index 2e835a4c..270cc4c7 100644
--- a/application/lib/calibre/utils/speedups.py
+++ b/application/lib/calibre/utils/speedups.py
@@ -9,14 +9,15 @@ class ReadOnlyFileBuffer:
 
     ''' A zero copy implementation of a file like object. Uses memoryviews for efficiency. '''
 
-    def __init__(self, raw):
+    def __init__(self, raw: bytes, name: str = ''):
         self.sz, self.mv = len(raw), (raw if isinstance(raw, memoryview) else memoryview(raw))
         self.pos = 0
+        self.name: str = name
 
     def tell(self):
         return self.pos
 
-    def read(self, n=None):
+    def read(self, n = None) -> memoryview:
         if n is None:
             ans = self.mv[self.pos:]
             self.pos = self.sz
@@ -35,6 +36,9 @@ def seek(self, pos, whence=os.SEEK_SET):
         self.pos = max(0, min(self.pos, self.sz))
         return self.pos
 
+    def seekable(self):
+        return True
+
     def getvalue(self):
         return self.mv
 
diff --git a/application/lib/calibre/utils/terminal.py b/application/lib/calibre/utils/terminal.py
index cd31103b..6aee59ef 100644
--- a/application/lib/calibre/utils/terminal.py
+++ b/application/lib/calibre/utils/terminal.py
@@ -4,10 +4,12 @@
 __copyright__ = '2012, Kovid Goyal <kovid at kovidgoyal.net>'
 __docformat__ = 'restructuredtext en'
 
-import os, sys, re
+import os
+import re
+import sys
 
-from calibre.prints import is_binary
 from calibre.constants import iswindows
+from calibre.prints import is_binary
 from polyglot.builtins import iteritems
 
 if iswindows:
@@ -233,7 +235,9 @@ class CONSOLE_SCREEN_BUFFER_INFO(Structure):
 
 
 def get_term_geometry():
-    import fcntl, termios, struct
+    import fcntl
+    import struct
+    import termios
 
     def ioctl_GWINSZ(fd):
         try:
diff --git a/application/lib/calibre/utils/threadpool.py b/application/lib/calibre/utils/threadpool.py
index 818183e4..1bc1db50 100644
--- a/application/lib/calibre/utils/threadpool.py
+++ b/application/lib/calibre/utils/threadpool.py
@@ -47,6 +47,7 @@
 
 # standard library modules
 import threading
+
 from polyglot import queue
 
 # exceptions
@@ -72,7 +73,7 @@ class WorkerThread(threading.Thread):
     """
 
     def __init__(self, requestsQueue, resultsQueue, **kwds):
-        """Set up thread in daemonic mode and start it immediatedly.
+        """Set up thread in daemonic mode and start it immediately.
 
         requestsQueue and resultQueue are instances of queue.Queue passed
         by the ThreadPool class when it creates a new worker thread.
@@ -96,9 +97,9 @@ def run(self):
                 break  # and exit
             try:
                 self.resultQueue.put(
-                    (request, request.callable(request.args, **request.kwds))
+                    (request, request.callable(*request.args, **request.kwds))
                 )
-            except Exception as e:
+            except:
                 request.exception = True
                 import traceback
                 self.resultQueue.put((request, traceback.format_exc()))
diff --git a/application/lib/calibre/utils/xml_parse.py b/application/lib/calibre/utils/xml_parse.py
index a31c6ed8..fdd57dca 100644
--- a/application/lib/calibre/utils/xml_parse.py
+++ b/application/lib/calibre/utils/xml_parse.py
@@ -36,8 +36,16 @@ def safe_xml_fromstring(string_or_bytes, recover=True):
     return ans
 
 
+def unsafe_xml_fromstring(string_or_bytes):
+    parser = etree.XMLParser(resolve_entities=True)
+    return fs(string_or_bytes, parser=parser)
+
+
 def find_tests():
-    import unittest, tempfile, os
+    import os
+    import tempfile
+    import unittest
+
     from calibre.constants import iswindows
 
     class TestXMLParse(unittest.TestCase):
@@ -61,7 +69,7 @@ def t(tid, val, expected, safe=True):
                 raw = templ.format(id=tid, val=val)
                 err = None
                 try:
-                    root = safe_xml_fromstring(raw) if safe else etree.fromstring(raw)
+                    root = safe_xml_fromstring(raw) if safe else unsafe_xml_fromstring(raw)
                 except Exception as e:
                     err = str(e)
                     root = None
diff --git a/application/lib/calibre/utils/zipfile.py b/application/lib/calibre/utils/zipfile.py
index e493b75c..03c57bc7 100644
--- a/application/lib/calibre/utils/zipfile.py
+++ b/application/lib/calibre/utils/zipfile.py
@@ -17,7 +17,7 @@
 from calibre.constants import filesystem_encoding
 from calibre.ebooks.chardet import detect
 from calibre.ptempfile import SpooledTemporaryFile
-from polyglot.builtins import string_or_bytes, as_bytes
+from polyglot.builtins import as_bytes, string_or_bytes
 
 try:
     import zlib  # We may need its compression method
diff --git a/application/lib/calibre/web/feeds/news.py b/application/lib/calibre/web/feeds/news.py
index c248ebe4..9f05402a 100644
--- a/application/lib/calibre/web/feeds/news.py
+++ b/application/lib/calibre/web/feeds/news.py
@@ -2318,7 +2318,7 @@ def get_browser(self):
         br.select_form(name='login')
         br['username'] = self.username
         br['password'] = self.password
-        raw = br.submit_selected().content
+        raw = br.submit().read()
         if 'href="/my-account"' not in raw:
             raise LoginFailed(
                     _('Failed to log in, check your username and password for'
diff --git a/application/lib/calibre/web/site_parsers/natgeo.py b/application/lib/calibre/web/site_parsers/natgeo.py
new file mode 100644
index 00000000..3e60a3e9
--- /dev/null
+++ b/application/lib/calibre/web/site_parsers/natgeo.py
@@ -0,0 +1,181 @@
+#!/usr/bin/env python
+
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+import json
+from pprint import pprint
+
+from calibre import prepare_string_for_xml as escape
+from calibre.utils.iso8601 import parse_iso8601
+
+module_version = 1  # needed for live updates
+pprint
+
+
+def extract_json(raw):
+    s = raw.find("window['__natgeo__']")
+    script = raw[s : raw.find('</script>', s)]
+    content = json.loads(script[script.find('{') :].rstrip(';'))['page']['content']
+    if content.get('prismarticle'):
+        return content['prismarticle']
+    if content.get('article'):
+        return content['article']
+
+
+def parse_contributors(grp):
+    for item in grp:
+        line = '<div class="auth">' + escape(item['title']) + ' '
+        for c in item['contributors']:
+            line += escape(c['displayName'])
+        yield line + '</div>'
+
+
+def parse_lead_image(media):
+    if 'image' in media:
+        yield '<p>'
+        if 'dsc' in media['image']:
+            yield (
+                f'<div><img src="{escape(media["image"]["src"], True)}" '
+                f'alt="{escape(media["image"]["dsc"], True)}"></div>'
+            )
+        else:
+            yield f'<div><img src="{escape(media["image"]["src"], True)}"></div>'
+        if 'caption' in media and 'credit' in media:
+            yield (
+                '<div class="cap">'
+                + media['caption']
+                + '<span class="cred"> '
+                + media['credit']
+                + '</span></div>'
+            )
+        elif 'caption' in media:
+            yield '<div class="cap">' + media['caption'] + '</div>'
+        yield '</p>'
+
+
+def parse_inline(inl):
+    if inl.get('content', {}).get('name', '') == 'Image':
+        props = inl['content']['props']
+        yield '<p>'
+        if 'image' in props:
+            yield f'<div class="img"><img src="{props["image"]["src"]}"></div>'
+        if 'caption' in props:
+            yield (
+                f'<div class="cap">{props["caption"].get("text", "")}<span '
+                f'class="cred"> {props["caption"].get("credit", "")}</span></div>'
+            )
+        yield '</p>'
+    if inl.get('content', {}).get('name', '') == 'ImageGroup':
+        if 'images' in inl['content']['props']:
+            for imgs in inl['content']['props']['images']:
+                yield '<p>'
+                if 'src' in imgs:
+                    yield f'<div class="img"><img src="{imgs["src"]}"></div>'
+                if 'caption' in imgs:
+                    yield (
+                        f'<div class="cap">{imgs["caption"].get("text", "")}<span '
+                        f'class="cred"> {imgs["caption"].get("credit", "")}</span></div>'
+                    )
+                yield '</p>'
+
+
+def parse_cont(content):
+    for cont in content.get('content', {}):
+        if isinstance(cont, dict):
+            yield from parse_body(cont)
+        if isinstance(cont, str):
+            yield cont
+
+
+def parse_body(x):
+    if isinstance(x, dict):
+        if 'type' in x:
+            tag = x['type']
+            if tag == 'inline':
+                yield ''.join(parse_inline(x))
+            elif 'attrs' in x and 'href' in x.get('attrs', ''):
+                yield '<' + tag + f' href="{x["attrs"]["href"]}">'
+                yield from parse_cont(x)
+                yield '</' + tag + '>'
+            else:
+                yield '<' + tag + '>'
+                yield from parse_cont(x)
+                yield '</' + tag + '>'
+    elif isinstance(x, list):
+        for y in x:
+            if isinstance(y, dict):
+                yield from parse_body(y)
+
+def parse_bdy(item):
+    c = item['cntnt']
+    if item.get('type') == 'inline':
+        if c.get('cmsType') == 'listicle':
+            if 'title' in c:
+                yield '<h3>' + escape(c['title']) + '</h3>'
+            yield c['text']
+        elif c.get('cmsType') == 'image':
+            yield from parse_lead_image(c)
+        elif c.get('cmsType') == 'imagegroup':
+            for imgs in c['images']:
+                yield from parse_lead_image(imgs)
+        elif c.get('cmsType') == 'pullquote':
+            if 'quote' in c:
+                yield '<blockquote>' + c['quote'] + '</blockquote>'
+        elif c.get('cmsType') == 'editorsNote':
+            if 'note' in c:
+                yield '<blockquote>' + c['note'] + '</blockquote>'
+    else:
+        if c['mrkup'].strip().startswith('<'):
+            yield c['mrkup']
+        else:
+            yield '<{tag}>{markup}</{tag}>'.format(
+                tag=item['type'], markup=c['mrkup'])
+
+def parse_article(edg):
+    sc = edg['schma']
+    yield '<div class="sub">' + escape(edg['sctn']) + '</div>'
+    yield '<h1>' + escape(sc['sclTtl']) + '</h1>'
+    if sc.get('sclDsc'):
+        yield '<div class="byline">' + escape(sc['sclDsc']) + '</div>'
+    yield '<p>'
+    yield from parse_contributors(edg.get('cntrbGrp', {}))
+    ts = parse_iso8601(edg['mdDt'], as_utc=False).strftime('%B %d, %Y')
+    yield '<div class="time">Published: ' + escape(ts) + '</div>'
+    if 'readTime' in edg:
+        yield '<div class="time">' + escape(edg['readTime']) + '</div>'
+    yield '</p>'
+    if edg.get('ldMda', {}).get('cmsType') == 'image':
+        yield from parse_lead_image(edg['ldMda'])
+    if edg.get('prismData'):
+        for main in edg['prismData']['mainComponents']:
+            if main['name'] == 'Body':
+                for item in main['props']['body']:
+                    if isinstance(item, dict):
+                        if item.get('type', '') == 'inline':
+                            yield ''.join(parse_inline(item))
+                    elif isinstance(item, list):
+                        for line in item:
+                            yield ''.join(parse_body(line))
+    elif edg.get('bdy'):
+        for item in edg['bdy']:
+            yield from parse_bdy(item)
+
+
+def article_parse(data):
+    yield '<html><body>'
+    for frm in data['frms']:
+        if not frm:
+            continue
+        for mod in frm.get('mods', ()):
+            for edg in mod.get('edgs', ()):
+                if edg.get('cmsType') == 'ImmersiveLeadTile':
+                    if 'image' in edg.get('cmsImage', {}):
+                        yield from parse_lead_image(edg['cmsImage'])
+                if edg.get('cmsType') == 'ArticleBodyTile':
+                    yield from parse_article(edg)
+    yield '</body></html>'
+
+
+def extract_html(raw_html):
+    data = extract_json(raw_html)
+    return '\n'.join(article_parse(data))
diff --git a/application/lib/calibre/web/site_parsers/nytimes.py b/application/lib/calibre/web/site_parsers/nytimes.py
index c78e3edc..17208821 100644
--- a/application/lib/calibre/web/site_parsers/nytimes.py
+++ b/application/lib/calibre/web/site_parsers/nytimes.py
@@ -9,92 +9,190 @@
 
 from calibre.utils.iso8601 import parse_iso8601
 
-module_version = 4  # needed for live updates
+module_version = 11  # needed for live updates
 pprint
 
 
-def is_heading(tn):
-    return tn in ('Heading1Block', 'Heading2Block', 'Heading3Block', 'Heading4Block')
-
-
-def process_inline_text(lines, block):
-    text = ''
-    if 'text@stripHtml' in block:
-        text = escape(block['text@stripHtml'])
-    elif 'renderedRepresentation' in block:  # happens in byline blocks
-        text = block['renderedRepresentation']
-    elif 'text' in block:
-        text = block['text']
-    if text:
-        for fmt in block.get('formats', ()):
-            tn = fmt['__typename']
-            if tn == 'LinkFormat':
-                ab = fmt
-                text = '<a href="{}" title="{}">{}</a>'.format(ab['url'], ab.get('title') or '', text)
-            elif tn == 'BoldFormat':
-                text = '<b>' + text + '</b>'
-        lines.append(text)
-
-
-def process_paragraph(lines, block, content_key='content'):
-    tn = block['__typename']
-    m = re.match(r'Heading([1-6])Block', tn)
-    if m is not None:
-        tag = 'h' + m.group(1)
-    else:
-        tag = 'p'
-    ta = block.get('textAlign') or 'LEFT'
-    style = f'text-align: {ta.lower()}'
-    lines.append(f'<{tag} style="{style}">')
-    for item in block[content_key]:
-        tn = item['__typename']
-        if tn in ('TextInline', 'Byline'):
-            process_inline_text(lines, item)
-    lines.append('</' + tag + '>')
-
-
-def process_timestamp(lines, block):
-    ts = block['timestamp']
-    dt = parse_iso8601(ts, as_utc=False)
-    lines.append('<p class="timestamp">' + escape(dt.strftime('%b %d, %Y')) + '</p>')
-
-
-def process_header(lines, block):
-    label = block.get('label')
-    if label:
-        process_paragraph(lines, label)
-    headline = block.get('headline')
-    if headline:
-        process_paragraph(lines, headline)
-    summary = block.get('summary')
-    if summary:
-        process_paragraph(lines, summary)
-    lm = block.get('ledeMedia')
-    if lm and lm.get('__typename') == 'ImageBlock':
-        process_image_block(lines, lm)
-    byline = block.get('byline')
-    if byline:
-        process_paragraph(lines, byline, content_key='bylines')
-    timestamp = block.get('timestampBlock')
-    if timestamp:
-        process_timestamp(lines, timestamp)
-
-
-def process_image_block(lines, block):
-    media = block['media']
-    caption = media.get('caption')
-    caption_lines = []
-    if caption:
-        process_inline_text(caption_lines, caption)
-    crops = media['crops']
-    renditions = crops[0]['renditions']
-    img = renditions[0]['url']
-    if 'web.archive.org' in img:
-        img = img.partition('/')[-1]
-        img = img[img.find('https://'):]
-    lines.append(f'<div style="text-align: center"><div style="text-align: center"><img src={quoteattr(img)}/></div><div style="font-size: smaller">')
-    lines.extend(caption_lines)
-    lines.append('</div></div>')
+def parse_image(i):
+    crop = i.get('crops') or i.get('spanImageCrops')
+    if crop:
+        yield f'<div><img src="{crop[0]["renditions"][0]["url"]}" title="{i.get("altText", "")}">'
+    if i.get('caption'):
+        yield f'<div class="cap">{"".join(parse_types(i["caption"]))}'
+        if i.get('credit'):
+            yield f'<span class="cred"> {i["credit"]}</span>'
+        yield '</div>'
+    elif i.get('legacyHtmlCaption'):
+        if i['legacyHtmlCaption'].strip():
+            yield f'<div class="cap">{i["legacyHtmlCaption"]}</div>'
+    yield '</div>'
+
+
+def parse_img_grid(g):
+    for grd in g.get('gridMedia', {}):
+        yield ''.join(parse_image(grd))
+    if g.get('caption'):
+        yield f'<div class="cap">{g["caption"]}'
+        if g.get('credit'):
+            yield f'<span class="cred"> {g["credit"]}</span>'
+        yield '</div>'
+
+
+def parse_vid(v):
+    if v.get('promotionalMedia'):
+        headline = v.get('headline', {}).get('default', '')
+        rendition = v.get('renditions')
+        yield (
+            f'<div><b><a href="{rendition[0]["url"]}">Video</a>: {headline}</b></div>'
+            if rendition
+            else f'<div><b>{headline}</b></div>'
+        )
+        yield ''.join(parse_types(v['promotionalMedia']))
+        if v.get('promotionalSummary'):
+            yield f'<div class="cap">{v["promotionalSummary"]}</div>'
+
+
+def parse_emb(e):
+    if e.get('html') and 'datawrapper.dwcdn.net' in e.get('html', ''):
+        dw = re.search(r'datawrapper.dwcdn.net/(.{5})', e['html']).group(1)
+        yield f'<div><img src="https://datawrapper.dwcdn.net/{dw}/full.png"></div>'
+    elif e.get('promotionalMedia'):
+        if e.get('headline'):
+            yield f'<div><b>{e["headline"]["default"]}</b></div>'
+        yield ''.join(parse_types(e['promotionalMedia']))
+        if e.get('note'):
+            yield f'<div class="cap">{e["note"]}</div>'
+
+
+def parse_byline(byl):
+    for b in byl.get('bylines', {}):
+        yield f'<div><b>{b["renderedRepresentation"]}</b></div>'
+    yield '<div><i>'
+    for rl in byl.get('role', {}):
+        if ''.join(parse_cnt(rl)).strip():
+            yield ''.join(parse_cnt(rl))
+    yield '</i></div>'
+
+
+def iso_date(x):
+    dt = parse_iso8601(x, as_utc=False)
+    return dt.strftime('%b %d, %Y at %I:%M %p')
+
+
+def parse_header(h):
+    if h.get('label'):
+        yield f'<div class="lbl">{"".join(parse_types(h["label"]))}</div>'
+    if h.get('headline'):
+        yield ''.join(parse_types(h['headline']))
+    if h.get('summary'):
+        yield f'<p><i>{"".join(parse_types(h["summary"]))}</i></p>'
+    if h.get('ledeMedia'):
+        yield ''.join(parse_types(h['ledeMedia']))
+    if h.get('byline'):
+        yield ''.join(parse_types(h['byline']))
+    if h.get('timestampBlock'):
+        yield ''.join(parse_types(h['timestampBlock']))
+
+
+def parse_fmt_type(fm):
+    for f in fm.get('formats', {}):
+        ftype = f.get('__typename', '')
+        if ftype == 'BoldFormat':
+            yield '<strong>'
+        if ftype == 'ItalicFormat':
+            yield '<em>'
+        if ftype == 'LinkFormat':
+            hrf = f['url']
+            yield f'<a href="{hrf}">'
+    yield fm.get('text', '')
+    for f in reversed(fm.get('formats', {})):
+        ftype = f.get('__typename', '')
+        if ftype == 'BoldFormat':
+            yield '</strong>'
+        if ftype == 'ItalicFormat':
+            yield '</em>'
+        if ftype == 'LinkFormat':
+            yield '</a>'
+
+
+def parse_cnt(cnt):
+    for k in cnt:
+        if isinstance(cnt[k], list):
+            if k == 'formats':
+                yield ''.join(parse_fmt_type(cnt))
+            else:
+                for cnt_ in cnt[k]:
+                    yield ''.join(parse_types(cnt_))
+        if isinstance(cnt[k], dict):
+            yield ''.join(parse_types(cnt[k]))
+    if cnt.get('text') and 'formats' not in cnt and 'content' not in cnt:
+        if isinstance(cnt['text'], str):
+            yield cnt['text']
+
+
+def parse_types(x):
+    typename = x.get('__typename', '')
+
+    align = ''
+    if x.get('textAlign'):
+        align = f' style="text-align: {x["textAlign"].lower()};"'
+
+    if 'Header' in typename:
+        yield '\n'.join(parse_header(x))
+
+    elif typename.startswith('Heading'):
+        htag = 'h' + re.match(r'Heading([1-6])Block', typename).group(1)
+        yield f'<{htag}{align}>{"".join(parse_cnt(x))}</{htag}>'
+
+    elif typename == 'ParagraphBlock':
+        yield f'<p>{"".join(parse_cnt(x))}</p>'
+    elif typename in {'DetailBlock', 'TextRunKV'}:
+        yield f'<p style="font-size: small;">{"".join(parse_cnt(x))}</p>'
+
+    elif typename == 'BylineBlock':
+        yield f'<div class="byl"><br/>{"".join(parse_byline(x))}</div>'
+    elif typename == 'LabelBlock':
+        yield f'<div class="sc">{"".join(parse_cnt(x))}</div>'
+    elif typename == 'BlockquoteBlock':
+        yield f'<blockquote>{"".join(parse_cnt(x))}</blockquote>'
+    elif typename == 'TimestampBlock':
+        yield f'<div class="time">{iso_date(x["timestamp"])}</div>'
+    elif typename == 'LineBreakInline':
+        yield '<br/>'
+    elif typename == 'RuleBlock':
+        yield '<hr/>'
+
+    elif typename == 'Image':
+        yield ''.join(parse_image(x))
+
+    elif typename == 'GridBlock':
+        yield ''.join(parse_img_grid(x))
+
+    elif typename == 'Video':
+        yield ''.join(parse_vid(x))
+
+    elif typename == 'EmbeddedInteractive':
+        yield ''.join(parse_emb(x))
+
+    elif typename == 'ListBlock':
+        yield f'\n<ul>{"".join(parse_cnt(x))}</ul>'
+    elif typename == 'ListItemBlock':
+        yield f'\n<li>{"".join(parse_cnt(x))}</li>'
+
+    elif typename and typename not in {
+        'RelatedLinksBlock',
+        'EmailSignupBlock',
+        'Dropzone',
+        'AudioBlock',
+    }:
+        yield ''.join(parse_cnt(x))
+
+
+def article_parse(data):
+    yield '<html><body>'
+    for d in data:
+        yield from parse_types(d)
+    yield '</body></html>'
 
 
 def json_to_html(raw):
@@ -105,18 +203,8 @@ def json_to_html(raw):
     except TypeError:
         data = data['initialState']
         return live_json_to_html(data)
-    article = next(iter(data.values()))
-    body = article['sprinkledBody']['content']
-    lines = []
-    for item in body:
-        tn = item['__typename']
-        if tn in ('HeaderBasicBlock', 'HeaderLegacyBlock', 'HeaderFullBleedVerticalBlock'):
-            process_header(lines, item)
-        elif tn in ('ParagraphBlock', 'LabelBlock', 'DetailBlock') or is_heading(tn):
-            process_paragraph(lines, item)
-        elif tn == 'ImageBlock':
-            process_image_block(lines, item)
-    return '<html><body>' + '\n'.join(lines) + '</body></html>'
+    content = data['article']['sprinkledBody']['content']
+    return '\n'.join(article_parse(content))
 
 
 def add_live_item(item, item_type, lines):
@@ -161,7 +249,7 @@ def add_live_item(item, item_type, lines):
 
 
 def live_json_to_html(data):
-    for k, v in data["ROOT_QUERY"].items():
+    for k, v in data['ROOT_QUERY'].items():
         if isinstance(v, dict) and 'id' in v:
             root = data[v['id']]
     s = data[root['storylines'][0]['id']]
@@ -178,37 +266,50 @@ def live_json_to_html(data):
     return '<html><body>' + '\n'.join(lines) + '</body></html>'
 
 
-def extract_html(soup):
+def extract_html(soup, url):
+    if '/interactive/' in url:
+        return (
+            '<html><body><p><em>'
+            + 'This is an interactive article, which is supposed to be read in a browser.'
+            + '</p></em></body></html>'
+        )
     script = soup.findAll('script', text=lambda x: x and 'window.__preloadedData' in x)[0]
     script = str(script)
-    raw = script[script.find('{'):script.rfind(';')].strip().rstrip(';')
+    raw = script[script.find('{') : script.rfind(';')].strip().rstrip(';')
     return json_to_html(raw)
 
 
-def download_url(url=None, br=None):
-    # Get the URL from the Wayback machine
+def download_url_from_wayback(category, url, br=None):
     from mechanize import Request
+
     host = 'http://localhost:8090'
     host = 'https://wayback1.calibre-ebook.com'
-    if url is None:
-        url = sys.argv[-1]
     rq = Request(
-        host + '/nytimes',
-        data=json.dumps({"url": url}),
-        headers={'User-Agent': 'calibre', 'Content-Type': 'application/json'}
+        host + '/' + category,
+        data=json.dumps({'url': url}),
+        headers={'User-Agent': 'calibre', 'Content-Type': 'application/json'},
     )
     if br is None:
         from calibre import browser
+
         br = browser()
     br.set_handle_gzip(True)
     return br.open_novisit(rq, timeout=3 * 60).read()
 
 
+def download_url(url=None, br=None):
+    # Get the URL from the Wayback machine
+    if url is None:
+        url = sys.argv[-1]
+    return download_url_from_wayback('nytimes', url, br)
+
+
 if __name__ == '__main__':
     f = sys.argv[-1]
     raw = open(f).read()
     if f.endswith('.html'):
         from calibre.ebooks.BeautifulSoup import BeautifulSoup
+
         soup = BeautifulSoup(raw)
         print(extract_html(soup))
     else:
diff --git a/application/lib/ebook_translator/html_translator.py b/application/lib/ebook_translator/html_translator.py
index 7498c079..5a24868d 100644
--- a/application/lib/ebook_translator/html_translator.py
+++ b/application/lib/ebook_translator/html_translator.py
@@ -142,17 +142,17 @@ def add_translation_soup(self, soup, tag, trans, dst):
         origStyle = self.params.get('orig_style', '')
         transStyle = self.params.get('trans_style', '')
         trans = trans.replace('&lt;', '<').replace('&gt;', '>')
-        transTag = BeautifulSoup(trans, 'html.parser') #'html.parser'解析器不会自动添加<html><body>
-        if not transTag.contents:
-            return
-        transTag = transTag.contents[0]
-        if isinstance(transTag, NavigableString):
-            oldTxt = str(transTag)
+        #纯文本，不是html
+        if '<' not in trans or '>' not in trans:
             transTagName = 'span' if tag.name in ('title', 'tr', 'td', 'th', 'thead', 'tbody', 'table', 
                 'ul', 'ol', 'li', 'a') else tag.name
             transTag = soup.new_tag(transTagName)
-            transTag.string = oldTxt
-        
+            transTag.string = trans
+        else:
+            transTag = BeautifulSoup(trans, 'html.parser') #'html.parser'解析器不会自动添加<html><body>
+            if not transTag.contents:
+                return
+            transTag = transTag.contents[0]
         if origStyle:
             old = tag.get('style')
             tag['style'] = f'{old};{origStyle}' if old else origStyle