From 07789a2be0f252360a199bb98a9c55707825ac3b Mon Sep 17 00:00:00 2001 From: Crozzers Date: Mon, 13 Mar 2023 18:28:23 +0000 Subject: [PATCH 01/23] Custom extras initial proof of concept. The way this works is by creating subclasses of the `Extra` class. These subclasses will have an order and a name. The name is the same one specified in the `extras` list/dict given to the Markdown init function. The order will be at which point the function will be executed. This is done by attaching the extra to a "Stage", a distinct step in the markdown process (eg: forming paragraphs, processing links... etc). You can set the extra to run before or after the stage. At the moment, extras are automatically registered, activated and executed by the Markdown class. TODO: * More elegant way to register and init extras * Optimise `Stage.mark` * Convert more extras to new class format --- lib/markdown2.py | 247 ++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 200 insertions(+), 47 deletions(-) diff --git a/lib/markdown2.py b/lib/markdown2.py index ddadb6b0..10ee6894 100755 --- a/lib/markdown2.py +++ b/lib/markdown2.py @@ -111,6 +111,8 @@ from random import random, randint import codecs from collections import defaultdict +from abc import ABC, abstractmethod +import functools # ---- globals @@ -170,6 +172,92 @@ def markdown(text, html4tags=False, tab_width=DEFAULT_TAB_WIDTH, use_file_vars=use_file_vars, cli=cli).convert(text) +class Stage(): + PREPROCESS = 50 + HASH_HTML = 150 + LINK_DEFS = 250 + + BLOCK_GAMUT = 350 + HEADERS = 450 + LISTS = 550 + CODE_BLOCKS = 650 + BLOCK_QUOTES = 750 + PARAGRAPHS = 850 + + SPAN_GAMUT = 950 + CODE_SPANS = 1050 + ESCAPE_SPECIAL = 1150 + LINKS = 1250 # and auto links + ITALIC_AND_BOLD = 1350 + + POSTPROCESS = 1450 + UNHASH_HTML = 1550 + + __counts = {} + + @classmethod + def after(cls, *items: 'Stage') -> list: + ret = [] + counts = cls._Stage__counts + for item in items: + if item in counts: + counts[item][1] += 5 + else: + counts[item] = [0, 5] + ret.append(item + counts[item][1]) + return ret + + @classmethod + def before(cls, *items: 'Stage') -> list: + ret = [] + counts = cls._Stage__counts + for item in items: + if item in counts: + counts[item][0] -= 5 + else: + counts[item] = [-5, 0] + ret.append(item + counts[item][0]) + return ret + + @staticmethod + def mark(stage): + def wrapper(func): + @functools.wraps(func) + def inner(self, text, *args, **kwargs): + before = [] + after = [] + + for extra in self.extras: + if extra not in Extra._registry: + continue + klass = Extra._registry[extra] + for order in klass.order: + if order // 100 == stage // 100: + if order < stage: + before.append((order, klass)) + else: + after.append((order, klass)) + + before.sort(key=lambda k: k[0]) + after.sort(key=lambda k: k[0]) + + for _, klass in before: + if klass.match(text): + text = klass.run(self, text, **(self.extras[klass.name] or {})) + + text = func(self, text, *args, **kwargs) + + for _, klass in after: + if klass.match(text): + text = klass.run(self, text, **(self.extras[klass.name] or {})) + + return text + + return inner + + return wrapper + + class Markdown(object): # The dict of "extras" to enable in processing -- a mapping of # extra name to argument for the extra. Most extras do not have an @@ -271,6 +359,7 @@ def _setup_extras(self): self._count_from_header_id = defaultdict(int) if "metadata" in self.extras: self.metadata = {} + Extra.collect() # Per "rel" # should only be used in tags with an "href" attribute. @@ -363,9 +452,6 @@ def convert(self, text): if "fenced-code-blocks" in self.extras and self.safe_mode: text = self._do_fenced_code_blocks(text) - if 'admonitions' in self.extras: - text = self._do_admonitions(text) - # Because numbering references aren't links (yet?) then we can do everything associated with counters # before we get started if "numbering" in self.extras: @@ -422,6 +508,7 @@ def convert(self, text): rv.metadata = self.metadata return rv + @Stage.mark(Stage.POSTPROCESS) def postprocess(self, text): """A hook for subclasses to do some postprocessing of the html, if desired. This is called before unescaping of special chars and @@ -429,6 +516,7 @@ def postprocess(self, text): """ return text + @Stage.mark(Stage.PREPROCESS) def preprocess(self, text): """A hook for subclasses to do some preprocessing of the Markdown, if desired. This is called after basic formatting of the text, but prior @@ -767,6 +855,7 @@ def _hash_html_block_sub(self, match, raw=False): self.html_blocks[key] = html return "\n\n" + key + "\n\n" + @Stage.mark(Stage.HASH_HTML) def _hash_html_blocks(self, text, raw=False): """Hashify HTML blocks @@ -908,6 +997,7 @@ def _strict_tag_block_sub(self, text, html_tags_re, callback): return result + @Stage.mark(Stage.LINK_DEFS) def _strip_link_definitions(self, text): # Strips link definitions from text, stores the URLs and titles in # hash references. @@ -1045,13 +1135,11 @@ def _strip_footnote_definitions(self, text): _hr_re = re.compile(r'^[ ]{0,3}([-_*])[ ]{0,2}(\1[ ]{0,2}){2,}$', re.M) + @Stage.mark(Stage.BLOCK_GAMUT) def _run_block_gamut(self, text): # These are all the transformations that form block-level # tags like paragraphs, headers, and list items. - if 'admonitions' in self.extras: - text = self._do_admonitions(text) - if 'wavedrom' in self.extras: text = self._do_wavedrom_blocks(text) @@ -1248,6 +1336,7 @@ def _do_wiki_tables(self, text): ''' % less_than_tab, re.M | re.X) return wiki_table_re.sub(self._wiki_table_sub, text) + @Stage.mark(Stage.SPAN_GAMUT) def _run_span_gamut(self, text): # These are all the transformations that occur *within* block-level # tags like paragraphs, headers, and list items. @@ -1312,6 +1401,7 @@ def _run_span_gamut(self, text): ) """, re.X) + @Stage.mark(Stage.ESCAPE_SPECIAL) def _escape_special_chars(self, text): # Python markdown note: the HTML tokenization here differs from # that in Markdown.pl, hence the behaviour for subtle cases can @@ -1343,6 +1433,7 @@ def _escape_special_chars(self, text): is_html_markup = not is_html_markup return ''.join(escaped) + @Stage.mark(Stage.HASH_HTML) def _hash_html_spans(self, text): # Used for safe_mode. @@ -1477,6 +1568,7 @@ def _protect_url(self, url): return key _safe_protocols = re.compile(r'(https?|ftp):', re.I) + @Stage.mark(Stage.LINKS) def _do_links(self, text): """Turn Markdown link shortcuts into XHTML and tags. @@ -1749,6 +1841,7 @@ def _h_sub(self, match): self._toc_add_entry(n, header_id, html) return "%s\n\n" % (n, header_id_attr, html, n) + @Stage.mark(Stage.HEADERS) def _do_headers(self, text): # Setext-style headers: # Header 1 @@ -1791,6 +1884,7 @@ def _list_sub(self, match): else: return "<%s%s>\n%s\n\n" % (lst_type, lst_opts, result, lst_type) + @Stage.mark(Stage.LISTS) def _do_lists(self, text): # Form HTML ordered (numbered) and unordered (bulleted) lists. @@ -2059,6 +2153,7 @@ def _html_class_str_from_tag(self, tag): return ' class="%s"' % html_classes_from_tag[tag] return "" + @Stage.mark(Stage.CODE_BLOCKS) def _do_code_blocks(self, text): """Process Markdown `
` blocks."""
         code_block_re = re.compile(r'''
@@ -2114,6 +2209,7 @@ def _code_span_sub(self, match):
         c = self._encode_code(c)
         return "%s" % (self._html_class_str_from_tag("code"), c)
 
+    @Stage.mark(Stage.CODE_SPANS)
     def _do_code_spans(self, text):
         #   *   Backtick quotes are used for  spans.
         #
@@ -2194,44 +2290,6 @@ def _wavedrom_block_sub(self, match):
     def _do_wavedrom_blocks(self, text):
         return self._fenced_code_block_re.sub(self._wavedrom_block_sub, text)
 
-    _admonitions = r'admonition|attention|caution|danger|error|hint|important|note|tip|warning'
-    _admonitions_re = re.compile(r'''
-        ^(\ *)\.\.\ (%s)::\ *                # $1 leading indent, $2 the admonition
-        (.*)?                                # $3 admonition title
-        ((?:\s*\n\1\ {3,}.*)+?)              # $4 admonition body (required)
-        (?=\s*(?:\Z|\n{4,}|\n\1?\ {0,2}\S))  # until EOF, 3 blank lines or something less indented
-        ''' % _admonitions,
-        re.IGNORECASE | re.MULTILINE | re.VERBOSE
-    )
-
-    def _do_admonitions_sub(self, match):
-        lead_indent, admonition_name, title, body = match.groups()
-
-        admonition_type = '%s' % admonition_name
-
-        # figure out the class names to assign the block
-        if admonition_name.lower() == 'admonition':
-            admonition_class = 'admonition'
-        else:
-            admonition_class = 'admonition %s' % admonition_name.lower()
-
-        # titles are generally optional
-        if title:
-            title = '%s' % title
-
-        # process the admonition body like regular markdown
-        body = self._run_block_gamut("\n%s\n" % self._uniform_outdent(body)[1])
-
-        # indent the body before placing inside the aside block
-        admonition = self._uniform_indent('%s\n%s\n\n%s\n' % (admonition_type, title, body), self.tab, False)
-        # wrap it in an aside
-        admonition = '' % (admonition_class, admonition)
-        # now indent the whole admonition back to where it started
-        return self._uniform_indent(admonition, lead_indent, False)
-
-    def _do_admonitions(self, text):
-        return self._admonitions_re.sub(self._do_admonitions_sub, text)
-
     _strike_re = re.compile(r"~~(?=\S)(.+?)(?<=\S)~~", re.S)
     def _do_strike(self, text):
         text = self._strike_re.sub(r"\1", text)
@@ -2241,7 +2299,7 @@ def _do_strike(self, text):
     def _do_underline(self, text):
         text = self._underline_re.sub(r"\1", text)
         return text
-    
+
     _tg_spoiler_re = re.compile(r"\|\|\s?(.+?)\s?\|\|", re.S)
     def _do_tg_spoiler(self, text):
         text = self._tg_spoiler_re.sub(r"\1", text)
@@ -2251,6 +2309,8 @@ def _do_tg_spoiler(self, text):
     _em_re = re.compile(r"(\*|_)(?=\S)(.+?)(?<=\S)\1", re.S)
     _code_friendly_strong_re = re.compile(r"\*\*(?=\S)(.+?[*_]*)(?<=\S)\*\*", re.S)
     _code_friendly_em_re = re.compile(r"\*(?=\S)(.+?)(?<=\S)\*", re.S)
+
+    @Stage.mark(Stage.ITALIC_AND_BOLD)
     def _do_italics_and_bold(self, text):
         #  must go first:
         if "code-friendly" in self.extras:
@@ -2353,6 +2413,7 @@ def _block_quote_sub(self, match):
         else:
             return '
\n%s\n
\n\n' % bq + @Stage.mark(Stage.BLOCK_QUOTES) def _do_block_quotes(self, text): if '>' not in text: return text @@ -2361,6 +2422,7 @@ def _do_block_quotes(self, text): else: return self._block_quote_re.sub(self._block_quote_sub, text) + @Stage.mark(Stage.PARAGRAPHS) def _form_paragraphs(self, text): # Strip leading and trailing lines: text = text.strip('\n') @@ -2596,7 +2658,8 @@ def _outdent(self, text): # Remove one level of line-leading tabs or spaces return self._outdent_re.sub('', text) - def _uniform_outdent(self, text, min_outdent=None, max_outdent=None): + @staticmethod + def _uniform_outdent(text, min_outdent=None, max_outdent=None): # Removes the smallest common leading indentation from each (non empty) # line of `text` and returns said indent along with the outdented text. # The `min_outdent` kwarg makes sure the smallest common whitespace @@ -2631,7 +2694,8 @@ def _uniform_outdent(self, text, min_outdent=None, max_outdent=None): return outdent, ''.join(outdented) - def _uniform_indent(self, text, indent, include_empty_lines=False): + @staticmethod + def _uniform_indent(text, indent, include_empty_lines=False): return ''.join( (indent + line if line.strip() or include_empty_lines else '') for line in text.splitlines(True) @@ -2653,6 +2717,95 @@ class MarkdownWithExtras(Markdown): extras = ["footnotes", "fenced-code-blocks"] +# ---------------------------------------------------------- +# Extras +# ---------------------------------------------------------- + +class Extra(ABC): + _registry = {} + + name: str + order: list + + def __init__(self): + self.register() + + @classmethod + def collect(cls): + for s in cls.__subclasses(cls): + s_inst = s() + cls._registry[s_inst.name] = s_inst + + @abstractmethod + def match(self, text: str) -> bool: + ... + + @abstractmethod + def run(self, md: Markdown, text: str, **opts) -> str: + ... + + def register(self): + self.__class__._registry[self.name] = self + + @staticmethod + def __subclasses(cls): + return set(cls.__subclasses__()).union( + s for c in cls.__subclasses__() for s in cls.__subclasses(c) + ) + + +class Admonitions(Extra): + name = 'admonitions' + order = Stage.before(Stage.BLOCK_GAMUT, Stage.LINK_DEFS) + + admonitions = r'admonition|attention|caution|danger|error|hint|important|note|tip|warning' + + admonitions_re = re.compile(r''' + ^(\ *)\.\.\ (%s)::\ * # $1 leading indent, $2 the admonition + (.*)? # $3 admonition title + ((?:\s*\n\1\ {3,}.*)+?) # $4 admonition body (required) + (?=\s*(?:\Z|\n{4,}|\n\1?\ {0,2}\S)) # until EOF, 3 blank lines or something less indented + ''' % admonitions, + re.IGNORECASE | re.MULTILINE | re.VERBOSE + ) + + def match(self, text): + return self.admonitions_re.search(text) is not None + + def sub(self, md: Markdown, match): + lead_indent, admonition_name, title, body = match.groups() + + admonition_type = '%s' % admonition_name + + # figure out the class names to assign the block + if admonition_name.lower() == 'admonition': + admonition_class = 'admonition' + else: + admonition_class = 'admonition %s' % admonition_name.lower() + + # titles are generally optional + if title: + title = '%s' % title + + # process the admonition body like regular markdown + body = md._run_block_gamut("\n%s\n" % md._uniform_outdent(body)[1]) + + # indent the body before placing inside the aside block + admonition = md._uniform_indent( + '%s\n%s\n\n%s\n' % (admonition_type, title, body), + md.tab, False + ) + # wrap it in an aside + admonition = '' % (admonition_class, admonition) + # now indent the whole admonition back to where it started + return md._uniform_indent(admonition, lead_indent, False) + + def run(self, md, text): + return self.admonitions_re.sub(lambda *_: self.sub(md, *_), text) + +# ---------------------------------------------------------- + + # ---- internal support functions From aad9e33b81bde058cb4f505dd3b7674d910df582 Mon Sep 17 00:00:00 2001 From: Crozzers Date: Tue, 14 Mar 2023 20:50:09 +0000 Subject: [PATCH 02/23] Convert wavedrom and numbering extras to new format --- lib/markdown2.py | 222 ++++++++++++++++++++++++----------------------- 1 file changed, 112 insertions(+), 110 deletions(-) diff --git a/lib/markdown2.py b/lib/markdown2.py index 10ee6894..9e628a7a 100755 --- a/lib/markdown2.py +++ b/lib/markdown2.py @@ -223,7 +223,8 @@ def before(cls, *items: 'Stage') -> list: def mark(stage): def wrapper(func): @functools.wraps(func) - def inner(self, text, *args, **kwargs): + def inner(self: 'Markdown', text, *args, **kwargs): + self.stage = stage before = [] after = [] @@ -242,13 +243,13 @@ def inner(self, text, *args, **kwargs): after.sort(key=lambda k: k[0]) for _, klass in before: - if klass.match(text): + if klass.match(self, text): text = klass.run(self, text, **(self.extras[klass.name] or {})) text = func(self, text, *args, **kwargs) for _, klass in after: - if klass.match(text): + if klass.match(self, text): text = klass.run(self, text, **(self.extras[klass.name] or {})) return text @@ -280,6 +281,8 @@ class Markdown(object): # (see _ProcessListItems() for details): list_level = 0 + stage: Stage = None + _ws_only_line_re = re.compile(r"^[ \t]+$", re.M) def __init__(self, html4tags=False, tab_width=4, safe_mode=None, @@ -437,9 +440,6 @@ def convert(self, text): text = self.preprocess(text) - if 'wavedrom' in self.extras: - text = self._do_wavedrom_blocks(text) - if "fenced-code-blocks" in self.extras and not self.safe_mode: text = self._do_fenced_code_blocks(text) @@ -452,11 +452,6 @@ def convert(self, text): if "fenced-code-blocks" in self.extras and self.safe_mode: text = self._do_fenced_code_blocks(text) - # Because numbering references aren't links (yet?) then we can do everything associated with counters - # before we get started - if "numbering" in self.extras: - text = self._do_numbering(text) - # Strip link definitions, store in hashes. if "footnotes" in self.extras: # Must do footnotes first because an unlucky footnote defn @@ -1033,64 +1028,6 @@ def _extract_link_def_sub(self, match): self.titles[key] = title return "" - def _do_numbering(self, text): - ''' We handle the special extension for generic numbering for - tables, figures etc. - ''' - # First pass to define all the references - self.regex_defns = re.compile(r''' - \[\#(\w+) # the counter. Open square plus hash plus a word \1 - ([^@]*) # Some optional characters, that aren't an @. \2 - @(\w+) # the id. Should this be normed? \3 - ([^\]]*)\] # The rest of the text up to the terminating ] \4 - ''', re.VERBOSE) - self.regex_subs = re.compile(r"\[@(\w+)\s*\]") # [@ref_id] - counters = {} - references = {} - replacements = [] - definition_html = '
{}{}{}
' - reference_html = '
{}' - for match in self.regex_defns.finditer(text): - # We must have four match groups otherwise this isn't a numbering reference - if len(match.groups()) != 4: - continue - counter = match.group(1) - text_before = match.group(2).strip() - ref_id = match.group(3) - text_after = match.group(4) - number = counters.get(counter, 1) - references[ref_id] = (number, counter) - replacements.append((match.start(0), - definition_html.format(counter, - ref_id, - text_before, - number, - text_after), - match.end(0))) - counters[counter] = number + 1 - for repl in reversed(replacements): - text = text[:repl[0]] + repl[1] + text[repl[2]:] - - # Second pass to replace the references with the right - # value of the counter - # Fwiw, it's vaguely annoying to have to turn the iterator into - # a list and then reverse it but I can't think of a better thing to do. - for match in reversed(list(self.regex_subs.finditer(text))): - number, counter = references.get(match.group(1), (None, None)) - if number is not None: - repl = reference_html.format(counter, - match.group(1), - number) - else: - repl = reference_html.format(match.group(1), - 'countererror', - '?' + match.group(1) + '?') - if "smarty-pants" in self.extras: - repl = repl.replace('"', self._escape_table['"']) - - text = text[:match.start()] + repl + text[match.end():] - return text - def _extract_footnote_def_sub(self, match): id, text = match.groups() text = _dedent(text, skip_first_line=not text.startswith('\n')).strip() @@ -1140,9 +1077,6 @@ def _run_block_gamut(self, text): # These are all the transformations that form block-level # tags like paragraphs, headers, and list items. - if 'wavedrom' in self.extras: - text = self._do_wavedrom_blocks(text) - if "fenced-code-blocks" in self.extras: text = self._do_fenced_code_blocks(text) @@ -2182,6 +2116,7 @@ def _do_code_blocks(self, text): def _fenced_code_block_sub(self, match): return self._code_block_sub(match, is_fenced_code_block=True) + @Stage.mark(Stage.CODE_BLOCKS) def _do_fenced_code_blocks(self, text): """Process ```-fenced unindented code blocks ('fenced-code-blocks' extra).""" return self._fenced_code_block_re.sub(self._fenced_code_block_sub, text) @@ -2254,42 +2189,6 @@ def _encode_code(self, text): self._code_table[text] = hashed return hashed - def _wavedrom_block_sub(self, match): - # if this isn't a wavedrom diagram block, exit now - if match.group(2) != 'wavedrom': - return match.string[match.start():match.end()] - - # dedent the block for processing - lead_indent, waves = self._uniform_outdent(match.group(3)) - # default tags to wrap the wavedrom block in - open_tag, close_tag = '' - - # check if the user would prefer to have the SVG embedded directly - if not isinstance(self.extras['wavedrom'], dict): - embed_svg = True - else: - # default behaviour is to embed SVGs - embed_svg = self.extras['wavedrom'].get('prefer_embed_svg', True) - - if embed_svg: - try: - import wavedrom - waves = wavedrom.render(waves).tostring() - open_tag, close_tag = '
', '\n
' - except ImportError: - pass - - # hash SVG to prevent <> chars being messed with - self._escape_table[waves] = _hash_text(waves) - - return self._uniform_indent( - '\n%s%s%s\n' % (open_tag, self._escape_table[waves], close_tag), - lead_indent, include_empty_lines=True - ) - - def _do_wavedrom_blocks(self, text): - return self._fenced_code_block_re.sub(self._wavedrom_block_sub, text) - _strike_re = re.compile(r"~~(?=\S)(.+?)(?<=\S)~~", re.S) def _do_strike(self, text): text = self._strike_re.sub(r"\1", text) @@ -2737,7 +2636,7 @@ def collect(cls): cls._registry[s_inst.name] = s_inst @abstractmethod - def match(self, text: str) -> bool: + def match(self, md: Markdown, text: str) -> bool: ... @abstractmethod @@ -2769,7 +2668,7 @@ class Admonitions(Extra): re.IGNORECASE | re.MULTILINE | re.VERBOSE ) - def match(self, text): + def match(self, md, text): return self.admonitions_re.search(text) is not None def sub(self, md: Markdown, match): @@ -2803,6 +2702,109 @@ def sub(self, md: Markdown, match): def run(self, md, text): return self.admonitions_re.sub(lambda *_: self.sub(md, *_), text) + +class Numbering(Extra): + name = 'numbering' + order = Stage.before(Stage.LINK_DEFS) + + def match(self, md, text): + return True + + def run(self, md: Markdown, text): + # First pass to define all the references + regex_defns = re.compile(r''' + \[\#(\w+) # the counter. Open square plus hash plus a word \1 + ([^@]*) # Some optional characters, that aren't an @. \2 + @(\w+) # the id. Should this be normed? \3 + ([^\]]*)\] # The rest of the text up to the terminating ] \4 + ''', re.VERBOSE) + regex_subs = re.compile(r"\[@(\w+)\s*\]") # [@ref_id] + counters = {} + references = {} + replacements = [] + definition_html = '
{}{}{}
' + reference_html = '{}' + for match in regex_defns.finditer(text): + # We must have four match groups otherwise this isn't a numbering reference + if len(match.groups()) != 4: + continue + counter = match.group(1) + text_before = match.group(2).strip() + ref_id = match.group(3) + text_after = match.group(4) + number = counters.get(counter, 1) + references[ref_id] = (number, counter) + replacements.append((match.start(0), + definition_html.format(counter, + ref_id, + text_before, + number, + text_after), + match.end(0))) + counters[counter] = number + 1 + for repl in reversed(replacements): + text = text[:repl[0]] + repl[1] + text[repl[2]:] + + # Second pass to replace the references with the right + # value of the counter + # Fwiw, it's vaguely annoying to have to turn the iterator into + # a list and then reverse it but I can't think of a better thing to do. + for match in reversed(list(regex_subs.finditer(text))): + number, counter = references.get(match.group(1), (None, None)) + if number is not None: + repl = reference_html.format(counter, + match.group(1), + number) + else: + repl = reference_html.format(match.group(1), + 'countererror', + '?' + match.group(1) + '?') + if "smarty-pants" in md.extras: + repl = repl.replace('"', md._escape_table['"']) + + text = text[:match.start()] + repl + text[match.end():] + return text + + +class Wavedrom(Extra): + name = 'wavedrom' + order = Stage.before(Stage.CODE_BLOCKS) + Stage.after(Stage.PREPROCESS) + + def match(self, md, text): + match = Markdown._fenced_code_block_re.search(text) + return match is None or match.group(2) == 'wavedrom' + + def sub(self, md: Markdown, match, **opts): + # dedent the block for processing + lead_indent, waves = md._uniform_outdent(match.group(3)) + # default tags to wrap the wavedrom block in + open_tag, close_tag = '' + + # check if the user would prefer to have the SVG embedded directly + embed_svg = opts.get('prefer_embed_svg', True) + + if embed_svg: + try: + import wavedrom + waves = wavedrom.render(waves).tostring() + open_tag, close_tag = '
', '\n
' + except ImportError: + pass + + # hash SVG to prevent <> chars being messed with + md._escape_table[waves] = _hash_text(waves) + + return md._uniform_indent( + '\n%s%s%s\n' % (open_tag, md._escape_table[waves], close_tag), + lead_indent, include_empty_lines=True + ) + + def run(self, md: Markdown, text, **opts): + return Markdown._fenced_code_block_re.sub( + lambda *_: self.sub(md, *_, **opts), text + ) + + # ---------------------------------------------------------- From 212131b8d6042f0a7583d5170c531ad7705372f7 Mon Sep 17 00:00:00 2001 From: Crozzers Date: Mon, 22 May 2023 21:55:16 +0100 Subject: [PATCH 03/23] Convert `femced-code-blocks` extra to new `Extra` format. Also converted `mermaid` extra as part of this process. As a plus, you no longer need fenced-code-blocks activated to use mermaid. Also added the ability for extras to be triggered before or after another extra --- lib/markdown2.py | 251 ++++++++++++++++++++----------------- test/tm-cases/mermaid.opts | 2 +- 2 files changed, 135 insertions(+), 118 deletions(-) diff --git a/lib/markdown2.py b/lib/markdown2.py index 7f319631..76345f95 100755 --- a/lib/markdown2.py +++ b/lib/markdown2.py @@ -109,7 +109,7 @@ import re import sys from collections import defaultdict -from abc import ABC, abstractmethod +from abc import ABC, abstractmethod, abstractproperty import functools from hashlib import sha256 from random import randint, random @@ -196,28 +196,27 @@ class Stage(): __counts = {} @classmethod - def after(cls, *items: 'Stage') -> list: + def _order(cls, items, direction=5): + index = 0 if direction > 0 else 1 ret = [] counts = cls._Stage__counts for item in items: - if item in counts: - counts[item][1] += 5 + if not isinstance(item, int) and issubclass(item, Extra): + ret.extend(o + direction for o in item.order) else: - counts[item] = [0, 5] - ret.append(item + counts[item][1]) + if item not in counts: + counts[item] = [0, 0] + counts[item][index] += direction + ret.append(item + counts[item][1]) return ret + @classmethod + def after(cls, *items: 'Stage') -> list: + return cls._order(items, 5) + @classmethod def before(cls, *items: 'Stage') -> list: - ret = [] - counts = cls._Stage__counts - for item in items: - if item in counts: - counts[item][0] -= 5 - else: - counts[item] = [-5, 0] - ret.append(item + counts[item][0]) - return ret + return cls._order(items, -5) @staticmethod def mark(stage): @@ -243,13 +242,13 @@ def inner(self: 'Markdown', text, *args, **kwargs): after.sort(key=lambda k: k[0]) for _, klass in before: - if klass.match(self, text): + if klass.test(self, text): text = klass.run(self, text, **(self.extras[klass.name] or {})) - text = func(self, text, *args, **kwargs) + text = func(self, text, *args, **kwargs) for _, klass in after: - if klass.match(self, text): + if klass.test(self, text): text = klass.run(self, text, **(self.extras[klass.name] or {})) return text @@ -440,8 +439,8 @@ def convert(self, text): text = self.preprocess(text) - if "fenced-code-blocks" in self.extras and not self.safe_mode: - text = self._do_fenced_code_blocks(text) + # if "fenced-code-blocks" in self.extras and not self.safe_mode: + # text = self._do_fenced_code_blocks(text) if self.safe_mode: text = self._hash_html_spans(text) @@ -449,8 +448,8 @@ def convert(self, text): # Turn block-level HTML blocks into hash entries text = self._hash_html_blocks(text, raw=True) - if "fenced-code-blocks" in self.extras and self.safe_mode: - text = self._do_fenced_code_blocks(text) + # if "fenced-code-blocks" in self.extras and self.safe_mode: + # text = self._do_fenced_code_blocks(text) # Strip link definitions, store in hashes. if "footnotes" in self.extras: @@ -1077,8 +1076,8 @@ def _run_block_gamut(self, text): # These are all the transformations that form block-level # tags like paragraphs, headers, and list items. - if "fenced-code-blocks" in self.extras: - text = self._do_fenced_code_blocks(text) + # if "fenced-code-blocks" in self.extras: + # text = self._do_fenced_code_blocks(text) text = self._do_headers(text) @@ -1116,7 +1115,7 @@ def _run_block_gamut(self, text): def _pyshell_block_sub(self, match): if "fenced-code-blocks" in self.extras: dedented = _dedent(match.group(0)) - return self._do_fenced_code_blocks("```pycon\n" + dedented + "```\n") + return Extra.get('fenced-code-blocks').run(self, "```pycon\n" + dedented + "```\n") lines = match.group(0).splitlines(0) _dedentlines(lines) indent = ' ' * self.tab_width @@ -1996,80 +1995,20 @@ def wrap(self, source, outfile=None): formatter = HtmlCodeFormatter(**formatter_opts) return pygments.highlight(codeblock, lexer, formatter) - def _code_block_sub(self, match, is_fenced_code_block=False): - lexer_name = None - if is_fenced_code_block: - lexer_name = match.group(2) - codeblock = match.group(3) - codeblock = codeblock[:-1] # drop one trailing newline - else: - codeblock = match.group(1) - codeblock = self._outdent(codeblock) - codeblock = self._detab(codeblock) - codeblock = codeblock.lstrip('\n') # trim leading newlines - codeblock = codeblock.rstrip() # trim trailing whitespace - - # Use pygments only if not using the highlightjs-lang extra - if lexer_name and "highlightjs-lang" not in self.extras: - lexer = self._get_pygments_lexer(lexer_name) - if lexer: - leading_indent = ' '*(len(match.group(1)) - len(match.group(1).lstrip())) - return self._code_block_with_lexer_sub(codeblock, leading_indent, lexer, is_fenced_code_block) + def _code_block_sub(self, match): + codeblock = match.group(1) + codeblock = self._outdent(codeblock) + codeblock = self._detab(codeblock) + codeblock = codeblock.lstrip('\n') # trim leading newlines + codeblock = codeblock.rstrip() # trim trailing whitespace pre_class_str = self._html_class_str_from_tag("pre") + code_class_str = self._html_class_str_from_tag("code") - if "highlightjs-lang" in self.extras and lexer_name: - code_class_str = ' class="%s language-%s"' % (lexer_name, lexer_name) - else: - code_class_str = self._html_class_str_from_tag("code") - - if is_fenced_code_block: - # Fenced code blocks need to be outdented before encoding, and then reapplied - leading_indent = ' ' * (len(match.group(1)) - len(match.group(1).lstrip())) - if codeblock: - # only run the codeblock through the outdenter if not empty - leading_indent, codeblock = self._uniform_outdent(codeblock, max_outdent=leading_indent) - - codeblock = self._encode_code(codeblock) - - if lexer_name == 'mermaid' and 'mermaid' in self.extras: - return '\n%s
%s\n
\n' % ( - leading_indent, codeblock) - - return "\n%s%s\n
\n" % ( - leading_indent, pre_class_str, code_class_str, codeblock) - else: - codeblock = self._encode_code(codeblock) - - return "\n%s\n\n" % ( - pre_class_str, code_class_str, codeblock) - - def _code_block_with_lexer_sub(self, codeblock, leading_indent, lexer, is_fenced_code_block): - if is_fenced_code_block: - formatter_opts = self.extras['fenced-code-blocks'] or {} - else: - formatter_opts = {} - - def unhash_code(codeblock): - for key, sanitized in list(self.html_spans.items()): - codeblock = codeblock.replace(key, sanitized) - replacements = [ - ("&", "&"), - ("<", "<"), - (">", ">") - ] - for old, new in replacements: - codeblock = codeblock.replace(old, new) - return codeblock - # remove leading indent from code block - _, codeblock = self._uniform_outdent(codeblock, max_outdent=leading_indent) - - codeblock = unhash_code(codeblock) - colored = self._color_with_pygments(codeblock, lexer, - **formatter_opts) + codeblock = self._encode_code(codeblock) - # add back the indent to all lines - return "\n%s\n" % self._uniform_indent(colored, leading_indent, True) + return "\n%s\n\n" % ( + pre_class_str, code_class_str, codeblock) def _html_class_str_from_tag(self, tag): """Get the appropriate ' class="..."' string (note the leading @@ -2106,21 +2045,6 @@ def _do_code_blocks(self, text): re.M | re.X) return code_block_re.sub(self._code_block_sub, text) - _fenced_code_block_re = re.compile(r''' - (?:\n+|\A\n?|(?<=\n)) - (^[ \t]*`{3,})\s{0,99}?([\w+-]+)?\s{0,99}?\n # $1 = opening fence (captured for back-referencing), $2 = optional lang - (.*?) # $3 = code block content - \1[ \t]*\n # closing fence - ''', re.M | re.X | re.S) - - def _fenced_code_block_sub(self, match): - return self._code_block_sub(match, is_fenced_code_block=True) - - @Stage.mark(Stage.CODE_BLOCKS) - def _do_fenced_code_blocks(self, text): - """Process ```-fenced unindented code blocks ('fenced-code-blocks' extra).""" - return self._fenced_code_block_re.sub(self._fenced_code_block_sub, text) - # Rules for a code span: # - backslash escapes are not interpreted in a code span # - to include one or or a run of more backticks the delimiters must @@ -2656,9 +2580,13 @@ def collect(cls): s_inst = s() cls._registry[s_inst.name] = s_inst + @classmethod + def get(cls, extra_name: str) -> 'Extra': + return cls._registry[extra_name] + @abstractmethod - def match(self, md: Markdown, text: str) -> bool: - ... + def test(self, md: Markdown, text: str) -> bool: + return self.re.search(text) is not None @abstractmethod def run(self, md: Markdown, text: str, **opts) -> str: @@ -2689,7 +2617,7 @@ class Admonitions(Extra): re.IGNORECASE | re.MULTILINE | re.VERBOSE ) - def match(self, md, text): + def test(self, md, text): return self.admonitions_re.search(text) is not None def sub(self, md: Markdown, match): @@ -2724,11 +2652,100 @@ def run(self, md, text): return self.admonitions_re.sub(lambda *_: self.sub(md, *_), text) +class FencedCodeBlocks(Extra): + name = 'fenced-code-blocks' + order = Stage.before(Stage.LINK_DEFS, Stage.BLOCK_GAMUT) + Stage.after(Stage.PREPROCESS) + + fenced_code_block_re = re.compile(r''' + (?:\n+|\A\n?|(?<=\n)) + (^[ \t]*`{3,})\s{0,99}?([\w+-]+)?\s{0,99}?\n # $1 = opening fence (captured for back-referencing), $2 = optional lang + (.*?) # $3 = code block content + \1[ \t]*\n # closing fence + ''', re.M | re.X | re.S) + + def test(self, md, text): + if md.stage == Stage.PREPROCESS and not md.safe_mode: + return True + if md.stage == Stage.LINK_DEFS and md.safe_mode: + return True + return md.stage == Stage.BLOCK_GAMUT + + def _code_block_with_lexer_sub(self, md: Markdown, codeblock, leading_indent, lexer): + formatter_opts = md.extras['fenced-code-blocks'] or {} + + def unhash_code(codeblock): + for key, sanitized in list(md.html_spans.items()): + codeblock = codeblock.replace(key, sanitized) + replacements = [ + ("&", "&"), + ("<", "<"), + (">", ">") + ] + for old, new in replacements: + codeblock = codeblock.replace(old, new) + return codeblock + # remove leading indent from code block + _, codeblock = md._uniform_outdent(codeblock, max_outdent=leading_indent) + + codeblock = unhash_code(codeblock) + colored = md._color_with_pygments(codeblock, lexer, + **formatter_opts) + + # add back the indent to all lines + return "\n%s\n" % md._uniform_indent(colored, leading_indent, True) + + def tags(self, md: Markdown, lexer_name): + pre_class = md._html_class_str_from_tag('pre') + if "highlightjs-lang" in md.extras and lexer_name: + code_class = ' class="%s language-%s"' % (lexer_name, lexer_name) + else: + code_class = md._html_class_str_from_tag('code') + return ('' % (pre_class, code_class), '') + + def sub(self, match: re.Match, md: Markdown): + lexer_name = match.group(2) + codeblock = match.group(3) + codeblock = codeblock[:-1] # drop one trailing newline + + # Use pygments only if not using the highlightjs-lang extra + if lexer_name and "highlightjs-lang" not in md.extras: + lexer = md._get_pygments_lexer(lexer_name) + if lexer: + leading_indent = ' '*(len(match.group(1)) - len(match.group(1).lstrip())) + return self._code_block_with_lexer_sub(md, codeblock, leading_indent, lexer) + + # Fenced code blocks need to be outdented before encoding, and then reapplied + leading_indent = ' ' * (len(match.group(1)) - len(match.group(1).lstrip())) + if codeblock: + # only run the codeblock through the outdenter if not empty + leading_indent, codeblock = md._uniform_outdent(codeblock, max_outdent=leading_indent) + + codeblock = md._encode_code(codeblock) + + tags = self.tags(md, lexer_name) + + return "\n%s%s%s\n%s\n" % (leading_indent, tags[0], codeblock, tags[1]) + + def run(self, md, text): + return self.fenced_code_block_re.sub(lambda m: self.sub(m, md), text) + # return self.fenced_code_block_re.sub(md._fenced_code_block_sub, text) + + +class Mermaid(FencedCodeBlocks): + name = 'mermaid' + order = Stage.before(FencedCodeBlocks) + + def tags(self, md: Markdown, lexer_name): + if lexer_name == 'mermaid': + return ('
', '
') + return super().tags(md, lexer_name) + + class Numbering(Extra): name = 'numbering' order = Stage.before(Stage.LINK_DEFS) - def match(self, md, text): + def test(self, md, text): return True def run(self, md: Markdown, text): @@ -2791,8 +2808,8 @@ class Wavedrom(Extra): name = 'wavedrom' order = Stage.before(Stage.CODE_BLOCKS) + Stage.after(Stage.PREPROCESS) - def match(self, md, text): - match = Markdown._fenced_code_block_re.search(text) + def test(self, md, text): + match = FencedCodeBlocks.fenced_code_block_re.search(text) return match is None or match.group(2) == 'wavedrom' def sub(self, md: Markdown, match, **opts): @@ -2821,7 +2838,7 @@ def sub(self, md: Markdown, match, **opts): ) def run(self, md: Markdown, text, **opts): - return Markdown._fenced_code_block_re.sub( + return FencedCodeBlocks.fenced_code_block_re.sub( lambda *_: self.sub(md, *_, **opts), text ) diff --git a/test/tm-cases/mermaid.opts b/test/tm-cases/mermaid.opts index 8bf6712c..fb557ac2 100644 --- a/test/tm-cases/mermaid.opts +++ b/test/tm-cases/mermaid.opts @@ -1 +1 @@ -{"extras": ['fenced-code-blocks', 'mermaid']} +{"extras": ['mermaid']} From 26ae9a5ed44f64ad5dd89ef4652f872b68b891b5 Mon Sep 17 00:00:00 2001 From: Crozzers Date: Tue, 23 May 2023 20:17:57 +0100 Subject: [PATCH 04/23] Initialise extras with instance of `Markdown` and convert pyshell extra --- lib/markdown2.py | 186 ++++++++++++++++++++++------------------------- 1 file changed, 88 insertions(+), 98 deletions(-) diff --git a/lib/markdown2.py b/lib/markdown2.py index 76345f95..23fc393d 100755 --- a/lib/markdown2.py +++ b/lib/markdown2.py @@ -242,14 +242,14 @@ def inner(self: 'Markdown', text, *args, **kwargs): after.sort(key=lambda k: k[0]) for _, klass in before: - if klass.test(self, text): - text = klass.run(self, text, **(self.extras[klass.name] or {})) + if klass.test(text): + text = klass.run(text, **(self.extras[klass.name] or {})) text = func(self, text, *args, **kwargs) for _, klass in after: - if klass.test(self, text): - text = klass.run(self, text, **(self.extras[klass.name] or {})) + if klass.test(text): + text = klass.run(text, **(self.extras[klass.name] or {})) return text @@ -361,7 +361,7 @@ def _setup_extras(self): self._count_from_header_id = defaultdict(int) if "metadata" in self.extras: self.metadata = {} - Extra.collect() + Extra.collect(self) # Per "rel" # should only be used in tags with an "href" attribute. @@ -439,18 +439,12 @@ def convert(self, text): text = self.preprocess(text) - # if "fenced-code-blocks" in self.extras and not self.safe_mode: - # text = self._do_fenced_code_blocks(text) - if self.safe_mode: text = self._hash_html_spans(text) # Turn block-level HTML blocks into hash entries text = self._hash_html_blocks(text, raw=True) - # if "fenced-code-blocks" in self.extras and self.safe_mode: - # text = self._do_fenced_code_blocks(text) - # Strip link definitions, store in hashes. if "footnotes" in self.extras: # Must do footnotes first because an unlucky footnote defn @@ -1076,9 +1070,6 @@ def _run_block_gamut(self, text): # These are all the transformations that form block-level # tags like paragraphs, headers, and list items. - # if "fenced-code-blocks" in self.extras: - # text = self._do_fenced_code_blocks(text) - text = self._do_headers(text) # Do Horizontal Rules: @@ -1091,8 +1082,6 @@ def _run_block_gamut(self, text): text = self._do_lists(text) - if "pyshell" in self.extras: - text = self._prepare_pyshell_blocks(text) if "wiki-tables" in self.extras: text = self._do_wiki_tables(text) if "tables" in self.extras: @@ -1112,34 +1101,6 @@ def _run_block_gamut(self, text): return text - def _pyshell_block_sub(self, match): - if "fenced-code-blocks" in self.extras: - dedented = _dedent(match.group(0)) - return Extra.get('fenced-code-blocks').run(self, "```pycon\n" + dedented + "```\n") - lines = match.group(0).splitlines(0) - _dedentlines(lines) - indent = ' ' * self.tab_width - s = ('\n' # separate from possible cuddled paragraph - + indent + ('\n'+indent).join(lines) - + '\n') - return s - - def _prepare_pyshell_blocks(self, text): - """Ensure that Python interactive shell sessions are put in - code blocks -- even if not properly indented. - """ - if ">>>" not in text: - return text - - less_than_tab = self.tab_width - 1 - _pyshell_block_re = re.compile(r""" - ^([ ]{0,%d})>>>[ ].*\n # first line - ^(\1[^\S\n]*\S.*\n)* # any number of subsequent lines with at least one character - (?=^\1?\n|\Z) # ends with a blank line or end of document - """ % less_than_tab, re.M | re.X) - - return _pyshell_block_re.sub(self._pyshell_block_sub, text) - def _table_sub(self, match): trim_space_re = '^[ \t\n]+|[ \t\n]+$' trim_bar_re = r'^\||\|$' @@ -2571,13 +2532,14 @@ class Extra(ABC): name: str order: list - def __init__(self): + def __init__(self, md: Markdown): + self.md = md self.register() @classmethod - def collect(cls): + def collect(cls, md: Markdown): for s in cls.__subclasses(cls): - s_inst = s() + s_inst = s(md) cls._registry[s_inst.name] = s_inst @classmethod @@ -2585,11 +2547,11 @@ def get(cls, extra_name: str) -> 'Extra': return cls._registry[extra_name] @abstractmethod - def test(self, md: Markdown, text: str) -> bool: + def test(self, text: str) -> bool: return self.re.search(text) is not None @abstractmethod - def run(self, md: Markdown, text: str, **opts) -> str: + def run(self, text: str, **opts) -> str: ... def register(self): @@ -2617,10 +2579,10 @@ class Admonitions(Extra): re.IGNORECASE | re.MULTILINE | re.VERBOSE ) - def test(self, md, text): + def test(self, text): return self.admonitions_re.search(text) is not None - def sub(self, md: Markdown, match): + def sub(self, match): lead_indent, admonition_name, title, body = match.groups() admonition_type = '%s' % admonition_name @@ -2636,20 +2598,20 @@ def sub(self, md: Markdown, match): title = '%s' % title # process the admonition body like regular markdown - body = md._run_block_gamut("\n%s\n" % md._uniform_outdent(body)[1]) + body = self.md._run_block_gamut("\n%s\n" % self.md._uniform_outdent(body)[1]) # indent the body before placing inside the aside block - admonition = md._uniform_indent( + admonition = self.md._uniform_indent( '%s\n%s\n\n%s\n' % (admonition_type, title, body), - md.tab, False + self.md.tab, False ) # wrap it in an aside admonition = '' % (admonition_class, admonition) # now indent the whole admonition back to where it started - return md._uniform_indent(admonition, lead_indent, False) + return self.md._uniform_indent(admonition, lead_indent, False) - def run(self, md, text): - return self.admonitions_re.sub(lambda *_: self.sub(md, *_), text) + def run(self, text): + return self.admonitions_re.sub(self.sub, text) class FencedCodeBlocks(Extra): @@ -2663,18 +2625,18 @@ class FencedCodeBlocks(Extra): \1[ \t]*\n # closing fence ''', re.M | re.X | re.S) - def test(self, md, text): - if md.stage == Stage.PREPROCESS and not md.safe_mode: + def test(self, text): + if self.md.stage == Stage.PREPROCESS and not self.md.safe_mode: return True - if md.stage == Stage.LINK_DEFS and md.safe_mode: + if self.md.stage == Stage.LINK_DEFS and self.md.safe_mode: return True - return md.stage == Stage.BLOCK_GAMUT + return self.md.stage == Stage.BLOCK_GAMUT - def _code_block_with_lexer_sub(self, md: Markdown, codeblock, leading_indent, lexer): - formatter_opts = md.extras['fenced-code-blocks'] or {} + def _code_block_with_lexer_sub(self, codeblock, leading_indent, lexer): + formatter_opts = self.md.extras['fenced-code-blocks'] or {} def unhash_code(codeblock): - for key, sanitized in list(md.html_spans.items()): + for key, sanitized in list(self.md.html_spans.items()): codeblock = codeblock.replace(key, sanitized) replacements = [ ("&", "&"), @@ -2685,70 +2647,69 @@ def unhash_code(codeblock): codeblock = codeblock.replace(old, new) return codeblock # remove leading indent from code block - _, codeblock = md._uniform_outdent(codeblock, max_outdent=leading_indent) + _, codeblock = self.md._uniform_outdent(codeblock, max_outdent=leading_indent) codeblock = unhash_code(codeblock) - colored = md._color_with_pygments(codeblock, lexer, - **formatter_opts) + colored = self.md._color_with_pygments(codeblock, lexer, + **formatter_opts) # add back the indent to all lines - return "\n%s\n" % md._uniform_indent(colored, leading_indent, True) + return "\n%s\n" % self.md._uniform_indent(colored, leading_indent, True) - def tags(self, md: Markdown, lexer_name): - pre_class = md._html_class_str_from_tag('pre') - if "highlightjs-lang" in md.extras and lexer_name: + def tags(self, lexer_name): + pre_class = self.md._html_class_str_from_tag('pre') + if "highlightjs-lang" in self.md.extras and lexer_name: code_class = ' class="%s language-%s"' % (lexer_name, lexer_name) else: - code_class = md._html_class_str_from_tag('code') + code_class = self.md._html_class_str_from_tag('code') return ('' % (pre_class, code_class), '') - def sub(self, match: re.Match, md: Markdown): + def sub(self, match: re.Match): lexer_name = match.group(2) codeblock = match.group(3) codeblock = codeblock[:-1] # drop one trailing newline # Use pygments only if not using the highlightjs-lang extra - if lexer_name and "highlightjs-lang" not in md.extras: - lexer = md._get_pygments_lexer(lexer_name) + if lexer_name and "highlightjs-lang" not in self.md.extras: + lexer = self.md._get_pygments_lexer(lexer_name) if lexer: leading_indent = ' '*(len(match.group(1)) - len(match.group(1).lstrip())) - return self._code_block_with_lexer_sub(md, codeblock, leading_indent, lexer) + return self._code_block_with_lexer_sub(codeblock, leading_indent, lexer) # Fenced code blocks need to be outdented before encoding, and then reapplied leading_indent = ' ' * (len(match.group(1)) - len(match.group(1).lstrip())) if codeblock: # only run the codeblock through the outdenter if not empty - leading_indent, codeblock = md._uniform_outdent(codeblock, max_outdent=leading_indent) + leading_indent, codeblock = self.md._uniform_outdent(codeblock, max_outdent=leading_indent) - codeblock = md._encode_code(codeblock) + codeblock = self.md._encode_code(codeblock) - tags = self.tags(md, lexer_name) + tags = self.tags(lexer_name) return "\n%s%s%s\n%s\n" % (leading_indent, tags[0], codeblock, tags[1]) - def run(self, md, text): - return self.fenced_code_block_re.sub(lambda m: self.sub(m, md), text) - # return self.fenced_code_block_re.sub(md._fenced_code_block_sub, text) + def run(self, text): + return self.fenced_code_block_re.sub(self.sub, text) class Mermaid(FencedCodeBlocks): name = 'mermaid' order = Stage.before(FencedCodeBlocks) - def tags(self, md: Markdown, lexer_name): + def tags(self, lexer_name): if lexer_name == 'mermaid': return ('
', '
') - return super().tags(md, lexer_name) + return super().tags(lexer_name) class Numbering(Extra): name = 'numbering' order = Stage.before(Stage.LINK_DEFS) - def test(self, md, text): + def test(self, text): return True - def run(self, md: Markdown, text): + def run(self, text): # First pass to define all the references regex_defns = re.compile(r''' \[\#(\w+) # the counter. Open square plus hash plus a word \1 @@ -2797,24 +2758,55 @@ def run(self, md: Markdown, text): repl = reference_html.format(match.group(1), 'countererror', '?' + match.group(1) + '?') - if "smarty-pants" in md.extras: - repl = repl.replace('"', md._escape_table['"']) + if "smarty-pants" in self.md.extras: + repl = repl.replace('"', self.md._escape_table['"']) text = text[:match.start()] + repl + text[match.end():] return text +class PyShell(Extra): + name = 'pyshell' + order = Stage.after(Stage.LISTS) + + def test(self, text): + return ">>>" in text + + def sub(self, match: re.Match): + if "fenced-code-blocks" in self.md.extras: + dedented = _dedent(match.group(0)) + return Extra.get('fenced-code-blocks').run("```pycon\n" + dedented + "```\n") + + lines = match.group(0).splitlines(0) + _dedentlines(lines) + indent = ' ' * self.md.tab_width + s = ('\n' # separate from possible cuddled paragraph + + indent + ('\n'+indent).join(lines) + + '\n') + return s + + def run(self, text): + less_than_tab = self.md.tab_width - 1 + _pyshell_block_re = re.compile(r""" + ^([ ]{0,%d})>>>[ ].*\n # first line + ^(\1[^\S\n]*\S.*\n)* # any number of subsequent lines with at least one character + (?=^\1?\n|\Z) # ends with a blank line or end of document + """ % less_than_tab, re.M | re.X) + + return _pyshell_block_re.sub(self.sub, text) + + class Wavedrom(Extra): name = 'wavedrom' order = Stage.before(Stage.CODE_BLOCKS) + Stage.after(Stage.PREPROCESS) - def test(self, md, text): + def test(self, text): match = FencedCodeBlocks.fenced_code_block_re.search(text) return match is None or match.group(2) == 'wavedrom' - def sub(self, md: Markdown, match, **opts): + def sub(self, match, **opts): # dedent the block for processing - lead_indent, waves = md._uniform_outdent(match.group(3)) + lead_indent, waves = self.md._uniform_outdent(match.group(3)) # default tags to wrap the wavedrom block in open_tag, close_tag = '' @@ -2830,17 +2822,15 @@ def sub(self, md: Markdown, match, **opts): pass # hash SVG to prevent <> chars being messed with - md._escape_table[waves] = _hash_text(waves) + self.md._escape_table[waves] = _hash_text(waves) - return md._uniform_indent( - '\n%s%s%s\n' % (open_tag, md._escape_table[waves], close_tag), + return self.md._uniform_indent( + '\n%s%s%s\n' % (open_tag, self.md._escape_table[waves], close_tag), lead_indent, include_empty_lines=True ) - def run(self, md: Markdown, text, **opts): - return FencedCodeBlocks.fenced_code_block_re.sub( - lambda *_: self.sub(md, *_, **opts), text - ) + def run(self, text, **opts): + return FencedCodeBlocks.fenced_code_block_re.sub(_curry(self.sub, **opts), text) # ---------------------------------------------------------- From f4b3d663b94d2d5bd095c0684f9a5382e463574e Mon Sep 17 00:00:00 2001 From: Crozzers Date: Tue, 23 May 2023 20:33:02 +0100 Subject: [PATCH 05/23] Add some docstrings --- lib/markdown2.py | 74 ++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 69 insertions(+), 5 deletions(-) diff --git a/lib/markdown2.py b/lib/markdown2.py index 23fc393d..49fba2be 100755 --- a/lib/markdown2.py +++ b/lib/markdown2.py @@ -2530,7 +2530,15 @@ class Extra(ABC): _registry = {} name: str + ''' + An identifiable name that users can use to invoke the extra + in the Markdown class + ''' order: list + ''' + A list of stages at which this extra should be invoked. + See `Stage`, `Stage.before` and `Stage.after` + ''' def __init__(self, md: Markdown): self.md = md @@ -2538,25 +2546,47 @@ def __init__(self, md: Markdown): @classmethod def collect(cls, md: Markdown): + ''' + Collects all subclasses of `Extra`, initialises and registers them + ''' for s in cls.__subclasses(cls): s_inst = s(md) cls._registry[s_inst.name] = s_inst @classmethod def get(cls, extra_name: str) -> 'Extra': + ''' + Get a registered extra by name. + For example, `Extra.get('fenced-code-blocks')` will return an instance + of the `FencedCodeBlocks` extra + ''' return cls._registry[extra_name] - @abstractmethod - def test(self, text: str) -> bool: - return self.re.search(text) is not None - @abstractmethod def run(self, text: str, **opts) -> str: + ''' + Run the extra against the given text. + + Args: + text: the text to process + **opts: any parameters supplied via `Markdown`'s extras dict + ''' ... def register(self): + ''' + Registers the class for use with `Markdown`. This function is + called during initialisation. + ''' self.__class__._registry[self.name] = self + @abstractmethod + def test(self, text: str) -> bool: + ''' + Check a section of markdown to see if this extra should be run upon it. + ''' + ... + @staticmethod def __subclasses(cls): return set(cls.__subclasses__()).union( @@ -2565,6 +2595,10 @@ def __subclasses(cls): class Admonitions(Extra): + ''' + Enable parsing of RST admonitions + ''' + name = 'admonitions' order = Stage.before(Stage.BLOCK_GAMUT, Stage.LINK_DEFS) @@ -2615,6 +2649,13 @@ def run(self, text): class FencedCodeBlocks(Extra): + ''' + Allows a code block to not have to be indented + by fencing it with '```' on a line before and after. Based on + with support for + syntax highlighting. + ''' + name = 'fenced-code-blocks' order = Stage.before(Stage.LINK_DEFS, Stage.BLOCK_GAMUT) + Stage.after(Stage.PREPROCESS) @@ -2656,7 +2697,17 @@ def unhash_code(codeblock): # add back the indent to all lines return "\n%s\n" % self.md._uniform_indent(colored, leading_indent, True) - def tags(self, lexer_name): + def tags(self, lexer_name) -> tuple: + ''' + Returns the tags that the encoded code block will be wrapped in, based + upon the lexer name. + + This function can be overridden by subclasses to piggy-back off of the + fenced code blocks syntax (see `Mermaid` extra). + + Returns: + The opening and closing tags, as strings within a tuple + ''' pre_class = self.md._html_class_str_from_tag('pre') if "highlightjs-lang" in self.md.extras and lexer_name: code_class = ' class="%s language-%s"' % (lexer_name, lexer_name) @@ -2703,6 +2754,11 @@ def tags(self, lexer_name): class Numbering(Extra): + ''' + Support of generic counters. Non standard extension to + allow sequential numbering of figures, tables, equations, exhibits etc. + ''' + name = 'numbering' order = Stage.before(Stage.LINK_DEFS) @@ -2766,6 +2822,11 @@ def run(self, text): class PyShell(Extra): + ''' + Treats unindented Python interactive shell sessions as + blocks. + ''' + name = 'pyshell' order = Stage.after(Stage.LISTS) @@ -2797,6 +2858,9 @@ def run(self, text): class Wavedrom(Extra): + ''' + Support for generating Wavedrom digital timing diagrams + ''' name = 'wavedrom' order = Stage.before(Stage.CODE_BLOCKS) + Stage.after(Stage.PREPROCESS) From 283fcdf07db87c2ca0fd3aa24ba484f695907602 Mon Sep 17 00:00:00 2001 From: Crozzers Date: Tue, 23 May 2023 20:52:44 +0100 Subject: [PATCH 06/23] Convert tables and wiki-tables extras to new format. All block extras have now been converted --- lib/markdown2.py | 282 +++++++++++++++++++++++++---------------------- 1 file changed, 148 insertions(+), 134 deletions(-) diff --git a/lib/markdown2.py b/lib/markdown2.py index 49fba2be..b6e83ba0 100755 --- a/lib/markdown2.py +++ b/lib/markdown2.py @@ -1082,11 +1082,6 @@ def _run_block_gamut(self, text): text = self._do_lists(text) - if "wiki-tables" in self.extras: - text = self._do_wiki_tables(text) - if "tables" in self.extras: - text = self._do_tables(text) - text = self._do_code_blocks(text) text = self._do_block_quotes(text) @@ -1101,135 +1096,6 @@ def _run_block_gamut(self, text): return text - def _table_sub(self, match): - trim_space_re = '^[ \t\n]+|[ \t\n]+$' - trim_bar_re = r'^\||\|$' - split_bar_re = r'^\||(?' % self._html_class_str_from_tag('table'), '' % self._html_class_str_from_tag('thead'), ''] - cols = [re.sub(escape_bar_re, '|', cell.strip()) for cell in re.split(split_bar_re, re.sub(trim_bar_re, "", re.sub(trim_space_re, "", head)))] - for col_idx, col in enumerate(cols): - hlines.append(' %s' % ( - align_from_col_idx.get(col_idx, ''), - self._run_span_gamut(col) - )) - hlines.append('') - hlines.append('') - - # tbody - hlines.append('') - for line in body.strip('\n').split('\n'): - hlines.append('') - cols = [re.sub(escape_bar_re, '|', cell.strip()) for cell in re.split(split_bar_re, re.sub(trim_bar_re, "", re.sub(trim_space_re, "", line)))] - for col_idx, col in enumerate(cols): - hlines.append(' %s' % ( - align_from_col_idx.get(col_idx, ''), - self._run_span_gamut(col) - )) - hlines.append('') - hlines.append('') - hlines.append('') - - return '\n'.join(hlines) + '\n' - - def _do_tables(self, text): - """Copying PHP-Markdown and GFM table syntax. Some regex borrowed from - https://github.com/michelf/php-markdown/blob/lib/Michelf/Markdown.php#L2538 - """ - less_than_tab = self.tab_width - 1 - table_re = re.compile(r''' - (?:(?<=\n\n)|\A\n?) # leading blank line - - ^[ ]{0,%d} # allowed whitespace - (.*[|].*) \n # $1: header row (at least one pipe) - - ^[ ]{0,%d} # allowed whitespace - ( # $2: underline row - # underline row with leading bar - (?: \|\ *:?-+:?\ * )+ \|? \s? \n - | - # or, underline row without leading bar - (?: \ *:?-+:?\ *\| )+ (?: \ *:?-+:?\ * )? \s? \n - ) - - ( # $3: data rows - (?: - ^[ ]{0,%d}(?!\ ) # ensure line begins with 0 to less_than_tab spaces - .*\|.* \n - )+ - ) - ''' % (less_than_tab, less_than_tab, less_than_tab), re.M | re.X) - return table_re.sub(self._table_sub, text) - - def _wiki_table_sub(self, match): - ttext = match.group(0).strip() - # print('wiki table: %r' % match.group(0)) - rows = [] - for line in ttext.splitlines(0): - line = line.strip()[2:-2].strip() - row = [c.strip() for c in re.split(r'(?' % self._html_class_str_from_tag('table')) - # Check if first cell of first row is a header cell. If so, assume the whole row is a header row. - if rows and rows[0] and re.match(r"^\s*~", rows[0][0]): - add_hline('' % self._html_class_str_from_tag('thead'), 1) - add_hline('', 2) - for cell in rows[0]: - add_hline("{}".format(format_cell(cell)), 3) - add_hline('', 2) - add_hline('', 1) - # Only one header row allowed. - rows = rows[1:] - # If no more rows, don't create a tbody. - if rows: - add_hline('', 1) - for row in rows: - add_hline('', 2) - for cell in row: - add_hline('{}'.format(format_cell(cell)), 3) - add_hline('', 2) - add_hline('', 1) - add_hline('') - return '\n'.join(hlines) + '\n' - - def _do_wiki_tables(self, text): - # Optimization. - if "||" not in text: - return text - - less_than_tab = self.tab_width - 1 - wiki_table_re = re.compile(r''' - (?:(?<=\n\n)|\A\n?) # leading blank line - ^([ ]{0,%d})\|\|.+?\|\|[ ]*\n # first line - (^\1\|\|.+?\|\|\n)* # any number of subsequent lines - ''' % less_than_tab, re.M | re.X) - return wiki_table_re.sub(self._wiki_table_sub, text) - @Stage.mark(Stage.SPAN_GAMUT) def _run_span_gamut(self, text): # These are all the transformations that occur *within* block-level @@ -2857,6 +2723,94 @@ def run(self, text): return _pyshell_block_re.sub(self.sub, text) +class Tables(Extra): + ''' + Tables using the same format as GFM + and + PHP-Markdown Extra . + ''' + name = 'tables' + order = Stage.after(Stage.LISTS) + + def run(self, text: str): + """Copying PHP-Markdown and GFM table syntax. Some regex borrowed from + https://github.com/michelf/php-markdown/blob/lib/Michelf/Markdown.php#L2538 + """ + less_than_tab = self.md.tab_width - 1 + table_re = re.compile(r''' + (?:(?<=\n\n)|\A\n?) # leading blank line + + ^[ ]{0,%d} # allowed whitespace + (.*[|].*) \n # $1: header row (at least one pipe) + + ^[ ]{0,%d} # allowed whitespace + ( # $2: underline row + # underline row with leading bar + (?: \|\ *:?-+:?\ * )+ \|? \s? \n + | + # or, underline row without leading bar + (?: \ *:?-+:?\ *\| )+ (?: \ *:?-+:?\ * )? \s? \n + ) + + ( # $3: data rows + (?: + ^[ ]{0,%d}(?!\ ) # ensure line begins with 0 to less_than_tab spaces + .*\|.* \n + )+ + ) + ''' % (less_than_tab, less_than_tab, less_than_tab), re.M | re.X) + return table_re.sub(self.sub, text) + + def sub(self, match: re.Match): + trim_space_re = '^[ \t\n]+|[ \t\n]+$' + trim_bar_re = r'^\||\|$' + split_bar_re = r'^\||(?' % self.md._html_class_str_from_tag('table'), '' % self.md._html_class_str_from_tag('thead'), ''] + cols = [re.sub(escape_bar_re, '|', cell.strip()) for cell in re.split(split_bar_re, re.sub(trim_bar_re, "", re.sub(trim_space_re, "", head)))] + for col_idx, col in enumerate(cols): + hlines.append(' %s' % ( + align_from_col_idx.get(col_idx, ''), + self.md._run_span_gamut(col) + )) + hlines.append('') + hlines.append('') + + # tbody + hlines.append('') + for line in body.strip('\n').split('\n'): + hlines.append('') + cols = [re.sub(escape_bar_re, '|', cell.strip()) for cell in re.split(split_bar_re, re.sub(trim_bar_re, "", re.sub(trim_space_re, "", line)))] + for col_idx, col in enumerate(cols): + hlines.append(' %s' % ( + align_from_col_idx.get(col_idx, ''), + self.md._run_span_gamut(col) + )) + hlines.append('') + hlines.append('') + hlines.append('') + + return '\n'.join(hlines) + '\n' + + def test(self, text: str): + return True + + class Wavedrom(Extra): ''' Support for generating Wavedrom digital timing diagrams @@ -2897,6 +2851,66 @@ def run(self, text, **opts): return FencedCodeBlocks.fenced_code_block_re.sub(_curry(self.sub, **opts), text) +class WikiTables(Extra): + ''' + Google Code Wiki-style tables. See + . + ''' + name = 'wiki-tables' + order = Stage.before(Tables) + + def run(self, text: str): + less_than_tab = self.md.tab_width - 1 + wiki_table_re = re.compile(r''' + (?:(?<=\n\n)|\A\n?) # leading blank line + ^([ ]{0,%d})\|\|.+?\|\|[ ]*\n # first line + (^\1\|\|.+?\|\|\n)* # any number of subsequent lines + ''' % less_than_tab, re.M | re.X) + return wiki_table_re.sub(self.sub, text) + + def sub(self, match: re.Match): + ttext = match.group(0).strip() + rows = [] + for line in ttext.splitlines(0): + line = line.strip()[2:-2].strip() + row = [c.strip() for c in re.split(r'(?' % self.md._html_class_str_from_tag('table')) + # Check if first cell of first row is a header cell. If so, assume the whole row is a header row. + if rows and rows[0] and re.match(r"^\s*~", rows[0][0]): + add_hline('' % self.md._html_class_str_from_tag('thead'), 1) + add_hline('', 2) + for cell in rows[0]: + add_hline("{}".format(format_cell(cell)), 3) + add_hline('', 2) + add_hline('', 1) + # Only one header row allowed. + rows = rows[1:] + # If no more rows, don't create a tbody. + if rows: + add_hline('', 1) + for row in rows: + add_hline('', 2) + for cell in row: + add_hline('{}'.format(format_cell(cell)), 3) + add_hline('', 2) + add_hline('', 1) + add_hline('') + return '\n'.join(hlines) + '\n' + + def test(self, text: str): + return '||' in text + + # ---------------------------------------------------------- From cb1fd28679161074a9908cbfdd7128245efea475 Mon Sep 17 00:00:00 2001 From: Crozzers Date: Wed, 24 May 2023 18:36:11 +0100 Subject: [PATCH 07/23] Pass extra options to initialiser --- lib/markdown2.py | 49 +++++++++++++++++++++++++----------------------- 1 file changed, 26 insertions(+), 23 deletions(-) diff --git a/lib/markdown2.py b/lib/markdown2.py index b6e83ba0..76778ba2 100755 --- a/lib/markdown2.py +++ b/lib/markdown2.py @@ -196,27 +196,27 @@ class Stage(): __counts = {} @classmethod - def _order(cls, items, direction=5): - index = 0 if direction > 0 else 1 + def _order(cls, items, step=5): + index = 0 if step > 0 else 1 ret = [] counts = cls._Stage__counts for item in items: if not isinstance(item, int) and issubclass(item, Extra): - ret.extend(o + direction for o in item.order) + ret.extend(o + step for o in item.order) else: if item not in counts: counts[item] = [0, 0] - counts[item][index] += direction + counts[item][index] += step ret.append(item + counts[item][1]) return ret @classmethod - def after(cls, *items: 'Stage') -> list: - return cls._order(items, 5) + def after(cls, *items: 'Stage', step=5) -> list: + return cls._order(items, step) @classmethod - def before(cls, *items: 'Stage') -> list: - return cls._order(items, -5) + def before(cls, *items: 'Stage', step=-5) -> list: + return cls._order(items, step) @staticmethod def mark(stage): @@ -243,13 +243,13 @@ def inner(self: 'Markdown', text, *args, **kwargs): for _, klass in before: if klass.test(text): - text = klass.run(text, **(self.extras[klass.name] or {})) + text = klass.run(text) text = func(self, text, *args, **kwargs) for _, klass in after: if klass.test(text): - text = klass.run(text, **(self.extras[klass.name] or {})) + text = klass.run(text) return text @@ -361,7 +361,10 @@ def _setup_extras(self): self._count_from_header_id = defaultdict(int) if "metadata" in self.extras: self.metadata = {} - Extra.collect(self) + + for extra in Extra.collect(self): + instance: Extra = extra(self, (self.extras.get(extra.name) or {})) + instance.register() # Per "rel" # should only be used in
tags with an "href" attribute. @@ -2406,18 +2409,16 @@ class Extra(ABC): See `Stage`, `Stage.before` and `Stage.after` ''' - def __init__(self, md: Markdown): + def __init__(self, md: Markdown, options): self.md = md - self.register() + self.options = options @classmethod - def collect(cls, md: Markdown): + def collect(cls, md: Markdown) -> list: ''' - Collects all subclasses of `Extra`, initialises and registers them + Returns all subclasses of `Extra` ''' - for s in cls.__subclasses(cls): - s_inst = s(md) - cls._registry[s_inst.name] = s_inst + return list(cls.__subclasses(cls)) @classmethod def get(cls, extra_name: str) -> 'Extra': @@ -2429,7 +2430,7 @@ def get(cls, extra_name: str) -> 'Extra': return cls._registry[extra_name] @abstractmethod - def run(self, text: str, **opts) -> str: + def run(self, text: str) -> str: ''' Run the extra against the given text. @@ -2533,6 +2534,8 @@ class FencedCodeBlocks(Extra): ''', re.M | re.X | re.S) def test(self, text): + if '```' not in text: + return False if self.md.stage == Stage.PREPROCESS and not self.md.safe_mode: return True if self.md.stage == Stage.LINK_DEFS and self.md.safe_mode: @@ -2822,14 +2825,14 @@ def test(self, text): match = FencedCodeBlocks.fenced_code_block_re.search(text) return match is None or match.group(2) == 'wavedrom' - def sub(self, match, **opts): + def sub(self, match): # dedent the block for processing lead_indent, waves = self.md._uniform_outdent(match.group(3)) # default tags to wrap the wavedrom block in open_tag, close_tag = '' # check if the user would prefer to have the SVG embedded directly - embed_svg = opts.get('prefer_embed_svg', True) + embed_svg = self.options.get('prefer_embed_svg', True) if embed_svg: try: @@ -2847,8 +2850,8 @@ def sub(self, match, **opts): lead_indent, include_empty_lines=True ) - def run(self, text, **opts): - return FencedCodeBlocks.fenced_code_block_re.sub(_curry(self.sub, **opts), text) + def run(self, text): + return FencedCodeBlocks.fenced_code_block_re.sub(self.sub, text) class WikiTables(Extra): From e460b2e6f4aa5dd194705def5feef45c17513f19 Mon Sep 17 00:00:00 2001 From: Crozzers Date: Thu, 25 May 2023 08:33:23 +0100 Subject: [PATCH 08/23] Allow link patterns to be specified via extras dict --- lib/markdown2.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/lib/markdown2.py b/lib/markdown2.py index 76778ba2..058c6e65 100755 --- a/lib/markdown2.py +++ b/lib/markdown2.py @@ -322,14 +322,19 @@ def __init__(self, html4tags=False, tab_width=4, safe_mode=None, self._toc_depth = 6 else: self._toc_depth = self.extras["toc"].get("depth", 6) - self._instance_extras = self.extras.copy() if 'link-patterns' in self.extras: + # allow link patterns via extras dict without kwarg explicitly set + link_patterns = link_patterns or extras['link-patterns'] if link_patterns is None: # if you have specified that the link-patterns extra SHOULD # be used (via self.extras) but you haven't provided anything # via the link_patterns argument then an error is raised raise MarkdownError("If the 'link-patterns' extra is used, an argument for 'link_patterns' is required") + self.extras['link-patterns'] = link_patterns + + self._instance_extras = self.extras.copy() + self.link_patterns = link_patterns self.footnote_title = footnote_title self.footnote_return_symbol = footnote_return_symbol From ad7a3ff24602b3c6769c8d4d6b282680a7f8e4ee Mon Sep 17 00:00:00 2001 From: Crozzers Date: Wed, 24 May 2023 18:37:23 +0100 Subject: [PATCH 09/23] Convert span extras to new format. Still TODO is things like `footnotes` extra. Also need to think of a system for extras to replace parts of the standard syntax --- lib/markdown2.py | 340 +++++++++++++++++++++++++++-------------------- 1 file changed, 195 insertions(+), 145 deletions(-) diff --git a/lib/markdown2.py b/lib/markdown2.py index 058c6e65..36bfded7 100755 --- a/lib/markdown2.py +++ b/lib/markdown2.py @@ -1114,9 +1114,6 @@ def _run_span_gamut(self, text): text = self._escape_special_chars(text) # Process anchor and image tags. - if "link-patterns" in self.extras: - text = self._do_link_patterns(text) - text = self._do_links(text) # Make links out of things like `` @@ -1126,25 +1123,10 @@ def _run_span_gamut(self, text): text = self._encode_amps_and_angles(text) - if "strike" in self.extras: - text = self._do_strike(text) - - if "underline" in self.extras: - text = self._do_underline(text) - text = self._do_italics_and_bold(text) - if "tg-spoiler" in self.extras: - text = self._do_tg_spoiler(text) - - if "smarty-pants" in self.extras: - text = self._do_smart_punctuation(text) - # Do hard breaks: - if "break-on-newline" in self.extras: - text = re.sub(r" *\n(?!\<(?:\/?(ul|ol|li))\>)", "\1", text) - return text - - _underline_re = re.compile(r"(?)(?=\S)(.+?)(?<=\S)(?)", re.S) - def _do_underline(self, text): - text = self._underline_re.sub(r"\1", text) - return text - - _tg_spoiler_re = re.compile(r"\|\|\s?(.+?)\s?\|\|", re.S) - def _do_tg_spoiler(self, text): - text = self._tg_spoiler_re.sub(r"\1", text) - return text - _strong_re = re.compile(r"(\*\*|__)(?=\S)(.+?[*_]*)(?<=\S)\1", re.S) _em_re = re.compile(r"(\*|_)(?=\S)(.+?)(?<=\S)\1", re.S) _code_friendly_strong_re = re.compile(r"\*\*(?=\S)(.+?[*_]*)(?<=\S)\*\*", re.S) @@ -1979,59 +1946,6 @@ def _do_italics_and_bold(self, text): text = self._em_re.sub(r"\2", text) return text - # "smarty-pants" extra: Very liberal in interpreting a single prime as an - # apostrophe; e.g. ignores the fact that "round", "bout", "twer", and - # "twixt" can be written without an initial apostrophe. This is fine because - # using scare quotes (single quotation marks) is rare. - _apostrophe_year_re = re.compile(r"'(\d\d)(?=(\s|,|;|\.|\?|!|$))") - _contractions = ["tis", "twas", "twer", "neath", "o", "n", - "round", "bout", "twixt", "nuff", "fraid", "sup"] - def _do_smart_contractions(self, text): - text = self._apostrophe_year_re.sub(r"’\1", text) - for c in self._contractions: - text = text.replace("'%s" % c, "’%s" % c) - text = text.replace("'%s" % c.capitalize(), - "’%s" % c.capitalize()) - return text - - # Substitute double-quotes before single-quotes. - _opening_single_quote_re = re.compile(r"(? - See "test/tm-cases/smarty_pants.text" for a full discussion of the - support here and - for a - discussion of some diversion from the original SmartyPants. - """ - if "'" in text: # guard for perf - text = self._do_smart_contractions(text) - text = self._opening_single_quote_re.sub("‘", text) - text = self._closing_single_quote_re.sub("’", text) - - if '"' in text: # guard for perf - text = self._opening_double_quote_re.sub("“", text) - text = self._closing_double_quote_re.sub("”", text) - - text = text.replace("---", "—") - text = text.replace("--", "–") - text = text.replace("...", "…") - text = text.replace(" . . . ", "…") - text = text.replace(". . .", "…") - - # TODO: Temporary hack to fix https://github.com/trentm/python-markdown2/issues/150 - if "footnotes" in self.extras and "footnote-ref" in text: - # Quotes in the footnote back ref get converted to "smart" quotes - # Change them back here to ensure they work. - text = text.replace('class="footnote-ref”', 'class="footnote-ref"') - - return text - _block_quote_base = r''' ( # Wrap whole match in \1 ( @@ -2247,64 +2161,6 @@ def _encode_email_address(self, addr): % (''.join(chars), ''.join(chars[7:])) return addr - _basic_link_re = re.compile(r'!?\[.*?\]\(.*?\)') - def _do_link_patterns(self, text): - link_from_hash = {} - for regex, repl in self.link_patterns: - replacements = [] - for match in regex.finditer(text): - if any(self._match_overlaps_substr(text, match, h) for h in link_from_hash): - continue - - if hasattr(repl, "__call__"): - href = repl(match) - else: - href = match.expand(repl) - replacements.append((match.span(), href)) - for (start, end), href in reversed(replacements): - - # Do not match against links inside brackets. - if text[start - 1:start] == '[' and text[end:end + 1] == ']': - continue - - # Do not match against links in the standard markdown syntax. - if text[start - 2:start] == '](' or text[end:end + 2] == '")': - continue - - # Do not match against links which are escaped. - if text[start - 3:start] == '"""' and text[end:end + 3] == '"""': - text = text[:start - 3] + text[start:end] + text[end + 3:] - continue - - # search the text for anything that looks like a link - is_inside_link = False - for link_re in (self._auto_link_re, self._basic_link_re): - for match in link_re.finditer(text): - if any((r[0] <= start and end <= r[1]) for r in match.regs): - # if the link pattern start and end pos is within the bounds of - # something that looks like a link, then don't process it - is_inside_link = True - break - else: - continue - break - - if is_inside_link: - continue - - escaped_href = ( - href.replace('"', '"') # b/c of attr quote - # To avoid markdown and : - .replace('*', self._escape_table['*']) - .replace('_', self._escape_table['_'])) - link = '%s' % (escaped_href, text[start:end]) - hash = _hash_text(link) - link_from_hash[hash] = link - text = text[:start] + hash + text[end:] - for hash, link in list(link_from_hash.items()): - text = text.replace(hash, link) - return text - def _unescape_special_chars(self, text): # Swap back in all the special characters we've hidden. while True: @@ -2520,6 +2376,17 @@ def run(self, text): return self.admonitions_re.sub(self.sub, text) +class BreakOnNewline(Extra): + name = 'break-on-newline' + order = Stage.after(Stage.ITALIC_AND_BOLD) + + def run(self, text: str): + return re.sub(r" *\n(?!\<(?:\/?(ul|ol|li))\>)", " and : + .replace('*', self.md._escape_table['*']) + .replace('_', self.md._escape_table['_'])) + link = '%s' % (escaped_href, text[start:end]) + hash = _hash_text(link) + link_from_hash[hash] = link + text = text[:start] + hash + text[end:] + for hash, link in list(link_from_hash.items()): + text = text.replace(hash, link) + return text + + def test(self, text: str): + return True + + class Mermaid(FencedCodeBlocks): name = 'mermaid' order = Stage.before(FencedCodeBlocks) @@ -2731,6 +2669,89 @@ def run(self, text): return _pyshell_block_re.sub(self.sub, text) +class SmartyPants(Extra): + ''' + Replaces ' and " with curly quotation marks or curly + apostrophes. Replaces --, ---, ..., and . . . with en dashes, em dashes, + and ellipses. + ''' + name = 'smarty-pants' + order = Stage.after(Stage.SPAN_GAMUT) + + _opening_single_quote_re = re.compile(r"(? str: + text = self._apostrophe_year_re.sub(r"’\1", text) + for c in self._contractions: + text = text.replace("'%s" % c, "’%s" % c) + text = text.replace("'%s" % c.capitalize(), + "’%s" % c.capitalize()) + return text + + def run(self, text: str): + """Fancifies 'single quotes', "double quotes", and apostrophes. + Converts --, ---, and ... into en dashes, em dashes, and ellipses. + + Inspiration is: + See "test/tm-cases/smarty_pants.text" for a full discussion of the + support here and + for a + discussion of some diversion from the original SmartyPants. + """ + if "'" in text: # guard for perf + text = self.contractions(text) + text = self._opening_single_quote_re.sub("‘", text) + text = self._closing_single_quote_re.sub("’", text) + + if '"' in text: # guard for perf + text = self._opening_double_quote_re.sub("“", text) + text = self._closing_double_quote_re.sub("”", text) + + text = text.replace("---", "—") + text = text.replace("--", "–") + text = text.replace("...", "…") + text = text.replace(" . . . ", "…") + text = text.replace(". . .", "…") + + # TODO: Temporary hack to fix https://github.com/trentm/python-markdown2/issues/150 + if "footnotes" in self.md.extras and "footnote-ref" in text: + # Quotes in the footnote back ref get converted to "smart" quotes + # Change them back here to ensure they work. + text = text.replace('class="footnote-ref”', 'class="footnote-ref"') + + return text + + def test(self, text: str): + return "'" in text or '"' in text + + +class Strike(Extra): + ''' + Text inside of double tilde is ~~strikethrough~~ + ''' + name = 'strike' + order = Stage.before(Stage.ITALIC_AND_BOLD) + + _strike_re = re.compile(r"~~(?=\S)(.+?)(?<=\S)~~", re.S) + + def run(self, text: str): + return self._strike_re.sub(r"\1", text) + + def test(self, text: str): + return '~~' in text + + class Tables(Extra): ''' Tables using the same format as GFM @@ -2819,6 +2840,35 @@ def test(self, text: str): return True +class TelegramSpoiler(Extra): + name = 'tg-spoiler' + order = Stage.after(Stage.ITALIC_AND_BOLD) + + _tg_spoiler_re = re.compile(r"\|\|\s?(.+?)\s?\|\|", re.S) + + def run(self, text: str): + return self._tg_spoiler_re.sub(r"\1", text) + + def test(self, text: str): + return '||' in text + + +class Underline(Extra): + ''' + Text inside of double dash is --underlined--. + ''' + name = 'underline' + order = Stage.before(Stage.ITALIC_AND_BOLD) + + _underline_re = re.compile(r"(?)(?=\S)(.+?)(?<=\S)(?)", re.S) + + def run(self, text: str): + return self._underline_re.sub(r"\1", text) + + def test(self, text: str): + return '--' in text + + class Wavedrom(Extra): ''' Support for generating Wavedrom digital timing diagrams From a2d8baa87b860f86a6cb73bcf63c9eb3aed942a0 Mon Sep 17 00:00:00 2001 From: Crozzers Date: Sun, 2 Jul 2023 12:45:38 +0100 Subject: [PATCH 10/23] Convert markdown-in-html to new extra format --- lib/markdown2.py | 51 ++++++++++++++++++++++++++++++++---------------- 1 file changed, 34 insertions(+), 17 deletions(-) diff --git a/lib/markdown2.py b/lib/markdown2.py index c19d4ff1..7b57e70f 100755 --- a/lib/markdown2.py +++ b/lib/markdown2.py @@ -367,7 +367,7 @@ def _setup_extras(self): if "metadata" in self.extras: self.metadata = {} - for extra in Extra.collect(self): + for extra in Extra.collect(): instance: Extra = extra(self, (self.extras.get(extra.name) or {})) instance.register() @@ -453,9 +453,6 @@ def convert(self, text): # Turn block-level HTML blocks into hash entries text = self._hash_html_blocks(text, raw=True) - if 'markdown-in-html' in self.extras: - text = self._do_markdown_in_html(text) - # Strip link definitions, store in hashes. if "footnotes" in self.extras: # Must do footnotes first because an unlucky footnote defn @@ -1012,15 +1009,6 @@ def _tag_is_closed(self, tag_name, text): # super basic check if number of open tags == number of closing tags return len(re.findall('<%s(?:.*?)>' % tag_name, text)) == len(re.findall('' % tag_name, text)) - def _do_markdown_in_html(self, text): - def callback(block): - indent, block = self._uniform_outdent(block) - block = self._hash_html_block_sub(block) - block = self._uniform_indent(block, indent, include_empty_lines=True, indent_empty_lines=False) - return block - - return self._strict_tag_block_sub(text, self._block_tags_a, callback, True) - @Stage.mark(Stage.LINK_DEFS) def _strip_link_definitions(self, text): # Strips link definitions from text, stores the URLs and titles in @@ -2321,12 +2309,12 @@ class Extra(ABC): See `Stage`, `Stage.before` and `Stage.after` ''' - def __init__(self, md: Markdown, options): + def __init__(self, md: Markdown, options: dict): self.md = md self.options = options @classmethod - def collect(cls, md: Markdown) -> list: + def collect(cls) -> list: ''' Returns all subclasses of `Extra` ''' @@ -2337,7 +2325,10 @@ def get(cls, extra_name: str) -> 'Extra': ''' Get a registered extra by name. For example, `Extra.get('fenced-code-blocks')` will return an instance - of the `FencedCodeBlocks` extra + of the `FencedCodeBlocks` extra. + + Raises: + KeyError: if no extra has been registered under the given name ''' return cls._registry[extra_name] @@ -2349,13 +2340,16 @@ def run(self, text: str) -> str: Args: text: the text to process **opts: any parameters supplied via `Markdown`'s extras dict + + Returns: + The new text after being modified by the extra ''' ... def register(self): ''' Registers the class for use with `Markdown`. This function is - called during initialisation. + called during `Markdown._setup_extras` ''' self.__class__._registry[self.name] = self @@ -2606,6 +2600,29 @@ def test(self, text: str): return True +class MarkdownInHTML(Extra): + ''' + Allow the use of `markdown="1"` in a block HTML tag to + have markdown processing be done on its contents. Similar to + but with + some limitations. + ''' + name = 'markdown-in-html' + order = Stage.after(Stage.HASH_HTML) + + def run(self, text: str) -> str: + def callback(block): + indent, block = self.md._uniform_outdent(block) + block = self.md._hash_html_block_sub(block) + block = self.md._uniform_indent(block, indent, include_empty_lines=True, indent_empty_lines=False) + return block + + return self.md._strict_tag_block_sub(text, self.md._block_tags_a, callback, True) + + def test(self, text: str) -> bool: + return True + + class Mermaid(FencedCodeBlocks): name = 'mermaid' order = Stage.before(FencedCodeBlocks) From 262d7ff8f2feda962c96d6aa61c3e8d78edd779b Mon Sep 17 00:00:00 2001 From: Crozzers Date: Sun, 2 Jul 2023 16:24:23 +0100 Subject: [PATCH 11/23] Fix failing Python 3.6 tests due to missing `re.Match` class. Also fix tests not running on Windows --- lib/markdown2.py | 8 ++++---- test/testall.py | 4 ++-- test/tm-cases/empty_fenced_code_blocks.tags | 1 + 3 files changed, 7 insertions(+), 6 deletions(-) create mode 100644 test/tm-cases/empty_fenced_code_blocks.tags diff --git a/lib/markdown2.py b/lib/markdown2.py index 7b57e70f..bbb74d5e 100755 --- a/lib/markdown2.py +++ b/lib/markdown2.py @@ -2501,7 +2501,7 @@ def tags(self, lexer_name) -> tuple: code_class = self.md._html_class_str_from_tag('code') return ('' % (pre_class, code_class), '') - def sub(self, match: re.Match): + def sub(self, match): lexer_name = match.group(2) codeblock = match.group(3) codeblock = codeblock[:-1] # drop one trailing newline @@ -2713,7 +2713,7 @@ class PyShell(Extra): def test(self, text): return ">>>" in text - def sub(self, match: re.Match): + def sub(self, match): if "fenced-code-blocks" in self.md.extras: dedented = _dedent(match.group(0)) return Extra.get('fenced-code-blocks').run("```pycon\n" + dedented + "```\n") @@ -2858,7 +2858,7 @@ def run(self, text: str): ''' % (less_than_tab, less_than_tab, less_than_tab), re.M | re.X) return table_re.sub(self.sub, text) - def sub(self, match: re.Match): + def sub(self, match): trim_space_re = '^[ \t\n]+|[ \t\n]+$' trim_bar_re = r'^\||\|$' split_bar_re = r'^\||(? Date: Sun, 2 Jul 2023 16:35:39 +0100 Subject: [PATCH 12/23] Drop Python 3.5 support. At time of commit, Python 3.5 is 2 years 9 months EOL. --- .github/workflows/python.yaml | 2 +- setup.py | 5 +++-- tox.ini | 2 +- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/.github/workflows/python.yaml b/.github/workflows/python.yaml index ef965895..9c5d692a 100644 --- a/.github/workflows/python.yaml +++ b/.github/workflows/python.yaml @@ -9,7 +9,7 @@ jobs: runs-on: ${{ matrix.os }} strategy: matrix: - python-version: ["3.5", "3.6", "3.7", "3.8", "3.9", "3.10"] + python-version: ["3.6", "3.7", "3.8", "3.9", "3.10", "3.11"] os: - ubuntu-20.04 - macos-latest diff --git a/setup.py b/setup.py index e7c43ad3..9962f64a 100755 --- a/setup.py +++ b/setup.py @@ -18,11 +18,12 @@ License :: OSI Approved :: MIT License Programming Language :: Python Programming Language :: Python :: 3 -Programming Language :: Python :: 3.5 Programming Language :: Python :: 3.6 Programming Language :: Python :: 3.7 Programming Language :: Python :: 3.8 Programming Language :: Python :: 3.9 +Programming Language :: Python :: 3.10 +Programming Language :: Python :: 3.11 Operating System :: OS Independent Topic :: Software Development :: Libraries :: Python Modules Topic :: Software Development :: Documentation @@ -55,7 +56,7 @@ ] }, description="A fast and complete Python implementation of Markdown", - python_requires=">=3.5, <4", + python_requires=">=3.6, <4", extras_require=extras_require, classifiers=classifiers.strip().split("\n"), long_description="""markdown2: A fast and complete Python implementation of Markdown. diff --git a/tox.ini b/tox.ini index ec0ca9a5..bebed309 100644 --- a/tox.ini +++ b/tox.ini @@ -4,7 +4,7 @@ # and then run "tox" from this directory. [tox] -envlist = py35, py36, py37, py38, py39, pypy +envlist = py36, py37, py38, py39, py310, py311, pypy [testenv] commands = make testone From 666ebefc43af1b001ebf335204d2fd8023e03e31 Mon Sep 17 00:00:00 2001 From: Crozzers Date: Sun, 2 Jul 2023 16:37:19 +0100 Subject: [PATCH 13/23] Update changes.md --- CHANGES.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/CHANGES.md b/CHANGES.md index 36592088..d07dba25 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -2,7 +2,8 @@ ## python-markdown2 2.4.10 (not yet released) -(nothing yet) +- [pull #519] Add support for custom extras +- [pull #519] Drop Python 3.5 support ## python-markdown2 2.4.9 From 8eb13b4cd0455f4b2dbd6ebdf2f7845181ec58e8 Mon Sep 17 00:00:00 2001 From: Crozzers Date: Sun, 2 Jul 2023 17:12:37 +0100 Subject: [PATCH 14/23] Fix wavedrom tests --- lib/markdown2.py | 2 +- test/tm-cases/wavedrom.html | 2 +- test/tm-cases/wavedrom_no_embed.html | 2 +- test/tm-cases/wavedrom_no_embed.text | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/lib/markdown2.py b/lib/markdown2.py index bbb74d5e..7307be94 100755 --- a/lib/markdown2.py +++ b/lib/markdown2.py @@ -2942,7 +2942,7 @@ class Wavedrom(Extra): Support for generating Wavedrom digital timing diagrams ''' name = 'wavedrom' - order = Stage.before(Stage.CODE_BLOCKS) + Stage.after(Stage.PREPROCESS) + order = Stage.before(Stage.CODE_BLOCKS, FencedCodeBlocks) + Stage.after(Stage.PREPROCESS) def test(self, text): match = FencedCodeBlocks.fenced_code_block_re.search(text) diff --git a/test/tm-cases/wavedrom.html b/test/tm-cases/wavedrom.html index e7a99a47..ddedb0d1 100644 --- a/test/tm-cases/wavedrom.html +++ b/test/tm-cases/wavedrom.html @@ -174,7 +174,7 @@
  • More complex diagram
      -
    • Inside of nested list
      +
    • Inside of nested list