From 3dd45d241f7596261aa083d0f38589f4daa46656 Mon Sep 17 00:00:00 2001
From: Taneli Hukkinen <3275109+hukkin@users.noreply.github.com>
Date: Tue, 17 Dec 2024 13:06:31 +0200
Subject: [PATCH 1/2] improve: cache textwrap.TextWrapper objects
---
src/mdformat/_cli.py | 7 ++-----
src/mdformat/_util.py | 13 +++++++++++++
src/mdformat/plugins.py | 6 ++++--
src/mdformat/renderer/_context.py | 10 ++--------
4 files changed, 21 insertions(+), 15 deletions(-)
diff --git a/src/mdformat/_cli.py b/src/mdformat/_cli.py
index 901dbac..6f6744f 100644
--- a/src/mdformat/_cli.py
+++ b/src/mdformat/_cli.py
@@ -8,11 +8,10 @@
from pathlib import Path
import shutil
import sys
-import textwrap
import mdformat
from mdformat._conf import DEFAULT_OPTS, InvalidConfError, read_toml_opts
-from mdformat._util import detect_newline_type, is_md_equal
+from mdformat._util import cached_textwrapper, detect_newline_type, is_md_equal
import mdformat.plugins
import mdformat.renderer
@@ -408,9 +407,7 @@ def wrap_paragraphs(paragraphs: Iterable[str]) -> str:
wrap_width = terminal_width
else:
wrap_width = 80
- wrapper = textwrap.TextWrapper(
- break_long_words=False, break_on_hyphens=False, width=wrap_width
- )
+ wrapper = cached_textwrapper(wrap_width)
return "\n\n".join(wrapper.fill(p) for p in paragraphs) + "\n"
diff --git a/src/mdformat/_util.py b/src/mdformat/_util.py
index d95a2c0..83b25d1 100644
--- a/src/mdformat/_util.py
+++ b/src/mdformat/_util.py
@@ -2,7 +2,9 @@
from collections.abc import Iterable, Mapping
from contextlib import nullcontext
+import functools
import re
+import textwrap
from types import MappingProxyType
from typing import Any, Literal
@@ -114,3 +116,14 @@ def detect_newline_type(md: str, eol_setting: str) -> Literal["\n", "\r\n"]:
if eol_setting == "crlf":
return "\r\n"
return "\n"
+
+
+@functools.lru_cache
+def cached_textwrapper(width: int) -> textwrap.TextWrapper:
+ return textwrap.TextWrapper(
+ break_long_words=False,
+ break_on_hyphens=False,
+ width=width,
+ expand_tabs=False,
+ replace_whitespace=False,
+ )
diff --git a/src/mdformat/plugins.py b/src/mdformat/plugins.py
index 373b7bd..8da514a 100644
--- a/src/mdformat/plugins.py
+++ b/src/mdformat/plugins.py
@@ -2,12 +2,14 @@
import argparse
from collections.abc import Callable, Mapping
-from typing import Any, Protocol
+from typing import TYPE_CHECKING, Any, Protocol
from markdown_it import MarkdownIt
from mdformat._compat import importlib_metadata
-from mdformat.renderer.typing import Postprocess, Render
+
+if TYPE_CHECKING:
+ from mdformat.renderer.typing import Postprocess, Render
def _load_entrypoints(
diff --git a/src/mdformat/renderer/_context.py b/src/mdformat/renderer/_context.py
index f325d6e..fa40a25 100644
--- a/src/mdformat/renderer/_context.py
+++ b/src/mdformat/renderer/_context.py
@@ -5,7 +5,6 @@
from contextlib import contextmanager
import logging
import re
-import textwrap
from types import MappingProxyType
from typing import TYPE_CHECKING, Any, Literal, NamedTuple
@@ -13,6 +12,7 @@
from mdformat import codepoints
from mdformat._conf import DEFAULT_OPTS
+from mdformat._util import cached_textwrapper
from mdformat.renderer._util import (
RE_CHAR_REFERENCE,
decimalify_leading,
@@ -344,13 +344,7 @@ def _wrap(text: str, *, width: int | Literal["no"]) -> str:
if width == "no":
return _recover_preserve_chars(text, replacements)
- wrapper = textwrap.TextWrapper(
- break_long_words=False,
- break_on_hyphens=False,
- width=width,
- expand_tabs=False,
- replace_whitespace=False,
- )
+ wrapper = cached_textwrapper(width)
wrapped = wrapper.fill(text)
wrapped = _recover_preserve_chars(wrapped, replacements)
return wrapped
From f568716143bde1259fa171f806919886e8565fcc Mon Sep 17 00:00:00 2001
From: Taneli Hukkinen <3275109+hukkin@users.noreply.github.com>
Date: Wed, 18 Dec 2024 12:59:46 +0200
Subject: [PATCH 2/2] fix: regex in is_md_equal is too greedy
---
src/mdformat/_util.py | 14 ++++++++++++--
tests/test_util.py | 27 +++++++++++++++++++++++++++
2 files changed, 39 insertions(+), 2 deletions(-)
diff --git a/src/mdformat/_util.py b/src/mdformat/_util.py
index 83b25d1..ad5492c 100644
--- a/src/mdformat/_util.py
+++ b/src/mdformat/_util.py
@@ -47,6 +47,15 @@ def build_mdit(
return mdit
+# Chars that markdown-it-py escapes when rendering code_inline:
+# https://github.com/executablebooks/markdown-it-py/blob/c5161b550f3c6c0a98d77e8389872405e8f9f9ee/markdown_it/common/utils.py#L138
+# Note that "&" is not included as it is used in the escape sequences of
+# these characters.
+_invalid_html_code_chars = '<>"'
+# a regex str that matches all except above chars
+_valid_html_code_char_re = rf"[^{re.escape(_invalid_html_code_chars)}]"
+
+
def is_md_equal(
md1: str,
md2: str,
@@ -71,10 +80,11 @@ def is_md_equal(
if codeformatters:
langs_re = "|".join(re.escape(lang) for lang in codeformatters)
html = re.sub(
- rf'.*
',
+ rf''
+ rf"{_valid_html_code_char_re}*"
+ r"
",
"",
html,
- flags=re.DOTALL,
)
# Reduce all whitespace to a single space
diff --git a/tests/test_util.py b/tests/test_util.py
index e3bc89f..62c8dcd 100644
--- a/tests/test_util.py
+++ b/tests/test_util.py
@@ -21,3 +21,30 @@ def test_is_md_equal():
paragr"""
assert not is_md_equal(md1, md2)
assert is_md_equal(md1, md2, codeformatters=("js", "go"))
+
+
+def test_is_md_equal__not():
+ md1 = """
+```js
+console.log()
+```
+
+paragr
+
+```js
+console.log()
+```
+"""
+ md2 = """
+```js
+bonsole.l()g
+```
+
+A different paragraph
+
+```js
+console.log()
+```
+"""
+ assert not is_md_equal(md1, md2)
+ assert not is_md_equal(md1, md2, codeformatters=("js",))