From 3dd45d241f7596261aa083d0f38589f4daa46656 Mon Sep 17 00:00:00 2001 From: Taneli Hukkinen <3275109+hukkin@users.noreply.github.com> Date: Tue, 17 Dec 2024 13:06:31 +0200 Subject: [PATCH 1/2] improve: cache textwrap.TextWrapper objects --- src/mdformat/_cli.py | 7 ++----- src/mdformat/_util.py | 13 +++++++++++++ src/mdformat/plugins.py | 6 ++++-- src/mdformat/renderer/_context.py | 10 ++-------- 4 files changed, 21 insertions(+), 15 deletions(-) diff --git a/src/mdformat/_cli.py b/src/mdformat/_cli.py index 901dbac..6f6744f 100644 --- a/src/mdformat/_cli.py +++ b/src/mdformat/_cli.py @@ -8,11 +8,10 @@ from pathlib import Path import shutil import sys -import textwrap import mdformat from mdformat._conf import DEFAULT_OPTS, InvalidConfError, read_toml_opts -from mdformat._util import detect_newline_type, is_md_equal +from mdformat._util import cached_textwrapper, detect_newline_type, is_md_equal import mdformat.plugins import mdformat.renderer @@ -408,9 +407,7 @@ def wrap_paragraphs(paragraphs: Iterable[str]) -> str: wrap_width = terminal_width else: wrap_width = 80 - wrapper = textwrap.TextWrapper( - break_long_words=False, break_on_hyphens=False, width=wrap_width - ) + wrapper = cached_textwrapper(wrap_width) return "\n\n".join(wrapper.fill(p) for p in paragraphs) + "\n" diff --git a/src/mdformat/_util.py b/src/mdformat/_util.py index d95a2c0..83b25d1 100644 --- a/src/mdformat/_util.py +++ b/src/mdformat/_util.py @@ -2,7 +2,9 @@ from collections.abc import Iterable, Mapping from contextlib import nullcontext +import functools import re +import textwrap from types import MappingProxyType from typing import Any, Literal @@ -114,3 +116,14 @@ def detect_newline_type(md: str, eol_setting: str) -> Literal["\n", "\r\n"]: if eol_setting == "crlf": return "\r\n" return "\n" + + +@functools.lru_cache +def cached_textwrapper(width: int) -> textwrap.TextWrapper: + return textwrap.TextWrapper( + break_long_words=False, + break_on_hyphens=False, + width=width, + expand_tabs=False, + replace_whitespace=False, + ) diff --git a/src/mdformat/plugins.py b/src/mdformat/plugins.py index 373b7bd..8da514a 100644 --- a/src/mdformat/plugins.py +++ b/src/mdformat/plugins.py @@ -2,12 +2,14 @@ import argparse from collections.abc import Callable, Mapping -from typing import Any, Protocol +from typing import TYPE_CHECKING, Any, Protocol from markdown_it import MarkdownIt from mdformat._compat import importlib_metadata -from mdformat.renderer.typing import Postprocess, Render + +if TYPE_CHECKING: + from mdformat.renderer.typing import Postprocess, Render def _load_entrypoints( diff --git a/src/mdformat/renderer/_context.py b/src/mdformat/renderer/_context.py index f325d6e..fa40a25 100644 --- a/src/mdformat/renderer/_context.py +++ b/src/mdformat/renderer/_context.py @@ -5,7 +5,6 @@ from contextlib import contextmanager import logging import re -import textwrap from types import MappingProxyType from typing import TYPE_CHECKING, Any, Literal, NamedTuple @@ -13,6 +12,7 @@ from mdformat import codepoints from mdformat._conf import DEFAULT_OPTS +from mdformat._util import cached_textwrapper from mdformat.renderer._util import ( RE_CHAR_REFERENCE, decimalify_leading, @@ -344,13 +344,7 @@ def _wrap(text: str, *, width: int | Literal["no"]) -> str: if width == "no": return _recover_preserve_chars(text, replacements) - wrapper = textwrap.TextWrapper( - break_long_words=False, - break_on_hyphens=False, - width=width, - expand_tabs=False, - replace_whitespace=False, - ) + wrapper = cached_textwrapper(width) wrapped = wrapper.fill(text) wrapped = _recover_preserve_chars(wrapped, replacements) return wrapped From f568716143bde1259fa171f806919886e8565fcc Mon Sep 17 00:00:00 2001 From: Taneli Hukkinen <3275109+hukkin@users.noreply.github.com> Date: Wed, 18 Dec 2024 12:59:46 +0200 Subject: [PATCH 2/2] fix: regex in is_md_equal is too greedy --- src/mdformat/_util.py | 14 ++++++++++++-- tests/test_util.py | 27 +++++++++++++++++++++++++++ 2 files changed, 39 insertions(+), 2 deletions(-) diff --git a/src/mdformat/_util.py b/src/mdformat/_util.py index 83b25d1..ad5492c 100644 --- a/src/mdformat/_util.py +++ b/src/mdformat/_util.py @@ -47,6 +47,15 @@ def build_mdit( return mdit +# Chars that markdown-it-py escapes when rendering code_inline: +# https://github.com/executablebooks/markdown-it-py/blob/c5161b550f3c6c0a98d77e8389872405e8f9f9ee/markdown_it/common/utils.py#L138 +# Note that "&" is not included as it is used in the escape sequences of +# these characters. +_invalid_html_code_chars = '<>"' +# a regex str that matches all except above chars +_valid_html_code_char_re = rf"[^{re.escape(_invalid_html_code_chars)}]" + + def is_md_equal( md1: str, md2: str, @@ -71,10 +80,11 @@ def is_md_equal( if codeformatters: langs_re = "|".join(re.escape(lang) for lang in codeformatters) html = re.sub( - rf'.*', + rf'' + rf"{_valid_html_code_char_re}*" + r"", "", html, - flags=re.DOTALL, ) # Reduce all whitespace to a single space diff --git a/tests/test_util.py b/tests/test_util.py index e3bc89f..62c8dcd 100644 --- a/tests/test_util.py +++ b/tests/test_util.py @@ -21,3 +21,30 @@ def test_is_md_equal(): paragr""" assert not is_md_equal(md1, md2) assert is_md_equal(md1, md2, codeformatters=("js", "go")) + + +def test_is_md_equal__not(): + md1 = """ +```js +console.log() +``` + +paragr + +```js +console.log() +``` +""" + md2 = """ +```js +bonsole.l()g +``` + +A different paragraph + +```js +console.log() +``` +""" + assert not is_md_equal(md1, md2) + assert not is_md_equal(md1, md2, codeformatters=("js",))