From a01c59b91a8d3dcf2672936c462db4fa3f73bcdd Mon Sep 17 00:00:00 2001 From: Maksym Lysak Date: Fri, 25 Oct 2024 11:19:52 +0200 Subject: [PATCH] Escaping underscore characters in md export Signed-off-by: Maksym Lysak --- docling_core/types/doc/document.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/docling_core/types/doc/document.py b/docling_core/types/doc/document.py index 50a4e77..f1f249b 100644 --- a/docling_core/types/doc/document.py +++ b/docling_core/types/doc/document.py @@ -1291,6 +1291,17 @@ def export_to_markdown( # noqa: C901 mdtext = re.sub( r"\n\n\n+", "\n\n", mdtext ) # remove cases of double or more empty lines. + + # Our export markdown doesn't contain any emphasis styling: + # Bold, Italic, or Bold-Italic + # Hence, any underscore that we print into Markdown is coming from document text + # That means we need to escape it, to properly reflect content in the markdown + def escape_underscores(text): + # Replace "_" with "\_" only if it's not already escaped + escaped_text = re.sub(r'(?