diff --git a/RELEASE.md b/RELEASE.md index 7ccd6fd..9453746 100644 --- a/RELEASE.md +++ b/RELEASE.md @@ -1,3 +1,7 @@ +# Pandoc-Filter 0.2.5 +## Release 0.2.5 +Re-organize the inner implementation about decoding url. + # Pandoc-Filter 0.2.4 ## Release 0.2.4 Fix a bug in `upload_figure_to_aliyun_filter` when local file path contains spaces. diff --git a/src/pandoc_filter/filters/md2html/hash_anchor_and_internal_link.py b/src/pandoc_filter/filters/md2html/hash_anchor_and_internal_link.py index 6704c38..4f6eaa0 100644 --- a/src/pandoc_filter/filters/md2html/hash_anchor_and_internal_link.py +++ b/src/pandoc_filter/filters/md2html/hash_anchor_and_internal_link.py @@ -5,8 +5,7 @@ import panflute as pf from ...utils import TracingLogger,DocRuntimeDict,InternalLink -from ...utils import get_html_id,sub_html_id,get_html_href,get_text_hash -from ..md2md.norm_internal_link import _decode_internal_link_url +from ...utils import get_html_id,sub_html_id,get_html_href,get_text_hash,decode_internal_link_url r"""A pandoc filter that mainly for converting `markdown` to `html`. @@ -109,7 +108,7 @@ def _url_hash_guess(text:str)->str: if isinstance(elem, pf.Link) and elem.url.startswith('#'): # Olny md internal links need to be decoded since it will be encoded by pandoc before filter. - decoded_url = _decode_internal_link_url(elem.url) + decoded_url = decode_internal_link_url(elem.url) url,guessed_url_with_num = _url_hash_guess(decoded_url) doc.runtime_dict['internal_link_record'].append(InternalLink(elem,url=url,guessed_url=guessed_url_with_num)) elif isinstance(elem, pf.RawInline) and elem.format == 'html' and (old_href:=get_html_href(elem.text)) and old_href.startswith('#'): diff --git a/src/pandoc_filter/filters/md2md/norm_internal_link.py b/src/pandoc_filter/filters/md2md/norm_internal_link.py index 4b844b7..3c1f1c2 100644 --- a/src/pandoc_filter/filters/md2md/norm_internal_link.py +++ b/src/pandoc_filter/filters/md2md/norm_internal_link.py @@ -1,9 +1,8 @@ import typeguard -import urllib.parse import panflute as pf from ...utils import TracingLogger -from ...utils import get_html_href,sub_html_href +from ...utils import get_html_href,sub_html_href,decode_internal_link_url r"""A pandoc filter that mainly for converting `markdown` to `markdown`. Normalize internal links' URLs. Decode the URL if it is URL-encoded. @@ -24,15 +23,6 @@ If in html, it is a raw-HTML element with href attribute that starts with `#`. Such as: `bbb` """ -@typeguard.typechecked -def _decode_internal_link_url(url:str)->str: - r"""When converting markdown to any type via pandoc, md internal links' URLs may be automatically URL-encoded before any filter works. - The encoding is done by default and may not be avoided. - This function is used to decode the URL. - """ - decoded_url = urllib.parse.unquote(url.lstrip('#')) - header_mimic = pf.convert_text(f"# {decoded_url}",input_format='markdown',output_format='gfm',standalone=True) - return f"#{header_mimic.lstrip('# ')}" def _norm_internal_link(elem:pf.Element,doc:pf.Doc,**kwargs)->None: r"""Follow the general procedure of [Panflute](http://scorreia.com/software/panflute/) @@ -44,11 +34,11 @@ def _norm_internal_link(elem:pf.Element,doc:pf.Doc,**kwargs)->None: if isinstance(elem, pf.Link) and elem.url.startswith('#'): tracing_logger.mark(elem) - elem.url = _decode_internal_link_url(elem.url) + elem.url = decode_internal_link_url(elem.url) tracing_logger.check_and_log('anchor_links',elem) elif isinstance(elem, pf.RawInline) and elem.format == 'html' and (old_href:=get_html_href(elem.text)) and old_href.startswith('#'): tracing_logger.mark(elem) - elem.text = sub_html_href(elem.text,_decode_internal_link_url(old_href)) + elem.text = sub_html_href(elem.text,decode_internal_link_url(old_href)) tracing_logger.check_and_log('raw_anchor_links',elem) def norm_internal_link_filter(doc:pf.Doc=None,**kwargs): diff --git a/src/pandoc_filter/filters/md2md/upload_figure_to_aliyun.py b/src/pandoc_filter/filters/md2md/upload_figure_to_aliyun.py index f639f2b..18150ba 100644 --- a/src/pandoc_filter/filters/md2md/upload_figure_to_aliyun.py +++ b/src/pandoc_filter/filters/md2md/upload_figure_to_aliyun.py @@ -5,8 +5,8 @@ import typeguard import panflute as pf -from ...utils import TracingLogger,OssHelper,DocRuntimeDict -from ...utils import get_html_src,sub_html_src +from ...utils import TracingLogger,OssHelper +from ...utils import get_html_src,sub_html_src,decode_src_url r"""A pandoc filter that mainly for converting `markdown` to `markdown`. Auto upload local pictures to Aliyun OSS. Replace the original `src` with the new one. @@ -32,13 +32,13 @@ def _upload_figure_to_aliyun(elem:pf.Element,doc:pf.Doc,**kwargs)->None: typeguard.check_type(kwargs['doc_path'],pathlib.Path) doc_path: pathlib.Path = kwargs['doc_path'] if isinstance(elem, pf.Image) and (old_src:=str(elem.url)).startswith('.'): # reletive path - old_src = urllib.parse.unquote(old_src) + old_src = decode_src_url(old_src) new_src = oss_helper.maybe_upload_file_and_get_src(doc_path.parent/old_src) tracing_logger.mark(elem) elem.url = new_src tracing_logger.check_and_log('image',elem) elif isinstance(elem, pf.RawInline) and elem.format == 'html' and (old_src:=get_html_src(elem.text)) and old_src.startswith('.'): # reletive path - old_src = urllib.parse.unquote(old_src) + old_src = decode_src_url(old_src) new_src = oss_helper.maybe_upload_file_and_get_src(doc_path.parent/old_src) tracing_logger.mark(elem) elem.text = sub_html_src(elem.text,new_src) diff --git a/src/pandoc_filter/utils/panflute_helper.py b/src/pandoc_filter/utils/panflute_helper.py index b069364..1103b75 100644 --- a/src/pandoc_filter/utils/panflute_helper.py +++ b/src/pandoc_filter/utils/panflute_helper.py @@ -1,4 +1,5 @@ from typing import Any,TypedDict +import urllib.parse import typeguard import pathlib from collections import UserDict @@ -29,4 +30,22 @@ class DocRuntimeDict(TypedDict): equations_count:int|None math:bool|None doc_path:pathlib.Path|None - oss_helper:OssHelper|None \ No newline at end of file + oss_helper:OssHelper|None + +@typeguard.typechecked +def decode_internal_link_url(url:str)->str: + r"""When converting markdown to any type via pandoc, internal links' URLs may be automatically URL-encoded before any filter works. + The encoding is done by default and may not be avoided. + This function is used to decode the URL. + """ + decoded_url = urllib.parse.unquote(url.lstrip('#')) + header_mimic = pf.convert_text(f"# {decoded_url}",input_format='markdown',output_format='gfm',standalone=True) + return f"#{header_mimic.lstrip('# ')}" + +@typeguard.typechecked +def decode_src_url(url:str)->str: + r"""When converting markdown to any type via pandoc, some elements' `src` URLs may be automatically URL-encoded before any filter works. + The encoding is done by default and may not be avoided. + This function is used to decode the URL. + """ + return urllib.parse.unquote(url) diff --git a/src/pandoc_filter/version.py b/src/pandoc_filter/version.py index 99cc615..61af8d0 100644 --- a/src/pandoc_filter/version.py +++ b/src/pandoc_filter/version.py @@ -4,4 +4,4 @@ from .utils import check_pandoc_version check_pandoc_version(required_version='3.1.0') -__version__ = '0.2.4' \ No newline at end of file +__version__ = '0.2.5' \ No newline at end of file