Skip to content

Commit

Permalink
modify helping scripts
Browse files Browse the repository at this point in the history
  • Loading branch information
Zhaopudark committed Jan 24, 2024
1 parent 252dd2f commit 78276e3
Show file tree
Hide file tree
Showing 9 changed files with 72 additions and 72 deletions.
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ For an example, `md2md_enhance_equation_filter` in [enhance_equation.py](https:/
pandoc_filter.run_filters_pyio(file_path,output_path,'markdown','gfm',[pandoc_filter.md2md_enhance_equation_filter])
```

**Runtime status** can be recorded. In **python mode**, any filter function will return a proposed panflute `Doc`. Some filter functions will add an instance attribute dict `runtime_status_dict` to the returned `Doc`, as a record for **runtime status**, which may be very useful for advanced users. For an example, `md2md_enhance_equation_filter`, will add an instance attribute dict `runtime_status_dict` to the returned `Doc`, which may contain a mapping `{'math':True}` if there is any math element in the `Doc`.
**Runtime status** can be recorded. In **python mode**, any filter function will return a proposed panflute `Doc`. Some filter functions will add an instance attribute dict `runtime_dict` to the returned `Doc`, as a record for **runtime status**, which may be very useful for advanced users. For an example, `md2md_enhance_equation_filter`, will add an instance attribute dict `runtime_dict` to the returned `Doc`, which may contain a mapping `{'math':True}` if there is any math element in the `Doc`.

All filters with corresponding registered command-line scripts, the specific features, and the recorded **runtime status** are recorded in the following table:

Expand All @@ -74,7 +74,7 @@ All filters with corresponding registered command-line scripts, the specific fe
>
> All filters support cascaded invoking.
| Filter Functions | Command Line | Additional Arguments | Features | Runtime status `doc.runtime_status_dict` |
| Filter Functions | Command Line | Additional Arguments | Features | Runtime status (`doc.runtime_dict`) |
| -------------------------------------------- | -------------------------------------------- | -------------------- | :----------------------------------------------------------- | ------------------------------------------------------------ |
| md2md_enhance_equation_filter | md2md-enhance-equation-filter | - | Enhance math equations. Specifically, this filter will: Adapt AMS rule for math formula. Auto numbering markdown formulations within \begin{equation} \end{equation}, as in Typora. Allow multiple tags, but only take the first one. Allow multiple labels, but only take the first one. | {'math':< bool >,'equations_count':<some_number>} |
| md2md_norm_footnote_filter | md2md-norm-footnote-filter | - | Normalize the footnotes. Remove unnecessary `\n` in the footnote content. | - |
Expand Down
1 change: 1 addition & 0 deletions RELEASE.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# Pandoc-Filter 0.2.1
## Release 0.2.1
Symplify `runtime_status_dict` to `runtime_dict`.

# Pandoc-Filter 0.2.0
## Release 0.2.0
Expand Down
62 changes: 21 additions & 41 deletions src/pandoc_filter/filters/md2html/hash_anchor_and_internal_link.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,8 @@
import typeguard
import panflute as pf

from ...utils import TracingLogger,RuntimeStatusDict
from ...utils import get_html_id,sub_html_id,get_html_href,sub_html_href,get_text_hash
from ...utils import TracingLogger,DocRuntimeDict,InternalLink
from ...utils import get_html_id,sub_html_id,get_html_href,get_text_hash
from ..md2md.norm_internal_link import _decode_internal_link_url


Expand Down Expand Up @@ -53,14 +53,12 @@
log a warning message and do nothing
"""

def _prepare_hash_anchor_and_internal_link(doc:pf.Doc)->pf.Doc:
doc.runtime_status_dict = RuntimeStatusDict(
def _prepare_hash_anchor_and_internal_link(doc:pf.Doc):
doc.runtime_dict = DocRuntimeDict(
{'anchor_count':{},
'internal_link_record':[]
})



def _hash_anchor_id(elem:pf.Element,doc:pf.Doc)->None:
r"""Follow the general procedure of [Panflute](http://scorreia.com/software/panflute/)
An `action` function to normalize any anchor's `id` to its hash.
Expand All @@ -69,43 +67,24 @@ def _hash_anchor_id(elem:pf.Element,doc:pf.Doc)->None:
tracing_logger = TracingLogger()
def _text_hash_count(text:str)->str:
text_hash = get_text_hash(text)
if text_hash in doc.runtime_status_dict['anchor_count']: # 按照text_hash值计数, 重复则加1
doc.runtime_status_dict['anchor_count'][text_hash] += 1
if text_hash in doc.runtime_dict['anchor_count']: # 按照text_hash值计数, 重复则加1
doc.runtime_dict['anchor_count'][text_hash] += 1
else:
doc.runtime_status_dict['anchor_count'][text_hash] = 1
doc.runtime_dict['anchor_count'][text_hash] = 1
return text_hash
if isinstance(elem, pf.Header):
tracing_logger.mark(elem)
# 获取header文本内容并剔除#号
header_text = pf.convert_text(elem,input_format='panflute',output_format='gfm',standalone=True).lstrip('#')
text_hash = _text_hash_count(header_text)
elem.identifier = f"{text_hash}-{doc.runtime_status_dict['anchor_count'][text_hash]}"
elem.identifier = f"{text_hash}-{doc.runtime_dict['anchor_count'][text_hash]}"
tracing_logger.check_and_log('headings anchor',elem)
elif isinstance(elem, pf.RawInline) and elem.format == 'html' and (raw_id_text:=get_html_id(elem.text)): # 获取id文本内容但不做任何剔除
tracing_logger.mark(elem)
text_hash = _text_hash_count(raw_id_text)
elem.text = sub_html_id(elem.text,f"{text_hash}-{doc.runtime_status_dict['anchor_count'][text_hash]}")
elem.text = sub_html_id(elem.text,f"{text_hash}-{doc.runtime_dict['anchor_count'][text_hash]}")
tracing_logger.check_and_log('raw-HTML anchor',elem)

class _PatchedInternalLink:
@typeguard.typechecked
def __init__(self,elem:pf.Link) -> None:
self.elem = elem
@typeguard.typechecked
def sub(self,url:str,tracing_logger:TracingLogger)->None:
tracing_logger.mark(self.elem)
self.elem.url = f"#{url}"
tracing_logger.check_and_log('internal_link',self.elem)

class _PatchedInternalRawLink:
@typeguard.typechecked
def __init__(self,elem:pf.RawInline) -> None:
self.elem = elem
@typeguard.typechecked
def sub(self,url:str,tracing_logger:TracingLogger)->None:
tracing_logger.mark(self.elem)
self.elem.text = sub_html_href(self.elem.text,f"#{url}")
tracing_logger.check_and_log('internal_link',self.elem)

def _internal_link_recorder(elem:pf.Element,doc:pf.Doc)->None:
r"""Follow the general procedure of [Panflute](http://scorreia.com/software/panflute/)
Expand All @@ -129,26 +108,27 @@ def _url_hash_guess(text:str)->str:
# Olny md internal links need to be decoded since it will be encoded by pandoc before filter.
decoded_url = _decode_internal_link_url(elem.url)
url,guessed_url_with_num = _url_hash_guess(decoded_url)
doc.runtime_status_dict['internal_link_record'].append((_PatchedInternalLink(elem),url,guessed_url_with_num))
doc.runtime_dict['internal_link_record'].append(InternalLink(elem,url=url,guessed_url=guessed_url_with_num))
elif isinstance(elem, pf.RawInline) and elem.format == 'html' and (old_href:=get_html_href(elem.text)) and old_href.startswith('#'):
# raw-HTML internal links will not be encoded by pandoc before filter. So there is no need to decode it.
url,guessed_url_with_num = _url_hash_guess(old_href)
doc.runtime_status_dict['internal_link_record'].append((_PatchedInternalRawLink(elem),url,guessed_url_with_num))
doc.runtime_dict['internal_link_record'].append(InternalLink(elem,url=url,guessed_url=guessed_url_with_num))

def _finalize_hash_anchor_and_internal_link(doc:pf.Doc)->pf.Doc:
def _finalize_hash_anchor_and_internal_link(doc:pf.Doc):
tracing_logger = TracingLogger()
id_set = set()
for k,v in doc.runtime_status_dict['anchor_count'].items():
for k,v in doc.runtime_dict['anchor_count'].items():
for i in range(1,v+1):
id_set.add(f"{k}-{i}")
for patched_elem,url,guessed_url_with_num in doc.runtime_status_dict['internal_link_record']:
if f"{url}-1" in id_set:
patched_elem.sub(f"{url}-1",tracing_logger)
elif guessed_url_with_num in id_set: # None is not in id_set
patched_elem.sub(f"{guessed_url_with_num}",tracing_logger)
for internal_link in doc.runtime_dict['internal_link_record']:
internal_link:InternalLink
if f"{internal_link.url}-1" in id_set:
internal_link.sub(f"{internal_link.url}-1",tracing_logger)
elif internal_link.guessed_url in id_set: # None is not in id_set
internal_link.sub(f"{internal_link.guessed_url}",tracing_logger)
else:
tracing_logger.logger.warning(f"{patched_elem.elem}")
tracing_logger.logger.warning(f"The internal link `{url}` is invalid and will not be changed because no target header is found.")
tracing_logger.logger.warning(f"{internal_link.elem}")
tracing_logger.logger.warning(f"The internal link `{internal_link.url}` is invalid and will not be changed because no target header is found.")

def hash_anchor_and_internal_link_filter(doc:pf.Doc=None)->pf.Doc:
return pf.run_filters(
Expand Down
16 changes: 8 additions & 8 deletions src/pandoc_filter/filters/md2md/enhance_equation.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import typeguard
import panflute as pf

from ...utils import TracingLogger,RuntimeStatusDict
from ...utils import TracingLogger,DocRuntimeDict

r"""A pandoc filter that mainly for converting `markdown` to `markdown`.
Enhance math equations.
Expand All @@ -13,10 +13,10 @@
- Allow multiple labels, but only take the first one.
"""

def _prepare_enhance_equation(doc:pf.Doc)->None:
doc.runtime_status_dict = RuntimeStatusDict(
{'math':False,
'equations_count':0})
def _prepare_enhance_equation(doc:pf.Doc):
doc.runtime_dict = DocRuntimeDict(
{'equations_count':0,
'math':False})

@typeguard.typechecked
def _enhance_equation(elem:pf.Element,doc:pf.Doc)->None:
Expand All @@ -43,7 +43,7 @@ def _enhance_equation(elem:pf.Element,doc:pf.Doc)->None:
"""
tracing_logger = TracingLogger()
if isinstance(elem, pf.elements.Math):
doc.runtime_status_dict.lazy_update(key='math',value=True)
doc.runtime_dict['math'] = True
if elem.format == "DisplayMath":
tracing_logger.mark(elem)
text = elem.text
Expand All @@ -64,8 +64,8 @@ def _enhance_equation(elem:pf.Element,doc:pf.Doc)->None:
if first_tag != '':
text = f"\\begin{{equation}}{first_label}{first_tag}\n{text.strip(" \n")}\n\\end{{equation}}"
else:
doc.runtime_status_dict['equations_count'] += 1
text = f"\\begin{{equation}}{first_label}\\tag{{{doc.runtime_status_dict['equations_count']}}}\n{text.strip(" \n")}\n\\end{{equation}}"
doc.runtime_dict['equations_count'] += 1
text = f"\\begin{{equation}}{first_label}\\tag{{{doc.runtime_dict['equations_count']}}}\n{text.strip(" \n")}\n\\end{{equation}}"
else:
text = f"{text}\n{first_label}{first_tag}"
elem.text = f"\n{text.strip(" \n")}\n"
Expand Down
8 changes: 4 additions & 4 deletions src/pandoc_filter/filters/md2md/upload_figure_to_aliyun.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import typeguard
import panflute as pf

from ...utils import TracingLogger,OssHelper,RuntimeStatusDict
from ...utils import TracingLogger,OssHelper,DocRuntimeDict
from ...utils import get_html_src,sub_html_src

r"""A pandoc filter that mainly for converting `markdown` to `markdown`.
Expand All @@ -27,7 +27,7 @@ def _prepare_upload_figure_to_aliyun(doc:pf.Doc,*,doc_path:pathlib.Path)->None:
assert os.environ['OSS_BUCKET_NAME'], "OSS_BUCKET_NAME is not given in environment variables."
assert os.environ['OSS_ACCESS_KEY_ID'], "OSS_ACCESS_KEY_ID is not given in environment variables."
assert os.environ['OSS_ACCESS_KEY_SECRET'], "OSS_ACCESS_KEY_SECRET is not given in environment variables."
doc.runtime_status_dict = RuntimeStatusDict(
doc.runtime_dict = DocRuntimeDict(
{'doc_path':doc_path,
'oss_helper':OssHelper(os.environ['OSS_ENDPOINT_NAME'],os.environ['OSS_BUCKET_NAME'])
})
Expand All @@ -38,8 +38,8 @@ def _upload_figure_to_aliyun(elem:pf.Element,doc:pf.Doc)->None:
[modify elements in place]
"""
tracing_logger = TracingLogger()
oss_helper: OssHelper = doc.runtime_status_dict['oss_helper']
doc_path: pathlib.Path = doc.runtime_status_dict['doc_path']
oss_helper: OssHelper = doc.runtime_dict['oss_helper']
doc_path: pathlib.Path = doc.runtime_dict['doc_path']
if isinstance(elem, pf.Image) and (old_src:=str(elem.url)).startswith('.'): # reletive path
new_src = oss_helper.maybe_upload_file_and_get_src(doc_path.parent/old_src)
tracing_logger.mark(elem)
Expand Down
2 changes: 1 addition & 1 deletion src/pandoc_filter/utils/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,4 @@
from .logging_helper import *
from .oss_helper import *
from .pandoc_helper import *
from .runtime_helper import *
from .panflute_helper import *
32 changes: 32 additions & 0 deletions src/pandoc_filter/utils/panflute_helper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
from typing import Any,TypedDict
import typeguard
import pathlib
from collections import UserDict
import panflute as pf

from .logging_helper import TracingLogger
from .html_helper import sub_html_href
from .oss_helper import OssHelper

class InternalLink():
@typeguard.typechecked
def __init__(self,elem:pf.Link|pf.RawInline,url:str,guessed_url:str|None) -> None:
self.elem = elem
self.url = url
self.guessed_url = guessed_url
@typeguard.typechecked
def sub(self,text:str,tracing_logger:TracingLogger)->None:
tracing_logger.mark(self.elem)
if isinstance(self.elem, pf.Link):
self.elem.url = f"#{text}"
else: # RawInline
self.elem.text = sub_html_href(self.elem.text,f"#{text}")
tracing_logger.check_and_log('internal_link',self.elem)

class DocRuntimeDict(TypedDict):
anchor_count:dict[str,int]|None
internal_link_record:list[InternalLink]|None
equations_count:int|None
math:bool|None
doc_path:pathlib.Path|None
oss_helper:OssHelper|None
13 changes: 0 additions & 13 deletions src/pandoc_filter/utils/runtime_helper.py

This file was deleted.

6 changes: 3 additions & 3 deletions tests/test_md2md_filters.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,9 +113,9 @@ def test_md2md_enhance_equation_filter_pyio():
output_path = pathlib.Path(f"./temp/{file_path.name}")
answer_path = pathlib.Path(f"./resources/outputs/{file_path.name}")
def finalize(doc:pf.Doc,**kwargs):
runtime_status_dict:dict = doc.runtime_status_dict
if runtime_status_dict.get('math'):
doc.metadata['math'] = doc.runtime_status_dict['math']
runtime_dict:dict = doc.runtime_dict
if runtime_dict.get('math'):
doc.metadata['math'] = doc.runtime_dict['math']
pandoc_filter.run_filters_pyio(file_path,output_path,'markdown','gfm',[pandoc_filter.md2md_enhance_equation_filter],finalize=finalize)
assert _check_the_same_content(output_path,answer_path)

Expand Down

0 comments on commit 78276e3

Please sign in to comment.