diff --git a/clear_html/formatted_text/main.py b/clear_html/formatted_text/main.py index e4821fb..145ca2e 100644 --- a/clear_html/formatted_text/main.py +++ b/clear_html/formatted_text/main.py @@ -27,6 +27,7 @@ from clear_html.formatted_text.headings import normalize_headings_level from clear_html.formatted_text.utils import ( clean_incomplete_structures, + double_br, kill_tag_content, remove_empty_tags, set_article_tag_as_root, @@ -116,12 +117,10 @@ def paragraphy(doc: HtmlElement): start, end = None, None for idx, child in enumerate(doc): if child.tag == "br": - prev_child = child.getprevious() - if prev_child is None or prev_child.tag != "br" or has_tail(prev_child): + if not double_br(child.getprevious()): # A br without previous consecutive br was found start = idx - next_child = child.getnext() - if next_child is None or next_child.tag != "br" or has_tail(child): + if not double_br(child): # A br without next consecutive br was found end = idx if start == end: diff --git a/clear_html/formatted_text/utils.py b/clear_html/formatted_text/utils.py index aac8210..c6189e2 100644 --- a/clear_html/formatted_text/utils.py +++ b/clear_html/formatted_text/utils.py @@ -163,12 +163,12 @@ def drop_tag_preserve_spacing(doc: HtmlElement, preserve_content=True): prev_is_inline = ( doc_prev is not None and doc_prev.tag in PHRASING_CONTENT - and not _double_br(doc_prev.getprevious()) + and not double_br(doc_prev.getprevious()) ) after_is_inline = ( doc_next is not None and doc_next.tag in PHRASING_CONTENT - and not _double_br(doc_next) + and not double_br(doc_next) ) has_text_prev = bool(prev_text(doc).strip()) or prev_is_inline @@ -192,7 +192,7 @@ def drop_tag_preserve_spacing(doc: HtmlElement, preserve_content=True): doc.drop_tree() -def _double_br(doc: Optional[HtmlElement]): +def double_br(doc: Optional[HtmlElement]): """True if doc and next element are "br" tags without text in between.""" if doc is None or doc.tag != "br": return False