diff --git a/.gitattributes b/.gitattributes index 57cb076..ce91100 100644 --- a/.gitattributes +++ b/.gitattributes @@ -1,3 +1,4 @@ tests/** linguist-detectable=false +benchmark/** linguist-detectable=false *.xsl linguist-documentation=true diff --git a/README.md b/README.md index 4fc38cf..8c84724 100644 --- a/README.md +++ b/README.md @@ -1,12 +1,12 @@ -# common-html-extractor - 通用HTML数据提取器 +# magic-html - 通用HTML数据提取器 -欢迎使用common-html-extractor,这是一个旨在简化从HTML中提取主体区域内容的Python库。 +欢迎使用magic-html,这是一个旨在简化从HTML中提取主体区域内容的Python库。 ## 项目描述 -common-html-extractor提供了一套工具,能够轻松地从HTML中提取主体区域内容。无论您处理的是复杂的HTML结构还是简单的网页,这个库都旨在为您的HTML抽取需求提供一个便捷高效的接口。 +magic-html提供了一套工具,能够轻松地从HTML中提取主体区域内容。无论您处理的是复杂的HTML结构还是简单的网页,这个库都旨在为您的HTML抽取需求提供一个便捷高效的接口。 @@ -22,12 +22,12 @@ common-html-extractor提供了一套工具,能够轻松地从HTML中提取主 ## 使用 ```python -from common_html_extractor import GeneralExtractor +from magic_html import GeneralExtractor # 初始化提取器 extractor = GeneralExtractor() -# http://example.com/ +url = "http://example.com/" html = """ @@ -51,8 +51,15 @@ html = """ """ -# 从HTML中提取数据 -data = extractor.extract(html) +# 文章类型HTML提取数据 +data = extractor.extract(html, base_url=url) + +# 论坛类型HTML提取数据 +# data = extractor.extract(html, base_url=url, html_type="forum") + +# 微信文章HTML提取数据 +# data = extractor.extract(html, base_url=url, html_type="weixin") + print(data) ``` @@ -67,5 +74,5 @@ print(data) ## 鸣谢 - [trafilatura](https://github.com/adbar/trafilatura) -- [python-readability](https://github.com/buriy/python-readability) +- [readability-lxml](https://github.com/buriy/python-readability) diff --git a/benchmark/__init__.py b/benchmark/__init__.py new file mode 100644 index 0000000..44d37d3 --- /dev/null +++ b/benchmark/__init__.py @@ -0,0 +1 @@ +# -*- coding:utf-8 -*- \ No newline at end of file diff --git a/tests/data/article/base.json b/benchmark/data/article/base.json similarity index 100% rename from tests/data/article/base.json rename to benchmark/data/article/base.json diff --git a/tests/data/article/htmls/2338d64a03dcd33cdc0e7351495b53b979b8e612813020d58265f6669aa8a979.html b/benchmark/data/article/htmls/2338d64a03dcd33cdc0e7351495b53b979b8e612813020d58265f6669aa8a979.html similarity index 100% rename from tests/data/article/htmls/2338d64a03dcd33cdc0e7351495b53b979b8e612813020d58265f6669aa8a979.html rename to benchmark/data/article/htmls/2338d64a03dcd33cdc0e7351495b53b979b8e612813020d58265f6669aa8a979.html diff --git a/tests/data/article/htmls/3ef3b4e65fab1f387fb5ec483a13a88da9b0bc1533c2d9bf894532322b8e3860.html b/benchmark/data/article/htmls/3ef3b4e65fab1f387fb5ec483a13a88da9b0bc1533c2d9bf894532322b8e3860.html similarity index 100% rename from tests/data/article/htmls/3ef3b4e65fab1f387fb5ec483a13a88da9b0bc1533c2d9bf894532322b8e3860.html rename to benchmark/data/article/htmls/3ef3b4e65fab1f387fb5ec483a13a88da9b0bc1533c2d9bf894532322b8e3860.html diff --git a/tests/data/article/htmls/42076fe0b7e334620dd25c6df7186a69e4c28c084800e3a7a46e99c3b1cf51b4.html b/benchmark/data/article/htmls/42076fe0b7e334620dd25c6df7186a69e4c28c084800e3a7a46e99c3b1cf51b4.html similarity index 100% rename from tests/data/article/htmls/42076fe0b7e334620dd25c6df7186a69e4c28c084800e3a7a46e99c3b1cf51b4.html rename to benchmark/data/article/htmls/42076fe0b7e334620dd25c6df7186a69e4c28c084800e3a7a46e99c3b1cf51b4.html diff --git a/tests/data/article/htmls/42823abb58e8e160a35fafb52f254f6c6e1d92f1916ecc3254ae492e6082b3d4.html b/benchmark/data/article/htmls/42823abb58e8e160a35fafb52f254f6c6e1d92f1916ecc3254ae492e6082b3d4.html similarity index 100% rename from tests/data/article/htmls/42823abb58e8e160a35fafb52f254f6c6e1d92f1916ecc3254ae492e6082b3d4.html rename to benchmark/data/article/htmls/42823abb58e8e160a35fafb52f254f6c6e1d92f1916ecc3254ae492e6082b3d4.html diff --git a/tests/data/article/htmls/452b8128babdcbc570f7b2a9e1f0c33fdafaf93f6bf07a3ed60b7aa3c2dd79cb.html b/benchmark/data/article/htmls/452b8128babdcbc570f7b2a9e1f0c33fdafaf93f6bf07a3ed60b7aa3c2dd79cb.html similarity index 100% rename from tests/data/article/htmls/452b8128babdcbc570f7b2a9e1f0c33fdafaf93f6bf07a3ed60b7aa3c2dd79cb.html rename to benchmark/data/article/htmls/452b8128babdcbc570f7b2a9e1f0c33fdafaf93f6bf07a3ed60b7aa3c2dd79cb.html diff --git a/tests/data/article/htmls/4cd29b7d9856e0451dd951d35e88257c3817c3659ed2108165a29eacab9a1276.html b/benchmark/data/article/htmls/4cd29b7d9856e0451dd951d35e88257c3817c3659ed2108165a29eacab9a1276.html similarity index 100% rename from tests/data/article/htmls/4cd29b7d9856e0451dd951d35e88257c3817c3659ed2108165a29eacab9a1276.html rename to benchmark/data/article/htmls/4cd29b7d9856e0451dd951d35e88257c3817c3659ed2108165a29eacab9a1276.html diff --git a/tests/data/article/htmls/4e55eac42f97a6d97d14795891d2e24b715bec8c057c592a81c3d7ca08aafa10.html b/benchmark/data/article/htmls/4e55eac42f97a6d97d14795891d2e24b715bec8c057c592a81c3d7ca08aafa10.html similarity index 100% rename from tests/data/article/htmls/4e55eac42f97a6d97d14795891d2e24b715bec8c057c592a81c3d7ca08aafa10.html rename to benchmark/data/article/htmls/4e55eac42f97a6d97d14795891d2e24b715bec8c057c592a81c3d7ca08aafa10.html diff --git a/tests/data/article/htmls/7e99760d0920deb099946fd607f580080345cf332fae8ac7eb15fd78f38d701a.html b/benchmark/data/article/htmls/7e99760d0920deb099946fd607f580080345cf332fae8ac7eb15fd78f38d701a.html similarity index 100% rename from tests/data/article/htmls/7e99760d0920deb099946fd607f580080345cf332fae8ac7eb15fd78f38d701a.html rename to benchmark/data/article/htmls/7e99760d0920deb099946fd607f580080345cf332fae8ac7eb15fd78f38d701a.html diff --git a/tests/data/article/htmls/83bd4dfc1e736b9a3885adc1ba5e74148b888887f494dc4e3f375a335e419a94.html b/benchmark/data/article/htmls/83bd4dfc1e736b9a3885adc1ba5e74148b888887f494dc4e3f375a335e419a94.html similarity index 100% rename from tests/data/article/htmls/83bd4dfc1e736b9a3885adc1ba5e74148b888887f494dc4e3f375a335e419a94.html rename to benchmark/data/article/htmls/83bd4dfc1e736b9a3885adc1ba5e74148b888887f494dc4e3f375a335e419a94.html diff --git a/tests/data/article/htmls/8a5d9673b0c29ddbbf294f1b851c5739e186ad9d2417604d620ed4656e4a942a.html b/benchmark/data/article/htmls/8a5d9673b0c29ddbbf294f1b851c5739e186ad9d2417604d620ed4656e4a942a.html similarity index 100% rename from tests/data/article/htmls/8a5d9673b0c29ddbbf294f1b851c5739e186ad9d2417604d620ed4656e4a942a.html rename to benchmark/data/article/htmls/8a5d9673b0c29ddbbf294f1b851c5739e186ad9d2417604d620ed4656e4a942a.html diff --git a/tests/data/article/htmls/9c962e02099b01b20b1e92b0872e704753fc9905f6ea435449248ae98dd70145.html b/benchmark/data/article/htmls/9c962e02099b01b20b1e92b0872e704753fc9905f6ea435449248ae98dd70145.html similarity index 100% rename from tests/data/article/htmls/9c962e02099b01b20b1e92b0872e704753fc9905f6ea435449248ae98dd70145.html rename to benchmark/data/article/htmls/9c962e02099b01b20b1e92b0872e704753fc9905f6ea435449248ae98dd70145.html diff --git a/tests/data/article/htmls/aa1432d872173fe865badf36b3687cdc114c8e8b2a1186c90d6117160201377a.html b/benchmark/data/article/htmls/aa1432d872173fe865badf36b3687cdc114c8e8b2a1186c90d6117160201377a.html similarity index 100% rename from tests/data/article/htmls/aa1432d872173fe865badf36b3687cdc114c8e8b2a1186c90d6117160201377a.html rename to benchmark/data/article/htmls/aa1432d872173fe865badf36b3687cdc114c8e8b2a1186c90d6117160201377a.html diff --git a/tests/data/article/htmls/c2020fd56735f8a698535b20c512c2978fcae1836ea12fa95c0587c60488af42.html b/benchmark/data/article/htmls/c2020fd56735f8a698535b20c512c2978fcae1836ea12fa95c0587c60488af42.html similarity index 100% rename from tests/data/article/htmls/c2020fd56735f8a698535b20c512c2978fcae1836ea12fa95c0587c60488af42.html rename to benchmark/data/article/htmls/c2020fd56735f8a698535b20c512c2978fcae1836ea12fa95c0587c60488af42.html diff --git a/tests/data/article/htmls/c6154d2bd427327540ef334370f8dd1cd2847c0df2254f5685882bb4a5bf3450.html b/benchmark/data/article/htmls/c6154d2bd427327540ef334370f8dd1cd2847c0df2254f5685882bb4a5bf3450.html similarity index 100% rename from tests/data/article/htmls/c6154d2bd427327540ef334370f8dd1cd2847c0df2254f5685882bb4a5bf3450.html rename to benchmark/data/article/htmls/c6154d2bd427327540ef334370f8dd1cd2847c0df2254f5685882bb4a5bf3450.html diff --git a/tests/data/article/htmls/c9d7202a66cd79fcb61bccdc7911d10618ff02bd32296b4c939e44e1a42055fa.html b/benchmark/data/article/htmls/c9d7202a66cd79fcb61bccdc7911d10618ff02bd32296b4c939e44e1a42055fa.html similarity index 100% rename from tests/data/article/htmls/c9d7202a66cd79fcb61bccdc7911d10618ff02bd32296b4c939e44e1a42055fa.html rename to benchmark/data/article/htmls/c9d7202a66cd79fcb61bccdc7911d10618ff02bd32296b4c939e44e1a42055fa.html diff --git a/tests/data/article/htmls/e870e9deb7e33c9675736e8876172d3b0ca3cc001365cb580c4be79d8b09e8f7.html b/benchmark/data/article/htmls/e870e9deb7e33c9675736e8876172d3b0ca3cc001365cb580c4be79d8b09e8f7.html similarity index 100% rename from tests/data/article/htmls/e870e9deb7e33c9675736e8876172d3b0ca3cc001365cb580c4be79d8b09e8f7.html rename to benchmark/data/article/htmls/e870e9deb7e33c9675736e8876172d3b0ca3cc001365cb580c4be79d8b09e8f7.html diff --git a/tests/data/article/htmls/e8dff24871ec675824595dd6c09f523abdff2134977ab766e47ef6c74bd6aa13.html b/benchmark/data/article/htmls/e8dff24871ec675824595dd6c09f523abdff2134977ab766e47ef6c74bd6aa13.html similarity index 100% rename from tests/data/article/htmls/e8dff24871ec675824595dd6c09f523abdff2134977ab766e47ef6c74bd6aa13.html rename to benchmark/data/article/htmls/e8dff24871ec675824595dd6c09f523abdff2134977ab766e47ef6c74bd6aa13.html diff --git a/tests/data/article/htmls/fb45fc029ee1244a433fb44a580c5a4ccbd88a87238d01047979c106f1ac98be.html b/benchmark/data/article/htmls/fb45fc029ee1244a433fb44a580c5a4ccbd88a87238d01047979c106f1ac98be.html similarity index 100% rename from tests/data/article/htmls/fb45fc029ee1244a433fb44a580c5a4ccbd88a87238d01047979c106f1ac98be.html rename to benchmark/data/article/htmls/fb45fc029ee1244a433fb44a580c5a4ccbd88a87238d01047979c106f1ac98be.html diff --git a/tests/data/forum/base.json b/benchmark/data/forum/base.json similarity index 100% rename from tests/data/forum/base.json rename to benchmark/data/forum/base.json diff --git a/tests/data/forum/htmls/13e5e1dc5565ced7bc89bcfecafdf56dfad212144e0f411b8119833025875f99.html b/benchmark/data/forum/htmls/13e5e1dc5565ced7bc89bcfecafdf56dfad212144e0f411b8119833025875f99.html similarity index 100% rename from tests/data/forum/htmls/13e5e1dc5565ced7bc89bcfecafdf56dfad212144e0f411b8119833025875f99.html rename to benchmark/data/forum/htmls/13e5e1dc5565ced7bc89bcfecafdf56dfad212144e0f411b8119833025875f99.html diff --git a/tests/data/forum/htmls/19618876358be36995003aae24d8332aca122ec44ab3eccf17e2e40a25768272.html b/benchmark/data/forum/htmls/19618876358be36995003aae24d8332aca122ec44ab3eccf17e2e40a25768272.html similarity index 100% rename from tests/data/forum/htmls/19618876358be36995003aae24d8332aca122ec44ab3eccf17e2e40a25768272.html rename to benchmark/data/forum/htmls/19618876358be36995003aae24d8332aca122ec44ab3eccf17e2e40a25768272.html diff --git a/tests/data/forum/htmls/1f95e1499390a2bc1bc80263ddba93402009d5d50fa72c79fb8bf2ebf9a9e07e.html b/benchmark/data/forum/htmls/1f95e1499390a2bc1bc80263ddba93402009d5d50fa72c79fb8bf2ebf9a9e07e.html similarity index 100% rename from tests/data/forum/htmls/1f95e1499390a2bc1bc80263ddba93402009d5d50fa72c79fb8bf2ebf9a9e07e.html rename to benchmark/data/forum/htmls/1f95e1499390a2bc1bc80263ddba93402009d5d50fa72c79fb8bf2ebf9a9e07e.html diff --git a/tests/data/forum/htmls/5b0ef856b2df5dff20276b5c23b10e7d2553249c020c86bfc8bb5b3b5ccbbb84.html b/benchmark/data/forum/htmls/5b0ef856b2df5dff20276b5c23b10e7d2553249c020c86bfc8bb5b3b5ccbbb84.html similarity index 100% rename from tests/data/forum/htmls/5b0ef856b2df5dff20276b5c23b10e7d2553249c020c86bfc8bb5b3b5ccbbb84.html rename to benchmark/data/forum/htmls/5b0ef856b2df5dff20276b5c23b10e7d2553249c020c86bfc8bb5b3b5ccbbb84.html diff --git a/tests/data/forum/htmls/b5bb37137a346f6d341184bf16defecdaa7c0d0011da3f66ae729bca830e98a5.html b/benchmark/data/forum/htmls/b5bb37137a346f6d341184bf16defecdaa7c0d0011da3f66ae729bca830e98a5.html similarity index 100% rename from tests/data/forum/htmls/b5bb37137a346f6d341184bf16defecdaa7c0d0011da3f66ae729bca830e98a5.html rename to benchmark/data/forum/htmls/b5bb37137a346f6d341184bf16defecdaa7c0d0011da3f66ae729bca830e98a5.html diff --git a/benchmark/eval-requirements.txt b/benchmark/eval-requirements.txt new file mode 100644 index 0000000..948f244 --- /dev/null +++ b/benchmark/eval-requirements.txt @@ -0,0 +1,12 @@ +beautifulsoup4 +jieba +ltp +numpy +rouge_score +tabulate +trafilatura +readability-lxml +newspaper3k +goose3 +justext +gne \ No newline at end of file diff --git a/tests/evaluate_articles.py b/benchmark/evaluate_articles.py similarity index 50% rename from tests/evaluate_articles.py rename to benchmark/evaluate_articles.py index d167e71..7befecb 100644 --- a/tests/evaluate_articles.py +++ b/benchmark/evaluate_articles.py @@ -96,8 +96,8 @@ def get_content_text(html: str) -> str: global_datas.append(v) -def run_common_html_extractor(name): - from common_html_extractor import GeneralExtractor +def run_magic_html(name): + from magic_html import GeneralExtractor datas = deepcopy(global_datas) extractor = GeneralExtractor() @@ -133,11 +133,79 @@ def run_trafilatura_fallback(name): evaluate_result(datas) +def run_readability_lxml(name): + from readability import Document + + datas = deepcopy(global_datas) + for x in datas: + x["extract_content"] = get_content_text(Document(x["html"]).summary()) + global_info["func"].append(name) + evaluate_result(datas) + + +def run_newspaper3k(name): + from newspaper import fulltext + + datas = deepcopy(global_datas) + for x in datas: + try: + x["extract_content"] = fulltext(x["html"]) + except: + x["extract_content"] = "" + global_info["func"].append(name) + evaluate_result(datas) + + +def run_goose3(name): + from goose3 import Goose + + g = Goose() + datas = deepcopy(global_datas) + for x in datas: + x["extract_content"] = g.extract(raw_html=x["html"]).cleaned_text + global_info["func"].append(name) + evaluate_result(datas) + + +def run_justext(name): + import justext + + datas = deepcopy(global_datas) + for x in datas: + paragraphs = justext.justext(x["html"], justext.get_stoplist("German"), 50, 200, 0.1, 0.2, 0.2, 200, + True) # stop_words + valid = [ + paragraph.text + for paragraph in paragraphs + if not paragraph.is_boilerplate + ] + + x["extract_content"] = ' '.join(valid) + global_info["func"].append(name) + evaluate_result(datas) + + +def run_gne(name): + from gne import GeneralNewsExtractor + + extractor = GeneralNewsExtractor() + datas = deepcopy(global_datas) + for x in datas: + x["extract_content"] = extractor.extract(x["html"])["content"] + global_info["func"].append(name) + evaluate_result(datas) + + # 自定义需要对比的方法 all_funcs = { - "common_html_extractor": run_common_html_extractor, + "magic_html": run_magic_html, "trafilatura": run_trafilatura, "trafilatura_fallback": run_trafilatura_fallback, + "readability-lxml": run_readability_lxml, + "newspaper3k": run_newspaper3k, + "goose3": run_goose3, + "justext": run_justext, + "gne": run_gne } for k, v in all_funcs.items(): @@ -149,14 +217,24 @@ def run_trafilatura_fallback(name): print("基准结果") print( """ -╒═══════════════════════╤═════════════╤════════════╤═══════════╕ -│ func │ prec_mean │ rec_mean │ f1_mean │ -╞═══════════════════════╪═════════════╪════════════╪═══════════╡ -│ common_html_extractor │ 0.955549 │ 0.993721 │ 0.974261 │ -├───────────────────────┼─────────────┼────────────┼───────────┤ -│ trafilatura │ 0.887413 │ 0.84243 │ 0.864336 │ -├───────────────────────┼─────────────┼────────────┼───────────┤ -│ trafilatura_fallback │ 0.893388 │ 0.93853 │ 0.915403 │ -╘═══════════════════════╧═════════════╧════════════╧═══════════╛ +╒══════════════════════╤═════════════╤════════════╤═══════════╕ +│ func │ prec_mean │ rec_mean │ f1_mean │ +╞══════════════════════╪═════════════╪════════════╪═══════════╡ +│ magic_html │ 0.955549 │ 0.993721 │ 0.974261 │ +├──────────────────────┼─────────────┼────────────┼───────────┤ +│ trafilatura │ 0.887413 │ 0.84243 │ 0.864336 │ +├──────────────────────┼─────────────┼────────────┼───────────┤ +│ trafilatura_fallback │ 0.893388 │ 0.93853 │ 0.915403 │ +├──────────────────────┼─────────────┼────────────┼───────────┤ +│ readability-lxml │ 0.860823 │ 0.885361 │ 0.87292 │ +├──────────────────────┼─────────────┼────────────┼───────────┤ +│ newspaper3k │ 0.527753 │ 0.551633 │ 0.539429 │ +├──────────────────────┼─────────────┼────────────┼───────────┤ +│ goose3 │ 0.543127 │ 0.531672 │ 0.537338 │ +├──────────────────────┼─────────────┼────────────┼───────────┤ +│ justext │ 0.484097 │ 0.334175 │ 0.395402 │ +├──────────────────────┼─────────────┼────────────┼───────────┤ +│ gne │ 0.869631 │ 0.859659 │ 0.864617 │ +╘══════════════════════╧═════════════╧════════════╧═══════════╛ """.strip() ) diff --git a/tests/evaluate_forums.py b/benchmark/evaluate_forums.py similarity index 50% rename from tests/evaluate_forums.py rename to benchmark/evaluate_forums.py index 9e1820c..6add073 100644 --- a/tests/evaluate_forums.py +++ b/benchmark/evaluate_forums.py @@ -96,8 +96,8 @@ def get_content_text(html: str) -> str: global_datas.append(v) -def run_common_html_extractor(name): - from common_html_extractor import GeneralExtractor +def run_magic_html(name): + from magic_html import GeneralExtractor datas = deepcopy(global_datas) extractor = GeneralExtractor() @@ -135,11 +135,79 @@ def run_trafilatura_fallback(name): evaluate_result(datas) +def run_readability_lxml(name): + from readability import Document + + datas = deepcopy(global_datas) + for x in datas: + x["extract_content"] = get_content_text(Document(x["html"]).summary()) + global_info["func"].append(name) + evaluate_result(datas) + + +def run_newspaper3k(name): + from newspaper import fulltext + + datas = deepcopy(global_datas) + for x in datas: + try: + x["extract_content"] = fulltext(x["html"]) + except: + x["extract_content"] = "" + global_info["func"].append(name) + evaluate_result(datas) + + +def run_goose3(name): + from goose3 import Goose + + g = Goose() + datas = deepcopy(global_datas) + for x in datas: + x["extract_content"] = g.extract(raw_html=x["html"]).cleaned_text + global_info["func"].append(name) + evaluate_result(datas) + + +def run_justext(name): + import justext + + datas = deepcopy(global_datas) + for x in datas: + paragraphs = justext.justext(x["html"], justext.get_stoplist("German"), 50, 200, 0.1, 0.2, 0.2, 200, + True) # stop_words + valid = [ + paragraph.text + for paragraph in paragraphs + if not paragraph.is_boilerplate + ] + + x["extract_content"] = ' '.join(valid) + global_info["func"].append(name) + evaluate_result(datas) + + +def run_gne(name): + from gne import GeneralNewsExtractor + + extractor = GeneralNewsExtractor() + datas = deepcopy(global_datas) + for x in datas: + x["extract_content"] = extractor.extract(x["html"])["content"] + global_info["func"].append(name) + evaluate_result(datas) + + # 自定义需要对比的方法 all_funcs = { - "common_html_extractor": run_common_html_extractor, + "magic_html": run_magic_html, "trafilatura": run_trafilatura, "trafilatura_fallback": run_trafilatura_fallback, + "readability-lxml": run_readability_lxml, + "newspaper3k": run_newspaper3k, + "goose3": run_goose3, + "justext": run_justext, + "gne": run_gne } for k, v in all_funcs.items(): @@ -150,13 +218,23 @@ def run_trafilatura_fallback(name): print(tabulate(global_info, headers="keys", tablefmt="fancy_grid")) print("基准结果") print(''' -╒═══════════════════════╤═════════════╤════════════╤═══════════╕ -│ func │ prec_mean │ rec_mean │ f1_mean │ -╞═══════════════════════╪═════════════╪════════════╪═══════════╡ -│ common_html_extractor │ 0.752323 │ 0.964762 │ 0.845401 │ -├───────────────────────┼─────────────┼────────────┼───────────┤ -│ trafilatura │ 0.711983 │ 0.568848 │ 0.632418 │ -├───────────────────────┼─────────────┼────────────┼───────────┤ -│ trafilatura_fallback │ 0.781724 │ 0.557774 │ 0.651028 │ -╘═══════════════════════╧═════════════╧════════════╧═══════════╛ +╒══════════════════════╤═════════════╤════════════╤═══════════╕ +│ func │ prec_mean │ rec_mean │ f1_mean │ +╞══════════════════════╪═════════════╪════════════╪═══════════╡ +│ magic_html │ 0.752323 │ 0.964762 │ 0.845401 │ +├──────────────────────┼─────────────┼────────────┼───────────┤ +│ trafilatura │ 0.711983 │ 0.568848 │ 0.632418 │ +├──────────────────────┼─────────────┼────────────┼───────────┤ +│ trafilatura_fallback │ 0.781724 │ 0.557774 │ 0.651028 │ +├──────────────────────┼─────────────┼────────────┼───────────┤ +│ readability-lxml │ 0.55441 │ 0.228667 │ 0.323788 │ +├──────────────────────┼─────────────┼────────────┼───────────┤ +│ newspaper3k │ 0.716421 │ 0.19569 │ 0.307411 │ +├──────────────────────┼─────────────┼────────────┼───────────┤ +│ goose3 │ 0.551646 │ 0.147048 │ 0.2322 │ +├──────────────────────┼─────────────┼────────────┼───────────┤ +│ justext │ 0.542019 │ 0.204443 │ 0.296899 │ +├──────────────────────┼─────────────┼────────────┼───────────┤ +│ gne │ 0.846939 │ 0.125834 │ 0.219114 │ +╘══════════════════════╧═════════════╧════════════╧═══════════╛ '''.strip()) diff --git a/common_html_extractor/__init__.py b/magic_html/__init__.py similarity index 77% rename from common_html_extractor/__init__.py rename to magic_html/__init__.py index d7292aa..fbaccfc 100644 --- a/common_html_extractor/__init__.py +++ b/magic_html/__init__.py @@ -1,8 +1,9 @@ # -*- coding: utf-8 -*- from urllib.parse import urlparse - -from common_html_extractor.extractors import * +from magic_html.extractors.article_extractor import ArticleExtractor +from magic_html.extractors.weixin_extractor import WeixinExtractor +from magic_html.extractors.forum_extractor import ForumExtractor class GeneralExtractor: diff --git a/common_html_extractor/config.py b/magic_html/config.py similarity index 100% rename from common_html_extractor/config.py rename to magic_html/config.py diff --git a/magic_html/extractors/__init__.py b/magic_html/extractors/__init__.py new file mode 100644 index 0000000..380474e --- /dev/null +++ b/magic_html/extractors/__init__.py @@ -0,0 +1 @@ +# -*- coding:utf-8 -*- diff --git a/magic_html/extractors/article_extractor.py b/magic_html/extractors/article_extractor.py new file mode 100644 index 0000000..53b9331 --- /dev/null +++ b/magic_html/extractors/article_extractor.py @@ -0,0 +1,43 @@ +# -*- coding:utf-8 -*- + +from magic_html.utils import * +from magic_html.extractors.base_extractor import BaseExtractor +from magic_html.extractors.title_extractor import TitleExtractor + + +class ArticleExtractor(BaseExtractor): + def __init__(self) -> None: + super().__init__() + + def extract(self, html="", base_url="") -> dict: + html = html.replace(" ", " ").replace(" ", " ") + tree = load_html(html) + if tree is None: + raise ValueError + + title = TitleExtractor().process(tree) + + # base_url + base_href = tree.xpath("//base/@href") + + if base_href and "http" in base_href[0]: + base_url = base_href[0] + + # 标签转换, 增加数学标签处理 + format_tree = self.convert_tags(tree, base_url=base_url) + + # 删除script style等标签及其内容 + normal_tree = self.clean_tags(format_tree) + + subtree, xp_num, drop_list = self.xp_1_5(normal_tree) + if xp_num == "others": + subtree, drop_list = self.prune_unwanted_sections(normal_tree) + body_html = self.get_content_html(subtree, xp_num, base_url) + + return { + "xp_num": xp_num, + "drop_list": drop_list, + "html": body_html, + "title": title, + "base_url": base_url, + } diff --git a/common_html_extractor/extractors.py b/magic_html/extractors/base_extractor.py similarity index 77% rename from common_html_extractor/extractors.py rename to magic_html/extractors/base_extractor.py index bb82310..6fabed4 100644 --- a/common_html_extractor/extractors.py +++ b/magic_html/extractors/base_extractor.py @@ -4,12 +4,10 @@ from collections import defaultdict from copy import deepcopy from urllib.parse import unquote, urljoin - from lxml.etree import Comment, strip_elements - -from common_html_extractor.config import * -from common_html_extractor.readability_plus import Document as DocumentPlus -from common_html_extractor.utils import * +from magic_html.config import * +from magic_html.readability_plus import Document as DocumentPlus +from magic_html.utils import * class BaseExtractor: @@ -842,292 +840,3 @@ def prune_unwanted_sections(self, tree): or drop_list_3_2 or drop_list_3_3, ) - - -class TitleExtractor: - def extract_by_meta(self, element: HtmlElement): - for xpath in METAS: - title = element.xpath(xpath) - if title: - return "".join(title) - - def extract_by_title(self, element: HtmlElement): - return "".join(element.xpath("//title//text()")).strip() - - def extract_by_hs(self, element: HtmlElement): - hs = element.xpath("//h1//text()|//h2//text()|//h3//text()") - return hs or [] - - def extract_by_h(self, element: HtmlElement): - for xpath in ["//h1", "//h2", "//h3"]: - children = element.xpath(xpath) - if not children: - continue - child = children[0] - texts = child.xpath("./text()") - if texts and len(texts): - return texts[0].strip() - - def process(self, element: HtmlElement): - title_extracted_by_meta = self.extract_by_meta(element) - if title_extracted_by_meta: - return title_extracted_by_meta - title_extracted_by_h = self.extract_by_h(element) - title_extracted_by_hs = self.extract_by_hs(element) - title_extracted_by_title = self.extract_by_title(element) - title_extracted_by_hs = sorted( - title_extracted_by_hs, - key=lambda x: similarity2(x, title_extracted_by_title), - reverse=True, - ) - if title_extracted_by_hs: - return lcs_of_2(title_extracted_by_hs[0], title_extracted_by_title) - - if title_extracted_by_title: - return title_extracted_by_title - - return title_extracted_by_h - - -class ArticleExtractor(BaseExtractor): - def __init__(self) -> None: - super().__init__() - - def extract(self, html="", base_url="") -> dict: - html = html.replace(" ", " ").replace(" ", " ") - tree = load_html(html) - if tree is None: - raise ValueError - - title = TitleExtractor().process(tree) - - # base_url - base_href = tree.xpath("//base/@href") - - if base_href and "http" in base_href[0]: - base_url = base_href[0] - - # 标签转换, 增加数学标签处理 - format_tree = self.convert_tags(tree, base_url=base_url) - - # 删除script style等标签及其内容 - normal_tree = self.clean_tags(format_tree) - - subtree, xp_num, drop_list = self.xp_1_5(normal_tree) - if xp_num == "others": - subtree, drop_list = self.prune_unwanted_sections(normal_tree) - body_html = self.get_content_html(subtree, xp_num, base_url) - - return { - "xp_num": xp_num, - "drop_list": drop_list, - "html": body_html, - "title": title, - "base_url": base_url, - } - - -class WeixinExtractor(BaseExtractor): - def __init__(self) -> None: - super().__init__() - - def extract(self, html="", base_url="") -> dict: - html = html.replace(" ", " ") - tree = load_html(html) - if tree is None: - raise ValueError - - # 获取title - title = TitleExtractor().process(tree) - - # base_url - base_href = tree.xpath("//base/@href") - - if base_href and "http" in base_href[0]: - base_url = base_href[0] - - # 文章区域 - body_tree = tree.xpath('.//*[@id="img-content"]')[0] - - # 去除 js , style, comment - for script in body_tree.xpath(".//script"): - self.remove_node(script) - for style in body_tree.xpath(".//style"): - self.remove_node(style) - for comment in body_tree.xpath(".//comment()"): - self.remove_node(comment) - - # 删除所有的公众号介绍 - for mp in body_tree.xpath('.//div[@id="meta_content"]'): - self.remove_node(mp) - for mp in body_tree.xpath('.//div[@id="js_tags"]'): - self.remove_node(mp) - for mp in body_tree.xpath('.//div[@class="original_area_primary"]'): - self.remove_node(mp) - # 隐藏的封禁 介绍 - for mp in body_tree.xpath('.//section[@class="wx_profile_card_inner"]'): - self.remove_node(mp) - # 特殊的wx卡片介绍 - for mp in body_tree.xpath( - ".//section[contains(@class, 'wx_profile_msg_inner')]" - ): - self.remove_node(mp) - - # 针对杂乱内容进行去除 - all_raga = body_tree.xpath( - ".//*[contains(@style, 'color: rgba(255, 255, 255, 0)')] | .//*[contains(@style, 'color: rgba(255 255 255 0)')]" - ) - - for mp in all_raga: - flag_have_color_rgb, detail_style = self.ensure_have_color_rgb( - mp.attrib["style"] - ) - - if not flag_have_color_rgb: - continue - self.remove_node(mp) - - for img in body_tree.xpath(".//img"): - - if "data-src" not in img.attrib: - continue - - try: - img.set("src", img.attrib["data-src"]) - except Exception as e: - continue - - for h1 in body_tree.xpath(".//h1"): - if not h1.text: - continue - h1.text = h1.text.replace("\n", "").strip() - - body_html = tostring(body_tree, encoding=str) - - return { - "xp_num": "weixin", - "drop_list": False, - "html": body_html, - "title": title, - "base_url": base_url - } - - @staticmethod - def ensure_have_color_rgb(htmlstr): - pattern = r"(? None: - super().__init__() - - def extract(self, html="", base_url="") -> dict: - self.need_comment = True - html = html.replace(" ", " ").replace(" ", " ") - tree = load_html(html) - if tree is None: - raise ValueError - - # 获取title - title = TitleExtractor().process(tree) - - # base_url - base_href = tree.xpath("//base/@href") - - if base_href and "http" in base_href[0]: - base_url = base_href[0] - self.generate_unique_id(tree) - - format_tree = self.convert_tags(tree, base_url=base_url) - - normal_tree = self.clean_tags(format_tree) - - subtree, xp_num, drop_list = self.xp_1_5(normal_tree) - if xp_num == "others": - subtree, drop_list = self.prune_unwanted_sections(normal_tree) - body_html = self.get_content_html(subtree, xp_num, base_url) - - # 论坛等独有 - body_html_tree = fromstring(body_html) - try: - body_tree = body_html_tree.body - except: - body_tree = Element("body") - body_tree.extend(body_html_tree) - main_ids = body_tree.xpath(f"./*/@{Unique_ID}") - - for main_id in main_ids: - main_tree = normal_tree.xpath( - f".//*[@{Unique_ID}={main_id}]" - ) - if main_tree: - self.remove_node(main_tree[0]) - if not main_ids: - main_ids = [-1] - - if xp_num != "others": - normal_tree, _ = self.prune_unwanted_sections(normal_tree) - for c_xpath in Forum_XPATH: - while normal_tree.xpath(c_xpath): - x = normal_tree.xpath(c_xpath)[0] - self.remove_node(x) - if ( - "header" in x.attrib.get("class", "").lower() - or "header" in x.attrib.get("id", "").lower() - ): - continue - try: - if int(x.attrib.get(Unique_ID, "0")) > int( - main_ids[-1] - ): - body_tree.append(x) - else: - prefix_div = Element("div") - suffix_div = Element("div") - need_prefix = False - need_suffix = False - while x.xpath( - f".//*[number(@{Unique_ID}) > {int(main_ids[-1])}]" - ): - tmp_x = x.xpath( - f".//*[number(@{Unique_ID}) > {int(main_ids[-1])}]" - )[0] - self.remove_node(tmp_x) - suffix_div.append(tmp_x) - need_suffix = True - while x.xpath( - f".//*[number(@{Unique_ID}) < {int(main_ids[-1])}]" - ): - tmp_x = x.xpath( - f".//*[number(@{Unique_ID}) < {int(main_ids[-1])}]" - )[0] - self.remove_node(tmp_x) - prefix_div.append(tmp_x) - need_prefix = True - if need_prefix: - body_tree.insert(0, prefix_div) - if need_suffix: - body_tree.append(suffix_div) - - except: - pass - - body_html = re.sub( - f' {Unique_ID}="\d+"', - "", - tostring(body_tree, encoding=str), - ) - - return { - "xp_num": xp_num, - "drop_list": drop_list, - "html": body_html, - "title": title, - "base_url": base_url - } diff --git a/magic_html/extractors/forum_extractor.py b/magic_html/extractors/forum_extractor.py new file mode 100644 index 0000000..a774fe8 --- /dev/null +++ b/magic_html/extractors/forum_extractor.py @@ -0,0 +1,116 @@ +# -*- coding:utf-8 -*- + +from magic_html.config import Forum_XPATH, Unique_ID +from magic_html.utils import * +from magic_html.extractors.base_extractor import BaseExtractor +from magic_html.extractors.title_extractor import TitleExtractor + + +class ForumExtractor(BaseExtractor): + def __init__(self) -> None: + super().__init__() + + def extract(self, html="", base_url="") -> dict: + self.need_comment = True + html = html.replace(" ", " ").replace(" ", " ") + tree = load_html(html) + if tree is None: + raise ValueError + + # 获取title + title = TitleExtractor().process(tree) + + # base_url + base_href = tree.xpath("//base/@href") + + if base_href and "http" in base_href[0]: + base_url = base_href[0] + self.generate_unique_id(tree) + + format_tree = self.convert_tags(tree, base_url=base_url) + + normal_tree = self.clean_tags(format_tree) + + subtree, xp_num, drop_list = self.xp_1_5(normal_tree) + if xp_num == "others": + subtree, drop_list = self.prune_unwanted_sections(normal_tree) + body_html = self.get_content_html(subtree, xp_num, base_url) + + # 论坛等独有 + body_html_tree = fromstring(body_html) + try: + body_tree = body_html_tree.body + except: + body_tree = Element("body") + body_tree.extend(body_html_tree) + main_ids = body_tree.xpath(f"./*/@{Unique_ID}") + + for main_id in main_ids: + main_tree = normal_tree.xpath( + f".//*[@{Unique_ID}={main_id}]" + ) + if main_tree: + self.remove_node(main_tree[0]) + if not main_ids: + main_ids = [-1] + + if xp_num != "others": + normal_tree, _ = self.prune_unwanted_sections(normal_tree) + for c_xpath in Forum_XPATH: + while normal_tree.xpath(c_xpath): + x = normal_tree.xpath(c_xpath)[0] + self.remove_node(x) + if ( + "header" in x.attrib.get("class", "").lower() + or "header" in x.attrib.get("id", "").lower() + ): + continue + try: + if int(x.attrib.get(Unique_ID, "0")) > int( + main_ids[-1] + ): + body_tree.append(x) + else: + prefix_div = Element("div") + suffix_div = Element("div") + need_prefix = False + need_suffix = False + while x.xpath( + f".//*[number(@{Unique_ID}) > {int(main_ids[-1])}]" + ): + tmp_x = x.xpath( + f".//*[number(@{Unique_ID}) > {int(main_ids[-1])}]" + )[0] + self.remove_node(tmp_x) + suffix_div.append(tmp_x) + need_suffix = True + while x.xpath( + f".//*[number(@{Unique_ID}) < {int(main_ids[-1])}]" + ): + tmp_x = x.xpath( + f".//*[number(@{Unique_ID}) < {int(main_ids[-1])}]" + )[0] + self.remove_node(tmp_x) + prefix_div.append(tmp_x) + need_prefix = True + if need_prefix: + body_tree.insert(0, prefix_div) + if need_suffix: + body_tree.append(suffix_div) + + except: + pass + + body_html = re.sub( + f' {Unique_ID}="\d+"', + "", + tostring(body_tree, encoding=str), + ) + + return { + "xp_num": xp_num, + "drop_list": drop_list, + "html": body_html, + "title": title, + "base_url": base_url + } diff --git a/magic_html/extractors/title_extractor.py b/magic_html/extractors/title_extractor.py new file mode 100644 index 0000000..5a59f6d --- /dev/null +++ b/magic_html/extractors/title_extractor.py @@ -0,0 +1,49 @@ +# -*- coding:utf-8 -*- + +from magic_html.utils import * +from magic_html.config import * + + +class TitleExtractor: + def extract_by_meta(self, element: HtmlElement): + for xpath in METAS: + title = element.xpath(xpath) + if title: + return "".join(title) + + def extract_by_title(self, element: HtmlElement): + return "".join(element.xpath("//title//text()")).strip() + + def extract_by_hs(self, element: HtmlElement): + hs = element.xpath("//h1//text()|//h2//text()|//h3//text()") + return hs or [] + + def extract_by_h(self, element: HtmlElement): + for xpath in ["//h1", "//h2", "//h3"]: + children = element.xpath(xpath) + if not children: + continue + child = children[0] + texts = child.xpath("./text()") + if texts and len(texts): + return texts[0].strip() + + def process(self, element: HtmlElement): + title_extracted_by_meta = self.extract_by_meta(element) + if title_extracted_by_meta: + return title_extracted_by_meta + title_extracted_by_h = self.extract_by_h(element) + title_extracted_by_hs = self.extract_by_hs(element) + title_extracted_by_title = self.extract_by_title(element) + title_extracted_by_hs = sorted( + title_extracted_by_hs, + key=lambda x: similarity2(x, title_extracted_by_title), + reverse=True, + ) + if title_extracted_by_hs: + return lcs_of_2(title_extracted_by_hs[0], title_extracted_by_title) + + if title_extracted_by_title: + return title_extracted_by_title + + return title_extracted_by_h diff --git a/magic_html/extractors/weixin_extractor.py b/magic_html/extractors/weixin_extractor.py new file mode 100644 index 0000000..c19bc5b --- /dev/null +++ b/magic_html/extractors/weixin_extractor.py @@ -0,0 +1,104 @@ +# -*- coding:utf-8 -*- + +from magic_html.utils import * +from magic_html.extractors.base_extractor import BaseExtractor +from magic_html.extractors.title_extractor import TitleExtractor + + +class WeixinExtractor(BaseExtractor): + def __init__(self) -> None: + super().__init__() + + def extract(self, html="", base_url="") -> dict: + html = html.replace(" ", " ") + tree = load_html(html) + if tree is None: + raise ValueError + + # 获取title + title = TitleExtractor().process(tree) + + # base_url + base_href = tree.xpath("//base/@href") + + if base_href and "http" in base_href[0]: + base_url = base_href[0] + + # 文章区域 + try: + body_tree = tree.xpath('.//*[@id="img-content"]')[0] + except: + raise ValueError + + # 去除 js , style, comment + for script in body_tree.xpath(".//script"): + self.remove_node(script) + for style in body_tree.xpath(".//style"): + self.remove_node(style) + for comment in body_tree.xpath(".//comment()"): + self.remove_node(comment) + + # 删除所有的公众号介绍 + for mp in body_tree.xpath('.//div[@id="meta_content"]'): + self.remove_node(mp) + for mp in body_tree.xpath('.//div[@id="js_tags"]'): + self.remove_node(mp) + for mp in body_tree.xpath('.//div[@class="original_area_primary"]'): + self.remove_node(mp) + # 隐藏的封禁 介绍 + for mp in body_tree.xpath('.//section[@class="wx_profile_card_inner"]'): + self.remove_node(mp) + # 特殊的wx卡片介绍 + for mp in body_tree.xpath( + ".//section[contains(@class, 'wx_profile_msg_inner')]" + ): + self.remove_node(mp) + + # 针对杂乱内容进行去除 + all_raga = body_tree.xpath( + ".//*[contains(@style, 'color: rgba(255, 255, 255, 0)')] | .//*[contains(@style, 'color: rgba(255 255 255 0)')]" + ) + + for mp in all_raga: + flag_have_color_rgb, detail_style = self.ensure_have_color_rgb( + mp.attrib["style"] + ) + + if not flag_have_color_rgb: + continue + self.remove_node(mp) + + for img in body_tree.xpath(".//img"): + + if "data-src" not in img.attrib: + continue + + try: + img.set("src", img.attrib["data-src"]) + except Exception as e: + continue + + for h1 in body_tree.xpath(".//h1"): + if not h1.text: + continue + h1.text = h1.text.replace("\n", "").strip() + + body_html = tostring(body_tree, encoding=str) + + return { + "xp_num": "weixin", + "drop_list": False, + "html": body_html, + "title": title, + "base_url": base_url + } + + @staticmethod + def ensure_have_color_rgb(htmlstr): + pattern = r"(? 1 and version_parts[0].startswith( - "common_html_extractor" + "magic_html" ): return version_parts[1] else: raise ValueError( - f"Invalid version tag {version}. Expected format is common_html_extractor--released." + f"Invalid version tag {version}. Expected format is magic_html--released." ) except Exception as e: print(e) @@ -41,10 +41,10 @@ def get_version(): requires = parse_requirements("requirements.txt") setup( - name="common_html_extractor", + name="magic_html", version=get_version(), - packages=["common_html_extractor"], - package_data={"common_html_extractor": ["mmltex/*.xsl"]}, + packages=["magic_html", "magic_html.extractors"], + package_data={"magic_html": ["mmltex/*.xsl"]}, install_requires=requires, python_requires=">=3.8", zip_safe=False, diff --git a/tests/eval-requirements.txt b/tests/eval-requirements.txt deleted file mode 100644 index cfd6553..0000000 --- a/tests/eval-requirements.txt +++ /dev/null @@ -1,7 +0,0 @@ -beautifulsoup4 -jieba -ltp -numpy -rouge_score -tabulate -trafilatura \ No newline at end of file