refactor: Refactor project directory

opendatalab · May 7, 2024 · e74b8b1 · e74b8b1
1 parent a562b43
commit e74b8b1
Show file tree

Hide file tree

Showing 51 changed files with 535 additions and 342 deletions.
diff --git a/.gitattributes b/.gitattributes
@@ -1,3 +1,4 @@
 tests/** linguist-detectable=false
+benchmark/** linguist-detectable=false
 *.xsl linguist-documentation=true
 
diff --git a/README.md b/README.md
@@ -1,12 +1,12 @@
-# common-html-extractor - 通用HTML数据提取器
+# magic-html - 通用HTML数据提取器
 
-欢迎使用common-html-extractor，这是一个旨在简化从HTML中提取主体区域内容的Python库。
+欢迎使用magic-html，这是一个旨在简化从HTML中提取主体区域内容的Python库。
 
 
 
 ## 项目描述
 
-common-html-extractor提供了一套工具，能够轻松地从HTML中提取主体区域内容。无论您处理的是复杂的HTML结构还是简单的网页，这个库都旨在为您的HTML抽取需求提供一个便捷高效的接口。
+magic-html提供了一套工具，能够轻松地从HTML中提取主体区域内容。无论您处理的是复杂的HTML结构还是简单的网页，这个库都旨在为您的HTML抽取需求提供一个便捷高效的接口。
 
 
 
@@ -22,12 +22,12 @@ common-html-extractor提供了一套工具，能够轻松地从HTML中提取主
 ## 使用
 
 ```python
-from common_html_extractor import GeneralExtractor
+from magic_html import GeneralExtractor
 
 # 初始化提取器
 extractor = GeneralExtractor()
 
-# http://example.com/
+url = "http://example.com/"
 html = """
 
 <!doctype html>
@@ -51,8 +51,15 @@ html = """
 </html>
 """
 
-# 从HTML中提取数据
-data = extractor.extract(html)
+# 文章类型HTML提取数据
+data = extractor.extract(html, base_url=url)
+
+# 论坛类型HTML提取数据
+# data = extractor.extract(html, base_url=url, html_type="forum")
+
+# 微信文章HTML提取数据
+# data = extractor.extract(html, base_url=url, html_type="weixin")
+
 print(data)
 ```
 
@@ -67,5 +74,5 @@ print(data)
 ## 鸣谢
 
 - [trafilatura](https://github.com/adbar/trafilatura)
-- [python-readability](https://github.com/buriy/python-readability)
+- [readability-lxml](https://github.com/buriy/python-readability)
 
diff --git a/benchmark/__init__.py b/benchmark/__init__.py
@@ -0,0 +1 @@
+# -*- coding:utf-8 -*-
diff --git a/tests/data/article/base.json → benchmark/data/article/base.json b/tests/data/article/base.json → benchmark/data/article/base.json
diff --git a/...53b979b8e612813020d58265f6669aa8a979.html → ...53b979b8e612813020d58265f6669aa8a979.html b/...53b979b8e612813020d58265f6669aa8a979.html → ...53b979b8e612813020d58265f6669aa8a979.html
diff --git a/...a88da9b0bc1533c2d9bf894532322b8e3860.html → ...a88da9b0bc1533c2d9bf894532322b8e3860.html b/...a88da9b0bc1533c2d9bf894532322b8e3860.html → ...a88da9b0bc1533c2d9bf894532322b8e3860.html
diff --git a/...6a69e4c28c084800e3a7a46e99c3b1cf51b4.html → ...6a69e4c28c084800e3a7a46e99c3b1cf51b4.html b/...6a69e4c28c084800e3a7a46e99c3b1cf51b4.html → ...6a69e4c28c084800e3a7a46e99c3b1cf51b4.html
diff --git a/...4f6c6e1d92f1916ecc3254ae492e6082b3d4.html → ...4f6c6e1d92f1916ecc3254ae492e6082b3d4.html b/...4f6c6e1d92f1916ecc3254ae492e6082b3d4.html → ...4f6c6e1d92f1916ecc3254ae492e6082b3d4.html
diff --git a/...c33fdafaf93f6bf07a3ed60b7aa3c2dd79cb.html → ...c33fdafaf93f6bf07a3ed60b7aa3c2dd79cb.html b/...c33fdafaf93f6bf07a3ed60b7aa3c2dd79cb.html → ...c33fdafaf93f6bf07a3ed60b7aa3c2dd79cb.html
diff --git a/...257c3817c3659ed2108165a29eacab9a1276.html → ...257c3817c3659ed2108165a29eacab9a1276.html b/...257c3817c3659ed2108165a29eacab9a1276.html → ...257c3817c3659ed2108165a29eacab9a1276.html
diff --git a/...e24b715bec8c057c592a81c3d7ca08aafa10.html → ...e24b715bec8c057c592a81c3d7ca08aafa10.html b/...e24b715bec8c057c592a81c3d7ca08aafa10.html → ...e24b715bec8c057c592a81c3d7ca08aafa10.html
diff --git a/...80080345cf332fae8ac7eb15fd78f38d701a.html → ...80080345cf332fae8ac7eb15fd78f38d701a.html b/...80080345cf332fae8ac7eb15fd78f38d701a.html → ...80080345cf332fae8ac7eb15fd78f38d701a.html
diff --git a/...74148b888887f494dc4e3f375a335e419a94.html → ...74148b888887f494dc4e3f375a335e419a94.html b/...74148b888887f494dc4e3f375a335e419a94.html → ...74148b888887f494dc4e3f375a335e419a94.html
diff --git a/...5739e186ad9d2417604d620ed4656e4a942a.html → ...5739e186ad9d2417604d620ed4656e4a942a.html b/...5739e186ad9d2417604d620ed4656e4a942a.html → ...5739e186ad9d2417604d620ed4656e4a942a.html
diff --git a/...704753fc9905f6ea435449248ae98dd70145.html → ...704753fc9905f6ea435449248ae98dd70145.html b/...704753fc9905f6ea435449248ae98dd70145.html → ...704753fc9905f6ea435449248ae98dd70145.html
diff --git a/...7cdc114c8e8b2a1186c90d6117160201377a.html → ...7cdc114c8e8b2a1186c90d6117160201377a.html b/...7cdc114c8e8b2a1186c90d6117160201377a.html → ...7cdc114c8e8b2a1186c90d6117160201377a.html
diff --git a/...c2978fcae1836ea12fa95c0587c60488af42.html → ...c2978fcae1836ea12fa95c0587c60488af42.html b/...c2978fcae1836ea12fa95c0587c60488af42.html → ...c2978fcae1836ea12fa95c0587c60488af42.html
diff --git a/...dd1cd2847c0df2254f5685882bb4a5bf3450.html → ...dd1cd2847c0df2254f5685882bb4a5bf3450.html b/...dd1cd2847c0df2254f5685882bb4a5bf3450.html → ...dd1cd2847c0df2254f5685882bb4a5bf3450.html
diff --git a/...d10618ff02bd32296b4c939e44e1a42055fa.html → ...d10618ff02bd32296b4c939e44e1a42055fa.html b/...d10618ff02bd32296b4c939e44e1a42055fa.html → ...d10618ff02bd32296b4c939e44e1a42055fa.html
diff --git a/...2d3b0ca3cc001365cb580c4be79d8b09e8f7.html → ...2d3b0ca3cc001365cb580c4be79d8b09e8f7.html b/...2d3b0ca3cc001365cb580c4be79d8b09e8f7.html → ...2d3b0ca3cc001365cb580c4be79d8b09e8f7.html
diff --git a/...523abdff2134977ab766e47ef6c74bd6aa13.html → ...523abdff2134977ab766e47ef6c74bd6aa13.html b/...523abdff2134977ab766e47ef6c74bd6aa13.html → ...523abdff2134977ab766e47ef6c74bd6aa13.html
diff --git a/...5a4ccbd88a87238d01047979c106f1ac98be.html → ...5a4ccbd88a87238d01047979c106f1ac98be.html b/...5a4ccbd88a87238d01047979c106f1ac98be.html → ...5a4ccbd88a87238d01047979c106f1ac98be.html
diff --git a/tests/data/forum/base.json → benchmark/data/forum/base.json b/tests/data/forum/base.json → benchmark/data/forum/base.json
diff --git a/...f56dfad212144e0f411b8119833025875f99.html → ...f56dfad212144e0f411b8119833025875f99.html b/...f56dfad212144e0f411b8119833025875f99.html → ...f56dfad212144e0f411b8119833025875f99.html
diff --git a/...332aca122ec44ab3eccf17e2e40a25768272.html → ...332aca122ec44ab3eccf17e2e40a25768272.html b/...332aca122ec44ab3eccf17e2e40a25768272.html → ...332aca122ec44ab3eccf17e2e40a25768272.html
diff --git a/...93402009d5d50fa72c79fb8bf2ebf9a9e07e.html → ...93402009d5d50fa72c79fb8bf2ebf9a9e07e.html b/...93402009d5d50fa72c79fb8bf2ebf9a9e07e.html → ...93402009d5d50fa72c79fb8bf2ebf9a9e07e.html
diff --git a/...0e7d2553249c020c86bfc8bb5b3b5ccbbb84.html → ...0e7d2553249c020c86bfc8bb5b3b5ccbbb84.html b/...0e7d2553249c020c86bfc8bb5b3b5ccbbb84.html → ...0e7d2553249c020c86bfc8bb5b3b5ccbbb84.html
diff --git a/...fecdaa7c0d0011da3f66ae729bca830e98a5.html → ...fecdaa7c0d0011da3f66ae729bca830e98a5.html b/...fecdaa7c0d0011da3f66ae729bca830e98a5.html → ...fecdaa7c0d0011da3f66ae729bca830e98a5.html
diff --git a/benchmark/eval-requirements.txt b/benchmark/eval-requirements.txt
@@ -0,0 +1,12 @@
+beautifulsoup4
+jieba
+ltp
+numpy
+rouge_score
+tabulate
+trafilatura
+readability-lxml
+newspaper3k
+goose3
+justext
+gne
diff --git a/tests/evaluate_articles.py → benchmark/evaluate_articles.py b/tests/evaluate_articles.py → benchmark/evaluate_articles.py
@@ -96,8 +96,8 @@ def get_content_text(html: str) -> str:
         global_datas.append(v)
 
 
-def run_common_html_extractor(name):
-    from common_html_extractor import GeneralExtractor
+def run_magic_html(name):
+    from magic_html import GeneralExtractor
 
     datas = deepcopy(global_datas)
     extractor = GeneralExtractor()
@@ -133,11 +133,79 @@ def run_trafilatura_fallback(name):
     evaluate_result(datas)
 
 
+def run_readability_lxml(name):
+    from readability import Document
+
+    datas = deepcopy(global_datas)
+    for x in datas:
+        x["extract_content"] = get_content_text(Document(x["html"]).summary())
+    global_info["func"].append(name)
+    evaluate_result(datas)
+
+
+def run_newspaper3k(name):
+    from newspaper import fulltext
+
+    datas = deepcopy(global_datas)
+    for x in datas:
+        try:
+            x["extract_content"] = fulltext(x["html"])
+        except:
+            x["extract_content"] = ""
+    global_info["func"].append(name)
+    evaluate_result(datas)
+
+
+def run_goose3(name):
+    from goose3 import Goose
+
+    g = Goose()
+    datas = deepcopy(global_datas)
+    for x in datas:
+        x["extract_content"] = g.extract(raw_html=x["html"]).cleaned_text
+    global_info["func"].append(name)
+    evaluate_result(datas)
+
+
+def run_justext(name):
+    import justext
+
+    datas = deepcopy(global_datas)
+    for x in datas:
+        paragraphs = justext.justext(x["html"], justext.get_stoplist("German"), 50, 200, 0.1, 0.2, 0.2, 200,
+                                     True)  # stop_words
+        valid = [
+            paragraph.text
+            for paragraph in paragraphs
+            if not paragraph.is_boilerplate
+        ]
+
+        x["extract_content"] = ' '.join(valid)
+    global_info["func"].append(name)
+    evaluate_result(datas)
+
+
+def run_gne(name):
+    from gne import GeneralNewsExtractor
+
+    extractor = GeneralNewsExtractor()
+    datas = deepcopy(global_datas)
+    for x in datas:
+        x["extract_content"] = extractor.extract(x["html"])["content"]
+    global_info["func"].append(name)
+    evaluate_result(datas)
+
+
 # 自定义需要对比的方法
 all_funcs = {
-    "common_html_extractor": run_common_html_extractor,
+    "magic_html": run_magic_html,
     "trafilatura": run_trafilatura,
     "trafilatura_fallback": run_trafilatura_fallback,
+    "readability-lxml": run_readability_lxml,
+    "newspaper3k": run_newspaper3k,
+    "goose3": run_goose3,
+    "justext": run_justext,
+    "gne": run_gne
 }
 
 for k, v in all_funcs.items():
@@ -149,14 +217,24 @@ def run_trafilatura_fallback(name):
 print("基准结果")
 print(
     """
-╒═══════════════════════╤═════════════╤════════════╤═══════════╕
-│ func                  │   prec_mean │   rec_mean │   f1_mean │
-╞═══════════════════════╪═════════════╪════════════╪═══════════╡
-│ common_html_extractor │    0.955549 │   0.993721 │  0.974261 │
-├───────────────────────┼─────────────┼────────────┼───────────┤
-│ trafilatura           │    0.887413 │   0.84243  │  0.864336 │
-├───────────────────────┼─────────────┼────────────┼───────────┤
-│ trafilatura_fallback  │    0.893388 │   0.93853  │  0.915403 │
-╘═══════════════════════╧═════════════╧════════════╧═══════════╛
+╒══════════════════════╤═════════════╤════════════╤═══════════╕
+│ func                 │   prec_mean │   rec_mean │   f1_mean │
+╞══════════════════════╪═════════════╪════════════╪═══════════╡
+│ magic_html           │    0.955549 │   0.993721 │  0.974261 │
+├──────────────────────┼─────────────┼────────────┼───────────┤
+│ trafilatura          │    0.887413 │   0.84243  │  0.864336 │
+├──────────────────────┼─────────────┼────────────┼───────────┤
+│ trafilatura_fallback │    0.893388 │   0.93853  │  0.915403 │
+├──────────────────────┼─────────────┼────────────┼───────────┤
+│ readability-lxml     │    0.860823 │   0.885361 │  0.87292  │
+├──────────────────────┼─────────────┼────────────┼───────────┤
+│ newspaper3k          │    0.527753 │   0.551633 │  0.539429 │
+├──────────────────────┼─────────────┼────────────┼───────────┤
+│ goose3               │    0.543127 │   0.531672 │  0.537338 │
+├──────────────────────┼─────────────┼────────────┼───────────┤
+│ justext              │    0.484097 │   0.334175 │  0.395402 │
+├──────────────────────┼─────────────┼────────────┼───────────┤
+│ gne                  │    0.869631 │   0.859659 │  0.864617 │
+╘══════════════════════╧═════════════╧════════════╧═══════════╛
 """.strip()
 )
diff --git a/tests/evaluate_forums.py → benchmark/evaluate_forums.py b/tests/evaluate_forums.py → benchmark/evaluate_forums.py
@@ -96,8 +96,8 @@ def get_content_text(html: str) -> str:
         global_datas.append(v)
 
 
-def run_common_html_extractor(name):
-    from common_html_extractor import GeneralExtractor
+def run_magic_html(name):
+    from magic_html import GeneralExtractor
 
     datas = deepcopy(global_datas)
     extractor = GeneralExtractor()
@@ -135,11 +135,79 @@ def run_trafilatura_fallback(name):
     evaluate_result(datas)
 
 
+def run_readability_lxml(name):
+    from readability import Document
+
+    datas = deepcopy(global_datas)
+    for x in datas:
+        x["extract_content"] = get_content_text(Document(x["html"]).summary())
+    global_info["func"].append(name)
+    evaluate_result(datas)
+
+
+def run_newspaper3k(name):
+    from newspaper import fulltext
+
+    datas = deepcopy(global_datas)
+    for x in datas:
+        try:
+            x["extract_content"] = fulltext(x["html"])
+        except:
+            x["extract_content"] = ""
+    global_info["func"].append(name)
+    evaluate_result(datas)
+
+
+def run_goose3(name):
+    from goose3 import Goose
+
+    g = Goose()
+    datas = deepcopy(global_datas)
+    for x in datas:
+        x["extract_content"] = g.extract(raw_html=x["html"]).cleaned_text
+    global_info["func"].append(name)
+    evaluate_result(datas)
+
+
+def run_justext(name):
+    import justext
+
+    datas = deepcopy(global_datas)
+    for x in datas:
+        paragraphs = justext.justext(x["html"], justext.get_stoplist("German"), 50, 200, 0.1, 0.2, 0.2, 200,
+                                     True)  # stop_words
+        valid = [
+            paragraph.text
+            for paragraph in paragraphs
+            if not paragraph.is_boilerplate
+        ]
+
+        x["extract_content"] = ' '.join(valid)
+    global_info["func"].append(name)
+    evaluate_result(datas)
+
+
+def run_gne(name):
+    from gne import GeneralNewsExtractor
+
+    extractor = GeneralNewsExtractor()
+    datas = deepcopy(global_datas)
+    for x in datas:
+        x["extract_content"] = extractor.extract(x["html"])["content"]
+    global_info["func"].append(name)
+    evaluate_result(datas)
+
+
 # 自定义需要对比的方法
 all_funcs = {
-    "common_html_extractor": run_common_html_extractor,
+    "magic_html": run_magic_html,
     "trafilatura": run_trafilatura,
     "trafilatura_fallback": run_trafilatura_fallback,
+    "readability-lxml": run_readability_lxml,
+    "newspaper3k": run_newspaper3k,
+    "goose3": run_goose3,
+    "justext": run_justext,
+    "gne": run_gne
 }
 
 for k, v in all_funcs.items():
@@ -150,13 +218,23 @@ def run_trafilatura_fallback(name):
 print(tabulate(global_info, headers="keys", tablefmt="fancy_grid"))
 print("基准结果")
 print('''
-╒═══════════════════════╤═════════════╤════════════╤═══════════╕
-│ func                  │   prec_mean │   rec_mean │   f1_mean │
-╞═══════════════════════╪═════════════╪════════════╪═══════════╡
-│ common_html_extractor │    0.752323 │   0.964762 │  0.845401 │
-├───────────────────────┼─────────────┼────────────┼───────────┤
-│ trafilatura           │    0.711983 │   0.568848 │  0.632418 │
-├───────────────────────┼─────────────┼────────────┼───────────┤
-│ trafilatura_fallback  │    0.781724 │   0.557774 │  0.651028 │
-╘═══════════════════════╧═════════════╧════════════╧═══════════╛
+╒══════════════════════╤═════════════╤════════════╤═══════════╕
+│ func                 │   prec_mean │   rec_mean │   f1_mean │
+╞══════════════════════╪═════════════╪════════════╪═══════════╡
+│ magic_html           │    0.752323 │   0.964762 │  0.845401 │
+├──────────────────────┼─────────────┼────────────┼───────────┤
+│ trafilatura          │    0.711983 │   0.568848 │  0.632418 │
+├──────────────────────┼─────────────┼────────────┼───────────┤
+│ trafilatura_fallback │    0.781724 │   0.557774 │  0.651028 │
+├──────────────────────┼─────────────┼────────────┼───────────┤
+│ readability-lxml     │    0.55441  │   0.228667 │  0.323788 │
+├──────────────────────┼─────────────┼────────────┼───────────┤
+│ newspaper3k          │    0.716421 │   0.19569  │  0.307411 │
+├──────────────────────┼─────────────┼────────────┼───────────┤
+│ goose3               │    0.551646 │   0.147048 │  0.2322   │
+├──────────────────────┼─────────────┼────────────┼───────────┤
+│ justext              │    0.542019 │   0.204443 │  0.296899 │
+├──────────────────────┼─────────────┼────────────┼───────────┤
+│ gne                  │    0.846939 │   0.125834 │  0.219114 │
+╘══════════════════════╧═════════════╧════════════╧═══════════╛
 '''.strip())
diff --git a/common_html_extractor/__init__.py → magic_html/__init__.py b/common_html_extractor/__init__.py → magic_html/__init__.py
@@ -1,8 +1,9 @@
 # -*- coding: utf-8 -*-
 
 from urllib.parse import urlparse
-
-from common_html_extractor.extractors import *
+from magic_html.extractors.article_extractor import ArticleExtractor
+from magic_html.extractors.weixin_extractor import WeixinExtractor
+from magic_html.extractors.forum_extractor import ForumExtractor
 
 
 class GeneralExtractor:

diff --git a/common_html_extractor/config.py → magic_html/config.py b/common_html_extractor/config.py → magic_html/config.py
diff --git a/magic_html/extractors/__init__.py b/magic_html/extractors/__init__.py
@@ -0,0 +1 @@
+# -*- coding:utf-8 -*-
diff --git a/magic_html/extractors/article_extractor.py b/magic_html/extractors/article_extractor.py
@@ -0,0 +1,43 @@
+# -*- coding:utf-8 -*-
+
+from magic_html.utils import *
+from magic_html.extractors.base_extractor import BaseExtractor
+from magic_html.extractors.title_extractor import TitleExtractor
+
+
+class ArticleExtractor(BaseExtractor):
+    def __init__(self) -> None:
+        super().__init__()
+
+    def extract(self, html="", base_url="") -> dict:
+        html = html.replace("&nbsp;", " ").replace("&#160;", " ")
+        tree = load_html(html)
+        if tree is None:
+            raise ValueError
+
+        title = TitleExtractor().process(tree)
+
+        # base_url
+        base_href = tree.xpath("//base/@href")
+
+        if base_href and "http" in base_href[0]:
+            base_url = base_href[0]
+
+        # 标签转换, 增加数学标签处理
+        format_tree = self.convert_tags(tree, base_url=base_url)
+
+        # 删除script style等标签及其内容
+        normal_tree = self.clean_tags(format_tree)
+
+        subtree, xp_num, drop_list = self.xp_1_5(normal_tree)
+        if xp_num == "others":
+            subtree, drop_list = self.prune_unwanted_sections(normal_tree)
+        body_html = self.get_content_html(subtree, xp_num, base_url)
+
+        return {
+            "xp_num": xp_num,
+            "drop_list": drop_list,
+            "html": body_html,
+            "title": title,
+            "base_url": base_url,
+        }