Skip to content

Commit

Permalink
refactor: Refactor project directory
Browse files Browse the repository at this point in the history
  • Loading branch information
zp committed May 7, 2024
1 parent a562b43 commit e74b8b1
Show file tree
Hide file tree
Showing 51 changed files with 535 additions and 342 deletions.
1 change: 1 addition & 0 deletions .gitattributes
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
tests/** linguist-detectable=false
benchmark/** linguist-detectable=false
*.xsl linguist-documentation=true

23 changes: 15 additions & 8 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
# common-html-extractor - 通用HTML数据提取器
# magic-html - 通用HTML数据提取器

欢迎使用common-html-extractor,这是一个旨在简化从HTML中提取主体区域内容的Python库。
欢迎使用magic-html,这是一个旨在简化从HTML中提取主体区域内容的Python库。



## 项目描述

common-html-extractor提供了一套工具,能够轻松地从HTML中提取主体区域内容。无论您处理的是复杂的HTML结构还是简单的网页,这个库都旨在为您的HTML抽取需求提供一个便捷高效的接口。
magic-html提供了一套工具,能够轻松地从HTML中提取主体区域内容。无论您处理的是复杂的HTML结构还是简单的网页,这个库都旨在为您的HTML抽取需求提供一个便捷高效的接口。



Expand All @@ -22,12 +22,12 @@ common-html-extractor提供了一套工具,能够轻松地从HTML中提取主
## 使用

```python
from common_html_extractor import GeneralExtractor
from magic_html import GeneralExtractor

# 初始化提取器
extractor = GeneralExtractor()

# http://example.com/
url = "http://example.com/"
html = """
<!doctype html>
Expand All @@ -51,8 +51,15 @@ html = """
</html>
"""

# 从HTML中提取数据
data = extractor.extract(html)
# 文章类型HTML提取数据
data = extractor.extract(html, base_url=url)

# 论坛类型HTML提取数据
# data = extractor.extract(html, base_url=url, html_type="forum")

# 微信文章HTML提取数据
# data = extractor.extract(html, base_url=url, html_type="weixin")

print(data)
```

Expand All @@ -67,5 +74,5 @@ print(data)
## 鸣谢

- [trafilatura](https://github.com/adbar/trafilatura)
- [python-readability](https://github.com/buriy/python-readability)
- [readability-lxml](https://github.com/buriy/python-readability)

1 change: 1 addition & 0 deletions benchmark/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
# -*- coding:utf-8 -*-
File renamed without changes.
File renamed without changes.
12 changes: 12 additions & 0 deletions benchmark/eval-requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
beautifulsoup4
jieba
ltp
numpy
rouge_score
tabulate
trafilatura
readability-lxml
newspaper3k
goose3
justext
gne
102 changes: 90 additions & 12 deletions tests/evaluate_articles.py → benchmark/evaluate_articles.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,8 +96,8 @@ def get_content_text(html: str) -> str:
global_datas.append(v)


def run_common_html_extractor(name):
from common_html_extractor import GeneralExtractor
def run_magic_html(name):
from magic_html import GeneralExtractor

datas = deepcopy(global_datas)
extractor = GeneralExtractor()
Expand Down Expand Up @@ -133,11 +133,79 @@ def run_trafilatura_fallback(name):
evaluate_result(datas)


def run_readability_lxml(name):
from readability import Document

datas = deepcopy(global_datas)
for x in datas:
x["extract_content"] = get_content_text(Document(x["html"]).summary())
global_info["func"].append(name)
evaluate_result(datas)


def run_newspaper3k(name):
from newspaper import fulltext

datas = deepcopy(global_datas)
for x in datas:
try:
x["extract_content"] = fulltext(x["html"])
except:
x["extract_content"] = ""
global_info["func"].append(name)
evaluate_result(datas)


def run_goose3(name):
from goose3 import Goose

g = Goose()
datas = deepcopy(global_datas)
for x in datas:
x["extract_content"] = g.extract(raw_html=x["html"]).cleaned_text
global_info["func"].append(name)
evaluate_result(datas)


def run_justext(name):
import justext

datas = deepcopy(global_datas)
for x in datas:
paragraphs = justext.justext(x["html"], justext.get_stoplist("German"), 50, 200, 0.1, 0.2, 0.2, 200,
True) # stop_words
valid = [
paragraph.text
for paragraph in paragraphs
if not paragraph.is_boilerplate
]

x["extract_content"] = ' '.join(valid)
global_info["func"].append(name)
evaluate_result(datas)


def run_gne(name):
from gne import GeneralNewsExtractor

extractor = GeneralNewsExtractor()
datas = deepcopy(global_datas)
for x in datas:
x["extract_content"] = extractor.extract(x["html"])["content"]
global_info["func"].append(name)
evaluate_result(datas)


# 自定义需要对比的方法
all_funcs = {
"common_html_extractor": run_common_html_extractor,
"magic_html": run_magic_html,
"trafilatura": run_trafilatura,
"trafilatura_fallback": run_trafilatura_fallback,
"readability-lxml": run_readability_lxml,
"newspaper3k": run_newspaper3k,
"goose3": run_goose3,
"justext": run_justext,
"gne": run_gne
}

for k, v in all_funcs.items():
Expand All @@ -149,14 +217,24 @@ def run_trafilatura_fallback(name):
print("基准结果")
print(
"""
╒═══════════════════════╤═════════════╤════════════╤═══════════╕
│ func │ prec_mean │ rec_mean │ f1_mean │
╞═══════════════════════╪═════════════╪════════════╪═══════════╡
│ common_html_extractor │ 0.955549 │ 0.993721 │ 0.974261 │
├───────────────────────┼─────────────┼────────────┼───────────┤
│ trafilatura │ 0.887413 │ 0.84243 │ 0.864336 │
├───────────────────────┼─────────────┼────────────┼───────────┤
│ trafilatura_fallback │ 0.893388 │ 0.93853 │ 0.915403 │
╘═══════════════════════╧═════════════╧════════════╧═══════════╛
╒══════════════════════╤═════════════╤════════════╤═══════════╕
│ func │ prec_mean │ rec_mean │ f1_mean │
╞══════════════════════╪═════════════╪════════════╪═══════════╡
│ magic_html │ 0.955549 │ 0.993721 │ 0.974261 │
├──────────────────────┼─────────────┼────────────┼───────────┤
│ trafilatura │ 0.887413 │ 0.84243 │ 0.864336 │
├──────────────────────┼─────────────┼────────────┼───────────┤
│ trafilatura_fallback │ 0.893388 │ 0.93853 │ 0.915403 │
├──────────────────────┼─────────────┼────────────┼───────────┤
│ readability-lxml │ 0.860823 │ 0.885361 │ 0.87292 │
├──────────────────────┼─────────────┼────────────┼───────────┤
│ newspaper3k │ 0.527753 │ 0.551633 │ 0.539429 │
├──────────────────────┼─────────────┼────────────┼───────────┤
│ goose3 │ 0.543127 │ 0.531672 │ 0.537338 │
├──────────────────────┼─────────────┼────────────┼───────────┤
│ justext │ 0.484097 │ 0.334175 │ 0.395402 │
├──────────────────────┼─────────────┼────────────┼───────────┤
│ gne │ 0.869631 │ 0.859659 │ 0.864617 │
╘══════════════════════╧═════════════╧════════════╧═══════════╛
""".strip()
)
102 changes: 90 additions & 12 deletions tests/evaluate_forums.py → benchmark/evaluate_forums.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,8 +96,8 @@ def get_content_text(html: str) -> str:
global_datas.append(v)


def run_common_html_extractor(name):
from common_html_extractor import GeneralExtractor
def run_magic_html(name):
from magic_html import GeneralExtractor

datas = deepcopy(global_datas)
extractor = GeneralExtractor()
Expand Down Expand Up @@ -135,11 +135,79 @@ def run_trafilatura_fallback(name):
evaluate_result(datas)


def run_readability_lxml(name):
from readability import Document

datas = deepcopy(global_datas)
for x in datas:
x["extract_content"] = get_content_text(Document(x["html"]).summary())
global_info["func"].append(name)
evaluate_result(datas)


def run_newspaper3k(name):
from newspaper import fulltext

datas = deepcopy(global_datas)
for x in datas:
try:
x["extract_content"] = fulltext(x["html"])
except:
x["extract_content"] = ""
global_info["func"].append(name)
evaluate_result(datas)


def run_goose3(name):
from goose3 import Goose

g = Goose()
datas = deepcopy(global_datas)
for x in datas:
x["extract_content"] = g.extract(raw_html=x["html"]).cleaned_text
global_info["func"].append(name)
evaluate_result(datas)


def run_justext(name):
import justext

datas = deepcopy(global_datas)
for x in datas:
paragraphs = justext.justext(x["html"], justext.get_stoplist("German"), 50, 200, 0.1, 0.2, 0.2, 200,
True) # stop_words
valid = [
paragraph.text
for paragraph in paragraphs
if not paragraph.is_boilerplate
]

x["extract_content"] = ' '.join(valid)
global_info["func"].append(name)
evaluate_result(datas)


def run_gne(name):
from gne import GeneralNewsExtractor

extractor = GeneralNewsExtractor()
datas = deepcopy(global_datas)
for x in datas:
x["extract_content"] = extractor.extract(x["html"])["content"]
global_info["func"].append(name)
evaluate_result(datas)


# 自定义需要对比的方法
all_funcs = {
"common_html_extractor": run_common_html_extractor,
"magic_html": run_magic_html,
"trafilatura": run_trafilatura,
"trafilatura_fallback": run_trafilatura_fallback,
"readability-lxml": run_readability_lxml,
"newspaper3k": run_newspaper3k,
"goose3": run_goose3,
"justext": run_justext,
"gne": run_gne
}

for k, v in all_funcs.items():
Expand All @@ -150,13 +218,23 @@ def run_trafilatura_fallback(name):
print(tabulate(global_info, headers="keys", tablefmt="fancy_grid"))
print("基准结果")
print('''
╒═══════════════════════╤═════════════╤════════════╤═══════════╕
│ func │ prec_mean │ rec_mean │ f1_mean │
╞═══════════════════════╪═════════════╪════════════╪═══════════╡
│ common_html_extractor │ 0.752323 │ 0.964762 │ 0.845401 │
├───────────────────────┼─────────────┼────────────┼───────────┤
│ trafilatura │ 0.711983 │ 0.568848 │ 0.632418 │
├───────────────────────┼─────────────┼────────────┼───────────┤
│ trafilatura_fallback │ 0.781724 │ 0.557774 │ 0.651028 │
╘═══════════════════════╧═════════════╧════════════╧═══════════╛
╒══════════════════════╤═════════════╤════════════╤═══════════╕
│ func │ prec_mean │ rec_mean │ f1_mean │
╞══════════════════════╪═════════════╪════════════╪═══════════╡
│ magic_html │ 0.752323 │ 0.964762 │ 0.845401 │
├──────────────────────┼─────────────┼────────────┼───────────┤
│ trafilatura │ 0.711983 │ 0.568848 │ 0.632418 │
├──────────────────────┼─────────────┼────────────┼───────────┤
│ trafilatura_fallback │ 0.781724 │ 0.557774 │ 0.651028 │
├──────────────────────┼─────────────┼────────────┼───────────┤
│ readability-lxml │ 0.55441 │ 0.228667 │ 0.323788 │
├──────────────────────┼─────────────┼────────────┼───────────┤
│ newspaper3k │ 0.716421 │ 0.19569 │ 0.307411 │
├──────────────────────┼─────────────┼────────────┼───────────┤
│ goose3 │ 0.551646 │ 0.147048 │ 0.2322 │
├──────────────────────┼─────────────┼────────────┼───────────┤
│ justext │ 0.542019 │ 0.204443 │ 0.296899 │
├──────────────────────┼─────────────┼────────────┼───────────┤
│ gne │ 0.846939 │ 0.125834 │ 0.219114 │
╘══════════════════════╧═════════════╧════════════╧═══════════╛
'''.strip())
5 changes: 3 additions & 2 deletions common_html_extractor/__init__.py → magic_html/__init__.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
# -*- coding: utf-8 -*-

from urllib.parse import urlparse

from common_html_extractor.extractors import *
from magic_html.extractors.article_extractor import ArticleExtractor
from magic_html.extractors.weixin_extractor import WeixinExtractor
from magic_html.extractors.forum_extractor import ForumExtractor


class GeneralExtractor:
Expand Down
File renamed without changes.
1 change: 1 addition & 0 deletions magic_html/extractors/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
# -*- coding:utf-8 -*-
43 changes: 43 additions & 0 deletions magic_html/extractors/article_extractor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
# -*- coding:utf-8 -*-

from magic_html.utils import *
from magic_html.extractors.base_extractor import BaseExtractor
from magic_html.extractors.title_extractor import TitleExtractor


class ArticleExtractor(BaseExtractor):
def __init__(self) -> None:
super().__init__()

def extract(self, html="", base_url="") -> dict:
html = html.replace("&nbsp;", " ").replace("&#160;", " ")
tree = load_html(html)
if tree is None:
raise ValueError

title = TitleExtractor().process(tree)

# base_url
base_href = tree.xpath("//base/@href")

if base_href and "http" in base_href[0]:
base_url = base_href[0]

# 标签转换, 增加数学标签处理
format_tree = self.convert_tags(tree, base_url=base_url)

# 删除script style等标签及其内容
normal_tree = self.clean_tags(format_tree)

subtree, xp_num, drop_list = self.xp_1_5(normal_tree)
if xp_num == "others":
subtree, drop_list = self.prune_unwanted_sections(normal_tree)
body_html = self.get_content_html(subtree, xp_num, base_url)

return {
"xp_num": xp_num,
"drop_list": drop_list,
"html": body_html,
"title": title,
"base_url": base_url,
}
Loading

0 comments on commit e74b8b1

Please sign in to comment.