-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
0 parents
commit cee6e3c
Showing
14 changed files
with
1,237 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,43 @@ | ||
HELP.md | ||
target/ | ||
!.mvn/wrapper/maven-wrapper.jar | ||
!**/src/main/** | ||
!**/src/test/** | ||
|
||
### STS ### | ||
.apt_generated | ||
.classpath | ||
.factorypath | ||
.project | ||
.settings | ||
.springBeans | ||
.sts4-cache | ||
|
||
### IntelliJ IDEA ### | ||
.idea | ||
*.iws | ||
*.iml | ||
*.ipr | ||
|
||
### NetBeans ### | ||
/nbproject/private/ | ||
/nbbuild/ | ||
/dist/ | ||
/nbdist/ | ||
/.nb-gradle/ | ||
build/ | ||
|
||
### VS Code ### | ||
.vscode/ | ||
|
||
|
||
### proj ### | ||
.mvn/ | ||
logs/ | ||
data/ | ||
hanlp-data/ | ||
|
||
*.sqlite3 | ||
*.sqlite3-journal | ||
/conll_output.txt | ||
/filterRegex.txt |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,69 @@ | ||
-- 句子表 | ||
create table if not exists parsed_sent | ||
( | ||
id integer not null primary key autoincrement, -- 主键 | ||
file_name varchar(255) not null default '', -- 文件名 | ||
sent text not null default '', -- 句子 | ||
file_offset int not null default 0, -- 句子首字母在文件中的偏移位置 | ||
line_no int not null default 0, -- 句子在文件中的行号 | ||
parser_name varchar(50) not null default 'hanlp-NeuralNetworkDependencyParser', -- 分析器名称 | ||
create_at datetime not null default (datetime('now', 'localtime')) -- 创建时间 | ||
); | ||
|
||
create index if not exists idx_create_at on parsed_sent (create_at); | ||
|
||
insert into parsed_sent(file_name, sent, file_offset, line_no, parser_name) | ||
values (?, ?, ?, ?, ?); | ||
|
||
-- 词语表 | ||
create table if not exists parsed_word | ||
( | ||
id integer not null primary key autoincrement, -- 主键 | ||
sent_id integer not null default 0, -- 句子 id | ||
word varchar(255) not null default '', -- 词语 | ||
word_no int not null default 0, -- 词语在句子中的序号,从1开始, 0是语法树的根节点, -1是空白节点 | ||
lemma varchar(255) not null default '', -- 当前词语(或标点)的原型或词干,在中文中,此列与FORM相同 | ||
cpostag varchar(50) not null default '', -- 当前词语的词性(粗粒度) | ||
postag varchar(50) not null default '', -- 当前词语的词性(细粒度) | ||
head_no int not null default 0, -- 当前词语的中心词序号 | ||
deprel varchar(50) not null default '', -- 当前词语与中心词的依存关系 | ||
conllword_name varchar(50) not null default '', -- 等效字符串 | ||
create_at datetime not null default (datetime('now', 'localtime')) -- 创建时间 | ||
); | ||
|
||
create index if not exists idx_sent_word_id on parsed_word (sent_id, word_no); | ||
create index if not exists idx_word on parsed_word (word); | ||
create index if not exists idx_cpostag on parsed_word (cpostag); | ||
create index if not exists idx_postag on parsed_word (postag); | ||
create index if not exists idx_deprel on parsed_word (deprel); | ||
create index if not exists idx_sent_head_no on parsed_word (sent_id, head_no); | ||
create index if not exists idx_create_at on parsed_word (create_at); | ||
|
||
insert into parsed_word(sent_id, word, word_no, lemma, cpostag, postag, head_no, deprel, conllword_name, create_at) | ||
values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?); | ||
|
||
-- cfg 表 | ||
create table if not exists parsed_cfg_pair | ||
( | ||
id integer not null primary key autoincrement, -- 主键 | ||
sent_id integer not null default 0, -- 句子 id | ||
deprel varchar(255) not null default '', -- 当前词语与中心词的依存关系 | ||
left_cpostag varchar(50) not null default '', -- 左边词语的词性(粗粒度) | ||
right_cpostag varchar(50) not null default '', -- 右边词语的词性(粗粒度) | ||
left_word varchar(255) not null default '', -- 左边词语 | ||
right_word varchar(255) not null default '', -- 右边词语 | ||
left_postag varchar(50) not null default '', -- 左边词语的词性(细粒度) | ||
right_postag varchar(50) not null default '', -- 右边词语的词性(细粒度) | ||
left_word_no int not null default 0, -- 左边词语序号 | ||
right_word_no int not null default 0, -- 右边词语序号 | ||
create_at datetime not null default (datetime('now', 'localtime')) -- 创建时间 | ||
); | ||
|
||
create index if not exists idx_sent_id_word_no_lr on parsed_cfg_pair (sent_id, left_word_no, right_word_no); | ||
create index if not exists idx_sent_id_word_no_rl on parsed_cfg_pair (sent_id, right_word_no, left_word_no); | ||
create index if not exists idx_deprel_cpostag on parsed_cfg_pair (deprel, left_cpostag, right_cpostag); | ||
create index if not exists idx_deprel_postag on parsed_cfg_pair (deprel, left_postag, right_postag); | ||
create index if not exists idx_create_at on parsed_cfg_pair (create_at); | ||
|
||
insert into parsed_cfg_pair(sent_id, deprel, left_cpostag, right_cpostag, left_word, right_word, left_postag, right_postag, left_word_no, right_word_no, create_at) | ||
values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,27 @@ | ||
|
||
hanlp1.x文档 | ||
https://github.com/hankcs/HanLP/tree/1.x | ||
|
||
pos代号释义 | ||
http://www.hankcs.com/nlp/part-of-speech-tagging.html#h2-8 | ||
|
||
短语结构树 vs. 依存句法树 | ||
https://blog.csdn.net/qq_43428310/article/details/107290398 | ||
|
||
南大 DependencyViewer (不开源) | ||
http://nlp.nju.edu.cn/tanggc/tools/DependencyViewer.html | ||
|
||
web版 coNLL 可视化(go/js/html, 开源) | ||
https://urd2.let.rug.nl/~kleiweg/conllu/ | ||
https://github.com/rug-compling/conllu-viewer | ||
|
||
hanLP 在线演示 | ||
https://hanlp.hankcs.com/?sentence=%E5%BE%90%E5%85%88%E7%94%9F%E8%BF%98%E5%85%B7%E4%BD%93%E5%B8%AE%E5%8A%A9%E4%BB%96%E7%A1%AE%E5%AE%9A%E4%BA%86%E6%8A%8A%E7%94%BB%E9%9B%84%E9%B9%B0%E3%80%81%E6%9D%BE%E9%BC%A0%E5%92%8C%E9%BA%BB%E9%9B%80%E4%BD%9C%E4%B8%BA%E4%B8%BB%E6%94%BB%E7%9B%AE%E6%A0%87%E3%80%82 | ||
|
||
sqlite demo | ||
https://m.runoob.com/sqlite/sqlite-java.html?ivk_sa=1024320u | ||
|
||
sqlite 类型 | ||
https://www.sqlite.org/datatype3.html | ||
sqlite 建表 | ||
https://www.sqlite.org/lang_createtable.html |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,22 @@ | ||
http://www.hankcs.com/nlp/parsing/neural-network-based-dependency-parser.html | ||
|
||
由于训练的时候使用的是Chinese Dependency Treebank 1.0,所以原始的标签是英文的,在Parser中,被按照下表进行了转换: | ||
|
||
|
||
| Tag | 关系 | Description | Example | | ||
| --- | ---------- | ------------------------- | -------------------------- | | ||
| SBV | 主谓关系 | subject-verb | 我送她一束花 (我 <– 送) | | ||
| VOB | 动宾关系 | 直接宾语,verb-object | 我送她一束花 (送 –> 花) | | ||
| IOB | 间宾关系 | 间接宾语,indirect-object | 我送她一束花 (送 –> 她) | | ||
| FOB | 前置宾语 | 前置宾语,fronting-object | 他什么书都读 (书 <– 读) | | ||
| DBL | 兼语 | double | 他请我吃饭 (请 –> 我) | | ||
| ATT | 定中关系 | attribute | 红苹果 (红 <– 苹果) | | ||
| ADV | 状中结构 | adverbial | 非常美丽 (非常 <– 美丽) | | ||
| CMP | 动补结构 | complement | 做完了作业 (做 –> 完) | | ||
| COO | 并列关系 | coordinate | 大山和大海 (大山 –> 大海) | | ||
| POB | 介宾关系 | preposition-object | 在贸易区内 (在 –> 内) | | ||
| LAD | 左附加关系 | left adjunct | 大山和大海 (和 <– 大海) | | ||
| RAD | 右附加关系 | right adjunct | 孩子们 (孩子 –> 们) | | ||
| IS | 独立结构 | independent structure | 两个单句在结构上彼此独立 | | ||
| WP | 标点符号 | punctuation | 标点符号 | | ||
| HED | 核心关系 | head | 指整个句子的核心 | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,155 @@ | ||
https://www.hankcs.com/nlp/part-of-speech-tagging.html | ||
|
||
HanLP使用的HMM词性标注模型训练自2014年人民日报切分语料,随后增加了少量98年人民日报中独有的词语。所以,HanLP词性标注集兼容《ICTPOS3.0汉语词性标记集》,并且兼容《现代汉语语料库加工规范——词语切分与词性标注》 | ||
|
||
|
||
HanLP词性标注集 | ||
|
||
a 形容词 | ||
ad 副形词 | ||
ag 形容词性语素 | ||
al 形容词性惯用语 | ||
an 名形词 | ||
b 区别词 | ||
begin 仅用于始##始 | ||
bg 区别语素 | ||
bl 区别词性惯用语 | ||
c 连词 | ||
cc 并列连词 | ||
d 副词 | ||
dg 辄,俱,复之类的副词 | ||
dl 连语 | ||
e 叹词 | ||
end 仅用于终##终 | ||
f 方位词 | ||
g 学术词汇 | ||
gb 生物相关词汇 | ||
gbc 生物类别 | ||
gc 化学相关词汇 | ||
gg 地理地质相关词汇 | ||
gi 计算机相关词汇 | ||
gm 数学相关词汇 | ||
gp 物理相关词汇 | ||
h 前缀 | ||
i 成语 | ||
j 简称略语 | ||
k 后缀 | ||
l 习用语 | ||
m 数词 | ||
mg 数语素 | ||
Mg 甲乙丙丁之类的数词 | ||
mq 数量词 | ||
n 名词 | ||
nb 生物名 | ||
nba 动物名 | ||
nbc 动物纲目 | ||
nbp 植物名 | ||
nf 食品,比如“薯片” | ||
ng 名词性语素 | ||
nh 医药疾病等健康相关名词 | ||
nhd 疾病 | ||
nhm 药品 | ||
ni 机构相关(不是独立机构名) | ||
nic 下属机构 | ||
nis 机构后缀 | ||
nit 教育相关机构 | ||
nl 名词性惯用语 | ||
nm 物品名 | ||
nmc 化学品名 | ||
nn 工作相关名词 | ||
nnd 职业 | ||
nnt 职务职称 | ||
nr 人名 | ||
nr1 复姓 | ||
nr2 蒙古姓名 | ||
nrf 音译人名 | ||
nrj 日语人名 | ||
ns 地名 | ||
nsf 音译地名 | ||
nt 机构团体名 | ||
ntc 公司名 | ||
ntcb 银行 | ||
ntcf 工厂 | ||
ntch 酒店宾馆 | ||
nth 医院 | ||
nto 政府机构 | ||
nts 中小学 | ||
ntu 大学 | ||
nx 字母专名 | ||
nz 其他专名 | ||
o 拟声词 | ||
p 介词 | ||
pba 介词“把” | ||
pbei 介词“被” | ||
q 量词 | ||
qg 量词语素 | ||
qt 时量词 | ||
qv 动量词 | ||
r 代词 | ||
rg 代词性语素 | ||
Rg 古汉语代词性语素 | ||
rr 人称代词 | ||
ry 疑问代词 | ||
rys 处所疑问代词 | ||
ryt 时间疑问代词 | ||
ryv 谓词性疑问代词 | ||
rz 指示代词 | ||
rzs 处所指示代词 | ||
rzt 时间指示代词 | ||
rzv 谓词性指示代词 | ||
s 处所词 | ||
t 时间词 | ||
tg 时间词性语素 | ||
u 助词 | ||
ud 助词 | ||
ude1 的 底 | ||
ude2 地 | ||
ude3 得 | ||
udeng 等 等等 云云 | ||
udh 的话 | ||
ug 过 | ||
uguo 过 | ||
uj 助词 | ||
ul 连词 | ||
ule 了 喽 | ||
ulian 连 (“连小学生都会”) | ||
uls 来讲 来说 而言 说来 | ||
usuo 所 | ||
uv 连词 | ||
uyy 一样 一般 似的 般 | ||
uz 着 | ||
uzhe 着 | ||
uzhi 之 | ||
v 动词 | ||
vd 副动词 | ||
vf 趋向动词 | ||
vg 动词性语素 | ||
vi 不及物动词(内动词) | ||
vl 动词性惯用语 | ||
vn 名动词 | ||
vshi 动词“是” | ||
vx 形式动词 | ||
vyou 动词“有” | ||
w 标点符号 | ||
wb 百分号千分号,全角:% ‰ 半角:% | ||
wd 逗号,全角:, 半角:, | ||
wf 分号,全角:; 半角: ; | ||
wh 单位符号,全角:¥ $ £ ° ℃ 半角:$ | ||
wj 句号,全角:。 | ||
wky 右括号,全角:) 〕 ] } 》 】 〗 〉 半角: ) ] { > | ||
wkz 左括号,全角:( 〔 [ { 《 【 〖 〈 半角:( [ { < | ||
wm 冒号,全角:: 半角: : | ||
wn 顿号,全角:、 | ||
wp 破折号,全角:—— -- ——- 半角:— —- | ||
ws 省略号,全角:…… … | ||
wt 叹号,全角:! | ||
ww 问号,全角:? | ||
wyy 右引号,全角:” ’ 』 | ||
wyz 左引号,全角:“ ‘ 『 | ||
x 字符串 | ||
xu 网址URL | ||
xx 非语素字 | ||
y 语气词(delete yg) | ||
yg 语气语素 | ||
z 状态词 | ||
zg 状态词 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,63 @@ | ||
## 原始描述 | ||
|
||
具体需求: -分词、词性标注、词频统计(主要是提取动词周围的名词进行统计); -短语结构分析,统计CFG规则频次 文件大小txt100k左右,不用重新训练模型啥的,用现成的即可 | ||
|
||
## 需求拆解 | ||
|
||
分词,词性标注,依存句法 词频统计 CFG规则 | ||
|
||
## 输出格式 | ||
|
||
句子表: 文件,句子,首字母偏移量,解析器 词语表: 句子id,词语,词性,上位词 分叉表: 关系,左词语,右词语,左词id,右词id,关系代号,左词性,右词性 | ||
|
||
## 运行分析 | ||
|
||
java -jar target/nlp-parser-1.0-SNAPSHOT-jar-with-dependencies.jar -r filterRegex.txt -c conll_output.txt -d nlpparsed.sqlite3 data/lz-data/shentiyundongxunlian.txt data/lz-data/tushouxunlian.txt data/lz-data/yujia.txt | ||
|
||
## 统计脚本 | ||
|
||
词频统计 | ||
|
||
```sql | ||
.mode tabs | ||
|
||
.output word_frequency.txt | ||
-- 动词搭配的词频 | ||
select w1.deprel | ||
, w1.cpostag | ||
, w2.cpostag | ||
, w1.word | ||
, w2.word | ||
, s.file_name | ||
-- , s.sent | ||
-- , s.file_name | ||
, count(1) num | ||
from parsed_word w1 | ||
join parsed_word w2 on w1.sent_id = w2.sent_id and w1.head_no = w2.word_no | ||
join parsed_sent s on w1.sent_id = s.id | ||
where (w1.cpostag = 'n' and w2.cpostag = 'v') | ||
or (w1.cpostag = 'v' and w2.cpostag = 'n') | ||
group by w1.deprel, w1.cpostag, w2.cpostag, w1.word, w2.word, s.file_name | ||
order by num desc | ||
; | ||
|
||
.output cfg_frequency.txt | ||
-- CFG规则频次 | ||
select c.deprel | ||
, c.left_cpostag | ||
, c.right_cpostag | ||
, s.file_name | ||
-- , c.left_word | ||
-- , c.right_word | ||
, s.sent | ||
, count(1) as num | ||
from parsed_cfg_pair c | ||
join parsed_sent s on c.sent_id = s.id | ||
where c.deprel in ('主谓关系', '动宾关系', '间宾关系', '前置宾语', '动补结构', '兼语', '介宾关系') | ||
group by c.deprel, c.left_cpostag, c.right_cpostag, s.file_name | ||
order by num desc | ||
; | ||
|
||
.output stdout | ||
.mode list | ||
``` |
Oops, something went wrong.