完成开发

chuangfengwang · Feb 22, 2022 · cee6e3c · cee6e3c
commit cee6e3c
Show file tree

Hide file tree

Showing 14 changed files with 1,237 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,43 @@
+HELP.md
+target/
+!.mvn/wrapper/maven-wrapper.jar
+!**/src/main/**
+!**/src/test/**
+
+### STS ###
+.apt_generated
+.classpath
+.factorypath
+.project
+.settings
+.springBeans
+.sts4-cache
+
+### IntelliJ IDEA ###
+.idea
+*.iws
+*.iml
+*.ipr
+
+### NetBeans ###
+/nbproject/private/
+/nbbuild/
+/dist/
+/nbdist/
+/.nb-gradle/
+build/
+
+### VS Code ###
+.vscode/
+
+
+### proj ###
+.mvn/
+logs/
+data/
+hanlp-data/
+
+*.sqlite3
+*.sqlite3-journal
+/conll_output.txt
+/filterRegex.txt
diff --git a/doc/ddl.sql b/doc/ddl.sql
@@ -0,0 +1,69 @@
+-- 句子表
+create table if not exists parsed_sent
+(
+    id          integer      not null primary key autoincrement,                     -- 主键
+    file_name   varchar(255) not null default '',                                    -- 文件名
+    sent        text         not null default '',                                    -- 句子
+    file_offset int          not null default 0,                                     -- 句子首字母在文件中的偏移位置
+    line_no     int          not null default 0,                                     -- 句子在文件中的行号
+    parser_name varchar(50)  not null default 'hanlp-NeuralNetworkDependencyParser', -- 分析器名称
+    create_at   datetime     not null default (datetime('now', 'localtime'))         -- 创建时间
+);
+
+create index if not exists idx_create_at on parsed_sent (create_at);
+
+insert into parsed_sent(file_name, sent, file_offset, line_no, parser_name)
+values (?, ?, ?, ?, ?);
+
+-- 词语表
+create table if not exists parsed_word
+(
+    id             integer      not null primary key autoincrement,             -- 主键
+    sent_id        integer      not null default 0,                             -- 句子 id
+    word           varchar(255) not null default '',                            -- 词语
+    word_no        int          not null default 0,                             -- 词语在句子中的序号,从1开始, 0是语法树的根节点, -1是空白节点
+    lemma          varchar(255) not null default '',                            -- 当前词语（或标点）的原型或词干，在中文中，此列与FORM相同
+    cpostag        varchar(50)  not null default '',                            -- 当前词语的词性（粗粒度）
+    postag         varchar(50)  not null default '',                            -- 当前词语的词性（细粒度）
+    head_no        int          not null default 0,                             -- 当前词语的中心词序号
+    deprel         varchar(50)  not null default '',                            -- 当前词语与中心词的依存关系
+    conllword_name varchar(50)  not null default '',                            -- 等效字符串
+    create_at      datetime     not null default (datetime('now', 'localtime')) -- 创建时间
+);
+
+create index if not exists idx_sent_word_id on parsed_word (sent_id, word_no);
+create index if not exists idx_word on parsed_word (word);
+create index if not exists idx_cpostag on parsed_word (cpostag);
+create index if not exists idx_postag on parsed_word (postag);
+create index if not exists idx_deprel on parsed_word (deprel);
+create index if not exists idx_sent_head_no on parsed_word (sent_id, head_no);
+create index if not exists idx_create_at on parsed_word (create_at);
+
+insert into parsed_word(sent_id, word, word_no, lemma, cpostag, postag, head_no, deprel, conllword_name, create_at)
+values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?);
+
+-- cfg 表
+create table if not exists parsed_cfg_pair
+(
+    id            integer      not null primary key autoincrement,             -- 主键
+    sent_id       integer      not null default 0,                             -- 句子 id
+    deprel        varchar(255) not null default '',                            -- 当前词语与中心词的依存关系
+    left_cpostag  varchar(50)  not null default '',                            -- 左边词语的词性（粗粒度）
+    right_cpostag varchar(50)  not null default '',                            -- 右边词语的词性（粗粒度）
+    left_word     varchar(255) not null default '',                            -- 左边词语
+    right_word    varchar(255) not null default '',                            -- 右边词语
+    left_postag   varchar(50)  not null default '',                            -- 左边词语的词性（细粒度）
+    right_postag  varchar(50)  not null default '',                            -- 右边词语的词性（细粒度）
+    left_word_no  int          not null default 0,                             -- 左边词语序号
+    right_word_no int          not null default 0,                             -- 右边词语序号
+    create_at     datetime     not null default (datetime('now', 'localtime')) -- 创建时间
+);
+
+create index if not exists idx_sent_id_word_no_lr on parsed_cfg_pair (sent_id, left_word_no, right_word_no);
+create index if not exists idx_sent_id_word_no_rl on parsed_cfg_pair (sent_id, right_word_no, left_word_no);
+create index if not exists idx_deprel_cpostag on parsed_cfg_pair (deprel, left_cpostag, right_cpostag);
+create index if not exists idx_deprel_postag on parsed_cfg_pair (deprel, left_postag, right_postag);
+create index if not exists idx_create_at on parsed_cfg_pair (create_at);
+
+insert into parsed_cfg_pair(sent_id, deprel, left_cpostag, right_cpostag, left_word, right_word, left_postag, right_postag, left_word_no, right_word_no, create_at)
+values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?);
diff --git a/doc/reference.md b/doc/reference.md
@@ -0,0 +1,27 @@
+
+hanlp1.x文档
+https://github.com/hankcs/HanLP/tree/1.x
+
+pos代号释义
+http://www.hankcs.com/nlp/part-of-speech-tagging.html#h2-8
+
+短语结构树 vs. 依存句法树
+https://blog.csdn.net/qq_43428310/article/details/107290398
+
+南大 DependencyViewer (不开源)
+http://nlp.nju.edu.cn/tanggc/tools/DependencyViewer.html
+
+web版 coNLL 可视化(go/js/html, 开源)
+https://urd2.let.rug.nl/~kleiweg/conllu/
+https://github.com/rug-compling/conllu-viewer
+
+hanLP 在线演示
+https://hanlp.hankcs.com/?sentence=%E5%BE%90%E5%85%88%E7%94%9F%E8%BF%98%E5%85%B7%E4%BD%93%E5%B8%AE%E5%8A%A9%E4%BB%96%E7%A1%AE%E5%AE%9A%E4%BA%86%E6%8A%8A%E7%94%BB%E9%9B%84%E9%B9%B0%E3%80%81%E6%9D%BE%E9%BC%A0%E5%92%8C%E9%BA%BB%E9%9B%80%E4%BD%9C%E4%B8%BA%E4%B8%BB%E6%94%BB%E7%9B%AE%E6%A0%87%E3%80%82
+
+sqlite demo
+https://m.runoob.com/sqlite/sqlite-java.html?ivk_sa=1024320u
+
+sqlite 类型
+https://www.sqlite.org/datatype3.html
+sqlite 建表
+https://www.sqlite.org/lang_createtable.html
diff --git a/doc/依存关系对照表.md b/doc/依存关系对照表.md
@@ -0,0 +1,22 @@
+http://www.hankcs.com/nlp/parsing/neural-network-based-dependency-parser.html
+
+由于训练的时候使用的是Chinese Dependency Treebank 1.0，所以原始的标签是英文的，在Parser中，被按照下表进行了转换：
+
+
+| Tag | 关系       | Description               | Example                    |
+| --- | ---------- | ------------------------- | -------------------------- |
+| SBV | 主谓关系   | subject-verb              | 我送她一束花 (我 <– 送)   |
+| VOB | 动宾关系   | 直接宾语，verb-object     | 我送她一束花 (送 –> 花)   |
+| IOB | 间宾关系   | 间接宾语，indirect-object | 我送她一束花 (送 –> 她)   |
+| FOB | 前置宾语   | 前置宾语，fronting-object | 他什么书都读 (书 <– 读)   |
+| DBL | 兼语       | double                    | 他请我吃饭 (请 –> 我)     |
+| ATT | 定中关系   | attribute                 | 红苹果 (红 <– 苹果)       |
+| ADV | 状中结构   | adverbial                 | 非常美丽 (非常 <– 美丽)   |
+| CMP | 动补结构   | complement                | 做完了作业 (做 –> 完)     |
+| COO | 并列关系   | coordinate                | 大山和大海 (大山 –> 大海) |
+| POB | 介宾关系   | preposition-object        | 在贸易区内 (在 –> 内)     |
+| LAD | 左附加关系 | left adjunct              | 大山和大海 (和 <– 大海)   |
+| RAD | 右附加关系 | right adjunct             | 孩子们 (孩子 –> 们)       |
+| IS  | 独立结构   | independent structure     | 两个单句在结构上彼此独立   |
+| WP  | 标点符号   | punctuation               | 标点符号                   |
+| HED | 核心关系   | head                      | 指整个句子的核心           |
diff --git a/doc/词性标注对照表.md b/doc/词性标注对照表.md
@@ -0,0 +1,155 @@
+https://www.hankcs.com/nlp/part-of-speech-tagging.html
+
+HanLP使用的HMM词性标注模型训练自2014年人民日报切分语料，随后增加了少量98年人民日报中独有的词语。所以，HanLP词性标注集兼容《ICTPOS3.0汉语词性标记集》，并且兼容《现代汉语语料库加工规范——词语切分与词性标注》
+
+
+HanLP词性标注集
+
+a	形容词	
+ad	副形词	
+ag	形容词性语素	
+al	形容词性惯用语	
+an	名形词	
+b	区别词	
+begin	仅用于始##始	
+bg	区别语素	
+bl	区别词性惯用语	
+c	连词	
+cc	并列连词	
+d	副词	
+dg	辄,俱,复之类的副词	
+dl	连语	
+e	叹词	
+end	仅用于终##终	
+f	方位词	
+g	学术词汇	
+gb	生物相关词汇	
+gbc	生物类别	
+gc	化学相关词汇	
+gg	地理地质相关词汇	
+gi	计算机相关词汇	
+gm	数学相关词汇	
+gp	物理相关词汇	
+h	前缀	
+i	成语	
+j	简称略语	
+k	后缀	
+l	习用语	
+m	数词	
+mg	数语素	
+Mg	甲乙丙丁之类的数词	
+mq	数量词	
+n	名词	
+nb	生物名	
+nba	动物名	
+nbc	动物纲目	
+nbp	植物名	
+nf	食品，比如“薯片”	
+ng	名词性语素	
+nh	医药疾病等健康相关名词	
+nhd	疾病	
+nhm	药品	
+ni	机构相关（不是独立机构名）	
+nic	下属机构	
+nis	机构后缀	
+nit	教育相关机构	
+nl	名词性惯用语	
+nm	物品名	
+nmc	化学品名	
+nn	工作相关名词	
+nnd	职业	
+nnt	职务职称	
+nr	人名	
+nr1	复姓	
+nr2	蒙古姓名	
+nrf	音译人名	
+nrj	日语人名	
+ns	地名	
+nsf	音译地名	
+nt	机构团体名	
+ntc	公司名	
+ntcb	银行	
+ntcf	工厂	
+ntch	酒店宾馆	
+nth	医院	
+nto	政府机构	
+nts	中小学	
+ntu	大学	
+nx	字母专名	
+nz	其他专名	
+o	拟声词	
+p	介词	
+pba	介词“把”	
+pbei	介词“被”	
+q	量词	
+qg	量词语素	
+qt	时量词	
+qv	动量词	
+r	代词	
+rg	代词性语素	
+Rg	古汉语代词性语素	
+rr	人称代词	
+ry	疑问代词	
+rys	处所疑问代词	
+ryt	时间疑问代词	
+ryv	谓词性疑问代词	
+rz	指示代词	
+rzs	处所指示代词	
+rzt	时间指示代词	
+rzv	谓词性指示代词	
+s	处所词	
+t	时间词	
+tg	时间词性语素	
+u	助词	
+ud	助词	
+ude1	的 底	
+ude2	地	
+ude3	得	
+udeng	等 等等 云云	
+udh	的话	
+ug	过	
+uguo	过	
+uj	助词	
+ul	连词	
+ule	了 喽	
+ulian	连 （“连小学生都会”）	
+uls	来讲 来说 而言 说来	
+usuo	所	
+uv	连词	
+uyy	一样 一般 似的 般	
+uz	着	
+uzhe	着	
+uzhi	之	
+v	动词	
+vd	副动词	
+vf	趋向动词	
+vg	动词性语素	
+vi	不及物动词（内动词）	
+vl	动词性惯用语	
+vn	名动词	
+vshi	动词“是”	
+vx	形式动词	
+vyou	动词“有”	
+w	标点符号	
+wb	百分号千分号，全角：％ ‰   半角：%	
+wd	逗号，全角：， 半角：,	
+wf	分号，全角：； 半角： ;	
+wh	单位符号，全角：￥ ＄ ￡  °  ℃  半角：$	
+wj	句号，全角：。	
+wky	右括号，全角：） 〕  ］ ｝ 》  】 〗 〉 半角： ) ] { >	
+wkz	左括号，全角：（ 〔  ［  ｛  《 【  〖 〈   半角：( [ { <	
+wm	冒号，全角：： 半角： :	
+wn	顿号，全角：、	
+wp	破折号，全角：——   －－   ——－   半角：—  —-	
+ws	省略号，全角：……  …	
+wt	叹号，全角：！	
+ww	问号，全角：？	
+wyy	右引号，全角：” ’ 』	
+wyz	左引号，全角：“ ‘ 『	
+x	字符串	
+xu	网址URL	
+xx	非语素字	
+y	语气词(delete yg)	
+yg	语气语素	
+z	状态词	
+zg	状态词
diff --git a/doc/需求描述.md b/doc/需求描述.md
@@ -0,0 +1,63 @@
+## 原始描述
+
+具体需求： -分词、词性标注、词频统计（主要是提取动词周围的名词进行统计）； -短语结构分析，统计CFG规则频次 文件大小txt100k左右，不用重新训练模型啥的，用现成的即可
+
+## 需求拆解
+
+分词,词性标注,依存句法 词频统计 CFG规则
+
+## 输出格式
+
+句子表: 文件,句子,首字母偏移量,解析器 词语表: 句子id,词语,词性,上位词 分叉表: 关系,左词语,右词语,左词id,右词id,关系代号,左词性,右词性
+
+## 运行分析
+
+java -jar target/nlp-parser-1.0-SNAPSHOT-jar-with-dependencies.jar -r filterRegex.txt -c conll_output.txt -d nlpparsed.sqlite3 data/lz-data/shentiyundongxunlian.txt data/lz-data/tushouxunlian.txt data/lz-data/yujia.txt
+
+## 统计脚本
+
+词频统计
+
+```sql
+.mode tabs
+
+.output word_frequency.txt
+-- 动词搭配的词频
+select w1.deprel
+     , w1.cpostag
+     , w2.cpostag
+     , w1.word
+     , w2.word
+     , s.file_name
+--      , s.sent
+--      , s.file_name
+     , count(1) num
+from parsed_word w1
+         join parsed_word w2 on w1.sent_id = w2.sent_id and w1.head_no = w2.word_no
+         join parsed_sent s on w1.sent_id = s.id
+where (w1.cpostag = 'n' and w2.cpostag = 'v')
+   or (w1.cpostag = 'v' and w2.cpostag = 'n')
+group by w1.deprel, w1.cpostag, w2.cpostag, w1.word, w2.word, s.file_name
+order by num desc
+;
+
+.output cfg_frequency.txt
+-- CFG规则频次
+select c.deprel
+     , c.left_cpostag
+     , c.right_cpostag
+     , s.file_name
+--      , c.left_word
+--      , c.right_word
+     , s.sent
+     , count(1) as num
+from parsed_cfg_pair c
+         join parsed_sent s on c.sent_id = s.id
+where c.deprel in ('主谓关系', '动宾关系', '间宾关系', '前置宾语', '动补结构', '兼语', '介宾关系')
+group by c.deprel, c.left_cpostag, c.right_cpostag, s.file_name
+order by num desc
+;
+
+.output stdout
+.mode list
+```