diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..151e1ec --- /dev/null +++ b/.gitignore @@ -0,0 +1,43 @@ +HELP.md +target/ +!.mvn/wrapper/maven-wrapper.jar +!**/src/main/** +!**/src/test/** + +### STS ### +.apt_generated +.classpath +.factorypath +.project +.settings +.springBeans +.sts4-cache + +### IntelliJ IDEA ### +.idea +*.iws +*.iml +*.ipr + +### NetBeans ### +/nbproject/private/ +/nbbuild/ +/dist/ +/nbdist/ +/.nb-gradle/ +build/ + +### VS Code ### +.vscode/ + + +### proj ### +.mvn/ +logs/ +data/ +hanlp-data/ + +*.sqlite3 +*.sqlite3-journal +/conll_output.txt +/filterRegex.txt diff --git a/doc/ddl.sql b/doc/ddl.sql new file mode 100644 index 0000000..21e1d1f --- /dev/null +++ b/doc/ddl.sql @@ -0,0 +1,69 @@ +-- 句子表 +create table if not exists parsed_sent +( + id integer not null primary key autoincrement, -- 主键 + file_name varchar(255) not null default '', -- 文件名 + sent text not null default '', -- 句子 + file_offset int not null default 0, -- 句子首字母在文件中的偏移位置 + line_no int not null default 0, -- 句子在文件中的行号 + parser_name varchar(50) not null default 'hanlp-NeuralNetworkDependencyParser', -- 分析器名称 + create_at datetime not null default (datetime('now', 'localtime')) -- 创建时间 +); + +create index if not exists idx_create_at on parsed_sent (create_at); + +insert into parsed_sent(file_name, sent, file_offset, line_no, parser_name) +values (?, ?, ?, ?, ?); + +-- 词语表 +create table if not exists parsed_word +( + id integer not null primary key autoincrement, -- 主键 + sent_id integer not null default 0, -- 句子 id + word varchar(255) not null default '', -- 词语 + word_no int not null default 0, -- 词语在句子中的序号,从1开始, 0是语法树的根节点, -1是空白节点 + lemma varchar(255) not null default '', -- 当前词语(或标点)的原型或词干,在中文中,此列与FORM相同 + cpostag varchar(50) not null default '', -- 当前词语的词性(粗粒度) + postag varchar(50) not null default '', -- 当前词语的词性(细粒度) + head_no int not null default 0, -- 当前词语的中心词序号 + deprel varchar(50) not null default '', -- 当前词语与中心词的依存关系 + conllword_name varchar(50) not null default '', -- 等效字符串 + create_at datetime not null default (datetime('now', 'localtime')) -- 创建时间 +); + +create index if not exists idx_sent_word_id on parsed_word (sent_id, word_no); +create index if not exists idx_word on parsed_word (word); +create index if not exists idx_cpostag on parsed_word (cpostag); +create index if not exists idx_postag on parsed_word (postag); +create index if not exists idx_deprel on parsed_word (deprel); +create index if not exists idx_sent_head_no on parsed_word (sent_id, head_no); +create index if not exists idx_create_at on parsed_word (create_at); + +insert into parsed_word(sent_id, word, word_no, lemma, cpostag, postag, head_no, deprel, conllword_name, create_at) +values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?); + +-- cfg 表 +create table if not exists parsed_cfg_pair +( + id integer not null primary key autoincrement, -- 主键 + sent_id integer not null default 0, -- 句子 id + deprel varchar(255) not null default '', -- 当前词语与中心词的依存关系 + left_cpostag varchar(50) not null default '', -- 左边词语的词性(粗粒度) + right_cpostag varchar(50) not null default '', -- 右边词语的词性(粗粒度) + left_word varchar(255) not null default '', -- 左边词语 + right_word varchar(255) not null default '', -- 右边词语 + left_postag varchar(50) not null default '', -- 左边词语的词性(细粒度) + right_postag varchar(50) not null default '', -- 右边词语的词性(细粒度) + left_word_no int not null default 0, -- 左边词语序号 + right_word_no int not null default 0, -- 右边词语序号 + create_at datetime not null default (datetime('now', 'localtime')) -- 创建时间 +); + +create index if not exists idx_sent_id_word_no_lr on parsed_cfg_pair (sent_id, left_word_no, right_word_no); +create index if not exists idx_sent_id_word_no_rl on parsed_cfg_pair (sent_id, right_word_no, left_word_no); +create index if not exists idx_deprel_cpostag on parsed_cfg_pair (deprel, left_cpostag, right_cpostag); +create index if not exists idx_deprel_postag on parsed_cfg_pair (deprel, left_postag, right_postag); +create index if not exists idx_create_at on parsed_cfg_pair (create_at); + +insert into parsed_cfg_pair(sent_id, deprel, left_cpostag, right_cpostag, left_word, right_word, left_postag, right_postag, left_word_no, right_word_no, create_at) +values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?); \ No newline at end of file diff --git a/doc/reference.md b/doc/reference.md new file mode 100644 index 0000000..c24ee28 --- /dev/null +++ b/doc/reference.md @@ -0,0 +1,27 @@ + +hanlp1.x文档 +https://github.com/hankcs/HanLP/tree/1.x + +pos代号释义 +http://www.hankcs.com/nlp/part-of-speech-tagging.html#h2-8 + +短语结构树 vs. 依存句法树 +https://blog.csdn.net/qq_43428310/article/details/107290398 + +南大 DependencyViewer (不开源) +http://nlp.nju.edu.cn/tanggc/tools/DependencyViewer.html + +web版 coNLL 可视化(go/js/html, 开源) +https://urd2.let.rug.nl/~kleiweg/conllu/ +https://github.com/rug-compling/conllu-viewer + +hanLP 在线演示 +https://hanlp.hankcs.com/?sentence=%E5%BE%90%E5%85%88%E7%94%9F%E8%BF%98%E5%85%B7%E4%BD%93%E5%B8%AE%E5%8A%A9%E4%BB%96%E7%A1%AE%E5%AE%9A%E4%BA%86%E6%8A%8A%E7%94%BB%E9%9B%84%E9%B9%B0%E3%80%81%E6%9D%BE%E9%BC%A0%E5%92%8C%E9%BA%BB%E9%9B%80%E4%BD%9C%E4%B8%BA%E4%B8%BB%E6%94%BB%E7%9B%AE%E6%A0%87%E3%80%82 + +sqlite demo +https://m.runoob.com/sqlite/sqlite-java.html?ivk_sa=1024320u + +sqlite 类型 +https://www.sqlite.org/datatype3.html +sqlite 建表 +https://www.sqlite.org/lang_createtable.html diff --git "a/doc/\344\276\235\345\255\230\345\205\263\347\263\273\345\257\271\347\205\247\350\241\250.md" "b/doc/\344\276\235\345\255\230\345\205\263\347\263\273\345\257\271\347\205\247\350\241\250.md" new file mode 100644 index 0000000..3119e91 --- /dev/null +++ "b/doc/\344\276\235\345\255\230\345\205\263\347\263\273\345\257\271\347\205\247\350\241\250.md" @@ -0,0 +1,22 @@ +http://www.hankcs.com/nlp/parsing/neural-network-based-dependency-parser.html + +由于训练的时候使用的是Chinese Dependency Treebank 1.0,所以原始的标签是英文的,在Parser中,被按照下表进行了转换: + + +| Tag | 关系 | Description | Example | +| --- | ---------- | ------------------------- | -------------------------- | +| SBV | 主谓关系 | subject-verb | 我送她一束花 (我 <– 送) | +| VOB | 动宾关系 | 直接宾语,verb-object | 我送她一束花 (送 –> 花) | +| IOB | 间宾关系 | 间接宾语,indirect-object | 我送她一束花 (送 –> 她) | +| FOB | 前置宾语 | 前置宾语,fronting-object | 他什么书都读 (书 <– 读) | +| DBL | 兼语 | double | 他请我吃饭 (请 –> 我) | +| ATT | 定中关系 | attribute | 红苹果 (红 <– 苹果) | +| ADV | 状中结构 | adverbial | 非常美丽 (非常 <– 美丽) | +| CMP | 动补结构 | complement | 做完了作业 (做 –> 完) | +| COO | 并列关系 | coordinate | 大山和大海 (大山 –> 大海) | +| POB | 介宾关系 | preposition-object | 在贸易区内 (在 –> 内) | +| LAD | 左附加关系 | left adjunct | 大山和大海 (和 <– 大海) | +| RAD | 右附加关系 | right adjunct | 孩子们 (孩子 –> 们) | +| IS | 独立结构 | independent structure | 两个单句在结构上彼此独立 | +| WP | 标点符号 | punctuation | 标点符号 | +| HED | 核心关系 | head | 指整个句子的核心 | diff --git "a/doc/\350\257\215\346\200\247\346\240\207\346\263\250\345\257\271\347\205\247\350\241\250.md" "b/doc/\350\257\215\346\200\247\346\240\207\346\263\250\345\257\271\347\205\247\350\241\250.md" new file mode 100644 index 0000000..cbf8c85 --- /dev/null +++ "b/doc/\350\257\215\346\200\247\346\240\207\346\263\250\345\257\271\347\205\247\350\241\250.md" @@ -0,0 +1,155 @@ +https://www.hankcs.com/nlp/part-of-speech-tagging.html + +HanLP使用的HMM词性标注模型训练自2014年人民日报切分语料,随后增加了少量98年人民日报中独有的词语。所以,HanLP词性标注集兼容《ICTPOS3.0汉语词性标记集》,并且兼容《现代汉语语料库加工规范——词语切分与词性标注》 + + +HanLP词性标注集 + +a 形容词 +ad 副形词 +ag 形容词性语素 +al 形容词性惯用语 +an 名形词 +b 区别词 +begin 仅用于始##始 +bg 区别语素 +bl 区别词性惯用语 +c 连词 +cc 并列连词 +d 副词 +dg 辄,俱,复之类的副词 +dl 连语 +e 叹词 +end 仅用于终##终 +f 方位词 +g 学术词汇 +gb 生物相关词汇 +gbc 生物类别 +gc 化学相关词汇 +gg 地理地质相关词汇 +gi 计算机相关词汇 +gm 数学相关词汇 +gp 物理相关词汇 +h 前缀 +i 成语 +j 简称略语 +k 后缀 +l 习用语 +m 数词 +mg 数语素 +Mg 甲乙丙丁之类的数词 +mq 数量词 +n 名词 +nb 生物名 +nba 动物名 +nbc 动物纲目 +nbp 植物名 +nf 食品,比如“薯片” +ng 名词性语素 +nh 医药疾病等健康相关名词 +nhd 疾病 +nhm 药品 +ni 机构相关(不是独立机构名) +nic 下属机构 +nis 机构后缀 +nit 教育相关机构 +nl 名词性惯用语 +nm 物品名 +nmc 化学品名 +nn 工作相关名词 +nnd 职业 +nnt 职务职称 +nr 人名 +nr1 复姓 +nr2 蒙古姓名 +nrf 音译人名 +nrj 日语人名 +ns 地名 +nsf 音译地名 +nt 机构团体名 +ntc 公司名 +ntcb 银行 +ntcf 工厂 +ntch 酒店宾馆 +nth 医院 +nto 政府机构 +nts 中小学 +ntu 大学 +nx 字母专名 +nz 其他专名 +o 拟声词 +p 介词 +pba 介词“把” +pbei 介词“被” +q 量词 +qg 量词语素 +qt 时量词 +qv 动量词 +r 代词 +rg 代词性语素 +Rg 古汉语代词性语素 +rr 人称代词 +ry 疑问代词 +rys 处所疑问代词 +ryt 时间疑问代词 +ryv 谓词性疑问代词 +rz 指示代词 +rzs 处所指示代词 +rzt 时间指示代词 +rzv 谓词性指示代词 +s 处所词 +t 时间词 +tg 时间词性语素 +u 助词 +ud 助词 +ude1 的 底 +ude2 地 +ude3 得 +udeng 等 等等 云云 +udh 的话 +ug 过 +uguo 过 +uj 助词 +ul 连词 +ule 了 喽 +ulian 连 (“连小学生都会”) +uls 来讲 来说 而言 说来 +usuo 所 +uv 连词 +uyy 一样 一般 似的 般 +uz 着 +uzhe 着 +uzhi 之 +v 动词 +vd 副动词 +vf 趋向动词 +vg 动词性语素 +vi 不及物动词(内动词) +vl 动词性惯用语 +vn 名动词 +vshi 动词“是” +vx 形式动词 +vyou 动词“有” +w 标点符号 +wb 百分号千分号,全角:% ‰ 半角:% +wd 逗号,全角:, 半角:, +wf 分号,全角:; 半角: ; +wh 单位符号,全角:¥ $ £ ° ℃ 半角:$ +wj 句号,全角:。 +wky 右括号,全角:) 〕 ] } 》 】 〗 〉 半角: ) ] { > +wkz 左括号,全角:( 〔 [ { 《 【 〖 〈 半角:( [ { < +wm 冒号,全角:: 半角: : +wn 顿号,全角:、 +wp 破折号,全角:—— -- ——- 半角:— —- +ws 省略号,全角:…… … +wt 叹号,全角:! +ww 问号,全角:? +wyy 右引号,全角:” ’ 』 +wyz 左引号,全角:“ ‘ 『 +x 字符串 +xu 网址URL +xx 非语素字 +y 语气词(delete yg) +yg 语气语素 +z 状态词 +zg 状态词 \ No newline at end of file diff --git "a/doc/\351\234\200\346\261\202\346\217\217\350\277\260.md" "b/doc/\351\234\200\346\261\202\346\217\217\350\277\260.md" new file mode 100644 index 0000000..318fb22 --- /dev/null +++ "b/doc/\351\234\200\346\261\202\346\217\217\350\277\260.md" @@ -0,0 +1,63 @@ +## 原始描述 + +具体需求: -分词、词性标注、词频统计(主要是提取动词周围的名词进行统计); -短语结构分析,统计CFG规则频次 文件大小txt100k左右,不用重新训练模型啥的,用现成的即可 + +## 需求拆解 + +分词,词性标注,依存句法 词频统计 CFG规则 + +## 输出格式 + +句子表: 文件,句子,首字母偏移量,解析器 词语表: 句子id,词语,词性,上位词 分叉表: 关系,左词语,右词语,左词id,右词id,关系代号,左词性,右词性 + +## 运行分析 + +java -jar target/nlp-parser-1.0-SNAPSHOT-jar-with-dependencies.jar -r filterRegex.txt -c conll_output.txt -d nlpparsed.sqlite3 data/lz-data/shentiyundongxunlian.txt data/lz-data/tushouxunlian.txt data/lz-data/yujia.txt + +## 统计脚本 + +词频统计 + +```sql +.mode tabs + +.output word_frequency.txt +-- 动词搭配的词频 +select w1.deprel + , w1.cpostag + , w2.cpostag + , w1.word + , w2.word + , s.file_name +-- , s.sent +-- , s.file_name + , count(1) num +from parsed_word w1 + join parsed_word w2 on w1.sent_id = w2.sent_id and w1.head_no = w2.word_no + join parsed_sent s on w1.sent_id = s.id +where (w1.cpostag = 'n' and w2.cpostag = 'v') + or (w1.cpostag = 'v' and w2.cpostag = 'n') +group by w1.deprel, w1.cpostag, w2.cpostag, w1.word, w2.word, s.file_name +order by num desc +; + +.output cfg_frequency.txt +-- CFG规则频次 +select c.deprel + , c.left_cpostag + , c.right_cpostag + , s.file_name +-- , c.left_word +-- , c.right_word + , s.sent + , count(1) as num +from parsed_cfg_pair c + join parsed_sent s on c.sent_id = s.id +where c.deprel in ('主谓关系', '动宾关系', '间宾关系', '前置宾语', '动补结构', '兼语', '介宾关系') +group by c.deprel, c.left_cpostag, c.right_cpostag, s.file_name +order by num desc +; + +.output stdout +.mode list +``` diff --git a/pom.xml b/pom.xml new file mode 100644 index 0000000..6796973 --- /dev/null +++ b/pom.xml @@ -0,0 +1,87 @@ + + + 4.0.0 + + org.example + nlp-parser + 1.0-SNAPSHOT + + + 8 + 8 + + + + + + com.hankcs + hanlp + portable-1.8.3 + + + + + org.xerial + sqlite-jdbc + 3.36.0.3 + + + + org.projectlombok + lombok + 1.18.22 + provided + + + + com.beust + jcommander + 1.82 + + + + commons-cli + commons-cli + 1.5.0 + + + + com.google.guava + guava + 22.0 + + + + + + + org.apache.maven.plugins + maven-assembly-plugin + 3.3.0 + + + jar-with-dependencies + + + + + com.ligongku.nlpparser.DependencyParser + + + + + + make-assembly + package + + single + + + + + + + + \ No newline at end of file diff --git a/src/main/java/com/ligongku/nlpparser/DependencyParser.java b/src/main/java/com/ligongku/nlpparser/DependencyParser.java new file mode 100644 index 0000000..122a5db --- /dev/null +++ b/src/main/java/com/ligongku/nlpparser/DependencyParser.java @@ -0,0 +1,275 @@ +package com.ligongku.nlpparser; + + +import com.google.common.base.Charsets; +import com.google.common.base.Strings; +import com.google.common.collect.Lists; +import com.google.common.io.CharSink; +import com.google.common.io.FileWriteMode; +import com.google.common.io.Files; +import com.google.common.io.LineProcessor; +import com.hankcs.hanlp.corpus.dependency.CoNll.CoNLLSentence; +import com.hankcs.hanlp.corpus.dependency.CoNll.CoNLLWord; + +import com.hankcs.hanlp.dependency.nnparser.NeuralNetworkDependencyParser; +import com.hankcs.hanlp.utility.SentencesUtil; +import com.ligongku.nlpparser.dao.NlpParserSqliteDao; +import com.ligongku.nlpparser.model.ParsedCfgPair; +import com.ligongku.nlpparser.model.ParsedSent; +import com.ligongku.nlpparser.model.ParsedWord; +import org.apache.commons.cli.*; + +import java.io.File; +import java.io.IOException; +import java.time.LocalDateTime; +import java.util.List; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +public class DependencyParser { + + private NlpParserSqliteDao nlpParserSqliteDao; + + private NeuralNetworkDependencyParser neuralNetworkDependencyParser = new NeuralNetworkDependencyParser(); + + private List filterPatternList = Lists.newLinkedList(); + + private CharSink sink; + + + public void parseFile(String filePath) { + File file = new File(filePath); + try { + Files.asCharSource(file, Charsets.UTF_8).readLines(new LineProcessor() { + private int lineNum = 1; + private int fileOffset = 0; + + @Override + public boolean processLine(String line) throws IOException { + parseLine(line, fileOffset, lineNum, filePath); + if (lineNum % 100 == 0) { + System.out.println(String.format("finished file: %s , lineNum %d", filePath, lineNum)); + } + + fileOffset += line.length(); + ++lineNum; + return true; + } + + @Override + public Object getResult() { + return lineNum; + } + }); + } catch (IOException e) { + System.err.println(e.getMessage()); + } + } + + public String filterLine(String line) { + for (Pattern pattern : filterPatternList) { + Matcher matcher = pattern.matcher(line); + line = matcher.replaceAll(""); + } + return line; + } + + public void parseLine(String line, int fileOffset, int lineNo, String fileName) { + line = filterLine(line); + if (Strings.isNullOrEmpty(line)) { + return; + } + + List sentList = SentencesUtil.toSentenceList(line, false); + LocalDateTime now = LocalDateTime.now(); + for (int idx = 0; idx < sentList.size(); ++idx) { + String sent = sentList.get(idx); + ParsedSent parsedSent = new ParsedSent(); + parsedSent.setFileName(fileName); + parsedSent.setSent(sent); + parsedSent.setFileOffset(fileOffset); + parsedSent.setLineNo(lineNo); + parsedSent.setParserName(neuralNetworkDependencyParser.getClass().getSimpleName()); + parsedSent.setCreateAt(now); + + Long sentId = nlpParserSqliteDao.insertSent(parsedSent); + parsedSent.setId(sentId); + + // System.out.println("\n\n" + parsedSent); + parseSentence(sent, sentId, fileName); + + fileOffset += sent.length(); + } + } + + public void parseSentence(String sent, Long sentId, String fileName) { + List wordList = Lists.newLinkedList(); + CoNLLSentence sentence = neuralNetworkDependencyParser.parse(sent); + LocalDateTime now = LocalDateTime.now(); + for (CoNLLWord word : sentence) { + ParsedWord parsedWord = new ParsedWord(); + parsedWord.setSentId(sentId); + parsedWord.setWord(word.LEMMA); + parsedWord.setWordId(word.ID); + parsedWord.setLemma(word.LEMMA); + parsedWord.setCpostag(word.CPOSTAG); + parsedWord.setPostag(word.POSTAG); + parsedWord.setHeadNo(word.HEAD.ID); + parsedWord.setDeprel(word.DEPREL); + parsedWord.setConllwordName(word.NAME); + parsedWord.setCreateAt(now); + + // System.out.printf("%s/%s --(%s)--> %s/%s\n", word.LEMMA, word.POSTAG, word.DEPREL, word.HEAD.LEMMA, word.HEAD.POSTAG); + wordList.add(parsedWord); + } + parseCfgList(sentence, sentId); + appendOutputConll(sentence, sent, sentId, fileName); + nlpParserSqliteDao.insertWordList(wordList); + } + + public void parseCfgList(CoNLLSentence coNLLSentence, Long sentId) { + LocalDateTime now = LocalDateTime.now(); + List cfgPairList = Lists.newLinkedList(); + for (CoNLLWord word : coNLLSentence) { + ParsedCfgPair cfgPair = new ParsedCfgPair(); + if (word != CoNLLWord.ROOT) { + cfgPair.setSentId(sentId); + cfgPair.setDeprel(word.DEPREL); + cfgPair.setLeftCpostag(word.CPOSTAG); + cfgPair.setRightCpostag(word.HEAD.CPOSTAG); + cfgPair.setLeftWord(word.LEMMA); + cfgPair.setRightWord(word.HEAD.LEMMA); + cfgPair.setLeftPostag(word.POSTAG); + cfgPair.setRightPostag(word.HEAD.POSTAG); + cfgPair.setLeftWordNo(word.ID); + cfgPair.setRightWordNo(word.HEAD.ID); + cfgPair.setCreateAt(now); + + cfgPairList.add(cfgPair); + } + } + nlpParserSqliteDao.insertCfgList(cfgPairList); + } + + public void appendOutputConll(CoNLLSentence coNLLSentence, String sent, Long sentId, String fileName) { + String result = printToConllFormat(coNLLSentence, sent, sentId, fileName); + try { + sink.write(result); + } catch (IOException e) { + System.err.println(e.getMessage()); + } + } + + public String printToConllFormat(CoNLLSentence coNLLSentence, String sent, Long sentId, String fileName) { + String result = String.format("# fileName = %s\n# sent_id = %d\n# text = %s\n%s\n", + fileName, sentId, sent, coNLLSentence.toString()); + // System.out.print(result); + return result; + } + + public void close() { + nlpParserSqliteDao.initIndex(); + nlpParserSqliteDao.close(); + } + + private void initLineFilterPatternList(String filterRegexFilePath) { + File file = new File(filterRegexFilePath); + try { + Files.asCharSource(file, Charsets.UTF_8).readLines(new LineProcessor() { + private int lineNum = 1; + + @Override + public boolean processLine(String line) throws IOException { + line = line.trim(); + if (Strings.isNullOrEmpty(line)) { + return true; + } + Pattern pattern = Pattern.compile(line); + filterPatternList.add(pattern); + + ++lineNum; + return true; + } + + @Override + public Object getResult() { + return lineNum; + } + }); + } catch (IOException e) { + + System.err.println(e.getMessage()); + } + } + + public DependencyParser(String conllOutputFilePath, String filterRegexFilePath, String dbPath) { + this.sink = Files.asCharSink(new File(conllOutputFilePath), Charsets.UTF_8, FileWriteMode.APPEND); + this.nlpParserSqliteDao = new NlpParserSqliteDao(dbPath); + initLineFilterPatternList(filterRegexFilePath); + } + + public static void printHelpMsg(Options options) { + HelpFormatter helpFormatter = new HelpFormatter(); + helpFormatter.printHelp("用法: java -jar .jar files...\n" + + "模型文件需要放在 ./hanlp-data/ ", options); + } + + public static void main(String[] args) throws IOException, ClassNotFoundException { + Option opt1 = Option.builder("r").longOpt("regex").argName("filterRegex").hasArg() + .desc("过滤无效字符的正则文件路径. 默认: ./filterRegex.txt").required(false).numberOfArgs(1).build(); + Option opt2 = Option.builder("c").longOpt("conll").argName("conllFile").hasArg() + .desc("中间结果coNLL文件输出路径. 默认: ./conll_output.txt").required(false).numberOfArgs(1).build(); + Option opt3 = Option.builder("d").longOpt("database").argName("dbFile").hasArg() + .desc("结果写入的数据库路径. 默认: ./nlpparsed.sqlite3").required(false).numberOfArgs(1).build(); + Option opt4 = Option.builder("h").longOpt("help").hasArg(false) + .desc("显示帮助").required(false).build(); + + Options options = new Options(); + options.addOption(opt1); + options.addOption(opt2); + options.addOption(opt3); + options.addOption(opt4); + + CommandLine cli = null; + CommandLineParser cliParser = new DefaultParser(); + + + String outputFilePath; + String filterPattern; + String dbPath; + List inputFileList; + try { + cli = cliParser.parse(options, args); + + outputFilePath = cli.getOptionValue("c", "conll_output.txt"); + filterPattern = cli.getOptionValue("r", "filterRegex.txt"); + dbPath = cli.getOptionValue("d", "nlpparsed.sqlite3"); + inputFileList = cli.getArgList(); + if (cli.hasOption("h")) { + printHelpMsg(options); + return; + } + } catch (ParseException | NullPointerException e) { + // 解析失败时用 HelpFormatter 打印 帮助信息 + printHelpMsg(options); + e.printStackTrace(); + return; + } + DependencyParser parser = new DependencyParser(outputFilePath, filterPattern, dbPath); + for (String inputFile : inputFileList) { + parser.parseFile(inputFile); + } + parser.close(); + + +// String outputFilePath = Joiner.on(File.separator).join("data", "conll_output.txt"); +// String filterPattern = Joiner.on(File.separator).join("data", "filterRegex.txt"); +// DependencyParser parser = new DependencyParser(outputFilePath, filterPattern); +// +// parser.parseFile("data/lz-data/shentiyundongxunlian.txt"); +// parser.parseFile("data/lz-data/tushouxunlian.txt"); +// parser.parseFile("data/lz-data/yujia.txt"); +// +// parser.close(); + } +} diff --git a/src/main/java/com/ligongku/nlpparser/dao/NlpParserSqliteDao.java b/src/main/java/com/ligongku/nlpparser/dao/NlpParserSqliteDao.java new file mode 100644 index 0000000..74f3540 --- /dev/null +++ b/src/main/java/com/ligongku/nlpparser/dao/NlpParserSqliteDao.java @@ -0,0 +1,276 @@ +package com.ligongku.nlpparser.dao; + +import com.ligongku.nlpparser.model.ParsedCfgPair; +import com.ligongku.nlpparser.model.ParsedSent; +import com.ligongku.nlpparser.model.ParsedWord; +import com.ligongku.nlpparser.util.FileUtil; + +import java.sql.*; +import java.util.List; + +public class NlpParserSqliteDao { + + private Connection connection = null; + + private String dbFilePath; + + // 初始化数据库 + public void initTables() { + beginTransaction(); + + String createTable = ""; + + // 依存分析范围表: 句子-分析器 + createTable = "create table if not exists parsed_sent\n" + + "(\n" + + " id integer not null primary key autoincrement, -- 主键\n" + + " file_name varchar(255) not null default '', -- 文件名\n" + + " sent text not null default '', -- 句子\n" + + " file_offset int not null default 0, -- 句子首字母在文件中的偏移位置\n" + + " line_no int not null default 0, -- 句子在文件中的行号\n" + + " parser_name varchar(50) not null default 'hanlp-NeuralNetworkDependencyParser', -- 分析器名称\n" + + " create_at datetime not null default (datetime('now', 'localtime')) -- 创建时间\n" + + ");"; + update(createTable); + + // 分析结果表 + createTable = "create table if not exists parsed_word\n" + + "(\n" + + " id integer not null primary key autoincrement, -- 主键\n" + + " sent_id integer not null default 0, -- 句子 id\n" + + " word varchar(255) not null default '', -- 词语\n" + + " word_no int not null default 0, -- 词语在句子中的序号,从1开始, 0是语法树的根节点, -1是空白节点\n" + + " lemma varchar(255) not null default '', -- 当前词语(或标点)的原型或词干,在中文中,此列与FORM相同\n" + + " cpostag varchar(50) not null default '', -- 当前词语的词性(粗粒度)\n" + + " postag varchar(50) not null default '', -- 当前词语的词性(细粒度)\n" + + " head_no int not null default 0, -- 当前词语的中心词序号\n" + + " deprel varchar(50) not null default '', -- 当前词语与中心词的依存关系\n" + + " conllword_name varchar(50) not null default '', -- 等效字符串\n" + + " create_at datetime not null default (datetime('now', 'localtime')) -- 创建时间\n" + + ");"; + update(createTable); + + // cfg 规则表 + createTable = "create table if not exists parsed_cfg_pair\n" + + "(\n" + + " id integer not null primary key autoincrement, -- 主键\n" + + " sent_id integer not null default 0, -- 句子 id\n" + + " deprel varchar(255) not null default '', -- 当前词语与中心词的依存关系\n" + + " left_cpostag varchar(50) not null default '', -- 左边词语的词性(粗粒度)\n" + + " right_cpostag varchar(50) not null default '', -- 右边词语的词性(粗粒度)\n" + + " left_word varchar(255) not null default '', -- 左边词语\n" + + " right_word varchar(255) not null default '', -- 右边词语\n" + + " left_postag varchar(50) not null default '', -- 左边词语的词性(细粒度)\n" + + " right_postag varchar(50) not null default '', -- 右边词语的词性(细粒度)\n" + + " left_word_no int not null default 0, -- 左边词语序号\n" + + " right_word_no int not null default 0, -- 右边词语序号\n" + + " create_at datetime not null default (datetime('now', 'localtime')) -- 创建时间\n" + + ");"; + update(createTable); + + commitTransaction(); + } + + // 添加索引 + public void initIndex() { + beginTransaction(); + + String createIndex = ""; + + // 依存分析范围表: 句子-分析器 + createIndex = "create index if not exists idx_create_at on parsed_sent (create_at);"; + update(createIndex); + + // 分析结果表 + createIndex = "create index if not exists idx_sent_word_id on parsed_word (sent_id, word_no);\n" + + "create index if not exists idx_word on parsed_word (word);\n" + + "create index if not exists idx_cpostag on parsed_word (cpostag);\n" + + "create index if not exists idx_postag on parsed_word (postag);\n" + + "create index if not exists idx_deprel on parsed_word (deprel);\n" + + "create index if not exists idx_sent_head_no on parsed_word (sent_id, head_no);\n" + + "create index if not exists idx_create_at on parsed_word (create_at);"; + update(createIndex); + + // cfg 规则表 + createIndex = "create index if not exists idx_sent_id_word_no_lr on parsed_cfg_pair (sent_id, left_word_no, right_word_no);\n" + + "create index if not exists idx_sent_id_word_no_rl on parsed_cfg_pair (sent_id, right_word_no, left_word_no);\n" + + "create index if not exists idx_deprel_cpostag on parsed_cfg_pair (deprel, left_cpostag, right_cpostag);\n" + + "create index if not exists idx_deprel_postag on parsed_cfg_pair (deprel, left_postag, right_postag);\n" + + "create index if not exists idx_create_at on parsed_cfg_pair (create_at);"; + update(createIndex); + + commitTransaction(); + } + + // 插入一个句子 + public Long insertSent(ParsedSent sent) { + String sql = "insert into parsed_sent(file_name, sent, file_offset, line_no, parser_name, create_at)\n" + + "values (?, ?, ?, ?, ?, ?);"; + PreparedStatement ps = null; + try { + beginTransaction(); + ps = connection.prepareStatement(sql, Statement.RETURN_GENERATED_KEYS); + ps.setString(1, sent.getFileName()); + ps.setString(2, sent.getSent()); + ps.setInt(3, sent.getFileOffset()); + ps.setInt(4, sent.getLineNo()); + ps.setString(5, sent.getParserName()); + // ps.setDate(6, new java.sql.Date(Date.from(sent.getCreateAt().atZone(ZoneId.systemDefault()).toInstant()).getTime())); + ps.setDate(6, java.sql.Date.valueOf(sent.getCreateAt().toLocalDate())); + ps.executeUpdate(); + ResultSet rs = ps.getGeneratedKeys(); + long id = 0; + if (rs.next()) { + id = rs.getLong(1); + } + commitTransaction(); + return id; + } catch (SQLException e) { + System.err.println(e.getMessage()); + throw new RuntimeException("DB error when insert into parsed_sent"); + } + } + + // 插入一个句子的依存分析结果 + public int insertWordList(List wordList) { + + String sql = "insert into parsed_word(sent_id,word,word_no,lemma,cpostag,postag,head_no,deprel,conllword_name,create_at)\n" + + "values (?,?,?,?,?,?,?,?,?,?);"; + try { + beginTransaction(); + + PreparedStatement ps = connection.prepareStatement(sql); + for (ParsedWord word : wordList) { + ps.setLong(1, word.getSentId()); + ps.setString(2, word.getWord()); + ps.setInt(3, word.getWordId()); + ps.setString(4, word.getLemma()); + ps.setString(5, word.getCpostag()); + ps.setString(6, word.getPostag()); + ps.setInt(7, word.getHeadNo()); + ps.setString(8, word.getDeprel()); + ps.setString(9, word.getConllwordName()); + ps.setDate(10, java.sql.Date.valueOf(word.getCreateAt().toLocalDate())); + ps.addBatch(); + } + + ps.executeBatch(); + commitTransaction(); + } catch (SQLException e) { + System.err.println(e.getMessage()); + try { + connection.rollback(); + } catch (SQLException ex) { + System.err.println(e.getMessage()); + } + return -1; + } + + return 0; + } + + // 插入一批三元组 + public int insertCfgList(List cfgPairList) { + + String sql = "insert into parsed_cfg_pair(sent_id, deprel, left_cpostag, right_cpostag, left_word, right_word, left_postag, right_postag, left_word_no, right_word_no, create_at)\n" + + "values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?);"; + try { + beginTransaction(); + + PreparedStatement ps = connection.prepareStatement(sql); + for (ParsedCfgPair word : cfgPairList) { + ps.setLong(1, word.getSentId()); + ps.setString(2, word.getDeprel()); + ps.setString(3, word.getLeftCpostag()); + ps.setString(4, word.getRightCpostag()); + ps.setString(5, word.getLeftWord()); + ps.setString(6, word.getRightWord()); + ps.setString(7, word.getLeftPostag()); + ps.setString(8, word.getRightPostag()); + ps.setInt(9, word.getLeftWordNo()); + ps.setInt(10, word.getRightWordNo()); + ps.setDate(11, java.sql.Date.valueOf(word.getCreateAt().toLocalDate())); + ps.addBatch(); + } + + ps.executeBatch(); + commitTransaction(); + } catch (SQLException e) { + System.err.println(e.getMessage()); + try { + connection.rollback(); + } catch (SQLException ex) { + System.err.println(e.getMessage()); + } + return -1; + } + + return 0; + } + + // 执行更新 sql + public void update(String sql) { + try { + PreparedStatement ps = connection.prepareStatement(sql); + ps.setQueryTimeout(0); + ps.executeUpdate(); + } catch (SQLException e) { + System.err.println(e.getMessage()); + throw new RuntimeException(e.getMessage()); + } + } + + // 开始一个事务 + public void beginTransaction() { + try { + connection.setAutoCommit(false); + } catch (SQLException e) { + System.err.println(e.getMessage()); + } + } + + // 提交一个事务 + public void commitTransaction() { + try { + connection.commit(); + } catch (SQLException e) { + System.err.println(e.getMessage()); + } + } + + // 初始化数据库 + public void init() { + initTables(); + } + + // 初始化连接 + public NlpParserSqliteDao(String dbFile) { + this.dbFilePath = dbFile; + + // 文件不存在就创建 + FileUtil.makeFileIfNotExist(this.dbFilePath); + + String url = "jdbc:sqlite:" + dbFile; + try { + connection = DriverManager.getConnection(url); + // 初始化数据库 + init(); + } catch (SQLException e) { + System.err.println(e.getMessage()); + throw new RuntimeException("Error when open sqlite DB:" + this.dbFilePath); + } + } + + // 关闭连接 + public void close() { + if (connection != null) { + try { + connection.close(); + } catch (SQLException e) { + System.err.println(e.getMessage()); + throw new RuntimeException("Error when close sqlite DB:" + this.dbFilePath); + } + } + } + +} diff --git a/src/main/java/com/ligongku/nlpparser/model/ParsedCfgPair.java b/src/main/java/com/ligongku/nlpparser/model/ParsedCfgPair.java new file mode 100644 index 0000000..a73637e --- /dev/null +++ b/src/main/java/com/ligongku/nlpparser/model/ParsedCfgPair.java @@ -0,0 +1,59 @@ +package com.ligongku.nlpparser.model; + +import lombok.Data; + +import java.time.LocalDateTime; + +/** + * Pack: com.ligongku.nlpparser.model + * File: ParsedCfgPair + * Desc: + * User: chuangfengwang + * CreateTime: 2022-02-22 15:42 + */ +@Data +public class ParsedCfgPair { + private Long id; + private Long sentId; + private String deprel; + private String leftCpostag; + private String rightCpostag; + private String leftWord; + private String rightWord; + private String leftPostag; + private String rightPostag; + private Integer leftWordNo; + private Integer rightWordNo; + private LocalDateTime createAt; + + @Override + public String toString() { + final StringBuilder sb = new StringBuilder("{"); + sb.append("\"id\":") + .append(id); + sb.append(",\"sentId\":") + .append(sentId); + sb.append(",\"deprel\":\"") + .append(deprel).append('\"'); + sb.append(",\"leftCpostag\":\"") + .append(leftCpostag).append('\"'); + sb.append(",\"rightCpostag\":\"") + .append(rightCpostag).append('\"'); + sb.append(",\"leftWord\":\"") + .append(leftWord).append('\"'); + sb.append(",\"rightWord\":\"") + .append(rightWord).append('\"'); + sb.append(",\"leftPostag\":\"") + .append(leftPostag).append('\"'); + sb.append(",\"rightPostag\":\"") + .append(rightPostag).append('\"'); + sb.append(",\"leftWordNo\":") + .append(leftWordNo); + sb.append(",\"rightWordNo\":") + .append(rightWordNo); + sb.append(",\"createAt\":") + .append(createAt); + sb.append('}'); + return sb.toString(); + } +} diff --git a/src/main/java/com/ligongku/nlpparser/model/ParsedSent.java b/src/main/java/com/ligongku/nlpparser/model/ParsedSent.java new file mode 100644 index 0000000..e6d522f --- /dev/null +++ b/src/main/java/com/ligongku/nlpparser/model/ParsedSent.java @@ -0,0 +1,37 @@ +package com.ligongku.nlpparser.model; + +import lombok.Data; + +import java.time.LocalDateTime; + +@Data +public class ParsedSent { + private Long id; + private String fileName; + private String sent; + private Integer lineNo; + private Integer fileOffset; + private String parserName; + private LocalDateTime createAt; + + @Override + public String toString() { + final StringBuilder sb = new StringBuilder("{"); + sb.append("\"id\":") + .append(id); + sb.append(",\"fileName\":\"") + .append(fileName).append('\"'); + sb.append(",\"sent\":\"") + .append(sent).append('\"'); + sb.append(",\"lineNo\":") + .append(lineNo); + sb.append(",\"fileOffset\":") + .append(fileOffset); + sb.append(",\"parserName\":\"") + .append(parserName).append('\"'); + sb.append(",\"createAt\":") + .append(createAt); + sb.append('}'); + return sb.toString(); + } +} diff --git a/src/main/java/com/ligongku/nlpparser/model/ParsedWord.java b/src/main/java/com/ligongku/nlpparser/model/ParsedWord.java new file mode 100644 index 0000000..b625bd0 --- /dev/null +++ b/src/main/java/com/ligongku/nlpparser/model/ParsedWord.java @@ -0,0 +1,49 @@ +package com.ligongku.nlpparser.model; + +import lombok.Data; + +import java.time.LocalDateTime; + +@Data +public class ParsedWord { + private Long id; + private Long sentId; + private String word; + private Integer wordId; + private String lemma; + private String cpostag; + private String postag; + private Integer headNo; + private String deprel; + private String conllwordName; + private LocalDateTime createAt; + + @Override + public String toString() { + final StringBuilder sb = new StringBuilder("{"); + sb.append("\"id\":") + .append(id); + sb.append(",\"sentId\":") + .append(sentId); + sb.append(",\"word\":\"") + .append(word).append('\"'); + sb.append(",\"wordId\":") + .append(wordId); + sb.append(",\"lemma\":\"") + .append(lemma).append('\"'); + sb.append(",\"cpostag\":\"") + .append(cpostag).append('\"'); + sb.append(",\"postag\":\"") + .append(postag).append('\"'); + sb.append(",\"headNo\":") + .append(headNo); + sb.append(",\"deprel\":\"") + .append(deprel).append('\"'); + sb.append(",\"conllwordName\":\"") + .append(conllwordName).append('\"'); + sb.append(",\"createAt\":") + .append(createAt); + sb.append('}'); + return sb.toString(); + } +} diff --git a/src/main/java/com/ligongku/nlpparser/util/FileUtil.java b/src/main/java/com/ligongku/nlpparser/util/FileUtil.java new file mode 100644 index 0000000..8ce6c0a --- /dev/null +++ b/src/main/java/com/ligongku/nlpparser/util/FileUtil.java @@ -0,0 +1,36 @@ +package com.ligongku.nlpparser.util; + +import com.google.common.base.Joiner; + +import java.io.File; +import java.io.IOException; + +/** + * Pack: com.ligongku.nlpparser.util + * File: FileUtil + * Desc: + * User: chuangfengwang + * CreateTime: 2022-02-22 04:45 + */ +public class FileUtil { + public static void makeFileIfNotExist(String path) { + if (!(path.startsWith(".") || path.startsWith("/") || path.contains(":"))) { + // 相对路径, 添加 ./ 前缀 + path = Joiner.on(File.separator).join(".", path); + } + File file = new File(path); + // 检查父目录是否存在 + File parentFile = file.getParentFile(); + if (!parentFile.exists()) { + parentFile.mkdirs(); + } + // 检查文件是否存在 + if (!file.exists()) { + try { + file.createNewFile(); + } catch (IOException e) { + System.err.println(e.getMessage()); + } + } + } +} diff --git a/src/main/resources/hanlp.properties b/src/main/resources/hanlp.properties new file mode 100755 index 0000000..a92abd1 --- /dev/null +++ b/src/main/resources/hanlp.properties @@ -0,0 +1,39 @@ +#本配置文件中的路径的根目录,根目录+其他路径=完整路径(支持相对路径,请参考:https://github.com/hankcs/HanLP/pull/254) +#Windows用户请注意,路径分隔符统一使用/ +root=./hanlp-data/ + +#好了,以上为唯一需要修改的部分,以下配置项按需反注释编辑。 + +#核心词典路径 +#CoreDictionaryPath=data/dictionary/CoreNatureDictionary.txt +#2元语法词典路径 +#BiGramDictionaryPath=data/dictionary/CoreNatureDictionary.ngram.txt +#自定义词典路径,用;隔开多个自定义词典,空格开头表示在同一个目录,使用“文件名 词性”形式则表示这个词典的词性默认是该词性。优先级递减。 +#所有词典统一使用UTF-8编码,每一行代表一个单词,格式遵从[单词] [词性A] [A的频次] [词性B] [B的频次] ... 如果不填词性则表示采用词典的默认词性。 +CustomDictionaryPath=data/dictionary/custom/CustomDictionary.txt; 现代汉语补充词库.txt; 全国地名大全.txt ns; 人名词典.txt; 机构名词典.txt; 上海地名.txt ns; lz-人体部位名词词表.txt n; lz-动词词表.txt v;data/dictionary/person/nrf.txt nrf; +#停用词词典路径 +#CoreStopWordDictionaryPath=data/dictionary/stopwords.txt +#同义词词典路径 +#CoreSynonymDictionaryDictionaryPath=data/dictionary/synonym/CoreSynonym.txt +#人名词典路径 +#PersonDictionaryPath=data/dictionary/person/nr.txt +#人名词典转移矩阵路径 +#PersonDictionaryTrPath=data/dictionary/person/nr.tr.txt +#繁简词典根目录 +#tcDictionaryRoot=data/dictionary/tc +#HMM分词模型 +#HMMSegmentModelPath=data/model/segment/HMMSegmentModel.bin +#分词结果是否展示词性 +#ShowTermNature=true +#IO适配器,实现com.hankcs.hanlp.corpus.io.IIOAdapter接口以在不同的平台(Hadoop、Redis等)上运行HanLP +#默认的IO适配器如下,该适配器是基于普通文件系统的。 +#IOAdapter=com.hankcs.hanlp.corpus.io.FileIOAdapter +#感知机词法分析器 +#PerceptronCWSModelPath=data/model/perceptron/pku1998/cws.bin +#PerceptronPOSModelPath=data/model/perceptron/pku1998/pos.bin +#PerceptronNERModelPath=data/model/perceptron/pku1998/ner.bin +#CRF词法分析器 +#CRFCWSModelPath=data/model/crf/pku199801/cws.txt +#CRFPOSModelPath=data/model/crf/pku199801/pos.txt +#CRFNERModelPath=data/model/crf/pku199801/ner.txt +#更多配置项请参考 https://github.com/hankcs/HanLP/blob/master/src/main/java/com/hankcs/hanlp/HanLP.java#L59 自行添加 \ No newline at end of file