diff --git a/.gitignore b/.gitignore index 7bbc71c..27e3dde 100644 --- a/.gitignore +++ b/.gitignore @@ -99,3 +99,8 @@ ENV/ # mypy .mypy_cache/ +*.jar +*.zip +pyhanlp/static/data +.idea/ +pyhanlp/static/hanlp.properties \ No newline at end of file diff --git a/MANIFEST.in b/MANIFEST.in new file mode 100644 index 0000000..f993898 --- /dev/null +++ b/MANIFEST.in @@ -0,0 +1,4 @@ +include README.md +include setup.cfg +include README.md +include pyhanlp/static/hanlp.properties.in \ No newline at end of file diff --git a/README.md b/README.md index bf04e99..1abc254 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,140 @@ -# pyhanlp -Python interfaces for HanLP: Han Language Processing +# pyhanlp: Python interfaces for HanLP + +HanLP的Python接口,支持自动下载与升级HanLP的jar包和数据包。 + +## 安装 + +``` +pip3 install pyhanlp +``` + +## 命令行 + +通过`hanlp --help`查看最新帮助手册。 + +### 中文分词 + +使用命令`hanlp segment`进入交互分词模式,输入一个句子并回车,HanLP会输出分词结果: + +``` +$ hanlp segment +商品和服务 +商品/n 和/cc 服务/vn +当下雨天地面积水分外严重 +当/p 下雨天/n 地面/n 积水/n 分外/d 严重/a +龚学平等领导说,邓颖超生前杜绝超生 +龚学平/nr 等/udeng 领导/n 说/v ,/w 邓颖超/nr 生前/t 杜绝/v 超生/vi +``` + +还可以重定向输入输出到文件等: + +``` +$ hanlp segment <<< '欢迎新老师生前来就餐' +欢迎/v 新/a 老/a 师生/n 前来/vi 就餐/vi +``` + +### 依存句法分析 + +命令为`hanlp parse`,同样支持交互模式和重定向: + +``` +hanlp parse <<< '徐先生还具体帮助他确定了把画雄鹰、松鼠 和麻雀作为主攻目标。' +1 徐先生 徐先生 nh nr _ 4 主谓关系 _ _ +2 还 还 d d _ 4 状中结构 _ _ +3 具体 具体 a a _ 4 状中结构 _ _ +4 帮助 帮助 v v _ 0 核心关系 _ _ +5 他 他 r rr _ 4 兼语 _ _ +6 确定 确定 v v _ 4 动宾关系 _ _ +7 了 了 u ule _ 6 右附加关系 _ _ +8 把 把 p pba _ 9 状中结构 _ _ +9 画 画 v v _ 6 动宾关系 _ _ +10 雄鹰 雄鹰 n n _ 9 动宾关系 _ _ +11 、 、 wp w _ 12 标点符号 _ _ +12 松鼠 松鼠 n n _ 10 并列关系 _ _ +13 wp w _ 9 标点符号 _ _ +14 和 和 c cc _ 15 左附加关系 _ _ +15 麻雀 麻雀 n n _ 16 主谓关系 _ _ +16 作为 作为 p p _ 9 并列关系 _ _ +17 主攻 主攻 v vn _ 18 定中关系 _ _ +18 目标 目标 n n _ 16 动宾关系 _ _ +19 。 。 wp w _ 4 标点符号 _ _ +``` + +### 升级 + +通过`hanlp update`命令来将HanLP升级到最新版。该命令会获取GitHub最新版本并自动下载安装。 + +## API + +通过工具类`HanLP`调用常用接口: + +```python +from pyhanlp import * + +print(HanLP.segment('你好,欢迎在Python中调用HanLP的API')) +testCases = [ + "商品和服务", + "结婚的和尚未结婚的确实在干扰分词啊", + "买水果然后来世博园最后去世博会", + "中国的首都是北京", + "欢迎新老师生前来就餐", + "工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作", + "随着页游兴起到现在的页游繁盛,依赖于存档进行逻辑判断的设计减少了,但这块也不能完全忽略掉。"] +for sentence in testCases: print(HanLP.segment(sentence)) +# 关键词提取 +document = "水利部水资源司司长陈明忠9月29日在国务院新闻办举行的新闻发布会上透露," \ + "根据刚刚完成了水资源管理制度的考核,有部分省接近了红线的指标," \ + "有部分省超过红线的指标。对一些超过红线的地方,陈明忠表示,对一些取用水项目进行区域的限批," \ + "严格地进行水资源论证和取水许可的批准。" +print(HanLP.extractKeyword(document, 2)) +# 自动摘要 +print(HanLP.extractSummary(document, 3)) +# 依存句法分析 +print(HanLP.parseDependency("徐先生还具体帮助他确定了把画雄鹰、松鼠和麻雀作为主攻目标。")) +``` + +### 更多功能 + +更多功能,包括但不限于: + +- 自定义词典 +- 极速词典分词 +- 索引分词 +- CRF分词 +- 感知机词法分析 +- 臺灣正體、香港繁體 +- 关键词提取、自动摘要 +- 文本分类、情感分析 + +请阅读[HanLP主项目文档](https://github.com/hankcs/HanLP)以了解更多。调用更底层的API需要参考Java语法用JClass引入更深的类路径。以感知机词法分析器为例,这个类位于包名[`com.hankcs.hanlp.model.perceptron.PerceptronLexicalAnalyzer`](https://github.com/hankcs/HanLP/blob/master/src/main/java/com/hankcs/hanlp/model/perceptron/PerceptronLexicalAnalyzer.java)下,所以先用`JClass`得到类,然后就可以调用了: + +``` +PerceptronLexicalAnalyzer = JClass('com.hankcs.hanlp.model.perceptron.PerceptronLexicalAnalyzer') +analyzer = PerceptronLexicalAnalyzer() +print(analyzer.analyze("上海华安工业(集团)公司董事长谭旭光和秘书胡花蕊来到美国纽约现代艺术博物馆参观")) +``` + +输出: + +``` +[上海/ns 华安/nz 工业/n (/w 集团/n )/w 公司/n]/nt 董事长/n 谭旭光/nr 和/c 秘书/n 胡花蕊/nr 来到/v [美国/ns 纽约/ns 现代/t 艺术/n 博物馆/n]/ns 参观/v +``` + +如果你经常使用某个类,欢迎将其写入`pyhanlp/__init__.py`中并提交pull request,谢谢! + +## 与其他项目共享data + +HanLP具备高度可自定义的特点,所有模型和词典都可以自由替换。如果你希望与别的项目共享同一套data,只需将该项目的配置文件`hanlp.properties`拷贝到pyhanlp的安装目录下即可。本机安装目录可以通过`hanlp --version`获取。 + +同时,还可以通过`--config`临时加载另一个配置文件: + +``` +hanlp segment --config path/to/another/hanlp.properties +``` + +## 授权协议 + +Apache License 2.0 + + + diff --git a/pyhanlp/__init__.py b/pyhanlp/__init__.py new file mode 100644 index 0000000..70a32f1 --- /dev/null +++ b/pyhanlp/__init__.py @@ -0,0 +1,15 @@ +# -*- coding:utf-8 -*- +# Author:hankcs +# Date: 2018-03-18 19:49 +import os +from jpype import * + +from pyhanlp.static import HANLP_JAR_PATH, STATIC_ROOT + +# 启动JVM +startJVM(getDefaultJVMPath(), "-Djava.class.path={}{}{}".format(HANLP_JAR_PATH, os.pathsep, STATIC_ROOT), "-Xms1g", + "-Xmx1g") + +# API列表 +HanLP = JClass('com.hankcs.hanlp.HanLP') # HanLP工具类 +PerceptronLexicalAnalyzer = JClass('com.hankcs.hanlp.model.perceptron.PerceptronLexicalAnalyzer') diff --git a/pyhanlp/main.py b/pyhanlp/main.py new file mode 100644 index 0000000..89cf9f7 --- /dev/null +++ b/pyhanlp/main.py @@ -0,0 +1,65 @@ +# -*- coding:utf-8 -*- +# Author:hankcs +# Date: 2018-03-19 01:05 +import argparse +import sys + +import os +from jpype import JClass + +from pyhanlp import HanLP +from pyhanlp.static import eprint, PATH_CONFIG, update_hanlp, HANLP_JAR_VERSION, HANLP_JAR_PATH, HANLP_DATA_PATH, \ + hanlp_installed_data_version + + +def main(): + if len(sys.argv) == 1: + sys.argv.append('--help') + + arg_parser = argparse.ArgumentParser(description='HanLP: Han Language Processing v{}'.format(HANLP_JAR_VERSION)) + arg_parser.add_argument('-v', '--version', required=False, action='store_true', + help='show installed versions of HanLP') + task_parser = arg_parser.add_subparsers(dest="task", help='which task to perform?') + segment_parser = task_parser.add_parser(name='segment', help='word segmentation') + parse_parser = task_parser.add_parser(name='parse', help='dependency parsing') + update_parser = task_parser.add_parser(name='update', help='update jar and data of HanLP') + + def add_args(p): + p.add_argument("--config", default=PATH_CONFIG, + help='path to hanlp.properties') + # p.add_argument("--action", dest="action", default='predict', + # help='Which action (train, test, predict)?') + + add_args(segment_parser) + add_args(parse_parser) + args = arg_parser.parse_args() + + def die(msg): + eprint(msg) + exit(1) + + if args.config: + if os.path.isfile(args.config): + JClass('com.hankcs.hanlp.utility.Predefine').HANLP_PROPERTIES_PATH = args.config + else: + die('Can\'t find config file {}'.format(args.config)) + + if args.version: + print('jar {}: {}'.format(HANLP_JAR_VERSION, HANLP_JAR_PATH)) + data_version = hanlp_installed_data_version() + print('data {}: {}'.format(data_version if data_version else '自定义', HANLP_DATA_PATH)) + + if args.task == 'segment': + for line in sys.stdin: + line = line.strip() + print(' '.join(term.toString() for term in HanLP.segment(line))) + elif args.task == 'parse': + for line in sys.stdin: + line = line.strip() + print(HanLP.parseDependency(line)) + elif args.task == 'update': + update_hanlp() + + +if __name__ == '__main__': + main() diff --git a/pyhanlp/static/__init__.py b/pyhanlp/static/__init__.py new file mode 100644 index 0000000..f5d1598 --- /dev/null +++ b/pyhanlp/static/__init__.py @@ -0,0 +1,247 @@ +# -*- coding:utf-8 -*- +# Author:hankcs +# Date: 2018-03-18 20:05 +import errno +import glob +import json +import os +import re +import sys +import urllib.request +import zipfile + +from shutil import copyfile + +import time + +STATIC_ROOT = os.path.dirname(os.path.realpath(__file__)) +PATH_CONFIG = os.path.join(STATIC_ROOT, 'hanlp.properties') +HANLP_DATA_PATH = os.path.join(STATIC_ROOT, 'data') +PATH_DATA_VERSION = os.path.join(HANLP_DATA_PATH, 'version.txt') +HANLP_JAR_VERSION = None +HANLP_DATA_VERSION = None +HANLP_RELEASES = None + + +def eprint(*args, **kwargs): + print(*args, file=sys.stderr, **kwargs) + + +def remove_file(filename): + try: + os.remove(filename) + except OSError as e: # this would be "except OSError, e:" before Python 2.6 + if e.errno != errno.ENOENT: # errno.ENOENT = no such file or directory + raise # re-raise exception if a different error occurred + + +def hanlp_latest_version(): + return hanlp_releases()[0] + + +def hanlp_releases(cache=True): + global HANLP_RELEASES + if cache and HANLP_RELEASES: + return HANLP_RELEASES + # print('Request GitHub API') + content = urllib.request.urlopen("http://api.github.com/repos/hankcs/HanLP/releases").read() + content = json.loads(content) + meta = [] + for r in content: + jar_version = r['tag_name'] + if jar_version.startswith('v'): + jar_version = jar_version[1:] + m = re.search(r'\[(data-for-.*?\.zip)\]\((.*?)\)', r['body']) + data_version, data_url = None, None + if m and len(m.groups()) == 2: + data_version = m.group(1)[len('data-for-'):-len('.zip')] + data_url = m.group(2) + meta.append((jar_version, data_version, data_url)) + + HANLP_RELEASES = meta + return meta + + +def hanlp_installed_jar_versions(): + # print(glob.glob(os.path.join(STATIC_ROOT, 'hanlp-portable-{}.jar'.format(version)))) + # if not version: + # pass + # if os.path.exists(version): + # pass + versions = [] + for jar in glob.glob(os.path.join(STATIC_ROOT, 'hanlp-portable-*.jar')): + versions.append(os.path.basename(jar)[len('hanlp-portable-'):-len('.jar')]) + + versions = sorted(versions, reverse=True) + if versions: + global HANLP_JAR_VERSION + HANLP_JAR_VERSION = versions[0] + return versions + + +def hanlp_installed_data_version(): + try: + with open(os.path.join(HANLP_DATA_PATH, 'version.txt')) as f: + global HANLP_DATA_VERSION + HANLP_DATA_VERSION = f.readlines()[0].strip() + return HANLP_DATA_VERSION + except FileNotFoundError: + return None + + +def hanlp_installed_data_path(): + root = read_config() + if os.path.isdir(root): + global HANLP_DATA_PATH + if root == STATIC_ROOT: + if not os.path.isdir(HANLP_DATA_PATH): + return None + HANLP_DATA_PATH = os.path.join(root, 'data') + return HANLP_DATA_PATH + + return None + + +def download(url, path): + opener = urllib.request.build_opener() + opener.addheaders = [('User-agent', + 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.162 Safari/537.36')] + urllib.request.install_opener(opener) + print('Downloading {} to {}'.format(url, path)) + tmp_path = '{}.downloading'.format(path) + try: + def reporthook(count, block_size, total_size): + global start_time + if count == 0: + start_time = time.time() + return + duration = time.time() - start_time + progress_size = int(count * block_size) + speed = int(progress_size / (1024 * duration)) + ratio = count * block_size / total_size + percent = ratio * 100 + eta = duration / ratio * (1 - ratio) + minutes = eta / 60 + seconds = eta % 60 + sys.stdout.write("\r%.2f%%, %d MB, %d KB/s, ETA %d min %d s" % + (percent, progress_size / (1024 * 1024), speed, minutes, seconds)) + sys.stdout.flush() + + urllib.request.urlretrieve(url, tmp_path, reporthook) + print() + except: + try: + os.system('wget {} -O {}'.format(url, path)) + except: + eprint('Failed to download {}'.format(url)) + return False + os.rename(tmp_path, path) + return True + + +def install_hanlp_jar(version=None): + if version is None: + version = hanlp_latest_version()[0] + url = 'http://search.maven.org/remotecontent?filepath=com/hankcs/hanlp/portable-{}/hanlp-portable-{}.jar'.format( + version, version) + download(url, hanlp_jar_path(version)) + global HANLP_JAR_VERSION + HANLP_JAR_VERSION = version + + +def update_hanlp(): + if update_hanlp_jar(): + print('HanLP jar has been updated to the latest version {}'.format(HANLP_JAR_VERSION)) + else: + print('HanLP jar is already the latest version {}'.format(HANLP_JAR_VERSION)) + + root = read_config() + if root == STATIC_ROOT: + if install_hanlp_data(HANLP_JAR_VERSION): + print('HanLP data has been updated to the latest version {}'.format(HANLP_DATA_VERSION)) + else: + print('HanLP data is already the latest version {}'.format(HANLP_DATA_VERSION)) + + +def update_hanlp_jar(): + if hanlp_releases()[0][0] in hanlp_installed_jar_versions(): + return False + install_hanlp_jar() + uninstall_hanlp_jar() + return True + + +def install_hanlp_data(the_jar_version): + for jar_version, data_version, data_url in hanlp_releases(): + if jar_version == the_jar_version: + if data_version == hanlp_installed_data_version(): + return False + data_zip = 'data-for-{}.zip'.format(data_version) + data_zip = os.path.join(STATIC_ROOT, data_zip) + download(data_url, os.path.join(STATIC_ROOT, data_zip)) + with zipfile.ZipFile(data_zip, "r") as zip_ref: + zip_ref.extractall(STATIC_ROOT) + os.remove(data_zip) + write_config(root=STATIC_ROOT) + with open(PATH_DATA_VERSION, 'w') as f: + f.write(data_version) + return True + + +def write_config(root=None): + content = [] + with open(PATH_CONFIG) as f: + for line in f: + if root: + if line.startswith('root'): + line = 'root={}{}'.format(root, os.linesep) + content.append(line) + with open(PATH_CONFIG, 'w') as f: + f.writelines(content) + + +def read_config(): + root = None + if not os.path.isfile(PATH_CONFIG): + copyfile(PATH_CONFIG + '.in', PATH_CONFIG) + with open(PATH_CONFIG) as f: + for line in f: + if line.startswith('root'): + root = line.strip().split('=')[1] + return root + + +def hanlp_jar_path(version): + return os.path.join(STATIC_ROOT, 'hanlp-portable-{}.jar'.format(version)) + + +def uninstall_hanlp_jar(version='old'): + if version == 'old': + vs = hanlp_installed_jar_versions() + if len(vs) > 1: + for v in vs[0:]: + remove_file(hanlp_jar_path(v)) + else: + remove_file(hanlp_jar_path(version)) + + +if not hanlp_installed_jar_versions(): + install_hanlp_jar() + +if not hanlp_installed_data_path(): + install_hanlp_data(HANLP_JAR_VERSION) + +HANLP_JAR_PATH = hanlp_jar_path(HANLP_JAR_VERSION) + +# install_hanlp_jar('1.5.4') +# print(hanlp_releases()) +# print(hanlp_installed('1.6.0')) +# print(hanlp_installed_jar_versions()) +# uninstall_hanlp_jar() +# update_hanlp_jar() +# print(_HANLP_JAR_PATH) +# write_config(data_version='1.6.0') +# print(hanlp_installed_data_versions()) +# download('http://storage.live.com/items/D4A741A579C555F7!65703:/data-for-1.6.0.zip', 'tmp') +# print(hanlp_installed_data_version()) +# update_hanlp() diff --git a/pyhanlp/static/hanlp.properties.in b/pyhanlp/static/hanlp.properties.in new file mode 100755 index 0000000..dfc61f7 --- /dev/null +++ b/pyhanlp/static/hanlp.properties.in @@ -0,0 +1,29 @@ +#本配置文件中的路径的根目录,根目录+其他路径=完整路径。通过绝对路径,pyhanlp可以和HanLP共享同一份data +#Windows用户请注意,路径分隔符统一使用/ +root=none +#核心词典路径 +CoreDictionaryPath=data/dictionary/CoreNatureDictionary.txt +#2元语法词典路径 +BiGramDictionaryPath=data/dictionary/CoreNatureDictionary.ngram.txt +#停用词词典路径 +CoreStopWordDictionaryPath=data/dictionary/stopwords.txt +#同义词词典路径 +CoreSynonymDictionaryDictionaryPath=data/dictionary/synonym/CoreSynonym.txt +#人名词典路径 +PersonDictionaryPath=data/dictionary/person/nr.txt +#人名词典转移矩阵路径 +PersonDictionaryTrPath=data/dictionary/person/nr.tr.txt +#繁简词典根目录 +tcDictionaryRoot=data/dictionary/tc +#自定义词典路径,用;隔开多个自定义词典,空格开头表示在同一个目录,使用“文件名 词性”形式则表示这个词典的词性默认是该词性。优先级递减。 +#另外data/dictionary/custom/CustomDictionary.txt是个高质量的词库,请不要删除。所有词典统一使用UTF-8编码。 +CustomDictionaryPath=data/dictionary/custom/CustomDictionary.txt; 现代汉语补充词库.txt; 全国地名大全.txt ns; 人名词典.txt; 机构名词典.txt; 上海地名.txt ns;data/dictionary/person/nrf.txt nrf; +#CRF分词模型路径 +CRFSegmentModelPath=data/model/segment/CRFSegmentModel.txt +#HMM分词模型 +HMMSegmentModelPath=data/model/segment/HMMSegmentModel.bin +#分词结果是否展示词性 +ShowTermNature=true +#IO适配器,实现com.hankcs.hanlp.corpus.io.IIOAdapter接口以在不同的平台(Hadoop、Redis等)上运行HanLP +#默认的IO适配器如下,该适配器是基于普通文件系统的。 +#IOAdapter=com.hankcs.hanlp.corpus.io.FileIOAdapter \ No newline at end of file diff --git a/setup.cfg b/setup.cfg new file mode 100644 index 0000000..3c6e79c --- /dev/null +++ b/setup.cfg @@ -0,0 +1,2 @@ +[bdist_wheel] +universal=1 diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..e99114a --- /dev/null +++ b/setup.py @@ -0,0 +1,38 @@ +# -*- coding:utf-8 -*- +# Author:hankcs +# Date: 2018-03-11 20:54 +from os.path import abspath, join, dirname + +from setuptools import find_packages, setup + + +this_dir = abspath(dirname(__file__)) +with open(join(this_dir, 'README.md'), encoding='utf-8') as file: + long_description = file.read() + +setup( + name='pyhanlp', + version='0.0.8', + description='Python wrapper for HanLP: Han Language Processing', + long_description=long_description, + url='https://github.com/hankcs/pyhanlp', + author='hankcs', + author_email='hankcshe@gmail.com', + license='Apache License 2.0', + classifiers=[ + 'Intended Audience :: Developers', + 'Topic :: Utilities', + 'Natural Language :: English', + 'Operating System :: OS Independent', + 'Programming Language :: Python :: 3', + ], + keywords='Natural Language Processing', + packages=find_packages(exclude=['docs', 'tests*']), + include_package_data=True, + install_requires=['jpype1'], + entry_points={ + 'console_scripts': [ + 'hanlp=pyhanlp.main:main', + ], + }, +) diff --git a/tests/test_hanlp.py b/tests/test_hanlp.py new file mode 100644 index 0000000..22c6f51 --- /dev/null +++ b/tests/test_hanlp.py @@ -0,0 +1,44 @@ +# -*- coding:utf-8 -*- +# Author:hankcs +# Date: 2018-03-18 21:07 +from pyhanlp import * + +print(HanLP.segment('你好,欢迎在Python中调用HanLP的API')) +testCases = [ + "商品和服务", + "结婚的和尚未结婚的确实在干扰分词啊", + "买水果然后来世博园最后去世博会", + "中国的首都是北京", + "欢迎新老师生前来就餐", + "工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作", + "随着页游兴起到现在的页游繁盛,依赖于存档进行逻辑判断的设计减少了,但这块也不能完全忽略掉。"] +for sentence in testCases: print(HanLP.segment(sentence)) +# 关键词提取 +document = "水利部水资源司司长陈明忠9月29日在国务院新闻办举行的新闻发布会上透露," \ + "根据刚刚完成了水资源管理制度的考核,有部分省接近了红线的指标," \ + "有部分省超过红线的指标。对一些超过红线的地方,陈明忠表示,对一些取用水项目进行区域的限批," \ + "严格地进行水资源论证和取水许可的批准。" +print(HanLP.extractKeyword(document, 2)) +# 自动摘要 +print(HanLP.extractSummary(document, 3)) +# 依存句法分析 +print(HanLP.parseDependency("徐先生还具体帮助他确定了把画雄鹰、松鼠和麻雀作为主攻目标。")) +# 更底层的API需要参考Java语法用JClass引入更深的类路径 +analyzer = PerceptronLexicalAnalyzer() + +print(analyzer.analyze("上海华安工业(集团)公司董事长谭旭光和秘书胡花蕊来到美国纽约现代艺术博物馆参观")) + +# 任何模型总会有失误,特别是98年这种陈旧的语料库 +print(analyzer.analyze("总统普京与特朗普通电话讨论太空探索技术公司")) +# 支持在线学习 +analyzer.learn("与/c 特朗普/nr 通/v 电话/n 讨论/v [太空/s 探索/vn 技术/n 公司/n]/nt") +# 学习到新知识 +print(analyzer.analyze("总统普京与特朗普通电话讨论太空探索技术公司")) +# 还可以举一反三 +print(analyzer.analyze("主席和特朗普通电话")) + +# 知识的泛化不是死板的规则,而是比较灵活的统计信息 +print(analyzer.analyze("我在浙江金华出生")) +analyzer.learn("在/p 浙江/ns 金华/ns 出生/v") +print(analyzer.analyze("我在四川金华出生")) +print(analyzer.analyze("我的名字叫金华")) \ No newline at end of file