Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/master'
Browse files Browse the repository at this point in the history
  • Loading branch information
hankcs committed Mar 23, 2018
2 parents 3d55813 + a8abebb commit cf61b9c
Show file tree
Hide file tree
Showing 3 changed files with 143 additions and 5 deletions.
27 changes: 23 additions & 4 deletions pyhanlp/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@
JAVA_JAR_CLASSPATH = "-Djava.class.path=%s%s%s" % (
HANLP_JAR_PATH, os.pathsep, STATIC_ROOT)
if HANLP_VERBOSE: print("设置 JAVA_JAR_CLASSPATH [%s]" % JAVA_JAR_CLASSPATH)

# 启动JVM
startJVM(
getDefaultJVMPath(),
Expand All @@ -74,7 +75,25 @@
"-Xmx%s" %
HANLP_JVM_XMX)

# API列表
HanLP = JClass('com.hankcs.hanlp.HanLP') # HanLP工具类
PerceptronLexicalAnalyzer = JClass(
'com.hankcs.hanlp.model.perceptron.PerceptronLexicalAnalyzer')
'''
API列表
use attachThreadToJVM to fix multi-thread issues: https://github.com/hankcs/pyhanlp/issues/7
'''
attach_jvm_to_thread = lambda : None if isThreadAttachedToJVM() else attachThreadToJVM()

class AttachJVMWrapper(object):
def __init__(self, class_name, is_construct = False):
if is_construct:
self.proxy = JClass(class_name)()
else:
self.proxy = JClass(class_name)

# self.proxy = JClass('com.hankcs.hanlp.dictionary.CustomDictionary') # HanLP工具类

def __getattr__(self, attr):
attach_jvm_to_thread()
return getattr(self.proxy, attr)

CustomDictionary = AttachJVMWrapper('com.hankcs.hanlp.dictionary.CustomDictionary')
HanLP = AttachJVMWrapper('com.hankcs.hanlp.HanLP')
PerceptronLexicalAnalyzer = AttachJVMWrapper('com.hankcs.hanlp.model.perceptron.PerceptronLexicalAnalyzer', True)
17 changes: 16 additions & 1 deletion tests/test_hanlp.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ def test_parsedependency(self):

def test_analyze(self):
logging.info("test_analyze")
analyzer = PerceptronLexicalAnalyzer()
analyzer = PerceptronLexicalAnalyzer
print(analyzer.analyze("上海华安工业(集团)公司董事长谭旭光和秘书胡花蕊来到美国纽约现代艺术博物馆参观"))
# 任何模型总会有失误,特别是98年这种陈旧的语料库
print(analyzer.analyze("总统普京与特朗普通电话讨论太空探索技术公司"))
Expand Down Expand Up @@ -86,6 +86,21 @@ def test_segment(self):
"随着页游兴起到现在的页游繁盛,依赖于存档进行逻辑判断的设计减少了,但这块也不能完全忽略掉。"]
for sentence in testCases: print(HanLP.segment(sentence))


def test_custom_dict(self):
logging.info("test_custom_dict")
# 动态增加
text = "攻城狮逆袭单身狗,迎娶白富美,走上人生巅峰" # 怎么可能噗哈哈!
assert len(HanLP.segment(text)) == 12, "添加自定义词汇前,分词结果预期"

# 强行插入
CustomDictionary.add("攻城狮")
CustomDictionary.insert("白富美", "nz 1024")
CustomDictionary.add("单身狗", "nz 1024 n 1")
CustomDictionary.get("单身狗")
text = "攻城狮逆袭单身狗,迎娶白富美,走上人生巅峰" # 怎么可能噗哈哈!
assert len(HanLP.segment(text)) == 10, "添加自定义词汇后,分词结果预期"

def test():
unittest.main()

Expand Down
104 changes: 104 additions & 0 deletions tests/test_multithread.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#===============================================================================
#
# Copyright (c) 2017 <> All Rights Reserved
#
#
# File: /Users/hain/ai/pyhanlp/tests/test_multithread.py
# Author: Hai Liang Wang
# Date: 2018-03-23:17:18:30
#
#===============================================================================

"""
"""
from __future__ import print_function
from __future__ import division

__copyright__ = "Copyright (c) 2017 . All Rights Reserved"
__author__ = "Hai Liang Wang"
__date__ = "2018-03-23:17:18:30"


import os
import sys
curdir = os.path.dirname(os.path.abspath(__file__))
sys.path.append(os.path.join(curdir, os.path.pardir))

if sys.version_info[0] < 3:
reload(sys)
sys.setdefaultencoding("utf-8")
# raise "Must be using Python 3"

# Get ENV
ENVIRON = os.environ.copy()

from absl import flags #absl-py
from absl import logging #absl-py

FLAGS = flags.FLAGS

import threading
import time
from pyhanlp import HanLP

class MyThread (threading.Thread):
def __init__(self, thread_id, name, counter, lock):
threading.Thread.__init__(self)
self.threadID = thread_id
self.name = name
self.counter = counter
self.lock = lock

def run(self):
print("Starting " + self.name)
self.lock.acquire()
print_time(self.name, self.counter, 3)
# Free lock to release next thread
self.lock.release()

def print_time(thread_name, delay, counter):
while counter:
# time.sleep(delay)
print("%s: %s, seg: %s" % (thread_name, time.ctime(time.time()), HanLP.segment("攻城狮逆袭单身狗,迎娶白富美,走上人生巅峰")))
counter -= 1

import unittest

# run testcase: python /Users/hain/ai/pyhanlp/tests/test_multithread.py Test.testExample
class Test(unittest.TestCase):
'''
'''
def setUp(self):
pass

def tearDown(self):
pass

def test_multithread(self):
logging.info("test_multithread")

threadLock = threading.Lock()

thread1 = MyThread(1, "Thread-1", 1, threadLock)
thread2 = MyThread(2, "Thread-2", 2, threadLock)

thread1.start()
thread2.start()

print('waiting to finish the thread')

thread1.join()
thread2.join()

print("Exiting Main Thread")

def test():
unittest.main()

if __name__ == '__main__':
FLAGS([__file__, '--verbosity', '1']) # DEBUG 1; INFO 0; WARNING -1
test()

0 comments on commit cf61b9c

Please sign in to comment.