diff --git a/README.md b/README.md index c29fedd..500ad4a 100644 --- a/README.md +++ b/README.md @@ -7,16 +7,22 @@ pinyin.py Example: from pinyin import PinYin - - test = PinYin() - test.load_word() - test.hanzi2pinyin(string='钓鱼岛是中国的') + test = PinYin() Out: test.hanzi2pinyin(string='钓鱼岛是中国的') - ['diao', 'yu', 'dao', 'shi', 'zhong', 'guo', 'de'] + ['diao', 'yu', 'dao', 'shi', 'zhong', 'guo', 'de'] test.hanzi2pinyin_split(string='钓鱼岛是中国的', split="-") diao-yu-dao-shi-zhong-guo-de + test.hanzi2pinyin(string='hello world 123') + out: [u'helloworld123'] + test.hanzi2pinyin_split(string='hello world 123', split="_") + out: helloworld123 + + test.hanzi2pinyin(string='hello 中国 123') + out: [u'hello', 'zhong', 'guo', u'123'] + test.hanzi2pinyin_split(string='hello 中国 123', split="_") + out: hello_zhong_guo_123 diff --git a/pinyin.py b/pinyin.py index 938b727..f659894 100644 --- a/pinyin.py +++ b/pinyin.py @@ -16,7 +16,7 @@ class PinYin(object): def __init__(self, dict_file='word.data'): self.word_dict = {} self.dict_file = dict_file - + self.load_word() def load_word(self): if not os.path.exists(self.dict_file): @@ -31,19 +31,30 @@ def load_word(self): line = f_line.split(' ') self.word_dict[line[0]] = line[1] - def hanzi2pinyin(self, string=""): result = [] + alnum = [] + if not isinstance(string, unicode): string = string.decode("utf-8") - - for char in string: + + for char in string.replace(' ', ''): key = '%X' % ord(char) - result.append(self.word_dict.get(key, char).split()[0][:-1].lower()) + word = self.word_dict.get(key, char).split()[0] + if len(word) == 1: + # 拼音都有声调 长度大于1 + alnum.append(word) + else: + if alnum: + words = ''.join(alnum) + alnum = [] + result.append(words) + result.append(word[:-1].lower()) + if alnum: + result.append(''.join(alnum)) return result - def hanzi2pinyin_split(self, string="", split=""): result = self.hanzi2pinyin(string=string) if split == "": @@ -54,8 +65,15 @@ def hanzi2pinyin_split(self, string="", split=""): if __name__ == "__main__": test = PinYin() - test.load_word() string = "钓鱼岛是中国的" print "in: %s" % string print "out: %s" % str(test.hanzi2pinyin(string=string)) - print "out: %s" % test.hanzi2pinyin_split(string=string, split="-") + print "out: %s" % test.hanzi2pinyin_split(string=string, split="_") + string = "hello world 123" + print "in: %s" % string + print "out: %s" % str(test.hanzi2pinyin(string=string)) + print "out: %s" % test.hanzi2pinyin_split(string=string, split="_") + string = "hello 中国 123" + print "in: %s" % string + print "out: %s" % str(test.hanzi2pinyin(string=string)) + print "out: %s" % test.hanzi2pinyin_split(string=string, split="_") diff --git a/setup.py b/setup.py index 40d5de1..7842f85 100644 --- a/setup.py +++ b/setup.py @@ -27,5 +27,4 @@ 'Topic :: Software Development :: Libraries', 'Topic :: Software Development :: Libraries :: Python Modules' ] - ) diff --git a/word.data b/word.data index e8bc443..1884208 100644 --- a/word.data +++ b/word.data @@ -1,3 +1,65 @@ +0x30 0 +0x31 1 +0x32 2 +0x33 3 +0x34 4 +0x35 5 +0x36 6 +0x37 7 +0x38 8 +0x39 9 +0x41 A +0x42 B +0x43 C +0x44 D +0x45 E +0x46 F +0x47 G +0x48 H +0x49 I +0x4a J +0x4b K +0x4c L +0x4d M +0x4e N +0x4f O +0x50 P +0x51 Q +0x52 R +0x53 S +0x54 T +0x55 U +0x56 V +0x57 W +0x58 X +0x59 Y +0x5a Z +0x61 a +0x62 b +0x63 c +0x64 d +0x65 e +0x66 f +0x67 g +0x68 h +0x69 i +0x6a j +0x6b k +0x6c l +0x6d m +0x6e n +0x6f o +0x70 p +0x71 q +0x72 r +0x73 s +0x74 t +0x75 u +0x76 v +0x77 w +0x78 x +0x79 y +0x7a z 3400 QIU1 3401 TIAN3 TIAN4 3404 KUA4