-
Notifications
You must be signed in to change notification settings - Fork 29
/
scel2txt.py
171 lines (133 loc) · 5.41 KB
/
scel2txt.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
"""
搜狗细胞词库转鼠须管(Rime)词库
搜狗的 scel 词库是按照一定格式保存的 Unicode 编码文件,其中每两个字节表示一个字符(中文汉字或者英文字母),主要两部分:
1. 全局拼音表,在文件中的偏移值是 0x1540+4, 格式为 (py_idx, py_len, py_str)
- py_idx: 两个字节的整数,代表这个拼音的索引
- py_len: 两个字节的整数,拼音的字节长度
- py_str: 当前的拼音,每个字符两个字节,总长 py_len
2. 汉语词组表,在文件中的偏移值是 0x2628 或 0x26c4, 格式为 (word_count, py_idx_count, py_idx_data, (word_len, word_str, ext_len, ext){word_count}),其中 (word_len, word, ext_len, ext){word_count} 一共重复 word_count 次, 表示拼音的相同的词一共有 word_count 个
- word_count: 两个字节的整数,同音词数量
- py_idx_count: 两个字节的整数,拼音的索引个数
- py_idx_data: 两个字节表示一个整数,每个整数代表一个拼音的索引,拼音索引数
- word_len:两个字节的整数,代表中文词组字节数长度
- word_str: 汉语词组,每个中文汉字两个字节,总长度 word_len
- ext_len: 两个字节的整数,可能代表扩展信息的长度,好像都是 10
- ext: 扩展信息,一共 10 个字节,前两个字节是一个整数(不知道是不是词频),后八个字节全是 0,ext_len 和 ext 一共 12 个字节
参考资料
1. https://raw.githubusercontent.com/archerhu/scel2mmseg/master/scel2mmseg.py
2. https://raw.githubusercontent.com/xwzhong/small-program/master/scel-to-txt/scel2txt.py
"""
import struct
import os
import sys
def read_utf16_str(f, offset=-1, len=2):
if offset >= 0:
f.seek(offset)
string = f.read(len)
return string.decode('UTF-16LE')
def read_uint16(f):
return struct.unpack('<H', f.read(2))[0]
def get_hz_offset(f):
mask = f.read(128)[4]
if mask == 0x44:
return 0x2628
elif mask == 0x45:
return 0x26c4
else:
print("不支持的文件类型(无法获取汉语词组的偏移量)")
sys.exit(1)
def get_dict_meta(f):
title = read_utf16_str(f, 0x130, 0x338 - 0x130)
category = read_utf16_str(f, 0x338, 0x540 - 0x338)
desc = read_utf16_str(f, 0x540, 0xd40 - 0x540)
samples = read_utf16_str(f, 0xd40, 0x1540 - 0xd40)
return title, category, desc, samples
def get_py_map(f):
py_map = {}
f.seek(0x1540+4)
while True:
py_idx = read_uint16(f)
py_len = read_uint16(f)
py_str = read_utf16_str(f, -1, py_len)
if py_idx not in py_map:
py_map[py_idx] = py_str
# 如果拼音为 zuo,说明是最后一个了
if py_str == 'zuo':
break
return py_map
def get_records(f, file_size, hz_offset, py_map):
f.seek(hz_offset)
records = []
while f.tell() != file_size:
word_count = read_uint16(f)
py_idx_count = int(read_uint16(f) / 2)
py_set = []
for i in range(py_idx_count):
py_idx = read_uint16(f)
if (py_map.get(py_idx, None) == None):
return records
py_set.append(py_map[py_idx])
py_str = " ".join(py_set)
for i in range(word_count):
word_len = read_uint16(f)
word_str = read_utf16_str(f, -1, word_len)
# 跳过 ext_len 和 ext 共 12 个字节
f.read(12)
records.append((py_str, word_str))
return records
def get_words_from_sogou_cell_dict(fname):
with open(fname, 'rb') as f:
hz_offset = get_hz_offset(f)
(title, category, desc, samples) = get_dict_meta(f)
#print("title: %s\ncategory: %s\ndesc: %s\nsamples: %s" %
# (title, category, desc, samples))
py_map = get_py_map(f)
file_size = os.path.getsize(fname)
words = get_records(f, file_size, hz_offset, py_map)
return words
def save(records, f):
records_translated = list(map(lambda x: "%s\t%s" % (
x[1], x[0]), records))
f.write("\n".join(records_translated))
return records_translated
def main():
# 将要转换的词库添加在 scel 目录下
scel_files = list(filter(lambda x: x.endswith('.scel'), [
i for i in os.listdir("./scel")]))
dict_file = "luna_pinyin.sogou.dict.yaml"
dict_file_content = []
dict_file_header = """# Rime dictionary
# encoding: utf-8
#
# Sogou Pinyin Dict - 搜狗细胞词库
#
# https://pinyin.sogou.com/dict/
#
# 包括:
#
%s
#
---
name: luna_pinyin.sogou
version: "1.0"
sort: by_weight
use_preset_vocabulary: true
...
"""
sougo_dict_name_list = list(
map(lambda x: "# * %s" % x.replace(".scel", ""), scel_files))
dict_file_content.append(dict_file_header % "\n".join(sougo_dict_name_list))
if not os.path.exists("./out"):
os.mkdir("./out")
for scel_file in scel_files:
records = get_words_from_sogou_cell_dict(
os.path.join("./scel", scel_file))
print("%s: %s 个词" % (scel_file, len(records)))
with open(os.path.join("./out", scel_file.replace(".scel", ".txt")), "w") as fout:
dict_file_content.extend(save(records, fout))
print("-"*80)
print("合并后 %s: %s 个词" % (dict_file, len(dict_file_content) - 1))
with open(os.path.join("./out", dict_file), "w") as dictfout:
dictfout.write("\n".join(dict_file_content))
if __name__ == "__main__":
main()