forked from kakaobrain/kortok
-
Notifications
You must be signed in to change notification settings - Fork 0
/
generate_tokenized_sentences.py
132 lines (117 loc) · 5.52 KB
/
generate_tokenized_sentences.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
from tokenizer import *
from pathlib import Path
from tqdm import tqdm
from functools import partial
import os
from multiprocessing import Process
from argparse import ArgumentParser
DICT_PATH = "/home/n5/chanwoo/utils/mecab-ko/lib/mecab/dic/mecab-ko-dic"
KO_CORPUSES = [
Path("dataset/modoo-translation/processed/train.ko"),
Path("dataset/modoo-translation/processed/validation.ko"),
Path("dataset/modoo-translation/processed/test.ko"),
]
EN_CORPUSES = [
Path("dataset/modoo-translation/processed/train.en"),
Path("dataset/modoo-translation/processed/validation.en"),
Path("dataset/modoo-translation/processed/test.en"),
]
OUTPUT_PATH = "dataset/wiki-0420/tokenized"
RESOURCES = "resources"
def tokenize_sentences(input_file, output_file, tokenizer: BaseTokenizer, pbar=False):
input_file = Path(input_file)
output_file = Path(output_file)
output_file.parent.mkdir(parents=True, exist_ok=True)
with open(input_file, "r") as f:
lines = f.readlines()
with open(output_file, "w") as f:
for line in tqdm(lines) if pbar else lines:
tokenized = tokenizer.tokenize(line)
tokenized = " ".join(tokenized)
f.write(tokenized + "\n")
if __name__ == "__main__":
parser = ArgumentParser()
parser.add_argument("--resources", type=str, default=RESOURCES)
parser.add_argument("--output", type=str, default=OUTPUT_PATH)
args = parser.parse_args()
processes = []
resources = os.listdir(args.resources)
output_path = Path(args.output)
os.makedirs(output_path, exist_ok=True)
tokenizer = CharTokenizer()
for input_file in KO_CORPUSES:
output_file = f"{output_path}/char-2k/{input_file.name}"
task = partial(tokenize_sentences, input_file, output_file, tokenizer)
process = Process(target=task)
process.start()
processes.append(process)
print(f"CharTokenizer: {input_file.name} is processing...")
# JamoTokenizer
tokenizer = JamoTokenizer()
for input_file in KO_CORPUSES:
output_file = f"{output_path}/jamo-200/{input_file.name}"
task = partial(tokenize_sentences, input_file, output_file, tokenizer)
process = Process(target=task)
process.start()
processes.append(process)
print(f"JamoTokenizer: {input_file.name} is processing...")
# MecabTokenizer
mecab_resources = [r for r in resources if r.startswith("mecab-")]
for mecab_resource in mecab_resources:
tokenizer = MeCabTokenizer(mecab_path=DICT_PATH, config_path=f"resources/{mecab_resource}/tok.json")
for input_file in KO_CORPUSES:
output_file = f"{output_path}/{mecab_resource}/{input_file.name}"
task = partial(tokenize_sentences, input_file, output_file, tokenizer)
process = Process(target=task)
process.start()
processes.append(process)
print(f"MeCabTokenizer: {input_file.name} is processing with {mecab_resource}...")
# SentencePieceTokenizer
sp_resources = [r for r in resources if r.startswith("sp-")]
for sp_resource in sp_resources:
tokenizer = SentencePieceTokenizer(model_path=f"resources/{sp_resource}/tok.model")
for input_file in KO_CORPUSES:
output_file = f"{output_path}/{sp_resource}/{input_file.name}"
task = partial(tokenize_sentences, input_file, output_file, tokenizer)
process = Process(target=task)
process.start()
processes.append(process)
print(f"SentencePieceTokenizer: {input_file.name} is processing with {sp_resource}...")
# en_sp tokenizer
en_sp_resources = [r for r in resources if r.startswith("en_sp-")]
for en_sp_resource in en_sp_resources:
tokenizer = SentencePieceTokenizer(model_path=f"resources/{en_sp_resource}/tok.model")
for input_file in EN_CORPUSES:
output_file = f"{output_path}/{en_sp_resource}/{input_file.name}"
task = partial(tokenize_sentences, input_file, output_file, tokenizer)
process = Process(target=task)
process.start()
processes.append(process)
print(f"SentencePieceTokenizer: {input_file.name} is processing with {en_sp_resource}...")
# MeCabSentencePieceTokenizer
mecab_sp_resources = [r for r in resources if r.startswith("mecab_sp-")]
for mecab_sp_resource in mecab_sp_resources:
tokenizer = MeCabSentencePieceTokenizer(
mecab=MeCabTokenizer(mecab_path=DICT_PATH, config_path="resources/mecab-16k/tok.json"),
sp=SentencePieceTokenizer(model_path=f"resources/{mecab_sp_resource}/tok.model"),
)
for input_file in KO_CORPUSES:
output_file = f"{output_path}/{mecab_sp_resource}/{input_file.name}"
task = partial(tokenize_sentences, input_file, output_file, tokenizer)
process = Process(target=task)
process.start()
processes.append(process)
print(f"MeCabSentencePieceTokenizer: {input_file.name} is processing with {mecab_sp_resource}...")
# special group: serial run
print("=== Special group: serial run ===")
# WordTokenizer
print("WordTokenizer")
tokenizer = WordTokenizer()
for input_file in KO_CORPUSES:
output_file = f"{output_path}/word-64k/{input_file.name}"
task = partial(tokenize_sentences, input_file, output_file, tokenizer)
task(pbar=True)
print("=== Waiting for all processes to finish ===")
for process in tqdm(processes):
process.join()
print("=== All processes finished ===")