forked from undertheseanlp/word_tokenize
-
Notifications
You must be signed in to change notification settings - Fork 0
/
word_tokenize.py
42 lines (36 loc) · 1.37 KB
/
word_tokenize.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
import argparse
import os
from os.path import abspath
from util.crf.word_tokenize import word_tokenize
parser = argparse.ArgumentParser("word_tokenize.py")
text_group = parser.add_argument_group("The following arguments are mandatory for text option")
text_group.add_argument("--text", metavar="TEXT", help="text to predict")
file_group = parser.add_argument_group("The following arguments are mandatory for file option")
file_group.add_argument("--fin", help="file input")
file_group.add_argument("--fout", help="file output")
parser.add_argument("--model", help="path to load model")
args = parser.parse_args()
if __name__ == '__main__':
if not (args.text or args.fin):
parser.print_help()
model = None
if args.model:
model = abspath(args.model)
if args.text:
text = args.text
label = word_tokenize(text, format="text", model_path=model)
print(label)
if args.fin or args.fout:
if not (args.fout and args.fin):
parser.error("Options --fin and --fout must be set together")
file_in = args.fin
file_out = args.fout
try:
os.rm(args.fout)
except:
pass
f = open(file_out, "a")
for text in open(file_in):
text = text.strip()
output = word_tokenize(text, format="text", model_path=model) + "\n"
f.write(output)