This repository has been archived by the owner on Jan 15, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 533
/
segment_sentences.py
150 lines (108 loc) · 5.73 KB
/
segment_sentences.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
import statistics
import nltk
import multiprocessing
from collections import defaultdict
from itertools import islice
import time
import os
def segment_sentences(input):
articles, output_names, rank, segmenter = input
print('process {} needs to segment {} articles'.format(rank, len(articles)))
sentences = {}
for i, article in enumerate(articles):
sentences[article] = segmenter.segment_string(articles[article])
if i % 100 == 0:
print('process {} finish article {}'.format(rank, i))
print('process {} finish segment'.format(rank))
# return_dict.update(sentences)
total_length = 0
for article_id in sentences:
total_length += len(sentences[article_id])
# try to average size of output size
ideal_length = total_length // len(output_names) + 1
output_list = defaultdict(lambda: [])
output_index = 0
current_length = 0
for article_id in sentences:
current_length += len(sentences[article_id])
output_list[output_names[output_index]].append(article_id)
if current_length >= ideal_length:
output_index += 1
current_length = 0
print('process {} start to write to disk'.format(rank))
for output_name in output_names:
with open(output_name, mode='w', newline='\n') as f:
for article_id in output_list[output_name]:
for line in sentences[article_id]:
f.write(line + '\n')
f.write('\n')
print('process {} finish to write to disk'.format(rank))
class NLTKSegmenter:
def __init(self):
pass
def segment_string(self, article):
return nltk.tokenize.sent_tokenize(article)
class Sharding:
def __init__(self, input_files, output_name_prefix, n_training_shards, n_test_shards, fraction_test_set, segmenting_num_worker):
assert len(input_files) > 0, 'The input file list must contain at least one file.'
assert n_training_shards > 0, 'There must be at least one output shard.'
assert n_test_shards > 0, 'There must be at least one output shard.'
self.n_training_shards = n_training_shards
self.n_test_shards = n_test_shards
self.fraction_test_set = fraction_test_set
self.segmenting_num_worker = segmenting_num_worker
self.input_files = input_files
self.output_name_prefix = output_name_prefix
self.output_training_identifier = '_training'
self.output_test_identifier = '_test'
self.output_file_extension = '.txt'
self.articles = {} # key: integer identifier, value: list of articles
self.sentences = {} # key: integer identifier, value: list of sentences
self.output_training_files = {} # key: filename, value: list of articles to go into file
self.output_test_files = {} # key: filename, value: list of articles to go into file
self.init_output_files()
# Remember, the input files contain one article per line (the whitespace check is to skip extraneous blank lines)
def load_articles(self):
print('Start: Loading Articles')
global_article_count = 0
for input_file in self.input_files:
print('input file:', input_file)
with open(input_file, mode='r', newline='\n') as f:
for i, line in enumerate(f):
if line.strip():
self.articles[global_article_count] = line.rstrip()
global_article_count += 1
print('End: Loading Articles: There are', len(self.articles), 'articles.')
def segment_articles_into_sentences(self):
print('Start: Sentence Segmentation')
segmenter = NLTKSegmenter()
if len(self.articles) is 0:
self.load_articles()
assert len(self.articles) is not 0, 'Please check that input files are present and contain data.'
def chunks(data, names, size=len(self.articles), name_size = 1):
it = iter(data)
it_name = iter(names)
for i in range(0, len(data), size):
yield ({k: data[k] for k in islice(it, size)}, [p for p in islice(it_name, name_size)])
n_processes = self.segmenting_num_worker # in addition to the main process, total = n_proc+1
pool = multiprocessing.Pool(n_processes)
rank = 0
args=[]
for item, name_item in chunks(self.articles, self.output_training_files,
(len(self.articles) - 1) // (n_processes) + 1,
(len(self.output_training_files) - 1) // (n_processes) + 1):
args.append((item, name_item, rank, segmenter))
rank += 1
pool.map(segment_sentences, args)
def init_output_files(self):
print('Start: Init Output Files')
assert len(self.output_training_files) is 0, 'Internal storage self.output_files already contains data. This function is intended to be used by the constructor only.'
assert len(self.output_test_files) is 0, 'Internal storage self.output_files already contains data. This function is intended to be used by the constructor only.'
for i in range(self.n_training_shards):
name = self.output_name_prefix + self.output_training_identifier + '_' + str(i) + self.output_file_extension
self.output_training_files[name] = []
for i in range(self.n_test_shards):
name = self.output_name_prefix + self.output_test_identifier + '_' + str(i) + self.output_file_extension
self.output_test_files[name] = []
print('End: Init Output Files')
nltk.download('punkt')