Skip to content
This repository has been archived by the owner on Jan 15, 2024. It is now read-only.

Commit

Permalink
Fixed #1490 and #1520 (#1579)
Browse files Browse the repository at this point in the history
  • Loading branch information
akshatgui authored Aug 24, 2021
1 parent 83780ab commit 5d4bc9e
Show file tree
Hide file tree
Showing 3 changed files with 218 additions and 263 deletions.
11 changes: 5 additions & 6 deletions scripts/processing/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,7 @@
import textwrap

from . import (
clean_tok_para_corpus,
clean_tok_mono_corpus,
clean_tok_corpus,
learn_subword,
apply_subword
)
Expand All @@ -24,13 +23,13 @@ def cli_main():
'Choices are {}.'.format(SUBCOMMANDS))
args, other_args = parser.parse_known_args()
if args.command == 'clean_tok_para_corpus':
parser = clean_tok_para_corpus.get_parser()
parser = clean_tok_corpus.get_parser.para()
sub_args = parser.parse_args(other_args)
clean_tok_para_corpus.main(sub_args)
clean_tok_corpus.main_para(sub_args)
elif args.command == 'clean_tok_mono_corpus':
parser = clean_tok_mono_corpus.get_parser()
parser = clean_tok_corpus.get_parser.mono()
sub_args = parser.parse_args(other_args)
clean_tok_mono_corpus.main(sub_args)
clean_tok_corpus.main_mono(sub_args)
elif args.command == 'learn_subword':
parser = learn_subword.get_parser()
sub_args = parser.parse_args(other_args)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ def get_tokenizer(tokenizer, lang=None):
raise NotImplementedError



def check_both_latin1(src_sentence: str, tgt_sentence: str) -> bool:
"""Check whether the sentence pair can all be encoded in latin1
Expand All @@ -48,6 +49,28 @@ def check_both_latin1(src_sentence: str, tgt_sentence: str) -> bool:
return True



def check_latin1(sentence: str) -> bool:
"""Check whether the sentence can be encoded in latin1
This is used in
https://github.com/mlperf/training/blob/master/rnn_translator/pytorch/scripts/filter_dataset.py
The idea is to filter the sentences with rare unicode glyphs
Returns
-------
ret
Whether sentences are latin1
"""
try:
sentence.encode('latin1')
except UnicodeEncodeError:
return False
else:
return True


def get_line_byte_start(corpus_path: str) -> np.ndarray:
"""Get the start position of each lines in terms of bytes so that we can use seek + read to
load an arbitrary line.
Expand Down Expand Up @@ -240,7 +263,137 @@ def chunk_iterator(step=10):
return filtered_line_count


def get_parser():


class MonoCorpusProcessor:
"""Process sentence of corpus.
This largely recovers the functionality of 'clean-corpus-n.perl' in mosesdecoder.
The difference is that it is customizable with pure python.
By default, we will perform the following pre-processing pipeline.
Each stage could be turned on/off and specialized based on the input arguments.
Also, you may directly revise the code and write your own processing script.
1. Normalize sentence
2. Pre-filter
3. Tokenize the sentence
4. Filter the sentence based on different rules
3.1 Remove sentences where `max(len(lhs) / len(rhs), len(rhs) / len(lhs) > max_ratio`
3.2 Remove sentences where not `min_max_words <= len(lhs) <= max_num_words` and
`min_max_words <= len(rhs) <= max_num_words`
"""
def __init__(self, lang: str,
normalize: bool = True,
tokenizer: Union[str, BaseTokenizer] = 'whitespace',
min_num_words: Optional[int] = None,
max_num_words: Optional[int] = None,
discard_non_latin1: bool = False):
self._lang = lang
if normalize:
self._normalizer = MosesNormalizer(lang=lang)
self._tokenizer = get_tokenizer(tokenizer, lang)
self._min_num_words = min_num_words
self._max_num_words = max_num_words
self._discard_non_latin1 = discard_non_latin1

def process_chunk(self, args):
path, chunk_start, chunk_size = args
processed_lines = []
with open(path, 'rb') as in_f:
# Read chunk
in_f.seek(chunk_start)
lines = in_f.read(chunk_size)
lines = lines.splitlines()
unfiltered_line_num = len(lines)
for line in lines:
line = line.decode('utf-8').strip()
# 1. Normalize
line = self._normalizer(line)
# 2. Filter after normalization.
if self._discard_non_latin1:
if not check_latin1(line):
continue
# 3. Tokenize the sentence
tokens = self._tokenizer.encode(line)
# 4. Filter after tokenization. Filter with multiple rules
if len(tokens) == 0:
continue
if self._max_num_words is not None:
if len(tokens) > self._max_num_words:
continue
if self._min_num_words is not None:
if len(tokens) < self._min_num_words:
continue
processed_lines.append(' '.join(tokens))
return processed_lines, unfiltered_line_num

def process_mono_corpus(self,
corpus_paths: List[str],
out_path: str,
chunk_size: int = 1024 * 1024,
num_process: int = 8) -> int:
"""Preprocess the mono corpus
Parameters
----------
corpus_paths
Corpus paths
out_path
Write the results to the output path
chunk_size
Approximately split the corpus files into multiple chunks
num_process
The number of process
Returns
-------
line_count
The number of lines in the final filtered file
"""
start = time.time()
total_line_count = 0
filtered_line_count = 0

def chunk_iterator(step=10):
for path in corpus_paths:
line_pos = get_line_byte_start(path)
line_size = line_pos[1:] - line_pos[:-1]
num_lines = line_pos.shape[0] - 1
budget = chunk_size
chunk_start = 0
cur_chunk_size = 0
for i in range(0, num_lines, step):
line_batch_num = min(num_lines - i, step)
batch_line_size = line_size[i:(i + line_batch_num)].sum()
budget -= batch_line_size
cur_chunk_size += batch_line_size
if budget <= 0 or i + step >= num_lines:
yield path, chunk_start, cur_chunk_size
chunk_start += cur_chunk_size
cur_chunk_size = 0
budget = chunk_size

with open(out_path, 'w', encoding='utf-8', newline='\n') as out_f:
with multiprocessing.Pool(num_process) as pool:
for i, (processed_lines, unfiltered_line_num) in \
enumerate(pool.imap(self.process_chunk, chunk_iterator())):
out_f.write('\n'.join(processed_lines) + '\n')
filtered_line_count += len(processed_lines)
total_line_count += unfiltered_line_num
if (i + 1) % 100 == 0:
print('Chunk {}, #Lines Processed: {}, Filtered: {}, Remain: {}'
.format(i + 1, total_line_count,
total_line_count - filtered_line_count,
filtered_line_count))
end = time.time()
print('Done, #Lines {}/{}, Time spent {}'.format(filtered_line_count,
total_line_count,
end - start))
return filtered_line_count


def get_parser_para():
parser = argparse.ArgumentParser(
description='Clean parallel corpus used in machine translation.')
parser.add_argument('--src-corpus', type=str, nargs='+', required=True)
Expand Down Expand Up @@ -268,7 +421,30 @@ def get_parser():
return parser


def main(args):

def get_parser_mono():
parser = argparse.ArgumentParser(
description='Clean mono corpus used in machine translation.')
parser.add_argument('--corpus', type=str, nargs='+', required=True)
parser.add_argument('--lang', type=str, required=True)
parser.add_argument('--save-path', type=str, default=None,
help='Path to save the cleaned and tokenized corpus. If not set, '
'the default is "corpus.tok.{lang}"')
parser.add_argument('--tokenizer', type=str, default='moses')
parser.add_argument('--min-num-words', type=int, default=None)
parser.add_argument('--max-num-words', type=int, default=None)
parser.add_argument('--discard-non-latin1', action='store_true',
help='Whether to discard the sentence pair if both sentences cannot be '
'encoded into latin1.')
parser.add_argument('--num-process', type=int, default=os.cpu_count(),
help='number of process')
parser.add_argument('--overwrite', action='store_true')

return parser



def main_para(args):
src_lang, tgt_lang = args.src_lang, args.tgt_lang
corpus_processor = ParallelCorpusProcessor(src_lang=src_lang,
tgt_lang=tgt_lang,
Expand Down Expand Up @@ -303,11 +479,43 @@ def main(args):
num_process=args.num_process)



def main_mono(args):
corpus_processor = MonoCorpusProcessor(lang=args.lang,
tokenizer=args.tokenizer,
min_num_words=args.min_num_words,
max_num_words=args.max_num_words,
discard_non_latin1=args.discard_non_latin1)
print('Clean the mono corpus:')
print(' {}: {}'.format(args.lang, args.corpus))
if args.save_path is None:
save_path = 'corpus.tok.{}'.format(args.lang)
else:
save_path = args.save_path
print('Save to {} -> {} \n'.format(args.lang, save_path))
if os.path.exists(save_path) and not args.overwrite:
warnings.warn('{} exists, skip. If you need to overwrite this file, '
'rerun the script with --overwrite.'.format(save_path))
else:
corpus_processor.process_mono_corpus(
corpus_paths=args.corpus,
out_path=save_path,
num_process=args.num_process)


def cli_main():
parser = get_parser()
args = parser.parse_args()
main(args)

try:
parser_para = get_parser_para()
args_para = parser_para.parse_args()
main_para(args_para)

except:
parser_mono = get_parser_mono()
args_mono = parser_mono.parse_args()
main_mono(args_mono)


if __name__ == '__main__':
cli_main()

Loading

0 comments on commit 5d4bc9e

Please sign in to comment.