-
Notifications
You must be signed in to change notification settings - Fork 0
/
clean.py
32 lines (28 loc) · 1002 Bytes
/
clean.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
import argparse
import io
import re
parser = argparse.ArgumentParser(description="flags for training tokenizer")
parser.add_argument("--path", required=True, type=str, help="path to .txt file")
def clean(path):
with open(path, "r") as file:
#read
file = file.read()
#ensure lowercase
file = file.lower()
#ensure remove new lines
file = file.replace('\n', ' ')
#strip
file = file.strip()
#ensure remove all punctuation but periods and question marks
pattern = r"[{}]".format("-!\"#$%&'()*+,/:;<=>@[\]^_`{|}~")
file = re.sub(pattern, "", file)
#single whitespace
file = " ".join(file.split())
#add back new line for tokenizer
file = file.replace(". ", ". \n")
#write
with open("{}-cleaned.txt".format(path.split(".")[0]), "w") as new_file:
new_file.write(file)
if __name__ == '__main__':
args = parser.parse_args()
clean(args.path)