-
Notifications
You must be signed in to change notification settings - Fork 0
/
test_regEx.py
72 lines (42 loc) · 1.35 KB
/
test_regEx.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
from collections import defaultdict
import re
import nltk
from nltk.tokenize import word_tokenize
with open('text-test.txt') as f:
freq = word_tokenize(f.read())
with open ('Fr-dictionary_Upper_Low.txt') as fr:
dic = word_tokenize(fr.read())
pat=re.compile(r"[.,:;?!'%-]|\b(\w'|w’)+\b|\w+(?:-\w+)+|\d+") #regular expression for words with apostrophes and separated by hyphen
reg= list(filter(pat.match, freq))
patt=re.compile(r"^[A-Z][a-z]+\b|^[A-Z]+\b") #regular expression for words that start with capital letters (ex: proper nouns)
c_n= list(filter(patt.match, freq))
Cap_nouns=[]
errors=[ ]
d=defaultdict(int)
d2=defaultdict(int)
d3=defaultdict(int)
for w in freq:
d[w]+=1
if w in reg:
continue
elif w in c_n:
Cap_nouns.append(w)
elif w not in dic:
errors.append(w)
for w in sorted(d, key=d.get):
print('word:',w, d[w])
display_errors=input('Display the list of items not found in Dict?" |Y/N: ')
for x in errors:
if display_errors=='Y':
d2[x]+=1
for x in sorted(d2, key=d2.get):
print('Error:',x, d2[x])
display_Cap=input('Display list of Capitalised words not found in Dict?" |Y/N: ')
for y in Cap_nouns:
if display_Cap =='Y':
d3[y]+=1
for y in sorted(d3, key=d3.get):
if y in dic:
continue
else:
print('Capitalised word:',y, d3[y])