-
Notifications
You must be signed in to change notification settings - Fork 0
/
text_percentage.py
68 lines (52 loc) · 2.16 KB
/
text_percentage.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
from collections import defaultdict
import nltk
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize,wordpunct_tokenize
import re
import os
import sys
from pathlib import Path
while True:
try:
file3=Path(input('\nPlease, enter your file path: '))
dic_to_open_ep=Path(input('\nPlease, enter your dictionary path: '))
punc_dic_ep=Path(input('\nPlease, enter your punctuation dictionary path: '))
with open (dic_to_open_ep,'r', encoding="utf-8") as ep:
dic_ep = wordpunct_tokenize(ep.read())
with open (punc_dic_ep, 'r', encoding='utf=8') as punc_ep:
punc_dic_ep: wordpunct_tokenize(punc_ep.read())
with open(file3,'r', encoding='utf-8') as e_p:
file_e_p=wordpunct_tokenize(e_p.read())
break
except FileNotFoundError:
print("\nFile not found. Better try again")
except IsADirectoryError:
print("\nIncorrect Directory path.Try again")
patt_ep=re.compile(r"^[A-Z][a-z]+\b|^[A-Z]+\b")
no= list(filter(patt_ep.match, file_e_p))
errors_ep=[]
text_ep=[]
Cap_nouns_ep=[]
named_entities_ep=[]
for EP in file_e_p:
if EP in dic_ep:
text_ep.append(EP)
elif EP in punc_dic_ep:
text_ep.append(EP)
elif EP in no:
Cap_nouns_ep.append(EP)
else:
errors_ep.append(EP)
for EPJ in Cap_nouns_ep:
if EPJ not in dic_ep:
named_entities_ep.append(EPJ)
print('\n\nYourfile contains a total of', len(text_ep), 'tokens')
print('\n\nYour file contains a total of',len(errors_ep), 'tokens not found in dictionary')
print('\n\nYour file contains a total of', len(Cap_nouns_ep), 'capitalised words')
print('\n\nYour file contains a total of', len(named_entities_ep), 'proper nouns')
print('\n\nThe error percentage of this file is: ', 100*float(len(errors_ep))/float(len(text_ep)))
print('\n\nThe percentage of capitalised words is: ', 100*float(len(Cap_nouns_ep))/float(len(text_ep)))
print('\n\nThe percentage of named entities is: ', 100*float(len(named_entities_ep))/float(len(text_ep)))
print('\n\n',errors_ep)
print('\n\n',Cap_nouns_ep)
print('\n\n',named_entities_ep)