-
Notifications
You must be signed in to change notification settings - Fork 1
/
verbs.py
executable file
·63 lines (52 loc) · 1.56 KB
/
verbs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
#!/usr/bin/env python2.7
'''
See README.md for setup.
Usage:
$ ./verbs.py $DOCUMENT
$ ./verbs.py - (Expect input via stdin)
Works with PDF, UTF-8, and ASCII documents.
Outputs tab-delimited histogram of verb phrase count.
'''
from pattern.en import parse
from pattern.en import pprint
from pattern.en import parsetree
import operator
import sys
import re
import magic
import pdftotext
if len(sys.argv) == 2:
filename = sys.argv[1]
input_text = None
if filename == '-':
input_text = sys.stdin.read()
else:
file_type = magic.from_file(filename)
if 'PDF document' in file_type:
with open(filename, 'rb') as f:
input_text = "".join(pdftotext.PDF(f)).encode('ascii','ignore').decode('utf-8')
elif 'UTF-8 Unicode text' in file_type or 'ASCII text' in file_type:
with open(filename) as f:
input_text = f.read().encode('ascii','ignore').decode('utf-8')
else:
print("Input doesn't appear to be either a PDF or UTF-8 or ASCII text.")
else:
print("Please specify filename or - for stdin")
exit(1)
if input_text == None:
print("No text read from input, skipping.")
exit(0)
s = parsetree(input_text)
hist = {}
for sentence in s:
for chunk in sentence.chunks:
if chunk.type == 'VP':
c = chunk.string.lower()
if c in hist:
hist[c] += 1
else:
hist[c] = 1
hist = sorted(hist.items(), key=operator.itemgetter(1))
hist.reverse()
for (w,c) in hist:
print("%u\t%s" % (c,w))