-
Notifications
You must be signed in to change notification settings - Fork 0
/
pykw.py
118 lines (106 loc) · 4.07 KB
/
pykw.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
#!/usr/bin/env python
# encoding: utf-8
import argparse
import csv
import re
import json
import time
import threading
import mechanize
import sys
from bs4 import BeautifulSoup, Comment
from topia.termextract import tag
from topia.termextract import extract
csv.register_dialect('custom', delimiter='\t', doublequote=True, escapechar=None,
quotechar='"', quoting=csv.QUOTE_MINIMAL, skipinitialspace=False)
parser = argparse.ArgumentParser()
parser.add_argument('-i, --input', action='store', dest='file', help='Path to the input file containing urls')
parser.add_argument('-c, --content', action='store', dest='content', help='Selector for the main content area')
parser.add_argument('-o, --output', action='store', dest='output', help='Name of the output file')
parser.add_argument('-l, --length', action='store', dest='length', help='Minimum string length of the tags to return')
opts = parser.parse_args()
def getFormat():
format = opts.output.rsplit('.', 1)
if(len(format) < 2):
format.append('csv')
elif not(format[1] == 'json' or format[1] == 'csv'):
print "Please enter a valid file type for the format parameter. Acceptable values are 'json' or 'csv'"
sys.exit()
return format
def getContent():
with open(opts.file) as urls:
data = csv.reader(urls, dialect='custom')
br = mechanize.Browser()
br.set_handle_robots(False)
scrape = []
print "=== Sit tight - Fetching content"
for url in data:
print " - Fetching " + url[0]
response = br.open(url[0])
soup = BeautifulSoup(response.read())
# Remove inline scripts, styles and comments completely
for script in soup("script"):
soup.script.extract()
for style in soup("style"):
soup.style.extract()
comments = soup.findAll(text=lambda text:isinstance(text, Comment))
[comment.extract() for comment in comments]
if(opts.content):
content = soup.select(opts.content)
else:
raise Exception('A required argument is missing. The content area must be specified.')
text = soup.title.string
for s in content:
s = str(s).decode('ascii', 'ignore')
s = ''.join(BeautifulSoup(s).findAll(text=True))
pat = re.compile(r'\s+')
s = pat.sub(' ', s).strip()
s = re.sub(r'<[^>]*?>', '', s) #strip remaining tag contents
text = text + ': ' + s
text = text.strip()
scrape.append(text)
time.sleep(2) # throttles requests to be courteous
return scrape
def analyzeKeywords():
content = getContent()
tagger = tag.Tagger()
tagger.initialize()
extractor = extract.TermExtractor(tagger)
allterms = []
print "=== Analyzing keywords"
for s in content:
try:
terms = sorted(extractor(s), key=lambda strength: strength[2])
allterms.extend(terms)
except Exception:
continue
termlist = []
for term in allterms:
if(opts.length):
if(len(term[0]) >= int(opts.length)):
termlist.append(term[0].lower())
else:
termlist.append(term[0].lower())
keywords = {}
for i in set(termlist):
keywords[i] = termlist.count(i)
return keywords
def generateExport():
keywords = analyzeKeywords()
format = getFormat()
if(format[1] == 'json'):
f = open(format[0] + '.' + format[1], 'w')
f.write(json.dumps(keywords, sort_keys=True, indent=4))
f.close()
elif(format[1] == 'csv'):
csvfile = open(format[0] + '.' + format[1], 'wb')
c = csv.writer(csvfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
c.writerow(['Keyword', 'Count'])
for term in keywords.items():
try:
c.writerow(term)
except Exception:
continue
csvfile.close()
if __name__ == '__main__':
generateExport()