-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathweb_scraper.py
210 lines (144 loc) · 4.92 KB
/
web_scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
# this is a simple script
# to get the 20 words and their frequency percentage
#with highest frequency in an english wikipedia article
#and save those list in a CSV File
#importing csv
import csv
#beautiful soup is a python library
#for pulling data out of HTML and XML files
from bs4 import BeautifulSoup
#Requests is one of the most downloaded
#python packages of all time
#HTTP library for pulling pushing and authenticating
import requests
# re lets you do regular expressions operations
import re
#The operator module exports a set of efficient functions
#corresponding to the intrinsic operators in python
#comparison, addition, greater than less then
import operator
#parses json,formats it
import json
#The module provides just one function ,tabulate
#which takes a list of lists or another
#tabular data type as the first argument,
#and outputs a nicely formatted plain-text table
from tabulate import tabulate
#system calls, deals with user agruments
import sys
#list of common stop words
from stop_words import get_stop_words
# csvFile = open("save.csv","a")
# csvWriter =csv.writer(csvFile)
#get the words
def getWordList(url):
word_list = []
#raw data
source_code = requests.get(url)
#convert to text
plain_text = source_code.text
#lxml format
soup = BeautifulSoup(plain_text,'lxml')
#find the words in paragraph tag
for text in soup.findAll('p'):
if text.text is None:
continue
#content
content = text.text
#lowercase and split into an array
words = content.lower().split()
#for each word
for word in words:
#remove non chars
cleaned_word = clean_word(word)
#if there is still something there
if len(cleaned_word)>0:
#add it to our list
word_list.append(cleaned_word)
return word_list
#clean word with regex
def clean_word(word):
cleaned_word = re.sub('[^A-Za-z]+','',word)
return cleaned_word
def createFrequencyTable(word_list):
#word count
word_count = {}
for word in word_list:
#index is the word
if word in word_count:
word_count[word]+=1
else:
word_count[word]=1
return word_count
#remove stop words
def remove_stop_words(frequency_list):
stop_words = get_stop_words('en')
temp_list=[]
for key,value in frequency_list:
if key not in stop_words:
temp_list.append([key,value])
return temp_list
#access wiki API. json format. query it for data. searcch type. shows list of possibilities
wikipedia_api_link = "https://en.wikipedia.org/w/api.php?format=json&action=query&list=search&srsearch="
wikipedia_link = "https://en.wikipedia.org/wiki/"
#if the search word is too small,throw an error
if(len(sys.argv)<2):
print("Enter valid string")
exit()
#get the search word
string_query = sys.argv[1]
#to remove stop words or not
if(len(sys.argv)>2):
search_mode= True
else:
search_mode= False
#create our url
url = wikipedia_api_link + string_query #concatinated
# try-except block.This is a simple way to deal with exceptions
#great for HTTP requests
try:
#use rquests to retrieve raw data from wiki API URL we just constructed
response = requests.get(url)
#format that data as a JSON dictionary
data = json.loads(response.content.decode("utf-8"))
#page title,first option
#show this in a web browser
wikipedia_page_tag = data['query']['search'][0]['title']
#get actual wiki page based on retrieved title
url = wikipedia_link + wikipedia_page_tag
#get list of words from that page
page_word_list = getWordList(url)
#create table of word counts,dictionary
page_word_count = createFrequencyTable(page_word_list)
#sort the table by frequency count
sorted_word_frequency_list = sorted(page_word_count.items(),key=operator.itemgetter(1),reverse = True)
#remove stop words if the user specified
if(search_mode):
sorted_word_frequency_list = remove_stop_words(sorted_word_frequency_list)
#sum the total words to calculate frequencies
total_words_sum = 0
for key,value in sorted_word_frequency_list:
total_words_sum = total_words_sum + value
#just get the top 20 words
if len(sorted_word_frequency_list)>20:
sorted_word_frequency_list = sorted_word_frequency_list[:20]
#create our final list which contains words,frequency(word count), percentage
final_list = []
for key,value in sorted_word_frequency_list:
percentage_value = float(value*100) / total_words_sum
final_list.append([key,value,round(percentage_value,4)])
#headers before the table
print_headers = ['word','Frequency','Frequency Percentage']
#print the table with tabulate
print(tabulate(final_list,headers = print_headers,tablefmt='orgtbl'))
#code to save the result in csv file
with open("save.csv","a") as file:
writer = csv.writer(file)
#setting column names in csv
writer.writerow(['word','Frequency','Frequency Percentage'])
#writing results from final_list to csv
for row in final_list:
writer.writerow(row)
#throw an exception in case it breaks
except requests.exceptions.Timeout:
print("The server didn't respond. Please, try again later.")