-
Notifications
You must be signed in to change notification settings - Fork 0
/
pagerank.py
238 lines (184 loc) · 7.81 KB
/
pagerank.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
import os
import random
import re
import sys
DAMPING = 0.85
SAMPLES = 10000
def main():
if len(sys.argv) != 2:
sys.exit("Usage: python pagerank.py corpus")
corpus = crawl(sys.argv[1])
ranks = sample_pagerank(corpus, DAMPING, SAMPLES)
print(f"PageRank Results from Sampling (n = {SAMPLES})")
for page in sorted(ranks):
print(f" {page}: {ranks[page]:.4f}")
ranks = iterate_pagerank(corpus, DAMPING)
print(f"PageRank Results from Iteration")
for page in sorted(ranks):
print(f" {page}: {ranks[page]:.4f}")
def crawl(directory):
"""
Parse a directory of HTML pages and check for links to other pages.
Return a dictionary where each key is a page, and values are
a list of all other pages in the corpus that are linked to by the page.
"""
pages = dict()
# Extract all links from HTML files
for filename in os.listdir(directory):
if not filename.endswith(".html"):
continue
with open(os.path.join(directory, filename)) as f:
contents = f.read()
links = re.findall(r"<a\s+(?:[^>]*?)href=\"([^\"]*)\"", contents)
pages[filename] = set(links) - {filename}
# Only include links to other pages in the corpus
for filename in pages:
pages[filename] = set(
link for link in pages[filename]
if link in pages
)
return pages
def transition_model(corpus, page, damping_factor):
"""
Return a probability distribution over which page to visit next,
given a current page.
With probability `damping_factor`, choose a link at random
linked to by `page`. With probability `1 - damping_factor`, choose
a link at random chosen from all pages in the corpus.
"""
# initialize the probability distribution to blank
prob_dist = {}
# total pages in corpus
total_pages_count = len(corpus)
# number of linked page to current page
linked_pages_count = len(corpus[page])
# If there are no links in the current page, then do equal distribution
if linked_pages_count == 0:
random_page_prob = 1 / total_pages_count
linked_page_prob = 0
else:
random_page_prob = (1-damping_factor) / total_pages_count
linked_page_prob = damping_factor / linked_pages_count
# Iterate through each page in corpus, and calculate the probability
for pg in list(corpus):
# default probability to random page probability
prob = random_page_prob
# if the page pg is one amongst the links inside of current page,
# then added the linked page probability
if pg in corpus[page]:
prob = prob + linked_page_prob
prob_dist[pg] = prob
return prob_dist
def sample_pagerank(corpus, damping_factor, n):
"""
Return PageRank values for each page by sampling `n` pages
according to transition model, starting with a page at random.
Return a dictionary where keys are page names, and values are
their estimated PageRank value (a value between 0 and 1). All
PageRank values should sum to 1.
"""
# initalize sample rank to blank dictionary
# This will hold the Page Rank for each page in corpus
sample_rank = {}
# get all pages list from corpus
page_list = list(corpus)
# select a random page
selected_page = page_list[random.randint(0, len(page_list)-1)]
# loop over based on provided sampling number
for sample_idx in range(0,n):
# get the probability distribution of selected page
prob_dist = transition_model(corpus, selected_page, damping_factor)
# pick a random number between 0 and 1
rand_num = random.random()
# initialize range limit to 0
range_limit = 0
# iterate over each page in corpus
# Pick next page based on distribution
# The way I implemented this as per example below
# rand_num - Pick a random number between 0 and 1. Say rand_num is 0.9
# I set the range limits based on probability distribution of selected page
# e.g for probability distibution <0.5, 0.2, 0.3>, I set ranges as below
# rand_num <= 0.5
# rand_num <= 0.7 (which is 0.5 + 0.2)
# rand_num <= 1.0 (which is 0.5 + 0.2 + 0.3)
# and pick the page that satisfies a specific range condition
for pg in page_list:
# set the range limit based on probability distribution of page
range_limit = range_limit + prob_dist[pg]
# if the rand_num is less or equal to computed range limit, then select it as next page
if rand_num <= range_limit:
selected_page = pg
# increment the visit count of picked page in the sample_rank
if(selected_page in sample_rank):
sample_rank[selected_page] = sample_rank[selected_page] + 1
else:
sample_rank[selected_page] = 1
break
# normalizing
for page, visit_count in sample_rank.items():
sample_rank[page] = sample_rank[page] / n
return sample_rank
def iterate_pagerank(corpus, damping_factor):
"""
Return PageRank values for each page by iteratively updating
PageRank values until convergence.
Return a dictionary where keys are page names, and values are
their estimated PageRank value (a value between 0 and 1). All
PageRank values should sum to 1.
"""
# set accuracy value
accuracy = 0.001
# initalize iterate rank to blank dictionary
# This will hold the Page Rank for each page in corpus
iterate_rank = {}
# get all pages list from corpus
page_list = list(corpus)
page_count = len(page_list)
# assign equal probabilty to all pages in corpus
prob = 1 / page_count
for page in page_list:
iterate_rank[page] = prob
# compute random link probability
random_prob = (1 - damping_factor) / page_count
continueIteration = True
while continueIteration:
new_iterate_rank = {}
# for each page
for page in page_list:
# get all the parent pages which contains link to this page
parent_pages = get_parent_pages(corpus, page)
linked_prob = 0
# iterate thru each parent page and summate the links probability
for parent in parent_pages:
# number of links in the parent page
no_of_links = len(corpus[parent])
# if there are no links then consider it has links to all pages including itself
if no_of_links == 0:
no_of_links = page_count
# compute and summate the calculated probability
linked_prob = linked_prob + (iterate_rank[parent] / no_of_links)
# compute new Page Rank
new_iterate_rank[page] = random_prob + (damping_factor * linked_prob)
shouldContinue = False
# check if the deviation between previous and new page ranks is within the state accuracy
# if deviation is within the defined accuracy, then Page Rank has converged and stop further calculation
for page in page_list:
if abs(iterate_rank[page] - new_iterate_rank[page]) > accuracy:
shouldContinue = True
break
continueIteration = shouldContinue
# assign iterate_rank to new rank
iterate_rank = new_iterate_rank.copy()
# return page rank
return iterate_rank
def get_parent_pages(corpus, page):
"""
Returns list of parent pages that has links to provide page
"""
parent_pages=[]
for pg, linked_pages in corpus.items():
if page in linked_pages or len(linked_pages) == 0:
parent_pages.append(pg)
return parent_pages
if __name__ == "__main__":
main()