-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpagerank.py
86 lines (65 loc) · 1.99 KB
/
pagerank.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
#!../bin/python
#
# Script for getting Google Page Rank of page
# Google Toolbar 3.0.x/4.0.x Pagerank Checksum Algorithm
#
# original from http://pagerank.gamesaga.net/
# this version was adapted from http://www.djangosnippets.org/snippets/221/
# by Corey Goldberg - 2010
#
# Licensed under the MIT license: http://www.opensource.org/licenses/mit-license.php
import urllib
def get_pagerank(url):
hsh = check_hash(hash_url(url))
gurl = 'http://www.google.com/search?client=navclient-auto&features=Rank:&q=info:%s&ch=%s' % (urllib.quote(url), hsh)
try:
f = urllib.urlopen(gurl)
rank = f.read().strip()[9:]
except Exception:
rank = 'N/A'
if rank == '':
rank = '0'
return rank
def int_str(string, integer, factor):
for i in range(len(string)) :
integer *= factor
integer &= 0xFFFFFFFF
integer += ord(string[i])
return integer
def hash_url(string):
c1 = int_str(string, 0x1505, 0x21)
c2 = int_str(string, 0, 0x1003F)
c1 >>= 2
c1 = ((c1 >> 4) & 0x3FFFFC0) | (c1 & 0x3F)
c1 = ((c1 >> 4) & 0x3FFC00) | (c1 & 0x3FF)
c1 = ((c1 >> 4) & 0x3C000) | (c1 & 0x3FFF)
t1 = (c1 & 0x3C0) << 4
t1 |= c1 & 0x3C
t1 = (t1 << 2) | (c2 & 0xF0F)
t2 = (c1 & 0xFFFFC000) << 4
t2 |= c1 & 0x3C00
t2 = (t2 << 0xA) | (c2 & 0xF0F0000)
return (t1 | t2)
def check_hash(hash_int):
hash_str = '%u' % (hash_int)
flag = 0
check_byte = 0
i = len(hash_str) - 1
while i >= 0:
byte = int(hash_str[i])
if 1 == (flag % 2):
byte *= 2;
byte = byte / 10 + byte % 10
check_byte += byte
flag += 1
i -= 1
check_byte %= 10
if 0 != check_byte:
check_byte = 10 - check_byte
if 1 == flag % 2:
if 1 == check_byte % 2:
check_byte += 9
check_byte >>= 1
return '7' + str(check_byte) + hash_str
if __name__ == '__main__':
print get_pagerank('http://www.google.com')