-
Notifications
You must be signed in to change notification settings - Fork 2
/
crawl.py
75 lines (60 loc) · 2.37 KB
/
crawl.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
import sys, os
import re
import urllib2
import urlparse
import urlQueue
class Crawl():
global keywordregex, linkregex, downloadnext
keywordregex = re.compile('<meta\sname=["\']keywords["\']\scontent=["\'](.*?)["\']\s/>')
linkregex = re.compile('<a\s(?:.*?\s)*?href=[\'"](.*?)[\'"].*?>')
def __init__(self, _url, _meta):
self.url = _url
self.meta = _meta
self.tocrawl = set([self.url])
self.crawled = set([])
#self.downloader = Downloader()
self.DOWNLOAD_FLAG = False
downloadnext = False
def start(self):
global keywordregex, linkregex
while 1:
print 'tocrawl : ',len(self.tocrawl)
print 'crawled : ',len(self.crawled)
if len(self.crawled) > 1000:
break
try:
crawling = self.tocrawl.pop()
#print 'Crawling : ', crawling
except KeyError:
raise StopIteration
url = urlparse.urlparse(crawling)
try:
response = urllib2.urlopen(crawling)
except:
continue
msg = response.read()
#print msg
keywordlist = keywordregex.findall(msg)
if len(keywordlist) != 0:
keywordlist = keywordlist[0]
keywordlist = keywordlist.split(", ")
links = linkregex.findall(msg)
self.crawled.add(crawling)
for link in (links.pop(0) for _ in xrange(len(links))):
#print link
if link.startswith('/'):
link = 'http://' + url[1] + link
elif link.startswith('#'):
link = 'http://' + url[1] + url[2] + link
elif not link.startswith('http'):
link = 'http://' + url[1] + '/' + link
if link.endswith('.mp3') and self.meta.lower() in link.lower():
print link, ' = true'
urlQueue.addUrl(link)
if link not in self.crawled:
if link.startswith(self.url):
#print 'added : ', link
self.tocrawl.add(link.replace(' ', ''))
print "that's all!"
s = Crawl('http://mp3skull.com/', 'gangnam')
s.start()