-
Notifications
You must be signed in to change notification settings - Fork 1
/
crawlMain.py
executable file
·140 lines (122 loc) · 4.42 KB
/
crawlMain.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
# -*- coding=utf8 -*-
import requests
from bs4 import BeautifulSoup
from collections import deque
import time
import re
headers = {
'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:46.0) Gecko/20100101 Firefox/46.0',
}
totalUrls = dict()
totalImageUrls = dict()
def getHtmlResponse(url):
try:
r = requests.get(url, headers=headers, timeout=30)
except requests.exceptions.ConnectionError:
print('request get url {0} happen exceptions. '. format(url))
return None
except Exception:
print('get url {0} happen exceptions. '. format(url))
return None
return r
def writeContentToFile(filename, data, mode='w'):
m_mode = mode
with open(filename, m_mode) as f:
f.write(data)
return True
def downImageFromUrls(urls, path):
while urls:
url = urls.pop()
filename = re.split("/|//", url)
m_len = len(filename)
m = re.match(".*\.jpg|.*\.gif|.*\.png", filename[m_len-1])
if m is not None:
time.sleep(0.1)
osfile = path + filename[m_len-1]
try:
rel = requests.get(url, stream=True, verify=False)
if rel.status_code == 200:
print("down image is {0}. ". format(url))
with open(osfile, 'wb') as f:
for chunk in rel.iter_content(1024):
f.write(chunk)
except requests.exceptions as e:
print('requests is exceptions when downImage.', url)
continue
except Exception as e:
print('ordinary exception', url)
continue
# print 'the current url\'s status is ',url,rel.status_code
else:
# print 'The url is filtered. ', url
return True
return True
def makeUrlFromHref(valueurl, currenturl, defaulturl=""):
"""
# <a href="../../../">, <a href="thread0806.php?fid=7">,
"""
m = re.match("http://.+|https://.+", valueurl)
if m is not None:
return valueurl
else:
new_url = defaulturl + valueurl
return new_url
def getHrefFromHtml(urls, currenturl, urlcontent):
soup_html = BeautifulSoup(urlcontent, 'lxml')
global totalUrls
for url_link in soup_html.find_all('a'):
value = url_link.get('href')
if value is not None:
new_url = makeUrlFromHref(value, currenturl)
m = re.match("http://.+\.\..*|https://.+\.\..*", new_url)
if m is not None:
continue
else:
m_count = totalUrls.get(new_url)
if m_count:
m_count = m_count + 1
totalUrls[new_url] = m_count
else:
urls.append(new_url)
totalUrls[new_url] = 1
return True
def getImageSrcFromHtml(urls, text):
global totalImageUrls
soup_html = BeautifulSoup(text, 'lxml')
for img_src in soup_html.find_all('img'):
value = img_src.get('src')
if value is not None:
m = re.match("http://.+|https://.+", value)
if m is not None:
if not totalImageUrls.get(value):
print('get Image url,{}'.format(value))
urls.append(value)
totalImageUrls[value] = 1
return True
if __name__ == '__main__':
print("hello webCrawl.")
# reload(sys)
# sys.setdefaultencoding('utf-8')
url = 'http://v.comicbus.com/online/comic-103.html?ch=1'
img_path = '/home/beyondkoma/work/gitProject/webCrawl/images/'
# createImageFromUrl('http://i4.tietuku.com/408da328c806fa52.jpg',img_path)
init_urls = deque()
img_urls = []
init_urls.append(url)
url_num = 1
while len(init_urls) > 0:
r = getHtmlResponse(url)
r.encoding = 'big5'
if r is not None:
url = init_urls.popleft()
writeContentToFile(img_path + "study.html", r.text)
print('current url is {0},the count is {1}. '.format(url, url_num))
url_num += 1
# getHrefFromHtml(init_urls, url, r.text)
img_urls = []
getImageSrcFromHtml(img_urls, r.text)
downImageFromUrls(img_urls, img_path)
else:
url = init_urls.popleft()
time.sleep(2)
continue