forked from Abhisheksinha1506/Collection-of-Useful-Scripts
-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy path4chan_Image Grabber.py
76 lines (60 loc) · 2.45 KB
/
4chan_Image Grabber.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
'''Just a little script I wrote to grab all images from a 4chan thread. Gets the full-res image, handles existing filenames so I could run it later to get the newest images. Works like "python 4chan.py <url> <folder>" so "./4chan.py http://boards.4chan.org/wg/res/4836807 wp-nature" would get me a bunch of hi-res nature wallpapers.'''
import os
import sys
import urllib
import urllib2
import re
import time
if not len(sys.argv) >= 3:
print "Missing parameters."
print "Usage: python 4chan.py <url> <folder>"
sys.exit()
threadurl = sys.argv[1]
subfolder = sys.argv[2]
exp_imgurl = re.compile('4chan\.org/\w+/src/\d+\.(?:jpg|gif|png|jpeg)')
exp_picname = re.compile('\d+\.(?:jpg|gif|png|jpeg)')
ua = "Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US; rv:1.9.1.4) Gecko/20091007 Firefox/3.5.4"
head = {'User-agent': ua}
print "Thread %s going to folder %s" % (threadurl, subfolder)
print "Fetching html..."
req = urllib2.Request(threadurl, None, head)
try:
response = urllib2.urlopen(req)
except urllib2.HTTPError, e:
if errorcount < 1:
errorcount = 1
print "Request failed"
response = urllib2.urlopen(req)
except urllib2.URLError, e:
if errorcount < 1:
errorcount = 1
print "Request failed"
response = urllib2.urlopen(req)
msg = response.read()
errorcount = 0
print "Received %d bytes" % len(msg)
imgurls = exp_imgurl.findall(msg)
print "Found %d images" % len(imgurls)
if not os.path.exists(subfolder):
print "Folder %s does not exist. Creating..." % subfolder
os.makedirs(subfolder)
else:
print "Folder %s exists. I will just put all files in there." % subfolder
totalnumber = len(list(set(imgurls)))
for i, img in enumerate(list(set(imgurls))):
source = "http://images."+str(img)
filename = exp_picname.findall(source)[0]
destination = os.path.join(subfolder, filename)
if not os.path.isfile(destination):
try:
print "Downloading %d/%d: %s" % (i+1, totalnumber, source)
urllib.urlretrieve(source, destination)
time.sleep(0.25) # why?
except urllib.ContentTooShortError:
print "Image download failed, retrying..."
time.sleep(1)
urllib.urlretrieve(source, destination)
time.sleep(0.5) # why?
else:
print "File %s exists. Skipping..." % str(filename)
print "Aaaaaaand we are done. See you next time."