forked from mithro/numato-opsis-crowdfunding-campaign
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscraper.py
116 lines (97 loc) · 3.63 KB
/
scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
#!/usr/bin/python
import os
import time
import bs4
import urllib2
from pprint import pprint
def download_page(url):
retry = 0
while retry < 5:
try:
print "Downloading", url
return bs4.BeautifulSoup(urllib2.urlopen(url).read().decode('utf-8'), "lxml")
break
except urllib2.HTTPError, e:
print "Failed to get", repr(url), "retrying"
retry += 1
except:
print "Failed to get", repr(url)
raise
else:
raise IOError("Failed to get %r", url)
url = "https://www.crowdsupply.com/numato-lab/opsis"
page = download_page(url)
project = page.find('section', attrs={'class':'section-project'})
facts = [" ".join(fact.text.split()).strip() for fact in project.findAll(attrs={'class': 'fact'})]
left, percent_funded, pledges = facts
pledged = project.find(attrs={'class': 'project-pledged'}).text.strip()
goal = project.find(attrs={'class': 'project-goal'}).text.strip()
#ends = page.find(attrs={'class': 'project-remind-me'}).find('p').text.split(' on ')[-1]
data={
'url': url,
'time': time.time(),
'pledged': int(pledged.split()[0][1:].replace(',', '')),
'goal': int(goal.split()[1][1:].replace(',', '')),
'percent_funded': int(percent_funded.split()[0][:-1]),
'pledges': int(pledges.split()[0]),
'left': left,
# 'ends': ends,
}
import json
json.dump(data, file('data.json','w'))
project_box = page.find('div', attrs={'class': 'project-block'})
print "%(pledges)i pledges - $%(pledged)i of $%(goal)i (%(percent_funded)s%%) - Preorder available!" % data
page = """\
<html>
<head>
<meta charset="UTF-8">
<link rel="stylesheet" type="text/css" href="style.css">
</head>
<body>
<div class='container'>
<div class='crowdsupply-box'>
<div class='image'>
<a href="https://www.crowdsupply.com/numato-lab/opsis" target="_top">
<img src="https://www.crowdsupply.com/img/35c2/hdmi2usb-1-1_jpg_project-tile.jpg">
</a>
</div>
<div class="message">
The <a href="http://hdmi2usb.tv/numato-opsis" target="_top">Numato Opsis</a>, the first open hardware for the <a href="http://hdmi2usb.tv" target="_top">HDMI2USB.tv</a> firmware,<br>
<a href="https://www.crowdsupply.com/numato-lab/opsis" target="_top">
Pre-order now on
<img src="https://www.crowdsupply.com/_teal/images/[email protected]" style="padding: 2px; height: 2em; vertical-align: middle;">
</a>
</div>
%(project_box)s
<div class='end'></div>
</div>
</div>
</body>
""".encode('utf-8') % locals()
page = page.replace('<p class="project-pledged">','<div class="project-funds"><p class="project-pledged">')
page = page.replace('<div class="factoids">', '</div><div class="factoids">')
page = page.replace("text=Check+out+this+Crowd+Supply+project", "text=Support+on+Crowd+Supply+the+@numatolab+Opsis+board,+a+new+open+video+platform!")
file('badge.html', 'w').write(page)
import subprocess
create_image = True
try:
subprocess.check_output('wkhtmltoimage --version', stderr=subprocess.STDOUT, shell=True)
subprocess.check_output('convert --version', stderr=subprocess.STDOUT, shell=True)
except subprocess.CalledProcessError:
print "Not generating image badge."
create_image = False
if create_image:
image = page.replace("</body>", '<link rel="stylesheet" type="text/css" href="image.css"></body>')
f = file('image.html', 'w')
f.write(image)
f.close()
del f
subprocess.check_call('wkhtmltoimage image.html badge-temp.png', shell=True)
subprocess.check_call('convert badge-temp.png -trim +repage badge.png', shell=True)
os.unlink('image.html')
os.unlink('badge-temp.png')
try:
import scraperwiki
scraperwiki.sqlite.save(unique_keys=['url', 'time'], data=data)
except ImportError:
pass