-
Notifications
You must be signed in to change notification settings - Fork 6
/
curacao_companies.py
110 lines (85 loc) · 3.56 KB
/
curacao_companies.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
# opencorporate curacao
import scraperwiki
import urllib
import urllib2
import string
import lxml.html
import time
import re
#Runtime info setup:
#record = {}
#record ['searchstring'] = a
#scraperwiki.sqlite.save(['searchstring'], data=record, table_name='runtime_info')
#scraperwiki.sqlite.save(unique_keys=["search_blob"], data={"search_blob":'aa'},table_name="runtime_info")
#exit()
url = 'http://www.curacao-chamber.an/info/registry/companyselect.asp'
regex = re.compile("There is no company")
def succ(word=''):
''' Takes a string, and returns the next successive value. '''
parts = [string.ascii_lowercase, string.ascii_uppercase, string.digits, string.punctuation]
last = ['']
# if none of the characters are in 'A-Z', 'a-z', or '0-9',
# then also include symbols
if not any(ch for ch in word for part in parts if ch in part):
parts = [string.printable[
string.printable.index('!'):string.printable.index(' ')+1]]
for index, ch in enumerate(word[::-1]):
for part in parts:
if ch in part:
last = part
ndx = part.index(ch)+1
complete = True
# if there's not an overflow (9+1=overflow),
# immediately return
if ndx >= len(part):
complete = False
ndx = 0
word = word[:(index+1)*-1]+part[ndx]+word[len(word)-index:]
if complete:
return word
return last[0] + word
def process(html):
no_results = regex.findall(html)
if not no_results:
root = lxml.html.fromstring(html)
for td in root.xpath('//tr[@class="row"] | //tr[@class="altrow"]/.')[1:]:
record = {}
record['url'] = 'http://www.curacao-chamber.an/info/registry/' + td[0][0].get('href')
record['tradename'] = td[0][0].text
record['registration_number'] = td[1].text
record['official_name'] = td[2].text
record['address'] = td[3].text
record['date_time'] = time.strftime("%a %b %e %T %z %Y", time.gmtime())
scraperwiki.sqlite.save(['registration_number'], data=record, table_name='curacao-companies')
else:
print 'No results. Onwards!'
def do_punctuation():
punctuation = ['"',"'",'.','#','0','1','2','3','4','5','6','7','8','9']
for p in punctuation:
get(p)
scraperwiki.sqlite.execute("drop table if exists runtime_info")
scraperwiki.sqlite.save(unique_keys=["search_blob"], data={"search_blob":'aa'},table_name="runtime_info")
def get(search_string):
values = {'name' : '%'+str(search_string)+'%%','companyid' : '','source' : '0','languageabbrev' : 'ENG','searchbut' : 'Search'}
data = urllib.urlencode(values)
req = urllib2.Request(url, data)
response = urllib2.urlopen(req)
html = response.read()
process(html)
selection_statement = '* from runtime_info'
searchstring = scraperwiki.sqlite.select(selection_statement)
search_string = searchstring[0]
search_string = search_string['search_blob']
#search_string = 'zx'
if search_string == 'aaa':
print "Let's do punctuation and numbers"
do_punctuation()
else:
while (search_string != 'aaa'):
print 'Processing string: ', search_string
get(search_string)
search_string = succ(search_string)
# save search string
update_statement= 'update runtime_info SET search_blob='+ '"' +search_string+ '"'
scraperwiki.sqlite.execute(update_statement)
scraperwiki.sqlite.commit()