-
Notifications
You must be signed in to change notification settings - Fork 6
/
utah_corporations.py
124 lines (99 loc) · 4.56 KB
/
utah_corporations.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
# -*- coding: utf-8 -*
#opencorporates Utah - usa
import scraperwiki,re
import string
import lxml.html
import urllib2
########## - stolen from here: http://www.python-forum.org/pythonforum/viewtopic.php?f=2&t=21333
def succ(word=''):
''' Takes a string, and returns the next successive value. '''
parts = [string.ascii_lowercase, string.ascii_uppercase, string.digits]
last = ['']
# if none of the characters are in 'A-Z', 'a-z', or '0-9',
# then also include symbols
if not any(ch for ch in word for part in parts if ch in part):
parts = [string.printable[
string.printable.index('!'):string.printable.index(' ')+1]]
for index, ch in enumerate(word[::-1]):
for part in parts:
if ch in part:
last = part
ndx = part.index(ch)+1
complete = True
# if there's not an overflow (9+1=overflow),
# immediately return
if ndx >= len(part):
complete = False
ndx = 0
word = word[:(index+1)*-1]+part[ndx]+word[len(word)-index:]
if complete:
return word
return last[0] + word
#################
id_number_regex = re.compile(".*=(\d.*)")
#first page with all results (% wildcard):
#starturl = 'https://secure.utah.gov/bes/action/searchresults?name=%25&type=beginning&pageNo='
starturl = 'https://secure.utah.gov/bes/action/searchresults?name='
# https://secure.utah.gov/bes/action/searchresults?track=3&name=%25&pageNo=100
#https://secure.utah.gov/bes/action/searchresults?name=%25&pageNo=300
#Runtime info setup:
#record = {}
#record ['lastPageNo'] = '0'
#record ['lastSequence'] = 'aaa'
#scraperwiki.sqlite.save(['lastSequence'], data=record, table_name='runtime_info')
#exit()
def scrape(lastSeq,lastPageNo):
print '*** Now doing sequence: ' + lastSeq + ' and result page: ' + str(lastPageNo)
try:
resp = urllib2.urlopen(starturl + str(lastSeq)+'&type=beginning&pageNo='+str(lastPageNo),'20')
html = resp.read()
except urllib2.HTTPError, error:
print 'The server could not fulfill the request.'
print 'Error code: ', error.code
except URLError, error:
print 'We failed to reach a server.'
print 'Reason: ', error.reason
root = lxml.html.fromstring(html)
#results = root.xpath('//div[@class="entities"]/.')
#print root.cssselect("div.entityRow")
#print len(results)
#print results
for tr in root.cssselect("div.entityRow"):
record = {}
record['name'] = tr[0].text_content()
record['status'] = tr[1][0].text_content()
record['type'] = tr[1][1].text_content()
record['city'] = tr[1][2].text_content()
detail_url = tr[2][0].get('href')
record['detail_url'] = 'https://secure.utah.gov' + detail_url
id_number = id_number_regex.findall(detail_url)
record['id_number'] = id_number[0]
scraperwiki.sqlite.save(['name', 'id_number'], data=record, table_name='us_utah_corporate_entities', verbose=0)
print 'processed resultpage for sequence: ',lastSeq,' and result page ', str(lastPageNo)
update_statement= 'update runtime_info SET lastPageNo=' + str(lastPageNo) + ' WHERE lastsequence='+ '"' + lastSeq+ '"'
scraperwiki.sqlite.execute(update_statement)
scraperwiki.sqlite.commit()
if root.xpath('/html/body/div/div/form/div/div/a[contains(text(), ">>")]'): #avoid the expensive // xpath operator
#print 'there is a next page to do'
next_int = int(lastPageNo)+1
#print 'get results: ', next_int, '-', next_int+50
#time.sleep(2) #sleep for 2 sec - a desperate attempt to avoid 500 errors from the server
scrape(lastSeq,str(next_int))
selection_statement = '* from runtime_info'
last_sequence = scraperwiki.sqlite.select(selection_statement)
for last in last_sequence:
last_seq = last['lastSequence']
last_PageNo = last['lastPageNo']
print 'Last sequence done: "',last_seq, '" and last result page done ', last_PageNo
#scrape(str(next_int))
while (last_seq != 'aaaa'):
#print 'ekki enn', last_seq
scrape(str(last_seq),int(last_PageNo)+1)
update_statement= 'update runtime_info SET lastPageNo=0 AND lastsequence='+ '"' + last_seq + '"'
scraperwiki.sqlite.execute(update_statement)
scraperwiki.sqlite.commit()
last_seq = succ(last_seq)
#restart when done with all sequences
update_statement= 'update runtime_info SET lastPageNo=0 AND lastsequence="aaa"'
scraperwiki.sqlite.execute(update_statement)
scraperwiki.sqlite.commit()