-
Notifications
You must be signed in to change notification settings - Fork 6
/
corporations-usa-georgia.py
77 lines (60 loc) · 2.75 KB
/
corporations-usa-georgia.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
# -*- coding: utf-8 -*
#opencorporates georgia -- usa
import scraperwiki,re
#from lxml import etree
import lxml.html
import urllib2
#first page:
# http://corp.sos.state.ga.us/corp/soskb/searchresults.asp?Words=Starting&searchstr=*&FormName=CorpNameSearch
#second page:
#http://corp.sos.state.ga.us/corp/soskb/searchresults.asp?Words=Starting&searchstr=*&TopRec=0&FormName=CorpNameSearch&Pos=Next+50+%3E%3E
#third page:
#http://corp.sos.state.ga.us/corp/soskb/searchresults.asp?Words=Starting&searchstr=*&TopRec=50&FormName=CorpNameSearch&Pos=Next+50+%3E%3E
# + 50 each time for TopRec
#Runtime info setup:
#record = {}
#record ['lastTopRec'] = '0'
#scraperwiki.sqlite.save(['lastTopRec'], data=record, table_name='runtime_info')
#exit()
regex = re.compile("Next 50")
def scrape(last_int):
if last_int == '0':
print 'get first page'
#html = scraperwiki.scrape('http://corp.sos.state.ga.us/corp/soskb/searchresults.asp?Words=Starting&searchstr=*&TopRec='+str(last_int)+'&FormName=CorpNameSearch&Pos=Next+50+%3E%3E')
try:
resp = urllib2.urlopen('http://corp.sos.state.ga.us/corp/soskb/searchresults.asp?Words=Starting&searchstr=%&TopRec='+str(last_int)+'&FormName=CorpNameSearch&Pos=Next+50+%3E%3E')
html = resp.read()
except urllib2.HTTPError, error:
print 'The server could not fulfill the request.'
print 'Error code: ', error.code
#except URLError, error:
# print 'We failed to reach a server.'
# print 'Reason: ', error.reason
root = lxml.html.fromstring(html)
results = root.xpath('//table//table//table//tr')
#print len(results)
#print results
for tr in results[6:]:
record = {}
record['name'] = tr[0].text_content()
record['detail_url'] = 'http://corp.sos.state.ga.us/corp/soskb/' + tr[0][0][0].get('href')
record['control_no'] = tr[1].text_content()
record['type'] = tr[2].text_content()
record['status'] = tr[3].text_content()
record['creation_date'] = tr[4].text_content()
scraperwiki.sqlite.save(['name', 'control_no'], data=record, table_name='us_georgia_corporate_entities')
update_statement= 'update runtime_info SET lastTopRec=' + str(last_int)
scraperwiki.sqlite.execute(update_statement)
scraperwiki.sqlite.commit()
if regex.findall(html):
#print 'there is a next page'
next_int = int(last_int)+50
print 'get results: ', next_int, '-', next_int+50
#time.sleep(2) #sleep for 2 sec - a desperate attempt to avoid 500 errors from the server
scrape(next_int)
selection_statement = '* from runtime_info'
last_done = scraperwiki.sqlite.select(selection_statement)
for last in last_done:
last_int = last['lastTopRec']
print 'Last done: ',last_int
scrape(last_int)