-
Notifications
You must be signed in to change notification settings - Fork 6
/
trial_cr_company_numbers.py
201 lines (167 loc) · 7.54 KB
/
trial_cr_company_numbers.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
# -*- coding: iso-8859-2 -*-
import scraperwiki
import mechanize
import lxml.html
import re
import itertools
import calendar,datetime
import time
# DICTIONARY OF START DATE AND END DATE FOR EACH MONTH FROM 1994. DATES ARE FROM 1-10, 11-19 AND 20-END OF MONTH
# THAT SHOULD TAKE CARE OF THE LIMIT THE APPLICATION IMPOSES ON RETURNED RECORDS FROM THE SEARCH (1000)
# IT ALSO SLOWS US DOWN ...
#years = range(1994,2012)
#months = range (1,13)
#for year in years:
# for month in months:
# monthdict = {}
# lastday1 = calendar.monthrange(year, month)
# lastday2=datetime.date(year,month,lastday1[1])
# lastday3 = datetime.datetime.strptime(str(lastday2), "%Y-%m-%d").strftime("%d.%m.%Y")
# firstday1 = '01.' + str(month) + '.' + str(year)
# lastday4 = '19.' + str(month) + '.' + str(year)
# firstday2 = '20.' + str(month) + '.' + str(year)
# firstday3 = '11.' + str(month) + '.' + str(year)
# middleday = '10.' + str(month) + '.' + str(year)
# monthdict['range'] = firstday1 + ' - ' + middleday
# monthdict['scraped'] = '0'
# scraperwiki.sqlite.save(['range'], data=monthdict, table_name='month_span')
# monthdict = {}
# monthdict['range'] = firstday2 + ' - ' + lastday3
# monthdict['scraped'] = '0'
# scraperwiki.sqlite.save(['range'], data=monthdict, table_name='month_span')
# monthdict = {}
# monthdict['range'] = firstday3 + ' - ' + lastday4
# monthdict['scraped'] = '0'
# scraperwiki.sqlite.save(['range'], data=monthdict, table_name='month_span')
# REPLACE 'bankrupt' in Company name
#select = scraperwiki.sqlite.select("* from croatia_corporate_entities WHERE CompanyName LIKE '%u stečaju'")
#print len(select)
#for s in select:
# companyname = s['CompanyName']
# companyname2 = re.sub('u ste.*', '', companyname)
# update_statement= 'update croatia_corporate_entities SET CompanyName="'+companyname2+'" WHERE CompanyName LIKE "' +companyname + '"'
# scraperwiki.sqlite.execute(update_statement)
#scraperwiki.sqlite.commit()
#exit()
#for s in select:
# companynumber = s['CompanyNumber']
# CompanyContact = s['RegistryUrl']
# update_statement= 'update croatia_corporate_entities SET CompanyContact="" WHERE companynumber='+ '"' + companynumber + '"'
# scraperwiki.sqlite.execute(update_statement)
# scraperwiki.sqlite.commit()
#exit()
url = 'http://www1.biznet.hr/HgkWeb/do/advsearch'
def process_and_save(results):
for tr in results:
record = {}
CompanyName = ' '.join(tr[2][0].text_content().split())
if re.search('u ste.*', CompanyName):
CompanyName = re.sub('u ste.*', '', CompanyName)
CompanyNumber = ' '.join(tr[1].text_content().split())
RegistryUrl= ''.join(tr[2][0].get('href').split())
RegistryUrl = 'http://www1.biznet.hr' + RegistryUrl
CompanyCounty = ' '.join(tr[3].text_content().split())
CompanyAddress = ' '.join(tr[4].text_content().split())
CompanyTelephone = ' '.join(tr[5].text_content().split())
CompanyTelefax = ''.join(tr[6].text_content().split())
CompanyEmail = ' '.join(tr[7].text_content().split())
CompanyContact = ' '.join(tr[8].text_content().split())
if CompanyContact == '- -, Bez podataka':
CompanyContact = ''
record['CompanyName'] = CompanyName
record['CompanyNumber'] = CompanyNumber
record['RegistryUrl'] = RegistryUrl
record['CompanyCounty'] = CompanyCounty
record['CompanyAddress'] = CompanyAddress
record['CompanyTelephone'] = CompanyTelephone
record['CompanyTelefax'] = CompanyTelefax
record['CompanyEmail'] = CompanyEmail
record['CompanyContact'] = CompanyContact
record['date_time'] = time.strftime("%a %b %e %T %z %Y", time.gmtime())
#print record
scraperwiki.sqlite.save(['CompanyNumber'], data=record, table_name='croatia_corporate_entities')
def scrape_result(response, range, page):
response = response.decode('iso-8859-2') #THIS IS NEEDED FOR LXML TO PROPERLY REPRESENT UNICODE - TOOK ME AGES TO FIND THIS OUT ...
root = lxml.html.fromstring(response)
print "Processing page: " + str(page)
if not re.search('No entity found matching requested criteria', response) == None:
print 'No results. Onwards!'
#update_statement= 'update month_span SET scraped=1 WHERE range='+ '"' + range + '"'
#scraperwiki.sqlite.execute(update_statement)
#scraperwiki.sqlite.commit()
return
numbers = root.xpath ('//td[contains(@class,"def_srednji_tekst")][1]/text()[7]')[0]
check = re.findall(r'\d+', numbers)
print numbers
#We skipped the results that were more than 1000 (since the system only returns 1000)
if int(check[0]) > 999:
print "Too many results (" + str(check[0]) + "). Skipping for now. Will deal with at the end."
return
results = root.xpath ('//tr[contains(@bgcolor,"#efefef")]/.| //tr[contains(@bgcolor,"#ffffff")]/.')
process_and_save(results)
next_page_url = root.xpath ('//td[contains(@class,"naglaseni_tekst")]/a[text()="Next >> "]/@href')
if next_page_url:
response1 = br.follow_link(text_regex=r"Next", nr=0)
page = int(page) + 1
response1 = response1.read()
scrape_result(response1,range,page)
else:
print "Done with this section"
#update_statement= 'update month_span SET scraped=1 WHERE range='+ '"' + range + '"'
#scraperwiki.sqlite.execute(update_statement)
#scraperwiki.sqlite.commit()
# ----------------------------------
# HERE IS THE BIG RUN START
#selection_statement = '* from month_span where scraped =0' # ' + '"' + 'all' + '"'
#todo = scraperwiki.sqlite.select(selection_statement)
#
#if len(todo) == 0:
# print "Nothing to do"
#
#for item in todo:
#
# start = item['range'].partition(' - ')[0]
# end = item['range'].partition(' - ')[2]
#
# br = mechanize.Browser()
# br.addheaders = [('User-agent', 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)')]
# response = br.open(url)
# response1 = br.follow_link(text_regex=r"poslovnih", nr=0)
# response2 = br.follow_link(nr=1)
# response2 = br.follow_link(nr=0)
# br.select_form("advsearch")
# br.form.set_all_readonly(False)
# br.form['tvrtkeTip'] = ["1"] # we want all companies
# br.form['limit'] = '1000' #1000 seems to be the max number of results returned
# br.form['datRegOd'] = start
# br.form['datRegDo'] = end
# print ' Fetching: ' + item['range']
# response = br.submit().read()
# page = '1'
# scrape_result(response, item['range'], page)
# ------------------------
# DAILY RUN - FETCH LAST 30 DAYS
base = datetime.datetime.today()
dateList = [ base - datetime.timedelta(days=x) for x in range(0,30) ]
print dateList
start_date = dateList[29].strftime("%d.%m.%Y")
end_date = dateList[0].strftime("%d.%m.%Y")
#start_date = '15.09.2011'
#end_date = '01.10.2011'
range = start_date + ' - ' + end_date
br = mechanize.Browser()
br.addheaders = [('User-agent', 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)')]
response = br.open(url)
response1 = br.follow_link(text_regex=r"poslovnih", nr=0)
response2 = br.follow_link(nr=1)
response2 = br.follow_link(nr=0)
br.select_form("advsearch")
br.form.set_all_readonly(False)
br.form['tvrtkeTip'] = ["1"] # we want all companies
br.form['limit'] = '1000' #1000 seems to be the max number of results returned
br.form['datRegOd'] = start_date
br.form['datRegDo'] = end_date
print ' Fetching: ' + range
response = br.submit().read()
page = '1'
scrape_result(response, range, page)