forked from eccoilmoro/Albo_pretorio_Conselice
-
Notifications
You must be signed in to change notification settings - Fork 0
/
scraper.py
88 lines (75 loc) · 3.94 KB
/
scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
import scraperwiki
from urllib2 import urlparse
import scraperwiki
import lxml.html
import datetime
SITEPAGE = 'http://albopretorio.comune.lugo.ra.it/?ente=conselice'
URL_ALLEGATI = 'http://albopretorio.comune.lugo.ra.it/'
#ENTE_ALLEGATI = '1' #1 Unione 0 Comune di Lugo
def is_date(row):
return len(row.cssselect('th')) == 2
def is_column_heading(row):
return 'Flug' in row.text_content()
def parse_page(url):
root = lxml.html.parse(url).getroot()
if root is not None:
print("nell albero")
for table in root.cssselect('table'):
print(table.attrib['summary']+'weeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeee')
count = 0
for row in table.cssselect('tbody tr'):
print("Riga" + row[0].text)
try:
if len(row)>1:
#print(row[4].text)
riga = {}
if table.attrib['summary'] == 'Altri Atti' :
riga["tipodoc"] = row[2].text.replace(u'\xa0', ' ').strip()
if 'non' in row[0].text:
riga["key"] = 'n' + str(count)
print riga["key"]
count = count + 1
else:
riga["key"] = row[0].text
try:
riga["oggetto"] = row[3].text.encode('utf-8',errors='ignore').replace('\r\n', ' ').replace('\t\t\t',' ').strip()
except:
print("Errore encode")
continue
riga["datapubbfrom"] = row[4].text_content().replace(u'\xa0', ' ').strip() or ''
riga["datapubbto"] = row[5].text_content().replace(u'\xa0', ' ').strip() or ''
else :
riga["tipodoc"] = table.attrib['summary']
riga["key"] = row[0].text.replace('\r\n', ' ').replace('\t\t\t',' ').strip()
#riga["oggetto"] = row[4].text.decode('utf-8',errors='ignore')
try:
riga["oggetto"] = row[5].text.encode('utf-8',errors='ignore').replace('\r\n', ' ').replace('\t\t\t',' ').strip()
except:
print("Errore encode")
continue
riga["datapubbfrom"] = row[6].text_content().replace(u'\xa0', ' ').strip() or ''
riga["datapubbto"] = row[7].text_content().replace(u'\xa0', ' ').strip() or ''
allegato = ''
if row.cssselect('img') :
img=row.cssselect('img')
if len(img[0].attrib['onclick'])>1:
index= img[0].attrib['onclick'].find(',')
allegato = img[0].attrib['onclick'][24:index-1]
else:
print('purachio')
riga["URL_allegato"] = URL_ALLEGATI + allegato
print(riga)
scraperwiki.sqlite.save(unique_keys=["key"], data=riga)
except:
print('errore')
#continue
# riga["id"] = row[2].text.encode('utf-8').replace("\xc2\xa0", " ") or ''
# riga["oggetto"] = row[3].text.encode('utf-8').replace("\xc2\xa0", " ") or ''
# riga["datapubbfrom"] = row[4].text or ''
# riga["datapubbto"] = row[5].text or ''
# riga["allegati"] = row[6].text or ''
def main():
scraperwiki.sqlite.execute("drop table if exists swdata")
scraperwiki.sqlite.commit()
parse_page(SITEPAGE)
main()