-
Notifications
You must be signed in to change notification settings - Fork 0
/
data2.py
114 lines (99 loc) · 3.15 KB
/
data2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
from collections import Counter
from json import dumps
candidatn={}
candidate={}
with open('candidacies.csv') as f:
next(f)
for l in f:
li=l.split(',')
candidatn[li[0]]=(li[3]+' '+li[4]).replace('"','').strip()
candidate[li[0]]=li[1]
election={}
with open('elections.csv') as f:
next(f)
for l in f:
li=l.split(',')
election[li[0]]=li[1]
tags={}
with open('tags.csv') as f:
next(f)
for l in f:
li=l.split(',')
tags[li[0]]=li[1].strip(' \n"')
#print tags.values()
#for c in candidate:
# print candidatn[c],election[candidate[c]]
from collections import Counter
import re,time
reid='([0-9a-f]{24})'
redate='(20\d\d-\d\d-\d\dT\d\d:\d\d:\d\dZ)'
reip=re.compile(r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}')
rcand= re.compile(r'{ ""\$oid"" : ""'+reid+r'"" }')
reevent= re.compile(r'^((?:"\[.*]")|),(.*),(.*),((?:"\[.*]")|),(.*$)')
error=Counter()
nbcand=Counter()
nbelec=Counter()
nbcling=Counter()
badcand=Counter()
data=[]
with open('events.csv') as f:
next(f)
for l in f:
m=reevent.search(l)
if m:
cs,date,ip,tag,cling=m.groups()
cs=cs.strip('"[] ')
if cs:
cs=cs.split(',')
if None in map(lambda x: rcand.search(x),cs):
error['badmatch cands']+=1
#print 'badmatch cands',l,
continue
else:
cs=map(lambda x: rcand.search(x).groups()[0],cs)
if filter(lambda x : x not in candidate,cs):
error['bad candidate']+=1
#print 'bad candidacie',l,
badcand[cs[0]]+=1
continue
else:
nbelec[election[candidate[cs[0]]]]+=1
else:
cs=[]
nbcand[len(cs)]+=1
tag=tag.strip('"[] ')
if tag:
if not rcand.search(tag):
error['badmatch tag']+=1
#print 'badmatch tag',l,
continue
else:
tag=rcand.search(tag).groups()[0]
if tag not in tags:
error['tag notin tags']+=1
print 'tag notin tags',tag
continue
else:
error['notag']+=1
tag=''
date=time.strptime(date,'%Y-%m-%dT%H:%M:%SZ')
nbcling[cling]+=1
if ip:
ip=ip.strip('"')
if not reip.match(ip):
print ip
data.append({'cs':cs,'date':int(time.strftime('%s',date)),'ip':ip,'tag':tag,'cling':cling})
else:
error['pasmatch']+=1
#print 'notmatch',l,
continue
print 'saving'
import cPickle
cPickle.dump(data,open('data.pk','w'))
cPickle.dump(candidate,open('candidate.pk','w'))
cPickle.dump(candidatn,open('candidatn.pk','w'))
cPickle.dump(election,open('election.pk','w'))
cPickle.dump(tags,open('tags.pk','w'))
import pprint
for c in error,nbcand,nbcling,badcand:
pprint.pprint(dict(c))