-
Notifications
You must be signed in to change notification settings - Fork 1
/
check_cvs_for_self_transactions_contractors.py
126 lines (80 loc) · 4.12 KB
/
check_cvs_for_self_transactions_contractors.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
#!/usr/bin/env python
'''
Created by Julia Poncela, on Feb. 2016
'''
import datetime as dt
import csv
import pickle
import random
import sys
import datetime as dt
import Herfindahl_index
def main():
path="../Data/95_05NYCgamentdata/"
##################
####### input datafile: (I NEED TO READ IT EVERY TIME, BECAUSE IT GETS EMPTY EVERY TIME AFTER ITERATING OVER IT)
name0="fhistory_ALL.csv"
print "reading: ", path+name0, "......."
#### paidbyfi,paidforf,periodfr,periodto,adjgr,gross,net,caf,liqdmg,cafper,rateper,ratecode
csvfile=open(path+name0, 'rb')
reader = csv.reader(csvfile, delimiter=',', quotechar='"')
next(reader, None) # to skip the header
### tot # rows: 779798
cont=0
list_unique_manuf=[]
list_unique_contr=[]
list_manuf=[]
list_contr=[]
for list_row in reader:
cont +=1
print cont
try: # some lines are missing the contractor or manufacturer: skip
manufacturer=int(list_row[0]) ##paidbyfirm
contractor=int(list_row[1]) ## paidforfirm
list_manuf.append(manufacturer)
list_contr.append(contractor)
if manufacturer != contractor:
list_unique_manuf.append(manufacturer)
list_unique_contr.append(contractor)
except ValueError: pass # some lines (very rare, one single instance) are missing the contractor or manufacturer
print "size of list unique manuf.:", len(set(list_unique_manuf)), " id. contr:", len(set(list_unique_contr)) # 1417 , 7768
print " overlap between unique_manuf and unique_contr:", len(list(set(list_unique_manuf) & set(list_unique_contr))) ### 496
print "size of list all manuf.:", len(set(list_manuf)), " id. contr:", len(set(list_contr)) ## 5747 , 10535
print " overlap between all manuf and all contr:", len(list(set(list_manuf) & set(list_contr))) # 5695
raw_input()
print "reading: ", path+name0, "......."
#### paidbyfi,paidforf,periodfr,periodto,adjgr,gross,net,caf,liqdmg,cafper,rateper,ratecode
csvfile=open(path+name0, 'rb')
reader = csv.reader(csvfile, delimiter=',', quotechar='"')
next(reader, None) # to skip the header
num_self_trans_manuf_from_contr_list=0
num_self_trans_contr_from_manuf_list=0
cont_self_transactions =0
cont_transactions =0
cont=0
for list_row in reader:
cont +=1
print cont
try: # some lines are missing the contractor or manufacturer: skip
manufacturer=int(list_row[0]) ##paidbyfirm
contractor=int(list_row[1]) ## paidforfirm
cont_transactions +=1
############## for now i deal with integers Dollars !! (easier for histograms)
#adj_gross=int(round(float((list_row[4]))))
if manufacturer == contractor:
cont_self_transactions +=1.
if manufacturer in list_unique_contr:
num_self_trans_manuf_from_contr_list +=1
if contractor in list_unique_manuf:
num_self_trans_contr_from_manuf_list +=1
except ValueError: pass # some lines (very rare, one single instance) are missing the contractor or manufacturer
print "cont:",cont," cont trans:", cont_transactions, " # self tr.:", cont_self_transactions, " # self tr. contr from manuf. list:",num_self_trans_contr_from_manuf_list, " # self tr. manuf from contr list:", num_self_trans_manuf_from_contr_list
#######################################
######################################
######################################
if __name__ == '__main__':
# if len(sys.argv) > 1:
# graph_filename = sys.argv[1]
main()
#else:
# print "Usage: python script.py "