-
Notifications
You must be signed in to change notification settings - Fork 4
/
batch_country_dataset_generator.py
executable file
·328 lines (259 loc) · 9.53 KB
/
batch_country_dataset_generator.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
"""
Batch dbpedia country point files generator
\author Jordi Castells
\date 8 December 2016
A helper tool to split an already downloaded dbpedia csv file into different
smaller files, separed by the second column that is assumed to be a dbo:Country
resource.
csv file is expected :
name;country;wikipediaURL;x;y;WKT
"""
import os
import re
import sys
import csv
import argparse
import signal
import tempfile
import xml, warnings
import cSpinner
import cPlace
import cReport
import geom_functions as GEOM
# ###########################
#
# ARGUMENT PARSING
#
# ###########################
parser = argparse.ArgumentParser(description='Filter an input dataset into smaller files (by country)')
parser.add_argument('--outDir', default=None, dest='outDir',help='Directory where to store the country points outputs')
parser.add_argument('--pointFile', default=None, dest='points',help='input file with points to split in smaller files')
parser.add_argument('--nospinner', default=False, dest='nospinner',help='Deactivate live feedback via shell. For batch operations', action="store_true")
#parser.add_argument('--country', default=None, dest='countryFilter', help='Filter by country')
args = parser.parse_args()
def slugify(value, allow_unicode=False):
"""
Force a valid filename
"""
import re
value = re.sub("http://dbpedia.org/resource/","",value)
return re.sub(r'[^\x00-\x7F]+','', value)
# ###########################
#
# INITIALIZATIONS
#
# ###########################
#Spinner
S = cSpinner.cSpinner()
if not args.nospinner:
S.start()
csv.field_size_limit(sys.maxsize)
def gen_alpha_shape(cgalfile, alpha):
"""
External system execution of alpha_shaper to generate a WKT alpha shape file.
Expects a CGAL file with lon lat corrdinates and the first line an integer
of the total number of lines to read
returns the well known text of this alpha shape polygons
"""
opt_alpha, wkt_polygons = GEOM.alpha_shape(cgalfile,alpha);
return wkt_polygons.splitlines()
def finish_program():
S.stop()
sys.exit(0)
def write_file_wkt_csv(wkt_polygons, fileh):
"""
Write a file with wkt polygons
"""
fileh.write("id;wkt\n")
for i,polygon in enumerate(wkt_polygons):
fileh.write("%s;%s\n" %(i,polygon))
def write_file_cgal(places, fileh):
"""
Writes a file to be read by cgal alpha_shape generator from a read
csvDictionary
returns the number of added points
"""
#Now write the file
fileh.write(str(len(places))+"\n")
for p in places:
fileh.write(p.lon+" "+p.lat+"\n")
def read_countries_from_csv(filename):
"""
Reads a CSV points file
"name;WKT;Country;Abstract"
We read this information apart because reading the whole
file can destroy our memory
"""
data = []
total = 0
failed = 0
totalL = sum(1 for line in open(filename))
with open(filename, 'rb') as csvfile:
preader = csv.DictReader(csvfile, delimiter=';', quotechar='"')
for row in preader:
S.set_msg("Reading Countries: %s/%s. FAILED: %s"%(total,totalL,failed))
total+=1
if "http://dbpedia.org/resource/" not in row["country"]:
failed+=1
continue
if row["country"] not in data:
#country = re.sub("http://dbpedia.org/resource/","", row["country"])
#country = re.sub("http://dbpedia.org/resource/","", country)
country = row["country"]
data.append(country)
return data
def extract_countries_to_files(filename, outdir, countryNames):
"""
retrieves all the points from a filename that match a countryName
and stores those points to another file
"name;WKT;Country;Abstract"
"""
data = []
total = 0
fileHandles = {}
for countryName in countryNames:
fileout = os.path.join(args.outDir, "%s_points.csv" % slugify(countryName))
handle = open(fileout, 'wb')
handle.write("name;country;wikipediaURL;x;y;WKT\n")
pwriter = csv.writer(handle, delimiter=';', quotechar='"')
fileHandles[countryName] = [handle, pwriter]
totalL = sum(1 for line in open(filename))
with open(filename, 'rb') as infile:
preader = csv.DictReader(infile, delimiter=';', quotechar='"')
for row in preader:
S.set_msg("Filtering & splitting %s"% total)
total+=1
if row["country"] in fileHandles.keys():
#os is the file. 1 is the csv parser
fileHandles[row["country"]][1].writerow([row["name"],row["country"],row["URL"],row["x"],row["y"],row["WKT"]])
for name, handle in fileHandles.items():
handle[0].close()
def extract_country_to_file(filename, fileout, countryName):
"""
retrieves all the points from a filename that match a countryName
and stores those points to another file
"name;WKT;Country;Abstract"
"""
data = []
total = 0
with open(filename, 'rb') as infile:
with open(fileout, 'wb') as outfile:
preader = csv.DictReader(infile, delimiter=';', quotechar='"')
pwriter = csv.writer(outfile, delimiter=';', quotechar='"')
for row in preader:
S.set_msg("Filtering %s: %s"%(countryName, total))
total+=1
if countryName in row["country"]:
pwriter.writerow([row["name"],row["country"],row["URL"],row["x"],row["y"],row["WKT"]])
def get_country_points(filename, countryName):
"""
retrieves all the points from a filename that match a countryName
"name;WKT;country;Abstract"
"""
data = []
total = 0
with open(filename, 'rb') as csvfile:
preader = csv.DictReader(csvfile, delimiter=';', quotechar='"')
for row in preader:
S.set_msg("Obtaining %s: %s"%(countryName, total))
total+=1
if countryName in row["country"]:
data.append(row)
return data
def read_points_csv(filename):
"""
Reads a CSV points file
"name;WKT;Country;Abstract"
Returns a reader object that can be looped returning a dictionary
for each row.
"""
data = []
with open(filename, 'rb') as csvfile:
preader = csv.DictReader(csvfile, delimiter=';', quotechar='"')
for row in preader:
data.append(row)
return data
def parse_input_to_place(dataList):
"""
Parse the set of readed points from a csv to a list of places @ref cPlace
"""
places = []
skipped = []
for row in dataList:
wkt = row["WKT"]
match = re.search(".*POINT\(([0-9.-]*) ([0-9.-]*)\)", wkt)
if not match:
skipped.append(row)
else:
place = cPlace.cPlace(row["name"],
match.group(2),match.group(1),
row["Abstract"],row["Country"])
places.append(place)
return places,skipped
# ###########################
#
# SIGNAL HANDLING
#
# ###########################
def kill_handler(signal, frame):
print 'Kill Signal Recieved'
finish_program()
signal.signal(signal.SIGINT, kill_handler)
# ###########################
#
# START
#
# ###########################
#Open Points file
try:
if not os.path.exists(args.outDir):
os.mkdir(args.outDir)
S.set_msg("Parsing input")
#CREATE A NICE TEMPFILE for CGAL from the input data
tmpfile = tempfile.NamedTemporaryFile(prefix='vagueplace',delete=True);
#Can't use this since it's using a lot of memory. This approach was getting
#everything in RAM and then start processing. We have to do something slower
#but less memory hungry
#datain = read_points_csv(args.points)
# First get the countries
S.set_msg("Generating country list")
countries = read_countries_from_csv(args.points)
extract_countries_to_files(args.points, args.outDir, countries)
#for country in countries:
# # For each country split the output in different files that we will
# # read one by one to generate the output files
# fileout = os.path.join(args.outDir, "%s_points.csv" % slugify(country))
# extract_countries_to_files(args.points, fileout, country)
# print
sys.exit(1)
#places, errors = parse_input_to_place(datain)
#first the whole dataset
#write_file_cgal(places, tmpfile)
#if not os.path.exists(args.outDir):
# os.mkdir(args.outDir)
#for alphaVal in args.alphas:
# S.set_msg("Shaping %s" % alphaVal)
# fileName = os.path.join(args.outDir, "alphaShape_%s.csv" % alphaVal)
# with open(fileName, 'wb') as fileh:
# polygons = gen_alpha_shape(tmpfile, alphaVal)
# write_file_wkt_csv(polygons, fileh)
#Now country by country
###data = (p.country for p in places)
###data = set(data)
###for c in data:
### cdata = []
### country_points = (p for p in places if p.country == c)
### for cp in country_points:
### cdata.append(cp)
### tmpfile = tempfile.NamedTemporaryFile(prefix='countryData',delete=True);
### write_file_cgal(cdata, tmpfile)
### for alphaVal in args.alphas:
### S.set_msg("Shaping %s %s" % (c, alphaVal))
### fileName = os.path.join(args.outDir, "alphaShape_%s_%s.csv" % (c, alphaVal))
### with open(fileName, 'wb') as fileh:
### polygons = gen_alpha_shape(tmpfile, alphaVal)
### write_file_wkt_csv(polygons, fileh)
except Exception as e:
print e
finally:
finish_program()