-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgvspar.pl
executable file
·283 lines (224 loc) · 7.64 KB
/
gvspar.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
#! /usr/bin/perl
# Previously: bin/env perl
# controller.pl: Controller for the Geocoordinate Validation Service (GVS).
# Author: Naim Matasci <[email protected]>
# Modified by: Brad Boyle <[email protected]>
#
###############################################################################
use strict;
use POSIX;
use Getopt::Long;
my $APPNAME = "gvs";
my $binpath = $0;
$binpath =~ s/\/?\w+\.?\w*$//;
if ( !$binpath ) {
$binpath = '.';
}
my $BINARY = "$binpath/gvs.sh";
my $CONSOLIDATE_SCR = "$binpath/consolidator.pl";
# Master directory where all content saved
my $tmpfoldermaster = "/tmp/${APPNAME}/";
my $infile = ''; # Input file
my $outfile = ''; # Optput file - optional
my $maxdist = ''; # MAX_DIST parameter
my $maxdistrel = ''; # MAX_DIST_REL parameter
my $nbatch = ''; # Number of batches
my $mf_opt = ''; # makeflow options - optional
my $d = 'c'; # Output file delimiter, currently only 'c' (csv)
GetOptions(
'in=s' => \$infile,
'out:s' => \$outfile,
'md=i' => \$maxdist,
'mdr=f' => \$maxdistrel,
'nbatch=i' => \$nbatch,
'opt:s' => \$mf_opt
);
# The temporary folder needs to be in the /tmp directory
# (see the function _clean)
mkdir "$tmpfoldermaster" unless -d "$tmpfoldermaster"; # Create temp folder
my $tmpfolder =
$tmpfoldermaster . time() . int( rand(10000) ); #Create a temporary folder
# If a folder with that name already exists, try another name
while ( -e $tmpfolder ) {
$tmpfolder = $tmpfoldermaster . time() . int( rand(10000) );
}
# If no output file name given
if ( !$outfile ) {
$outfile = $infile;
# Use the input file name w/o extension and append [appname]_scrubbed.csv
$outfile =~ s/(?:\.\w+)?$/_${APPNAME}_scrubbed.csv/;
}
# Set maxdist parameter option
my $opt_maxdist = ''; # If omitted will use application default
if ( !$maxdist=='' ) {
$opt_maxdist = "-d $maxdist";
}
# Set maxdistrel parameter option
my $opt_maxdistrel = ''; # If omitted will use application default
if ( !$maxdistrel=='' ) {
$opt_maxdistrel = "-r $maxdistrel";
}
# Let the magic begin
process( $infile, $nbatch, $tmpfolder, $outfile );
sub process {
my ( $infile, $nbatch, $tmpfolder, $outfile ) = @_;
# Get the number of records in the input file
my $nlines = `wc -l < $infile 2>/dev/null`
or die("Cannot find $infile: $!\n");
if ( $nlines == 0 ) { die("The input file $infile is empty.\n") }
# Calculate the expected size of the batches, given their number
# and the number of records
my $exp_g_size = ceil( $nlines / $nbatch );
# Used to map the original name identifiers to the results.
my %map;
# Used to map the original IDs, if present
# NOT NEEDED
my %pids;
# Used to store names that are already valid. Not used
# my @valids;
# Indexer for the batch id
my $batch_id = 0;
# Indexer for the name id within a batch
my $id = 0;
# Line tracker
my $tot = 0;
# The list of lat/long pairs forming a batch
my @batch;
open( my $INL, "<$infile" ) or die "Cannot open input file $infile: $!\n";
while (<$INL>) {
$tot++;
chomp;
my $coords = $_;
# A coordinate that is present more than once in the list, but with
# different primary id, will be processed only once
# All the associated primary ids will be returned.
my $pid=$tot; # Assign original primary id
if ( $coords =~ m/,/ ) {
# ( $lat, $long ) = ( split /,/, $coords );
# $coords =~ s/^\s+//;
if ( exists $pids{$coords} ) {
my @k = @{ $pids{$coords} };
unshift @k, $pid;
$pids{$coords} = \@k;
}
else {
$pids{$coords} = [$pid];
}
}
if ( exists $map{$coords} && $tot <= $nlines ) {
#We have already seen that name
next;
}
# Append coordinates to @batch
push @batch, $coords;
# Every name is assigned a unique internal id, combining its
# batch id and position within the batch
$map{$coords} = "$batch_id.$id";
$id++;
# We write a file every time we reach the predetermined batchsize
# or if there aren't any more input values
if ( @batch >= $exp_g_size || $tot == $nlines ) {
_write_out( $batch_id, \@batch, $tmpfolder );
# _write_screen($batch_id,\@batch);
@batch = ();
$batch_id++;
$id = 0;
}
}
close $INL;
# Create mapping between the coordinates and the internal id
_write_map( \%map, "$tmpfolder/map.tab" );
if (%pids) {
# Create mapping between the name and the original ids
_write_map( \%pids, "$tmpfolder/pids.tab", 1 );
}
# Write the makeflow control file
_generate_mfconfig( $batch_id, $tmpfolder, $outfile );
print "tmpfolder='$tmpfolder'\n";
my $makeflow_cmd="makeflow $mf_opt $tmpfolder/${APPNAME}.flow";
print "makeflow_cmd='$makeflow_cmd'\n";
system("makeflow $mf_opt $tmpfolder/${APPNAME}.flow"); #Run makeflow
#_clean($tmpfolder); #Remove all temporary data
}
#Writes a mapping to a comma separated file
sub _write_map {
my ( $map, $fn, $invert ) = @_;
open my $MAP, ">$fn" or die "Cannot write map file $fn: $!\n";
while ( my ( $coords, $id ) = each %{$map} ) {
if ($invert) { #In case the name and ids are swapped (depends which one is unique)
my $t = $id;
$id = $coords;
$coords = $t;
}
if ( ref($coords) eq 'ARRAY' ) {
$coords = join ',', @{$coords};
}
print $MAP "$id,$coords\n";
}
close $MAP;
}
#Writes the makeflow control file
sub _generate_mfconfig {
my ( $batch_id, $tmpfolder, $outfile ) = @_;
my $filelist; #list of output files that will be produced
my $cmd = "APPBIN=$BINARY\n";
# A 2-line instruction is written for every input file,
for ( my $i = 0 ; $i < $batch_id ; $i++ ) {
# Line 1: output and input files
my $operation =
"$tmpfolder/out_$i.txt: $tmpfolder/input/in_$i.txt \$APPBIN\n";
#Line 2: command
$operation .=
"\t\$APPBIN -a -f $tmpfolder/input/in_$i.txt -o $tmpfolder/out_$i.txt $opt_maxdist $opt_maxdistrel \n\n";
$cmd = $cmd . $operation;
$filelist .= "$tmpfolder/out_$i.txt ";
}
# Call to the consolidation script
#$cmd .= "$tmpfolder/output.csv: $CONSOLIDATE_SCR $tmpfolder $filelist\nLOCAL $CONSOLIDATE_SCR $tmpfolder\n\n";
$cmd .= "$tmpfolder/output.csv: $CONSOLIDATE_SCR $tmpfolder $filelist\n $CONSOLIDATE_SCR $tmpfolder $d\n\n";
# Copy the consolidated output to the final destination
#$cmd .= "$outfile: $tmpfolder/output.csv\nLOCAL cp $tmpfolder/output.csv $outfile\n\n";
$cmd .= "$outfile: $tmpfolder/output.csv\n cp $tmpfolder/output.csv $outfile\n\n";
# Write the file to the temporary folder
open my $FF, ">$tmpfolder/${APPNAME}.flow"
or die("Cannot write makeflow file $tmpfolder/${APPNAME}.flow: $!\n");
print $FF $cmd;
close $FF;
}
# Write a batch of coordinates to a file in the temporary folder
sub _write_out {
my $batch_id = shift;
my $batch = shift;
my $tmpfolder = shift;
if ( !-e $tmpfolder ) {
mkdir $tmpfolder
or die("Cannot create temporary folder $tmpfolder: $!\n");
mkdir "$tmpfolder/input"
or die("Cannot create temporary folder $tmpfolder/input: $!\n");
}
# Batch files are stored in subfolder input
$tmpfolder = "$tmpfolder/input";
open( my $OF, ">$tmpfolder/in_$batch_id.txt" )
or die("Cannot write output file $tmpfolder/in_$batch_id.txt: $!\n");
print $OF join( "\n", @{$batch} );
close $OF;
}
#In case no files need to be written (Unused)
sub _write_screen {
my $batch_id = shift;
my @batch = @{ shift() };
for ( my $i = 0 ; $i < @batch ; $i++ ) {
print "$batch_id.$i\t$batch[$i]\n";
}
}
#Remove temporary files
#The tempfolder needs to be in the /tmp directory
sub _clean {
my $td = shift;
$td =~ s/^\/tmp//; #This is a failsafe to avoid accidentally deleting other relevant files.
my $dummy = system("rm -rf /tmp$td");
}
#Dummy function, in case accepted names are to be treated differently
sub is_accepted {
return 0;
}