-
Notifications
You must be signed in to change notification settings - Fork 6
/
ColdStartLib.pm
executable file
·6491 lines (5922 loc) · 249 KB
/
ColdStartLib.pm
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#!/usr/bin/perl
use warnings;
use strict;
### BEGIN INCLUDE Header
use Carp;
use utf8;
use JSON;
use Encode;
### END INCLUDE Header
binmode(STDOUT, ":utf8");
#####################################################################################
# This library handles a variety of tasks needed to process TAC Cold Start
# submissions.
#
# Authors: James Mayfield, Shahzad Rajput
# Please send questions or comments to jamesmayfield "at" gmail "dot" com
### DO NOT INCLUDE
# FIXME: Use github issue tracking or something else?
### DO INCLUDE
#####################################################################################
my $version = "2017.2.0"; # (1) Code state at the release of scores
### BEGIN INCLUDE Switches
# I don't know where this script will be run, so pick a reasonable
# screen width for describing program usage (with the -help switch)
my $terminalWidth = 80;
### END INCLUDE Switches
### BEGIN INCLUDE Logger
#####################################################################################
# Reporting Problems
#####################################################################################
# The following is the default list of problems that can be checked
# for. A different list of problems can be specified as an argument to
# Logger->new(). WARNINGs can be corrected and do not prevent further
# processing. ERRORs permit further error checking, but processing
# does not proceed after that. FATAL_ERRORs cause immediate program
# termination when the error is reported.
my $problem_formats = <<'END_PROBLEM_FORMATS';
# Error Name Type Error Message
# ---------- ---- -------------
########## Provenance Errors
FAILED_LANG_INFERENCE WARNING Unable to infer language from DOCID %s. Using %s by default.
ILLEGAL_DOCID FATAL_ERROR DOCID %s is not a valid DOCID for this task
ILLEGAL_OFFSET ERROR %s is not a valid offset
ILLEGAL_OFFSET_IN_DOC ERROR %s is not a valid offset for DOCID %s
ILLEGAL_OFFSET_PAIR ERROR (%s, %s) is not a valid offset pair
ILLEGAL_OFFSET_PAIR_STRING ERROR %s is not a valid offset pair string
ILLEGAL_OFFSET_TRIPLE_STRING ERROR %s is not a valid docid/offset pair string
MISSING_FILLER_STRING_PROV ERROR Filler string missing in provencance: %s
MULTIPLE_DOCIDS_IN_PROV ERROR %s contains multiple DOCIDs
MULTIPLE_STRINGS_FOR_PROV ERROR Multiple strings provided for provenance: %s (%s)
TOO_MANY_PROVENANCE_TRIPLES WARNING Too many provenance triples (%d) provided; only the first %d will be used
TOO_MANY_PROVENANCE_TRIPLES_E ERROR Unexpected number of provenance triples: provided=(%d) expected=(%d)
TOO_MANY_CHARS WARNING Provenance contains too many characters; only the first %d will be used
TOO_MANY_TOTAL_CHARS ERROR All provenance strings contain a total of more than %d characters
UNEXPECTED_PROVENANCE ERROR Only PREDICATE_JUSTIFICATION is expected in the provenance: %s
UNEXPECTED_BASE_FILLER ERROR %s is not an allowed value for BASE_FILLER provenance for event assertions
TOO_MANY_PROVENANCES_IN_LIST ERROR Unexpected number of provenances in the list: %s. provided=(%d) expected=(%d)
########## Knowledge Base Errors
AMBIGUOUS_PREDICATE ERROR %s: ambiguous predicate
COLON_OMITTED WARNING Initial colon omitted from name of node %s
DUPLICATE_ASSERTION WARNING The same assertion is made more than once (%s)
ILLEGAL_CONFIDENCE_VALUE ERROR Illegal confidence value: %s
ILLEGAL_NODE_NAME ERROR Illegal node name: %s. (Accepted: :Entity..., :Event..., :String...; A dash '-' is not acceptable as part of the name)
ILLEGAL_NODE_TYPE ERROR Illegal node type: %s
ILLEGAL_LINK_SPECIFICATION WARNING Illegal link specification: %s
ILLEGAL_PREDICATE ERROR Illegal predicate: %s
ILLEGAL_PREDICATE_TYPE ERROR Illegal predicate type: %s
ILLEGAL_REALIS ERROR Illegal realis: %s
INACCURACTE_MENTION_STRING WARNING Mention (or filler) string '%s' not found at %s
INCOMPATIBLE_NODE_NAME ERROR Node name %s is not compatible with type %s
IMPROPER_CONFIDENCE_VALUE WARNING Confidence value in scientific format: %s
MISSING_CANONICAL WARNING Entity %s has no canonical mention in document %s
MISSING_CANONICAL_E ERROR Canonical mention of node %s in document %s required for inferring inverse
MISSING_MENTION WARNING Node %s has no mention in document %s
MISSING_MENTION_E ERROR %s: '%s' is not a mention of node %s
MISSING_DECIMAL_POINT WARNING Decimal point missing in confidence value: %s
MISSING_INVERSE WARNING No inverse relation asserted for %s(%s, %s)
MISSING_REALIS ERROR Realis is missing in assertion: %s
MISSING_RUNID ERROR The first line of the file does not contain a legal runid
MISSING_TYPEDEF ERROR No type asserted for Node %s
MULTIPLE_CANONICAL ERROR More than one canonical mention for Node %s in document %s
MULTIPLE_FILLS_ENTITY WARNING Node %s has multiple %s fills, but should be single-valued
MULTIPLE_LINKS WARNING More than one link from node %s to KB %s
MULTIPLE_MENTIONS_NO_CANONICAL ERROR Node %s has more than one named/nominal mention in document %s but has no canonical mention
MULTITYPED_ENTITY ERROR Node %s has more than one type: %s
NO_MENTIONS WARNING Node %s has no mentions
PREDICATE_ALIAS WARNING Use of %s predicate; %s replaced with %s
STRING_USED_FOR_ENTITY ERROR Expecting a node, but got string %s
SUBJECT_PREDICATE_MISMATCH ERROR Type of subject (%s) does not match type of predicate (%s)
UNASSERTED_MENTION WARNING Failed to assert that %s in document %s is also a mention
UNATTESTED_RELATION_ENTITY ERROR Relation %s uses node %s, but that node has no mentions in provenance %s
UNEXPECTED_REALIS ERROR Unexpected value of realis (expected %s, got %s)
UNQUOTED_STRING WARNING String %s not surrounded by double quotes
UNKNOWN_TYPE ERROR Cannot infer type for Node %s
########## Query File Errors
DUPLICATE_QUERY WARNING Queries %s and %s share entry point(s)
DUPLICATE_QUERY_ID WARNING Duplicate query ID %s
DUPLICATE_QUERY_FIELD WARNING Duplicate <%s> tag
MALFORMED_QUERY ERROR Malformed query %s
MISMATCHED_HOP_SUBTYPES WARNING In %s, range of %s does not match domain of %s
MISMATCHED_HOP_TYPES WARNING In %s, type of %s does not match domain of %s
MISMATCHED_TAGS WARNING <%s> tag closed with </%s>
MISSING_QUERY_FIELD ERROR Missing <%s> tag in query %s
NO_QUERIES_LOADED WARNING No queries found
POSSIBLE_DUPLICATE_QUERY WARNING Queries %s and %s are possibly duplicates, based on entrypoint %s
QUERY_WITHOUT_LOADED_PARENT ERROR Query %s has parent %s that was not loaded
UNKNOWN_QUERY_FIELD WARNING <%s> is not a recognized query field
UNLOADED_QUERY WARNING Query %s is not present in the query files; skipping it
########## Submission File/Assessment File Errors
BAD_QUERY WARNING Response for illegal query %s skipped
DISCARDED_DEPENDENT WARNING Following line has been discarded because all of its parents were discarded due to constraints on multiple justifications: %s
DISCARDED_DEPENDENT_DEPTH WARNING Following line has been discarded because all of its parents were discarded due to depth constraints: %s
DISCARDED_ENTRY WARNING Following line has been discarded due to constraints on multiple justifications: %s
DISCARDED_ENTRY_DEPTH WARNING Following line has been discarded due to depth constraints: %s
DUPLICATE_LINE WARNING Following line appears more than once in the submission therefore all copies but one will be removed: %s
EMPTY_FIELD WARNING Empty value for column %s
EMPTY_FILE WARNING Empty response or assessment file: %s
ILLEGAL_VALUE_TYPE ERROR Illegal value type: %s
MISMATCHED_RUNID FATAL_ERROR Multiple runids were used: %s
MULTIPLE_CORRECT_GROUND_TRUTH WARNING More than one correct choice for ground truth for query %s
MULTIPLE_DOCIDS_IN_RESPONSE ERROR Multiple DOCIDs used in response: %s
MULTIPLE_FILLS_SLOT WARNING Multiple responses given to single-valued slot %s
MULTIPLE_RUNIDS WARNING File contains multiple run IDs (%s, %s)
OFF_TASK_SLOT WARNING %s slot is not valid for task %s
SEMICOLON_IN_PROVENANCE_E ERROR A semicolon is used in the provenance %s
SEMICOLON_AS_SEPARATOR WARNING A semicolon is used as a triple separator in the provenance %s. The semicolon will be replaced with a comma.
UNEXPECTED_ASSESSMENT_ENTRY ERROR Child with no correct parent found in assessment file. %s %s
UNKNOWN_QUERY_ID ERROR Unknown query: %s
UNKNOWN_QUERY_ID_WARNING WARNING Unknown query: %s
UNKNOWN_RESPONSE_FILE_TYPE FATAL_ERROR %s is not a known response file type
UNKNOWN_SLOT_NAME ERROR Unknown slot name: %s
WRONG_SLOT_NAME WARNING Slot %s is not the requested slot for query %s (expected %s)
########## Multi-Use Errors
WRONG_NUM_ENTRIES ERROR Wrong number of entries on line (expected %d, got %d)
END_PROBLEM_FORMATS
#####################################################################################
# Logger
#####################################################################################
package Logger;
use Carp;
# Create a new Logger object
sub new {
my ($class, $formats, $error_output) = @_;
$formats = $problem_formats unless $formats;
my $self = {FORMATS => {}, PROBLEMS => {}, PROBLEM_COUNTS => {}};
bless($self, $class);
$self->set_error_output($error_output);
$self->add_formats($formats);
$self;
}
# Add additional error formats to an existing Logger
sub add_formats {
my ($self, $formats) = @_;
# Convert the problem formats list to an appropriate hash
chomp $formats;
foreach (grep {/\S/} grep {!/^\S*#/} split(/\n/, $formats)) {
s/^\s+//;
my ($problem, $type, $format) = split(/\s+/, $_, 3);
$self->{FORMATS}{$problem} = {TYPE => $type, FORMAT => $format};
}
}
# Get a list of warnings that can be ignored through the -ignore switch
sub get_warning_names {
my ($self) = @_;
join(", ", grep {$self->{FORMATS}{$_}{TYPE} eq 'WARNING'} sort keys %{$self->{FORMATS}});
}
# Do not report warnings of the specified type
sub ignore_warning {
my ($self, $warning) = @_;
$self->NIST_die("Unknown warning: $warning") unless $self->{FORMATS}{$warning};
$self->NIST_die("$warning is a fatal error; cannot ignore it") unless $self->{FORMATS}{$warning}{TYPE} eq 'WARNING';
$self->{IGNORE_WARNINGS}{$warning}++;
}
# Just use the ignore_warning mechanism to delete errors, but don't enforce the warnings-only edict
sub delete_error {
my ($self, $error) = @_;
$self->NIST_die("Unknown error: $error") unless $self->{FORMATS}{$error};
$self->{IGNORE_WARNINGS}{$error}++;
}
# Is a particular error being ignored?
sub is_ignored {
my ($self, $warning) = @_;
$self->NIST_die("Unknown error: $warning") unless $self->{FORMATS}{$warning};
$self->{IGNORE_WARNINGS}{$warning};
}
# Remember that a particular problem was encountered, for later reporting
sub record_problem {
my ($self, $problem, @args) = @_;
my $source = pop(@args);
# Warnings can be suppressed here; errors cannot
return if $self->{IGNORE_WARNINGS}{$problem};
my $format = $self->{FORMATS}{$problem} ||
{TYPE => 'INTERNAL_ERROR',
FORMAT => "Unknown problem $problem: %s"};
$self->{PROBLEM_COUNTS}{$format->{TYPE}}++;
my $type = $format->{TYPE};
my $message = "$type: " . sprintf($format->{FORMAT}, @args);
# Use Encode to support Unicode.
$message = Encode::encode_utf8($message);
my $where = (ref $source ? "$source->{FILENAME} line $source->{LINENUM}" : $source);
$self->NIST_die("$message\n$where") if $type eq 'FATAL_ERROR' || $type eq 'INTERNAL_ERROR';
$self->{PROBLEMS}{$problem}{$message}{$where}++;
}
# Send error output to a particular file or file handle
sub set_error_output {
my ($self, $output) = @_;
if (!$output) {
$output = *STDERR{IO};
}
elsif (!ref $output) {
if (lc $output eq 'stdout') {
$output = *STDOUT{IO};
}
elsif (lc $output eq 'stderr') {
$output = *STDERR{IO};
}
else {
$self->NIST_die("File $output already exists") if -e $output;
open(my $outfile, ">:utf8", $output) or $self->NIST_die("Could not open $output: $!");
$output = $outfile;
$self->{OPENED_ERROR_OUTPUT} = 'true';
}
}
$self->{ERROR_OUTPUT} = $output
}
# Retrieve the file handle for error output
sub get_error_output {
my ($self) = @_;
$self->{ERROR_OUTPUT};
}
# Close the error output if it was opened here
sub close_error_output {
my ($self) = @_;
close $self->{ERROR_OUTPUT} if $self->{OPENED_ERROR_OUTPUT};
}
# Report all of the problems that have been aggregated to the selected error output
sub report_all_problems {
my ($self) = @_;
my $error_output = $self->{ERROR_OUTPUT};
foreach my $problem (sort keys %{$self->{PROBLEMS}}) {
foreach my $message (sort keys %{$self->{PROBLEMS}{$problem}}) {
my $num_instances = scalar keys %{$self->{PROBLEMS}{$problem}{$message}};
print $error_output "$message";
my $example = (sort keys %{$self->{PROBLEMS}{$problem}{$message}})[0];
if ($example ne 'NO_SOURCE') {
print $error_output " ($example";
print $error_output " and ", $num_instances - 1, " other place" if $num_instances > 1;
print $error_output "s" if $num_instances > 2;
print $error_output ")";
}
print $error_output "\n\n";
}
}
# Return the number of errors and the number of warnings encountered
($self->{PROBLEM_COUNTS}{ERROR} || 0, $self->{PROBLEM_COUNTS}{WARNING} || 0);
}
sub get_num_errors {
my ($self) = @_;
$self->{PROBLEM_COUNTS}{ERROR} || 0;
}
sub get_num_warnings {
my ($self) = @_;
$self->{PROBLEM_COUNTS}{WARNING} || 0;
}
sub get_error_type {
my ($self, $error_name) = @_;
$self->{FORMATS}{$error_name}{TYPE};
}
# NIST submission scripts demand an error code of 255 on failure
my $NIST_error_code = 255;
### DO NOT INCLUDE
# FIXME: Inconsistency: sometimes NIST_die is called directly; other
# times record_problem is called with a FATAL_ERROR
### DO INCLUDE
sub NIST_die {
my ($self, @messages) = @_;
my $outfile = $self->{ERROR_OUTPUT};
print $outfile "================================================================\n";
print $outfile Carp::longmess();
print $outfile "================================================================\n";
print $outfile join("", @messages), " at (", join(":", caller), ")\n";
exit $NIST_error_code;
}
### END INCLUDE Logger
### BEGIN INCLUDE Patterns
#####################################################################################
# Patterns
#####################################################################################
package main;
# Eliminate comments, ensuring that pound signs in the middle of
# strings are not treated as comment characters
# Here is the original slightly clearer syntax that unfortunately doesn't work with Perl 5.8
# s/^(
# (?:[^#"]*+ # Any number of chars that aren't double quote or pound sign
# (?:"(?:[^"\\]++|\\.)*+")? # Any number of double quoted strings
# )*+ # The pair of them repeated any number of times
# ) # Everything up to here is captured in $1
# (\s*\#.*)$/x; # Pound sign through the end of the line is not included in the replacement
our $comment_pattern = qr/
^(
(?>
(?:
(?>[^#"]*) # Any number of chars that aren't double quote or pound sign
(?:" # Beginning of double quoted string
(?> # Start a possessive match of the string body
(?:(?>[^"\\]+)|\\.)* # Possessively match any number of non-double quotes or match an escaped char
)" # Possessively match the above repeatedly, before the closing double quote
)? # There might or might not be a double quoted string
)* # The pair of them repeated any number of times
) # Possessively match everything before a pound sign that starts the comment
) # Everything up to here is captured in $1
(\s*\#.*)$/x; # Pound sign through the end of the line is not included in the replacement
### END INCLUDE Patterns
### BEGIN INCLUDE ProvenanceList
#####################################################################################
# ProvenanceList
#####################################################################################
package ProvenanceList;
# Create a new ProvenanceList object
sub new {
my ($class, $logger, $where, $text, $subject, $object, $verb) = @_;
unless($text) {
my $self = {LOGGER => $logger,
WHERE => $where};
bless($self, $class);
return $self;
}
my $self = {LOGGER => $logger, WHERE => $where, ORIGINAL_STRING => $text};
bless($self, $class);
$self->populate_from_text($text, $subject, $object, $verb);
$self;
}
sub populate_from_text {
my ($self, $text, $subject, $object, $verb) = @_;
my $logger = $self->{LOGGER};
my $where = $self->{WHERE};
my ($filler_string,$predicate_justification,$base_filler,$additional_justification);
my @elements = split(";", $text);
$self->validate_list($subject, $object, $verb, scalar @elements);
if($object =~ /^:String/) {
$filler_string = $elements[0] eq 'NIL' ? undef : Provenance->new($logger, $where, 'PROVENANCETRIPLELIST+1', $elements[0]);
shift(@elements);
}
my @types = qw(PROVENANCETRIPLELIST+3 PROVENANCETRIPLELIST+1 PROVENANCETRIPLELIST++);
($predicate_justification,$base_filler,$additional_justification) =
map {
$elements[$_] eq 'NIL' ?
undef : Provenance->new($logger, $where, $types[$_], $elements[$_])
} (0..$#elements);
my @docids = map {$_->get_docid() if $_}
grep {defined $_}
($filler_string,$predicate_justification,$base_filler,$additional_justification);
my %docids = map{$_=>1 if $_} @docids;
unless (scalar keys %docids == 1) {
$logger->record_problem('MULTIPLE_DOCIDS_IN_PROV', $self->tooriginalstring(), $where);
}
$self->{DOCID} = $docids[0];
$self->{FILLER_STRING} = $filler_string;
$self->{PREDICATE_JUSTIFICATION} = $predicate_justification;
$self->{BASE_FILLER} = $base_filler;
$self->{ADDITIONAL_JUSTIFICATION} = $additional_justification;
}
sub validate_list {
my ($self, $subject, $object, $verb, $count) = @_;
my ($b1,$b2) = (0,0);
$b1 = 1 if $subject =~ /^:Event/ || $object =~ /^:Event/;
$b2 = 1 if $object =~ /^:String/;
$self->{LOGGER}->record_problem('TOO_MANY_PROVENANCES_IN_LIST', $self->{ORIGINAL_STRING}, $count, 2*$b1+$b2+1, $self->{WHERE})
if($verb !~ /mention/ && 2*$b1+$b2+1 != $count);
$self->{LOGGER}->record_problem('TOO_MANY_PROVENANCES_IN_LIST', $self->{ORIGINAL_STRING}, $count, 2*$b1+$b2+1, $self->{WHERE})
if($verb =~ /mention/ && $count != 1);
}
# Get the complete path of the file containing the document used in the provenance
sub get_docfile {
my ($self) = @_;
return unless $self->{DOCID};
my $docids = $Provenance::docids;
return $docids->{$self->{DOCID}}{FILE} if ($docids && $docids->{$self->{DOCID}});
return;
}
sub get_docid {
my ($self) = @_;
return "NO DOCUMENT" unless $self->{DOCID};
$self->{DOCID};
}
sub get_counts {
my ($self) = @_;
map { $_ => ($self->{$_} && $self->{$_} eq "NIL") || (not defined $self->{$_}) ? 0 : scalar @{$self->{$_}{TRIPLES}} }
qw(FILLER_STRING PREDICATE_JUSTIFICATION BASE_FILLER ADDITIONAL_JUSTIFICATION);
}
sub get_start {
my ($self) = @_;
my $start = 0;
$start = $self->{PREDICATE_JUSTIFICATION}->get_start() if($self->{PREDICATE_JUSTIFICATION});
$start;
}
sub get_end {
my ($self) = @_;
my $end = 0;
$end = $self->{PREDICATE_JUSTIFICATION}->get_end() if($self->{PREDICATE_JUSTIFICATION});
$end;
}
# This is used to get a consistent string representing the provenancelist
sub tostring {
my ($self) = @_;
my $filler_string = $self->{FILLER_STRING} ? $self->{FILLER_STRING}->tostring() : "NIL";
my $predicate_justification = $self->{PREDICATE_JUSTIFICATION} ? $self->{PREDICATE_JUSTIFICATION}->tostring() : "NIL";
my $base_filler = $self->{BASE_FILLER} ? $self->{BASE_FILLER}->tostring() : "NIL";
my $additional_justification = $self->{ADDITIONAL_JUSTIFICATION} ? $self->{ADDITIONAL_JUSTIFICATION}->tostring() : "NIL";
$self->{PROVENANCE_TOSTRING} = "$filler_string;$predicate_justification;$base_filler;$additional_justification"
unless $self->{PROVENANCE_TOSTRING};
$self->{PROVENANCE_TOSTRING};
}
# tostring() normalizes provenance entry order; this retains the original order
sub tooriginalstring {
my ($self) = @_;
return "" unless $self->{ORIGINAL_STRING};
$self->{ORIGINAL_STRING};
}
## END INCLUDE ProvenanceList
### BEGIN INCLUDE Provenance
#####################################################################################
# Provenance
#####################################################################################
package Provenance;
# Bounds from "Task Description for English Slot Filling at TAC-KBP 2014"
my $max_chars_per_triple = 200;
my $max_total_chars = 800;
my $max_triples = 3;
{
our $docids;
sub set_docids {
$docids = $_[0];
}
# Validate a particular docid/offset-pair entry. Return the updated
# start/end pair in case it has been updated
sub check_triple {
my ($logger, $where, $docid, $start, $end) = @_;
my %checks;
# If the offset triple is illegible, the document ID is set to
# NO_DOCUMENT. Return failure, but don't report it (as the
# underlying error has already been reported)
return if $docid eq 'NO_DOCUMENT';
if ($start !~ /^\d+$/) {
$logger->record_problem('ILLEGAL_OFFSET', $start, $where);
$checks{START} = $logger->get_error_type('ILLEGAL_OFFSET');
}
if ($end !~ /^\d+$/) {
$logger->record_problem('ILLEGAL_OFFSET', $end, $where);
$checks{END} = $logger->get_error_type('ILLEGAL_OFFSET');
}
if (defined $docids && !$docids->{$docid}) {
$logger->record_problem('ILLEGAL_DOCID', $docid, $where);
$checks{DOCID} = $logger->get_error_type('ILLEGAL_DOCID');
}
if (($checks{START} || '') ne 'ERROR' && ($checks{END} || '') ne 'ERROR') {
if ($end < $start) {
$logger->record_problem('ILLEGAL_OFFSET_PAIR', $start, $end, $where);
$checks{PAIR} = $logger->get_error_type('ILLEGAL_OFFSET_PAIR');
}
elsif ($end - $start + 1 > $max_chars_per_triple) {
$logger->record_problem('TOO_MANY_CHARS', $max_chars_per_triple, $where);
# Fix the problem by truncating
$end = $start + $max_chars_per_triple - 1;
$checks{LENGTH} = $logger->get_error_type('TOO_MANY_CHARS');
}
}
if (defined $docids &&
($checks{START} || '') ne 'ERROR' &&
($checks{DOCID} || '') ne 'ERROR') {
if ($start > $docids->{$docid}{LENGTH}) {
$logger->record_problem('ILLEGAL_OFFSET_IN_DOC', $start, $docid, $where);
$checks{START_OFFSET} = $logger->get_error_type('ILLEGAL_OFFSET_IN_DOC');
}
}
if (defined $docids &&
($checks{END} || '') ne 'ERROR' &&
($checks{DOCID} || '') ne 'ERROR') {
if ($end > $docids->{$docid}{LENGTH}) {
$logger->record_problem('ILLEGAL_OFFSET_IN_DOC', $end, $docid, $where);
$checks{END_OFFSET} = $logger->get_error_type('ILLEGAL_OFFSET_IN_DOC');
}
}
foreach (values %checks) {
return if $_ eq 'ERROR';
}
return($start, $end);
}
}
# This is used to, among other things, get a consistent string
# representing the provenance for use in construction of a UUID
sub tostring {
my ($self) = @_;
# join(",", map {"$_->{DOCID}:$_->{START}-$_->{END}"}
# sort {$a->{DOCID} cmp $b->{DOCID} ||
# $a->{START} <=> $b->{START} ||
# $a->{END} cmp $b->{END}}
# @{$self->{TRIPLES}});
### SPEEDUP
$self->{PROVENANCE_TOSTRING} = join(",", map {"$self->{DOCID}:$_->{START}-$_->{END}"}
sort {$a->{START} <=> $b->{START} ||
$a->{END} cmp $b->{END}}
@{$self->{TRIPLES}})
unless $self->{PROVENANCE_TOSTRING};
### SPEEDUP
$self->{PROVENANCE_TOSTRING};
}
# This is used to, among other things, get a short version
# for Event Argument output
sub toshortstring {
my ($self) = @_;
$self->{PROVENANCE_TOSTRING_SHORT} = join(",", map {"$_->{START}-$_->{END}"}
sort {$a->{START} <=> $b->{START} ||
$a->{END} cmp $b->{END}}
@{$self->{TRIPLES}})
unless $self->{PROVENANCE_TOSTRING_SHORT};
$self->{PROVENANCE_TOSTRING_SHORT};
}
# tostring() normalizes provenance entry order; this retains the original order
sub tooriginalstring {
my ($self) = @_;
join(",", map {"$self->{DOCID}:$_->{START}-$_->{END}"} @{$self->{TRIPLES}});
}
# Create a new Provenance object
sub new {
my ($class, $logger, $where, $type, @values) = @_;
my $self = {LOGGER => $logger, TRIPLES => [], WHERE => $where};
my $total = 0;
# This is where we control custom max_triples
if($type =~ /PROVENANCETRIPLELIST\+(.)/) {
$max_triples = $1;
$type = 'PROVENANCETRIPLELIST';
}
if ($type eq 'EMPTY') {
# DO NOTHING
}
elsif ($type eq 'DOCID_OFFSET_OFFSET') {
my ($docid, $start, $end) = @values;
if (($start, $end) = &check_triple($logger, $where, $docid, $start, $end)) {
$self->{DOCID} = $docid;
push(@{$self->{TRIPLES}}, {START => $start,
END => $end,
WHERE => $where});
$total += $end - $start + 1;
}
}
elsif ($type eq 'DOCID_OFFSETPAIRLIST') {
my ($docid, $offset_pair_list) = @values;
my $start;
my $end;
foreach my $pair (split(/,/, $offset_pair_list)) {
unless (($start, $end) = $pair =~ /^\s*(\d+)-(\d+)\s*$/) {
$logger->record_problem('ILLEGAL_OFFSET_PAIR_STRING', $pair, $where);
$start = 0;
$end = 0;
}
if (($start, $end) = &check_triple($logger, $where, $docid, $start, $end)) {
$self->{DOCID} = $docid unless $self->{DOCID};
$logger->record_problem('MULTIPLE_DOCIDS_IN_PROV', $pair, $where)
if($docid ne $self->{DOCID});
push(@{$self->{TRIPLES}}, {START => $start,
END => $end,
WHERE => $where});
$total += $end - $start + 1;
}
else {
return;
}
}
}
elsif ($type eq 'PROVENANCETRIPLELIST') {
my ($triple_list) = @values;
# If a semicolon is present in the triple_list, determine if its used as a separator (may be in addition to a comma);
# Determine if it can be repaired; generate an ERROR or WARNING accordingly
if($triple_list =~ /;/) {
if($triple_list =~ qr/^(?:[^:;]+:\d+-\d+[,;]){0,3}[^:;]+:\d+-\d+$/) {
# Generate a WARNING and repair
$logger->record_problem('SEMICOLON_AS_SEPARATOR', $triple_list, $where);
# Replace the semicolons with commas
$triple_list =~ s/;/,/g;
}
else{
# Cannot be repaired; generate an ERROR
$logger->record_problem('SEMICOLON_IN_PROVENANCE_E', $triple_list, $where);
}
}
my @triple_list = split(/,/, $triple_list);
# This is where we handle unlimited triple list
# specified using PROVENANCETRIPLELIST++
if ($max_triples ne "+" && @triple_list > $max_triples) {
$logger->record_problem('TOO_MANY_PROVENANCE_TRIPLES_E',
scalar @triple_list, $max_triples, $where);
$#triple_list = $max_triples - 1;
}
foreach my $triple (@triple_list) {
my $docid;
my $start;
my $end;
unless (($docid, $start, $end) = $triple =~ /^\s*([^:]+):(\d+)-(\d+)\s*$/) {
$logger->record_problem('ILLEGAL_OFFSET_TRIPLE_STRING', $triple, $where);
$docid = 'NO_DOCUMENT';
$start = 0;
$end = 0;
}
if (($start, $end) = &check_triple($logger, $where, $docid, $start, $end)) {
$self->{DOCID} = $docid unless $self->{DOCID};
$logger->record_problem('MULTIPLE_DOCIDS_IN_PROV', $triple_list, $where)
if($docid ne $self->{DOCID});
push(@{$self->{TRIPLES}}, {START => $start,
END => $end,
WHERE => $where});
$total += $end - $start + 1;
}
}
}
if ($total > $max_total_chars) {
$logger->record_problem('TOO_MANY_TOTAL_CHARS', $max_total_chars, $where);
}
bless($self, $class);
$self;
}
# Get the complete path of the file containing the document used in the provenance
sub get_docfile {
my ($self) = @_;
return unless $self->{DOCID};
my $docids = $Provenance::docids;
return $docids->{$self->{DOCID}}{FILE} if ($docids && $docids->{$self->{DOCID}});
return;
}
sub get_docid {
my ($self, $num) = @_;
$num = 0 unless defined $num;
return "NO DOCUMENT" unless @{$self->{TRIPLES}};
$self->{DOCID};
}
sub get_start {
my ($self, $num) = @_;
$num = 0 unless defined $num;
return 0 unless @{$self->{TRIPLES}};
$self->{TRIPLES}[$num]{START};
}
sub get_end {
my ($self, $num) = @_;
$num = 0 unless defined $num;
return 0 unless @{$self->{TRIPLES}};
$self->{TRIPLES}[$num]{END};
}
sub get_num_entries {
my ($self) = @_;
scalar @{$self->{TRIPLES}};
}
### END INCLUDE Provenance
### BEGIN INCLUDE Query
#####################################################################################
# Query
#####################################################################################
package Query;
my $predicate_set;
# $predicate_set = PredicateSet->new($logger);
### DO NOT INCLUDE
# FIXME: We'd probably be better off using an existing SGML parser of some sort here
### DO INCLUDE
# This table indicates how to parse XML queries
# ORD indicates the output ordering of query fields
# TYPE indicates whether a query may have only one or more than one of the field (some
# years allow multiple entrypoints in a query)
# YEARS indicates which TAC year(s) used that field (not currently used programmatically)
# REQUIRED flags an error if an attempt is made to output a query that lacks the field
# REWRITE changes the field name to the indicated name
my %tags = (
ENTRYPOINTS => {ORD => 0, TYPE => 'single'},
ENTTYPE => {ORD => 1, TYPE => 'single', YEARS => '2014:2015', REQUIRED => 'yes'},
NODEID => {ORD => 2, TYPE => 'single', YEARS => '2016'},
SLOT => {ORD => 3, TYPE => 'single', YEARS => '2014:2015'},
SLOT0 => {ORD => 4, TYPE => 'single', REQUIRED => 'yes'},
SLOT1 => {ORD => 5, TYPE => 'single', },
SLOT2 => {ORD => 6, TYPE => 'single', YEARS => '2012'},
NAME => {ORD => 1, TYPE => 'multiple', REQUIRED => 'yes'},
DOCID => {ORD => 2, TYPE => 'multiple', REQUIRED => 'yes'},
BEG => {ORD => 3, TYPE => 'multiple', REQUIRED => 'yes', REWRITE => 'START'},
END => {ORD => 4, TYPE => 'multiple', REQUIRED => 'yes'},
OFFSET => {ORD => 5, TYPE => 'multiple', YEARS => '2012:2013'},
);
my %languages = (
ENGLISH => {
NAME => "ENGLISH",
CODE => "ENG",
},
SPANISH => {
NAME => "SPANISH",
CODE => "SPA",
},
CHINESE => {
NAME => "CHINESE",
CODE => "CMN",
},
);
sub parse_queryid {
my ($full) = @_;
my ($base, $query_id, $level, $expanded, $prefix, $initial, $remainder, @components);
if (($prefix, $initial, $remainder) = $full =~ /^(?:(.+)_)?([0-9A-F]{10})(_[0-9A-F]{12})*$/i) {
$remainder ||= "";
my @remainder = $remainder =~ /_([0-9A-F]{12})/gi;
$level = scalar @remainder;
$query_id = $level ? pop @remainder : $initial;
$expanded = 'true';
@components = @remainder;
unshift(@components, $initial);
push(@components, $query_id) if($level);
$base = $components[0];
}
# If this function is invoked over LDC queryid
elsif(($prefix, $initial) = $full =~ /^(?:(.+)_)?(\d+)$/i) {
$level = 0;
$query_id = $initial;
push(@components, $query_id);
$base = $query_id;
}
else {
die "unexpected argument: \"$full\" sent to parse_queryid\n";
}
### DO NOT INCLUDE
# FIXME: Handle PSEUDO, etc., as used to be in &get_query_id_base
### DO INCLUDE
# FIXME: Eventually, let's completely separate base from query_id (by eliminating the following line)
#$query_id = "${base}_$query_id" if $base;
($base, $query_id, $level, $expanded, $prefix, @components);
}
sub put {
my ($self, $fieldname, $value) = @_;
$fieldname = uc $fieldname;
$self->{$fieldname} = $value;
### DO NOT INCLUDE
# FIXME: Can generalize to more than two levels, set LEVEL
### DO INCLUDE
if ($fieldname eq 'QUERY_ID') {
my (undef, $query_id) = &Query::parse_queryid($value);
$self->{QUERY_ID} = $query_id;
}
elsif ($fieldname eq 'SLOTS') {
$self->{SLOT} = $value->[0];
foreach my $num (0..$#{$value}) {
$self->put("SLOT$num", $value->[$num]);
}
$self->{LASTSLOT} = &main::max($self->{LASTSLOT} || 0, $#{$value});
}
elsif ($fieldname =~ /^SLOT(\d+)$/) {
my $level = $1;
$self->{SLOTS}[$level] = $value;
$self->{LASTSLOT} = &main::max($self->{LASTSLOT} || 0, $level);
# Split the domain name from the slot name
$value =~ /^(.*?):(.*)$/;
my $domain = $1;
my $shortname = $2;
### SPEEDUP
$predicate_set = PredicateSet->new($self->{LOGGER}) unless $predicate_set;
### SPEEDUP
my @candidates = $predicate_set->lookup_predicate($shortname, $domain);
unless (@candidates) {
$self->{LOGGER}->record_problem('UNKNOWN_SLOT_NAME', $value, 'NO_SOURCE');
return;
}
if (@candidates > 1) {
### DO NOT INCLUDE
# FIXME: I'm not convinced this can happen with fully qualified
# predicate names; it probably dates back to the time when
# predicate specifications were not guaranteed to be qualified
# with the domain name.
### DO INCLUDE
print STDERR "Warning: more than one candidate predicate for $shortname in domain $domain\n";
}
$self->{PREDICATES}[$level] = $candidates[0];
if ($level == 0) {
$self->put('SLOT', $value);
$self->put('QUANTITY', $candidates[0]{QUANTITY});
}
$self->put("${fieldname}_QUANTITY", $candidates[0]{QUANTITY});
}
$value;
}
sub get {
my ($self, $fieldname) = @_;
return $self->get_full_queryid() if(uc $fieldname eq 'FULL_QUERY_ID');
$self->{uc $fieldname};
}
# Recursively get the complete QUERYID
sub get_full_queryid {
my ($self) = @_;
return "$self->{PREFIX}_$self->{QUERY_ID}" if(!$self->{PARENTQUERY});
return $self->{PARENTQUERY}->get_full_queryid()."_".$self->{QUERY_ID};
}
### DO NOT INCLUDE
# sub get_query_id_base {
# my ($query_id) = @_;
# my $result = $query_id;
# # Remove full UUIDs (from 2014)
# $result = $1 if $query_id =~ /^(.*?)_\w{8}-\w{4}-\w{4}-\w{4}-\w{12}$/;
# $result = $1 if $query_id =~ /^(.*?)_PSEUDO/;
# # Remove longer short uuid (from 2015)
# $result = $1 if $query_id =~ /^(.*?)_[0-9a-f]{12}$/i;
# # Remove short uuid (from GenerateQueries)
# $result = $1 if $query_id =~ /^(.*?_[0-9a-f]{10})$/i;
# $result;
# }
### DO INCLUDE
# Calculate a hash of this query
sub get_short_uuid {
my ($self) = @_;
### DO NOT INCLUDE
# FIXME: Don't even try to UUID an unexpanded query?
### DO INCLUDE
my $entrypoint = $self->get_entrypoint(0);
my $string = "$entrypoint->{DOCID}:$entrypoint->{START}:$entrypoint->{END}:" . join(":", @{$self->{SLOTS}});
### DO NOT INCLUDE
# FIXME: Don't hard-code the length
### DO INCLUDE
&main::generate_uuid_from_string($string, 10);
}
# sub get_hashname {
# my ($self) = @_;
# my $short_uuid = $self->get_short_uuid();
# my $query_base = $self->get('QUERY_ID_BASE');
# "${query_base}_$short_uuid";
# }
# sub rename_query {
# my ($self, $new_name) = @_;
# $new_name = $self->get_hashname() unless defined $new_name;
# $self->put('QUERY_ID', $new_name);
# }
sub get_entrypoint {
my ($self, $pos) = @_;
$pos = 0 unless defined $pos;
$self->{ENTRYPOINTS}[$pos];
}
sub get_num_entrypoints {
my ($self) = @_;
scalar @{$self->{ENTRYPOINTS}};
}
sub get_all_entrypoints {
my ($self) = @_;
@{$self->{ENTRYPOINTS}};
}
sub add_entrypoint {
my ($self, %entrypoint) = @_;
unless (defined($entrypoint{PROVENANCE})) {
$entrypoint{PROVENANCE} = Provenance->new($self->{LOGGER},
$entrypoint{WHERE} || 'NO_SOURCE',
'DOCID_OFFSET_OFFSET',
$entrypoint{DOCID},
$entrypoint{START},
$entrypoint{END});
}
my $provenance = $entrypoint{PROVENANCE};
$entrypoint{DOCID} = $provenance->{DOCID} unless defined $entrypoint{DOCID};
$entrypoint{START} = $provenance->{TRIPLES}[0]{START} unless defined $entrypoint{START};
$entrypoint{END} = $provenance->{TRIPLES}[0]{END} unless defined $entrypoint{END};
$entrypoint{LANGUAGE} = $self->infer_language_from_documentid($entrypoint{DOCID}, $entrypoint{WHERE} || 'NO_SOURCE');
push( @{$self->{LANGUAGES}}, $entrypoint{LANGUAGE} ) unless grep {$_ eq $entrypoint{LANGUAGE}} @{$self->{LANGUAGES}};
### DO NOT INCLUDE
# FIXME: Don't hard-code the 12
### DO INCLUDE
$entrypoint{UUID} = &main::generate_uuid_from_values($self->{QUERY_ID}, $entrypoint{NAME}, $provenance->tostring(), 12)
unless defined $entrypoint{UUID};
push(@{$self->{ENTRYPOINTS}}, \%entrypoint);
\%entrypoint;
}
# Infer language from documentid
sub infer_language_from_documentid{
my ($self, $documentid, $where) = @_;
my $language;
if($documentid =~ /^CMN/i) {
$language = "CHINESE";
}
elsif($documentid =~ /^SPA/i) {
$language = "SPANISH";
}
elsif($documentid =~ /(^ENG_)|(_ENG_)/i) {
$language = "ENGLISH";
}
else {
$self->{LOGGER}->record_problem('FAILED_LANG_INFERENCE', $documentid, 'ENGLISH', $where);
$language = "ENGLISH";
}
return $language;
}
# Create a new Query object
sub new {
my ($class, $logger, $text, $filename) = @_;
my $self = {LOGGER => $logger, LEVEL => 0, ENTRYPOINTS => [], EXPANDED_QUERY_IDS => [], LANGUAGES => [], FILENAME => $filename};
bless($self, $class);
$self->populate_from_text($text) if defined $text;
$self;
}
sub duplicate {
my ($self, @fields_to_omit) = @_;