#!/usr/bin/perl use warnings; use strict; # ResolveQueries.pl # Author: James Mayfield (james "dot" mayfield "at" jhuapl.edu) my $version = "1.0"; binmode(STDOUT, ":utf8"); # This program takes input from a TAC Cold Start query file and a # submitted knowledge base. It runs each evaluation query against the # KB, producing one line of output for each relation traversed. I # needed this to work on a machine without a great deal of memory, so # the KB is processed by iteratively reading the entire KB file, then # seeking back to the beginning of the file. As each assertion in the # submission is processed, the program checks whether it can fulfill # any of the tasks it has outstanding. There are three kinds of # tasks: # 1. FindEntrypointTask: The task knows the docid and character # offset of one of the evaluation queries. It is fulfilled if the # assertion is a mention, and the characters of the mention include # the entrypoint character. # 2. FillSlotTask: The task knows a KB entity and a predicate # name. It is fulfilled if the assertion is of that type, and has the # desired entity as subject. # 3. Entity2NameTask: The task knows an entity and a docid. It is # fulfilled if the assertion is a canonical_mention for that entity # in that document. # Processing starts by creating a task manager and adding a # FindEntrypointTask for each evaluation query to it. The KB # submission file is then processed one assertion at a time. If the # assertion fulfills any task that is active in the task manager, the # action associated with that task is executed. The actions for the # three types of task are: # - FindEntrypointTask: Create a FillSlotTask for the first relation # type in the evaluation query, starting from the entity that matched # the entry point character. Add the task to the task manager. # - FillSlotTask: Create and add to the task manager an # Entity2NameTask, which will find the canonical mention for the # object of the matched relation in the document that supports the # the relation. If the object is not an entity (e.g., for a # relation that takes a string filler), just record it as a slot # fill. If there are more hops to be found in the evaluation query, # it also creates another FillSlotTask that starts at the object of # the assertion that matched, and traverses the next slot in the # query. # - Entity2NameTask: Add the string found as canonical mention as a # slot fill. # As a courtesy, the task manager keeps track of all the successfully # matched slot fills. Once a task has been matched, or, in the case # of a FillSlotTask, once it has been tested against each assertion in # the KB submission, it is deleted from the task manager. Thus, once # the task manager has no more tasks, it terminates and returns the # slot fills it found. # Each type of task has a matching routine that tests whether an # assertion fulfills any of the active tasks of that type. They are # stored globally here for some reason. my @retrievers; ##################################################################################### # Patterns ##################################################################################### # Eliminate comments, ensuring that pound signs in the middle of # strings are not treated as comment characters # Here is the original slightly clearer syntax that unfortunately doesn't work with Perl 5.8 # s/^( # (?:[^#"]*+ # Any number of chars that aren't double quote or pound sign # (?:"(?:[^"\\]++|\\.)*+")? # Any number of double quoted strings # )*+ # The pair of them repeated any number of times # ) # Everything up to here is captured in $1 # (\#.*)$/$1/x; # Pound sign through the end of the line is not included in the replacement my $comment_pattern = qr/ ^( (?> (?: (?>[^#"]*) # Any number of chars that aren't double quote or pound sign (?:" # Beginning of double quoted string (?> # Start a possessive match of the string body (?:(?>[^"\\]+)|\\.)* # Possessively match any number of non-double quotes or match an escaped char )" # Possessively match the above repeatedly, before the closing double quote )? # There might or might not be a double quoted string )* # The pair of them repeated any number of times ) # Possessively match everything before a pound sign that starts the comment ) # Everything up to here is captured in $1 (\#.*)$/x; # Pound sign through the end of the line is not included in the replacement ##################################################################################### # Evaluation Queries ##################################################################################### # Read the evaluation queries file, building a hash to represent each, # and returning a list of all the queries read sub load_evaluation_queries { my ($filename) = @_; my $result = []; # Slurp the entire text open(my $infile, "<:utf8", $filename) or die "Could not open $filename: $!"; local($/); my $text = <$infile>; close $infile; # Repeatedly look for text that lies between and # tags. Pull out from each the query id, and the list of attributes # of that query while ($text =~ /\s*(.*?)\s*<\/query>/gs) { my $id = $1; my $fields = $2; # Now break each query into its separate fields. Those fields # appear in key-value order, so we can just capture both values # and shove the whole list directly into a hash my $struct = {$fields =~ /^\s*<(.*?)>([^<]*) tag. So # we add it to the result explicitly $struct->{queryid} = $id; # Find all the slots for this query (different evaluation queries # may have different numbers of slots to be traversed). Collect # them in a list to make it easier for a FillSlotTask to step # through the desired slots for (my $i = 0; ; $i++) { # Slots are all named something like 'slot3' my $key = "slot$i"; if (defined $struct->{$key}) { push (@{$struct->{slots}}, $struct->{$key}); } else { last; } } } $result; } ##################################################################################### # Task ##################################################################################### # This is the base class for the three task types package Task; { my $next_id = "00000"; sub new { my ($class, $query, $description, $parent) = @_; my $self = {QUERY => $query, DESCRIPTION => $description, ID => $next_id++}; # $self->{HOP} = (defined $parent ? $parent->{HOP} + 1 : 0); bless($self, $class); $self; } } ##################################################################################### # FindEntrypointTask ##################################################################################### package FindEntrypointTask; # Inherit from Task. Use -norequire because Task is defined in this file. use parent -norequire, 'Task'; sub new { my ($class, $query) = @_; my $self = $class->SUPER::new($query, "$class($query->{docid}, $query->{offset})"); bless($self, $class); $self; } # FindEntrypointTasks are indexed only under docid; matching the entry # point character to the extent of a mention is done at lookup time sub add_to_index { my ($self, $taskset) = @_; push(@{$taskset->{INDICES}{FindEntrypoint}{$self->{QUERY}{docid}}}, $self); } # Determine whether this mention includes the evaluation query offset sub match { my ($self, $assertion) = @_; $self->{QUERY}{offset} >= $assertion->{start} && $self->{QUERY}{offset} <= $assertion->{end}; } # Find FindEntrypointTasks in the index that match the given assertion push(@retrievers, sub { my ($taskset, $assertion) = @_; return () unless $assertion->{predicate} eq 'mention'; grep {$_->match($assertion)} @{$taskset->{INDICES}{FindEntrypoint}{$assertion->{docid}} || []}; }); # Remove this FindEntrypointTask from the TaskSet. This only reverses # add_to_index; indexing by the position of the assertion in the KB # file is handled by the TaskSet itself sub remove_from_index { my ($self, $taskset) = @_; $taskset->{INDICES}{FindEntrypoint}{$self->{QUERY}{docid}} = [grep {$_ != $self} @{$taskset->{INDICES}{FindEntrypoint}{$self->{QUERY}{docid}}}]; } # We've found an assertion that matches this # FindEntrypointTask. Create a task to fill the first slot of the # evaluation query, and remove this task from the TaskSet sub execute { my ($self, $taskset, $assertion) = @_; my $task = FillSlotTask->new($self->{QUERY}, 'NIL', $assertion->{entity}, 'NIL', @{$self->{QUERY}{slots}}); $taskset->add_task($task, $assertion->{position}); # There is only one fill for this task $taskset->remove($self); $taskset->{STATS}{ENTRYPOINTS_FOUND}++; } ##################################################################################### # FillSlotTask ##################################################################################### package FillSlotTask; use parent -norequire, 'Task'; sub new { my ($class, $query, $predecessor, $entity, $predecessor_assertion, @slots) = @_; die "Attempt to create $class with no slot list" unless @slots; my $self = $class->SUPER::new($query, "$class(" . join(", ", @slots). ") from " . join(", ", caller), $predecessor); $self->{ENTITY} = $entity; $self->{QUERY} = $query; $self->{PREDECESSOR} = $predecessor; $self->{PREDECESSOR_ASSERTION} = $predecessor_assertion; $self->{SLOT} = shift @slots; $self->{SLOTS} = [@slots]; $self->{HOP} = $predecessor eq 'NIL' ? '00' : sprintf("%02d", $predecessor->{HOP} + 1); bless($self, $class); $self; } # FillSlotTasks are indexed by entity and predicate sub add_to_index { my ($self, $taskset) = @_; my $pred = ($self->{PREDECESSOR} eq 'NIL' ? $self->{QUERY}{queryid} : $self->{PREDECESSOR}{ID}); # unless (defined $pred) { # print STDERR "PREDECESSOR = $self->{PREDECESSOR}\n"; # print STDERR "queryid = ", $self->{QUERY}{queryid} || 'undef', "\n"; # print STDERR "predid = ", $self->{ $taskset->{STATS}{"HOP$self->{HOP}_UNIQUE"}++ unless $taskset->{DEDUP}{HOP_UNIQUE}{$self->{HOP}}{$pred}++; $taskset->{STATS}{"HOP$self->{HOP}_TOTAL"}++ unless $taskset->{DEDUP}{HOP_TOTAL}{$self->{HOP}}{$self->{ID}}++; push(@{$taskset->{INDICES}{FillSlot}{$self->{ENTITY}}{$self->{SLOT}}}, $self); } push(@retrievers, sub { my ($taskset, $assertion) = @_; # Slot names have colons; others (such as mention or type) do not return () unless $assertion->{predicate} =~ /:/; @{$taskset->{INDICES}{FillSlot}{$assertion->{entity}}{$assertion->{predicate}} || []}; }); sub remove_from_index { my ($self, $taskset) = @_; $taskset->{INDICES}{FillSlot}{$self->{ENTITY}}{$self->{SLOT}} = [grep {$_ != $self} @{$taskset->{INDICES}{FillSlot}{$self->{ENTITY}}{$self->{SLOT}}}]; } sub execute { my ($self, $taskset, $assertion) = @_; # If the object of the assertion begins with a colon, it represents # an entity. if ($assertion->{object} =~ /^:/) { # If there are hops that have not yet been traversed, create a task to find the next one if (@{$self->{SLOTS}}) { my $task = FillSlotTask->new($self->{QUERY}, $self, $assertion->{object}, $assertion, @{$self->{SLOTS}}); $taskset->add_task($task, $assertion->{position}); } else { $taskset->{STATS}{FINAL_ENTITY_FILLS_FOUND}++; $taskset->{STATS}{FINAL_UNIQUE_ENTITY_FILLS_FOUND}++ unless $taskset->{DEDUP}{FINAL_UNIQUE_FILLS}{$self->{ID}}++; $taskset->{STATS}{FINAL_TOTAL_FILLS_FOUND}++; $taskset->{STATS}{FINAL_TOTAL_UNIQUE_FILLS_FOUND}++ unless $taskset->{DEDUP}{FINAL_UNIQUE_FILLS}{$self->{ID}}++; } # Whether this is the final hop or not, find the canonical mention # for this entity in the supporting document. That allows the # query thus far to be treated independently as a shorter query my $task = Entity2NameTask->new($self->{QUERY}, $self, $assertion->{object}, $assertion->{docid}, $assertion); $taskset->add_task($task, $assertion->{position}); } else { # If this is not an entity, it is a regular slot fill; there is no # need to look for a canonical mention. Add it to the set of # results $taskset->add_fill($self, $assertion); $taskset->{STATS}{FILLS_FOUND}++; $taskset->{STATS}{FINAL_STRING_FILLS_FOUND}++; $taskset->{STATS}{FINAL_UNIQUE_STRING_FILLS_FOUND}++ unless $taskset->{DEDUP}{FINAL_UNIQUE_FILLS}{$self->{ID}}++; $taskset->{STATS}{FINAL_TOTAL_FILLS_FOUND}++; $taskset->{STATS}{FINAL_TOTAL_UNIQUE_FILLS_FOUND}++ unless $taskset->{DEDUP}{FINAL_UNIQUE_FILLS}{$self->{ID}}++; } # Note that we do not remove this task from the taskset yet; slot fills # can be filled multiple times } ##################################################################################### # Entity2NameTask ##################################################################################### package Entity2NameTask; use parent -norequire, 'Task'; sub new { my ($class, $query, $parent, $entity, $docid, $parent_assertion) = @_; my $self = $class->SUPER::new($query, "$class($entity)", $parent); $self->{ENTITY} = $entity; $self->{DOCID} = $docid; $self->{PARENT} = $parent; $self->{PARENT_ASSERTION} = $parent_assertion; bless($self, $class); $self; } # Entity2NameTasks are indexed by entity and docid; they look for the # appropriate canonical_mention sub add_to_index { my ($self, $taskset) = @_; push(@{$taskset->{INDICES}{Entity2Name}{$self->{ENTITY}}{$self->{DOCID}}}, $self); } push(@retrievers, sub { my ($taskset, $assertion) = @_; return () unless $assertion->{predicate} eq 'canonical_mention'; @{$taskset->{INDICES}{Entity2Name}{$assertion->{entity}}{$assertion->{docid}} || []}; }); sub remove_from_index { my ($self, $taskset) = @_; $taskset->{INDICES}{Entity2Name}{$self->{ENTITY}}{$self->{DOCID}} = [grep {$_ != $self} @{$taskset->{INDICES}{Entity2Name}{$self->{ENTITY}}{$self->{DOCID}}}]; } sub execute { my ($self, $taskset, $assertion) = @_; $taskset->add_fill($self->{PARENT}, $self->{PARENT_ASSERTION}, $self, $assertion); # There is only one fill for this task, so the task can be deleted immediately $taskset->remove($self); } ##################################################################################### # TaskSet ##################################################################################### # A TaskSet maintains a set of open tasks. It processes the assertions # in a KB file one at a time, looking for assertions that fulfill any # of the tasks. If the end of the KB file is reached, seek is used to # return to the beginning, and processing continues. If all assertions # have been matched to a task, the task is deleted. package TaskSet; # COUNT is the number of open tasks currently in the TaskSet # OUTFILE is the file handle to which output should be sent sub new { my ($class, $outfile) = @_; my $self = {COUNT => 0, OUTFILE => $outfile}; bless($self, $class); $self; } # Keep track of which run is currently being processed sub set_runid { my ($self, $runid) = @_; $self->{RUNID} = $runid; } # Include a new task in the set of open tasks sub add_task { my ($self, $task, $position) = @_; die "You forgot to set the runid for the TaskSet!" unless defined $self->{RUNID}; # POSITION is the location in the input file at the time the task is # added. The next time we return to that place in the file, we can # remove this task (since we will have compared all assertions in # the file to the task description) $task->{POSITION} = $position; # Index this task according to the current position. We use a hash # rather than an array, because the entries will be sparse push(@{$self->{POSITIONS}{$position}}, $task); $self->{COUNT}++; # We have indexed this task according to its position in the # file. Now we also index it according to the particular type of # task $task->add_to_index($self); $self->{STATS}{TOTAL_TASKS}++; } # Convert an evaluation query to its initial FindEntryPointTask and # add it to the set of current tasks sub add_evaluation_query { my ($self, $query) = @_; my $initial_task = FindEntrypointTask->new($query); $self->add_task($initial_task, 0); } # Number of open tasks sub get_num_active_tasks { $_[0]->{COUNT}; } # Find any open tasks that match the assertion, by invoking each of # the retrieval routines stored in @retrievers sub retrieve_tasks { my ($self, $assertion) = @_; my @result = (); foreach my $retriever (@retrievers) { push(@result, &{$retriever}($self, $assertion)); } @result; } # We're done with this task, either because we tried all the # assertions, or because its execute routine was satisfied and asked # for the deletion sub remove { my ($self, $task) = @_; $task->remove_from_index($self); my $position = $task->{POSITION}; # remove_at_position might already have removed the task from POSITIONS if (defined $self->{POSITIONS}{$position}) { $self->{POSITIONS}{$position} = [grep {$_ != $task} @{$self->{POSITIONS}{$position}}]; } $self->{COUNT}--; } # Delete all tasks that started at the current position (we have # looped around and tried each of the assertions in the file, so we # should stop looking) sub remove_at_position { my ($self, $position) = @_; # Delete tasks at this position in bulk (delete returns the deleted tasks) my $tasks = delete $self->{POSITIONS}{$position}; if (defined $tasks) { foreach (@{$tasks}) { $self->remove($_); } } } # A filler might have tabs or newlines, which can't appear in the # assessment files. It is also likely to be surrounded by double # quotes, which must be removed (along with any escaped characters in # the string) sub normalize_filler { my ($filler) = @_; if ($filler =~ /^"(.*)"$/) { $filler = $1; $filler =~ s/\\(.)/$1/g; } $filler =~ s/\s/ /gs; $filler; } # Here are the columns that LDC is expecting: # # Column 1: response ID # Column 2: query and hop ID (e.g. CS_ENG_001_00) # Column 3: parent ID (ID for the filler from the previous hop level now acting as the entity; NIL if 0 hop) # Column 4: slot name # Column 5: docid that justifies the relation between the parent entity and the slot filler # Column 6: [possibly normalized] slot filler string # Column 7: start offset of unnormalized filler # Column 8: end offset of unnormalized filler # Column 9: start offset of justification # Column 10: end offset of justification # + # Column 11: judgment for slot filler string (Correct, Redundant w/KB, Inexact, Wrong) # Column 12: judgment for justification (Correct, Inexact, Wrong) # Column 13: equivalence class for slot filler string if Correct (0 otherwise) # Keep track of the filled slots that have been found. We collect them # in the TaskSet just as a convenience, since all the routines that # need to return a filled slot already have access to it sub add_fill { my ($self, $task, $assertion, $name_task, $name_assertion) = @_; # First, we construct each of the values in Columns 1-10 (the later # columns are produced by LDC during assessment) my $response_id = "$task->{ID}_$assertion->{position}"; my $query_and_hop = $task->{QUERY}{queryid} . "_" . $task->{HOP}; my $parent_id; if ($task->{PREDECESSOR} eq 'NIL') { $parent_id = 'NIL'; } else { # We can't easily store the ID of the predecessor of this entry # (i.e., the object of the previous hop). So instead we recreate # what it must be (the task ID followed by the position in the KB # file) $parent_id = "$task->{PREDECESSOR}{ID}_$task->{PREDECESSOR_ASSERTION}{position}"; } my $slot_name = $task->{SLOT}; my $docid = $assertion->{docid}; my $filler; my $fstart; my $fend; # This routine either receives a single task and matching assertion # (if this a string-valued slot) or two such pairs, one for the # final hop in the query and one bearing the canonical_mention for # the slot fill. if (defined $name_task) { $filler = &normalize_filler($name_assertion->{object}); $fstart = $name_assertion->{start}; $fend = $name_assertion->{end}; } else { $filler = &normalize_filler($assertion->{object}); $fstart = $assertion->{object_start}; $fend = $assertion->{object_end}; } my $jstart = $assertion->{predicate_start}; my $jend = $assertion->{predicate_end}; # We've calculated all of the necessary values, so print the result my $outfile = $self->{OUTFILE}; print $outfile join("\t", ($response_id, $query_and_hop, $parent_id, $slot_name, $docid, $filler, $fstart, $fend, $jstart, $jend)), "\n"; } package main; # For each type of relation that can appear in a KB submission file, # this table indicates the names and order of the columns my %predicate2labels = ( type => [qw(entity predicate object confidence)], mention => [qw(entity predicate object docid start end confidence)], canonical_mention => [qw(entity predicate object docid start end confidence)], default => [qw(entity predicate object docid entity_start entity_end predicate_start predicate_end object_start object_end confidence)], ); # Convert an assertion to a hash that holds the various fields of the assertion sub parse_assertion { my ($line, $position) = @_; # The spec didn't actually require a single tab between entries, so # we must ditch any fields that don't contain text my (@entries) = grep {/\S/} split(/\t/, $line); # Some folks include a confidence on type assertions, others # don't. So add a confidence if none is present to make all # assertions uniform push(@entries, "1.0") unless $entries[-1] =~ /\./; # Get the list of expected columns in the assertion statement my $predicate = lc $entries[1]; my $labels = $predicate2labels{$predicate} || $predicate2labels{default}; # Make sure the number of values provided matches the number # expected. This should always be true if the Validator has been # run, but do the check anyway just to make sure if (@{$labels} != @entries) { print STDERR "\nlabels = (", join(", ", @{$labels}), "); entries = (", join(", ", @entries), ")\n"; die "Wrong number of arguments for predicate $predicate"; } # Create the hash my $result = {map {$labels->[$_] => $entries[$_]} 0..$#{$labels}}; # Add the description and position fields, which are metadata about # the assertion that do not appear in it $result->{description} = "$result->{predicate}($result->{entity}, $result->{object}) ---> <<$line>>"; $result->{position} = $position; $result; } # Cycle back to the top of the KB file, find the run ID in the first # line, skip over that line (so that we don't try to interpret the run # ID as an assertion), and return the run ID (which may well be # ignored, but we don't care) sub seek_to_start { my ($infile, $taskset) = @_; seek($infile, 0, 0) or die "Could not seek to beginning of file"; my $runid = <$infile>; chomp $runid; $runid =~ s/$comment_pattern/$1/; $runid =~ s/^\s+//; $runid =~ s/\s+$//; die "No runid found" unless $runid; # The initial tasks for the evaluation queries go in at position # zero. Make sure we don't forget to delete them once we've read # through the file the first time $taskset->remove_at_position(0); $runid; } # Look to fulfill each of the evaluation queries in the current run file sub process_runfile { my ($runfile, $evaluation_queries) = @_; # FIXME: Should probably use a switch for this, rather than just # dumping the output into the runfile directory # my $outputfile = "$runfile.derived10.ldc.tab.txt"; my $outputfile = "$runfile.ldc.tab.txt"; open(my $infile, "<:utf8", $runfile) or die "Could not open $runfile: $!"; open(my $outfile, ">:utf8", $outputfile) or die "Could not open $outputfile: $!"; # Create a new task set my $taskset = TaskSet->new($outfile); # Call seek_to_start to skip over the run ID. seek_to_start wipes # out tasks at position 0, so add evaluation queries afterward my $runid = &seek_to_start($infile, $taskset); $taskset->set_runid($runid); foreach my $evaluation_query (@{$evaluation_queries}) { $taskset->add_evaluation_query($evaluation_query); } # Main loop for stepping through the KB file. We're done when no # active tasks remain while ($taskset->get_num_active_tasks()) { # Get the position of the assertion we're about to read in my $tell = tell($infile); # Remove any assertions already at this position; we've gone # through the entire file with them $taskset->remove_at_position($tell); # Get the next assertion local $_ = <$infile>; # If we didn't get anything, we're at the end of the KB file, so # seek back to the start and continue if (!defined $_) { &seek_to_start($infile, $taskset); next; } chomp; # KB files may contain comments. Delete them, but make sure to # handle double-quoted strings properly s/$comment_pattern/$1/; next unless /\S/; my $assertion = &parse_assertion($_, $tell); # Find any open tasks that are fulfilled by this assertion my @tasks = $taskset->retrieve_tasks($assertion); # If any are found, run the execute method on them foreach my $task (@tasks) { $task->execute($taskset, $assertion); } } close $infile; close $outfile; print STDERR "$taskset->{RUNID}:\n"; foreach (sort keys %{$taskset->{STATS}}) { print STDERR "\t$_ = ", $taskset->{STATS}{$_} || 0, "\n"; } } @ARGV >= 2 or die "Usage: perl $0 "; my ($query_filename, @runfiles) = @ARGV; my $queries = &load_evaluation_queries($query_filename); foreach my $runfile (@runfiles) { print STDERR "WARNING: $runfile might not be a Cold Start run file\n" if $runfile =~ /\./ && $runfile !~ /\.valid$/; &process_runfile($runfile, $queries); } 1;