#!/usr/bin/perl use warnings; use strict; my $version = "1.0"; binmode(STDOUT, ":utf8"); # This program pools the output of ResolveQueries.pl for # assessment. It anonymizes the entries, combines any entries that are # identical except for the run ID going back to the original query, # and sorts the output by subject-slot-filler-docid. # Author: James Mayfield # james.mayfield "@" jhuapl "dot" edu die "Usage: perl $0 " unless @ARGV > 1; # Tracks which labels represent the same slot-fill, and can thus be # assessed only once my %equivalence_classes; # Tracks slot-fills by their content, to identify those that should go # into the same equivalence class my %index; # The starting point for all _00 hops is the original query, which is # not represented in the ldc.tab.txt file. Thus, we need to pull them # out of the original query file. sub load_query_subjects { my ($filename) = @_; my $result = {}; open(my $infile, "<:utf8", $filename) or die "Could not open $filename: $!"; local($/); my $text = <$infile>; close $infile; # First, split out entire queries (delimited by tags) while ($text =~ /\s*(.*?)\s*<\/query>/gs) { my $id = $1; my $fields = $2; # Now break each query into its separate fields. Those fields # appear in key-value order, so we can just capture both values # and shove the whole list directly into a hash my %query = $fields =~ /^\s*<(.*?)>([^<]*) tag. So # we add it to the result explicitly $result->{$id} = $query{name}; } die "No query subjects found" unless keys %{$result}; $result; } # We use a couple of different namespaces. This routine generates a # function, which, when invoked, will return either the original label # associated with a given value, or, if the value hasn't been seen # before, generates a new label for it. For example: # my $fn = &gen_gensym("Foo01"); # &{$fn}(42) => Foo01 # &{$fn}(88) => Foo02 # &{$fn}(42) => Foo01 # Labels are created starting with the argument to gen_gensym, and # incrementing that value for each new token (so it's best to use an # integer or letters followed by an integer for the initial # value). Gensym is a bit of a misnomer here. While it does generate # new unseen tokens when called with a new argument, it's more like # intern for arguments previously seen sub gen_gensym { my ($initialsym) = @_; # The next token and the symbol table are held in a closure around # the constructed function my $nextsym = $initialsym; my %symtable; # The closure around the following constructed function is the return value sub { my ($key, $do_not_create) = @_; # Return the memoized label if it's in the symbol table return $symtable{$key} if defined $symtable{$key}; # It's not there; are we allowed to create it? die "Attempt to intern <<$key>> without permission" if $do_not_create; # Otherwise generate a new label, remember it, then return it my $sym = $nextsym++; $symtable{$key} = $sym; $sym; } } # Map the (gensym-ified) ID of the entry to the name of the entity # that serves as the subject of the relation my %idsym2fill; # Store the output lines here so that they can be sorted before being printed my @outputs; # The first namespace is used to build the pools and aggregate # identical lines under a single label my $gensym1 = &gen_gensym("S00000"); # The second namespace is used to rename the entry labels to be in # increasing order, once they have been sorted into the final desired # order my $gensym2 = &gen_gensym("T00000"); # The original run-name:pos-in-runfile keys are remembered here, so # the correct entry in the final anonymized output file can be found # for a given entry in an input file my @all_keys; # Comparator to reorder the output lines such that no child fact # appears before its parent sub compare_lines { my ($line1, $line2) = @_; my ($self1, $parent1) = $line1 =~ /^(.*?)\t.*?\t(.*?)\t/; my ($self2, $parent2) = $line2 =~ /^(.*?)\t.*?\t(.*?)\t/; die "No match in compare_lines" unless defined $self1 && defined $self2 && defined $parent1 && defined $parent2; return -1 if $self1 eq $parent2; return 1 if $self2 eq $parent1; return 0; } # Reorder the output lines such that no child fact appears before its # parent. I believe this can be greatly simplified by just sorting to # place the hops in order sub sort_lines { my (@lines) = @_; my %OK = (NIL => 'true'); my @result; while (@lines) { my @newlines; foreach my $line (@lines) { my ($self, $parent) = $line =~ /^(.*?)\t.*?\t(.*?)\t/; die "No match in sort_lines" unless defined $self && defined $parent; if ($OK{$parent}) { push(@result, $line); $OK{$self}++; } else { push (@newlines, $line); } } @lines = @newlines; } @result; } # Read in the queries my $queryfile = shift; my $queryid2fill = &load_query_subjects($queryfile); # March through each .ldc.tab.txt file foreach my $assessment_file (@ARGV) { open(my $infile, "<:utf8", $assessment_file) or die "Could not open $assessment_file: $!"; # The name of the run is not stored in the file, so get it from the filename my $runname = $assessment_file; $runname =~ s/^.*\///; $runname =~ s/\..*$//; # Load the entire file at once, so we can sort the lines to ensure # that all parents are processed before their children my @lines = <$infile>; close $infile; @lines = &sort_lines(@lines); # Process each entry in the .ldc.tab.txt file foreach (@lines) { next unless /\S/; my $line = $_; chomp; # This is the line format (for files without judgments; assessed # files contain three additional columns) my ($id, $query_hop, $parent, $predicate, $docid, $fill, $fill_start, $fill_end, $justification_start, $justification_end) = split(/\t/); # FIXME # LDC botched the offsets in the manual run if ($runname eq 'ldc') { $fill_start++; $fill_end++; $justification_start++; $justification_end++; } # The combination of the name of the run and the slot ID uniquely identifies this entry my $key = "$runname:$id"; push(@all_keys, $key); # Generate a symbol from the first namespace to represent this key my $idsym = &{$gensym1}($key); # Generate a symbol for the parent my $parentsym = $parent eq 'NIL' ? 'NIL' : &{$gensym1}("$runname:$parent"); # Map the parent to its equivalence class $parentsym = $equivalence_classes{$parentsym} unless $parentsym eq 'NIL'; die "Undefined parent in $assessment_file : $parent" unless defined $parentsym; # Find the equivalence class for this slot my $equiv_class = $index{$query_hop}{$parentsym}{$predicate}{$docid}{$fill}{$fill_start}{$fill_end}{$justification_start}{$justification_end}; # If we've already seen this equivalence class, just note the mapping if (defined $equiv_class) { $equivalence_classes{$idsym} = $equiv_class; } # Otherwise, create the new equivalence class and process the slot else { # We just use the anonymized ID of the first slot to match this # equivalence class as the name for the entire class $equivalence_classes{$idsym} = $idsym; $index{$query_hop}{$parentsym}{$predicate}{$docid}{$fill} {$fill_start}{$fill_end} {$justification_start}{$justification_end} = $idsym; # Extract the ID of the query from the query_and_hop field my $queryid = $query_hop; $queryid =~ s/^(.*)_\d+$/$1/; # Figure out the subject of this query; it is either the listed # parent, or the subject of the query if the parent is NIL my $subject = $idsym2fill{$parentsym}; $subject = $queryid2fill->{$queryid} unless defined $subject; die "Couldn't find subject for idsym = $idsym, id = $id, run = $runname, parent = $parent, parentsym = $parentsym, queryid = $queryid" unless defined $subject; # Keep a mapping from the equivalence class to the slot filler $idsym2fill{$idsym} = $fill; # Build a hash that contains all the information needed to write one line of output my $output = { ID => $id, QUERY_HOP => $query_hop, PARENT => $parent, PREDICATE => $predicate, DOCID => $docid, FILL => $fill, FILL_START => $fill_start, FILL_END => $fill_end, JUSTIFICATION_START => $justification_start, JUSTIFICATION_END => $justification_end, IDSYM => $idsym, PARENTSYM => $parentsym, SUBJECT => $subject, RUNNAME => $runname, }; # Don't output the entry immediately; rather wait until the # output lines have been placed in the desired order push(@outputs, $output); } } } # Now that the entire input has been processed, place the output lines # in the desired order. This order was requested by LDC to make # assessment easier @outputs = sort {lc $a->{SUBJECT} cmp lc $b->{SUBJECT} || $a->{PREDICATE} cmp $b->{PREDICATE} || lc $a->{FILL} cmp lc $b->{FILL} || $a->{DOCID} cmp $b->{DOCID} || $a->{FILL_START} <=> $b->{FILL_START}} @outputs; # Now we can generate the final symbol for the entry foreach my $output (@outputs) { $output->{IDSYM2} = &{$gensym2}($output->{IDSYM}); } # We also have to make sure the parent has been assigned a symbol in # the same namespace foreach my $output (@outputs) { $output->{PARENTSYM2} = $output->{PARENTSYM} eq 'NIL' ? 'NIL' : &{$gensym2}($output->{PARENTSYM}); } # Finally we can produce output. We start with the portion of the # output that will go to the assessors foreach my $output (@outputs) { print join("\t", map {$output->{$_}} qw(IDSYM2 QUERY_HOP PARENTSYM2 PREDICATE DOCID FILL FILL_START FILL_END JUSTIFICATION_START JUSTIFICATION_END)), "\n"; } # Write a separator, then output the mapping from the original ID in # the submission file to the matching output ID print "=======================================\n"; foreach my $key (sort @all_keys) { my $g1 = &{$gensym1}($key); print "$key\t", &{$gensym2}($equivalence_classes{$g1} || $g1), "\n" unless $key eq 'NIL'; } 1;