#!/usr/bin/perl -w use strict; # Author: Hoa Dang (hoa.dang@nist.gov) # Date: July 24, 2012 # Version: 1.1 # Increased maximum run number to 5 (up from 3) # Date: July 2, 2012 # Version: 1.0 # Initial version # Check a TAC 2012 KBP Track slot-filling task submission for various common errors, including: # * invalid run tag (should be a concatenation of your TAC 2012 team ID and the run number (1-5) # * multiple run tags # * invalid query id or slot name # * missing value for a question (at least one response required per question) # * answer from an invalid document # [NB: "question" refers to a (query id, slot name) pair] # Messages regarding submission are printed to an error log # Usage is: # check_kbp_slot-filling.pl doclist_file queries_file results_file # where results_file is the name of the results submission file to be checked # The doclist file lists all the valid document IDS for the task, one ID per line # # The queries file is an xml file with the following dtd # # # # # # # # # # # # The results file has exactly one response per single-valued slot and # at least one response for each list-valued slot, for each query # (except that no response should be given for slots listed in the # 'ignore' list of the query). # # A response is a line with the following tab-separated columns: # Column 1: query id # Column 2: slot name # Column 3: a unique run id for the submission # # Column 4: NIL, if the system believes no information is learnable for this slot; or a single docid # that justifies the relation between the query entity and the slot filler # Column 5: a slot filler # Column 6: start offset of filler # Column 7: end offset of filler # Column 8: start offset of justification # Column 9: end offset of justification # Column 10: confidence score # # If Column 4 is NIL, then Columns 5-10 must be empty. # The slot filler (Column 5) must not contain any embeded tab characters # This script also creates a new file, "input", in the current directory # that has a more standard format (there will not be leading whitespace; # and columns will be separated by one tab) # Change this variable to the directory where the error log should be put my $errlog_dir = "."; my $MAX_ERRORS = 25; my %queries; # Slot Filler Type as defined in LDC publication "TAC KBP Slots"; Version 2.3; June 11, 2012 # Slot names as defined in "Proposed Task Description for Knowledge-Base Population at TAC 2012 Version 1.1 of June 3, 2012" my %slots = ('PER' => {'per:alternate_names' => {quantity => 'list', content => 'name'}, 'per:children' => {quantity => 'list', content => 'name'}, 'per:cities_of_residence' => {quantity => 'list', content => 'name'}, 'per:city_of_birth' => {quantity => 'single', content => 'name'}, 'per:city_of_death' => {quantity => 'single', content => 'name'}, 'per:countries_of_residence' => {quantity => 'list', content => 'name'}, 'per:country_of_birth' => {quantity => 'single', content => 'name'}, 'per:country_of_death' => {quantity => 'single', content => 'name'}, 'per:employee_of' => {quantity => 'list', content => 'name'}, 'per:member_of' => {quantity => 'list', content => 'name'}, 'per:origin' => {quantity => 'list', content => 'name'}, 'per:other_family' => {quantity => 'list', content => 'name'}, 'per:parents' => {quantity => 'list', content => 'name'}, 'per:schools_attended' => {quantity => 'list', content => 'name'}, 'per:siblings' => {quantity => 'list', content => 'name'}, 'per:spouse' => {quantity => 'list', content => 'name'}, 'per:stateorprovince_of_birth' => {quantity => 'single', content => 'name'}, 'per:stateorprovince_of_death' => {quantity => 'single', content => 'name'}, 'per:statesorprovinces_of_residence' => {quantity => 'list', content => 'name'}, 'per:age' => {quantity => 'single', content => 'value'}, 'per:date_of_birth' => {quantity => 'single', content => 'value'}, 'per:date_of_death' => {quantity => 'single', content => 'value'}, 'per:cause_of_death' => {quantity => 'single', content => 'string'}, 'per:charges' => {quantity => 'list', content => 'string'}, 'per:religion' => {quantity => 'single', content => 'string'}, 'per:title' => {quantity => 'list', content => 'string'}}, 'ORG' => {'org:alternate_names' => {quantity => 'list', content => 'name'}, 'org:city_of_headquarters' => {quantity => 'single', content => 'name'}, 'org:country_of_headquarters' => {quantity => 'single', content => 'name'}, 'org:founded_by' => {quantity => 'list', content => 'name'}, 'org:member_of' => {quantity => 'list', content => 'name'}, 'org:members' => {quantity => 'list', content => 'name'}, 'org:parents' => {quantity => 'list', content => 'name'}, 'org:political_religious_affiliation' => {quantity => 'list', content => 'name'}, 'org:shareholders' => {quantity => 'list', content => 'name'}, 'org:stateorprovince_of_headquarters' => {quantity => 'single', content => 'name'}, 'org:subsidiaries' => {quantity => 'list', content => 'name'}, 'org:top_members_employees' => {quantity => 'list', content => 'name'}, 'org:date_dissolved' => {quantity => 'single', content => 'value'}, 'org:date_founded' => {quantity => 'single', content => 'value'}, 'org:number_of_employees_members' => {quantity => 'single', content => 'value'}, 'org:website' => {quantity => 'single', content => 'string'}}); my $doclist_file; # list of valid docids my $queries_file; # list of eval queries my $results_file; # submission file to check/validate my %docids; # list of valid docids my %qids; # number of answers returned for question my %nils; # number of NIL answers returned for question my ($errlog,$num_errors,$line_num); my ($run_id, $tag); my ($q,$etype,@ignore,$slot_name,$docid, $answer,$fbeg, $fend, $jbeg, $jend, $conf); my ($i, $last_i, $line); if ($#ARGV != 2) { print STDERR "Usage: $0 doclist_file query_file resultsfile\n"; die "\n"; } $doclist_file = $ARGV[0]; $queries_file = $ARGV[1]; $results_file = $ARGV[2]; $num_errors = 0; # set up output files $last_i = -1; while ( ($i=index($results_file,"/",$last_i+1)) > -1) { $last_i = $i; } $errlog = $errlog_dir . "/" . substr($results_file,$last_i+1) . ".errlog"; open ERRLOG, ">$errlog" || die "Cannot open error log for writing\n"; open INPUT, ">input" || die "Cannot create `input' file: $!\n"; # read in doclist file open DOCLIST, "<$doclist_file" || die "Unable to open document list file $doclist_file: $!"; while ($line = ) { chomp $line; $docids{$line} = 1; } # read in queries file # only do limited error checking open QUERIES, "<$queries_file" || die "Unable to open queries file $queries_file: $!"; undef $q; undef $etype; @ignore = (); while ($line = ) { chomp $line; next if ($line =~ /^\s*$/); $line =~ s/^\s*(.*\S)\s*$/$1/g; # remove leading and trailing whitespace if($line =~/^$/) { $q= $1; if (defined $queries{$q}) { &error("duplicate query ids in queries file"); next; } } elsif ($line =~ /^<\/query>$/) { if (!defined $etype) { &error("undefined enttype for query $q"); next; } $queries{$q}{'type'} = $etype; @{$queries{$q}{'ignore'}} = @ignore; undef $q; undef $etype; @ignore = (); } elsif ($line =~ /^([A-Z]+)<\/enttype>$/) { if((defined $etype)) { &error("entity type $etype already defined"); next; } $etype = $1; } elsif ($line =~ /^([a-z_: ]+)<\/ignore>$/) { if (@ignore) { &error("nonempty ignorelist already defined"); next; } @ignore = split " ", $1; } } open RESULTS, "<$results_file" || die "Unable to open results file $results_file: $!"; $line_num = 0; $run_id = ""; while ($line = ) { chomp $line; $line_num++; next if ($line =~ /^\s*$/); if (&invalid_UTF8($line)) { &error("invalid character (non-UTF8)"); next; } undef $slot_name; undef $tag; undef $docid, undef $answer; undef $fbeg; undef $fend; undef $jbeg; undef $jend; undef $conf; ($q, $slot_name, $tag, $docid, $answer, $fbeg, $fend, $jbeg, $jend, $conf) = split "\t", $line, 10; if (!defined $docid || length($docid) == 0 || !defined $tag || !defined $slot_name) { &error("Wrong number of fields -- missing fields"); next; } # make sure runtag is ok if (! $run_id) { # very first line --- remember tag $run_id = $tag; if ($run_id !~ /^[A-Za-z0-9._]{1,12}[1-5]$/) { &error("Run tag `$run_id' is malformed)"); next; } } else { # otherwise just make sure one tag used if ($tag ne $run_id) { &error("Run tag inconsistent (`$tag' and `$run_id')"); next; } } # get query id if (!defined $queries{$q}) { &error("Invalid query id ($q)"); next; } # get slot name if (!defined $slots{$queries{$q}{'type'}}{$slot_name}) { &error("Invalid slot name $slot_name for query $q with entity type $queries{$q}{'type'}"); next; } $qids{$q}{$slot_name}++; if ($docid ne "NIL") { if (!defined($conf)) { &error("Wrong number of fields -- missing fields"); next; } # make sure docid valid if ($docid =~ /(\.sgm)/) { &error("Unknown document ID `$docid' (looks like a file name instead of a document ID)"); next; } elsif (!$docids{$docid}) { &error("Unknown document `$docid'"); next; } # make sure answer exists if (!defined $answer || length($answer) == 0) { &error("Missing answer-string for slot $slot_name for query $q"); next; } # check filler offsets and justification offsets if (!($fbeg =~ /^\d+$/ && $fend =~ /^\d+$/ && $fbeg <= $fend)) { &error("invalid filler offsets: beg =`$fbeg' , end = `$fend'"); next; } if (!($jbeg =~ /^\d+$/ && $jend =~ /^\d+$/ && $jbeg <= $jend)) { &error("invalid justification offsets: beg =`$jbeg' , end = `$jend'"); next; } if(!($conf =~ /^\d+\.\d+$/ && $conf <= 1.0)) { &error("invalid confidence value `$conf'"); next; } } else { if ($answer) { &error ("Answer string given when docid is NIL"); next; } $answer = ""; $nils{$q}{$slot_name}++; } print INPUT "$q\t$slot_name\t$tag\t$docid\t$answer\n"; } # Do global checks: # error if some question that is not in the 'ignore' list has no response given for it # error if some question that is in the 'ignore' list has a response given for it # error if single-valued question has more than one response given for it # error if more than one response is given for a question where NIL has been given as an answer foreach $q (keys %queries) { foreach $slot_name (keys %{$slots{$queries{$q}{'type'}}}) { if (! defined $qids{$q}{$slot_name} && -1 == &is_member($slot_name, \@{$queries{$q}{'ignore'}})) { &error("No response given for slot $slot_name for query $q of type $queries{$q}{'type'}"); } if (defined $qids{$q}{$slot_name} && -1 != &is_member($slot_name, \@{$queries{$q}{'ignore'}})) { &error("$qids{$q}{$slot_name} responses given for slot $slot_name, which should be ignored for query $q"); } if ($slots{$queries{$q}{'type'}}{$slot_name}{'quantity'} eq "single" && defined $qids{$q}{$slot_name} && $qids{$q}{$slot_name} > 1) { &error("More than one response given for single-valued slot $slot_name for query $q"); } if (defined $qids{$q}{$slot_name} && $qids{$q}{$slot_name} > 1 && defined $nils{$q}{$slot_name}) { &error("More than one response given for slot $slot_name for query $q, where NIL has been given as an answer"); } } } print ERRLOG "Finished processing $results_file\n"; close INPUT || die "Close failed for `input' file: $!\n"; close ERRLOG || die "Close failed for error log $errlog: $!\n"; if ($num_errors) { exit 255; } exit 0; # print error message, keeping track of total number of errors sub error { my $msg_string = pop(@_); print ERRLOG "$0 of $results_file: Error on line $line_num --- $msg_string\n"; $num_errors++; if ($num_errors > $MAX_ERRORS) { print ERRLOG "$0 of $results_file: Quit. Too many errors!\n"; close ERRLOG || die "Close failed for error log $errlog: $!\n"; exit 255; } } # Returns index of string element in array if present, else -1 sub is_member { my($element,$arrayref) = @_; my $i; for ($i=0; $i<= $#{$arrayref}; $i++) { if ($element eq $$arrayref[$i]) { return($i); } } return(-1); } # Return 0 iff line is valid UTF-8; else return 1. # Regular expression from: http://www.w3.org/International/questions/qa-forms-utf-8 sub invalid_UTF8 { my ($line) = @_; if($line =~ m/\A( [\x09\x0A\x0D\x20-\x7E] # ASCII | [\xC2-\xDF][\x80-\xBF] # non-overlong 2-byte | \xE0[\xA0-\xBF][\x80-\xBF] # excluding overlongs | [\xE1-\xEC\xEE\xEF][\x80-\xBF]{2} # straight 3-byte | \xED[\x80-\x9F][\x80-\xBF] # excluding surrogates | \xF0[\x90-\xBF][\x80-\xBF]{2} # planes 1-3 | [\xF1-\xF3][\x80-\xBF]{3} # planes 4-15 | \xF4[\x80-\x8F][\x80-\xBF]{2} # plane 16 )*\z/x) { return 0; } else { return 1; } }