#!/usr/bin/perl use warnings; use strict; ##################################################################################### # This program maps between several different representations of a TAC # Cold Start KB, and checks the validity of input files # # You are receiving this program because you signed up for a partner newsletter # # Author: James Mayfield # Please send questions or comments to james.mayfield "at" jhuapl.edu # # For usage, run with no arguments ##################################################################################### my $version = "1.2"; ##################################################################################### # This table lists the classes of problems that are checked for. ##################################################################################### my $problem_formats = <<'END_PROBLEM_FORMATS'; # Error Name Type Error Message # ---------- ---- ------------- AMBIGUOUS_PREDICATE ERROR %s: ambiguous predicate COLON_OMITTED WARNING Initial colon omitted from name of entity %s DUPLICATE_ASSERTION WARNING The same assertion is made more than once (%s) # This can be a warning instead of an error because we've already eliminated duplicates: DUPLICATE_SKB_ENTITY WARNING More than one SKB entry for entity %s ILLEGAL_CONFIDENCE_VALUE ERROR Illegal confidence value: %s ILLEGAL_DOCID ERROR DOCID %s is not a legal DOCID for this task ILLEGAL_ENTITY_NAME ERROR Illegal entity name: %s ILLEGAL_ENTITY_TYPE ERROR Illegal entity type: %s ILLEGAL_OFFSET ERROR %d is not a legal offset for DOCID %s ILLEGAL_OFFSET_PAIR ERROR (%d, %d) is not a legal offset pair ILLEGAL_PREDICATE ERROR Illegal predicate: %s ILLEGAL_PREDICATE_TYPE ERROR Illegal predicate type: %s MISSING_CANONICAL ERROR Entity %s has no canonical mention in document %s MISSING_INVERSE WARNING No inverse relation asserted for %s(%s, %s) MISSING_RUNID ERROR The first line of the file does not contain a legal runid MISSING_TYPEDEF WARNING No type asserted for Entity %s MULTIPLE_CANONICAL ERROR More than one canonical mention for Entity %s in document %s MULTIPLE_FILLS WARNING Entity %s has multiple %s fills, but should be single-valued MULTITYPED_ENTITY ERROR Entity %s has more than one type: %s NO_MENTIONS WARNING Entity %s has no mentions OVERLAPPING_MENTIONS ERROR Overlapping mentions at positions %s and %s in document %s PREDICATE_ALIAS WARNING Use of %s predicate; %s replaced with %s STRING_USED_FOR_ENTITY ERROR Expecting an entity, but got string %s SUBJECT_PREDICATE_MISMATCH ERROR Type of subject (%s) does not match type of predicate (%s) SYNTAX_ERROR ERROR Syntax error: %s UNASSERTED_CANONICAL WARNING Failed to assert canonical mention of Entity %s in document %s UNASSERTED_MENTION WARNING Failed to assert that canonical_mention %s in document %s is also a mention UNATTESTED_RELATION_ENTITY ERROR Relation %s uses entity %s, but that entity is has no mentions in document %s UNQUOTED_STRING WARNING String %s not surrounded by double quotes UNKNOWN_TYPE ERROR Cannot infer type for Entity %s END_PROBLEM_FORMATS ######################################################################################## # This table lists the legal predicates. An asterisk means the relation is single-valued ######################################################################################## my $predicates = <<'END_PREDICATES'; # DOMAIN NAME RANGE INVERSE NUM_OFFSET_PAIRS # ------ ---- ----- ------- ---------------- PER age* STRING none 3 PER,ORG alternate_names STRING none 3 GPE births_in_city PER city_of_birth* 3 GPE births_in_country PER country_of_birth* 3 GPE births_in_stateorprovince PER stateorprovince_of_birth* 3 PER cause_of_death* STRING none 3 PER charges STRING none 3 PER children PER parents 3 PER cities_of_residence GPE residents_of_city 3 PER city_of_birth* GPE births_in_city 3 PER city_of_death* GPE deaths_in_city 3 ORG city_of_headquarters* GPE headquarters_in_city 3 PER countries_of_residence GPE residents_of_country 3 PER country_of_birth* GPE births_in_country 3 PER country_of_death* GPE deaths_in_country 3 ORG country_of_headquarters* GPE headquarters_in_country 3 ORG date_dissolved* STRING none 3 ORG date_founded* STRING none 3 PER date_of_birth* STRING none 3 PER date_of_death* STRING none 3 GPE deaths_in_city PER city_of_death* 3 GPE deaths_in_country PER country_of_death* 3 GPE deaths_in_stateorprovince PER stateorprovince_of_death* 3 PER employee_of ORG,GPE employees 3 ORG,GPE employees PER employee_of 3 ORG founded_by PER,ORG,GPE organizations_founded 3 GPE headquarters_in_city ORG city_of_headquarters* 3 GPE headquarters_in_country ORG country_of_headquarters* 3 GPE headquarters_in_stateorprovince ORG stateorprovince_of_headquarters* 3 PER,ORG,GPE holds_shares_in ORG shareholders 3 PER member_of ORG membership 3 ORG,GPE member_of ORG members 3 ORG members ORG,GPE member_of 3 ORG membership PER member_of 3 ORG number_of_employees_members* STRING none 3 PER,ORG,GPE organizations_founded ORG founded_by 3 PER origin STRING none 3 PER other_family PER other_family 3 PER parents PER children 3 ORG parents ORG,GPE subsidiaries 3 ORG political_religious_affiliation STRING none 3 PER religion* STRING none 3 GPE residents_of_city PER cities_of_residence 3 GPE residents_of_country PER countries_of_residence 3 GPE residents_of_stateorprovince PER statesorprovinces_of_residence 3 PER schools_attended ORG students 3 ORG shareholders PER,ORG,GPE holds_shares_in 3 PER siblings PER siblings 3 PER spouse PER spouse 3 PER stateorprovince_of_birth* GPE births_in_stateorprovince 3 PER stateorprovince_of_death* GPE deaths_in_stateorprovince 3 ORG stateorprovince_of_headquarters* GPE headquarters_in_stateorprovince 3 PER statesorprovinces_of_residence GPE residents_of_stateorprovince 3 ORG students PER schools_attended 3 ORG,GPE subsidiaries ORG parents 3 PER title STRING none 3 PER top_member_employee_of ORG top_members_employees 3 ORG top_members_employees PER top_member_employee_of 3 ORG website* STRING none 3 # The following are not TAC slot filling predicates, but rather # predicates required by the Cold Start task PER,ORG,GPE mention STRING none 1 PER,ORG,GPE canonical_mention STRING none 1 PER,ORG,GPE type TYPE none 0 END_PREDICATES ##################################################################################### # This table lists known aliases of the legal predicates. ##################################################################################### my $predicate_aliases = <<'END_ALIASES'; # REASON ALIAS MAPS TO # ------ ----- ------- DEPRECATED dissolved date_dissolved DEPRECATED founded date_founded DEPRECATED number_of_employees/members number_of_employees_members DEPRECATED political/religious_affiliation political_religious_affiliation DEPRECATED stateorprovinces_of_residence statesorprovinces_of_residence DEPRECATED top_members/employees top_members_employees MISSPELLED ages age MISSPELLED canonical_mentions canonical_mention MISSPELLED city_of_residence cities_of_residence MISSPELLED country_of_residence countries_of_residence MISSPELLED mentions mention MISSPELLED spouses spouse MISSPELLED stateorprovince_of_residence statesorprovinces_of_residence MISSPELLED titles title END_ALIASES ##################################################################################### # Priority for the selection of problem locations ##################################################################################### my %use_priority = ( MENTION => 1, TYPEDEF => 2, SUBJECT => 3, OBJECT => 4, ); ##################################################################################### # Mappings from file extensions to import/export routines ##################################################################################### my %type2import = ( skb => \&load_skb, tac => \&load_tac, ); my %type2export = ( skb => \&export_skb, tac => \&export_tac, rdf => \&export_rdf, ); ##################################################################################### # Default values ##################################################################################### # Should entities be output in C case or Camel case? my $default_output_case_fn = \&string2c; # Warnings can be suppressed by warning type using this table my %ignore_warnings; # Should provenance and confidence values be included in the output? my $output_confidence; my $output_provenance; my $output_offsets; # Can the same assertion be made more than once? my %multiple_attestations = ( ONE => "only one allowed - no duplicates", ONEPERDOC => "at most one allowed per document", MANY => "any number of duplicate assertions allowed", ); my $multiple_attestations = 'ONE'; # Which triple labels should be output? my %output_labels = (); # How many provenance offset pairs do particular predicates take? The # default is three for predicates that do not appear in this table my %num_provenance_strings = ( 'type' => 0, 'mention' => 1, '*mention' => 1, 'canonical_mention' => 1, ); # Should skb output be decorated with handy names? my $display_names; # Should skb output be deduped? my $dedup; # Filehandles for program and error output my $program_output = *STDOUT; my $error_output = *STDERR; ##################################################################################### # Patterns ##################################################################################### # Eliminate comments, ensuring that pound signs in the middle of # strings are not treated as comment characters # Here is the original slightly clearer syntax that unfortunately doesn't work with Perl 5.8 # s/^( # (?:[^#"]*+ # Any number of chars that aren't double quote or pound sign # (?:"(?:[^"\\]++|\\.)*+")? # Any number of double quoted strings # )*+ # The pair of them repeated any number of times # ) # Everything up to here is captured in $1 # (\#.*)$/$1/x; # Pound sign through the end of the line is not included in the replacement my $comment_pattern = qr/ ^( (?> (?: (?>[^#"]*) # Any number of chars that aren't double quote or pound sign (?:" # Beginning of double quoted string (?> # Start a possessive match of the string body (?:(?>[^"\\]+)|\\.)* # Possessively match any number of non-double quotes or match an escaped char )" # Possessively match the above repeatedly, before the closing double quote )? # There might or might not be a double quoted string )* # The pair of them repeated any number of times ) # Possessively match everything before a pound sign that starts the comment ) # Everything up to here is captured in $1 (\#.*)$/x; # Pound sign through the end of the line is not included in the replacement ##################################################################################### # Utility routines ##################################################################################### # Convert a string to camel case { my %string2camel; sub string2camel { my ($slot) = @_; return $string2camel{$slot} if defined $string2camel{$slot}; my $type; if ($slot =~ /^(.*?):(.*)$/) { $type = $1; $slot = $2; } my @words = split(/_/, $slot); my $result = join("", map {ucfirst(lc($_))} @words); $result = "$type:$result" if defined $type; $string2camel{$slot} = $result; $result; } } # Convert a string to C case { my %string2c; sub string2c { my ($slot) = @_; return $string2c{$slot} if defined $string2c{$slot}; return $slot unless $slot =~ /[A-Z]/; my $type; if ($slot =~ /^(.*?):(.*)$/) { $type = $1; $slot = $2; } my @words = $slot =~ /[A-Z][a-z0-9]*/g; my $result = join("_", map {lc} @words); $result = "$type:$result" if defined $type; $string2c{$slot} = $result; $result; } } # Is this string currently in camel case? sub is_camel { my ($name) = @_; # Remove the leading type if present $name =~ s/.*://; $name =~ /^(?:[A-Z][a-z0-9]*)+$/; } # Is this string currently in C case? sub is_c { my ($name) = @_; # Remove the leading type if present $name =~ s/.*://; $name =~ /^[a-z0-9]+(?:_[a-z0-9]+)*$/; } ##################################################################################### # Reporting Problems ##################################################################################### # Convert the problem formats list above to an appropriate hash my %problem_formats; chomp $problem_formats; foreach (grep {!/^\S*#/} split(/\n/, $problem_formats)) { s/^\s+//; my ($problem, $type, $format) = split(/\s+/, $_, 3); $problem_formats{$problem} = {TYPE => $type, FORMAT => $format}; } # Don't report problems immediately when they're encountered; rather, aggregate them here my %problems; my %problem_counts; # Remember that a particular problem was encountered, for later reporting sub record_problem { my $source = pop(@_); my ($problem, @args) = @_; # Warnings can be suppressed here; errors cannot return if $ignore_warnings{$problem}; my $format = $problem_formats{$problem} || {TYPE => 'INTERNAL_ERROR', FORMAT => "Unknown problem $problem: %s"}; $problem_counts{$format->{TYPE}}++; my $message = "$format->{TYPE}: " . sprintf($format->{FORMAT}, @args); if (ref $source) { $problems{$problem}{$message}{"$source->{FILENAME} line $source->{LINENUM}"}++; } else { $problems{problem}{$message}{'NO_SOURCE'}++; } } # Report all of the problems that have been aggregated in %problems to STDERR sub report_all_problems { foreach my $problem (sort keys %problems) { foreach my $message (sort keys %{$problems{$problem}}) { my $num_instances = scalar keys %{$problems{$problem}{$message}}; print $error_output "$message"; my $example = (keys %{$problems{$problem}{$message}})[0]; if ($example ne 'NO_SOURCE') { print $error_output " ($example"; print $error_output " and ", $num_instances - 1, " other place" if $num_instances > 1; print $error_output "s" if $num_instances > 2; print $error_output ")"; } print $error_output "\n"; } } # Return the number of errors and the number of warnings encountered ($problem_counts{ERROR} || 0, $problem_counts{WARNING} || 0); } ##################################################################################### # Predicates ##################################################################################### package Predicate; # Keep track of all known predicates my %predicates; # Mapping from predicate name without domain info to set of fully-qualified predicate names #my %name2predicates; # Set of legal domain types (e.g., {PER, ORG, GPE}) my %legal_domain_types = ( per => 'true', gpe => 'true', org => 'true', ); # Set of legal range types (e.g., {PER, ORG, GPE}) my %legal_range_types = ( per => 'true', gpe => 'true', org => 'true', string => 'true', type => 'true', ); # Set of types that are entities my %legal_entity_types = ( per => 'true', gpe => 'true', org => 'true', ); # Is one type specification compatible with another? The second # argument must be a hash representing a set of types. The first # argument may either be the same representation, or a single type # name. The two are compatible if the second is a (possibly improper) # superset of the first. sub is_compatible { my ($type, $typeset) = @_; my @type_names; if (ref $type) { @type_names = keys %{$type}; } else { @type_names = ($type); } foreach (@type_names) { return unless $typeset->{$_}; } return "compatible"; } # Find all predicates with the given name that are compatible with the # domain and range given, if any sub lookup_predicate { my ($name, $domain, $range) = @_; my @candidates = @{$predicates{$name} || []}; @candidates = grep {&is_compatible($domain, $_->get_domain())} @candidates if defined $domain; @candidates = grep {&is_compatible($range, $_->get_range())} @candidates if defined $range; @candidates; } # Populate the predicates tables from $predicates, which is defined at # the top of this file sub create_predicates { my ($predicates, $label) = @_; chomp $predicates; foreach (grep {!/^\s*#/} split(/\n/, lc $predicates)) { my ($domain, $name, $range, $inverse, $num_provenance_strings) = split; my $lcpredicate = lc $name; $lcpredicate =~ s/\*$//; $num_provenance_strings{$lcpredicate} = $num_provenance_strings; # The "single-valued" marker (asterisk) is handled by Predicate->new Predicate->new($domain, $name, $range, $inverse, $label); } } &create_predicates($predicates, 'TAC'); sub load_predicates { my ($filename) = @_; my $base_filename = $filename; $base_filename =~ s/.*\///; die "Filename for predicates files should be