#!/usr/bin/perl

use warnings;
use strict;

my $version = "1.0";

binmode(STDOUT, ":utf8");

# This program pools the output of ResolveQueries.pl for
# assessment. It anonymizes the entries, combines any entries that are
# identical except for the run ID going back to the original query,
# and sorts the output by subject-slot-filler-docid.
# Author: James Mayfield
#         james.mayfield "@" jhuapl "dot" edu

die "Usage: perl $0 <queryfile> <ldc.tab.txt files>" unless @ARGV > 1;

# Tracks which labels represent the same slot-fill, and can thus be
# assessed only once
my %equivalence_classes;

# Tracks slot-fills by their content, to identify those that should go
# into the same equivalence class
my %index;

# The starting point for all _00 hops is the original query, which is
# not represented in the ldc.tab.txt file. Thus, we need to pull them
# out of the original query file.
sub load_query_subjects {
  my ($filename) = @_;
  my $result = {};
  open(my $infile, "<:utf8", $filename) or die "Could not open $filename: $!";
  local($/);
  my $text = <$infile>;
  close $infile;

  # First, split out entire queries (delimited by <query> tags)
  while ($text =~ /<query\s*id="(.*?)">\s*(.*?)\s*<\/query>/gs) {
    my $id = $1;
    my $fields = $2;
    # Now break each query into its separate fields. Those fields
    # appear in key-value order, so we can just capture both values
    # and shove the whole list directly into a hash
    my %query = $fields =~ /^\s*<(.*?)>([^<]*)</gm;
    # The Query ID is not a field, but comes from the <query> tag. So
    # we add it to the result explicitly
    $result->{$id} = $query{name};
  }
  die "No query subjects found" unless keys %{$result};
  $result;
}

# We use a couple of different namespaces. This routine generates a
# function, which, when invoked, will return either the original label
# associated with a given value, or, if the value hasn't been seen
# before, generates a new label for it.  For example:
#    my $fn = &gen_gensym("Foo01");
#    &{$fn}(42) => Foo01
#    &{$fn}(88) => Foo02
#    &{$fn}(42) => Foo01
# Labels are created starting with the argument to gen_gensym, and
# incrementing that value for each new token (so it's best to use an
# integer or letters followed by an integer for the initial
# value). Gensym is a bit of a misnomer here.  While it does generate
# new unseen tokens when called with a new argument, it's more like
# intern for arguments previously seen
sub gen_gensym {
  my ($initialsym) = @_;
  # The next token and the symbol table are held in a closure around
  # the constructed function
  my $nextsym = $initialsym;
  my %symtable;
  # The closure around the following constructed function is the return value
  sub {
    my ($key, $do_not_create) = @_;
    # Return the memoized label if it's in the symbol table
    return $symtable{$key} if defined $symtable{$key};
    # It's not there; are we allowed to create it?
    die "Attempt to intern <<$key>> without permission" if $do_not_create;
    # Otherwise generate a new label, remember it, then return it
    my $sym = $nextsym++;
    $symtable{$key} = $sym;
   $sym;
  }
}

# Map the (gensym-ified) ID of the entry to the name of the entity
# that serves as the subject of the relation
my %idsym2fill;
# Store the output lines here so that they can be sorted before being printed
my @outputs;
# The first namespace is used to build the pools and aggregate
# identical lines under a single label
my $gensym1 = &gen_gensym("S00000");
# The second namespace is used to rename the entry labels to be in
# increasing order, once they have been sorted into the final desired
# order
my $gensym2 = &gen_gensym("T00000");
# The original run-name:pos-in-runfile keys are remembered here, so
# the correct entry in the final anonymized output file can be found
# for a given entry in an input file
my @all_keys;

# Comparator to reorder the output lines such that no child fact
# appears before its parent
sub compare_lines {
  my ($line1, $line2) = @_;
  my ($self1, $parent1) = $line1 =~ /^(.*?)\t.*?\t(.*?)\t/;
  my ($self2, $parent2) = $line2 =~ /^(.*?)\t.*?\t(.*?)\t/;
  die "No match in compare_lines" unless defined $self1 && defined $self2 && defined $parent1 && defined $parent2;
  return -1 if $self1 eq $parent2;
  return 1 if $self2 eq $parent1;
  return 0;
}

# Reorder the output lines such that no child fact appears before its
# parent.  I believe this can be greatly simplified by just sorting to
# place the hops in order
sub sort_lines {
  my (@lines) = @_;
  my %OK = (NIL => 'true');
  my @result;
  while (@lines) {
    my @newlines;
    foreach my $line (@lines) {
      my ($self, $parent) = $line =~ /^(.*?)\t.*?\t(.*?)\t/;
      die "No match in sort_lines" unless defined $self && defined $parent;
      if ($OK{$parent}) {
	push(@result, $line);
	$OK{$self}++;
      }
      else {
	push (@newlines, $line);
      }
    }
    @lines = @newlines;
  }
  @result;
}

# Read in the queries
my $queryfile = shift;
my $queryid2fill = &load_query_subjects($queryfile);

# March through each .ldc.tab.txt file
foreach my $assessment_file (@ARGV) {
  open(my $infile, "<:utf8", $assessment_file) or die "Could not open $assessment_file: $!";
  # The name of the run is not stored in the file, so get it from the filename
  my $runname = $assessment_file;
  $runname =~ s/^.*\///;
  $runname =~ s/\..*$//;
  # Load the entire file at once, so we can sort the lines to ensure
  # that all parents are processed before their children
  my @lines = <$infile>;
  close $infile;
  @lines = &sort_lines(@lines);
  # Process each entry in the .ldc.tab.txt file
  foreach (@lines) {
    next unless /\S/;
    my $line = $_;
    chomp;
    # This is the line format (for files without judgments; assessed
    # files contain three additional columns)
    my ($id, $query_hop, $parent, $predicate, $docid,
	$fill, $fill_start, $fill_end,
	$justification_start, $justification_end) = split(/\t/);
    # FIXME
    # LDC botched the offsets in the manual run
    if ($runname eq 'ldc') {
      $fill_start++;
      $fill_end++;
      $justification_start++;
      $justification_end++;
    }
    # The combination of the name of the run and the slot ID uniquely identifies this entry
    my $key = "$runname:$id";
    push(@all_keys, $key);
    # Generate a symbol from the first namespace to represent this key
    my $idsym = &{$gensym1}($key);
    # Generate a symbol for the parent
    my $parentsym = $parent eq 'NIL' ? 'NIL' : &{$gensym1}("$runname:$parent");
    # Map the parent to its equivalence class
    $parentsym = $equivalence_classes{$parentsym} unless $parentsym eq 'NIL';
    die "Undefined parent in $assessment_file : $parent" unless defined $parentsym;
    # Find the equivalence class for this slot
    my $equiv_class = $index{$query_hop}{$parentsym}{$predicate}{$docid}{$fill}{$fill_start}{$fill_end}{$justification_start}{$justification_end};
    # If we've already seen this equivalence class, just note the mapping
    if (defined $equiv_class) {
      $equivalence_classes{$idsym} = $equiv_class;
    }
    # Otherwise, create the new equivalence class and process the slot
    else {
      # We just use the anonymized ID of the first slot to match this
      # equivalence class as the name for the entire class
      $equivalence_classes{$idsym} = $idsym;
      $index{$query_hop}{$parentsym}{$predicate}{$docid}{$fill}
	    {$fill_start}{$fill_end}
	    {$justification_start}{$justification_end} = $idsym;
      # Extract the ID of the query from the query_and_hop field
      my $queryid = $query_hop;
      $queryid =~ s/^(.*)_\d+$/$1/;
      # Figure out the subject of this query; it is either the listed
      # parent, or the subject of the query if the parent is NIL
      my $subject = $idsym2fill{$parentsym};
      $subject = $queryid2fill->{$queryid} unless defined $subject;
      die "Couldn't find subject for idsym = $idsym, id = $id, run = $runname, parent = $parent, parentsym = $parentsym, queryid = $queryid"
	unless defined $subject;
      # Keep a mapping from the equivalence class to the slot filler
      $idsym2fill{$idsym} = $fill;
      # Build a hash that contains all the information needed to write one line of output
      my $output = {
		    ID => $id,
		    QUERY_HOP => $query_hop,
		    PARENT => $parent,
		    PREDICATE => $predicate,
		    DOCID => $docid,
		    FILL => $fill,
		    FILL_START => $fill_start,
		    FILL_END => $fill_end,
		    JUSTIFICATION_START => $justification_start,
		    JUSTIFICATION_END => $justification_end,
		    IDSYM => $idsym,
		    PARENTSYM => $parentsym,
		    SUBJECT => $subject,
		    RUNNAME => $runname,
		   };
      # Don't output the entry immediately; rather wait until the
      # output lines have been placed in the desired order
      push(@outputs, $output);
    }
  }
}

# Now that the entire input has been processed, place the output lines
# in the desired order. This order was requested by LDC to make
# assessment easier
@outputs = sort {lc $a->{SUBJECT} cmp lc $b->{SUBJECT} ||
		 $a->{PREDICATE} cmp $b->{PREDICATE} ||
		 lc $a->{FILL} cmp lc $b->{FILL} ||
		 $a->{DOCID} cmp $b->{DOCID} ||
	         $a->{FILL_START} <=> $b->{FILL_START}} @outputs;
# Now we can generate the final symbol for the entry
foreach my $output (@outputs) {
  $output->{IDSYM2} = &{$gensym2}($output->{IDSYM});
}
# We also have to make sure the parent has been assigned a symbol in
# the same namespace
foreach my $output (@outputs) {
  $output->{PARENTSYM2} = $output->{PARENTSYM} eq 'NIL' ? 'NIL' : &{$gensym2}($output->{PARENTSYM});
}
# Finally we can produce output. We start with the portion of the
# output that will go to the assessors
foreach my $output (@outputs) {
  print join("\t", map {$output->{$_}}
	     qw(IDSYM2 QUERY_HOP PARENTSYM2 PREDICATE DOCID FILL FILL_START FILL_END
		JUSTIFICATION_START JUSTIFICATION_END)), "\n";
}
# Write a separator, then output the mapping from the original ID in
# the submission file to the matching output ID
print "=======================================\n";
foreach my $key (sort @all_keys) {
  my $g1 = &{$gensym1}($key);
  print "$key\t", &{$gensym2}($equivalence_classes{$g1} || $g1), "\n" unless $key eq 'NIL';
}

1;