#!/bin/sh -- # -*- perl -*- -w
eval 'exec perl -S $0 ${1+"$@"}'
    if 0;

#
# @(#)taldwrap	22.1 12/08/98
#
#


&set_global_variables();

&get_commandline_options();

&scliteout_2_taldin();

&run_tald();

&taldout_2_mucscorerin();

exit(0);

##################################################

sub set_global_variables{
    $scliteout = "";
    $taldin = "";
    $taldout = "";
    $mucscorein = "";
    $alignbin = "";

    $line = "";
    $here = "";
    %g_sentences = ();
}

########################################

sub get_commandline_options{
    use Getopt::Long;

    &GetOptions(
		"scliteout=s"  => \$scliteout,
		"taldin=s"     => \$taldin,
		"taldout=s"    => \$taldout,
		"mucscorein=s" => \$mucscorein,
		"aldistsmdir=s"   => \$alignbin
		);
    
# strip quotes from values
    map {s/^\"(.*)\"$/$1/ or s/^\'(.*)\'$/$1/} ( # "
						$scliteout,
						$taldin,
						$taldout,
						$mucscorein
						);
}

########################################

sub scliteout_2_taldin {

    my $sentencenum = 0;
    my $next_is_ref = 1;
    # (Flag to keep track of order of REF-HYP-REF-...
    #   so we can insert missing lines.)

    open SCOUT, "$scliteout" 
	or die "can't open '$scliteout'\n";
    open TALDIN, ">$taldin" 
	or die "can't open '$taldin'\n";
    while(defined($line=<SCOUT>)){
	chomp $line;
	if($line =~ /^((REF)|(HYP)): /){
	    if( $next_is_ref ) {
		$sentencenum++;
		printf TALDIN "\nID: (SENTENCE%s)\n", $sentencenum;
	    }
	    if($next_is_ref 
	       &&
	       $line =~ /^HYP:/){
		#
		# Missing a ref line.
		#
		# Print the (empty) REF
		printf TALDIN "REF:\n";
		# Make the flag match the current (HYP) line.
		$next_is_ref = 0;
	    } elsif( ! $next_is_ref
		     &&
		     $line =~ /^REF:/ ) {
		#
		# Missing a hyp line.
		#
		# Print the (empty) HYP
		printf TALDIN "HYP:\n";
		# Make the flag match the current (REF) line.
		$next_is_ref = 1;
	    } 
	    #
	    # Now $next_is_ref should agree with the line.
	    #
	    # take out "optionally deletable" parens
	    $line =~ s/\(//g;
	    $line =~ s/\)//g;

	    # Get the tags, save them in g_sentences list.
	    if( $line =~ /^REF:/ ){
		&get_tags($line,'REF',$sentencenum);
	    } elsif( $line =~ /^HYP:/ ){
		&get_tags($line,'HYP',$sentencenum);
	    }

	    # Remove SGML from the line.
	    $line =~ s/<[^<>]*>//g;

	    # Print the line to the input of tald3e_sm_export
	    print TALDIN $line, "\n";
	    
	    # Toggle the $next_is_ref flag.
	    $next_is_ref = 1 - $next_is_ref;
	}
    }
    close SCOUT;
    close TALDIN;
}

########################################

sub run_tald {
    print "tald3e_sm_export ...\n";
    $here = `pwd`; chomp $here;
    chdir $alignbin;
    if( $sysstat = system(
			  "tald3e_sm_export",
			  "$here/$taldout",
			  "$here/$taldin",
			  "pcdt_eng_p.txt",
			  "data/ttp411_l3r4.rls",
			  "data/nat+for",
			  "1.25",
			  "20000",
			  "0",
			  ">/dev/null"
			  ) ) {
	exit($sysstat);
    }
    chdir $here;
}

########################################

sub taldout_2_mucscorerin {
    my $line;
    my $refline;
    my $hypline;
    my $merged_refline;
    my $merged_hypline;
    my $sentence_num;

    open ALIGNED, $taldout 
	or die "can't open '$taldout'\n";
    open MSCIN, ">$mucscorein" 
	or die "can't open '$mucscorein'\n";

    $line = <ALIGNED>;
    SENTENCE: while( defined($line) ) {
	# Skip over lines until next sentence marker.
	if(! defined($line)	# check this because of redo below.
	   ||
	   $line !~ /^ID:\s*\(SENTENCE(\d+)\)/i){
	    next SENTENCE;
	}
	# Got the ID line.
	$line =~ /^ID:\s*\(SENTENCE(\d+)\)/i;
	$sentence_num = $1;
	print MSCIN "\nID: (SENTENCE$sentence_num)\n";

	# Get the ref and hyp lines, if they are there.
	$refline = "";
	$hypline = "";
	while(defined($line=<ALIGNED>)
	      &&
	      $line !~ /^ID:\s*\(SENTENCE(\d+)\)/i  ){
	    if( $line =~ /^REF:/ ){
		$refline = $line;
	    } elsif( $line =~ /^HYP:/ ) {
		$hypline = $line;
	    }
	}

	# If missing data from one line, get number of offset bars
	# from the other line, and fill in the missing line.
	($merged_refline,
	 $merged_hypline) = &fill_missing_sentence($refline,
						   $hypline,
						   $sentence_num);

	print MSCIN $merged_refline, "\n";
	print MSCIN $merged_hypline, "\n";

	# Don't get the next line, since we already got it,
	#    if it's there.
	redo SENTENCE;

    } continue {
	$line = <ALIGNED>;
    }

    close ALIGNED;
    close MSCIN;
}

########################################
    
sub fill_missing_sentence {

    my ($refline,
	$hypline,
	$sentence_num) = @_;

    my $merged_refline = "";
    my $merged_hypline = "";

    if($refline
       &&
       $hypline ) {
	# Both REF and HYP lines came out of aldist
	$merged_refline = &put_tags($sentence_num,
				    'REF',
				    $refline);
	$merged_hypline = &put_tags($sentence_num,
				    'HYP',
				    $hypline);
    } elsif($refline) {
	# One is empty, so insert the tags in the other one
	# and put the same number of offset bars as in the
	# one that is empty.
	$merged_hypline = "HYP: " . ("| " x ($refline =~ tr/\|/\|/));
	$merged_refline = &put_tags($sentence_num,
				    'REF',
				    $refline);
    } elsif($hypline) {
	$merged_refline = "REF: " . ("| " x ($hypline =~ tr/\|/\|/));
	$merged_hypline = &put_tags($sentence_num,
				    'HYP',
				    $hypline);
    } else {
	$merged_refline = "REF: ";
	$merged_hypline = "HYP: ";
    }	
    return ($merged_refline,
	    $merged_hypline);
}

########################################

sub get_tags{
    my ($line,
	$r_or_h,
	$sentencenum) = @_;

    my @words = ($line =~ m(
			    <[^<>]*>
			    |
			    [^<>\s]+
			    )xog );

    $g_sentences{$sentencenum}{$r_or_h} = \@words;
}

########################################

sub put_tags{
    my ($sentence_num,
	$r_or_h,
	$line) = @_;
    my @words;
    my $word;
    my $result = "";
    my @result_tokens;
    my $r_sentence;

    if( defined( $g_sentences{$sentence_num}{$r_or_h} ) ){
	$r_sentence = $g_sentences{$sentence_num}{$r_or_h};

	@words = ($line =~ m(
			     \|    # offset separators
			     
			     |
			     
			     [^\s\*\|]+          # words 
			     
			     )xog );
	
	&merge_tokens(\@words, $r_sentence, \@result_tokens);
	$result = join " ", @result_tokens;
    } else {
	# get the result from sclite output only (put a vertical
	#   bar at the beginning and end of the line)
	$result = $line;
	$result =~ s/^((REF|HYP):\s*)(.*)$/$1 \| $2\|/;
    }
    # Change # back into a single space.
    $result =~ s/#/ /g;
    return $result;
}

###################################################################
    
sub merge_tokens{
    my ($r_aft,			# aligned file tokens
	$r_tft,			# task file tokens
	$r_mt) = @_;		# merged tokens

    while( @$r_tft || @$r_aft ) {
	if( ! @$r_tft ){
	    push @$r_mt, shift @$r_aft;
	} elsif( ! @$r_aft ){
	    push @$r_mt, shift @$r_tft;
	} elsif(&data_vs_data($r_aft,
			      $r_tft)){
	    if(lc($$r_tft[0])
	       ne
	       lc($$r_aft[0]) ) {
		# words don't match.  An ne tag may have split
		# the word, so look ahead in the task token stream to 
		# see if we can reconstruct the word.
		&look_into_tft($r_aft,
			       $r_tft,
			       $r_mt);
	    } else {
		# words match; use the one from the alignment
		push @$r_mt, shift @$r_aft;
		shift @$r_tft;
	    }
	} elsif(&data_vs_tag($r_aft,$r_tft)) {
	    # data in alignment, tag in task -- push tag.
	    push @$r_mt, shift @$r_tft;
	} elsif(&bar_vs_data){
	    # Bar in alignment, word in task -- push bar.
	    push @$r_mt, shift @$r_aft;
	} elsif(&open_task_tag($r_aft,
			       $r_tft) ) { 
	    # Must be a bar in alignment (aft), and
	    # a tag in the task (tft),
	    # so look to see if tft tag is an open or
	    # a close. If it's an open tag, push
	    # bar first:
	    #
	    #    ...|<enamex>...
	    # 
	    push @$r_mt, shift @$r_aft;
	} else {
	    # Otherwise, it`s a close tag -- push the 
	    # tag before the bar:
	    #
	    #    ...</enamex>|...
	    #
	    push @$r_mt, shift @$r_tft;
	}
    }
}

########################################

sub look_into_tft{
    my ($r_aft,			# aligned file tokens
	$r_tft,			# task file tokens
	$r_mt) = @_;		# merged tokens
    # The aft and the tft both have a
    # data token at their head, but the data tokens don't match.
    # This should only be possible when a task tag has split a
    # token, e.g. "Doe</ENAMEX>'s".  So try to recover by looking
    # ahead in the tft stream.  Try to glue the split
    # tokens back together.  Success is when the glued-together
    # token matches the token at the head of aligned_file_tokens.
    # Failure is whenever the glued-together token is not a prefix
    # (ignoring case) of the head of aligned_file_tokens.

    my $atok = shift @$r_aft;	# aligned token
    my $sw   = $$r_tft[0];	# split word
    my $ttok = shift @$r_tft;	# task token (with tags left out)

    while(lc($atok) ne lc($ttok)){
	if((length($ttok) >= length($atok))
	   ||
	   (lc(substr($atok, 0, length($ttok))) ne lc($ttok))) {
	    die "merge mismatch: \'$atok\' and \'$sw\'\n";
	}
	# skip any tags in @$r_tft (but keep them for the output)
	while(@$r_tft
	      &&
	      $$r_tft[0] =~ /^<[^<>]*>$/o){
	    $sw .= shift @$r_tft;
	}
	if(@$r_tft){
	    $sw .= $$r_tft[0];
	    $ttok .= shift @$r_tft;
	} else {
	    # Ran out of pieces to glue onto $ttok before we 
	    # had a match.
	    die "Data mismatch (eol): \'$atok\' vs \'$ttok\'\n";
	}
    }
    push @$r_mt, $sw;
}

########################################

sub data_vs_data{
    my ($r_aft,
	$r_tft) = @_;
    if(@$r_aft
       &&
       @$r_tft
       &&
       ($$r_aft[0] !~ /^\|$/o)
       &&
       ($$r_tft[0] !~ /^<[^<>]*>$/o)){
	return 1;
    } else {
	return 0;
    }
}

########################################

sub data_vs_tag{
    my ($r_aft,
	$r_tft) = @_;
    if(@$r_aft
       &&
       @$r_tft
       &&
       ($$r_aft[0] !~ /^\|$/o)
       &&
       ($$r_tft[0] =~ /^<[^<>]*>$/o)){
	return 1;
    } else {
	return 0;
    }
}

########################################

sub bar_vs_data{
    my ($r_aft,
	$r_tft) = @_;
    if(@$r_aft
       &&
       @$r_tft
       &&
       ($$r_aft[0] =~ /^\|$/o)
       &&
       ($$r_tft[0] !~ /^<[^<>]*>$/o)){
	return 1;
    } else {
	return 0;
    }
}

########################################

sub open_task_tag{
    my ($r_aft,
	$r_tft) = @_;
    if(@$r_tft
       &&
       ($$r_tft[0] =~ /^<[^<>]*>$/o)
       &&
       ($$r_tft[0] !~ /^<\/[^<>]*>$/o)){
	return 1;
    } else {
	return 0;
    }
}

