#!/bin/sh -- # -*- perl -*- 
eval 'exec perl -S $0 ${1+"$@"}'
    if 0;

# @(#)sgml2tald	22.1 12/08/98

$ref_state_out = 1;
$hyp_state_out = 1;
$exclude = 0;

@REF = ();
@HYP = ();
@EXCLUDE = ();

$ncorr = 0;

$sent = 0;

while (<STDIN>){
    if ($_ =~ /^<SPEAKER id="([^"]*)"/){  # "
        $exclude = ($1 =~ /excluded_region/) ? 1 : 0;
    } elsif ($_ =~ /^<PATH/){
	$pathid=$_;
	$pathid =~ s/sequence=.\d+. //;

	print "<PATH>\n";
	$_ = <STDIN>;
	@p = split(/[:\n]/,$_);
	$rest = "";
	for ($i=0; $i<=$#p; $i++){


	    $p[$i] =~ m(
			^
			(\w)
			,
			(?:            # optional quoted thing
			 \"            # open quote
			(              # start of data
			(?:            # start of one char/tag
			 (?:<[^><]*>)  # sgml tag
			 |             # OR
			 (?:[^\"><])   # char
			 )*            # 0 or more times
			)              # end of data
			\"             # close quote "
			 )?            # end optional quoted thing
			,
			(?:            # optional quoted thing
			 \"            # open quote
			(              # start of data
			(?:            # start of one char/tag
			 (?:<[^><]*>)  # sgml tag
			 |             # OR
			 (?:[^\"><])   # char
			 )*            # 0 or more times
			)              # end of data
			\"             # close quote "
			 )?            # end optional quoted thing
			,
			(.*)
			$
			)xo;

	    ($eval, $ref, $hyp, $rest) = ($1, $2, $3, $4);

	    #($eval, $ref, $hyp, $rest) = split(/,/,$p[$i],4);
	    $ref =~ s/^"(.*)"$/$1/;
	    $hyp =~ s/^"(.*)"$/$1/;

	    set_state(*ref_state_out, $ref, "ref_state");
	    set_state(*hyp_state_out, $hyp, "hyp_state");

	    print "$ref_state_out $hyp_state_out $exclude $p[$i]\n";

	    push (@REF, $ref);
	    push (@HYP, $hyp);
	    push (@EXCLUDE, $exclude);
	    
	    if ($eval eq "C"){
		$ncorr ++;	
	    } else {
		$ncorr = 0;
	    }

   	    flush_alignment() if ($ncorr > 3 && 
				  $ref_state_out == 1 &&
				  $hyp_state_out == 1) ;
	}
        $_ = <STDIN>;
    } else {
	;
    }

}

flush_alignment() if ($#REF >= 0);
exit 0;

###################################################
########## Subroutines

sub flush_alignment{
    my $w;
    my $rw;
    my $tag;
    my $reftext;
    my $hyptext;

    printf "ID: (SENTENCE%06d)\n",$sent++;
    
    #### Look for ending ne tags in excluded regions
    for ($w=0; $w<=$#REF; $w++){
	if ($EXCLUDE[$w] == 1 && $REF[$w] =~ /<\/(enamex|timex|numex)>/i){
	    $tag = $1;

	    ### Search back to find a matching tag
	    $REF[$w] =~ s/<\/${tag}>//i;
	    for ($rw=$w; $rw >= 0; $rw--){
		last if ($REF[$rw] =~ s/<${tag}#[^>]+>//);
	    }
	    die "Error: internal failure" if ($rw < 0);
        }
	if ($EXCLUDE[$w] == 1 && $HYP[$w] =~ /<\/(enamex|timex|numex)>/i){
	    $tag = $1;

	    ### Search back to find a matching tag
	    $HYP[$w] =~ s/<\/${tag}>//i;
	    for ($rw=$w; $rw >= 0; $rw--){
		last if ($HYP[$rw] =~ s/<${tag}#[^>]+>//);
	    }
	    die "Error: internal failure" if ($rw < 0);
        }
    }
    #### Look for beginning ne tags in excluded regions
    for ($w=0; $w<=$#REF; $w++){
	if ($EXCLUDE[$w] == 1 && $REF[$w] =~ /<(enamex|timex|numex)/i){
	    $tag = $1;

	    ### Search back to find a matching tag
	    $REF[$w] =~ s/<${tag}[^>]+>//i;
	    for ($rw=$w; $rw <= $#REF; $rw++){
		last if ($REF[$rw] =~ s/<\/${tag}>//);
	    }
	    die "Error: internal failure" if ($rw < 0);
        }
	if ($EXCLUDE[$w] == 1 && $HYP[$w] =~ /<(enamex|timex|numex)/i){
	    $tag = $1;

	    ### Search back to find a matching tag
	    $HYP[$w] =~ s/<${tag}[^>]+>//i;
	    for ($rw=$w; $rw <= $#HYP; $rw++){
		last if ($HYP[$rw] =~ s/<\/${tag}>//);
	    }
	    die "Error: internal failure" if ($rw < 0);
        }
    }  
    $reftext = "REF:";
    $hyptext = "HYP:";
    for ($w=0; $w<=$#REF; $w++){
	if ($EXCLUDE[$w] == 0){
	    $reftext .= " $REF[$w]";
	    $hyptext .= " $HYP[$w]";
        }
    }

    ### Tald doesn't handle extended ascii, so let's make it into a consistent
    ###    escaped notation
    print join ("", map({$_ = (/([\200-\377])/) ? 
                                sprintf ("0x%x",ord($1)) : $_}
                        split(//,$reftext) ))."\n";
    print join ("", map({$_ = (/([\200-\377])/) ?  
                                sprintf ("0x%x",ord($1)) : $_}
                        split(//,$hyptext) ))."\n\n";
#    print "$reftext\n$hyptext\n\n";
    
    @REF = ();
    @HYP = ();
    @EXCLUDE = ();
    
    $ncorr = 0;
}

sub set_state{
    local (*state, $text, $type) = @_;

    if ($state == 1){
	if ($text =~ /^[^<>]*<(timex|numex|enamex)[^>]+>[^<>]*$/){
	    $state = 0;
	} elsif ($text =~ /^[^<>]*<(timex|numex|enamex)[^>]+>[^<>]+<\/(timex|numex|enamex)>[^<>]*$/){
	    $state = 1;
	} elsif ($text =~ /^[^<>]*<\/(timex|numex|enamex)>[^<>]*$/){
	    die "sync error ${type}_out=1 $p[$i]\n";
	} else {
	    die "Poorly handled case ${type}_out=1 $text\n"
		if ($text =~ /[<>]/);
	}
    } else {
	if ($text =~ /^[^<>]*<(timex|numex|enamex)[^>]+>[^<>]*$/){
	    die "sync error case ${type}_out=0 $p[$i]\n";
	} elsif ($text =~ /^[^<>]*<\/(timex|numex|enamex)>[^<>]*$/){
	    $state = 1;
	} else {
	    die "Poorly handled case ${type}_out=0 $text\n"
		if ($text =~ /[<>]/);
	}
    }
}
