# RCS: @(#) $Id: sclite.tcl,v 1.2 2003/08/26 10:28:03 barras Exp $

# sclite.tcl - extension of the Transcriber program
# Copyright (C) 2001, LIMSI

# This conversion filter for Transcriber was developed in the framework
# of the European project CORETEX (http://coretex.itc.it/)

# It takes as input a .sgml file generated by the NIST Speech Recognition
# Scoring Toolkit 'sclite' (see http://www.nist.gov/speech/tools/)
# Basically, this file contains the result of the alignment between a
# reference ".stm" transcription and an hypothesized ".ctm" automatic
# word transcription.

# When such a file is open in Transcriber, both levels (reference
# .stm and hypothesized .ctm) are shown under the signal.
# Matching segments are in white, substitution in grey,
# segments with lot of omissions or insersions in red.
# (this may be changed below)
# The reference .stm transcription is broken where errors start
# or stop, hypothesized .ctm remains time aligned at word level.

# Whenever a .trs transcription with the same basename as the
# file is found, it is open in the text editor (it may be the
# reference transcription from which the .stm was generated).
# Else the .stm is displayed (it should be in the same directory
# as the .sgml alignment).

# It is important to remember that the .stm is only time aligned
# at segment boundaries, and that word alignment between .stm and
# .ctm performed by 'sclite' within these segment boundaries with
# dynamic programmation may result in a wrong temporal position for
# some words of the reference.
# Also, omited segments from the reference absolutely lack of any
# time boundary, so they are inserted between previous and next
# segment with an arbitrary duration (% of both neighbours).

# INSTALLATION
#
# Just put the file into the 'convert' directory to be found
# for Unix at
#  $(INSTALL_PATH)/lib/transcriber1.4/convert/
# or for Windows typically (to be checked)
#  C:\Program Files\DGA & LDC\Transcriber\convert

# USE:
#
# Under Unix, the tool may be launched with e.g.:
#  %  trans scoring/sample.ctm.filt.sgml signal/sample.wav
#
# NB - if the signal is in the same directory as the .sgml and shares
# the same basename as the .sgml (until first dot) then it may be omited.


# colors for correct/substitutions/insertions/deletions
set ::v(colorCor) "white"
set ::v(colorSub) "grey75"
set ::v(colorIns) "#e78e86"
set ::v(colorDel) "#e78e86"

namespace eval ::convert::sclite {

  variable msg "NIST sclite .sgml alignment"
  variable ext ".sgml"

  proc guess {name} {
    variable ext
    set filext [string tolower [file extension $name]]
    set f [open $name]; set magic [read $f 8]; close $f
    if {$magic == "<SYSTEM " && [lsearch $ext $filext]>=0} {
      return 1
    } else {
      return 0
    }
  }

  # look for .stm (or .stm.norm) in same dir as .sgml or .wav then in their sibling paths 
  proc lookForFiles {name {exts {stm stm.norm}}} {
    global v

    set file_id [file root [file tail $v(sig,name)]]
    set file_id2 [lindex [split [file tail $name] .] 0]
    foreach ext $exts {
      foreach base [list $file_id $file_id2] {
	foreach path [list [file dir $name] [file dir $v(sig,name)] [file dir [file dir $name]]/* [file dir [file dir $v(sig,name)]]/*] {
	  
	  if {![catch {
	    set stm [lindex [glob $path/$base.$ext] 0]
	  }] && [file exists $stm]} {
	    return $stm
	  }
	}
      }
    }
    return ""
  }

  proc import {name} {
    global v

    LookForSignal [file dir $name]/[lindex [split [file tail $name] .] 0] ""
    set segmt [readSegmt [ReadFile $name]]
    
    # Try to open .trs file in main editor, else .stm
    set trs [lookForFiles $name {trs xml}]
    set stm [lookForFiles $name {stm stm.norm}]
    if {$trs != ""} {
      ::trs::import $trs
    } elseif {$stm != ""} {
      ::convert::stm::import $stm
    } else {
      SegmtToTrans $segmt
    }
    # Try to open partitioning infos (LIMSI specific part)
    catch {
      set pem [ReadFile [lookForFiles $name {pem3+om.lbl}]]
      ::convert::pem3om::readSegmt $pem
      set v(trans,speaker) [union $v(trans,speaker) $v(trans,background)]
    }

    # Hide unused levels
    foreach lvl {seg2  "gender" "bandwidth" "background"} {
      foreach s [array names v view,*.$lvl] {
	set v($s) 0
      }
    }
    catch {
      foreach w $v(wavfm,list) {
	SwitchSegmtView $w
      }
    }
  }
  
  proc readSegmt {content} {
    global v

    set file_id ""
    if {$v(sig,name) != ""} {
      set file_id [file root [file tail $v(sig,name)]]
      regsub -all -- "-" $file_id "_" file_id
    }
    set segmt {}
    set ref {}
    set hyp {}
    set t0 0
    set t3 0
    foreach line [split $content "\n"] {
      if {[anaTag $line type kind atts]} {
	if {$kind != "close" && $type == "PATH"} {
	  array set arr $atts
	  set id $arr(file)
	  set t0 $arr(R_T1)
	  set t3 $arr(R_T2)
	  unset arr
	  #puts "$id $t0 $t3"
	}
      } else {
	#if {$file_id !="" && $id != $file_id} continue
	#set file_id $id
	set errors {}
	set ndel [set nsub [set nins 0]]
	set lastcor $t0
	while {[regexp "^(\[CSDI]),(\"(\[^\"]*)\")?,(\"(\[^\"]*)\")?,((\[0-9.]+)\\+(\[0-9.]+))?(:(.*))?$" $line all type bid1 w1 bid2 w2 bid3 t1 t2 bid4 line]} {
	  #puts stderr "$type $w1 $w2 $t1 $t2"
	  if {$t1 == 0 && $t2 == 0} continue
	  set t1bis $t1
	  switch $type {
	    "C" {
	      # previous reference errors
	      if {$errors != {}} {
		#puts "t1 $t1 t2 $t2 lastcor $lastcor t1bis $t1bis errors $errors"
		if {$lastcor < $t1} {
		  set t1bis $t1
		} else {
		  set n [expr [llength $ref] - 1]
		  set preseg [lindex $ref $n]
		  set pret1 [lindex $preseg 1]
		  if {$n >= 0 && $pret1 == $t1} {
		    set pret0 [lindex $preseg 0]
		    catch {
		      set pret0 [max $pret0 [lindex [lindex $hyp end] 0]]
		    }
		    set lastcor [expr 0.33*$pret0+0.67*$pret1]
		    set t1bis [expr 0.67*$t1+0.33*$t2]
		    #puts "$pret0 $t0 $pret1 $t1 $t1bis $t2"
		    set ref [lreplace $ref $n $n [lreplace $preseg 1 1 $lastcor]]
		  } else {
		    set lastcor $t1
		    set t1bis [expr ($t1+$t2)/2]
		  }
		}
		if {$ndel > 2*$nsub+$nins} {
		  set color $v(colorDel)
		} else {
		  set color $v(colorSub)
		}
		lappend ref [list $lastcor $t1bis $errors $color]
		set errors {}
		set ndel [set nsub [set nins 0]]
		lappend ref [list $t1bis $t2 $w1 $v(colorCor)]
	      } else {
		set n [expr [llength $ref] - 1]
		set preseg [lindex $ref $n]
		foreach {pret0 pret1 txt col} $preseg break
		if {$n >= 0 && $t1-$pret1 < 0.1 && $col == $v(colorCor)} {
		  lappend txt $w1
		  set preseg [list $pret0 $t2 $txt $col]		  
		  set ref [lreplace $ref $n $n $preseg]
		} else {
		  lappend ref [list $t1 $t2 $w1 $v(colorCor)]
		}
	      }
	      set lastcor $t2
	      #lappend ref [list $t1bis $t2 $w1 $v(colorCor)]
	      lappend hyp [list $t1 $t2 $w2 $v(colorCor)]
	      #lappend segmt [list $t1 $t2 $w2]
	    }
	    "S" {
	      lappend errors $w1
	      incr nsub
	      #lappend ref [list $t1bis $t2 $w1 $v(colorSub)]
	      lappend hyp [list $t1 $t2 $w2 $v(colorSub)]
	      #lappend segmt [list $t1 $t2 "$w1 / $w2"]
	    }
	    "D" {
	      lappend errors $w1
	      incr ndel
	    }
	    "I" {
	      incr nins
	      lappend hyp [list $t1 $t2 $w2 $v(colorSub)]
	      #lappend segmt [list $t1 $t2 " / $w2"]
	    }
	  }
	  if {$t2 != ""} {
	    set t0 $t2
	    set oldt1 $t1
	  }
	}
	set line [string trim $line]
	if {$line != ""} {
	  puts stderr "Sclite file warning: unparsed \"$line\""
	}
	if {$errors != {}} {
	  if {$t3 <= $lastcor} {
	    #puts "$lastcor $t0 $t3 $errors"
	    set t3 $lastcor
 	    set n [expr [llength $ref] - 1]
 	    set preseg [lindex $ref $n]
 	    set pret1 [lindex $preseg 1]
 	    if {$n >= 0 && $pret1 == $lastcor} {
 	      set pret0 [lindex $preseg 0]
 	      set lastcor [expr ($pret0+$pret1)/2]
 	      set ref [lreplace $ref $n $n [lreplace $preseg 1 1 $lastcor]]
	    }
 	  }
	  if {$ndel > 2*$nsub+$nins} {
	    set color $v(colorDel)
	  } else {
	    set color $v(colorSub)
	  }
	  lappend ref [list $lastcor $t3 $errors $color]
	  set errors {}
	  set ndel [set nsub [set nins 0]]
	}
      }
    }
    #puts stderr [join $hyp \n]
    # sort segments
    foreach seg {ref hyp segmt} {
      set $seg [lsort -real -index 0 [set $seg]]
    }
    #puts $ref
    # display (only in the interactive case)
    if {[info command LookForSignal] != ""} {
      global v
      #puts $file_id
      if {$v(sig,name) == ""} {LookForSignal $file_id ""}
      foreach seg {ref hyp} {
	if {![info exists $seg] || [llength $seg] == 0} continue
	set v(trans,$seg) [set $seg]
	foreach wavfm $v(wavfm,list) {
	  CreateSegmentWidget $wavfm $seg -full white
	}
      }
    }
    set segmt $hyp
    return $segmt
  }


# returns 1 if $line can be parsed as a tag, 0 else.
# if yes, returns type of tag , kind of tag (open/close/empty)
# and list of attribute name/values in following variables
#
# An error is raised if the line can not be parsed correctily
# but contains a "<"

proc anaTag {line varType varKind varAtts} {
  upvar $varType type
  upvar $varKind kind
  upvar $varAtts listAtts
  set Name {[a-zA-Z_:][a-zA-Z0-9._:-]*}
  set S "\[ \n\r\t\]"
  set Eq "$S*=$S*"
  #set AttValue "(\"\[^%&\"]*\"|'\[^%&']*')"
  # Relaxed match with unquotted values for SGML case - but exclude entities
  set AttValue "(\"\[^%&\"]*\"|'\[^%&']*'|\[^%&'\" \n\r\t<>]*)"
  set Attribute "($Name)$Eq$AttValue"
  if {[regexp "^$S*<(/?)($Name)(($S+$Attribute)*)$S*(/?)>$S*\$" $line \
	   all start type atts att1 nam1 val1 end]} {
    if {$start != ""} {
      if {$end != "" || $atts != ""} {
	error "can not parse tag from $line"
      }
      set kind "close"
    } else {
      set listAtts {}
      while {[regexp "^$S+${Attribute}(.*)" $atts match name val atts]} {
	set c [string index $val 0]
	if {$c == "'" || $c == "\""} {
	  set val [string range $val 1 [expr [string length $val]-2]] 
	}
	lappend listAtts $name $val
      }
      if {$end != ""} {
	set kind "empty"
      } else {
	set kind "open"
      }
    }
    return 1
  } else {
    if {[string first < $line] >= 0} {
      #error "can not parse tag from $line"
    }
    return 0
  }
}

}
