#! /usr/bin/perl

    #	Read a mail folder and compute statistics of
    #	the classification of messages in it by
    #	annoyance-filter.  If a second file name is
    #	supplied after the required mail folder name,
    #	a CSV log will be written with a record for
    #	each message classified containing the following
    #	fields:
    #
    #	    0	Classification (Mail, Junk, or Indeterminate)
    #	    1	Junk probability
    #	    2	From header line
    #	    3	To header line
    #	    4	Subject header line
    #	    5	Date header line
    #
    #	The header line fields are all quoted, with embedded
    #	quotes doubled to force them.  Control characters
    #	in header fields which might confuse CSV parsing
    #	are deleted.
        
    #	by John Walker  http://www.fourmilab.ch/
    #	    	   September 2002

    $acmd = "./annoyance-filter --read dict.bin --trans - --test - |" .
    	"egrep '^X-Annoyance-Filter' | head -2 >score.tmp";

    if ($#ARGV < 0) {
    	print("Usage: perl testfolder.pl mail_folder_file [CSV _log_file]\n");
	exit(2);
    }
    
    $totalMail = 0;
    $totalJunk = 0;
    $totalMessages = 0;
    $sumProbability = 0;
    
    $writeLog = 0;
    
    $inf = $ARGV[0];
    if ($inf =~ m/\.gz$/) {
    	open(IN, "zcat $inf |") || die "Cannot open gzipped input file $inf";
    } else {
    	open(IN, "<$inf") || die "Cannot open input file $inf";
    }
    
    if ($#ARGV > 0) {
    	open(CSV, ">$ARGV[1]") || die "Cannot create CSV log file $ARGV[1]";
	$writeLog = 1;
    }

    while (($l = <IN>)) {
    	if ($l =~ m/^From /) {
    	    last;
	}
    }
    
    if (!$l) {
    	print("No messages in mail folder!\n");
	exit(0);
    }
    
    $eof = 0;
    $pattern = '';  	    	# Initialise search pattern
    
    while (!$eof) {
    
    	#   Read next message from mail folder.  At
	#   this point $l contains the first ("From ")
	#   line of the message.
	
	$nlines = 0;
	undef @message;
	$msize = length($l);
	&trim_end_of_line;
	$message[$nlines++] = $l;
	
	#   Read the balance of the message into the
	#   @message array.  Quit when the "From " line
	#   of the next message is encountered or the
	#   end of the folder is encountered.
	
	while ($l = <IN>) {
	    if ($l =~ m/^From /) {
		last;
	    } else {
	    	&trim_end_of_line;
	    	$message[$nlines++] = $l;
	    	$msize += length($l);
    	    }
	}
	if ($nlines > 0) {
	    &dispose_of_message;
	}
	$eof = !($l);
# if ($bail++ > 20) { $eof = 1; }  # Quick bail-out for testing
    }
    
    if ($writeLog) {
    	close(CSV);
    }
    
    #	Compute and display aggregate statistics
    
    print("Total messages: $totalMessages\n");
    print("    Mail: $totalMail\n");
    print("    Junk: $totalJunk\n");
    printf("Mean junk probability: %.4f\n", $sumProbability / $totalMessages);

#   Dispose of the message in the @message
#   array.
    
sub dispose_of_message {
    local($i, $from, $to, $subject, $disp,
    	  $afrom, $act, $score, $class);
    
    #	Parse message header for "interesting" items
    
    for ($i = 0; $i < $nlines; $i++) {
    	if ((length($message[$i]) == 0) ||
	    ($message[$i] =~ m/^\s*$/)) {
#print("Bail on line $i\n");
	    last;
	}
	
	if ($message[$i] =~ m/^From\s/) {
	    if (!($message[$i] =~ m/^From\s+-\s+/)) {
	    	$from = $message[$i];
	    } else {
	    	$afrom = $message[$i];
	    }
	}
	elsif ((!defined($from)) && ($message[$i] =~ m/^From:\s/)) {
	    $from = $message[$i];
	}
	elsif ($message[$i] =~ m/^To:\s/) {
	    $to = $message[$i];
	}
	elsif ($message[$i] =~ m/^Subject:\s/) {
	    $subject = $message[$i];
	}
	elsif ($message[$i] =~ m/^Date:\s/) {
	    $date = $message[$i];
	}
    }
    if (!defined($from)) {
    	$from = $afrom;
    }
    
    #	Open a pipe to annoyance-filter and feed the message
    #	to it.  Since we cannot read the output from
    #	the run via a pipe, it's redirected to a temporary
    #	file from which we snarf it after closing the pipe.

    open(OF, "|$acmd") || die "Cannot create pipeline $acmd";
    for ($i = 0; $i < $nlines; $i++) {
    	print(OF "$message[$i]\n");
    }
    close(OF);
    
    $score = `grep Probability score.tmp`;
    $score =~ s/(^\S+: )(\S+)//;
    $score = $2;
    
    $class = `grep Classification score.tmp`;
    $class =~ s/(^\S+: )(\S+)//;
    $class = $2;
    system("rm -f score.tmp");
    
    if ($writeLog) {
    	print(CSV "$class,$score," .
	    	&csvQuote($from) . "," .
	    	&csvQuote($to) . "," .
	    	&csvQuote($subject) . "," .
	    	&csvQuote($date) . "\n");
    }
    
print("$subject\n");
print("Score: $score $class\n");
    $totalMessages++;
    $sumProbability += $score;
    if ($class eq 'Mail') {
    	$totalMail++;
    } else {
    	$totalJunk++;
    }
}

#   Trim end of line sequences from line

sub trim_end_of_line {
    $l =~ s/(.*)([\r\n]+)$/\1/;
}

#   Quote string for output in CSV

sub csvQuote {
    local($s) = @_[0];
    
    $s =~ s/"/""/g;
    $s =~ tr/\t\r\n//d;
    return "\"$s\"";
}
