package Lire::AsciiDlf::GroupOp;

use strict;

use base qw/ Lire::AsciiDlf::ReportOperator /;

use Carp;

use Lire::Aggregate;
use Lire::DataTypes qw/ format_numeric_type /;

use Lire::Sum;
use Lire::Average;
use Lire::Min;
use Lire::Max;
use Lire::First;
use Lire::Last;
use Lire::Count;

#------------------------------------------------------------------------
# Method init_merge( $period_start, $period_end )
#
# Method required by Lire::AsciiDlf::ReportOperator
#
# Default implementation does nothing
sub init_merge {
    my ( $self, $period_start, $period_end ) = @_;
}

#------------------------------------------------------------------------
# Method end_merge()
#
# Method required by Lire::AsciiDlf::ReportOperator
#
# Default implementation does nothing
sub end_merge {}

#------------------------------------------------------------------------
# Method init_group_data()
#
# Method required by Lire::AsciiDlf::ReportOperator
#
# OPTIMIZATION OF THE DATA STRUCTURE
#.
# Scalar ref takes less place then array refs which takes less spec
# then hash ref. We use only scalar or array refs.
#
# As a side effects, we also get a little speed optimization :
# perl -MBenchmark=cmpthese -e '
#   my $dummy = 0;
#   my $scalar = \$dummy;
#   my $array = [0]; 
#   my $hash = { 'value' => 0};
#   cmpthese( 500000, {
#		'SCALAR' => sub { $$scalar++ },
#		'ARRAY'  => sub { $array->[0]++ },
#		'HASH'   => sub { $hash->{'value'}++ },
#	     });
# Benchmark: timing 500000 iterations of ARRAY, HASH, SCALAR...
#     ARRAY:  0 wallclock secs ( 0.57 usr +  0.00 sys =  0.57 CPU) @ 877192.98/s (n=500000)
#     HASH:  1 wallclock secs ( 0.67 usr +  0.00 sys =  0.67 CPU) @ 746268.66/s (n=500000)
#    SCALAR:  1 wallclock secs ( 0.41 usr +  0.00 sys =  0.41 CPU) @ 1219512.20/s (n=500000)
#             Rate   HASH  ARRAY SCALAR
# HASH    746269/s     --   -15%   -39%
# ARRAY   877193/s    18%     --   -28%
# SCALAR 1219512/s    63%    39%     --
#
# Make default implementation uses a scalar reference
sub init_group_data {
    my $scalar = 0;
    return \$scalar;
}

#------------------------------------------------------------------------
# Method end_group_data( $data )
#
# Method required by Lire::AsciiDlf::ReportOperator
#
# Default implementation does nothing
sub end_group_data {}

#------------------------------------------------------------------------
# Method add_entry_value( $entry, $data )
sub add_entry_value {
    my ( $self, $entry, $data ) = @_;

    my $v = $self->create_value( $entry->group(), $self->data2dlf( $data ) );
    $entry->add_value( %$v );
    return;
}

# Method used by Lire::AsciiDlf::Aggregator:: to dispath to
# Lire::*::create_value()
sub data2dlf {
    croak "Unimplemented data2dlf() in ", ref $_[0];
}

#------------------------------------------------------------------------
# missing_cases( $data )
#
# This method returns the number of missing-cases encountered
# while processing $data.
sub missing_cases {
    my ( $self, $data );

    return 0 unless ( defined $self->{'_missing_cases'}
                      && defined $self->{'_missing_cases'}{$data} );
    return $self->{'_missing_cases'}{$data};
}

package Lire::AsciiDlf::GroupOp::Count;

use base qw/ Lire::AsciiDlf::GroupOp Lire::Count /;

use Lire::DataTypes qw/ format_numeric_type /;
use Lire::Utils qw/ratio100/;
use Carp;

#------------------------------------------------------------------------
# Method init_group_data()
#
# Method required by Lire::AsciiDlf::ReportOperator
sub init_group_data {
    my ( $self ) = @_;

    if ( $self->{'key_maker'} ) {
        return [ 0, {} ];
    } else {
	my $scalar = 0;
	return \$scalar;
    }
}

#------------------------------------------------------------------------
# Method merge_group_data( $value, $data )
#
# Method required by Lire::AsciiDlf::ReportOperator
sub merge_group_data {
    my ( $self, $value, $data ) = @_;

    # FIXME: We should add a note when a key_maker was used. For
    # example, there is no guarentee that the sum of the number of
    # hosts that visited during two reports is the same than the
    # number of hosts calculated over the period spanned by the two
    # reports.

    # To merge two counts, we simply add them
    $$data += $value->{'value'};
}

#------------------------------------------------------------------------
# Method end_group_data( $data )
#
# Method required by Lire::AsciiDlf::ReportOperator
sub end_group_data {
    my ( $self, $data ) = @_;

    $data->[1] = undef
      if $self->{'key_maker'};
}

# Implements Lire::AsciiDlf::GroupOp::data2dlf()
sub data2dlf {
    my ($self, $data) = @_;

    my $name = $self->name();
    return { "$name" => $self->{'key_maker'} ? $data->[0] : $$data,
             "_lr_${name}_mc" => $self->missing_cases( $data ),
           };
}

package Lire::AsciiDlf::GroupOp::SimpleStat;

use base qw/ Lire::AsciiDlf::GroupOp /;

use Lire::DataTypes qw/ format_numeric_type /;

#------------------------------------------------------------------------
# Method init_merge( $period_start, $period_end)
#
# Method required by Lire::AsciiDlf::ReportOperator
sub init_merge {
    my ($self, $period_start, $period_end ) = @_;

    my $field = $self->{'report_spec'}->schema()->field( $self->field );
    $self->{'field_type'} = $field->type;

    return $self;
}

# Implements Lire::AsciiDlf::GroupOp::data2dlf()
sub data2dlf {
    my ($self, $data) = @_;

    my $name = $self->name();
    return { "$name" => $$data,
             "_lr_${name}_mc" => $self->missing_cases( $data ),
           };
}

package Lire::AsciiDlf::GroupOp::Sum;

use base qw/ Lire::AsciiDlf::GroupOp::SimpleStat Lire::Sum /;

use Lire::DataTypes qw/format_numeric_type/;
use Lire::Utils qw/ratio100/;

#------------------------------------------------------------------------
# Method merge_group_data( $value, $data )
#
# Method required by Lire::AsciiDlf::ReportOperator
sub merge_group_data {
    my ( $self, $value, $data ) = @_;

    # To merge two sums, we simply add them
    $$data += $value->{'value'};
}

# Implements Lire::AsciiDlf::GroupOp::data2dlf()
sub data2dlf {
    my ($self, $data) = @_;

    my $name = $self->name();
    return { "$name" => $$data,
             "_lr_${name}_mc" => $self->missing_cases( $data ),
           };
}

package Lire::AsciiDlf::GroupOp::Avg;

use base qw/ Lire::AsciiDlf::GroupOp Lire::Average /;

use Lire::AsciiDlf::ReportOperator qw/group_data_value/;
use Lire::DataTypes qw/ is_numeric_type format_numeric_type/;

use Carp;

#------------------------------------------------------------------------
# Method init_merge( $period_start, $period_end )
#
# Method required by Lire::AsciiDlf::ReportOperator
sub init_merge {
    my ( $self, $period_start, $period_end ) = @_;

    $self->create_avg_ops;
    $self->{'sum_op'}->init_merge( $period_start, $period_end );
    $self->{'n_op'}->init_merge( $period_start, $period_end );

    $self
}

#------------------------------------------------------------------------
# Method create_avg_ops()
#
# Initialize n_op and sum_op
sub create_avg_ops {
    my ($self) = @_;
    my %common = (
		  'parent' => $self->parent,
		  'report_spec' => $self->report_spec,
		 );
    if ( $self->field ) {
	my $f = $self->report_spec->schema->field( $self->field );
	if ( is_numeric_type( $f->type ) ) {
	    # Use a sum
	    $self->{'sum_op'} =
	      new Lire::AsciiDlf::GroupOp::Sum( %common,
						'name' => $self->name . ".sum",
						'field' => $self->field,
						'weight' => $self->weight,
					      );
	} else {
	    # Use a counter
	    $self->{'sum_op'} =
	      new Lire::AsciiDlf::GroupOp::Count( %common,
						  'fields' => [ $self->field ],
						  'name' => $self->name . ".sum",
						);
	}
	$self->{'field_type'} = $f->type;
    } else {
	# Use a simple count operation
	$self->{'sum_op'} = 
	  new Lire::AsciiDlf::GroupOp::Count( %common,
					      'name' => $self->name . ".sum",
					    );
    }

    # Special case for a numerical field. This enables
    # to compute arbitrary ratio.
    # FIXME: A new operator ratio operator should be defined
    if ( $self->by_fields && @{$self->by_fields} == 1 &&
         is_numeric_type( $self->report_spec->schema->field( $self->by_fields->[0])->type))
     {
         $self->{'n_op'} =
           new Lire::AsciiDlf::GroupOp::Sum( %common,
                                             'field' => $self->by_fields->[0],
                                             'name' => $self->name . ".n",
                                           );
     } else {
         $self->{'n_op'} =
           new Lire::AsciiDlf::GroupOp::Count( %common,
                                               'fields' => $self->by_fields,
                                               'name' => $self->name . ".n",
                                             );
     }
}

#------------------------------------------------------------------------
# Method init_group_data()
#
# Method required by Lire::AsciiDlf::ReportOperator
sub init_group_data {
    my ( $self ) = @_;

    # Result is held in the first element
    [ "NaN", $self->{'sum_op'}->init_group_data, $self->{'n_op'}->init_group_data ];
}

#------------------------------------------------------------------------
# Method merge_group_data( $value, $data )
#
# Method required by Lire::AsciiDlf::ReportOperator
sub merge_group_data {
    my ( $self, $value, $data ) = @_;

    $self->{'sum_op'}->merge_group_data( {value => $value->{'total'}},
				       $data->[1] );
    $self->{'n_op'}->merge_group_data( {value => $value->{'n'}},
				     $data->[2] );
}

#------------------------------------------------------------------------
# Method end_group_data( $data )
#
# Method required by Lire::AsciiDlf::ReportOperator
sub end_group_data {
    my ( $self, $data ) = @_;

    $self->{'sum_op'}->end_group_data( $data->[1] );
    $self->{'n_op'}->end_group_data( $data->[2] );

    my $sum = group_data_value( $data->[1] );
    my $n = group_data_value( $data->[2] );
    if ( $n ) {
	$data->[0] = sprintf "%.2f", $sum / $n;
    }
}

#------------------------------------------------------------------------
# Method end_merge()
#
# Method required by Lire::AsciiDlf::ReportOperator
sub end_merge {
    my ( $self ) = @_;

    $self->{'sum_op'}->end_merge;
    $self->{'n_op'}->end_merge;
}

# Implements Lire::AsciiDlf::GroupOp::data2dlf()
sub data2dlf {
    my ($self, $data) = @_;

    my $name = $self->name();
    return { "$name" => $data->[0],
             "${name}_total" => group_data_value($data->[1]),
             "${name}_n" => group_data_value( $data->[2] ),
             "_lr_${name}_mc" => $self->missing_cases( $data ),
           };
}


package Lire::AsciiDlf::GroupOp::Min;

use base qw/ Lire::AsciiDlf::GroupOp::SimpleStat Lire::Min /;

#------------------------------------------------------------------------
# Method init_group_data()
#
# Method required by Lire::AsciiDlf::ReportOperator
sub init_group_data {
    my $scalar = undef;
    return \$scalar;
}

#------------------------------------------------------------------------
# Method merge_group_data( $dlf, $data )
#
# Method required by Lire::AsciiDlf::ReportOperator
sub merge_group_data {
    my ( $self, $value, $data ) = @_;

    return if lc $value->{'value'} eq 'nan';

    $$data = $value->{'value'}
      unless ( defined $$data );

    # To merge two mins, we keep the lowest
    $$data = $value->{'value'}
      if $value->{'value'} < $$data;
}

#------------------------------------------------------------------------
# Method end_group_data( $data )
#
# Method required by Lire::AsciiDlf::ReportOperator
sub end_group_data {
    my ( $self, $data ) = @_;

    $$data = "NaN" unless defined $$data;
}

package Lire::AsciiDlf::GroupOp::Max;

use base qw/ Lire::AsciiDlf::GroupOp::SimpleStat Lire::Max  /;

#------------------------------------------------------------------------
# Method init_group_data()
#
# Method required by Lire::AsciiDlf::ReportOperator
sub init_group_data {
    my $scalar = undef;
    return \$scalar;
}

#------------------------------------------------------------------------
# Method merge_group_data( $dlf, $data )
#
# Method required by Lire::AsciiDlf::ReportOperator
sub merge_group_data {
    my ( $self, $value, $data ) = @_;

    return if lc $value->{'value'} eq 'nan';

    $$data = $value->{'value'}
      unless ( defined $$data );

    # To merge two max, we keep the highest
    $$data = $value->{'value'}
      if $value->{'value'} > $$data;
}

#------------------------------------------------------------------------
# Method end_group_data( $data )
#
# Method required by Lire::AsciiDlf::ReportOperator
sub end_group_data {
    my ( $self, $data ) = @_;

    $$data = "NaN" unless defined $$data;
}

package Lire::AsciiDlf::GroupOp::First;

use base qw/ Lire::AsciiDlf::GroupOp::SimpleStat Lire::First /;

use Lire::DataTypes qw/ is_numeric_type format_numeric_type /;

use Carp;

#------------------------------------------------------------------------
# Method init_merge( $period_start, $period_end )
#
# Method required by Lire::AsciiDlf::ReportOperator
sub init_merge {
    my ($self, $period_start, $period_end ) = @_;

    $self->SUPER::init_merge( $period_start, $period_end );

    my $sort_fields = $self->sort_fields;
    unless ( $sort_fields && @$sort_fields ) {
	$sort_fields = [ $self->report_spec->schema->timestamp_field->name ];
    }

    my @cmp = ();
    my $i = 0;
    foreach my $f ( @$sort_fields ) {
	my $type = $self->report_spec->schema->field( $f )->type;
	my $cmp = is_numeric_type( $type ) ? '<=>' : 'cmp';

        push @cmp, "\$_[0][$i] $cmp \$_[1][$i]";
 	$i++;
    }

    my $sort_code = "sub { " . join( " && ", @cmp ) . " }";
    $self->{'sort_func'} = eval $sort_code;
    croak "failed to compile sort function ($sort_code): $@" if $@;

    $self->init_common;

    return $self;
}

#------------------------------------------------------------------------
# Method init_common()
#
# Initialization common to both init_report() and init_merge()
sub init_common {
    my ( $self) = @_;

    my $sort_fields = $self->sort_fields;
    unless ( $sort_fields && @$sort_fields ) {
	$sort_fields = [ $self->report_spec->schema->timestamp_field->name ];
    }

    my @i = ();
    foreach my $f ( @$sort_fields ) {
	push @i, $self->report_spec->schema->field( $f )->pos;
    }

    # Extracts the sort values to save them in the total attribute
    $self->{'total_func'} = sub {
        if ( defined $_[0] ) {
            return join( " ", map { $_[0]->[$_] } @i);
        } else {
            return ""
        }
    };
}

#------------------------------------------------------------------------
# Method init_group_data()
#
# Method required by Lire::AsciiDlf::ReportOperator
sub init_group_data {
    [];
}

#------------------------------------------------------------------------
# Method merge_group_data( $dlf, $data )
#
# Method required by Lire::AsciiDlf::ReportOperator
sub merge_group_data {
    my ( $self, $value, $data ) = @_;

    return unless $value->{'total'} ne "";

    my $fields = [ split / /, $value->{'total'} ];

    unless (defined $data->[0]) {
	$data->[0] = $value->{'value'};
	$data->[1] = $fields;
	return;
    }

    # Change the value only if the fields sorts before the last one
    if ( $self->{'sort_func'}->( $fields, $data->[1] ) < 0 ) {
	$data->[0] = $value->{'value'};
	$data->[1] = $fields;
    }
}

# Implements Lire::AsciiDlf::GroupOp::data2dlf()
sub data2dlf {
    my ($self, $data) = @_;

    my $name = $self->name();
    return { "$name" => $data->[0],
             "${name}_key" => $self->{'total_func'}->( $data->[1] ),
             "_lr_${name}_mc" => $self->missing_cases( $data ),
           };
}

package Lire::AsciiDlf::GroupOp::Last;

use base qw/ Lire::AsciiDlf::GroupOp::SimpleStat Lire::Last /;

use Lire::DataTypes qw/ is_numeric_type format_numeric_type /;

use Carp;

#------------------------------------------------------------------------
# Method init_merge( $period_start, $period_end )
#
# Method required by Lire::AsciiDlf::ReportOperator
sub init_merge {
    my ($self, $period_start, $period_end ) = @_;

    $self->SUPER::init_merge( $period_start, $period_end );

    my $sort_fields = $self->sort_fields;
    unless ( $sort_fields && @$sort_fields ) {
	$sort_fields = [ $self->report_spec->schema->timestamp_field->name ];
    }

    my @cmp = ();
    my $i = 0;
    foreach my $f ( @$sort_fields ) {
	my $type = $self->report_spec->schema->field( $f )->type;
	my $cmp = is_numeric_type( $type ) ? '<=>' : 'cmp';

        push @cmp, "\$_[0][$i] $cmp \$_[1][$i]";
	$i++;
    }

    my $sort_code = "sub { " . join( " && ", @cmp ) . " }";
    $self->{'sort_func'} = eval $sort_code;
    croak "failed to compile sort function ($sort_code): $@" if $@;

    $self->init_common;

    return $self;
}

#------------------------------------------------------------------------
# Method init_common()
#
# Initialization common to both init_report() and init_merge()
sub init_common {
    my ( $self) = @_;

    my $sort_fields = $self->sort_fields;
    unless ( $sort_fields && @$sort_fields ) {
	$sort_fields = [ $self->report_spec->schema->timestamp_field->name ];
    }

    my @i = ();
    foreach my $f ( @$sort_fields ) {
	push @i, $self->report_spec->schema->field( $f )->pos;
    }

    # Extracts the sort values to save them in the total attribute
    $self->{'total_func'} = sub {
        if ( defined $_[0] ) {
            return join( " ", map { $_[0]->[$_] } @i);
        } else {
            return ""
        }
    };
}

#------------------------------------------------------------------------
# Method init_group_data()
#
# Method required by Lire::AsciiDlf::ReportOperator
sub init_group_data {
    [];
}

#------------------------------------------------------------------------
# Method merge_group_data( $dlf, $data )
#
# Method required by Lire::AsciiDlf::ReportOperator
sub merge_group_data {
    my ( $self, $value, $data ) = @_;

    return unless $value->{'total'} ne "";

    my $fields = [ split / /, $value->{'total'} ];

    unless (defined $data->[0]) {
	$data->[0] = $value->{'value'};
	$data->[1] = $fields;
	return;
    }

    # Change the value only if the fields sorts after the last one
    if ( $self->{'sort_func'}->( $fields, $data->[1] ) > 0 ) {
	$data->[0] = $value->{'value'};
	$data->[1] = $fields;
    }
}

# Implements Lire::AsciiDlf::GroupOp::data2dlf()
sub data2dlf {
    my ($self, $data) = @_;

    my $name = $self->name();
    return { "$name" => $data->[0],
             "${name}_key" => $self->{'total_func'}->( $data->[1] ),
             "_lr_${name}_mc" => $self->missing_cases( $data ),
           };
}

# keep perl happy
1;

__END__

=pod

=head1 NAME

Lire::AsciiDlf::GroupOp -

=head1 SYNOPSIS


=head1 DESCRIPTION

Reports consist of several periods.  E.g. a weekly report can consist of 2002
week no 23, 2002 week no 24, 2002 week no 25, ..., 2002 week no 52, 2003 week
no 1, 2003 week no 2.

A Timegroup is a hash, which holds an array, which holds a report's data.  (We
omit the complication caused by the fact these things can be nested.)

See Lire::NestableAggregator:

    my $self = bless {
                      'report_spec'   => $report_spec,
                      'element_name'  => $args{'element_name'},
                      'ops'           => [],
                      'op'            => $args{'element_name'},
                      'name'          => $name,
                      field         ...
                      ...
                     }, $class;

Each period's data is contained in a slot in that array in the Timegroup hash.
E.g. week 2002 no 25 is represented by a certain slot.
And, if we have a 1 month period, e.g. the array is structured like

  [ sep2002, oct2002, dec2002, jan2003, ...]

Such an array is created by the init_group_data() routine.

All operations (like avg, count, sum, etc.) have an init_group_data(),
update_group_data() and end_group_data() method.  These 3 functions are
the high-level interface to GroupOp.

Suppose we're creating a monthly requests-by-day report, containing

 <timegroup><count/></timegroup>

.  In the data structure

 ([ sep2002, oct2002, ...])

sep2002 is the data structure created by the GroupOp::Count operation and which
is used to "count" the sep2002 stats.

If the report was a req-by-user-by-day period, sep2002 would be the data
structure created by the Group::init_group_data() method.

When not optimizing for large data sets, the group's data structure is a hash
instead of an array.  Keys of these hashes are the group's keys.  In such a
case, the values of this hash contain whatever data structure created by the
operator contained in the group element (count in the case of the last
example).

So, the structure boils down to:

 Timegroup: [ sep2002 = { 'user1' => ?, user2 => },
         oct2002 = { 'user1' => ?, user2 => }, etc. ]

Each { 'user1' => ?, user2 => } hash is created by the same Group instance.  The
update_group_data() function will be called with the appropriate data structure
for the DLF record we are currently processing


A NestableAggregator is an Aggregator that implements the GroupOp interface.
The only non-nestable Aggregator is the Summary element which doesn't do real
aggregation (all the DLF records are aggregated in one "group") whereas the
other aggregators (group, timeslot, timegroup, rangegroup) will aggregate the
DLF records into different groups (based on key (for Group), time period (for
Timegroup), etc.)


=head1 VERSION

$Id: GroupOp.pm,v 1.48 2004/03/26 00:27:34 wsourdeau Exp $

=head1 COPYRIGHT

Copyright (C) 2001, 2002 Stichting LogReport Foundation LogReport@LogReport.org

This file is part of Lire.

Lire is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program (see COPYING); if not, check with
http://www.gnu.org/copyleft/gpl.html or write to the Free Software 
Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.

=head1 AUTHOR

Francis J. Lacoste <flacoste@logreport.org>

=cut
