!
!  Robust weighted mean
!
!  Copyright © 2016-7 F.Hroch (hroch@physics.muni.cz)
!
!  This file is part of Munipack.
!
!  Munipack is free software: you can redistribute it and/or modify
!  it under the terms of the GNU General Public License as published by
!  the Free Software Foundation, either version 3 of the License, or
!  (at your option) any later version.
!
!  Munipack is distributed in the hope that it will be useful,
!  but WITHOUT ANY WARRANTY; without even the implied warranty of
!  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
!  GNU General Public License for more details.
!
!  You should have received a copy of the GNU General Public License
!  along with Munipack.  If not, see <http://www.gnu.org/licenses/>.
!
!

module weightedmean

  ! This module provides subroutines for estimation of both
  ! robust mean and its deviations.
  !
  ! rmean should be called as:
  !
  !  call rmean(x,dx,t,dt,sig,reliable)
  !
  ! on input:
  !   x - array of data values to be estimated
  !   dx - array of statistical errors of x
  !
  ! on output are estimated:
  !   t - robust mean
  !   dt - standard error
  !   sig - standard deviation (optional)
  !   reliable - indicates reliability of results (optional)
  !
  ! The given results  means that a true value X of the sample x can be,
  ! with 70% probability, found in interval
  !
  !         t - dt  <  X  <  t + dt
  !
  ! and the distribution can be described by Normal distribution N(t,sig).
  ! No studentizing applied.
  !
  !
  ! WARNING
  !
  ! This routine gives unexpected results when input errors in dx
  ! are incorrectly determined. In this case, sig is significantly
  ! different from number one. The reliable flag is set to .false.
  !
  !
  ! Robust estimators has been prepared on base of
  !  * Hogg in Launer, Wilkinson: Robustness in Statistics
  !  * Hubber: Robust Statistics
  !  * my experiences


  implicit none
  private

  ! print debug information ?
  logical, parameter, private :: debug = .false.
  logical, private :: verbose = .false.

  ! numerical precision of real numbers
  integer, parameter, private :: dbl = selected_real_kind(15)

  ! 50% quantil of N(0,1)
  real(dbl), parameter, private :: q50 = 0.6745

  ! private working arrays
  real(dbl), dimension(:), allocatable, private :: r,rs,psi,dpsi,z,dz

  ! implemened in both single and double precision
  interface rwmean
     module procedure rwmean2, rwmean1
  end interface rwmean

  private :: newton, zmean, zerofun, zero_graph
  public :: rwmean, rwmean2, rwmean1, rwinit

contains

  subroutine rwmean2(x,dx,t,dt,sig,reliable,verb)

    use rfun
    use entropyscale

    real(dbl), dimension(:),intent(in) :: x, dx
    real(dbl), intent(out) :: t, dt
    real(dbl), intent(out), optional :: sig
    logical, intent(out), optional :: reliable
    logical, intent(in), optional :: verb

    real(dbl), parameter :: machtol = epsilon(t)
    real(dbl) :: s,smed,s0,t0
    integer :: n
    logical :: converge, reli, treli, sreli, xverbose

    xverbose = verbose
    if( present(verb) ) verbose = verb

    if( size(x) /= size(dx) ) stop 'rwmean: size(x) /= size(dx)'
    if( .not. ( all(dx > 0) ) ) stop 'rwmean: any dx < 0'

    converge = .false.
    dt = -1

    n = size(x)

    if( n == 0 ) then

       t = 0
       dt = 0
       if( present(sig) ) sig = 1
       if( present(reliable) ) reliable = .false.
       return

    else if( n == 1 ) then

       t = x(1)
       dt = dx(1)
       if( present(sig) ) sig = 1
       if( present(reliable) ) reliable = .false.
       return

    else if( n == 2 ) then

       smed = (dx(1) + dx(2)) / 2
       t = (x(1) + x(2)) / 2
       s = abs(x(1) - x(2)) / smed
       dt = s / 1.41

       if( present(sig) ) sig = s
       if( present(reliable) ) reliable = .true.
       return

    end if

    ! initial estimate
    call rwinit(x,dx,t,s,smed)
    if( verbose ) write(*,'(a,1p3g17.7)') 'rwinit:',t,s,smed

    if( abs(s) < 2*epsilon(s)  )then

       ! The power of the robust method can be performed
       ! only on large dataset.

       ! If all the data are identical, results can be considered as reliable.
       reli = sum(abs(x-t)) / n < 10*epsilon(s)

       dt = smed / sqrt(real(n))
       if( present(reliable) ) reliable = reli
       if( present(sig) ) sig = s
       return

    endif
    ! we are suppose that n > 2 from this point

    ! keep initial estimation
    t0 = t
    s0 = s

    ! Allocates working arrays. A memory can be allocated on the fly,
    ! but it may slow-down the computations.
    allocate(r(n),rs(n),psi(n),dpsi(n))

    if( debug ) call zero_graph(x,dx,t,s)

    ! zmean is a solver without derivatives
    call zmean(x,dx,smed,s,t,treli)

    ! The scale parameter estimates average dispersion of residuals
    ! scaled by theirs, a priory known, statistical errors (dx).
    ! Normally, the value is expected near number one: escale ~= 1.
    call escale((x-t)/dx,s,sreli)

    reli = treli .and. sreli
    if( verbose ) write(*,'(a,1p2g17.7,1x,2l1)') 'zmean, escale:',t,s,treli,sreli

    if( reli ) then

       ! keep the solution
       t0 = t
       s0 = s

       ! Newton's iterations are used mainly to get more precise result and
       ! to estimate of std.err.
       call newton(x,dx,s,t,dt,converge)
       reli = converge .and. (minval(x) < t .and. t < maxval(x))

       if( verbose ) write(*,'(a,1p3g17.7,2l3)') 'Newton:',t,dt,reli,converge

    end if

    if( .not. reli ) then
       ! failback alternative:
       !  * parameters are poorly estimated, all psi'(.) are zero
       !  * no convergence
       !  * final result is out of range of the zmean result
       !
       !  As the consequence, results will not realiable.

       t = t0
       s = s0
       if( dt < 0 ) dt = smed * sqrt(s**2/n)

       if( debug ) write(*,*) 'rmean5 (not reliable):',t,t0,s,s0,dt,converge

    end if

    if( verbose ) write(*,'(a,3g15.5,1x,l1)') 'rwmean2:',real(t),real(dt),real(s),reli
    deallocate(r,rs,psi,dpsi)

    ! fill-up optional items
    if( present(sig) ) sig = s
    if( present(reliable) ) reliable = reli
    verbose = xverbose

  end subroutine rwmean2

  subroutine rwmean1(x,dx,t,dt,sig,reliable,verb)

    ! single precision version

    real, dimension(:),intent(in) :: x, dx
    real, intent(out) :: t, dt
    real, intent(out), optional :: sig
    logical, intent(out), optional :: reliable
    logical, intent(in), optional :: verb

    real(dbl) :: q,dq,sig2
    real(dbl), dimension(:), allocatable :: y,dy
    logical :: reliable2

    if( present(verb) ) verbose = verb

    allocate(y(size(x)),dy(size(x)))
    y = x
    dy = dx

    call rwmean2(y,dy,q,dq,sig2,reliable2)

    t = real(q)
    dt = real(dq)
    if( present(sig) ) sig = real(sig2)
    if( present(reliable) ) reliable = reliable2

    deallocate(y,dy)

  end subroutine rwmean1


  subroutine newton(x,dx,s,t,dt,converge)

    ! This routine implements Newton's method for solving of equation
    !
    !  f(t) = 0,  where f(t) = sum( psi( (x-t)/(s*dx) ) )
    !
    ! t   is estimation of mean,
    ! dt  is standard error of t
    ! sig is standard deviation of the sample

    use rfun

    real(dbl), dimension(:), intent(in) :: x, dx
    real(dbl), intent(in out) :: t,s
    real(dbl), intent(out) :: dt
    logical, intent(out) :: converge

    integer, parameter :: maxiter = precision(t)
    ! Number of iterations is limited by numerical precision.
    ! We belives in quadratic convergence of Newton's method so,
    ! two orders are reached by each iteration, at least.

    integer :: n,iter
    real(dbl) :: d,f,df,f2,tol

    converge = .false.
    dt = -1
    if( .not. (s > 0) ) return

    n = size(x)

    ! Adjusting of tolerance
    tol = 10*n*epsilon(t)*(abs(t) + 1)

    ! Newton's iterations
    do iter = 1, maxiter

       ! derivation of derivatives
       r = (x - t) / dx
       rs =  r / s
       call tukeys(rs,psi)
       call dtukeys(rs,dpsi)
       f = sum(psi/dx)
       df = sum(dpsi/dx**2)

       ! derivative (denominator) checkpoint:
       ! * If all values are out of range: |df| < epsilon
       ! * df < 0 indicates "re-descending M-estimate" function problem ... the problem
       !   usually appears when distribution strongly deviates from Normal distribution
       if( df < epsilon(df) ) exit

       ! corrector for mean
       d = s * f / df

       ! update location
       t = t + d

       if( debug ) &
            write(*,'(a,i2,g15.5,1p3g12.3)') "mean, incr., f, f': ",iter,t,d,-f/s,df/s**2

       ! exit of iterations: the absolute errot must be at least |d| < tol
       converge = abs(d) < tol
       if( converge ) exit

    enddo

    ! estimation of standard error
    if( converge ) then
       f2 = sum(psi**2/dx**2)
       if( f2 > 0 .and. df > 0 ) then
          s = s*sqrt(n/(n-1)*f2/df)
          dt = s/sqrt(df)
       end if
    end if

  end subroutine newton

  subroutine zmean(x,dx,smed,s,t,reli)

    use fmm

    real(dbl), dimension(:),intent(in) :: x, dx
    real(dbl), intent(in) :: s,smed
    real(dbl), intent(in out) :: t
    logical, intent(out) :: reli

    real(dbl) :: tol,tmin,tmax,xmin,xmax,d
    integer :: n

    n = size(x)
    d = s * smed / q50
    tol = 0.001*d
    allocate(z(n),dz(n))

    xmin = minval(x)
    xmax = maxval(x)

    if( n > 23 ) then
       tmin = max(t - d,xmin)
       tmax = min(t + d,xmax)
    else
       tmin = xmin
       tmax = xmax
    end if

    z = x
    dz = s*dx
    t = zeroin(tmin,tmax,zerofun,tol)
    deallocate(z,dz)

    ! d gives precision of localisation on base of std.dev.
    ! Number of calls of 0-fun will approximately ln((tmax-tmin)/tol).

    d = 2*tol
    reli = abs(t - tmin) > d .and. abs(tmax - t) > d

    ! if result is too close to interval endpoints, one raise suspicion ...

  end subroutine zmean


  function zerofun(t)

    use rfun

    real(dbl) :: zerofun
    real(dbl), intent(in) :: t

    rs = (t - z) / dz
    call tukeys(rs,psi)
!    psi = rs

    zerofun = sum(psi/dz)

  end function zerofun


  subroutine rwinit(x,dx,t,s,d)

    ! This routine performs an initial estimate of both mean and scale
    ! using both median and absolute value.

    use medians
    use qmeans

    real(dbl), dimension(:),intent(in) :: x, dx
    real(dbl), intent(out) :: t,s,d

    real(dbl) :: mad,med
    integer :: n

    n = size(x)

    if( n > 50 ) then

       ! This choice of the threshold used for median computation method
       ! assumes that odd and even elements in sequence are the same (within
       ! requested precision). Note that qmed is the fastest known algorithm
       ! (~2*n) while median is simpler and slower (~n*log(n)).

       d = qmedian(dx)
       med = qmedian(x)
       mad = qmedian(abs(x - med)) / d

       ! correction for Normal distribution
       s = mad / q50
       t = med

    else if( n > 13 ) then

       ! correct way to compute median (two middle values)
       d = median(dx)
       med = median(x)
       mad = median(abs(x - med)) / d

       ! correction for Normal distribution
       s = mad / q50
       t = med

    else if( n > 2 ) then

       ! compute parameters by empirical CDF quantilies
       call qmean(dx,d,s)
       call qmean(x,med,s)
       call qmean(abs(x - med),mad,s)
       mad = mad / d
       s = mad / q50
       t = med

    else if( n == 2 ) then

       d = (dx(1) + dx(2)) / 2
       t = (x(1) + x(2)) / 2
       s = abs(x(1) - x(2)) / d

    else if( n == 1 ) then

       d = dx(1)
       t = x(1)
       s = 1

    else

       d = 1
       t = 0
       s = 1

    end if

  end subroutine rwinit



 ! diagnostics routines


  subroutine zero_graph(x,dx,t0,s)

    use rfun
    use medians

    real(dbl), dimension(:),intent(in) :: x, dx
    real(dbl), intent(in) :: t0,s

    integer :: i,n
    real(dbl) :: t,d

    d = median(s*dx) / 0.6745

    n = size(x)
    allocate(z(n),dz(n))
    z = x
    dz = dx

    open(1,file='/tmp/tw')
    do i = -500,500,10
       t = t0 + d * i / 100.0
       write(1,*) t,zerofun(t)
    end do
    close(1)
    deallocate(z,dz)

  end subroutine zero_graph



end module weightedmean
