/*
    Theseus - maximum likelihood superpositioning of macromolecular structures

    Copyright (C) 2004-2010 Douglas L. Theobald

    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with this program; if not, write to the:

    Free Software Foundation, Inc.,
    59 Temple Place, Suite 330,
    Boston, MA  02111-1307  USA

    -/_|:|_|_\-
*/

#include <pthread.h>
#include "Kabsch.h"
#include "Threads.h"
#include "MultiPose_local.h"
#include "MultiPose.h"


void
SuperPose2Anchor(CoordsArray *scratchA, CoordsArray *baseA, char *anchorf_name)
{
    double        **anchormat = MatAlloc(3, 3);
    double         *anchortrans = malloc(3 * sizeof(double));
    double         *tmpanchortrans = malloc(3 * sizeof(double));
    double         *trans = malloc(3 * sizeof(double));
    double          norm1, norm2, innprod;
    int             i, j, anchor = 0;

    for (i = 0; i < baseA->cnum; ++i)
    {
        if (strncmp(anchorf_name, baseA->coords[i]->filename, FILENAME_MAX - 1) == 0)
        {
            anchor = i;
            break;
        }
    }

    SuperPose(scratchA->coords[anchor], baseA->coords[anchor], anchormat, anchortrans,
              &norm1, &norm2, &innprod);

    for (i = 0; i < baseA->cnum; ++i)
    {
        InvRotVec(tmpanchortrans, anchortrans, scratchA->coords[i]->matrix);

        for (j = 0; j < 3; ++j)
            scratchA->coords[i]->center[j] = scratchA->coords[i]->translation[j] =
                scratchA->coords[i]->center[j] - tmpanchortrans[j];

        Mat3MultIp(scratchA->coords[i]->matrix, (const double **) anchormat);
    }

    for (j = 0; j < 3; ++j)
        scratchA->avecoords->center[j] = scratchA->avecoords->translation[j] =
            anchortrans[j];

    Mat3Cpy(scratchA->avecoords->matrix, (const double **) anchormat);

    free(trans);
    free(anchortrans);
    free(tmpanchortrans);
    MatDestroy(&anchormat);
}


/* static void */
/* CenMassWtHVarIp_old3D(Coords *coords, const double *wts, const double wtnorm, */
/*                 const double *mean, const double *var, const double precision) */
/* { */
/*     int             i; */
/*     double          tempx, tempy, tempz; */
/*     double          wti, wtsum; */
/*     const double   *x = (const double *) coords->x, */
/*                    *y = (const double *) coords->y, */
/*                    *z = (const double *) coords->z; */
/*  */
/*     tempx = tempy = tempz = wtsum = 0.0; */
/*     for (i = 0; i < coords->vlen; ++i) */
/*     { */
/*         wti = wts[i]; */
/*         wtsum += wti; */
/*         tempx += (wti * x[i]); */
/*         tempy += (wti * y[i]); */
/*         tempz += (wti * z[i]); */
/*     } */
/*  */
/*     if (wtsum < precision * wtnorm / var[0]) */
/*     { */
/*  */
/*         coords->center[0] = - wtnorm * mean[0]; */
/*         coords->center[1] = - wtnorm * mean[1]; */
/*         coords->center[2] = - wtnorm * mean[2]; */
/*     } */
/*     else */
/*     { */
/*         coords->center[0] = (tempx - wtnorm*mean[0] / var[0]) / (wtsum + wtnorm / var[0]); */
/*         coords->center[1] = (tempy - wtnorm*mean[1] / var[1]) / (wtsum + wtnorm / var[1]); */
/*         coords->center[2] = (tempz - wtnorm*mean[2] / var[2]) / (wtsum + wtnorm / var[2]); */
/*     } */
/* } */


/* For superimposing to an alignment, we don't need to weight by occupancy
   since we are using pseudo-coordinates here from the E-M expectation step */
/* static void */
/* CalcTranslations_old3D(CoordsArray *scratchA, Algorithm *algo) */
/* { */
/*     Coords        **coords = scratchA->coords; */
/*     int             i, j, cnt; */
/*     double          chi2; */
/*  */
/*     if (algo->notrans == 0) */
/*     { */
/*         if (algo->htrans == 1 && algo->rounds > 1) */
/*         { */
/*             double      logL, lvar, varsum; */
/*             double     *mean = malloc(3 * sizeof(double)); */
/*             double     *var = malloc(3 * sizeof(double)); */
/*             double     *trans = malloc(scratchA->cnum * sizeof(double)); */
/*  */
/*             varsum = FLT_MAX; */
/*             cnt = 0; */
/*             do */
/*             { */
/*                 lvar = varsum; */
/*                 ++cnt; */
/*                 varsum = 0.0; */
/*                 for (j = 0; j < 3; ++j) */
/*                 { */
/*                     for (i = 0; i < scratchA->cnum; ++i) */
/*                         trans[i] = -coords[i]->center[j]; */
/*  */
/*                     chi2 = normal_fit((const double *) trans, scratchA->cnum, &mean[j], &var[j], &logL); */
/*                     varsum += var[j]; */
/*  */
/*                     fflush(NULL); */
/*                 } */
/*  */
/*                 for (i = 0; i < scratchA->cnum; ++i) */
/*                     CenMassWtHVarIp(coords[i], scratchA->w, scratchA->stats->wtnorm, mean, var, algo->precision); */
/*                 break; */
/*             } */
/*             while(fabs(lvar - varsum)/varsum > algo->precision); */
/*  */
/*             scratchA->stats->htrans_ave = mean[0]; */
/*             scratchA->stats->htrans_var = var[0]; */
/*             scratchA->stats->htrans_chi2 = chi2; */
/*  */
/*             free(trans); */
/*             free(var); */
/*             free(mean); */
/*         } */
/*         else */
/*         { */
/*             for (i = 0; i < scratchA->cnum; ++i) */
/*             { */
/*                 if (algo->covweight == 0) */
/*                 { */
/*                     if (algo->alignment == 1 && algo->rounds < 3) */
/*                         CenMassWtIpOcc(coords[i], scratchA->w); */
/*                     else */
/*                         CenMassWtIp(coords[i], scratchA->w); */
/*                 } */
/*                 else */
/*                     CenMassCov(coords[i], (const double **) scratchA->WtMat); */
/*             } */
/*         } */
/*     } */
/* } */


static void
CenMassWtHVarIp(Coords *coords, const double *wts, const double wtnorm,
                const double *mean, const double var, const double precision)
{
    int             i;
    double          tempx, tempy, tempz;
    double          wti, wtsum;
    const double   *x = (const double *) coords->x,
                   *y = (const double *) coords->y,
                   *z = (const double *) coords->z;

    tempx = tempy = tempz = wtsum = 0.0;
    for (i = 0; i < coords->vlen; ++i)
    {
        wti = wts[i];
        wtsum += wti;
        tempx += (wti * x[i]);
        tempy += (wti * y[i]);
        tempz += (wti * z[i]);
    }
/* printf("\nwtsum: %f wtnorm: %f", wtsum, wtnorm); */
/* printf("\n% f % f % f    %f %f %f", tempx / wtsum, tempy / wtsum, tempz / wtsum, wtsum, wtnorm, wtnorm / var); */

    printf("\nbefore: % f % f % f", coords->center[0], coords->center[1], coords->center[2]);
    if (var * wtsum < precision * wtnorm)
    {
        coords->center[0] = -mean[0];
        coords->center[1] = -mean[1];
        coords->center[2] = -mean[2];
    }
    else
    {
        coords->center[0] = (tempx*var - wtnorm*mean[0]) / (wtsum*var + wtnorm);
        coords->center[1] = (tempy*var - wtnorm*mean[1]) / (wtsum*var + wtnorm);
        coords->center[2] = (tempz*var - wtnorm*mean[2]) / (wtsum*var + wtnorm);
    }

    printf("\nafter:  % f % f % f", coords->center[0], coords->center[1], coords->center[2]);
    fflush(NULL);
}


void
CalcTranslationsOp(CoordsArray *scratchA, CoordsArray *baseA, Algorithm *algo)
{
    int             i, j, cnt;
    double          chi2;

    if (algo->notrans == 0)
    {
        if (algo->htrans == 1 && algo->rounds > 1)
        {
            double      logL, lvar, varsum/* , chi2 */;
            double     *mean = malloc(3 * sizeof(double));
            double     *var = malloc(3 * sizeof(double));
            double     *trans = malloc(scratchA->cnum * sizeof(double));

            varsum = FLT_MAX;
            cnt = 0;
            do
            {
                lvar = varsum;
                ++cnt;
                varsum = 0.0;
                for (j = 0; j < 3; ++j)
                {
                    for (i = 0; i < scratchA->cnum; ++i)
                        trans[i] = -baseA->coords[i]->center[j];

                    chi2 = normal_fit((const double *) trans, scratchA->cnum, &mean[j], &var[j], &logL);
                    varsum += var[j];
                    printf("\n %3d:%d chi2: %f mean: %f  var: %f logL: %f wtnorm: %f",
                           cnt, j, chi2, mean[j], var[j], logL, scratchA->stats->wtnorm);
                    fflush(NULL);
                }

                printf("\n %3d: varsum:%f", cnt, varsum);

                for (i = 0; i < scratchA->cnum; ++i)
                    CenMassWtHVarIp(baseA->coords[i], scratchA->w, scratchA->stats->wtnorm, mean, varsum, algo->precision);
            }
            while(fabs(lvar - varsum) > algo->precision * varsum);

            scratchA->stats->htrans_ave = mean[0];
            scratchA->stats->htrans_var = varsum;
            scratchA->stats->htrans_chi2 = chi2;

            free(trans);
            free(var);
            free(mean);
        }
        else
        {
            for (i = 0; i < scratchA->cnum; ++i)
            {
                if (algo->covweight == 0)
                {
                    if (algo->alignment == 1 && algo->rounds < 3)
                    {
                        CenMassWtIpOcc(baseA->coords[i], scratchA->w);
                    }
                    else
                    {
                        if (algo->commandeur == 1)
                            CenMassWtIpEM(baseA->coords[i], scratchA->avecoords, scratchA->w);
                        else
                            CenMassWtIp(baseA->coords[i], scratchA->w);
                    }
                }
                else
                {
                    CenMassCov(baseA->coords[i], (const double **) scratchA->WtMat);
                }

                //printf("\n********** cen[%d]: %f %f %f", i+1, baseA->coords[i]->center[0], baseA->coords[i]->center[1], baseA->coords[i]->center[2]);
            }
        }
    }

//  fflush(NULL);

/*  for (i = 0; i < scratchA->cnum; ++i) */
/*      memcpy(scratchA->coords[i]->center, baseA->coords[i]->center, 3 * sizeof(double)); */
}


void
CalcTranslationsIp(CoordsArray *scratchA, Algorithm *algo)
{
    Coords        **coords = scratchA->coords;
    int             i, j, cnt;
    double          chi2;

    if (algo->notrans == 0)
    {
        if (algo->htrans == 1 && algo->rounds > 1)
        {
            double      logL, lvar, varsum/* , chi2 */;
            double     *mean = malloc(3 * sizeof(double));
            double     *var = malloc(3 * sizeof(double));
            double     *trans = malloc(scratchA->cnum * sizeof(double));

            varsum = FLT_MAX;
            cnt = 0;
            do
            {
                lvar = varsum;
                ++cnt;
                varsum = 0.0;
                for (j = 0; j < 3; ++j)
                {
                    for (i = 0; i < scratchA->cnum; ++i)
                        trans[i] = -coords[i]->center[j];

                    chi2 = normal_fit((const double *) trans, scratchA->cnum, &mean[j], &var[j], &logL);
                    varsum += var[j];
                    printf("\n %3d:%d chi2: %f mean: %f  var: %f logL: %f wtnorm: %f",
                           cnt, j, chi2, mean[j], var[j], logL, scratchA->stats->wtnorm);
                    fflush(NULL);
                }

                printf("\n %3d: varsum:%f", cnt, varsum);

                for (i = 0; i < scratchA->cnum; ++i)
                    CenMassWtHVarIp(coords[i], scratchA->w, scratchA->stats->wtnorm, mean, varsum, algo->precision);
                break; /* iterating converges to singularities */
            }
            while(fabs(lvar - varsum) > algo->precision * varsum);

            scratchA->stats->htrans_ave = mean[0];
            scratchA->stats->htrans_var = varsum;
            scratchA->stats->htrans_chi2 = chi2;

            free(trans);
            free(var);
            free(mean);
        }
        else
        {
            for (i = 0; i < scratchA->cnum; ++i)
            {
                if (algo->covweight == 0)
                {
/*                     if (algo->alignment == 1 && algo->rounds < 3) */
/*                     { */
/*                         CenMassWtIpOcc(scratchA->coords[i], scratchA->w); */
/*                     } */
/*                     else */
/*                     { */
                    if (algo->commandeur == 1)
                        CenMassWtIpEM(scratchA->coords[i], scratchA->avecoords, scratchA->w);
                    else
                        CenMassWtIp(scratchA->coords[i], scratchA->w);
/*                     } */
                }
                else
                {
                    CenMassCov(scratchA->coords[i], (const double **) scratchA->WtMat);
                }
            }
        }
    }
}


void
MatMultCoordsMultMatDiag(Coords *outcoords, const double **matK, const Coords *coords, const double *axesw)
{
    int             i, k;
    const int       vlen = coords->vlen;
    double        **TmpMat = MatAlloc(vlen, 3);
    double          matKik;
    double          xi, yi, zi;

    for (i = 0; i < vlen; ++i)
    {
        for (k = 0; k < vlen; ++k)
        {
            matKik = matK[i][k];
            TmpMat[i][0] += matKik * coords->x[k];
            TmpMat[i][1] += matKik * coords->y[k];
            TmpMat[i][2] += matKik * coords->z[k];
        }
    }

    for (i = 0; i < vlen; ++i)
    {
        xi = TmpMat[i][0];
        yi = TmpMat[i][1];
        zi = TmpMat[i][2];

        outcoords->x[i] = xi * axesw[0];
        outcoords->y[i] = yi * axesw[1];
        outcoords->z[i] = zi * axesw[2];
    }

    MatDestroy(&TmpMat);
}


void
MatMultCoordsMultMat(Coords *outcoords, const double **matK, const Coords *coords, const double **matD)
{
    int             i, k;
    const int       vlen = coords->vlen;
    double        **TmpMat = MatAlloc(vlen, 3);
    double          matKik;
    double          xi, yi, zi;

    for (i = 0; i < vlen; ++i)
    {
        for (k = 0; k < vlen; ++k)
        {
            matKik = matK[i][k];
            TmpMat[i][0] += matKik * coords->x[k];
            TmpMat[i][1] += matKik * coords->y[k];
            TmpMat[i][2] += matKik * coords->z[k];
        }
    }

    for (i = 0; i < vlen; ++i)
    {
        xi = TmpMat[i][0];
        yi = TmpMat[i][1];
        zi = TmpMat[i][2];

        outcoords->x[i] = xi * matD[0][0] + yi * matD[1][0] + zi * matD[2][0];
        outcoords->y[i] = xi * matD[0][1] + yi * matD[1][1] + zi * matD[2][1];
        outcoords->z[i] = xi * matD[0][2] + yi * matD[1][2] + zi * matD[2][2];
    }

    MatDestroy(&TmpMat);
}


void
MatDiagMultCoordsMultMat(Coords *outcoords, const double *diag, const Coords *coords, const double **matD)
{
    int             i;
    const int       vlen = coords->vlen;
    double          diagi;
    double          xi, yi, zi;

    for (i = 0; i < vlen; ++i)
    {
        diagi = diag[i];
        xi = diagi * coords->x[i];
        yi = diagi * coords->y[i];
        zi = diagi * coords->z[i];

        outcoords->x[i] = xi * matD[0][0] + yi * matD[1][0] + zi * matD[2][0];
        outcoords->y[i] = xi * matD[0][1] + yi * matD[1][1] + zi * matD[2][1];
        outcoords->z[i] = xi * matD[0][2] + yi * matD[1][2] + zi * matD[2][2];
    }
}


void
MatDiagMultCoordsMultMatDiag(Coords *outcoords, const double *wtK, const Coords *coords, const double *wtD)
{
    int             i;
    double          wtKi;
    const double   *x = (const double *) coords->x,
                   *y = (const double *) coords->y,
                   *z = (const double *) coords->z;

    for (i = 0; i < coords->vlen; ++i)
    {
        wtKi = wtK[i];

        outcoords->x[i] = wtKi * x[i] * wtD[0];
        outcoords->y[i] = wtKi * y[i] * wtD[1];
        outcoords->z[i] = wtKi * z[i] * wtD[2];
    }
}


static double
InnerProduct(double *A, Coords *coords1, Coords *coords2, const int len, const double *weight)
{
    double          x1, x2, y1, y2, z1, z2;
    int             i;
    const double   *fx1 = coords1->x, *fy1 = coords1->y, *fz1 = coords1->z;
    const double   *fx2 = coords2->x, *fy2 = coords2->y, *fz2 = coords2->z;
    double          G1 = 0.0, G2 = 0.0;
    A[0] = A[1] = A[2] = A[3] = A[4] = A[5] = A[6] = A[7] = A[8] = A[9] = 0.0;
    if (weight != NULL)
   {
      for (i = 0; i < len; ++i)
      {
         x1 = weight[i] * fx1[i];
         y1 = weight[i] * fy1[i];
         z1 = weight[i] * fz1[i];

         G1 += x1 * fx1[i] + y1 * fy1[i] + z1 * fz1[i];

         x2 = fx2[i];
         y2 = fy2[i];
         z2 = fz2[i];

         G2 += weight[i] * (x2 * x2 + y2 * y2 + z2 * z2);

         A[0] +=  (x1 * x2);
         A[1] +=  (x1 * y2);
         A[2] +=  (x1 * z2);

         A[3] +=  (y1 * x2);
         A[4] +=  (y1 * y2);
         A[5] +=  (y1 * z2);

         A[6] +=  (z1 * x2);
         A[7] +=  (z1 * y2);
         A[8] +=  (z1 * z2);   
      }
   }
   else
   {
      for (i = 0; i < len; ++i)
      {
         x1 = fx1[i];
         y1 = fy1[i];
         z1 = fz1[i];

         G1 += x1 * x1 + y1 * y1 + z1 * z1;

         x2 = fx2[i];
         y2 = fy2[i];
         z2 = fz2[i];

         G2 += (x2 * x2 + y2 * y2 + z2 * z2);

         A[0] +=  (x1 * x2);
         A[1] +=  (x1 * y2);
         A[2] +=  (x1 * z2);

         A[3] +=  (y1 * x2);
         A[4] +=  (y1 * y2);
         A[5] +=  (y1 * z2);

         A[6] +=  (z1 * x2);
         A[7] +=  (z1 * y2);
         A[8] +=  (z1 * z2);  
      }
   }
    return (G1 + G2) * 0.5;
}


static
int FastCalcRMSDAndRotation(double *rot, double *A, double *msd, double E0, int len, double minScore)
{
   double          Sxx, Sxy, Sxz, Syx, Syy, Syz, Szx, Szy, Szz;
   double          Szz2, Syy2, Sxx2, Sxy2, Syz2, Sxz2, Syx2, Szy2, Szx2,
                   SyzSzymSyySzz2, Sxx2Syy2Szz2Syz2Szy2, Sxy2Sxz2Syx2Szx2,
                   SxzpSzx, SyzpSzy, SxypSyx, SyzmSzy,
                   SxzmSzx, SxymSyx, SxxpSyy, SxxmSyy;
   double          C[4];
   int i;
   double mxEigenV; 
   double oldg = 0.0;
   double b, a, delta, ms;
   double q1, q2, q3, q4, normq;
   double d11, d12, d13, d14, d21, d22, d23, d24;
   double d31, d32, d33, d34, d41, d42, d43, d44;
   double a2, x2, y2, z2; 
   double xy, az, zx, ay, yz, ax; 
   double d3344_4334, d3244_4234, d3243_4233, d3143_4133,d3144_4134, d3142_4132; 

   Sxx = A[0]; Sxy = A[1]; Sxz = A[2];
   Syx = A[3]; Syy = A[4]; Syz = A[5];
   Szx = A[6]; Szy = A[7]; Szz = A[8];

   Sxx2 = Sxx * Sxx;
   Syy2 = Syy * Syy;
   Szz2 = Szz * Szz;

   Sxy2 = Sxy * Sxy;
   Syz2 = Syz * Syz;
   Sxz2 = Sxz * Sxz;

   Syx2 = Syx * Syx;
   Szy2 = Szy * Szy;
   Szx2 = Szx * Szx;

   SyzSzymSyySzz2 = 2.0*(Syz*Szy - Syy*Szz);
   Sxx2Syy2Szz2Syz2Szy2 = Syy2 + Szz2 - Sxx2 + Syz2 + Szy2;

   C[2] = -2.0 * (Sxx2 + Syy2 + Szz2 + Sxy2 + Syx2 + Sxz2 + Szx2 + Syz2 + Szy2);
   C[1] = 8.0 * (Sxx*Syz*Szy + Syy*Szx*Sxz + Szz*Sxy*Syx - Sxx*Syy*Szz - Syz*Szx*Sxy - Szy*Syx*Sxz);

   SxzpSzx = Sxz + Szx;
   SyzpSzy = Syz + Szy;
   SxypSyx = Sxy + Syx;
   SyzmSzy = Syz - Szy;
   SxzmSzx = Sxz - Szx;
   SxymSyx = Sxy - Syx;
   SxxpSyy = Sxx + Syy;
   SxxmSyy = Sxx - Syy;
   Sxy2Sxz2Syx2Szx2 = Sxy2 + Sxz2 - Syx2 - Szx2;

   C[0] = Sxy2Sxz2Syx2Szx2 * Sxy2Sxz2Syx2Szx2
          + (Sxx2Syy2Szz2Syz2Szy2 + SyzSzymSyySzz2) * (Sxx2Syy2Szz2Syz2Szy2 - SyzSzymSyySzz2)
          + (-(SxzpSzx)*(SyzmSzy)+(SxymSyx)*(SxxmSyy-Szz)) * (-(SxzmSzx)*(SyzpSzy)+(SxymSyx)*(SxxmSyy+Szz))
          + (-(SxzpSzx)*(SyzpSzy)-(SxypSyx)*(SxxpSyy-Szz)) * (-(SxzmSzx)*(SyzmSzy)-(SxypSyx)*(SxxpSyy+Szz))
          + (+(SxypSyx)*(SyzpSzy)+(SxzpSzx)*(SxxmSyy+Szz)) * (-(SxymSyx)*(SyzmSzy)+(SxzpSzx)*(SxxpSyy+Szz))
          + (+(SxypSyx)*(SyzmSzy)+(SxzmSzx)*(SxxmSyy-Szz)) * (-(SxymSyx)*(SyzpSzy)+(SxzmSzx)*(SxxpSyy-Szz));


   mxEigenV = E0;
   for (i = 0; i < 50; ++i)
   {
     oldg = mxEigenV;
     x2 = mxEigenV*mxEigenV;
     b = (x2 + C[2])*mxEigenV;
     a = b + C[1];
     delta = ((a*mxEigenV + C[0])/(2.0*x2*mxEigenV + b + a));
     mxEigenV -= delta;
     if (fabs(mxEigenV - oldg) < fabs((1e-6)*mxEigenV)) {
       break;
     }
   }
   if (i == 50) 
   {
      fprintf(stderr," more than %d iterations needed, something wrong!\b", i);
   }

   ms = 2.0 * (E0 - mxEigenV) / len;
   (*msd) = ms;

   if (minScore > 0) 
   {
        if (ms < minScore)
        {
            // Don't bother with rotation. 
            return -1;
        }   
   }

   d11 = SxxpSyy + Szz-mxEigenV; d12 = SyzmSzy; d13 = - SxzmSzx; d14 = SxymSyx;
   d21 = SyzmSzy; d22 = SxxmSyy - Szz-mxEigenV; d23 = SxypSyx; d24= SxzpSzx;
   d31 = d13; d32 = d23; d33 = Syy-Sxx-Szz - mxEigenV; d34 = SyzpSzy;
   d41 = d14; d42 = d24; d43 = d34; d44 = Szz - SxxpSyy - mxEigenV;
   d3344_4334 = d33 * d44 - d43 * d34; d3244_4234 = d32 * d44-d42*d34;
   d3243_4233 = d32 * d43 - d42 * d33; d3143_4133 = d31 * d43-d41*d33;
   d3144_4134 = d31 * d44 - d41 * d34; d3142_4132 = d31 * d42-d41*d32;
   q1 =  d22*d3344_4334-d23*d3244_4234+d24*d3243_4233;
   q2 = -d21*d3344_4334+d23*d3144_4134-d24*d3143_4133;
   q3 =  d21*d3244_4234-d22*d3144_4134+d24*d3142_4132;
   q4 = -d21*d3243_4233+d22*d3143_4133-d23*d3142_4132;

   normq = sqrt(q1 * q1 + q2 * q2 + q3 * q3 + q4 * q4);
   q1 /= normq; q2 /= normq; q3 /= normq; q4 /= normq;

   a2 = q1 * q1;
   x2 = q2 * q2;
   y2 = q3 * q3;
   z2 = q4 * q4;

   xy = q2 * q3;
   az = q1 * q4;
   zx = q4 * q2;
   ay = q1 * q3;
   yz = q3 * q4;
   ax = q1 * q2;

   rot[0] = a2 + x2 - y2 - z2;
   rot[1] = 2 * (xy + az);
   rot[2] = 2 * (zx - ay);
   rot[3] = 2 * (xy - az);
   rot[4] = a2 - x2 + y2 - z2;
   rot[5] = 2 * (yz + ax);
   rot[6] = 2 * (zx + ay);
   rot[7] = 2 * (yz - ax);
   rot[8] = a2 - x2 - y2 + z2;

   return 0;
}


/* static void  */
/* CenterCoords(Coords *coords, const int len) */
/* { */
/*     int             i; */
/*     double          xsum, ysum, zsum; */
/*     double         *x = coords->x, *y = coords->y, *z = coords->z; */
/*  */
/*     xsum = ysum = zsum = 0.0; */
/*     for (i = 0; i < len; ++i) */
/*     { */
/*         xsum += x[i]; */
/*         ysum += y[i]; */
/*         zsum += z[i]; */
/*     } */
/*  */
/*     xsum /= len; */
/*     ysum /= len; */
/*     zsum /= len; */
/*  */
/*     for (i = 0; i < len; ++i) */
/*     { */
/*         x[i] -= xsum; */
/*         y[i] -= ysum; */
/*         z[i] -= zsum; */
/*     } */
/* } */


static double
CalcRMSDRotationalMatrix(Coords *coords1, Coords *coords2, const int len, double *rot, const double *weight)
{
    double          A[9];
    double          rmsd;

    /* center the structures */
    //CenterCoords(coords1, len);
    //CenterCoords(coords2, len);

    /* calculate the (weighted) inner product of two structures */
    double E0 = InnerProduct(A, coords1, coords2, len, weight);

    /* calculate the RMSD & rotational matrix */
    FastCalcRMSDAndRotation(rot, A, &rmsd, E0, len, -1);

    return rmsd;
}


double
CalcRotations(CoordsArray *cdsA)
{
    Coords        **coords = cdsA->coords;
    const Coords   *avecoords = cdsA->avecoords;
    const double   *wts = (const double *) cdsA->w;
    const double   *axesw = (const double *) cdsA->axesw;
    Coords         *tcoords = cdsA->tcoords;
    double          deviation = 0.0, deviation_sum = 0.0;
    int             i;

    if (cdsA->algo->norot == 0)
    {
        if (cdsA->algo->method == 3) /* default */
        {
            if (cdsA->algo->covweight == 1)
            {
                MatMultCoordsMultMatDiag(tcoords,
                                         (const double **) cdsA->WtMat,
                                         avecoords,
                                         axesw);
            }
            else if (cdsA->algo->varweight == 1 || cdsA->algo->leastsquares == 1)
            {
                MatDiagMultCoordsMultMatDiag(tcoords,
                                             wts,
                                             avecoords,
                                             axesw);
            }

            for (i = 0; i < cdsA->cnum; ++i)
            {
                if (cdsA->algo->tenberge == 1)
                {
                    AveCoordsTB(cdsA, i);
                    MatDiagMultCoordsMultMatDiag(tcoords,
                                 wts,
                                 avecoords,
                                 axesw);
                }

                if (cdsA->algo->pu == 1)
                {
                    //printf("\nHERE!!\n");
                    deviation = CalcRMSDRotationalMatrix(coords[i], tcoords, coords[i]->vlen, &coords[i]->matrix[0][0], NULL);
                }
                else
                {
                    deviation = Kabsch(coords[i],
                                       tcoords,
                                       coords[i]->matrix,
                                       cdsA->tmpmat3a,
                                       cdsA->tmpmat3b,
                                       cdsA->tmpmat3c,
                                       cdsA->tmpvec3a);

                    /* note that the avecoords are already multiplied by the weight matrices */
/*                     deviation = ProcLAPACKSVDvan(coords[i], */
/*                                                  tcoords, */
/*                                                  coords[i]->matrix, */
/*                                                  cdsA->tmpmat3a, */
/*                                                  cdsA->tmpmat3b, */
/*                                                  cdsA->tmpmat3c, */
/*                                                  cdsA->tmpvec3a); */
                }
        /*      RotateCoordsIp(coords[i], (const double **) coords[i]->matrix); */
        /*      if (cdsA->algo->noave == 0) */
        /*          AveCoords(cdsA); */

                /* find global rmsd and average coords (both held in structure) */
                coords[i]->wRMSD_from_mean = sqrt(deviation / (3 * cdsA->vlen));
                deviation_sum += deviation;

    /*             if (cdsA->algo->verbose == 1) */
    /*             { */
                    /* rmsd from mean would usually need a 2 in denom,\
                       but this is already deviation from mean,
                       since structure #2 is the average structure */
    /*                 printf("\n %5d %8.3f %13.3f ", */
    /*                        i+1, */
    /*                        coords[i]->wRMSD_from_mean, */
    /*                        deviation); */
    /*             } */
            }

            /* Correct for the bias of the dim-weighted rotation.
               this step is formally unnecessary, since the rotations are relative,
               but in our case it helps convergence since we require that the superposition
               be aligned with the principal axes of the dimensional covariance matrix */
            if (cdsA->algo->dimweight != 0)
            {
                ProcLAPACKSVDvan(avecoords,
                                 tcoords,
                                 avecoords->matrix,
                                 cdsA->tmpmat3a,
                                 cdsA->tmpmat3b,
                                 cdsA->tmpmat3c,
                                 cdsA->tmpvec3a);

                Mat3TransposeIp(avecoords->matrix);

                for (i = 0; i < cdsA->cnum; ++i)
                    Mat3MultIp(coords[i]->matrix, (const double **) avecoords->matrix);
            }
        }
    }

//     else /* These are really just for debugging; they aren't supported or
//             guarranteed to work */
//     {
//         for (i = 0; i < cdsA->cnum; ++i)
//         {
//             switch(cdsA->algo->method)
//             {
//                 case 0:
//                     deviation = Kabsch(coords[i],
//                                        avecoords,
//                                        coords[i]->matrix,
//                                        wts, axesw,
//                                        cdsA->tmpmat3a,
//                                        cdsA->tmpmat3b,
//                                        cdsA->tmpmat3c,
//                                        cdsA->tmpvec3a);
//                     break;
// 
//                 case 1:
//                     deviation = Kearsley(coords[i],
//                                          avecoords,
//                                          wts,
//                                          coords[i]->matrix,
//                                          coords[i]->evals);
//                     break;
// 
//                 case 2:
//                     deviation = Horn(coords[i],
//                                      avecoords,
//                                      wts,
//                                      NULL,
//                                      coords[i]->matrix);
//                     break;
// 
//                 case 4:
//                     deviation = Kearsley_LAPACK(coords[i],
//                                                 avecoords,
//                                                 wts,
//                                                 coords[i]->matrix,
//                                                 coords[i]->evals);
//                     break;
// 
//                 case 5:
//                     deviation = KabschLAPACK(coords[i],
//                                        avecoords,
//                                        coords[i]->matrix,
//                                        wts, axesw,
//                                        cdsA->tmpmat3a,
//                                        cdsA->tmpmat3b,
//                                        cdsA->tmpmat3c);
//                     break;
// 
//                 case 6:
//                     deviation = ProcLAPACKSVDAx(coords[i], avecoords,
//                                                 coords[i]->matrix,
//                                                 wts, axesw,
//                                                 (const double **) cdsA->AxWtMat,
//                                                 cdsA->tmpmat3a,
//                                                 cdsA->tmpmat3b,
//                                                 cdsA->tmpmat3c,
//                                                 cdsA->tmpvec3a);
//                     break;
// 
//                 case 7:
//                     deviation = ProcLAPACKSVD(coords[i], avecoords,
//                                               coords[i]->matrix,
//                                               wts, axesw,
//                                               cdsA->tmpmat3a,
//                                               cdsA->tmpmat3b,
//                                               cdsA->tmpmat3c,
//                                               cdsA->tmpvec3a);
//                     break;
// 
//                 case 8:
//                     deviation = KabschJacobi(coords[i],
//                                              avecoords,
//                                              coords[i]->matrix,
//                                              wts, axesw,
//                                              cdsA->tmpmat3a,
//                                              cdsA->tmpmat3b,
//                                              cdsA->tmpmat3c,
//                                              cdsA->tmpvec3a);
//                     break;
// 
//                 case 10:
//                     deviation = ProcJacobiSVD(coords[i], avecoords,
//                                               coords[i]->matrix,
//                                               wts, axesw,
//                                               cdsA->tmpmat3a,
//                                               cdsA->tmpmat3b,
//                                               cdsA->tmpmat3c,
//                                               cdsA->tmpvec3a);
//                     break;
// 
//                 case 11:
//                     deviation = KabschJacobiCyc(coords[i],
//                                                 avecoords,
//                                                 coords[i]->matrix,
//                                                 wts, axesw,
//                                                 cdsA->tmpmat3a,
//                                                 cdsA->tmpmat3b,
//                                                 cdsA->tmpmat3c,
//                                                 cdsA->tmpvec3a);
//             }
// 
//             /* find global rmsd and average coords (both held in structure) */
//             coords[i]->wRMSD_from_mean = sqrt(deviation / (3 * cdsA->vlen));
//             deviation_sum += deviation;
// 
// /*             if (cdsA->algo->verbose == 1) */
// /*             { */
//                 /* rmsd from mean would usually need a 2 in denom,
//                    but this is already deviation from mean,
//                    since structure #2 is the average structure */
// /*                 printf("\n %5d %8.3f %13.3f ", */
// /*                        i+1, */
// /*                        coords[i]->wRMSD_from_mean, */
// /*                        deviation); */
// /*             } */
//         }
//     }

    return(deviation_sum);
}


/* This is the classic iterative (not eigendecomp) solution given by Gower 1975 and in 
   Gower and Dijksterhuis 2004, Ch 9, page 113, Eqn 9.21 */
double
CalcScaleFactors(CoordsArray *cdsA)
{
    Coords         *cdsi = NULL;
    Coords        **cds = cdsA->coords;
    Coords         *avecds = cdsA->avecoords;
    double         *wts = cdsA->w;
    int             i;
    const int       cnum = cdsA->cnum, vlen = cdsA->vlen;
    double          scaleprod, selfprod, innprod, norm, avecdstr, oldscale, factor;


    if (cdsA->algo->leastsquares == 1)
    {
        avecdstr = TrCoordsInnerProd(avecds, vlen);

        norm = 0.0;
        for (i = 0; i < cnum; ++i)
            norm += TrCoordsInnerProd(cds[i], vlen);
    }
    else if (cdsA->algo->varweight == 1)
    {
        avecdstr = TrCoordsInnerProdWt(avecds, vlen, wts);

        norm = 0.0;
        for (i = 0; i < cnum; ++i)
            norm += TrCoordsInnerProdWt(cds[i], vlen, wts);
    }
    else
    {
        norm = avecdstr = 1.0;
    }

        for (i = 0; i < vlen; ++i)
            wts[i] = 1.0 / cdsA->var[i];

    scaleprod = 0.0;
    for (i = 0; i < cnum; ++i)
    {
        cdsi = cdsA->coords[i];
        oldscale = cdsi->scale;

        if (cdsA->algo->leastsquares == 1)
        {
            selfprod = TrCoordsInnerProd(cdsi, vlen) / (oldscale * oldscale);
            innprod = TrCoordsInnerProd2(cdsi, avecds, vlen) / oldscale;

        }
        else if (cdsA->algo->varweight == 1)
        {
            selfprod = TrCoordsInnerProdWt(cdsi, vlen, wts) / (oldscale * oldscale);
            innprod = TrCoordsInnerProdWt2(cdsi, avecds, vlen, wts) / oldscale-1.0;
        }
        else
        {
            innprod = selfprod = 1.0;
        }

        cdsi->scale = norm * innprod / (cnum * avecdstr * selfprod);
        cdsi->scale = (sqrt(innprod*innprod + 12.0 * (double) vlen * selfprod) + innprod) / (2.0 * selfprod);
        //cds[i]->scale = innprod / selfprod;
        scaleprod += log(cds[i]->scale);
        factor = cdsi->scale / oldscale;
        ScaleCoords(cdsi, factor);
        printf("\nfactor[%3d] = %12.6e -- scale = %12.6e", i+1, factor, cdsi->scale);
    }

    scaleprod = exp(scaleprod / (double) cnum);

    double bsum = 0.0;
    for (i = 0; i < cnum; ++i)
        bsum += cdsA->coords[i]->scale;

    for (i = 0; i < cnum; ++i)
        printf("\nscale[%3d]: %12.6f", i+1, 15.5 * 30.0 * cdsA->coords[i]->scale / bsum);

//    for (i = 0; i < cnum; ++i)
//        cds[i]->scale /= scaleprod;

    return(scaleprod);
}


void
ConstrainCovMat(CoordsArray *cdsA)
{
    int             i, j;
    double          tmpx, tmpy, tmpz;
    double          lagrange, vari;
    const int       cnum = cdsA->cnum, vlen = cdsA->vlen;
    double         *var = cdsA->var;
    const Coords  **coords = (const Coords **) cdsA->coords;
    Coords         *coordsj;
    const double   *avex = (const double *) cdsA->avecoords->x,
                   *avey = (const double *) cdsA->avecoords->y,
                   *avez = (const double *) cdsA->avecoords->z;

    lagrange = 0.0;
    for (j = 0; j < cnum; ++j)
    {
        coordsj = (Coords *) coords[j];

		for (i = 0; i < vlen; ++i)
		{
			tmpx = coordsj->x[i] - avex[i];
			vari = tmpx * tmpx;
			tmpy = coordsj->y[i] - avey[i];
			vari += tmpy * tmpy;
			tmpz = coordsj->z[i] - avez[i];
			vari += tmpz * tmpz;

            lagrange += vari / var[i];
		}
    }

    lagrange = lagrange / (3.0 * cnum * vlen) - 1.0;
    
    if (lagrange < 0.0)
        lagrange = 0.0;
    
    printf("\nlagrange = % 12.6e", lagrange);

    for (i = 0; i < vlen; ++i)
        var[i] -= lagrange * (avex[i]*avex[i] + avey[i]*avey[i] + avez[i]*avez[i]) / (3.0 * cnum);
}


/* This is the unconstrained ML solution */
double
CalcScaleFactorsML(CoordsArray *cdsA)
{
    Coords        **cds = cdsA->coords;
    Coords         *avecds = cdsA->avecoords;
    Coords         *cdsi = NULL;
    double         *wts = cdsA->w;
    int             i;
    const int       cnum = cdsA->cnum, vlen = cdsA->vlen;
    double          bsum, scaleprod, phi, gamma, sigma2, oldscale, factor, nkd = 3.0 * cnum * vlen;
//    double          var = cdsA->stats->var;

    CalcCovariances(cdsA);
    CalcWts(cdsA);

    scaleprod = 0.0;
    if (cdsA->algo->leastsquares == 1)
    {
        sigma2 = 0.0;
        for (i = 0; i < vlen; ++i)
            sigma2 += cdsA->var[i];
        sigma2 /= (double) vlen;

        bsum = 1.0;
    
        printf("\nsigma2 = %12.6e \n", sigma2);

        for (i = 0; i < cnum; ++i)
        {
            cdsi = cdsA->coords[i];
            oldscale = cdsi->scale;
            phi = bsum * TrCoordsInnerProd(cdsi, vlen) / (oldscale * oldscale);
            gamma = TrCoordsInnerProd2(cdsi, avecds, vlen) / oldscale;
            cdsi->scale = (sqrt(gamma*gamma + 12.0 * vlen * sigma2 * phi) + gamma) / (2.0 * phi);
            scaleprod += log(cdsi->scale);
            factor = cdsi->scale / oldscale;
            ScaleCoords(cdsi, factor);
            printf("\nfactor[%3d] = %12.6e -- scale = %12.6e", i+1, factor, cdsi->scale);
        }

        /* This is to verify that our implicit constraint is actually in effect. */
        bsum = 0.0;
        for (i = 0; i < cnum; ++i)
            bsum += log(cdsA->coords[i]->scale);

        printf("\nblogsum = %12.6e", bsum);

        bsum = 0.0;
        for (i = 0; i < cnum; ++i)
            bsum += TrCoordsInnerProd(cds[i], vlen) - TrCoordsInnerProd2(cds[i], avecds, vlen);

        printf("\nbsum = %12.6e %12.6e % 12.6e", bsum/sigma2, nkd, bsum/sigma2 - nkd);
        //bsum = (bsum / (3.0 * cnum * vlen)) + 1.0;

        scaleprod = exp(scaleprod / (double) cnum);
    }
    else if (cdsA->algo->varweight == 1)
    {
        for (i = 0; i < vlen; ++i)
            wts[i] = 1.0 / cdsA->var[i];

        double constraint = 0.0;

        for (i = 0; i < vlen; ++i)
            constraint += wts[i] * (avecds->x[i] * avecds->x[i] + 
                                    avecds->y[i] * avecds->y[i] + 
                                    avecds->z[i] * avecds->z[i]);

        constraint = constraint / (3.0 * vlen) + 1.0;
        printf("\nconstraint = % 12.6e", constraint);

        for (i = 0; i < cnum; ++i)
        {
            cdsi = cdsA->coords[i];
            oldscale = cdsi->scale;
            phi = constraint * TrCoordsInnerProdWt(cdsi, vlen, wts) / (oldscale * oldscale);
            gamma = TrCoordsInnerProdWt2(cdsi, avecds, vlen, wts) / oldscale;
            cdsi->scale = (sqrt(gamma*gamma + 12.0 * vlen * phi) + gamma) / (2.0 * phi);
            scaleprod += log(cdsi->scale);
            factor = cdsi->scale / oldscale;
            ScaleCoords(cdsi, factor);
            printf("\nfactor[%3d] = %12.6e -- scale = %12.6e", i+1, factor, cdsi->scale);
        }

        /* This is to verify that our implicit constraint is actually in effect. */
        bsum = 0.0;
        for (i = 0; i < cnum; ++i)
            bsum += TrCoordsInnerProdWt(cds[i], vlen, wts) - TrCoordsInnerProdWt2(cds[i], avecds, vlen, wts);

        printf("\nbsum = %12.6e %12.6e % 12.6e", bsum, nkd, bsum - nkd);

        double phisum = 0.0;
        for (i = 0; i < cnum; ++i)
            phisum += TrCoordsInnerProdWt(cds[i], vlen, wts);

        double gammasum = 0.0;
        for (i = 0; i < cnum; ++i)
            gammasum +=TrCoordsInnerProdWt2(cds[i], avecds, vlen, wts);

        printf("\nphisum, gammasum: % 12.6e % 12.6e % 12.6e", phisum, gammasum, 3.0*vlen);

        scaleprod = exp(scaleprod / (double) cnum);

/*         CalcRotations(cdsA); */
/*         for (i = 0; i < cnum; ++i) */
/*             RotateCoordsIp(cds[i], (const double **) cds[i]->matrix); */
/*         AveCoords(cdsA); */
        CalcCovariances(cdsA);
        //ConstrainCovMat(cdsA);
        CalcWts(cdsA);
    }
    else
    {
        gamma = phi = 1.0;
        for (i = 0; i < cnum; ++i)
            cds[i]->scale = 1.0;
        scaleprod = 1.0;
    }

/*     bsum = 0.0; */
/*     for (i = 0; i < cnum; ++i) */
/*         bsum += cdsA->coords[i]->scale; */
/*  */
/*     for (i = 0; i < cnum; ++i) */
/*         printf("\nscale[%3d]: %12.6f", i+1, 15.5 * 30.0 * cdsA->coords[i]->scale / bsum); */

/*     for (i = 0; i < cnum; ++i) */
/*         cds[i]->scale /= scaleprod; */

    return(scaleprod);
}


static void
evallognormal(const double beta, const double phi, const double gamma, const double mu, const int vlen, const double lambda, double *fx, double *dfx)
{
    *fx = phi * beta * beta - gamma * beta + log(beta)/mu - 3.0 * vlen - lambda;
    *dfx = 2.0 * beta * phi - gamma + 1.0 / (mu * beta);
}


static double
NewtRaphScaleLogNorm(const double init, const double phi, const double gamma, const double mu, const int vlen, const double lambda,  const double tol)
{
    int            i;
    double         beta, fx, dfx;

    /* Use Newton-Raphson to find ML estimate of lognormally distributed
       scale factors.

       must find root of:

          F1 =  = 0

       where the first derivative with repect to the lognormal scale
       estimate x (dF1/dx) is:

          F1' = 
    */
    beta = init;
    for (i = 0; i < 200; ++i)
    {
        evallognormal(beta, phi, gamma, mu, vlen, lambda, &fx, &dfx);

        if (fabs(fx) < tol)
            break; /* success */

        beta -= (fx / dfx); /* Newton-Raphson correction */
    }

    if (i == 200)
        beta = init;

    printf("\n init, beta: %10.5f %10.5f", init, beta);
    return(beta);
}


static double
CalcMu(CoordsArray *cdsA)
{
    int             i;
    const int       cnum = cdsA->cnum;
    double          logb, logbsum;

    logbsum = 0.0;
    for (i = 0; i < cnum; ++i)
    {
        logb = log(cdsA->coords[i]->scale);
        logbsum += logb*logb;
    }
    printf("\n logbsum: %10.5f", logbsum);

    return(0.5 * (sqrt(1.0 + 4.0 * logbsum / cnum) - 1.0));
}


static double
CalcTheta(CoordsArray *cdsA)
{
    int             i;
    const int       cnum = cdsA->cnum;
    double          logb, logbsum;

    logbsum = 0.0;
    for (i = 0; i < cnum; ++i)
    {
        logb = log(cdsA->coords[i]->scale);
        logbsum += logb*logb;
    }

    return(logbsum / cnum);
}


double
CalcScaleFactorsMLLogNorm(CoordsArray *cdsA)
{
    Coords        **cds = cdsA->coords;
    Coords         *cdsi = NULL;
    Coords         *avecds = cdsA->avecoords;
    double         *wts = cdsA->w;
    const int       cnum = cdsA->cnum, vlen = cdsA->vlen;
    double          scaleprod, init, mu, theta, lambda, phi, gamma, sigma2, oldscale, factor;
    int             i;
    double          tol = cdsA->algo->precision;
    //double         *variance = cdsA->var;

    scaleprod = 0.0;
    if (cdsA->algo->leastsquares == 1)
    {
        lambda = 0.0;
        for (i = 0; i < cnum; ++i)
            lambda += TrCoordsInnerProd(cds[i], vlen) - TrCoordsInnerProd2(cds[i], avecds, vlen);

        lambda = (lambda - 3.0 * vlen * cnum) / cnum;

        sigma2 = 0.0;
        for (i = 0; i < vlen; ++i)
            sigma2 += cdsA->var[i];
        sigma2 /= (double) vlen;

        printf("\nsigma2 = %12.6e \n", sigma2);

        for (i = 0; i < cnum; ++i)
        {
/*             lambda = 0.0; */
/*             for (j = 0; j < cnum; ++j) */
/*                 lambda += TrCoordsInnerProd(cds[j], vlen) - TrCoordsInnerProd2(cds[j], avecds, vlen); */
/*      */
/*             lambda = (lambda - 3.0 * vlen * cnum + cnum) / cnum; */
/*             printf("\nlambda = %12.6e \n", lambda); */
            lambda = 0.0;

            cdsi = cdsA->coords[i];
            oldscale = cdsi->scale;
            phi = TrCoordsInnerProd(cdsi, vlen) / (sigma2 * oldscale * oldscale);
            gamma = TrCoordsInnerProd2(cdsi, avecds, vlen) / (sigma2 * oldscale);
            init = gamma / phi;
            mu = CalcMu(cdsA);
            theta = CalcTheta(cdsA);
            printf("\nmu = %12.6e \n", mu);
            printf("\ntheta = %12.6e \n", theta);
            cdsi->scale = NewtRaphScaleLogNorm(init, phi, gamma, mu, vlen, lambda, tol);
            //cdsi->scale = (sqrt(gamma*gamma + 12.0 * (double) vlen * sigma2 * phi) + gamma) / (2.0 * phi);
            scaleprod += log(cdsi->scale);
            factor = cdsi->scale / oldscale;
            ScaleCoords(cdsi, factor);
            printf("\nfactor[%3d] = %12.6e -- scale = %12.6e", i+1, factor, cdsi->scale);
        }

        scaleprod = exp(scaleprod / (double) cnum);
    }
    else if (cdsA->algo->varweight == 1)
    {
/*         int j;         */
/*         lambda = 0.0; */
/*         for (j = 0; j < cnum; ++j) */
/*             lambda += TrCoordsInnerProdWt(cds[j], vlen, wts) - TrCoordsInnerProdWt2(cds[j], avecds, vlen, wts); */
/*  */
/*         lambda = (lambda - 3.0 * vlen * cnum) / cnum; */
/*         printf("\nlambda = %12.6e \n", lambda); */

        lambda = 0.0;

        for (i = 0; i < vlen; ++i)
            wts[i] = 1.0 / cdsA->var[i];

        for (i = 0; i < cnum; ++i)
        {
            cdsi = cdsA->coords[i];
            oldscale = cdsi->scale;
            phi = TrCoordsInnerProdWt(cdsi, vlen, wts) / (oldscale * oldscale);
            gamma = TrCoordsInnerProdWt2(cdsi, avecds, vlen, wts) / oldscale;

            if (cdsA->algo->rounds > 8)
                init = cdsi->scale;
            else
                init = gamma / phi;

            mu = CalcMu(cdsA);
            theta = CalcTheta(cdsA);
            printf("\nmu = %12.6e ", mu);
            printf("\ntheta = %12.6e ", theta);
            cdsi->scale = NewtRaphScaleLogNorm(init, phi, gamma, mu, vlen, lambda, tol);
            //cdsi->scale = (sqrt(gamma*gamma + 12.0 * (double) vlen * phi) + gamma) / (2.0 * phi);
            scaleprod += log(cdsi->scale);
            factor = cdsi->scale / oldscale;
            ScaleCoords(cdsi, factor);
            printf("\nfactor[%3d] = %12.6e -- scale = %12.6e", i+1, factor, cdsi->scale);
        }
    }
    else
    {
        phi = gamma = 1.0;
        for (i = 0; i < cnum; ++i)
            cds[i]->scale = 1.0;
        scaleprod = 1.0;
    }

    double bsum = 0.0;
    for (i = 0; i < cnum; ++i)
        bsum += cdsA->coords[i]->scale;

    for (i = 0; i < cnum; ++i)
        printf("\nscale[%3d]: %12.6f", i+1, 15.5 * 30.0 * cdsA->coords[i]->scale / bsum);

    return(scaleprod);
}


/* */
double
CalcScaleFactorsMLLog(CoordsArray *cdsA)
{
    Coords        **cds = cdsA->coords;
    Coords         *avecds = cdsA->avecoords;
    const double   *wts = (const double *) cdsA->w;
    int             i, cnum = cdsA->cnum, vlen = cdsA->vlen;
    double          scaleprod, selfprod, innprod, sigma2, theta;

    theta = 0.0;
    for (i = 0; i < cnum; ++i)
        theta += cds[i]->scale * cds[i]->scale;
    theta /= (double) cnum;

    CalcCovariances(cdsA);
    CalcWts(cdsA);

    sigma2 = 0.0;
    for (i = 0; i < vlen; ++i)
        sigma2 += cdsA->var[i];
    sigma2 /= (double) vlen;

    printf("\nsigma2 = %12.6e \n", sigma2);

    scaleprod = 0.0;
    if (cdsA->algo->leastsquares == 1)
    {
        for (i = 0; i < cnum; ++i)
        {
            innprod = TrCoordsInnerProd2(cds[i], avecds, vlen);
            selfprod = TrCoordsInnerProd(cds[i], vlen);
            cds[i]->scale =
            (sqrt(innprod*innprod + 4.0 * (3.0 * (double) vlen  - 1.0 - log(cds[i]->scale)/theta) * sigma2 * selfprod) + innprod) / (2.0 * selfprod);
            scaleprod += log(cds[i]->scale);
        }

        scaleprod = exp(scaleprod / (double) cnum);
    }
    else if (cdsA->algo->varweight == 1)
    {
        for (i = 0; i < cnum; ++i)
        {
            innprod = TrCoordsInnerProdWt2(cds[i], avecds, vlen, wts);
            selfprod = TrCoordsInnerProdWt(cds[i], vlen, wts);
            cds[i]->scale = (sqrt(innprod*innprod + 4.0 * (3.0 * (double) vlen  - 1.0 - log(cds[i]->scale)/theta) * selfprod) + innprod) / (2.0 * selfprod);
            scaleprod += log(cds[i]->scale);
        }

        scaleprod = exp(scaleprod / (double) cnum);
    }
    else
    {
        innprod = selfprod = 1.0;
        for (i = 0; i < cnum; ++i)
            cds[i]->scale = 1.0;
        scaleprod = 1.0;
    }

    return(scaleprod);
}


/* This is the constrained ML solution, without the scale factor Jacobian in the PDF */
double
CalcScaleFactorsMLConstr(CoordsArray *cdsA)
{
    Coords        **cds = cdsA->coords;
    Coords         *avecds = cdsA->avecoords;
    const double   *wts = (const double *) cdsA->w;
    int             i, cnum = cdsA->cnum, vlen = cdsA->vlen;
    double          scaleprod, selfprod, innprod, sigma2, trsig;
    double         *varu = malloc(vlen * sizeof(double));

    CalcCovariances(cdsA);

    memcpy(varu, cdsA->var, vlen);

    CalcWts(cdsA);

    sigma2 = 0.0;
    for (i = 0; i < vlen; ++i)
        sigma2 += cdsA->var[i];
    sigma2 /= (double) vlen;

    printf("\nsigma2 = %12.6e \n", sigma2);

    scaleprod = 0.0;

    if (cdsA->algo->leastsquares == 1)
    {
        for (i = 0; i < cnum; ++i)
        {
            innprod = TrCoordsInnerProd2(cds[i], avecds, vlen);
            selfprod = TrCoordsInnerProd(cds[i], vlen);
            cds[i]->scale = (sqrt(innprod*innprod + 12.0 * (double) vlen * sigma2 * selfprod) + innprod) / (2.0 * selfprod);
            scaleprod += log(cds[i]->scale);
        }
    }
    else if (cdsA->algo->varweight == 1)
    {
        trsig = 0.0;
        for (i = 0; i < vlen; ++i)
            trsig += varu[i]/cdsA->var[i];

        printf("\ntrsig = %12.6e \n", trsig);

        for (i = 0; i < cnum; ++i)
        {
            innprod = TrCoordsInnerProdWt2(cds[i], avecds, vlen, wts);
            selfprod = TrCoordsInnerProdWt(cds[i], vlen, wts);
            cds[i]->scale = (sqrt(innprod*innprod + 12.0 * trsig * selfprod) + innprod) / (2.0 * selfprod);
            scaleprod += log(cds[i]->scale);
        }
    }
    else
    {
        innprod = selfprod = 1.0;
        cds[i]->scale = 1.0;
        scaleprod += log(cds[i]->scale);
    }

    free(varu);

    return(exp(scaleprod / (double) cnum));
}


/* This is the constrained ML solution, with (or without) the scale factor Jacobian in the PDF */
double
CalcScaleFactorsMLGoodall(CoordsArray *cdsA)
{
    Coords        **cds = cdsA->coords;
    Coords         *avecds = cdsA->avecoords;
    const double   *wts = (const double *) cdsA->w;
    int             i, cnum = cdsA->cnum, vlen = cdsA->vlen;
    double          scaleprod, selfprod, innprod, sigma2;

    CalcCovariances(cdsA);
    CalcWts(cdsA);

    sigma2 = 0.0;
    for (i = 0; i < vlen; ++i)
        sigma2 += cdsA->var[i];
    sigma2 /= (double) vlen;

    printf("\nsigma2 = %12.6e \n", sigma2);

    scaleprod = 0.0;
    for (i = 0; i < cnum; ++i)
    {
        if (cdsA->algo->leastsquares == 1)
        {
            innprod = TrCoordsInnerProd2(cds[i], avecds, vlen);
            selfprod = TrCoordsInnerProd(cds[i], vlen);
        }
        else if (cdsA->algo->varweight == 1)
        {
            innprod = TrCoordsInnerProdWt2(cds[i], avecds, vlen, wts);
            selfprod = TrCoordsInnerProdWt(cds[i], vlen, wts);
        }
        else
        {
            innprod = selfprod = 1.0;
        }

        cds[i]->scale = (sqrt(innprod*innprod + 12.0 * (double) vlen * sigma2 * selfprod) + innprod) / (2.0 * selfprod);
        scaleprod += log(cds[i]->scale);
    }

    return(exp(scaleprod / (double) cnum));
}


/* constrained LS, so that \Prod_i^N scale_i = 1 */
double
CalcScaleFactorsML2(CoordsArray *cdsA)
{
    Coords        **cds = cdsA->coords;
    Coords         *avecds = cdsA->avecoords;
    const double   *wts = (const double *) cdsA->w;
    int             i, cnum = cdsA->cnum, vlen = cdsA->vlen;
    double          scaleprod, selfprod, innprod;


    scaleprod = 1.0;
    for (i = 0; i < cnum; ++i)
    {
        if (cdsA->algo->leastsquares == 1)
        {
            innprod = TrCoordsInnerProd2(cds[i], avecds, vlen);
            selfprod = TrCoordsInnerProd(cds[i], vlen);
        }
        else if (cdsA->algo->varweight == 1)
        {
            innprod = TrCoordsInnerProdWt2(cds[i], avecds, vlen, wts);
            selfprod = TrCoordsInnerProdWt(cds[i], vlen, wts);
        }
        else
        {
            innprod = selfprod = 1.0;
        }

        cds[i]->scale = innprod / selfprod;
        scaleprod *= cds[i]->scale;
    }

    scaleprod = powf(scaleprod, 1.0 / (double) cnum);

    for (i = 0; i < cnum; ++i)
        cds[i]->scale /= scaleprod;

    return(scaleprod);
}


void
ScaleCoordsArray(CoordsArray *cdsA)
{
    Coords        **coords = cdsA->coords;
    int             i;

    for (i = 0; i < cdsA->cnum; ++i)
    {
        ScaleCoords(coords[i], coords[i]->scale);
        printf("Scale[%3d]: %12.6f\n", i, coords[i]->scale);
    }
}


static void
*CalcRot_pth(void *rotdata_ptr)
{
    int             i;
    double          deviation;
    RotData        *rotdata = (RotData *) rotdata_ptr;
    Coords         *coords;

    for (i = rotdata->start; i < rotdata->end; ++i)
    {
        coords = rotdata->coords[i];
        /* note that the avecoords are already multiplied by the weight matrices */
        deviation = ProcLAPACKSVDvan(coords,
                                     rotdata->tcoords,
                                     coords->matrix,
                                     coords->tmpmat3a,
                                     coords->tmpmat3b,
                                     coords->tmpmat3c,
                                     coords->tmpvec3a);

        /* rotate the scratch coords with new rotation matrix */
        RotateCoordsIp(coords, (const double **) coords->matrix);

        /* find global rmsd and average coords (both held in structure) */
        coords->wRMSD_from_mean = sqrt(deviation / (3 * rotdata->vlen));
    }

    pthread_exit((void *) 0);
}


static double
CalcRotations_pth(CoordsArray *cdsA, RotData **rotdata, pthread_t *callThd,
                  pthread_attr_t *attr, const int thrdnum)
{
    Coords        **coords = cdsA->coords;
    const Coords   *avecoords = cdsA->avecoords;
    const double   *wts = (const double *) cdsA->w;
    const double   *axesw = (const double *) cdsA->axesw;
    Coords         *tcoords = cdsA->tcoords;
    double          deviation_sum = 0.0;
    int             i, rc = 0, incr;

    if (cdsA->algo->covweight == 1)
    {
        MatMultCoordsMultMatDiag(tcoords,
                                 (const double **) cdsA->WtMat,
                                 avecoords,
                                 axesw);
    }
    else if (cdsA->algo->varweight == 1 || cdsA->algo->leastsquares == 1)
    {
        MatDiagMultCoordsMultMatDiag(tcoords,
                                     wts,
                                     avecoords,
                                     axesw);
    }

    incr = cdsA->cnum / thrdnum;

    for (i = 0; i < thrdnum - 1; ++i)
    {
        rotdata[i]->coords = coords;
        rotdata[i]->tcoords = tcoords;
        rotdata[i]->start = i * incr;
        rotdata[i]->end = i*incr + incr;
        rotdata[i]->vlen = cdsA->vlen;

        rc = pthread_create(&callThd[i], attr, CalcRot_pth, (void *) rotdata[i]);

        if (rc)
        {
            printf("ERROR811: return code from pthread_create() %d is %d\n", i, rc);
            exit(EXIT_FAILURE);
        }
    }

    rotdata[thrdnum - 1]->coords = coords;
    rotdata[thrdnum - 1]->tcoords = tcoords;
    rotdata[thrdnum - 1]->start = (thrdnum - 1) * incr;
    rotdata[thrdnum - 1]->end = cdsA->cnum;
    rotdata[thrdnum - 1]->vlen = cdsA->vlen;

    rc = pthread_create(&callThd[thrdnum - 1], attr, CalcRot_pth, (void *) rotdata[thrdnum - 1]);

    if (rc)
    {
        printf("ERROR811: return code from pthread_create() %d is %d\n", i, rc);
        exit(EXIT_FAILURE);
    }

    for (i = 0; i < thrdnum; ++i)
    {
        rc = pthread_join(callThd[i], (void **) NULL);

        if (rc)
        {
            printf("ERROR812: return code from pthread_join() %d is %d\n", i, rc);
            exit(EXIT_FAILURE);
        }
    }

    for (i = 0; i < cdsA->cnum; ++i)
        deviation_sum += 3 * cdsA->vlen * coords[i]->wRMSD_from_mean * coords[i]->wRMSD_from_mean;

    /* Correct for the bias of the dim-weighted rotation.
       this step is formally unnecessary, since the rotations are relative,
       but in our case it helps convergence since we require that the superposition
       be aligned with the principal axes of the dimensional covariance matrix */
    if (cdsA->algo->dimweight != 0)
    {
        ProcLAPACKSVDvan(avecoords,
                         tcoords,
                         avecoords->matrix,
                         cdsA->tmpmat3a,
                         cdsA->tmpmat3b,
                         cdsA->tmpmat3c,
                         cdsA->tmpvec3a);

        Mat3TransposeIp(avecoords->matrix);

        for (i = 0; i < cdsA->cnum; ++i)
            Mat3MultIp(coords[i]->matrix, (const double **) avecoords->matrix);
    }

    return(deviation_sum);
}


void
LedoitWolfShrinkVar(CoordsArray *cdsA, int method)
{
    switch(method)
    {
        case -1:
        case 0:
            break;

        case 1: /* Preferred parametric */
            CalcLedoitVarNewPar(cdsA);
            break;

        case 2: /* Preferred non-parametric */
            CalcLedoitVarNew(cdsA);
            break;

        case 3:
            CalcLedoitVar1Par(cdsA);
            break;

        case 4:
            CalcLedoitVar1(cdsA);
            break;

        case 5:
            CalcLedoitVar3(cdsA);
            break;

        case 6:
            CalcSteinVar(cdsA);
            break;

        default:
            printf("\n  ERROR:  Bad -L Ledoit-Wolf option \"%d\" \n", method);
            Usage(0);
            exit(EXIT_FAILURE);
            break;
    }
}


void
LedoitWolfShrinkCov(CoordsArray *cdsA, int method)
{
    switch(method)
    {
        case -1:
        case 0:
            break;

        case 1:
            CalcLedoitCovMatPar(cdsA);
            break;

        case 2:
            CalcLedoitCovMat(cdsA);
            break;

        default:
            printf("\n  ERROR:  Bad -L Ledoit-Wolf option \"%d\" \n", method);
            Usage(0);
            exit(EXIT_FAILURE);
            break;
    }
}


void
HierarchVars(CoordsArray *cdsA)
{
    int             i;
    double          mean, mu, lambda, zeta, sigma;

    switch(cdsA->algo->hierarch)
    {
        case 0:
            break;

        /* Assuming a known shape param c, real ML-EM fit */
        case 1:
            if (cdsA->algo->rounds > 4)
                InvGammaEMFixedCFitEvals(cdsA, 0.5, 1);
            else
                InvGammaEMFixedCFitEvals(cdsA, 0.5, 0);
            break;

        /* real ML-EM fit, fitting unknown b and c inverse gamma params (scale and shape, resp.) */
        case 2:
            if (cdsA->algo->rounds > 4)
                InvGammaMLFitEvals(cdsA, 1);
            else
                InvGammaMLFitEvals(cdsA, 0);
            break;

        case 3:
            InvGammaFitEvalsBfact(cdsA, 1); 
            break;

        case 4: 
            /* This is the old approximate method, used in versions 1.0-1.1  */
            /* inverse gamma fit of variances, excluding the smallest 3 */
            /* This accounts for the fact that the smallest three eigenvalues of the covariance
               matrix are always zero, i.e. the covariance matrix is necessarily of rank
               vlen - 3 (or usually less, with inadequate amounts of data 3N-6). */
            if (cdsA->algo->rounds > 4)
                InvGammaFitEvals(cdsA, 1);
            else
                InvGammaFitEvals(cdsA, 0);
            break;

        case 5: /* inverse gamma fit of variances, excluding the smallest 3 */
            /* This accounts for the fact that the smallest three eigenvalues of the covariance
               matrix are always zero, i.e. the covariance matrix is necessarily of rank
               vlen - 3 (or usually less, with inadequate amounts of data 3N-6). 
               __Bayesian Bernardo reference prior on the scale and shape params__. */
            if (cdsA->algo->rounds > 4)
                InvGammaBayesFitEvals(cdsA, 1);
            else
                InvGammaBayesFitEvals(cdsA, 0);
            break;

        case 6:
            if (cdsA->algo->rounds > 4)
                InvGammaBayesFitEvals3(cdsA, 1);
            else
                InvGammaBayesFitEvals3(cdsA, 0);
            break;

//         case 7:
//             /* InvGammaFitVars_minc(cdsA, 1.0, 1); */
//             if (cdsA->algo->rounds > 4)
//                 InvGammaFitEvalsEq(cdsA, 1);
//             else
//                 InvGammaFitEvalsEq(cdsA, 0);
// 
//             if (cdsA->algo->verbose != 0)
//                 printf("    HierarchVars() chi2:%f\n", cdsA->stats->hierarch_chi2);
//             break;

        case 7:
            if (cdsA->algo->rounds > 4)
                InvGammaMLFixedCFitEvals(cdsA, cdsA->algo->minc, 1);
            else
                InvGammaMLFixedCFitEvals(cdsA, cdsA->algo->minc, 0);
            break;

        case 8:
            if (cdsA->algo->rounds > 4)
                InvGammaMLFixedCFitEvals(cdsA, 0.5, 1);
            else
                InvGammaMLFixedCFitEvals(cdsA, 0.5, 0);
            break;

/*         case 8: */
/*             // InvGammaMMFitVars(cdsA, &b, &c); */
/*             if (cdsA->algo->rounds > 4) */
/*                 InvGammaFitVars_minc(cdsA, cdsA->algo->minc, 1); */
/*             else */
/*                 InvGammaFitVars_minc(cdsA, cdsA->algo->minc, 0); */
/*             break; */

        case 9:
            if (cdsA->algo->rounds > 4)
                InvGammaFitVars_fixed_c(cdsA, cdsA->algo->minc, 1);
            else
                InvGammaFitVars_fixed_c(cdsA, cdsA->algo->minc, 0);
            break;

        case 10:
            InvGammaFitVars_minc(cdsA, cdsA->algo->minc, 0);
            break;

        case 11:
            if (cdsA->algo->rounds > 4)
                InvGammaFitModeEvals(cdsA, 1);
            else
                InvGammaFitModeEvals(cdsA, 0);
            break;

        case 12: /* inverse gamma fit of variances, excluding the smallest 3 */
            /* This accounts for the fact that the smallest three eigenvalues of the covariance
               matrix are always zero, i.e. the covariance matrix is necessarily of rank
               vlen - 3 (or usually less, with inadequate amounts of data 3N-6). 
               No iterations */
            InvGammaFitEvals(cdsA, 0);
            break;

        case 13: /* Bayesian diagonal Wishart prior (proportional to the identity mat) on the
                    variances/covmat, assuming improper reference prior on the precision 
                    hyperparameter */
            WishartFitVar(cdsA, 1);
            break;

        case 14:
            WishartFitVar2(cdsA, 1);
//             if (cdsA->algo->rounds > 4)
//                 InvGammaBayesFitVars_fixed_c(cdsA, cdsA->algo->minc, 1);
//             else
//                 InvGammaBayesFitVars_fixed_c(cdsA, cdsA->algo->minc, 0);
// 
//             if (cdsA->algo->verbose != 0)
//                 printf("    HierarchVars() chi2:%f\n", cdsA->stats->hierarch_chi2);
            break;

        case 15:
            if (cdsA->algo->rounds >= 10)
                WishartAdjustVar(cdsA->var, cdsA->var, cdsA->vlen, cdsA->cnum, cdsA->stats->lsvar);
            break;

        case 16:
            if (cdsA->algo->rounds >= 10)
                ConjBayesAdjustVar(cdsA->var, cdsA->var, cdsA->vlen, cdsA->cnum, cdsA->stats->lsvar);
            break;

        case 17: /* inverse gamma fit of variances, excluding the smallest 3 */
            /* This accounts for the fact that the smallest three eigenvalues of the covariance
               matrix are always zero, i.e. the covariance matrix is necessarily of rank
               vlen - 3 (or usually less, with inadequate amounts of data 3N-6). */
            if (cdsA->algo->rounds > 4)
                InvGammaFitEvalsNoN(cdsA, 1);
            else
                InvGammaFitEvalsNoN(cdsA, 0);
            break;

        case 18:
            WishartAdjustVar(cdsA->var, cdsA->var, cdsA->vlen, cdsA->cnum, cdsA->algo->param[0]);
            break;

        case 19:
            WishartFitVar2(cdsA, 1);
            break;

/*             for (i = 0; i < cdsA->vlen; ++i) */
/*                 cdsA->var[i] = cdsA->CovMat[i][i]; */
/*             cdsA->algo->covweight = 0; */
/*             cdsA->algo->varweight = 1; */
/*             InvGammaFitVars(cdsA, 1); */
/*             cdsA->algo->covweight = 1; */
/*             cdsA->algo->varweight = 0; */
/*             CovMat2CorMat(cdsA->CovMat, cdsA->vlen); */
/*             CorMat2CovMat(cdsA->CovMat, (const double *) cdsA->var, cdsA->vlen); */
/*             break; */

        case 20: /* ML fit of variances to a reciprocal inverse gaussian dist */
            RecipInvGaussFitVars(cdsA, &mu, &lambda);
            RecipInvGaussAdjustVars(cdsA, mu, lambda);
            break;

        case 21: /* ML fit of variances to a lognorml distribution */
            LognormalFitVars(cdsA, &zeta, &sigma);
            LognormalAdjustVars(cdsA, zeta, sigma);             
            break;

        case 22:
            InvgaussFitVars(cdsA, &mean, &lambda);
            InvgaussAdjustVars(cdsA, zeta, sigma);
            break;

        case 30: /* inv gamma fit to eigenvalues of covariance mat, but only weighting by variances */
            cdsA->algo->covweight = 1;
            cdsA->algo->varweight = 0;
            if (cdsA->algo->alignment == 1)
                CalcCovMatOcc(cdsA);
            else
                CalcCovMat(cdsA);
            InvGammaFitEvals(cdsA, 1);
            cdsA->algo->covweight = 0;
            cdsA->algo->varweight = 1;
            for (i = 0; i < cdsA->vlen; ++i)
                cdsA->var[i] = cdsA->CovMat[i][i];
            break;

        case 31: /* inv gamma fit to eigenvalues of covariance mat, but only weighting by variances */
            cdsA->algo->covweight = 1;
            cdsA->algo->varweight = 0;
            if (cdsA->algo->alignment == 1)
                CalcCovMatOcc(cdsA);
            else
                CalcCovMat(cdsA);
            InvGammaFitVars(cdsA, 0); /* no iterations */
            cdsA->algo->covweight = 0;
            cdsA->algo->varweight = 1;
            for (i = 0; i < cdsA->vlen; ++i)
                cdsA->var[i] = cdsA->CovMat[i][i];
            break;

        default:
            printf("\n  ERROR:  Bad -g option \"%d\" \n", cdsA->algo->hierarch);
            Usage(0);
            exit(EXIT_FAILURE);
            break;

        if (cdsA->algo->verbose != 0)
            printf("    HierarchVars() chi2:%f\n", cdsA->stats->hierarch_chi2);
    }

    if (cdsA->algo->lele5 == 1 && cdsA->algo->covweight != 0)
    {
        /* Correct Lele's 5-landmark testset covariance matrix (only two off-diags are non-zero) */
        cdsA->CovMat[0][1] = 0.0;
        cdsA->CovMat[0][2] = 0.0;
        cdsA->CovMat[0][3] = 0.0;
        cdsA->CovMat[0][4] = 0.0;

        cdsA->CovMat[1][0] = 0.0;
        cdsA->CovMat[1][2] = 0.0;
        cdsA->CovMat[1][4] = 0.0;

        cdsA->CovMat[2][0] = 0.0;
        cdsA->CovMat[2][1] = 0.0;
        cdsA->CovMat[2][3] = 0.0;
        cdsA->CovMat[2][4] = 0.0;

        cdsA->CovMat[3][0] = 0.0;
        cdsA->CovMat[3][2] = 0.0;
        cdsA->CovMat[3][4] = 0.0;

        cdsA->CovMat[4][0] = 0.0;
        cdsA->CovMat[4][1] = 0.0;
        cdsA->CovMat[4][2] = 0.0;
        cdsA->CovMat[4][3] = 0.0;
    }
}


int
CheckConvergenceInner(CoordsArray *cdsA, const double precision)
{
    Algorithm      *algo = cdsA->algo;
    int             i;

    if (algo->abort == 1)
        return(1);

    for (i = 0; i < cdsA->cnum; ++i)
    {
        if (TestIdentMat((const double **) cdsA->coords[i]->matrix, 3, precision) == 0)
        /* if (Mat3FrobEq((const double **) cdsA->coords[i]->last_matrix, (const double **) cdsA->coords[i]->matrix, precision) == 0) */
            return(0);
    }

    return(1);
}


int
CheckConvergenceOuter(CoordsArray *cdsA, int round, const double precision)
{
    Algorithm      *algo = cdsA->algo;
    int             i;

    if (round >= algo->iterations)
        return(1);

    if (algo->abort == 1)
        return(1);

/*     else if (algo->alignment == 1 && round < 10) */
/*         return(0); */
    else if (round > 6)
    {
/*         if (Mat3FrobEq((const double **) cdsA->coords[0]->matrix, (const double **) cdsA->coords[0]->last_matrix, algo->precision) == 0) */
/*             return(1); */
/*         else */
/*             return(0); */

        cdsA->stats->precision = 0.0;
        for (i = 0; i < cdsA->cnum; ++i)
            cdsA->stats->precision += FrobDiffNormIdentMat((const double **) cdsA->coords[i]->matrix, 3);
        cdsA->stats->precision /= cdsA->cnum;

        if (cdsA->stats->precision > precision)
            return(0);
        else
            return(1);
    }
    else
        return(0);
}


double
SuperPoseArray2Orig(CoordsArray *cdsA, CoordsArray *targetA, double *sumdev)
{
    int             i, j;
    const int       vlen = cdsA->vlen;
    const int       cnum = cdsA->cnum;
    double         *trans = malloc(3 * sizeof(double));
    double          norm1, norm2, innprod, fpe;
    Coords        **coords = cdsA->coords;
    PDBCoordsArray *pdbA;

    /* Superimpose originals on ML superimposed family, just to clean up
       any floating point problems due to extensive iteration,
       and to get the proper rotations & translations to apply to the original
       PDB coords. */
    pdbA = PDBCoordsArrayInit();
    PDBCoordsArrayAlloc(pdbA, cnum, vlen);

    for (i = 0; i < cnum; ++i)
        CopyCoords2PDB(pdbA->coords[i], coords[i]);

    if (cdsA->algo->verbose == 1)
    {
        char *ca_name = mystrcat(cdsA->algo->rootname, "_MultiPose_CA.pdb");
        WriteModelFile(pdbA, ca_name);
        free(ca_name);
    }

    *sumdev = fpe = 0.0;
    for (i = 0; i < cnum; ++i)
    {
        *sumdev += fabs(SuperPose(targetA->coords[i], coords[i], coords[i]->matrix, trans,
                                 &norm1, &norm2, &innprod));

        fpe += fabs(norm1 - innprod)/innprod;

        for (j = 0; j < 3; ++j)
            coords[i]->center[j] = coords[i]->translation[j] =
            targetA->coords[i]->center[j] = targetA->coords[i]->translation[j] =
            -trans[j];

/*             printf("\n new trans: %f %f %f", */
/*                    coords[i]->center[0], coords[i]->center[1], coords[i]->center[2]); */
    }

    free(trans);
    PDBCoordsArrayDestroy(&pdbA);

    *sumdev = sqrt(*sumdev / (cnum * vlen));

    return(fpe / cnum);
}


double
CalcInnProd(const Coords *coords1, const Coords *coords2)
{
    int             i;
    double          innprod, tmpx, tmpy, tmpz;

    innprod = 0.0;
    for (i = 0; i < coords1->vlen; ++i)
    {
        tmpx = coords1->x[i] * coords2->x[i];
        /* printf("\n% 8.3f %8.3f", coords1->x[i], coords2->x[i]); */
        tmpy = coords1->y[i] * coords2->y[i];
        tmpz = coords1->z[i] * coords2->z[i];
        innprod += (tmpx*tmpx + tmpy*tmpy + tmpz*tmpz);
    }

    return(innprod / coords1->vlen);
}


static void
WriteInstModelFile(char *fext, CoordsArray *cdsA)
{
    int         i;
    PDBCoordsArray *mpA;
    mpA = PDBCoordsArrayInit();
    char       *fext_name = NULL;

    PDBCoordsArrayAlloc(mpA, cdsA->cnum, cdsA->vlen);

    for (i = 0; i < mpA->cnum; ++i)
        CopyCoords2PDB(mpA->coords[i], cdsA->coords[i]);

    fext_name = mystrcat(cdsA->algo->rootname, fext);
    WriteTheseusModelFileNoStats(mpA, cdsA->algo, fext_name);

    free(fext_name);
    PDBCoordsArrayDestroy(&mpA);
}


/* The real thing */
int
MultiPose(CoordsArray *baseA)
{
    /* double         *tmpaxesw = malloc(3 * sizeof(double)); */
    /* FILE           *fp; */
    int             i, round, innerround;
    int             slxn; /* index of random coord to select as first */
    double          frobnorm, sumdev, percent, lastpercent, logL, lastlogL, lastscale;
    double          deviation_sum = 0.0;
    double        **lastmat = MatAlloc(3,3);
    const int       cnum = baseA->cnum;
    const int       vlen = baseA->vlen;
    Algorithm      *algo = NULL;
    Statistics     *stats = NULL;
    Coords        **coords = NULL;
    Coords         *avecoords = NULL;
    CoordsArray    *scratchA = NULL; /* working scratch array, also holds average coords, */
                                     /* rotation matrices, translation and center vectors */

#if defined(__APPLE__)
    double          starttime, endtime;
    double          init, setup = 0.0, innerloop, exitloop;

    starttime = seconds();
#endif

/*     for (i = 0; i < cnum; ++i) */
/*         for (int j = 0; j < vlen; ++j) */
/*             baseA->coords[i]->z[j] = 0.0; */

    /* setup scratchA */
    scratchA = CoordsArrayInit();
    CoordsArrayAlloc(scratchA, cnum, vlen);
    CoordsArraySetup(scratchA);

    baseA->scratchA = scratchA;

    /* duplicate baseA -- copy to scratchA */
    CoordsArrayCopy(scratchA, baseA);

    /* setup local aliases based on scratchA */
    algo = scratchA->algo;
    stats = scratchA->stats;
    coords = scratchA->coords;
    avecoords = scratchA->avecoords;

    if (algo->covweight == 1)
    {
        SetupCovWeighting(scratchA); /* DLT debug */
        SetupCovWeighting(baseA); /* DLT debug */
    }

    memsetd(scratchA->w, 1.0, vlen);
    memsetd(baseA->w, 1.0, vlen);

    stats->hierarch_p1 = 1.0;
    stats->hierarch_p2 = 1.0;

    //algo->constant = 0.001;

#if defined(__APPLE__)
    endtime = seconds();
    init = (double) (endtime - starttime) / 0.001;
    starttime = seconds();
#endif

    /* Initialize the algorithm -- we need a centered mean structure as first guess */
    /* determine a structure to use as the initial mean structure */
    if (algo->embedave != 0 || algo->alignment == 1)
    {
        printf("    Calculating distance matrix for embedding average ... \n");
        fflush(NULL);

        CoordsCopyAll(avecoords, coords[0]);
        DistMatsAlloc(scratchA);

        if (algo->alignment == 1)
            CalcMLDistMatOcc(scratchA);
        else
            CalcMLDistMat(scratchA);

        printf("    Embedding average structure (ML) ... \n");
        fflush(NULL);

        EmbedAveCoords(scratchA);

        for (i = 0; i < vlen; ++i)
            avecoords->resSeq[i] = i+1;

        // DistMatsDestroy(scratchA); // DLT debug FIX

        printf("    Finished embedding \n");
        fflush(NULL);

        if (algo->write_file == 1)
        {
            char *embed_ave_name = mystrcat(algo->rootname, "_embed_ave.pdb");
            WriteAveCoordsFile(scratchA, embed_ave_name);
            free(embed_ave_name);
        }
    }
    else
    {
        slxn = (int) (genrand_real2() * cnum);
        CoordsCopyAll(avecoords, baseA->coords[slxn]);
    }

    if (algo->notrans == 0)
    {
        CenMass(avecoords);
        ApplyCenterIp(avecoords);
    }

    if (algo->seed == 1)
    {
        CalcStats(scratchA);
        round = 10;
    }

    if (algo->bfact > 0)
    {
        for (i = 0; i < cnum; ++i)
            Bfacts2PrVars(scratchA, i);
    }

    //if (algo->alignment == 1)
        CalcDf(scratchA);

    if (algo->scale > 0)
    {
		//double fac, facsum = 0.0;
		for (i = 0; i < cnum; ++i)
		{
			//fac = genrand_real2() * 100.0;
			//facsum += fac;
			//printf("\nfac[%3d]: % 12.6f", i+1, fac);
			//ScaleCoords(scratchA->coords[i], 1.0 / (i+1.0));
			//ScaleCoords(scratchA->coords[i], 1.0 / fac);
			scratchA->coords[i]->scale = 1.0 / (i+1.0);
		}
		//printf("\nfacsum: %12.6f", facsum);
    }

    /* The EM algorithm */
    /* The outer loop:
       (1) First calculates the translations
       (2) Does inner loop -- calc rotations and average till convergence
       (3) Holding the superposition constant, calculates the covariance
           matrices and corresponding weight matrices, looping till 
           convergence when using a dimensional/axial covariance matrix */
    round = 0;
    percent = lastpercent = 0.0;
    logL = lastlogL = lastscale = -DBL_MAX;
    while(1)
    {
/*         if (round % 62 == 0) */
/*              printf("\n    "); */
/*         else */
/*             putchar('.'); */
/*         fflush(NULL); */

        if (algo->nullrun == 1)
            break;

        lastlogL = logL;
        ++round;
        baseA->algo->rounds = algo->rounds = round;

        if (algo->verbose == 1)
        {
            printf("\n\n\nNew Outer Round:%3d ////////////////////////////////////////////////////////////",
                   round);
            fflush(NULL);
        }

        /* Calculate the minimum variance empirically -- this is really just inherent floating point error */
        if (round == 2 && algo->constant < 0.0)
        {
            SuperPoseArray2Orig(scratchA, baseA, &sumdev);
            algo->constant = sumdev * sumdev;
/*             printf("\n    Minimum variance: %8.3e (sigma:%8.3e)", sumdev*sumdev, sumdev); */
/*             fflush(NULL); */
        }

        /* Find weighted center and translate all coords */
        CalcTranslationsIp(scratchA, algo);
        //CalcTranslationsOp(scratchA, baseA, algo);

        for (i = 0; i < cnum; ++i)
            ApplyCenterIp(coords[i]);
            //ApplyCenterOp(coords[i], (const Coords *) baseA->coords[i]);

        Mat3Cpy(lastmat, (const double **) scratchA->AxCovMat);

        /* save the translation vector for each coord in the array */
        for (i = 0; i < cnum; ++i)
            memcpy(coords[i]->translation, coords[i]->center, 3 * sizeof(double));

        /* when superimposing to an alignemnt, initially iterate unweighted LS for a few rounds */
        //if (algo->alignment == 1 && round < 10) /* DLT debug -- I changed this just to find the LS answer first */
        //    memsetd(scratchA->w, 1.0, vlen);

        /* Inner loop:
           (1) Calc rotations given weights/weight matrices
           (2) Rotate coords with new rotations
           (3) Recalculate average

           Loops till convergence, holding constant the weights, variances, and covariances
           (and thus the translations too) */
        innerround = 0;
        do
        {
/*          putchar('*'); */
/*          fflush(NULL); */
            ++innerround;
            algo->innerrounds += innerround;

/* char *tempstr = malloc(512 * sizeof(char)); */
/* sprintf(tempstr, "_mp_%d.pdb", algo->innerrounds); */
/* WriteInstModelFile(tempstr, scratchA); */
/* free(tempstr); */

            if (algo->verbose == 1)
            {
                printf("\n    New Inner Round:%d \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\n", innerround);
                fflush(NULL);
            }

            /* save the old rotation matrices to test convergence at bottom of loop */
            for (i = 0; i < cnum; ++i)
                MatCpySym(coords[i]->last_matrix, (const double **) coords[i]->matrix, 3);

////////////////////////////////////////////////////////////////////////////////////////////////////
/*             double *mytrans = malloc(3 * sizeof(double)); */
/*              */
/*             mytrans[0] = 20.0; mytrans[1] = 30.0; mytrans[2] = 40.0; */
/*  */
/*             TransCoordsIp(coords[0], mytrans); */
/*             CalcRotations(scratchA); */
/*              */
/*             printf("\nBefore[%d][%d]:", innerround, round); */
/*             Mat3Print(coords[0]->matrix); */
/*              */
/*             mytrans[0] = -20.0; mytrans[1] = -30.0; mytrans[2] = -40.0; */
/*             TransCoordsIp(coords[0], mytrans); */
/*              */
/*             free(mytrans); */

            /* find the optimal rotation matrices */
            if (algo->alignment == 1 /* && (round == 1 || cnum == 2) */)
                deviation_sum = CalcRotationsOcc(scratchA);
            else
                deviation_sum = CalcRotations(scratchA);

/*             printf("\nAfter:"); */
/*             Mat3Print(coords[0]->matrix); */
////////////////////////////////////////////////////////////////////////////////////////////////////

            if (algo->verbose == 1 && innerround == 1)
            {
                frobnorm = 0.0;
                for (i = 0; i < cnum; ++i)
                    /* frobnorm += MatFrobNorm((const double **) coords[i]->last_matrix, (const double **) coords[i]->matrix, 3, 3); */
                    frobnorm += FrobDiffNormIdentMat((const double **) coords[i]->matrix, 3);
                frobnorm /= cnum;

                printf("-----<<<<< %3d Frobenius Norm (Outer): % 8.3e //////////////////////////////\n",
                       round, frobnorm);
                fflush(NULL);
            }

            if (innerround == 1 &&
                CheckConvergenceOuter(scratchA, round, algo->precision) == 1 &&
                MatFrobNorm((const double **) lastmat, (const double **) scratchA->AxCovMat, 3, 3) < algo->precision)
                   goto outsidetheloops;

            if (stats->precision > 0.0)
                percent = 100.0 * log(fabs(stats->precision))/log(algo->precision);
            else
                percent = 0.0;

//            if (percent > lastpercent)
//            {
//                lastpercent = percent;
//                printf("    %5.1f%%\n", percent);
//                /* printf("\n%e\n", stats->precision); */
//                printf("\033[<1>A"); /* moves the cursor up one line */
//                fflush(NULL);
//            }

            /* rotate the scratch coords with new rotation matrix */
            for (i = 0; i < cnum; ++i)
            {
                RotateCoordsIp(coords[i], (const double **) coords[i]->matrix);
                //printf("\n\nCoords %d\n", i);
                //PrintCoords(coords[i]);
            }

            if (algo->scale > 0)
            {
                lastscale = coords[0]->scale;

                double scaleprod;
    
                if (algo->scale == 1)
                    scaleprod = CalcScaleFactorsML(scratchA);
                else if (algo->scale == 2)
                    scaleprod = CalcScaleFactors(scratchA);
                else if (algo->scale == 3)
                    scaleprod = CalcScaleFactorsMLLogNorm(scratchA);
                else
                    scaleprod = 1.0;
    
                printf("\n%5d scaleprod = %12.6f\n", round, scaleprod);
            }

            /* find global rmsd and average coords (both held in structure) */
            if (algo->noave == 0)
            {
                if (algo->alignment == 1)
                {
                    AveCoordsOcc(scratchA);
                    EM_MissingCoords(scratchA);
                    //printf("\n\nAveCoords\n");
                    //PrintCoords(scratchA->avecoords);
                }
                else
                {
                    AveCoords(scratchA);

/* Calculate the ML estimate of a hierarchical mean, where the variance-weighted atoms 
   are normally distributed with mean zero */
/* See pdbUtils.c */
/*                     double oldpsi = 0.0, psi = 0.0; */
/*                     int q; */
/*  */
/*                     q = 0; */
/*                     do */
/*                     { */
/*                         oldpsi = psi; */
/*                         psi = HierAveCoords(scratchA); */
/*                         //printf("\n  psi[%d] = %e", q, psi); */
/*                         q++; */
/*                     } */
/*                     while(fabs(psi - oldpsi) > psi * algo->precision); */
                }
                //PrintCoords(scratchA->avecoords);
            }

            if (algo->mbias == 1)
                UnbiasMean(scratchA);

            stats->wRMSD_from_mean = sqrt(deviation_sum / (3 * vlen * cnum));



            if (algo->verbose == 1)
            {
                frobnorm = 0.0;
                for (i = 0; i < cnum; ++i)
                    frobnorm += FrobDiffNormIdentMat((const double **) coords[i]->matrix, 3);
                frobnorm /= cnum;
                printf("    ----->>>>> %3d Frobenius Norm (Inner %d): % e\n", round, innerround, frobnorm);
                printf("    End Inner Round:%d \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\n", innerround);
                fflush(NULL);
            }

            if (algo->noinnerloop == 1)
                break;
            else if (innerround > 160)
            {
                putchar(',');
                fflush(NULL);
                break;
            }
        }
        //while(CheckConvergenceInner(scratchA, algo->precision) == 0 &&
          //    fabs(coords[0]->scale - lastscale) > coords[0]->scale * algo->precision);
        while(CheckConvergenceInner(scratchA, algo->precision) == 0);

        if (round < 10)
        {
            if (algo->alignment == 1)
                VarianceCoordsOcc(scratchA);
            else
                VarianceCoords(scratchA);

            stats->lsvar = stats->stddev * stats->stddev;
        }

/* printf("\nvar  = %10.5e",   VarianceCoordsOcc(scratchA)); */
/* printf("\nrmsd = %10.5e\n", CalcPRMSD(scratchA)); */
        if (algo->instfile == 1)
            WriteInstModelFile("_inst.pdb", scratchA);

        /* Weighting by dimensional, axial Xi covariance matrix, here diagonal. */
        /* Holding the superposition constant, calculates the covariance
           matrices and corresponding weight matrices, looping till 
           convergence. */
        CalcCovariances(scratchA);
        
        //if (algo->scale > 0)
          //  ConstrainCovMat(scratchA);

        if (scratchA->algo->lele5 == 1)
        {
            /* Correct Lele's 5-landmark testset covariance matrix (only two off-diags are non-zero) */
            scratchA->CovMat[0][1] = 0.0;
            scratchA->CovMat[0][2] = 0.0;
            scratchA->CovMat[0][3] = 0.0;
            scratchA->CovMat[0][4] = 0.0;

            scratchA->CovMat[1][0] = 0.0;
            scratchA->CovMat[1][2] = 0.0;
            scratchA->CovMat[1][4] = 0.0;

            scratchA->CovMat[2][0] = 0.0;
            scratchA->CovMat[2][1] = 0.0;
            scratchA->CovMat[2][3] = 0.0;
            scratchA->CovMat[2][4] = 0.0;

            scratchA->CovMat[3][0] = 0.0;
            scratchA->CovMat[3][2] = 0.0;
            scratchA->CovMat[3][4] = 0.0;

            scratchA->CovMat[4][0] = 0.0;
            scratchA->CovMat[4][1] = 0.0;
            scratchA->CovMat[4][2] = 0.0;
            scratchA->CovMat[4][3] = 0.0;
        }

        if (CheckZeroVariances(scratchA) == 1)
        {
            algo->varweight = 0;
            algo->covweight = 0;
            algo->leastsquares = 1;
        }

        /* calculate the weights/weight matrices */
        /* and first the hierarchical adjustment */
        CalcWts(scratchA);

        if (algo->printlogL == 1)
        {
            if (algo->leastsquares == 1)
                CalcNormResidualsLS(scratchA);
            else
                CalcNormResiduals(scratchA);
            logL = CalcLogL(scratchA);
            printf("----> %4d logL: % e  % e <----\n", round, logL, logL - lastlogL);
        }

        if (algo->verbose == 1)
        {
            printf("END Outer Round:%3d ////////////////////////////////////////////////////////////\n\n",
                   round);
            fflush(NULL);
        }
    }

    outsidetheloops:

/*     for (i = 0; i < cnum; ++i) */
/*      printf("\ntrans [%3d]: %f %f %f", */
/*             i+1, */
/*             coords[i]->transsum[0], */
/*             coords[i]->transsum[1], */
/*             coords[i]->transsum[2]); */

    if (algo->seed == 1)
        round -= 10;

    if (algo->verbose == 1)
    {
        printf(" AxCovMat:\n");
        write_C_mat((const double **) scratchA->AxCovMat, 3, 8, 0);
        fflush(NULL);
    }

    if (algo->bayes > 0)
    {
        #include "GibbsMet.h"
        printf("    Calculating Gibbs-Metropolis Bayesian superposition ... \n");
        fflush(NULL);
        GibbsMet(scratchA);
    }

#if defined(__APPLE__)
    endtime = seconds();
    innerloop = (double) (endtime - starttime) / 0.001;
    starttime = seconds();
#endif

    printf("    Calculating statistics ... \n");
    fflush(NULL);

    if (algo->instfile == 1)
        WriteInstModelFile("_inst_final.pdb", scratchA);

/*
    fp = fopen("distcor.txt", "w");
    if (scratchA->CovMat == NULL)
        scratchA->CovMat = MatAlloc(vlen, vlen);

    CalcCovMat(scratchA);
    DistMatsAlloc(cdsA);

    CalcMLDistMat(scratchA);

    for (i = 0; i < vlen; ++i)
        for (j = 0; j < i; ++j)
            fprintf(fp, "%6d % 10.3f  % 8.3e\n",
                   i-j,
                   scratchA->Dij_matrix[i][j],
                   scratchA->CovMat[i][j] / sqrt(scratchA->CovMat[i][i] * scratchA->CovMat[j][j]));

    fclose(fp);
*/

/*     if (algo->weight == 200) */
/*         unremlvar(scratchA); */

/* #include "internmat.h" */
/* AveCoords(scratchA); */
/* CalcCovMat(scratchA); */
/* PrintCovMatGnuPlot((const double **) scratchA->CovMat, vlen, "cov.mat"); */
/* for (i = 0; i < vlen; ++i) */
/*  for (j = 0; j < vlen; ++j) */
/*      scratchA->CovMat[i][j] -= internmat[i][j]; */
/* PrintCovMatGnuPlot((const double **) scratchA->CovMat, vlen, "covdiff.mat"); */

/*     CovMat2CorMat(scratchA->CovMat, vlen); */
/*     PrintCovMatGnuPlot((const double **) scratchA->CovMat, vlen, "corr.mat"); */
/*     memcpy(&scratchA->CovMat[0][0], &internmat[0][0], vlen * vlen * sizeof(double)); */
/*     PrintCovMatGnuPlot((const double **) scratchA->CovMat, vlen, "cov_true.mat"); */
/*     CovMat2CorMat(scratchA->CovMat, vlen); */
/*     PrintCovMatGnuPlot((const double **) scratchA->CovMat, vlen, "corr_true.mat"); */

/*     CovMatsDestroy(scratchA); */ /* DLT debug */
/*     CovMatsDestroy(baseA); */

    if (algo->covweight == 1 && (algo->write_file > 0 || algo->info != 0))
    {
        double         *evals = malloc(vlen * sizeof(double));
        double        **evecs = scratchA->tmpmatKK2;
        char           *mp_cov_name = NULL;

        eigenvalsym2((const double **) scratchA->CovMat, evals, evecs, vlen, &scratchA->tmpmatKK1[0][0]);
        /* VecPrint(evals, vlen); */
        mp_cov_name = mystrcat(algo->rootname, "_mp_cov.mat");
        PrintCovMatGnuPlot((const double **) scratchA->CovMat, vlen, mp_cov_name);
        free(mp_cov_name);
/*         CovMat2CorMat(scratchA->CovMat, vlen); */
/*         PrintCovMatGnuPlot((const double **) scratchA->CovMat, vlen, mystrcat(algo->rootname, "_cor.mat")); */
        CalcPRMSD(scratchA);
        WriteInstModelFile("_mp.pdb", scratchA);
        free(evals);
    }

    { /* Write out a taxa distance matrix in NEXUS format */
        #include "DistMat.h"

        DISTMAT *distmat;
        double sum;
        int j,k;
        char num[32];
        char *ptr = NULL;
        char *tree_name = NULL;

        distmat = DISTMATalloc(cnum);

        for (i = 0; i < cnum; ++i)
        {
            strcpy(distmat->taxa[i], coords[i]->filename);
            ptr = strrchr(distmat->taxa[i], '.');
            if (ptr != NULL)
                *ptr = '\0';
            sprintf(num, "_%d", i);
            strcat(distmat->taxa[i], num);
        }

        for (i = 0; i < cnum; ++i)
        {
            for (j = 0; j < cnum; ++j)
            {
                sum = 0.0;
                for (k = 0; k < vlen; ++k)
                    sum += SqrCoordsDistMahal2((const Coords *) coords[i], k,
                                               (const Coords *) coords[j], k,
                                               (const double) scratchA->w[k]);

                distmat->dist[i][j] = sqrt(sum);
            }
        }

        tree_name = mystrcat(algo->rootname, "_tree.nxs");
        print_NX_distmat(distmat, tree_name);

        if (tree_name != NULL)
            free(tree_name);

        DISTMATdestroy(&distmat);
    }

    CalcStats(scratchA);
    stats->fperr = SuperPoseArray2Orig(scratchA, baseA, &stats->minvar);

    if (algo->ssm == 1)
    {
        printf("    Calculating SSM ... \n");
        fflush(NULL);

        #include "pdbSSM.h"
        SSM *ssm = SSMInit();
        SSMAlloc(ssm, scratchA);
        //for (i=0; i < 2; ++i)
        SSMCalc(ssm, scratchA);

        printf("    Writing SSM ... \n");
        fflush(NULL);

        WriteSSM(ssm);
        SSMDestroy(&ssm);
    }

    if (baseA->anchorf_name != NULL) /* orient entire family to a user-specified structure */
        SuperPose2Anchor(scratchA, baseA, baseA->anchorf_name);
    else if (algo->princaxes == 1) /* orient the family perpendicular to principal axes of the average coords -- */
        RotPrincAxes(scratchA);    /* makes for nice viewing */

    if (algo->write_file == 1)
    {
        char *transf_name = mystrcat(algo->rootname, "_transf2.txt");
        WriteTransformations(scratchA, transf_name);
        free(transf_name);
    }

    if (algo->olve == 1 && algo->write_file == 1)
    {
        PDBCoordsArray *olveA;
        printf("    Writing Olve's file ... \n");
        fflush(NULL);

        olveA = PDBCoordsArrayInit();
        PDBCoordsArrayAlloc(olveA, cnum, vlen);

        for (i = 0; i < cnum; ++i)
            CopyCoords2PDB(olveA->coords[i], coords[i]);

        char *olve_name = mystrcat(algo->rootname, "_olve.pdb");
        WriteOlveModelFile(olveA, algo, stats, olve_name);
        free(olve_name);
        PDBCoordsArrayDestroy(&olveA);
    }

    CopyStats(baseA, scratchA);

    /* wRMSD_from_mean does not need 2 in denominator, since it is already from the average */
    stats->wRMSD_from_mean = sqrt(deviation_sum / (double) (vlen * cnum));

#if defined(__APPLE__)
    endtime = seconds();
    exitloop = (double) (endtime - starttime) / 0.001;
    if (algo->verbose == 1)
    {
        printf("    init    setup  inner loop  exit loop \n");
        printf(" %7.2f  %7.2f     %7.2f    %7.2f (ms) \n", init, setup, innerloop, exitloop);
        fflush(NULL);
    }
#endif

    CoordsArrayDestroy(&scratchA);
    MatDestroy(&lastmat);

    return(round);
}


int
MultiPose_pth(CoordsArray *baseA)
{
    /* double         *tmpaxesw = malloc(3 * sizeof(double)); */
    /* FILE           *fp; */
    int             i, round, innerround;
    int             slxn; /* index of random coord to select as first */
    double          frobnorm, sumdev, percent, lastpercent;
    double          deviation_sum = 0.0;
    double        **lastmat = MatAlloc(3,3);
    const int       cnum = baseA->cnum;
    const int       vlen = baseA->vlen;
    double         *evals = malloc(3 * sizeof(double));
    Algorithm      *algo = NULL;
    Statistics     *stats = NULL;
    Coords        **coords = NULL;
    Coords         *avecoords = NULL;
    CoordsArray    *scratchA = NULL;

    const int       thrdnum = baseA->algo->threads;
    RotData       **rotdata = malloc(thrdnum * sizeof(RotData *));
    AveData       **avedata = malloc(thrdnum * sizeof(AveData *));
    pthread_t      *callThd = malloc(thrdnum * sizeof(pthread_t));
    pthread_attr_t  attr;


#if defined(__APPLE__)
    double          starttime, endtime;
    double          init, setup = 0.0, innerloop, exitloop;

    starttime = seconds();
#endif

    for (i = 0; i < thrdnum; ++i)
    {
        rotdata[i] = malloc(sizeof(RotData));
        avedata[i] = malloc(sizeof(AveData));
    }

    pthread_attr_init(&attr);
/*     pthread_attr_getstacksize (&attr, &stacksize); */
/*     printf("\nDefault stack size = %d", (int) stacksize); */
    pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
    pthread_attr_setscope(&attr, PTHREAD_SCOPE_SYSTEM);

/*     if (baseA->algo->weight == 200) */
/*         remlvar(baseA); */

    /* setup scratchA */
    scratchA = CoordsArrayInit();
    CoordsArrayAlloc(scratchA, cnum, vlen);
    CoordsArraySetup(scratchA);

    baseA->scratchA = scratchA;

    /* duplicate baseA -- copy to scratchA */
    CoordsArrayCopy(scratchA, baseA);

    /* setup local aliases based on scratchA */
    algo = scratchA->algo;
    stats = scratchA->stats;
    coords = scratchA->coords;
    avecoords = scratchA->avecoords;

    SetupCovWeighting(scratchA); /* DLT debug */
    SetupCovWeighting(baseA); /* DLT debug */

    stats->hierarch_p1 = 1.0;
    stats->hierarch_p2 = 1.0;

#if defined(__APPLE__)
    endtime = seconds();
    init = (double) (endtime - starttime) / 0.001;
    starttime = seconds();
#endif

    if (algo->embedave != 0 || algo->alignment == 1) // DLTIP
    {
        printf("    Calculating distance matrix for embedding average ... \n");
        fflush(NULL);

        CoordsCopyAll(avecoords, coords[0]);
        DistMatsAlloc(scratchA);

        if (algo->alignment == 1)
            CalcMLDistMatOcc(scratchA);
        else
            CalcMLDistMat(scratchA);

        printf("    Embedding average structure (ML) ... \n");
        fflush(NULL);

        EmbedAveCoords(scratchA);

        for (i = 0; i < vlen; ++i)
            avecoords->resSeq[i] = i+1;

        printf("    Finished embedding \n");
        fflush(NULL);

        if (algo->write_file == 1)
        {
            char *embed_ave_name = mystrcat(algo->rootname, "_embed_ave.pdb");
            WriteAveCoordsFile(scratchA, embed_ave_name);
            free(embed_ave_name);
        }
    }
    else
    {
        slxn = (int) (genrand_real2() * cnum);
        CoordsCopyAll(avecoords, baseA->coords[slxn]);
    }

    if (algo->notrans == 0)
    {
        CenMass(avecoords);
        ApplyCenterIp(avecoords);
    }

    if (algo->seed == 1)
    {
        CalcStats(scratchA);
        round = 10;
    }

    /* The outer loop:
       (1) First calculates the translations
       (2) Does inner loop -- calc rotations and average till convergence
       (3) Holding the superposition constant, calculates the covariance
           matrices and corresponding weight matrices, looping till 
           convergence when using a dimensional/axial covariance matrix 
    */
    percent = lastpercent = 0.0;
    round = 0;
    while(1)
    {
/*         if (round % 62 == 0) */
/*              printf("    \n"); */
/*         else */
/*             putchar('.'); */
/*         fflush(NULL); */

        if (algo->nullrun == 1)
            break;

        ++round;
        baseA->algo->rounds = algo->rounds = round;

        if (algo->verbose == 1)
        {
            printf("\n\n\nNew Outer Round:%3d ////////////////////////////////////////////////////////////",
                   round);
            fflush(NULL);
        }

        /* Calculate the minimum variance empirically */
        if (round == 2 && algo->constant < 0.0)
        {
            SuperPoseArray2Orig(scratchA, baseA, &sumdev);
            algo->constant = sumdev * sumdev;
/*             printf("\n    Minimum variance: %8.3e (sigma:%8.3e)", sumdev*sumdev, sumdev); */
/*             fflush(NULL); */
        }

        /* Find weighted center and translate all coords */
        CalcTranslationsIp(scratchA, algo);
        for (i = 0; i < cnum; ++i)
            ApplyCenterIp(coords[i]);

        Mat3Cpy(lastmat, (const double **) scratchA->AxCovMat);

        /* save the translation vector for each coord in the array */
        for (i = 0; i < cnum; ++i)
            memcpy(coords[i]->translation, coords[i]->center, 3 * sizeof(double));

        /* when superimposing to an alignemnt, initially iterate into unwted LS for a few rounds */
//        if (algo->alignment == 1 && round < 5)
//            memsetd(scratchA->w, 1.0, vlen);

        /* Inner loop:
           (1) Calc rotations given weights/weight matrices
           (2) Rotate coords with new rotations
           (3) Recalculate average

           Loops till convergence, holding constant the weights, variances, and covariances
           (and thus the translations too) */
        innerround = 0;
        do
        {
/*          putchar('*'); */
/*          fflush(NULL); */
            ++innerround;
            algo->innerrounds += innerround;

            if (algo->verbose == 1)
            {
                printf("\n    New Inner Round:%d \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\n", innerround);
                fflush(NULL);
            }

            /* save the old rotation matrices to test convergence at bottom of loop */
            for (i = 0; i < cnum; ++i)
                MatCpySym(coords[i]->last_matrix, (const double **) coords[i]->matrix, 3);

            /* find the optimal rotation matrices */
            if (algo->alignment == 1 /* && (round == 1 || cnum == 2) */)
                deviation_sum = CalcRotationsOcc(scratchA);
            else
                deviation_sum = CalcRotations_pth(scratchA, rotdata, callThd, &attr, thrdnum);

            if (algo->verbose == 1 && innerround == 1)
            {
                frobnorm = 0.0;
                for (i = 0; i < cnum; ++i)
                    /* frobnorm += MatFrobNorm((const double **) coords[i]->last_matrix, (const double **) coords[i]->matrix, 3, 3); */
                    frobnorm += FrobDiffNormIdentMat((const double **) coords[i]->matrix, 3);
                frobnorm /= cnum;

                printf("-----<<<<< %3d Frobenius Norm (Outer): % 8.3e //////////////////////////////\n",
                       round, frobnorm);
                fflush(NULL);
            }

            if (innerround == 1 &&
                CheckConvergenceOuter(scratchA, round, algo->precision) == 1 &&
                MatFrobNorm((const double **) lastmat, (const double **) scratchA->AxCovMat, 3, 3) < algo->precision)
                   goto outsidetheloops;

            if (stats->precision > 0.0)
                percent = 100.0 * log(fabs(stats->precision))/log(algo->precision);
            else
                percent = 0.0;

            if (percent > lastpercent)
            {
                lastpercent = percent;
                printf("    %5.1f%%\n", percent);
                /* printf("\n%e\n", stats->precision); */
                printf("\033[<1>A"); /* moves the cursor up one line */
                fflush(NULL);
            }

            /* find global rmsd and average coords (both held in structure) */
            if (algo->noave == 0)
            {
                if (algo->alignment == 1)
                {
                    AveCoordsOcc(scratchA);
                    EM_MissingCoords(scratchA);
                    /* PrintCoords(scratchA->avecoords); */
                }
                else
                {
                    AveCoords_pth(scratchA, avedata, callThd, &attr, thrdnum);
                    /* AveCoords(scratchA); */
                }
            }

            if (algo->mbias == 1)
                UnbiasMean(scratchA);

            stats->wRMSD_from_mean = sqrt(deviation_sum / (3 * vlen * cnum));

            if (algo->verbose == 1)
            {
                frobnorm = 0.0;
                for (i = 0; i < cnum; ++i)
                    frobnorm += FrobDiffNormIdentMat((const double **) coords[i]->matrix, 3);
                frobnorm /= cnum;
                printf("    ----->>>>> %3d Frobenius Norm (Inner %d): % e\n", round, innerround, frobnorm);
                printf("    End Inner Round:%d \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\n", innerround);
                fflush(NULL);
            }

            if (algo->noinnerloop == 1)
                break;
            else if (innerround > 160)
            {
                putchar(',');
                fflush(NULL);
                break;
            }
        }
        while(CheckConvergenceInner(scratchA, algo->precision) == 0);

        /* Weighting by dimensional, axial Xi covariance matrix, here diagonal. */
        /* Holding the superposition constant, calculates the covariance
           matrices and corresponding weight matrices, looping till 
           convergence. */
        CalcCovariances(scratchA);

        /* calculate the weights/weight matrices */
        CalcWts(scratchA);
        /* printf("\n----> logL: %e <----", CalcLogL(scratchA)); */
    }

    outsidetheloops:

    if (algo->seed == 1)
        round -= 10;

    if (algo->verbose == 1)
    {
        printf(" AxCovMat:\n");
        write_C_mat((const double **) scratchA->AxCovMat, 3, 8, 0);
        fflush(NULL);
    }

#if defined(__APPLE__)
    endtime = seconds();
    innerloop = (double) (endtime - starttime) / 0.001;
    starttime = seconds();
#endif

    printf("    Calculating statistics ... \n");
    fflush(NULL);

/*
    fp = fopen("distcor.txt", "w");
    if (scratchA->CovMat == NULL)
        scratchA->CovMat = MatAlloc(vlen, vlen);

    CalcCovMat(scratchA);
    DistMatsAlloc(cdsA);

    CalcMLDistMat(scratchA);

    for (i = 0; i < vlen; ++i)
        for (j = 0; j < i; ++j)
            fprintf(fp, "%6d % 10.3f  % 8.3e\n",
                   i-j,
                   scratchA->Dij_matrix[i][j],
                   scratchA->CovMat[i][j] / sqrt(scratchA->CovMat[i][i] * scratchA->CovMat[j][j]));

    fclose(fp);
*/

/*     if (algo->weight == 200) */
/*         unremlvar(scratchA); */

/* #include "internmat.h" */
/* AveCoords(scratchA); */
/* CalcCovMat(scratchA); */
/* PrintCovMatGnuPlot((const double **) scratchA->CovMat, vlen, "cov.mat"); */
/* for (i = 0; i < vlen; ++i) */
/*  for (j = 0; j < vlen; ++j) */
/*      scratchA->CovMat[i][j] -= internmat[i][j]; */
/* PrintCovMatGnuPlot((const double **) scratchA->CovMat, vlen, "covdiff.mat"); */

/*     CovMat2CorMat(scratchA->CovMat, vlen); */
/*     PrintCovMatGnuPlot((const double **) scratchA->CovMat, vlen, "corr.mat"); */
/*     memcpy(&scratchA->CovMat[0][0], &internmat[0][0], vlen * vlen * sizeof(double)); */
/*     PrintCovMatGnuPlot((const double **) scratchA->CovMat, vlen, "cov_true.mat"); */
/*     CovMat2CorMat(scratchA->CovMat, vlen); */
/*     PrintCovMatGnuPlot((const double **) scratchA->CovMat, vlen, "corr_true.mat"); */

/*     CovMatsDestroy(scratchA); */ /* DLT debug */
/*     CovMatsDestroy(baseA); */

    CalcStats(scratchA);
    stats->fperr = SuperPoseArray2Orig(scratchA, baseA, &stats->minvar);

    /* orient the family perpendicular to principal axes of the average coords --
       makes for nice viewing */
    if (baseA->anchorf_name != NULL)
        SuperPose2Anchor(scratchA, baseA, baseA->anchorf_name);
    else if (algo->princaxes == 1)
        RotPrincAxes(scratchA);

    if (algo->write_file == 1)
    {
        char *transf_name = mystrcat(algo->rootname, "_transf.txt");
        WriteTransformations(scratchA, transf_name);
        free(transf_name);
    }

    if (algo->olve == 1 && algo->write_file == 1)
    {
        PDBCoordsArray *olveA;
        printf("    Writing Olve's file ... \n");
        fflush(NULL);

        olveA = PDBCoordsArrayInit();
        PDBCoordsArrayAlloc(olveA, cnum, vlen);

        for (i = 0; i < cnum; ++i)
            CopyCoords2PDB(olveA->coords[i], coords[i]);

        /* WriteTheseusModelFile(olveA, algo, stats, "theseus_olve.pdb"); */
        char *olve_name = mystrcat(algo->rootname, "_olve.pdb");
        WriteOlveModelFile(olveA, algo, stats, olve_name);
        free(olve_name);
        PDBCoordsArrayDestroy(&olveA);
    }

    CopyStats(baseA, scratchA);

    /* wRMSD_from_mean does not need 2 in denominator, since it is already from the average */
    stats->wRMSD_from_mean = sqrt(deviation_sum / (double) (vlen * cnum));

    if (algo->write_file == 1)
    {
        char *cov_name = mystrcat(algo->rootname, "_cov.mat");
        char *cor_name = mystrcat(algo->rootname, "_cor.mat");
        CalcCovMat(scratchA);
        PrintCovMatGnuPlot((const double **) scratchA->CovMat, vlen, cov_name);
        CovMat2CorMat(scratchA->CovMat, vlen);
        PrintCovMatGnuPlot((const double **) scratchA->CovMat, vlen, cor_name);
        free(cov_name);
        free(cor_name);
    }

    CoordsArrayDestroy(&scratchA);
    free(evals);
    MatDestroy(&lastmat);
    pthread_attr_destroy(&attr);
    for (i = 0; i < thrdnum; ++i)
        free(rotdata[i]);
    for (i = 0; i < thrdnum; ++i)
        free(avedata[i]);
    free(rotdata);
    free(avedata);
    free(callThd);

#if defined(__APPLE__)
    endtime = seconds();
    exitloop = (double) (endtime - starttime) / 0.001;
    if (algo->verbose == 1)
    {
        printf("    init    setup  inner loop  exit loop \n");
        printf(" %7.2f  %7.2f     %7.2f    %7.2f (ms) \n", init, setup, innerloop, exitloop);
        fflush(NULL);
    }
#endif

    return(round);
}


void
RotPrincAxes(CoordsArray *cdsA)
{
    int             i;
    double        **x90z90 = MatAlloc(3,3);
    /* double x90[3][3]    = {{ 1, 0, 0}, { 0, 0, 1}, { 0,-1, 0}}; */
    /* double z90[3][3]    = {{ 0, 1, 0}, {-1, 0, 0}, { 0, 0, 1}}; */
    /* double x90z90[3][3] = {{ 0, 1, 0}, { 0, 0, 1}, { 1, 0, 0}}; */

    /* this orients the least -> most variable axes along x, y, z respectively */
    CalcCoordsPrincAxes(cdsA->avecoords, cdsA->avecoords->matrix);

    memset(&x90z90[0][0], 0, 9 * sizeof(double));
    x90z90[0][1] = x90z90[1][2] = x90z90[2][0] = 1.0;

    /* Rotate the family 90deg along x and then along z.
       This puts the most variable axis horizontal, the second most variable
       axis vertical, and the least variable in/out of screen. */
    Mat3MultIp(cdsA->avecoords->matrix, (const double **) x90z90);

    for (i = 0; i < cdsA->cnum; ++i)
        Mat3MultIp(cdsA->coords[i]->matrix, (const double **) cdsA->avecoords->matrix);

    MatDestroy(&x90z90);
}


/* Jackknife! */
void
SuperJack(CoordsArray *baseA)
{
    int             i, j, k, m;
    int             ticker;
    const double   *wts = (const double *) baseA->w;
    const double    vlen = baseA->vlen, cnum = baseA->cnum;
    Coords         *jkcoords = baseA->jkcoords;
    Coords         *avecoords = baseA->avecoords;
    Coords        **coords = baseA->coords;

    ticker = baseA->algo->jackknife / 70;

    printf("\n I==-===-===-===-===-===-===-===-===-===-===-===-===-===-===-===-===-I \n ");
    MultiPose(baseA);
    CoordsCopy(jkcoords, (const Coords *) avecoords);

    for (i = 0; i < baseA->algo->jackknife; ++i)
    {
        multinomial_eqdev(vlen, 1.0/vlen, avecoords->o, vlen);

        for (j = 0; j < vlen; ++j)
            for (k = 0; k < cnum; ++k)
                coords[k]->o[j] = avecoords->o[j];

        MultiPose(baseA);
        /* CoordsAdd(baseA->jkcoords, baseA->avecoords); */
        /* calc relative rotmat by superimposing avecoords on jkcoords */
        ProcLAPACKSVD(avecoords, jkcoords, jkcoords->matrix, wts, baseA->axesw,
                      baseA->tmpmat3a, baseA->tmpmat3b, baseA->tmpmat3c, baseA->tmpvec3a);

        for (k = 0; k < cnum; ++k)
        {
            Mat3MultIp(coords[k]->matrix, (const double **) jkcoords->matrix);
            RotMatAddIp(coords[k]->jackmat, (const double **) coords[k]->matrix);

            for (m = 0; m < 3; ++m)
                coords[k]->jktranslation[m] += coords[k]->translation[m];
        }

        if (i%ticker == 0)
            putchar('.');

        fflush(NULL);
    }

    for (j = 0; j < vlen; ++j)
        for (k = 0; k < cnum; ++k)
            coords[k]->o[j] = avecoords->o[j] = 1.0;

    NormJKCoords(baseA);
}


/* Calculates weights corresponding to the atomic, row-wise covariance matrix only */
void
CalcWts(CoordsArray *cdsA)
{
    int             i, j;
    Algorithm      *algo = cdsA->algo;
    double         *variance = cdsA->var;
    double         *weight = cdsA->w;
    const int       vlen = cdsA->vlen;

    if (algo->leastsquares != 0)
    {
        for (i = 0; i < vlen; ++i)
            weight[i] = 1.0;

        return;
    }

    if (algo->varweight != 0)
    {
        for (i = 0; i < vlen; ++i)
            if (variance[i] < algo->constant)
                variance[i] = algo->constant;

        HierarchVars(cdsA);

        for (i = 0; i < vlen; ++i)
        {
            if (variance[i] >= DBL_MAX)
                weight[i] = 0.0;
            else if (variance[i] == 0.0)
                weight[i] = 0.0;
            else
                weight[i] =  1.0 / variance[i];
        }

/*         if (algo->alignment == 1) */
/*             cdsA->stats->wtnorm = NormalizeWeightsOcc(weight, cdsA->avecoords->o, vlen); */
/*         else */
        cdsA->stats->wtnorm = NormalizeWeights(weight, vlen);

// for (i=0; i<vlen;++i)
//      printf("\n%d: % e  % e", i, variance[i], weight[i]);
    }
    else if (algo->covweight != 0)
    {
        if (cdsA->algo->rounds < 5)
        {
            for (i = 0; i < vlen; ++i)
                for (j = 0; j < i; ++j)
                    cdsA->CovMat[i][j] = cdsA->CovMat[j][i] = 0.0;
        }

        if (algo->hierarch != 0 && algo->rounds > 2)
            HierarchVars(cdsA);

//        LedoitWolfShrinkCov(cdsA, algo->LedoitWolf);

        /* minimum variance boundary condition */
        for (i = 0; i < vlen; ++i)
            if (cdsA->CovMat[i][i] < algo->constant)
                cdsA->CovMat[i][i] = algo->constant;

        /* CovInvWeightLAPACK(cdsA); */
        /* pseudoinv_sym(cdsA->CovMat, cdsA->WtMat, vlen, DBL_MIN); */
        InvSymEigenOp(cdsA->WtMat, (const double **) cdsA->CovMat, vlen, cdsA->tmpvecK, cdsA->tmpmatKK1, DBL_MIN);

        cdsA->stats->wtnorm = NormalizeCovMat(cdsA->WtMat, vlen);

        cdsA->stats->trace_inv_sigma = 0.0;
        for (i = 0; i < vlen; ++i)
            for (j = 0; j < vlen; ++j)
                cdsA->stats->trace_inv_sigma += cdsA->WtMat[i][j];
    }
}


void
CalcWtsFinal(CoordsArray *cdsA)
{
    int             i;
    double         *weight = cdsA->w;
    const double   *variance = (const double *) cdsA->var;

    for (i = 0; i < cdsA->vlen; ++i)
    {
        if (variance[i] >= DBL_MAX)
            weight[i] = 0.0;
        else
            weight[i] = 1.0 / variance[i];
    }

    NormalizeWeights(weight, cdsA->vlen);
}


double
SuperPose(Coords *coords1, Coords *coords2, double **rotmat, double *trans,
          double *norm1, double *norm2, double *innprod)
{
    const int       vlen = coords1->vlen;
    double        **tmpmat1 = MatAlloc(3, 3);
    double        **tmpmat2 = MatAlloc(3, 3);
    double        **tmpmat3 = MatAlloc(3, 3);
    double         *tmpvec = malloc(3 * sizeof(double));
    double         *newtrans = malloc(3 * sizeof(double));
    double         *cen1 = calloc(3, sizeof(double));
    double         *cen2 = calloc(3, sizeof(double));
    double          sumdev;
    int             i;

    CenMassOccVec(coords1, cen1);
    CenMassOccVec(coords2, cen2);

    NegTransCoordsIp(coords1, cen1);
    NegTransCoordsIp(coords2, cen2);

    sumdev = ProcLAPACKSVDvanOcc(coords1, coords2, rotmat,
                                 tmpmat1, tmpmat2, tmpmat3, tmpvec,
                                 norm1, norm2, innprod);

    if (sumdev > 1)
    {
        printf("  ERROR1111: -> sumdev: % 12.7e % 12.7e \n",
                0.5 * sumdev / vlen, sqrt(fabs(0.5 * sumdev / vlen)) ); 
        printf("  ERROR1111: Please report to dtheobald@brandeis.edu \n");
//        PrintTheseusTag();
//        exit(EXIT_FAILURE);
    }

    TransCoordsIp(coords1, cen1);
    TransCoordsIp(coords2, cen2);

    InvRotVec(newtrans, cen2, rotmat);

/*     printf("\n nt: %f %f %f", */
/*            newtrans[0], newtrans[1], newtrans[2]); */

    for (i = 0; i < 3; ++i)
        trans[i] = newtrans[i] - cen1[i];

    MatDestroy(&tmpmat1);
    MatDestroy(&tmpmat2);
    MatDestroy(&tmpmat3);
    free(tmpvec);
    free(newtrans);
    free(cen1);
    free(cen2);

    return(sumdev);
}
