/*************************************************************
*  This file is part of the Surface Evolver source code.     *
*  Programmer:  Ken Brakke, brakke@susqu.edu                 *
*************************************************************/

/* matrix.c */

/* matrix routines, mostly from Numerical Recipes */

/* Note: matrix allocation routine assumes whole matrix can be 
    allocated in one call to calloc.  If not true (say for large
    matrices on an IBM PC), dmatrix() and free_matrix() will
    have to be modified */

/* Note: Matrices allocated as row-pointer structures.
    The -1 index is used to store a private copy of 
    pointer to memory, so users may swap row pointers freely.
*/

#include "include.h"

#ifdef BLAS
/* prototypes */
void DPOTRF ARGS((char*,int*,double*,int*,int*));
void DPOTRS ARGS((char*,int*,int*,double*,int*,double*,int*,int*));
void DGEMM ARGS((char*,char*,int*,int*,int*,double*,
           double*,int*,double*,int*,double*,double*,int*));
void DSYMM ARGS((char*,char*,int*,int*,double*,double*,int*,
          double*,int*,double*,double*,int*));
void DGEMV ARGS((char*,int*,int*,double*,double*,int*,double*,int*,double*,  
              double*,int*));
void DSYMV ARGS((char*,int*,double*,double*,int*,double*,int*,double*,  
              double*,int*));
#endif

/*****************************************************************
*
* Function: nerror()
*
* Purpose: Numerical Recipes error reporting
*
*/

void nrerror(error_text)
char *error_text;
{ 
  kb_error(1204,error_text,RECOVERABLE);
}

/******************************************************************
*
* Function: matcopy()
*
* Purpose:  copies matrix b to matrix a 
*           for zero-base indexing only 
*/

void matcopy(a,b,rows,cols)
REAL **a,**b;
int rows,cols;
{
  int i;

  for ( i = 0 ; i < rows ; i++ )
     memcpy((char *)a[i],(char *)b[i],cols*sizeof(REAL));
}

/******************************************************************
*
* Function: kb_dmatrix()
*
* Purpose: Allocates zeroed 2D matrix in pointer-pointer form,
*          a REAL matrix with range [rlo..rhi][clo..chi] 
*/

#ifdef MEMSTRINGS
REAL **kb_dmatrix(rlo,rhi,clo,chi,file,line)
int rlo,rhi,clo,chi;
char *file; int line;
#else
REAL **kb_dmatrix(rlo,rhi,clo,chi)
int rlo,rhi,clo,chi;
#endif
{
  int i;
  REAL **m;

#ifdef MEMSTRINGS
  if ( memdebug)
  { sprintf(msg,"dmatrix from %s  %d.\n",file,line);
     outstring(msg);
  }
#endif
  if ( rhi-rlo+1 == 0 ) return NULL;
  m = (REAL **)mycalloc((unsigned)(rhi-rlo+1+1),sizeof(REAL *));
  if ( !m ) nrerror("dmatrix allocation error.");
  m -= rlo;
  m++; /* room for private pointer */

  m[rlo-1] = (REAL *) mycalloc((unsigned)(chi-clo+1),(rhi-rlo+1)*sizeof(REAL));
  if ( !m[rlo-1] ) nrerror("dmatrix allocation error.");
  for ( i = rlo ; i <= rhi ; i++ )
        m[i] = m[rlo-1] + (i - rlo)*(chi - clo + 1) - clo;

  return m;
}

/******************************************************************
*
* Function: kb_dmatrix3()
*
* Purpose: Allocates a zeroed 3D matrix in pointer-pointer-pointer form,
*  a REAL matrix with range [0..n1-1][0..n2-1][0..n3-1] 
*
*/

#ifdef MEMSTRINGS
REAL ***kb_dmatrix3(n1,n2,n3,file,line)
int n1,n2,n3;
char * file; int line;
#else
REAL ***kb_dmatrix3(n1,n2,n3)
int n1,n2,n3;
#endif
{
  int i,j;
  REAL ***m;

#ifdef MEMSTRINGS
  if ( memdebug )
  { sprintf(msg,"dmatrix3 from %s  %d.\n",file,line);
     outstring(msg);
  }
#endif

  if ( n1 <= 0 ) n1 = 1;
  if ( n2 <= 0 ) n2 = 1;
  /* assumes all pointers same machine size and alignment */
  m = (REAL ***)mycalloc((n2+1)*n1+1,sizeof(REAL **));
  if ( !m ) nrerror("dmatrix3 allocation error.");

  m++; /* room for private pointer to doubles */
  for ( i = 0 ; i < n1 ; i++ )
     m[i] = (REAL **)(m + n1 + i*n2);
  m[0][0] = (REAL *) mycalloc(n1*n2*n3,sizeof(REAL));
  m[-1] = (REAL **)(m[0][0]);
  if ( !m[0][0] ) nrerror("dmatrix3 allocation error.");
  for ( i = 0 ; i < n1 ; i++ )
     for ( j = 0 ; j < n2 ; j++ )
        m[i][j] = m[0][0] + i*n2*n3 + j*n3;

  return m;
}

/*********************************************************************
*
* function: matrix3_reorder()
*
* purpose: Reorder data of a 3D matrix into canonical order.
*          Creates new matrix, and frees old one.
*
* return: 
*/
REAL *** matrix3_reorder(a,maxi,maxj,maxk)
REAL ***a;
int maxi,maxj,maxk; /* dimensions */
{ int i,j,k;
  REAL ***newa;
  newa = kb_dmatrix3(maxi,maxj,maxk);
  for ( i = 0 ; i < maxi; i++ )
    for ( j = 0 ; j < maxj ; j++ )
      for ( k = 0 ; k < maxk ; k++ )
        newa[i][j][k] = a[i][j][k];
  free_matrix3(a);
  return newa;
}

/******************************************************************
*
* Function: kb_dmatrix4()
*
* Purpose: Allocates a zeroed 4D matrix in pointer form,
*  a REAL matrix with range [0..n1-1][0..n2-1][0..n3-1][0..n4-1] 
*/

#ifdef MEMSTRINGS
REAL ****kb_dmatrix4(n1,n2,n3,n4,file,line)
int n1,n2,n3,n4;
char * file; int line;
#else
REAL ****kb_dmatrix4(n1,n2,n3,n4)
int n1,n2,n3,n4;
#endif
{
  int i,j,k;
  REAL ****m;

#ifdef MEMSTRINGS
  if ( memdebug )
  { sprintf(msg,"dmatrix4 from %s  %d.\n",file,line);
     outstring(msg);
  }
#endif

  if ( n1 <= 0 ) n1 = 1;
  if ( n2 <= 0 ) n2 = 1;
  if ( n3 <= 0 ) n3 = 1;

  /* assumes all pointers same machine size and alignment */
  m = (REAL ****)mycalloc(1+n1+n1*n2+n1*n2*n3,sizeof(REAL ***));
  if ( !m ) nrerror("dmatrix4 allocation error.");

  m++; /* room for private pointer */
  for ( i = 0 ; i < n1 ; i++ )
     { m[i] = (REAL ***)(m + n1 + i*n2);
        for ( j = 0 ; j < n2 ; j++ )
          m[i][j] = (REAL **)(m + n1 + n1*n2 + i*n2*n3 + j*n3);
     }
  m[0][0][0] = (REAL *) mycalloc(n1*n2*n3*n4,sizeof(REAL));
  m[-1] = (REAL***)(m[0][0][0]);
  if ( !m[0][0][0] ) nrerror("dmatrix4 allocation error.");
  for ( i = 0 ; i < n1 ; i++ )
     for ( j = 0 ; j < n2 ; j++ )
        for ( k = 0 ; k < n3 ; k++ )
          m[i][j][k] = m[0][0][0] + i*n2*n3*n4 + j*n3*n4 + k*n4;

  return m;
}

/******************************************************************
*
* Function: matNd_setup()
*
* Purpose: routines for initializing matrices declared as local variables 
*          with MAT2D etc macros.  Note it does not zero the entries!
*/

REAL ** mat2d_setup(name,spacename,rows,cols)
REAL **name;
REAL *spacename;
int rows,cols;
{ REAL **spot = name;
  for ( ; rows > 0 ; rows--,spacename += cols,spot++ )
     *spot = spacename;
  return name;
}

REAL *** mat3d_setup(name,spacename,rows,cols,levels)
REAL ***name;
REAL *spacename;
int rows,cols,levels;
{ int i;
  REAL ***spot;
  REAL **row = (REAL **)(name + rows);
  for ( spot = name ; rows > 0 ; rows--,spot++ )
  { *spot = row;
     for ( i = 0 ; i < cols ; i++,spacename += levels, row++ )
        *row = spacename;
  }
  return name;
}

REAL **** mat4d_setup(name,spacename,rows,cols,levels,tiers)
REAL ****name;
REAL *spacename;
int rows,cols,levels,tiers;
{ int i,j;
  REAL ***row = (REAL ***)(name + rows);
  REAL **col = (REAL **)(name + rows + rows*cols);
  REAL ****spot;
  for (spot=name ; rows > 0 ; rows--,spot++ )
  { *spot = row;
     for ( i = 0 ; i < cols ; i++, row++ )
     { *row = col;
        for ( j = 0 ; j < levels ; j++,spacename += tiers, col++ )
          *col = spacename;
     }
  }
  return name;
}
/* end local declaration routines */

/******************************************************************
*
* Function: ivector()
*
* Purpose: allocate integer or real vector with given index range.
*/

int *ivector(lo,hi)
int lo,hi;
/* allocates a int vector with range [lo..hi] */
{
  int *v;

  v = (int *)mycalloc((unsigned)(hi-lo+1),sizeof(int));
  return v-lo;
}

void free_ivector(v,lo,hi)
int *v,lo,hi;
{
  myfree((char *)(v+lo));
}

/******************************************************************
*
* Function: free_matrixN()
*
* Purpose: Deallocate storage allocated by kb_dmatrixN().
*/

void free_matrix(m)
REAL **m;
{
  if ( !m ) return;
  myfree((char *)m[-1]);  /* using private pointer */
  myfree((char *)(m-1));
}

void free_matrix3(m)
REAL ***m;
{
  if ( !m ) return;
  myfree((char *)m[-1]);
  myfree((char *)(m-1));
}

void free_matrix4(m)
REAL ****m;
{
  if ( !m ) return;
  myfree((char *)m[-1]);
  myfree((char *)(m-1));
}

/******************************************************************
*
* Function: vector_add()
*
* Purpose: add vector b to vector a 
*/
void vector_add(a,b,n)
REAL *a,*b;
int n;
{ for(;n!=0;n--) *(a++) += *(b++);
}

/******************************************************************
*
* Function: vector_add_smul()
*
* Purpose: add scalar multiple of vector b to vector a 
*/
void vector_add_smul(a,b,c,n)
REAL *a,*b;
REAL c;
int n;
{ for(;n!=0;n--) *(a++) += c*(*(b++));
}

/******************************************************************
*
* Function: vector_sub()
*
* Purpose: subtract vector b from vector a  
*/

void vector_sub(a,b,n)
REAL *a,*b;
int n;
{ for(;n!=0;n--) *(a++) -= *(b++);
}

/******************************************************************
*
* Function: vnormal()
*
* Purpose: given 3 points, find cross product of sides 
*/
void vnormal(a,b,c,n)
REAL *a,*b,*c,*n;
{
  REAL aa[MAXCOORD],bb[MAXCOORD];
  int i;

  for ( i = 0 ; i < SDIM ; i++ )
  { aa[i] = a[i] - c[i];
    bb[i] = b[i] - c[i];
  }
  cross_prod(aa,bb,n);
}
  
/******************************************************************
*
* Function: cross_product()
*
* Purpose; Find 3D cross product of a and b, return in c
*/
void cross_prod(a,b,c)
REAL *a,*b,*c;
{
  c[0] = a[1]*b[2] - a[2]*b[1];
  c[1] = a[2]*b[0] - a[0]*b[2];
  c[2] = a[0]*b[1] - a[1]*b[0];
} 

/******************************************************************
*
* Function: triple_prod()
*
* Purpose: Find scalar triple product in 3D.
*/
REAL triple_prod(a,b,c)
REAL *a,*b,*c;
{
  return  a[0]*(b[1]*c[2] - b[2]*c[1]) - a[1]*(b[0]*c[2] - b[2]*c[0])
             + a[2]*(b[0]*c[1] - b[1]*c[0]);
}

/******************************************************************
*
* Function: dot(), dotdf(), dotf()
*
* Purpose: dot products of various REALs and floats.
*
*/
/* dot product of REALS */
REAL dot(a,b,n)
REAL *a,*b;
int n;  /* number of items */
{
  REAL x = 0.0;
  for (  ; --n >= 0 ;  ) x += (*(a++))*(*(b++));
  return x;
}

/* dot product for doubles and floats */
REAL dotdf(a,b,n)
REAL *a;
float *b;
int n;  /* number of items */
{
  REAL x = 0.0;
  for (  ; --n >= 0 ;  ) x += (*(a++))*(*(b++));
  return x;
}

/* dot product for floats */
REAL dotf(a,b,n)
float *a,*b;
int n;  /* number of items */
{
  REAL x = 0.0;
  for (  ; --n >= 0 ;  ) x += (*(a++))*(*(b++));
  return x;
}


/******************************************************************
*
* Function: matvec_mul()
*
* Purpose:  matrix times vector multiplication, c = a * b 
*/
void matvec_mul(a,b,c,rows,cols)
REAL **a,*b,*c;
int rows,cols;
{
  int i,j;

  for ( i = 0 ; i < rows ; i++ )
  { c[i] = 0.0;
    for ( j = 0 ; j < cols ; j++ )
      c[i] += a[i][j]*b[j];
  }
}


/******************************************************************
*
* Function: vec_mat_mul()
*
* Purpose: vector times matrix multiplication, c = a * b 
*/
void vec_mat_mul(a,b,c,rows,cols)
REAL *a,**b,*c;
int rows,cols;
{
  int i,j;

  for ( i = 0 ; i < cols ; i++ )
  { c[i] = 0.0;
    for ( j = 0 ; j < rows ; j++ )
      c[i] += a[j]*b[j][i];
  }
}

/******************************************************************
*
* Function: mat_mult()
*
* Purpose: matrix by matrix multiplication,  c = a * b 
*          a is imax x jmax, b is jmax x kmax, c is imax x kmax
*       a, b, and c need not be distinct.
*       Tests for zero entries in first matrix for larger matrices,
*         so if one matrix is sparse, put it first.
*/
void mat_mult(a,b,c,imax,jmax,kmax)
REAL **a,**b,**c;  /* not assumed distinct */
int imax,jmax,kmax;
{ int i,j,k;

  if ( (a == c) || (b == c) )
  { if ( (imax<=MAXCOORD)&&(kmax<=MAXCOORD))
    { MAT2D(temp,MAXCOORD,MAXCOORD);  /* local temp space */
      for ( i = 0 ; i < imax ; i++ )
        for ( k = 0 ; k < kmax ; k++ )
          for ( j = 0, temp[i][k] = 0.0 ; j < jmax ; j++ )
             temp[i][k] += a[i][j]*b[j][k];
      matcopy(c,temp,imax,kmax);
    }
    else /* have to go to the effort to get temp work space */
    { REAL **temp = dmatrix(0,imax-1,0,kmax-1);  /* temporary storage */
      for ( i = 0 ; i < imax ; i++ )
        for ( j = 0 ; j < jmax ; j++ )
        { REAL aa = a[i][j];
          if ( aa==0.0 ) continue;
          for ( k = 0 ; k < kmax ; k++ )
            temp[i][k] += aa*b[j][k];
        }
      matcopy(c,temp,imax,kmax);
      free_matrix(temp);
    }
  }
  else
  { for ( i = 0 ; i < imax ; i++ )
    { for ( k = 0 ; k < kmax ; k++ ) c[i][k] = 0.0;
      for ( j = 0 ; j < jmax ; j++ )
      { REAL aa = a[i][j];
        if ( aa == 0.0 ) continue;
        for ( k = 0 ; k < kmax ; k++ )
          c[i][k] += aa*b[j][k];
      }
    }
  }
}

/******************************************************************
*
* Function: tr_mat_mul()
*
* Purpose: matrix transpose by matrix multiplication 
*          output: c = aT*b 
*          a is imax x jmax, b is imax x kmax, c is jmax x kmax 
*       a, b, and c need not be distinct.
*/
void tr_mat_mul(a,b,c,imax,jmax,kmax)
REAL **a,**b,**c;  /* not assumed distinct */
int imax,jmax,kmax; 
{
  REAL **temp;  /* temporary storage, if needed */
  int i,j,k;

  if ( (a == c) || (b == c) )
  { temp = dmatrix(0,jmax-1,0,kmax-1);  /* temporary storage */
    for ( j = 0 ; j < jmax ; j++ )
      for ( k = 0 ; k < kmax ; k++ )
        for ( i = 0 ; i < imax ; i++ )
          temp[j][k] += a[i][j]*b[i][k];
    matcopy(c,temp,jmax,kmax);
    free_matrix(temp);
  }
  else
  { REAL *s;
    for ( j = 0 ; j < jmax ; j++ )
      for ( k = 0, s = c[j] ; k < kmax ; k++,s++ )
      { *s = 0.0;
         for ( i = 0 ; i < imax ; i++ )
           *s += a[i][j]*b[i][k];
      }
  }
}

/******************************************************************
*
* Function: mat_mul_tr()
*
* Purpose: matrix by matrix transpose multiplication,  c = a * bT 
*       a is imax x jmax, b is kmax x jmax, c is imax x kmax
*       a, b, and c need not be distinct.
*/  
void mat_mul_tr(a,b,c,imax,jmax,kmax)
REAL **a,**b,**c;  /* not assumed distinct */
int imax,jmax,kmax;
{
  REAL **temp;  /* temporary storage, if needed */
  int i,j,k;

  if ( (a == c) || (b == c) )
  { temp = dmatrix(0,imax-1,0,kmax-1);  /* temporary storage */
    for ( i = 0 ; i < imax ; i++ )
      for ( j = 0 ; j < jmax ; j++ )
        for ( k = 0 ; k < kmax ; k++ )
          temp[i][k] += a[i][j]*b[k][j];
    matcopy(c,temp,imax,kmax);
    free_matrix(temp);
  }
  else
  { 
    for ( k = 0 ; k < kmax ; k++ )
    { for ( i = 0 ; i < imax ; i++ ) c[i][k] = 0.0;
      for ( j = 0 ; j < jmax ; j++ ) 
      { REAL bb = b[k][j];
        if ( bb == 0.0 ) continue;
        for ( i = 0 ; i < imax ; i++ )
          c[i][k] += a[i][j]*bb;
      }
    }
  }
}

/******************************************************************
*
* Function: mat_tsquare()
*
* Purpose: matrix times own transpose, b = a*aT
*          a and b must be different. 
*/
void  mat_tsquare(a,b,n,m)
REAL **a; /* original */
REAL **b; /* square  b = a*aT */
int n,m; /* a is nxm, b is nxn */
{
  int i,j;
  if ( a == b )
    kb_error(2141,"mat_tsquare: a and b same (internal error).\n",RECOVERABLE);
  for ( i = 0 ; i < n ; i++ )
    for ( j = 0 ; j <= i ; j++ )
      b[i][j] = b[j][i] = dot(a[i],a[j],m);
}

/******************************************************************
*
* Function: quadratic_form()
*
* Purpose: quadratic form evaluation, a*b*c; only uses lower triangle 
*/
REAL quadratic_form(a,b,c,n)
REAL *a,**b,*c;
int n; /* size */ 
{ int i,j;
  REAL sum = 0.0;
  REAL temp;

  for ( i = 0 ; i < n ; i++ )
  { temp = b[i][0]*c[0];
    for ( j = 1 ; j <= i ; j++ )
      temp += b[i][j]*c[j];
    for (  ; j < n ; j++ )
      temp += b[j][i]*c[j];
    sum += a[i]*temp;
  }

  return sum;
}


/******************************************************************
*
* Function: mat_inv()
*
* Purpose: in-place matrix inverse by gauss-jordan 
* returns -1 for singular matrix, 
*          >= 0 for nonsingular, value is index, i.e. number of neg pivots
*           
*/

#define SWAP(a,b) {REAL temp = (a); (a) = (b); (b) = temp; }
#define SMALL 10

int mat_inv(a,n)
REAL **a;     /* matrix to invert in place */
int n;        /* size of matrix */
{
  int *indxc,*indxr,*ipiv;
  int i,icol=0,irow=0,j,k,l,ll;
  REAL big,dum,pivinv;
  int retval = 1;  /* default return value is success */
  int temp1[SMALL],temp2[SMALL],temp3[SMALL]; /* avoid alloc for small sizes */
  int neg_index = 0;

  if ( n <= SMALL )
  { indxc = temp1; indxr = temp2; ipiv = temp3; }
  else
  { /* large size */
    indxc = ivector(0,n-1);
    indxr = ivector(0,n-1);
    ipiv  = ivector(0,n-1);
  }
  for ( j = 0 ; j < n ; j++ ) ipiv[j] = -1;
  for ( i = 0 ; i < n ; i++ )
  { big = 0.0;
    for ( j = 0 ; j < n ; j++ )
      if ( ipiv[j] != 0 )
         for ( k = 0 ; k < n ; k++ )
         { if ( ipiv[k] == -1 )
           { if ( fabs(a[j][k]) >= big )
             { big = fabs(a[j][k]);
               irow = j;
               icol = k;
             }
           }
           else if ( ipiv[k] > 0 ) { retval = -1; goto mat_inv_exit; }
         }
      ++(ipiv[icol]);

      if ( irow != icol )
         for ( l = 0 ; l < n ; l++ ) SWAP(a[irow][l],a[icol][l])
      indxr[i] = irow;
      indxc[i] = icol;
      if ( a[icol][icol] == 0.0 ) { retval = -1; goto mat_inv_exit; }
      if ( a[icol][icol] < 0.0 ) neg_index++;
      pivinv = 1/a[icol][icol];
      a[icol][icol] = 1.0;
      for ( l = 0 ; l < n ; l++ ) a[icol][l] *= pivinv;
      for ( ll = 0  ; ll < n ; ll++ )
        if ( ll != icol )
        { dum = a[ll][icol];
          a[ll][icol] = 0.0;
          for ( l = 0 ; l < n ; l++ ) a[ll][l] -= a[icol][l]*dum;
        }
  }
  for ( l = n-1 ; l >= 0 ; l-- )
  { if ( indxr[l] != indxc[l] )
       for ( k = 0 ; k < n ; k++ )
          SWAP(a[k][indxr[l]],a[k][indxc[l]])
  }
  retval = neg_index;

mat_inv_exit:
  if ( n > SMALL )
  { free_ivector(ipiv,0,n-1);
    free_ivector(indxr,0,n-1);
    free_ivector(indxc,0,n-1);
  }
  return retval;
}

/*************************************************************************
*
* function: LD_factor()
*
* purpose: factor dense symmetric matrix into LDL^T form.  Touches only
*          lower triangle.  Does not pivot, so should be used only
*          on positive definite matrices.
*/

int LD_factor( H, N )
REAL **H;  /* lower triangular, row by row */
int N;     /* size */
{ int i,j,k;
  int negs = 0;

#ifdef BLAS
  if ( blas_flag )
  { char uplo = 'U';
    int stride = H[1] - H[0];
    int info=0;
    DPOTRF(&uplo,&N,&H[0][0],&stride,&info);
  }
  else
#endif
  for ( i = 0 ; i < N ; i++ )   /* pivot (i,i) */
  { REAL pivot = H[i][i];
    if ( pivot == 0.0 )
    { kb_error(2553,"Trying to factor singular matrix; using 1 as pivot.\n",
       WARNING);
      pivot = 1.0;
    }
    if ( pivot < 0 ) negs++;
    pivot = 1/pivot;
    for ( j = i+1 ; j < N ; j++ )  /* row j */
    { REAL x = H[j][i];
      for ( k = i+1 ; k < j ; k++ ) /* col k */
        H[j][k] -= x*H[k][i];
      H[j][i] = x*pivot;
      H[j][j] -= x*x*pivot;
    }
  }
 return negs;
}

/***********************************************************************
*
* function: LD_solve
*
* purpose: solve for a single rhs using matrix factored by LD_factor.
*/

void LD_solve(LD,B,X,N)
REAL **LD;  /* from LD_factor() */
REAL *B;  /* rhs */
REAL *X;  /* solution */
int N;
{ int i,j;
#ifdef BLAS
  if ( blas_flag )
  { char uplo = 'U';
    int stride = LD[1] - LD[0];
    int info=0;
    int nrhs = 1; /* number of rhs */
    for ( i = 0 ; i < N ; i++ ) X[i] = B[i]; /* since DPOTRS overwrites */
    DPOTRS(&uplo,&N,&nrhs,&LD[0][0],&stride,&X[0],&N,&info);
  }
  else
#endif
  {  for ( i = 0 ; i < N ; i++ )
     { X[i] = B[i];
       for ( j = 0 ; j < i ; j++ )
         X[i] -= LD[i][j]*X[j];
     }
     for ( i = 0 ; i < N ; i++ )
       X[i] /= LD[i][i];
     for ( i = N-1 ; i >= 0 ; i-- )
     { for ( j = i+1 ; j < N ; j++ )
        X[i] -= LD[j][i]*X[j];
     }
  }
}


/******************************************************************
*
* Function: det_adjoint()
*
* Purpose: calculates determinant in place and leaves adjoint transpose 
*/
REAL  det_adjoint(a,n)
REAL **a;     /* matrix to change in place */
int n;        /* size of matrix */
{
  int *indxc,*indxr,*ipiv;
  int i,icol=0,irow=0,j,k,l,ll;
  REAL big,dum,pivinv,piv;
  int temp1[SMALL],temp2[SMALL],temp3[SMALL]; /* avoid alloc for small sizes */
  REAL det = 1.0;  /* will multiply by pivots */

  if ( n <= 0 )
    kb_error(1205,"Internal error: Matrix size not positive.",RECOVERABLE);

  if ( n == 1 ) { det = a[0][0]; a[0][0] = 1.0; return det; }
  if ( n == 2 )
  { REAL temp;
    det = a[0][0]*a[1][1] - a[0][1]*a[1][0];
    temp = a[0][0]; a[0][0] = a[1][1]; a[1][1] = temp;
    a[0][1] = -a[0][1]; a[1][0] = -a[1][0];
    return det;
  }

  if ( n <= SMALL )
  { indxc = temp1; indxr = temp2; ipiv = temp3; }
  else
  { /* large size */
    indxc = ivector(0,n-1);
    indxr = ivector(0,n-1);
    ipiv  = ivector(0,n-1);
  }
  for ( j = 0 ; j < n ; j++ ) ipiv[j] = -1;
  for ( i = 0 ; i < n-1 ; i++ )
  { big = 0.0;
    for ( j = 0 ; j < n ; j++ )
      if ( ipiv[j] != 0 )
         for ( k = 0 ; k < n ; k++ )
         { if ( ipiv[k] == -1 )
           { if ( fabs(a[j][k]) >= big )
             { big = fabs(a[j][k]);
               irow = j;
               icol = k;
             }
           }
           else if ( ipiv[k] > 0 )
           { kb_error(1206,"Internal: ipiv > 0.\n",WARNING); det = 0.0; goto det_exit; }
         }
      ++(ipiv[icol]);

      if ( irow != icol )
      { for ( l = 0 ; l < n ; l++ ) SWAP(a[irow][l],a[icol][l])
        det = -det;
      }
      indxr[i] = irow;
      indxc[i] = icol;
      det *= a[icol][icol];  /* build determinant */
      if ( a[icol][icol] == 0.0 ) { goto det_lowrank; }
      pivinv = 1/a[icol][icol];
      a[icol][icol] = 1.0;
      for ( l = 0 ; l < n ; l++ ) a[icol][l] *= pivinv;
      for ( ll = 0  ; ll < n ; ll++ )
        if ( ll != icol )
        { dum = a[ll][icol];
          a[ll][icol] = 0.0;
          for ( l = 0 ; l < n ; l++ ) a[ll][l] -= a[icol][l]*dum;
        }
  }
  /* special treatment for last pivot; works even if zero */
  for ( j = 0 ; j < n ; j++ )
     if ( ipiv[j] != 0 ) { irow = icol = j; break; }
  indxr[n-1] = irow;
  indxc[n-1] = icol;
  piv = a[icol][icol];
  a[icol][icol] = 1.0;
  for ( l = 0 ; l < n ; l++ ) a[icol][l] *= det;
  for ( ll = 0  ; ll < n ; ll++ )
    if ( ll != icol )
    { dum = a[ll][icol];
      a[ll][icol] = 0.0;
      for ( l = 0 ; l < n ; l++ ) 
         a[ll][l] = a[ll][l]*piv*det - a[icol][l]*dum;
    }
  det *= piv;

  for ( l = n-1 ; l >= 0 ; l-- )
  { if ( indxr[l] != indxc[l] )
      for ( k = 0 ; k < n ; k++ )
        SWAP(a[k][indxr[l]],a[k][indxc[l]])
  }

det_exit:
  if ( n > SMALL )
  { free_ivector(ipiv,0,n-1);
    free_ivector(indxr,0,n-1);
    free_ivector(indxc,0,n-1);
  }
  return det;

det_lowrank: /* rank less than n-1, so adjoint = 0 */
  for ( i = 0 ; i < n ; i++ )
     for ( j = 0 ; j < n ; j++ )
        a[i][j] = 0.0;
  det = 0.0;
  goto det_exit;
  
}


/******************************************************************
*
* Function: determinant()
*
* Purpose: calculates determinant; no change in matrix for 3x3 or smaller 
*           otherwise calls det_adjoint()
*/
REAL  determinant(a,n)
REAL **a;     /* matrix to change in place */
int n;        /* size of matrix */
{
  if ( n == 1 ) { return a[0][0];  }
  if ( n == 2 ) { return  a[0][0]*a[1][1] - a[0][1]*a[1][0]; }
  if ( n == 3 )
     { return a[0][0]*(a[1][1]*a[2][2] - a[1][2]*a[2][1])
              - a[0][1]*(a[1][0]*a[2][2] - a[1][2]*a[2][0])
              + a[0][2]*(a[1][0]*a[2][1] - a[1][1]*a[2][0]);
     }
  return det_adjoint(a,n);  /* other cases */
}

/******************************************************************
*
* Function: print_matrix()
*
*/
void print_matrix(a,rows,cols)
REAL **a;
int rows,cols;
{
  int i,j;

  for ( i = 0 ; i < rows ; i++ )
    { msg[0] = 0;
      for ( j = 0 ; j < cols ; j++ )
        sprintf(msg+strlen(msg),"%10.6f ",(DOUBLE)a[i][j]);
      strcat(msg,"\n");
      outstring(msg);
    }
}

/******************************************************************
*
* Function: exterior_product()
*
* Purpose: conversion of k vectors to a k-vector 
*          components in index lexicographic order 
*/
void exterior_product(v,w,k,n)
REAL **v;  /* list of k vectors */
REAL *w;    /* returned k-vector */
int k;      /* number of vectors */
int n;      /* space dimension */
{
  /* anticipate only small k, so just brute force */
  int i1,i2,i3;

  switch ( k )
    {
      case 1:  for ( i1 = 0 ; i1 < n ; i1++ ) *(w++) = v[0][i1];
               break;

      case 2:  for ( i1 = 0 ; i1 < n ; i1++ )
                 for ( i2 = i1+1 ; i2 < n ; i2++ )
                    *(w++) = v[0][i1]*v[1][i2] - v[0][i2]*v[1][i1];
               break;

      case 3:  for ( i1 = 0 ; i1 < n ; i1++ )
                 for ( i2 = i1+1 ; i2 < n ; i2++ )
                   for ( i3 = i2+1 ; i3 < n ; i3++ )
                      *(w++) = v[0][i1]*v[1][i2]*v[2][i3]
                             + v[0][i2]*v[1][i3]*v[2][i1] 
                             + v[0][i3]*v[1][i1]*v[2][i2] 
                             - v[0][i1]*v[1][i3]*v[2][i2] 
                             - v[0][i3]*v[1][i2]*v[2][i1] 
                             - v[0][i2]*v[1][i1]*v[2][i3] ;
               break;

      default: sprintf(errmsg,"Exterior product of %d vectors.\n",k);
               kb_error(1207,errmsg,RECOVERABLE);

               break;
    }
}

/**********************************************************************
*
*  function: kernel_basis()
*
*  purpose:  Find basis for kernel of matrix (nullspace of rows)
*/

int kernel_basis(a,ker,imax,jmax)
REAL **a;  /* the matrix, will be altered */
REAL **ker; /* for basis vectors in columns */
int imax,jmax;  /* rows and columns of a */
{
  int i,j,k;
  int pivrow[20];    /* pivot row in column */
  int n; /* nullity */

  for ( j = 0 ; j < jmax ; j++ ) pivrow[j] = -1;  /* mark as no pivot in col */

  /* get row echelon form, pivot largest in each row */
  for ( i = 0 ; i < imax ; i++ )
  { int piv = -1;
    REAL b,big,p;

    /* find largest element in row */
    big = 0.0;
    for ( j = 0 ; j < jmax ; j++ )
      if ( fabs(a[i][j]) > big )
      { big = fabs(a[i][j]);
        piv = j;
      }
    if ( piv == -1 ) continue; /* row of zeros */
    pivrow[piv] = i;

    /* pivot step */
    p = a[i][piv];
    for ( j = 0 ; j < jmax ; j++ )
      a[i][j] /= p;
    for ( k = 0 ; k < imax ; k++ )
    { if ( k == i ) continue;
      b = a[k][piv];
      for ( j = 0 ; j < jmax ; j++ )
           a[k][j] -= b*a[i][j];
    }
  }         

  /* now find kernel basis */
  for ( j = 0, n = 0 ; j < jmax ; j++ )
  { if ( pivrow[j] >= 0 ) continue;  /* column has leading 1 */
    /* column j is parameter column */
    for ( k = 0 ; k < jmax ; k++ )
    { if ( pivrow[k] >= 0 )
         ker[k][n] = -a[pivrow[k]][j];
      else if ( k == j )
         ker[k][n] = 1.0;
      else ker[k][n] = 0.0;
    }
    n++;
  }
  return n; /* nullity */
}

/**********************************************************************
*
*  function: kernel_basis_rows()
*
*  purpose:  Find basis for kernel of matrix (nullspace of rows)
*                Returns basis rowwise.
*      basis vectors normalized, but not orthohormal.
*/

int kernel_basis_rows(a,ker,imax,jmax)
REAL **a;  /* the matrix, will be altered */
REAL **ker; /* for basis vectors in rows */
int imax,jmax;  /* rows and columns of a */
{
  int i,j,k;
  int pivrow[20];    /* pivot row in column */
  int pivcol[20];    /* pivot column  in row */
  int n; /* nullity */
  int  detsign=1;  /* to try to keep orientation of normal positive */

  for ( j = 0 ; j < jmax ; j++ ) pivrow[j] = -1;  /* mark as no pivot in col */

  /* get row echelon form, pivot largest in each row */
  for ( i = 0 ; i < imax ; i++ )
  { int piv = -1;
    REAL b,big,p;

    /* find largest element in row */
    big = 0.0;
    for ( j = 0 ; j < jmax ; j++ )
      if ( fabs(a[i][j]) > big )
      { big = fabs(a[i][j]);
        piv = j;
      }
    if ( piv == -1 ) continue; /* row of zeros */
    pivrow[piv] = i; pivcol[i] = piv;

    /* pivot step */
    p = a[i][piv];   if ( p < 0 ) detsign = -detsign;
    for ( j = 0 ; j < jmax ; j++ )
      a[i][j] /= p;
    for ( k = 0 ; k < imax ; k++ )
    { if ( k == i ) continue;
      b = a[k][piv];
      for ( j = 0 ; j < jmax ; j++ )
         a[k][j] -= b*a[i][j];
    }
  }         

  /* now find kernel basis */
  for ( j = 0, n = 0 ; j < jmax ; j++ )
  { int sign;
    if ( pivrow[j] >= 0 ) continue;  /* column has leading 1 */
    /* column j is parameter column */
    pivcol[imax+n] = j;
    /* get sign for pos det */
    for (sign = detsign, k=0 ; k <= imax+n ; k++ )
      for ( i = k+1 ; i <= imax+n ; i++ )
        if ( pivcol[k] > pivcol[i] ) sign = -sign;
    for ( k = 0 ; k < jmax ; k++ )
    { if ( pivrow[k] >= 0 )
        ker[n][k] = -sign*a[pivrow[k]][j];
      else if ( k == j )
        ker[n][k] = sign;
      else ker[n][k] = 0.0;
    }
    n++;
  }
  /* normalize */
  for ( i = 0 ; i < n ; i ++ )
  { REAL mag;
    mag = sqrt(SDIM_dot(ker[i],ker[i]));
    for ( j = 0 ; j < SDIM ; j++ )
      ker[i][j] /= mag;
  } 

  return n; /* nullity */
}


/*********************************************************************
*
* function: matrix_index()
*
* purpose:  return number of negative eigenvalues of matrix
*           Does not destroy original.  For symmetric matrices.
*
*/

int matrix_index(M,n)
REAL **M;  /* square matrix */
int n;  /* size */
{ REAL **a = dmatrix(0,n-1,0,n-1);
  REAL *tempptr;
  int row,col,prow=0;
  REAL maxp;
  int i,j;
  int indx = 0;
  REAL temp;
  REAL *firstrow;  /* for proper freeing after swapping rows */

  firstrow = a[0];
  matcopy(a,M,n,n);


  /* basically, gauss elimination to lower triangular form
     with partial pivoting.  
  */
  for ( col = 0 ; col < n ; col++ )
  { /* find max pivot in diagonal */
    maxp = 0.0;
    for ( row = col ; row < n ; row++ )
       if ( fabs(a[row][row]) > maxp ) { maxp = fabs(a[row][row]); prow = row; }
    if ( maxp == 0.0 ) continue;
    if ( prow != col )
    { /* swap rows and colums to keep symmetric */
      tempptr = a[prow]; a[prow] = a[col]; a[col] = tempptr;
      for ( j = col; j < n ; j++ )
      { temp = a[j][col]; a[j][col] = a[j][prow]; a[j][prow] = temp; }
    }
    if ( a[col][col] < 0.0 ) indx++;
    for ( row = col+1 ; row < n ; row++ )
      for ( i = col+1 ; i < n ; i++ )
      { a[row][i] -= a[row][col]/a[col][col]*a[col][i];
      }
  }
  a[0] = firstrow; free_matrix(a);
  return indx;
}


/****************************************************************************
*
* function: jacobi_eigenpairs()
*
* purpose: find eigenpairs of small dense symmetric matrix by jacobi rotations
*          From Numerical Recipes 11.1.
*
* output: eigenvalues in d[], sorted in descending order, and
*         corresponding eigenvectors in columns of v[][].
*/
void jacobi_eigenpairs(a,n,d,v,work)
REAL **a;  /* input matrix, destroyed */
int n; /* size */
REAL *d; /* for return of eigenvalues */
REAL **v;  /* for return of eigenvectors */
REAL *work;  /* space for 2*n values */
{ REAL sm,h,tresh,dum,g,t,theta,tau,s,c;
  int i,iq,ip,nrot,j;
  REAL *z,*b;

  z = work;
  b = work+n;

  /* initialize v to identity */
  for ( ip = 0 ; ip < n ; ip++ )
  { for ( iq = 0 ; iq < n ; iq++ ) v[ip][iq] = 0.0;
    v[ip][ip] = 1.0;
  }

  for ( ip = 0 ; ip < n ; ip++ )
  { b[ip] = a[ip][ip];
    d[ip] = b[ip];
    z[ip] = 0.0;
  }

  nrot = 0;
  for ( i = 1 ; i < 50 ; i++ )
  { sm = 0.0;
    for ( ip = 0 ; ip < n-1; ip++ )
      for ( iq = ip + 1 ; iq < n ; iq++ )
        sm += fabs(a[ip][iq]);
    if ( sm == 0.0 ) goto jacobi_exit; /* normal exit */

    if ( i < 4 ) tresh = .2*sm/n/n;
    else tresh = 0.0;

    for ( ip = 0 ; ip < n-1 ; ip++ )
     for ( iq = ip+1 ; iq < n ; iq++ )
     { g = 100*fabs(a[ip][iq]);
       dum = fabs(d[ip]);
       if ( (i > 4) && (dum+g == dum) && (fabs(d[iq])+g == fabs(d[iq])) )
         a[ip][iq] = 0.0;
       else if ( fabs(a[ip][iq]) > tresh ) 
       { h = d[iq] - d[ip];
         if ( fabs(h) + g == fabs(h) ) t = a[ip][iq]/h;
         else 
         { theta = .5*h/a[ip][iq];
           t = 1.0/(fabs(theta) + sqrt(1 + theta*theta));
           if ( theta < 0.0 ) t = -t;
         }
         c = 1.0/sqrt(1 + t*t);
         s = t*c;
         tau = s/(1+c);
         h = t*a[ip][iq];
         z[ip] -= h;
         z[iq] += h;
         d[ip] -= h;
         d[iq] += h;
         a[ip][iq] = 0.0;
         for ( j = 0 ; j <= ip-1 ; j++ )
         { g = a[j][ip];
           h = a[j][iq];
           a[j][ip] = g - s*(h+g*tau);
           a[j][iq] = h + s*(g-h*tau);
         }
         for ( j = ip+1 ; j <= iq-1 ; j++ )
         { g = a[ip][j];
           h = a[j][iq];
           a[ip][j] = g - s*(h+g*tau);
           a[j][iq] = h + s*(g-h*tau);
         }
         for ( j = iq+1 ; j < n ; j++ )
         { g = a[ip][j];
           h = a[iq][j];
           a[ip][j] = g - s*(h+g*tau);
           a[iq][j] = h + s*(g-h*tau);
         }
         for ( j = 0 ; j < n ; j++ )
         { g = v[j][ip];
           h = v[j][iq];
           v[j][ip] = g - s*(h+g*tau);
           v[j][iq] = h + s*(g-h*tau);
         }
         nrot++;
      } /* end if */

     } /* end iq */
      /* end ip */

     for ( ip = 0 ; ip < n ; ip++ )
     { b[ip] += z[ip];
       d[ip] = b[ip];
       z[ip] = 0.0;
     }
  } /* end i */

  kb_error(2548,"50 Jacobi iterations should never happen.\n",WARNING);
  { temp_free((char*)z); temp_free((char*)b); return; }

jacobi_exit:
  /* sort eigenpairs in descending order, insertion sort */
  for ( i = 0 ; i < n-1 ; i++ )
  { REAL p;
    int k;
    k = i;
    p = d[i];
    for ( j = i + 1 ; j < n ; j++ )
    { if ( d[j] >= p ) 
      { k = j; p = d[j]; }
    }
    if ( k != i )
    { d[k] = d[i];
      d[i] = p;
      for ( j = 0 ; j < n ; j++ ) 
      { p = v[j][i]; v[j][i] = v[j][k]; v[j][k] = p; }
    }
  }
  return; 
}

/**********************************************************************
*
* function: det_hess(a,h,n)
*
* Purpose: find hessian of determinant as function of entries
*
* Returns  h[i1][j1][i2][j2] as d^2 det(a)/da[i1][j1]/da[i2][j2]
*/

void det_hess(a,h,n)
REAL **a;
REAL ****h;
int n;  /* size */
{ int i1,i2,jj1,j2,k;

  /* copy original matrix into h lots of times */
  for ( i1 = 0 ; i1 < n ; i1++ )
    for ( jj1 = 0 ; jj1 < n ; jj1++ )
     for ( i2 = 0 ; i2 < n ; i2++ )
      for ( j2 = 0 ; j2 < n ; j2++ )
         h[i1][jj1][i2][j2] = a[i2][j2];

  /* replace element row and column with identity stuff */
  for ( i1 = 0 ; i1 < n ; i1++ )
    for ( jj1 = 0 ; jj1 < n ; jj1++ )
    { for ( k = 0 ; k < n ; k++ )
      { h[i1][jj1][i1][k] = 0.0;
         h[i1][jj1][k][jj1] = 0.0;
      }
      h[i1][jj1][i1][jj1] = 1.0;
    }

  /* find adjoints */
  for ( i1 = 0 ; i1 < n ; i1++ )
    for ( jj1 = 0 ; jj1 < n ; jj1++ )
    { det_adjoint(h[i1][jj1],n);
      h[i1][jj1][jj1][i1] = 0.0; /* need fixup */
    }

  /* transpose to get back to hessian */
  for ( i1 = 0 ; i1 < n ; i1++ )
    for ( jj1 = 0 ; jj1 < n ; jj1++ )
     for ( i2 = 1 ; i2 < n ; i2++ )
      for ( j2 = 0 ; j2 < i2 ; j2++ )
      { REAL tmp =  h[i1][jj1][i2][j2];
         h[i1][jj1][i2][j2] = h[i1][jj1][j2][i2];
         h[i1][jj1][j2][i2] = tmp;
      }
}

/**********************************************************************
*
* function: gram_schmidt()
*
* purpose: orthonormalize rows of a matrix
*
* return: number of independent rows
*/

int gram_schmidt(mat,rows,cols)
REAL **mat;
int rows, cols;
{ int i,j,k;
  REAL d;
  for ( i = 0 ; i < rows ; i++ )
  { for ( j = 0 ; j < i ; j++ )
    { REAL c = dot(mat[i],mat[j],cols);
      for ( k = 0 ; k < cols ; k++ ) mat[i][k] -= c*mat[j][k];
    }
    d = dot(mat[i],mat[i],cols);
    if ( d == 0.0 ) 
    { rows--; 
      for ( k = 0 ; k < cols ; k++ ) mat[i][k] = mat[rows][k];
      i--;
    }
    else
    { d = 1/sqrt(d);
      for ( k = 0 ; k < cols ; k++ ) mat[i][k] *= d;
    }
  }
  return rows;
} 

/**************************************************************************
*
* function: mat_inv_sparse()
*
* purpose: invert large sparse symmetric matrix in place, with input in 
*          dense format.
*          Should be just temporary until callers get fully integrated
*          with sparse matrix stuff.
*/

int mat_inv_sparse(a,n)
REAL **a;     /* matrix to invert in place */
int n;        /* size of matrix */
{ struct linsys S;
  int i,j,k;
  REAL *BB,*Y;

  memset(&S,0,sizeof(S));
  S.N = n;
  /* count entries */
  S.IA = (int *)temp_calloc(n+1,sizeof(int));
  for ( i = 0, k = 0 ; i < n ; i++ )
  { S.IA[i] = k + A_OFF;
    for ( j = i ; j < n ; j++ )
      if ( a[i][j] != 0.0 ) k++;
  }
  S.IA[i] = k + A_OFF;

  /* allocate main space and fill */
  S.JA = (int *)temp_calloc(k,sizeof(int));
  S.A  = (REAL *)temp_calloc(k,sizeof(REAL));
  S.P = (int *)temp_calloc(n,sizeof(int));
  S.IP = (int *)temp_calloc(n,sizeof(int));
  for ( i = 0, k = 0 ; i < n ; i++ )
  { 
    for ( j = i ; j < n ; j++ )
      if ( a[i][j] != 0.0 )
      { S.A[k] = a[i][j];
        S.JA[k] = j + A_OFF;
        k++;
      }
  }

#ifdef USEMINDEG
  xmd_factor(&S);  /* use mindeg, since internals known */
#else
  ysmp_factor(&S);
#endif

  /* solve back for inverse matrix */
  BB = (REAL*)temp_calloc(S.N+10,sizeof(REAL));  /* intermediate solutions */
  Y = (REAL*)temp_calloc(S.N+10,sizeof(REAL));  /* intermediate solutions */
  for ( i = 0 ; i < n ; i++ )
  { 
    memset(BB,0,n*sizeof(REAL));
    BB[i] = 1.0;
#ifdef USEMINDEG
/* mindeg is too specific, using vertex structure to optimize for hessian */
    /* solve U^T Y = B */
   int *jp;REAL *e;
    for ( j = 0 ; j < S.N ; j++ )
    { int start,end;
      Y[j] = BB[S.LJA[S.LIJA[j]]];  /* for BK inner permutation */
      if ( S.psize[j] == FIRSTOFPAIR ) start = 2;
      else start = 1;
      end = S.LIA[j+1];
      for ( i=S.LIA[j]+start, e=S.LA+i , jp=S.LJA+S.LIJA[j]+start ;
                   i < end ; i++,e++,jp++ )
        BB[*jp] -= (*e)*Y[j];
    }

    /* solve D V = Y (will use Y to store V) */
    for ( j = 0 ; j < S.N ; j++ )
    { if ( S.psize[j] == ONEBYONE )
          Y[j] /= S.LA[S.LIA[j]];
      else if ( S.psize[j] == ZEROPIVOT ) Y[j] = 0.0;  
      else
      { REAL piv[2][2];
        REAL pinv[2][2];
        REAL det,yy;
        piv[0][0] = S.LA[S.LIA[j]];
        piv[0][1] = piv[1][0] = S.LA[S.LIA[j]+1];
        piv[1][1] = S.LA[S.LIA[j+1]];
        det = piv[0][0]*piv[1][1] - piv[0][1]*piv[1][0];
        pinv[0][0] = piv[1][1]/det;
        pinv[1][0] = pinv[0][1] = -piv[0][1]/det;
        pinv[1][1] = piv[0][0]/det;
        yy = Y[j]*pinv[0][0] + Y[j+1]*pinv[1][0];
        Y[j+1] = Y[j]*pinv[0][1] + Y[j+1]*pinv[1][1];
        Y[j] = yy;
        j++;
      }
    }

    /* solve U X = V */
    for ( j = S.N-1 ; j >= 0 ; j-- )
    { int start,end;
      if ( S.psize[j] == FIRSTOFPAIR ) start = 2;
      else start = 1;
      end = S.LIA[j+1];
      for ( k=S.LIA[j]+start, e=S.LA+k, jp=S.LJA+S.LIJA[j]+start  ;
           k < end ; k++,e++,jp++ )
         Y[j] -= (*e)*BB[*jp];
      BB[S.LJA[S.LIJA[j]]] = Y[j];
    }

    /* unpermute */
    for ( j = 0 ; j < S.N ; j++ )
      a[i][S.P[j]] = BB[j];
/* end of mindeg version */
#else
    ysmp_solve(&S,BB,BB);
    for ( j = 0 ; j < S.N ; j++ )
      a[i][j] = BB[j];
#endif
  }

  temp_free((char*)Y);
  temp_free((char*)BB);
  free_system(&S);
  return 1;
}

/*************************************************************************
  General sparse matrix construction scheme.  Does not assume symmetry.
  Usage: call sp_hash_init() to initialize,
              sp_hash() to add value,
              sp_hash_end() at end to sort and gather into linear system.

  Some benchmarks for using sparse_constraints vs dense constraints on
  tgrid4:               Seconds for 'g'
            Bodies     Sparse     Dense
              16
              64                  0.0141
             256                  0.9281
            1024        0.0953   58.36      memory 13MB (dense)
            4096        0.687
           16384        3.737
           65536       27.3
          262144      182.5      10^9       memory 555MB (sparse) 64GB(dense)
  So there is a five million fold speed-up for the largest sparse
  system my machine can handle.
  No jiggling, so only one pass through volume projection.  With jiggling
  and zero scale, single iteration shows volumes projected back to
  targets with maximum error 5e-7!

  Benchmarking on twointor, likewise no jiggle, m 0, g:
           Bodies        Sparse    Dense
              2          0.0048    0.0025
              4          0.0066    0.0040
             16          0.0181    0.0147
             64          0.0722    0.0713
            256          0.285     1.118
           1024          1.73     59.1        memory 45MB(dense)
           4096         16.3                  memory 116MB(sparse)
           8192        145.5                  memory 250MB(sparse)
  Note twointor.fe has more elements per body (13 v, 36 f) than
  tgrid4.fe (1 v, 1 f) and has a denser constraint matrix, being
  three dimensional instead of two dimensional. 

**************************************************************************/
#define SP_PRIME 99991
#define sp_hash(row,col)  (abs((row)*97+(col)*SP_PRIME))

/********************************************************************
* 
* function: sp_hash_init()
*
* purpose: Initialize hash table.
*/
void sp_hash_init(S,size_estimate)
struct linsys *S;
int size_estimate; /* from sp_hash_end of previous invocation */
{ int i;
 
  S->table_size = size_estimate > 0 ? size_estimate : 
         6*SDIM*web.skel[VERTEX].max_ord;
  S->max_fill = 4*S->table_size/5;  /* leave enough empty for decent hash */
  if ( !hessian_quiet_flag )
  { sprintf(msg,"Sparse init alloc: %d\n",S->table_size);
    outstring(msg);
  }
  S->hashcount = 0;
  if ( S->hashtable ) temp_free((char*)S->hashtable);
  S->hashtable = 
     (struct sp_entry *)temp_calloc(S->table_size,sizeof(struct sp_entry));
  for ( i = 0 ; i < S->table_size ; i++ ) S->hashtable[i].row = HASHEMPTY;
  S->hash_extraprobes = 0;
}

/********************************************************************
* 
* function: sp_hash_expand()
*
* purpose: Expands hash table
*/

void sp_hash_expand(S)
struct linsys *S;
{ struct sp_entry *newtable,*oldtable;
  int i;
  struct sp_entry *e;
  int newsize; 
  int oldsize = S->table_size;

  if ( !S->hashtable ) sp_hash_init(S,0);
  newsize = S->table_size*2; 
  oldtable = S->hashtable;
  newtable = 
     (struct sp_entry *)temp_calloc(newsize,sizeof(struct sp_entry));
  for ( i = 0 ; i < newsize ; i++ ) newtable[i].row = HASHEMPTY;
  S->table_size =  newsize;
  S->max_fill = 4*S->table_size/5;
  S->hashtable = newtable;

  /* reinsert */
  for ( i = 0, e = oldtable ; i < oldsize ; i++,e++ )
     if ( e->row != HASHEMPTY )
     { int spot = sp_hash(e->row,e->col) % S->table_size;
       struct sp_entry *ee;
       for ( ee = S->hashtable + spot; ee->row != HASHEMPTY ; spot++ )
       { if ( spot == S->table_size ) spot = 0;
         ee = S->hashtable + spot;
       }
       *ee = *e;
     }
  temp_free((char*)oldtable);

  if ( !hessian_quiet_flag )
  { sprintf(msg,"Expanded hashtable size: %d.\n",S->table_size);
    outstring(msg);
  }
}

/********************************************************************
* 
* function: sp_hash_search()
*
* purpose: Finds existing entry or allocates entry.
*          Installs key values, and adds hessian value.
*/
void sp_hash_search(S,row,col,value)
struct linsys *S;
int row,col;  /* Note these are in reverse order as from hessian version */
REAL value;  /* value to add */
{
  struct sp_entry *e;
  int spot;

  if ( value == 0.0 ) return;   

  if ( S->hashcount >= S->max_fill ) 
     sp_hash_expand(S);

  /* search hash table */
  spot = sp_hash(row,col) % S->table_size;
  e = S->hashtable + spot;
  while ( e->row != HASHEMPTY )
  { if ( (e->row == row) && (e->col == col) )
     { e->value += value; return; 
     }
     spot++;
     if ( spot >= S->table_size ) spot -= S->table_size;
     e = S->hashtable + spot;
     S->hash_extraprobes++;
  }
  /* if here, then have empty slot and need to insert */
  e->col = col; e->row = row;  S->hashcount++; 
  e->value = value;
}


/********************************************************************
* 
* function: sp_hash_end()
*
* purpose: Puts entries in sparse packed format; allocates own lists.
*          Create sorted list of sparse entries, row major order
*          Does 2-digit radix sort on row,col with count sort on each.
*          Deallocate hessian hash table.
*
* return:  Estimated size of hash table to use next time.
*/
int sp_hash_end(S,rows,cols,index_start)
struct linsys *S;
int rows; /* number of rows */

int cols; /* number of columns */
int index_start; /* for 0 or 1 based indexing */
{ int i;
  struct sp_entry *e;
  int *counts;
  int *starts;
  int *spots; 
  int sum,oldsum;

  S->N = rows;
  S->maxN = rows > cols ? rows : cols;
  S->IA = (int *)temp_calloc(S->maxN+1,sizeof(int));
  counts = (int *)temp_calloc(3*cols+1,sizeof(int));
  starts = counts + cols;
  spots = starts + cols + 1;

  /* first stage: column binning */
  /* count entries in columns */
  for ( i = 0, e = S->hashtable ; i < S->table_size ; i++, e++ )
    if ( e->row != HASHEMPTY ) 
       counts[e->col]++;

  /* get starting points of each column */
  for ( i = 0, sum = 0 ; i < cols  ; i++ )
  { 
    spots[i] = starts[i] = sum;
    sum += counts[i];
  } 
  starts[cols] = sum;
  S->maxA = sum + S->maxN;  /* extra room for later expansion */
  S->JA = (int *)temp_calloc(S->maxA,sizeof(int));
  S->A = (REAL *)temp_calloc(S->maxA,sizeof(REAL));

  /* sort into column bins */
  for ( i = 0, e = S->hashtable ; i < S->table_size ; i++,e++ )
  { 
    struct sp_entry eorig;
    struct sp_entry etmp;
    if ( e->row == HASHEMPTY ) continue;
    if ( i < spots[e->col] ) continue; /* already in place */
    eorig = *e;
    e->row = HASHEMPTY;
    do
    { /* follow chain of replacements */
      int spot;
      spot = spots[eorig.col]++;
      etmp = S->hashtable[spot];
      S->hashtable[spot] = eorig;
      eorig = etmp;
    }
    while ( eorig.row != HASHEMPTY ); /* stop when get back to empty */ 
  }


  /* Second stage: row binning */
  /* count entries in row */

  for ( i = 0, e = S->hashtable ; i < sum ; i++, e++ )
     S->IA[e->row]++;

  /* get starting points of each row */
  for ( i = 0, sum = 0 ; i < rows  ; i++ )
  { oldsum = sum;
    sum += S->IA[i];
    S->IA[i] = oldsum;
  } 
  S->IA[rows] = sum;

  /* sort into row bins */
  for ( i = 0, e = S->hashtable ; i < sum ; i++,e++ )
  { 
    S->JA[S->IA[e->row]] = e->col;
    S->A[S->IA[e->row]] = e->value;
    S->IA[e->row]++;
  }
  /* put back IA */
  for ( i = rows ; i > 0 ; i-- )
    S->IA[i] = S->IA[i-1];
  S->IA[0] = 0;

  if ( index_start )
  { 
    for ( i = 0 ; i < S->IA[rows] ; i++ )
       S->JA[i] += index_start;
    for ( i = 0 ; i <= rows ; i++ )
       S->IA[i] += index_start;
  }

  temp_free((char*)counts);

  if ( !hessian_quiet_flag )
  { sprintf(msg,"Sparse entries: %d  Final hashtable size: %d\n",
        S->hashcount, S->table_size);
    outstring(msg);
    sprintf(msg,"Hash extra probes: %d\n",S->hash_extraprobes);
    outstring(msg);
  }
 
  temp_free((char*)S->hashtable);
  S->hashtable = NULL;

  return S->IA[S->N] + S->IA[S->N]/3;  /* estimate for next time */
}
/********************************************************************
     End sparse hash routines.
********************************************************************/

/*******************************************************************
     Using BLAS and LAPACK to factor large dense symmetric matrix
     for Hessian with lots of constraints.  To enable, -DBLAS in
     Makefile, and link with libblas and liblapack (ATLAS or whatever;
     Intel Math Kernel Library mkl_c.lib for Intel; needs mkl_sys.dll
     also).  Command line toggle blas_enable.  

     Usage: Incoming matrix should be block lower triangular, with
     blocksize BLAS_BLOCKSIZE.  Full blocks on diagonal, although
     only lower triangle has to filled on entry to LD_block_factor().
     Last odd rows also have to be full multiple of BLAS_BLOCKSIZE.
     Call LD_block_factor(H,N) first, then LD_block_solve(H,B,X,N)
     to solve for one right hand side.
********************************************************************/

/* Timings on block factoring, 1200 MHz dual Athlon MP.  Entries are Mflops
                              Matrix size
       block     512  1024  2048  4096  8192
        size   
         16      233   225   204   192   184
         32      340   360   366   371   373
         64      315   362   410   443   459
        128      211   283   340   377   399
        256      131   184   240   288   321
        512       99   122   164   213   254
  For comparison, my LDL in plain C runs at 32 Mflops, 
  and LAPACK DPPTRF at 66 Mflops.  Seems to use both CPUs
  for the DGEMM matrix multiplies, but only one for factoring.
*/

#ifdef BLAS

/* CPU clock timing */
__int32 LD_block_factor_elapsed_time[2];


int blocksize = BLAS_BLOCKSIZE;
static REAL workspace[BLAS_BLOCKSIZE][BLAS_BLOCKSIZE];
static REAL *work[BLAS_BLOCKSIZE];
static REAL workv[BLAS_BLOCKSIZE];
static int ipiv[BLAS_BLOCKSIZE]; /* for Bunch-Kaufman pivot info */

/*************************************************************************
*
* function: LD_block_factor()
*
* Purpose: BLAS and LAPACK used to block factor symmetric dense matrix.
*    Usage: Incoming matrix should be block lower triangular, with
*    blocksize BLAS_BLOCKSIZE.  Full blocks on diagonal, although
*    only lower triangle has to filled on entry to LD_block_factor().
*    Last odd rows also have to be full multiple of BLAS_BLOCKSIZE.
*    H will be overwritten.
* 
* Return value: >= 0  index of matrix
*               < 0   error
*/

/**********************************************************************
*
*  function: kb_DSYTRF()
*
*  purpose:  My replacement for buggy MKL code for DSYTRF
*            Does in-place Bunch-Kaufman factoring of lowet
*            triangular dense matrix (although full storage room).
*
*  returns: index of matrix.
*/

REAL BK_alpha = 0.6404;

void L_swap_columns(H,N,stride,i,r)
REAL *H;
int N;
int stride;
int i,r; /* the columns to swap */
{ int j;
  REAL tmp;
  REAL *ispot;
  REAL *rspot;

  if ( i == r ) return;
  if ( i > r ) { j = i; i = r; r = j; }

  for ( j=0, ispot=H+i*stride, rspot=H+r*stride ; j < i ; j++,ispot++,rspot++ )
  { tmp = *ispot; *ispot = *rspot; *rspot = tmp; }
  tmp = *ispot;
  *ispot = H[r*stride+r];
  H[r*stride+r] = tmp;
  for ( j = i+1, ispot += stride, rspot++ ; j < r ; j++,ispot+=stride,rspot++ )
  { tmp = *ispot; *ispot = *rspot; *rspot = tmp; } 
  for ( j = r+1, ispot+=stride,rspot+=stride; j < N ;
                                   j++,ispot+=stride,rspot+= stride)
  { tmp = *ispot; *ispot = *rspot; *rspot = tmp; } 

}

/************************************************************************
*
* function: kb_DSYTRF()
*
* purpose: Bunch-Kaufman factor dense matrix.
*          Uses lower triangle of full matrix.
*
* return: Index of matrix (number of negative entries on diagonal)
*/

int kb_DSYTRF(N,H,stride,ipiv,info)
int N;    /* matrix size */
REAL *H;  /* start of storage */
int stride;  /* entries between row starts */
int *ipiv;   /* returned pivot info; my own scheme: 
                  1-based indexing
                  abs value is what original column wound up here;
                  pos for 1x1, neg for first(!) of 2x2 pivots.
             */
int *info; /* 0 for success, -1 error, 1 singular */
{ int negs = 0;
  int i,j,k;

  *info = 0;  /* default is success */
  for ( i = 0 ; i < N ; i++ ) ipiv[i] = i+1; /* identity perm */

  for ( i = 0 ; i < N ; i++ )
  { REAL colmax,rcolmax;
    int r; /* row of max value */
    int pivotsize = 1;   
    REAL *ispot,*jspot,*kspot;

    /* first step is pivot selection */
    colmax = 0.0;
    r = -1;
    for ( j = i+1 ; j < N ; j++ )
    { REAL v = fabs(H[j*stride + i]);
      if ( v > colmax ) { colmax = v; r = j; }
    }
    if ( r == -1 ) /* zero column */
    { if ( H[i*stride+i] < 0.0 ) negs++;
      else if ( H[i*stride+i] == 0.0 ) *info = 1;
      goto skippivot;
    }
    if ( fabs(H[i*stride + i]) >= BK_alpha*colmax )
      goto dopivot;
    /* now need max in column r */
    rcolmax = colmax;
    for ( j = i+i ; j <= r ; j++ )
    { REAL v = fabs(H[r*stride+j]);
      if ( v > rcolmax ) rcolmax = v;
    }
    for ( j = r+1 ; j < N ; j++ )
    { REAL v = fabs(H[j*stride+r]);
      if ( v > rcolmax ) rcolmax = v;
    }
    if ( fabs(H[i*stride+i])*rcolmax >= BK_alpha*colmax*colmax )
      goto dopivot;
    if ( fabs(H[r*stride+r]) >= BK_alpha*rcolmax )
    { 
      L_swap_columns(H,N,stride,i,r);
      j = ipiv[i]; ipiv[i] = ipiv[r]; ipiv[r] = j;
    }
    else
    { pivotsize = 2; 
      L_swap_columns(H,N,stride,i+1,r);
      j = ipiv[i+1]; ipiv[i+1] = ipiv[r]; ipiv[r] = j;
    }
  
dopivot:
    if ( pivotsize == 1 )
    { REAL pivot;
      ispot = H+i*stride+i;
      pivot = *ispot;
      if ( pivot == 0.0 ) {*info = 1; goto skippivot;}
      if ( pivot < 0.0 ) negs++;
      pivot = 1/pivot; 
      for ( j = i+1  ; j < N ; j++ )
      { REAL x;
        jspot = H+j*stride+i;
        x = *jspot;
        *jspot *= pivot;
        kspot = ispot+stride;
        for ( k = i+1, jspot++ ; k <= j ; k++, jspot++, kspot+=stride )
          *jspot -= *kspot*x;
      }
    }
    else /* pivotsize 2 */
    { REAL det,p11,p21,p22,tmp;
      ispot = H+i*stride+i;
      p11 = *ispot; ispot += stride;
      p21 = *ispot; 
      p22 = ispot[1];
      det = p11*p22 - p21*p21;
      if ( (det > 0.0) && (p11+p22 < 0.0) ) negs += 2;
      else if ( det < 0.0 ) negs++;
      tmp = p11;
      p11 = p22/det;
      p22 = tmp/det;
      p21 = -p21/det;
      for ( j = i+2; j < N ; j++ )
      { REAL x1,x2;
        jspot = H+j*stride+i;
        x1 = jspot[0]; x2 = jspot[1];
        jspot[0] = p11*x1 + p21*x2;
        jspot[1] = p21*x1 + p22*x2;
        kspot = ispot+stride;
        for ( k = i+2, jspot+=2 ; k <= j ; k++, jspot++, kspot+=stride )
          *jspot -= kspot[0]*x1 + kspot[1]*x2;
      }
      ipiv[i] = -ipiv[i];
      i++;
    }
skippivot: ;
  }
  return negs;
}
/**********************************************************************
*
* function: kb_DSYTRS()
*
* purpose: solve for single rhs using factored matrix.
*/

void kb_DSYTRS(N,LD,stride,ipiv,B,work,info)
int N;
REAL *LD;  /* from kb_DSYTRF() */
int stride;
int *ipiv; /* ipiv[i] is what column wound up in column i, 1-based indexing */
REAL *B;  /* rhs,solution */
REAL *work; /* must be size N */
int *info;
{ int i,j;
  REAL *X = work;
  REAL *ispot,*jspot;

  /* permute */
  for ( i = 0 ; i < N ; i++ ) work[i] = B[abs(ipiv[i])-1];

  /* solve left factor */
  for ( i = 0, ispot = LD ; i < N ; i++, ispot += stride )
  { if ( ipiv[i] > 0 )   /* 1x1 pivot */
    { for ( j = 0, jspot = ispot ; j < i ; j++,jspot++ )
        X[i] -= (*jspot)*X[j];
    }
    else /* 2x2 pivot */
    { for ( j = 0, jspot = ispot ; j < i ; j++,jspot++ )
      { X[i] -= (*jspot)*X[j];
        X[i+1] -= jspot[stride]*X[j];
      }
      i++; ispot += stride;
    }
  }
  for ( i = 0, ispot = LD ; i < N ; i++, ispot+=stride+1 )
  { if ( ipiv[i] > 0 )   /* 1x1 pivot */
     X[i] /= *ispot;
    else /* 2x2 pivot */
    { REAL p11 = *ispot; 
      REAL p21 = ispot[stride];
      REAL p22 = ispot[stride+1];
      REAL det = p11*p22 - p21*p21;
      REAL x1 = X[i], x2 = X[i+1];
      X[i] = (p22*x1 - p21*x2)/det;
      X[i+1] = (-p21*x1 + p11*x2)/det;
      i++; ispot += stride+1;
    }
  }
  for ( i = N-1 ; i >= 0 ; i-- )
  { if ( ipiv[i] > 0 )   /* 1x1 pivot */
      for ( j = i+1, jspot = ispot ; j < N ; j++ )
        X[i] -= LD[j*stride+i]*X[j];
    else /* 2x2 pivot */
    { for ( j = i+2 ; j < N ; j++ )
      { X[i] -= LD[j*stride+i]*X[j];
      }
    }
  }
  /* permute */
  for ( i = 0 ; i < N ; i++ ) 
    B[abs(ipiv[i])-1] = work[i];

}

/************************************************************************
*
* function: kb_DSYTRI()
*
* purpose: convert factored matrix to inverse.
*
*/

void kb_DSYTRI(N,H,stride,ipiv,info)
int N;
REAL *H;  /* from kb_DSYTRF() */
int stride;
int *ipiv; /* ipiv[i] is what column wound up in column i, 1-based indexing */
int *info;
{ int i,j,k;
  REAL *ispot,sum,tmp;

  /* convert L to inverse */
  for ( i = 0 ; i < N ; i++ )
  { int jstart = (ipiv[i] < 0) ? i+2 : i+1;
    for ( j = jstart ; j < N ; j++ )
    { int endk = ((i>0) && (ipiv[i-1] < 0)) ? i-1 : i;
      for ( k = 0 ; k < endk ; k++ )
        H[j*stride+k] -= H[i*stride+k]*H[j*stride+i];
      H[j*stride+i] = -H[j*stride+i];
    }
  }

  /* convert D to inverse */
  for ( i = 0 ; i < N ; i++ )
  { ispot = H + i*stride + i;
    if ( ipiv[i] > 0 )
      *ispot = 1/(*ispot);
    else /* 2x2 pivot */
    { REAL p11 = *ispot; 
      REAL p21 = ispot[stride];
      REAL p22 = ispot[stride+1];
      REAL det = p11*p22 - p21*p21;
      H[i*stride+i] = p22/det;
      H[(i+1)*stride+i] = -p21/det;
      H[(i+1)*stride+i+1] = p11/det;
      i++;
    }
  }

  /* multiply out L^TD into upper triangle */
  for ( i = 0 ; i < N ; i++ )
  { if ( ipiv[i] > 0 )
    { REAL p = H[i*stride+i];
      for ( k = 0 ; k < i ; k++ )
        H[k*stride+i] = p*H[i*stride+k];
    }
    else
    { REAL p11 = H[i*stride+i];
      REAL p21 = H[(i+1)*stride+i];
      REAL p22 = H[(i+1)*stride+i+1];
      for ( k = 0 ; k < i ; k++ )
      { H[k*stride+i] = p11*H[i*stride+k] + p21*H[(i+1)*stride+k];
        H[k*stride+i+1] = p21*H[i*stride+k] + p22*H[(i+1)*stride+k];
      }
      H[i*stride+i+1] = H[(i+1)*stride+i];
      i++;
    }
  }

  /* multiply the two triangles L^TD*L into the lower */
  for ( i = 0 ; i < N ; i++ )  
  {
    if ( ipiv[i] > 0 )  /* 1x1 pivot */
    {
      for ( j = 0 ; j < i ; j++ )
      { 
        for ( k = i, sum = 0.0 ; k < N ; k++ )
          sum += H[i*stride+k]*H[k*stride+j];
        H[i*stride+j] = sum;
      }
      for ( j = i+1, sum = 0.0 ; j < N ; j++ )
       sum += H[i*stride+j]*H[j*stride+i];
      H[i*stride+i] += sum;
    }
    else /* 2x2 pivot */
    {
      for ( j = 0 ; j < i ; j++ )
      { REAL tmp; 
        for ( k = i, sum = 0.0 ; k < N ; k++ )
          sum += H[i*stride+k]*H[k*stride+j];
        tmp = sum;
        for ( k = i, sum = 0.0 ; k < N ; k++ )
          sum += H[(i+1)*stride+k]*H[k*stride+j];
        H[(i+1)*stride+j] = sum;
        H[i*stride+j] = tmp;
      }

      /* diagonal */
      for ( j = i+2, sum = 0.0 ; j < N ; j++ )
       sum += H[(i+1)*stride+j]*H[j*stride+i];
      tmp = sum; 
      for ( j = i+2, sum = 0.0 ; j < N ; j++ )
       sum += H[i*stride+j]*H[j*stride+i];
      H[i*stride+i] += sum;
      for ( j = i+2, sum = 0.0 ; j < N ; j++ )
       sum += H[(i+1)*stride+j]*H[j*stride+(i+1)];
      H[(i+1)*stride+(i+1)] += sum;
      H[(i+1)*stride+i] += tmp;
     
      i++;
    }
  }

  /* permute, using upper triangle as work space */
  /* diagonal, top row as work space */

  for ( i = 0 ; i < N ; i++ )
    H[0*stride+abs(ipiv[i])-1] = H[i*stride+i];
  for ( i = 1 ; i < N ; i++ )
    H[i*stride+i] = H[0*stride+i];
  /* now the main part */
  for ( i = 1 ; i < N ; i++ )
  { int fromi = abs(ipiv[i])-1;
    for ( j = 0 ; j < i ; j++ )
    { int fromj = abs(ipiv[j])-1;
      if ( fromi > fromj )
        H[fromj*stride+fromi] = H[i*stride+j];
      else
        H[fromi*stride+fromj] = H[i*stride+j];
    }
  }

  /* copy upper to lower */
  for ( i = 1 ; i < N ; i++ )
    for ( j = 0 ; j < i ; j++ )
      H[i*stride+j] = H[j*stride+i];
}

/*************************************************************************
*
* function: LD_block_factor()
*
* purpose: Do LDL factoring in block form to take advantage of DGEMM.
*
* return: Index.
*/

/* returns index */
int LD_block_factor( H, N )  /* actually, D^-1 on diagonal */
REAL **H;  /* lower block triangular, row by row */
           /* i.e. each consecutive BLOCKSIZE rows same length, */
           /* in multiples of BLOCKSIZE */
int N;     /* size */
{ int i,j,k;
  int info;
  int rowcount; /* rows in block; less than blocksize for last block */
  int negs = 0;
 
  for ( i = 0 ; i < N ; i += blocksize )   /* pivot (i,i) */
  { /* get pivot inverse in place */
    int stride = i+blocksize;
    rowcount = (N - i > blocksize) ? blocksize : N - i;

    /* Inverse of diagonal block */
    /* Bunch-Kaufman factor */
    negs += kb_DSYTRF(rowcount,&H[i][i],stride,ipiv,&info);
    /* now the inverse */
    kb_DSYTRI(rowcount,&H[i][i],stride,ipiv,&info);

    /* update row */
    for ( j = i+blocksize ; j < N ; j += blocksize )  /* row j */
    { REAL x = H[j][i];
      char transa = 'T';
      char transb = 'N';
      REAL alpha = -1.0;
      REAL beta  =  1.0;
      int stridea;
      int strideb;
      int stridec;
      char side = 'L';
      int ii,jj;
      char uplo;

      rowcount = (N - j > blocksize) ? blocksize : N - j;

      for ( k = i+blocksize ; k < j ; k += blocksize ) /* col k */
      { stridea = k+blocksize;
        strideb = j+blocksize;
        stridec = j+blocksize;
        DGEMM(&transa,&transb,&blocksize,&rowcount,&blocksize,&alpha,
           &H[k][i],&stridea,&H[j][i],&strideb,&beta,&H[j][k],&stridec);
      }
      /* H[j][i] = x*pivot;  temp store result in work */
      alpha =  1.0;
      beta  =  0.0;
      stridea = i+blocksize;
      strideb = j+blocksize;
      stridec = blocksize;
      alpha = 1.0; beta = 0.0;
      side = 'L';  /* symmetric matrix on the left */
      uplo = 'U';  /* transposed since Fortran */
      DSYMM(&side,&uplo,&blocksize,&rowcount,&alpha,&H[i][i],&stridea,
          &H[j][i],&strideb,&beta,&workspace[0][0],&stridec);

      /* H[j][j] -= x*pivot*x; */
      transa = 'T';
      transb = 'N';
      alpha = -1.0;
      beta  =  1.0;
      stridea = j+blocksize;
      strideb = blocksize;
      stridec = j+blocksize;
      DGEMM(&transa,&transb,&rowcount,&rowcount,&blocksize,&alpha,
         &H[j][i],&stridea,&workspace[0][0],&strideb,&beta,&H[j][j],&stridec);
      /* copy workspace back to H[j][i] */
      for ( jj = 0 ; jj < rowcount ; jj++ )
        for ( ii = 0 ; ii < blocksize ; ii++ )
          H[j+jj][i+ii] = workspace[jj][ii];
    }
  }
  return negs;
}

/**************************************************************************
*
* function: LD_block_solve()
*
* purpose: Solve for single right hand side using factoring produced
*          by LD_block_factor().
*/

void LD_block_solve(LD,B,X,N)
REAL **LD;  /* from LD_block_factor(), inverse blocks on diagonal */
REAL *B;  /* rhs */
REAL *X;  /* solution */
int N;  /* size of system */
{ int i,j;
  REAL alpha = 0.0;
  REAL beta = 0.0;
  int stridea = 1;
  int incx=1,incy=1;
  char trans = 'N';
  int rowcount;  /* rows in block; less than blocksize at end */
 
  for ( i = 0 ; i < N ; i++ ) X[i] = B[i];

  for ( i = 0 ; i < N ; i += blocksize )
  { 
    rowcount = (N - i > blocksize) ? blocksize : N - i;
    for ( j = 0 ; j < i ; j += blocksize )
    { /*  X[i] -= LD[i][j]*X[j]; */
      alpha = -1.0;
      beta = 1.0;
      stridea = i+blocksize;
      incx = 1;
      incy = 1;
      trans = 'T';
      DGEMV(&trans,&blocksize,&rowcount,&alpha,&LD[i][j],&stridea,
         &X[j],&incx,&beta,&X[i],&incy);
    }
  }
  for ( i = 0 ; i < N ; i += blocksize )
  { /* X[i] /= LD[i][i]; */
    char uplo = 'U';
    rowcount = (N - i > blocksize) ? blocksize : N - i;
    alpha = 1.0;
    beta = 0.0;
    stridea = i+blocksize;
    incx = 1;
    incy = 1;
    DSYMV(&uplo,&rowcount,&alpha,&LD[i][i],&stridea,&X[i],&incx,
       &beta,&workv[0],&incy);  /* need separate destination */
    for ( j = 0 ; j < rowcount ; j++ )
       X[i+j] = workv[j];
  }

  for ( i = N-rowcount ; i >= 0 ; i -= blocksize )
  { 
    for ( j = i+blocksize ; j < N ; j += blocksize )
    { /*  X[i] -= LD[j][i]*X[j]; */
      rowcount = (N - j > blocksize) ? blocksize : N - j;
      alpha = -1.0;
      beta = 1.0;
      stridea = j+blocksize;
      incx = 1;
      incy = 1;
      trans = 'N';
      DGEMV(&trans,&blocksize,&rowcount,&alpha,&LD[j][i],&stridea,
         &X[j],&incx,&beta,&X[i],&incy);
    }
  }
}


#endif
/**********************************************************************/

 
