

#include "default.h"
#include "poa.h"
#include "seq_util.h"
#include "lpo.h"



void fuse_ring_identities(int len_x,LPOLetter_T seq_x[],
			  int len_y,LPOLetter_T seq_y[],
			  LPOLetterRef_T al_x[],
			  LPOLetterRef_T al_y[])
{
  int i,j;
  LOOP (i,len_y) {
    if (al_y[i]<0 || seq_x[al_y[i]].letter == seq_y[i].letter)
      continue; /* NOT ALIGNED, OR ALREADY IDENTICAL, SO SKIP */
    for (j=seq_x[al_y[i]].align_ring;j!=al_y[i];j=seq_x[j].align_ring)
      if (seq_x[j].letter == seq_y[i].letter) { /* IDENTICAL! SO FUSE! */
	al_x[al_y[i]]= INVALID_LETTER_POSITION; /* DISCONNECT FROM OLD */
	al_y[i]=j; /* CONNECT TO NEW IDENTITY */
	al_x[j]=i;
	break; /* SEARCH YE NO FURTHER */
      }
  }
}




/** aligns the sequences in seq[] to the sequence or partial order in
  new_seq; seq[] must be linear orders (regular sequences);
  the alignment is built up by iterative partial order alignment,
  and the resulting partial order is returned in new_seq */
LPOSequence_T *buildup_lpo(LPOSequence_T *new_seq,
			   int nseq,LPOSequence_T seq[],
			   ResidueScoreMatrix_T *score_matrix,
			   int use_aggressive_fusion,
			   int use_global_alignment)
{
  int i,max_alloc=0,total_alloc;
  LPOLetterRef_T *al1=NULL,*al2=NULL;

  lpo_index_symbols(new_seq,score_matrix); /* MAKE SURE LPO IS TRANSLATED */
  for (i=0;i<nseq;i++) { /* ALIGN ALL SEQUENCES TO my_lpo ONE BY ONE */
    if (seq[i].letter == NULL) /* HMM.  HASN'T BEEN INITIALIZED AT ALL YET */
      initialize_seqs_as_lpo(1,seq+i,score_matrix);
    total_alloc=new_seq->length*seq[i].length
      + sizeof(LPOLetter_T)*new_seq->length;
    if (total_alloc>max_alloc) { /* DP RECTANGLE ARRAY SIZE */
      max_alloc=total_alloc;
#ifdef REPORT_MAX_ALLOC
      fprintf(stderr,"max_alloc: %d bytes\n",max_alloc);
#endif
      if (max_alloc>POA_MAX_ALLOC) {
	WARN_MSG(TRAP,(ERRTXT,"Exceeded memory bound: %d\n Exiting!\n\n",max_alloc),"$Revision: 1.2.2.1 $");
	break; /* JUST RETURN AND FINISH */
      }
    }
    align_lpo(new_seq,&seq[i],
	      score_matrix,&al1,&al2,use_global_alignment); /* ALIGN ONE MORE SEQ */
    if (use_aggressive_fusion) 
      fuse_ring_identities(new_seq->length,new_seq->letter,
			   seq[i].length,seq[i].letter,al1,al2);
    fuse_lpo(new_seq,seq+i,al1,al2); /* BUILD COMPOSITE LPO */

    free_lpo_letters(seq[i].length,seq[i].letter,TRUE);/*NO NEED TO KEEP*/
    seq[i].letter=NULL; /* MARK AS FREED... DON'T LEAVE DANGLING POINTER! */
    FREE(al1); /* DUMP TEMPORARY MAPPING ARRAYS */
    FREE(al2);
  }

  return new_seq;
}
/**@memo example: aligning a set of sequences to a partial order: 
      lpo_out=buildup_lpo(lpo_in,nseq,seq,&score_matrix,0);
*/



/** CLIPS seq->letter[] TO JUST THE SEGMENT ALIGNED TO letter_x[] via al_x[]
 DOES *NOT* FREE existing seq->letter[]; YOU MUST KEEP IT OR FREE IT YOURSELF*/
int clip_unaligned_ends(LPOSequence_T *seq,
			LPOLetterRef_T al[],
			int len_x,LPOLetter_T letter_x[],
			LPOLetterRef_T al_x[],int *offset,int *match_length)
{
  int i,j=0,start,end,new_length,allow_end_length=0,nidentity=0;
  LPOLetter_T *temp=NULL;
  CALLOC(temp,seq->length,LPOLetter_T); /* ALLOCATE NEW letter[] COPY */
  for (start=0;start<seq->length;start++) /* FIND 1ST ALIGNED POS */
    if (al[start]>=0)
      break;

  for (end=seq->length -1;end>=0;end--) /* FIND LAST ALIGNED POS */
    if (al[end]>=0)
      break;

  for (i=start;i<=end;i++) /* COUNT IDENTITIES TO letter_x[] */
    if (al[i]>=0 && seq->letter[i].letter==letter_x[al[i]].letter)
      nidentity++;
  if (match_length) /* RETURN THE MATCH LENGTH TO THE CALLER */
    *match_length = end-start+1;

  if (start>allow_end_length) /* ALLOW EXTRA RESIDUES ON EITHER END*/
    start-=allow_end_length;
  else /* KEEP IN BOUNDS */
    start=0;
  if (end+allow_end_length<seq->length)
    end+=allow_end_length;
  else /* KEEP IN BOUNDS */
    end=seq->length-1;

  LOOP (i,len_x) /* WE ARE SHIFTING al TO THE RIGHT BY start POSITIONS */
    if (al_x[i]>=0) /* SO WE HAVE TO TRANSLATE al_x CORRESPONDINGLY */
      al_x[i]-= start;

  seq->length=end-start+1; /* NOW TRANSLATE left, right, align_ring, ring_id*/
  memcpy(temp,seq->letter+start,sizeof(LPOLetter_T)*(seq->length));
  LOOP (i,seq->length) { /* THIS *ONLY* WORKS FOR PURE LINEAR SEQUENCE!!! */
    temp[i].left.ipos -= start; /*IF <0, BECOMES INVALID BY DEFINITION, OK*/
    temp[i].right.ipos -= start;
    if (temp[i].right.ipos>=seq->length) /* PAST THE NEW, CLIPPED END */
      temp[i].right.ipos= INVALID_LETTER_POSITION;
    temp[i].ring_id=temp[i].align_ring=i;
  }

  if (offset) /* RETURN THE OFFSET TO THE CALLER */
    *offset = start;

  seq->letter=temp; /* NEW START: FIRST ALIGNED POSITION */
  return nidentity; /* NEW LENGTH: FROM 1ST TO LAST ALIGNED POS*/
}





void restore_lpo_size(LPOSequence_T *seq,int length,LPOLetter_T *letter)
{

  free_lpo_letters(seq->length,seq->letter,TRUE); /* DUMP CLIPPED VERSION*/
  seq->length=length; /* RESTORE ORIGINAL length AND letter[] */
  seq->letter=letter;
}



/** BUILDS UP ALIGNMENT, BUT CLIPS UNALIGNED ENDS OF EACH NEW SEQUENCE ADDED  
-------------------------------------------------------
---------------------------------------------------------------------------
*/
LPOSequence_T *buildup_clipped_lpo(LPOSequence_T *new_seq,
				   int nseq,LPOSequence_T seq[],
				   ResidueScoreMatrix_T *score_matrix,
				   int use_global_alignment)
{
  int i,ntemp,offset=0,nidentity,length_max=0,match_length=0;
  int total_alloc,max_alloc=0;
  LPOLetterRef_T *al1=NULL,*al2=NULL;
  LPOLetter_T *temp;
  float identity_max=0.,f;

  lpo_index_symbols(new_seq,score_matrix); /* MAKE SURE LPO IS TRANSLATED */
  for (i=0;i<nseq;i++) { /* ALIGN ALL SEQUENCES TO new_seq ONE BY ONE */
    if (seq[i].letter == NULL) /* HMM.  HASN'T BEEN INITIALIZED AT ALL YET */
      initialize_seqs_as_lpo(1,seq+i,score_matrix);
    total_alloc=new_seq->length*seq[i].length
      + sizeof(LPOLetter_T)*new_seq->length;
    if (total_alloc>max_alloc) { /* DP RECTANGLE ARRAY SIZE */
      max_alloc=total_alloc;
#ifdef REPORT_MAX_ALLOC
      fprintf(stderr,"max_alloc: %d bytes (%d x %d)\n",max_alloc,
	      new_seq->length,seq[i].length);
#endif
      if (max_alloc>POA_MAX_ALLOC) {
	WARN_MSG(TRAP,(ERRTXT,"Exceeded memory bound: %d\n Exiting!\n\n",max_alloc),"$Revision: 1.2.2.1 $");
	break; /* JUST RETURN AND FINISH */
      }
    }
    align_lpo(new_seq, &seq[i],
	      score_matrix,&al1,&al2,use_global_alignment); /* ALIGN ONE MORE SEQ */
    ntemp=seq[i].length; /* SAVE letter[] BEFORE CLIPPING IT TO ALIGNED AREA*/
    temp=seq[i].letter;
    if ((nidentity=clip_unaligned_ends(seq+i,al2,/*THERE IS AN ALIGNED REGION*/
			new_seq->length,new_seq->letter,al1,&offset,
				       &match_length))>0) {
      f=nidentity/(float)match_length; /* CALCULATE IDENTITY FRACTION */
      if (0==i /*f>identity_max*/) { /* REPORT IDENTITY OF TOP HIT */
	identity_max=nidentity;
	length_max=match_length;
      }
      fuse_lpo(new_seq,seq+i,al1,al2+offset); /*ADD CLIPPED REGION TO LPO*/
    }
    restore_lpo_size(seq+i,ntemp,temp); /* REVERT FROM CLIPPED TO ORIGINAL*/
    FREE(al1); /* DUMP TEMPORARY MAPPING ARRAYS FROM align_lpo() */
    FREE(al2);
  }

  fprintf(stderr,"%s\tmaximum identity\t%3.1f%%\t%.0f/%d\n",new_seq->name,
	  100*identity_max/length_max,identity_max,length_max);
  return new_seq;
}


int find_seq_name(int nseq,LPOSequence_T seq[],char name[])
{
  int i;
  LOOP (i,nseq)
    if (0==strcmp(seq[i].name,name))
      return i;
  return -1;
}


typedef struct {
  double score;
  double bitscore;
  int i;
  int j;
} SeqPairScore_T;


/* SORT IN ASCENDING ORDER BY score, THEN DESCENDING ORDER by bitscore */
int seqpair_score_qsort_cmp(const void *void_a,const void *void_b)
{
  const SeqPairScore_T *a=(const SeqPairScore_T *)void_a,
  *b=(const SeqPairScore_T *)void_b;
  if (a->score < b->score)
    return -1;
  else if (a->score == b->score) {
    if (a->bitscore > b->bitscore)
      return -1;
    else if (a->bitscore == b->bitscore)
      return 0;
  }
  else
    return 1;
}




SeqPairScore_T *read_seqpair_scorefile(int nseq,LPOSequence_T seq[],
			       FILE *ifile,int *p_nscore)
{
  int i,j,nscore=0;
  SeqPairScore_T *score=NULL;
  double v,x;
  char name1[256],name2[256];

  CALLOC(score,nseq*nseq/2,SeqPairScore_T);
  while (fscanf(ifile," %s %s %lf %lf",name1,name2,
		&v,&x)==4) { /*READ SCORE FILE*/
    i=find_seq_name(nseq,seq,name1);      
    j=find_seq_name(nseq,seq,name2);
    if (i<0 || j<0) {
      WARN_MSG(USERR,(ERRTXT,"invalid sequence pair, not found: %s,%s",name1,name2),"$Revision: 1.2.2.1 $");
      FREE(score);
      return NULL;
    }

    /*    fprintf(stderr,"i=%d,j=%d,x=%e\n",i,j,x);*/
    if (i<j) { /* DON'T SAVE UPPER, DUPLICATE HALF OF THE MATRIX */
      /*      fprintf(stderr,"Saving score %s,%s:%e\n",name1,name2,x);*/
      score[nscore].score=x; /* SAVE THE SCORE INTO THE MATRIX */
      score[nscore].bitscore=v;
      score[nscore].i=i;
      score[nscore].j=j;
      nscore++;
    }
  }

  /* NOW SORT THESE IN ASCENDING ORDER AND HAND BACK TO CALLER */
  qsort(score,nscore,sizeof(SeqPairScore_T),seqpair_score_qsort_cmp);
  if (p_nscore) /* RETURN LENGTH OF PAIR SCORE TABLE IF REQUESTED */
    *p_nscore=nscore;
  return score;
}


LPOSequence_T *buildup_progressive_lpo(int nseq,LPOSequence_T seq[],
				       ResidueScoreMatrix_T *score_matrix,
				       int use_aggressive_fusion,
				       char score_file[],
				       LPOScore_T (*scoring_function)
				       (int,int,LPOLetter_T [],LPOLetter_T [],
					ResidueScoreMatrix_T *),
				       int use_global_alignment)
{
  int i,j,max_alloc=0,total_alloc;
  LPOLetterRef_T *al1=NULL,*al2=NULL;
  SeqPairScore_T *score=NULL;
  FILE *ifile;
  LPOSequence_T *new_seq=NULL;
  int *seq_cluster=NULL,cluster_i,cluster_j,nscore=0,iscore;
  
  ifile=fopen(score_file,"r");
  if (ifile) {
    if ((score=read_seqpair_scorefile(nseq,seq,ifile,&nscore))==NULL)
      goto free_and_exit;
    fclose(ifile);
  }
  else
    goto free_and_exit;

  CALLOC(seq_cluster,nseq,int); /* MAPS SEQS TO CLUSTER THEY'RE IN */
  LOOP (i,nseq) /* CREATE TRIVIAL MAPPING, EACH SEQ ITS OWN CLUSTER */
    seq_cluster[i]=i;

  for (iscore=0;iscore<nscore;iscore++) {
    if (seq_cluster[score[iscore].i] < seq_cluster[score[iscore].j]) {
      cluster_i=seq_cluster[score[iscore].i];
      cluster_j=seq_cluster[score[iscore].j];
    }
    else if (seq_cluster[score[iscore].j] < seq_cluster[score[iscore].i]) {
      cluster_i=seq_cluster[score[iscore].j];
      cluster_j=seq_cluster[score[iscore].i];
    }
    else /* CLUSTERS ALREADY FUSED, SO SKIP THIS PAIR */
      continue;

    fprintf(stderr,"Fusing cluster %s --> %s... score %e,%lf\n",
	    seq[cluster_j].name,seq[cluster_i].name,score[iscore].score,
	    score[iscore].bitscore);
    new_seq=seq+cluster_i; /* THIS WILL BECOME THE NEW MASTER CLUSTER */
    if (seq[cluster_i].letter == NULL) /* NOT INITIALIZED AT ALL YET */
      initialize_seqs_as_lpo(1,seq+cluster_i,score_matrix);
    if (seq[cluster_j].letter == NULL) /* NOT INITIALIZED AT ALL YET */
      initialize_seqs_as_lpo(1,seq+cluster_j,score_matrix);
    total_alloc=new_seq->length*seq[cluster_j].length
      + sizeof(LPOLetter_T)*new_seq->length;
    if (total_alloc>max_alloc) { /* DP RECTANGLE ARRAY SIZE */
      max_alloc=total_alloc;
#ifdef REPORT_MAX_ALLOC
      fprintf(stderr,"max_alloc: %d bytes\n",max_alloc);
#endif
      if (max_alloc>POA_MAX_ALLOC) {
	WARN_MSG(TRAP,(ERRTXT,"Exceeded memory bound: %d\n Exiting!\n\n",max_alloc),"$Revision: 1.2.2.1 $");
	break; /* JUST RETURN AND FINISH */
      }
    }

#ifdef USE_LOCAL_NEUTRALITY_CORRECTION /* NO LONGER USED */
    if (score_matrix->nfreq>0) { /* CALCULATE BALANCED SCORING ON EACH PO */
      balance_matrix_score(new_seq->length,new_seq->letter,score_matrix);
      balance_matrix_score(seq[cluster_j].length,seq[cluster_j].letter,
			   score_matrix);
    }
#endif

    align_lpo_po(new_seq, &seq[cluster_j],
		 score_matrix,&al1,&al2,
		 scoring_function,use_global_alignment); /* ALIGN ONE MORE SEQ */
    if (use_aggressive_fusion) 
      fuse_ring_identities(new_seq->length,new_seq->letter,
			   seq[cluster_j].length,seq[cluster_j].letter,
			   al1,al2);
    fuse_lpo(new_seq,seq+cluster_j,al1,al2); /* BUILD COMPOSITE LPO */

    free_lpo_letters(seq[cluster_j].length,seq[cluster_j].letter,TRUE);
    seq[cluster_j].letter=NULL; /*MARK AS FREED. DON'T LEAVE DANGLING POINTER*/
    FREE(al1); /* DUMP TEMPORARY MAPPING ARRAYS */
    FREE(al2);

    LOOP (i,nseq) /* REINDEX ALL MEMBERS OF cluster_j TO JOIN cluster_i */
      if (seq_cluster[i]==cluster_j)
	seq_cluster[i]=cluster_i;
  }

 free_and_exit:
  FREE(score);
  FREE(seq_cluster);
  return new_seq; /* RETURN THE FINAL MASTER CLUSTER */
}
