Revision 7c331b1ee58f504e2f77c8ec64224ad3b674463b authored by Ronny Lorenz on 04 May 2015, 13:34:30 UTC, committed by Ronny Lorenz on 04 May 2015, 13:34:30 UTC
1 parent fa00faf
Raw File
stringdist.c
/*
		String alignment for RNA secondary structures
		      Peter F Stadler, Ivo Hofacker
			   Vienna RNA Package
*/
/* Last changed Time-stamp: <2005-07-23 10:19:40 ivo> */

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
#include <math.h>
#include  "edit_cost.h"
#include  "dist_vars.h"
#include  "utils.h"

static char rcsid[] = "$Id: stringdist.c,v 1.3 2005/07/24 08:37:15 ivo Exp $";

#define PUBLIC
#define PRIVATE        static

#define INFINITY_DIST  10000

PUBLIC  float      string_edit_distance(swString *T1, swString *T2);
PUBLIC  swString  *Make_swString(char *string);
PUBLIC  void       print_swString(swString *x);
PUBLIC  void       print_alignment_list(void);

PRIVATE void       sprint_aligned_swStrings(swString *T1, swString *T2);
PRIVATE float      StrEditCost(int i, int j, swString *T1, swString *T2);
PRIVATE void       DeCode(char *string, int k, int *tp, float *w);
PRIVATE int        decode(char *id);
PRIVATE void       encode(int type, char label[]);
PRIVATE int       *alignment[2]; /* contains information from backtracking
				    alignment[0][n] is the node in tree2
				    matching node n in tree1               */


/*---------------------------------------------------------------------------*/

PUBLIC float string_edit_distance(swString *T1, swString *T2)

{
    float  **distance;
    short    **i_point, **j_point;

    int           i, j, i1, j1, pos, length1,length2;
    float         minus, plus, change, temp;

    if (cost_matrix==0) EditCost = &UsualCost;
    else EditCost = &ShapiroCost;

    length1 = T1[0].sign;
    length2 = T2[0].sign;

    distance = (float **)  space((length1 +1)*sizeof(float *));
    if(edit_backtrack){
       i_point  = (short **)  space((length1 +1)*sizeof(short *));
       j_point  = (short **)  space((length1 +1)*sizeof(short *));
    }
    for(i=0; i<= length1; i++){
       distance[i] = (float *) space( (length2+1)*sizeof(float));
       if(edit_backtrack){
	  i_point[i]  = (short *) space( (length2+1)*sizeof(short));
	  j_point[i]  = (short *) space( (length2+1)*sizeof(short));
       }
    }

    for(i = 1; i <= length1; i++) {
       if (edit_backtrack){
	  i_point[i][0] = i-1;
	  j_point[i][0] = 0;
       }
       distance[i][0] = distance[i-1][0]+StrEditCost(i,0,T1,T2);
    }
    for(j = 1; j <= length2; j++) {
       if (edit_backtrack){
	  j_point[0][j] = j-1;
	  i_point[0][j] = 0;
       }
       distance[0][j] = distance[0][j-1]+StrEditCost(0,j,T1,T2);
    }

    for (i = 1; i <= length1; i++) {
       for (j = 1; j <= length2 ; j++) {
          minus  = distance[i-1][j]  + StrEditCost(i,0,T1,T2);
          plus   = distance[i][j-1]  + StrEditCost(0,j,T1,T2);
          change = distance[i-1][j-1]+ StrEditCost(i,j,T1,T2);

          distance[i][j] = MIN3(minus, plus, change);
          /* printf("%g ", distance[i][j]); */

          if(edit_backtrack){
             if(distance[i][j] == change) {
                i_point[i][j]=i-1; j_point[i][j]=j-1;  }
             else if(distance[i][j] == plus) {
                i_point[i][j]=i  ; j_point[i][j]=j-1;  }
             else {
                i_point[i][j]=i-1; j_point[i][j]=j  ;  }
          }
       }
       /* printf("\n"); */
    }
    /* printf("\n"); */
    temp = distance[length1][length2];
    for(i=0;i<=length1;i++)
       free(distance[i]);
    free(distance);

    if(edit_backtrack){
       if(alignment[0]!= NULL) free(alignment[0]);
       if(alignment[1]!= NULL) free(alignment[1]);
       alignment[0] = (int *) space((length1+length2+1)*sizeof(int));
       alignment[1] = (int *) space((length1+length2+1)*sizeof(int));

       pos = length1+length2;
       i   = length1;
       j   = length2;
       while( (i>0)||(j>0) ) {
          i1 = i_point[i][j];
          j1 = j_point[i][j];
          if( ((i-i1)==1)&&((j-j1)==1) )  {  /* substitution    */
              alignment[0][pos] = i;
              alignment[1][pos] = j;
          }
          if( ((i-i1)==1)&&(j==j1) )      {  /* Deletion in [1] */
              alignment[0][pos] = i;
              alignment[1][pos] = 0;
          }
          if( (i==i1)&&((j-j1)==1)  )      {  /* Deletion in [0] */
              alignment[0][pos] = 0;
              alignment[1][pos] = j;
          }
          pos--;
          i = i1;
          j = j1;
       }
       for(i=pos+1; i<=length1+length2; i++){
          alignment[0][i-pos] = alignment[0][i];
          alignment[1][i-pos] = alignment[1][i];
       }
       alignment[0][0] = length1+length2-pos;   /* length of alignment */

       for(i=0; i<=length1; i++){
          free(i_point[i]); free(j_point[i]);
       }
       free(i_point); free(j_point);
       sprint_aligned_swStrings(T1,T2);

    }

    return temp;
}


/*---------------------------------------------------------------------------*/

PRIVATE float StrEditCost(int i, int j, swString *T1, swString *T2)
{
    float  c, diff, cd, min, a, b, dist;

    if(i==0) {
       cd   =  (float) (*EditCost)[0][T2[j].type];
       diff =  T2[j].weight;
       dist =  cd*diff;
    }
    else
    if(j==0) {
       cd   =  (float) (*EditCost)[T1[i].type][0];
       diff =  T1[i].weight;
       dist =  cd*diff;
    }
    else
    if( ((T1[i].sign)*(T2[j].sign)) > 0) {
       c = (float) (*EditCost)[T1[i].type][T2[j].type];
       diff = (float) fabs((a=T1[i].weight) - (b=T2[j].weight));
       min = MIN2(a,b);
       if (min == a) cd = (float) (*EditCost)[0][T2[j].type];
       else          cd = (float) (*EditCost)[T1[i].type][0];
       dist = c * min + cd * diff;
    }
    else dist = (float) INFINITY_DIST;
    return dist;
}

/*---------------------------------------------------------------------------*/

PUBLIC swString *Make_swString(char *string)
{
   int i=0, j=0, k=0;
   int tp, len, l, length;
   float w;
   swString  *S;

   length = strlen(string);

   for(i=0; i<length; i++) {
      if( (string[i]=='(') || (string[i]==')') ) j++;
      if(string[i]=='.') j+=2;
   }

   len = j;

   S= (swString *) space(sizeof(swString)*(len+1));
   S[0].sign = j; /* number of entries */
   S[0].weight= 0.0;
   S[0].type=     0;

   i=0;
   j=1;
   while(i<length){
      switch(string[i]){
       case '(' :
          S[j].sign = 1;
          l=1;
          k=i;
          while (l>0) {
             k++;
             if(string[k] == '(' ) l++;
             if(string[k] == ')' ) l--;
          }
          DeCode(string,k,&tp,&w);
          S[j].type   = tp;
          S[j].weight = w/2.0;
	  j++;
          break;
       case ')' :
          k=i;
          S[j].sign = -1;
          DeCode(string,k,&tp,&w);
          S[j].type = tp;
          S[j].weight = w/2.0;
	  j++;
          break;
       case '.' :
          S[j].sign = 1;
          S[j].type = 1;
          S[j].weight = 0.5;
          j++;
          S[j].sign = -1;
          S[j].type =  1;
          S[j].weight = 0.5;
	  j++;
          break;
      }
      i++;
   }
   return S;
}

/*---------------------------------------------------------------------------*/

PRIVATE void DeCode(char *string, int k, int *tp, float *w)
   /* retrieves type and weigth for a node closed  by a bracket at position k */
{
   int i,j,l,m;
   char  label[20], id[20] ;
   i=k;
   label[0] = '\0';
   while(i>=0){
      i--;
      if( (string[i]=='(')||(string[i]==')')||(string[i]=='.') ) break;
      else {
         label[k-i-1] = string[i]; label[k-i] = '\0';
      }
   }
   l=strlen(label);
   if (l==0) {           /* Dot-Bracket notation */
     *w  = 1.0;
     *tp =   2;
   }
   else{
     for (i=0; i<l; i++) {
       if (!isalpha(label[l-i-1])) break;
       id[i] = label[l-i-1];
      }
      id[i] = '\0';
      *tp=decode(id);
      l=l-i-1;
      if(l>=0){
         for(j=0; j<=l; j++)
            id[j] = label[l-j];
         label[l+1] ='\0';
         m=-1;
         sscanf(label,"%d",&m);
         *w= (float) m;
         if(m==-1) {
            fprintf(stderr, "Warning: Non-integer weight in DeCode ignored\n");
            *w=1.0;
         }
      }
      else
         *w=1.0;
   }
}

/*---------------------------------------------------------------------------*/

PRIVATE int decode(char *id)
{
    int   n, quit, i;
    char  label[100], *code;

    n = 0;

    quit = 0;
    code = coding;

    while (!quit) {
        for (i = 0; code[i] != sep; i++) {
            if (code[i] == '\0') {
                quit = 1;
                break;
            }
            label[i] = code[i];
        }
        label[i] = '\0';
        if (strcmp(id, label) == 0) return (n);
        code += (i+1);
        n++;
    }

    fprintf(stderr,"Syntax error: node identifier \"%s\" not found "
		   "in coding string \"%s\"\n", id, coding);
    fprintf(stderr,"Exiting...");
    exit(0);
}


/*---------------------------------------------------------------------------*/

PRIVATE void encode( int type, char label[])

{
    int   i, l;

    l = 0;
    for (i = 0; i < type; i++) {
        while (coding[l] != sep && coding[l]) l++;
        l++;
    }

    for (i = 0; coding[l+i] != sep; i++) {
        if (coding[l+i] == '\0') break;
        label[i] = coding[l+i];
    }
    label[i] = '\0';
}

/*---------------------------------------------------------------------------*/

PRIVATE void sprint_aligned_swStrings(swString *T1, swString *T2)
{
   int i, j, l0, l1, ltmp=0, weights;
   char label[10], *a0, *a1, tmp0[20], tmp1[20];

   weights = 0;
   for (i=1; i<=T1[0].sign; i++) weights = (weights||(T1[i].weight!=0.5));
   for (i=1; i<=T2[0].sign; i++) weights = (weights||(T2[i].weight!=0.5));

   a0 = (char *) space(alignment[0][0]*4+2);
   a1 = (char *) space(alignment[0][0]*4+2);
   for(i=1; i<= alignment[0][0]; i++){
      tmp0[0] = '\0'; l0=0;
      if(alignment[0][i] > 0) {
         encode(T1[alignment[0][i]].type, label);
         if(T1[alignment[0][i]].sign > 0) {
            tmp0[0] = '(';
            tmp0[1] = '\0';
         }
         strcat(tmp0,label);
	 if (weights)
	    sprintf(tmp0+strlen(tmp0), "%d",
		    (int)(2*T1[alignment[0][i]].weight));

         if(T1[alignment[0][i]].sign < 0) strcat(tmp0, ")");
	 l0 = strlen(tmp0);
      }
      tmp1[0]= '\0'; l1=0;
      if(alignment[1][i] > 0) {
         encode(T2[alignment[1][i]].type, label);
         if(T2[alignment[1][i]].sign > 0) {
            tmp1[0] = '(';
            tmp1[1] = '\0';
         }
         strcat(tmp1,label);
	 if (weights)
	    sprintf(tmp1+strlen(tmp1), "%d",
		    (int)(2*T2[alignment[1][i]].weight));

	 if(T2[alignment[1][i]].sign < 0) strcat(tmp1, ")");
         l1 = strlen(tmp1);
      }
      ltmp = MAX2(l0,l1);
      for (j=l0; j<ltmp; j++) tmp0[j] = '_';
      for (j=l1; j<ltmp; j++) tmp1[j] = '_';
      tmp0[ltmp] = '\0'; tmp1[ltmp] = '\0';

      strcat(a0,tmp0); strcat(a1,tmp1);
      ltmp = strlen(a0);
   }
   if (aligned_line[0]!= NULL) { free(aligned_line[0]); aligned_line[0]= NULL;}
   if (aligned_line[1]!= NULL) { free(aligned_line[1]); aligned_line[1]= NULL;}
   aligned_line[0] = strdup(a0);
   free(a0);
   aligned_line[1] = strdup(a1);
   free(a1);
}

/*---------------------------------------------------------------------------*/


PUBLIC void print_swString(swString *x)
{
   int i;
   for (i=0; i<=x[0].sign; i++)
      printf("(%d,%d,%f\n) ", x[i].type, x[i].sign, x[i].weight );
   printf("\n");
}

/*---------------------------------------------------------------------------*/

PUBLIC void print_alignment_list(void)
{
   int i;
   printf("\n");
   for (i=1; i<= alignment[0][0]; i++)
      printf("%3d ",alignment[0][i]);
      printf("\n");
   for (i=1; i<= alignment[0][0]; i++)
      printf("%3d ",alignment[1][i]);
   printf("\n");
}
back to top