Revision 7c331b1ee58f504e2f77c8ec64224ad3b674463b authored by Ronny Lorenz on 04 May 2015, 13:34:30 UTC, committed by Ronny Lorenz on 04 May 2015, 13:34:30 UTC
1 parent fa00faf
Raw File
aln_util.c
/*
			       aln_util.c
	       Helper functions frelated to alignments
*/
/* Last changed Time-stamp: <2006-01-16 11:42:38 ivo> */

#include <stdio.h>
#include <stdlib.h>
#include <errno.h>
#include <time.h>
#include <string.h>
#include <ctype.h>
#include <config.h>
#include "utils.h"
#include "fold_vars.h"
#include "pair_mat.h"
/*@unused@*/
static char rcsid[] = "$Id: aln_util.c,v 1.4 2007/02/02 15:18:13 ivo Exp $";

#define MAX_NUM_NAMES    500
int read_clustal(FILE *clust, char *AlignedSeqs[], char *names[]) {
   char *line, name[100]="", *seq;
   int  n, nn=0, num_seq = 0, i;

   if ((line=get_line(clust)) == NULL) {
     fprintf(stderr, "Empty CLUSTAL file\n"); return 0;
   }

   if ((strncmp(line,"CLUSTAL", 7) !=0) && (!strstr(line,"STOCKHOLM"))) {
     fprintf(stderr, "This doesn't look like a CLUSTAL/STOCKHOLM file, sorry\n");
     free(line); return 0;
   }
   free(line);
   line = get_line(clust);

   while (line!=NULL) {
    if(strncmp(line, "//", 2) == 0){
      free(line);
      break;
    }

    if (((n=strlen(line))<4) || isspace((int)line[0])) {
      /* skip non-sequence line */
      free(line); line = get_line(clust);
      nn=0; /* reset seqence number */
      continue;
    }
    /* skip comments */
    if(line[0] == '#'){
      free(line);
      line = get_line(clust);
      continue;
    }

     seq = (char *) space( (n+1)*sizeof(char) );
     sscanf(line,"%99s %s", name, seq);

    for(i=0;i<strlen(seq);i++){
      if(seq[i] == '.') seq[i] = '-'; /* replace '.' gaps by '-' */
      /* comment the next line and think about something more difficult to deal with
         lowercase sequence letters if you really want to */
      seq[i] = toupper(seq[i]);
    }

     if (nn == num_seq) { /* first time */
       names[nn] = strdup(name);
       AlignedSeqs[nn] = strdup(seq);
     }
     else {
       if (strcmp(name, names[nn])!=0) {
	 /* name doesn't match */
	 fprintf(stderr,
		 "Sorry, your file is fucked up (inconsitent seq-names)\n");
	 free(line); free(seq);
	 return 0;
       }
       AlignedSeqs[nn] = (char *)
	 xrealloc(AlignedSeqs[nn], strlen(seq)+strlen(AlignedSeqs[nn])+1);
       strcat(AlignedSeqs[nn], seq);
     }
     nn++;
     if (nn>num_seq) num_seq = nn;
     free(seq);
     free(line);
     if (num_seq>=MAX_NUM_NAMES) {
       fprintf(stderr, "Too many sequences in CLUSTAL/STOCKHOLM file");
       return 0;
     }

     line = get_line(clust);
   }

   AlignedSeqs[num_seq] = NULL;
   names[num_seq] = NULL;
   if (num_seq == 0) {
     fprintf(stderr, "No sequences found in CLUSTAL/STOCKHOLM file\n");
     return 0;
   }
   n = strlen(AlignedSeqs[0]);
   for (nn=1; nn<num_seq; nn++) {
     if (strlen(AlignedSeqs[nn])!=n) {
       fprintf(stderr, "Sorry, your file is fucked up.\n"
	       "Unequal lengths!\n\n");
       return 0;
     }
   }

   fprintf(stderr, "%d sequences; length of alignment %d.\n", nn, n);
   return num_seq;
}

char *consensus(const char *AS[]) {
  /* simple consensus sequence (most frequent character) */
  char *string;
  int i,n;
  n = strlen(AS[0]);
  string = (char *) space((n+1)*sizeof(char));
  for (i=0; i<n; i++) {
    int s,c,fm, freq[8] = {0,0,0,0,0,0,0,0};
    for (s=0; AS[s]!=NULL; s++)
      freq[encode_char(AS[s][i])]++;
    for (s=c=fm=0; s<8; s++) /* find the most frequent char */
      if (freq[s]>fm) {c=s, fm=freq[c];}
    if (s>4) s++; /* skip T */
    string[i]=Law_and_Order[c];
  }
  return string;
}

/* IUP nucleotide classes indexed by a bit string of the present bases */
/* A C AC G AG CG ACG U AU CU ACU GU AGU CGU ACGU */
static char IUP[17] = "-ACMGRSVUWYHKDBN";
char *consens_mis(const char*AS[]) {
  /* MIS displays the 'most informative sequence' (Freyhult et al 2004),
     elements in columns with frequency greater than the background
     frequency are projected into iupac notation. Columns where gaps are
     over-represented are in lower case. */

  char *cons;
  int i, s, n, N, c;
  int bgfreq[8] = {0,0,0,0,0,0,0,0};

  n = strlen(AS[0]);
  for (N=0; AS[N]!=NULL; N++);
  cons = (char *) space((n+1)*sizeof(char));

  for (i=0; i<n; i++)
    for (s=0; s<N; s++) {
      c = encode_char(AS[s][i]);
      if (c>4) c=5;
      bgfreq[c]++;
    }

  for (i=0; i<n; i++) {
    int freq[8] = {0,0,0,0,0,0,0,0};
    int code = 0;
    for (s=0; s<N; s++) {
      c = encode_char(AS[s][i]);
      if (c>4) c=5;
      freq[c]++;
    }
    for (c=4; c>0; c--) {
      code <<=1;
      if (freq[c]*n>=bgfreq[c]) code++;
    }
    cons[i] = IUP[code];
    if (freq[0]*n>bgfreq[0])
      cons[i] = tolower(IUP[code]);
  }
  return cons;
}
back to top