Raw File
RNAfold.c
/* Last changed Time-stamp: <2012-02-15 18:20:49 ivo> */
/*
                  Ineractive Access to folding Routines

                  c Ivo L Hofacker
                  Vienna RNA package
*/

/** \file
*** \brief RNAfold program source code
***
*** This code provides an interface for MFE and Partition function folding
*** of single linear or circular RNA molecules.
**/

#include <assert.h>
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <ctype.h>
#include <unistd.h>
#include <string.h>
#include "ViennaRNA/fold.h"
#include "ViennaRNA/part_func.h"
#include "ViennaRNA/fold_vars.h"
#include "ViennaRNA/PS_dot.h"
#include "ViennaRNA/utils.h"
#include "ViennaRNA/read_epars.h"
#include "ViennaRNA/centroid.h"
#include "ViennaRNA/MEA.h"
#include "ViennaRNA/params.h"
#include "ViennaRNA/constraints.h"
#include "ViennaRNA/file_formats.h"
#include "RNAfold_cmdl.h"



/*@unused@*/
static char UNUSED rcsid[] = "$Id: RNAfold.c,v 1.25 2009/02/24 14:22:21 ivo Exp $";

/*--------------------------------------------------------------------------*/

static void
add_shape_constraints(vrna_fold_compound *vc,
                      const char *shape_method,
                      const char *shape_conversion,
                      const char *shape_file,
                      int verbose,
                      unsigned int constraint_type){

  float p1, p2;
  char method;
  char *sequence;
  double *values;
  int length = vc->length;

  if(!vrna_sc_SHAPE_parse_method(shape_method, &method, &p1, &p2)){
    vrna_message_warning("Method for SHAPE reactivity data conversion not recognized!");
    return;
  }

  if(verbose){
    fprintf(stderr, "Using SHAPE method '%c'", method);
    if(method != 'W'){
      if(method == 'Z')
        fprintf(stderr, " with parameter p1=%f", p1);
      else
        fprintf(stderr, " with parameters p1=%f and p2=%f", p1, p2);
    }
    fputc('\n', stderr);
  }

  sequence = vrna_alloc(sizeof(char) * (length + 1));
  values = vrna_alloc(sizeof(double) * (length + 1));
  vrna_read_SHAPE_file(shape_file, length, method == 'W' ? 0 : -1, sequence, values);

  if(method == 'D'){
    (void)vrna_sc_SHAPE_add_deigan(vc, (const double *)values, p1, p2, constraint_type);
  }
  else if(method == 'Z'){
    (void)vrna_sc_SHAPE_add_zarringhalam(vc, (const double *)values, p1, 0.5, shape_conversion, constraint_type);
  } else {
    assert(method == 'W');
    vrna_sc_add_up(vc, values, constraint_type);
  }

  free(values);
  free(sequence);
}

int main(int argc, char *argv[]){
  FILE            *input, *output;
  struct          RNAfold_args_info args_info;
  char            *buf, *rec_sequence, *rec_id, **rec_rest, *structure, *cstruc, *orig_sequence;
  char            *constraints_file, *shape_file, *shape_method, *shape_conversion;
  char            fname[FILENAME_MAX_LENGTH], ffname[FILENAME_MAX_LENGTH], *ParamFile;
  char            *ns_bases, *c;
  int             i, length, l, cl, sym, istty, pf, noPS, noconv, do_bpp, enforceConstraints;
  unsigned int    rec_type, read_opt;
  double          energy, min_en, kT, sfact;
  int             doMEA, circular, lucky, with_shapes, verbose;
  double          MEAgamma, bppmThreshold, betaScale;
  char            *infile, *outfile;
;
  int               out_v, out_hx, out_ct, out_bps;
  vrna_param_t      *mfe_parameters;
  vrna_exp_param_t  *pf_parameters;
  vrna_md_t         md;

  rec_type      = read_opt = 0;
  rec_id        = buf = rec_sequence = structure = cstruc = orig_sequence = NULL;
  rec_rest      = NULL;
  ParamFile     = NULL;
  ns_bases      = NULL;
  pf_parameters = NULL;
  do_bpp        = do_backtrack  = 1;  /* set local (do_bpp) and global (do_backtrack) default for bpp computation */
  pf            = 0;
  sfact         = 1.07;
  noPS          = 0;
  noconv        = 0;
  circular      = 0;
  gquad         = 0;
  cl            = l = length = 0;
  dangles       = 2;
  MEAgamma      = 1.;
  bppmThreshold = 1e-5;
  lucky         = 0;
  doMEA         = 0;
  betaScale     = 1.;
  shape_file    = NULL;
  shape_method  = NULL;
  with_shapes   = 0;
  verbose       = 0;
  max_bp_span   = -1;
  constraints_file = NULL;
  enforceConstraints  = 0;

  outfile       = NULL;
  infile        = NULL;
  input         = NULL;
  output        = NULL;
  out_v         = 1;  /* default to thermodynamic properties and dot bracket string */
  out_hx = out_ct = out_bps = 0;

  /* apply default model details */
  set_model_details(&md);


  /*
  #############################################
  # check the command line parameters
  #############################################
  */
  if(RNAfold_cmdline_parser (argc, argv, &args_info) != 0) exit(1);
  /* temperature */
  if(args_info.temp_given)        md.temperature = temperature = args_info.temp_arg;
  /* structure constraint */
  if(args_info.constraint_given){
    fold_constrained=1;
    if(args_info.constraint_arg[0] != '\0')
      constraints_file = strdup(args_info.constraint_arg);
  }
  /* enforce base pairs given in constraint string rather than weak enforce */
  if(args_info.enforceConstraint_given)   enforceConstraints = 1;

  /* do not take special tetra loop energies into account */
  if(args_info.noTetra_given)     md.special_hp = tetra_loop=0;
  /* set dangle model */
  if(args_info.dangles_given){
    if((args_info.dangles_arg < 0) || (args_info.dangles_arg > 3))
      vrna_message_warning("required dangle model not implemented, falling back to default dangles=2");
    else
      md.dangles = dangles = args_info.dangles_arg;
  }
  /* do not allow weak pairs */
  if(args_info.noLP_given)        md.noLP = noLonelyPairs = 1;
  /* do not allow wobble pairs (GU) */
  if(args_info.noGU_given)        md.noGU = noGU = 1;
  /* do not allow weak closing pairs (AU,GU) */
  if(args_info.noClosingGU_given) md.noGUclosure = no_closingGU = 1;
  /* gquadruplex support */
  if(args_info.gquad_given)       md.gquad = gquad = 1;
  /* enforce canonical base pairs in any case? */
  if(args_info.canonicalBPonly_given)       md.canonicalBPonly = canonicalBPonly = 1;
  /* do not convert DNA nucleotide "T" to appropriate RNA "U" */
  if(args_info.noconv_given)      noconv = 1;
  /* set energy model */
  if(args_info.energyModel_given) md.energy_set = energy_set = args_info.energyModel_arg;
  /* take another energy parameter set */
  if(args_info.paramFile_given)   ParamFile = strdup(args_info.paramFile_arg);
  /* Allow other pairs in addition to the usual AU,GC,and GU pairs */
  if(args_info.nsp_given)         ns_bases = strdup(args_info.nsp_arg);
  /* set pf scaling factor */
  if(args_info.pfScale_given)     md.sfact = sfact = args_info.pfScale_arg;
  /* assume RNA sequence to be circular */
  if(args_info.circ_given)        md.circ = circular = 1;
  /* always look on the bright side of life */
  if(args_info.ImFeelingLucky_given)  md.uniq_ML = lucky = pf = st_back = 1;
  /* set the bppm threshold for the dotplot */
  if(args_info.bppmThreshold_given)
    bppmThreshold = MIN2(1., MAX2(0.,args_info.bppmThreshold_arg));
  if(args_info.betaScale_given)   md.betaScale = betaScale = args_info.betaScale_arg;
  /* do not produce postscript output */
  if(args_info.noPS_given)        noPS=1;
  /* partition function settings */
  if(args_info.partfunc_given){
    pf = 1;
    if(args_info.partfunc_arg != 1)
      do_bpp = md.compute_bpp = do_backtrack = args_info.partfunc_arg;
  }
  /* MEA (maximum expected accuracy) settings */
  if(args_info.MEA_given){
    pf = doMEA = 1;
    if(args_info.MEA_arg != -1)
      MEAgamma = args_info.MEA_arg;
  }
  if(args_info.layout_type_given)
    rna_plot_type = args_info.layout_type_arg;
  /* SHAPE reactivity data */
  if(args_info.shape_given){
    with_shapes = 1;
    shape_file = strdup(args_info.shape_arg);
  }

  shape_method = strdup(args_info.shapeMethod_arg);
  shape_conversion = strdup(args_info.shapeConversion_arg);
  if(args_info.verbose_given){
    verbose = 1;
  }
  if(args_info.maxBPspan_given){
    md.max_bp_span = max_bp_span = args_info.maxBPspan_arg;
  }
  if(args_info.outfile_given){
    outfile = strdup(args_info.outfile_arg);
  }
  if(args_info.format_given){
    char *o,*s;
    out_v = out_hx = out_ct = 0; /* reset defaults */
    o = strdup(args_info.format_arg);
    /* simply print to file */
    s = strchr(o, 'V');
    if(s != NULL)
      out_v = 1;
    /* print helix list? */
    s = strchr(o, 'H');
    if(s != NULL)
      out_hx = 1;
    /* print connect file? */
    s = strchr(o, 'C');
    if(s != NULL)
      out_ct = 1;
    /* print bpseq file? */
    s = strchr(o, 'B');
    if(s != NULL)
      out_bps = 1;
  }
  if(args_info.infile_given){
    infile = strdup(args_info.infile_arg);
  }
  

  /* free allocated memory of command line data structure */
  RNAfold_cmdline_parser_free (&args_info);


  /*
  #############################################
  # begin initializing
  #############################################
  */
  if(infile){
    input = fopen((const char *)infile, "r");
    if(!input)
      vrna_message_error("Could not read input file");
  }

  if(circular && gquad){
    vrna_message_error("G-Quadruplex support is currently not available for circular RNA structures");
  }

  if (ParamFile != NULL)
    read_parameter_file(ParamFile);

  if (circular && noLonelyPairs)
    vrna_message_warning("depending on the origin of the circular sequence, some structures may be missed when using -noLP\nTry rotating your sequence a few times");

  if (ns_bases != NULL) {
    /* nonstandards = vrna_alloc(33); */
    c=ns_bases;
    i=sym=0;
    if (*c=='-') {
      sym=1; c++;
    }
    while (*c!='\0') {
      if (*c!=',') {
        md.nonstandards[i++]=*c++;
        md.nonstandards[i++]=*c;
        if ((sym)&&(*c!=*(c-1))) {
          md.nonstandards[i++]=*c;
          md.nonstandards[i++]=*(c-1);
        }
      }
      c++;
    }
  }

  istty = isatty(fileno(stdout))&&isatty(fileno(stdin))&&(!infile);

  /* print user help if we get input from tty */
  if(istty){
    if(fold_constrained){
      vrna_message_constraint_options_all();
      vrna_message_input_seq("Input sequence (upper or lower case) followed by structure constraint");
    }
    else vrna_message_input_seq_simple();
  }

  mfe_parameters = get_scaled_parameters(temperature, md);

  /* set options we wanna pass to vrna_read_fasta_record() */
  if(istty)             read_opt |= VRNA_INPUT_NOSKIP_BLANK_LINES;
  if(!fold_constrained) read_opt |= VRNA_INPUT_NO_REST;

  /*
  #############################################
  # main loop: continue until end of file
  #############################################
  */
  while(
    !((rec_type = vrna_read_fasta_record(&rec_id, &rec_sequence, &rec_rest, input, read_opt))
        & (VRNA_INPUT_ERROR | VRNA_INPUT_QUIT))){

    char *prefix      = NULL;
    char *v_file_name = NULL;
    /*
    ########################################################
    # init everything according to the data we've read
    ########################################################
    */
    if(rec_id){
      if(!istty && !outfile) printf("%s\n", rec_id);
      (void) sscanf(rec_id, ">%" XSTR(FILENAME_ID_LENGTH) "s", fname);
    }
    else fname[0] = '\0';

    if(outfile){
      /* prepare the file prefix */
      if(fname[0] != '\0'){
        prefix = (char *)vrna_alloc(sizeof(char) * (strlen(fname) + strlen(outfile) + 1));
        strcpy(prefix, outfile);
        strcat(prefix, "_");
        strcat(prefix, fname);
      } else {
        prefix = (char *)vrna_alloc(sizeof(char) * (strlen(outfile) + 1));
        strcpy(prefix, outfile);
      }
    }

    /* convert DNA alphabet to RNA if not explicitely switched off */
    if(!noconv) vrna_seq_toRNA(rec_sequence);
    /* store case-unmodified sequence */
    orig_sequence = strdup(rec_sequence);
    /* convert sequence to uppercase letters only */
    vrna_seq_toupper(rec_sequence);

    vrna_fold_compound *vc = vrna_get_fold_compound(rec_sequence, &md, VRNA_OPTION_MFE | ((pf) ? VRNA_OPTION_PF : 0));

    length    = vc->length;

    structure = (char *) vrna_alloc(sizeof(char) * (length+1));
    

    /* parse the rest of the current dataset to obtain a structure constraint */
    if(fold_constrained){
      if(constraints_file){
        vrna_add_constraints(vc, constraints_file, VRNA_CONSTRAINT_FILE | VRNA_CONSTRAINT_SOFT_MFE | ((pf) ? VRNA_CONSTRAINT_SOFT_PF : 0));
      } else {
        cstruc = NULL;
        unsigned int coptions = (rec_id) ? VRNA_CONSTRAINT_MULTILINE : 0;
        coptions |= VRNA_CONSTRAINT_ALL;
        vrna_extract_record_rest_constraint(&cstruc, (const char **)rec_rest, coptions);
        cl = (cstruc) ? (int)strlen(cstruc) : 0;

        if(cl == 0)           vrna_message_warning("structure constraint is missing");
        else if(cl < length)  vrna_message_warning("structure constraint is shorter than sequence");
        else if(cl > length)  vrna_message_error("structure constraint is too long");
        if(cstruc){
          strncpy(structure, cstruc, sizeof(char)*(cl+1));

          unsigned int constraint_options = 0;
          constraint_options |= VRNA_CONSTRAINT_DB
                                | VRNA_CONSTRAINT_DB_PIPE
                                | VRNA_CONSTRAINT_DB_DOT
                                | VRNA_CONSTRAINT_DB_X
                                | VRNA_CONSTRAINT_DB_ANG_BRACK
                                | VRNA_CONSTRAINT_DB_RND_BRACK;
          if(enforceConstraints)
            constraint_options |= VRNA_CONSTRAINT_DB_ENFORCE_BP;
          vrna_add_constraints(vc, (const char *)structure, constraint_options);
        }
      }
    }

    if(with_shapes)
      add_shape_constraints(vc, shape_method, shape_conversion, shape_file, verbose, VRNA_CONSTRAINT_SOFT_MFE | ((pf) ? VRNA_CONSTRAINT_SOFT_PF : 0));

    if(istty) printf("length = %d\n", length);

    if(out_v){
      if(outfile){
        v_file_name = (char *)vrna_alloc(sizeof(char) * (strlen(prefix) + 8));
        strcpy(v_file_name, prefix);
        strcat(v_file_name, ".fold");

        output = fopen((const char *)v_file_name, "a");
        if(!output)
          vrna_message_error("Failed to open file for writing");
      } else {
        output = stdout;
      }
    }

    /*
    ########################################################
    # begin actual computations
    ########################################################
    */



    min_en = (double)vrna_fold(vc, structure);

    char *hx_file_name  = NULL;
    char *ct_file_name  = NULL;
    char *bps_file_name = NULL;

    if(!lucky){
      if(output){
        fprintf(output, "%s\n%s", orig_sequence, structure);
        if(istty)
          fprintf(output, "\n minimum free energy = %6.2f kcal/mol\n", min_en);
        else
          fprintf(output, " (%6.2f)\n", min_en);

        (void) fflush(output);

      }

      if(outfile){

        if(out_hx){
          hx_file_name = (char *)vrna_alloc(sizeof(char) * (strlen(prefix) + 8));
          strcpy(hx_file_name, prefix);
          strcat(hx_file_name, ".hx");

          FILE *fp = fopen((const char *)hx_file_name, "a");
          if(!fp)
            vrna_message_error("Failed to open file for writing");

          vrna_structure_print_hx((const char *)orig_sequence,
                                  (const char *)structure,
                                  min_en,
                                  fp);

          fclose(fp);
        }
        
        if(out_ct){
          ct_file_name = (char *)vrna_alloc(sizeof(char) * (strlen(prefix) + 8));
          strcpy(ct_file_name, prefix);
          strcat(ct_file_name, ".ct");

          FILE *fp = fopen((const char *)ct_file_name, "a");
          if(!fp)
            vrna_message_error("Failed to open file for writing");

          vrna_structure_print_ct((const char *)orig_sequence,
                                  (const char *)structure,
                                  min_en,
                                  (const char *)(fname[0] != '\0' ? fname : "MFE-structure"),
                                  fp);

          fclose(fp);
        }

        if(out_bps){
          bps_file_name = (char *)vrna_alloc(sizeof(char) * (strlen(prefix) + 11));
          strcpy(bps_file_name, prefix);
          strcat(bps_file_name, ".bpseq");

          FILE *fp = fopen((const char *)bps_file_name, "a");
          if(!fp)
            vrna_message_error("Failed to open file for writing");

          vrna_structure_print_bpseq( (const char *)orig_sequence,
                                      (const char *)structure,
                                      fp);

          fclose(fp);
        }
      }

      if(fname[0] != '\0'){
        strcpy(ffname, fname);
        strcat(ffname, "_ss.ps");
      } else strcpy(ffname, "rna.ps");

      if(gquad){
        if (!noPS) (void) PS_rna_plot_a_gquad(orig_sequence, structure, ffname, NULL, NULL);
      } else {
        if (!noPS) (void) PS_rna_plot_a(orig_sequence, structure, ffname, NULL, NULL);
      }
    }

    if (length>2000)
      vrna_free_mfe_matrices(vc);

    if (pf) {
      char *pf_struc = (char *) vrna_alloc((unsigned) length+1);
      if (vc->params->model_details.dangles==1) {
          vc->params->model_details.dangles=2;   /* recompute with dangles as in pf_fold() */
          min_en = vrna_eval_structure(vc, structure);
          vc->params->model_details.dangles=1;
      }

      vrna_exp_params_rescale(vc, &min_en);

      kT = vc->exp_params->kT/1000.;

      if (length>2000) fprintf(stderr, "scaling factor %f\n", pf_scale);

      if (cstruc!=NULL) strncpy(pf_struc, cstruc, length+1);

      energy = vrna_pf_fold(vc, pf_struc);

      /* in case we abort because of floating point errors */
      if (length>1600)
        fprintf(stderr, "free energy = %8.2f\n", energy);

      if(lucky){
        vrna_init_rand();
        char *s = vrna_pbacktrack(vc);
        min_en = vrna_eval_structure(vc, (const char *)s);
        if(output){
          fprintf(output, "%s\n%s", orig_sequence, s);
          if (istty)
            fprintf(output, "\n free energy = %6.2f kcal/mol\n", min_en);
          else
            fprintf(output, " (%6.2f)\n", min_en);

          (void) fflush(output);
        }
        if(fname[0] != '\0'){
          strcpy(ffname, fname);
          strcat(ffname, "_ss.ps");
        } else strcpy(ffname, "rna.ps");

        if (!noPS) (void) PS_rna_plot(orig_sequence, s, ffname);
        free(s);
      }
      else{
      
        if (do_bpp) {
          if(output){
            fprintf(output, "%s", pf_struc);
            if(!istty)
              fprintf(output, " [%6.2f]\n", energy);
            else
              fprintf(output, "\n");
            }
        }
        if(((istty)||(!do_bpp)) && output)
          fprintf(output, " free energy of ensemble = %6.2f kcal/mol\n", energy);


        if (do_bpp) {
          plist *pl1,*pl2;
          char *cent;
          double dist, cent_en;

          pl1     = vrna_pl_get_from_pr(vc, bppmThreshold);
          pl2     = vrna_pl_get(structure, 0.95*0.95);
          cent    = vrna_get_centroid_struct(vc, &dist);
          cent_en = vrna_eval_structure(vc, (const char *)cent);
          if(output)
            fprintf(output, "%s {%6.2f d=%.2f}\n", cent, cent_en, dist);
          free(cent);
          if (fname[0]!='\0') {
            strcpy(ffname, fname);
            strcat(ffname, "_dp.ps");
          } else strcpy(ffname, "dot.ps");
          (void) PS_dot_plot_list(orig_sequence, ffname, pl1, pl2, "");
          free(pl2);
          if (do_bpp==2) {
            pl2 = vrna_stack_prob(vc, 1e-5);
            if (fname[0]!='\0') {
              strcpy(ffname, fname);
              strcat(ffname, "_dp2.ps");
            } else strcpy(ffname, "dot2.ps");
            PS_dot_plot_list(orig_sequence, ffname, pl1, pl2,
                             "Probabilities for stacked pairs (i,j)(i+1,j-1)");
            free(pl2);
          }
          free(pl1);
          free(pf_struc);
          if(doMEA){
            float mea, mea_en;
            plist *pl = vrna_pl_get_from_pr(vc, 1e-4/(1+MEAgamma));

            if(gquad){
              mea = MEA_seq(pl, rec_sequence, structure, MEAgamma, pf_parameters);
            } else {
              mea = MEA(pl, structure, MEAgamma);
            }
            mea_en = vrna_eval_structure(vc, (const char *)structure);
            if(output)
              fprintf(output, "%s {%6.2f MEA=%.2f}\n", structure, mea_en, mea);

            free(pl);
          }
        }
        if(output){
          fprintf(output, " frequency of mfe structure in ensemble %g; ", exp((energy-min_en)/kT));
          if (do_bpp)
            fprintf(output, "ensemble diversity %-6.2f", vrna_mean_bp_distance(vc));
          fprintf(output, "\n");
        }
      }
      free(pf_parameters);
    }
    if(output)
      (void) fflush(output);
    if(outfile && output){
      fclose(output);
      output = NULL;
    }

    /* clean up */
    vrna_free_fold_compound(vc);
    free(cstruc);
    free(rec_id);
    free(rec_sequence);
    free(orig_sequence);
    free(structure);

    /* free the rest of current dataset */
    if(rec_rest){
      for(i=0;rec_rest[i];i++) free(rec_rest[i]);
      free(rec_rest);
    }
    rec_id = rec_sequence = structure = cstruc = NULL;
    rec_rest = NULL;

    if(with_shapes)
      break;

    /* print user help for the next round if we get input from tty */
    if(istty){
      if(fold_constrained){
        vrna_message_constraint_options_all();
        vrna_message_input_seq("Input sequence (upper or lower case) followed by structure constraint");
      }
      else vrna_message_input_seq_simple();
    }
  }
  
  if(input)
    fclose(input);

  free(constraints_file);
  free(mfe_parameters);
  return EXIT_SUCCESS;
}
back to top