https://github.com/jttoivon/MODER
Raw File
Tip revision: c485231e5b468ae509306e1aaeebaa0f3004572d authored by Jarkko Toivonen on 31 March 2020, 18:10:18 UTC
Fixed indexing bug.
Tip revision: c485231
suffix_array_wrapper.hpp
/*

    MODER is a program to learn DNA binding motifs from SELEX datasets.
    Copyright (C) 2016, 2017  Jarkko Toivonen,
    Department of Computer Science, University of Helsinki

    MODER is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.

    MODER is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.

*/
#ifndef SUFFIX_ARRAY_WRAPPER_HPP
#define SUFFIX_ARRAY_WRAPPER_HPP

#include "CPM03/suffixsort.hpp"
#ifndef TIMING
#define TIMING
#endif
#include "timing.hpp"

#include <string>
#include <cassert>
#include <cstring>
#include <cstdio>


class suffix_array
{
public:
  typedef std::vector<unsigned char>::difference_type offset_type;
  typedef unsigned char uchar;

  suffix_array(bool use_rna=false) : text(""), sa(0)
  {
    init_array(use_rna);
  }

  suffix_array(const std::string& str, bool use_rna=false) : text(str), sa(str.length())
  {
    TIME_START(t);
    construct_suffix_array(text.begin(), text.end(),
			   sa.begin(), sa.end());
    TIME_PRINT("Constructing suffix array took %.2f seconds\n", t);
    init_array(use_rna);

    /*    
    int n=text.length();
    for (int i=0; i < n-1; ++i)
      assert(text.substr(sa[i]) < text.substr(sa[i+1]));
    */
  }

  void
  reset(const std::string& str) 
  {
    TIME_START(t);
    text = str;
    sa.resize(str.length());

    construct_suffix_array(text.begin(), text.end(),
			   sa.begin(), sa.end());
    TIME_PRINT("Constructing suffix array took %.2f seconds\n", t);

    /*    
    int n=text.length();
    for (int i=0; i < n-1; ++i)
      assert(text.substr(sa[i]) < text.substr(sa[i+1]));
    */
  }


  int
  count(const std::string& pattern) const
  {
    std::vector<offset_type> dummy;
    return count_helper(pattern, dummy, false);
  }

  int 
  locate(const std::string& pattern, std::vector<offset_type>& positions)
  {
    return count_helper(pattern, positions, true);
  }


  int 
  count_iupac(const std::string& pattern) const
  {
    std::vector<offset_type> dummy;
    return count_iupac_helper(pattern, 0, 0, text.length(), dummy, false);
  }

  int 
  locate_iupac(const std::string& pattern, std::vector<offset_type>& positions) const
  {
    assert(positions.size() == 0);
    return count_iupac_helper(pattern, 0, 0, text.length(), positions, true);
  }


  ~suffix_array() {
    
  }



private:

  offset_type 
  GetPos(offset_type position) const
  {
    return sa[position];
  }

  int ComparePos(offset_type position, const std::string& pattern) const
  {
    offset_type n=text.length();
    offset_type m=pattern.length();
    offset_type j=0;
    
    while (position+j < n && j < m && text[position+j]==pattern[j]) {
      j++;
    }
    if (j >= m)
      return 0;
    else if (position+j >= n)
      return +1;
    else if (text[position+j]>pattern[j])
      return -1;
    else
      return +1;
  }




/*
 -1 text >  pattern
  0 text == pattern
 +1 text <  pattern
*/
  int 
  ComparePos2(offset_type position, uchar pchar, int j) const
  {
    offset_type n=text.length();

    if (position+j >= n) /* text piece is shorter than pattern */
      return +1;
    else if (text[position+j] > pchar)
      return -1;
    else if (text[position+j] == pchar)
      return 0;
    else
      return +1;
  }

  typedef struct
  {
    char c;
    const char* str;
  } char_classes_t;


  static const char_classes_t char_classes_dna[16];
  static const char_classes_t char_classes_rna[16];


  const char* char_classes[256];

  void 
  init_array(bool use_rna = false)
  {
    if (use_rna) {
      int size = sizeof(char_classes_rna)/sizeof(char_classes_t);
      for(int i=0; i < size; ++i)
	char_classes[(unsigned char)char_classes_rna[i].c] = char_classes_rna[i].str;
    } else {
      int size = sizeof(char_classes_dna)/sizeof(char_classes_t);
      for(int i=0; i < size; ++i)
	char_classes[(unsigned char)char_classes_dna[i].c] = char_classes_dna[i].str;
    }
  }

  int 
  count_helper(const std::string& pattern, std::vector<offset_type>& positions, bool get_positions) const
  {
    if (text.length() == 0)
      return 0;
    offset_type L=0;
    offset_type R=text.length();
    offset_type Lraja;

    /* Search for the left boundary */
    if (ComparePos(GetPos(0), pattern) <= 0)  /* text is greater than or equal to pattern */
      Lraja=0;
    else {
      while (L<R-1) {  // while not consequent
        offset_type i = (R+L)/2;
        if (ComparePos(GetPos(i), pattern) <= 0) /* text is greater than or equal to pattern */
	  R = i;
        else
	  L = i;
      }
      Lraja=R;
    }
    // Post-condition: Lraja is the first suffix that is greater than or equal to the pattern, or
    //                 if not such suffix exists, Lraja is text.length()


    /* Search for the right boundary */
    L=0;
    R=text.length();
    offset_type Rraja;
    if (ComparePos(GetPos(0), pattern) < 0) /* text is greater than pattern */
      Rraja=0;
    else {
      while (L < R-1) {
	offset_type i = (R+L)/2;
	if (ComparePos(GetPos(i), pattern) >= 0)  /* text is less than or equal to pattern */
	  L = i;
	else
	  R = i;
      }
      Rraja=R;
    }
    // Post-condition: Rraja is the first suffix that is greater than the pattern, or
    //                 if not such suffix exists, Rraja is text.length()

    //printf("L is %lu, R is %lu\n", Lraja, Rraja); 

    if (get_positions) {
      for (int i=Lraja; i < Rraja; ++i)
	positions.push_back(GetPos(i));
    }

    return Rraja-Lraja;

  }


  /* Input range: [L,R) and [a,b), half-open interval */
  /* Output range: [Lraja,Rraja],  closed interval    */
  int count_iupac_helper(const std::string& pattern, offset_type pos, offset_type a, offset_type b, 
			 std::vector<offset_type>& positions, bool get_positions) const
  {
    if (a == b)
      return 0;

    int old_count = positions.size();

    /* offset_type L=0; */
    /* offset_type R=_index->n; */
    int length = pattern.length();

    uchar pchar = pattern[pos];
    int result = 0;
    int number_of_chars = strlen(char_classes[pchar]);

    for (int r = 0; r < number_of_chars; ++r) {
      uchar current_char = char_classes[pchar][r];


      /* Search for the left boundary */

      offset_type L=a;
      offset_type R=b;

      offset_type Lraja;
      if (ComparePos2(GetPos(L), current_char, pos) <= 0)   /* text is greater than or equal to pattern */
	Lraja=L;
      else {
	while (L<R-1) { // while not adjacent
	  offset_type i = (R+L)/2;
	  if (ComparePos2(GetPos(i), current_char, pos) <= 0) /* text is greater than or equal to pattern */
	    R = i;
	  else
	    L = i;
	}
	Lraja=R;
      }


      /* Search for the right boundary */
      offset_type Rraja;
      L=a;
      R=b;
      if (ComparePos2(GetPos(L), current_char, pos) < 0) /* text is greater than pattern */
	Rraja=L;
      else {
	while (L < R-1) { // while not adjacent
	  offset_type i=(L+R)/2;
	  if (ComparePos2(GetPos(i), current_char, pos) >= 0)  /* text is smaller or equal to pattern */
	    L = i;
	  else
	    R = i;                                           /* text greater than pattern */
	}
	Rraja=R;
      }

      if (pos + 1 == length) {
	result += Rraja-Lraja;
	if (get_positions) {
	  for (int i=Lraja; i < Rraja; ++i)
	    positions.push_back(GetPos(i));
	}
      }
      else
	result += count_iupac_helper(pattern, pos+1, Lraja, Rraja, positions, get_positions);
    
      /*a = Rraja + 1;*/
    }


    if (get_positions)
      assert(result == positions.size() - old_count);

    return result;
  }


  suffix_array(const suffix_array& dummy) {}  // disable
  std::string text;
  std::vector<offset_type> sa;  // suffix array
};


#endif //SUFFIX_ARRAY_WRAPPER_HPP
back to top