https://github.com/jttoivon/MODER
Raw File
Tip revision: c485231e5b468ae509306e1aaeebaa0f3004572d authored by Jarkko Toivonen on 31 March 2020, 18:10:18 UTC
Fixed indexing bug.
Tip revision: c485231
data.cpp
/*

    MODER is a program to learn DNA binding motifs from SELEX datasets.
    Copyright (C) 2016, 2017  Jarkko Toivonen,
    Department of Computer Science, University of Helsinki

    MODER is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.

    MODER is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.

*/
#include "data.hpp"
#include "type.hpp"
#include "common.hpp"
#include "probabilities.hpp"
#include "kmer_tools.hpp"

//#include "common.hpp"

#include <cassert>
#include <iostream>
#include <algorithm>
#include <map>
#include <set>
#include <cstdio>

#include <boost/foreach.hpp>



// interpret string of nucleid acids as a number in base 4
// The first nucleotide of the string is in the most significant end
big_int
dna_to_number(const std::string& s)  
{
  assert(s.length() <= (sizeof(big_int)*4));

  big_int sum=0;
  for (int i=0; i < s.length(); ++i)
    sum = sum*4 + to_int(s[i]);
  
  return sum;
}

// converts integer back to a nucled acid sequence
// Sequence has l nucleotides
// The first nucleotide of the string is in the most significant end
std::string
number_to_dna(big_int i, int l)  
{
  assert(l <= sizeof(big_int)*4);
  assert(i>=0);

  char nucs[] = "ACGT";

  std::string s(l, 'X');
  while (l-- > 0) {
    s[l] = nucs[i & 3];
    i = i / 4;
  }
  return s;
}

// interpret string of nucleid acids A,C,G,T,N as a number in base 5
// The first nucleotide of the string is in the most significant end
big_int
extended_dna_to_number(const std::string& s)  
{
  
  assert(s.size() < 28);

  big_int sum=0;
  for (int i=0;i<s.size(); ++i)
    sum = sum*5 + to_int(s[i]);

  /*
  if (sum < 0) {
    std::cout << s << std::endl;
    throw "sika";
  }
  */
  return sum;
}

// converts integer back to a nucled acid sequence including N
// Sequence has l nucleotides
// The first nucleotide of the string is in the most significant end
std::string
number_to_extended_dna(big_int i, int l)  
{
  assert(i>=0);

  char nas[] = "ACGTN";

  std::string s;
  while (l-- > 0) {
    int m = i % 5;
    s.push_back(nas[m]);
    i = i / 5;
  }
  reverse(s.begin(), s.end());
  return s;
}












// interpret string of nucleid acids as a number in base 4
// The first nucleotide of the string is in the most significant end
big_int
vector_to_number(const std::vector<int>& s)  
{
  
  assert(s.size() < 32);

  big_int sum=0;
  for (int i=0;i<s.size(); ++i)
    sum = sum*4 + s[i];

  /*
  if (sum < 0) {
    throw "sika";
  }
  */
  return sum;
}

// converts integer back to a nucled acid sequence
// Sequence has l nucleotides
std::vector<int>
number_to_vector(big_int i, int l)  
{
  assert(i>=0);

  std::vector<int> s;
  while (l-- > 0) {
    int m = i % 4;
    s.push_back(m);
    i = i / 4;
  }
  reverse(s.begin(), s.end());
  return s;
}

// interpret string of nucleid acids A,C,G,T,N as a number in base 5
// The first nucleotide of the string is in the most significant end
big_int
extended_vector_to_number(const std::vector<int>& s)  
{
  
  assert(s.size() < 28);

  big_int sum=0;
  for (int i=0;i<s.size(); ++i)
    sum = sum*5 + s[i];

  /*
  if (sum < 0) {
    throw "sika";
  }
  */
  return sum;
}

// converts integer back to a nucled acid sequence including N
// Sequence has l nucleotides
std::vector<int>
number_to_extended_vector(big_int i, int l)  
{
  assert(i>=0);

  std::vector<int> s;
  while (l-- > 0) {
    int m = i % 5;
    s.push_back(m);
    i = i / 5;
  }
  reverse(s.begin(), s.end());
  return s;
}

int
N_count(const std::string& s)
{
  return std::count(s.begin(), s.end(), 'N');
}

int
non_N_count(const std::string& s)
{
  return s.length() - N_count(s);
}

bool
has_atmost_one_gap(const std::string& s)
{
  if (s.size() == 0)
    return true;

  if (toupper(s[0]) == 'N' or toupper(s[s.size()-1]) == 'N')
    throw "String contains n in the border";

  bool in_gap = false;
  int number_of_switches = 0;
  for (int i=0; i < s.size(); ++i) {
    if ((toupper(s[i]) == 'N') != in_gap) {
      ++number_of_switches;
      in_gap = !in_gap;
    }
  }

  return number_of_switches <= 2;
}

std::vector<std::string>
remove_duplicate_reads(const std::vector<std::string>& sequences_orig)
{
  std::map<std::string, int> read_count;
  BOOST_FOREACH(std::string s, sequences_orig) {
    std::string key = std::min(s, reverse_complement(s));
    ++read_count[key];
  }

  std::set<std::string> duplicates;
  std::string s;
  int count;
  BOOST_FOREACH(boost::tie(s, count), read_count) {
    if (count > 1)
      duplicates.insert(s);
  }
  printf("Dataset contains %zu duplicates\n", duplicates.size());
  std::vector<std::string> sequences;
  BOOST_FOREACH(std::string s, sequences_orig) {
    // if (duplicates.count(s) or duplicates.count(reverse_complement(s)))
    //   continue;

    std::string sr = reverse_complement(s);
    BOOST_FOREACH(std::string d, duplicates) {
      if (hamming_distance(s, d) <= 1 or hamming_distance(sr, d) <= 1)
	goto end;
    }
    
    sequences.push_back(s);
  end:
    ;
  }

  return sequences;
}






/*
template <typename T>
T
dna_to_number(const std::string& s)  
{
  
  //  assert(s.length() < std::numeric_limits<T>::digits/2);

  T sum=0;
  for (int i=0; i < s.length(); ++i)
    sum = sum*4 + to_int(s[i]);

  return sum;
}
*/



// Take only unique set of sequences
// This implementation is slower, but works for reads longer than 64 bp
std::vector<std::string>
remove_duplicate_reads_generic(const std::vector<std::string>& sequences_orig, int hamming_distance)
{
  std::vector<std::string> sequences;
  if (sequences_orig.size() == 0)
    return std::vector<std::string >();

  std::map<std::string, int> read_count;
  BOOST_FOREACH(std::string s, sequences_orig) {
    ++read_count[s];
  }

  if (hamming_distance == 0) {
    std::string str;
    int count;

    BOOST_FOREACH(boost::tie(str, count), read_count)
      sequences.push_back(str);
    return sequences;
  }

  std::multimap<int, std::string, std::greater<int> > reads_multimap;
  std::string str;
  int count;
  BOOST_FOREACH(boost::tie(str, count), read_count) {
    reads_multimap.insert(std::make_pair(count, str));
  }
  std::vector<std::pair<int, std::string> > reads_vector;
  BOOST_FOREACH(boost::tie(count, str), reads_multimap) {
    reads_vector.push_back(std::make_pair(count, str));
  }

  int size = reads_vector.size();
  std::vector<bool> deleted(size, false);
  for (int i=0; i < size; ++i) {
    if (deleted[i])
      continue;
    std::string current_str = reads_vector[i].second;
    for (int j=i+1; j < size; ++j) {
      if (deleted[j])
	continue;
      if (::hamming_distance(current_str, reads_vector[j].second) <= hamming_distance) {
	deleted[j] = true;
      }
    }
    sequences.push_back(current_str);
  }
  return sequences;
}



// Take only unique set of sequences
std::vector<std::string>
remove_duplicate_reads_faster(const std::vector<std::string>& sequences_orig, int hamming_distance)
{
  
  std::vector<std::string> sequences;
  if (sequences_orig.size() == 0)
    return std::vector<std::string >();
  const int L = sequences_orig[0].length();
  if (L > 64)
    return remove_duplicate_reads_generic(sequences_orig, hamming_distance);
  
  std::map<myuint128, int> read_count;
  BOOST_FOREACH(std::string s, sequences_orig) {
    std::string key = s;
    ++read_count[dna_to_number<myuint128>(key)];
  }

  if (hamming_distance == 0) {
    myuint128 id;
    int count;

    BOOST_FOREACH(boost::tie(id, count), read_count)
      sequences.push_back(number_to_dna<myuint128>(id, L));
    return sequences;
  }

  std::multimap<int, myuint128, std::greater<int> > reads_multimap;
  myuint128 id;
  int count;
  BOOST_FOREACH(boost::tie(id, count), read_count) {
    reads_multimap.insert(std::make_pair(count, id));
  }
  std::vector<std::pair<int, myuint128> > reads_vector;
  BOOST_FOREACH(boost::tie(count, id), reads_multimap) {
    reads_vector.push_back(std::make_pair(count, id));
  }

  int size = reads_vector.size();
  std::vector<bool> deleted(size, false);
  for (int i=0; i < size; ++i) {
    if (deleted[i])
      continue;
    myuint128 current_id = reads_vector[i].second;
    for (int j=i+1; j < size; ++j) {
      if (deleted[j])
	continue;
      if (hamming_distance_with_bits(current_id, reads_vector[j].second) <= hamming_distance) {
	deleted[j] = true;
      }
    }
    sequences.push_back(number_to_dna<myuint128>(current_id, L));
  }
  /*
  std::vector<std::string> sequences;
  std::vector<myint128> duplicates;
  std::vector<myint128> unique;
  myint128 b;
  int count;
  BOOST_FOREACH(boost::tie(b, count), read_count) {
    if (count > 1)
      duplicates.push_back(b);
    else
      unique.push_back(b);
    sequences.push_back(number_to_dna<myint128>(b, L));
  } 
  printf("Dataset contains %zu distinct sequences with multiple occurrences\n", duplicates.size());
  printf("Dataset contains %zu unique sequences\n", read_count.size() - duplicates.size());
  std::vector<std::string> sequences2;
  BOOST_FOREACH(std::string s, sequences_orig) {

    myint128 si  = dna_to_number<myint128>(s);
    BOOST_FOREACH(myint128 d, duplicates) {
      if (hamming_distance_with_bits(si, d) <= 1)
	goto end;
    }
    
    sequences2.push_back(s);
  end:
    ;
  }
  printf("If Hamming duplicates were removed, %zu sequences would remain", sequences2.size());
  */
  /*
  BOOST_FOREACH(myint128 si, unique) {

    std::string s = number_to_dna(si, L);
    std::string sr = reverse_complement(s);
    myint128 sri = dna_to_number<myint128>(sr);
    BOOST_FOREACH(myint128 d, duplicates) {
      if (hamming_distance_with_bits(si, d) <= 1 or hamming_distance_with_bits(sri, d) <= 1)
	goto end;
    }
    
    sequences.push_back(s);
  end:
    ;
  }
  */

  return sequences;
}


// returns number of scoring matrix hits
int
number_of_matches(const std::vector<std::string>& sequences, 
		  const matrix<double>& m, 
		  double threshold)
{
  int lines = sequences.size();
  assert(lines > 0);
  int L = sequences[0].length();
  int k = m.get_columns();
  score_function_t score_function;

  //  if (use_logodds_score)
  if (true)
    score_function = compute_logodds_probability;
  else
    score_function = compute_normal_probability;

  dmatrix matrix = m;
  dmatrix rev_comp_matrix = reverse_complement(matrix);
  int matching_sequences = 0;
  int matching_sites = 0;
  for (int i=0; i<lines; ++i) {
    const std::string& line = sequences[i];
    int limit = L-k+1;
    bool line_hit = false;
    for (int j=0; j < limit; ++j) {  // go through all possible starting positions
      double p1, p2;

      const std::string& site = line.substr(j, k);
      bool is_palindrome = is_palindromic(site);
      p1 = score_function(site, matrix);
      p2 = score_function(site, rev_comp_matrix);

      if (p1 >= threshold) {  // compare to cutoff
	//matches[i].push_back(occurrence(j,1,i));
	++matching_sites;
	line_hit = true;
	if (is_palindrome)  // count palindromes only once
	  continue;
      }
      
      if (p2 >= threshold) {  // compare to cutoff
	//matches[i].push_back(occurrence(j,-1,i));
	++matching_sites;
	line_hit = true;
      }
    }
    if (line_hit)
      ++matching_sequences;
  }

  return matching_sites;
}
back to top