https://github.com/jttoivon/MODER
Raw File
Tip revision: c485231e5b468ae509306e1aaeebaa0f3004572d authored by Jarkko Toivonen on 31 March 2020, 18:10:18 UTC
Fixed indexing bug.
Tip revision: c485231
common.hpp
/*

    MODER is a program to learn DNA binding motifs from SELEX datasets.
    Copyright (C) 2016, 2017  Jarkko Toivonen,
    Department of Computer Science, University of Helsinki

    MODER is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.

    MODER is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.

*/
#ifndef COMMON_HPP
#define COMMON_HPP

#include "matrix.hpp"
#include "data.hpp"
#include "cob.hpp"
#include "orientation.hpp"

#include <cfloat>
#include <vector>
#include <map>
#include <string>
#include <cstring>
#include <boost/multi_array.hpp>
#include <boost/unordered_map.hpp>
#include <boost/tuple/tuple.hpp>
#include <boost/shared_ptr.hpp>

extern std::string header;

typedef std::vector<int> ivector;

double
cut(double x);


void
error(bool b, const std::string& message);

std::string
mybasename(const std::string& path);

int
bits_in_range(const std::bitset<64>& occ, int begin, int end);


std::string
print_bitvector(unsigned int x, int number_of_bits=sizeof(unsigned int)*8);


const char*
yesno(bool b);


template <typename T>
std::string
m(std::string s, T t)
{
  std::ostringstream ss;
  ss << s << ", default: " << t;
  return ss.str();
}

void
print_command_line(int argc, char* argv[]);

void
print_command_line(int argc, char* argv[], FILE* f);

std::vector<int>
count_frequencies(const std::string& s);


std::string
itoa(int i);

int
atoi(const std::string& s);

double
atof(const std::string& s);


std::string
to_string(std::string format, ...);

void
underline(std::string format, ...);

extern matrix<double> positional_background;

template <typename Iter, typename Key>
void
key_sort(Iter begin, Iter end, Key key);

template <typename Value, typename Key>
class key_sort_functor
{
public:

  key_sort_functor(Key k) : k_(k) {} 

  bool
  operator()(Value x, Value y) const
  {
    return k_(x) < k_(y); 
  }

private:
  Key k_;
};

template <typename Iter, typename Key>
void
key_sort(Iter begin, Iter end, Key key)
{
  typedef typename Iter::value_type Value;
  std::sort(begin, end, key_sort_functor<Value, Key>(key));
}

class division_functor
{
public:
  
  division_functor(double d_) : d(d_) {}

  double
  operator()(double x) const {
    return x/d;
  }

private:
  double d;
};

class multiplication_functor
{
public:
  
  multiplication_functor(double d_) : d(d_) {}

  double
  operator()(double x) const {
    return x*d;
  }

private:
  double d;
};

std::string
reverse_complement(const std::string& s);

std::string
reverse_complement_rna(const std::string& s);

std::string
reverse(const std::string& s);

bool
is_palindromic(const std::string& s);

int
palindromic_index(const std::string& s);

bool
is_nucleotide_string(const std::string& str);


matrix<double>
reverse_complement(const matrix<double>& m);

matrix<double>
reverse(const matrix<double>& m);

dmatrix
extend_matrix(const dmatrix& orig, int k);



//char complement(char c);

/*
// convert nucleotides ACGT to 0123
// Other characters are undefined
int
to_int(char c);

void
initialize_to_int();
*/


template <typename T>
boost::tuple<int,int,int,int>
get_ranges(const boost::multi_array<T, 2>& a)
{
  int row_begin, row_end, col_begin, col_end;
  row_begin = a.index_bases()[0];
  col_begin = a.index_bases()[1];
  row_end = row_begin + a.shape()[0];
  col_end = col_begin + a.shape()[1];
  return boost::make_tuple(row_begin, row_end, col_begin, col_end);
}


void
normalize_vector(std::vector<double>& v);

std::vector<double>
normalize_vector_copy(const std::vector<double>& v);

std::vector<double>
normalize_vector_copy(const std::vector<int>& v);

std::vector<double>
to_double_vector(const std::vector<int>& v);

void
normalize_map(std::map<big_int, double>& v);




std::pair<int,int>
read_sequences(const std::string& filename, std::vector<std::string>& seqs, bool allow_iupac=false);

std::pair<int,int>
read_fasta_sequences(const std::string& filename, std::vector<std::string>& seqs, bool allow_iupac=false);

std::pair<int,int>
read_fastq_sequences(const std::string& filename, std::vector<std::string>& seqs, bool allow_iupac=false);

std::string
join(const std::vector<std::string>& v, char c);

std::string
join(const std::vector<std::string>& v, const std::string& sep);

std::string
join_rev(const std::vector<std::string>& v, char c);

std::vector<std::string>
split(const std::string& s, char c);

template <typename T,
	  typename F,
	  typename R>
std::vector<R>
transform(const std::vector<T>& orig, F f)
{
  std::vector<R> result;
  std::transform(orig.begin(), orig.end(), std::back_inserter(result), f);
  return result;
}


class character_translation
{
public:

  character_translation(const std::string& from, const std::string& to) 
    : table(256)
  {
    assert(from.length() == to.length());

    for (int i=0; i < 256; ++i)
      table[i]=i;

    for (int i=0; i < from.length(); ++i)
      table[(unsigned char)from[i]] = to[i];
  }

  char
  operator()(char c) const
  {
    return table[(unsigned char) c];
  }

private:
  std::vector<char> table;
};

template <typename T>
class character_to_values
{
public:

  character_to_values(const std::string& from, const std::vector<T>& to, T default_value=T()) 
    : table(256)
  {
    assert(from.length() == to.size());

    for (int i=0; i < 256; ++i)
      table[i] = default_value;

    for (int i=0; i < from.length(); ++i)
      table[(unsigned char)from[i]] = to[i];
  }

  character_to_values(const std::string& from, T to, T default_value=T()) 
    : table(256)
  {

    for (int i=0; i < 256; ++i)
      table[i] = default_value;

    for (int i=0; i < from.length(); ++i)
      table[(unsigned char)from[i]] = to;
  }

  T
  operator()(char c) const
  {
    return table[(unsigned char) c];
  }

private:
  std::vector<T> table;
};

int
hamming_distance(const std::string& s, const std::string& t);

int
iupac_hamming_dist(const std::string& str, const std::string& pattern, int max_hd);

// finds the minimum Hamming distance between t and the substrings of s
int
min_hamming_distance(const std::string& s, const std::string& t);


void
check_data(const std::vector<std::string>& seqs, const std::string& valid_chars="ACGT");

// recalculate matrix m using frequencies of motif nucleotides 
// in subset [begin,end] of sequences
template <typename Iter>
matrix<double>
calculate_frequencies_in_subset(const std::vector<std::string>& sequences, Iter begin, Iter end, 
				const std::vector<int>& alignments, 
				const std::vector<int>& direction, 
				int k)
{
  matrix<double> m_new(4,k);
  m_new.fill_with(0);

  for (Iter i=begin;  i < end; ++i) {
    const std::string& line = sequences[*i];
    int n = line.length();
    assert(n>=k);
    int l = alignments[*i];
    std::string tmp = line.substr(l,k);
    if (direction[*i]==-1)
      tmp=reverse_complement(tmp);
    for (int j=0; j < k; ++j) {
      ++m_new(to_int(tmp[j]), j);
    }

  }

  // add pseudo counts
//   int pseudo_count=0;
//   const int pad = 1;
//   for (int i=0; i < 4; ++i)
//     for (int j=0; j < k; ++j)
//       if (m_new(i,j)==0) {
// 	m_new(i,j)=pad;
// 	pseudo_count += pad;
//       }

  //printf("pseudocount =%i    charcount=%i\n", pseudo_count, char_count);

  return m_new;

}

template <typename T, size_t N>
T
sum(T (&array)[N])
{
  T sum=0;
  for (size_t i=0; i < N; ++i)
    sum += array[i];

  return sum;
}

template <typename T>
T
sum(const std::vector<T>& array)
{
  T sum=0;
  size_t N = array.size();
  for (size_t i=0; i < N; ++i)
    sum += array[i];

  return sum;
}

template <typename T>
T
sum(const boost::multi_array<T, 2>& a) 
{
  int row_begin, row_end;
  int col_begin, col_end;
  boost::tie(row_begin, row_end, col_begin, col_end) = get_ranges(a);

  T sum = 0.0;
  for (int row=row_begin; row < row_end; ++row) {
    for (int col=col_begin; col < col_end; ++col) {
      sum += a[row][col];
    }
  }
  return sum;
}

/*

template <typename T>
T
sum(const matrix<T>& a) 
{
  int row_end;
  int col_end;
  boost::tie(row_end, col_end) = a.dim();

  T sum = 0.0;
  for (int row=0; row < row_end; ++row) {
    for (int col=0; col < col_end; ++col) {
      sum += a(row,col);
    }
  }
  return sum;
}
*/

template <typename K, typename T>
T
sum(const std::map<K, T>& m)
{
  T sum=0;

  typedef typename std::map<K,T>::const_iterator iterator;
  for (iterator i=m.begin(); i != m.end(); ++i)
    sum += i->second;

  return sum;
}

template <typename K, typename T, typename H>
T
sum(const boost::unordered_map<K, T, H>& m)
{
  T sum=0;

  typedef typename boost::unordered_map<K,T,H>::const_iterator iterator;
  for (iterator i=m.begin(); i != m.end(); ++i)
    sum += i->second;

  return sum;
}

template <typename T>
std::string
print_vector(const std::vector<T>& v)
{
  if (v.size() == 0)
    return std::string("[]");
  std::ostringstream result;
  result << "[";
  for (int i=0; i < v.size()-1; ++i)
    result << v[i] << ", ";
  result << v[v.size()-1] << "]";

  return result.str();
}

template <typename T>
std::string
print_vector(const std::vector<boost::shared_ptr<T> >& v)
{
  if (v.size() == 0)
    return std::string("[]");
  std::ostringstream result;
  result << "[";
  for (int i=0; i < v.size()-1; ++i)
    result << *v[i] << ", ";
  result << *v[v.size()-1] << "]";

  return result.str();
}

template <typename T>
std::string
print_vector(const std::vector<T>& v, const std::string& sep, int precision)
{
  if (v.size() == 0)
    return std::string("");
  std::ostringstream result;
  result.precision(precision);
  for (int i=0; i < v.size()-1; ++i)
    result << v[i] << sep;
  result << v[v.size()-1];

  return result.str();
}


template <>
std::string
print_vector<bool>(const std::vector<bool>& v);

template <typename T>
T
max_element(const std::vector<T>& array)
{
  T max=-std::numeric_limits<T>::max();
  size_t N = array.size();
  for (size_t i=0; i < N; ++i)
    max = std::max(max, array[i]);

  return max;
}

template <typename T>
T
max_element(const boost::multi_array<T, 2>& a)
{
  int row_begin, row_end, col_begin, col_end;
  boost::tie(row_begin, row_end, col_begin, col_end) = get_ranges(a);
  T max = -std::numeric_limits<T>::max();
  for (int r = row_begin; r < row_end; ++r)
    for (int c = col_begin; c < col_end; ++c)
      max = std::max(a[r][c], max);
  return max;
}


template <typename T>
T
min_element(const std::vector<T>& array)
{
  T min=array[0];
  size_t N = array.size();
  for (size_t i=1; i < N; ++i)
    min = std::min(min, array[i]);

  return min;
}

/*
template <typename K, typename V>
V
max_element(const my_unordered_map<K,V>& hash)
{
  return std::max_element(hash.begin(), hash.end())->second;
}
*/

template <typename K, typename V, typename H>
V
max_element(const boost::unordered_map<K, V, H>& m)
{
  if (m.size() == 0)
    throw "sika";

  V max=m.begin()->second;

  typedef typename boost::unordered_map<K,V, H>::const_iterator iterator;
  for (iterator i=m.begin(); i != m.end(); ++i)
    max = std::max(max, i->second);

  return max;
}


template <typename K, typename V, typename H>
K
arg_max(const boost::unordered_map<K, V, H>& m)
{
  if (m.size() == 0)
    throw "Cannot take maximum of an empty container";

  //  K arg_max=m.begin()->first;
  //  V max=m.begin()->second;
  //  std::cout << "Tassa" << std::endl;
  //  std::cout << m.begin()->first << std::endl;
  //  std::cout << m.begin()->second << std::endl;
  K arg_max=m.begin()->first;
  V max=m.begin()->second;

  typedef typename boost::unordered_map<K,V, H>::const_iterator iterator;
  for (iterator i=m.begin(); i != m.end(); ++i)
    if (i->second > max) {
      arg_max = i->first;
      max = i->second;
    }

  return arg_max;
}


/*
template <typename K, typename V>
V
arg_max(const boost::unordered_map<K,V>& hash)
{
  return std::max_element(hash.begin(), hash.end())->first;
}
*/

template <typename T>
typename std::vector<T>::size_type
arg_max(const std::vector<T>& array)
{
  assert(array.size() > 0);
  size_t maxarg=0;
  T max=array[maxarg];
  size_t N = array.size();
  for (size_t i=1; i < N; ++i) {
    if (array[i]>max) {
      max=array[i];
      maxarg=i;
    }
  }
  return maxarg;
}

template <typename T>
double
add_pseudo_counts(matrix<T>& m, const double pad = 1.0)
{
  int rows = m.get_rows();
  int columns = m.get_columns();
  double count = 0;
  for (int i=0; i < rows; ++i)
    for (int j=0; j < columns; ++j) {
	m(i,j)+=pad;
	count+=m(i,j);
    }
  return count;
}

template <typename T>
class prior
{
public:

  prior() : pseudo_counts(4,0) {}
  
  void
  use_add_one(double p = 1.0) 
  {
    for (int i=0; i < 4; ++i)
      pseudo_counts[i] = p;
  }

  void
  use_dirichlet(double b, const std::vector<double>& q)
  {
    assert( q.size() == 4 );
    for (int i=0; i < 4; ++i)
      pseudo_counts[i] = q[i] * b;
  }

  void
  add(matrix<T>& m)
  {
    int rows = m.get_rows();
    int columns = m.get_columns();
    assert(rows == 4);

    for (int i=0; i < rows; ++i)
      for (int j=0; j < columns; ++j)
	m(i,j) += pseudo_counts[i];
  }

  void
  add(std::vector<T>& v)
  {
    int size = v.size();
    
    for (int i=0; i < size; ++i)
      v[i] += pseudo_counts[i];
  }

  std::vector<double>
  get()
  {
    return pseudo_counts;
  }

private:
  std::vector<double> pseudo_counts;
};

// x == base^2 * result[0] + base * result[1] + result[2]
std::vector<int>
decode_base(int base, int x);


int
code_base(int base, int a, int b, int c, int d);


int
code_base(int base, const std::vector<int>& v);


namespace {

  template <typename T>
  void
  print_cell(FILE* fp, const std::string& format, const T& cell)
  {
    fprintf(fp, format.c_str(), cell);
  }

  template <>
  void
  print_cell(FILE* fp, const std::string& format, const std::string& cell)
  {
    fprintf(fp, format.c_str(), cell.c_str());
  }

}

/*
template <typename T, unsigned long N, unsigned long I>
typename boost::multi_array<T, N>::extent_gen::template gen_type<N>::type
xget_shape_helper(const boost::multi_array<T, N>& a, typename boost::multi_array<T, N>::extent_gen::template gen_type<I>::type ext)
{
  typedef typename boost::multi_array<T, N>::extent_range range;
  int row_begin, row_end, col_begin, col_end;
  row_begin = a.index_bases()[I];
  row_end = row_begin + a.shape()[I];
  return xget_shape_helper<T, N, I+1>(a, ext[range(row_begin, row_end)]);
}

template <typename T, unsigned long N>
typename boost::multi_array<T, N>::extent_gen::template gen_type<N>::type
xget_shape_helper<T,N,N>(const boost::multi_array<T, N>& a, typename boost::multi_array<T, N>::extent_gen::template gen_type<N>::type ext)
{
  return ext;
}


template <typename T, unsigned long N>
typename boost::multi_array<T, N>::extent_gen::template gen_type<N>::type
xget_shape(const boost::multi_array<T, N>& a)
{
  typedef typename boost::multi_array<T, N>::extent_range range;
  int row_begin, row_end, col_begin, col_end;
  row_begin = a.index_bases()[0];
  row_end = row_begin + a.shape()[0];
  return xget_shape_helper<T, N, 1>(a, boost::extents[range(row_begin, row_end)]);
}
*/

template <typename T>
typename boost::multi_array<T, 2>::extent_gen::template gen_type<2>::type
get_shape(const boost::multi_array<T, 2>& a)
{
  typedef typename boost::multi_array<T, 1>::extent_range range;
  int row_begin, row_end, col_begin, col_end;
  row_begin = a.index_bases()[0];
  row_end = row_begin + a.shape()[0];
  col_begin = a.index_bases()[1];
  col_end = col_begin + a.shape()[1];
  return boost::extents[range(row_begin, row_end)][range(col_begin,col_end)];
}

template <typename T>
typename boost::multi_array<T, 1>::extent_gen::template gen_type<1>::type
get_shape(const boost::multi_array<T, 1>& a)
{
  typedef typename boost::multi_array<T, 1>::extent_range range;
  int row_begin, row_end;
  row_begin = a.index_bases()[0];
  row_end = row_begin + a.shape()[0];
  return boost::extents[range(row_begin, row_end)];
}

template <typename T>
void
fill_with(boost::multi_array<T, 2>& a, const T& value)
{
  int row_begin, row_end, col_begin, col_end;
  boost::tie(row_begin, row_end, col_begin, col_end) = get_ranges(a);

  for (int r = row_begin; r < row_end; ++r)
    for (int c = col_begin; c < col_end; ++c)
      a[r][c] = value;
}


std::vector<std::string>
integer_range(int begin, int end);



template <typename T>
void
print_array(FILE* fp, const boost::multi_array<T, 2>& a, 
	    const std::vector<std::string>& row_headers = std::vector<std::string>(), 
	    const std::vector<std::string>& col_headers = std::vector<std::string>(),
	    const std::string& format = "%.2f")
{
  int row_begin, row_end;
  int col_begin, col_end;
  boost::tie(row_begin, row_end, col_begin, col_end) = get_ranges(a);

  bool use_row_headers = row_headers.size() == (row_end - row_begin);
  bool use_col_headers = col_headers.size() == (col_end - col_begin);
  if (use_col_headers) {
    if (use_row_headers)
      fprintf(fp, "\t");
    for (int col=col_begin; col < col_end-1; ++col)
      fprintf(fp, "%s\t", col_headers[col-col_begin].c_str());
    fprintf(fp, "%s\n", col_headers[col_end-col_begin-1].c_str());
  }

  
  for (int row=row_begin; row < row_end; ++row) {
    if (use_row_headers)
      fprintf(fp, "%s\t", row_headers[row-row_begin].c_str());
    for (int col=col_begin; col < col_end-1; ++col) {
      print_cell(fp, format, a[row][col]);
      //      printf(format.c_str(), a[row][col]);
      fprintf(fp, "\t");
    }
    print_cell(fp, format, a[row][col_end-1]);
    //    printf(format.c_str(), a[row][col_end-1]);
    fprintf(fp, "\n");
  }
}

template <typename T>
boost::multi_array<T, 2>
read_array(FILE* fp)
{
  typedef typename boost::multi_array<T, 2> cob_type;

  typedef typename cob_type::extent_range range;
  typename cob_type::extent_gen extents;


  char* line = NULL;
  size_t dummy;
  int ret_val = getline(&line, &dummy, fp);
  assert(ret_val > 0);
  std::vector<std::string> parts = split(line, '\t');
  //assert(parts[0].length() == 0);
  free(line);
  int dmin = atoi(parts[0]);
  int dmax = atoi(parts[parts.size()-1]);
  int n=0;
  std::vector<std::string> lines;
  line = NULL;
  ret_val = getline(&line, &dummy, fp);
  while (ret_val > 0 and n < 4) {
    lines.push_back(line);
    free(line);
    line = NULL;
    ret_val = getline(&line, &dummy, fp);
    ++n;
  }
  free(line);

  assert(n == 3 or n == 4);
  cob_type result(extents[n][range(dmin, dmax+1)]);


  for (int i=0; i < n; ++i) {
    parts = split(lines[i], '\t');
    assert(orients[i] == parts[0]);
    for (int j=0; j < (dmax-dmin+1); ++j) {
      result[i][dmin+j] = atof(parts[j+1]);
    }
  }
  return result;
}

template <typename T>
boost::multi_array<T, 2>
read_array_file(const std::string& arrayfile)
{
  FILE* fp = fopen(arrayfile.c_str(), "r");
  if (fp == NULL) {
    fprintf(stderr, "Couldn't open file %s\n", arrayfile.c_str());
    exit(1);
  }
  boost::multi_array<T, 2> M = read_array<T>(fp);
  fclose(fp);
  return M;
}


template <typename T>
void
print_cob(FILE* fp, const boost::multi_array<T, 2>& a, const std::string& title = "", const std::string& format = "%.2f")
{
  int row_begin, row_end, col_begin, col_end;
  boost::tie(row_begin, row_end, col_begin, col_end) = get_ranges(a);  

  assert(row_begin == 0 and (row_end == 3 or row_end == 4 or row_end == 1 or row_end == 2));
  std::vector<std::string> col_headers = integer_range(col_begin, col_end);
  std::vector<std::string> row_headers;
  row_headers.push_back("HT");
  if (row_end == 2)
    row_headers.push_back("TH");
  else if (row_end >= 3) {
    row_headers.push_back("HH");
    row_headers.push_back("TT");
    if (row_end == 4)
      row_headers.push_back("TH");
  }

  if (title.length() > 0)
    fprintf(fp, "%s", title.c_str());
  print_array(fp, a, row_headers, col_headers, format);
}

template <typename T>
void
write_cob_file(const std::string& filename, const boost::multi_array<T, 2>& a)
{
  FILE* fp = fopen(filename.c_str(), "w");
  if (fp == NULL) {
    fprintf(stderr, "Couldn't open file %s for writing\n", filename.c_str());
    exit(1);
  }
  print_cob(fp, a, "", "%.5f");
  fclose(fp);
}

std::string&
SSS(const char* s);


#endif // COMMON_HPP

back to top