https://github.com/jttoivon/MODER
Tip revision: c485231e5b468ae509306e1aaeebaa0f3004572d authored by Jarkko Toivonen on 31 March 2020, 18:10:18 UTC
Fixed indexing bug.
Fixed indexing bug.
Tip revision: c485231
probabilities.hpp
/*
MODER is a program to learn DNA binding motifs from SELEX datasets.
Copyright (C) 2016, 2017 Jarkko Toivonen,
Department of Computer Science, University of Helsinki
MODER is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
MODER is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
*/
#ifndef PROBABILITIES_HPP
#define PROBABILITIES_HPP
#include "matrix.hpp"
#include "data.hpp"
#include "parameters.hpp"
#include "common.hpp"
#include <string>
#include <vector>
#include <boost/tuple/tuple.hpp>
enum {atleast, atmost, exactly};
typedef double (*score_function_t)(const std::string&, const dmatrix&);
std::vector<double>
binomial_distribution(double p, int n);
std::vector<double>
binomial_distribution2(double p, int n);
std::vector<double>
poisson_binomial_distribution(const std::vector<double>& p);
double
poisson_binomial_expectation(const std::vector<double>& p);
struct condition
{
condition(int n_, int type_) : n(n_), type(type_) {}
int n;
int type;
};
// compute the tail of a random variable X \in 0,1,2,...,n with distribution p
double
tail(const std::vector<double>& p, condition cond);
// compute the expectation of a random variable X \in 0,1,2,...,n with distribution p
double
expectation(const std::vector<double>& p);
double
entropy(const std::vector<double>& v);
double
KL_distance(const std::vector<double>& p, const std::vector<double>& q);
double
symmetric_KL_distance(const std::vector<double>& p, const std::vector<double>& q);
double
information_content(const std::vector<double>& p,
std::vector<double> q = std::vector<double>());
double
average_information_content(const matrix<double>& m, std::vector<double> q = std::vector<double>());
double
matrix_information_content(const matrix<double>& m);
double
matrix_entropy(const matrix<double>& m);
double
matrix_KL_distance(const matrix<double>& m, const std::vector<double>& q);
double
matrix_KL_distance(const std::vector<double>& q, const matrix<double>& m);
double
aic_score(double maximum_log_likelihood, int lines, int k);
matrix<double>
matrix_to_logodds(const matrix<double>& m, const std::vector<double>& bg);
matrix<double>
matrix_to_weighted_logodds(const matrix<double>& m, const std::vector<double>& bg, double m_weight, double bg_weight);
dmatrix
matrix_to_affinity(const dmatrix& m);
double
compute_logodds_probability(const std::string& s, const dmatrix& m);
double
compute_normal_probability(const std::string& s, const dmatrix& m);
double
max_matrix_score(const dmatrix& m);
double
max_matrix_probability(const dmatrix& m);
std::string
string_giving_max_score(const dmatrix& m, bool use_rna);
matrix<double>
count_positional_background(const std::vector<std::string>& sequences);
template <typename T>
T
compute_bernoulli_probability(const std::string& line, const std::vector<double>& q)
{
T prob = 1.0;
for (int i=0; i < line.length(); ++i) {
prob *= q[to_int(line[i])];
assert(prob > 0);
}
return prob;
}
template <typename T, typename S>
T
compute_log_bernoulli_probability(const std::string& line, const std::vector<S>& q)
{
T log_prob = 0.0;
for (int i=0; i < line.length(); ++i) {
log_prob += q[to_int(line[i])];
}
return log_prob;
}
template <typename T>
T
compute_bernoulli_probability(big_int code, int k, const std::vector<double>& q)
{
T prob = 1.0;
for (int i=0; i < k; ++i, code >>= 2)
prob *= q[code & 3];
assert(prob > 0);
return prob;
}
// double
// compute_background_probability(const std::string& line, const std::vector<double>& q,
// const matrix<double>& q2 = dmatrix());
// computes the probability P(line|\gamma)
template <typename T>
T
compute_background_probability(const std::string& line,
const std::vector<double>& q,
const dmatrix& q2)
{
assert(line.length() > 0);
assert(q.size() == 4);
T prob = 1.0;
if (use_markov_background) {
error(true, "Not implemented");
// if (use_two_strands)
// return (compute_markov_probability(line, q, q2) + compute_markov_probability(reverse_complement(line), q, q2)) / 2.0;
// else
// return compute_markov_probability(line, q, q2);
}
else if (use_positional_background) {
for (int i=0; i < line.length(); ++i)
prob *= positional_background(to_int(line[i]),i);
assert(prob > 0);
}
else { // 0th order model
return compute_bernoulli_probability<T>(line, q);
}
return prob;
}
// computes the logarithm of probability P(line|\gamma)
template <typename T, typename S>
T
compute_log_background_probability(const std::string& line,
const std::vector<S>& q,
const dmatrix& q2)
{
assert(line.length() > 0);
assert(q.size() == 4);
T log_prob = 0.0;
if (use_markov_background) {
error(false, "Not implemented!");
// if (use_two_strands)
// return (compute_markov_probability(line, q, q2) + compute_markov_probability(reverse_complement(line), q, q2)) / 2.0;
// else
// return compute_markov_probability(line, q, q2);
}
else if (use_positional_background) {
for (int i=0; i < line.length(); ++i)
log_prob += positional_background(to_int(line[i]),i);
}
else { // 0th order model
return compute_log_bernoulli_probability<T>(line, q);
}
return log_prob;
}
// wrapper for the one below
// double
// compute_probability(const std::string& line, int pos, int direction,
// const matrix<double>& m,
// const std::vector<double>& q,
// const matrix<double>& q2 = dmatrix());
// Parameter direction tells the direction of the factor. It does NOT
// tell whether we chose the upper or lower strand.
// computes the probability P(line|\mathcal M(pos))
template <typename T>
T
compute_probability(const std::string& line_orig, const std::string& line_orig_rev,
int pos, int direction,
const matrix<double>& m,
const std::vector<double>& q,
const matrix<double>& q2)
{
assert(q[0]>0 && q[0]<=1);
assert(q[1]>0 && q[1]<=1);
assert(q[2]>0 && q[2]<=1);
assert(q[3]>0 && q[3]<=1);
assert(q.size() == 4);
int k = m.get_columns();
const std::string& line = direction == 1 ? line_orig : line_orig_rev;
int L = line.length();
if (direction == -1)
pos = L - (pos + k);
T prob = 1.0;
// compute probability of motif part
std::string seq = line.substr(pos,k);
for (int i=0; i < k; ++i) {
assert(pos+i < L);
prob *= m(to_int(line[pos+i]),i);
}
if (use_markov_background) {
// compute probability on the left side of motif
prob *= q[to_int(line[0])];
for (int i=0; i < pos - 1; ++i) {
prob *= q2(to_int(line[i]),to_int(line[i+1]));
}
if (direction == -1) // This correction is because ADM(rev(X)) != rev(ADM)(X)
prob *= (q[to_int(line[pos-1])] / q[to_int(line[0])]);
// compute probability on the right side of motif
prob *= q[to_int(line[pos+k])];
for (int i=pos+k; i < L-1; ++i) {
prob *= q2(to_int(line[i]), to_int(line[i+1]));
}
if (direction == -1) // This correction is because ADM(rev(X)) != rev(ADM)(X)
prob *= (q[to_int(line[L-1])] / q[to_int(line[pos+k])]);
}
else if (use_positional_background) {
for (int i=0; i < line.length(); ++i) {
if (i >= pos && i < pos + k)
continue;
prob *= positional_background(to_int(line[i]),i);
}
}
else { // use 0th order model
// compute probability on the left side of motif
for (int i=0; i < pos; ++i) {
prob *= q[to_int(line[i])];
}
// compute probability on the right side of motif
for (int i=pos+k; i < line.length(); ++i) {
prob *= q[to_int(line[i])];
}
}
assert(prob > 0.0);
return prob;
}
// Parameter direction tells the direction of the factor. It does NOT
// tell whether we chose the upper or lower strand.
// computes the logarithm of probability P(line|\mathcal M(pos))
template <typename T, typename S>
T
compute_log_probability(const std::string& line_orig, const std::string& line_orig_rev,
int pos, int direction,
const matrix<T>& m,
const std::vector<S>& q,
const matrix<double>& q2)
{
/*
assert(q[0]>0 && q[0]<=1);
assert(q[1]>0 && q[1]<=1);
assert(q[2]>0 && q[2]<=1);
assert(q[3]>0 && q[3]<=1);
*/
assert(q.size() == 4);
int k = m.get_columns();
const std::string& line = direction == 1 ? line_orig : line_orig_rev;
int L = line.length();
if (direction == -1)
pos = L - (pos + k);
T log_prob = 0.0;
// compute probability of motif part
std::string seq = line.substr(pos,k);
for (int i=0; i < k; ++i) {
assert(pos+i < L);
log_prob += m(to_int(line[pos+i]),i);
}
if (use_markov_background) {
// compute probability on the left side of motif
log_prob += q[to_int(line[0])];
for (int i=0; i < pos - 1; ++i) {
log_prob += q2(to_int(line[i]),to_int(line[i+1]));
}
if (direction == -1) // This correction is because ADM(rev(X)) != rev(ADM)(X)
log_prob += (q[to_int(line[pos-1])] / q[to_int(line[0])]);
// compute probability on the right side of motif
log_prob += q[to_int(line[pos+k])];
for (int i=pos+k; i < L-1; ++i) {
log_prob += q2(to_int(line[i]), to_int(line[i+1]));
}
if (direction == -1) // This correction is because ADM(rev(X)) != rev(ADM)(X)
log_prob += (q[to_int(line[L-1])] / q[to_int(line[pos+k])]);
}
else if (use_positional_background) {
for (int i=0; i < line.length(); ++i) {
if (i >= pos && i < pos + k)
continue;
log_prob += positional_background(to_int(line[i]),i);
}
}
else { // use 0th order model
// compute probability on the left side of motif
for (int i=0; i < pos; ++i) {
log_prob += q[to_int(line[i])];
}
// compute probability on the right side of motif
for (int i=pos+k; i < line.length(); ++i) {
log_prob += q[to_int(line[i])];
}
}
return log_prob;
}
template <typename T>
T
compute_probability(const std::string& line_orig, int pos, int direction,
const matrix<double>& m,
const std::vector<double>& q,
const matrix<double>& q2)
{
return compute_probability<T>(line_orig, reverse_complement(line_orig), pos, direction, m, q, q2);
}
template <typename T>
T
compute_dimer_probability(const std::string& line_orig, const std::string& line_orig_rev,
int pos, int direction,
const matrix<double>& m1,
const matrix<double>& m2,
int d,
const std::vector<double>& q,
const matrix<double>& q2)
{
assert(q.size() == 4);
assert(q[0]>0 && q[0]<=1);
assert(q[1]>0 && q[1]<=1);
assert(q[2]>0 && q[2]<=1);
assert(q[3]>0 && q[3]<=1);
assert(d >= 0);
int L = line_orig.length();
T prob = 1.0;
int k1 = m1.get_columns();
int k2 = m2.get_columns();
int dimer_len = k1 + k2 + d;
const std::string& line = direction == 1 ? line_orig : line_orig_rev;
if (direction == -1)
pos = L - (pos + dimer_len);
int pos2 = pos + dimer_len - k2; // Starting position of the second motif
// compute probability of the first motif part
for (int i=0; i < k1; ++i) {
assert(pos+i < line.length());
prob *= m1(to_int(line[pos+i]), i);
}
// compute probability of the second motif part
for (int i=0; i < k2; ++i) {
assert(pos2+i < L);
prob *= m2(to_int(line[pos2+i]), i);
}
if (use_markov_background) {
// compute probability on the left side of motif
prob *= q[to_int(line[0])];
for (int i=0; i < pos-1; ++i) {
prob *= q2(to_int(line[i]),to_int(line[i+1]));
}
if (direction == -1) // This correction is because ADM(rev(X)) != rev(ADM)(X)
prob *= (q[to_int(line[pos-1])] / q[to_int(line[0])]);
// compute probability between the motifs
if (d > 0) {
prob *= q[to_int(line[pos+k1])];
for (int i=pos+k1; i < pos2-1; ++i) {
prob *= q2(to_int(line[i]), to_int(line[i+1]));
}
if (direction == -1) // This correction is because ADM(rev(X)) != rev(ADM)(X)
prob *= (q[to_int(line[pos2-1])] / q[to_int(line[pos+k1])]);
}
// compute probability on the right side of motif
prob *= q[to_int(line[pos2+k2])];
for (int i=pos2+k2; i < L-1; ++i) {
prob *= q2(to_int(line[i]),to_int(line[i+1]));
}
if (direction == -1) // This correction is because ADM(rev(X)) != rev(ADM)(X)
prob *= (q[to_int(line[L-1])] / q[to_int(line[pos2+k2])]);
}
else if (use_positional_background) {
// compute probability on the left side of the first motif
for (int i=0; i < pos; ++i)
prob *= positional_background(to_int(line[i]), i);
// compute probability between the motifs
for (int i=pos+k1; i < pos2; ++i)
prob *= positional_background(to_int(line[i]), i);
// compute probability on the right side of the second motif
for (int i=pos2+k2; i < L; ++i)
prob *= positional_background(to_int(line[i]), i);
}
else { // use 0th order model
// compute probability on the left side of the first motif
for (int i=0; i < pos; ++i)
prob *= q[to_int(line[i])];
// compute probability between the motifs
for (int i=pos+k1; i < pos2; ++i)
prob *= q[to_int(line[i])];
// compute probability on the right side of the second motif
for (int i=pos2+k2; i < L; ++i)
prob *= q[to_int(line[i])];
}
assert(prob > 0.0);
return prob;
} // compute_dimer_probability
// Parameter direction tells the direction of the dimer. It does NOT
// tell whether we chose the upper or lower strand.
// computes the spaced dimer probability P(line|\mathcal dimer(pos))
// Assumes that the PWMs are already correctly oriented
template <typename T>
T
compute_log_dimer_probability(const std::string& line_orig, const std::string& line_orig_rev,
int pos, int direction,
const matrix<T>& m1,
const matrix<T>& m2,
int d,
const std::vector<T>& q,
const matrix<double>& q2)
{
assert(q.size() == 4);
/*
assert(q[0]>0 && q[0]<=1);
assert(q[1]>0 && q[1]<=1);
assert(q[2]>0 && q[2]<=1);
assert(q[3]>0 && q[3]<=1);
*/
assert(d >= 0);
int L = line_orig.length();
T log_prob = 0.0;
int k1 = m1.get_columns();
int k2 = m2.get_columns();
int dimer_len = k1 + k2 + d;
const std::string& line = direction == 1 ? line_orig : line_orig_rev;
if (direction == -1)
pos = L - (pos + dimer_len);
int pos2 = pos + dimer_len - k2; // Starting position of the second motif
// compute probability of the first motif part
for (int i=0; i < k1; ++i) {
assert(pos+i < line.length());
log_prob += m1(to_int(line[pos+i]), i);
}
// compute probability of the second motif part
for (int i=0; i < k2; ++i) {
assert(pos2+i < L);
log_prob += m2(to_int(line[pos2+i]), i);
}
if (use_markov_background) {
// compute probability on the left side of motif
log_prob += q[to_int(line[0])];
for (int i=0; i < pos-1; ++i) {
log_prob += q2(to_int(line[i]),to_int(line[i+1]));
}
if (direction == -1) // This correction is because ADM(rev(X)) != rev(ADM)(X)
log_prob += (q[to_int(line[pos-1])] / q[to_int(line[0])]);
// compute probability between the motifs
if (d > 0) {
log_prob += q[to_int(line[pos+k1])];
for (int i=pos+k1; i < pos2-1; ++i) {
log_prob += q2(to_int(line[i]), to_int(line[i+1]));
}
if (direction == -1) // This correction is because ADM(rev(X)) != rev(ADM)(X)
log_prob += (q[to_int(line[pos2-1])] / q[to_int(line[pos+k1])]);
}
// compute probability on the right side of motif
log_prob += q[to_int(line[pos2+k2])];
for (int i=pos2+k2; i < L-1; ++i) {
log_prob += q2(to_int(line[i]),to_int(line[i+1]));
}
if (direction == -1) // This correction is because ADM(rev(X)) != rev(ADM)(X)
log_prob += (q[to_int(line[L-1])] / q[to_int(line[pos2+k2])]);
}
else if (use_positional_background) {
// compute log_probability on the left side of the first motif
for (int i=0; i < pos; ++i)
log_prob += positional_background(to_int(line[i]), i);
// compute log_probability between the motifs
for (int i=pos+k1; i < pos2; ++i)
log_prob += positional_background(to_int(line[i]), i);
// compute log_probability on the right side of the second motif
for (int i=pos2+k2; i < L; ++i)
log_prob += positional_background(to_int(line[i]), i);
}
else { // use 0th order model
// compute log_probability on the left side of the first motif
for (int i=0; i < pos; ++i)
log_prob += q[to_int(line[i])];
// compute log_probability between the motifs
for (int i=pos+k1; i < pos2; ++i)
log_prob += q[to_int(line[i])];
// compute log_probability on the right side of the second motif
for (int i=pos2+k2; i < L; ++i)
log_prob += q[to_int(line[i])];
}
return log_prob;
} // compute_dimer_log_probability
template <typename T>
T
compute_dimer_probability(const std::string& line_orig, int pos, int direction,
const matrix<double>& m1,
const matrix<double>& m2,
int d,
const std::vector<double>& q,
const matrix<double>& q2)
{
return compute_dimer_probability<T>(line_orig, reverse_complement(line_orig), pos, direction, m1, m2, d, q, q2);
}
// double
// compute_probability(const std::string& line_orig, const std::string& line_orig_rev,
// int pos, int direction,
// const matrix<double>& m,
// const std::vector<double>& q,
// const matrix<double>& q2 = dmatrix());
/*
// wrapper for the one below
double
compute_dimer_probability(const std::string& line_orig,
int pos, int direction,
const matrix<double>& m1,
const matrix<double>& m2,
int d,
const std::vector<double>& q,
const matrix<double>& q2 = dmatrix());
double
compute_dimer_probability(const std::string& line_orig, const std::string& line_orig_rev,
int pos, int direction,
const matrix<double>& m1,
const matrix<double>& m2,
int d,
const std::vector<double>& q,
const matrix<double>& q2 = dmatrix());
*/
// double
// compute_bernoulli_probability(const std::string& line, const std::vector<double>& q);
// double
// compute_bernoulli_probability(big_int code, int k, const std::vector<double>& q);
boost::tuple<std::vector<double>, dmatrix, std::vector<int> >
count_background(const std::vector<std::string>& sequences, bool use_rna=false);
#endif // PROBABILITIES_HPP