https://github.com/jttoivon/MODER
Tip revision: c485231e5b468ae509306e1aaeebaa0f3004572d authored by Jarkko Toivonen on 31 March 2020, 18:10:18 UTC
Fixed indexing bug.
Fixed indexing bug.
Tip revision: c485231
huddinge.cpp
/*
MODER is a program to learn DNA binding motifs from SELEX datasets.
Copyright (C) 2016, 2017 Jarkko Toivonen,
Department of Computer Science, University of Helsinki
MODER is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
MODER is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
*/
#include "huddinge.hpp"
#include "data.hpp"
#include "common.hpp"
#include "matrix.hpp"
#include "suffix_array_wrapper.hpp"
#include <functional>
bool require_contiguous_gap=false;
bool use_counts=false;
bool quiet = true;
bool print_maximum_only = false;
bool get_paths = false;
// These are for bit operations
enum {up = 1,
left = 2,
diag = 4};
std::string
mypad(const std::string& s, int pad, char c, int total_length)
{
int right = total_length - pad - s.length();
return std::string(pad, c) + s + std::string(right, c);
}
int
min3(int a, int b, int c)
{
return std::min(a, std::min(b, c));
}
int
max3(int a, int b, int c)
{
return std::max(a, std::max(b, c));
}
boost::tuple<dmatrix, imatrix>
huddinge_distance_helper(const std::string& x, const std::string& y)
{
int xlen = x.length();
int ylen = y.length();
dmatrix d(ylen+1, xlen+1);
imatrix path(ylen+1, xlen+1);
for (int i=1; i < ylen; ++i) {
for (int j=1; j < xlen; ++j) {
d(i, j) = d(i-1, j-1) + (y[i-1] == x[j-1] and y[i-1] != 'n');
}
}
// Lowest row
for (int j=1; j < xlen; ++j) {
d(ylen, j) = std::max(d(ylen, j-1), d(ylen-1, j-1) + (y[ylen-1] == x[j-1] and y[ylen-1] != 'n') );
if (d(ylen, j) == d(ylen, j-1))
path(ylen, j) |= left;
if (d(ylen, j) == d(ylen-1, j-1) + (y[ylen-1] == x[j-1] and y[ylen-1] != 'n'))
path(ylen, j) |= diag;
}
// rightmost column
for (int i=1; i < ylen; ++i) {
d(i, xlen) = std::max(d(i-1, xlen), d(i-1, xlen-1) + (y[i-1] == x[xlen-1] and x[xlen-1] != 'n') );
if (d(i, xlen) == d(i-1, xlen))
path(i, xlen) |= up;
if (d(i, xlen) == d(i-1, xlen-1) + (y[i-1] == x[xlen-1] and x[xlen-1] != 'n'))
path(i, xlen) |= diag;
}
d(ylen, xlen) = max3(d(ylen-1, xlen), d(ylen, xlen-1), d(ylen-1, xlen-1) + (y[ylen-1] == x[xlen-1] and x[xlen-1] != 'n'));
// Which routes gave maximum value
if (d(ylen, xlen) == d(ylen-1, xlen))
path(ylen, xlen) |= up;
if (d(ylen, xlen) == d(ylen, xlen-1))
path(ylen, xlen) |= left;
if (d(ylen, xlen) == d(ylen-1, xlen-1) + (y[ylen-1] == x[xlen-1] and x[xlen-1] != 'n'))
path(ylen, xlen) |= diag;
return boost::make_tuple(d, path);
}
character_to_values<bool> is_defined_base("ACGTacgt", true);
int
defined_bases(const std::string& x)
{
int count = 0;
int len = x.length();
for (int i=0; i < len; ++i)
count += is_defined_base((unsigned char)x[i]);
return count;
}
bool
one_contiguous_gap(std::string s)
{
assert(s[0] != 'n' and s[s.length()-1] != 'n');
int number_of_switches=0;
bool was_previous_defined = true;
for (int i=0; i < s.length(); ++i) {
char c = s[i];
if (was_previous_defined != is_defined_base(c))
number_of_switches += 1;
was_previous_defined = is_defined_base(c);
}
return number_of_switches <= 2;
}
int
huddinge_distance(const std::string& x, const std::string& y)
{
int xlen = x.length();
int ylen = y.length();
dmatrix d = huddinge_distance_helper(x, y).get<0>();
return std::max(defined_bases(x), defined_bases(y)) - d(ylen, xlen);
}
void
check(int i, int j, std::vector<int>& result, const imatrix& path)
{
if (path(i, j) & diag) {
int r = std::min(i, j);
if (i >= j) // Last row, y starts before x
result.push_back(-(i-r));
else // Last column, x starts before y
result.push_back((j-r));
}
}
std::vector<int>
huddinge_alignment(const std::string& x, const std::string& y)
{
int xlen = x.length();
int ylen = y.length();
dmatrix d;
imatrix path;
boost::tie(d, path) = huddinge_distance_helper(x, y);
std::vector<int> result; // In alignment a \in result, y starts after 'a' positions after x
check(ylen, xlen, result, path); // Check the lower right corner
// Check the last column
int i = ylen;
while (path(i, xlen) & up and i >= 0) {
i -= 1;
check(i, xlen, result, path);
}
// Check the last row
int j = xlen;
while (path(ylen, j) & left and j >= 0) {
j -= 1;
check(ylen, j, result, path);
}
return result;
}
// Align consequent strings in the input list
std::vector<std::string>
huddinge_align_all(const std::vector<std::string>& l)
{
int n=l.size();
std::vector<int> aligns(1, 0);
std::vector<int> lengths;
BOOST_FOREACH(const std::string& x, l) {
lengths.push_back(x.length());
}
int current_align=0;
for (int i=0; i < n-1; ++i) {
std::vector<int> a = huddinge_alignment(l[i], l[i+1]);
current_align=current_align+a[0];
aligns.push_back(current_align);
}
int max_align=-min_element(aligns);
int total_length = 0;
for (int i=0; i < n; ++i) {
total_length = std::max(total_length, max_align + aligns[i] + lengths[i]);
}
std::vector<std::string> result;
for (int i=0; i < n; ++i) {
result.push_back(mypad(l[i], max_align+aligns[i], 'n', total_length));
}
return result;
}
// The next algorithm is from page
// https://alistairisrael.wordpress.com/2009/09/22/simple-efficient-pnk-algorithm/
void
next_k_permutation(std::vector<int>& a, int k)
{
int n = a.size();
assert(n >= k);
int edge = k - 1;
// find j in (k to n-1) where a_j > a_edge
int j = k;
while (j < n and a[edge] >= a[j])
j += 1;
if (j < n)
std::swap(a[edge], a[j]);
//a[edge], a[j] = a[j], a[edge];
else {
// reverse a[k] to a[n-1]
std::reverse(a.begin()+k , a.end());
// find rightmost ascent to left of edge
int i = edge - 1;
while (i >= 0 and a[i] >= a[i+1])
i -= 1;
if (i < 0) //no more permutations
return;
// find j in (n-1 to i+1) where a[j] > a[i]
j = n - 1;
while (j > i and a[i] >= a[j])
j -= 1;
std::swap(a[i], a[j]);
//a[i], a[j] = a[j], a[i] # swap
//reverse a[i+1] to a[n-1]
std::reverse(a.begin()+(i+1), a.end());
// a[i+1:n] = reversed(a[i+1:n])
}
// return a
}
class operation
{
public:
virtual
void
execute(std::string& s, std::string& t) const=0;
virtual
std::ostream&
print(std::ostream& f) const=0;
};
std::ostream&
operator<<(std::ostream& f, const operation& o)
{
return o.print(f);
}
class swap : public operation
{
public:
swap(int i_, int j_) : i(i_), j(j_) {}
void
execute(std::string& s, std::string& t) const
{
s[i] = t[i];
s[j] = 'n';
}
std::ostream&
print(std::ostream& f) const
{
return f << to_string("Swap(%i,%i)", i, j);
}
private:
int i;
int j;
};
class doswitch : public operation
{
public:
doswitch(int i_) : i(i_) {}
void
execute(std::string& s, std::string& t) const
{
s[i] = t[i];
}
std::ostream&
print(std::ostream& f) const
{
return f << to_string("Switch(%i)", i);
}
private:
int i;
};
class change : public operation
{
public:
change(int i_) : i(i_) {}
void
execute(std::string& s, std::string& t) const
{
s[i] = t[i];
}
std::ostream&
print(std::ostream& f) const
{
return f << to_string("Change(%i)", i);
}
private:
int i;
};
suffix_array sa;
int
get_count(const std::string& x)
{
int count = sa.count_iupac(x);
if (is_palindromic(x))
return count / 2;
else
return count;
}
double
get_scaled_count(const std::string& x, int reference_length)
{
return get_count(x) * pow(4.0, defined_bases(x) - reference_length);
}
typedef std::pair<std::string, double> my_comp_param;
bool
mycomp(const my_comp_param& a, const my_comp_param& b)
{
return a.second < b.second;
}
std::string
make_header(int l)
{
std::string result(l, '-');
for (int i=0; i < l; ++i)
result[i] = i % 10;
return result;
}
counted_path_type
get_counted_path(const path_type& path, int reference_length)
{
counted_path_type result;
BOOST_FOREACH(std::string w, path) {
double count = use_counts ? get_scaled_count(w, reference_length) : 0;
result.push_back(boost::make_tuple(w, count));
}
return result;
}
// This is more or less Dijkstra's algorithm except vertices are added to the priority queue lazily
std::vector<counted_path_type>
check_for_longer_paths(std::string x, std::string y, int dmax, int wmin, int L, int reference_length)
{
if (not quiet)
printf("Finding route with highest weight, but at least weight %i, and length at most %d\n", wmin, dmax);
typedef std::map<std::string, int> int_map_type;
typedef std::map<std::string, double> double_map_type;
typedef std::map<std::string, std::vector<std::string> > list_map_type;
int_map_type d;
double_map_type w;
list_map_type prev;
prev[x]=std::vector<std::string>();
d[x]=0;
w[x]=get_scaled_count(x, reference_length); // non-final weights
double_map_type S; // final weights
while (w.size() > 0) {
std::string u;
double w1;
boost::tie(u, w1) = *std::max_element(w.begin(), w.end(), mycomp);
//u, w1 = sorted(w.items(), key = lambda x: x[1])[-1]; // get the maximum
S[u] = w1;
if (not quiet) {
printf("Current string is %s with weight %f", u.c_str(), w1);
printf(" and count %f and path length %i\n", get_scaled_count(u, reference_length), d[u]);
}
w.erase(u);
if (d[u] == dmax) // Path is already too long to be extended
continue;
huddinge_neighbourhood neighbourhood(u, 1, L, 1, L);
std::vector<boost::tuple<std::string, int> > neigh=neighbourhood.compute(require_contiguous_gap); // The neighbourhood contains u as well
std::string v;
BOOST_FOREACH(boost::tie(v, boost::tuples::ignore), neigh) {
if (S.count(v))
continue;
double c = get_scaled_count(v, reference_length);
if (c < wmin)
continue;
if (not w.count(v)) {
w[v] = 0;
d[v] = 1000000;
prev[v] = std::vector<std::string>();
}
double temp = std::min(S[u], c);
if (w[v] < temp or (w[v] == temp and d[v] >= d[u] + 1)) { // update weight of a node
if (w[v] < temp or d[v] > d[u] + 1)
prev[v] = std::vector<std::string>(1, u);
else
prev[v].push_back(u);
w[v] = temp;
d[v] = d[u] + 1;
}
}
}
std::vector<counted_path_type> result;
if (not quiet)
printf("Weight of y is %f\n", S[y]);
if (S.count(y)) {
std::string temp=y;
int depth=d[y];
std::vector<std::string> path(depth+1, "");
std::vector<std::string> nodes(1, y);
while (nodes.size() > 0) {
temp=nodes.back();
nodes.pop_back();
path[d[temp]]=temp;
BOOST_FOREACH(std::string child, prev[temp])
nodes.push_back(child);
if (temp == x)
result.push_back(get_counted_path(path, reference_length));
}
}
if (not quiet)
printf("Size of result set from find_longer_paths is %zu\n", result.size());
return result;
}
std::string
strip(const std::string& s, char c)
{
int start, end;
for (start=0; start < s.length() and s[start] == c; ++start)
;
for (end=s.length(); end > 0 and s[end-1] == c; --end)
;
if (start >= end)
return std::string();
else
return std::string(start, end - start);
}
bool
is_valid_path(const std::vector<std::string>& path)
{
if (not require_contiguous_gap)
return true;
BOOST_FOREACH(std::string s, path) {
if (not one_contiguous_gap(strip(s, 'n'))) {
return false;
}
}
return true;
}
bool
has_positive_count(const counted_path_type& counted_path, int reference_length)
{
std::string s;
double count;
BOOST_FOREACH(boost::tie(s, count), counted_path) {
if (count == 0.0)
return false;
}
return true;
}
// Finds paths of length Huddinge(x,y)
std::vector<counted_path_type>
find_paths(const std::string& x, const std::string& y)
{
int xlen = x.length();
int ylen = y.length();
int xd = defined_bases(x);
int yd = defined_bases(y);
int reference_length=std::max(xd,yd);
std::vector<int> alignments=huddinge_alignment(x, y);
if (not quiet)
printf ("Alignments are %s\n", print_vector(alignments).c_str());
std::vector<counted_path_type> result;
BOOST_FOREACH(int h, alignments) { // Iterate over all alignments
int l=0;
if (not quiet)
printf("%s\n", std::string(40, '-').c_str());
std::string s;
std::string t;
if (h >= 0) {
l = std::max(xlen, h+ylen);
s = mypad(x, 0, 'n', l);
t = mypad(y, h, 'n', l);
}
else{
int l = std::max(xlen - h, ylen);
s = mypad(x, -h, 'n', l);
t = mypad(y, 0, 'n', l);
}
std::string header = make_header(l);
if (not quiet) {
printf("%s\n", header.c_str());
printf("%s\n", s.c_str());
printf("%s\n", t.c_str());
printf("\n");
}
std::vector<int> Jzero;
std::vector<int> Jplus;
std::vector<int> Jminus;
for (int i=0; i < l; ++i) {
if (s[i] != t[i]) {
if (s[i] == 'n')
Jplus.push_back(i);
else if (t[i] == 'n')
Jminus.push_back(i);
else
Jzero.push_back(i);
}
}
if (not quiet) {
printf("Jzero %s\n", print_vector(Jzero).c_str());
printf("Jplus %s\n", print_vector(Jplus).c_str());
printf("Jminus %s\n", print_vector(Jminus).c_str());
}
int k = std::min(Jplus.size(), Jminus.size());
int n = std::max(Jplus.size(), Jminus.size());
int number_of_k_permutations=1; //reduce(lambda x, y: x*y, xrange(n, n-k, -1), 1);
for (int i=n; i > n-k; --i)
number_of_k_permutations *= i;
if (not quiet) {
printf("n is %i\n", n);
printf("k is %i\n", k);
printf("Number of k-permutations is %i\n", number_of_k_permutations);
}
for (int counter = 0; counter < number_of_k_permutations; ++counter) { // go through all sets of operations
std::vector<boost::shared_ptr<operation> > operations;
for (int i=0; i < k; ++i)
operations.push_back(boost::shared_ptr<operation>(new swap(Jplus[i], Jminus[i])));
if (xd >= yd)
for (int i=k; i < Jminus.size(); ++i) // Deletions from x
operations.push_back(boost::shared_ptr<operation>(new doswitch(Jminus[i])));
else
for (int i=k; i < Jplus.size(); ++i) // Insertions to x
operations.push_back(boost::shared_ptr<operation>(new doswitch(Jplus[i])));
BOOST_FOREACH(int i, Jzero)
operations.push_back(boost::shared_ptr<operation>(new change(i)));
if (not quiet)
printf("Operations: %s\n", print_vector(operations).c_str());
int no = operations.size();
std::vector<int> order(no);
for (int i=0; i < no; ++i)
order[i] = i;
int number_of_orders = 1; // reduce(lambda x, y: x*y, xrange(1, no+1), 1);
for (int i=1; i < no + 1; ++i)
number_of_orders *= i;
for (int order_counter = 0; order_counter < number_of_orders; ++order_counter) { // go through orderings of operations
std::vector<boost::shared_ptr<operation> > sorted_operations;
BOOST_FOREACH(int i, order)
sorted_operations.push_back(operations[i]);
std::string a = s;
std::string b = t;
path_type path;
path.push_back(a); // first sequence of the path
BOOST_FOREACH(boost::shared_ptr<operation> op, sorted_operations) {
op->execute(a,b);
path.push_back(a);
}
bool valid_path = is_valid_path(path);
if (valid_path) { // all strings in the path have at most one gap, if require_contiguous_gap
counted_path_type counted_path = get_counted_path(path, reference_length);
if (not use_counts or has_positive_count(counted_path, reference_length)) {
result.push_back(counted_path);
if (not quiet) {
printf("\n");
printf("%s\n", print_vector(sorted_operations).c_str());
printf("%s\n", header.c_str()); // header of indices for easier reading
std::string s;
double count;
BOOST_FOREACH(boost::tie(s, count), counted_path)
printf("%s\t%f\n", s.c_str(), count);
printf("\n");
}
}
} // if is_valid_path
next_k_permutation(order, no);
} // for order_counter
if (xd >= yd)
next_k_permutation(Jminus, k);
else
next_k_permutation(Jplus, k);
}
} // FOREACH h in alignments
if (not quiet)
printf("%s\n", std::string(40, '=').c_str());
return result;
}
std::vector<boost::tuple<std::string,int> >
huddinge_neighbourhood::compute(bool allow_only_one_gap)
{
for (int j=1; j < x.length() + 1; ++j)
U[0][j] = insert(0, j);
// Compute array U
for (int i=1; i < L+1; ++i) {
for (int j=0; j < x.length()+1; ++j) {
str_dist_container ins = insert(i, j);
str_dist_container de = del(i, j);
str_dist_container ma = match(i, j);
std::set<std::string> keys;
std::string s;
BOOST_FOREACH(boost::tie(s, boost::tuples::ignore), ins)
keys.insert(s);
BOOST_FOREACH(boost::tie(s, boost::tuples::ignore), de)
keys.insert(s);
BOOST_FOREACH(boost::tie(s, boost::tuples::ignore), ma)
keys.insert(s);
BOOST_FOREACH(s, keys) {
int xerr = min3(my_get(ins, s).get<0>(), my_get(de, s).get<0>(), my_get(ma, s).get<0>());
int yerr = min3(my_get(ins, s).get<1>(), my_get(de, s).get<1>(), my_get(ma, s).get<1>());
assert(xerr != INT_MAX and yerr != INT_MAX);
U[i][j][s] = boost::make_tuple(xerr, yerr);
/*
int xpad=0, ypad = 0;
if (i > j)
xpad = i - j;
else
ypad = j - i;
printf("j=%i i=%i xerr=%i yerr=%i:\n", j, i, xerr, yerr);
std::string pad(xpad, ' ');
printf("%s%s\n", pad.c_str(), x.substr(0, j).c_str());
pad = std::string(ypad, ' ');
printf("%s%s\n", pad.c_str(), s.c_str());
*/
}
}
}
// Compute array T
boost::multi_array<str_dist_container, 2> T(boost::extents[L+1][x.length()+1]);
for (int i = 1; i < L+1; ++i) {
std::string s;
err_t e;
BOOST_FOREACH(boost::tie(s, e), U[i][0]) {
if (s[i-1] != 'N')
T[i][0][s] = e;
}
for (int j=1; j < x.length()+1; ++j) {
std::set<std::string> keys;
BOOST_FOREACH(boost::tie(s, boost::tuples::ignore), U[i][j])
keys.insert(s);
BOOST_FOREACH(boost::tie(s, boost::tuples::ignore), T[i][j-1])
keys.insert(s);
BOOST_FOREACH(s, keys) {
if (s[i-1] != 'N') {
err_t e1 = my_get(U[i][j], s);
err_t e2 = my_get(T[i][j-1], s);
int xerr_temp = e2.get<0>();
if (xerr_temp != INT_MAX) // This makes sure that the value doesn't overflow
xerr_temp += (x[j-1] != 'N');
err_t temp = boost::make_tuple(std::min(e1.get<0>(), xerr_temp),
std::min(e1.get<1>(), e2.get<1>()));
if (std::max(temp.get<0>(), temp.get<1>()) <= h)
T[i][j][s] = temp;
}
}
}
}
std::vector<boost::tuple<std::string, int> > result;
for (int i=1; i < L+1; ++i) {
std::string s;
err_t err;
BOOST_FOREACH(boost::tie(s, err), T[i][x.length()]) {
if (non_N_count(s) >= min_kmer_len and non_N_count(s) <= max_kmer_len and (not allow_only_one_gap or has_atmost_one_gap(s))) {
int distance = std::max(err.get<0>(), err.get<1>());
//printf("%%%s %i %i\n", s.c_str(), err.get<0>(), err.get<1>());
//printf("%%%s %i\n", s.c_str(), std::max(err.get<0>(), err.get<1>()));
result.push_back(boost::make_tuple(s, distance));
}
}
}
return result;
}
str_dist_container
huddinge_neighbourhood::insert(int i, int j)
{
str_dist_container result;
if (i==0 and j > 0) {
std::string s;
err_t err;
//printf("Size of U[%i][%i] is %zu\n", i, j-1, U[i][j-1].size());
BOOST_FOREACH(boost::tie(s, err), U[i][j-1]) {
// if (std::max(err.get<0>(), err.get<1>()) == INT_MAX)
// continue;
err_t temp = boost::make_tuple(err.get<0>() + (x[j-1] != 'N'), err.get<1>());
if (std::max(temp.get<0>(), temp.get<1>()) <= h) {
result[s] = temp;
// printf("j=%i i=%i xerr=%i yerr=%i:\n", j, 0, temp.get<0>(), temp.get<1>());
// printf("%s\n", x.substr(0, j).c_str());
// printf("\n");
}
}
}
return result;
}
str_dist_container
huddinge_neighbourhood::del(int i, int j)
{
str_dist_container result;
std::string sigma = (i==1 or i==L) ? "ACGT" : "ACGTN";
if (i > 0 and (j==0 or j == x.length())) {
std::string s;
err_t err;
BOOST_FOREACH(boost::tie(s, err), U[i-1][j]) {
// if (std::max(err.get<0>(), err.get<1>()) == INT_MAX)
// continue;
BOOST_FOREACH(char w, sigma) {
err_t temp = boost::make_tuple(err.get<0>(), err.get<1>() + (w != 'N'));
if (std::max(temp.get<0>(), temp.get<1>()) <= h)
result[s+w] = temp;
}
}
}
return result;
}
bool
cerror(char a, char b)
{
return a != 'N' and a != b;
}
str_dist_container
huddinge_neighbourhood::match(int i, int j)
{
str_dist_container result;
std::string sigma = (i==1 or i==L) ? "ACGT" : "ACGTN";
if (i > 0 and j > 0) {
std::string s;
err_t err;
BOOST_FOREACH(boost::tie(s, err), U[i-1][j-1]) {
BOOST_FOREACH(char w, sigma) {
// if (std::max(err.get<0>(), err.get<1>()) == INT_MAX)
// continue;
err_t temp = boost::make_tuple(err.get<0>() + cerror(x[j-1], w),
err.get<1>() + cerror(w, x[j-1]));
if (std::max(temp.get<0>(), temp.get<1>()) <= h)
result[s+w] = temp;
}
}
}
return result;
}
err_t
huddinge_neighbourhood::my_get(const str_dist_container& h, const std::string& key)
{
typedef str_dist_container::const_iterator iterator;
iterator i = h.find(key);
if (i == h.end())
return boost::make_tuple(INT_MAX, INT_MAX);
else
return i->second;
}