https://github.com/jttoivon/MODER
Tip revision: c485231e5b468ae509306e1aaeebaa0f3004572d authored by Jarkko Toivonen on 31 March 2020, 18:10:18 UTC
Fixed indexing bug.
Fixed indexing bug.
Tip revision: c485231
kmer_tools.cpp
/*
MODER is a program to learn DNA binding motifs from SELEX datasets.
Copyright (C) 2016, 2017 Jarkko Toivonen,
Department of Computer Science, University of Helsinki
MODER is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
MODER is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
*/
#include "kmer_tools.hpp"
#include "common.hpp"
#include "data.hpp"
#include "parameters.hpp"
#include <cmath>
#include <cassert>
bool use_middle_gap = false;
namespace detail {
reverse_4_2bit_string_type my_reverse_4_2bit_string;
}
//namespace boost {
// Hash function to make int128 work with boost::hash.
std::size_t
hash_value(const myuint128& input)
{
boost::hash<unsigned long long> hasher;
unsigned long long hashVal = 0;
unsigned long long mask = (1ull<<32) - 1;
for(int i = 0; i < 3; ++i)
{
hashVal *= 37;
hashVal += (mask & (input>>(32*i)));
}
return hasher(hashVal);
}
//}
/*
uint32_t
reverse32_2bitstring(uint32_t u, int k)
{
unsigned char* p=reinterpret_cast<unsigned char*>(&u);
for (int i=0; i<4; ++i)
p[i]=my_reverse_4_2bit_string(p[i]);
std::swap(p[0], p[3]);
std::swap(p[1], p[2]);
return u>>(32-2*k);
}
uint64_t
reverse64_2bitstring(uint64_t u, int k)
{
unsigned char* p=reinterpret_cast<unsigned char*>(&u);
for (int i=0; i<8; ++i)
p[i]=my_reverse_4_2bit_string(p[i]);
std::swap(p[0], p[7]);
std::swap(p[1], p[6]);
std::swap(p[2], p[5]);
std::swap(p[3], p[4]);
return u>>(64-2*k);
}
*/
/* Not needed anymore
big_int
reverse_complement_2bitstring_big_int(big_int c, int k)
{
const int number_of_bits = sizeof(big_int)*8;
assert(k <= number_of_bits/2);
assert(k > 0);
big_int result = reverse_2bitstring(c, k);
big_int one = 1;
big_int mask = (one<<(2*k))-1; // Don't replace one by 1 !!!!!!!!!!!!!!!!
result = result^mask; // do the complementation
//assert(number_to_dna(result, l) == reverse_complement(number_to_dna(c, l)));
return result;
}
*/
// get counts for each k-mer in data
void
get_kmer_counts(const std::vector<std::string>& sequences, int k,
count_container_t& count, bool use_two_strands, bool count_palindromes_twice)
{
assert(sequences.size() > 0);
assert(k > 0 and k <= sizeof(big_int)*4);
#ifndef USE_HASH
code_t size = pow(4,k);
count.assign(size, 0);
#endif
big_int limit = pow(4, k);
//printf("k=%i limit=%i\n", k, limit);
for (int i = 0; i < sequences.size(); ++i) {
const std::string& line = sequences[i];
const int L = line.length();
const int positions = L-k+1;
std::string s = line.substr(0, k);
big_int temp=dna_to_number(s);
bool palindrome = is_palindromic(s);
assert(temp < limit);
++count[temp];
if (use_two_strands and (not palindrome or count_palindromes_twice))
++count[reverse_complement_2bitstring(temp, k)];
for (int j=1; j < positions; ++j) {
temp = (temp<<2) + to_int(line[k+j-1]);
temp &= (limit-1);
assert(temp < limit);
++count[temp];
palindrome = (temp == reverse_complement_2bitstring(temp, k));
if (use_two_strands and (not palindrome or count_palindromes_twice))
++count[reverse_complement_2bitstring(temp, k)];
}
}
//printf("Sizeof(count)=%lu\n", count.size());
}
kmers::kmers(const std::vector<std::string>& sequences, int maxk_) :
maxk(maxk_), counts(maxk_+1), total_counts(maxk_+1)
{
for (int k=1; k <= maxk; ++k) {
get_kmer_counts(sequences, k, counts[k], use_two_strands);
total_counts[k] = sum(counts[k]);
printf("Total_counts[%i]=%i\n", k, total_counts[k]);
}
}
unsigned int
kmers::count(int k, big_int code) const
{
assert(k >= 1 && k <= maxk);
return counts[k][code];
}
unsigned int
kmers::count(int k, const std::string& str) const
{
return count(k, dna_to_number(str));
}
double
kmers::probability(int k, big_int code) const
{
assert(k >= 1 && k <= maxk);
return counts[k][code] / (double) total_counts[k];
}
double
kmers::probability(int k, const std::string& str) const
{
return probability(k, dna_to_number(str));
}
namespace {
big_int
first_part(big_int c, int k, int gap, int pos)
{
const int len1=pos;
const int len2=k-pos;
const big_int mask1 = (1<<(2*len1))-1;
big_int first = (c >> (2*(gap+len2)) & mask1);
return first;
}
big_int
second_part(big_int c, int k, int gap, int pos)
{
const int len2=k-pos;
const big_int mask2 = (1<<(2*len2))-1;
big_int second = c & mask2;
return second;
}
}
big_int
remove_gap_from_bitstring(big_int c, int k, int gap, int pos)
{
const big_int mask = (1<<(2*k))-1;
if (gap==0)
return c & mask;
big_int first = first_part(c, k, gap, pos);
big_int second = second_part(c, k, gap, pos);
big_int limit = 1 << (2*k);
const int len2=k-pos;
big_int result = (first << (2*len2)) | second;
//printf("orig: %lli first: %lli second: %lli result: %lli\n", c, first, second, result);
assert(result < limit);
return result;
}
namespace {
template<typename T>
T
first_part_template(T c, int k, int gap, int pos)
{
const int len1=pos;
const int len2=k-pos;
T one = 1;
const big_int mask1 = (one<<(2*len1))-1;
T first = (c >> (2*(gap+len2)) & mask1);
return first;
}
template<typename T>
T
second_part_template(T c, int k, int gap, int pos)
{
const int len2=k-pos;
T one = 1;
const big_int mask2 = (one<<(2*len2))-1;
T second = c & mask2;
return second;
}
}
template<typename T>
T
remove_gap_from_bitstring_template(T c, int k, int gap, int pos)
{
T one = 1;
const big_int mask = (one<<(2*k))-1;
if (gap==0)
return c & mask;
big_int first = first_part_template(c, k, gap, pos);
big_int second = second_part_template(c, k, gap, pos);
big_int limit = one << (2*k);
const int len2=k-pos;
big_int result = (first << (2*len2)) | second;
assert(result < limit);
return result;
}
// this is almost equivalent to helper3, except that 'pos' argument works here
void
get_gapped_kmer_counts(const std::vector<std::string>& sequences, int k, int max_gap,
gapped_count_container_t& count)
{
assert(sequences.size() > 0);
assert(k > 0);
assert(max_gap >= 0);
const int L = sequences[0].length();
assert(k + max_gap <= L);
count.resize(boost::extents[max_gap+1][k+1]);
for (int gap_len=0; gap_len <= max_gap; ++gap_len) {
const int l = k+gap_len;
const int positions = L-l+1;
for (int i = 0; i < sequences.size(); ++i) {
const std::string& line = sequences[i];
const std::string& line2 = reverse_complement(line);
for (int gap_pos=1; gap_pos < (gap_len == 0 ? 2 : k); ++gap_pos) {
big_int id1=dna_to_number(line.substr(0, l));
big_int id2=dna_to_number(line2.substr(0, l));
++count[gap_len][gap_pos][remove_gap_from_bitstring(id1, k, gap_len, gap_pos)];
++count[gap_len][gap_pos][remove_gap_from_bitstring(id2, k, gap_len, gap_pos)];
for (int j=1; j < positions; ++j) {
id1 = (id1<<2) + to_int(line[l+j-1]);
id2 = (id2<<2) + to_int(line2[l+j-1]);
++count[gap_len][gap_pos][remove_gap_from_bitstring(id1, k, gap_len, gap_pos)];
++count[gap_len][gap_pos][remove_gap_from_bitstring(id2, k, gap_len, gap_pos)];
}
} // gap_pos
}
}
}
template <typename T>
inline
T
bit_substr(T code, int L, int pos, int len)
{
// assert(L >= pos + len);
T one = 1;
T mask = (one << (2*len)) - 1;
return (code >> (2*(L - (pos + len)))) & mask;
}
/*
template <typename T>
T
dna_to_number(const std::string& s)
{
// assert(s.length() < std::numeric_limits<T>::digits/2);
T sum=0;
for (int i=0; i < s.length(); ++i)
sum = sum*4 + to_int(s[i]);
return sum;
}
*/
void
get_gapped_kmer_counts_fast(const std::vector<std::string>& sequences, int k, int max_gap,
gapped_count_container_t& count)
{
int lines = sequences.size();
assert(lines > 0);
assert(k > 0);
assert(max_gap >= 0);
const int L = sequences[0].length();
assert(k + max_gap <= L);
count.resize(boost::extents[max_gap+1][k]);
#ifndef USE_HASH
for (int gap=0; gap <= max_gap; ++gap) // Initialize containers if not using hashing
for (int pos=1; pos < k; ++pos)
count[gap][pos].assign(pow(4, k), 0);
#endif
typedef myuint128 int_type;
std::vector<int_type> bits(lines);
std::vector<int_type> bits2(lines);
#pragma omp parallel for schedule(static)
for (int i=0; i<lines; ++i) {
bits[i] = dna_to_number<int_type>(sequences[i]);
bits2[i] = dna_to_number<int_type>(reverse_complement(sequences[i]));
}
#pragma omp parallel for schedule(static)
for (int gap_len=0; gap_len <= max_gap; ++gap_len) {
const int l = k+gap_len;
const int positions = L-l+1;
int first, last;
if (gap_len == 0)
first = last = 1;
else {
if (use_middle_gap) {
if (k % 2 == 0)
first = last = k / 2;
else {
first = k / 2;
last = first + 1;
}
} else {
first = 1;
last = k-1;
}
}
for (int i = 0; i < lines; ++i) {
for (int gap_pos=first; gap_pos <= last; ++gap_pos) {
for (int j=0; j < positions; ++j) {
int_type id1 = bit_substr(bits[i], L, j, l);
int_type id2 = bit_substr(bits2[i], L, j, l);
++count[gap_len][gap_pos][remove_gap_from_bitstring_template(id1, k, gap_len, gap_pos)];
++count[gap_len][gap_pos][remove_gap_from_bitstring_template(id2, k, gap_len, gap_pos)];
}
} // gap_pos
}
}
}