https://github.com/jttoivon/MODER
Tip revision: c485231e5b468ae509306e1aaeebaa0f3004572d authored by Jarkko Toivonen on 31 March 2020, 18:10:18 UTC
Fixed indexing bug.
Fixed indexing bug.
Tip revision: c485231
kmer_tools.hpp
/*
MODER is a program to learn DNA binding motifs from SELEX datasets.
Copyright (C) 2016, 2017 Jarkko Toivonen,
Department of Computer Science, University of Helsinki
MODER is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
MODER is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
*/
#ifndef KMER_TOOLS_HPP
#define KMER_TOOLS_HPP
#include "type.hpp"
#include <vector>
#include <string>
#include <boost/unordered_map.hpp>
#include <boost/multi_array.hpp>
#include "unordered_map.hpp"
extern bool use_middle_gap;
/*
namespace std {
std::size_t
hash_value(const myuint128& input);
}
*/
// Hash function to make int128 work with boost::hash.
struct hash128
: std::unary_function<myuint128, std::size_t>
{
std::size_t
operator()(const myuint128& input) const
{
boost::hash<unsigned long long> hasher;
unsigned long long hashVal = 0;
unsigned long long mask = (1ull<<32) - 1;
for(int i = 0; i < 3; ++i)
{
hashVal *= 37;
hashVal += (mask & (input>>(32*i)));
}
return hasher(hashVal);
}
};
#define USE_HASH 1
//typedef unsigned long long int code_t; // These code bit representations of DNA sequences
//typedef myuint128 code_t;
typedef uint64_t code_t;
typedef unsigned int count_t;
///////////////////////////
//
// Define count_container_t
//
#ifdef USE_HASH
//typedef boost::unordered_map<code_t, count_t> count_container_t;
typedef my_unordered_map<code_t, count_t, hash128> count_container_t;
#else
typedef std::vector<count_t> count_container_t; // index type of std::vector is size_t
#endif
typedef boost::multi_array<count_container_t, 2> gapped_count_container_t;
void
get_kmer_counts(const std::vector<std::string>& sequences, int k,
count_container_t& count, bool use_two_strands = true, bool count_palindromes_twice = false);
void
get_gapped_kmer_counts(const std::vector<std::string>& sequences, int k, int gaps,
gapped_count_container_t& count);
void
get_gapped_kmer_counts_fast(const std::vector<std::string>& sequences, int k, int max_gap,
gapped_count_container_t& count);
namespace detail {
class reverse_4_2bit_string_type // this functor reverses the 4-mer stored in a byte
{
public:
reverse_4_2bit_string_type() : v(256) { // initialise the v vector
unsigned char mask[4] = {0xc0, 0x30, 0x0c, 0x03};
for (int i=0; i < 256; ++i) {
unsigned char x0=(i & mask[0])>>6;
unsigned char x1=(i & mask[1])>>2;
unsigned char x2=(i & mask[2])<<2;
unsigned char x3=(i & mask[3])<<6;
v[i] = x0 | x1 | x2 | x3;
}
}
unsigned char
operator()(unsigned char i) { return v[i]; }
private:
std::vector<unsigned char> v;
};
extern reverse_4_2bit_string_type my_reverse_4_2bit_string;
} // end namespace detail
/*
uint32_t
reverse32_2bitstring(uint32_t u, int k);
uint64_t
reverse64_2bitstring(uint64_t u, int k);
*/
template <typename T>
T
reverse_2bitstring(T u, int k)
{
static_assert(std::is_unsigned<T>::value,
"reverse_2bitstring requires unsigned parameter type");
unsigned char* p=reinterpret_cast<unsigned char*>(&u);
int bytes = sizeof(T);
int bits = bytes * 8;
for (int i=0; i < bytes; ++i)
p[i]=detail::my_reverse_4_2bit_string(p[i]);
for (int i=0; i < bytes/2; ++i)
std::swap(p[i], p[bytes-i-1]);
return u>>(bits-2*k);
}
template <typename T>
T
reverse_complement_2bitstring(T u, int k)
{
static_assert(std::is_unsigned<T>::value,
"reverse_complement_2bitstring requires unsigned parameter type");
const int number_of_bits = sizeof(T)*8;
assert(k <= number_of_bits/2);
assert(k > 0);
T result = reverse_2bitstring(u, k);
const T one = 1;
T mask = (one<<(2*k))-1; // Don't replace one by 1 !!!!!!!!!!!!!!!!
result = result^mask; // do the complementation
//assert(number_to_dna(result, l) == reverse_complement(number_to_dna(c, l)));
return result;
}
//big_int
//reverse_complement_2bitstring_big_int(big_int c, int l);
big_int
remove_gap_from_bitstring(big_int c, int k, int gap, int pos);
namespace {
template <typename T>
int
mypopcount(T u)
{
static_assert(std::is_unsigned<T>::value, "mypopcount requires unsigned parameter type");
unsigned char* p = reinterpret_cast<unsigned char*>(&u); // Break down into bytes
int result = 0;
for (int i=0; i < sizeof(u); ++i)
result += __builtin_popcount(p[i]);
return result;
}
template <>
int
mypopcount<uint32_t>(uint32_t u)
{
return __builtin_popcount(u);
}
template <>
int
mypopcount<uint64_t>(uint64_t u)
{
return __builtin_popcountll(u);
}
template <>
int
mypopcount<myuint128>(myuint128 u)
{
unsigned long long* p = reinterpret_cast<unsigned long long*>(&u); // Break down into two 64 bit words
int c = __builtin_popcountll(p[0]) + __builtin_popcountll(p[1]); // number of one bits in w
return c;
}
} // end nameless namespace
namespace detail {
template <typename T>
T
get_even_mask()
{
T x=0;
int number_of_bits = sizeof(T) * 8;
for (int i=0; i < number_of_bits / 2; ++i) {
x <<= 2;
x += 1;
}
return x;
}
} // end namespace detail
template <typename T>
int
hamming_distance_with_bits(T x, T y)
{
// clang compiler does not support this on (u)int128 type
#ifndef __clang__
static_assert(std::is_unsigned<T>::value,
"hamming_distance_with_bits requires unsigned parameter type");
#endif
// Note! Here we assume that the leftmost bits are cleared and contain no thrash.
// We could also get length of string as parameter and mask out the extra bits, just to be sure, but this would slow things down.
static const T evenmask = detail::get_even_mask<T>();
static const T oddmask = evenmask << 1;
T w = x ^ y;
T result = ((w & oddmask) >> 1) | (w & evenmask);
return mypopcount<T>(result);
}
// returns bitstring 000...000111...111 with k trailing ones
template <typename T>
T
trailing_ones(int k)
{
static_assert(std::is_unsigned<T>::value,
"trailing_ones requires unsigned parameter type");
int number_of_bits = sizeof(T) * 8;
assert(k <= number_of_bits);
assert(k >= 0);
return (static_cast<T>(1) << k) - 1;
}
// returns bitstring 111...111000...000 with k leading ones
template <typename T>
T
leading_ones(int k)
{
static_assert(std::is_unsigned<T>::value,
"leading_ones requires unsigned parameter type");
int number_of_bits = sizeof(T) * 8;
assert(k <= number_of_bits);
assert(k >= 0);
return ((static_cast<T>(1) << k) - 1) << (number_of_bits-k);
}
class kmers
{
public:
kmers(const std::vector<std::string>& sequences, int maxk_);
unsigned
count(int k, big_int code) const ;
unsigned
count(int k, const std::string& str) const ;
double
probability(int k, big_int code) const ;
double
probability(int k, const std::string& str) const;
int get_maxk() const
{ return maxk; }
private:
int maxk;
std::vector<count_container_t> counts;
std::vector<unsigned int> total_counts;
};
#endif // KMER_TOOLS_HPP