https://github.com/bcgsc/ntCard
Tip revision: 2040559b5e0f413bd7cfcea0ffbd2c19c2ccee24 authored by mohamadi on 17 May 2018, 22:57:14 UTC
ntCard with spaced seeds
ntCard with spaced seeds
Tip revision: 2040559
ntHashIterator.hpp
#ifndef NTHASH__ITERATOR_H
#define NTHASH__ITERATOR_H 1
#include <string>
#include <limits>
#include "nthash.hpp"
/**
* Iterate over hash values for k-mers in a
* given DNA sequence.
*
* This implementation uses ntHash
* function to efficiently calculate
* hash values for successive k-mers.
*/
class ntHashIterator
{
public:
/**
* Default constructor. Creates an iterator pointing to
* the end of the iterator range.
*/
ntHashIterator():
m_hVec(NULL),
m_pos(std::numeric_limits<std::size_t>::max())
{}
/**
* Constructor.
* @param seq address of DNA sequence to be hashed
* @param k k-mer size
* @param h number of hashes
*/
ntHashIterator(const std::string& seq, unsigned h, unsigned k):
m_seq(seq), m_h(h), m_k(k), m_hVec(new uint64_t[h]), m_pos(0)
{
init();
}
/**
* Constructor.
* @param seq address of DNA sequence to be hashed
* @param seed address of spaced seed
* @param k k-mer size
* @param h number of hashes
*/
ntHashIterator(const std::string& seq, const std::vector<std::string> &seed, unsigned h, unsigned k):
m_seq(seq), m_seed(seed), m_h(h), m_k(k), m_hVec(new uint64_t[h]), m_pos(0)
{
init_seed();
}
/** Initialize internal state of iterator */
void init()
{
if (m_k > m_seq.length()) {
m_pos = std::numeric_limits<std::size_t>::max();
return;
}
unsigned locN=0;
while (m_pos<m_seq.length()-m_k+1 && !NTMC64(m_seq.data()+m_pos, m_k, m_h, m_fhVal, m_rhVal, locN, m_hVec))
m_pos+=locN+1;
if (m_pos >= m_seq.length()-m_k+1)
m_pos = std::numeric_limits<std::size_t>::max();
}
/** Initialize internal state of iterator */
void init_seed()
{
if (m_k > m_seq.length()) {
m_pos = std::numeric_limits<std::size_t>::max();
return;
}
unsigned locN=0;
while (m_pos<m_seq.length()-m_k+1 && !NTMS64(m_seq.data()+m_pos, m_seed, m_k, m_h, m_fhVal, m_rhVal, locN, m_hVec))
m_pos+=locN+1;
if (m_pos >= m_seq.length()-m_k+1)
m_pos = std::numeric_limits<std::size_t>::max();
}
/** Advance iterator right to the next valid k-mer */
void next()
{
++m_pos;
if (m_pos >= m_seq.length()-m_k+1) {
m_pos = std::numeric_limits<std::size_t>::max();
return;
}
if(seedTab[(unsigned char)(m_seq.at(m_pos+m_k-1))]==seedN) {
m_pos+=m_k;
init();
}
else
NTMC64(m_seq.at(m_pos-1), m_seq.at(m_pos-1+m_k), m_k, m_h, m_fhVal, m_rhVal, m_hVec);
}
void next_seed()
{
++m_pos;
if (m_pos >= m_seq.length()-m_k+1) {
m_pos = std::numeric_limits<std::size_t>::max();
return;
}
if(seedTab[(unsigned char)(m_seq.at(m_pos+m_k-1))]==seedN) {
m_pos+=m_k;
init_seed();
}
else
NTMS64(m_seq.data()+m_pos, m_seed, m_seq.at(m_pos-1), m_seq.at(m_pos-1+m_k), m_k, m_h, m_fhVal, m_rhVal, m_hVec);
}
size_t pos() const{
return m_pos;
}
/** get pointer to hash values for current k-mer */
const uint64_t* operator*() const
{
return m_hVec;
}
/** test equality with another iterator */
bool operator==(const ntHashIterator& it) const
{
return m_pos == it.m_pos;
}
/** test inequality with another iterator */
bool operator!=(const ntHashIterator& it) const
{
return !(*this == it);
}
/** pre-increment operator */
ntHashIterator& operator++()
{
next();
return *this;
}
/** pre-decrement operator */
ntHashIterator& operator--()
{
next_seed();
return *this;
}
/** iterator pointing to one past last element */
static const ntHashIterator end()
{
return ntHashIterator();
}
/** destructor */
~ntHashIterator() {
if(m_hVec!=NULL)
delete [] m_hVec;
}
private:
/** DNA sequence */
std::string m_seq;
/** Spaced Seed sequence */
std::vector<std::string> m_seed;
/** number of hashes */
unsigned m_h;
/** k-mer size */
unsigned m_k;
/** hash values */
uint64_t *m_hVec;
/** position of current k-mer */
size_t m_pos;
/** forward-strand k-mer hash value */
uint64_t m_fhVal;
/** reverse-complement k-mer hash value */
uint64_t m_rhVal;
};
#endif