stHashIterator.hpp
#ifndef STHASH__ITERATOR_H
#define STHASH__ITERATOR_H 1
#include <string>
#include <limits>
#include "nthash.hpp"
/**
* Iterate over hash values for k-mers in a
* given DNA sequence.
*
* This implementation uses ntHash
* function to efficiently calculate
* hash values for successive k-mers.
*/
class stHashIterator
{
public:
static std::vector<std::vector<unsigned> > parseSeed(const std::vector<std::string> &seedString) {
std::vector<std::vector<unsigned> > seedSet;
for (unsigned i=0; i< seedString.size(); i++) {
std::vector<unsigned> sSeed;
for (unsigned j=0; j < seedString[i].size(); j++) {
if(seedString[i][j]!='1') sSeed.push_back(j);
}
seedSet.push_back(sSeed);
}
return seedSet;
}
/**
* Default constructor. Creates an iterator pointing to
* the end of the iterator range.
*/
stHashIterator():
m_hVec(NULL),
m_hStn(NULL),
m_pos(std::numeric_limits<std::size_t>::max())
{}
/**
* Constructor.
* @param seq address of DNA sequence to be hashed
* @param seed address of spaced seed
* @param k k-mer size
* @param h number of seeds
* @param h2 number of hashes per seed
*/
stHashIterator(const std::string& seq, const std::vector<std::vector<unsigned> >& seed, unsigned h, unsigned h2, unsigned k):
m_seq(seq), m_seed(seed), m_h(h), m_h2(h2), m_k(k), m_hVec(new uint64_t[h * h2]), m_hStn(new bool[h * h2]), m_pos(0)
{
init();
}
/** Initialize internal state of iterator */
void init()
{
if (m_k > m_seq.length()) {
m_pos = std::numeric_limits<std::size_t>::max();
return;
}
unsigned locN=0;
while (m_pos<m_seq.length()-m_k+1 && !NTMSM64(m_seq.data()+m_pos, m_seed, m_k, m_h, m_h2, m_fhVal, m_rhVal, locN, m_hVec, m_hStn))
m_pos+=locN+1;
if (m_pos >= m_seq.length()-m_k+1)
m_pos = std::numeric_limits<std::size_t>::max();
}
/** Advance iterator right to the next valid k-mer */
void next()
{
++m_pos;
if (m_pos >= m_seq.length()-m_k+1) {
m_pos = std::numeric_limits<std::size_t>::max();
return;
}
if(seedTab[(unsigned char)(m_seq.at(m_pos+m_k-1))]==seedN) {
m_pos+=m_k;
init();
}
else
NTMSM64(m_seq.data()+m_pos, m_seed, m_seq.at(m_pos-1), m_seq.at(m_pos-1+m_k), m_k, m_h, m_h2, m_fhVal, m_rhVal, m_hVec, m_hStn);
}
size_t pos() const{
return m_pos;
}
/** get pointer to hash values for current k-mer */
const bool* strandArray() const
{
return m_hStn;
}
/** get pointer to hash values for current k-mer */
const uint64_t* operator*() const
{
return m_hVec;
}
/** test equality with another iterator */
bool operator==(const stHashIterator& it) const
{
return m_pos == it.m_pos;
}
/** test inequality with another iterator */
bool operator!=(const stHashIterator& it) const
{
return !(*this == it);
}
/** pre-increment operator */
stHashIterator& operator++()
{
next();
return *this;
}
/** iterator pointing to one past last element */
static const stHashIterator end()
{
return stHashIterator();
}
/** destructor */
~stHashIterator() {
if(m_hVec!=NULL) {
delete [] m_hVec;
delete [] m_hStn;
}
}
private:
/** DNA sequence */
std::string m_seq;
/** Spaced Seed sequence */
std::vector<std::vector<unsigned> > m_seed;
/** number of seeds */
unsigned m_h;
/** number of hashes per seed */
unsigned m_h2;
/** k-mer size */
unsigned m_k;
/** hash values
* For m_h = n and m_h2 = m:
* [seed1Hash1, seed1Hash2 ... seed(n)Hash(m-1), seed(n)Hash(m)]
*/
uint64_t *m_hVec;
/** hash strands, forward = 0, reverse-complement = 1 */
bool *m_hStn;
/** position of current k-mer */
size_t m_pos;
/** forward-strand k-mer hash value */
uint64_t m_fhVal;
/** reverse-complement k-mer hash value */
uint64_t m_rhVal;
};
#endif