/* * * nthash.hpp * Author: Hamid Mohamadi * Genome Sciences Centre, * British Columbia Cancer Agency */ #ifndef NT_HASH_H #define NT_HASH_H #include // offset for the complement base in the random seeds table const uint8_t cpOff = 0x07; // shift for gerenerating multiple hash values const int multiShift = 27; // seed for gerenerating multiple hash values static const uint64_t multiSeed = 0x90b45d39fb6da1fa; // 64-bit random seeds corresponding to bases and their complements static const uint64_t seedA = 0x3c8bfbb395c60474; static const uint64_t seedC = 0x3193c18562a02b4c; static const uint64_t seedG = 0x20323ed082572324; static const uint64_t seedT = 0x295549f54be24456; static const uint64_t seedN = 0x0000000000000000; static const uint64_t vecA[64] = { 0x3c8bfbb395c60474,0x7917f7672b8c08e8,0xf22feece571811d0,0xe45fdd9cae3023a1,0xc8bfbb395c604743,0x917f7672b8c08e87,0x22feece571811d0f,0x45fdd9cae3023a1e, 0x8bfbb395c604743c,0x17f7672b8c08e879,0x2feece571811d0f2,0x5fdd9cae3023a1e4,0xbfbb395c604743c8,0x7f7672b8c08e8791,0xfeece571811d0f22,0xfdd9cae3023a1e45, 0xfbb395c604743c8b,0xf7672b8c08e87917,0xeece571811d0f22f,0xdd9cae3023a1e45f,0xbb395c604743c8bf,0x7672b8c08e87917f,0xece571811d0f22fe,0xd9cae3023a1e45fd, 0xb395c604743c8bfb,0x672b8c08e87917f7,0xce571811d0f22fee,0x9cae3023a1e45fdd,0x395c604743c8bfbb,0x72b8c08e87917f76,0xe571811d0f22feec,0xcae3023a1e45fdd9, 0x95c604743c8bfbb3,0x2b8c08e87917f767,0x571811d0f22feece,0xae3023a1e45fdd9c,0x5c604743c8bfbb39,0xb8c08e87917f7672,0x71811d0f22feece5,0xe3023a1e45fdd9ca, 0xc604743c8bfbb395,0x8c08e87917f7672b,0x1811d0f22feece57,0x3023a1e45fdd9cae,0x604743c8bfbb395c,0xc08e87917f7672b8,0x811d0f22feece571,0x023a1e45fdd9cae3, 0x04743c8bfbb395c6,0x08e87917f7672b8c,0x11d0f22feece5718,0x23a1e45fdd9cae30,0x4743c8bfbb395c60,0x8e87917f7672b8c0,0x1d0f22feece57181,0x3a1e45fdd9cae302, 0x743c8bfbb395c604,0xe87917f7672b8c08,0xd0f22feece571811,0xa1e45fdd9cae3023,0x43c8bfbb395c6047,0x87917f7672b8c08e,0x0f22feece571811d,0x1e45fdd9cae3023a }; static const uint64_t vecC[64] = { 0x3193c18562a02b4c,0x6327830ac5405698,0xc64f06158a80ad30,0x8c9e0c2b15015a61,0x193c18562a02b4c3,0x327830ac54056986,0x64f06158a80ad30c,0xc9e0c2b15015a618, 0x93c18562a02b4c31,0x27830ac540569863,0x4f06158a80ad30c6,0x9e0c2b15015a618c,0x3c18562a02b4c319,0x7830ac5405698632,0xf06158a80ad30c64,0xe0c2b15015a618c9, 0xc18562a02b4c3193,0x830ac54056986327,0x06158a80ad30c64f,0x0c2b15015a618c9e,0x18562a02b4c3193c,0x30ac540569863278,0x6158a80ad30c64f0,0xc2b15015a618c9e0, 0x8562a02b4c3193c1,0x0ac5405698632783,0x158a80ad30c64f06,0x2b15015a618c9e0c,0x562a02b4c3193c18,0xac54056986327830,0x58a80ad30c64f061,0xb15015a618c9e0c2, 0x62a02b4c3193c185,0xc54056986327830a,0x8a80ad30c64f0615,0x15015a618c9e0c2b,0x2a02b4c3193c1856,0x54056986327830ac,0xa80ad30c64f06158,0x5015a618c9e0c2b1, 0xa02b4c3193c18562,0x4056986327830ac5,0x80ad30c64f06158a,0x015a618c9e0c2b15,0x02b4c3193c18562a,0x056986327830ac54,0x0ad30c64f06158a8,0x15a618c9e0c2b150, 0x2b4c3193c18562a0,0x56986327830ac540,0xad30c64f06158a80,0x5a618c9e0c2b1501,0xb4c3193c18562a02,0x6986327830ac5405,0xd30c64f06158a80a,0xa618c9e0c2b15015, 0x4c3193c18562a02b,0x986327830ac54056,0x30c64f06158a80ad,0x618c9e0c2b15015a,0xc3193c18562a02b4,0x86327830ac540569,0x0c64f06158a80ad3,0x18c9e0c2b15015a6 }; static const uint64_t vecG[64] = { 0x20323ed082572324,0x40647da104ae4648,0x80c8fb42095c8c90,0x0191f68412b91921,0x0323ed0825723242,0x0647da104ae46484,0x0c8fb42095c8c908,0x191f68412b919210, 0x323ed08257232420,0x647da104ae464840,0xc8fb42095c8c9080,0x91f68412b9192101,0x23ed082572324203,0x47da104ae4648406,0x8fb42095c8c9080c,0x1f68412b91921019, 0x3ed0825723242032,0x7da104ae46484064,0xfb42095c8c9080c8,0xf68412b919210191,0xed08257232420323,0xda104ae464840647,0xb42095c8c9080c8f,0x68412b919210191f, 0xd08257232420323e,0xa104ae464840647d,0x42095c8c9080c8fb,0x8412b919210191f6,0x08257232420323ed,0x104ae464840647da,0x2095c8c9080c8fb4,0x412b919210191f68, 0x8257232420323ed0,0x04ae464840647da1,0x095c8c9080c8fb42,0x12b919210191f684,0x257232420323ed08,0x4ae464840647da10,0x95c8c9080c8fb420,0x2b919210191f6841, 0x57232420323ed082,0xae464840647da104,0x5c8c9080c8fb4209,0xb919210191f68412,0x7232420323ed0825,0xe464840647da104a,0xc8c9080c8fb42095,0x919210191f68412b, 0x232420323ed08257,0x464840647da104ae,0x8c9080c8fb42095c,0x19210191f68412b9,0x32420323ed082572,0x64840647da104ae4,0xc9080c8fb42095c8,0x9210191f68412b91, 0x2420323ed0825723,0x4840647da104ae46,0x9080c8fb42095c8c,0x210191f68412b919,0x420323ed08257232,0x840647da104ae464,0x080c8fb42095c8c9,0x10191f68412b9192 }; static const uint64_t vecT[64] = { 0x295549f54be24456,0x52aa93ea97c488ac,0xa55527d52f891158,0x4aaa4faa5f1222b1,0x95549f54be244562,0x2aa93ea97c488ac5,0x55527d52f891158a,0xaaa4faa5f1222b14, 0x5549f54be2445629,0xaa93ea97c488ac52,0x5527d52f891158a5,0xaa4faa5f1222b14a,0x549f54be24456295,0xa93ea97c488ac52a,0x527d52f891158a55,0xa4faa5f1222b14aa, 0x49f54be244562955,0x93ea97c488ac52aa,0x27d52f891158a555,0x4faa5f1222b14aaa,0x9f54be2445629554,0x3ea97c488ac52aa9,0x7d52f891158a5552,0xfaa5f1222b14aaa4, 0xf54be24456295549,0xea97c488ac52aa93,0xd52f891158a55527,0xaa5f1222b14aaa4f,0x54be24456295549f,0xa97c488ac52aa93e,0x52f891158a55527d,0xa5f1222b14aaa4fa, 0x4be24456295549f5,0x97c488ac52aa93ea,0x2f891158a55527d5,0x5f1222b14aaa4faa,0xbe24456295549f54,0x7c488ac52aa93ea9,0xf891158a55527d52,0xf1222b14aaa4faa5, 0xe24456295549f54b,0xc488ac52aa93ea97,0x891158a55527d52f,0x1222b14aaa4faa5f,0x24456295549f54be,0x488ac52aa93ea97c,0x91158a55527d52f8,0x222b14aaa4faa5f1, 0x4456295549f54be2,0x88ac52aa93ea97c4,0x1158a55527d52f89,0x22b14aaa4faa5f12,0x456295549f54be24,0x8ac52aa93ea97c48,0x158a55527d52f891,0x2b14aaa4faa5f122, 0x56295549f54be244,0xac52aa93ea97c488,0x58a55527d52f8911,0xb14aaa4faa5f1222,0x6295549f54be2445,0xc52aa93ea97c488a,0x8a55527d52f89115,0x14aaa4faa5f1222b }; static const uint64_t vecN[64] = { seedN,seedN,seedN,seedN,seedN,seedN,seedN,seedN, seedN,seedN,seedN,seedN,seedN,seedN,seedN,seedN, seedN,seedN,seedN,seedN,seedN,seedN,seedN,seedN, seedN,seedN,seedN,seedN,seedN,seedN,seedN,seedN, seedN,seedN,seedN,seedN,seedN,seedN,seedN,seedN, seedN,seedN,seedN,seedN,seedN,seedN,seedN,seedN, seedN,seedN,seedN,seedN,seedN,seedN,seedN,seedN, seedN,seedN,seedN,seedN,seedN,seedN,seedN,seedN }; static const uint64_t *msTab[256] = { vecN, vecT, vecN, vecG, vecA, vecN, vecN, vecC, // 0..7 vecN, vecN, vecN, vecN, vecN, vecN, vecN, vecN, // 8..15 vecN, vecN, vecN, vecN, vecN, vecN, vecN, vecN, // 16..23 vecN, vecN, vecN, vecN, vecN, vecN, vecN, vecN, // 24..31 vecN, vecN, vecN, vecN, vecN, vecN, vecN, vecN, // 32..39 vecN, vecN, vecN, vecN, vecN, vecN, vecN, vecN, // 40..47 vecN, vecN, vecN, vecN, vecN, vecN, vecN, vecN, // 48..55 vecN, vecN, vecN, vecN, vecN, vecN, vecN, vecN, // 56..63 vecN, vecA, vecN, vecC, vecN, vecN, vecN, vecG, // 64..71 vecN, vecN, vecN, vecN, vecN, vecN, vecN, vecN, // 72..79 vecN, vecN, vecN, vecN, vecT, vecN, vecN, vecN, // 80..87 vecN, vecN, vecN, vecN, vecN, vecN, vecN, vecN, // 88..95 vecN, vecA, vecN, vecC, vecN, vecN, vecN, vecG, // 96..103 vecN, vecN, vecN, vecN, vecN, vecN, vecN, vecN, // 104..111 vecN, vecN, vecN, vecN, vecT, vecN, vecN, vecN, // 112..119 vecN, vecN, vecN, vecN, vecN, vecN, vecN, vecN, // 120..127 vecN, vecN, vecN, vecN, vecN, vecN, vecN, vecN, // 128..135 vecN, vecN, vecN, vecN, vecN, vecN, vecN, vecN, // 136..143 vecN, vecN, vecN, vecN, vecN, vecN, vecN, vecN, // 144..151 vecN, vecN, vecN, vecN, vecN, vecN, vecN, vecN, // 152..159 vecN, vecN, vecN, vecN, vecN, vecN, vecN, vecN, // 160..167 vecN, vecN, vecN, vecN, vecN, vecN, vecN, vecN, // 168..175 vecN, vecN, vecN, vecN, vecN, vecN, vecN, vecN, // 176..183 vecN, vecN, vecN, vecN, vecN, vecN, vecN, vecN, // 184..191 vecN, vecN, vecN, vecN, vecN, vecN, vecN, vecN, // 192..199 vecN, vecN, vecN, vecN, vecN, vecN, vecN, vecN, // 200..207 vecN, vecN, vecN, vecN, vecN, vecN, vecN, vecN, // 208..215 vecN, vecN, vecN, vecN, vecN, vecN, vecN, vecN, // 216..223 vecN, vecN, vecN, vecN, vecN, vecN, vecN, vecN, // 224..231 vecN, vecN, vecN, vecN, vecN, vecN, vecN, vecN, // 232..239 vecN, vecN, vecN, vecN, vecN, vecN, vecN, vecN, // 240..247 vecN, vecN, vecN, vecN, vecN, vecN, vecN, vecN // 248..255 }; static const uint64_t seedTab[256] = { seedN, seedT, seedN, seedG, seedA, seedN, seedN, seedC, // 0..7 seedN, seedN, seedN, seedN, seedN, seedN, seedN, seedN, // 8..15 seedN, seedN, seedN, seedN, seedN, seedN, seedN, seedN, // 16..23 seedN, seedN, seedN, seedN, seedN, seedN, seedN, seedN, // 24..31 seedN, seedN, seedN, seedN, seedN, seedN, seedN, seedN, // 32..39 seedN, seedN, seedN, seedN, seedN, seedN, seedN, seedN, // 40..47 seedN, seedN, seedN, seedN, seedN, seedN, seedN, seedN, // 48..55 seedN, seedN, seedN, seedN, seedN, seedN, seedN, seedN, // 56..63 seedN, seedA, seedN, seedC, seedN, seedN, seedN, seedG, // 64..71 seedN, seedN, seedN, seedN, seedN, seedN, seedN, seedN, // 72..79 seedN, seedN, seedN, seedN, seedT, seedN, seedN, seedN, // 80..87 seedN, seedN, seedN, seedN, seedN, seedN, seedN, seedN, // 88..95 seedN, seedA, seedN, seedC, seedN, seedN, seedN, seedG, // 96..103 seedN, seedN, seedN, seedN, seedN, seedN, seedN, seedN, // 104..111 seedN, seedN, seedN, seedN, seedT, seedN, seedN, seedN, // 112..119 seedN, seedN, seedN, seedN, seedN, seedN, seedN, seedN, // 120..127 seedN, seedN, seedN, seedN, seedN, seedN, seedN, seedN, // 128..135 seedN, seedN, seedN, seedN, seedN, seedN, seedN, seedN, // 136..143 seedN, seedN, seedN, seedN, seedN, seedN, seedN, seedN, // 144..151 seedN, seedN, seedN, seedN, seedN, seedN, seedN, seedN, // 152..159 seedN, seedN, seedN, seedN, seedN, seedN, seedN, seedN, // 160..167 seedN, seedN, seedN, seedN, seedN, seedN, seedN, seedN, // 168..175 seedN, seedN, seedN, seedN, seedN, seedN, seedN, seedN, // 176..183 seedN, seedN, seedN, seedN, seedN, seedN, seedN, seedN, // 184..191 seedN, seedN, seedN, seedN, seedN, seedN, seedN, seedN, // 192..199 seedN, seedN, seedN, seedN, seedN, seedN, seedN, seedN, // 200..207 seedN, seedN, seedN, seedN, seedN, seedN, seedN, seedN, // 208..215 seedN, seedN, seedN, seedN, seedN, seedN, seedN, seedN, // 216..223 seedN, seedN, seedN, seedN, seedN, seedN, seedN, seedN, // 224..231 seedN, seedN, seedN, seedN, seedN, seedN, seedN, seedN, // 232..239 seedN, seedN, seedN, seedN, seedN, seedN, seedN, seedN, // 240..247 seedN, seedN, seedN, seedN, seedN, seedN, seedN, seedN // 248..255 }; /*// assembly rol inline uint64_t rol (uint64_t v, size_t n) { asm("rol %b1, %0":"+r,r"(v):"i,c"(n)); return (v); } // assembly ror inline uint64_t ror (uint64_t v, size_t n) { asm("ror %b1, %0":"+r,r"(v):"i,c"(n)); return (v); }*/ // rotate "v" to the left 1 position inline uint64_t rol1(const uint64_t v) { return (v << 1) | (v >> 63); } // rotate "v" to the left by "s" positions inline uint64_t rol(const uint64_t v, int s) { if ((s &= 63) == 0) return v; return (v << s) | (v >> (64 - s)); } // rotate "v" to the right by 1 position inline uint64_t ror1(const uint64_t v) { return (v >> 1) | (v << 63); } // rotate "v" to the right by "s" positions inline uint64_t ror(const uint64_t v, int s) { if ((s &= 63) == 0) return v; return (v >> s) | (v << (64 - s)); } // forward-strand hash value of the base kmer, i.e. fhval(kmer_0) inline uint64_t getFhval(const char * kmerSeq, const unsigned k) { uint64_t hVal=0; for(unsigned i=0; i> multiShift; return hVal; } // canonical ntBase, using rotate ops inline uint64_t NTC64(const char * kmerSeq, const unsigned k) { uint64_t fhVal=0, rhVal=0; fhVal=getFhval(kmerSeq, k); rhVal=getRhval(kmerSeq, k); return (rhVal> multiShift; return hVal; } /* * Using pre-computed seed matrix msTab instead of rotate opts */ // ntHash inline uint64_t NTP64(const char * kmerSeq, const unsigned k) { uint64_t hVal=0; for(unsigned i=0; i> multiShift; return hVal; } // canonical ntBase inline uint64_t NTPC64(const char * kmerSeq, const unsigned k) { uint64_t fhVal=0, rhVal=0; for(unsigned i=0; i> multiShift; return hVal; } // multihash ntHash, ntBase inline void NTM64(const char * kmerSeq, const unsigned k, const unsigned m, uint64_t *hVal) { uint64_t bVal=0, tVal=0; bVal = NTP64(kmerSeq, k); hVal[0] = bVal; for(unsigned i=1; i> multiShift; hVal[i] = tVal; } } // one extra hash for given base hash inline uint64_t NTE64(const uint64_t hVal, const unsigned k, const unsigned i) { uint64_t tVal = hVal; tVal *= (i ^ k * multiSeed); tVal ^= tVal >> multiShift; return tVal; } // multihash ntHash for sliding k-mers inline void NTM64(const unsigned char charOut, const unsigned char charIn, const unsigned k, const unsigned m, uint64_t *hVal) { uint64_t bVal=0, tVal=0; bVal = rol1(hVal[0]) ^ msTab[charOut][k%64] ^ msTab[charIn][0]; hVal[0] = bVal; for(unsigned i=1; i> multiShift; hVal[i] = tVal; } } // canonical multihash ntBase inline void NTMC64(const char * kmerSeq, const unsigned k, const unsigned m, uint64_t *hVal) { uint64_t bVal=0, tVal=0; bVal = NTPC64(kmerSeq, k); hVal[0] = bVal; for(unsigned i=1; i> multiShift; hVal[i] = tVal; } } // canonical multihash ntHash inline void NTMC64(const char * kmerSeq, const unsigned k, const unsigned m, uint64_t& fhVal, uint64_t& rhVal, uint64_t *hVal) { uint64_t bVal=0, tVal=0; bVal = NTPC64(kmerSeq, k, fhVal, rhVal); hVal[0] = bVal; for(unsigned i=1; i> multiShift; hVal[i] = tVal; } } // canonical multihash ntHash for sliding k-mers inline void NTMC64(const unsigned char charOut, const unsigned char charIn, const unsigned k, const unsigned m, uint64_t& fhVal, uint64_t& rhVal, uint64_t *hVal) { uint64_t bVal=0, tVal=0; bVal = NTPC64(charOut, charIn, k, fhVal, rhVal); hVal[0] = bVal; for(unsigned i=1; i> multiShift; hVal[i] = tVal; } } /* * ignoring k-mers containing nonACGT using ntHash function */ // canonical ntBase inline bool NTPC64(const char *kmerSeq, const unsigned k, uint64_t& hVal, unsigned& locN) { hVal=0; locN=0; uint64_t fhVal=0,rhVal=0; for(int i=k-1; i>=0; i--) { if(msTab[(unsigned char)kmerSeq[i]][(k-1-i)%64]==seedN) { locN=i; return false; } fhVal ^= msTab[(unsigned char)kmerSeq[i]][(k-1-i)%64]; rhVal ^= msTab[(unsigned char)kmerSeq[i]&cpOff][i%64]; } hVal = (rhVal=0; i--) { if(msTab[(unsigned char)kmerSeq[i]][(k-1-i)%64]==seedN) { locN=i; return false; } fhVal ^= msTab[(unsigned char)kmerSeq[i]][(k-1-i)%64]; rhVal ^= msTab[(unsigned char)kmerSeq[i]&cpOff][i%64]; } bVal = (rhVal> multiShift; hVal[i] = tVal; } return true; } // canonical ntHash inline bool NTPC64(const char *kmerSeq, const unsigned k, uint64_t& fhVal, uint64_t& rhVal, uint64_t& hVal, unsigned& locN) { hVal=fhVal=rhVal=0; locN=0; for(int i=k-1; i>=0; i--) { if(msTab[(unsigned char)kmerSeq[i]][(k-1-i)%64]==seedN) { locN=i; return false; } fhVal ^= msTab[(unsigned char)kmerSeq[i]][(k-1-i)%64]; rhVal ^= msTab[(unsigned char)kmerSeq[i]&cpOff][i%64]; } hVal = (rhVal=0; i--) { if(msTab[(unsigned char)kmerSeq[i]][(k-1-i)%64]==seedN) { locN=i; return false; } fhVal ^= msTab[(unsigned char)kmerSeq[i]][(k-1-i)%64]; rhVal ^= msTab[(unsigned char)kmerSeq[i]&cpOff][i%64]; } bVal = (rhVal> multiShift; hVal[i] = tVal; } return true; } #endif