/* * * nttest.hpp * Author: Hamid Mohamadi * Genome Sciences Centre, * British Columbia Cancer Agency */ #include #include #include #include #include #include #include #include #include #include #include #include "seqgen.hpp" #include "BloomFilter.hpp" #include "nthash.hpp" #ifdef _OPENMP #include #endif #define PROGRAM "nttest" static const char VERSION_MESSAGE[] = PROGRAM " Version 1.0.0 \n" "Written by Hamid Mohamadi.\n" "Copyright 2016 Canada's Michael Smith Genome Science Centre\n"; static const char USAGE_MESSAGE[] = "Usage: " PROGRAM " [OPTION]... QUERY\n" "Report bugs to hmohamadi@bcgsc.ca.\n"; namespace opt { unsigned threads=1; unsigned kmerLen=50; unsigned ibits = 8; unsigned nhash=1; unsigned nz; size_t nquery; size_t squery; size_t ngene; size_t sgene; unsigned method; unsigned maxitr; bool fastq = false; bool inpFlag = false; bool uniformity = false; } using namespace std; static const char shortopts[] = "k:b:h:j:q:l:t:g:m:a:iu"; enum { OPT_HELP = 1, OPT_VERSION }; static const struct option longopts[] = { { "threads", required_argument, NULL, 'j' }, { "kmer", required_argument, NULL, 'k' }, { "qnum", required_argument, NULL, 'q' }, { "qlen", required_argument, NULL, 'l' }, { "bit", required_argument, NULL, 'b' }, { "hash", required_argument, NULL, 'h' }, { "tnum", required_argument, NULL, 't' }, { "tlen", required_argument, NULL, 'g' }, { "max", required_argument, NULL, 'm' }, { "alg", required_argument, NULL, 'a' }, { "input", no_argument, NULL, 'i' }, { "uniformity", no_argument, NULL, 'u' }, { "help", no_argument, NULL, OPT_HELP }, { "version", no_argument, NULL, OPT_VERSION }, { NULL, 0, NULL, 0 } }; static const string itm[]= {"ntbase","nthash","city"}; void getFtype(const char *fName) { std::ifstream in(fName); std::string hLine; bool good=static_cast(getline(in,hLine)); in.close(); if(!good) { std::cerr<<"Error in reading file: "<') opt::fastq=false; else if (hLine[0]=='@') opt::fastq=true; else { std::cerr<<"Error in file format: "<(getline(uFile, hline)); good=static_cast(getline(uFile, line)); good=static_cast(getline(uFile, hline)); good=static_cast(getline(uFile, hline)); } else { do { good=static_cast(getline(uFile, hline)); if(hline[0]=='>'&&!line.empty()) break;// !line.empty() for the first rec if(hline[0]!='>')line+=hline; } while(good); if(!good&&!line.empty()) good=true; } return good; } static const unsigned char b2r[256] = { 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', //0 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', //1 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', //2 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', //3 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'T', 'N', 'G', 'N', 'N', 'N', 'C', //4 'A' 'C' 'G' 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'A', 'N', 'N', 'N', //5 'T' 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'T', 'N', 'G', 'N', 'N', 'N', 'C', //6 'a' 'c' 'g' 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'A', 'N', 'N', 'N', //7 't' 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', //8 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', //9 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', //10 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', //11 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', //12 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', //13 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', //14 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', //15 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N' }; void getCanon(std::string &bMer) { int p=0, hLen=(opt::kmerLen-1)/2; while (bMer[p] == b2r[(unsigned char)bMer[opt::kmerLen-1-p]]) { ++p; if(p>=hLen) break; } if (bMer[p] > b2r[(unsigned char)bMer[opt::kmerLen-1-p]]) { for (int lIndex = p, rIndex = opt::kmerLen-1-p; lIndex<=rIndex; ++lIndex,--rIndex) { char tmp = b2r[(unsigned char)bMer[rIndex]]; bMer[rIndex] = b2r[(unsigned char)bMer[lIndex]]; bMer[lIndex] = tmp; } } } void loadSeq(BloomFilter & myFilter, const string & seq) { if (seq.size() < opt::kmerLen) return; for (size_t i = 0; i < seq.size() - opt::kmerLen + 1; i++) { myFilter.insert(seq.c_str()+i); } } void loadSeqr(BloomFilter & myFilter, const string & seq) { if (seq.size() < opt::kmerLen) return; uint64_t fhVal, rhVal; myFilter.insert(seq.c_str(), fhVal, rhVal); for (size_t i = 1; i < seq.size() - opt::kmerLen + 1; i++) { myFilter.insert(fhVal, rhVal, seq[i-1], seq[i+opt::kmerLen-1]); } } void loadSeqm(BloomFilter & myFilter, const string & seq) { if (seq.size() < opt::kmerLen) return; for (size_t i = 0; i < seq.size() - opt::kmerLen + 1; i++) { string kmer = seq.substr(i, opt::kmerLen); getCanon(kmer); myFilter.insertMur(kmer.c_str()); } } void loadSeqc(BloomFilter & myFilter, const string & seq) { if (seq.size() < opt::kmerLen) return; for (size_t i = 0; i < seq.size() - opt::kmerLen + 1; i++) { string kmer = seq.substr(i, opt::kmerLen); getCanon(kmer); myFilter.insertCit(kmer.c_str()); } } void loadSeqx(BloomFilter & myFilter, const string & seq) { if (seq.size() < opt::kmerLen) return; for (size_t i = 0; i < seq.size() - opt::kmerLen + 1; i++) { string kmer = seq.substr(i, opt::kmerLen); getCanon(kmer); myFilter.insertXxh(kmer.c_str()); } } void loadBf(BloomFilter &myFilter, const char* faqFile) { getFtype(faqFile); ifstream uFile(faqFile); bool good = true; #ifdef _OPENMP #pragma omp parallel #endif for(string line; good;) { #ifdef _OPENMP #pragma omp critical(uFile) #endif good = getSeq(uFile, line); if(good) { if(itm[opt::method]=="city") loadSeqc(myFilter, line); else if(itm[opt::method]=="murmur") loadSeqm(myFilter, line); else if(itm[opt::method]=="xxhash") loadSeqx(myFilter, line); else if(itm[opt::method]=="ntbase") loadSeq(myFilter, line); else if(itm[opt::method]=="nthash") loadSeqr(myFilter, line); } } uFile.close(); } void querySeqr(BloomFilter & myFilter, const string & seq, size_t & fHit) { if (seq.size() < opt::kmerLen) return; uint64_t fhVal, rhVal; if(myFilter.contains(seq.c_str(), fhVal, rhVal)) { #ifdef _OPENMP #pragma omp atomic #endif ++fHit; } for (size_t i = 1; i < seq.size() - opt::kmerLen + 1; i++) { if(myFilter.contains(fhVal, rhVal, seq[i-1], seq[i+opt::kmerLen-1])) { #ifdef _OPENMP #pragma omp atomic #endif ++fHit; } } } void querySeq(BloomFilter & myFilter, const string & seq, size_t & fHit) { if (seq.size() < opt::kmerLen) return; for (size_t i = 0; i < seq.size() - opt::kmerLen + 1; i++) { if(myFilter.contains(seq.c_str()+i)) { #ifdef _OPENMP #pragma omp atomic #endif ++fHit; } } } void querySeqm(BloomFilter & myFilter, const string & seq, size_t & fHit) { if (seq.size() < opt::kmerLen) return; for (size_t i = 0; i < seq.size() - opt::kmerLen + 1; i++) { string kmer = seq.substr(i, opt::kmerLen); getCanon(kmer); if(myFilter.containsMur(kmer.c_str())) { #ifdef _OPENMP #pragma omp atomic #endif ++fHit; } } } void querySeqc(BloomFilter & myFilter, const string & seq, size_t & fHit) { if (seq.size() < opt::kmerLen) return; for (size_t i = 0; i < seq.size() - opt::kmerLen + 1; i++) { string kmer = seq.substr(i, opt::kmerLen); getCanon(kmer); if(myFilter.containsCit(kmer.c_str())) { #ifdef _OPENMP #pragma omp atomic #endif ++fHit; } } } void querySeqx(BloomFilter & myFilter, const string & seq, size_t & fHit) { if (seq.size() < opt::kmerLen) return; for (size_t i = 0; i < seq.size() - opt::kmerLen + 1; i++) { string kmer = seq.substr(i, opt::kmerLen); getCanon(kmer); if(myFilter.containsXxh(kmer.c_str())) { #ifdef _OPENMP #pragma omp atomic #endif ++fHit; } } } void queryBf(BloomFilter &myFilter, const char* faqFile) { getFtype(faqFile); ifstream uFile(faqFile); size_t fHit=0,totKmer=0; bool good = true; #ifdef _OPENMP #pragma omp parallel #endif for(string line; good;) { #ifdef _OPENMP #pragma omp critical(uFile) #endif good = getSeq(uFile, line); if(good) { if(itm[opt::method]=="city") querySeqc(myFilter, line, fHit); else if(itm[opt::method]=="murmur") querySeqm(myFilter, line, fHit); else if(itm[opt::method]=="xxhash") querySeqx(myFilter, line, fHit); else if(itm[opt::method]=="ntbase") querySeq(myFilter, line, fHit); else if(itm[opt::method]=="nthash") querySeqr(myFilter, line, fHit); #ifdef _OPENMP #pragma omp atomic #endif totKmer+=opt::squery-opt::kmerLen+1; } } uFile.close(); cerr << "tkmer=" << totKmer << " "; cerr << "fhits=" << fHit << " %" << setprecision(4) << fixed << (double)fHit/(double)totKmer << " "; } void hashSeqb(const string & seq) { for (size_t i = 0; i < seq.length() - opt::kmerLen + 1; i++) { if(NTC64(seq.c_str()+i, opt::kmerLen)) opt::nz++; } } void hashSeqr(const string & seq) { uint64_t fhVal,rhVal,hVal; hVal = NTC64(seq.c_str(), opt::kmerLen, fhVal, rhVal); if(hVal)opt::nz++; for (size_t i = 1; i < seq.length() - opt::kmerLen + 1; i++) { hVal = NTC64(seq[i-1], seq[i-1+opt::kmerLen], opt::kmerLen, fhVal, rhVal); if(hVal)opt::nz++; } } void hashSeqx(const string & seq) { for (size_t i = 0; i < seq.length() - opt::kmerLen + 1; i++) { string kmer = seq.substr(i, opt::kmerLen); getCanon(kmer); if(XXH64(kmer.c_str(), opt::kmerLen, 0))++opt::nz; } } void hashSeqc(const string & seq) { for (size_t i = 0; i < seq.length() - opt::kmerLen + 1; i++) { string kmer = seq.substr(i, opt::kmerLen); getCanon(kmer); if (CityHash64(kmer.c_str(), opt::kmerLen))++opt::nz; } } void hashSeqm(const string & seq) { for (size_t i = 0; i < seq.length() - opt::kmerLen + 1; i++) { string kmer = seq.substr(i, opt::kmerLen); getCanon(kmer); if(MurmurHash64A(kmer.c_str(), opt::kmerLen, 0))++opt::nz; } } void hashSeqbM(const string & seq) { uint64_t hVal[opt::nhash]; for (size_t i = 0; i < seq.length() - opt::kmerLen + 1; i++) { NTMC64(seq.c_str()+i, opt::kmerLen, opt::nhash, hVal); for(unsigned i=0; i1) { for(unsigned method=0; method<2; method++) { opt::nz=0; ifstream uFile(readName); string line; clock_t sTime = clock(); while(getSeq(uFile,line)) { if(itm[method]=="city") hashSeqcM(line); else if(itm[method]=="murmur") hashSeqmM(line); else if(itm[method]=="nthash") hashSeqrM(line); else if(itm[method]=="ntbase") hashSeqbM(line); else if(itm[method]=="xxhash") hashSeqxM(line); } cerr << (double)(clock() - sTime)/CLOCKS_PER_SEC << "\t"; uFile.close(); } cerr << "\n"; } else { for(unsigned method=0; method<2; method++) { opt::nz=0; ifstream uFile(readName); string line; clock_t sTime = clock(); while(getSeq(uFile, line)) { if(itm[method]=="city") hashSeqc(line); else if(itm[method]=="murmur") hashSeqm(line); else if(itm[method]=="nthash") hashSeqr(line); else if(itm[method]=="ntbase") hashSeqb(line); else if(itm[method]=="xxhash") hashSeqx(line); } cerr << (double)(clock() - sTime)/CLOCKS_PER_SEC << "\t"; uFile.close(); } cerr << "\n"; } } int main(int argc, char** argv) { bool die = false; for (int c; (c = getopt_long(argc, argv, shortopts, longopts, NULL)) != -1;) { std::istringstream arg(optarg != NULL ? optarg : ""); switch (c) { case '?': die = true; break; case 'j': arg >> opt::threads; break; case 'b': arg >> opt::ibits; break; case 'q': arg >> opt::nquery; break; case 'l': arg >> opt::squery; break; case 't': arg >> opt::ngene; break; case 'g': arg >> opt::sgene; break; case 'h': arg >> opt::nhash; break; case 'k': arg >> opt::kmerLen; //init_kmod(opt::kmerLen); break; case 'm': arg >> opt::maxitr; break; case 'a': arg >> opt::method; break; case 'i': opt::inpFlag=true; break; case 'u': opt::uniformity=true; break; case OPT_HELP: std::cerr << USAGE_MESSAGE; exit(EXIT_SUCCESS); case OPT_VERSION: std::cerr << VERSION_MESSAGE; exit(EXIT_SUCCESS); } if (optarg != NULL && !arg.eof()) { std::cerr << PROGRAM ": invalid option: `-" << (char)c << optarg << "'\n"; exit(EXIT_FAILURE); } } if (argc - optind != 1 && argc - optind != 2) { std::cerr << PROGRAM ": missing arguments\n"; die = true; } if (die) { std::cerr << "Try `" << PROGRAM << " --help' for more information.\n"; exit(EXIT_FAILURE); } const char *readName(argv[argc-1]); if (opt::uniformity) { std::cerr<<"bit/i="<