Raw File
/*
 *
 * nttest.hpp
 * Author: Hamid Mohamadi
 * Genome Sciences Centre,
 * British Columbia Cancer Agency
 */


#include <string>
#include <iostream>
#include <iomanip>
#include <cstdlib>
#include <fstream>
#include <sstream>
#include <vector>
#include <algorithm>
#include <stdint.h>
#include <unistd.h>
#include <getopt.h>
#include "seqgen.hpp"
#include "BloomFilter.hpp"
#include "nthash.hpp"

#ifdef _OPENMP
#include <omp.h>
#endif

#define PROGRAM "nttest"

static const char VERSION_MESSAGE[] =
    PROGRAM " Version 1.0.0 \n"
    "Written by Hamid Mohamadi.\n"
    "Copyright 2016 Canada's Michael Smith Genome Science Centre\n";

static const char USAGE_MESSAGE[] =
    "Usage: " PROGRAM " [OPTION]... QUERY\n"
    "Report bugs to hmohamadi@bcgsc.ca.\n";

namespace opt {
unsigned threads=1;
unsigned kmerLen=50;
unsigned ibits = 8;
unsigned nhash=1;
unsigned nz;
size_t nquery;
size_t squery;
size_t ngene;
size_t sgene;
unsigned method;
unsigned maxitr;
bool fastq = false;
bool inpFlag = false;
bool uniformity = false;
}

using namespace std;

static const char shortopts[] = "k:b:h:j:q:l:t:g:m:a:iu";

enum { OPT_HELP = 1, OPT_VERSION };

static const struct option longopts[] = {
    { "threads",	required_argument, NULL, 'j' },
    { "kmer",	required_argument, NULL, 'k' },
    { "qnum",	required_argument, NULL, 'q' },
    { "qlen",	required_argument, NULL, 'l' },
    { "bit",	required_argument, NULL, 'b' },
    { "hash",	required_argument, NULL, 'h' },
    { "tnum",	required_argument, NULL, 't' },
    { "tlen",	required_argument, NULL, 'g' },
    { "max",	required_argument, NULL, 'm' },
    { "alg",	required_argument, NULL, 'a' },
    { "input",	no_argument, NULL, 'i' },
    { "uniformity",	no_argument, NULL, 'u' },
    { "help",	no_argument, NULL, OPT_HELP },
    { "version",	no_argument, NULL, OPT_VERSION },
    { NULL, 0, NULL, 0 }
};

static const string itm[]= {"ntbase","nthash","city"};

void getFtype(const char *fName) {
    std::ifstream in(fName);
    std::string hLine;
    bool good=static_cast<bool>(getline(in,hLine));
    in.close();
    if(!good) {
        std::cerr<<"Error in reading file: "<<fName<<"\n";
        exit(EXIT_FAILURE);
    }
    if(hLine[0]=='>')
        opt::fastq=false;
    else if (hLine[0]=='@')
        opt::fastq=true;
    else {
        std::cerr<<"Error in file format: "<<fName<<"\n";
        exit(EXIT_FAILURE);
    }
}

bool getSeq(std::ifstream &uFile, std::string &line) {
    bool good=false;
    std::string hline;
    line.clear();
    if(opt::fastq) {
        good=static_cast<bool>(getline(uFile, hline));
        good=static_cast<bool>(getline(uFile, line));
        good=static_cast<bool>(getline(uFile, hline));
        good=static_cast<bool>(getline(uFile, hline));
    }
    else {
        do {
            good=static_cast<bool>(getline(uFile, hline));
            if(hline[0]=='>'&&!line.empty()) break;// !line.empty() for the first rec
            if(hline[0]!='>')line+=hline;
        } while(good);
        if(!good&&!line.empty())
            good=true;
    }
    return good;
}

static const unsigned char b2r[256] = {
    'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', //0
    'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N',
    'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', //1
    'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N',
    'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', //2
    'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N',
    'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', //3
    'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N',
    'N', 'T', 'N', 'G', 'N', 'N', 'N', 'C', //4   'A' 'C' 'G'
    'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N',
    'N', 'N', 'N', 'N', 'A', 'N', 'N', 'N', //5   'T'
    'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N',
    'N', 'T', 'N', 'G', 'N', 'N', 'N', 'C', //6   'a' 'c' 'g'
    'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N',
    'N', 'N', 'N', 'N', 'A', 'N', 'N', 'N', //7   't'
    'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N',
    'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', //8
    'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N',
    'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', //9
    'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N',
    'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', //10
    'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N',
    'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', //11
    'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N',
    'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', //12
    'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N',
    'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', //13
    'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N',
    'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', //14
    'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N',
    'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', //15
    'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N'
};

void getCanon(std::string &bMer) {
    int p=0, hLen=(opt::kmerLen-1)/2;
    while (bMer[p] == b2r[(unsigned char)bMer[opt::kmerLen-1-p]]) {
        ++p;
        if(p>=hLen) break;
    }
    if (bMer[p] > b2r[(unsigned char)bMer[opt::kmerLen-1-p]]) {
        for (int lIndex = p, rIndex = opt::kmerLen-1-p; lIndex<=rIndex; ++lIndex,--rIndex) {
            char tmp = b2r[(unsigned char)bMer[rIndex]];
            bMer[rIndex] = b2r[(unsigned char)bMer[lIndex]];
            bMer[lIndex] = tmp;
        }
    }
}

void loadSeq(BloomFilter & myFilter, const string & seq) {
    if (seq.size() < opt::kmerLen) return;
    for (size_t i = 0; i < seq.size() - opt::kmerLen + 1; i++) {
        myFilter.insert(seq.c_str()+i);
    }
}

void loadSeqr(BloomFilter & myFilter, const string & seq) {
    if (seq.size() < opt::kmerLen) return;
    uint64_t fhVal, rhVal;
    myFilter.insert(seq.c_str(), fhVal, rhVal);
    for (size_t i = 1; i < seq.size() - opt::kmerLen + 1; i++) {
        myFilter.insert(fhVal, rhVal, seq[i-1], seq[i+opt::kmerLen-1]);
    }
}

void loadSeqm(BloomFilter & myFilter, const string & seq) {
    if (seq.size() < opt::kmerLen) return;
    for (size_t i = 0; i < seq.size() - opt::kmerLen + 1; i++) {
        string kmer = seq.substr(i, opt::kmerLen);
        getCanon(kmer);
        myFilter.insertMur(kmer.c_str());
    }
}

void loadSeqc(BloomFilter & myFilter, const string & seq) {
    if (seq.size() < opt::kmerLen) return;
    for (size_t i = 0; i < seq.size() - opt::kmerLen + 1; i++) {
        string kmer = seq.substr(i, opt::kmerLen);
        getCanon(kmer);
        myFilter.insertCit(kmer.c_str());
    }
}

void loadSeqx(BloomFilter & myFilter, const string & seq) {
    if (seq.size() < opt::kmerLen) return;
    for (size_t i = 0; i < seq.size() - opt::kmerLen + 1; i++) {
        string kmer = seq.substr(i, opt::kmerLen);
        getCanon(kmer);
        myFilter.insertXxh(kmer.c_str());
    }
}

void loadBf(BloomFilter &myFilter, const char* faqFile) {
    getFtype(faqFile);
    ifstream uFile(faqFile);
    bool good = true;
    
    #ifdef _OPENMP
    #pragma omp parallel
    #endif
    
    for(string line; good;) {

        #ifdef _OPENMP
        #pragma omp critical(uFile)
        #endif

        good = getSeq(uFile, line);
        if(good) {
            if(itm[opt::method]=="city")
                loadSeqc(myFilter, line);
            else if(itm[opt::method]=="murmur")
                loadSeqm(myFilter, line);
            else if(itm[opt::method]=="xxhash")
                loadSeqx(myFilter, line);
            else if(itm[opt::method]=="ntbase")
                loadSeq(myFilter, line);
            else if(itm[opt::method]=="nthash")
                loadSeqr(myFilter, line);
        }
    }
    uFile.close();
}

void querySeqr(BloomFilter & myFilter, const string & seq, size_t & fHit) {
    if (seq.size() < opt::kmerLen) return;
    uint64_t fhVal, rhVal;
    if(myFilter.contains(seq.c_str(), fhVal, rhVal)) {
        #ifdef _OPENMP
        #pragma omp atomic
        #endif
        ++fHit;
    }
    for (size_t i = 1; i < seq.size() - opt::kmerLen + 1; i++) {
        if(myFilter.contains(fhVal, rhVal, seq[i-1], seq[i+opt::kmerLen-1])) {
            #ifdef _OPENMP
            #pragma omp atomic
            #endif
            ++fHit;
        }
    }
}

void querySeq(BloomFilter & myFilter, const string & seq, size_t & fHit) {
    if (seq.size() < opt::kmerLen) return;
    for (size_t i = 0; i < seq.size() - opt::kmerLen + 1; i++) {
        if(myFilter.contains(seq.c_str()+i)) {
            #ifdef _OPENMP
            #pragma omp atomic
            #endif
            ++fHit;
        }
    }
}

void querySeqm(BloomFilter & myFilter, const string & seq, size_t & fHit) {
    if (seq.size() < opt::kmerLen) return;
    for (size_t i = 0; i < seq.size() - opt::kmerLen + 1; i++) {
        string kmer = seq.substr(i, opt::kmerLen);
        getCanon(kmer);
        if(myFilter.containsMur(kmer.c_str())) {
            #ifdef _OPENMP
            #pragma omp atomic
            #endif
            ++fHit;
        }
    }
}

void querySeqc(BloomFilter & myFilter, const string & seq, size_t & fHit) {
    if (seq.size() < opt::kmerLen) return;
    for (size_t i = 0; i < seq.size() - opt::kmerLen + 1; i++) {
        string kmer = seq.substr(i, opt::kmerLen);
        getCanon(kmer);
        if(myFilter.containsCit(kmer.c_str())) {
            #ifdef _OPENMP
            #pragma omp atomic
            #endif
            ++fHit;
        }
    }
}

void querySeqx(BloomFilter & myFilter, const string & seq, size_t & fHit) {
    if (seq.size() < opt::kmerLen) return;
    for (size_t i = 0; i < seq.size() - opt::kmerLen + 1; i++) {
        string kmer = seq.substr(i, opt::kmerLen);
        getCanon(kmer);
        if(myFilter.containsXxh(kmer.c_str())) {
            #ifdef _OPENMP
            #pragma omp atomic
            #endif
            ++fHit;
        }
    }
}

void queryBf(BloomFilter &myFilter, const char* faqFile) {
    getFtype(faqFile);
    ifstream uFile(faqFile);
    size_t fHit=0,totKmer=0;
    bool good = true;
    #ifdef _OPENMP
    #pragma omp parallel
    #endif
    for(string line; good;) {
        #ifdef _OPENMP
        #pragma omp critical(uFile)
        #endif
        good = getSeq(uFile, line);
        if(good) {
            if(itm[opt::method]=="city")
                querySeqc(myFilter, line, fHit);
            else if(itm[opt::method]=="murmur")
                querySeqm(myFilter, line, fHit);
            else if(itm[opt::method]=="xxhash")
                querySeqx(myFilter, line, fHit);
            else if(itm[opt::method]=="ntbase")
                querySeq(myFilter, line, fHit);
            else if(itm[opt::method]=="nthash")
                querySeqr(myFilter, line, fHit);
            #ifdef _OPENMP
            #pragma omp atomic
            #endif
            totKmer+=opt::squery-opt::kmerLen+1;
        }
    }
    uFile.close();
    cerr << "tkmer=" << totKmer << " ";
    cerr << "fhits=" << fHit << " %" << setprecision(4) << fixed << (double)fHit/(double)totKmer << " ";
}

void hashSeqb(const string & seq) {
    for (size_t i = 0; i < seq.length() - opt::kmerLen + 1; i++) {
        if(NTC64(seq.c_str()+i, opt::kmerLen)) opt::nz++;
    }
}

void hashSeqr(const string & seq) {
    uint64_t fhVal,rhVal,hVal;
    hVal = NTC64(seq.c_str(), opt::kmerLen, fhVal, rhVal);
    if(hVal)opt::nz++;
    for (size_t i = 1; i < seq.length() - opt::kmerLen + 1; i++) {
        hVal = NTC64(seq[i-1], seq[i-1+opt::kmerLen], opt::kmerLen, fhVal, rhVal);
        if(hVal)opt::nz++;
    }
}

void hashSeqx(const string & seq) {
    for (size_t i = 0; i < seq.length() - opt::kmerLen + 1; i++) {
        string kmer = seq.substr(i, opt::kmerLen);
        getCanon(kmer);
        if(XXH64(kmer.c_str(), opt::kmerLen, 0))++opt::nz;
    }
}

void hashSeqc(const string & seq) {
    for (size_t i = 0; i < seq.length() - opt::kmerLen + 1; i++) {
        string kmer = seq.substr(i, opt::kmerLen);
        getCanon(kmer);
        if (CityHash64(kmer.c_str(), opt::kmerLen))++opt::nz;
    }
}

void hashSeqm(const string & seq) {
    for (size_t i = 0; i < seq.length() - opt::kmerLen + 1; i++) {
        string kmer = seq.substr(i, opt::kmerLen);
        getCanon(kmer);
        if(MurmurHash64A(kmer.c_str(), opt::kmerLen, 0))++opt::nz;
    }
}

void hashSeqbM(const string & seq) {
    uint64_t hVal[opt::nhash];
    for (size_t i = 0; i < seq.length() - opt::kmerLen + 1; i++) {
        NTMC64(seq.c_str()+i, opt::kmerLen, opt::nhash, hVal);
        for(unsigned i=0; i<opt::nhash; i++)
            if(hVal[i])opt::nz++;
    }
}

void hashSeqrM(const string & seq) {
    uint64_t hVal[opt::nhash], fhVal, rhVal;
    NTMC64(seq.c_str(), opt::kmerLen, opt::nhash, fhVal, rhVal, hVal);
    for(unsigned h=0; h<opt::nhash; h++) if(hVal[h])opt::nz++;
    for (size_t i = 1; i < seq.length() - opt::kmerLen + 1; i++) {
        NTMC64(seq[i-1], seq[i-1+opt::kmerLen], opt::kmerLen, opt::nhash, fhVal, rhVal, hVal);
        for(unsigned h=0; h<opt::nhash; h++) if(hVal[h])opt::nz++;
    }
}

void hashSeqxM(const string & seq) {
    for (size_t i = 0; i < seq.length() - opt::kmerLen + 1; i++) {
        string kmer = seq.substr(i, opt::kmerLen);
        getCanon(kmer);
        for(unsigned h=0; h<opt::nhash; h++)
            if(XXH64(kmer.c_str(), opt::kmerLen, h))++opt::nz;
    }
}

void hashSeqcM(const string & seq) {
    for (size_t i = 0; i < seq.length() - opt::kmerLen + 1; i++) {
        string kmer = seq.substr(i, opt::kmerLen);
        getCanon(kmer);
        for(unsigned h=0; h<opt::nhash; h++)
            if (CityHash64WithSeed(kmer.c_str(), opt::kmerLen, h))++opt::nz;
    }
}

void hashSeqmM(const string & seq) {
    for (size_t i = 0; i < seq.length() - opt::kmerLen + 1; i++) {
        string kmer = seq.substr(i, opt::kmerLen);
        getCanon(kmer);
        for(unsigned h=0; h<opt::nhash; h++)
            if(MurmurHash64A(kmer.c_str(), opt::kmerLen, h))++opt::nz;
    }
}

void nthashBF(const char *geneName, const char *readName) {

#ifdef _OPENMP
    omp_set_num_threads(opt::threads);
#endif
    std::cerr<<"#threads="<<opt::threads << "\n";
    for(opt::method=0; opt::method<3; opt::method++) {
        std::cerr<<"method="<<itm[opt::method]<<" ";
        for (unsigned k=50; k<=opt::squery; k+=100) {
            opt::kmerLen = k;
            //init_kmod(opt::kmerLen);
            std::cerr<<"kmerl="<<opt::kmerLen<<"\n";
            for (unsigned i=1; i<6; i+=2) {
                opt::nhash = i;
                std::cerr<<"nhash="<<opt::nhash<<" ";
                

                #ifdef _OPENMP
                double sTime = omp_get_wtime();
                #else
                clock_t start = clock();
                #endif

                
                BloomFilter myFilter(opt::ibits*opt::ngene*opt::sgene , opt::nhash, opt::kmerLen);
                loadBf(myFilter, geneName);
                cerr << "|popBF|=" << myFilter.getPop() << " ";

                #ifdef _OPENMP
                std::cerr << "load_time=" <<setprecision(4) << fixed << omp_get_wtime() - sTime << "\n";
                #else
                std::cerr << "load_time=" <<setprecision(4) << (double)(clock() - start)/CLOCKS_PER_SEC << "\n";
                #endif
                
                #ifdef _OPENMP
                sTime = omp_get_wtime();
                #else
                start = clock();
                #endif
                
                queryBf(myFilter, readName);
                
                #ifdef _OPENMP
                std::cerr << "query_time=" <<setprecision(4) << fixed << omp_get_wtime() - sTime << "\n";
                #else
                std::cerr << "query_time=" <<setprecision(4) << (double)(clock() - start)/CLOCKS_PER_SEC << "\n";
                #endif
            }
        }
        cerr << "\n";
    }
}

void nthashRT(const char *readName) {
    getFtype(readName);
    cerr << "CPU time (sec) for hash algorithms for ";
    cerr << "kmer="<<opt::kmerLen<< "\n";
    cerr << "nhash="<<opt::nhash<< "\n";
    for(unsigned method=0; method<2; method++)
        cerr << itm[method] << "\t";
    cerr << "\n";
    if(opt::nhash>1) {
        for(unsigned method=0; method<2; method++) {
            opt::nz=0;
            ifstream uFile(readName);
            string line;
            clock_t sTime = clock();
            while(getSeq(uFile,line)) {
                if(itm[method]=="city")
                    hashSeqcM(line);
                else if(itm[method]=="murmur")
                    hashSeqmM(line);
                else if(itm[method]=="nthash")
                    hashSeqrM(line);
                else if(itm[method]=="ntbase")
                    hashSeqbM(line);
                else if(itm[method]=="xxhash")
                    hashSeqxM(line);
            }
            cerr << (double)(clock() - sTime)/CLOCKS_PER_SEC << "\t";
            uFile.close();
        }
        cerr << "\n";
    }
    else {
        for(unsigned method=0; method<2; method++) {
            opt::nz=0;
            ifstream uFile(readName);
            string line;
            clock_t sTime = clock();
            while(getSeq(uFile, line)) {
                if(itm[method]=="city")
                    hashSeqc(line);
                else if(itm[method]=="murmur")
                    hashSeqm(line);
                else if(itm[method]=="nthash")
                    hashSeqr(line);
                else if(itm[method]=="ntbase")
                    hashSeqb(line);
                else if(itm[method]=="xxhash")
                    hashSeqx(line);
            }
            cerr << (double)(clock() - sTime)/CLOCKS_PER_SEC << "\t";
            uFile.close();
        }
        cerr << "\n";
    }
}

int main(int argc, char** argv) {

    bool die = false;
    for (int c; (c = getopt_long(argc, argv, shortopts, longopts, NULL)) != -1;) {
        std::istringstream arg(optarg != NULL ? optarg : "");
        switch (c) {
        case '?':
            die = true;
            break;
        case 'j':
            arg >> opt::threads;
            break;
        case 'b':
            arg >> opt::ibits;
            break;
        case 'q':
            arg >> opt::nquery;
            break;
        case 'l':
            arg >> opt::squery;
            break;
        case 't':
            arg >> opt::ngene;
            break;
        case 'g':
            arg >> opt::sgene;
            break;
        case 'h':
            arg >> opt::nhash;
            break;
        case 'k':
            arg >> opt::kmerLen;
            //init_kmod(opt::kmerLen);
            break;
        case 'm':
            arg >> opt::maxitr;
            break;
        case 'a':
            arg >> opt::method;
            break;
        case 'i':
            opt::inpFlag=true;
            break;
        case 'u':
            opt::uniformity=true;
            break;
        case OPT_HELP:
            std::cerr << USAGE_MESSAGE;
            exit(EXIT_SUCCESS);
        case OPT_VERSION:
            std::cerr << VERSION_MESSAGE;
            exit(EXIT_SUCCESS);
        }
        if (optarg != NULL && !arg.eof()) {
            std::cerr << PROGRAM ": invalid option: `-"
                      << (char)c << optarg << "'\n";
            exit(EXIT_FAILURE);
        }
    }
    if (argc - optind != 1 && argc - optind != 2) {
        std::cerr << PROGRAM ": missing arguments\n";
        die = true;
    }

    if (die) {
        std::cerr << "Try `" << PROGRAM
                  << " --help' for more information.\n";
        exit(EXIT_FAILURE);
    }

    const char *readName(argv[argc-1]);

    if (opt::uniformity) {
        std::cerr<<"bit/i="<<opt::ibits<<"\n";
        std::cerr<<"nquery="<<opt::nquery<<"\n";
        std::cerr<<"squery="<<opt::squery<<"\n";
        std::cerr<<"ngene="<<opt::ngene<<"\n";
        std::cerr<<"sgene="<<opt::sgene<<"\n\n";

        if(opt::inpFlag) {
            makeGene(opt::ngene, opt::sgene, opt::nquery, opt::squery);
            makeRead(opt::nquery, opt::squery);
            nthashBF("genes.fa", "reads.fa");
        }
        else {
            const char *geneName(argv[argc-2]);
            nthashBF(geneName, readName);
        }
    }
    else {
        if(opt::inpFlag) {
            makeRead(opt::nquery, opt::squery);
            nthashRT("reads.fa");
        }
        else
            nthashRT(readName);
    }

    return 0;
}
back to top