#include #include #include #include #include #include #include #include #include #include #include "ntHashIterator.hpp" #include "Uncompress.h" #ifdef _OPENMP # include #endif #define PROGRAM "ntCard" static const char VERSION_MESSAGE[] = PROGRAM " 1.1.0 \n" "Written by Hamid Mohamadi.\n" "Copyright 2018 Hamid Mohamadi, Licensed under MIT License\n"; static const char USAGE_MESSAGE[] = "Usage: " PROGRAM " [OPTION]... FILE(S)...\n" "Estimates k-mer coverage histogram in FILE(S).\n\n" "Acceptable file formats: fastq, fasta, sam, bam and in compressed formats gz, bz, zip, xz.\n" "A list of files containing file names in each row can be passed with @ prefix.\n" "\n" " Options:\n" "\n" " -t, --threads=N use N parallel threads [1] (N>=2 should be used when input files are >=2)\n" " -k, --kmer=N the length of kmer \n" " -c, --cov=N the maximum coverage of kmer in output [1000]\n" " -p, --pref=STRING the prefix for output file name(s)\n" " -o, --output=STRING the name for output file name (used when output should be a single file)\n" " --help display this help and exit\n" " --version output version information and exit\n" "\n" "Report bugs to https://github.com/bcgsc/ntCard/issues\n"; using namespace std; namespace opt { unsigned nThrd=1; unsigned kmLen=64; size_t rBuck; unsigned rBits=27; unsigned sBits=11; unsigned sMask; unsigned covMax=1000; size_t nSamp=2; size_t nK=0; string prefix; string output; bool samH=true; } static const char shortopts[] = "t:s:r:k:c:l:p:f:o:"; enum { OPT_HELP = 1, OPT_VERSION }; static const struct option longopts[] = { { "threads", required_argument, NULL, 't' }, { "kmer", required_argument, NULL, 'k' }, { "cov", required_argument, NULL, 'c' }, { "rbit", required_argument, NULL, 'r' }, { "sbit", required_argument, NULL, 's' }, { "output", required_argument, NULL, 'o' }, { "pref", required_argument, NULL, 'p' }, { "help", no_argument, NULL, OPT_HELP }, { "version", no_argument, NULL, OPT_VERSION }, { NULL, 0, NULL, 0 } }; size_t getInf(const char* inFile) { std::ifstream in(inFile, std::ifstream::ate | std::ifstream::binary); return in.tellg(); } bool isNumber(const string &seq) { string::const_iterator itr = seq.begin(); while(itr!= seq.end() && isdigit(*itr)) ++itr; return (itr==seq.end() && !seq.empty()); } unsigned getftype(std::ifstream &in, std::string &samSeq) { std::string hseq; getline(in,hseq); if(hseq[0]=='>') { return 1; } if(hseq[0]=='@') { if( (hseq[1]=='H'&& hseq[2]=='D') || (hseq[1]=='S'&& hseq[2]=='Q') || (hseq[1]=='R'&& hseq[2]=='G') || (hseq[1]=='P'&& hseq[2]=='G') || (hseq[1]=='C'&& hseq[2]=='O') ) { return 2; } else return 0; } std::istringstream alnSec(hseq); std::string s1,s2,s3,s4,s5,s6,s7,s8,s9,s10, s11; alnSec>>s1>>s2>>s3>>s4>>s5>>s6>>s7>>s8>>s9>>s10>>s11; if(isNumber(s2) && isNumber(s5)) { opt::samH=false; samSeq=hseq; return 2; } return 3; } inline void ntComp(const uint64_t hVal, uint16_t *t_Counter) { uint64_t indBit=opt::nSamp; if(hVal>>(63-opt::sBits) == 1) indBit=0; if(hVal>>(64-opt::sBits) == opt::sMask) indBit=1; if(indBit < opt::nSamp) { size_t shVal=hVal&(opt::rBuck-1); #pragma omp atomic ++t_Counter[indBit*opt::rBuck+shVal]; } } inline void ntRead(const string &seq, const std::vector &kList, uint16_t *t_Counter, size_t totKmer[]) { for(unsigned k=0; k &kList, uint16_t *t_Counter, size_t totKmer[]) { bool good = true; for(string seq, hseq; good;) { good = static_cast(getline(in, seq)); good = static_cast(getline(in, hseq)); good = static_cast(getline(in, hseq)); if(good) ntRead(seq, kList, t_Counter, totKmer); good = static_cast(getline(in, hseq)); } } void getEfa(std::ifstream &in, const std::vector &kList, uint16_t *t_Counter, size_t totKmer[]) { bool good = true; for(string seq, hseq; good;) { string line; good = static_cast(getline(in, seq)); while(good&&seq[0]!='>') { line+=seq; good = static_cast(getline(in, seq)); } ntRead(line, kList, t_Counter, totKmer); } } void getEsm(std::ifstream &in, const std::vector &kList, const std::string &samSeq, uint16_t *t_Counter, size_t totKmer[]) { std::string samLine,seq; std::string s1,s2,s3,s4,s5,s6,s7,s8,s9,s11; if(opt::samH) { while(getline(in,samLine)) if (samLine[0]!='@') break; } else samLine=samSeq; do { std::istringstream iss(samLine); iss>>s1>>s2>>s3>>s4>>s5>>s6>>s7>>s8>>s9>>seq>>s11; ntRead(seq, kList, t_Counter, totKmer); } while(getline(in,samLine)); } void compEst(const uint16_t *t_Counter, double &F0Mean, double fMean[]) { unsigned p[opt::nSamp][65536]; for(size_t i=0; i &kList, const size_t totalKmers[], const uint16_t *t_Counter) { std::ofstream histFiles[opt::nK]; for (unsigned k=0; k &kList, const size_t totalKmers[], const uint16_t *t_Counter) { ofstream histFile(opt::output.c_str()); histFile << "k\tf\tn\n"; for(unsigned k=0; k kList; bool die = false; for (int c; (c = getopt_long(argc, argv, shortopts, longopts, NULL)) != -1;) { std::istringstream arg(optarg != NULL ? optarg : ""); switch (c) { case '?': die = true; break; case 't': arg >> opt::nThrd; break; case 's': arg >> opt::sBits; break; case 'r': arg >> opt::rBits; break; case 'c': arg >> opt::covMax; if(opt::covMax>65535) opt::covMax = 65535; break; case 'p': arg >> opt::prefix; break; case 'o': arg >> opt::output; break; case 'k': { std::string token; while(getline(arg, token, ',')) { unsigned myK; std::stringstream ss(token); ss >> myK; kList.push_back(myK); ++opt::nK; } break; } case OPT_HELP: std::cerr << USAGE_MESSAGE; exit(EXIT_SUCCESS); case OPT_VERSION: std::cerr << VERSION_MESSAGE; exit(EXIT_SUCCESS); } if (optarg != NULL && !arg.eof()) { std::cerr << PROGRAM ": invalid option: `-" << (char)c << optarg << "'\n"; exit(EXIT_FAILURE); } } if (argc - optind < 1) { std::cerr << PROGRAM ": missing arguments\n"; die = true; } if (opt::nK == 0) { std::cerr << PROGRAM ": missing argument -k ... \n"; die = true; } if (opt::prefix.empty() && opt::output.empty()) { std::cerr << PROGRAM ": missing argument -p/-o ... \n"; die = true; } if (die) { std::cerr << "Try `" << PROGRAM << " --help' for more information.\n"; exit(EXIT_FAILURE); } vector inFiles; for (int i = optind; i < argc; ++i) { string file(argv[i]); if(file[0]=='@') { string inName; ifstream inList(file.substr(1,file.length()).c_str()); while(getline(inList,inName)) inFiles.push_back(inName); } else inFiles.push_back(file); } size_t totalSize=0; for (unsigned file_i = 0; file_i < inFiles.size(); ++file_i) totalSize += getInf(inFiles[file_i].c_str()); if(totalSize<50000000000) opt::sBits=7; size_t totalKmers[kList.size()]; for(unsigned k=0; k