https://github.com/bcgsc/ntCard
Tip revision: 0135d929301d24c464d843573e4ea56cd0f3e139 authored by Johnathan Wong on 22 April 2021, 00:47:28 UTC
ntcard.cpp: update low kmer sample check to evaluate entire expression
ntcard.cpp: update low kmer sample check to evaluate entire expression
Tip revision: 0135d92
ntcard.cpp
#include "Uncompress.h"
#include "vendor/ntHash/ntHashIterator.hpp"
#include "vendor/ntHash/stHashIterator.hpp"
#include <cassert>
#include <cmath>
#include <cstdlib>
#include <fstream>
#include <getopt.h>
#include <iomanip>
#include <iostream>
#include <sstream>
#include <string>
#include <vector>
#ifdef _OPENMP
#include <omp.h>
#endif
#define PROGRAM "ntCard"
static const char VERSION_MESSAGE[] =
PROGRAM " 1.2.2 \n"
"Written by Hamid Mohamadi.\n"
"Copyright 2018 Hamid Mohamadi, Licensed under MIT License\n";
static const char USAGE_MESSAGE[] =
"Usage: " PROGRAM " [OPTION]... FILE(S)...\n"
"Estimates k-mer coverage histogram in FILE(S).\n\n"
"Acceptable file formats: fastq, fasta, sam, bam and in compressed formats gz, bz, zip, xz.\n"
"A list of files containing file names in each row can be passed with @ prefix.\n"
"\n"
" Options:\n"
"\n"
" -t, --threads=N use N parallel threads [1] (N>=2 should be used when input files are >=2)\n"
" -k, --kmer=N the length of kmer \n"
" -g, --gap=N the length of gap in the gap seed [0]. g mod 2 must equal k mod 2 unless g == "
"0\n"
" -g does not support multiple k currently.\n"
" -c, --cov=N the maximum coverage of kmer in output [1000]\n"
" -p, --pref=STRING the prefix for output file name(s)\n"
" -o, --output=STRING the name for output file name (used when output should be a single "
"file)\n"
" --help display this help and exit\n"
" --version output version information and exit\n"
"\n"
"Report bugs to https://github.com/bcgsc/ntCard/issues\n";
using namespace std;
namespace opt {
unsigned nThrd = 1;
unsigned kmLen = 64;
unsigned gap = 0;
size_t rBuck;
unsigned rBits = 27;
unsigned sBits = 11;
unsigned sMask;
unsigned covMax = 1000;
size_t nSamp = 2;
size_t nK = 0;
string prefix;
string output;
bool samH = true;
std::vector<std::vector<unsigned> > seedSet;
}
static const char shortopts[] = "t:s:r:k:c:l:p:f:o:g:";
enum
{
OPT_HELP = 1,
OPT_VERSION
};
static const struct option longopts[] = { { "threads", required_argument, NULL, 't' },
{ "kmer", required_argument, NULL, 'k' },
{ "gap", required_argument, NULL, 'g' },
{ "cov", required_argument, NULL, 'c' },
{ "rbit", required_argument, NULL, 'r' },
{ "sbit", required_argument, NULL, 's' },
{ "output", required_argument, NULL, 'o' },
{ "pref", required_argument, NULL, 'p' },
{ "help", no_argument, NULL, OPT_HELP },
{ "version", no_argument, NULL, OPT_VERSION },
{ NULL, 0, NULL, 0 } };
size_t
getInf(const char* inFile)
{
std::ifstream in(inFile, std::ifstream::ate | std::ifstream::binary);
return in.tellg();
}
bool
isNumber(const string& seq)
{
string::const_iterator itr = seq.begin();
while (itr != seq.end() && isdigit(*itr))
++itr;
return (itr == seq.end() && !seq.empty());
}
unsigned
getftype(std::ifstream& in, std::string& samSeq)
{
std::string hseq;
getline(in, hseq);
if (hseq[0] == '>') {
return 1;
}
if (hseq[0] == '@') {
if ((hseq[1] == 'H' && hseq[2] == 'D') || (hseq[1] == 'S' && hseq[2] == 'Q') ||
(hseq[1] == 'R' && hseq[2] == 'G') || (hseq[1] == 'P' && hseq[2] == 'G') ||
(hseq[1] == 'C' && hseq[2] == 'O')) {
return 2;
} else
return 0;
}
std::istringstream alnSec(hseq);
std::string s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11;
alnSec >> s1 >> s2 >> s3 >> s4 >> s5 >> s6 >> s7 >> s8 >> s9 >> s10 >> s11;
if (isNumber(s2) && isNumber(s5)) {
opt::samH = false;
samSeq = hseq;
return 2;
}
return 3;
}
inline void
ntComp(const uint64_t hVal, uint16_t* t_Counter)
{
uint64_t indBit = opt::nSamp;
if (hVal >> (63 - opt::sBits) == 1)
indBit = 0;
if (hVal >> (64 - opt::sBits) == opt::sMask)
indBit = 1;
if (indBit < opt::nSamp) {
size_t shVal = hVal & (opt::rBuck - 1);
#pragma omp atomic
++t_Counter[indBit * opt::rBuck + shVal];
}
}
inline void
ntRead(const string& seq, const std::vector<unsigned>& kList, uint16_t* t_Counter, size_t totKmer[])
{
for (unsigned k = 0; k < kList.size(); k++) {
ntHashIterator itr(seq, 1, kList[k]);
while (itr != itr.end()) {
ntComp((*itr)[0], t_Counter + k * opt::nSamp * opt::rBuck);
++itr;
++totKmer[k];
}
}
}
inline void
stRead(const string& seq, const std::vector<unsigned>& kList, uint16_t* t_Counter, size_t totKmer[])
{
for (unsigned k = 0; k < kList.size(); k++) {
stHashIterator itr(seq, opt::seedSet, 1, 1, kList[k]);
while (itr != itr.end()) {
ntComp((*itr)[0], t_Counter + k * opt::nSamp * opt::rBuck);
++itr;
++totKmer[k];
}
}
}
void
getEfq(std::ifstream& in, const std::vector<unsigned>& kList, uint16_t* t_Counter, size_t totKmer[])
{
bool good = true;
for (string seq, hseq; good;) {
good = static_cast<bool>(getline(in, seq));
good = static_cast<bool>(getline(in, hseq));
good = static_cast<bool>(getline(in, hseq));
if (good && opt::gap == 0) {
ntRead(seq, kList, t_Counter, totKmer);
}
if (good && opt::gap != 0) {
stRead(seq, kList, t_Counter, totKmer);
}
good = static_cast<bool>(getline(in, hseq));
}
}
void
getEfa(std::ifstream& in, const std::vector<unsigned>& kList, uint16_t* t_Counter, size_t totKmer[])
{
bool good = true;
for (string seq, hseq; good;) {
string line;
good = static_cast<bool>(getline(in, seq));
while (good && seq[0] != '>') {
line += seq;
good = static_cast<bool>(getline(in, seq));
}
if (opt::gap == 0) {
ntRead(line, kList, t_Counter, totKmer);
} else {
stRead(line, kList, t_Counter, totKmer);
}
}
}
void
getEsm(
std::ifstream& in,
const std::vector<unsigned>& kList,
const std::string& samSeq,
uint16_t* t_Counter,
size_t totKmer[])
{
std::string samLine, seq;
std::string s1, s2, s3, s4, s5, s6, s7, s8, s9, s11;
if (opt::samH) {
while (getline(in, samLine))
if (samLine[0] != '@')
break;
} else
samLine = samSeq;
do {
std::istringstream iss(samLine);
iss >> s1 >> s2 >> s3 >> s4 >> s5 >> s6 >> s7 >> s8 >> s9 >> seq >> s11;
if (opt::gap == 0) {
ntRead(seq, kList, t_Counter, totKmer);
} else {
stRead(seq, kList, t_Counter, totKmer);
}
} while (getline(in, samLine));
}
void
compEst(const uint16_t* t_Counter, double& F0Mean, double fMean[])
{
unsigned p[opt::nSamp][65536];
for (size_t i = 0; i < opt::nSamp; i++)
for (size_t j = 0; j < 65536; j++)
p[i][j] = 0;
for (size_t i = 0; i < opt::nSamp; i++)
for (size_t j = 0; j < opt::rBuck; j++)
++p[i][t_Counter[i * opt::rBuck + j]];
double pMean[65536];
for (size_t i = 0; i < 65536; i++)
pMean[i] = 0.0;
for (size_t i = 0; i < 65536; i++) {
for (size_t j = 0; j < opt::nSamp; j++)
pMean[i] += p[j][i];
pMean[i] /= 1.0 * opt::nSamp;
}
F0Mean = (ssize_t)(
(opt::rBits * log(2) - log(pMean[0])) * 1.0 * ((size_t)1 << (opt::sBits + opt::rBits)));
for (size_t i = 0; i < 65536; i++)
fMean[i] = 0;
if (pMean[0] * (log(pMean[0]) - opt::rBits * log(2)) == 0) {
return;
}
fMean[1] = -1.0 * pMean[1] / (pMean[0] * (log(pMean[0]) - opt::rBits * log(2)));
for (size_t i = 2; i < 65536; i++) {
double sum = 0.0;
for (size_t j = 1; j < i; j++)
sum += j * pMean[i - j] * fMean[j];
fMean[i] = -1.0 * pMean[i] / (pMean[0] * (log(pMean[0]) - opt::rBits * log(2))) -
sum / (i * pMean[0]);
}
for (size_t i = 1; i < 65536; i++)
fMean[i] = abs((ssize_t)(fMean[i] * F0Mean));
}
void
outDefault(const std::vector<unsigned>& kList, const size_t totalKmers[], const uint16_t* t_Counter)
{
std::ofstream histFiles[opt::nK];
for (unsigned k = 0; k < opt::nK; k++) {
std::stringstream hstm;
hstm << opt::prefix << "_k" << kList[k] << ".hist";
histFiles[k].open((hstm.str()).c_str());
}
#pragma omp parallel for schedule(dynamic)
for (unsigned k = 0; k < kList.size(); k++) {
double F0Mean = 0.0;
double fMean[65536];
compEst(t_Counter + k * opt::nSamp * opt::rBuck, F0Mean, fMean);
histFiles[k] << "F1\t" << totalKmers[k] << "\n";
histFiles[k] << "F0\t" << (size_t)F0Mean << "\n";
for (size_t i = 1; i <= opt::covMax; i++)
histFiles[k] << i << "\t" << (size_t)fMean[i] << "\n";
}
for (unsigned k = 0; k < opt::nK; k++)
histFiles[k].close();
}
void
outCompact(const std::vector<unsigned>& kList, const size_t totalKmers[], const uint16_t* t_Counter)
{
ofstream histFile(opt::output.c_str());
histFile << "k\tf\tn\n";
for (unsigned k = 0; k < kList.size(); k++) {
double F0Mean = 0.0;
double fMean[65536];
compEst(t_Counter + k * opt::nSamp * opt::rBuck, F0Mean, fMean);
std::cerr << "k=" << kList[k] << "\tF1\t" << totalKmers[k] << "\n";
std::cerr << "k=" << kList[k] << "\tF0\t" << (size_t)F0Mean << "\n";
for (size_t i = 1; i <= opt::covMax; i++)
histFile << kList[k] << "\t" << i << "\t" << (size_t)fMean[i] << "\n";
}
histFile.close();
}
int
main(int argc, char** argv)
{
double sTime = omp_get_wtime();
vector<unsigned> kList;
bool die = false;
for (int c; (c = getopt_long(argc, argv, shortopts, longopts, NULL)) != -1;) {
std::istringstream arg(optarg != NULL ? optarg : "");
switch (c) {
case '?':
die = true;
break;
case 't':
arg >> opt::nThrd;
break;
case 's':
arg >> opt::sBits;
break;
case 'r':
arg >> opt::rBits;
break;
case 'c':
arg >> opt::covMax;
if (opt::covMax > 65535)
opt::covMax = 65535;
break;
case 'p':
arg >> opt::prefix;
break;
case 'o':
arg >> opt::output;
break;
case 'g':
arg >> opt::gap;
break;
case 'k': {
std::string token;
while (getline(arg, token, ',')) {
unsigned myK;
std::stringstream ss(token);
ss >> myK;
kList.push_back(myK);
++opt::nK;
}
break;
}
case OPT_HELP:
std::cerr << USAGE_MESSAGE;
exit(EXIT_SUCCESS);
case OPT_VERSION:
std::cerr << VERSION_MESSAGE;
exit(EXIT_SUCCESS);
}
if (optarg != NULL && !arg.eof()) {
std::cerr << PROGRAM ": invalid option: `-" << (char)c << optarg << "'\n";
exit(EXIT_FAILURE);
}
}
if (argc - optind < 1) {
std::cerr << PROGRAM ": missing arguments\n";
die = true;
}
if (opt::gap != 0 && (opt::gap % 2 != kList[0] % 2)) {
std::cerr << PROGRAM "Gap size and kmer must have the same modulus\n";
die = true;
}
if (opt::nK == 0) {
std::cerr << PROGRAM ": missing argument -k ... \n";
die = true;
}
if (opt::prefix.empty() && opt::output.empty()) {
std::cerr << PROGRAM ": missing argument -p/-o ... \n";
die = true;
}
if (opt::gap != 0 && kList.size() != 1) {
std::cerr << PROGRAM ": -g does not support multiple k currently.\n";
die = true;
}
if (die) {
std::cerr << "Try `" << PROGRAM << " --help' for more information.\n";
exit(EXIT_FAILURE);
}
if (opt::gap != 0) {
std::string gap(opt::gap, '0');
std::string nonGap((kList[0] - opt::gap) / 2, '1');
std::vector<std::string> seedString;
seedString.push_back(nonGap + gap + nonGap);
opt::seedSet = stHashIterator::parseSeed(seedString);
}
vector<string> inFiles;
for (int i = optind; i < argc; ++i) {
string file(argv[i]);
if (file[0] == '@') {
string inName;
ifstream inList(file.substr(1, file.length()).c_str());
while (getline(inList, inName))
inFiles.push_back(inName);
} else
inFiles.push_back(file);
}
size_t totalSize = 0;
for (unsigned file_i = 0; file_i < inFiles.size(); ++file_i)
totalSize += getInf(inFiles[file_i].c_str());
if (totalSize < 50000000000)
opt::sBits = 7;
size_t totalKmers[kList.size()];
for (unsigned k = 0; k < kList.size(); k++)
totalKmers[k] = 0;
opt::rBuck = ((size_t)1) << opt::rBits;
opt::sMask = (((size_t)1) << (opt::sBits - 1)) - 1;
uint16_t* t_Counter = new uint16_t[opt::nK * opt::nSamp * opt::rBuck]();
#ifdef _OPENMP
omp_set_num_threads(opt::nThrd);
#endif
#pragma omp parallel for schedule(dynamic)
for (unsigned file_i = 0; file_i < inFiles.size(); ++file_i) {
size_t totKmer[kList.size()];
for (unsigned k = 0; k < kList.size(); k++)
totKmer[k] = 0;
std::ifstream in(inFiles[file_i].c_str());
std::string samSeq;
unsigned ftype = getftype(in, samSeq);
if (ftype == 0)
getEfq(in, kList, t_Counter, totKmer);
else if (ftype == 1)
getEfa(in, kList, t_Counter, totKmer);
else if (ftype == 2)
getEsm(in, kList, samSeq, t_Counter, totKmer);
else {
std::cerr << "Error in reading file: " << inFiles[file_i] << std::endl;
exit(EXIT_FAILURE);
}
in.close();
for (unsigned k = 0; k < kList.size(); k++)
#pragma omp atomic
totalKmers[k] += totKmer[k];
}
if (opt::output.empty())
outDefault(kList, totalKmers, t_Counter);
else
outCompact(kList, totalKmers, t_Counter);
delete[] t_Counter;
std::cerr << "Runtime(sec): " << setprecision(4) << fixed << omp_get_wtime() - sTime << "\n";
return 0;
}