https://github.com/ekg/freebayes
Tip revision: 04d702cdb8098449bade25e27ca17df983fa3ead authored by Erik Garrison on 25 May 2010, 21:44:14 UTC
added bamBayes back into make all
added bamBayes back into make all
Tip revision: 04d702c
aggregateHapmap.cpp
//------------------------------------------------------------------------------
//------------------------------------------------------------------------------
// aggregateHapmap.cpp
// aggregates HapMap genotypes across multiple HapMap genotype files
// Copyright 2009 Gabor T. Marth, Boston College
// All rights reserved.
//------------------------------------------------------------------------------
//------------------------------------------------------------------------------
//------------------------------------------------------------------------------
//------------------------------------------------------------------------------
// includes
//------------------------------------------------------------------------------
//------------------------------------------------------------------------------
// standard includes
#include <cstdio>
#include <iostream>
#include <fstream>
#include <string>
#include <vector>
#include <map>
#include <iterator>
#include <algorithm>
#include <cmath>
// "tclap" commandline parsing library
#include <tclap/CmdLine.h>
// "boost" regular expression library
#include <boost/regex.hpp>
// private libraries
#include "Function-Generic.h"
#include "Class-BedReader.h"
#include "ReferenceSequenceReader.h"
// uses
using namespace std;
using namespace TCLAP;
//------------------------------------------------------------------------------
//------------------------------------------------------------------------------
// typedefs
//------------------------------------------------------------------------------
//------------------------------------------------------------------------------
struct ArgStruct {
string shortId;
string longId;
string description;
bool required;
string defaultValueString;
int defaultValueInt;
double defaultValueDouble;
bool defaultValueBool;
string type;
bool multi;
vector<string> constraint;
};
//------------------------------------------------------------------------------
//------------------------------------------------------------------------------
// template headers
//------------------------------------------------------------------------------
//------------------------------------------------------------------------------
template< typename keyType, typename valueType >
vector<keyType> sortKeysByValue(map<keyType, valueType, std::less<keyType> > hash, bool descend);
//------------------------------------------------------------------------------
//------------------------------------------------------------------------------
// subroutine headers
//------------------------------------------------------------------------------
//------------------------------------------------------------------------------
//------------------------------------------------------------------------------
//------------------------------------------------------------------------------
// static variables
//------------------------------------------------------------------------------
//------------------------------------------------------------------------------
static string ProgramName("rptStat");
static string ProgramDescription("Program to create data coverage and SNP stats.");
static string ProgramVersion("0.5.2");
static string ProgramDate("2009-04-29");
static string ProgramDeveloper("Gabor T. Marth");
static string ProgramInstitution("Boston College");
static string ProgramCopyrightDates("2009.");
static vector<ArgStruct> ArgList;
//------------------------------------------------------------------------------
//------------------------------------------------------------------------------
// classes
//------------------------------------------------------------------------------
//------------------------------------------------------------------------------
class MyOutput : public StdOutput {
public:
virtual void failure(CmdLineInterface& c, ArgException& e)
{
cerr << "################################################################################" << endl;
cerr << "### Program: " << ProgramName << " Version: " << ProgramVersion << " Release date: " << ProgramDate << endl;
cerr << "### " << ProgramDescription << endl;
cerr << "### " << "Copyright: " << ProgramCopyrightDates << " " << ProgramDeveloper << ", " << ProgramInstitution << endl;
cerr << "### All rights reserved." << endl;
cerr << "###" << endl;
cerr << "### Command line error:" << e.what() << endl;
cerr << "### For usage please type: " << c.getProgramName() << " --help" << endl;
cerr << "################################################################################" << endl;
}
virtual void usage(CmdLineInterface& c)
{
cout << "################################################################################" << endl;
cout << "### Program: " << ProgramName << " Version: " << ProgramVersion << " Release date: " << ProgramDate << endl;
cout << "### " << ProgramDescription << endl;
cout << "### " << "Copyright: " << ProgramCopyrightDates << " " << ProgramDeveloper << ", " << ProgramInstitution << endl;
cout << "### All rights reserved." << endl;
cout << "###" << endl;
cout << "### Usage: " << c.getProgramName() << " [arguments], where:" << endl;
for(vector<ArgStruct>::const_iterator it = ArgList.begin();
it != ArgList.end(); it++) {
ArgStruct arg = *it;
string idString = "";
if (arg.longId != "") {idString += "--" + arg.longId;}
if (arg.shortId != "") {idString += ", -" + arg.shortId;}
string multiString = "single-valued";
if (arg.multi) {multiString = "multi-valued";}
if (arg.required) {
cout << "### " << idString << " [" << arg.type << ", required, no default, " << multiString << "]" << endl;
}
else {
cout << "### " << idString << " [" << arg.type << ", optional, default=" << arg.defaultValueString << ", " << multiString << "]" << endl;
}
if (arg.constraint.size() > 0) {
cout << "### Permitted values: (";
bool first = true;
for (vector<string>::const_iterator iter = arg.constraint.begin();
iter != arg.constraint.end(); iter++) {
string value = *iter;
if (! first) {
cout << "|";
}
first = false;
cout << value;
}
cout << ")" << endl;
}
cout << "### Description: " << arg.description << endl;
}
cout << "################################################################################" << endl;
}
virtual void version(CmdLineInterface& c)
{
cerr << "################################################################################" << endl;
cerr << "### Program: " << ProgramName << " Version: " << ProgramVersion << " Release date: " << ProgramDate << endl;
cout << "### " << ProgramDescription << endl;
cout << "### " << "Copyright: " << ProgramCopyrightDates << " " << ProgramDeveloper << ", " << ProgramInstitution << endl;
cout << "### All rights reserved." << endl;
cout << "###" << endl;
cerr << "################################################################################" << endl;
}
};
//------------------------------------------------------------------------------
//------------------------------------------------------------------------------
// main
//------------------------------------------------------------------------------
//------------------------------------------------------------------------------
int main (int argc, char *argv[]) {
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
// constants
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
// white space
boost::regex re("\\s+");
// HapMap file header line
boost::regex patternHapmapHeaderLine("^rs#");
// HapMap file data line
boost::regex patternHapmapDataLine("^(rs\\d+)\\s+(\\S)/(\\S)\\s+chr(\\S+)\\s+(\\d+)");
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
// variables
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
// Regex match
boost::smatch match;
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
// command line
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
// Create new CmdLine object
//----------------------------------------------------------------------------
CmdLine cmd("", ' ', ProgramVersion);
//----------------------------------------------------------------------------
// add program-specific command line arguments
//----------------------------------------------------------------------------
// initialize arg
ArgStruct arg;
// input file: MOSAIK binary reference sequence archive
ArgStruct argMbr;
arg = argMbr;
arg.shortId = "";
arg.longId = "mbr";
arg.description = "Reference sequence archive file (MOSAIK binary format)";
arg.required = true;
arg.defaultValueString = "";
arg.type = "string";
arg.multi = false;
ArgList.push_back(arg);
ValueArg<string> cmd_mbr(arg.shortId, arg.longId, arg.description, arg.required, arg.defaultValueString, arg.type, cmd);
// I: report interval for debugging printouts
ArgStruct argI;
arg = argI;
arg.shortId = "";
arg.longId = "I";
arg.description = "Report interval for debugging printouts";
arg.required = false;
arg.defaultValueString = "10000";
arg.type = "int";
arg.multi = false;
ArgList.push_back(arg);
ValueArg<int> cmd_I(arg.shortId, arg.longId, arg.description, arg.required, 10000, arg.type, cmd);
// debug
ArgStruct argDebug;
arg = argDebug;
arg.shortId = "";
arg.longId = "debug";
arg.description = "Print debugging messages?";
arg.required = false;
arg.defaultValueString = "false";
arg.type = "switch";
arg.multi = false;
ArgList.push_back(arg);
SwitchArg cmd_debug(arg.shortId, arg.longId, arg.description, cmd, false);
// debug2
ArgStruct argDebug2;
arg = argDebug2;
arg.shortId = "";
arg.longId = "debug2";
arg.description = "Print very detailed debugging messages?";
arg.required = false;
arg.defaultValueString = "false";
arg.type = "switch";
arg.multi = false;
ArgList.push_back(arg);
SwitchArg cmd_debug2(arg.shortId, arg.longId, arg.description, cmd, false);
//----------------------------------------------------------------------------
// register (but not add to cmd) special arguments that are automatically
// added to cmd
//----------------------------------------------------------------------------
// help
ArgStruct argHelp;
arg = argHelp;
arg.shortId = "h";
arg.longId = "help";
arg.description = "Print usage statement?";
arg.required = false;
arg.defaultValueString = "false";
arg.type = "switch";
arg.multi = false;
ArgList.push_back(arg);
// version
ArgStruct argVersion;
arg = argVersion;
arg.shortId = "";
arg.longId = "version";
arg.description = "Print program version?";
arg.required = false;
arg.defaultValueString = "false";
arg.type = "switch";
arg.multi = false;
ArgList.push_back(arg);
//----------------------------------------------------------------------------
// add custom output handler
//----------------------------------------------------------------------------
MyOutput my;
cmd.setOutput(&my);
//----------------------------------------------------------------------------
// parse command line and catch possible errors
//----------------------------------------------------------------------------
try {
cmd.parse(argc,argv);
}
catch ( ArgException& e ) {
cerr << "ERROR: " << e.error() << " " << e.argId() << endl;
}
//----------------------------------------------------------------------------
// assign command line parameters
//----------------------------------------------------------------------------
string mbr = cmd_mbr.getValue();
int I = cmd_I.getValue();
bool debug = cmd_debug.getValue();
bool debug2 = cmd_debug2.getValue();
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
// report command line and parameters used
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
map<bool, string, less<bool> > bool2String;
bool2String[false] = "false";
bool2String[true] = "true";
if (debug) {
cerr << "Command line:";
for (int i=0; i<argc; i++) {
cerr << " " << argv[i];
}
cerr << endl;
cerr << endl;
cerr << "Complete list of parameter values:" << endl;
cerr << " --mbr = " << mbr << endl;
cerr << " --I = " << I << endl;
cerr << " --debug = " << bool2String[debug] << endl;
cerr << " --debug2 = " << bool2String[debug2] << endl;
cerr << endl;
}
//--------------------------------------------------------------------------
//--------------------------------------------------------------------------
// process MBR input file
//--------------------------------------------------------------------------
//--------------------------------------------------------------------------
//--------------------------------------------------------------------------
// report
//--------------------------------------------------------------------------
if (debug) {cerr << "Opening MOSAIK binary format reference sequence archive input file: " << mbr << " ...";}
//--------------------------------------------------------------------------
// if this is not a valid MOSAIK reference file, complain and bomb
//--------------------------------------------------------------------------
if (! Mosaik::CReferenceSequenceReader::CheckFile(mbr, false)) {
cerr << "ERROR: MOSAIK reference file not valid: " << mbr << ". Exiting..." << endl;
exit(1);
}
//--------------------------------------------------------------------------
// open the MOSAIK alignments file
//--------------------------------------------------------------------------
Mosaik::CReferenceSequenceReader rsr;
rsr.Open(mbr);
//--------------------------------------------------------------------------
// retrieve all of the reference sequence metadata
//--------------------------------------------------------------------------
vector<Mosaik::ReferenceSequence> referenceSequences;
rsr.GetReferenceSequences(referenceSequences);
//--------------------------------------------------------------------------
// load ref seq names into hash
//--------------------------------------------------------------------------
map<string, unsigned int, less<string> > refseqLength;
map<string, string, less<string> > refseqDna;
for (vector<Mosaik::ReferenceSequence>::const_iterator rsIter = referenceSequences.begin();
rsIter != referenceSequences.end(); ++rsIter) {
Mosaik::ReferenceSequence rs = *rsIter;
string ref = rs.Name;
refseqLength[ref] = rs.NumBases;
// get the reference base sequence
string bases;
rsr.GetReferenceSequence(ref, bases);
refseqDna[ref] = bases;
}
// close the reference sequence reader
rsr.Close();
// report success
if (debug) {cerr << " done." << endl;}
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
// read and process concatenated population-specific HapMap file
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
map<string, map<int, map<string, string, less<string> >, less<int> >, less<string> > G;
map<string, map<int, string, less<int> >, less<string> > RS, A1, A2;
map<string, bool, less<string> > sampleExists;
vector<string> samples, allSamples;
if (debug) {cerr << "Reading concatenated population-specific HapMap genotype file: " << endl;}
//----------------------------------------------------------------------------
// iterate through entries
//----------------------------------------------------------------------------
string line;
unsigned long int lc = 1;
unsigned int numberFields = 0;
while (getline(cin, line)) {
//--------------------------------------------------------------------------
// increment line counter
//--------------------------------------------------------------------------
++lc;
//--------------------------------------------------------------------------
// give life sign
//--------------------------------------------------------------------------
if (debug && lc % I == 0){
cerr << " processing line: " << lc << endl;
}
//--------------------------------------------------------------------------
// parse line
//--------------------------------------------------------------------------
// if file header
if (boost::regex_search(line, match, patternHapmapHeaderLine)) {
//------------------------------------------------------------------------
// tokenize fields
//------------------------------------------------------------------------
boost::sregex_token_iterator rj;
boost::sregex_token_iterator ri(line.begin(), line.end(), re, -1);
vector<string> fields;
while(ri != rj) {
fields.push_back(*ri++);
}
//------------------------------------------------------------------------
// set expected number of fields for data lines
//------------------------------------------------------------------------
numberFields = fields.size();
//------------------------------------------------------------------------
// get and register sample names
//------------------------------------------------------------------------
samples.clear();
for (int i=11; i<numberFields; ++i) {
string sample = fields[i];
samples.push_back(sample);
if (sampleExists.count(sample) <= 0) {
allSamples.push_back(sample);
sampleExists[sample] = true;
}
}
//------------------------------------------------------------------------
// report number samples if needed
//------------------------------------------------------------------------
if (debug) {
cerr << " number of samples: " << samples.size() << endl;
}
}
// otherwise data line
else if (boost::regex_search(line, match, patternHapmapDataLine)) {
//------------------------------------------------------------------------
// parse chromosome id, position, rs, and alleles
//------------------------------------------------------------------------
string rs = match[1];
string allele1 = match[2];
string allele2 = match[3];
string ref = match[4];
int pos = string2Int(match[5]);
//------------------------------------------------------------------------
// store rs, and alleles
//------------------------------------------------------------------------
RS[ref][pos] = rs;
A1[ref][pos] = allele1;
A2[ref][pos] = allele2;
//------------------------------------------------------------------------
// tokenize fields
//------------------------------------------------------------------------
boost::sregex_token_iterator rj;
boost::sregex_token_iterator ri(line.begin(), line.end(), re, -1);
vector<string> fields;
while(ri != rj) {
fields.push_back(*ri++);
}
//------------------------------------------------------------------------
// check number of fields
//------------------------------------------------------------------------
if (fields.size() != numberFields) {
cerr << "HapMap data line has incorrect number of fields: " << fields.size()
<< " instead of the expected number: " << numberFields<< endl;
continue;
}
//------------------------------------------------------------------------
// parse and register genotypes
//------------------------------------------------------------------------
int sc = 0;
for (int i=11; i<numberFields; ++i) {
string g = fields[i];
string sample = samples[sc];
G[ref][pos][sample] = g;
++sc;
}
}
}
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
// produce output
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
if (debug) {cerr << "Printing aggregate HapMap genotype file" << endl;}
//----------------------------------------------------------------------------
// print header line
//----------------------------------------------------------------------------
if (debug) {cerr << " Printing header" << endl;}
// print fixed fields
cout << "refId\tpos\trsId\trefAllele\tallele1\tallele2\t#samples";
// print sample names
for (vector<string>::const_iterator sampleIter = allSamples.begin();
sampleIter != allSamples.end(); ++sampleIter) {
string sample = *sampleIter;
cout << "\t" << sample;
}
cout << endl;
//----------------------------------------------------------------------------
// print metadata and genotypes for each ref position
//----------------------------------------------------------------------------
if (debug) {cerr << " Printing genotypes" << endl;}
unsigned int numSamples = allSamples.size();
for (map<string, map<int, map<string, string, less<string> >, less<int> >, less<string> >::const_iterator refIter = G.begin();
refIter != G.end(); ++refIter) {
string ref = refIter->first;
for (map<int, map<string, string, less<string> > >::const_iterator posIter = G[ref].begin();
posIter != G[ref].end(); ++posIter) {
int pos = posIter->first;
// get reference allele
string refAllele = refseqDna[ref].substr(pos-1, 1);
cout << ref
<< "\t" << pos
<< "\t" << RS[ref][pos]
<< "\t" << refAllele
<< "\t" << A1[ref][pos]
<< "\t" << A2[ref][pos]
<< "\t" << numSamples;
// iterate through every sample in allSamples
for (vector<string>::const_iterator sampleIter = allSamples.begin();
sampleIter != allSamples.end(); ++sampleIter) {
string sample = *sampleIter;
string g = "NE";
if (G[ref][pos].count(sample) > 0) {
g = G[ref][pos][sample];
}
cout << "\t" << g;
}
cout << endl;
}
}
}
//------------------------------------------------------------------------------
//------------------------------------------------------------------------------
// templates
//------------------------------------------------------------------------------
//------------------------------------------------------------------------------
//------------------------------------------------------------------------------
// sorts keys of a hash in order of associated value
//------------------------------------------------------------------------------
template< typename keyType, typename valueType >
vector<keyType> sortKeysByValue(map<keyType, valueType, std::less<keyType> > hash, bool descend) {
// instantiate inverse hash as a multimap
multimap<valueType, keyType, std::less<valueType> > inverseHash;
// load elements of hash into inverseHash
for (typename map<keyType, valueType, std::less<keyType> >::const_iterator iter = hash.begin(); iter != hash.end(); iter++) {
keyType key = iter->first;
valueType value = iter->second;
inverseHash.insert(typename multimap<valueType, keyType, std::less<valueType> >::value_type(value, key));
}
// compose vector of original keys sorted by original values
vector<keyType> sortedKeys;
for(typename multimap<valueType, keyType, std::less<valueType> >::const_iterator iter = inverseHash.begin();
iter != inverseHash.end(); iter++) {
keyType key = iter->second;
sortedKeys.push_back(key);
}
// reverse if descending order was required
if (descend) {
reverse(sortedKeys.begin(), sortedKeys.end());
}
// return
return sortedKeys;
}
//------------------------------------------------------------------------------
//------------------------------------------------------------------------------
// subroutines
//------------------------------------------------------------------------------
//------------------------------------------------------------------------------