https://github.com/AlexanderDilthey/MHC-PRG
Tip revision: e59943adb8855532573a6c276651efad1e18a6b1 authored by Alexander Dilthey on 18 December 2018, 10:20:48 UTC
Update HLA-PRG.md
Update HLA-PRG.md
Tip revision: e59943a
Utilities.cpp
/*
* Utilities.cpp
*
* Created on: 23.05.2011
* Author: Alexander Dilthey
*/
#include "Utilities.h"
#include <iostream>
#include <sstream>
#include "MHC-PRG.h"
#include <stdlib.h>
#include <time.h>
#include <assert.h>
#include <fstream>
#include <math.h>
#include <omp.h>
#include <exception>
#include <stdexcept>
#include <boost/filesystem.hpp>
#include <boost/random/uniform_01.hpp>
// todo at some point activate...
// unsigned int globalRandRSeed = time(NULL);
unsigned int globalRandRSeed = 0;
using namespace boost::filesystem;
Utilities::Utilities() {
}
Utilities::~Utilities() {
}
std::string Utilities::removeGaps(std::string in)
{
std::string out;
out.reserve(in.size());
for(size_t i = 0; i < in.size(); i++)
{
if((in.at(i) != '_') && (in.at(i) != '-'))
{
out.push_back(in.at(i));
}
}
return out;
}
bool Utilities::directoryExists(std::string dir)
{
path p(dir);
return (exists(p) && is_directory(p));
}
void Utilities::makeDir(std::string dir)
{
if(! directoryExists(dir))
{
path p(dir);
bool success = create_directory(p);
if(! success)
{
throw std::runtime_error("Cannot create directory "+dir);
}
}
}
bool Utilities::fileExists(std::string filepath)
{
std::ifstream ifile(filepath);
if(ifile.good())
{
ifile.close();
return true;
}
else
{
return false;
}
}
bool Utilities::fileReadable(std::string file)
{
std::ifstream fS;
fS.open(file.c_str());
bool forReturn = fS.is_open();
fS.close();
return forReturn;
}
std::vector<int> Utilities::StrtoI(std::vector<std::string> s)
{
std::vector<int> forReturn;
for(unsigned int i = 0; i < s.size(); i++)
{
forReturn.push_back(StrtoI(s.at(i)));
}
return forReturn;
}
unsigned char Utilities::PCorrectToPhred(double PCorrect)
{
assert(PCorrect >= 0);
assert(PCorrect <= 1);
double pWrong = 1 - PCorrect;
if(pWrong == 0)
{
pWrong = 1e-100;
}
double phred1 = -10.0 * log10(pWrong);
if((phred1 + 33) > 255)
{
phred1 = 255 - 33;
}
assert((phred1 + 33) >= 0);
int forReturn = round(phred1 + 33);
assert(forReturn >= 0);
assert(forReturn <= 255);
return (unsigned char)(forReturn);
}
std::string Utilities::JoinMapUInt2Str(std::map<std::string, unsigned int> M)
{
std::vector<std::string> forReturn_parts;
for(std::map<std::string, unsigned int>::iterator mIt = M.begin(); mIt != M.end(); mIt++)
{
std::string thisE = mIt->first + ":" + ItoStr(mIt->second);
forReturn_parts.push_back(thisE);
}
return join(forReturn_parts, ",");
}
double Utilities::LogSumLogPs (const std::vector<double>& v)
{
std::pair<double, unsigned int> maxPair = findVectorMax(v);
double max = maxPair.first;
double secondTerm = 0;
for(size_t i = 0; i < v.size(); i++)
{
if(i == maxPair.second)
{
continue;
}
else
{
secondTerm += exp(v.at(i) - max);
}
}
double forReturn = (max + log(1 + secondTerm));
return forReturn;
}
std::pair<double, int> Utilities::findIntMapMax(std::map<int, double>& m)
{
assert(m.size() > 0);
double max;
unsigned iMax;
for(std::map<int, double>::iterator mIt = m.begin(); mIt != m.end(); mIt++)
{
if((mIt == m.begin()) || (mIt->second > max))
{
max = mIt->second;
iMax = mIt->first;
}
}
return std::pair<double, unsigned int>(max, iMax);
}
std::pair<double, int> Utilities::findIntMapMaxP_nonCritical(std::map<int, double>& m, unsigned int* thisSeed)
{
assert(m.size() > 0);
double max;
std::vector<unsigned int> iMax;
for(std::map<int, double>::iterator mIt = m.begin(); mIt != m.end(); mIt++)
{
if((mIt == m.begin()) || (mIt->second > max))
{
max = mIt->second;
iMax.clear();
iMax.push_back(mIt->first);
}
}
if(iMax.size() > 1)
{
int selectedI = randomNumber_nonCritical(iMax.size() - 1, thisSeed);
assert((selectedI >= 0) && (selectedI < iMax.size()));
return std::pair<double, unsigned int>(max, iMax.at(selectedI));
}
else
{
return std::pair<double, unsigned int>(max, iMax.at(0));
}
}
bool Utilities::extractBit(unsigned int number, unsigned int bit)
{
unsigned int bitmask = pow((unsigned int)2, bit);
bool bitValue = ((number & bitmask) == bitmask);
return bitValue;
}
std::vector<std::string> Utilities::get_map_keys_sorted_by_value(const std::map<std::string, int>& m)
{
std::vector<std::string> keys;
for(auto e : m)
{
keys.push_back(e.first);
}
std::sort( keys.begin( ), keys.end( ), [&]( const std::string& lhs, const std::string& rhs )
{
return (m.at(lhs) < m.at(rhs));
});
std::reverse(keys.begin(), keys.end());
if(keys.size() > 1)
{
assert(m.at(keys.at(0)) >= m.at(keys.at(1)));
}
return keys;
}
std::pair<double, unsigned int> Utilities::findVectorMax(const std::vector<double>& v)
{
assert(v.size() > 0);
double max;
unsigned iMax;
for(unsigned int i = 0; i < v.size(); i++)
{
if((i == 0) || (v.at(i) > max))
{
iMax = i;
max = v.at(i);
}
}
return std::pair<double, unsigned int>(max, iMax);
}
std::pair<double, unsigned int> Utilities::findVectorMaxP(std::vector<double>& v)
{
assert(v.size() > 0);
double max;
// unsigned iMax;
std::vector<unsigned int> iMaxs;
for(unsigned int i = 0; i < v.size(); i++)
{
if((i == 0) || (v.at(i) > max))
{
iMaxs.clear();
iMaxs.push_back(i);
max = v.at(i);
}
}
unsigned int selectedI;
if(iMaxs.size() == 1)
{
selectedI = iMaxs.at(0);
}
else
{
selectedI = iMaxs.at(randomNumber(iMaxs.size() - 1));
}
return std::pair<double, unsigned int>(max, selectedI);
}
// Converts char to probability of correct genotype, according to Illumina scheme
double Utilities::PhredToPCorrect(unsigned char nucleotideQuality)
{
if(nucleotideQuality == 0)
{
return -1;
}
int illuminaPhred = (int)nucleotideQuality - 33;
assert(illuminaPhred >= 0);
//char illuminaPhred = nucleotideQuality;
double log10_pWrong = (double)illuminaPhred / (double)-10;
double pWrong = exp(log(10) * log10_pWrong);
assert(pWrong >= 0);
assert(pWrong <= 1);
double pRight = 1 - pWrong;
assert(pRight >= 0);
assert(pRight <= 1);
return pRight;
}
std::pair<double, unsigned int> Utilities::findVectorMaxP_nonCritical(std::vector<double>& v, unsigned int* thisSeed)
{
assert(v.size() > 0);
double max;
// unsigned iMax;
std::vector<unsigned int> iMaxs;
for(unsigned int i = 0; i < v.size(); i++)
{
if((i == 0) || (v.at(i) > max))
{
iMaxs.clear();
iMaxs.push_back(i);
max = v.at(i);
}
}
unsigned int selectedI;
if(iMaxs.size() == 1)
{
selectedI = iMaxs.at(0);
}
else
{
selectedI = iMaxs.at(randomNumber_nonCritical(iMaxs.size() - 1, thisSeed));
}
return std::pair<double, unsigned int>(max, selectedI);
}
vector<string> Utilities::ItoStr(vector<int> i)
{
vector<string> forReturn;
for(int I = 0; I < (int)i.size(); I++)
{
forReturn.push_back(ItoStr(i.at(I)));
}
return forReturn;
}
double Utilities::randomDouble()
{
assert(RAND_MAX != 0);
double f = (double)rand() / RAND_MAX;
assert(f >= 0);
assert(f <= 1);
return f;
}
int Utilities::chooseFromVector(vector<double>& v)
{
//TODO activate at later point
// srand ( time(NULL) );
assert(RAND_MAX != 0);
double f = (double)rand() / RAND_MAX;
double resolution = 1 / RAND_MAX;
assert(f >= 0);
assert(f <= 1);
double sum = 0.0;
for(unsigned int i = 0; i < v.size(); i++)
{
sum += v.at(i);
}
assert(sum != 0);
int problematic_items = 0;
double problematic_proportion = 0.0;
double running_sum = 0.0;
int selected_i = -1;
assert(v.size() > 0);
for(unsigned int i = 0; i < v.size(); i++)
{
double proportion = v.at(i)/sum;
running_sum += proportion;
if(proportion < resolution)
{
problematic_items++;
problematic_proportion += proportion;
}
if(f <= running_sum)
{
selected_i = i;
break;
}
if(i == (v.size() - 1))
{
selected_i = i;
cout << "WARNING!!\n";
cout << "selected_i: " << selected_i << "\n";
cout << "f: " << f << "\n";
cout << "running_sum: " << running_sum << "\n";
cout << "resolution: " << resolution << "\n";
cout << "problematic_items: " << problematic_items << "\n";
cout << "sum: " << sum << "\n";
double deb_running_sum = 0.00;
for(unsigned int k = 0; k < v.size(); k++)
{
double deb_proportion = v.at(k)/sum;
cout << "Vector v element " << k << "\n";
cout << "\tv.at(k): " << v.at(k) << "\n";
cout << "\tdeb_proportion: "<< deb_proportion << "\n";
cout << "\tdeb_running_sum before: "<< deb_running_sum << "\n";
deb_running_sum += deb_proportion;
cout << "\tdeb_running_sum after: "<< deb_running_sum << "\n";
}
}
}
assert(selected_i != -1);
assert(problematic_proportion < 0.25);
return selected_i;
}
vector<string> Utilities::split(const string &s, char delim, vector<string> &elems)
{
std::stringstream ss(s);
std::string item;
while(std::getline(ss, item, delim))
{
elems.push_back(item);
}
return elems;
}
vector<string> Utilities::split(string input, string delimiter)
{
vector<string> output;
if(input.length() == 0)
{
return output;
}
if(delimiter == "")
{
output.reserve(input.size());
for(unsigned int i = 0; i < input.length(); i++)
{
output.push_back(input.substr(i, 1));
}
}
else
{
if(input.find(delimiter) == string::npos)
{
output.push_back(input);
}
else
{
int s = 0;
int p = input.find(delimiter);
do {
output.push_back(input.substr(s, p - s));
s = p + delimiter.size();
p = input.find(delimiter, s);
} while (p != (int)string::npos);
output.push_back(input.substr(s));
}
}
return output;
}
vector<string> Utilities::split(const string &s, char delim) {
vector<string> elems;
return split(s, delim, elems);
}
string Utilities::ItoStr(int i)
{
std::stringstream sstm;
sstm << i;
return sstm.str();
}
string Utilities::PtoStr(void* p)
{
std::stringstream sstm;
sstm << p;
return sstm.str();
}
string Utilities::DtoStr(double d)
{
std::stringstream sstm;
sstm << d;
return sstm.str();
}
string Utilities::join(vector<string> parts, string delim)
{
if(parts.size() == 0)
return "";
string ret = parts.at(0);
for(unsigned int i = 1; i < parts.size(); i++)
{
ret.append(delim);
ret.append(parts.at(i));
}
return ret;
}
void Utilities::eraseNL(string& s)
{
if (!s.empty() && s[s.length()-1] == '\r') {
s.erase(s.length()-1);
}
if (!s.empty() && s[s.length()-1] == '\n') {
s.erase(s.length()-1);
}
}
long long Utilities::StrtoLongLong(string s)
{
stringstream ss(s);
long long i;
ss >> i;
return i;
}
int Utilities::StrtoI(string s)
{
stringstream ss(s);
int i;
ss >> i;
return i;
}
double Utilities::StrtoD(string s)
{
stringstream ss(s);
double d;
ss >> d;
return d;
}
bool Utilities::StrtoB(string s)
{
stringstream ss(s);
bool b;
ss >> b;
return b;
}
std::string Utilities::timestamp()
{
time_t ltime; /* calendar time */
ltime=time(NULL); /* get current cal time */
char* timeCString = asctime( localtime(<ime) );
std::string forReturn(timeCString);
return " [ "+ forReturn.substr(0, forReturn.length() - 1)+" ] ";
}
map<string, string> Utilities::readFASTA(std::string file, bool fullIdentifier)
{
map<string, string> forReturn;
std::ifstream FASTAstream;
FASTAstream.open(file.c_str());
if(! FASTAstream.is_open())
{
throw std::runtime_error("readFASTA(): Cannot open file "+file);
}
while(FASTAstream.good())
{
std::string line;
size_t lineCounter = 0;
std::string currentSequenceIdentifier;
while(FASTAstream.good())
{
std::getline(FASTAstream, line);
Utilities::eraseNL(line);
lineCounter++;
if(line.substr(0, 1) == ">")
{
std::string ident = line.substr(1);
if(! fullIdentifier)
{
for(int i = 0; i < ident.size(); i++)
{
if(ident.at(i) == ' ')
{
ident = ident.substr(0, i);
break;
}
}
}
currentSequenceIdentifier = ident;
assert(forReturn.count(ident) == 0);
}
else
{
forReturn[currentSequenceIdentifier] += line;
}
}
}
return forReturn;
}
std::string Utilities::repeatString(std::string s, int repeatNumber)
{
std::string forReturn;
assert(repeatNumber >= 0);
for(int i = 1; i <= repeatNumber; i++)
{
forReturn.append(s);
}
return forReturn;
}
std::pair<std::string,std::vector<int>> Utilities::modifySequence(std::string sequence, std::vector<int> positionOrigin, double mutationFrequence, double insertionFrequence, int insertionMaxLength, double deletionFrequence, int deletionMaxLength)
{
assert(sequence.size() == positionOrigin.size());
assert((mutationFrequence >= 0) && (mutationFrequence <= 1));
assert((insertionFrequence >= 0) && (insertionFrequence <= 1));
assert((deletionFrequence >= 0) && (deletionFrequence <= 1));
assert(insertionMaxLength > 1);
assert(deletionMaxLength > 1);
std::pair<std::string,std::vector<int>> forReturn;
for(int i = 0; i < (int)sequence.length(); i++)
{
if(randomDouble() <= mutationFrequence)
{
char newNuc = randomNucleotide();
forReturn.first.push_back(newNuc);
if(newNuc == sequence.at(i))
{
forReturn.second.push_back(positionOrigin.at(i));
}
else
{
forReturn.second.push_back(-1);
}
}
else if(randomDouble() <= deletionFrequence)
{
int deletionLength = randomNumber(deletionMaxLength - 1);
for(int dI = 0; dI < deletionLength; dI++)
{
forReturn.first.push_back('_');
forReturn.second.push_back(-1);
}
i += deletionLength;
}
else if(randomDouble() <= insertionFrequence)
{
int insertionLength = randomNumber(insertionMaxLength - 1);
forReturn.first.append(generateRandomSequence(insertionLength));
for(int dI = 0; dI < insertionLength; dI++)
{
forReturn.second.push_back(-1);
}
}
else
{
forReturn.first.push_back(sequence.at(i));
forReturn.second.push_back(positionOrigin.at(i));
}
}
assert(forReturn.first.size() == forReturn.second.size());
return forReturn;
}
std::string Utilities::generateRandomSequenceWithGaps(int length, double gapFrequency)
{
std::string forReturn;
forReturn.resize(length);
for(int i = 0; i < length; i++)
{
if(randomDouble() <= gapFrequency)
{
forReturn.at(i) = '_';
}
else
{
forReturn.at(i) = randomNucleotide();
}
}
return forReturn;
}
std::string Utilities::generateRandomSequence(int length)
{
std::string forReturn;
forReturn.resize(length);
for(int i = 0; i < length; i++)
{
forReturn.at(i) = randomNucleotide();
}
return forReturn;
}
int Utilities::randomNumber(int max)
{
assert(max >= 0);
int n;
#pragma omp critical
{
// n = rand() % (max + 1);
n = rand_r(&globalRandRSeed) % (max + 1);
}
assert((n >= 0) && (n <= max));
return n;
}
int Utilities::randomNumber_nonCritical(int max, unsigned int* thisSeed)
{
int n = rand_r(thisSeed) % (max + 1);
assert((n >= 0) && (n <= max));
return n;
}
char Utilities::randomNucleotide()
{
char nucleotides[4] = {'A', 'C', 'G', 'T'};
int n = rand() % 4;
assert((n >= 0) && (n <= 3));
return nucleotides[n];
}
void Utilities::writeStatus(std::string statusFile, int status)
{
std::ofstream statusStream;
statusStream.open(statusFile.c_str(), std::ios::out);
if(! statusStream.is_open())
{
throw std::runtime_error("Cannot open file for writing: "+statusFile);
}
statusStream << status << "\n";
statusStream.close();
}
int Utilities::readStatus(std::string statusFile)
{
std::ifstream statusStream;
statusStream.open(statusFile.c_str(), std::ios::in);
if(! statusStream.is_open())
{
return 0;
}
assert(statusStream.good());
std::string line;
getline (statusStream, line);
Utilities::eraseNL(line);
if(line.length() == 0)
{
return 0;
}
int status = Utilities::StrtoI(line);
assert(status >= 0);
return status;
}
void Utilities::check_map_is_normalized(std::map<char, double> m)
{
double sum = 0;
for(std::map<char, double>::iterator cIt = m.begin(); cIt != m.end(); cIt++)
{
assert(cIt->second >= 0);
sum += cIt->second;
}
assert(abs(sum - 1) < 1e-5);
}
std::map<char, double> Utilities::normalize_map(std::map<char, double> m)
{
std::map<char, double> forReturn;
double sum = 0;
for(std::map<char, double>::iterator cIt = m.begin(); cIt != m.end(); cIt++)
{
assert(cIt->second >= 0);
sum += cIt->second;
}
assert(sum > 0);
for(std::map<char, double>::iterator cIt = m.begin(); cIt != m.end(); cIt++)
{
forReturn[cIt->first] = cIt->second/sum;
}
return forReturn;
}
char Utilities::choose_from_normalized_map(std::map<char, double> m)
{
//TODO activate at later point
// srand ( time(NULL) );
assert(RAND_MAX != 0);
double f = (double)rand() / RAND_MAX;
double resolution = 1 / RAND_MAX;
assert(f >= 0);
assert(f <= 1);
char forReturn;
double current_sum = 0;
for(std::map<char, double>::iterator mIt = m.begin(); mIt != m.end(); mIt++)
{
current_sum += mIt->second;
if(f <= current_sum)
{
forReturn = mIt->first;
current_sum = -1;
break;
}
}
assert(current_sum = -1);
return forReturn;
}
char Utilities::choose_from_normalized_map(std::map<char, double> m, boost::mt19937& rng)
{
boost::random::uniform_01<boost::mt19937&> f_gen(rng);
double f = f_gen();
assert(f >= 0);
assert(f <= 1);
char forReturn;
double current_sum = 0;
for(std::map<char, double>::iterator mIt = m.begin(); mIt != m.end(); mIt++)
{
current_sum += mIt->second;
if(f <= current_sum)
{
forReturn = mIt->first;
current_sum = -1;
break;
}
}
assert(current_sum = -1);
return forReturn;
}
std::string Utilities::seq_reverse_complement(std::string sequence)
{
int length = sequence.size();
std::string forReturn;
forReturn.resize(length);
for(int k=0; k < length; k++)
{
forReturn[k] = reverse_char_nucleotide(sequence.at(length-k-1));
}
return forReturn;
}
char Utilities::reverse_char_nucleotide(char c)
{
switch (c)
{
case 'A':
return 'T';
case 'C':
return 'G';
case 'G':
return 'C';
case 'T':
return 'A';
case 'N':
return 'N';
case 'a':
return 't';
case 'c':
return 'g';
case 'g':
return 'c';
case 't':
return 'a';
case 'n':
return 'n';
case '*':
return '*';
default:
std::string errorString = "Utilities::reverse_char_nucleotide: nucleotide not existing: '";
errorString.push_back(c);
errorString.push_back('\'');
std::cerr << errorString << "\n" << std::flush;
return 'N';
throw std::runtime_error(errorString);
}
}
bool Utilities::oneBernoulliTrial(double p)
{
//TODO activate at later point
// srand ( time(NULL) );
assert(RAND_MAX != 0);
double f = (double)rand() / RAND_MAX;
double resolution = 1 / RAND_MAX;
assert(f >= 0);
assert(f <= 1);
if(f <= p)
{
return true;
}
else
{
return false;
}
}
bool Utilities::oneBernoulliTrial(double p, boost::mt19937& rng)
{
boost::random::uniform_01<boost::mt19937&> f_gen(rng);
double f = f_gen();
assert(f >= 0);
assert(f <= 1);
if(f <= p)
{
return true;
}
else
{
return false;
}
}
char Utilities::randomNucleotide(boost::mt19937& rng)
{
boost::random::uniform_int_distribution<> nucleotide_gen (0,3);
char nucleotides[4] = {'A', 'C', 'G', 'T'};
int n = nucleotide_gen(rng);
assert((n >= 0) && (n <= 3));
return nucleotides[n];
}
std::string Utilities::removeFROM(std::string readID)
{
if(readID.find(":FROM:") != std::string::npos)
{
size_t cutFrom = readID.find(":FROM:");
return readID.substr(0, cutFrom);
}
else
{
return readID;
}
}
double Utilities::logAvg(double a, double b)
{
if(a > b)
{
return(log(0.5) + (log(1 + exp(b - a)) + a));
} else
{
return(log(0.5) + (log(1 + exp(a - b)) + b));
}
}