https://github.com/jttoivon/MODER
Tip revision: c485231e5b468ae509306e1aaeebaa0f3004572d authored by Jarkko Toivonen on 31 March 2020, 18:10:18 UTC
Fixed indexing bug.
Fixed indexing bug.
Tip revision: c485231
common.cpp
/*
MODER is a program to learn DNA binding motifs from SELEX datasets.
Copyright (C) 2016, 2017 Jarkko Toivonen,
Department of Computer Science, University of Helsinki
MODER is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
MODER is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
*/
#include "common.hpp"
#include "iupac.hpp"
#include <libgen.h> // for dirname and basename
#include <fstream>
#include <bitset>
#include <sstream>
#include <cstdarg>
#include <cstring>
#include <boost/tuple/tuple.hpp>
//std::vector<std::string> sequences;
std::string header;
/*
template
boost::tuple<int,int,int,int>
get_ranges<>(const boost::multi_array<double, 2>& a);
*/
double
cut(double x)
{
if (x < 0)
return 0;
else
return x;
}
const char*
yesno(bool b)
{
return b ? "yes" : "no";
}
void
error(bool b, const std::string& message)
{
if (b) {
fprintf(stderr, "Error: %s\n", message.c_str());
exit(1);
}
}
// Number of 1-bits in the range
int
bits_in_range(const std::bitset<64>& occ, int begin, int end)
{
int count=0;
for (int j=begin; j < end; ++j)
count += occ[j];
return count;
}
template <>
std::string
print_vector<bool>(const std::vector<bool>& v)
{
if (v.size() == 0)
return std::string("[]");
std::ostringstream result;
result << "[";
for (int i=0; i < v.size()-1; ++i)
result << yesno(v[i]) << ", ";
result << yesno(v[v.size()-1]) << "]";
return result.str();
}
std::string
print_bitvector(unsigned int x, int number_of_bits)
{
std::string bit_repr(number_of_bits, '-');
for (int pos=0; pos < number_of_bits; ++pos) {
unsigned char mask = 1u << (number_of_bits-pos-1);
bit_repr[pos] = x & mask ? '1' : '0';
}
return std::string(bit_repr);
}
std::string
mybasename(const std::string& path)
{
char *basec = strdup(path.c_str());
std::string result = basename(basec);
free(basec);
return result;
}
template <>
std::string
m(std::string s, bool t)
{
std::ostringstream ss;
ss << s << ", default: ";
if (t)
ss << "yes";
else
ss << "no";
return ss.str();
}
void
print_command_line(int argc, char* argv[], FILE* f)
{
fprintf(f, "%s", argv[0]);
for (int i=1; i < argc; ++i)
fprintf(f, " %s", argv[i]);
fprintf(f, "\n");
}
void
print_command_line(int argc, char* argv[])
{
print_command_line(argc, argv, stdout);
}
matrix<double> positional_background(0,0);
// template <typename Value, typename Key>
// bool
// key_sort_functor_asc(Value x, Value y)
// {
// Key k;
// return k(x) < k(y);
// }
std::vector<int>
count_frequencies(const std::string& s)
{
std::vector<int> result(4, 0);
for (int i=0; i < s.size(); ++i)
++result[to_int(s[i])];
return result;
}
std::string
to_string(std::string format, ...)
{
char* tmpstr;
va_list args;
va_start( args, format );
int count = vasprintf(&tmpstr, format.c_str(), args);
va_end( args );
assert(count != -1);
std::string result = tmpstr;
free(tmpstr);
return result;
}
std::string
vto_string(const std::string& format, va_list args)
{
char* tmpstr;
int count = vasprintf(&tmpstr, format.c_str(), args);
assert(count != -1);
std::string result = tmpstr;
free(tmpstr);
return result;
}
void
underline(std::string format, ...)
{
va_list args;
va_start( args, format );
std::string temp = vto_string(format, args);
va_end( args );
std::cout << temp << std::endl;
std::cout << std::string(temp.length(), '-') << std::endl;
}
inline
void
add_to_vector(std::vector<int>& to, const std::vector<int>& from)
{
assert(to.size() == from.size());
for (size_t i=0; i < to.size(); ++i)
to[i] += from[i];
}
std::vector<int>
count_frequencies(const std::vector<std::string>& sequences)
{
std::vector<int> result(4, 0);
for (int i=0; i < sequences.size(); ++i)
add_to_vector(result, count_frequencies(sequences[i]));
return result;
}
std::string
itoa(int i)
{
std::ostringstream s;
s << i;
return s.str();
}
int
atoi(const std::string& s)
{
return atoi(s.c_str());
}
double
atof(const std::string& s)
{
return atof(s.c_str());
}
void
normalize_vector(std::vector<double>& v)
{
double s = sum(v);
for (int i=0; i < v.size(); ++i)
v[i] = v[i]/s;
return;
}
std::vector<double>
normalize_vector_copy(const std::vector<double>& v)
{
double s = sum(v);
std::vector<double> result(v.size());
for (size_t i=0; i < v.size(); ++i)
result[i] = v[i]/s;
return result;
}
std::vector<double>
normalize_vector_copy(const std::vector<int>& v)
{
double s = sum(v);
std::vector<double> result(v.size());
for (size_t i=0; i < v.size(); ++i)
result[i] = v[i]/s;
return result;
}
std::vector<double>
to_double_vector(const std::vector<int>& v)
{
std::vector<double> result(v.size());
for (size_t i=0; i < v.size(); ++i)
result[i] = v[i];
return result;
}
void
normalize_map(std::map<big_int, double>& v)
{
double s = sum(v);
typedef std::map<big_int, double>::iterator iterator;
for (iterator i=v.begin(); i != v.end(); ++i)
i->second = i->second/s;
return;
}
bool
is_valid_string(const std::string& str, const character_to_values<bool>& is_valid)
{
for (int i=0; i < str.length(); ++i) {
if (not is_valid(str[i]))
return false;
}
return true;
}
void
check_data(const std::vector<std::string>& seqs, const std::string& valid_chars)
{
assert(seqs.size() != 0);
character_to_values<bool> is_valid(valid_chars, true);
for (int i=0; i < seqs.size(); ++i) {
const std::string& line = seqs[i];
assert(is_valid_string(line, is_valid));
}
}
bool
is_nucleotide_string(const std::string& str)
{
//std::string nucs = "ACGT";
static character_to_values<bool> isnuc("ACGTU", true);
for (int i=0; i < str.length(); ++i) {
if (not isnuc(str[i]))
return false;
}
return true;
}
std::string
reverse_complement(const std::string& s)
{
std::string t(s.size(), '-');
int j=s.size()-1;
for (int i=0; i < s.size(); ++i, --j)
t[j] = complement(s[i]);
return t;
}
std::string
reverse_complement_rna(const std::string& s)
{
std::string t(s.size(), '-');
int j=s.size()-1;
for (int i=0; i < s.size(); ++i, --j)
t[j] = complement_rna(s[i]);
return t;
}
std::string
reverse(const std::string& s)
{
std::string t(s.size(), ' ');
int j=s.size()-1;
for (int i=0; i < s.size(); ++i, --j)
t[j] = s[i];
return t;
}
bool
is_palindromic(const std::string& s)
{
int len =s.length();
if (len % 2 == 1) // odd string cannot be palindromic
return false;
int middle = len/2;
for (int i=0; i<middle; ++i) {
if (s[i] != complement(s[len-i-1]))
return false;
}
return true;
}
int
palindromic_index(const std::string& s)
{
return hamming_distance(s, reverse_complement(s));
}
// Reflects over both diagonals, that is rotate 180 degrees. This is NOT the transpose of the matrix
matrix<double>
reverse_complement(const matrix<double>& m)
{
assert( m.get_rows() == 4 );
int c = m.get_columns();
matrix<double> result(4, c);
for (int i = 0; i < 4; ++i)
for (int j = 0; j < c; ++j) {
result(i, j) = m(4-i-1, c-j-1);
}
return result;
}
matrix<double>
reverse(const matrix<double>& m)
{
assert( m.get_rows() == 4 );
int c = m.get_columns();
matrix<double> result(4, c);
for (int i = 0; i < 4; ++i)
for (int j = 0; j < c; ++j) {
result(i, j) = m(i, c-j-1);
}
return result;
}
// extend the matrix by k-1 columns of even distribution to both sides of the matrix
dmatrix
extend_matrix(const dmatrix& orig, int k)
{
int width = orig.get_columns();
int new_width = 2*(k-1) + width;
dmatrix result(4, new_width);
result.fill_with(0.25);
result.inject(orig, 0, k-1);
return result;
}
int
hamming_distance(const std::string& s, const std::string& t)
{
assert(s.length() == t.length());
int count = 0;
for (int i=0; i < s.length(); ++i)
if (s[i] != t[i])
++count;
// if (count == 1)
// std::cout << t << std::endl;
return count;
}
int
iupac_hamming_dist(const std::string& str, const std::string& pattern, int max_hd)
{
assert(str.length() == pattern.length());
int hd=0;
for (int i=0; i < str.length(); ++i) {
if (not iupac_match(str[i], pattern[i])) {
++hd;
if (hd > max_hd)
return hd;
}
}
return hd;
}
// finds the minimum Hamming distance between t and the substrings of s
int
min_hamming_distance(const std::string& s, const std::string& t)
{
assert(s.length() >= t.length());
int len = s.length();
int k = t.length();
int dist = k;
for (int i = 0; i < len-k+1; ++i) {
int temp = hamming_distance(s.substr(i,k), t);
if (temp < dist)
dist = temp;
}
return dist;
}
std::string
join(const std::vector<std::string>& v, char c)
{
int L = v[0].length();
int n = v.size();
std::string temp;
temp.reserve((L+1)*n);
for (int i=0; i < n-1; ++i) {
temp.append(v[i]);
temp.push_back(c);
}
temp.append(v[n-1]);
return temp;
}
std::string
join(const std::vector<std::string>& v, const std::string& sep)
{
int l = v.size();
std::string temp;
for (int i=0; i < l-1; ++i) {
temp.append(v[i]);
temp.append(sep);
}
temp.append(v.back());
return temp;
}
// reverse complement of above catenation
// only separators are correctly placed
std::string
join_rev(const std::vector<std::string>& v, char c)
{
int L = v[0].length();
int lines = v.size();
assert(lines>0);
std::string temp;
temp.reserve((L+1)*lines);
for (int i=lines; i>1;) {
--i;
temp.append(reverse_complement(v[i]));
temp.push_back(c);
}
temp.append(reverse_complement(v[0]));
return temp;
}
std::vector<std::string>
split(const std::string& s, char c)
{
size_t b=0;
size_t e;
std::vector<std::string> result;
while (b != s.size() && (e=s.find(c, b)) != std::string::npos) {
if (e>b)
result.push_back(s.substr(b, e-b));
b=e+1;
}
if (b != s.size())
result.push_back(s.substr(b, s.size()-b));
return result;
}
std::pair<int,int>
read_sequences(const std::string& filename, std::vector<std::string>& seqs, bool allow_iupac)
{
assert(seqs.size() == 0);
std::ifstream f;
std::string line;
int bad_lines = 0;
f.open(filename.c_str(), std::ios_base::in);
if (not f.is_open()) {
std::cerr << "Couldn't open file " << filename << std::endl;
exit(1);
}
while (getline(f, line)) {
if (line.length() == 0) {
++bad_lines;
continue;
}
while (not line.empty() and line.back() == '\r')
line.pop_back();
if ((allow_iupac and is_iupac_string(line)) || is_nucleotide_string(line))
seqs.push_back(line);
else
++bad_lines;
}
f.close();
error(seqs.size() == 0, "No valid sequences found! Exiting.\n");
int lines = seqs.size();
return std::make_pair(lines, bad_lines);
}
std::istream&
mygetline(std::istream& f, std::string& line)
{
std::getline(f, line);
while (not line.empty() and line.back() == '\r')
line.pop_back();
return f;
}
std::pair<int,int>
read_fastq_sequences(const std::string& filename, std::vector<std::string>& seqs, bool allow_iupac)
{
assert(seqs.size() == 0);
std::ifstream f;
std::string line;
int bad_lines = 0;
f.open(filename.c_str(), std::ios_base::in);
if (not f.is_open()) {
std::cerr << "Couldn't open file " << filename << std::endl;
exit(1);
}
std::string msg = "Does not appear to be a fastq file\n";
std::string sequence;
int block=0;
int offset=0;
bool e=false;
while (mygetline(f, line)) {
if (line.empty() or line[0] != '@') {
e=true;
offset=0;
break;
}
if (not mygetline(f, sequence) or sequence.empty() or not is_iupac_string(sequence)) {
e=true;
offset=1;
break;
}
if (not mygetline(f, line) or line.empty() or line[0] != '+') {
e=true;
offset=2;
break;
}
if (not mygetline(f, line) or line.length() != sequence.length()) {
e=true;
offset=3;
break;
}
++block;
if (sequence.length() == 0) {
++bad_lines;
continue;
}
if ((allow_iupac and is_iupac_string(sequence)) || is_nucleotide_string(sequence))
seqs.push_back(sequence);
else
++bad_lines;
}
f.close();
if (e) {
fprintf(stderr, "Read error on line %i\n", 4*block+offset+1);
switch (offset) {
case 0:
fprintf(stderr, "Expected a line beginning with @\n");
break;
case 1:
fprintf(stderr, "Expected a sequence of nucleotides\n");
line = sequence;
break;
case 2:
fprintf(stderr, "Expected a line beginning with +\n");
break;
case 3:
fprintf(stderr, "Expected a sequence of quality codes, one for each nucleotide\n");
break;
}
fprintf(stderr, "Instead got:\n%s\n", line.c_str());
exit(1);
}
error(seqs.size() == 0, "No valid sequences found! Exiting.\n");
int lines = seqs.size();
return std::make_pair(lines, bad_lines);
}
std::pair<int,int>
read_fasta_sequences(const std::string& filename, std::vector<std::string>& seqs, bool allow_iupac)
{
assert(seqs.size() == 0);
std::ifstream f;
std::string line;
int bad_lines = 0;
f.open(filename.c_str(), std::ios_base::in);
if (not f.is_open()) {
std::cerr << "Couldn't open file " << filename << std::endl;
exit(1);
}
bool header_read=false;
std::string current;
while (getline(f, line)) {
while (not line.empty() and line.back() == '\r')
line.pop_back();
if (not header_read) { // wait till we find the next header line
if (line.length() > 0 and line[0] == '>')
header_read = true;
else
++bad_lines;
continue;
}
if (line.length() == 0) {
++bad_lines;
current.clear();
header_read=false;
continue;
}
if (line[0] == '>') {
if (current.length() == 0)
++bad_lines;
else {
seqs.push_back(current);
}
current.clear();
continue;
}
if ((allow_iupac and is_iupac_string(line)) || is_nucleotide_string(line))
current += line;
else {
++bad_lines;
current.clear();
header_read=false;
}
}
f.close();
if (current.length() != 0)
seqs.push_back(current);
error(seqs.size() == 0, "No valid sequences found! Exiting.\n");
int lines = seqs.size();
return std::make_pair(lines, bad_lines);
}
// x == base^3 * result[0] + base^2 * result[1] + base * result[2] + result[3]
std::vector<int>
decode_base(int base, int x)
{
std::vector<int> result(4);
result[3] = x % base;
x /= base;
result[2] = x % base;
x /= base;
result[1] = x % base;
x /= base;
result[0] = x;
assert(result[0] < base);
return result;
}
int
code_base(int base, int a, int b, int c, int d)
{
assert(0 <= a && a < base);
assert(0 <= b && b < base);
assert(0 <= c && c < base);
assert(0 <= d && c < base);
return a*base*base*base + b*base*base + c*base + d;
}
int
code_base(int base, const std::vector<int>& v)
{
assert(v.size() == 4);
assert(0 <= v[0] && v[0] < base);
assert(0 <= v[1] && v[1] < base);
assert(0 <= v[2] && v[2] < base);
assert(0 <= v[3] && v[3] < base);
return v[0]*base*base*base + v[1]*base*base + v[2]*base + v[3];
}
std::vector<std::string>
integer_range(int begin, int end)
{
std::vector<std::string> result;
for (int i=begin; i < end; ++i)
result.push_back(to_string("%i", i));
return result;
}
// This is a helper function to be used inside gdb debugger
std::string&
SSS(const char* s)
{
return *(new std::string(s));
}