https://github.com/Malfoy/BCALM
Raw File
Tip revision: 07c86e0dccb6ea271d530c1f66e3483c31ddc935 authored by Malfoy on 15 January 2016, 15:32:36 UTC
Update README.md
Tip revision: 07c86e0
lm.cpp
#include <iostream>
#include <fstream>
#include <vector>
#include <string>
#include <iterator>
#include <ctime>
#include <unordered_map>
#include <algorithm>
#include <sys/stat.h>
#include <sys/types.h>
#include <cmath>
#include <algorithm>
#include <cassert>
#include <chrono>

#include "lm.h"

/*
 * Constuct the compacted de bruijn graph from list of distinct kmers
 */

using namespace std;

// 10 chars to store the integer hash of a minimizer
#define MINIMIZER_STR_SIZE 10

// keep largest bucket seen, disregarding small ones
unsigned long nb_elts_in_largest_bucket = 10000;

string minimizer2string(int input_int){
    long long i = input_int;
    string str = to_string(i);
    assert(str.size() <= MINIMIZER_STR_SIZE);
    if(MINIMIZER_STR_SIZE > str.size())
        str.insert(0, MINIMIZER_STR_SIZE- str.size(), '0');
    return str;
}

bool nextchar(char * c){
	switch(*c){
		case 'a':
		*c='c';
		return true;
		case 'c':
		*c='g';
		return true;
		case 'g':
		*c='t';
		return true;
		case 't':
		*c='a';
		return false;
		default :
		cout<<"Problem with nextchar: "<<c;
		assert(0);
		return false;
	}
}


// reads k-mers and Put kmers in superbuckets
// (other behavior: just count m-mers)
void sortentry(string namefile, const int k, const int m, bool create_buckets, bool m_mer_count){
	int numbersuperbucket(pow(4,m));
	ofstream out[1100];
    ofstream checkpoint;
//    InputDot ind;
    if (create_buckets)
    {
    	for(long long i(0);i<numbersuperbucket;i++)
    		out[i].open(".bcalmtmp/z"+to_string(i),ofstream::app);
    }

//    ind.init_input(namefile,k);
	ifstream in(namefile);
	string kmer,waste;

    while (1)
    {
//        string kmer = ind.next_input(k);
		getline(in,kmer,' ');
		getline(in,waste);

		if (kmer.size()<(unsigned int)k){
            break;
		}
		transform(kmer.begin(),kmer.end(),kmer.begin(),::tolower);

        if (m_mer_count){
            count_m_mers(kmer, 2*m, k);
            continue;
        }

        int middlemin, leftmostmin, rightmostmin, min;

        middlemin=minimiserrc(kmer.substr(1,k-2),2*m);
        leftmostmin=minimiserrc(kmer.substr(0,2*m),2*m);
        rightmostmin=minimiserrc(kmer.substr(kmer.size()-2*m,2*m),2*m);

        int leftmin = (leftmostmin < middlemin) ? leftmostmin : middlemin;

        int rightmin = (rightmostmin < middlemin) ? rightmostmin : middlemin ;

        min = (leftmin < rightmin) ? leftmin : rightmin;

        uint64_t h = min/numbersuperbucket;

        if (create_buckets)
            out[h]<<kmer<< minimizer2string(leftmin) << minimizer2string(rightmin) <<";";
        else
            cout << h << ":" << kmer<< minimizer2string(leftmin) << minimizer2string(rightmin) << ";\n";

        //checkpoint << kmer << minimizer2string(leftmin) << minimizer2string(rightmin) <<";\n";
}
    if (create_buckets)
        cout << "initial partitioning done" << endl;
}



//copy n characters from in to out
void copylm(ifstream* in,int64_t n,ofstream* out){
	int buffsize(1000000),nbbuffer(n/buffsize);
	string str;
	for(int j(0);j<nbbuffer;j++)	{
		str=readn(in,buffsize);
		*out<<str;
	}
	str=readn(in,n-nbbuffer*buffsize);
	*out<<str;
}

void copylmrv(ifstream* in,int64_t n,ofstream* out){
	int buffsize(1000000),nbbuffer(n/buffsize);
	int64_t pos(in->tellg());
	string str;
	int j;
	for(j=1;j<=nbbuffer;j++){
		in->seekg(pos+n-j*buffsize,ios::beg);
		str=readn(in,buffsize);
		*out<<reversecompletment(str);
	}
	int64_t rest=n-nbbuffer*buffsize;
	if(rest!=0){
		in->seekg(pos,ios::beg);
		str=readn(in,rest);
		*out<<reversecompletment(str);
	}
}


//Put nodes from superbuckets to buckets
void createbucket(const string superbucketname,const int m){
	int superbucketnum(stoi(superbucketname));
	ifstream in(".bcalmtmp/z"+superbucketname);
	if(!in.is_open()){
		cerr<<"Problem with Createbucket"<<endl;
		return;
	}
	in.seekg(0, ios_base::end);
	int64_t size(in.tellg()), buffsize(1000000), numberbuffer(size/buffsize),nb(pow(4,m)),suffix;
	if(size==0){
		remove((".bcalmtmp/z"+superbucketname).c_str());
		return;
	}
	in.seekg(0,ios::beg);
	ofstream out[5000];
	for(long long i(0);i<nb;i++){
		out[i].open(".bcalmtmp/"+to_string(superbucketnum*nb+i),ofstream::app);
	}
	int64_t lastposition(-1),position(0),point(0),mini;
	string buffer;
	vector<string> miniv;
	in.seekg(0);

	for(int j(0);j<=numberbuffer;j++){
		if(j==numberbuffer){
			if(size-numberbuffer*buffsize-1!=-1){
				buffer=readn(&in,size-numberbuffer*buffsize-1);
				buffer+=";";
			}
			else
				buffer="";
        }
		else
		{
			buffer=readn(&in,buffsize);
			point+=buffsize;
		}
		for (uint64_t i(0); i<buffer.size(); i++,position++)
        {
			if((buffer)[i]==';'){
                int leftmin, rightmin;
				if(i>=(uint64_t)(MINIMIZER_STR_SIZE * 2))
                {
					leftmin = stoi(buffer.substr(i-(MINIMIZER_STR_SIZE * 2),MINIMIZER_STR_SIZE));
					rightmin = stoi(buffer.substr(i-MINIMIZER_STR_SIZE,MINIMIZER_STR_SIZE));
                }
				else{
					in.seekg(position-(MINIMIZER_STR_SIZE * 2));
					leftmin = stoi(readn(&in,MINIMIZER_STR_SIZE));
					rightmin = stoi(readn(&in,MINIMIZER_STR_SIZE));
                }
				string first_bucket_of_superbucket(to_string((long long)(superbucketnum*nb-1)));
				mini = minbutbiggerthan(leftmin, rightmin, first_bucket_of_superbucket);
				suffix=mini%nb;
				in.seekg(lastposition+1,ios_base::beg);
				copylm(&in,position-lastposition,&out[suffix]);
				lastposition=position;
			}
        }
		in.seekg(point);
	}
	remove((".bcalmtmp/z"+superbucketname).c_str());
}



//count the length of each node
vector<int64_t> countbucket(const string& name){
	vector<int64_t> count;
	ifstream in(".bcalmtmp/"+name);
	if(in){
		in.seekg( 0 , ios_base::end );
		int64_t size(in.tellg()),buffsize(10),numberbuffer(size/buffsize),lastposition(-1),position(0);
		if(size<2)
			return count;
		in.seekg(0,ios::beg);
		string buffer;

		for(int j(0);j<numberbuffer;j++){
			buffer=readn(&in,buffsize);
			for (uint64_t i(0); i<buffer.size(); i++,position++)
				if((buffer)[i]==';'){
					count.push_back(position-lastposition-1);
					lastposition=position;
				}
		}

		buffer=readn(&in,size-numberbuffer*buffsize);
		for (uint64_t i(0); i<buffer.size(); i++,position++)
			if((buffer)[i]==';'){
				count.push_back(position-lastposition-1);
				lastposition=position;
			}
	}
	return count;
}



//true iff node does not contain tag
bool notag(const string& node,const int64_t start,int64_t* n){
	for(uint64_t i(start);i<node.size();i++){
		if (node[i] >= '0' && node[i] <= '9'){
			*n=i;
			return false;
		}
	}
	return true;
}



//return length of tag
int taglength(const string& node, int64_t j){
	int n=1;
	for(uint64_t i(j+1);i<node.size();i++)
		if ((node[i] >= '0' && node[i] <= '9') || node[i]=='+' || node[i]=='-')
			n++;
		else
			return n;
	return n;
}



//Write a node remplacing tags by their sequences
void writeit(const string& outfile,const string& node, int leftmin, int rightmin, vector<pair<int64_t,int64_t>>* tagsposition,ifstream* tagfile,int64_t j,const string& fout){
	ofstream out(".bcalmtmp/"+outfile,ios::app);
	char rc;
	if(out){
		int64_t lastposition(0),tag,tagl,position,length;
		pair<int64_t,int64_t> pair;
		do{
			out<<node.substr(lastposition,j-lastposition-1);
			tagl=taglength(node,j);
			rc=node[j-1];
			if(rc=='+'){
				tag=stoi(node.substr(j,tagl));
			}
			else{
				tag=stoi(reversecompletment(node.substr(j,tagl)));
			}
			lastposition=j+tagl;
			pair=(*tagsposition)[tag];
			position=pair.first;
			length= pair.second;
			tagfile->seekg(position,ios_base::beg);
			if(rc=='+')
				copylm(tagfile,length,&out);
			if(rc=='-')
				copylmrv(tagfile,length,&out);
		}
		while(!notag(node,lastposition,&j));
		if(outfile!=fout)
			out<<node.substr(lastposition)<< minimizer2string(leftmin) << minimizer2string(rightmin) <<";";
		else
			out<<node.substr(lastposition) <<";"<<endl;
	}
	else
		cerr<<"writeitbug"<<endl;
}

void put(const string& outfile,const string& node, int leftmin, int rightmin, const string& fout){
	ofstream out(".bcalmtmp/"+outfile,ios::app);
	if(outfile==fout)
	    out<<node <<";" << endl;
    else
	    out<<node<< minimizer2string(leftmin) << minimizer2string(rightmin) <<";";
}

void putorwrite(const string& outfile, const string& node, int leftmin, int rightmin, vector<pair<int64_t,int64_t>>* tagsposition , ifstream* tagfile,const string& fout){
	int64_t i;
	if(notag(node,0,&i))
		put(outfile,node,leftmin, rightmin, fout);
	else
		writeit(outfile,node, leftmin, rightmin, tagsposition,tagfile,i,fout);
}

//Decide where to put a node
void goodplace(const string& node, int leftmin, int rightmin, const string& bucketname,vector<pair<int64_t,int64_t>>* tagsposition,ifstream* tagfile,const int m,const string& nameout){
	int nb(pow(4,m)),prefixnumber(stoi(bucketname)/nb+1);
	long long mini(minbutbiggerthan(leftmin, rightmin, bucketname));
	if(mini==-1)
		putorwrite( nameout, node, leftmin, rightmin, tagsposition,tagfile,nameout);
	else{
		long long minipre(mini/nb);
		string miniprefix('z'+to_string(minipre));
		putorwrite( ((minipre >= prefixnumber) ?  miniprefix: to_string(mini)), node, leftmin, rightmin, tagsposition,tagfile,nameout);
	}
}



//Compact a bucket and put the nodes on the right place
void compactbucket(const int& prefix,const int& suffix,const int k,const char *nameout,const int m){
	int64_t buffsize(k),postags(0),length,nb(pow(4,m));
	long long tagnumber(0),numberbucket(prefix*nb+suffix);
	string fullname(to_string(numberbucket)),node,tag,end;
	auto count(countbucket(fullname));
	if(count.size()==0)	{
		remove((".bcalmtmp/"+fullname).c_str());
		return;
	}

    // keep largest bucket seen
    if (count.size() > nb_elts_in_largest_bucket)
    {
        nb_elts_in_largest_bucket = count.size();
	    system(("cp .bcalmtmp/" + fullname + " largest_bucket.dot").c_str());
    }

	ifstream in(".bcalmtmp/"+fullname);
	ofstream tagfile(".bcalmtmp/tags"),out(".bcalmtmp/"+(string)nameout,ios_base::app);
	graph g(k);
	vector<pair<int64_t,int64_t>> tagsposition;

    // add nodes to graph
	if(in && tagfile && out){
		for(auto it=count.begin();it!=count.end();it++){
			length=*it;
			if(length-(MINIMIZER_STR_SIZE*2)<=2*buffsize){
				node=readn(&in,length+1);
				g.addvertex(node.substr(0,length-(MINIMIZER_STR_SIZE*2)));
                g.addleftmin(stoi(node.substr(length-(MINIMIZER_STR_SIZE*2),MINIMIZER_STR_SIZE)));
                g.addrightmin(stoi(node.substr(length-MINIMIZER_STR_SIZE,MINIMIZER_STR_SIZE)));
			}
			else{
				node=readn(&in,buffsize);
				tag=to_string(tagnumber);
				node+="+"+tag+"+";
				tagnumber++;
				copylm(&in,length-(MINIMIZER_STR_SIZE*2)-2*buffsize,&tagfile);
				tagsposition.push_back(make_pair(postags,length-(MINIMIZER_STR_SIZE*2)-2*buffsize));
				postags+=length-(MINIMIZER_STR_SIZE*2)-2*buffsize;
				end=readn(&in,buffsize);
				node+=end.substr(0,buffsize);
				g.addvertex(node);
                g.addleftmin(stoi(readn(&in,MINIMIZER_STR_SIZE)));
                g.addrightmin(stoi(readn(&in,MINIMIZER_STR_SIZE)));
                readn(&in,1); // the ';'
			}
		}
		tagfile.close();
	}

	remove((".bcalmtmp/"+fullname).c_str());
	g.debruijn();

	g.compressh(stoi(fullname));
	ifstream fichiertagin(".bcalmtmp/tags");
    int node_index = 0;

	for(auto it(g.nodes.begin());it!=g.nodes.end();it++)
    {
		if(it->size()!=0)
        {
            int leftmin = g.leftmins[node_index];
            int rightmin = g.rightmins[node_index];
			goodplace(*it, leftmin, rightmin,fullname,&tagsposition,&fichiertagin,m,nameout);
        }
        node_index++;
    }

	remove(".bcalmtmp/tags");
	return;
}

//Create a file with the nodes of the compacted graph
void createoutfile(const char *namein,const char *nameout,const int k,const int m){
	auto start=chrono::system_clock::now();
	HashMap hm(build_hash_map(2*m));
	int64_t nbsuperbucket(pow(4,m)),sys(0);
	mkdir(".bcalmtmp",0777);
	sys+=system("rm -rf .bcalmtmp/*");
	remove(nameout);

    // create the hash function
    init_m_mers_table(2*m);
    sortentry(namein,k,m, false, true);
    create_hash_function_from_m_mers(2*m);

	sortentry(namein,k,m);
	for(long long i(0);i<nbsuperbucket;i++){
		createbucket(to_string(i),m);
		for(int j(0);j<pow(4,m);j++)
			compactbucket(i,j,k,nameout,m);
	}
	sys+=system(("mv .bcalmtmp/"+(string)nameout+" "+(string)nameout).c_str());
	if(sys!=0)
		cerr<<"system call failed"<<endl;
	 auto end=chrono::system_clock::now();
	 auto waitedFor=end-start;
	 cout<<"Last for "<<chrono::duration_cast<chrono::seconds>(waitedFor).count()<<" seconds"<<endl;
}

back to top