https://github.com/grenaud/libbam
Raw File
Tip revision: 064f9fe2cdee94fd6b78304a29da2686ebc51cdb authored by grenaud on 13 February 2024, 12:30:24 UTC
no nonDNA
Tip revision: 064f9fe
errorQCScores.cpp
#include <iostream>
#include <vector>
#include <set>
#include <ctype.h>
#include <stdlib.h>

#include "api/BamMultiReader.h"
#include "api/BamReader.h"
#include "api/BamWriter.h"
#include "api/BamAux.h"
#include "api/SamSequenceDictionary.h"

#include "libgab.h"
#include "ReconsReferenceBAM.h"

using namespace std;
using namespace BamTools;

const int offset=33;


bool isTransition(char reference,char read){

    if(  (reference == 'A' && read == 'G') ||
	 (reference == 'G' && read == 'A') ||
	 (reference == 'C' && read == 'T') ||
	 (reference == 'T' && read == 'C') ){
	return true;
    }
	 
    return false;
}



bool isTransversions(char reference,char read){
    if(  (reference == 'A' && read == 'C') ||
	 (reference == 'A' && read == 'T') ||
	 
	 (reference == 'G' && read == 'C') ||
	 (reference == 'G' && read == 'T') ||
	 
	 (reference == 'C' && read == 'A') ||
	 (reference == 'C' && read == 'G') ||

	 (reference == 'T' && read == 'A') ||
	 (reference == 'T' && read == 'G') ){
	return true;
    }

    return false;	
}





int main (int argc, char *argv[]) {
    //    bool pairedEnd=false;
    unsigned int  binSize        = 1000;
    // int  minBaseQuality = 0;
    // string fastaIndex = "/mnt/solexa/bin/gabriel/index.hg19.fai";
    bool printM=false;
    bool printTV=false;

    string usage=string(""+string(argv[0])+"  [aligned BAM file] "+
			"\nThis program computes the frequency of transitions\n"+
			"\nacross read length\n"+
			"arguments:\n"+

			// "\t"+"--fai [fasta index] : Fasta index (Default: "+stringify(fastaIndex)+")\n"+
			// "\t"+"--bin [bin sizes]   : Bin size to use (Default: "+stringify(binSize)+")\n"+
			// "\t"+"--bq  [base qual]   : Minimum base quality (Default: "+stringify(minBaseQuality)+")\n"+
			"\n");

    if(argc == 1 ||
       (argc == 2 && (string(argv[0]) == "-h" || string(argv[0]) == "--help") )
       ){
	cerr << "Usage "<<usage<<endl;
	return 1;       
    }


    for(int i=1;i<(argc);i++){ 

        //COMMON TO ALL
        //BEGIN loci selection

        if(string(argv[i]) == "-m"){
	    printM=true;
            continue;
	}

        if(string(argv[i]) == "-tv"){
	    printTV=true;
            continue;
	}

        // if(string(argv[i]) == "--bq"){
	//     minBaseQuality=destringify<unsigned int>(argv[i+1]);
        //     i++;
        //     continue;
	// }

    }

    string bamfiletopen = string( argv[ argc-1 ] );


    BamReader reader;
    
    if ( !reader.Open(bamfiletopen) ) {
    	cerr << "Could not open input BAM file"<< bamfiletopen << endl;
    	return 1;
    }

    // if ( !reader.LocateIndex()  ) {
    // 	cerr << "The index for the BAM file cannot be located" << endl;
    // 	return 1;
    // }

    // if ( !reader.HasIndex()  ) {
    // 	cerr << "The BAM file has not been indexed." << endl;
    // 	return 1;
    // }


    // SamSequenceDictionary  sequencesFound=reader.GetHeader().Sequences;

    // if(sequencesFound.Size() == 0){
    // 	cerr << "The BAM file does not have any @SQ fields" << endl;
    // 	return 1;
    // }
    

    // //iterates over every sequence found in the header
    // for( SamSequenceConstIterator   ssit=sequencesFound.ConstBegin();
    // 	 ssit<sequencesFound.ConstEnd();
    // 	 ssit++){
    // 	string referenceName=  ssit->Name;


    // 	unsigned int    referenceLength=destringify<unsigned int>(ssit->Length);
	
    // 	unsigned int coordinate=1;

    // 	//for each sequence found in the header, we will iterate over each window
    // 	while( (coordinate+binSize)<referenceLength){	    
    // 	    int refid=reader.GetReferenceID(referenceName);

    // 	    if(refid < 0){
    // 		cerr << "Cannot retrieve the reference ID for "<<referenceName << endl;
    // 		return 1;
    // 	    }
	    
    // 	    //setting the BAM reader at that position
    // 	    reader.SetRegion(refid,
    // 			     coordinate,
    // 			     refid,
    // 			     coordinate+binSize); 	






    //iterating over the alignments for these regions
    BamAlignment al;
    unsigned int readCounter  =0;
    unsigned int sumReadLength=0;
    unsigned int numberTransitions=0;
    unsigned int numberTransversions=0;

    unsigned int numberA=0;
    unsigned int numberC=0;
    unsigned int numberG=0;
    unsigned int numberT=0;


    while ( reader.GetNextAlignment(al) ) {
	// cout<<"hello1"<<endl;
	//skip unmapped
	if(!al.IsMapped())
	    continue;

	if( al.IsReverseStrand() ){//would need to code
	    continue;
	}
	// cout<<"hello2"<<endl;

	readCounter++;
	sumReadLength+=al.QueryBases.size();
		
	string reconstructedReference = reconstructRef(&al);
	
		

	//iterate over each character
	if(al.Qualities.size() != reconstructedReference.size()){
	    cerr<<"Quality line is not the same size as the reconstructed reference"<<endl;
	    return 1;
	}

	// for(unsigned int i=0;i<1;i++){
	for(int i=0;i<int(al.QueryBases.size());i++){
	// for(int i=10;i<(int(al.QueryBases.size())-10);i++){

	    //skip over matches and soft clipped bases, unresolved bases and deletion in the reference 
	    if(reconstructedReference[i] == 'S' ||
	       //reconstructedReference[i] == 'M' || 
	       reconstructedReference[i] == 'I' || 
	       al.QueryBases[i]          == 'N' ){
		continue;
	    }

	    //Skip bases with low base quality
	    //this is done specifically to avoid unresolved base pairs
	    //that were called as 'A's with qual = 0 by Martin's BCL reader 
	    // if(int(al.Qualities[i]-offset) < minBaseQuality ){
	    // 	continue;
	    // }
	    int  qcScore = int(al.Qualities[i]-offset);
	    char refeBase=toupper(reconstructedReference[i]);
	    char readBase=toupper(al.QueryBases[i]);
	    if(refeBase == 'N' || readBase == 'N' )
		continue;

	    if(refeBase == 'A'){
		numberA++;
	    }

	    if(refeBase == 'C'){
		numberC++;
	    }

	    if(refeBase == 'G'){
		numberG++;
	    }

	    if(refeBase == 'T'){
		numberT++;
	    }
	    
	    // if(refeBase== 'C' &&  
	    //    readBase== 'T' ){
	    // 	cout<<(al.Position+i)<<"\t"<<refeBase<<"\t"<<readBase<<"\t"<<qcScore<<endl;
	    // }
	    // if(refeBase== 'G' &&  
	    //    readBase== 'A' ){
	    // 	cout<<(al.Position+i)<<"\t"<<refeBase<<"\t"<<readBase<<"\t"<<qcScore<<endl;
	    // }

	    if(int(al.Position) <= 2)
		continue;
	    if(refeBase == 'M'){
		if(printM)
		    cout<<"gi|251831106|ref|NC_012920.1|\t"<<(int(al.Position)+i-2)<<"\t"<<(int(al.Position)+i+2)<<"\t"<<refeBase<<"\t"<<readBase<<"\t"<<qcScore<<endl;	
		    //cout<<(al.Position+i)<<"\t"<<refeBase<<"\t"<<readBase<<"\t"<<qcScore<<endl;			

	    }else{ //substitution
		if(isTransition( refeBase, readBase)){
				

		    numberTransitions++;
		}else{
		    if(isTransversions( refeBase, readBase)){
			if(printTV)
			    cout<<"gi|251831106|ref|NC_012920.1|\t"<<(int(al.Position)+i-2)<<"\t"<<(int(al.Position)+i+2)<<"\t"<<refeBase<<"\t"<<readBase<<"\t"<<qcScore<<endl;	

			numberTransversions++;
		    }else{
			cerr<<"Wrong substitution from "<<refeBase<<" to "<<readBase<<" in "<<al.Name<<endl<<"refe:"<<reconstructedReference<<endl<<"read:"<<al.QueryBases<<endl;
			return 1;
		    }
		}
	    }		       
		       
    

	}   
    
    }//while reader


    // cout//<<referenceName<<":"<<":"<<coordinate<<"-"<<(coordinate+binSize)
    // 	//<<"\t"<<referenceLength
    // 	<<"\t"<<readCounter
    // 	<<"\t"<<sumReadLength 
    // 	<<"\t"<<double(sumReadLength)/double(readCounter)
    // 	<<"\t"<<numberTransitions
    // 	<<"\t"<<numberTransversions
    // 	<<"\t"<<double(numberTransitions)/(double(numberTransitions)+double(numberTransversions) )
    // 	<<"\t"<<double(numberC+numberG)/(double( numberA+numberC+numberG+numberT ) )
    // 	<<endl;



    // cout<<endl;
    // cout<<double(sumReadLength)/double(readCounter)<<endl;




    //     coordinate+=binSize;
    // }





    // }




    reader.Close();



    // ofstream outfile;

    // outfile.open(string(bamfiletopen+".baseobspred1").c_str());
    // outfile<<qual<<"\t"<<editDist2Count[qual].match<<"\t"<<editDist2Count[qual].mismatch<<endl;
    
    // outfile.close();


   
    return 0;
}

back to top