https://github.com/cjri/a2bnetwork
Raw File
Tip revision: 2c08d1a789b7f1a9ce758a86db27fc3d78b9d003 authored by cjri on 22 June 2021, 14:36:01 UTC
Example data files containing threshold information
Tip revision: 2c08d1a
variants.cpp
using namespace std;
#include "basicmodel.h"
#include "diagnostics.h"
#include "variants.h"

void ProcessVariantInformation (run_params p, const vector<string>& seqs, vector<sparseseq>& variants, vector<pat>& pdat) {
    string all_consensus;
    FindConsensus(all_consensus,seqs);
    //Variants with respect to general consensus.
    FindVariants (variants,all_consensus,pdat);
    //Find sequences with an 'N' at sites with called variants
    vector<allele> allvar;
    ListAllVariantPositions(variants,allvar);
    if (p.diagnostic==1) {
        PrintVariantLoci(allvar);
    }
    //Next:
    //a) Find internal allele frequencies at loci for which there is at least one N
    //b) Remove sequences with too many uncertain loci in variant positions
    //c) Discard unneeded frequencies
    vector<int> nloci;
    vector<int> nloc_count;
    FindAmbiguousVarPositions (allvar,pdat,nloci,nloc_count);
    vector<ipair> freqs;
    FindAmbiguousVarFreqs (all_consensus,allvar,variants,pdat,nloci,freqs);
    ProcessAmbiguousFixedVariants(freqs,nloc_count,variants,pdat);
    //Delete sequences with too many Ns
    RemoveIndividualsMultipleN(p,nloc_count,pdat,variants);
    if (p.noseq==0&&p.diagnostic==1) {
        PrintVariants(variants,pdat);
    }
}

void FindConsensus (string& consensus, const vector<string>& seqs) {
    consensus=seqs[0];
    cout << "Find consensus of all input sequences\n";
    int nA=0;
    int nC=0;
    int nG=0;
    int nT=0;
    for (int pos=0;pos<seqs[0].size();pos++) {
        nA=0;
        nC=0;
        nG=0;
        nT=0;
        for (int seq=0;seq<seqs.size();seq++) {
            if (seqs[seq][pos]=='A') {
                nA++;
            }
            if (seqs[seq][pos]=='C') {
                nC++;
            }
            if (seqs[seq][pos]=='G') {
                nG++;
            }
            if (seqs[seq][pos]=='T') {
                nT++;
            }
        }
        int max=nA;
        consensus[pos]='A';
        if (nC>max) {
            max=nC;
            consensus[pos]='C';
        }
        if (nG>max) {
            max=nG;
            consensus[pos]='G';
        }
        if (nT>max) {
            consensus[pos]='T';
            max=nT;
        }
        if (max==0) {
            consensus[pos]='-';
        }
    }
}

void FindVariants (vector<sparseseq>& variants, string& consensus, vector<pat>& pdat) {
	for (int i=0;i<pdat.size();i++) {
        sparseseq s;
        for (int pos=0;pos<pdat[i].seq.size();pos++) {
			if (pdat[i].seq.compare(pos,1,consensus,pos,1)!=0) {
				if (pdat[i].seq.compare(pos,1,"A")==0||pdat[i].seq.compare(pos,1,"C")==0||pdat[i].seq.compare(pos,1,"G")==0||pdat[i].seq.compare(pos,1,"T")==0) {
					//cout << "Found variant " << pdat[i].code_match << " " << pos << " " << consensus[pos] << " " << pdat[i].seq[pos] << "\n";
					if (pos!=28827&&pos!=28828) {
						s.locus.push_back(pos);
						s.allele.push_back(pdat[i].seq[pos]);
					}
                }
            }
        }
        variants.push_back(s);
    }
}

bool compare_allele(allele v1, allele v2) {
    return (v1.loc < v2.loc);
}

void ListAllVariantPositions (const vector<sparseseq>& variants, vector<allele>& allvar) {
    //Put into the vector allvar
    for (int i=0;i<variants.size();i++) {
        for (int j=0;j<variants[i].locus.size();j++) {
            allele a;
            a.loc=variants[i].locus[j];
            a.nuc=variants[i].allele[j];
            allvar.push_back(a);
        }
    }
    sort(allvar.begin(),allvar.end(),compare_allele);
    vector<int> to_rem;
    for (int i=allvar.size()-1;i>0;i--) {
        if (allvar[i].loc==allvar[i-1].loc&&allvar[i].nuc==allvar[i-1].nuc) {
            to_rem.push_back(i);
            //cout << "Remove " << i << "\n";
        }
    }
    for (int i=0;i<to_rem.size();i++) {
        allvar.erase(allvar.begin()+to_rem[i]);
    }
}

void FindAmbiguousVarPositions (const vector<allele>& allvar, vector<pat>& pdat, vector<int>& nloci, vector<int>& nloc_count) {
    //Variant positions for which at least one sequence has an N
    for (int i=0;i<pdat.size();i++) {
		cout << pdat[i].code << " ";
        int nc=0;
        for (int j=0;j<allvar.size();j++) {
            if (pdat[i].seq.compare(allvar[j].loc,1,"N")==0) {
                idat id;
                id.loc=allvar[j].loc;
                id.q=0;
				cout << allvar[j].loc << " ";
                pdat[i].seq_uncertainty.push_back(id);
                nc++;
                nloci.push_back(allvar[j].loc);
            }
        }
		cout << "\n";
        nloc_count.push_back(nc);
    }
    sort(nloci.begin(),nloci.end());
    nloci.erase(unique(nloci.begin(),nloci.end()),nloci.end());
}

void FindAmbiguousVarFreqs (const string all_consensus, const vector<allele>& allvar, const vector<sparseseq>& variants, vector<pat>& pdat, const vector<int>& nloci, vector<ipair>& freqs) {
    //Variant allele frequencies from the population of the variants for which Ns exist
    
    //Find consensus and variant loci at each position
    vector<char> allnuc;
    vector<char> cons;
    for (int i=0;i<nloci.size();i++) {
        int done=0;
        cons.push_back(all_consensus[nloci[i]]);
        for (int j=0;j<pdat.size();j++) {
            if (done==0) {
                for (int k=0;k<variants[j].locus.size();k++) {
                    if (variants[j].locus[k]==nloci[i]) {
                        allnuc.push_back(variants[j].allele[k]);
                        done=1;
                        break;
                    }
                }
            }
        }
    }

    for (int i=0;i<nloci.size();i++) {
        ipair ip;
        ip.loc=nloci[i];
        double countc=0;
        double countv=0;
        for (int j=0;j<pdat.size();j++) {
            string c(1,cons[i]);
            string v(1,allnuc[i]);
            if (pdat[j].seq.compare(nloci[i],1,c,0,1)==0) {
                countc++;
            }
            if (pdat[j].seq.compare(nloci[i],1,v,0,1)==0) {
                countv++;
            }
        }
        ip.q=countv/(countc+countv);
        freqs.push_back(ip);
    }
    
    //Add variant frequencies and alleles into pdat
    for (int i=0;i<pdat.size();i++) {
        for (int j=0;j<pdat[i].seq_uncertainty.size();j++) {
            for (int k=0;k<freqs.size();k++) {
                if (freqs[k].loc==pdat[i].seq_uncertainty[j].loc) {
                    pdat[i].seq_uncertainty[j].q=freqs[k].q;
                    pdat[i].seq_uncertainty[j].allele=allnuc[k];
                }
            }
            
        }
    }
}

void ProcessAmbiguousFixedVariants(vector<ipair>& freqs, vector<int>& nloc_count, vector<sparseseq>& variants, vector<pat>& pdat) {
    //Repair uncertain variants that are seen in all other sequences i.e. frequency 1
    for (int i=freqs.size()-1;i>=0;i--) {
        if (freqs[i].q==1) {
            //Remove this as an uncertain site from each sequence
            for (int j=0;j<pdat.size();j++) {
                for (int k=0;k<pdat[j].seq_uncertainty.size();k++) {
                    if (pdat[j].seq_uncertainty[k].loc==freqs[i].loc) {
                        pdat[j].seq_uncertainty.erase(pdat[j].seq_uncertainty.begin()+k);
                        nloc_count[j]--;
                        break;
                    }
                }
            }
            //Add this to the list of variants
            char v;
            for (int j=0;j<variants.size();j++) {
                for (int k=0;k<variants[j].locus.size();k++) {
                    if (variants[j].locus[k]==freqs[i].loc) {
                        v=variants[j].allele[k];
                        break;
                    }
                }
            }
            //cout << "Here " << v << "\n";
            for (int j=0;j<variants.size();j++) {
                for (int k=0;k<=variants[j].locus.size();k++) {
                    if (k==0&&variants[j].locus[k]>freqs[i].loc) {
                        //cout << "Site 1\n";
                        variants[j].locus.insert(variants[j].locus.begin(),freqs[i].loc);
                        variants[j].allele.insert(variants[j].allele.begin(),v);
                        break;
                    }
                    if (k>=0&&k<variants[j].locus.size()) {
                        if (variants[j].locus[k]<freqs[i].loc&&variants[j].locus[k+1]>freqs[i].loc) {
                            //cout << "Site 2\n";
                            variants[j].locus.insert(variants[j].locus.begin()+k+1,freqs[i].loc);
                            variants[j].allele.insert(variants[j].allele.begin()+k+1,v);
                            break;

                        }
                    }
                    if (k==variants[j].locus.size()&&variants[j].locus[k-1]<freqs[i].loc) {
                        //cout << "Site 3 " << k << " " << variants[j].locus.size() << "\n";
                        variants[j].locus.insert(variants[j].locus.begin()+k,freqs[i].loc);
                        variants[j].allele.insert(variants[j].allele.begin()+k,v);
                        break;
                    }
                }
            }
            //Remove this from freqs as now dealt with
            freqs.erase(freqs.begin()+i);
        }
    }
}

void RemoveIndividualsMultipleN (const run_params p, vector<int>& nloc_count, vector<pat>& pdat, vector<sparseseq>& variants) {
    vector<int> rem;
    for (int i=pdat.size()-1;i>=0;i--) {
        if (nloc_count[i]>p.max_n) {
            rem.push_back(i);
        }
    }
	if (p.diagnostic==1) {
		cout << "Variants with N:\n";
		for (int i=0;i<pdat.size();i++) {
			cout << pdat[i].code << " " << nloc_count[i] << "\n";
		}
	}
    for (int i=0;i<rem.size();i++) {
        pdat.erase(pdat.begin()+rem[i]);
        variants.erase(variants.begin()+rem[i]);
        nloc_count.erase(nloc_count.begin()+rem[i]);
    }
}

void FindPairwiseDistances (run_params p, vector< vector<int> >& seqdists, vector<sparseseq>& variants, vector<pat>& pdat) {
    vector<int> zeros(pdat.size(),0);
    for (int i=0;i<pdat.size();i++) {
        seqdists.push_back(zeros);
    }
    for (int i=0;i<pdat.size();i++) {
        for (int j=i+1;j<pdat.size();j++) {
            int dist=0;
            //Find unique difference positions;
            vector<int> uniq;
            for (int k=0;k<variants[i].locus.size();k++) {
                uniq.push_back(variants[i].locus[k]);
            }
            for (int k=0;k<variants[j].locus.size();k++) {
                uniq.push_back(variants[j].locus[k]);
            }
            sort(uniq.begin(),uniq.end());
            uniq.erase(unique(uniq.begin(),uniq.end()),uniq.end());
            for (int k=0;k<uniq.size();k++) {
                if (pdat[i].seq[uniq[k]]=='A'||pdat[i].seq[uniq[k]]=='C'||pdat[i].seq[uniq[k]]=='G'||pdat[i].seq[uniq[k]]=='T') {
                    if (pdat[j].seq[uniq[k]]=='A'||pdat[j].seq[uniq[k]]=='C'||pdat[j].seq[uniq[k]]=='G'||pdat[j].seq[uniq[k]]=='T') {
                        if (pdat[i].seq[uniq[k]]!=pdat[j].seq[uniq[k]]) {
                            dist++;
                        }
                    }
                }
            }
            seqdists[i][j]=dist;
            seqdists[j][i]=dist;
        }
    }
    for (int i=0;i<pdat.size();i++) {
        if (pdat[i].seq.size()==0) {
            for (int j=0;j<pdat.size();j++){
                seqdists[i][j]=-1;
                seqdists[j][i]=-1;
            }
            seqdists[i][i]=0;
        }
    }
    if (p.diagnostic==1) {
        PrintSeqDistances(seqdists);
    }
}

void FromConsensusDistances (const vector<sparseseq>& variants, vector< vector<tpair> >& seqdists_c) {
    //Construct a pairwise consensus between i and j as having a list of variants present in both sequences.
    //Find the number of variants each sequence would need to gain in order to be constructed from that consensus.
    //Used in the calculation of the amount of evolution occurring in each sequence via noise or error after the transmission event
    for (int i=0;i<variants.size();i++) {
        vector<tpair> vc;
        for (int j=0;j<variants.size();j++) {
            tpair p;
            p.from=0;
            p.to=0;
            //We want the number of variants in i not in j and vice versa.
            for (int k=0;k<variants[i].locus.size();k++) {
                int mycount = count(variants[j].locus.begin(), variants[j].locus.end(), variants[i].locus[k]);
                if (mycount==0) {
                    p.from++;
                }
            }
            for (int k=0;k<variants[j].locus.size();k++) {
                int mycount = count(variants[i].locus.begin(), variants[i].locus.end(), variants[j].locus[k]);
                if (mycount==0) {
                    p.to++;
                }
            }
            vc.push_back(p);
        }
        seqdists_c.push_back(vc);
    }
}













void ModifyVariantsByBin(int b, const vector< vector<int> >& bin, double& lLvar, const vector<pat>& pdat, vector<sparseseq>& variants) {
	int index=0;
	for (int i=0;i<pdat.size();i++) {
		for (int j=0;j<pdat[i].seq_uncertainty.size();j++) {
			if (bin[b][index]==1) {
				//Fix variantby adding to variants[i]
				variants[i].locus.push_back(pdat[i].seq_uncertainty[j].loc);
				variants[i].allele.push_back(pdat[i].seq_uncertainty[j].allele);
				lLvar=lLvar+log(pdat[i].seq_uncertainty[j].q);
			} else {
				//Don't fix - just change probability
				lLvar=lLvar+log(1-pdat[i].seq_uncertainty[j].q);
			}
			index++;
		}
	}
}

void FindSharedVariants(int set, vector<varcount>& vc_multi, const vector< vector<sparseseq> >& variants_sets) {
    //cout << "FindSharedVariants\n";
	vector<varcount> vc;
	for (int i=0;i<variants_sets[set].size();i++) {
		for (int j=0;j<variants_sets[set][i].locus.size();j++) {
			int found=0;
			for (int k=0;k<vc.size();k++) {
				if (variants_sets[set][i].locus[j]==vc[k].locus&&variants_sets[set][i].allele[j]==vc[k].allele) {
					vc[k].individual.push_back(i);
					found=1;
				}
			}
			if (found==0) {
				//cout << "Add " << variants_sets[set][i].locus[j] << " " << variants_sets[set][i].allele[j] << "\n";
				varcount v;
				v.locus=variants_sets[set][i].locus[j];
				v.allele=variants_sets[set][i].allele[j];
				v.individual.push_back(i);
				vc.push_back(v);
			}
		}
	}
    //cout << "Now here\n";
	for (int i=0;i<vc.size();i++) {
		if (vc[i].individual.size()>1) {
			vc_multi.push_back(vc[i]);
			/*cout << vc[i].locus << vc[i].allele <<" ";
			for (int j=0;j<vc[i].individual.size();j++) {
				cout << vc[i].individual[j] << " ";
			}
			cout << "\n";*/
		}
	}
	//Delete duplicate sets
	vector<int> to_rem;
	for (int i=0;i<vc_multi.size()-1;i++) {
		for (int j=i+1;j<vc_multi.size();j++) {
			if (vc_multi[i].individual==vc_multi[j].individual) {
				to_rem.push_back(j);
			}
		}
	}
	sort(to_rem.begin(),to_rem.end());
	to_rem.erase(unique(to_rem.begin(),to_rem.end()),to_rem.end());
	reverse(to_rem.begin(),to_rem.end());
	for (int i=0;i<to_rem.size();i++){
		vc_multi.erase(vc_multi.begin()+to_rem[i]);
	}
}




/*void GetSharedInOut (int c, const vector<tpairs>& trans_sets, const vector<varcount>& vc_multi, vector< vector<int> >& inout) {
    for (int v=0;v<vc_multi.size();v++) {
        vector<int> ino;
        for (int j=0;j<trans_sets[c].pair.size();j++) {
            int to_in=0;
            int from_in=0;
            for (int k=0;k<vc_multi[v].individual.size();k++) {
                if (trans_sets[c].pair[j].to==vc_multi[v].individual[k]) {
                    to_in=1;
                }
                if (trans_sets[c].pair[j].from==vc_multi[v].individual[k]) {
                    from_in=1;
                }
            }
            if (to_in==1&&from_in==0) { //This goes into the set.  Can only be one...
                ino.push_back(trans_sets[c].pair[j].from);
                ino.push_back(trans_sets[c].pair[j].to);

            }
        }
        inout.push_back(ino);
    }
}*/

/*void GetNonSharedInOut (int c, const vector<tpairs>& trans_sets, const vector<varcount>& vc_single, vector< vector<int> >& inout_s) {
    for (int v=0;v<vc_single.size();v++) {
        vector<int> ino;
        int found=0;
        for (int j=0;j<trans_sets[c].pair.size();j++) {
            if (trans_sets[c].pair[j].to==vc_single[v].individual[0]) {
                found=1;
                ino.push_back(trans_sets[c].pair[j].from);
                ino.push_back(trans_sets[c].pair[j].to);
            }
        }
        if (found==1) {
            inout_s.push_back(ino);
        } else {
            //This variant is in the root individual
            ino.push_back(-1);
            ino.push_back(-1);
            inout_s.push_back(ino);
        }
    }
}*/


back to top