https://github.com/BenLangmead/bowtie2
Raw File
Tip revision: b989f0fbed6bbe3ea9e0bb237250297148e2acf5 authored by BenLangmead on 20 April 2017, 20:50:05 UTC
put shared_ back in the conditions
Tip revision: b989f0f
aligner_sw_driver.h
/*
 * Copyright 2011, Ben Langmead <langmea@cs.jhu.edu>
 *
 * This file is part of Bowtie 2.
 *
 * Bowtie 2 is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * Bowtie 2 is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with Bowtie 2.  If not, see <http://www.gnu.org/licenses/>.
 */

/*
 *  aligner_sw_driver.h
 *
 * REDUNDANT SEED HITS
 *
 * We say that two seed hits are redundant if they trigger identical
 * seed-extend dynamic programming problems.  Put another way, they both lie on
 * the same diagonal of the overall read/reference dynamic programming matrix.
 * Detecting redundant seed hits is simple when the seed hits are ungapped.  We
 * do this after offset resolution but before the offset is converted to genome
 * coordinates (see uses of the seenDiags1_/seenDiags2_ fields for examples).
 *
 * REDUNDANT ALIGNMENTS
 *
 * In an unpaired context, we say that two alignments are redundant if they
 * share any cells in the global DP table.  Roughly speaking, this is like
 * saying that two alignments are redundant if any read character aligns to the
 * same reference character (same reference sequence, same strand, same offset)
 * in both alignments.
 *
 * In a paired-end context, we say that two paired-end alignments are redundant
 * if the mate #1s are redundant and the mate #2s are redundant.
 *
 * How do we enforce this?  In the unpaired context, this is relatively simple:
 * the cells from each alignment are checked against a set containing all cells
 * from all previous alignments.  Given a new alignment, for each cell in the
 * new alignment we check whether it is in the set.  If there is any overlap,
 * the new alignment is rejected as redundant.  Otherwise, the new alignment is
 * accepted and its cells are added to the set.
 *
 * Enforcement in a paired context is a little trickier.  Consider the
 * following approaches:
 *
 * 1. Skip anchors that are redundant with any previous anchor or opposite
 *    alignment.  This is sufficient to ensure no two concordant alignments
 *    found are redundant.
 *
 * 2. Same as scheme 1, but with a "transitive closure" scheme for finding all
 *    concordant pairs in the vicinity of an anchor.  Consider the AB/AC
 *    scenario from the previous paragraph.  If B is the anchor alignment, we
 *    will find AB but not AC.  But under this scheme, once we find AB we then
 *    let B be a new anchor and immediately look for its opposites.  Likewise,
 *    if we find any opposite, we make them anchors and continue searching.  We
 *    don't stop searching until every opposite is used as an anchor.
 *
 * 3. Skip anchors that are redundant with any previous anchor alignment (but
 *    allow anchors that are redundant with previous opposite alignments).
 *    This isn't sufficient to avoid redundant concordant alignments.  To avoid
 *    redundant concordants, we need an additional procedure that checks each
 *    new concordant alignment one-by-one against a list of previous concordant
 *    alignments to see if it is redundant.
 *
 * We take approach 1.
 */

#ifndef ALIGNER_SW_DRIVER_H_
#define ALIGNER_SW_DRIVER_H_

#include <utility>
#include "ds.h"
#include "aligner_seed.h"
#include "aligner_sw.h"
#include "aligner_cache.h"
#include "reference.h"
#include "group_walk.h"
#include "bt2_idx.h"
#include "mem_ids.h"
#include "aln_sink.h"
#include "pe.h"
#include "ival_list.h"
#include "simple_func.h"
#include "random_util.h"

struct SeedPos {

	SeedPos() : fw(false), offidx(0), rdoff(0), seedlen(0) { }

	SeedPos(
		bool fw_,
		uint32_t offidx_,
		uint32_t rdoff_,
		uint32_t seedlen_)
	{
		init(fw_, offidx_, rdoff_, seedlen_);
	}
	
	void init(
		bool fw_,
		uint32_t offidx_,
		uint32_t rdoff_,
		uint32_t seedlen_)
	{
		fw      = fw_;
		offidx  = offidx_;
		rdoff   = rdoff_;
		seedlen = seedlen_;
	}
	
	bool operator<(const SeedPos& o) const {
		if(offidx < o.offidx)   return true;
		if(offidx > o.offidx)   return false;
		if(rdoff < o.rdoff)     return true;
		if(rdoff > o.rdoff)     return false;
		if(seedlen < o.seedlen) return true;
		if(seedlen > o.seedlen) return false;
		if(fw && !o.fw)         return true;
		if(!fw && o.fw)         return false;
		return false;
	}
	
	bool operator>(const SeedPos& o) const {
		if(offidx < o.offidx)   return false;
		if(offidx > o.offidx)   return true;
		if(rdoff < o.rdoff)     return false;
		if(rdoff > o.rdoff)     return true;
		if(seedlen < o.seedlen) return false;
		if(seedlen > o.seedlen) return true;
		if(fw && !o.fw)         return false;
		if(!fw && o.fw)         return true;
		return false;
	}
	
	bool operator==(const SeedPos& o) const {
		return fw == o.fw && offidx == o.offidx &&
		       rdoff == o.rdoff && seedlen == o.seedlen;
	}

	bool fw;
	uint32_t offidx;
	uint32_t rdoff;
	uint32_t seedlen;
};

/**
 * An SATuple along with the associated seed position.
 */
struct SATupleAndPos {
	
	SATuple sat;    // result for this seed hit
	SeedPos pos;    // seed position that yielded the range this was taken from
	size_t  origSz; // size of range this was taken from
	size_t  nlex;   // # position we can extend seed hit to left w/o edit
	size_t  nrex;   // # position we can extend seed hit to right w/o edit
	
	bool operator<(const SATupleAndPos& o) const {
		if(sat < o.sat) return true;
		if(sat > o.sat) return false;
		return pos < o.pos;
	}

	bool operator==(const SATupleAndPos& o) const {
		return sat == o.sat && pos == o.pos;
	}
};

/**
 * Encapsulates the weighted random sampling scheme we want to use to pick
 * which seed hit range to sample a row from.
 */
class RowSampler {

public:

	RowSampler(int cat = 0) : elim_(cat), masses_(cat) { 
		mass_ = 0.0f;
	}
	
	/**
	 * Initialze sampler with respect to a range of elements in a list of
	 * SATupleAndPos's.
	 */
	void init(
		const EList<SATupleAndPos, 16>& salist,
		size_t sai,
		size_t saf,
		bool lensq, // whether to square the numerator, which = extended length
		bool szsq)  // whether to square denominator, which = 
	{
		assert_gt(saf, sai);
		elim_.resize(saf - sai);
		elim_.fill(false);
		// Initialize mass
		mass_ = 0.0f;
		masses_.resize(saf - sai);
		for(size_t i = sai; i < saf; i++) {
			size_t len = salist[i].nlex + salist[i].nrex + 1; // + salist[i].sat.key.len;
			double num = (double)len;
			if(lensq) {
				num *= num;
			}
			double denom = (double)salist[i].sat.size();
			if(szsq) {
				denom *= denom;
			}
			masses_[i - sai] = num / denom;
			mass_ += masses_[i - sai];
		}
	}
	
	/**
	 * Caller is indicating that the bin at index i is exhausted and we should
	 * exclude it from our sampling from now on.
	 */
	void finishedRange(size_t i) {
		assert_lt(i, masses_.size());
		elim_[i] = true;
		mass_ -= masses_[i];
	}
	
	/**
	 * Sample randomly from the mass.
	 */
	size_t next(RandomSource& rnd) {
		// Throw the dart
		double rd = rnd.nextFloat() * mass_;
		double mass_sofar = 0.0f;
		size_t sz = masses_.size();
		size_t last_unelim = std::numeric_limits<size_t>::max();
		for(size_t i = 0; i < sz; i++) {
			if(!elim_[i]) {
				last_unelim = i;
				mass_sofar += masses_[i];
				if(rd < mass_sofar) {
					// This is the one we hit
					return i;
				}
			}
		}
		assert_neq(std::numeric_limits<size_t>::max(), last_unelim);
		return last_unelim;
	}

protected:
	double        mass_;    // total probability mass to throw darts at
	EList<bool>   elim_;    // whether the range is eliminated
	EList<double> masses_;  // mass of each range
};

/**
 * Return values from extendSeeds and extendSeedsPaired.
 */
enum {
	// All end-to-end and seed hits were examined
	// The policy does not need us to look any further
	EXTEND_EXHAUSTED_CANDIDATES = 1,
	EXTEND_POLICY_FULFILLED,
	// We stopped because we reached a point where the only remaining
	// alignments of interest have perfect scores, but we already investigated
	// perfect alignments
	EXTEND_PERFECT_SCORE,
	// We stopped because we ran up against a limit on how much work we should
	// do for one set of seed ranges, e.g. the limit on number of consecutive
	// unproductive DP extensions
	EXTEND_EXCEEDED_SOFT_LIMIT,
	// We stopped because we ran up against a limit on how much work we should
	// do for overall before giving up on a mate
	EXTEND_EXCEEDED_HARD_LIMIT
};

/**
 * Data structure encapsulating a range that's been extended out in two
 * directions.
 */
struct ExtendRange {

	void init(size_t off_, size_t len_, size_t sz_) {
		off = off_; len = len_; sz = sz_;
	}

	size_t off; // offset of extended region
	size_t len; // length between extremes of extended region
	size_t sz;  // # of elements in SA range
};

class SwDriver {

	typedef PList<TIndexOffU, CACHE_PAGE_SZ> TSAList;

public:

	SwDriver(size_t bytes) :
		satups_(DP_CAT),
		gws_(DP_CAT),
		seenDiags1_(DP_CAT),
		seenDiags2_(DP_CAT),
		redAnchor_(DP_CAT),
		redMate1_(DP_CAT),
		redMate2_(DP_CAT),
		pool_(bytes, CACHE_PAGE_SZ, DP_CAT),
		salistEe_(DP_CAT),
		gwstate_(GW_CAT) { }

	/**
	 * Given a collection of SeedHits for a single read, extend seed alignments
	 * into full alignments.  Where possible, try to avoid redundant offset
	 * lookups and dynamic programming problems.  Optionally report alignments
	 * to a AlnSinkWrap object as they are discovered.
	 *
	 * If 'reportImmediately' is true, returns true iff a call to
	 * mhs->report() returned true (indicating that the reporting
	 * policy is satisfied and we can stop).  Otherwise, returns false.
	 */
	int extendSeeds(
		Read& rd,                    // read to align
		bool mate1,                  // true iff rd is mate #1
		SeedResults& sh,             // seed hits to extend into full alignments
		const Ebwt& ebwtFw,          // BWT
		const Ebwt* ebwtBw,          // BWT'
		const BitPairReference& ref, // Reference strings
		SwAligner& swa,              // dynamic programming aligner
		const Scoring& sc,           // scoring scheme
		int seedmms,                 // # mismatches allowed in seed
		int seedlen,                 // length of seed
		int seedival,                // interval between seeds
		TAlScore& minsc,             // minimum score for anchor
		int nceil,                   // maximum # Ns permitted in ref portion
		size_t maxhalf,              // maximum width on one side of DP table
		bool doUngapped,             // do ungapped alignment
		size_t maxIters,             // stop after this many seed-extend loop iters
		size_t maxUg,                // max # ungapped extends
		size_t maxDp,                // max # DPs
		size_t maxUgStreak,          // stop after streak of this many ungap fails
		size_t maxDpStreak,          // stop after streak of this many dp fails
		bool doExtend,               // do seed extension
		bool enable8,                // use 8-bit SSE where possible
		size_t cminlen,              // use checkpointer if read longer than this
		size_t cpow2,                // interval between diagonals to checkpoint
		bool doTri,                  // triangular mini-fills
		int tighten,                 // -M score tightening mode
		AlignmentCacheIface& ca,     // alignment cache for seed hits
		RandomSource& rnd,           // pseudo-random source
		WalkMetrics& wlm,            // group walk left metrics
		SwMetrics& swmSeed,          // DP metrics for seed-extend
		PerReadMetrics& prm,         // per-read metrics
		AlnSinkWrap* mhs,            // HitSink for multiseed-style aligner
		bool reportImmediately,      // whether to report hits immediately to mhs
		bool& exhaustive);

	/**
	 * Given a collection of SeedHits for a read pair, extend seed
	 * alignments into full alignments and then look for the opposite
	 * mate using dynamic programming.  Where possible, try to avoid
	 * redundant offset lookups.  Optionally report alignments to a
	 * AlnSinkWrap object as they are discovered.
	 *
	 * If 'reportImmediately' is true, returns true iff a call to
	 * mhs->report() returned true (indicating that the reporting
	 * policy is satisfied and we can stop).  Otherwise, returns false.
	 */
	int extendSeedsPaired(
		Read& rd,                    // mate to align as anchor
		Read& ord,                   // mate to align as opposite
		bool anchor1,                // true iff anchor mate is mate1
		bool oppFilt,                // true iff opposite mate was filtered out
		SeedResults& sh,             // seed hits for anchor
		const Ebwt& ebwtFw,          // BWT
		const Ebwt* ebwtBw,          // BWT'
		const BitPairReference& ref, // Reference strings
		SwAligner& swa,              // dyn programming aligner for anchor
		SwAligner& swao,             // dyn programming aligner for opposite
		const Scoring& sc,           // scoring scheme
		const PairedEndPolicy& pepol,// paired-end policy
		int seedmms,                 // # mismatches allowed in seed
		int seedlen,                 // length of seed
		int seedival,                // interval between seeds
		TAlScore& minsc,             // minimum score for anchor
		TAlScore& ominsc,            // minimum score for opposite
		int nceil,                   // max # Ns permitted in ref for anchor
		int onceil,                  // max # Ns permitted in ref for opposite
		bool nofw,                   // don't align forward read
		bool norc,                   // don't align revcomp read
		size_t maxhalf,              // maximum width on one side of DP table
		bool doUngapped,             // do ungapped alignment
		size_t maxIters,             // stop after this many seed-extend loop iters
		size_t maxUg,                // max # ungapped extends
		size_t maxDp,                // max # DPs
		size_t maxEeStreak,          // stop after streak of this many end-to-end fails
		size_t maxUgStreak,          // stop after streak of this many ungap fails
		size_t maxDpStreak,          // stop after streak of this many dp fails
		size_t maxMateStreak,        // stop seed range after N mate-find fails
		bool doExtend,               // do seed extension
		bool enable8,                // use 8-bit SSE where possible
		size_t cminlen,              // use checkpointer if read longer than this
		size_t cpow2,                // interval between diagonals to checkpoint
		bool doTri,                  // triangular mini-fills
		int tighten,                 // -M score tightening mode
		AlignmentCacheIface& cs,     // alignment cache for seed hits
		RandomSource& rnd,           // pseudo-random source
		WalkMetrics& wlm,            // group walk left metrics
		SwMetrics& swmSeed,          // DP metrics for seed-extend
		SwMetrics& swmMate,          // DP metrics for mate finidng
		PerReadMetrics& prm,         // per-read metrics for anchor
		AlnSinkWrap* msink,          // AlnSink wrapper for multiseed-style aligner
		bool swMateImmediately,      // whether to look for mate immediately
		bool reportImmediately,      // whether to report hits immediately to msink
		bool discord,                // look for discordant alignments?
		bool mixed,                  // look for unpaired as well as paired alns?
		bool& exhaustive);

	/**
	 * Prepare for a new read.
	 */
	void nextRead(bool paired, size_t mate1len, size_t mate2len) {
		redAnchor_.reset();
		seenDiags1_.reset();
		seenDiags2_.reset();
		seedExRangeFw_[0].clear(); // mate 1 fw
		seedExRangeFw_[1].clear(); // mate 2 fw
		seedExRangeRc_[0].clear(); // mate 1 rc
		seedExRangeRc_[1].clear(); // mate 2 rc
		size_t maxlen = mate1len;
		if(paired) {
			redMate1_.reset();
			redMate1_.init(mate1len);
			redMate2_.reset();
			redMate2_.init(mate2len);
			if(mate2len > maxlen) {
				maxlen = mate2len;
			}
		}
		redAnchor_.init(maxlen);
	}

protected:

	bool eeSaTups(
		const Read& rd,              // read
		SeedResults& sh,             // seed hits to extend into full alignments
		const Ebwt& ebwt,            // BWT
		const BitPairReference& ref, // Reference strings
		RandomSource& rnd,           // pseudo-random generator
		WalkMetrics& wlm,            // group walk left metrics
		SwMetrics& swmSeed,          // metrics for seed extensions
		size_t& nelt_out,            // out: # elements total
        size_t maxelts,              // max # elts to report
		bool all);                   // report all hits?

	void extend(
		const Read& rd,       // read
		const Ebwt& ebwtFw,   // Forward Bowtie index
		const Ebwt* ebwtBw,   // Backward Bowtie index
		TIndexOffU topf,        // top in fw index
		TIndexOffU botf,        // bot in fw index
		TIndexOffU topb,        // top in bw index
		TIndexOffU botb,        // bot in bw index
		bool fw,              // seed orientation
		size_t off,           // seed offset from 5' end
		size_t len,           // seed length
		PerReadMetrics& prm,  // per-read metrics
		size_t& nlex,         // # positions we can extend to left w/o edit
		size_t& nrex);        // # positions we can extend to right w/o edit

	void prioritizeSATups(
		const Read& rd,              // read
		SeedResults& sh,             // seed hits to extend into full alignments
		const Ebwt& ebwtFw,          // BWT
		const Ebwt* ebwtBw,          // BWT'
		const BitPairReference& ref, // Reference strings
		int seedmms,                 // # seed mismatches allowed
		size_t maxelt,               // max elts we'll consider
		bool doExtend,               // extend out seeds
		bool lensq,                  // square extended length
		bool szsq,                   // square SA range size
		size_t nsm,                  // if range as <= nsm elts, it's "small"
		AlignmentCacheIface& ca,     // alignment cache for seed hits
		RandomSource& rnd,           // pseudo-random generator
		WalkMetrics& wlm,            // group walk left metrics
		PerReadMetrics& prm,         // per-read metrics
		size_t& nelt_out,            // out: # elements total
		bool all);                   // report all hits?

	Random1toN               rand_;    // random number generators
	EList<Random1toN, 16>    rands_;   // random number generators
	EList<Random1toN, 16>    rands2_;  // random number generators
	EList<EEHit, 16>         eehits_;  // holds end-to-end hits
	EList<SATupleAndPos, 16> satpos_;  // holds SATuple, SeedPos pairs
	EList<SATupleAndPos, 16> satpos2_; // holds SATuple, SeedPos pairs
	EList<SATuple, 16>       satups_;  // holds SATuples to explore elements from
	EList<GroupWalk2S<TSlice, 16> > gws_;   // list of GroupWalks; no particular order
	EList<size_t>            mateStreaks_; // mate-find fail streaks
	RowSampler               rowsamp_;     // row sampler
	
	// Ranges that we've extended through when extending seed hits
	EList<ExtendRange> seedExRangeFw_[2];
	EList<ExtendRange> seedExRangeRc_[2];

	// Data structures encapsulating the diagonals that have already been used
	// to seed alignment for mate 1 and mate 2.
	EIvalMergeListBinned seenDiags1_;
	EIvalMergeListBinned seenDiags2_;

	// For weeding out redundant alignments
	RedundantAlns  redAnchor_;  // database of cells used for anchor alignments
	RedundantAlns  redMate1_;   // database of cells used for mate 1 alignments
	RedundantAlns  redMate2_;   // database of cells used for mate 2 alignments

	// For holding results for anchor (res_) and opposite (ores_) mates
	SwResult       resGap_;    // temp holder for alignment result
	SwResult       oresGap_;   // temp holder for alignment result, opp mate
	SwResult       resUngap_;  // temp holder for ungapped alignment result
	SwResult       oresUngap_; // temp holder for ungap. aln. opp mate
	SwResult       resEe_;     // temp holder for ungapped alignment result
	SwResult       oresEe_;    // temp holder for ungap. aln. opp mate
	
	Pool           pool_;      // memory pages for salistExact_
	TSAList        salistEe_;  // PList for offsets for end-to-end hits
	GroupWalkState gwstate_;   // some per-thread state shared by all GroupWalks
	
	// For AlnRes::matchesRef:
	ASSERT_ONLY(SStringExpandable<char>     raw_refbuf_);
	ASSERT_ONLY(SStringExpandable<uint32_t> raw_destU32_);
	ASSERT_ONLY(EList<bool>                 raw_matches_);
	ASSERT_ONLY(BTDnaString                 tmp_rf_);
	ASSERT_ONLY(BTDnaString                 tmp_rdseq_);
	ASSERT_ONLY(BTString                    tmp_qseq_);
};

#endif /*ALIGNER_SW_DRIVER_H_*/
back to top