Skip to main content
  • Home
  • Development
  • Documentation
  • Donate
  • Operational login
  • Browse the archive

swh logo
SoftwareHeritage
Software
Heritage
Archive
Features
  • Search

  • Downloads

  • Save code now

  • Add forge now

  • Help

Revision 49bf399a9d1ceae722c0c8c4aeb8376f2644d40f authored by Asher Preska Steinberg on 07 February 2022, 16:12:20 UTC, committed by Asher Preska Steinberg on 07 February 2022, 16:12:20 UTC
Commit for ReduceCoreGenome
1 parent 86759f7
  • Files
  • Changes
  • c9e6ce9
  • /
  • MeasureGaps
  • /
  • main.go
Raw File Download
Permalinks

To reference or cite the objects present in the Software Heritage archive, permalinks based on SoftWare Hash IDentifiers (SWHIDs) must be used.
Select below a type of object currently browsed in order to display its associated SWHID and permalink.

  • revision
  • directory
  • content
revision badge
swh:1:rev:49bf399a9d1ceae722c0c8c4aeb8376f2644d40f
directory badge Iframe embedding
swh:1:dir:7dbb18a3abff96bbee4a88ede75b8d11d5022893
content badge Iframe embedding
swh:1:cnt:cadb371470c3a9b1986d9c4353baecba84ea02e5
Citations

This interface enables to generate software citations, provided that the root directory of browsed objects contains a citation.cff or codemeta.json file.
Select below a type of object currently browsed in order to generate citations for them.

  • revision
  • directory
  • content
Generate software citation in BibTex format (requires biblatex-software package)
Generating citation ...
Generate software citation in BibTex format (requires biblatex-software package)
Generating citation ...
Generate software citation in BibTex format (requires biblatex-software package)
Generating citation ...
main.go
package main

// author: Asher Preska Steinberg
import (
	"bufio"
	"fmt"
	"github.com/kussell-lab/biogo/seq"
	"gopkg.in/alecthomas/kingpin.v2"
	"io"
	"log"
	"os"
	"runtime"
	"strings"
	"sync"
)

func main() {
	app := kingpin.New("measureGaps", "measure fraction of MSA that's been aligned to the reference genome")
	app.Version("v20210302")
	alnFile := app.Arg("MSA", "multi-sequence alignment file").Required().String()
	ncpu := app.Flag("num-cpu", "Number of CPUs (default: using all available cores)").Default("0").Int()
	numDigesters := app.Flag("threads", "Number of alignments to process at a time (default: 8)").Default("8").Int()

	kingpin.MustParse(app.Parse(os.Args[1:]))
	if *ncpu == 0 {
		*ncpu = runtime.NumCPU()
	}

	runtime.GOMAXPROCS(*ncpu)

	//alnFile := "/Volumes/aps_timemachine/recombo/APS160_splitGenome/1224_properheader"
	//numDigesters := 20

	//start := time.Now()
	//done channel will close when we're done reading alignments
	done := make(chan struct{})
	//read in alignments
	alignments, errc := readAlignments(done, *alnFile)

	//start a fixed number of goroutines to read alignments and split into core/flex
	c := make(chan result)
	var wg sync.WaitGroup
	for i := 0; i < *numDigesters; i++ {
		wg.Add(1)
		go MeasureGaps(done, alignments, c, i, &wg)
	}

	go func() {
		wg.Wait()
		close(c)
	}()
	//end of pipeline; write files
	//total gap length
	var totGaps float64
	//total sequence length
	var totLength float64
	//sum of aligned fractions
	var sumFrac float64
	//number of genes
	var numGenes float64
	//initializing counters ...
	totGaps = 0
	totLength = 0
	sumFrac = 0
	for gene := range c {
		totGaps = totGaps + gene.numGaps
		totLength = totLength + gene.seqLength
		sumFrac = sumFrac + gene.frac
		numGenes++
	}
	if err := <-errc; err != nil { // HLerrc
		panic(err)
	}
	//total aligned fraction
	fracAligned := (totLength - totGaps) / totLength
	//average aligned fraction for each gene
	avgAligned := sumFrac / numGenes
	fmt.Printf("Statistics for MSA file: %s\n", *alnFile)
	fmt.Print("-------------------------------\n")
	fmt.Printf("Total aligned fraction: %f\n", fracAligned)
	fmt.Printf("Aligned fraction per gene: %f\n", avgAligned)
	fmt.Print("-------------------------------\n")
	//duration := time.Since(start)
	//fmt.Println("Time to measure sequence gaps:", duration)
}

// readAlignments reads sequence alignment from a extended Multi-FASTA file,
// and return a channel of alignment, which is a list of seq.Sequence
func readAlignments(done <-chan struct{}, file string) (<-chan Alignment, <-chan error) {
	alignments := make(chan Alignment)
	errc := make(chan error, 1)
	go func() {
		defer close(alignments)

		f, err := os.Open(file)
		if err != nil {
			panic(err)
		}
		defer f.Close()
		xmfaReader := seq.NewXMFAReader(f)
		numAln := 0
		for {
			alignment, err := xmfaReader.Read()
			if err != nil {
				if err != io.EOF {
					panic(err)
				}
				break
			}
			if len(alignment) > 0 {
				numAln++
				alnID := strings.Split(alignment[0].Id, " ")[0]
				select {
				case alignments <- Alignment{alnID, numAln, alignment}:
					// fmt.Printf("\rRead %d alignments.", numAln)
					// fmt.Printf("\r alignment ID: %s", alnID)
				case <-done:
					// fmt.Printf(" Total alignments %d\n", numAln)
				}
			}
		}
		errc <- err
	}()
	return alignments, errc
}

// Alignment is an array of multiple sequences with same length.
type Alignment struct {
	ID        string
	num       int
	Sequences []seq.Sequence
}

// A result is a single gene alignment belonging to the core or flexible genome
type result struct {
	Alignment Alignment
	frac      float64 //fraction of gene that's aligned
	seqLength float64 //total gene sequence length
	numGaps   float64 //number of gaps in sequence length
}

// MeasureGaps reads gene alignments from the master MSA, figures out what portion of the gene is aligned
// then sends these processed results on alnChan until either the master MSA or done channel is closed.
func MeasureGaps(done <-chan struct{}, alignments <-chan Alignment, genes chan<- result, id int, wg *sync.WaitGroup) {
	defer wg.Done()
	//fmt.Printf("Worker %d starting\n", id)
	for aln := range alignments { // HLpaths
		//total number of gaps for a gene sequence
		var totGaps float64
		totGaps = 0
		//total length of the gene alignment
		var totLength float64

		for _, s := range aln.Sequences {
			NumGaps := countGaps(s)
			seqLength := float64(len(s.Seq))
			//don't take sequences from strains which don't have the gene
			//(depending on how the alignment was done, these show up as sequences with all gaps)
			if NumGaps == seqLength {
				continue
			} else {
				totGaps = NumGaps + totGaps
				totLength = totLength + seqLength
			}

		}
		//fraction aligned for that gene
		frac := (totLength - totGaps) / totLength
		gene := result{aln, frac, totLength, totGaps}
		//writeAln(aln, outdir)
		select {
		//case c <- aln.num:
		case genes <- gene:
		//	writeAln(aln, outdir)
		case <-done:
			return
		}
	}
	//fmt.Printf("Worker %d done\n", id)

}

// readSamples return a list of samples from a sample file.
func readSamples(filename string) (samples []string) {
	f, err := os.Open(filename)
	if err != nil {
		log.Fatalf("Error when reading file %s:%v", filename, err)
	}
	defer f.Close()

	r := bufio.NewReader(f)
	for {
		line, err := r.ReadString('\n')

		if err != nil {
			if err != io.EOF {
				log.Fatalf("Error when reading file %s: %v", filename, err)
			}
			break
		}
		samples = append(samples, strings.TrimSpace(line))
	}
	return
}

//check for errors
func check(e error) {
	if e != nil {
		panic(e)
	}
}

// extractFullCodons returns the number of full codons
//there needs to be at least 1 full codon for us to say the strain "has the gene"
func extractFullCodons(s seq.Sequence) (NumFullCodons int) {
	var codons []Codon
	for i := 0; i+3 <= len(s.Seq); i += 3 {
		c := s.Seq[i:(i + 3)]
		//check for gaps
		containsGap := false
		for k := 0; k < 3; k++ {
			if c[k] == '-' || c[k] == 'N' {
				containsGap = true
				break
			}
		}
		if containsGap {
			continue
		} else {
			codons = append(codons, c)
		}

	}
	NumFullCodons = len(codons)
	return
}

// countGaps counts the number of gaps in a gene sequence
func countGaps(s seq.Sequence) (NumGaps float64) {
	for i := 0; i < len(s.Seq); i++ {
		b := s.Seq[i]
		if b == '-' || b == 'N' {
			NumGaps++
		}
	}
	return
}

// Codon is a byte list of length 3
type Codon []byte
The diff you're trying to view is too large. Only the first 1000 changed files have been loaded.
Showing with 0 additions and 0 deletions (0 / 0 diffs computed)
swh spinner

Computing file changes ...

Software Heritage — Copyright (C) 2015–2025, The Software Heritage developers. License: GNU AGPLv3+.
The source code of Software Heritage itself is available on our development forge.
The source code files archived by Software Heritage are available under their own copyright and licenses.
Terms of use: Archive access, API— Contact— JavaScript license information— Web API

back to top