Skip to main content
  • Home
  • Development
  • Documentation
  • Donate
  • Operational login
  • Browse the archive

swh logo
SoftwareHeritage
Software
Heritage
Archive
Features
  • Search

  • Downloads

  • Save code now

  • Add forge now

  • Help

https://github.com/kussell-lab/ReferenceAlignmentGenerator
18 February 2022, 11:37:47 UTC
  • Code
  • Branches (3)
  • Releases (0)
  • Visits
    • Branches
    • Releases
    • HEAD
    • refs/heads/2019natmethods
    • refs/heads/2021biorxiv
    • refs/heads/master
    • 49bf399a9d1ceae722c0c8c4aeb8376f2644d40f
    No releases to show
  • c9e6ce9
  • /
  • FilterGaps
  • /
  • main.go
Raw File Download
Take a new snapshot of a software origin

If the archived software origin currently browsed is not synchronized with its upstream version (for instance when new commits have been issued), you can explicitly request Software Heritage to take a new snapshot of it.

Use the form below to proceed. Once a request has been submitted and accepted, it will be processed as soon as possible. You can then check its processing state by visiting this dedicated page.
swh spinner

Processing "take a new snapshot" request ...

Permalinks

To reference or cite the objects present in the Software Heritage archive, permalinks based on SoftWare Hash IDentifiers (SWHIDs) must be used.
Select below a type of object currently browsed in order to display its associated SWHID and permalink.

  • content
  • directory
  • revision
  • snapshot
origin badgecontent badge Iframe embedding
swh:1:cnt:d46dbc64b2f347ec36ac4eb3acd9d0ca44e411cf
origin badgedirectory badge Iframe embedding
swh:1:dir:878bb50b5531b39f786bf100b913390aec08c34a
origin badgerevision badge
swh:1:rev:49bf399a9d1ceae722c0c8c4aeb8376f2644d40f
origin badgesnapshot badge
swh:1:snp:14ae65545ac5d89be3e3e69d3ac6b8b27db9c01e
Citations

This interface enables to generate software citations, provided that the root directory of browsed objects contains a citation.cff or codemeta.json file.
Select below a type of object currently browsed in order to generate citations for them.

  • content
  • directory
  • revision
  • snapshot
Generate software citation in BibTex format (requires biblatex-software package)
Generating citation ...
Generate software citation in BibTex format (requires biblatex-software package)
Generating citation ...
Generate software citation in BibTex format (requires biblatex-software package)
Generating citation ...
Generate software citation in BibTex format (requires biblatex-software package)
Generating citation ...
Tip revision: 49bf399a9d1ceae722c0c8c4aeb8376f2644d40f authored by Asher Preska Steinberg on 07 February 2022, 16:12:20 UTC
Commit for ReduceCoreGenome
Tip revision: 49bf399
main.go
package main

import (
	"fmt"
	"github.com/kussell-lab/biogo/seq"
	"gopkg.in/alecthomas/kingpin.v2"
	"io"
	"os"
	"path/filepath"
	"runtime"
	"strings"
	"sync"
	"time"
)

func main() {

	app := kingpin.New("FilterGaps", "Filters gene alignments with >=2% gaps")
	app.Version("v20210305")
	alnFile := app.Arg("master_MSA", "multi-sequence alignment file for all genes").Required().String()
	outdir := app.Arg("outdir", "output directory for the filtered MSA").Required().String()
	ncpu := app.Flag("num-cpu", "Number of CPUs (default: using all available cores)").Default("0").Int()
	numFilters := app.Flag("threads", "Number of alignments to process at a time (default: 8)").Default("8").Int()

	kingpin.MustParse(app.Parse(os.Args[1:]))
	if *ncpu == 0 {
		*ncpu = runtime.NumCPU()
	}

	runtime.GOMAXPROCS(*ncpu)

	//alnFile := "/Volumes/aps_timemachine/recombo/APS168_gapfiltered/test_3"
	//outdir := "/Volumes/aps_timemachine/recombo/APS168_gapfiltered/gapfiltered"
	//numFilters := 4
	//cutoff := 99
	//timer
	start := time.Now()
	makeFilteredMSA(*outdir, *alnFile)
	done := make(chan struct{})
	//read in alignments
	alignments, errc := readAlignments(done, *alnFile)
	//start a fixed number of goroutines to read alignments and split into core/flex
	c := make(chan Alignment)
	var wg sync.WaitGroup
	for i := 0; i < *numFilters; i++ {
		wg.Add(1)
		go FilterGappedAlns(done, alignments, c, i, &wg)
	}

	go func() {
		wg.Wait()
		close(c)
	}()
	//end of pipeline; write files
	for filteredAln := range c {
		if len(filteredAln.Sequences) > 0 {
			writeMSA(filteredAln, *outdir, *alnFile)
		}
	}
	if err := <-errc; err != nil { // HLerrc
		panic(err)
	}
	//add the number of core and flex to the bottom of the spreadsheet

	duration := time.Since(start)
	fmt.Println("Time to filter fgapped alignments:", duration)
}

// readAlignments reads sequence alignment from a extended Multi-FASTA file,
// and return a channel of alignment, which is a list of seq.Sequence
func readAlignments(done <-chan struct{}, file string) (<-chan Alignment, <-chan error) {
	alignments := make(chan Alignment)
	errc := make(chan error, 1)
	go func() {
		defer close(alignments)

		f, err := os.Open(file)
		if err != nil {
			panic(err)
		}
		defer f.Close()
		xmfaReader := seq.NewXMFAReader(f)
		numAln := 0
		for {
			alignment, err := xmfaReader.Read()
			if err != nil {
				if err != io.EOF {
					panic(err)
				}
				break
			}
			if len(alignment) > 0 {
				numAln++
				alnID := strings.Split(alignment[0].Id, " ")[0]
				select {
				case alignments <- Alignment{alnID, numAln, alignment}:
					fmt.Printf("\rRead %d alignments.", numAln)
					fmt.Printf("\r alignment ID: %s", alnID)
				case <-done:
					fmt.Printf(" Total alignments %d\n", numAln)
				}
			}
		}
		errc <- err
	}()
	return alignments, errc
}

// Alignment is an array of multiple sequences with same length.
type Alignment struct {
	ID        string
	num       int
	Sequences []seq.Sequence
}

// FilterGappedAlns reads gene alignments from the master MSA, filters out sequences with >=2% gaps,
// then sends these processed results on alnChan until either the master MSA or done channel is closed.
func FilterGappedAlns(done <-chan struct{}, alignments <-chan Alignment, filteredAlns chan<- Alignment, id int, wg *sync.WaitGroup) {
	defer wg.Done()
	//fmt.Printf("Worker %d starting\n", id)
	for aln := range alignments { // HLpaths
		//collect those sequences with < 2% gaps
		var filteredSeqs []seq.Sequence
		for _, s := range aln.Sequences {
			//count gaps in the gene alignment
			gaps := countGaps(s)
			//gene alignment length
			seqLength := float64(len(s.Seq))
			percentGaps := gaps / seqLength
			if percentGaps < 0.02 {
				filteredSeqs = append(filteredSeqs, s)
			}
		}
		//just include those gene alignments with <2% gaps
		var filteredAln Alignment
		filteredAln = Alignment{aln.ID, aln.num, filteredSeqs}

		select {
		case filteredAlns <- filteredAln:
		case <-done:
			return
		}
	}
	//fmt.Printf("Worker %d done\n", id)

}

// countGaps counts the number of gaps in a gene sequence
func countGaps(s seq.Sequence) (NumGaps float64) {
	for i := 0; i < len(s.Seq); i++ {
		b := s.Seq[i]
		if b == '-' || b == 'N' {
			NumGaps++
		}
	}
	return
}

//makeFilteredMSA makes the outdir and initializes the MSA files for core and flexible genomes
func makeFilteredMSA(outdir string, alnFile string) {
	if _, err := os.Stat(outdir); os.IsNotExist(err) {
		os.Mkdir(outdir, 0755)
	}
	terms := strings.Split(alnFile, "/")
	alnFileName := terms[len(terms)-1]
	MSAname := alnFileName + "_GAPFILTERED"
	MSA := filepath.Join(outdir, MSAname)
	f, err := os.Create(MSA)
	check(err)
	f.Close()
	f, err = os.Create(MSA)
	check(err)
	f.Close()
}

//check for errors
func check(e error) {
	if e != nil {
		panic(e)
	}
}

//writeMSA write the gene to the correct MSA (core or flex)
func writeMSA(c Alignment, outdir string, alnFile string) {
	terms := strings.Split(alnFile, "/")
	alnFileName := terms[len(terms)-1]
	MSAname := alnFileName + "_GAPFILTERED"
	MSA := filepath.Join(outdir, MSAname)
	//f, err := os.OpenFile(MSA, os.O_APPEND|os.O_WRONLY|os.O_CREATE, 0600)
	//f, err := os.Create(MSA)
	f, err := os.OpenFile(MSA, os.O_APPEND|os.O_WRONLY, 0600)
	if err != nil {
		panic(err)
	}
	defer f.Close()
	aln := c
	for _, s := range aln.Sequences {
		f.WriteString(">" + s.Id + "\n")
		f.Write(s.Seq)
		f.WriteString("\n")
	}
	f.WriteString("=\n")
}

Software Heritage — Copyright (C) 2015–2025, The Software Heritage developers. License: GNU AGPLv3+.
The source code of Software Heritage itself is available on our development forge.
The source code files archived by Software Heritage are available under their own copyright and licenses.
Terms of use: Archive access, API— Contact— JavaScript license information— Web API

back to top