https://github.com/kussell-lab/ReferenceAlignmentGenerator
Tip revision: 49bf399a9d1ceae722c0c8c4aeb8376f2644d40f authored by Asher Preska Steinberg on 07 February 2022, 16:12:20 UTC
Commit for ReduceCoreGenome
Commit for ReduceCoreGenome
Tip revision: 49bf399
spliter.go
package main
import (
"fmt"
"sort"
"strings"
"github.com/kussell-lab/biogo/feat/gff"
"github.com/kussell-lab/biogo/seq"
)
// Splitter is a interface for splitting genome sequences into multiple parts.
type Splitter interface {
Split(s *seq.Sequence) []*seq.Sequence
}
// GeneSplitter splites a genome into multiple genes.
type GeneSplitter struct {
features []*gff.Record
}
// NewGeneSplitter returns a new GeneSplitter
func NewGeneSplitter(features []*gff.Record) *GeneSplitter {
cdsFeatures := []*gff.Record{}
for _, feat := range features {
if feat.Feature == "CDS" {
cdsFeatures = append(cdsFeatures, feat)
}
}
return &GeneSplitter{features: cdsFeatures}
}
// Split splits a genomic sequence into multiple genes.
func (spliter *GeneSplitter) Split(s *seq.Sequence) []*seq.Sequence {
return split(s, spliter.features)
}
func split(s *seq.Sequence, features []*gff.Record) []*seq.Sequence {
subSeqs := []*seq.Sequence{}
for _, rec := range features {
if s.Id != rec.SeqName {
// continue
}
start := rec.Start - 1
if start < len(s.Seq) {
end := rec.End
if end > len(s.Seq) {
end = len(s.Seq)
}
if end > len(s.Seq) {
end = len(s.Seq)
}
ss := &seq.Sequence{}
ss.Id = getGffID(rec)
ss.Seq = s.Seq[start:end]
// add gaps into the end if the sequence is of short.
length := rec.End - rec.Start + 1 // desired length.
for len(ss.Seq) < length {
ss.Seq = append(ss.Seq, '-')
}
if rec.Strand == gff.ReverseStrand {
ss.Seq = seq.Reverse(ss.Seq)
if rec.Feature == "CDS" {
ss.Seq = seq.Complement(ss.Seq)
}
}
subSeqs = append(subSeqs, ss)
}
}
return subSeqs
}
func getGffID(rec *gff.Record) string {
attrs := strings.Split(rec.Attribute, ";")
for _, att := range attrs {
keyvalue := strings.Split(att, "=")
key := keyvalue[0]
value := keyvalue[1]
if key == "ID" {
strand := "+"
if rec.Strand == gff.ReverseStrand {
strand = "-"
}
return fmt.Sprintf("%s|%s %d%s%d", rec.SeqName, value, rec.Start, strand, rec.End)
}
}
return fmt.Sprintf("%s|start:%d_end:%d", rec.SeqName, rec.Start, rec.End)
}
// UpstreamSplitter split a genome sequence into mutiple upstream region.
type UpstreamSplitter struct {
UpstreamLen int
features []*gff.Record
}
// NewUpstreamSplitter return a new UpstreamSplitter
func NewUpstreamSplitter(features []*gff.Record, upstreamLen int) *UpstreamSplitter {
us := &UpstreamSplitter{UpstreamLen: upstreamLen}
genomeFeatures := [][]*gff.Record{}
current := ""
currentFeatures := []*gff.Record{}
for _, feat := range features {
if current == "" {
current = feat.SeqName
}
if current != feat.SeqName {
genomeFeatures = append(genomeFeatures, currentFeatures)
current = feat.SeqName
currentFeatures = []*gff.Record{}
}
currentFeatures = append(currentFeatures, feat)
}
genomeFeatures = append(genomeFeatures, currentFeatures)
upstreamRegions := []*gff.Record{}
for _, genomeFeats := range genomeFeatures {
upstreamRegions = append(upstreamRegions, extractUpstreamRegions(genomeFeats, upstreamLen)...)
}
us.features = upstreamRegions
return us
}
// Split splits a genomic sequence into multiple genes.
func (spliter *UpstreamSplitter) Split(s *seq.Sequence) []*seq.Sequence {
return split(s, spliter.features)
}
// ByStart is a sorting function to sort gff records by start position.
type ByStart []*gff.Record
func (s ByStart) Len() int {
return len(s)
}
func (s ByStart) Swap(i, j int) {
s[i], s[j] = s[j], s[i]
}
func (s ByStart) Less(i, j int) bool {
return s[i].Start < s[j].Start
}
// extractUpstreamRegions return upstream regions for each gene.
func extractUpstreamRegions(features []*gff.Record, upstreamLen int) []*gff.Record {
cdsGffs := []*gff.Record{}
for _, feat := range features {
if feat.Feature == "CDS" {
cdsGffs = append(cdsGffs, feat)
}
}
sort.Sort(ByStart(cdsGffs))
regions := []*gff.Record{}
for i, feat := range cdsGffs {
start := 0
end := -1
if feat.Strand == gff.ForwardStrand {
start = feat.Start - upstreamLen
end = feat.Start - 1
if start < 0 {
start = 0
}
previousEnd := 1
if i > 0 {
previousEnd = cdsGffs[i-1].End + 1
}
if start < previousEnd {
start = previousEnd
}
} else {
start = feat.End + 1
end = feat.End + upstreamLen
if i < len(cdsGffs)-1 {
nextStart := cdsGffs[i+1].Start
if nextStart < end {
end = nextStart - 2
}
}
}
if end > start {
rec := gff.Record{}
rec.SeqName = feat.SeqName
rec.Source = feat.Source
rec.Attribute = feat.Attribute
rec.End = end
rec.Start = start + 1
rec.Strand = feat.Strand
rec.Feature = "upstream"
rec.Frame = feat.Frame
rec.Score = feat.Score
regions = append(regions, &rec)
}
}
return regions
}