https://github.com/csw/bioruby-maf
Revision f443fdc142c5992c47b6639f3a613d2496350f7e authored by Clayton Wheeler on 04 August 2012, 22:34:14 UTC, committed by Clayton Wheeler on 04 August 2012, 22:34:14 UTC
1 parent 4fc31e2
Tip revision: f443fdc142c5992c47b6639f3a613d2496350f7e authored by Clayton Wheeler on 04 August 2012, 22:34:14 UTC
Use GZipReader instead of a gzip pipe. Performs better.
Use GZipReader instead of a gzip pipe. Performs better.
Tip revision: f443fdc
maf_tile
#!/usr/bin/env ruby
require 'optparse'
require 'ostruct'
require 'bio-maf'
require 'bio-genomic-interval'
def parse_interval(line)
src, r_start_s, r_end_s, _ = line.split(nil, 4)
r_start = r_start_s.to_i
r_end = r_end_s.to_i
return Bio::GenomicInterval.zero_based(src, r_start, r_end)
end
def target_for(base, interval, &blk)
path = "#{base}_#{interval.zero_start}-#{interval.zero_end}.fa"
File.open(path, 'w', &blk)
end
def apply_options(options, tiler)
tiler.reference = options.ref if options.ref
tiler.species = options.species
tiler.species_map = options.species_map
end
options = OpenStruct.new
options.p = { :threads => 1 }
options.species = []
options.species_map = {}
options.usage = false
o_parser = OptionParser.new do |opts|
opts.banner = "Usage: maf_tile [options] <maf> [index]"
opts.separator ""
opts.separator "Options:"
opts.on("-r", "--reference SEQ", "FASTA reference sequence") do |ref|
options.ref = ref
end
opts.on("-i", "--interval [CHR:]BEGIN:END", "Genomic interval, zero-based") do |int|
if int =~ /(.+):(\d+):(\d+)/
gi = Bio::GenomicInterval.zero_based($1, ($2.to_i), ($3.to_i))
options.genomic_interval = gi
elsif int =~ /(\d+):(\d+)/
options.interval = ($1.to_i)...($2.to_i)
else
$stderr.puts "Invalid interval specification #{int}!"
options.usage = true
end
end
opts.on("-s", "--species SPECIES[:NAME]", "Species to use (with mapped name)") do |sp|
if sp =~ /:/
species, mapped = sp.split(/:/)
options.species << species
options.species_map[species] = mapped
else
options.species << sp
end
end
opts.on("-o", "--output-base BASE", "Base name for output files",
"Use stdout for a single interval if not given") do |base|
options.output_base = base
end
opts.on("--bed BED", "BED file specifying intervals",
"(requires --output-base)") do |bed|
options.bed = bed
end
Bio::MAF::handle_logging_options(opts)
end
o_parser.parse!(ARGV)
Bio::Log::CLI.configure('bio-maf')
maf_p = ARGV.shift
index_p = ARGV.shift
unless (! options.usage) \
&& maf_p && (! options.species.empty?) \
&& (options.output_base \
? options.bed \
: options.interval || options.genomic_interval)
$stderr.puts o_parser
exit 2
end
access = if File.directory? maf_p
Bio::MAF::Access.maf_dir(maf_p, options.p)
else
Bio::MAF::Access.file(maf_p, index_p, options.p)
end
if options.bed
intervals = []
File.open(options.bed) do |bed_f|
bed_f.each_line { |line| intervals << parse_interval(line) }
end
intervals.sort_by! { |int| int.zero_start }
intervals.each do |int|
access.tile(int) do |tiler|
apply_options(options, tiler)
target_for(options.output_base, int) do |target|
tiler.write_fasta(target)
end
end
end
else
# single interval
if options.genomic_interval
interval = options.genomic_interval
else
if access.indices.size != 1
raise "Must explicitly specify sequence in --interval argument with multiple candidate MAF files!"
end
ref_seq = access.indices.keys.first
interval = Bio::GenomicInterval.zero_based(ref_seq,
options.interval.begin,
options.interval.end)
end
access.tile(interval) do |tiler|
apply_options(options, tiler)
if options.output_base
target = target_for(options.output_base, tiler.interval)
else
target = $stdout
end
tiler.write_fasta(target)
target.close
end
end
Computing file changes ...