https://github.com/csw/bioruby-maf
Revision f443fdc142c5992c47b6639f3a613d2496350f7e authored by Clayton Wheeler on 04 August 2012, 22:34:14 UTC, committed by Clayton Wheeler on 04 August 2012, 22:34:14 UTC
1 parent 4fc31e2
Raw File
Tip revision: f443fdc142c5992c47b6639f3a613d2496350f7e authored by Clayton Wheeler on 04 August 2012, 22:34:14 UTC
Use GZipReader instead of a gzip pipe. Performs better.
Tip revision: f443fdc
maf_tile
#!/usr/bin/env ruby

require 'optparse'
require 'ostruct'

require 'bio-maf'
require 'bio-genomic-interval'

def parse_interval(line)
  src, r_start_s, r_end_s, _ = line.split(nil, 4)
  r_start = r_start_s.to_i
  r_end = r_end_s.to_i
  return Bio::GenomicInterval.zero_based(src, r_start, r_end)
end

def target_for(base, interval, &blk)
  path = "#{base}_#{interval.zero_start}-#{interval.zero_end}.fa"
  File.open(path, 'w', &blk)
end

def apply_options(options, tiler)
  tiler.reference = options.ref if options.ref
  tiler.species = options.species
  tiler.species_map = options.species_map
end

options = OpenStruct.new
options.p = { :threads => 1 }
options.species = []
options.species_map = {}
options.usage = false

o_parser = OptionParser.new do |opts|
  opts.banner = "Usage: maf_tile [options] <maf> [index]"
  opts.separator ""
  opts.separator "Options:"
  opts.on("-r", "--reference SEQ", "FASTA reference sequence") do |ref|
    options.ref = ref
  end
  opts.on("-i", "--interval [CHR:]BEGIN:END", "Genomic interval, zero-based") do |int|
    if int =~ /(.+):(\d+):(\d+)/
      gi = Bio::GenomicInterval.zero_based($1, ($2.to_i), ($3.to_i))
      options.genomic_interval = gi
    elsif int =~ /(\d+):(\d+)/
      options.interval = ($1.to_i)...($2.to_i)
    else
      $stderr.puts "Invalid interval specification #{int}!"
      options.usage = true
    end
  end
  opts.on("-s", "--species SPECIES[:NAME]", "Species to use (with mapped name)") do |sp|
    if sp =~ /:/
      species, mapped = sp.split(/:/)
      options.species << species
      options.species_map[species] = mapped
    else
      options.species << sp
    end
  end
  opts.on("-o", "--output-base BASE", "Base name for output files",
          "Use stdout for a single interval if not given") do |base|
    options.output_base = base
  end
  opts.on("--bed BED", "BED file specifying intervals",
          "(requires --output-base)") do |bed|
    options.bed = bed
  end
  Bio::MAF::handle_logging_options(opts)
end

o_parser.parse!(ARGV)
Bio::Log::CLI.configure('bio-maf')

maf_p = ARGV.shift
index_p = ARGV.shift

unless (! options.usage) \
  && maf_p && (! options.species.empty?) \
  && (options.output_base \
      ? options.bed \
      : options.interval || options.genomic_interval)
  $stderr.puts o_parser
  exit 2
end

access = if File.directory? maf_p
           Bio::MAF::Access.maf_dir(maf_p, options.p)
         else
           Bio::MAF::Access.file(maf_p, index_p, options.p)
         end

if options.bed
  intervals = []
  File.open(options.bed) do |bed_f|
    bed_f.each_line { |line| intervals << parse_interval(line) }
  end
  intervals.sort_by! { |int| int.zero_start }
  intervals.each do |int|
    access.tile(int) do |tiler|
      apply_options(options, tiler)
      target_for(options.output_base, int) do |target|
        tiler.write_fasta(target)
      end
    end
  end
else
  # single interval
  if options.genomic_interval
    interval = options.genomic_interval
  else
    if access.indices.size != 1
      raise "Must explicitly specify sequence in --interval argument with multiple candidate MAF files!"
    end
    ref_seq = access.indices.keys.first
    interval = Bio::GenomicInterval.zero_based(ref_seq,
                                               options.interval.begin,
                                               options.interval.end)
  end
  access.tile(interval) do |tiler|
    apply_options(options, tiler)
    if options.output_base
      target = target_for(options.output_base, tiler.interval)
    else
      target = $stdout
    end
    tiler.write_fasta(target)
    target.close
  end
end
back to top