https://github.com/csw/bioruby-maf
Revision f443fdc142c5992c47b6639f3a613d2496350f7e authored by Clayton Wheeler on 04 August 2012, 22:34:14 UTC, committed by Clayton Wheeler on 04 August 2012, 22:34:14 UTC
1 parent 4fc31e2
Raw File
Tip revision: f443fdc142c5992c47b6639f3a613d2496350f7e authored by Clayton Wheeler on 04 August 2012, 22:34:14 UTC
Use GZipReader instead of a gzip pipe. Performs better.
Tip revision: f443fdc
maf_extract
#!/usr/bin/env ruby

require 'bio-maf'
require 'optparse'
require 'ostruct'

include Bio::MAF

options = OpenStruct.new
options.mode = :intersect
options.format = :maf
options.seq_filter = {}
options.block_filter = {}
options.parse_options = {}

def handle_list_spec(spec)
  if spec =~ /^@(.+)/
    File.read($1).split
  else
    spec.split(',')
  end
end

def handle_interval_spec(int)
  if int =~ /(.+):(\d+)-(\d+)/
    Bio::GenomicInterval.zero_based($1, $2.to_i, $3.to_i)
  else
    raise "Invalid interval specification: #{int}"
  end
end

$op = OptionParser.new do |opts|
  opts.banner = "Usage: maf_extract (-m MAF [-i INDEX] | -d MAFDIR) [options]"
  opts.separator ""
  opts.separator "MAF source options (either --maf or --maf-dir must be given):"
  opts.on("-m", "--maf MAF", "MAF file") do |maf|
    options.maf = maf
  end
  opts.on("-i", "--index INDEX", "MAF index") do |idx|
    options.idx = idx
  end
  opts.on("-d", "--maf-dir DIR", "MAF directory") do |dir|
    options.maf_dir = dir
  end
  opts.separator ""
  opts.separator "Extraction options:"
  opts.on("--mode MODE", [:intersect, :slice],
          "Extraction mode; 'intersect' to match ",
          "blocks intersecting the given region,",
          "or 'slice' to extract subsets covering ",
          "given regions") do |mode|
    options.mode = mode
  end
  opts.on("--bed BED", "Use intervals from the given BED file") do |bed|
    options.bed = bed
  end
  opts.on("--interval SEQ:START:END", "Zero-based genomic interval to match") do |int|
    options.interval = handle_interval_spec(int)
  end
  opts.separator ""
  opts.separator "Output options:"
  opts.on("-f", "--format FMT", [:maf, :fasta], "Output format") do |fmt|
    options.format = fmt
  end
  opts.on("-o", "--output OUT", "Write output to file OUT") do |out|
    options.out_path = out
  end
  opts.separator ""
  opts.separator "Filtering options:"
  opts.on("--only-species SPECIES",
          "Filter out all but the species in the",
          "given comma-separated list",
          "(or @FILE to read from a file)") do |spec|
    options.seq_filter[:only_species] = handle_list_spec(spec)
  end
  opts.on("--with-all-species SPECIES",
          "Only match blocks with all the given",
          "species, comma-separated",
          "(or @FILE to read from a file)") do |spec|
    options.block_filter[:with_all_species] = handle_list_spec(spec)
  end
  opts.on("--min-sequences N", Integer,
          "Match only blocks with at least N sequences") do |n|
    options.block_filter[:at_least_n_sequences] = n
  end
  opts.on("--min-text-size N", Integer,
          "Match only blocks with minimum text size N") do |n|
    options.block_filter[:min_size] = n
  end
  opts.on("--max-text-size N", Integer,
          "Match only blocks with maximum text size N") do |n|
    options.block_filter[:max_size] = n
  end
  opts.separator ""
  opts.separator "Block processing options:"
  opts.on("--join-blocks",
          "Join blocks if appropriate after filtering",
          "out sequences") do
    options.parse_options[:join_blocks] = true
  end
  opts.on("--remove-gaps", "Remove gaps after filtering out sequences") do
    options.parse_options[:remove_gaps] = true
  end
  opts.on("--parse-extended", "Parse 'extended' MAF data (i, q lines)") do
    options.parse_options[:parse_extended] = true
  end
  opts.on("--parse-empty", "Parse empty (e) lines of MAF data") do
    options.parse_options[:parse_empty] = true
  end
  opts.separator ""
  opts.separator "Logging options:"
  Bio::MAF::handle_logging_options(opts)
end
$op.parse!(ARGV)
Bio::Log::CLI.configure('bio-maf')

def usage(msg)
  $stderr.puts msg
  $stderr.puts $op
  exit 2
end

if options.maf
  access = Access.file(options.maf, options.idx, options.parse_options)
elsif options.maf_dir
  access = Access.maf_dir(options.maf_dir, options.parse_options)
else
  usage "Must supply --maf or --maf-dir!"
end

begin
  access.sequence_filter = options.seq_filter unless options.seq_filter.empty?
  access.block_filter = options.block_filter unless options.block_filter.empty?
  if options.out_path
    outf = File.open(options.out_path, 'w')
  else
    outf = $stdout
  end

  case options.format
  when :maf
    writer = Writer.new(outf)
  when :fasta
    writer = FASTAWriter.new(outf)
  else
    raise "unsupported output format #{format}!"
  end

  if options.bed
    intervals = read_bed_intervals(options.bed)
  elsif options.interval
    intervals = [options.interval]
  else
    usage "Must supply --interval or --bed!"
  end

  # TODO: provide access to original MAF header?
  if options.format == :maf
    writer.write_header(Header.default)
  end

  case options.mode
  when :intersect
    access.find(intervals) do |block|
      writer.write_block(block)
    end
  when :slice
    # TODO: multiple files if intervals.size > 1?
    intervals.each do |interval|
      access.slice(interval) do |block|
        writer.write_block(block)
      end
    end
  else
    raise "Unsupported mode #{options.mode}!"
  end
  
ensure
  access.close
end
back to top