https://github.com/jhbadger/APIS
Revision cedb1bbca42dd06094db03ac0b517a889d4b8b4c authored by Jonathan Badger on 17 August 2011, 19:32:14 UTC, committed by Jonathan Badger on 17 August 2011, 19:32:14 UTC
1 parent 67d3efe
Raw File
Tip revision: cedb1bbca42dd06094db03ac0b517a889d4b8b4c authored by Jonathan Badger on 17 August 2011, 19:32:14 UTC
remove apisRun, _input from dataset name on grid
Tip revision: cedb1bb
simulateReads
#!/usr/bin/env ruby

require 'optparse'
require 'ostruct'
require 'bio'

opt = OpenStruct.new
opt.datasets = 1
opt.num = 500000
opt.prefix = "simreads"
opt.evolve = 0.0
opt.avLen = 400
opt.stdDev = 100

o = OptionParser.new

o.banner << " fasta-file percentages.txt"
o.on("-a ", "--avg ", Integer, 
    "average read length (#{opt.avLen})") {|t| opt.avLen = t}
o.on("-d ", "--datasets ", Integer, 
    "number of datasets to simulate (#{opt.datasets})") {|t| opt.datasets = t}
o.on("-e ", "--evolve ", Float, "probability of each base to evolve (#{opt.evolve})") {|t| opt.evolve = t}
o.on("-m ", "--method ", String, 
  "sequencing method (#{opt.seqmethod} default)") {|t| opt.seqmethod = t}
o.on("-n ", "--num ", Integer, 
    "reads per dataset (#{opt.num})") {|t| opt.num = t}
o.on("-p ", "--prefix ", String, 
      "file name prefix for simulated read files (#{opt.prefix} default)") {|t| opt.prefix = t}
o.on("-s ", "--sdev ", Integer, 
          "read length std dev (#{opt.stdDev})") {|t| opt.stdDev = t}
begin
  o.parse!
rescue
  STDERR << $!.message << "\n"
  STDERR << o
  exit(1)
end
if (ARGV.size != 2)
  STDERR << o
  exit(1)
end


# simulate data from normal distribution with given mean and sd
def simNorm(mean, sd)
  u1 = rand
  u2 = rand
  return (sd * Math.sqrt(-2*Math.log(u1))*Math.cos(2*3.1415*u2)) + mean
end

fasta, percent = ARGV

num = Hash.new
File.new(percent).each do |line|
  contig, percent = line.chomp.split(" ")
  percent = percent.to_f
  num[contig] = (opt.num*percent).to_i
end

outFiles = []
suffix = "0000001"
opt.datasets.times do |i|
  outFiles[i] = File.new(opt.prefix + suffix + ".fa", "w")
  suffix.succ!
end

suffix = "0000001"
Bio::FlatFile.new(Bio::FastaFormat, File.new(fasta)).each do |seq|
  if (num[seq.entry_id])
    opt.datasets.times do |i|
      STDERR << "Simulating #{num[seq.entry_id]} reads from #{seq.definition} for dataset #{i + 1}...\n"
      num[seq.entry_id].times do
        len = 0
        while (len < 1)
          len = simNorm(opt.avLen, opt.stdDev).to_i
        end
        pos = (rand * (seq.length - len)).to_i
        read = seq.seq[pos,len]
        if (opt.evolve > 0)
          len.times do |n|
            if (rand < opt.evolve)
              x = rand
              if (x < 0.25)
                read[n] = "a"
              elsif (x < 0.5)
                read[n] = "g"
              elsif (x < 0.75)
                read[n] = "c"
              else
                read[n] = "t"
              end
            end
          end
        end
        header = "seq" + suffix + " " + seq.definition
        outFiles[i].print ">" + header + "\n" + read.gsub(Regexp.new(".{1,60}"), "\\0\n")
        suffix.succ!
      end
    end
  else
    STDERR << "Warning: #{seq.entry_id} is not in probability file!\n"
  end
end

opt.datasets.times do |i|
  outFiles[i].close
end
back to top