https://github.com/fenderglass/Ragout
Revision 2c683c8749cb7b50f7134e5621d81a901e77005c authored by fenderglass on 17 October 2013, 13:45:12 UTC, committed by fenderglass on 17 October 2013, 13:45:12 UTC
1 parent b853b4e
Raw File
Tip revision: 2c683c8749cb7b50f7134e5621d81a901e77005c authored by fenderglass on 17 October 2013, 13:45:12 UTC
with debrujin refinement
Tip revision: 2c683c8
simulator.py
#!/usr/bin/env python

import numpy as np
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord
from Bio.Seq import Seq
from collections import namedtuple
import sys
import random
SyntenyBlock = namedtuple("SyntenyBlock", ["seq", "chr_id", "strand", "id", "start", "end", "chr_num"])
SeqInfo = namedtuple("SeqInfo", ["id", "length"])
Permutation = namedtuple("Permutation", ["chr_id", "chr_num", "blocks"])


def parse_coords_file(blocks_file):
    group = [[]]
    seq_info = {}
    blocks_info = {}
    line = [l.strip() for l in open(blocks_file) if l.strip()]
    for l in line:
        if l.startswith("-"):
            group.append([])
        else:
            group[-1].append(l)
    for l in group[0][1:]:
        seq_num, seq_len, seq_id = l.split()
        seq_num = int(seq_num)
        seq_info[seq_num] = SeqInfo(seq_id, int(seq_len))
    for g in [g for g in group[1:] if g]:
        block_id = int(g[0].split()[1][1:])
        blocks_info[block_id] = []
        for l in g[2:]:
            chr_num, bl_strand, bl_start, bl_end, bl_length = l.split()
            chr_num = int(chr_num)
            chr_id = seq_info[chr_num].id
            blocks_info[block_id].append(SyntenyBlock(seq="", chr_id=chr_id,
                                        strand=bl_strand, id=block_id,
                                        start=int(bl_start), end=int(bl_end),
                                        chr_num=int(chr_num)))
    return (blocks_info, seq_info)


def parse_permutations_file(filename, seq_info):
    fin = open(filename, "r")
    contigs = []
    permutations = []
    contig_name = None
    ref_name = None
    num_by_id = {seq.id : seq_num for seq_num, seq in seq_info.iteritems()}

    for line in fin:
        if line.startswith(">"):
            name = line.strip()[1:]
            ref_name = name
            continue

        blocks = line.strip().split(" ")[0:-1]

        ref_num = num_by_id[ref_name]
        permutations.append(Permutation(chr_id=ref_name, chr_num=ref_num,
                                        blocks=map(int, blocks)))
    return permutations


def reversal(permutation, mean_length):
    length = np.random.poisson(mean_length, 1)
    position = np.random.randint(0, len(permutation) - length)
    negate = map(lambda b: -b, permutation[position : position + length])
    return permutation[0:position] + negate[::-1] + permutation[position+length:]


def translocation(permutation, mean_length):
    length = np.random.poisson(mean_length, 1)
    pos_from = np.random.randint(0, len(permutation) - length)
    pos_to = np.random.randint(0, len(permutation) - length)

    cutted = permutation[pos_from:pos_from+length]
    rest = permutation[0:pos_from] + permutation[pos_from+length:]

    return rest[0:pos_to] + cutted + rest[pos_to:]


def deletion(permutation):
    pos = np.random.randint(0, len(permutation))
    del permutation[pos]
    return permutation


def insertion(permutation, blocks_info):
    pos = np.random.randint(0, len(permutation))
    while True:
        block_num = np.random.randint(0, 1000)
        if block_num not in blocks_info:
            break
    permutation.insert(pos, block_num)
    return permutation


def indel(permutation, blocks_info):
    if np.random.randint(0, 2):
        return insertion(permutation, blocks_info)
    else:
        return deletion(permutation)


def evolve(permutation, blocks_info, n_rearr, n_indels):
    REVERSAL_LEN = 3
    TRANSLOC_LEN = 2
    for i in xrange(n_rearr):
        permutation = reversal(permutation, REVERSAL_LEN)
        permutation = translocation(permutation, TRANSLOC_LEN)

    for i in xrange(n_indels):
        permutation = indel(permutation, blocks_info)
    return permutation


def rand_seq(length):
    return "".join(random.choice(["A", "C", "G", "T"]) for x in xrange(length))


def main():
    print "Super Rearrangement Simulator 2000!"
    if len(sys.argv) < 4:
        print "Usage: coords_file permutations_file genome"
        return

    blocks_info, seq_info = parse_coords_file(sys.argv[1])
    permutations = parse_permutations_file(sys.argv[2], seq_info)
    sequence = SeqIO.parse(sys.argv[3], "fasta").next().seq
    root = permutations[0].blocks

    N_REARR = 3
    N_INDELS = 10
    ref1 = evolve(root, blocks_info, N_REARR, N_INDELS)
    root2 = evolve(root, blocks_info, N_REARR, N_INDELS)
    ref2 = evolve(root2, blocks_info, N_REARR, N_INDELS)
    root3 = evolve(root2, blocks_info, 0, N_INDELS)
    #ref3 = evolve(root3, BRANCH_LEN)
    ref3 = root3
    root4 = evolve(root3, blocks_info, 0, N_INDELS)
    ref4 = evolve(root4, blocks_info, N_REARR, N_INDELS)
    ref5 = evolve(root4, blocks_info, N_REARR, N_INDELS)

    new_blocks = {}

    for i, result in enumerate([ref1, ref2, ref3, ref4, ref5]):
        seq = Seq("")
        for block in result:
            if abs(block) not in blocks_info:
                if abs(block) not in new_blocks:
                    new_blocks[abs(block)] = Seq(rand_seq(5000))
                bseq = new_blocks[abs(block)]
            else:
                binfo = filter(lambda b: b.chr_num == 1, blocks_info[abs(block)])[0]
                bseq = (sequence[binfo.start : binfo.end + 1] if binfo.strand == "+"
                        else sequence[binfo.end : binfo.start + 1])
            if block < 0:
                bseq = bseq.reverse_complement()
            seq += bseq

        ref_name = "synteinc_reference_{0}.fasta".format(i + 1)
        SeqIO.write(SeqRecord(seq=seq, id=ref_name, description=""), open(ref_name, "w"), "fasta")
    #print perm
    #print evolve(perm, 10)


if __name__ == "__main__":
    main()
back to top