Content - 7ad0385532260df09ace8101c613e9e319c71099 - c640f16/make-frags.py

visit type:
https://github.com/rando2/foxmap

05 April 2024, 21:02:55 UTC
Tip revision: 5a6eb200d0e985891228ca8e0d08eb0e67d547b5 authored by HM Rando on 23 April 2020, 13:08:34 UTC
Initial cleanup
Tip revision: 5a6eb20
make-frags.py
import csv
import re
from essential_mods import *

"""Note: this script is designed specifically for the red fox project and requires information that may
not be available for other assembly projects (e.g., meiotic linkage map). You are welcome to try to adapt
it for your purposes, but it will not run. It is here to show the logic we used for assembly."""

# Setup parameters
scaff_dir = '/home/lab/fox/vv2/indivs' #where are the scaffold files stored?
raca_frag_dir = '/home/rando2/genome/RACA/raca_frags' #where RACA fasta should be written
rec_chrs_file = '/home/rando2/genome/RACA/rec_chrs.refined_40kbp.txt' #from RACA

def loadsc(scaff):
    # Open the scaffold specified. Assumes multifasta has been split into individual files for each scaffold.
    # Accepts: basename of scaffold file (e.g., "scaffold1" if the file is "scaffold1.fa"
    # Returns: contents of "scaffold1.fa"
    fin = open(scaff_dir + "/" + scaff+'.fa','r')
    data = fin.read()
    data = data.replace('>' + scaff ,'')
    data = data.replace('\n','')
    return data

def initfrag(folder, frag):
    # Initialize fasta file for RACA fragment
    # Accepts: name of directory that these fragments should be build in within the raca_frag_dir and
    #          name of fragment 
    # Returns: Name of file
    fname=raca_frag_dir + '/' + folder + '/' + frag + '.fa'
    outfile = open(fname,'w')
    outfile.write('>' + frag + '\n')
    return fname

def grabseq(scaff, slow, shigh, sdir):
    # Returns sequence based on start and end positions and direction
    # Accepts: scaffold name (format: "scaffold1"), start position (low) as integer, end position (high) as integer
    #          direction of sequence desired ("+" or "-")
    # Returns: specified sequence, reverse and comped if negative direction
    seq = loadsc(scaff)[slow:shigh] #raca is using zero indexed values as they run 0 to i where i the length of the scaff
    if sdir == "-":
        seq = rev_comp(seq)
    return seq

def openfrag(folder, frag):
    # Open an existing RACA fragment
    # Accepts: Name of folder containing RACA fragments within raca_frag_dir
    # Returns: Sequence of RACA fragment
    fopen = open(raca_frag_dir+ "/" +folder+'/' + frag + '.fa','r')
    seq = fopen.read()
    seq = seq.replace('>' + frag, '')
    seq = seq.replace('\n', '')
    return seq

with open(rec_chrs_file, 'rb') as csvfile:
    data = csv.reader(csvfile, delimiter='\t')
    last = "" #raca
    outfile = ""
    for line in data:
        if line[3]=="GAPS": #if it's a gap, don't try to unpack, just write and move on
            outfile.write("N"*100)
            continue
        #Here we make individual fragments corresponding to each RACA Fragment
        raca, rlow, rhigh, scaff, slow, shigh, sdir = line  #format of data used: 12_33   0       12472085        scaffold55      0       12472085        -
        rlow, rhigh, slow, shigh = [int(i) for i in rlow, rhigh, slow, shigh]
        if len(last)==0 : #if it's the first RACA frag
            outfile = initfrag('raca_orig',raca)
        elif last != raca: #if we're starting a new RACA frag
            outfile.write('\n\n')
            outfile.close()
            outfile = initfrag('raca_orig',raca)
        last = raca
        seq = grabseq(scaff, slow, shigh, sdir)
        outfile.write(seq)

"""If you don't have a meiotic linkage map, don't try to run the code below
        
#Check whether each raca fragment goes + or - in dog
racadog = {}
with open('/home/rando2/genome/RACA/rec_chrs.canFam3.segments.formatted.csv','rb') as csvfile:
    data = csv.reader(csvfile, delimiter=',')
    for line in data:
        raca, rlow, rhigh, rdir, dog, dlow, dhigh, ddir = line #1a,1949,582677,+,chr1,24994866,25534824,+
    
        if raca not in racadog.keys():
            racadog[raca] = [[int(dlow), int(dhigh), ddir]]
        else:
            racadog[raca].append([int(dlow), int(dhigh), ddir])

final_dir = {}
for rd,ls in racadog.items():
    if len(ls) == 1:
        final_dir[rd] = ls[0][2]
    else:
        maxcvg = 0
        maxdir = ""
        for block in ls:
            if block[1]-block[0] > maxcvg:
                maxcvg = block[1]-block[0]
                maxdir = block[2]
        final_dir[rd] = maxdir

with open('/home/rando2/genome/RACA/racafrag_foxfrag.csv','r') as csvfile:
    data = csv.reader(csvfile, delimiter=',')
    last = [] #raca, fox #15a,10b
    #outfile = ""
    head = 0

    for line in data:        
        if head == 0:
            head = 1
            continue
        #Here we make individual fragments corresponding to each Dog Fragment
        raca, fox = line  #26a, 10a
        print line
        if len(last)==0 : #if it's the first RACA frag
            fake = 2 #just for annotation
            #outfile = initfrag('dog_syntblocks',fox)
        elif last[1] != fox: #if we're starting a new RACA frag
            #outfile.write('\n\n')
            #outfile.close()
            #outfile = initfrag('dog_syntblocks',fox)
            fake=2 #just for writing annotation right now
        else: # if the same raca chr, write buffer
            prevdog = [int(i[0]) for i in racadog[last[0]]] #what are the dog positions of the last fragment?
            prevdog.extend([int(i[1]) for i in racadog[last[0]]])
            currentdog = [int(i[0]) for i in racadog[raca]] #what are the dog positions of the current fragment?
            currentdog.extend([int(i[1]) for i in racadog[raca]])
            if min(currentdog) > max(prevdog):
                #outfile.write("N"*(min(currentdog) - max(prevdog)))
                print "intergap", last, "to", raca, fox, ":", min(currentdog) - max(prevdog)
            elif max(currentdog) < min(prevdog):
                #outfile.write("N"*(min(prevdog) - max(currentdog)))
                print "intergap", last, "to", raca, fox, ":", min(prevdog) - max(currentdog)

        last = [raca, fox]
        seq = openfrag('raca_orig',raca)

        dogsynt = {'1a':'-','1b':'+','1c':'+','2a':'+','2b':'-','2c':'+','3a':'+','3b':'-','3c':'+','4a':'+','4b':'-','4c':'+','5a':'+','5b':'-','5c':'-','5d':'+','5e':'-','6a':'+','6b':'+','7a':'+','7b':'+','8a':'+','8b':'-','9a':'-','9b':'+','10a':'+','10b':'+','11a':'-','11b':'+','12a':'-','12b':'-','12c':'+','13a':'-','13b':'-','13c':'-','14a':'-','14b':'+','15a':'-','15b':'+','15c':'+','16a':'-','16b':'+','X':'+'}
        if (final_dir[raca]=="+" and dogsynt[fox] == "-") or (final_dir[raca]=="-" and dogsynt[fox] == "+"):
            print "reversing", raca, "to use on", fox
            seq = rev_comp(seq)
        else:
            print raca, "runs same direction on", fox
"""