https://github.com/fjruizruano/ngs-protocols
Raw File
Tip revision: 39a091d1fa569a7fc717ac73c4b3de07f0a1204d authored by fjruizruano on 03 August 2023, 11:48:27 UTC
adding gfa2fas.py and extract_gfa.py
Tip revision: 39a091d
divsum_ab.py
#!/usr/bin/python

import sys
import operator
from subprocess import call
from Bio import SeqIO

print "divsum_count.py DivsumFile FastaFile"

try:
    file = sys.argv[1]
except:
    file = raw_input("Introduce RepeatMasker's Divsum file: ")

try:
    fasta = sys.argv[2]
except:
    fasta = raw_input("Introduce FASTA file: ")

data = open(file).readlines()

header = "Coverage for each repeat class and divergence (Kimura)\n"

matrix_start = data.index(header)
matrix = data[matrix_start+1:]
li = []
li_clean = []
names_line = matrix[0]
info = names_line.split()
info = info[1:]

for fam in info:
    fam_clean = fam.split("/")
    li.append([fam])
    li_clean.append([fam_clean[1]])

info_len = len(li)

for line in matrix[1:]:
    info = line.split()
    info = info[1:]
    for i in range(0,info_len):
        li_clean[i].append(info[i])

out = open(file+".counts","w")

counts = []
for el in li_clean:
    numbers = el[1:]
    numbers = [int(x) for x in numbers]
    counts.append([el[0],sum(numbers)])
    out.write(el[0]+"\t"+str(sum(numbers))+"\n")

out.close()

fam_count = {}
fam_all = {}
fam_members = {}
seq_list = []
var_number = {}

for el in counts:
    clase = el[0]
    count = el[1]
    clase_fam = clase.split("_")
    clase_fam = clase_fam[-1]
    if len(clase_fam) == 2:
        if clase_fam in fam_count:
            fam_members[clase_fam].append(clase)
            last_count = fam_count[clase_fam][1]
            if count > last_count:
                fam_count[clase_fam] = [clase,count]
            fam_all[clase_fam][clase] = count
        else:
            fam_members[clase_fam] = [clase]
            fam_count[clase_fam] = [clase,count]
            fam_all[clase_fam] = {clase:count}
    else:
        seq_list.append(clase)
        var_number[clase] = 1

out = open("pattern.txt", "w")

for el in fam_count:
    leader = fam_count[el][0]
    seq_list.append(leader)
    members = fam_members[el]
    var_number[leader] = len(members)
    for member in members:
        if member != leader:
            out.write("%s\t%s\n" % (member, leader))

out.close()

out = open("selection.txt", "w")
out.write("\n".join(seq_list))
out.close()

call("extract_seq.py %s selection.txt" % fasta, shell=True)

out = open("table.txt", "w")

fasta_sel = SeqIO.parse(open("selection.txt.extract"), "fasta")

for secu in fasta_sel:
    secuen = str(secu.seq)
    id = str(secu.id)
    length = len(secuen)
    at = secuen.count("A")+secuen.count("T")+secuen.count("a")+secuen.count("t")
    at_perc = float(1.0*at/length)
    v_number = var_number[id]
    out.write("%s\t%s\t%s\t%s\n" % (id, str(length), str(at_perc), v_number))

out.close()

abc = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
abc_dict = {}
for el in fam_all:
    family = fam_all[el]
    family_sort = sorted(family.iteritems(), key=operator.itemgetter(1), reverse=True)
    i = -1
    for var in family_sort:
        i += 1
        abc_dict[var[0]] = fam_count[el][0] + abc[i]

out = open(fasta+".abc", "w")
fas = SeqIO.parse(open(fasta), "fasta")
len_dict = {}
for s in fas:
    new = s.id
    length = len(str(s.seq))
    if s.id in abc_dict:
        new = abc_dict[s.id]
    out.write(">%s-%s\n%s\n" % (new, length, s.seq))
    len_dict[new] = length
out.close()

out = open(fasta+".dim.abc", "w")
fas = SeqIO.parse(open(fasta+".dim"), "fasta")
for s in fas:
    new = s.id
    if s.id in abc_dict:
        new = abc_dict[s.id]
    length = len_dict[new]
    out.write(">%s-%s\n%s\n" % (new, length, s.seq))
out.close()

back to top