https://github.com/fjruizruano/ngs-protocols
Raw File
Tip revision: 39a091d1fa569a7fc717ac73c4b3de07f0a1204d authored by fjruizruano on 03 August 2023, 11:48:27 UTC
adding gfa2fas.py and extract_gfa.py
Tip revision: 39a091d
cd_hit_count_clusters.py
#!/usr/bin/python

import sys

print "cd_hit_count_clusters.py CdhitClstrFile MinClusterSize MaxClusterSize"

try:
    file = sys.argv[1]
except:
    file = raw_input("Introduce CD-HIT's *.Clstr File: ")

try:
    min = int(sys.argv[2])
except:
    min = raw_input("Introduce Minimum Cluster Size:")
    min = int(min)

try:
    max = int(sys.argv[3])
except:
    max = raw_input("Introduce Maximum Cluster Size:")
    max = int(min)

data = open(file).readlines()

indexes = []
sequences = []
sizes = []

for n in range(0,len(data)):
    if data[n].startswith(">Cluster"):
        indexes.append(n)
    elif data[n].endswith("*\n"):
        info = data[n]
        info = info.split()
        info = info[2][1:-3]
        sequences.append(info)

for n in range(0,len(indexes)):
    if n != len(indexes)-1:
        info = data[indexes[n+1]-1]
    else:
        info = data[-1]
    info = info.split()
    info = int(info[0])+1
    sizes.append(info)

w = open(file+".sel.%s.%s" % (str(min), str(max)), "w")

ww = open(file+".clu.%s.%s" % (str(min), str(max)), "w")

for n in range(0, len(sizes)):
    if sizes[n] >= min and sizes[n] <= max:
        w.write("%s\n" % sequences[n])
        cluster = data[indexes[n]:indexes[n+1]]
        uniq_cluster = []
        for el in cluster[1:]:
            i = el.split("\t")
            uniq_cluster.append(i[1])
        uniq_cluster = set(uniq_cluster[1:])
        uniq_cluster = list(uniq_cluster)
        ucl = len(uniq_cluster)
        if ucl >= min and ucl <= max:
            ww.write(cluster[0]+"".join(uniq_cluster))

print el 
print i
print i[1:]
print uniq_cluster

w.close()
ww.close()


back to top