https://github.com/marvin-jens/find_circ
Revision 8416d7d04034af45aa31830b45a12d673c50ff21 authored by Marcel Schilling on 10 November 2015, 14:37:25 UTC, committed by Marcel Schilling on 10 November 2015, 14:37:25 UTC
1 parent ed560ec
Tip revision: 8416d7d04034af45aa31830b45a12d673c50ff21 authored by Marcel Schilling on 10 November 2015, 14:37:25 UTC
added underscore to prefix description
added underscore to prefix description
Tip revision: 8416d7d
unmapped2anchors.py
#!/usr/bin/env python
import pysam
import numpy
import os,sys,re
from optparse import *
from logging import error
COMPLEMENT = {
'a' : 't',
't' : 'a',
'c' : 'g',
'g' : 'c',
'k' : 'm',
'm' : 'k',
'r' : 'y',
'y' : 'r',
's' : 's',
'w' : 'w',
'b' : 'v',
'v' : 'b',
'h' : 'd',
'd' : 'h',
'n' : 'n',
'A' : 'T',
'T' : 'A',
'C' : 'G',
'G' : 'C',
'K' : 'M',
'M' : 'K',
'R' : 'Y',
'Y' : 'R',
'S' : 'S',
'W' : 'W',
'B' : 'V',
'V' : 'B',
'H' : 'D',
'D' : 'H',
'N' : 'N',
}
def complement(s):
return "".join([COMPLEMENT[x] for x in s])
def rev_comp(seq):
return complement(seq)[::-1]
usage = """
%prog <alignments.bam> > unmapped_anchors.qfa
Extract anchor sequences from unmapped reads. Optionally permute.
"""
parser = OptionParser(usage=usage)
parser.add_option("-a","--anchor",dest="asize",type=int,default=20,help="anchor size")
parser.add_option("-q","--minqual",dest="minqual",type=int,default=5,help="min avg. qual along both anchors (default=5)")
parser.add_option("-r","--rev",dest="rev",type="choice",choices=["A","B","R","N","C","P"],default="N",help="P-ermute read parts or reverse A,B,R,C,N for control")
parser.add_option("-R","--reads",dest="reads",action="store_true",default=False,help="instead of unmapped reads from BAM, input is sites.reads from find_circ.py")
parser.add_option("-F","--fasta",dest="fasta",action="store_true",default=False,help="instead of unmapped reads from BAM, input is FASTA file")
options,args = parser.parse_args()
import random
perm_A = []
perm_I = []
perm_B = []
perm_burn_in = []
N_perm = 100
def randomchoice(l):
return l.pop(random.randint(0,len(l)-1))
def passthru(x):
return x
def reverse(x):
return x[::-1]
funcs = {
'A' : (passthru,reverse,passthru),
'B' : (passthru,passthru,reverse),
'R' : (reverse,passthru,passthru),
'N' : (passthru,passthru,passthru),
'P' : (passthru,passthru,passthru),
'C' : (passthru,passthru,passthru),
}
read_f,A_f,B_f = funcs[options.rev]
def handle_read(read):
if not read.is_unmapped:
return
seq,qual = read_f(read.seq),read_f(read.qual)
# minimal quality scores
nq = numpy.fromstring(qual,dtype=numpy.uint8) - 35
if nq[:options.asize].mean() < options.minqual or nq[-options.asize:].mean() < options.minqual:
# read is junk
#print "qual.fail",nq[:options.asize].mean(),nq[-options.asize:].mean()
return
if options.rev == "P":
perm_A.append((seq[:options.asize],qual[:options.asize]))
perm_B.append((seq[-options.asize:],qual[-options.asize:]))
perm_I.append((seq[options.asize:-options.asize:],qual[options.asize:-options.asize:]))
if len(perm_burn_in) < N_perm:
# collect some reads for permutation control first.
perm_burn_in.append(read)
return
A_seq,A_qual = randomchoice(perm_A)
B_seq,B_qual = randomchoice(perm_B)
I_seq,I_qual = randomchoice(perm_I)
seq,qual = A_seq+I_seq+B_seq,A_qual+I_qual+B_qual
if options.rev == "C":
seq = rev_comp(seq)
qual = reverse(qual)
print "@%s_A__%s" % (read.qname,seq)
print A_f(seq[:options.asize])
print "+"
print A_f(qual[:options.asize])
print "@%s_B" % read.qname
print B_f(seq[-options.asize:])
print "+"
print B_f(qual[-options.asize:])
if options.reads:
N = 0
for line in file(args[0]):
name,seq = line.rstrip().split('\t').replace(" ","_")
N +=1
class Item(object):
pass
read = Item()
read.qname = "%s_%d" % (name,N)
read.is_unmapped=True
read.seq = seq
read.qual = "b"*len(seq)
handle_read(read)
elif options.fasta:
from sequence_data.io import fasta_chunks
N = 0
for name,seq in fasta_chunks(file(args[0])):
N += 1
name = name.replace(" ","_")
class Item(object):
pass
read = Item()
read.qname = "%s_%d" % (name,N)
read.is_unmapped=True
read.seq = seq
read.qual = "b"*len(seq)
handle_read(read)
else:
for read in pysam.Samfile(args[0],'rb'):
handle_read(read)
for read in perm_burn_in:
handle_read(read)
Computing file changes ...