https://github.com/xyc0813/pysim
Tip revision: 6002572bc8bc41bca40059b7cfa8bd5a7b6eec15 authored by xyc0813 on 29 June 2017, 14:05:36 UTC
Add files via upload
Add files via upload
Tip revision: 6002572
GC_v1.0.3.py
# -*- coding: cp936 -*-
import sys,os
import random
import string
from math import log,exp
from optparse import OptionParser
import getopt
import time
######生成模拟GC含量文件###########
def fun(x):
return -8.89*x**3-3.56*x**2+9.13*x-1.58
def generate_GC_file(line=100,outfilename='GC_content.txt',start=0,seq=0.01):
outfile=open(outfilename,'w')
for i in range(line):
outfile.write(str(start)+'\t'+str(fun(start))+'\n')
start=start+seq
outfile.close()
#####计算序列GC含量###############
def Compute_GC (seq):
if 'N' in seq:
G=seq.count('G')
C=seq.count('C')
N=seq.count('N')
if len(seq)-N!=0:
return float(G+C)/(len(seq)-N)
else:
return 0
else:
G=seq.count('G')
C=seq.count('C')
if len(seq)>0:
return float(G+C)/len(seq)
else:
return 0
#####读取GC文件####
def GC_file(GCfile='GC_content.txt'):
#GC_function=[]
GC_function={}
for line in open(GCfile):
if not line.startswith('#'):
newline=line.rstrip().split('\t')
#GC_function.append([float(newline[0]),float(newline[1])])
GC_function[float(newline[0])]=float(newline[1])
else:
continue
return GC_function
#return sorted(GC_function,key=lambda d:d[0])
######计算f(GC)#######
def GC_value(GC,GC_function):
tmp=0.0
tag=0
sort_GC=sorted(GC_function.keys())
if GC in GC_function.keys():
return GC_function[GC]
else:
for line in sort_GC:
if GC>line:
tmp=GC_function[line]
continue
else:
result=tmp
tag=1
break
if tag==0:
result=tmp
return result
######读取ref序列##############
def reference(ref_name):
ref_dic={}
chr_name=''
for line in open(ref_name):
newline=line.rstrip()
if newline.startswith('>'):
if chr_name!='':
ref_dic[chr_name]=tmp_str
chr_name=newline.split('>')[1]
tmp_str=''
else:
tmp_str=tmp_str+newline
ref_dic[chr_name]=tmp_str
return ref_dic
######以概率生成0,1################
def weighted_choice_sub(weights):
rnd = random.random() * sum(weights)
for i, w in enumerate(weights):
rnd -= w
if rnd < 0:
return i
def reverse(str1):
str2=''
for line in str1:
if line=='A':
str2=str2+'T'
elif line=='G':
str2=str2+'C'
elif line=='C':
str2=str2+'G'
elif line=='T':
str2=str2+'A'
else:
str2=str2+line
return str2
#######读取bam文件,确定1,0###########
def read_name_sort_bam(filename,ref_dic,GC_function,outfile_name='tmp',meta='n',keepsam='n'):
if filename.split('.')[-1]=='bam':
filename_out=filename+'.sam'
cmd='samtools view -h '+filename +' > '+filename_out
os.system(cmd)
else:
filename_out=filename
outfile1=open(outfile_name+'_1.fq','w')
outfile2=open(outfile_name+'_2.fq','w')
i=0
tmp=[]
if keepsam=='y' or keepsam=='Y':
outfile=open(outfile_name+'_sam','w')
for line in open(filename_out):
if line.startswith('@'):
outfile.write(line)
else:
i=i+1
if i%2==1:
tmp.append(line.rstrip().split('\t'))
else:
tmp.append(line.rstrip().split('\t'))
pois=[]
#print tmp
if tmp[0][0]==tmp[1][0]:
#if tmp[0][2].startswith('chr') and tmp[1][2].startswith('chr') and int(tmp[0][4])>0 and int(tmp[1][4])>0:
if tmp[0][2].startswith('chr') and tmp[1][2].startswith('chr'):
#chrome=tmp[0][2]+'_hap1'
if meta=='y' or meta=='Y':
if int(tmp[0][3])<int(tmp[1][3]):
pois=[int(tmp[0][3]),int(tmp[1][3]),len(tmp[1][9])]
chrome=tmp[0][2]
else:
a=tmp[0]
tmp[0]=tmp[1]
tmp[1]=a
chrome=tmp[0][2]
pois=[int(tmp[0][3]),int(tmp[1][3]),len(tmp[1][9])]
else:
if int(tmp[0][3])<int(tmp[1][3]):
pois=[int(tmp[0][3]),int(tmp[1][3]),len(tmp[1][9])]
tmp[1][9]=reverse(tmp[1][9][::-1].upper())
tmp[1][10]=tmp[1][10][::-1]
chrome=tmp[0][2]
else:
a=tmp[0]
tmp[0]=tmp[1]
tmp[1]=a
pois=[int(tmp[0][3]),int(tmp[1][3]),len(tmp[1][9])]
tmp[1][9]=reverse(tmp[1][9][::-1].upper())
tmp[1][10]=tmp[1][10][::-1]
chrome=tmp[0][2]
else:
#print ['wrong',tmp]
i=0
tmp=[]
continue
else:
#print ['wrong',tmp]
i=0
tmp=[]
continue
seq=ref_dic[chrome][pois[0]-1:pois[1]+pois[2]]
#print [ref_dic.keys(),seq,pois[0]-1,pois[1]+pois[2]]
GC_content=Compute_GC(seq.upper())
f_GC=GC_value(GC_content,GC_function)
prob=exp(f_GC)/(1+exp(f_GC))
index=weighted_choice_sub([1-prob,prob])
#print([GC_content,f_GC,prob,index,pois])
if index==1:
outfile.write('\t'.join(tmp[0])+'\n')
outfile.write('\t'.join(tmp[1])+'\n')
outfile1.write('@'+tmp[0][0]+'\n')
outfile1.write(tmp[0][9]+'\n')
outfile1.write('+\n')
outfile1.write(tmp[0][10]+'\n')
outfile2.write('@'+tmp[1][0]+'\n')
outfile2.write(tmp[1][9]+'\n')
outfile2.write('+\n')
outfile2.write(tmp[1][10]+'\n')
tmp=[]
i=0
else:
tmp=[]
i=0
outfile.close()
else:
for line in open(filename_out):
if line.startswith('@'):
continue
else:
i=i+1
if i%2==1:
tmp.append(line.rstrip().split('\t'))
else:
tmp.append(line.rstrip().split('\t'))
pois=[]
#print tmp
if tmp[0][0]==tmp[1][0]:
#if tmp[0][2].startswith('chr') and tmp[1][2].startswith('chr') and int(tmp[0][4])>0 and int(tmp[1][4])>0:
if tmp[0][2].startswith('chr') and tmp[1][2].startswith('chr'):
#chrome=tmp[0][2]+'_hap1'
if meta=='y' or meta=='Y':
if int(tmp[0][3])<int(tmp[1][3]):
pois=[int(tmp[0][3]),int(tmp[1][3]),len(tmp[1][9])]
chrome=tmp[0][2]
else:
a=tmp[0]
tmp[0]=tmp[1]
tmp[1]=a
chrome=tmp[0][2]
pois=[int(tmp[0][3]),int(tmp[1][3]),len(tmp[1][9])]
else:
if int(tmp[0][3])<int(tmp[1][3]):
pois=[int(tmp[0][3]),int(tmp[1][3]),len(tmp[1][9])]
tmp[1][9]=reverse(tmp[1][9][::-1].upper())
tmp[1][10]=tmp[1][10][::-1]
chrome=tmp[0][2]
else:
a=tmp[0]
tmp[0]=tmp[1]
tmp[1]=a
pois=[int(tmp[0][3]),int(tmp[1][3]),len(tmp[1][9])]
tmp[1][9]=reverse(tmp[1][9][::-1].upper())
tmp[1][10]=tmp[1][10][::-1]
chrome=tmp[0][2]
else:
#print ['wrong',tmp]
i=0
tmp=[]
continue
else:
#print ['wrong',tmp]
i=0
tmp=[]
continue
seq=ref_dic[chrome][pois[0]-1:pois[1]+pois[2]]
#print [ref_dic.keys(),seq,pois[0]-1,pois[1]+pois[2]]
GC_content=Compute_GC(seq.upper())
f_GC=GC_value(GC_content,GC_function)
prob=exp(f_GC)/(1+exp(f_GC))
index=weighted_choice_sub([1-prob,prob])
#print([GC_content,f_GC,prob,index,pois])
if index==1:
outfile1.write('@'+tmp[0][0]+'\n')
outfile1.write(tmp[0][9]+'\n')
outfile1.write('+\n')
outfile1.write(tmp[0][10]+'\n')
outfile2.write('@'+tmp[1][0]+'\n')
outfile2.write(tmp[1][9]+'\n')
outfile2.write('+\n')
outfile2.write(tmp[1][10]+'\n')
tmp=[]
i=0
else:
tmp=[]
i=0
outfile1.close()
outfile2.close()
def main():
usage = """%prog -i <file>
GC v1.0.1
Author: Yuchao Xia
Description: accord GC content to filter paired-end reads from name sorted simulate bam file.
python GC1.py -i <ref.fa> -g <GC_file> -b <name_sort_bam> -o <out_fastq> -sort_bam<y/n> -m <y/n> -k <y/n>
"""
parser = OptionParser(usage)
parser.add_option("-i", "--inFile", dest="inFile", help="A reference fasta file.",metavar="FILE")
parser.add_option("-g","--GC",dest='input_GC',help='GC_function file,splited by \t,the filst col is GC content',metavar="FILE")
parser.add_option("-b","--bamfile",dest='input_bam',help='input bam file sorted by name',metavar="FILE")
parser.add_option('-s','--sort_bam',dest='sort_bam',help='if convert sam to sorted bam, print y',metavar="y/n")
parser.add_option('-o','--output',dest='output',help='output fastq file',metavar="file")
parser.add_option('-m','--metasim',dest='meta',help='input metasim data',metavar='y/n')
parser.add_option('-k','--keepsam',dest='keepsam',help='generate sam file',metavar='y/n')
(opts, args) = parser.parse_args()
if opts.inFile is None or opts.input_bam is None:
parser.print_help()
else:
if opts.input_GC == None:
print 'done'
generate_GC_file()
ref_dic=reference(opts.inFile)
GC_function=GC_file()
read_name_sort_bam(opts.input_bam,ref_dic,GC_function,opts.output,opts.meta,opts.keepsam)
else:
print 'done1'
ref_dic=reference(opts.inFile)
GC_function=GC_file(opts.input_GC)
read_name_sort_bam(opts.input_bam,ref_dic,GC_function,opts.output,opts.meta,opts.keepsam)
#if opts.sort_bam=='y' or opts.sort_bam=='Y' :
#cmd1='samtools view -S -h -b '+ops.input_bam.split('.')[0]+'.sam -o '+ops.input_bam.split('.')[0]+'_new.bam'
#cmd2='samtools sort ' +ops.input_bam.split('.')[0]+'_new.bam' +ops.input_bam.split('.')[0]+'_sorted'
#os.system(cmd1)
#os.system(cmd2)
if __name__ == "__main__":
start = time.clock()
main()
end = time.clock()
print("The function run time is : %.03f seconds" %(end-start))