https://github.com/fjruizruano/ngs-protocols
Tip revision: 39a091d1fa569a7fc717ac73c4b3de07f0a1204d authored by fjruizruano on 03 August 2023, 11:48:27 UTC
adding gfa2fas.py and extract_gfa.py
adding gfa2fas.py and extract_gfa.py
Tip revision: 39a091d
fastq-combine-pe.py
#!/usr/bin/python
import sys
#####Authors: Team Ladner-Barshis-Tepolt
######Usage########
######This script takes a set of separate quality trimmed paired end .fastq files
######and pairs the reads that still have a mate and combined the solitary reads
######into a single file of orphans for input into denovo assembly software (e.g. CLC Genomics).
print "Command line usage: fastqcombinepairedend.py \042the delimiter\042 \042the seqheader text\042 infiledirection1.fastq infiledirection2.fastq"
print "Command line usage: fastqcombinepairedend.py \042@HWI\042 \042 \042 infiledirection1.fastq infiledirection2.fastq"
seqheader=sys.argv[1]
paireddelim=sys.argv[2]
in1=sys.argv[3]
in2=sys.argv[4]
###This line had to be added 15Oct2012 for script to work properly
setoffiles=[in1, in2]
#filecount=0
#pairedlist=[]
#for file in infilelist:
#if filecount==0:
#firstname=file
#filecount+=1
#else:
#secondname=file
#filecount=0
#pairedlist.append((firstname,secondname))
#Opens an infile specified by the user.
IN1 = open(sys.argv[3], 'r')
IN2 = open(sys.argv[4], 'r')
#Opens an output text file as specified by user
SIN = open('%s_trimmed_clipped_singles.fastq'%(sys.argv[3][:-23]), 'w') #Unpaired reads from both directions for CLC
PAIR1 = open('%s_trimmed_clipped_stillpaired.fastq'%(sys.argv[3][:-6]), 'w') #Read one of paired reads for CLC
PAIR2 = open('%s_trimmed_clipped_stillpaired.fastq'%(sys.argv[4][:-6]), 'w') #Read two of paired reads for CLC
names1=[]
names2=[]
seqs1={}
seqs2={}
quals1={}
quals2={}
linenum1=0
#Starts a for loop to read through the infile line by line
for line in IN1:
line = line.rstrip()
linenum1+=1
#sets count to 1 when it finds the header before a sequence
if line[0:4]==seqheader:
line=line.split(paireddelim)
name=line[0][1:]
names1.append(name)
count = 0
else:
if count==1:
seqs1[name]=line
if count==3:
quals1[name]=line
count=count+1
# print linenum1
print 'finished reading in: %s' %(setoffiles[0])
for line in IN2:
# print 2 #commented out 15oct2012 just produces a long list of 2's in stout
line = line.rstrip()
#sets count to 1 when it finds the header before a sequence
if line[0:4]==seqheader:
line=line.split(paireddelim)
name=line[0][1:]
names2.append(name)
count = 0
else:
if count==1:
seqs2[name]=line
if count==3:
quals2[name]=line
count=count+1
# print linenum1
# print 'here'
print 'finished reading in: %s'%(setoffiles[1])
paired = list(set(names1) & set(names2))
#print paired
# print len(paired)
print 'number of paired reads: %d'%(len(paired))
single = list(set(names1) ^ set(names2))
# print len(single)
single1 = list(set(single) & set(names1))
# print len(single1)
print 'number of single reads left from %s: %d'%(setoffiles[0],len(single1))
single2 = list(set(single) & set(names2))
# print len(single2)
print 'number of single reads left from %s: %d'%(setoffiles[1],len(single2))
# for label in paired:
# PAIR.write('@' + str(label) + '1\n' + str(seqs1[label]) + '\n+' + str(label) + '1\n' + str(quals1[label]) + '\n@' + str(label) + '2\n' + str(seqs2[label]) + '\n+' + str(label) + '2\n' + str(quals2[label]) + '\n')
for label in single1:
SIN.write('@' + str(label) + '1\n' + str(seqs1[label]) + '\n+' + str(label) + '1\n' + str(quals1[label]) + '\n')
for label in single2:
SIN.write('@' + str(label) + '2\n' + str(seqs2[label]) + '\n+' + str(label) + '2\n' + str(quals2[label]) + '\n')
for label in paired:
PAIR1.write('@' + str(label) + '1\n' + str(seqs1[label]) + '\n+' + str(label) + '1\n' + str(quals1[label]) + '\n')
PAIR2.write('@' + str(label) + '2\n' + str(seqs2[label]) + '\n+' + str(label) + '2\n' + str(quals2[label]) + '\n')
IN1.close()
IN2.close()
SIN.close()
PAIR1.close()
PAIR2.close()