https://github.com/Gregor-Mendel-Institute/RKP2021-CMT3
Tip revision: 89d7e2ea78af1969bb161640baed09296ed2485f authored by Papareddy on 23 April 2021, 11:18:44 UTC
Update README.md
Update README.md
Tip revision: 89d7e2e
bampe_rm_orphan.py
#!/usr/bin/env python
###############################################################################
###############################################################################
## Created on February 1st 2017 to remove singletons from paired-end BAM file
###############################################################################
###############################################################################
import os
import pysam
import errno
import argparse
############################################
############################################
## PARSE ARGUMENTS
############################################
############################################
Description = 'Remove singleton reads from paired-end BAM file i.e if read1 is present in BAM file without read 2 and vice versa.'
Epilog = """Example usage: bampe_rm_orphan.py <BAM_INPUT_FILE> <BAM_OUTPUT_FILE>"""
argParser = argparse.ArgumentParser(description=Description, epilog=Epilog)
## REQUIRED PARAMETERS
argParser.add_argument('BAM_INPUT_FILE', help="Input BAM file sorted by name.")
argParser.add_argument('BAM_OUTPUT_FILE', help="Output BAM file sorted by name.")
## OPTIONAL PARAMETERS
argParser.add_argument('-fr', '--only_fr_pairs', dest="ONLY_FR_PAIRS", help="Only keeps pairs that are in FR orientation on same chromosome.",action='store_true')
args = argParser.parse_args()
############################################
############################################
## HELPER FUNCTIONS
############################################
############################################
def makedir(path):
if not len(path) == 0:
try:
os.makedirs(path)
except OSError as exception:
if exception.errno != errno.EEXIST:
raise
############################################
############################################
## MAIN FUNCTION
############################################
############################################
def bampe_rm_orphan(BAMIn,BAMOut,onlyFRPairs=False):
## SETUP DIRECTORY/FILE STRUCTURE
OutDir = os.path.dirname(BAMOut)
makedir(OutDir)
## COUNT VARIABLES
totalReads = 0; totalOutputPairs = 0; totalSingletons = 0; totalImproperPairs = 0
## ITERATE THROUGH BAM FILE
EOF = 0
SAMFin = pysam.AlignmentFile(BAMIn, "rb")
SAMFout = pysam.AlignmentFile(BAMOut, "wb", header=SAMFin.header)
iter = SAMFin.fetch(until_eof=True)
currRead = next(iter)
for read in iter:
totalReads += 1
if currRead.qname == read.qname:
pair1 = currRead; pair2 = read
## FILTER FOR READS ON SAME CHROMOSOME IN FR ORIENTATION
if onlyFRPairs:
if pair1.tid == pair2.tid:
## READ1 FORWARD AND READ2 REVERSE STRAND
if not pair1.is_reverse and pair2.is_reverse:
if pair1.reference_start <= pair2.reference_start:
totalOutputPairs += 1
SAMFout.write(pair1)
SAMFout.write(pair2)
else:
totalImproperPairs += 1
## READ1 REVERSE AND READ2 FORWARD STRAND
elif pair1.is_reverse and not pair2.is_reverse:
if pair2.reference_start <= pair1.reference_start:
totalOutputPairs += 1
SAMFout.write(pair1)
SAMFout.write(pair2)
else:
totalImproperPairs += 1
else:
totalImproperPairs += 1
else:
totalImproperPairs += 1
else:
totalOutputPairs += 1
SAMFout.write(pair1)
SAMFout.write(pair2)
## RESET COUNTER
try:
totalReads += 1
currRead = next(iter)
except:
StopIteration
EOF = 1
## READS WHERE ONLY ONE OF A PAIR IS IN FILE
else:
totalSingletons += 1
pair1 = currRead
currRead = read
if not EOF:
totalReads += 1
totalSingletons += 1
pair1 = currRead
## CLOSE ALL FILE HANDLES
SAMFin.close()
SAMFout.close()
LogFile = os.path.join(OutDir,'%s_bampe_rm_orphan.log' % (os.path.basename(BAMOut[:-4])))
SamLogFile = open(LogFile,'w')
SamLogFile.write('\n##############################\n')
SamLogFile.write('FILES/DIRECTORIES')
SamLogFile.write('\n##############################\n\n')
SamLogFile.write('Input File: ' + BAMIn + '\n')
SamLogFile.write('Output File: ' + BAMOut + '\n')
SamLogFile.write('\n##############################\n')
SamLogFile.write('OVERALL COUNTS')
SamLogFile.write('\n##############################\n\n')
SamLogFile.write('Total Input Reads = ' + str(totalReads) + '\n')
SamLogFile.write('Total Output Pairs = ' + str(totalOutputPairs) + '\n')
SamLogFile.write('Total Singletons Excluded = ' + str(totalSingletons) + '\n')
SamLogFile.write('Total Improper Pairs Excluded = ' + str(totalImproperPairs) + '\n')
SamLogFile.write('\n##############################\n')
SamLogFile.close()
############################################
############################################
## RUN FUNCTION
############################################
############################################
bampe_rm_orphan(BAMIn=args.BAM_INPUT_FILE,BAMOut=args.BAM_OUTPUT_FILE,onlyFRPairs=args.ONLY_FR_PAIRS)
############################################
############################################
############################################
############################################