https://github.com/adamewing/discord-retro
Raw File
Tip revision: 1afc09d60ffc138c0890bd02d707fe5d408a63d8 authored by Adam Ewing on 11 March 2013, 23:16:04 UTC
mergelist groupname bug
Tip revision: 1afc09d
discord-retro.py
#!/usr/bin/env python
#
# Copyright (C) 2012 by Adam Ewing (adam.ewing@gmail.com)
#
# Released under the MIT license, see LICENSE.txt
#

import sys, re, time, pp, argparse, subprocess, pysam, ConfigParser
import lib.configtest as configtest
from os import path as path

# other imports are done via pp

class PairedSample:
    def __init__(self, name, config, outdir, assembly, tcga=False, grouped=False, pinonly=False, justcall=False):
        self.name = name
        self.tcga = tcga      # if true, changes validation, naming
        self.grouped = grouped # requires field 4 in sample file be filled for all samples
        self.pinonly   = pinonly
        self.justcall  = justcall
        self.config    = config  # ConfigParser
        self.outdir    = outdir
        self.cancerBam = None
        self.normalBam = None
        self.cancerIdx = None
        self.normalIdx = None 
        self.assembly  = assembly 

        self.groupBams  = []
        self.groupIdxs  = []
        self.groupNames = []

    def __str__(self):
        return "\t".join((str(self.cancerBam),str(self.normalBam),
                          str(self.cancerIdx),str(self.normalIdx)))

    def validate(self):
        if self.tcga:
            if (self.cancerBam and path.exists(self.cancerBam) and
                self.normalBam and path.exists(self.normalBam) and
                self.cancerIdx and path.exists(self.cancerIdx) and
                self.normalIdx and path.exists(self.normalIdx)):
                return True
            else:
                return False

        elif self.grouped:
            for bam in self.groupBams:
                if not path.exists(bam):
                    return False
            for idx in self.groupIdxs:
                if not path.exists(idx):
                    return False
            return True

        else:
            if (self.normalBam and path.exists(self.normalBam) and
                self.normalIdx and path.exists(self.normalIdx)):
                return True
            else:
                return False

    def addFile(self,sampleType,extension,filePath):
        if sampleType == 'NORMAL' and extension == 'bam':
            self.normalBam = filePath
            self.normalIdx = filePath + ".bai"
        if sampleType == 'CANCER' and extension == 'bam':
            self.cancerBam = filePath
            self.cancerIdx = filePath + ".bai"

    def addGroupFile(self,name,filePath):
        assert name not in self.groupNames
        self.groupNames.append(name)
        self.groupBams.append(filePath)
        self.groupIdxs.append(filePath + ".bai")

    def runDiscordant(self):
        """Runs discordant.py, peakparser.py, summarize,py, pinpoint.py"""
        tabixLoc = self.config.get('discord', 'annotDir') + "/" + self.assembly + "/" + self.assembly + ".rmsk.bed.gz"
        lib.discordant.checkfile(tabixLoc)
        usechr = False
        allowOverlap = False # for summary, include elements overlapping similar repeatmasker annotations (e.g. Alu in Alu)
        if self.config.get('discord','usechr') == "True":
            usechr = True
        if self.config.get('discord','allowOverlap') == "True":
            allowOverlap = True
        args = argparse.Namespace(bamFileName    = self.normalBam,  # discordant.py
                                  outBaseName    = self.name,       # discordant.py, peakparser.py, summarize.py, pinpoint.py
                                  tabixFileName  = tabixLoc,        # discordant.py
                                  refGenome      = self.assembly,   # discordant.py
                                  overwrite      = True,            # discordant.py
                                  printout       = False,           # summarize.py
                                  inDir1         = None,            # mergepairs.py
                                  inDir2         = None,            # mergepairs.py
                                  sampleList     = None,            # mergelist.py
                                  outDirName     = self.outdir,     # everything
                                  usechr         = usechr,          # pinpoint.py
                                  allowOverlap   = allowOverlap,
                                  eltFileName    = self.config.get('discord','eltFileName'),
                                  eltLenFileName = self.config.get('discord','eltLenFileName'),
                                  readLength     = self.config.get('discord','readLength'),
                                  insertSize     = self.config.get('discord','insertSize'),
                                  minMapQ        = self.config.get('discord','minMapQ'),
                                  configFileName = self.config.get('discord','configFileName'),
                                  eltFile        = self.config.get('discord','eltFile'),
                                  minPeakSize    = self.config.get('discord','minPeakSize'),
                                  maxReadLen     = self.config.get('discord','maxReadLen'),
                                  zeroChar       = self.config.get('discord','zeroChar'),
                                  minClipQual    = self.config.get('discord','minClipQual'),
                                  refFastaDir    = self.config.get('discord','refFastaDir'),
                                  annotDir       = self.config.get('discord','annotDir'),
                                  refGenomeFile  = self.config.get('discord',self.assembly))
        if self.tcga:
            normalName = self.name + "-NORMAL"
            cancerName = self.name + "-CANCER"
            mergeName  = self.name + "-MERGE"

            args.inDir1 = normalName
            args.inDir2 = cancerName
            args.outBaseName = normalName
            # normal .bam
            lib.discordant.main(args)

            # cancer .bam
            args.bamFileName = self.cancerBam
            args.outBaseName = cancerName
            lib.discordant.main(args)

            # merged results
            args.outBaseName = mergeName
            lib.mergepairs.main(args)

            if not self.pinonly:
                lib.peakparser.main(args)
                lib.summarize.main(args)
            lib.pinpoint.main(args)

        elif self.grouped:
            sampleList = []
            for i in range(len(self.groupNames)):
                sampleString = "\t".join((self.groupBams[i], self.groupNames[i], self.assembly, self.name))
                sampleList.append(sampleString)
                args.outBaseName = self.groupNames[i]
                args.bamFileName = self.groupBams[i]
                lib.discordant.main(args)

            args.outBaseName = self.name
            args.sampleList = sampleList
            lib.mergelist.main(args)
            if not self.pinonly:
                lib.peakparser.main(args)
                lib.summarize.main(args)
            lib.multipoint.main(args)

        else:
            if not self.pinonly:
                if not self.justcall:
                    lib.discordant.main(args)
                lib.peakparser.main(args)
                lib.summarize.main(args)
            lib.pinpoint.main(args)

def main(args):
    configtest.check(args.configFile)
    
    pairedSamples = {}

    for line in open(args.sampleFile, 'r'):
        if not re.search("^#", line):
            filePath = label = assembly = groupName = None

            ncols = len(line.strip().split())
            if ncols == 3: 
                (filePath,label,assembly) = line.strip().split()
            elif ncols == 4:
                (filePath,label,assembly,groupName) = line.strip().split()

            assert filePath and label and assembly

            base = path.basename(filePath)

            sampleType = 'NORMAL'
            sampleName = label
            fileparts = base.split('.')
            extension = fileparts[-1]

            if args.tcga: # see https://wiki.nci.nih.gov/display/TCGA/Working+with+TCGA+Data
                baseparts = base.split('-')
                participant = baseparts[2]
                samplenum = baseparts[3]
                if re.search('01', samplenum):
                    sampleType = 'CANCER'
                sampleName = '-'.join((label,participant))

            config = ConfigParser.ConfigParser()
            config.read(args.configFile)

            if args.grouped:
                if groupName not in pairedSamples:
                    pairedSamples[groupName] = PairedSample(groupName,config, 
                                                             args.outDirName,
                                                             assembly,
                                                             tcga=args.tcga, 
                                                             grouped=args.grouped,
                                                             pinonly=args.pinpointonly,
                                                             justcall=args.justcall)
                pairedSamples[groupName].addGroupFile(sampleName,filePath)

            else:
                if sampleName not in pairedSamples:
                    pairedSamples[sampleName] = PairedSample(sampleName,config, 
                                                             args.outDirName,
                                                             assembly,
                                                             tcga=args.tcga, 
                                                             pinonly=args.pinpointonly,
                                                             justcall=args.justcall)

                pairedSamples[sampleName].addFile(sampleType,extension,filePath)
            

    # parallel python stuff
    ncpus = int(args.numCPUs) 
    jobServer = pp.Server(ncpus,ppservers=())

    sampleJobs = {}
    for sampleName in pairedSamples.keys():
        if pairedSamples[sampleName].validate():
            print sampleName + " validated"
            job = jobServer.submit(pairedSamples[sampleName].runDiscordant,(),(),
                  ("lib.discordant","lib.mergepairs","lib.mergelist","lib.peakparser","lib.multipoint","lib.pinpoint","lib.summarize","argparse"))
            sampleJobs[sampleName] = job
            print "started job: " + sampleName
            jobServer.print_stats()
        else:
            print sampleName + " did not validate, make sure the .bam exists and is indexed (has a .bam and a .bam.bai file)"

    for sampleName in sampleJobs:
        print "=== stdout of job: " + sampleName + " ==="
        print sampleJobs[sampleName]()

if __name__ == '__main__':
    parser = argparse.ArgumentParser(description="Pipeline that aims to identify TE insertions from .bam alignments")
    parser.add_argument('-s', '--sampleFile', dest="sampleFile", required=True,
                   help="List of filenames and sample names. Should include both .bam files (.bam) and their indexes (.bai)")
    parser.add_argument('-c', '--config', dest='configFile', required=True,
                   help="config file, see human_sample.cfg for an example")
    parser.add_argument('-p', '--processors', dest='numCPUs', default='1', 
                   help="Number of CPUs to use (1 job per CPU)")
    parser.add_argument('-o', '--outdir', dest='outDirName', required=True, 
                   help="output base directory")
    parser.add_argument('--pinpointonly', action='store_true', 
                   help="only run pinpoint.py")
    parser.add_argument('--tcga', action="store_true", 
                   help="samples are cancer/normal pairs speficied by sample names as per TCGA spec (https://wiki.nci.nih.gov/display/TCGA/Working+with+TCGA+Data)")
    parser.add_argument('--grouped', action="store_true", help="samples are grouped by name in column 4 of sample file")
    parser.add_argument('--justcall', action="store_true", help="don't pick discordant reads, just call on pre-existing sample.readpairs.txt")
    args = parser.parse_args()
    main(args)
back to top