https://github.com/grenaud/cinch
Raw File
Tip revision: cf9ed4240b74c866f62b3da2cdb4f0bbceb7f551 authored by grenaud on 16 February 2022, 09:31:13 UTC
license info
Tip revision: cf9ed42
cinch.py
#!/usr/bin/python

from __future__ import absolute_import, division, print_function, unicode_literals
from optparse import OptionParser
from optparse import OptionGroup

import pathlib

#import matplotlib.pyplot as plt

#import seaborn as sns
import sys
import os, random
import subprocess
import gzip
from tqdm import tqdm

import numpy as np
from sklearn.decomposition import NMF, non_negative_factorization
from sklearn.preprocessing import normalize
from sklearn import metrics
from sklearn.metrics import roc_auc_score

from itertools             import combinations 


#!pip install -q tensorflow==2.0.0-beta1
#import pandas as pd
#import tensorflow as tf
import re
#from tensorflow import keras
#from tensorflow.keras import layers

#def normalize_rows(x: numpy.ndarray):
#    return x/numpy.linalg.norm(x, ord=2, axis=1, keepdims=True)


def which(program):
    sys.stderr.write("Detecting program: "+str(program)+"\n");
    cjob = "type "+str(program);

    sp = subprocess.Popen(["/bin/bash", "-i", "-c", cjob],
                                stdout=subprocess.PIPE, 
                                stderr=subprocess.PIPE)
    

    out, err = sp.communicate();
    errcode  = sp.returncode;
    if(errcode != 0): #has finished but wrong code
        sys.stderr.write("Cannot find program: "+str(program)+" please make sure it is installed\n");
        sys.exit(1);
    
        #print("#"+str(str(out).find("aliased"))+"#");
        #out=str(out);
    if(out.find("aliased") != -1): #was aliased
        return out[out.find(" to ")+5:-2];
    else:
        if(out.find(" is ") != -1):
            return out[out.find(" is ")+4:];
        else:
            sys.stderr.write("Cannot seem to find program: "+str(program)+" please make sure it is installed\n");
            sys.exit(1);
            

    



    return out;

def handle_job(cjob):

    jobcreated=subprocess.Popen(cjob,shell=True,
                                executable="/bin/bash",
                                stdout=subprocess.PIPE, 
                                stderr=subprocess.PIPE); 
    jobcreated.wait()
    #print str(cjob);

    out, err = jobcreated.communicate()
    errcode  = jobcreated.returncode;

    if(err != ""): #has finished but has printed to stderr
        print("Job failed "+cjob+" failed");
        sys.exit(1);
    else:
        print("Job finished succesfully");

    return out;



        

    

#sys.exit(1);


    return out;


mininsize=1;
maxinsize=700;
wcomponents=2;

#print(pathofconfig);
parser = OptionParser(
"\n\n"+

"\n"+
" ------------------------------   \n"+
    "       __          __       \n"+
"      /  ` | |\ | /  ` |__| \n"+
"      \__, | | \| \__, |  | \n"+
"\n"+
" ------------------------------   \n"+
"\n"
"Prediction of ctDNA for low coverage data\n"+
"\n"+

"\tcynch [options] [mode] [fof]\n"+
"\n"+
"There are 2 modes:\n"+
"\n"+
"\tcynch train [fof]\n"+
    "\tcynch predict [model] [fof]\n"
"\n"+
    "\tThe fof (file of file) as the following format for \"training\"\n"+
    "\t\t[bamfile1] [ctDNA frac 1]\n"+
    "\t\t[bamfile2] [ctDNA frac 2]\n"+
    "\t\t...\n"+

    "\tFor example:\n"+
    "\t\t/data/path/in1.bam 0.59\n"+
    "\t\t/data/path/in2.bam 0.11\n"+

    "\n"+
"\tThe fof has the following format for \"predict\"\n"+
    "\t\t[bamfile1]\n"+
    "\t\t[bamfile2]\n"+
    "\t\t..."+
"\n"+
"\n"+

"\tThe [model] is the *H.dat file created by the training and has the following shape\n"+
    "\t\tcomp1_1 comp1_2 comp1_3 ... \n"+
    "\t\tcomp2_1 comp2_2 comp2_3 ...\n"+
    "\t\t..."+
"\n"+
"\n"+

"\tThe program will auto-resume if you have sent commands to the cluster or used parallel\n"+
    
"\n"
);


##parser.add_option("-t", "--theta",       dest="theta",        help="Theta, default is 20",                          default=20,      type="float");
#parser.add_option("-r", "--reference",      dest="ref",          help="Reference genome used for alignment",                 default=None,    type="string");
#parser.add_option("--tmpfolder",            dest="tmpfolder",    help="Temporary folder",                                    default="/tmp/",    type="string");
parser.add_option("-o","--outdir",         dest="resultso",     help="Output directory to store the results",                  default=None,    type="string");

#
#parser.add_option("--samtools",             dest="samtools",     help="Use this version of samtools",                        default="samtools",    type="string");
#parser.add_option("--tabix",                dest="tabix",        help="Use this version of tabix",                           default="tabix",    type="string");
parser.add_option("--bed",                   dest="bed",          help="Restrict the analysis to those regions (requires samtools and tabix to be installed)",               default=None,    type="string");
#
#parser.add_option("--map",                  dest="mappability",  help="Path to the mappability file for the reference genome",   default=None,    type="string");
#parser.add_option("--gc",                   dest="gc",           help="Path to the GC content file  for the reference genome",   default=None,    type="string");
#
#parser.add_option("--hpc",                  dest="hpc",          help="Use high-performance computing (for queueing systems ex: SGE)",          action="store_true");
#parser.add_option("--resume",               dest="resume",       help="Resume by providing the temp directory used",                              type="string");
parser.add_option("--unmapped",              dest="unmapped",     help="Also consider the length of unmapped fragments",                         action="store_true");
#parser.add_option("--nice",                 dest="nice",         help="Nice the jobs",                                                          action="store_true");
parser.add_option("-t"  , "--threads",      dest="threads",      help="Number of threads to use if the local machine is used",  default=1,   type="int");
#
#parser.add_option("--mismap",               dest="mismappingrate", help="Mismapping rate , default is "+str(mismappingrate)+"",            default=mismappingrate, type="float");
#
##parser.add_option(""  , "--branchl",      dest="branchlscale", help="Seq-gen branch scale, default is 0.00045",            default=0.00045, type="float");
##parser.add_option(""  , "--chrlen",       dest="lengthchr",    help="Chromosome length, default is 10kb",              default=10000,   type="int");
parser.add_option("--minl",                 dest="minl",         help="Minimum fragment length to use, default is "+str(mininsize),                       default=mininsize,type="int")
parser.add_option("-c",                     dest="numcomp",      help="Number of components, default is 2",                   default=2,        type="int")
parser.add_option(""  , "--chrnum",         dest="numchr",       help="Number of chromosomes",                                default=22,       type="int");
parser.add_option(""  , "--chrprefix",      dest="prechr",       help="Prefix for chromosomes",                               default="chr",    type="string");

wingroup = OptionGroup(parser, "Window Based Options")
wingroup.add_option("--winsize",              dest="winsize",      help="Size of genomic windows (ex: --winsize 1000000)",             type="int");
wingroup.add_option("--fai",                  dest="fai",          help="Fasta index (fai) of the reference used for mapping if --winsize is specified)",             type="string");
wingroup.add_option("--minobs",               dest="minobs",       help="Minimum number of observations for a given window",    default=10000,    type="int");
parser.add_option_group(wingroup);

#parser.add_option("--gc",                  dest="gcfile",       help="File containing the % of GC per window size (see above) default: "+gcContent+"",                  default=gcContent,    type="string");
#
#parser.add_option("-c", "--conf",           dest="configfile",   help="Configuration for various conditions file to use for species/build default: "+str(pathofconfig)+"", default=pathofconfig, type="string");




#print handle_job("which ls");

(options,args) = parser.parse_args()

usewindow=(options.winsize != None);

if( len(args) < 2 ):
    sys.stderr.write("\nneed a least 2 arguments, please use -h to see options.\n");
    sys.exit(1);


chrarray=[];
for chrn in range(1, options.numchr+1):
    chrarray.append( options.prechr+str(chrn));

    
sys.stderr.write("Using chromosomes: "+str( ",".join(chrarray) )+"\n" );
chrnames   = {};
chrranges  = [];

if(usewindow):
    if(options.fai == None):
        sys.stderr.write("The option --fai is required for window based computation.\n");
        sys.exit(1);

        
    referencefai = open(options.fai, "r");

    for linefai in referencefai:
        faia=linefai.split("\t");
        if( not(faia[0] in chrarray ) ):
            continue;

        chrnames[ faia[0] ] = 0;
        
        #chrlengths.append( faia[1] );

        for c in range(1, (int(faia[1])-options.winsize), options.winsize):
            rangedict ={ "chr": faia[0], "start": c,   "end": c+(options.winsize-1) };        
            chrranges.append(rangedict);
    referencefai.close();


#sys.exit(1);

sys.stderr.write("Detecting program: insize");

pathofexec      = os.path.abspath(sys.argv[0]);
pathofexecarray = pathofexec.split('/')[:-1];

pathofinsize =  ("/".join(pathofexecarray))+"/lib/insertsize/src/insize";

if(not os.path.exists(pathofinsize)):
    sys.stderr.write("\nERROR: The executable file "+pathofinsize+" does not exist, please type make in the main directory\n");
    sys.exit(1);

#pathofreadcounter =  ("/".join(pathofexecarray))+"/lib/hmmcopy_utils/bin/readCounter";

#if(not os.path.exists(pathofreadcounter)):
#    sys.stderr.write("\nERROR: The executable file "+pathofreadcounter+" does not exist, please type make in the main directory\n");
#    sys.exit(1);


    
if(options.minl<1 or options.minl>=200 ):
    sys.stderr.write("\nERROR: The minimum length of the fragments:"+str(options.minl)+" cannot be less than 1 or more than 200\n");
    sys.exit(1);

mininsize = options.minl;

if(options.numcomp<=1):
    sys.stderr.write("\nERROR: The number of components  "+str(options.numcomp)+" cannot be less than 1\n");
    sys.exit(1);

if(options.numcomp>=9):
    sys.stderr.write("\nERROR: The number of components  "+str(options.numcomp)+" cannot greater or equal to 9\n");
    sys.exit(1);

wcomponents = options.numcomp;

#rcmd        = re.sub('\s+','',which("R"));
#print(rcmd);

#rscmd        = re.sub('\s+','',which("Rscript"));
#print(rscmd);
#hmmcopycmd =  rcmd+"  CMD BATCH --vanilla --silent <(echo \"is.installed <- function(mypkg) is.element(mypkg, installed.packages()[,1]);  is.installed('HMMcopy');\") /dev/stdout";
#hmmcopycmd =  rscmd+" -e \"is.installed <- function(mypkg) is.element(mypkg, installed.packages()[,1]);  is.installed('HMMcopy');\" ";

#print(hmmcopycmd);
#outputrcmd = handle_job(hmmcopycmd);

#if( "[1] FALSE"  in outputrcmd ):
#    sys.stderr.write("\nERROR: cannot find package HMMcopy, please install it (see http://bioconductor.org/packages/release/bioc/html/HMMcopy.html)\n");
#    sys.exit(1);


#if ( not (  "[1] TRUE"  in outputrcmd )):
#    sys.stderr.write("\nERROR: not sure if we can find package HMMcopy, contact developers, this is an unknown case\n");
#    sys.exit(1);

if( options.resultso == None):
    sys.stderr.write("\nPlease specify the outdir using -o outputDirectory/ or --outdir=outputDirectory/\n");
    sys.exit(1);

sys.stderr.write("\n\n\n");
resultso = options.resultso;
if(not resultso.endswith("/")):
    resultso = resultso+"/";

foffile=args[1];    
if(not os.path.exists(foffile)):
    sys.stderr.write("\nERROR: The file of files "+foffile+" does not exist\n");
    sys.exit(1);
pathoffof      = os.path.abspath(foffile);
pathoffofarray = pathoffof.split('/')[:-1];
pathoffofnof   = "/".join(pathoffofarray);

samtoolscmds="";
if(options.bed or usewindow):
    samtoolscmds        = re.sub('\s+','',which("samtools"));
#print(rcmd);

tabixcmds="";
if(options.bed):
    tabixcmds        = re.sub('\s+','',which("tabix"));
#print(rcmd);


# train
def readFOFlabeled(foffile):
    foffilefd = open(foffile, "r");
    bamfiles  = [];
    label     = [];
    
    for linefd in foffilefd:
        linefd = linefd.strip();
        fields = linefd.split();

        if(len(linefd)==0):
            continue;

        if(len(fields)!=2):
            sys.stderr.write("\nThe line ->"+str(linefd)+"<- does not have 2 columns, please check the expected format using -h\n");
            sys.exit(1);
       
        bamfile = os.path.abspath(fields[0]);
        #print(bamfile);
        if(not os.path.exists(bamfile)):            
            bamfile2 = os.path.abspath(pathoffofnof+"/"+fields[0]);
            if(not os.path.exists(bamfile2)):            
                sys.stderr.write("\nERROR: The BAM file "+bamfile+" or "+bamfile2+" does not exist, make sure you have the correct relative or absolute path\n");
                sys.exit(1);
            else:
                if(not os.path.exists(bamfile2+".bai")):            
                    sys.stderr.write("\nERROR: The BAM file "+bamfile2+" is not indexed, please index the bam files and rerun\n");
                    sys.exit(1);
                    
                bamfiles.append( bamfile2 );
        else:
            if(not os.path.exists(bamfile+".bai")):            
                sys.stderr.write("\nERROR: The BAM file "+bamfile+" is not indexed, please index the bam files and rerun\n");
                sys.exit(1);

            bamfiles.append( bamfile   );
        fracbam = fields[1];
        try:
            fracbam = float(fracbam)
        except ValueError:
            sys.stderr.write("\nThe line ->"+str(len(linefd))+"<- does not have 2 columns where the second column is a floating point\n");
            sys.exit(1);

        if(fracbam>1 or fracbam<0):
            sys.stderr.write("\nThe second column should between 0 and 1, found "+str(fracbam)+" on line "+str(linefd)+"\n");
            sys.exit(1);
        label.append( fracbam );
        
    foffilesub=re.sub('/','_',foffile);
    foffilesub=re.sub("\ ",'_',foffilesub);
    foffilesub=re.sub("\.",'_',foffilesub);

    if(len(bamfiles) == 0):
        sys.stderr.write("\nNo BAM files were found in "+str(foffile)+"\n");
        sys.exit(1);

    
    return bamfiles,label,foffilesub;

def readFOFunlabeled(foffile):

    foffilefd = open(foffile, "r");
    bamfiles  = [];
    
    for linefd in foffilefd:
        linefd = linefd.strip();
        fields = linefd.split();

        if(len(linefd)==0):
            continue;

        if(len(fields)!=1):
            sys.stderr.write("\nThe line ->"+str(linefd)+"<- does not have 1 column, please check the expected format using -h\n");
            sys.exit(1);
       
        bamfile = os.path.abspath(fields[0]);
        #print(bamfile);
        if(not os.path.exists(bamfile)):            
            bamfile2 = os.path.abspath(pathoffofnof+"/"+fields[0]);
            if(not os.path.exists(bamfile2)):            
                sys.stderr.write("\nERROR: The BAM file "+bamfile+" or "+bamfile2+" does not exist, make sure you have the correct relative or absolute path\n");
                sys.exit(1);
            else:
                if(not os.path.exists(bamfile2+".bai")):            
                    sys.stderr.write("\nERROR: The BAM file "+bamfile2+" is not indexed, please index the bam files and rerun\n");
                    sys.exit(1);
                    
                bamfiles.append( bamfile2 );
        else:
            if(not os.path.exists(bamfile+".bai")):            
                sys.stderr.write("\nERROR: The BAM file "+bamfile+" is not indexed, please index the bam files and rerun\n");
                sys.exit(1);

            bamfiles.append( bamfile   );
            
        
    foffilesub=re.sub('/','_',foffile);
    foffilesub=re.sub("\ ",'_',foffilesub);
    foffilesub=re.sub("\.",'_',foffilesub);

    if(len(bamfiles) == 0):
        sys.stderr.write("\nNo BAM files were found in "+str(foffile)+"\n");
        sys.exit(1);


    return bamfiles,foffilesub;


def runStage1(resultso,threads,winsize,foffilesub,bamfiles,logfile):

    sys.stderr.write("\n");
    sys.stderr.write("    #####################################    \n");
    sys.stderr.write("    #   stage 1: feature extraction     #    \n");
    sys.stderr.write("    #####################################    \n");
    sys.stderr.write("\n");



    if not os.path.exists( ""+resultso+"/stage1/"):
        os.mkdir( ""+resultso+"/stage1/", 0755 );
    else:
        sys.stderr.write("\nThe directory "+resultso+"/stage1/"+" already exists\n");            


    #isize

    #sys.stderr.write("bamfiles"+str(bamfiles));

    if(winsize == None):#we do not use a window based analysis
        fileHandleLC = open ( ""+resultso+"/listcommands_1.txt", 'w' ) ;

        for bami in range(0,len(bamfiles)):
            cmdtowrite="";

            if(options.bed):
                cmdtowrite=samtoolscmds+" view  -u -L  "+str(options.bed)+" "+bamfiles[bami]+" ";
            else:
                cmdtowrite="cat "+bamfiles[bami]+" ";

            cmdtowrite=cmdtowrite+" | "+pathofinsize+"  -l "+str(mininsize)+" -L "+str(maxinsize)+" ";

            if(not(options.unmapped)): #
                cmdtowrite=cmdtowrite+" -m ";

            #else: (we consider unmapped too)
            #do nothing, isize considers even unmapped fragments by default
            
            cmdtowrite=cmdtowrite+" /dev/stdin |sort --temporary-directory="+resultso+"/stage1/ -n |uniq -c |gzip > "+resultso+"/stage1/"+str(bami)+".isize.gz\n";
            fileHandleLC.write(cmdtowrite);
        fileHandleLC.close();



    else:
        sys.stderr.write("\nWriting temp files\n");            

        filei=0;
        for bami in range(0,len(bamfiles)):
            if( os.path.exists(resultso+"/stage1/"+str(bami)+".isize") ):
                os.remove(resultso+"/stage1/"+str(bami)+".isize") ;
            if( os.path.exists(resultso+"/stage1/"+str(bami)+".isize.gz") ):
                os.remove(resultso+"/stage1/"+str(bami)+".isize.gz") ;


        #for bami in range(0,len(bamfiles)):
        for bami in tqdm(range(0,len(bamfiles))):
            fileHandleLC = open ( ""+resultso+"/listcommands_1_"+str(bami)+".txt", 'w' ) ;


            for cn in chrranges:
                cmdtowrite=samtoolscmds+" view  -u   "+bamfiles[bami]+" "+str(cn["chr"])+":"+str(cn["start"])+"-"+str(cn["end"])+" ";

                if(options.bed):
                    cmdtowrite=cmdtowrite+" | "+samtoolscmds+" view  -u -L <("+str(tabixcmds)+" "+str(options.bed)+" "+str(cn["chr"])+":"+str(cn["start"])+"-"+str(cn["end"])+" ) /dev/stdin ";
                else:
                    cmdtowrite=cmdtowrite+" ";

                cmdtowrite=cmdtowrite+" | "+pathofinsize+" -v ";
                cmdtowrite=cmdtowrite+"  -l "+str(mininsize)+" -L "+str(maxinsize)+" ";

                if(not(options.unmapped)): #
                    cmdtowrite=cmdtowrite+" -m ";
                    #else: (we consider unmapped too)
                    #do nothing, isize considers even unmapped fragments by default
                    
                cmdtowrite=cmdtowrite+" /dev/stdin  >> "+resultso+"/stage1/"+str(bami)+".isize\n";
                fileHandleLC.write(cmdtowrite);
                filei+=1;
            cmdtowrite="gzip "+resultso+"/stage1/"+str(bami)+".isize\n";
            fileHandleLC.close();

            fileHandleSH = open ( ""+resultso+"/listcommands_1_"+str(bami)+".bash", 'w' ) ;
            cmdtowrite="#!/bin/bash\n";
            cmdtowrite=cmdtowrite+"\n";
            cmdtowrite=cmdtowrite+"bash "+resultso+"/listcommands_1_"+str(bami)+".txt\n";
            fileHandleSH.write(cmdtowrite);
            fileHandleSH.close();

            

    #readcount, todo reenable?
    #if False:
    #    for bami in range(0,len(bamfiles)):
    #        fileHandleLC.write(pathofreadcounter+" -w "+str(winsize)+" -c "+str( ",".join(chrarray) )+" "+bamfiles[bami]+"  > "+resultso+"/stage1/"+str(bami)+".seg\n");
    #fileHandleLC.close();

    #logfile = (resultso+foffilesub+"_train.log");
    logfilefp = open(logfile, "w");
    logfilefp.write("#-o:"+resultso+"\n");
    logfilefp.write("#fof:"+foffile+"\n");
    #logfilefp.write("#opt:"+foffile+"\n");    
    logfilefp.write("#stage1:\n"); #TODO add isize


    logfilefp.close();

    print("Please run the commands manually either using:");
    if(winsize == None):#we do not use a window based analysis
        print("  cat "+resultso+"/listcommands_1.txt | parallel -j "+str(threads));
        print("on the use a batch/queueing system to launch:");
        print("  cat "+resultso+"/listcommands_1.txt | sbatch ...");
        print("");
    else:
        #todo, write a file will all of the bash listcommands and launch it.
        #print("  cat "+resultso+"/listcommands_1_*.txt | parallel -j "+str(threads));
        print("  cat "+resultso+"/listcommands_1_*.bash | parallel -j "+str(threads));
        print("on the use a batch/queueing system to launch, ex:");
        print("  for i in `ls "+resultso+"/listcommands_1_*.bash `; do echo $i;  sbatch --time=11:00:00 --mem=2G  --cpus-per-task=2 $i;  done ");
        print("");
    print("Once commands are done, rerun with:\n");
    print( " ".join(sys.argv) );
    #print("  cinch.py   -o "+resultso+"  train "+foffile);
    print("");

    cmdtolaunch="cat "+resultso+"/listcommands_1.txt | parallel  -j "+str(threads);
    
def parseIsize(resultso,foffilesub,bamfiles,mininsize,maxinsize):
    isizedat = (resultso+foffilesub+"_isize.dat");
    isizedatfp = open(isizedat, "w");

    isizedatfp.write("#fileidx");
    for i in range(mininsize,(maxinsize+1)):
        isizedatfp.write( "\t"+str( i  ) );
    isizedatfp.write( "\n" );
    sys.stderr.write("Reading bam files\n");

    #dataAllisize = np.array([])
    dataAllisize = [];

    for bami in tqdm(range(0,len(bamfiles))):
        fileisize = resultso+"/stage1/"+str(bami)+".isize.gz";
        #datatoadd = np.array([]);
        datatoadd = [];

        if(not os.path.exists(fileisize)):
            sys.stderr.write("\nThe file "+fileisize+" does not exist, please run all commands.\n");
            sys.exit(1);
        else:
            isizeCount=[];
            for i in range(0,maxinsize+1):
                isizeCount.append(0);
            #print(fileisize);
            fileisizefd = gzip.open(fileisize, "r");
            sumisize=0;
            for lineisfd in fileisizefd:
                #add filters and store
                fields=lineisfd.split( );

                try:
                    count   = int(fields[0]);
                    isize   = int(fields[1]);
                except ValueError:
                    continue;

                if( (isize<mininsize) or (isize>maxinsize) ):
                    continue;
                sumisize+=count;
                isizeCount[ isize ] = count;

            isizedatfp.write(str(bami));
            for i in range(mininsize,(maxinsize+1)):                        
                isizedatfp.write( "\t"+str( isizeCount[ i ]) );

            for i in range(mininsize,(maxinsize+1)):
                isizeCount[i]=float(isizeCount[i])/float(sumisize); #normalize
                datatoadd.append( isizeCount[ i ]  );    

            isizedatfp.write( "\n" );

            dataAllisize.append( datatoadd );

    isizedatfp.close();
    sys.stderr.write("\nWriten isize data to "+str(resultso+foffilesub+"_isize.dat")+".\n");
    handle_job("gzip "+resultso+foffilesub+"_isize.dat");    
    return dataAllisize;



def parseIsizeWindow(resultso,foffilesub,bamfiles,numbwins,mininsize,maxinsize,minobs):

    dataAllisize = [];    
    originalIdx  = [];    

    if(not os.path.exists((resultso+foffilesub+"_isize.dat.gz"))):
        isizedat = (resultso+foffilesub+"_isize.dat");
        isizedatfp = open(isizedat, "w");

        isizedatfp.write("#fileidx\twindowID");
        for i in range(mininsize,(maxinsize+1)):
            isizedatfp.write( "\t"+str( i  ) );
        isizedatfp.write( "\n" );
        sys.stderr.write("Reading bam files\n");

        #dataAllisize = np.array([])


        for bami in tqdm(range(0,len(bamfiles))):
            fileisize = resultso+"/stage1/"+str(bami)+".isize.gz";
            #datatoadd = np.array([]);
            #datatoadd = [];

            if(not os.path.exists(fileisize)):
                sys.stderr.write("\nThe file "+str(fileisize)+" does not exist, please run all commands.\n");
                sys.exit(1);
            else:
                isizeCount=[];
                #for i in range(0,maxinsize+1):
                #    isizeCount.append(0);
                #print(fileisize);

                fileisizefd = gzip.open(fileisize, "r");
                numLines=0;
                for lineisfd in fileisizefd:
                    numLines+=1;
                    isizeCountW=[];
                    #add filters and store
                    fields=lineisfd.split( );
                    sumVal=0;
                    if(len(fields) != (maxinsize-mininsize-1) ):
                        sys.stderr.write("A line in file: "+str(fileisize)+" contains "+str(len(fields))+" fields but we expect: "+str(maxinsize-mininsize-1)+", verify the arguments to the script\n");
                        sys.exit(1);

                    if(len(fields) == 1 ):
                        isizeCountW = [-1] * (maxinsize-mininsize-1);
                    else:
                        for f in fields:
                            try:
                                count   = int(f);
                                sumVal+=count;
                                isizeCountW.append(count);
                            except ValueError:
                                sys.stderr.write("cannot convert "+str(f)+" to int");
                                sys.exit(1);
                    #    continue;
                    #if( (isize<mininsize) or (isize>maxinsize) ):
                    #    continue;

                    isizeCount.append( isizeCountW );
                    #isizeCount[ isize ] = count;
                    #for i in range( 0 , numbwins):
                    isizedatfp.write(str(bami)+"\t"+str(len(isizeCount))+"\t");
                    for i in range(0,len(isizeCountW)):
                        isizedatfp.write( "\t"+str( isizeCountW[ i ]) );
                        #datatoadd.append( isizeCountW[ i ]  );

                    isizedatfp.write( "\n" );

                fileisizefd.close();

                if(numLines != numbwins):
                    sys.stderr.write("Found  "+str(numLines)+" fields but no expected "+str(numbwins));
                    sys.exit(1);

                    #dataAllisize.append( isizeCountW );
                if( (isizeCountW[0] != -1) and (sumVal>=minobs) ):
                    for k in range(0,len(isizeCountW)):
                        isizeCountW[k]=float(isizeCountW[k])/float(sumVal); #normalize
                    dataAllisize.append( isizeCountW );
                    originalIdx.append( lineCount );


        isizedatfp.close();
        sys.stderr.write("\nWriten isize data to "+str(resultso+foffilesub+"_isize.dat")+".\n");
        handle_job("gzip "+resultso+foffilesub+"_isize.dat");    
    else:
        #
        sys.stderr.write("\nFile "+resultso+foffilesub+"_isize.dat.gz exists, reading it\n");
        #sys.exit(1);
        fileisizefd = gzip.open(resultso+foffilesub+"_isize.dat.gz", "r");

        lineCount=0;
        #keep count of order in 
        for lineisfd in fileisizefd:
            
            isizeCountW=[];
            #print("#"+lineisfd+"!");
            fields=lineisfd.split( );            
            if(fields[0] == "#fileidx"):#skip header                
                continue;
            sumVal=0;
            for f in fields[2:]:
                try:
                    count   = int(f);         
                    sumVal+=count;
                    isizeCountW.append(count);
                except ValueError:
                    sys.stderr.write("cannot convert "+str(f)+" to int");
                    sys.exit(1);

            if( (isizeCountW[0] != -1) and (sumVal>=minobs) ):
                #print(sumVal);
                #print(len(isizeCountW))
                for k in range(0,len(isizeCountW)):
                    isizeCountW[k]=float(isizeCountW[k])/float(sumVal); #normalize

                dataAllisize.append( isizeCountW );
                originalIdx.append( lineCount );

            if((lineCount%10000)==0 and lineCount!=0):
                print("read "+str(lineCount)+" lines");
            lineCount+=1;
        fileisizefd.close();
    return dataAllisize,originalIdx;


if(args[0] == "train"):

    #####################################
    #   stage 1: feature extraction     #
    #####################################

    bamfiles,label,foffilesub  =  readFOFlabeled(foffile);
    

    logfile = (resultso+foffilesub+"_train.log");
    #print(logfile);


    #stage 2: training+writing model
    
    
    if(not os.path.exists(logfile)):#step 1
        stage=1;
        #read file of file

        if not os.path.exists( ""+resultso):
            os.mkdir( ""+resultso, 0755 );
        else:
            sys.stderr.write("\nThe directory "+resultso+" already exists\n");            

        #insert size
        runStage1(options.resultso,options.threads,options.winsize,foffilesub,bamfiles,logfile);

        
    else:
        stage=2;
        sys.stderr.write("\n");
        sys.stderr.write("    #####################################    \n");
        sys.stderr.write("    #   stage 2: parsing features       #    \n");
        sys.stderr.write("    #####################################    \n");
        sys.stderr.write("\n");
        sys.stderr.write("Opening logfile:"+logfile+"\n");

        logfilefp = open(logfile, "r");

        #for linelog in logfilefp:
        linelog = logfilefp.readline();
        linelog = linelog.strip();
        
        if(linelog.startswith("#-o:")):
            options.resultso = linelog[len("#-o:"):len(linelog)];                
        else:
            sys.stderr.write("\nThe line "+linelog+" in "+logfile+" should start with #-o: something went wrong, please delete the log file.\n");
            sys.exit(1);

        linelog = logfilefp.readline();
        linelog = linelog.strip();

        if(linelog.startswith("#fof:")):
            if(foffile != linelog[len("#fof:"):len(linelog)]):
                sys.stderr.write("\nThe line "+linelog+" in "+logfile+" should have #fof with the same as the ones provided as arguments: something went wrong, please delete the log file.\n#"+linelog[len("#fof:"):len(linelog)]+"#");
                sys.exit(1);
                
        else:
            sys.stderr.write("\nThe line "+linelog+" in "+logfile+" should have #fof: something went wrong, please delete the log file.\n");
            sys.exit(1);

        linelog = logfilefp.readline();
        linelog = linelog.strip();

        if(linelog.startswith("#stage1:")):
            stage=2;
        else:
            sys.stderr.write("\nThe line "+linelog+" in "+logfile+" should have with #stage1: something went wrong, please delete the log file.\n");
            sys.exit(1);

        logfilefp.close();

        if(stage==1):
            sys.stderr.write("\nCannot identify stage in "+logfile+" should have with #stage1: something went wrong, please delete the log file.\n");
            sys.exit(1);
                    
        if( stage == 2):
            #########################
            #      PARSE ISIZE      #
            #########################
            if( usewindow ):
                dataAllisize , originalIdx = parseIsizeWindow(options.resultso,foffilesub,bamfiles,len(chrranges),mininsize,maxinsize,options.minobs);
            else:
                dataAllisize               = parseIsize(      options.resultso,foffilesub,bamfiles,               mininsize,maxinsize);
            #########################
            #      run HMMcopy      #
            #########################
            #normalize
            #print("dataAllisize");
            #print(len(dataAllisize));
            #print(dataAllisize);
            
            #row_sums          = dataAllisize.sum(axis=1);          
            #dataAllisizeNorm  = dataAllisize / row_sums[:, numpy.newaxis]
           # for d in range(0,len(dataAllisize)):
                #print(str(d)+"\t"+str(type(dataAllisize[d])));
            #    print(str(d)+"\t"+str(dataAllisize[d]));
            #    for i in range(0,len(dataAllisize[d])):
            #        print(str(i)+"\t"+str(type(dataAllisize[d][i])));
                #print(dataAllisize[d]);
                #print(len(dataAllisize[d]));
                #print(type(dataAllisize[d]));
            
            #print(type(dataAllisize[0][0]));
            #print(type(dataAllisize));
            #print(len(dataAllisize));
            
            #sys.exit(1);
            
            #
            #dataAllisizeNorm = np.array(normalize(dataAllisize,norm='l1'));
            dataAllisizeNorm = np.array( dataAllisize );
            #dataAllisizeNP    = np.array(dataAllisize);
            #print(dataAllisizeNP);
            #print(dataAllisizeNP.shape);
            #print(dataAllisizeNorm);
            #sys.exit(1);
            #print(dataAllisize[0]);
            #print(dataAllisizeNorm[0]);
            
            #nmf = NMF(n_components=2, init='random', random_state=0)
            
            #W = nmf.fit_transform(dataAllisizeNorm);
            #H = nmf.components_;
            sys.stderr.write("\nRunning NMF...");
            W, H, n_iter = non_negative_factorization(dataAllisizeNorm, n_components=wcomponents,init='random', random_state=0,solver="mu",beta_loss=1,max_iter=1000)
            sys.stderr.write("done\n");

            #print("W");
            #print(W);
            
            hfile = (resultso+foffilesub+"_"+str(wcomponents)+"_H.dat");
            hfilefp = open(hfile, "w");
            
            rows = H.shape[0]
            cols = H.shape[1]
            for i in range(0, rows):
                hfilefp.write( str(H[i,0]) );
                for j in range(1, cols):
                    hfilefp.write( "\t"+str(H[i,j]) );
                hfilefp.write( "\n" );
            hfilefp.close();


            sys.stderr.write("Writen H matrix  "+hfile+"\n");

            WNorm = np.array(normalize(W,norm='l1'));
            #print(WNorm);


            wfile = (resultso+foffilesub+"_"+str(wcomponents)+"_W.dat");
            wfilefp = open(wfile, "w");
            
            rows = W.shape[0]
            cols = W.shape[1]
            for i in range(0, rows):
                wfilefp.write( str(W[i,0]) );
                for j in range(1, cols):
                    wfilefp.write( "\t"+str(W[i,j]) );
                wfilefp.write( "\n" );
            wfilefp.close();

            sys.stderr.write("Writen raw W matrix  "+wfile+"\n");

            wfile = (resultso+foffilesub+"_"+str(wcomponents)+"_Wnorm.dat");
            wfilefp = open(wfile, "w");
            
            rows = WNorm.shape[0]
            cols = WNorm.shape[1]
            for i in range(0, rows):
                wfilefp.write( str(WNorm[i,0]) );
                for j in range(1, cols):
                    wfilefp.write( "\t"+str(WNorm[i,j]) );
                wfilefp.write( "\n" );
            wfilefp.close();

            sys.stderr.write("Writen normalized W matrix  "+wfile+"\n");

            #write W on a per window basis
            if( usewindow ):
                wfile = (resultso+foffilesub+"_"+str(wcomponents)+"_window.dat");
                wfilefp = open(wfile, "w");
                idxwin=0;
                idxwinMat=0;
                #originalIdx
                for bami in range(0,len(bamfiles)):
                    for cn in chrranges:
                        strtowrite=str(idxwin)+"\t"+bamfiles[bami]+"\t"+str(cn["chr"])+":"+str(cn["start"])+"-"+str(cn["end"])+"\t";
                        #originalIdx is the original index 
                        #print(str(idxwin)+"\t"+str(idxwinMat)+"\t"+str(idxwinMat<len(originalIdx)))
                        
                        if(idxwinMat<len(originalIdx) and idxwin==originalIdx[idxwinMat] ):#match
                            strtowrite+=str(WNorm[idxwinMat,0]);
                            for j in range(1, cols):
                                strtowrite+="\t"+str(WNorm[idxwinMat,j]) ;
                            idxwinMat+=1;
                        else:
                            strtowrite+="NA";
                            for j in range(1, cols):
                                strtowrite+="\tNA";

                        wfilefp.write( strtowrite+"\n" );
                        idxwin+=1;

                sys.stderr.write("Writen normalized W per window  "+wfile+"\n");


            #check correlations
            
            # Get all combination of [1, 2, 3] 
            if( not usewindow ):
                arraycomb=[]
                for j in range(1,wcomponents):
                    comb = combinations(range(0,wcomponents),j) 
                    # Print the obtained combutations 
                    for i in list(comb): 
                        arraycomb.append(i);

                #print(arraycomb);
                correlationArray=[];
                correlationTOP= float("-inf");
                correlationTOPi= -1;

                #for i in arraycomb: #for each possible combination
                auc=False;
                if(len(np.unique(label))==2):
                    auc=True;
                    sys.stderr.write("Labels are binary, using AUC\n");

                for i in range(0,len(arraycomb)): #for each possible combination
                    #print("i="+str(i))
                    #add components
                    #arraySumW=np.zeros(rows) 

                    arraySumW=[];
                    #sum the components 
                    for j in range(0,rows):
                        #print("j="+str(j)+" "+str(WNorm[j]));
                        #print("j="+str(j)+" "+str(WNorm[j,i]));
                        arraySumW.append( sum( WNorm[ j , arraycomb[i] ] ) );

                    #print("i="+str(i))
                    #print(bamfiles);
                    #print(arraySumW)
                    #print(label);
                    #correlation

                    if(auc):
                        corr=roc_auc_score(label,arraySumW);                    
                        corrFactor = corr;
                    else:                 
                        corr=np.corrcoef(arraySumW, label);
                        corrFactor=corr[1][0];

                        #print(corrFactor);
                    correlationArray.append(corrFactor);
                    if(corrFactor>correlationTOP):
                        correlationTOP=corrFactor;
                        correlationTOPi= i;

                sys.stderr.write("Correlation factors:\n");
                sys.stderr.write("components\tcorrelation\n");
                wfilec = (resultso+foffilesub+"_"+str(wcomponents)+"_Wcomp.dat");

                for i in range(0,len(arraycomb)): #for each possible combination
                    if(correlationTOPi == i):
                        wfilecfp = open(wfilec, "w");
                        wfilecfp.write( str(arraycomb[i]) );
                        wfilecfp.close();

                        sys.stderr.write(str(arraycomb[i])+"\t"+str(correlationArray[i])+"\tbest\n");
                    else:
                        sys.stderr.write(str(arraycomb[i])+"\t"+str(correlationArray[i])+"\n");

                sys.stderr.write("Writen best components  "+wfilec+"\n");

            #########################
            #      run HMMcopy      #
            #########################

            

 

        


    
    sys.exit(0);

#predict

if(args[0] == "predict"):

    #print(args[1]);
    #stage 1: feature extraction
    #stage 2: reading model +prediction

    hmatrix=args[1];
    foffile=args[2];


    #print(foffile);
    bamfiles,foffilesub  =  readFOFunlabeled(foffile);
    

    logfile = (resultso+foffilesub+"_predict.log");
    #sys.stderr.write("Trying to detect "+str(logfile)+"\n");



    #stage 2: training+writing model
    
    
    if(not os.path.exists(logfile)):#step 1
        stage=1;
        #read file of file
        
        #insert size
        runStage1(options.resultso,options.threads,options.winsize,foffilesub,bamfiles,logfile);
    else:

        hfile = hmatrix;
        if(not hfile.endswith("_H.dat")):
            sys.stderr.write("\nThe hfile "+str(hfile)+" should end with _H.dat\n");
            sys.exit(1);

        #detecting # of components

        fieldshf=hfile.split("_");
        try:
            wcomponents=int(fieldshf[len(fieldshf)-2] )
        except ValueError:
            sys.stderr.write("\nTried to infer the # of components used during training using "+str(hfile)+", failed. Make sure the file name ends with #_H.dat where # is the number of components\n");
            sys.exit(1);s

        sys.stderr.write("Model used "+str(wcomponents)+" components\n");
        sys.stderr.write("Opening model file "+str(hfile)+"\n");
        hfilefp = open(hfile, "r");
        Hmat=[];
        for hline in hfilefp:
            fields=hline.split( );
            Hmattoadd=[];

            for i in range(0,len(fields)):
                #print(fields[i]);
                Hmattoadd.append( float(fields[i]) );
            
            Hmat.append( Hmattoadd );

        Hnp = np.array( Hmat );

        #reading correlation
        correlationTOP= -1;

        
        wcompfile   = hmatrix[0:-5]+"Wcomp.dat";
        sys.stderr.write("Opening component file "+str(wcompfile)+"\n");
        
        wcompfilefp = open(wcompfile, "r");
        for wcompline in wcompfilefp:
            correlationTOP=wcompline;
        correlationTOP=correlationTOP.replace("(","") 
        correlationTOP=correlationTOP.replace(")","") 
        correlationTOP=correlationTOP.replace(" ","") 



        correlationTOP=np.fromstring(correlationTOP,dtype=int, sep=',');
        sys.stderr.write("Using best W components: "+str(correlationTOP)+"\n");
        

        #sys.exit(1);

        if( usewindow ):
            dataAllisize , originalIdx = parseIsizeWindow(options.resultso,foffilesub,bamfiles,len(chrranges),mininsize,maxinsize,options.minobs);
        else:
            dataAllisize = parseIsize(options.resultso,foffilesub,bamfiles,mininsize,maxinsize);
        #print(dataAllisize);
        #we used to normalize, now normalize in reading
        #dataAllisizeNorm = np.array(normalize(dataAllisize,norm='l1'));
        dataAllisizeNorm = np.array(dataAllisize);

        #print(Hnp.shape);
        #print(dataAllisizeNorm.shape);


        W, H, n_iter = non_negative_factorization(dataAllisizeNorm, 
                                                  n_components=wcomponents,
                                                  #W=None, 
                                                  H=Hnp,
                                                  update_H=False,
                                                  init='custom', 
                                                  random_state=0,
                                                  solver="mu",
                                                  beta_loss=1,
                                                  max_iter=200);
        #writing out W    
        wfile = (options.resultso+foffilesub+"_"+str(wcomponents)+"_W.out");
        sys.stderr.write("\nWriten W matrix to "+wfile+"\n");

        wfilefp = open(wfile, "w");

        rows = W.shape[0]
        cols = W.shape[1]
        for i in range(0, rows):
            wfilefp.write( str(W[i,0]) );
            for j in range(1, cols):
                wfilefp.write( "\t"+str(W[i,j]) );
            wfilefp.write( "\n" );
        wfilefp.close();

        Wnorm = np.array(normalize(W,norm='l1'));

        #writing out W norm
        wfile = (options.resultso+foffilesub+"_"+str(wcomponents)+"_Wnorm.out");
        sys.stderr.write("\nWriten normalized W matrix to "+wfile+"\n");

        wfilefp = open(wfile, "w");

        rows = Wnorm.shape[0]
        cols = Wnorm.shape[1]
        for i in range(0, rows):
            wfilefp.write( str(Wnorm[i,0]) );
            for j in range(1, cols):
                wfilefp.write( "\t"+str(Wnorm[i,j]) );
            wfilefp.write( "\n" );
        wfilefp.close();

        #writing out subsampled W norm
        wfile = (options.resultso+foffilesub+"_"+str(wcomponents)+"_Wnormcomp.out");
        sys.stderr.write("\nWriten correlated score matrix to "+wfile+"\n");

        wfilefp = open(wfile, "w");

        rows = Wnorm.shape[0]
        cols = Wnorm.shape[1]
        for i in range(0, rows):
            wfilefp.write( str(sum(Wnorm[i,correlationTOP])) );
            wfilefp.write( "\n" );
        wfilefp.close();



    #end else not stage 1
    
        
        #print(W);
    #print(hmatrix);
    #print(foffile);

    sys.exit(0);



sys.stderr.write("\nUnknown mode.\n");
sys.exit(1);


back to top