https://github.com/machinegun/hi-c-scaffold
Raw File
Tip revision: 1b76bf63efb973583647a1eb95863d33ee6e09ad authored by Sergey Koren on 19 May 2021, 17:21:41 UTC
Merge pull request #130 from EdHarry/patch-1
Tip revision: 1b76bf6
run_pipeline.py
#!/usr/bin/env python2
import os
import stat
import argparse
import sys
import subprocess
from subprocess import Popen, PIPE

def check(path):
    with open(path,'r') as f:
        for line in f:
            attrs = line.split()
            if float(attrs[4]) >= 1:
                return False
            else:
                return True

def fileExists(path):
   return os.path.isfile(path) and os.path.getsize(path) > 0


ng50 = []
def NG50(lengths,sz = 0):
    genome_size = sz
    if genome_size == 0:
        genome_size = sum(lengths.values())
    contig_lengths = sorted(lengths.values(),reverse=True)
    lensum = 0
    for i in xrange(len(contig_lengths)):
        lensum += contig_lengths[i]
        if lensum >= genome_size/2:
            return contig_lengths[i]

def main():
    bin=os.path.dirname(os.path.abspath(__file__))
    #print bin
    parser = argparse.ArgumentParser(description="SALSA Iterative Pipeline")
    parser.add_argument('-a','--assembly',help='Path to initial assembly',required=True)
    parser.add_argument('-l','--length',help='Length of contigs at start',required=True)
    parser.add_argument('-b','--bed',help='Bed file of alignments sorted by read names',required=True)
    parser.add_argument('-o','--output',help='Output directory to put results',required=False,default='SALSA_output')
    parser.add_argument('-c','--cutoff',help='Minimum contig length to scaffold, default=1000',required=False,default=1000)
    parser.add_argument('-g','--gfa',help='GFA file for assembly',required=False,default='abc')
    #parser.add_argument('-u','--unitigs',help='The tiling of unitigs to contigs in bed format',required=False,default='abc')
    #parser.add_argument('-t','--tenx',help='10x links tab separated file, sorted by last columnls',required=False,default='abc')
    parser.add_argument('-e','--enzyme',help='Restriction Enzyme used for experiment',required=False,default='AAGCTT')
    parser.add_argument('-i','--iter',help='Number of iterations to run, default = 3',required=False,default=3)
    parser.add_argument("-x",'--dup',help='File containing duplicated contig information',required=False,default='abc')
    parser.add_argument("-s",'--exp',help="Expected Genome size of the assembled genome",required=False,default=0)
    parser.add_argument("-m","--clean",help="Set this option to \"yes\" if you want to find misassemblies in input assembly",required=False,default="yes")
    #parser.add_argument("-d","--dist",help="Maximum distance between pairs to consider for misassembly detection",required=False,default=2000000)
    parser.add_argument("-f","--filter",help="Filter bed file for contigs present in the assembly",required=False,default="no")
    parser.add_argument("-p","--prnt",help="Set this option to \"yes\" if you want to output the scaffolds sequence and agp file for each iteration", required=False,default="no")

    args = parser.parse_args()


    #iteration counter
    iter_num = 1
    genome_size = int(args.exp)
    if not os.path.exists(args.output):
        os.mkdir(args.output)


    '''
    Check if misassembly detection needs to be done
    '''
    log = open(args.output+'/commands.log','w',1)


    if not os.path.exists(args.output+'/scaffold_length_iteration_1'):
        cmd = 'cp '+args.length+' '+args.output+'/scaffold_length_iteration_1'

        try:
            p = subprocess.check_output(cmd, shell=True)
            os.chmod(args.output + "/scaffold_length_iteration_1", stat.S_IRUSR | stat.S_IWUSR | stat.S_IRGRP | stat.S_IWGRP | stat.S_IROTH)
        except subprocess.CalledProcessError as err:
            print >> sys.stderr, str(err.output)
            sys.exit(1)


    if not fileExists(args.output+'/alignment_iteration_1.bed'):
        if os.path.isfile(args.output+'/alignment_iteration_1.bed'):
            os.system('rm '+args.output+'/alignment_iteration_1.bed')

        #os.system('cp '+args.bed+' '+args.output+'/alignment_iteration_1.bed')
        if args.filter == "yes":
            os.system("cut -f 1 "+args.length+" | grep -v '>'  > "+args.output+"/contig_names.txt")
            os.system("grep -f "+args.output+"/contig_names.txt -w  "+ args.bed+ " > "+args.output+"/alignment_iteration_1.bed")
        else:
            os.symlink(os.path.abspath(args.bed),args.output+'/alignment_iteration_1.bed')
        #cmd  = 'ln -s '+os.path.abspath(args.bed)+' '+args.output+'/alignment_iteration_1.bed'
        #try:
          #  p = subprocess.check_output(cmd, shell=True)
        #except subprocess.CalledProcessError as err:
         #   print >> sys.stderr, str(err.output)
          #  sys.exit(1)

    os.system('ln -s ' + os.path.abspath(args.assembly) + ' '+args.output+'/assembly.cleaned.fasta')

    if args.clean == 'yes':
       cmd = bin+'/break_contigs_start -a ' + args.output+'/alignment_iteration_1.bed -l ' + args.output+'/scaffold_length_iteration_1 -s 100 > ' + args.output+'/input_breaks'
       log.write(cmd)
       try:
           p = subprocess.check_output(cmd,shell=True)
       except subprocess.CalledProcessError as err:
           print >> sys.stderr,str(err.output)

       cmd = 'python2 ' +bin+'/correct.py  ' + args.assembly + ' '+args.output+'/input_breaks '+args.output+'/alignment_iteration_1.bed '+args.output
       log.write(cmd)
       try:
           p = subprocess.check_output(cmd,shell=True)
       except subprocess.CalledProcessError as err:
           print >> sys.stderr, str(err.output)

       os.system("mv " + args.output+"//alignment_iteration_1.tmp.bed " + args.output+"//alignment_iteration_1.bed")
       os.system("mv " + args.output+"//asm.cleaned.fasta " + args.output+"//assembly.cleaned.fasta")






    #First get RE sites
    if not fileExists(args.output+'/re_counts_iteration_'+str(iter_num)):
        try:
            cmd = 'python2 '+bin+'/RE_sites.py -a '+args.output + '/assembly.cleaned.fasta -e '+ args.enzyme + ' > '+ args.output+'/re_counts_iteration_'+str(iter_num)
            print cmd
            log.write(cmd+'\n')
            p = subprocess.check_output(cmd,shell=True)

        except subprocess.CalledProcessError as err:
            # if os.path.isfile(args.output+'/re_counts_iteration_'+str(iter_num)):
            #   os.system(args.output+'/RE_sites_iteration_'+str(iter_num))
            print >> sys.stderr, str(err.output)
            sys.exit(1)

    #Now compute normal links with old new_links code
    print >> sys.stderr, "Starting Iteration "+ str(iter_num)
    if not fileExists(args.output+'/contig_links_iteration_'+str(iter_num)):
        try:
            cmd = 'python2 '+bin+'/make_links.py -b '+ args.output+'/alignment_iteration_1.bed' + ' -d '+ args.output +' -i '+str(iter_num) + ' -x ' + args.dup
            print cmd
            log.write(cmd+'\n')
            p = subprocess.check_output(cmd,shell=True)

        except subprocess.CalledProcessError as err:
            print >> sys.stderr, str(err.output)
            if os.path.isfile(args.output+'/contig_links_iteration_'+str(iter_num)):
                os.system('rm '+args.output+'/contig_links_iteration_'+str(iter_num))
            sys.exit(1)

    #now use Serge's code to calculate
    if not fileExists(args.output+'/contig_links_scaled_iteration_'+str(iter_num)):
        try:
            cmd =  'python2 '+bin+'/fast_scaled_scores.py -d '+args.output+' -i '+str(iter_num)
            log.write(cmd+'\n')
            print cmd
            p = subprocess.check_output(cmd,shell=True)
        except subprocess.CalledProcessError as err:
            if os.path.isfile(args.output+'/contig_links_scaled_iteration_'+str(iter_num)):
                os.system('rm '+args.output+'/contig_links_scaled_iteration_'+str(iter_num))
            print >> sys.stderr, str(err.output)
            sys.exit(1)

    #Sort the links by column 5
    if not fileExists(args.output+'/contig_links_scaled_sorted_iteration_'+str(iter_num)):
        try:
            cmd = 'sort -k 5 -gr '+args.output+'/contig_links_scaled_iteration_'+str(iter_num)+ ' > '+ args.output+'/contig_links_scaled_sorted_iteration_'+str(iter_num)
            log.write(cmd+'\n')
            print cmd
            p = subprocess.check_output(cmd,shell=True)
        except subprocess.CalledProcessError as err:
            if os.path.isfile(args.output+'/contig_links_scaled_sorted_iteration_'+str(iter_num)):
                os.system('rm '+args.output+'/contig_links_scaled_sorted_iteration_'+str(iter_num))
            print >> sys.stderr, str(err.output)
            sys.exit(1)

    if args.gfa != 'abc' and not os.path.isfile(args.output+'/tmp.links'):
        try:
            cmd = bin+'/correct_links -g ' + args.gfa + ' -l ' + args.output+'/contig_links_scaled_sorted_iteration_1 > ' + args.output+'/tmp.links'
            log.write(cmd+'\n')
            p = subprocess.check_output(cmd,shell=True)
        except  subprocess.CalledProcessError as err:
            print >> sys.stderr, str(err.output)
            sys.exit(1)

        os.system('mv ' + args.output+'/tmp.links '+args.output+'/contig_links_scaled_sorted_iteration_1')


    if not os.path.isfile(args.output+'/scaffolds_iteration_1.p'):
        try:
            cmd = 'python2 '+bin+'/layout_unitigs.py -x '+args.gfa + ' -l '+args.output+'/contig_links_scaled_sorted_iteration_'+str(iter_num) +' -c ' +str(args.cutoff)+' -i 1 -d '+args.output
            log.write(cmd+'\n')
            print cmd
            p = subprocess.check_output(cmd,shell=True)

        except subprocess.CalledProcessError as err:
            if os.path.isfile(args.output+'scaffolds_iteration_1.p'):
                os.system('rm '+args.output+'scaffolds_iteration_1.p')
            print >> sys.stderr, str(err.output)
            sys.exit(1)
    if not fileExists(args.output+'/misasm_iteration_'+str(iter_num+1)+'.report'):
        try:
            cmd = bin+'/break_contigs -a ' + args.output+'/alignment_iteration_'+str(iter_num+1)+'.bed -b ' + args.output+'/breakpoints_iteration_'+str(iter_num+1)+'.txt -l '+ args.output+'/scaffold_length_iteration_'+str(iter_num+1) + ' -i '+str(iter_num+1)+' -s 100   > ' + args.output+'/misasm_iteration_'+str(iter_num+1)+'.report'
            p = subprocess.check_output(cmd,shell=True)
            print cmd
            log.write(cmd+'\n')
        except subprocess.CalledProcessError as err:
            print  >> sys.stderr, str(err.output)
            sys.exit(1)

    if not fileExists(args.output+'/misasm_'+str(iter_num+1)+'.DONE'):
        try:
            cmd = 'python2 '+bin+'/refactor_breaks.py -d ' + args.output + ' -i ' + str(iter_num+1)
            p = subprocess.check_output(cmd,shell=True)
            print cmd
            log.write(cmd+'\n')
            #os.system('mv scaffold_length_iteration_'+str(iter_num+1)+'_tmp scaffold_length_iteration'+str(iter_num+1))
            #os.system('mv re_counts_iteration_'+str(iter_num+1)+'_tmp re_counts_iteration_'+str(iter_num+1))
            #os.system('mv alignment_iteration_'+str(iter_num+1)+'_tmp.bed alignment_iteration_'+str(iter_num+1)+'.bed')
        except subprocess.CalledProcessError as err:
            print  >> sys.stderr, str(err.output)
            sys.exit(1)

    if args.prnt == 'yes':
        cmd = 'python2 ' + bin+'/get_seq.py -a '+ args.output +'/assembly.cleaned.fasta -f ' + args.output+'/scaffolds_ITERATION_'+str(iter_num)+'.fasta -g ' + args.output+'/scaffolds_ITERATION_'+str(iter_num)+'.agp -p '+ args.output+'/scaffolds_iteration_'+str(iter_num)+'.p'
        log.write(cmd+'\n')
        try:
            p = subprocess.check_output(cmd,shell=True)
        except subprocess.CalledProcessError as err:
            print >> sys.stderr, str(err.output)


    iter_num += 1
    scaffold_length = {}
    with open(args.output+'/scaffold_length_iteration_'+str(iter_num),'r') as f:
        for line in f:
            attrs = line.split()
            scaffold_length[attrs[0]] = int(attrs[1])
        ng50.append(NG50(scaffold_length,genome_size))
    if iter_num - 1 == int(args.iter):
        cmd ='python2 '+bin+'/get_seq.py -a '+ args.output + '/assembly.cleaned.fasta -f ' + args.output+'/scaffolds_FINAL.fasta -g ' + args.output+'/scaffolds_FINAL.agp -p '+args.output+'/scaffolds_iteration_'+str(args.iter)+'.p'
        log.write(cmd+'\n')
        os.system(cmd)
        sys.exit(0)
    #now do iterative
    while True:
        print >> sys.stderr, "Starting Iteration "+ str(iter_num)
        if not fileExists(args.output+'/contig_links_iteration_'+str(iter_num)):
            try:
                cmd = 'python2 '+bin+'/make_links.py -b '+ args.output+'/alignment_iteration_'+str(iter_num)+'.bed' + ' -d '+ args.output +' -i '+str(iter_num)
                print cmd
                p = subprocess.check_output(cmd,shell=True)
                log.write(cmd+'\n')
                #os.system("rm "+args.output+'/alignment_iteration_'+str(iter_num)+'.bed')

            except subprocess.CalledProcessError as err:
                print >> sys.stderr, str(err.output)
                if os.path.isfile(args.output+'/contig_links_iteration_'+str(iter_num)):
                    os.system('rm '+args.output+'/contig_links_iteration_'+str(iter_num))
                sys.exit(1)

        print >> sys.stderr, "Starting Iteration "+ str(iter_num)
        if not os.path.isfile(args.output+'/contig_links_scaled_iteration_'+str(iter_num)):
            try:
                cmd =  'python2 '+bin+'/fast_scaled_scores.py -d '+args.output+' -i '+str(iter_num)
                log.write(cmd+'\n')
                p = subprocess.check_output(cmd,shell=True)
            except subprocess.CalledProcessError as err:
                if os.path.isfile(args.output+'/contig_links_scaled_iteration_'+str(iter_num)):
                    os.system('rm '+args.output+'/contig_links_scaled_iteration_'+str(iter_num))
                print >> sys.stderr, str(err.output)
                sys.exit(1)

        if not fileExists(args.output+'/contig_links_scaled_sorted_iteration_'+str(iter_num)):
            try:
                cmd = 'sort -k 5 -gr '+args.output+'/contig_links_scaled_iteration_'+str(iter_num)+ ' > '+ args.output+'/contig_links_scaled_sorted_iteration_'+str(iter_num)
                log.write(cmd+'\n')
                p = subprocess.check_output(cmd,shell=True)
            except subprocess.CalledProcessError as err:
                if os.path.isfile(args.output+'/new_links_scaled_sorted_iteration_'+str(iter_num)):
                    os.system('rm '+args.output+'/new_links_scaled_sorted_iteration_'+str(iter_num))
                print >> sys.stderr, str(err.output)
                sys.exit(1)
        #NOW check if any useful link here
        if check(args.output+'/contig_links_scaled_sorted_iteration_'+str(iter_num)):
           print >> sys.stderr, "WARNING: Not enough Hi-C reads for scaffolding"

        if not fileExists(args.output+'/scaffolds_iteration_'+str(iter_num)+'.p'):
            try:
                cmd = 'python2 '+bin+'/layout_unitigs.py -x abc -l '+args.output+'/contig_links_scaled_sorted_iteration_'+str(iter_num) +' -c ' +str(args.cutoff)+' -i '+str(iter_num)+' -d '+args.output
                print cmd
                log.write(cmd+'\n')
                p = subprocess.check_output(cmd,shell=True)

            except subprocess.CalledProcessError as err:
                if os.path.isfile(args.output+'scaffolds_iteration_'+str(iter_num)+'.p'):
                    os.system('rm '+args.output+'scaffolds_iteration_'+str(iter_num)+'.p')
                print >> sys.stderr, str(err.output)
                sys.exit(1)


        if not os.path.isfile(args.output+'/misasm_iteration_'+str(iter_num+1)+'.report'):
            try:
                cmd = bin+'/break_contigs -a ' + args.output+'/alignment_iteration_'+str(iter_num+1)+'.bed -b ' + args.output+'/breakpoints_iteration_'+str(iter_num+1)+'.txt -l '+ args.output+'/scaffold_length_iteration_'+str(iter_num+1) + ' -i '+str(iter_num+1)+' -s 100  > ' + args.output+'/misasm_iteration_'+str(iter_num+1)+'.report'
                print cmd
                log.write(cmd+'\n')
                p = subprocess.check_output(cmd,shell=True)
            except subprocess.CalledProcessError as err:
                print  >> sys.stderr, str(err.output)
                sys.exit(1)

        if not os.path.isfile(args.output+'/misasm_'+str(iter_num+1)+'.DONE'):
            try:
                cmd = 'python2 '+bin+'/refactor_breaks.py -d ' + args.output + ' -i ' + str(iter_num+1) + ' > '+args.output+'/misasm_'+str(iter_num+1)+'.log'
                print cmd
                log.write(cmd+'\n')
                p = subprocess.check_output(cmd,shell=True)
                #os.system('mv scaffold_length_iteration_'+str(iter_num+1)+'_tmp scaffold_length_iteration'+str(iter_num+1))
                #os.system('mv re_counts_iteration_'+str(iter_num+1)+'_tmp re_counts_iteration_'+str(iter_num+1))
                #os.system('mv alignment_iteration_'+str(iter_num+1)+'_tmp.bed alignment_iteration_'+str(iter_num+1)+'.bed')
            except subprocess.CalledProcessError as err:
                print  >> sys.stderr, str(err.output)
                sys.exit(1)


        if args.prnt == 'yes':
            cmd = 'python2 ' + bin+'/get_seq.py -a '+ args.output +'/assembly.cleaned.fasta -f ' + args.output+'/scaffolds_ITERATION_'+str(iter_num)+'.fasta -g ' + args.output+'/scaffolds_ITERATION_'+str(iter_num)+'.agp -p '+ args.output+'/scaffolds_iteration_'+str(iter_num)+'.p'
            log.write(cmd+'\n')
            try:
                p = subprocess.check_output(cmd,shell=True)
            except subprocess.CalledProcessError as err:
                print >> sys.stderr, str(err.output)

        scaffold_length = {}
        with open(args.output+'/scaffold_length_iteration_'+str(iter_num+1),'r') as f:
            for line in f:
                attrs = line.split()
                scaffold_length[attrs[0]] = int(attrs[1])
            ng50.append(NG50(scaffold_length,genome_size))
            curr_sz = len(ng50)
            if ng50[curr_sz - 1] == ng50[curr_sz - 2]:
                cmd ='python2 '+bin+'/get_seq.py -a '+ args.output + '/assembly.cleaned.fasta -f ' + args.output+'/scaffolds_FINAL.fasta -g ' + args.output+'/scaffolds_FINAL.agp -p '+args.output+'/scaffolds_iteration_'+str(iter_num-1)+'.p'
                log.write(cmd+'\n')
                os.system(cmd)
                sys.exit(0)


        if iter_num - 1 == int(args.iter):
            cmd ='python2 '+bin+'/get_seq.py -a '+ args.output + '/assembly.cleaned.fasta -f ' + args.output+'/scaffolds_FINAL.fasta -g ' + args.output+'/scaffolds_FINAL.agp -p '+args.output+'/scaffolds_iteration_'+str(args.iter)+'.p'
            log.write(cmd+'\n')
            os.system(cmd)
            sys.exit(0)

        iter_num += 1

    log.close()

if __name__ == '__main__':
    main()

back to top