https://github.com/nguyenngochuy91/Ancestral-Blocks-Reconstruction
Tip revision: 2534747382a8d6039c9d5456c322723671867030 authored by nguyenngochuy91 on 20 December 2019, 15:34:24 UTC
update REAME
update REAME
Tip revision: 2534747
roague.py
#!/usr/bin/env python
''' Author : Huy Nguyen
Program : ROAGUE pipeline, which include the finding of orthoblocks in set of given taxa
and reconstruct the ancestral gene blocks.
This is basically a program that calls several other program
Start : 09/08/2016
End : 09/1/2016
'''
import argparse
import os
## Traverses the genome information directory
def traverseAll(path):
res=[]
for root,dirs,files in os.walk(path):
for f in files:
res.append(root+'/'+f)
return res
### parsing the argument from user input
def parser_code():
parser = argparse.ArgumentParser()
parser.add_argument("--genomes_directory","-g", help="The directory that store all the genomes file (E_Coli/genomes)")
parser.add_argument("--gene_blocks","-b", help="The gene_block_names_and_genes.txt file, this file stores the operon name and its set of genes")
parser.add_argument("--reference","-r", help="The ncbi accession number for the reference genome (NC_000913 for E_Coli and NC_000964 for B_Sub)")
parser.add_argument("--filter","-f", help="The filter file for creating the tree (E_Coli/phylo_order.txt for E_Coli or B_Sub/phylo_order.txt for B-Sub)")
parser.add_argument("--method","-m", help="The method to reconstruc ancestral gene block, we support either global or local")
parser.add_argument("--output","-o", help="Output directory to store the result",default = "result")
parser.add_argument("--approx","-a", help="Using approx method (Y,N)",default = "F")
return parser.parse_args()
if __name__ == '__main__':
args = parser_code()
genomes_directory = args.genomes_directory
reference = args.reference
filter_file = args.filter
method = args.method
gene_block_names_and_genes = args.gene_blocks
outdir = args.output
approx = args.approx
# check if we are going to output results in the current directory
try:
dirs = genomes_directory.split('/')
except AttributeError:
print ("Genomes directory argument can't not be None, please refer to the manual of the program (typing ./roague -h)")
outdir+='/'
try:
os.mkdir(outdir+'/')
except:
print ("output directory has already been created")
if len(dirs)>=3: # means that we have to go to subdirectory
parent_dir = outdir+dirs[0]+"/"
else:
parent_dir = outdir
##########################################################################
# finding gene blocks
### format a database for faster blasting. output in db
db = parent_dir+ 'db'
cmd1 ='./format_db.py -i {} -o {}'.format(genomes_directory,db)
os.system(cmd1)
print ('cmd1:',cmd1)
### Given the gene_block_names_and_genes.txt, create a gene_block_query.fa using the reference gene bank file. output in file gene_block_query.fa
gene_block_names_and_genes = dirs[0]+"/"+'gene_block_names_and_genes.txt'
gene_block_query = parent_dir +'gene_block_query.fa'
cmd2 ='./make_operon_query.py -i {} -b {} -r {} -o {}'.format(genomes_directory,gene_block_names_and_genes,reference,gene_block_query)
os.system(cmd2)
print ('cmd2:',cmd2)
### blasting using db vs the gene_block_query.fa above. output in blast_result
blast_result = parent_dir+'blast_result/'
cmd3 ='./blast_script.py -u {} -d {} -o {}'.format(gene_block_query,db,blast_result)
os.system(cmd3)
print ('cmd3:',cmd3)
### parsing the blast result directory into files that group by operon names, output in blast_parse
blast_parse = parent_dir+'blast_parse/'
cmd4 ='./blast_parse.py -b {} -i {} -o {}'.format(gene_block_names_and_genes,blast_result,blast_parse)
os.system(cmd4)
print ('cmd4:',cmd4)
#
# ### filtering the gene blocks so that we have the most optimal gene blocks given the blast parse directory, ouput to optimized_gene_block
optimized_gene_block = parent_dir+'optimized_gene_block/'
cmd5 ='./filter_operon_blast_results.py -i {} -o {}'.format(blast_parse,optimized_gene_block)
os.system(cmd5)
print ('cmd5:',cmd5)
#
# ### create newick tree file. output in tree directory
tree_dir = parent_dir+'tree/'
cmd6 ='./create_newick_tree.py -G {} -f {} -r {} -o {}'.format(genomes_directory,filter_file,reference,tree_dir)
os.system(cmd6)
print ('cmd6:',cmd6)
#
# ### from the filter_operon_blast_results, create a result directory. output in result directory
accession = tree_dir + 'accession_to_common.csv'
result = parent_dir + 'result/'
cmd7 = './get_result.py -g {} -a {} -o {} -i {}'.format(gene_block_names_and_genes,accession,result,optimized_gene_block)
os.system(cmd7)
print ('cmd7:',cmd7)
#
# ### generate group.txt file to color taxa based on class
group = parent_dir+'group.txt'
cmd8 ='./group.py -i {} -o {} -a {}'.format(genomes_directory,group,filter_file)
os.system(cmd8)
print ('cmd8:',cmd8)
###########################################################################
## reconstruction of ancestral gene blocks
### convert the file in directory result and store it in new_result
new_result = parent_dir+'new_result/'
cmd9 = './convert.py -i {} -o {}'.format(result,new_result)
os.system(cmd9)
print ('cmd9:',cmd9)
if approx.upper() == "F":
### reconstructing the gene block using global method
reconstruction = parent_dir+'reconstruction_'+method
tree = tree_dir+'out_tree.nwk'
cmd10 ='./reconstruction.py -i {} -t {} -o {} -m {}'.format(new_result,tree,reconstruction,method)
os.system(cmd10)
print ('cmd10:',cmd10)
### create a visualization of the reconstruction and store it in directory visualization
visualization = parent_dir+'/visualization/'
try:
os.mkdir(visualization)
except:
print ("visualization directory is already created" )
res = traverseAll(reconstruction)
# go through all the reconstruction ancestral file and create a pdf visualization of them, using the group text to color group
for file in res:
operon = file.split('/')[-1]
if 'mapping' in operon:
continue
outfile = visualization+operon
cmd11 = './show_tree.py -i {} -g {} -o {}'.format(file,group,outfile)
os.system(cmd11)
else:
new_result = new_result[:-1]+"_approx/"
### reconstructing the gene block using global method
reconstruction = parent_dir+'reconstruction_'+method+"_approx/"
tree = tree_dir+'out_tree.nwk'
cmd10 ='./reconstruction.py -i {} -t {} -o {} -m {}'.format(new_result,tree,reconstruction,method)
os.system(cmd10)
print ('cmd10:',cmd10)
### create a visualization of the reconstruction and store it in directory visualization
visualization = parent_dir+'/visualization_approx/'
try:
os.mkdir(visualization)
except:
print ("visualization directory is already created" )
res = traverseAll(reconstruction)
# go through all the reconstruction ancestral file and create a pdf visualization of them, using the group text to color group
for file in res:
operon = file.split('/')[-1]
if 'mapping' in operon:
continue
outfile = visualization+operon
cmd11 = './show_tree.py -i {} -g {} -o {}'.format(file,group,outfile)
os.system(cmd11)