Revision 3a78098f11e03b0136e9f4bb3aa587d528f08d3f authored by Asher Preska Steinberg on 09 February 2021, 19:22:11 UTC, committed by Asher Preska Steinberg on 09 February 2021, 19:22:11 UTC
1 parent a3cece3
APS146_ngs_CF_MSA.py
# -*- coding: utf-8 -*-
"""
This code makes a "Master MSA" file from MSA files from individual clusters
can be useful if you want to recluster your data and you don't have an MSA file with all
sequences
Here I'm using it to make Master MSAs for the core and flexible genomes for n gonorrhoeae
"""
#Spence's thing
import sys
import os
import pandas as pd
from tqdm import tqdm
def mkdir_p(dir):
'make a directory if doesnt exist'
if not os.path.exists(dir):
os.mkdir(dir)
###
#Background
#This script assumes you've already run the 'pipeline' bash script
#After you've run the 'pipeline' script you should have a series of folders for each serotype
#This will crawl through those folders and grab the multi FASTA files and merge them to create a pooled multi FASTA file
#This script will create two files:
#-REFGEN_Master
#-REFGEN_Master_Sorted
#The sorted file has all of the samples grouped together by gene as a proper XMFA file
#The REFGEN_Master file is just the different serotype XMFA files concatinated
#Hence, the REFGEN_Master can be deleted.
#Notes
#This program is denoted 'HighRAM' because it loads the REFGEN file into memory for sorting
#A lower RAM version was created. However, it takes a prohibitively long time.
###
#INPUTS
#ref_dir is the directory that contains the all of the serotype folders generated by pipeline.sh
ref_dir = "/scratch/aps376/recombo/APS144_1008_ngs_Archive/cutoff_10pt/"
#Where the output file will be sent to
outdir = "/scratch/aps376/recombo/APS146_PCA/"
genomes = ['CORE', 'FLEX']
#A function thats input is a refgen file is then turned into a DataFrame
#The Dataframe is indexed by gene name and includes the gene start and gene end lines within the
#provided refgen file.
def get_name_loc(ref_file):
with open(ref_file, "r") as refgen_file:
refgen_lines = refgen_file.readlines()
#The indices of the lines seperating each gene of the serotype specific regen filex
gene_end = [i for i, x in enumerate(refgen_lines) if x == '=\n']
gene_start = [0]+gene_end[:-1]
gene_name = [refgen_lines[name_ind-2].replace('|',' ').split()[1] for name_ind in gene_end]
#A dataframe indexed by gene name and with a column of gene end line
out_df = pd.DataFrame({'start':gene_start,'end':gene_end},index=gene_name)
return out_df
#list of clusters or serotypes
seros = [1, 4, 5, 6, 7, 9, 10, 11, 12, 13, 14, 15, 16, 17, 21,
22, 23, 24, 25, 26, 27, 28, 29, 30, 33, 35, 36, 37, 38,
39, 41, 42, 43, 47, 48, 49]
for genome in genomes:
refout_all = outdir+'MSA_'+str(genome)+'_Master'
#Generate a concatinated file
with open(refout_all, 'w') as master_file:
for sero_name in seros:
#Where the REFGEN sero file is
refout = ref_dir + 'MSA_' + str(genome) + '_cluster' + str(sero_name)
with open(refout) as sero_file:
for line in sero_file:
master_file.write(line)
gene_locations = get_name_loc(refout_all)
genes = set(gene_locations.index)
with open(refout_all, 'r') as master_file:
master_full = master_file.readlines()
#Create the sorted file
with open(refout_all + '_Sorted', 'w+') as sorted_master_file:
#Go across genes
for gene in tqdm(genes):
# print('Adding gene:' + gene)
#Put serotype gene regions together
master_chunk_lines = []
one_gene = gene_locations.loc[gene].reset_index(drop=True)
#print(one_gene)
for i in range(0,one_gene.shape[0]):
gene_start = one_gene.at[i,'start']
gene_end = one_gene.at[i,'end']
master_chunk_lines.extend(master_full[gene_start+1:gene_end])
# print(master_chunk_lines[0])
#Write gene to file
for line in master_chunk_lines:
sorted_master_file.write(line)
sorted_master_file.write('=\n')

Computing file changes ...