Revision 3a78098f11e03b0136e9f4bb3aa587d528f08d3f authored by Asher Preska Steinberg on 09 February 2021, 19:22:11 UTC, committed by Asher Preska Steinberg on 09 February 2021, 19:22:11 UTC
1 parent a3cece3
aps143_sentericaMSA.py
# -*- coding: utf-8 -*-
"""APS143_senterica_MSA
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/1NI2XFz_rOwlXHX73jAAJYJypsr2R0_S8
"""
#Spence's thing
import sys
import os
import pandas as pd
from tqdm import tqdm
def mkdir_p(dir):
'make a directory if doesnt exist'
if not os.path.exists(dir):
os.mkdir(dir)
###
#Background
#This script assumes you've already run the 'pipeline' bash script
#After you've run the 'pipeline' script you should have a series of folders for each serotype
#This will crawl through those folders and grab the multi FASTA files and merge them to create a pooled multi FASTA file
#This script will create two files:
#-REFGEN_Master
#-REFGEN_Master_Sorted
#The sorted file has all of the samples grouped together by gene as a proper XMFA file
#The REFGEN_Master file is just the different serotype XMFA files concatinated
#Hence, the REFGEN_Master can be deleted.
#Notes
#This program is denoted 'HighRAM' because it loads the REFGEN file into memory for sorting
#A lower RAM version was created. However, it takes a prohibitively long time.
###
#INPUTS
#ref_dir is the directory that contains the all of the serotype folders generated by pipeline.sh
ref_dir = "/scratch/aps376/APS132_Archive/"
#Where the output file will be sent to
outdir = "/scratch/aps376/recombo/APS143_1008_senterica_Archive/"
mkdir_p(outdir)
refout_all = outdir+'MSA_Master'
#A function thats input is a refgen file is then turned into a DataFrame
#The Dataframe is indexed by gene name and includes the gene start and gene end lines within the
#provided refgen file.
def get_name_loc(ref_file):
with open(ref_file, "r") as refgen_file:
refgen_lines = refgen_file.readlines()
#The indices of the lines seperating each gene of the serotype specific regen filex
gene_end = [i for i, x in enumerate(refgen_lines) if x == '=\n']
gene_start = [0]+gene_end[:-1]
gene_name = [refgen_lines[name_ind-2].replace('|',' ').split()[1] for name_ind in gene_end]
#A dataframe indexed by gene name and with a column of gene end line
out_df = pd.DataFrame({'start':gene_start,'end':gene_end},index=gene_name)
return out_df
#list of serotypes
seros = ['Enteritidis',
'Typhimurium',
'Virchow',
'Infantis',
'Newport',
'Typhi',
'Kentucky',
'Stanley',
'Agona',
'Braenderup',
'Oranienburg']
#Generate a concatinated file
with open(refout_all, 'w+') as master_file:
for sero_name in seros:
#Where the REFGEN sero file is
refout = ref_dir +sero_name + '_OUT/REFGEN_' + sero_name
with open(refout) as sero_file:
for line in sero_file:
master_file.write(line)
gene_locations = get_name_loc(refout_all)
genes = set(gene_locations.index)
with open(refout_all, 'r') as master_file:
master_full = master_file.readlines()
#Create the sorted file
with open(refout_all + '_Sorted', 'w+') as sorted_master_file:
#Go across genes
for gene in tqdm(genes):
# print('Adding gene:' + gene)
#Put serotype gene regions together
master_chunk_lines = []
one_gene = gene_locations.loc[gene].reset_index(drop=True)
#print(one_gene)
for i in range(0,one_gene.shape[0]):
gene_start = one_gene.at[i,'start']
gene_end = one_gene.at[i,'end']
master_chunk_lines.extend(master_full[gene_start+1:gene_end])
# print(master_chunk_lines[0])
#Write gene to file
for line in master_chunk_lines:
sorted_master_file.write(line)
sorted_master_file.write('=\n')

Computing file changes ...