Revision c3873d851fdfb01efd8bb1f8a18f33acb06b6fc5 authored by jdaeth274 on 13 May 2021, 12:33:05 UTC, committed by jdaeth274 on 13 May 2021, 12:33:05 UTC
1 parent 0fa3af5
Raw File
blast_to_info.py
import pandas
import os
import sys
import re
import numpy
import argparse

def get_options():

    purpose = '''This is a script to take the blast species results csvs 
    and then from there create a more informative outcome.
    Usage: blast_results_to_info.py <species_list_csvs> <python_hit_locs_csv> out_results'''

    parser = argparse.ArgumentParser(prog='blast_to_info.py', description=purpose)

    parser.add_argument('--list_file', required=True, help='Species list ot from blast local results interprete', type=str)
    parser.add_argument('--hit_locs', required=True, help='Hits df from hit allocator csv', type=str)
    parser.add_argument('--out_name', required=True, help='out_name for csv', type=str)

    input_args = parser.parse_args()

    return input_args

###############################################################################
## Now we'll look at getting the species results from the different results ###
###############################################################################


if __name__ == '__main__':

    input_args = get_options()
    print("~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Compiling BLAST results now ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~")


    list_file = open(input_args.list_file, "r")
    species_list = list_file.read().splitlines()

    python_hit_locs = pandas.read_csv(input_args.hit_locs)


    isolates = []
    top_species = []
    bitscores = []
    insertion_point = []


    for k in range(len(species_list)):
        current_file = species_list[k]
        current_csv = pandas.read_csv(current_file)
        top_hit = current_csv.iloc[0]
        isolates.append(top_hit.iloc[0])
        top_species.append(top_hit.iloc[1])
        bitscores.append(top_hit.iloc[2])


        izzy = top_hit.iloc[0]

        if "!" in izzy:
            insertion_point.append("reference")
        else:
            if k == 35:
                print(current_file)
            try:
                matching_python_record = python_hit_locs[python_hit_locs['id'] == top_hit.iloc[0]]
            except:
                matching_python_record = python_hit_locs[python_hit_locs['isolate'] == top_hit.iloc[0]]
            try:
                clus_name = matching_python_record['insert_name'].values[0]
            except:
                clus_name = matching_python_record['profile'].values[0]


            #insertion_point.append(str(matching_python_record.iloc[0:1,matching_python_record.columns.get_loc("cluster_names")]))
            insertion_point.append(str(clus_name))


    out_df = pandas.DataFrame()
    out_df['isolate'] = pandas.Series(data=isolates)
    out_df['top_species'] = pandas.Series(data=top_species, index=out_df.index)
    out_df['bitscore'] = pandas.Series(data=bitscores, index=out_df.index)
    out_df['insertion_point'] = pandas.Series(data=insertion_point, index=out_df.index)

    out_df.to_csv(input_args.out_name,index=False)

    print("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Finished ! ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~")

back to top