https://github.com/jdaeth274/ISA
Raw File
Tip revision: c3873d851fdfb01efd8bb1f8a18f33acb06b6fc5 authored by jdaeth274 on 13 May 2021, 12:33:05 UTC
tweaking pbp blasting
Tip revision: c3873d8
hit_allocator.py
import re
import pandas
import numpy
import sys
import dendropy
from dendropy.model.parsimony import fitch_down_pass
from dendropy.model.parsimony import fitch_up_pass
import csv
from statistics import stdev
from statistics import mean
import os
import string
from Bio import SeqIO
from Bio.Seq import Seq
import math
import argparse
import time
import pyfastx
import subprocess
import inspect

def get_options():
    purpose = '''This is a python script to intake a csv of hit locations for an MGE for a large collection of 
     genomes and then create a library of hits for which to search against 
    Usage: python library_creator.py <hit_csv> <reference_and_isolate_loc.csv> <align_cutoff> <path to act comparisons> <contig_checker_path>
    <out_csv>'''

    parser = argparse.ArgumentParser(description=purpose,
                                     prog='pen_checker_cdc.py')

    parser.add_argument('--lib_csv', required=True, help='Out merged BLAST csv (required)', type=str)
    parser.add_argument('--reference_csv', required=True, help='Isolates gff and references gff csv (required)', type=str)
    parser.add_argument('--blast_csv', required=True, help='Merged blast csv (required)', type=str)
    parser.add_argument('--fasta_csv', required=True, help= "isolates fasta and reference fasta csv", type = str)
    parser.add_argument('--act_loc', required=True, help='Directory where act comparisons stored (required)', type=str)
    parser.add_argument('--contig_loc', required=True, help='Directory where contig_numbers stored  (required)', type=str)
    parser.add_argument('--align', required=True, help='Int value to cutoff align for blast hits (required)', type=int)
    parser.add_argument('--output', required=True, help='Out Library csv name (required)', type=str)

    args = parser.parse_args()

    return args

def n50_calc(fasta_list, reference_id):
    ## Function to calculate the largest n50 to take as new comparison for ACT
    ## given that the reference has the mge within
    ## Input: fasta_list: The list of the locations of the fastas for a cluster
    ##        reference_id: The old reference

    n50s = []
    ids = []

    for k in fasta_list:
        current_isolate = k
        current_basename = os.path.basename(current_isolate)
        current_id = re.sub("\..*[a-z,A-Z]*.$","",current_basename)
        if current_id != reference_id:
            current_fasta = pyfastx.Fasta(current_isolate)
            current_n50 = current_fasta.nl(50)[0]
            n50s.append(current_n50)
            ids.append(current_isolate)


    max_n50 = n50s.index(max(n50s))

    new_act_iso = ids[max_n50]

    return new_act_iso


def contig_checker(contig_csv, hit_to_check):
    ## This is a function to check what contig a BLAST hit appears on
    ## Input: contig_csv: The CSV file containing contig start in a single column and contig end in another
    ##        hit_to_check: Start and end of a BLAST hit
    ## Output: Contig number of BLAST hit.
    hit_contig = 0

    if len(hit_to_check) == 1:
        for j in range(len(contig_csv.index)):
            c_start_and_end = contig_csv.iloc[j]
            if j == 0:
                overhang_before = 0
                overhang_after = 10
            else:
                overhang_before = 15
                overhang_after = 15
            start_hit = hit_to_check[0] >= (c_start_and_end[0] - overhang_before) and \
                        hit_to_check[0] <= (c_start_and_end[1] + overhang_after)

            if start_hit == True:
                hit_contig = j + 1
    else:

        for j in range(len(contig_csv.index)):
            c_start_and_end = contig_csv.iloc[j]
            if j == 0:
                overhang_before = 0
                overhang_after = 10
            else:
                overhang_before = 15
                overhang_after = 15
            start_hit = hit_to_check[0] >= (c_start_and_end[0] - overhang_before) and \
                        hit_to_check[0] <= (c_start_and_end[1] + overhang_after)
            end_hit = hit_to_check[1] >= (c_start_and_end[0] - overhang_before) and \
                      hit_to_check[1] <= (c_start_and_end[1] + overhang_after)

            if start_hit == True and end_hit == True:
                hit_contig = j + 1

    return hit_contig

def bounds_of_contig(contig_tab, contig_mge):
    ## Function to get bounds of a contig
    if contig_mge != 0:
        contig_bounds = contig_tab.iloc[contig_mge - 1]
    else:
        contig_bounds = [0,0]

    return contig_bounds



def contig_end_checker(hitters, contig_tab, act_path, isolate_id):

    ## This function takes in the hitters and checks if near enough to the end of a
    ## contig to be considered a split up hit.
    last_occy = isolate_id.rfind('_')
    isolate_id_list = list(isolate_id)
    isolate_id_list[last_occy] = "#"
    isolate_id_2 = "".join(isolate_id_list)

    compo_file = act_path + isolate_id_2 + ".crunch.gz"
    compo_names = ['query', 'subject', 'pid', 'align', 'gap', 'mismatch', 'qstart',
                   'qend', 'sstart', 'send', 'eval', 'bitscore']
    compo_table = pandas.read_csv(compo_file, sep='\t', names=compo_names)

    min_hit_val = hitters.values.min()
    max_hit_val = hitters.values.max()

    if hitters.iloc[1] > hitters.iloc[0]:
        mge_ori = "forward"
    elif hitters.iloc[0] > hitters.iloc[1]:
        mge_ori = "reverse"



    contig = contig_tab[(contig_tab['0'] <= (min_hit_val + 15)) & (contig_tab['1'] >= (max_hit_val - 15))]


    start_pos = "None"
    end_pos = "None"

    ## Now we need to check if there are any act hits across this bound


    if len(contig.index) != 1:
        pos = "None"
    else:
        pos = "None"




        if (min_hit_val - 65) <= contig.iloc[0,0] <= (min_hit_val + 15) :
            if min_hit_val == hitters.iloc[0]:
                start_pos = "start"
            else:
                end_pos = "end"

            if  (max_hit_val - 15) <= contig.iloc[0,1] <= (max_hit_val + 50):
                if start_pos == "start":
                    end_pos = "end"
                else:
                    start_pos = "start"
        elif (max_hit_val - 15) <= contig.iloc[0,1] <= (max_hit_val + 50):
            if max_hit_val == hitters.iloc[0]:
                start_pos  = "start"
            else:
                end_pos = "end"

        if (start_pos == "None") or (end_pos == "None"):
            if mge_ori == "forward":
                before_region_to_check = [contig.iloc[0,0], hitters.iloc[0]]
                after_region_to_check = [hitters.iloc[1], contig.iloc[0,1]]
            elif mge_ori == "reverse":
                before_region_to_check = [hitters.iloc[ 0], contig.iloc[0, 1]]
                after_region_to_check = [contig.iloc[0, 0] , hitters.iloc[ 1]]

            before_spans = compo_table[(compo_table['qstart'] <= before_region_to_check[0]) & (compo_table['qend'] >= before_region_to_check[1])]
            after_spans = compo_table[(compo_table['qstart'] <= after_region_to_check[0]) & (compo_table['qend'] >= after_region_to_check[1])]

            before_within = compo_table[(compo_table['qstart'] >= before_region_to_check[0]) & (compo_table['qend'] <= before_region_to_check[1])]
            after_within = compo_table[(compo_table['qstart'] >= after_region_to_check[0]) & (compo_table['qend'] <= after_region_to_check[1])]

            before_overlap_1 = compo_table[(compo_table['qstart'] >= before_region_to_check[0]) & (compo_table['qstart'] <= before_region_to_check[1]) & (compo_table['qend'] >= before_region_to_check[1])]
            before_overlap_2 = compo_table[(compo_table['qstart'] <= before_region_to_check[0]) & (compo_table['qend'] <= before_region_to_check[1]) & (compo_table['qend'] >= before_region_to_check[0])]

            after_overlap_1 = compo_table[(compo_table['qstart'] >= after_region_to_check[0]) & (compo_table['qend'] >= after_region_to_check[1]) & (compo_table['qstart'] <= after_region_to_check[1])]
            after_overlap_2 =  compo_table[(compo_table['qstart'] <= after_region_to_check[0]) & (compo_table['qend'] <= after_region_to_check[1]) & (compo_table['qend'] >= after_region_to_check[0])]





            before_df = pandas.concat([before_spans, before_within, before_overlap_1, before_overlap_2], sort=False)
            after_df = pandas.concat([after_spans, after_within, after_overlap_1, after_overlap_2], sort=False)


            if start_pos == "None":
                if before_df.empty:
                    start_pos = "start"
                else:
                    high_pid_only = before_df[before_df['pid'] >= 98.0]
                    if high_pid_only.empty:
                        start_pos = "start"
                    else:
                        acceptable_contig_gap = round((before_region_to_check[1] - before_region_to_check[0]) * 0.1)
                        aligns = []

                        before_region_to_check_set = set(range(int(before_region_to_check[0]), int(before_region_to_check[1])))
                        for k in range(len(high_pid_only.index)):
                            current_range = range(high_pid_only.iloc[k, 6], high_pid_only.iloc[k, 7])
                            aligno = len(before_region_to_check_set.intersection(current_range))
                            aligns.append(aligno)

                        divisibleBySeven = [num for num in aligns if num > acceptable_contig_gap]
                        if len(divisibleBySeven) == 0:
                            start_pos = "start"
                        else:
                            acceptable_contig_gap = round((before_region_to_check[1] - before_region_to_check[0]) * 0.1)
                            high_pid_high_align = high_pid_only[high_pid_only['align'] >= acceptable_contig_gap]

                            if high_pid_high_align.empty:
                                start_pos = "start"

            if end_pos == "None":
                if after_df.empty:
                    end_pos = "end"
                else:
                    high_pid_only = after_df[after_df['pid'] >= 98.0]
                    high_pid_only.reset_index(drop=True)
                    if high_pid_only.empty:
                        end_pos = "end"
                    else:
                        acceptable_contig_gap = round((after_region_to_check[1] - after_region_to_check[0]) * 0.1)
                        aligns = []
                        after_region_to_check_set = set(range(int(after_region_to_check[0]), int(after_region_to_check[1])))
                        for k in range(len(high_pid_only.index)):
                            current_range = range(high_pid_only.iloc[k, 6], high_pid_only.iloc[k, 7])
                            aligno = len(after_region_to_check_set.intersection(current_range))
                            aligns.append(aligno)

                        divisibleBySeven = [num for num in aligns if num > acceptable_contig_gap]
                        if len(divisibleBySeven) == 0:
                            end_pos = "end"
                        else:
                            high_pid_high_align = high_pid_only[high_pid_only['align'] >= acceptable_contig_gap]
                            if high_pid_high_align.empty:
                                end_pos = "end"

    pos = start_pos + "_" + end_pos



    return  pos


def row_merger(narrowed_rows):
    ## This function takes in the narrowed rows from merged_contig_checker and then
    ## works out how best to merge them

    ## First we work out which hits have both starts and ends
    narrowed_rows['end_loc'] = narrowed_rows['contig_pos'].str.find("end")
    narrowed_rows['start_loc'] = narrowed_rows['contig_pos'].str.find("start")

    ## End rows
    end_rows = narrowed_rows[narrowed_rows['end_loc'] != -1]


    if len(end_rows.index) == 0:
        ## If no end rows just take the returned row as the end

        returned_row = narrowed_rows.drop(['merged_index', 'contig_pos', 'end_loc', 'start_loc'], axis=1)
        merged_indexers = []
        merged_locs = []
        return returned_row, merged_indexers, merged_locs
    else:
        start_rows = narrowed_rows[narrowed_rows['start_loc'] != -1]
        start_rows = start_rows.reset_index(drop=True)
        first_row = end_rows.iloc[0, :].copy()
        ## while loop to incorporate rest of hits
        if len(start_rows.index) == 0:
            ## If no start rows just take the returned row as the end

            returned_row = narrowed_rows.drop(['merged_index', 'contig_pos', 'end_loc', 'start_loc'], axis = 1 )
            merged_indexers = []
            merged_locs = []
            return returned_row, merged_indexers, merged_locs
        else:

            ## Now we'll loop through the hits that have a start at the begininning of a contig and then
            ## altered the merged row with this new data. The overlap test will then check if the
            ## new start row to be potentially merged is within the current merged hit.

            first_start = start_rows.iloc[0,:]
            first_start_index = start_rows.index[0]
            if first_start['subject'] == first_row['subject']:
                first_start = start_rows.iloc[1, :]
                first_start_index = start_rows.index[1]

            counter = 0
            merged_row = first_row
            merged_locs = [[first_row['sstart'], first_row['ssend']]]
            merged_row['align'] = abs(merged_locs[0][1] - merged_locs[0][0])
            merged_row['ori_start'] = first_row['ori_start']
            merged_indexers = [merged_row.loc['merged_index']]
            added_start_indexes = []

            while counter < len(start_rows.index):

                if counter > 1000:
                    print("\n","Could be stuck in a while loop here")
                    print(narrowed_rows.iloc[0,0])
                    print(narrowed_rows)
                    sys.exit(1)



                current_range = merged_row.iloc[[3,4]]
                current_align = merged_row.iloc[2]
                poss_align = first_start.iloc[2]
                poss_range = first_start.iloc[[3,4]]
                overlap_test = current_range[0] <= poss_range[0] and poss_range[1] <= current_range[1] and \
                               current_align > poss_align
                if overlap_test == True:
                    counter += 1
                    added_start_indexes.append(first_start_index)
                    first_start_index += 1
                    if first_start_index <= (len(start_rows.index) - 1):
                        first_start = start_rows.iloc[first_start_index]


                else:
                    if first_start['contig_pos'] == "start_None":
                        ## check for other start_nones in the narrowed_df
                        start_nones = start_rows[start_rows['contig_pos'] == "start_None"]
                        if len(start_nones.index) == 1:
                                new_align = merged_row['align'] + abs(start_nones.iloc[0,6] - start_nones.iloc[0,5])
                                new_qend = start_nones.iloc[0, 4]
                                new_send = start_nones.iloc[0, 6]
                                merged_row.loc['align'] = new_align
                                merged_row.loc['qend'] = new_qend
                                merged_row.loc['ssend'] = new_send
                                merged_row.loc['ori_end'] = start_nones.iloc[0,11]
                                merged_locs.append([first_start.iloc[5],first_start.iloc[6]])
                                merged_indexers.append(first_start.iloc[12])
                                added_start_indexes.append(first_start_index)
                                first_start_index += 1
                                if first_start_index <= (len(start_rows.index) - 1):
                                    first_start = start_rows.iloc[first_start_index]

                                counter = len(start_rows.index)

                        else:
                                current_pid = merged_row['pident']
                                poss_sstart = start_nones['sstart'].values.tolist()
                                #closest_pid_val = min(poss_pids, key=lambda x: abs(x - current_pid))
                                #closest_pid_index = poss_pids.index(closest_pid_val)
                                lowest_sstart_index = poss_sstart.index(min(poss_sstart))

                                new_align = merged_row['align'] + abs(start_nones.iloc[lowest_sstart_index,6] - start_nones.iloc[lowest_sstart_index,5])
                                new_qend = start_nones.iloc[lowest_sstart_index, 4]
                                new_send = start_nones.iloc[lowest_sstart_index, 6]
                                merged_row.loc['align'] = new_align
                                merged_row.loc['qend'] = new_qend
                                merged_row.loc['ssend'] = new_send
                                merged_row.loc['ori_end'] = start_nones.loc[lowest_sstart_index,'ori_end']
                                merged_locs.append([start_nones.iloc[lowest_sstart_index,5], start_nones.iloc[lowest_sstart_index, 6]])
                                merged_indexers.append(start_nones.iloc[lowest_sstart_index, 12])
                                added_start_indexes.append(start_nones.index)
                                first_start_index += 1
                                if first_start_index <= (len(start_rows.index) - 1):
                                    first_start = start_rows.iloc[first_start_index]

                                counter += len(start_nones.index)
                    else:
                            new_align = merged_row['align'] + abs(first_start.iloc[6] - first_start.iloc[5])
                            new_qend = first_start.iloc[4]
                            new_send = first_start.iloc[6]
                            merged_row.loc['align'] = new_align
                            merged_row.loc['qend'] = new_qend
                            merged_row.loc['ssend'] = new_send
                            merged_row.loc['ori_end'] = first_start['ori_end']
                            merged_locs.append([first_start.iloc[5], first_start.iloc[6]])
                            merged_indexers.append(first_start.iloc[12])
                            added_start_indexes.append(first_start_index)
                            first_start_index += 1
                            if first_start_index <= (len(start_rows.index) - 1):
                                first_start = start_rows.iloc[first_start_index]
                            counter += 1


            returned_row = merged_row.drop(['merged_index','contig_pos','end_loc','start_loc'])
            returned_row['merged'] = "Yes"

            return returned_row, merged_indexers, merged_locs


def merged_contig_checker(merged_csv, contig_file_abs_path, act_path):
    multi_rows = []

    merged_csv['ori_start'] = merged_csv['orientation']
    merged_csv['ori_end'] = merged_csv['orientation']

    for k in range(len(merged_csv.index)):
        current_id = merged_csv.iloc[k, 0]  # .values.to_string()
        underscore_count = current_id.count("_")
        if underscore_count > 1:
            current_loc = merged_csv['file_loc'].iloc[k]
            loccies = merged_csv[merged_csv['file_loc'] == current_loc]
            if len(loccies.index) > 1:
                multi_rows.append(k)



    multi_row_db = merged_csv.iloc[multi_rows].copy()


    file_locs = set(multi_row_db.iloc[:,8].copy().to_list())
    file_locs = list(file_locs)



    merged_rows_to_drop = []

    merged_rows_to_add = pandas.DataFrame(columns=merged_csv.columns)
    merged_locs = []


    for k in range(len(file_locs)):
        isolate_id = file_locs[k]
        isolate_rows = multi_row_db[multi_row_db['file_loc'] == isolate_id].copy()
        isolate_rows.columns = isolate_rows.columns.str.strip()
        isolate_rows['merged_index'] = isolate_rows.index
        isolate_rows = isolate_rows.reset_index(drop=True)
        isolate_rows = isolate_rows.sort_values(by = 'qstart', ascending=True)
        contig_suffix = "#contig_bounds.csv"
        contig_isolate = re.sub("#", "_", isolate_id)
        if ".contigs_velvet.fa" in contig_isolate:
            contig_isolate = re.sub(".contigs_velvet.fa","",contig_isolate)
        contig_file_path = contig_file_abs_path + contig_isolate + contig_suffix

        contig_tab = pandas.read_csv(contig_file_path)

        positions = []

        for l in range(len(isolate_rows.index)):
            current_hitters = isolate_rows.iloc[l, [5,6]]
            position = contig_end_checker(current_hitters, contig_tab, act_path, contig_isolate)
            positions.append(position)


        isolate_rows['contig_pos'] = pandas.Series(positions, index=isolate_rows.index)



        isolate_rows_narrow = isolate_rows[isolate_rows['contig_pos'] != "None_None"].copy()




        if len(isolate_rows_narrow.index) < 10 and len(isolate_rows_narrow.index) > 1:
            isolate_rows_narrow = isolate_rows_narrow.sort_values(by = 'qstart', ascending=True)
            isolate_rows_narrow = isolate_rows_narrow.reset_index(drop=True)

            merged_row, merged_row_to_drop, merged_loc = row_merger(isolate_rows_narrow)
            if len(merged_row_to_drop) > 0:
                merged_rows_to_add = merged_rows_to_add.append(merged_row, sort=False, ignore_index=True)
                merged_rows_to_drop.extend(merged_row_to_drop)
                merged_locs.append(merged_loc)


        iter_val = "{0:0=5d}".format((k + 1))
        print("Completed %s of %s rows. Just finished: %s" % (iter_val, len(file_locs), isolate_id), end="\r",flush=True)

    ## Now we'll remove the merged rows from the df

    merged_csv = merged_csv.drop(merged_rows_to_drop)
    merged_csv['merged_index'] = numpy.NAN
    merged_csv['merged'] = pandas.Series(list(numpy.repeat("No", len(merged_csv.index))), index=merged_csv.index)
    ## Now lets add in the rows
    merged_rows_to_add['merged_index'] = range(len(merged_rows_to_add.index))
    merged_csv = merged_csv.append(merged_rows_to_add, sort=False, ignore_index=True)

    return merged_csv, merged_locs

def ref_contains_hit(compo_table, hitters, mge_bounds, isolate_id):
    ## This function checks if there are any hits within the mge position
    ## where compo_table is the act comparison, hitters the position of
    ## the mge in the query and mge bounds the contig of the mge.

    if hitters[0] < hitters[1]:
        mge_ori = "forward"
    elif hitters[0] > hitters[1]:
        mge_ori = "reverse"

    cut_off_val = abs(hitters[1] - hitters[0]) * 0.5

    if mge_ori == "forward":
        whole_overlap = compo_table[(compo_table['qstart'] < hitters[0]) & (compo_table['qend'] > hitters[1])]
        whole_overlap = whole_overlap.sort_values(by=['qstart'], ascending=False)

        if whole_overlap.empty:
            ## Check if one hit is within the mge bounday above the cut off val established above

            middle_hits = compo_table[(compo_table['qstart'] > hitters[0]) &\
                                      (compo_table['qend'] < hitters[1])]
            if middle_hits.empty:
                middle_length = 0
            else:
                middle_length = middle_hits['align'].sum()

            ## Check if there are hits that overlap with the MGE before the MGE insertion.

            hits_bef = compo_table[(compo_table['qend'] > hitters[0])  & (compo_table['qstart'] < hitters[0])]
            hits_bef = hits_bef.sort_values(by=['qstart'], ascending=False)

            if hits_bef.empty:
                hit_bef_length = 0
                hit_bef = before_and_after_hits(hitters, compo_table, mge_bounds, "before","for","for")
                if isinstance(hit_bef.iloc[0], int):
                    hit_bef = "No"
            else:
                hit_bef = hits_bef.iloc[0]
                hit_bef_length = hit_bef['qend'] - hitters[0]
                hit_bef = whole_match_splitter(hit_bef, hitters, "before")

            ## Check if there are hits that overlap with the MGE and the after flank region.

            hits_aft = compo_table[(compo_table['qstart'] < hitters[1] ) & (compo_table['qend'] > hitters[1])]
            hits_aft = hits_aft.sort_values(by=['qend'], ascending=True)
            if hits_aft.empty:
                hit_aft_length = 0
                hit_aft = before_and_after_hits(hitters, compo_table, mge_bounds, "after","for","for")
                if isinstance(hit_aft.iloc[0], int):
                    hit_aft = "No"
            else:
                hit_aft = hits_aft.iloc[0]
                hit_aft_length = hitters[1] - hit_aft['qstart']
                hit_aft = whole_match_splitter(hit_aft, hitters, "after")


            tot_overlap_length = hit_bef_length + middle_length + hit_aft_length
            bef_aft_length = hit_bef_length + hit_aft_length

            ## If the overall overlap is below the threshold and the hit_afts and befs don't
            ## have a big enough overlap we can use the methods used for other hits.
            ## If however there is a big overlap (could come from a big middle hit and smaller afters & before
            ## or big afters and before) then lets use these new hits.
            ## if however there is a below overlap, but one (or both) of the after and before hits have an overlap
            ## greater than 50 (which the normal methods would miss) then use these new hits.
            ## Finally if there is a big overlap but neither of the before or after hits are overlapping (so there is a
            ## big middle overlap) just use the estimated hits.

            if tot_overlap_length < cut_off_val and hit_bef_length < 50 and hit_aft_length < 50:
                hit_bef = "No"
                hit_aft = "No"
            elif tot_overlap_length > cut_off_val and hit_bef_length > 0 and hit_aft_length > 0:
                hit_bef = hit_bef
                hit_aft = hit_aft
            elif tot_overlap_length < cut_off_val and (hit_bef_length > 50 or hit_aft_length > 50):
                hit_bef = hit_bef
                hit_aft = hit_aft
            elif tot_overlap_length > cut_off_val and hit_bef_length == 0 and hit_aft_length == 0:
                hit_bef = hit_bef
                hit_aft = hit_aft



        else:
            whole_match = whole_overlap.iloc[0]
            hit_bef, hit_aft = whole_match_splitter(whole_match, hitters, "both")

    elif mge_ori == "reverse":
        whole_overlap = compo_table[(compo_table['qstart'] < hitters[1]) & (compo_table['qend'] > hitters[0])]
        whole_overlap = whole_overlap.sort_values(by=['qstart'], ascending=False)
        if whole_overlap.empty:
            if whole_overlap.empty:
                ## Check if one hit is within the mge bounday above the cut off val established above

                middle_hits = compo_table[(compo_table['qstart'] >= hitters[1]) & \
                                          (compo_table['qend'] <= hitters[0])]
                if middle_hits.empty:
                    middle_length = 0
                else:
                    middle_length = middle_hits['align'].sum()

                hits_bef = compo_table[(compo_table['qstart'] < hitters[0]) & (compo_table['qend'] > hitters[0])]
                hits_bef = hits_bef.sort_values(by=['qstart'], ascending=True)

                if hits_bef.empty:
                    hit_bef_length = 0
                    hit_bef = before_and_after_hits(hitters, compo_table, mge_bounds, "before","rev","rev")
                    if isinstance(hit_bef.iloc[0], int):
                        hit_bef = "No"
                else:
                    hit_bef = hits_bef.iloc[0]
                    hit_bef_length = hitters[0] - hit_bef['qstart']
                    hit_bef = whole_match_splitter(hit_bef, hitters, "before")

                hits_aft = compo_table[(compo_table['qend'] > hitters[1]) & (compo_table['qstart'] < hitters[1])]
                hits_aft = hits_aft.sort_values(by=['qend'], ascending=False)
                if hits_aft.empty:
                    hit_aft_length = 0
                    hit_aft = before_and_after_hits(hitters, compo_table, mge_bounds, "after","rev","rev")
                    if isinstance(hit_aft.iloc[0], int):
                        hit_aft = "No"
                else:
                    hit_aft = hits_aft.iloc[0]
                    hit_aft_length = hit_aft['qend'] - hitters[1]
                    hit_aft = whole_match_splitter(hit_aft, hitters, "after")

                tot_overlap_length = hit_bef_length + middle_length + hit_aft_length
                bef_aft_length = hit_bef_length + hit_aft_length

                if tot_overlap_length < cut_off_val and hit_bef_length < 50 and hit_aft_length < 50:
                    hit_bef = "No"
                    hit_aft = "No"
                elif tot_overlap_length > cut_off_val and hit_bef_length > 0 and hit_aft_length > 0:
                    hit_bef = hit_bef
                    hit_aft = hit_aft
                elif tot_overlap_length < cut_off_val and (hit_bef_length > 50 or hit_aft_length > 50):
                    hit_bef = hit_bef
                    hit_aft = hit_aft

        else:
            whole_match = whole_overlap.iloc[0]
            hit_bef, hit_aft = whole_match_splitter(whole_match, hitters, "both")

    if type(hit_bef) == str or type(hit_aft) == str:
        overlap = "No"
    else:
        overlap = "Yes"


    return hit_bef, hit_aft, overlap

def whole_match_splitter(match, mge_locs, hit_to_split):
    ## This is a function to split a whole match if found to a before hit that
    ## lines up with the start of the mge and then the after hit with the end

    if mge_locs[0] < mge_locs[1]:
        mge_ori = "forward"
    elif mge_locs[0] > mge_locs[1]:
        mge_ori = "reverse"

    if match['sstart'] < match['send']:
        match_ori = "forward"
    elif match['sstart'] > match['send']:
        match_ori = "reverse"


    if match_ori == "forward":
        if mge_ori == "forward":
            if hit_to_split == "both" or hit_to_split == "before":

                length_bef = mge_locs[0] - match['qstart']
                hit_bef = pandas.Series({'query': match['query'],
                                         'subject': match['subject'],
                                         'pid': match['pid'],
                                         'align': length_bef,
                                         'gap': match['gap'],
                                         'mismatch': match['mismatch'],
                                         'qstart': match['qstart'],
                                         'qend': mge_locs[0],
                                         'sstart': match['sstart'],
                                         'send': (match['sstart'] + length_bef),
                                         'eval': match['eval'],
                                         'bitscore': match['bitscore']})
            if hit_to_split == "both" or hit_to_split == "after":
                length_aft = match['qend'] - mge_locs[1]
                hit_aft = pandas.Series({'query': match['query'],
                                         'subject': match['subject'],
                                         'pid': match['pid'],
                                         'align': length_aft,
                                         'gap': match['gap'],
                                         'mismatch': match['mismatch'],
                                         'qstart': mge_locs[1],
                                         'qend': match['qend'],
                                         'sstart': (match['send'] - length_aft),
                                         'send': match['send'],
                                         'eval': match['eval'],
                                         'bitscore': match['bitscore']})
        elif mge_ori == "reverse":
            if hit_to_split == "both" or hit_to_split == "before":
                length_bef = match['qend'] - mge_locs[0]
                hit_bef = pandas.Series({'query': match['query'],
                                         'subject': match['subject'],
                                         'pid': match['pid'],
                                         'align': length_bef,
                                         'gap': match['gap'],
                                         'mismatch': match['mismatch'],
                                         'qstart': mge_locs[0],
                                         'qend': match['qend'],
                                         'sstart': (match['send'] - length_bef),
                                         'send': match['send'],
                                         'eval': match['eval'],
                                         'bitscore': match['bitscore']})
            if hit_to_split == "both" or hit_to_split == "after":
                length_aft = mge_locs[1] - match['qstart']
                hit_aft = pandas.Series({'query': match['query'],
                                         'subject': match['subject'],
                                         'pid': match['pid'],
                                         'align': length_aft,
                                         'gap': match['gap'],
                                         'mismatch': match['mismatch'],
                                         'qstart': match['qstart'],
                                         'qend': mge_locs[1],
                                         'sstart': match['sstart'],
                                         'send': (match['sstart'] + length_aft),
                                         'eval': match['eval'],
                                         'bitscore': match['bitscore']})
    elif match_ori == "reverse":
        if mge_ori == "forward":
            if hit_to_split == "both" or hit_to_split == "before":
                length_bef = mge_locs[0] - match['qstart']
                hit_bef = pandas.Series({'query': match['query'],
                                         'subject': match['subject'],
                                         'pid': match['pid'],
                                         'align': length_bef,
                                         'gap': match['gap'],
                                         'mismatch': match['mismatch'],
                                         'qstart': match['qstart'],
                                         'qend': mge_locs[0],
                                         'sstart': match['sstart'],
                                         'send': (match['sstart'] - length_bef),
                                         'eval': match['eval'],
                                         'bitscore': match['bitscore']})
            if hit_to_split == "both" or hit_to_split == "after":
                length_aft = match['qend'] - mge_locs[1]
                hit_aft = pandas.Series({'query': match['query'],
                                         'subject': match['subject'],
                                         'pid': match['pid'],
                                         'align': length_aft,
                                         'gap': match['gap'],
                                         'mismatch': match['mismatch'],
                                         'qstart': mge_locs[1],
                                         'qend': match['qend'],
                                         'sstart': (match['send'] + length_aft),
                                         'send': match['send'],
                                         'eval': match['eval'],
                                         'bitscore': match['bitscore']})
        elif mge_ori == "reverse":
            if hit_to_split == "both" or hit_to_split == "before":
                length_bef = match['qend'] - mge_locs[0]
                hit_bef = pandas.Series({'query': match['query'],
                                         'subject': match['subject'],
                                         'pid': match['pid'],
                                         'align': length_bef,
                                         'gap': match['gap'],
                                         'mismatch': match['mismatch'],
                                         'qstart': mge_locs[0],
                                         'qend': match['qend'],
                                         'sstart': (match['send'] + length_bef),
                                         'send': match['send'],
                                         'eval': match['eval'],
                                         'bitscore': match['bitscore']})
            if hit_to_split == "both" or hit_to_split == "after":
                length_aft = mge_locs[1] - match['qstart']
                hit_aft = pandas.Series({'query': match['query'],
                                         'subject': match['subject'],
                                         'pid': match['pid'],
                                         'align': length_aft,
                                         'gap': match['gap'],
                                         'mismatch': match['mismatch'],
                                         'qstart': match['qstart'],
                                         'qend': mge_locs[1],
                                         'sstart': match['sstart'],
                                         'send': (match['sstart'] - length_aft),
                                         'eval': match['eval'],
                                         'bitscore': match['bitscore']})

    if hit_to_split == "both":
        return hit_bef, hit_aft
    elif hit_to_split == "after":
        return hit_aft
    elif hit_to_split == "before":
        return  hit_bef

def before_and_after_hits(hit_info, compo_table, contig_bounds, hits_to_search, mge_bef_ori, mge_aft_ori):
    ## This function looks for matches around the MGE loc based on their
    ## position in the query sequence. First looks for hits with align
    ## greater than 2000, then if this isn't on contig looks for hits
    ## simply on the contig, no matter the size.
    overhang = 50
    compo_names = ['query', 'subject', 'pid', 'align', 'gap', 'mismatch', 'qstart',
                   'qend', 'sstart', 'send', 'eval', 'bitscore']

    if isinstance(contig_bounds, list):


        if hits_to_search == "before" or hits_to_search == "both":
            if mge_bef_ori == "forward":

                hits_before = compo_table.loc[compo_table[compo_table.columns[7]] < (hit_info[0] + overhang)]
                hits_before = hits_before.sort_values(by=['qend'], ascending=False)
                hits_before_1k = hits_before.loc[hits_before['align'] > 2000]
                ## Just check if this align > 1500 hit is on the same contig, if not we
                ## use the original hits_before ordered df

                if hits_before_1k.empty:
                    hits_before_1k = hits_before

                else:
                    if hits_before_1k.iloc[0, 6] < (contig_bounds[0][0] - 10):
                        hits_before_1k = hits_before

                ## Check now if this align > 1500 df is empty, if so again we use the
                ## original hits before df

                if hits_before_1k.empty:
                    hit_before = pandas.DataFrame(data=numpy.zeros(shape=(1, 12)),
                                                  columns=compo_names)
                else:
                    hit_before = hits_before_1k.iloc[0]
            elif mge_bef_ori == "reverse":
                hits_before = compo_table.loc[compo_table[compo_table.columns[6]] > (hit_info[0] - overhang)]
                hits_before = hits_before.sort_values(by=['qstart'], ascending=True)
                hits_before_1k = hits_before.loc[hits_before['align'] > 2000]

                if hits_before_1k.empty:
                    hits_before_1k = hits_before

                else:
                    if hits_before_1k.iloc[0, 7] > (contig_bounds[0][1] + 10):
                        hits_before_1k = hits_before

                if hits_before_1k.empty:
                    hit_before = pandas.DataFrame(data=numpy.zeros(shape=(1, 12)),
                                                  columns=compo_names)
                else:
                    hit_before = hits_before_1k.iloc[0]
        if hits_to_search == "both" or hits_to_search == "after":
            if mge_aft_ori == "forward":
                hits_after = compo_table.loc[compo_table[compo_table.columns[6]] > (hit_info[1] - overhang)]
                hits_after = hits_after.sort_values(by=['qstart'], ascending=True)
                hits_after_1k = hits_after.loc[hits_after['align'] > 2000]

                if hits_after_1k.empty:
                    hits_after_1k = hits_after

                else:
                    if hits_after_1k.iloc[0, 7] > (contig_bounds[1][1] + 10):
                        hits_after_1k = hits_after

                if hits_after_1k.empty:
                    hit_after = pandas.DataFrame(data=numpy.zeros(shape=(1, 12)),
                                                 columns=compo_names)
                else:
                    hit_after = hits_after_1k.iloc[0]
            elif mge_aft_ori == "reverse":
                hits_after = compo_table.loc[compo_table[compo_table.columns[7]] < (hit_info[1] + overhang)]
                hits_after = hits_after.sort_values(by=['qend'], ascending=False)
                hits_after_1k = hits_after.loc[hits_after['align'] > 2000]

                if hits_after_1k.empty:
                    hits_after_1k = hits_after
                else:
                    if hits_after_1k.iloc[0, 6] < (contig_bounds[1][0] - 10):
                        hits_after_1k = hits_after

                if hits_after_1k.empty:
                    hit_after = pandas.DataFrame(data=numpy.zeros(shape=(1, 12)),
                                                 columns=compo_names)
                else:
                    hit_after = hits_after_1k.iloc[0]




    else:

        if hit_info[0] < hit_info[1]:
            if hits_to_search == "both" or hits_to_search == "before":
                hits_before = compo_table.loc[compo_table[compo_table.columns[7]] < (hit_info[0] + overhang)]
                hits_before = hits_before.sort_values(by=['qend'], ascending=False)
                hits_before_1k = hits_before.loc[hits_before['align'] > 2000]
                ## Just check if this align > 1500 hit is on the same contig, if not we
                ## use the original hits_before ordered df

                if hits_before_1k.empty:
                    hits_before_1k = hits_before

                else:
                    if hits_before_1k.iloc[0, 6] < (contig_bounds[0] - 10):
                        hits_before_1k = hits_before

                ## Check now if this align > 1500 df is empty, if so again we use the
                ## original hits before df

                if hits_before_1k.empty:
                    hit_before = pandas.DataFrame(data=numpy.zeros(shape=(1, 12)),
                                                  columns=compo_names)
                else:
                    hit_before = hits_before_1k.iloc[0]

            ## Now we get the major hit before the insertion
            if hits_to_search == "both" or hits_to_search == "after":
                hits_after = compo_table.loc[compo_table[compo_table.columns[6]] > (hit_info[1] - overhang)]
                hits_after = hits_after.sort_values(by=['qstart'], ascending=True)
                hits_after_1k = hits_after.loc[hits_after['align'] > 2000]

                if hits_after_1k.empty:
                    hits_after_1k = hits_after

                else:
                    if hits_after_1k.iloc[0, 7] > (contig_bounds[1] + 10):
                        hits_after_1k = hits_after


                if hits_after_1k.empty:
                    hit_after = pandas.DataFrame(data=numpy.zeros(shape=(1, 12)),
                                                 columns=compo_names)
                else:
                    hit_after = hits_after_1k.iloc[0]


        elif hit_info[0] > hit_info[1]:
            if hits_to_search == "both" or hits_to_search == "before":
                hits_before = compo_table.loc[compo_table[compo_table.columns[6]] > (hit_info[0] - overhang)]
                hits_before = hits_before.sort_values(by=['qstart'], ascending=True)
                hits_before_1k = hits_before.loc[hits_before['align'] > 2000]

                if hits_before_1k.empty:
                    hits_before_1k = hits_before

                else:
                    if hits_before_1k.iloc[0, 7] > (contig_bounds[1] + 10):
                        hits_before_1k = hits_before


                if hits_before_1k.empty:
                    hit_before = pandas.DataFrame(data=numpy.zeros(shape=(1, 12)),
                                                  columns=compo_names)
                else:
                    hit_before = hits_before_1k.iloc[0]
            if hits_to_search == "both" or hits_to_search == "after":
                hits_after = compo_table.loc[compo_table[compo_table.columns[7]] < (hit_info[1] + overhang)]
                hits_after = hits_after.sort_values(by=['qend'], ascending=False)
                hits_after_1k = hits_after.loc[hits_after['align'] > 2000]

                if hits_after_1k.empty:
                    hits_after_1k = hits_after
                else:
                    if hits_after_1k.iloc[0, 6] < (contig_bounds[0] - 10):
                        hits_after_1k = hits_after


                if hits_after_1k.empty:
                    hit_after = pandas.DataFrame(data=numpy.zeros(shape=(1, 12)),
                                                 columns=compo_names)
                else:
                    hit_after = hits_after_1k.iloc[0]

    hits_to_use = "both"
    if hits_to_search == "both":
        is_aft_within_bef = False
        is_bef_within_aft = False

        if isinstance(hit_before, pandas.DataFrame):
            hit_before = hit_before.iloc[0]
        if isinstance(hit_after, pandas.DataFrame):
            hit_after = hit_after.iloc[0]

        if hit_before.iloc[3] > hit_after.iloc[3]:
            is_aft_within_bef = within_a_hit(hit_before, hit_after)
        elif hit_before.iloc[3] < hit_after.iloc[3]:
            is_bef_within_aft = within_a_hit(hit_after, hit_before)

        if is_aft_within_bef == True:
            hits_to_use = "before"
        if is_bef_within_aft == True:
            hits_to_use = "after"

    if hits_to_search == "both":
        return hit_before, hit_after, hits_to_use
    elif hits_to_search == "before":
        return  hit_before
    elif hits_to_search == "after":
        return  hit_after

def within_a_hit(big_hit, little_hit):
    ## This only works with two hits of the same orientation
    if big_hit['sstart'] < big_hit['send']:
        orientation = "forward"
    elif big_hit['sstart'] > big_hit['send']:
        orientation = "reverse"

    if orientation == "forward":
        output = little_hit['sstart'] > big_hit['sstart'] and little_hit['send'] < big_hit['send']
    elif orientation == "reverse":
        output = little_hit['send'] > big_hit['send'] and little_hit['sstart'] < big_hit['sstart']

    return output

def gff_finder(gff_csv, isolate_id, clus_name):
    ## Function to get the location of an isolates gff file
    isolate_check = isolate_id + "\."
    isolate_rows = gff_csv['isolate'].str.contains(isolate_check)
    isolate_row_indy = isolate_rows.where(isolate_rows == True)
    isolate_row_indy = isolate_row_indy.index[isolate_row_indy == True].tolist()
    if len(isolate_row_indy) != 1:
        print(isolate_id)
        print(isolate_row_indy)
    isolate_loc = gff_csv.iloc[isolate_row_indy,0]
    isolate_ref = gff_csv.iloc[isolate_row_indy,1]
    if clus_name:
        cluster_name = gff_csv.iloc[isolate_row_indy,2]
    else:
        cluster_name = "BOO!"

    return isolate_loc, isolate_ref, cluster_name

def gff_to_dna(gff_csv, contig_csv, isolate_id, input_k):
    ## Function to turn the contigs based values of gff gene locs into a continuous DNA
    ## based values (1 .. seq length). Needs there to be at least one gene on a contig
    ## Input gff_csv: gff for a contiged assembly
    ##       contig_csv: contig_bound csv

    finding_izzy = re.sub("#","_",isolate_id)
    contig_lengths = contig_csv.iloc[:,1] - contig_csv.iloc[:,0]
    index_to_keep = contig_lengths[contig_lengths > 10000].index
    ## remove DNA sequences ##
    starting_seq = gff_csv['seqid'].str.startswith("##FASTA", na = False)
    starting_seq_indy = starting_seq.where(starting_seq == True)
    starting_seq_indy = starting_seq_indy.index[starting_seq_indy == True].tolist()
    if len(starting_seq_indy) < 1:
        print("No FASTA in this isolates GFF skipping: %s" % isolate_id)
        return  "No"


    starting_seq_indy = min(starting_seq_indy)

    narrowed_gff = gff_csv.drop(range(starting_seq_indy,(len(gff_csv.index) - 1) ))
    narrowed_gff = narrowed_gff.reset_index(drop=True)


    if "contig" in narrowed_gff.iloc[1,0]:
        ## go through the contig_csv and add on differences
        for k in index_to_keep :
            if k == 0:
                current_contig = k + 1
                num_zeros = 6 - len(str(current_contig))
                empty_str = ""
                contig_finder = "contig" + (empty_str.join((["0"] * num_zeros))) + str(current_contig) + "$"
                contig_rows = narrowed_gff['seqid'].str.contains(contig_finder)
                contig_rows_indy = contig_rows.where(contig_rows == True)
                contig_rows_indy = contig_rows_indy.index[contig_rows_indy == True].tolist()
                last_indy = max(contig_rows_indy)

            else:
                current_contig = k + 1
                num_zeros = 6 - len(str(current_contig))
                empty_str = ""
                contig_finder = "contig" + (empty_str.join((["0"] * num_zeros))) + str(current_contig) + "$"
                num_to_add = contig_csv.iloc[k,0]
                contig_rows = narrowed_gff['seqid'].str.contains(contig_finder)
                contig_rows_indy = contig_rows.where(contig_rows == True)
                contig_rows_indy = contig_rows_indy.index[contig_rows_indy == True].tolist()
                if len(contig_rows_indy) == 0:
                    continue

                narrowed_gff.iloc[contig_rows_indy,3] = narrowed_gff.iloc[contig_rows_indy, 3] + num_to_add - 1
                narrowed_gff.iloc[contig_rows_indy, 4] =  narrowed_gff.iloc[contig_rows_indy, 4] + num_to_add - 1

                last_indy = max(contig_rows_indy)

        gff_index_to_drop = range((last_indy + 1), (len(narrowed_gff.index) - 1))
        out_gff_csv = narrowed_gff.drop(gff_index_to_drop)



    elif finding_izzy in narrowed_gff.iloc[1,0]:
        for k in  index_to_keep:
            if k == 0:
                current_contig = k + 1
                contig_finder = finding_izzy + "\." + str(current_contig) + "$"
                contig_rows = narrowed_gff['seqid'].str.contains(contig_finder)
                contig_rows_indy = contig_rows.where(contig_rows == True)
                contig_rows_indy = contig_rows_indy.index[contig_rows_indy == True].tolist()
                last_indy = max(contig_rows_indy)

            else:
                current_contig = k + 1
                contig_finder = finding_izzy + "\." + str(current_contig) + "$"
                num_to_add = contig_csv.iloc[k,0]


                contig_rows = narrowed_gff['seqid'].str.contains(contig_finder)
                contig_rows_indy = contig_rows.where(contig_rows == True)
                contig_rows_indy = contig_rows_indy.index[contig_rows_indy == True].tolist()
                if len(contig_rows_indy) == 0:
                    continue


                narrowed_gff.iloc[contig_rows_indy,3] = narrowed_gff.iloc[contig_rows_indy, 3] + num_to_add - 1
                narrowed_gff.iloc[contig_rows_indy, 4] =  narrowed_gff.iloc[contig_rows_indy, 4] + num_to_add - 1

                last_indy = max(contig_rows_indy)

        gff_index_to_drop = range((last_indy + 1), (len(narrowed_gff.index) ))
        out_gff_csv = narrowed_gff.drop(gff_index_to_drop)

    ## test out finder ##


    out_gff_csv = out_gff_csv.reset_index(drop=True)

    return out_gff_csv




def gene_name_tryer(prospective_csv, library_csv, out_hit, missing_isolate, mergio, isolate_id, gene_rows,
                    fasta_csv, reference_name,contig_tab, ref_contig_tab, fasta_out_dir):
    ## Function to take in hits they have missed the first few cutoffs in the hit_integrator function
    ## And try to merge them instead using the flank gene names and then the mge charecteristics
    ## Input: prospective_csv: The single line isolate from the hit_integrator function
    ##        library_csv: The library csv
    ##        out_hit: The total hit df
    ##        missing_isolate: The df for the isolates with not library hit
    ##        mergio: Whether the hit was merged or not (BOOLEAN)
    ##        isolate_id: The current isolate id (for debugging)
    ##        gene_rows: The before and after gene rows from the gff (list of series)
    ##        fasta_csv: The csv of the fastas and the references
    ##        reference_name: The current reference for this isolate, after correcting for MGE presence.
    ##        contig_tab: The isoltaes contig table file.
    ##        ref_contig_tab: The reference contig table file.
    ##        fasta_out_dir: Location to store the fasta searches.


    if mergio:
        merged = "Yes"
    else:
        merged = "No"

    missing_copy = missing_isolate.copy()
    out_hit_copy = out_hit.copy()
    library_append = library_csv.copy()

    if not mergio:
        library_use = library_csv[library_csv['mergio'] == "No"]
    else:
        library_use = library_csv.copy()

    bef_and_aft = library_use[(library_use['before_gene_name'] == prospective_csv['before_gene_name'][0]) & \
                              (library_use['after_gene_name'] == prospective_csv['after_gene_name'][0])]


    if not bef_and_aft.empty:

        current_hit_length = bef_and_aft[(bef_and_aft['mge_length'] >= (prospective_csv['mge_length'][0] - 100)) & \
                                         (bef_and_aft['mge_length'] <= (prospective_csv['mge_length'][0] + 100))]

        if mergio:
            tot_hit = current_hit_length
        else:
            current_hit_gene = bef_and_aft[(bef_and_aft['mge_genes'] >= (prospective_csv['mge_genes'][0] - 1)) & \
                                       (bef_and_aft['mge_genes'] <= (prospective_csv['mge_genes'][0] + 1))]

            tot_hit = pandas.concat([current_hit_length, current_hit_gene], sort=False, ignore_index=True)
            tot_hit = tot_hit.drop_duplicates()


        if len(tot_hit.index) == 1:
            prospective_csv.loc[0,'insert_name'] = tot_hit['insert_name'].values
            if mergio:
                prospective_csv['mergio'] = "Yes"
            else:
                prospective_csv['mergio'] = "No"


            out_hit_copy = out_hit_copy.append(prospective_csv, sort = False)

        elif len(tot_hit.index) > 1:
            tot_hit = tot_hit.reset_index(drop=True)
            length_target = prospective_csv['mge_length'][0]
            mge_dist = abs(tot_hit['mge_length'] - length_target)
            min_val = mge_dist.idxmin()
            single_hit = tot_hit.iloc[min_val]#.to_frame().transpose()
            prospective_csv['insert_name'] = pandas.Series(single_hit['insert_name'], index=prospective_csv.index)
            if mergio:
                prospective_csv['mergio'] = "Yes"
            else:
                prospective_csv['mergio'] = "No"

            out_hit_copy = out_hit_copy.append(prospective_csv, sort = False)

        else:
            ## So these hits have the same approved gene names as before. Lets add them to the library instead
            ## of punishing the MGE length criteria

            library_append = merged_adder(prospective_csv, library_csv, merged)

            # print(mergio)
            # missing_current = pandas.DataFrame()
            # missing_current['id'] = pandas.Series(prospective_csv['id'])
            # missing_current['mge_start'] = pandas.Series(prospective_csv['mge_start'], index=missing_current.index)
            # missing_current['mge_end'] = pandas.Series(prospective_csv['mge_end'], index=missing_current.index)
            # missing_current['insert_start'] = pandas.Series(prospective_csv['insert_start'],index = missing_current.index)
            # missing_current['insert_end'] = pandas.Series(prospective_csv['insert_end'],index=missing_current.index)
            # missing_current['ref_name'] = pandas.Series(prospective_csv['ref_name'], index=missing_current.index)
            # missing_current['cluster_name'] = pandas.Series(prospective_csv['cluster_name'], index=missing_current.index)
            # missing_current['mge_length'] = pandas.Series(prospective_csv['mge_length'], index=missing_current.index)
            # missing_current['before_gene_name'] = pandas.Series(prospective_csv['before_gene_name'], index = missing_current.index)
            # missing_current['after_gene_name'] = pandas.Series(prospective_csv['after_gene_name'], index=missing_current.index)
            # missing_current['merged'] = pandas.Series(mergio, index = missing_current.index)
            # missing_current['reason'] = pandas.Series(["No gene name and length matches"], index=missing_current.index)
            # missing_copy = missing_copy.append(missing_current, sort = False)

    else:
        # Initially had this as only searching if none not present, I'll do away with that condition for now

        if mergio:
            merged = "Yes"
        else:
            merged = "No"

        cluster_name = fasta_extractor(isolate_id, gene_rows, fasta_csv, contig_tab, fasta_out_dir)
        before_results, after_results = blast_search(isolate_id, reference_name, cluster_name, fasta_out_dir)
        library_append, missing_copy =  blast_results_adder([before_results,after_results],prospective_csv, library_csv,missing_copy,ref_contig_tab, merged)



        # else:
        #
        #     missing_current = pandas.DataFrame()
        #     missing_current['id'] = pandas.Series(prospective_csv['id'])
        #
        #     missing_current['mge_start'] = pandas.Series(prospective_csv['mge_start'], index=missing_current.index)
        #     missing_current['mge_end'] = pandas.Series(prospective_csv['mge_end'], index=missing_current.index)
        #     missing_current['insert_start'] = pandas.Series(prospective_csv['insert_start'], index=missing_current.index)
        #     missing_current['insert_end'] = pandas.Series(prospective_csv['insert_end'], index=missing_current.index)
        #     missing_current['ref_name'] = pandas.Series(prospective_csv['ref_name'], index=missing_current.index)
        #     missing_current['cluster_name'] = pandas.Series(prospective_csv['cluster_name'], index=missing_current.index)
        #     missing_current['mge_length'] = pandas.Series(prospective_csv['mge_length'], index=missing_current.index)
        #     missing_current['before_gene_name'] = pandas.Series(prospective_csv['before_gene_name'],index=missing_current.index)
        #     missing_current['after_gene_name'] = pandas.Series(prospective_csv['after_gene_name'], index=missing_current.index)
        #     missing_current['merged'] = pandas.Series(mergio, index=missing_current.index)
        #     missing_current['reason'] = pandas.Series(["No Fasta search, no gene name matches"], index=missing_current.index)
        #     missing_copy = missing_copy.append(missing_current, sort=False)


    return out_hit_copy, missing_copy, library_append

def fasta_extractor(isolate_id, gene_rows, fasta_csv, contig_bounds, fasta_dir):
    ## Function to extract the gene sequence for the before and after hits in the
    ## MGE identification pipeline
    ## Input: gene_rows: List of the two series for the named genes before and
    ##                   after the MGE insert
    ##        fasta_csv: The csv with the location of the fasta files
    ##        contig_bounds: The list of start and end of a contig to transfer back
    ##        fasta_dir: The directory where the searched for fasta files should be stored

    fasta_file, reference, cluster = gff_finder(fasta_csv, isolate_id, True)

    current_contig_before = contig_number_getter(gene_rows[0].iloc[0])
    current_contig_after = contig_number_getter(gene_rows[1].iloc[0])


    before_contig_start = contig_bounds.iloc[current_contig_before-1,0]
    after_contig_start = contig_bounds.iloc[current_contig_after-1, 0]

    before_hit_locs = gene_rows[0].loc[['start','end']].tolist()
    before_hit_locs = [x - before_contig_start + 1 for x in before_hit_locs]
    before_hit_locs = [int(x) for x in before_hit_locs]


    after_hit_locs = gene_rows[1].loc[['start', 'end']].tolist()
    after_hit_locs = [x - after_contig_start + 1 for x in after_hit_locs]
    after_hit_locs = [int(x) for x in after_hit_locs]

    bef_strand = str(gene_rows[0].loc['strand'])
    aft_strand = str(gene_rows[1].loc['strand'])

    fasta_file = str(fasta_file.iloc[0])
    bef = False
    aft = False


    for record in SeqIO.parse(fasta_file, "fasta"):
        if "INV200" in isolate_id:
            bef_contig = record.seq
            aft_contig = record.seq
            bef = True
            aft = True
        else:
            current_name = contig_number_getter(record.id)
            if current_name == current_contig_before:
                bef_contig = record.seq
                bef = True
            if current_name == current_contig_after:
                aft_contig = record.seq
                aft = True

        if aft and bef:
            break

    gene_string_bef = str(bef_contig[(before_hit_locs[0] - 1):before_hit_locs[1]])
    gene_string_aft = str(aft_contig[(after_hit_locs[0] - 1):after_hit_locs[1]])
    if bef_strand == "-":
        reverso = Seq(gene_string_bef)  # , generic_dna)

        gene_string_bef = str(reverso.reverse_complement())
    if aft_strand == "-":
        reverso = Seq(gene_string_aft)  # , generic_dna)

        gene_string_aft = str(reverso.reverse_complement())


    before_gene = SeqIO.SeqRecord(Seq(gene_string_bef), id = isolate_id + "_before_gene")

    after_gene = SeqIO.SeqRecord(Seq(gene_string_aft), id=isolate_id + "_after_gene")

    ## ok its getting the right positions, lets export these fastas

    with open((fasta_dir + isolate_id + "_before_gene.fasta"), "w+") as output_handle:
        SeqIO.write(before_gene, output_handle, "fasta")
    with open((fasta_dir + isolate_id + "_after_gene.fasta"), "w+") as output_handle:
        SeqIO.write(after_gene, output_handle, "fasta")

    return str(cluster.iloc[0])


def blast_search(isolate_id, reference_loc, cluster_id, fasta_dir):
    ## Function to search the newly created genes against the refernece isolate for the
    ## cluster. Will check if this already exists in the tmp_dna_dir folder
    ## input: isolate_id: The current isolate's id, gets us the gene fasta locs
    ##        reference_loc: The current reference name
    ##        clutser_id: The current cluster id, we'll name the reference db after this.

    reference_db = cluster_id + "_db.nhr"
    before_gene = fasta_dir + isolate_id + "_before_gene.fasta"
    after_gene = fasta_dir + isolate_id + "_after_gene.fasta"
    if not os.path.isfile(reference_db):
        if reference_loc == "pmen3_reference":
            data_dir = re.sub("python$", "data", os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe()))))  # script directory
            reference_name = data_dir + "/pmen3_reference.fasta"
        else:
            reference_name = "./tmp_dna_dir/" + re.sub("#","_", reference_loc) + ".dna"
        db_command = "makeblastdb -in " + reference_name + " -dbtype nucl -out " + cluster_id + "_db"
        subprocess.call(db_command, shell=True)

    before_blast = "blastn -query " + before_gene + " -db " + cluster_id + "_db" + " -out " + fasta_dir + isolate_id + "_before_blast.csv" + " -outfmt 10"
    after_blast = "blastn -query " + after_gene + " -db " + cluster_id + "_db" + " -out " + fasta_dir + isolate_id + "_after_blast.csv" + " -outfmt 10"

    subprocess.call(before_blast, shell=True)
    subprocess.call(after_blast, shell=True)

    before_blast_res = fasta_dir + isolate_id + "_before_blast.csv"
    after_blast_res = fasta_dir + isolate_id + "_after_blast.csv"

    return before_blast_res, after_blast_res

def blast_results_adder(blast_results, prospective_csv, library_csv, missing_csv, ref_contig_tab, merged):
    ## Function to take in the blast results for a hit from blast_search
    ## then to add in the results if the hits are on the same contig as
    ## each other
    ## input: blast_results: List of the blast result locations
    ##        prospective_csv: The current prospective csv
    ##        library_csv: The current library csv
    ##        missing_csv: The current missing csv
    ##        ref_contig_tab: The current reference contig table
    ##        merged: Whether or not the hit is merged to add into merged column

    missing_copy = missing_csv.copy()
    library_csv_copy = library_csv.copy()

    blast_cols = ['query','subject','pid','align','gap','mismatch','qstart',
                  'qend','sstart','send','eval','bitscore']

    blast_results_before = pandas.read_csv(blast_results[0], names = blast_cols)
    blast_results_after = pandas.read_csv(blast_results[1], names=blast_cols)


    if blast_results_before.empty or blast_results_after.empty:
        missing_current = pandas.DataFrame()
        missing_current['id'] = pandas.Series(prospective_csv['id'])

        missing_current['mge_start'] = pandas.Series(prospective_csv['mge_start'], index=missing_current.index)
        missing_current['mge_end'] = pandas.Series(prospective_csv['mge_end'], index=missing_current.index)
        missing_current['insert_start'] = pandas.Series(prospective_csv['insert_start'], index=missing_current.index)
        missing_current['insert_end'] = pandas.Series(prospective_csv['insert_end'], index=missing_current.index)
        missing_current['ref_name'] = pandas.Series(prospective_csv['ref_name'], index=missing_current.index)
        missing_current['cluster_name'] = pandas.Series(prospective_csv['cluster_name'], index=missing_current.index)
        missing_current['mge_length'] = pandas.Series(prospective_csv['mge_length'], index=missing_current.index)
        missing_current['before_gene_name'] = pandas.Series(prospective_csv['before_gene_name'],
                                                            index=missing_current.index)
        missing_current['after_gene_name'] = pandas.Series(prospective_csv['after_gene_name'],
                                                           index=missing_current.index)
        missing_current['merged'] = pandas.Series(mergio, index=missing_current.index)
        missing_current['reason'] = pandas.Series(["No Fasta results returned"], index=missing_current.index)
        missing_copy = missing_copy.append(missing_current, sort=False)
    else:

        ## Just use the top hit
        before_ref_hits = blast_results_before.iloc[0,[8,9]].tolist()
        after_ref_hits = blast_results_after.iloc[0, [8, 9]].tolist()

        before_contig = contig_checker(ref_contig_tab, before_ref_hits)
        after_contig = contig_checker(ref_contig_tab, after_ref_hits)

        ## OK I'm going to remove the need for the two reference hits to be on the same contig!



        # if before_contig == after_contig:
        #
        #     library_csv_copy = merged_adder(prospective_csv, library_csv_copy)
        # else:
        #
        #     missing_current = pandas.DataFrame()
        #     missing_current['id'] = pandas.Series(prospective_csv['id'])
        #
        #     missing_current['mge_start'] = pandas.Series(prospective_csv['mge_start'], index=missing_current.index)
        #     missing_current['mge_end'] = pandas.Series(prospective_csv['mge_end'], index=missing_current.index)
        #     missing_current['insert_start'] = pandas.Series(prospective_csv['insert_start'],
        #                                                     index=missing_current.index)
        #     missing_current['insert_end'] = pandas.Series(prospective_csv['insert_end'], index=missing_current.index)
        #     missing_current['ref_name'] = pandas.Series(prospective_csv['ref_name'], index=missing_current.index)
        #     missing_current['cluster_name'] = pandas.Series(prospective_csv['cluster_name'],
        #                                                     index=missing_current.index)
        #     missing_current['mge_length'] = pandas.Series(prospective_csv['mge_length'], index=missing_current.index)
        #     missing_current['before_gene_name'] = pandas.Series(prospective_csv['before_gene_name'],
        #                                                         index=missing_current.index)
        #     missing_current['after_gene_name'] = pandas.Series(prospective_csv['after_gene_name'],
        #                                                        index=missing_current.index)
        #     missing_current['merged'] = pandas.Series(mergio, index=missing_current.index)
        #     missing_current['reason'] = pandas.Series(["Fasta hits not on same contig in reference"], index=missing_current.index)
        #     missing_copy = missing_copy.append(missing_current, sort=False)

        library_csv_copy = merged_adder(prospective_csv, library_csv_copy, merged)

    return library_csv_copy, missing_copy






def merged_adder(prospective_csv, library_csv, merged):
    ## This is a function to add a merged isolates hit into the the library csv.
    ## This will add the hit and lable the hit as for merged use only
    ## Input: prospective_csv: The merged hit row
    ##        library_csv: The library csv, with a merged column already added in.
    ##        merged: The mergio column value

    prospective_csv['mergio'] = merged
    prospective_csv['insert_name'] = len(library_csv.index) + 1
    library_csv_merged = library_csv.append(prospective_csv, ignore_index=True, sort=False)

    return  library_csv_merged

def contig_number_getter(contig_name):
    if "|" in contig_name:
        contig_num = contig_name[-4:]
        contig_num = int(contig_num)
    else:
        contig_num = re.sub("^.*[0-9]\.","",contig_name)
        contig_num =int(contig_num)

    return contig_num


def hit_detector(library_csv, prospective_csv, isolate_id, hit_csv, missing_isolates, mergio, gene_rows,
                 fasta_csv, reference_name, contig_tab, ref_contig_tab, fasta_dir):
    ## Function to allocate hit location
    ## Input: library_csv: The current library csv to search for matches against
    ##        prospective_csv: The prospective set of results to be sorted
    ##        isolate_id: The current isolate id (mainly for debugging)
    ##        hit_csv: The total hit csv the isolate to be integrated into
    ##        missing_isolates: The df of isolates not assigned to a hit value
    ##        mergio: Whether or not the isolate was merged together.
    ##        gene_rows: List of the gff rows for the before and after genes
    ##        fasta_csv: Csv of the fasta and reference fasta locations
    ##        reference_name: Name of the current reference, after rechecking for if MGE in element
    ##        contig_tab: the contig file of the isolate in question for use in the fasta collection
    ## This will first work on the basis of hits sharing almost identical blast matches
    ## Then we'll look into flank region composition and finally the total insert composition
    ## to see if this is a novel hit or not.

    lib_new = library_csv.copy()
    out_hit = hit_csv.copy()
    missing_df = missing_isolates.copy()
    ids_to_drop = []

    if not mergio:
        lib_use = lib_new[lib_new['mergio'] == "No"]
    else:
        lib_use = lib_new.copy()

    ## lets narrow down first by number of genes in the element if this is not exact I think its
    ## fair to assume this would be novel.
    #'before_flank_gene', 'after_flank_gene', 'before_flank_avg',
    #'after_flank_avg'

    odd_ones = False


    ## So there is a hit with the same number of mge genes, let now check the element length +- 2 bp
    ## Needs to be a hit with no length +- 2bp and no genes +- 1

    if mergio:
        out_hit, missing_df, lib_new = gene_name_tryer(prospective_csv, library_csv, out_hit, missing_df, mergio, isolate_id, gene_rows,
                                                       fasta_csv, reference_name, contig_tab, ref_contig_tab, fasta_dir)

    else:


        mge_hits = lib_use[(lib_use['mge_genes'] >= (prospective_csv['mge_genes'][0] - 1)) &\
                           (lib_use['mge_genes'] <= (prospective_csv['mge_genes'][0] + 1))]
        mge_length_hits = mge_hits[(mge_hits['mge_length'] >= (prospective_csv['mge_length'][0] - 5))\
                                     & (mge_hits['mge_length'] <= (prospective_csv['mge_length'][0] + 5))]


        if not mge_length_hits.empty:

                ## Lets check if the before or after hits match quite closely with the number of genes +- 1
                ## Before
                before_gene_num = prospective_csv['before_flank_gene'][0]
                after_gene_num = prospective_csv['after_flank_gene'][0]


                before_gene_hits = mge_length_hits[(mge_length_hits['before_flank_gene'] >= (before_gene_num - 1)) & \
                                                   (mge_length_hits['before_flank_gene'] <= (before_gene_num + 1))]

                after_gene_hits = mge_length_hits[(mge_length_hits['after_flank_gene'] >= (after_gene_num - 1)) & \
                                                  (mge_length_hits['after_flank_gene'] <= (after_gene_num + 1))]

                before_empty = before_gene_hits.empty
                after_empty = after_gene_hits.empty



                if not before_empty or not after_empty:
                    ## So either the before or the after hit or both match to ones already in the database
                    ## check matches +- 25 bp
                    before_flank_empty = True
                    after_flank_empty = True


                    if not before_empty and after_empty:
                        before_mean_flank = prospective_csv['before_flank_avg'][0]

                        before_flank_means = before_gene_hits[(before_gene_hits['before_flank_avg'] >= (before_mean_flank - 25)) &\
                                                              (before_gene_hits['before_flank_avg'] <= (before_mean_flank + 25))]

                        before_flank_empty = before_flank_means.empty

                    elif before_empty and not after_empty:
                        after_mean_flank = prospective_csv['after_flank_avg'][0]

                        after_flank_means = after_gene_hits[(after_gene_hits['before_flank_avg'] >= (after_mean_flank - 25)) & \
                                                             (after_gene_hits['before_flank_avg'] <= (after_mean_flank + 25))]

                        after_flank_empty = after_flank_means.empty
                    elif not before_empty and not after_empty:
                        before_mean_flank = prospective_csv['before_flank_avg'][0]

                        before_flank_means = before_gene_hits[
                            (before_gene_hits['before_flank_avg'] >= (before_mean_flank - 25)) & \
                            (before_gene_hits['before_flank_avg'] <= (before_mean_flank + 25))]

                        before_flank_empty = before_flank_means.empty

                        after_mean_flank = prospective_csv['after_flank_avg'][0]

                        after_flank_means = after_gene_hits[
                            (after_gene_hits['after_flank_avg'] >= (after_mean_flank - 25)) & \
                            (after_gene_hits['after_flank_avg'] <= (after_mean_flank + 25))]

                        after_flank_empty = after_flank_means.empty



                    if not before_flank_empty or not after_flank_empty:

                        ## So the before or after (or both) flanks seem to match in composition
                        ## Now lets check if the insert length is similar, if not likely a novel insertion in the
                        ## same insert site as before.


                        insert_genes = prospective_csv['insert_genes'][0]
                        insert_length = prospective_csv['insert_length'][0]




                        if "before_flank_means" in locals() and "after_flank_means" in locals():

                            remaining_hits = pandas.concat([before_flank_means, after_flank_means], ignore_index=True, sort=False)
                            remaining_hits = remaining_hits.drop_duplicates()
                        elif "before_flank_means" in locals() and "after_flank_means" not in locals():
                            remaining_hits = before_flank_means
                        else:
                            remaining_hits = after_flank_means

                        gene_hits = remaining_hits[(remaining_hits['insert_genes'] >= (insert_genes - 2)) & \
                                                   (remaining_hits['insert_genes'] <= (insert_genes + 2))]

                        length_hits = remaining_hits[(remaining_hits['insert_length'] >= (insert_length - 500)) & \
                                                     (remaining_hits['insert_length'] <= (insert_length + 500))]



                        if gene_hits.empty and length_hits.empty:
                            out_hit, missing_df, lib_new = gene_name_tryer(prospective_csv, lib_use, out_hit, missing_df, mergio, isolate_id,
                                                                           gene_rows, fasta_csv, reference_name, contig_tab, ref_contig_tab, fasta_dir)

                        else:
                            remain_48 = pandas.concat([gene_hits, length_hits], ignore_index=True, sort = False)
                            remain_48 = remain_48.drop_duplicates()
                            before_gene_name = prospective_csv['before_gene_name'][0]
                            after_gene_name = prospective_csv['after_gene_name'][0]

                            if len(remain_48.index) == 1:
                                if remain_48['before_gene_name'][0] != before_gene_name and remain_48['after_gene_name'][0] != after_gene_name:
                                    out_hit, missing_df, lib_new = gene_name_tryer(prospective_csv, lib_use, out_hit, missing_df, mergio, isolate_id,
                                                                                   gene_rows, fasta_csv, reference_name, contig_tab, ref_contig_tab, fasta_dir)

                                else:
                                    prospective_csv['insert_name'] = pandas.Series(remain_48['insert_name'], index=prospective_csv.index)
                                    if mergio:
                                        prospective_csv['mergio'] = "Yes"
                                    else:
                                        prospective_csv['mergio'] = "No"
                                    out_hit = out_hit.append(prospective_csv, sort = False)

                            elif len(remain_48.index) > 1:
                                ## If there are two hits with similar tendencies base on insert length
                                remain_48 = remain_48.reset_index(drop=True)
                                target_insert = prospective_csv['insert_length'][0]
                                lengers = abs(remain_48['insert_length'] - target_insert)
                                closest_index = lengers.idxmin()

                                if remain_48['before_gene_name'].iloc[closest_index] != before_gene_name and remain_48['after_gene_name'].iloc[closest_index] != after_gene_name:
                                    out_hit, missing_df, lib_new = gene_name_tryer(prospective_csv, lib_use, out_hit, missing_df, mergio, isolate_id, gene_rows,
                                                       fasta_csv, reference_name, contig_tab, ref_contig_tab, fasta_dir)

                                else:
                                    prospective_csv['insert_name'] = pandas.Series(remain_48['insert_name'].iloc[closest_index], index=remain_48.index)
                                    if mergio:
                                        prospective_csv['mergio'] = "Yes"
                                    else:
                                        prospective_csv['mergio'] = "No"
                                    out_hit = out_hit.append(prospective_csv, sort=False)



                            else:
                                print("Odd behaviour here")

                    else:
                        out_hit, missing_df, lib_new = gene_name_tryer(prospective_csv, lib_use, out_hit, missing_df, mergio, isolate_id, gene_rows,
                                                       fasta_csv, reference_name, contig_tab, ref_contig_tab, fasta_dir)

                else:
                    out_hit, missing_df, lib_new = gene_name_tryer(prospective_csv, lib_use, out_hit, missing_df, mergio, isolate_id, gene_rows,
                                                       fasta_csv, reference_name, contig_tab, ref_contig_tab, fasta_dir)

        else:
            ## Try to see if there is a hit with the same start and end gene names and mge_length +- 50 bp or gene +- 1
            out_hit, missing_df, lib_new = gene_name_tryer(prospective_csv, lib_use, out_hit, missing_df, mergio, isolate_id, gene_rows,
                                                       fasta_csv, reference_name, contig_tab, ref_contig_tab, fasta_dir)
    return(out_hit, missing_df, lib_new)

def gene_name_finder(flanks_csv, back_it_up):
    ## Function to find the first row in a flanks region that has a gene name and
    ## return that gene name. Removes the paralog suffix attached.
    ## Input: flanks_csv =  Either before or after 5k region from library pros
    ##        back_it_up = Whether to work backwards throgh the csv to get the closest
    ##                  hits

    new_flanks = flanks_csv.reset_index(drop=True)
    search_res = ""
    gene_row = None


    if back_it_up:
        for k in reversed(range(len(new_flanks.index))):
            search_res = re.findall('gene=.*?;', new_flanks.iloc[k, 8])
            if search_res == []:
                continue
            else:
                search_res = search_res[0]
                search_res = re.findall('=.*?;', search_res)
                search_res = search_res[0]

                search_res = re.sub("=","",search_res)
                search_res = re.sub(";","",search_res)
                gene_row = new_flanks.iloc[k].copy()
                #search_res = search_res[3:-3]

                break
    else:
        for k in range(len(new_flanks.index)):
            search_res = re.findall('gene=.*?;', new_flanks.iloc[k,8])
            if search_res == []:
                continue
            else:
                search_res = search_res[0]
                search_res = re.findall('=.*?;', search_res)
                search_res = search_res[0]

                search_res = re.sub("=", "", search_res)
                search_res = re.sub(";", "", search_res)
                gene_row = new_flanks.iloc[k].copy()

                break

    if search_res == []:
        search_res = "NONE"
        if back_it_up:
            gene_row = new_flanks.iloc[len(new_flanks.index) - 1].copy()
        else:
            gene_row = new_flanks.iloc[0].copy()


    if re.search("_[0-9]$",search_res):
        search_res = re.sub("_[0-9]$", "", search_res)


    return  search_res, gene_row

def library_trim(library_csv):
    ## Function to run through the library csv and removes any that look like they could be duplicates
    ## This will be based solely on the before and after genes

    before_after_combo = library_csv['before_gene_name'].str.cat(library_csv['after_gene_name'], sep = "£")
    before_after_unique = before_after_combo.unique()
    indies_to_remove = []
    for k in range(len(before_after_unique)):
        combo = before_after_unique[k]
        before = re.split("£", combo, 2)[0]
        after = re.split("£", combo, 2)[1]

        bef_and_aft = library_csv[(library_csv['before_gene_name'] == before) &\
                                  (library_csv['after_gene_name'] == after)]

        if len(bef_and_aft.index) == 1:
            continue
        else:
            ## Work through each hit to see if they match closely to the others and then
            ## remove them.
            ids_to_drop = []
            for hit in bef_and_aft.index:

                current_hit = bef_and_aft.loc[hit]

                if current_hit['id'] in ids_to_drop:
                    continue
                ## hits based on insert length
                current_hit_length = bef_and_aft[(bef_and_aft['insert_length'] >= (current_hit['insert_length'] - 100)) &\
                                                 (bef_and_aft['insert_length'] <= (current_hit['insert_length'] + 100))]
                current_hit_gene = bef_and_aft[(bef_and_aft['insert_genes'] >= (current_hit['insert_genes'] - 1)) &\
                                               (bef_and_aft['insert_genes'] <= (current_hit['insert_genes'] + 1))]

                tot_hit = pandas.concat([current_hit_length, current_hit_gene], sort=False, ignore_index=True)
                tot_hit = tot_hit.drop_duplicates()



                if tot_hit.empty:
                    continue
                else:

                    ## choose max flank lengths as hit

                    max_row = pandas.to_numeric(tot_hit['flanks_length']).idxmax()
                    remove_rows = tot_hit.drop(max_row)
                    ids_to_lose = remove_rows['id'].tolist()
                    ids_to_drop.extend(ids_to_lose)

            ids = list(library_csv['id'][library_csv['id'].isin(ids_to_drop)].index)
            indies_to_remove.extend(ids)


    out_lib = library_csv.drop(indies_to_remove)
    print("Removing this many duplicates from library: %s, now %s hits" % (len(indies_to_remove), len(out_lib.index)))
    return  out_lib

def subject_checker(hit, ori, compo_csv, mge_bounds, mge_ori):
    ## Function to take in hits that don't have close hits in the query strand
    ## from the hit_mover function and then to check if these hits are reasonably
    ## close in the subject strand to allow for merging.
    ## Input: hit: the initial hit input into hit_mover
    ##        ori: The location of the hit before or after
    ##        compo_csv: The poss_hits csv that is narrowed to those with hits on the same contig and proximate to hit
    ##        mge_bounds: The contig bounds of the contig the mge is found on
    ##        mge_ori: The orientation of the MGE in question.

    if hit['sstart'] < hit['send']:
        hit_subject_ori = "forward"
    elif hit['send'] < hit['sstart']:
        hit_subject_ori = "reverse"

        #sys.exit()

    hit_new = hit.copy()

    if ori == "before":
        if mge_ori == "forward":
            if hit_subject_ori == "forward":
                narrowed_poss_hits = compo_csv[(compo_csv['send'] >= (hit['sstart'] - 1000)) & \
                                               (compo_csv['send'] <= (hit['sstart'] + 50))]
                narrowed_poss_hits = narrowed_poss_hits.sort_values(by = ['send'], ascending = False)
                if not narrowed_poss_hits.empty:
                    new_align = hit['qend'] - narrowed_poss_hits['qstart'].iloc[0]
                    if new_align >= 5000:
                        hit_new = pandas.DataFrame(hit_new).transpose()
                        current_hit = pandas.DataFrame(narrowed_poss_hits.iloc[0]).transpose()
                        hit_new = pandas.concat([hit_new, current_hit], sort=False, ignore_index=False)
                        hit_new = hit_new.reset_index(drop=True)
                        altered = "merge"
                    else:
                        hit_new = pandas.DataFrame(hit_new).transpose()
                        current_hit = pandas.DataFrame(narrowed_poss_hits.iloc[0]).transpose()
                        hit_new = pandas.concat([hit_new, current_hit], sort=False, ignore_index=False)
                        hit_new = hit_new.reset_index(drop=True)
                        narrowed_poss_hits_2 = compo_csv[(compo_csv['send'] >= (narrowed_poss_hits['sstart'].iloc[0] - 1000)) & \
                                                         (compo_csv['send'] <= (narrowed_poss_hits['sstart'].iloc[0] + 50))]
                        narrowed_poss_hits_2 = narrowed_poss_hits_2.sort_values(by='align', ascending=False)

                        while not narrowed_poss_hits_2.empty and new_align < 5000:
                            new_align = abs(hit['qend'] - narrowed_poss_hits_2['qstart'].iloc[0])
                            extra_hit = pandas.DataFrame(narrowed_poss_hits_2.iloc[0]).transpose()

                            hit_new = pandas.concat([hit_new, extra_hit.iloc[[0]]], sort=False, ignore_index=False)
                            hit_new = hit_new.reset_index(drop=True)
                            if new_align < 5000:
                                narrowed_poss_hits_2 = compo_csv[(compo_csv['send'] >= (narrowed_poss_hits['sstart'].iloc[0] - 1000)) & \
                                                                 (compo_csv['send'] <= (narrowed_poss_hits['sstart'].iloc[0] + 50))]
                                narrowed_poss_hits_2 = narrowed_poss_hits_2.sort_values(by='align', ascending=False)

                        if new_align >= 5000:
                            altered = "merge"
                else:
                    new_align = 0
            elif hit_subject_ori == "reverse":
                narrowed_poss_hits = compo_csv[(compo_csv['send'] >= (hit['sstart'] - 50)) & \
                                               (compo_csv['send'] <= (hit['sstart'] + 1000))]
                narrowed_poss_hits = narrowed_poss_hits.sort_values(by=['send'], ascending=True)
                if not narrowed_poss_hits.empty:
                    new_align = hit['qend'] - narrowed_poss_hits['qstart'].iloc[0]
                    if new_align >= 5000:
                        hit_new = pandas.DataFrame(hit_new).transpose()
                        current_hit = pandas.DataFrame(narrowed_poss_hits.iloc[0]).transpose()
                        hit_new = pandas.concat([hit_new, current_hit], sort=False, ignore_index=False)
                        hit_new = hit_new.reset_index(drop=True)
                        altered = "merge"
                    else:
                        hit_new = pandas.DataFrame(hit_new).transpose()
                        current_hit = pandas.DataFrame(narrowed_poss_hits.iloc[0]).transpose()
                        hit_new = pandas.concat([hit_new, current_hit], sort=False, ignore_index=False)
                        hit_new = hit_new.reset_index(drop=True)
                        narrowed_poss_hits_2 = compo_csv[(compo_csv['send'] >= (narrowed_poss_hits['sstart'].iloc[0] - 50)) & \
                                                         (compo_csv['send'] <= (narrowed_poss_hits['sstart'].iloc[0] + 1000))]
                        narrowed_poss_hits_2 = narrowed_poss_hits_2.sort_values(by='align', ascending=False)

                        while not narrowed_poss_hits_2.empty and new_align < 5000:
                            new_align = abs(hit['qend'] - narrowed_poss_hits_2['qstart'].iloc[0])
                            extra_hit = pandas.DataFrame(narrowed_poss_hits_2.iloc[0]).transpose()

                            hit_new = pandas.concat([hit_new, extra_hit.iloc[[0]]], sort=False, ignore_index=False)
                            hit_new = hit_new.reset_index(drop=True)
                            if new_align < 5000:
                                narrowed_poss_hits_2 = compo_csv[(compo_csv['send'] >= (narrowed_poss_hits['sstart'].iloc[0] - 50)) & \
                                                                 (compo_csv['send'] <= (narrowed_poss_hits['sstart'].iloc[0] + 1000))]
                                narrowed_poss_hits_2 = narrowed_poss_hits_2.sort_values(by='align', ascending=False)

                        if new_align >= 5000:
                            altered = "merge"
                else:
                    new_align = 0
        elif mge_ori == "reverse":
            if hit_subject_ori == "forward":

                narrowed_poss_hits = compo_csv[(compo_csv['sstart'] <= (hit['send'] + 1000))  & \
                                               (compo_csv['sstart'] >= (hit['send'] - 50))]
                narrowed_poss_hits = narrowed_poss_hits.sort_values(by=['sstart'], ascending=True)
                if not narrowed_poss_hits.empty:
                    new_align = abs(hit['qstart'] - narrowed_poss_hits['qend'].iloc[0])
                    if new_align >= 5000:
                        hit_new = pandas.DataFrame(hit_new).transpose()
                        current_hit = pandas.DataFrame(narrowed_poss_hits.iloc[0]).transpose()
                        hit_new = pandas.concat([hit_new, current_hit], sort=False, ignore_index=False)
                        hit_new = hit_new.reset_index(drop=True)
                        altered = "merge"
                    else:
                        hit_new = pandas.DataFrame(hit_new).transpose()
                        current_hit = pandas.DataFrame(narrowed_poss_hits.iloc[0]).transpose()
                        hit_new = pandas.concat([hit_new, current_hit], sort=False, ignore_index=False)
                        hit_new = hit_new.reset_index(drop=True)
                        narrowed_poss_hits_2 = compo_csv[
                            (compo_csv['sstart'] >= (narrowed_poss_hits['send'].iloc[0] - 50)) & \
                            (compo_csv['sstart'] <= (narrowed_poss_hits['send'].iloc[0] + 1000))]
                        narrowed_poss_hits_2 = narrowed_poss_hits_2.sort_values(by='align', ascending=False)

                        while not narrowed_poss_hits_2.empty and new_align < 5000:
                            new_align = abs(hit['qstart'] - narrowed_poss_hits_2['qend'].iloc[0])
                            extra_hit = pandas.DataFrame(narrowed_poss_hits_2.iloc[0]).transpose()

                            hit_new = pandas.concat([hit_new, extra_hit.iloc[[0]]], sort=False, ignore_index=False)
                            hit_new = hit_new.reset_index(drop=True)
                            if new_align < 5000:
                                narrowed_poss_hits_2 = compo_csv[
                                    (compo_csv['sstart'] >= (narrowed_poss_hits['send'].iloc[0] - 50)) & \
                                    (compo_csv['sstart'] <= (narrowed_poss_hits['send'].iloc[0] + 1000))]
                                narrowed_poss_hits_2 = narrowed_poss_hits_2.sort_values(by='align', ascending=False)

                        if new_align >= 5000:
                            altered = "merge"
                else:
                    new_align = 0
            elif hit_subject_ori == "reverse":
                narrowed_poss_hits = compo_csv[(compo_csv['sstart'] <= (hit['send'] + 50)) & \
                                               (compo_csv['sstart'] >= (hit['send'] - 1000))]
                narrowed_poss_hits = narrowed_poss_hits.sort_values(by=['sstart'], ascending=False)
                if not narrowed_poss_hits.empty:
                    new_align = abs(hit['qstart'] - narrowed_poss_hits['qend'].iloc[0])
                    if new_align >= 5000:
                        hit_new = pandas.DataFrame(hit_new).transpose()
                        current_hit = pandas.DataFrame(narrowed_poss_hits.iloc[0]).transpose()
                        hit_new = pandas.concat([hit_new, current_hit], sort=False, ignore_index=False)
                        hit_new = hit_new.reset_index(drop=True)
                        altered = "merge"
                    else:
                        hit_new = pandas.DataFrame(hit_new).transpose()
                        current_hit = pandas.DataFrame(narrowed_poss_hits.iloc[0]).transpose()
                        hit_new = pandas.concat([hit_new, current_hit], sort=False, ignore_index=False)
                        hit_new = hit_new.reset_index(drop=True)
                        narrowed_poss_hits_2 = compo_csv[(compo_csv['sstart'] >= (narrowed_poss_hits['send'].iloc[0] - 1000)) & \
                                                         (compo_csv['sstart'] <= (narrowed_poss_hits['send'].iloc[0] + 50))]
                        narrowed_poss_hits_2 = narrowed_poss_hits_2.sort_values(by='align', ascending=False)

                        while not narrowed_poss_hits_2.empty and new_align < 5000:
                            new_align = abs(hit['qstart'] - narrowed_poss_hits_2['qend'].iloc[0])
                            extra_hit = pandas.DataFrame(narrowed_poss_hits_2.iloc[0]).transpose()

                            hit_new = pandas.concat([hit_new, extra_hit.iloc[[0]]], sort=False, ignore_index=False)
                            hit_new = hit_new.reset_index(drop=True)
                            if new_align < 5000:
                                narrowed_poss_hits_2 = compo_csv[(compo_csv['sstart'] >= (narrowed_poss_hits['send'].iloc[0] - 1000)) & \
                                                                 (compo_csv['sstart'] <= (narrowed_poss_hits['send'].iloc[0] + 50))]
                                narrowed_poss_hits_2 = narrowed_poss_hits_2.sort_values(by='align', ascending=False)

                        if new_align >= 5000:
                            altered = "merge"

                else:
                    new_align = 0
    elif ori == "after":
        if mge_ori == "forward":
            if hit_subject_ori == "forward":
                narrowed_poss_hits = compo_csv[(compo_csv['sstart'] <= (hit['send'] + 1000)) &\
                                               (compo_csv['sstart'] >= (hit['send'] - 50))]
                narrowed_poss_hits = narrowed_poss_hits.sort_values(by=['sstart'], ascending=True)
                if not narrowed_poss_hits.empty:
                    new_align = abs(hit['qstart'] - narrowed_poss_hits['qend'].iloc[0])
                    if new_align >= 5000:
                        hit_new = pandas.DataFrame(hit_new).transpose()
                        current_hit = pandas.DataFrame(narrowed_poss_hits.iloc[0]).transpose()
                        hit_new = pandas.concat([hit_new, current_hit], sort=False, ignore_index=False)
                        hit_new = hit_new.reset_index(drop=True)
                        altered = "merge"
                    else:
                        hit_new = pandas.DataFrame(hit_new).transpose()
                        current_hit = pandas.DataFrame(narrowed_poss_hits.iloc[0]).transpose()
                        hit_new = pandas.concat([hit_new, current_hit], sort=False, ignore_index=False)
                        hit_new = hit_new.reset_index(drop=True)
                        narrowed_poss_hits_2 = compo_csv[
                            (compo_csv['sstart'] >= (narrowed_poss_hits['send'].iloc[0] - 50)) & \
                            (compo_csv['sstart'] <= (narrowed_poss_hits['send'].iloc[0] + 1000))]
                        narrowed_poss_hits_2 = narrowed_poss_hits_2.sort_values(by='align', ascending=False)

                        while not narrowed_poss_hits_2.empty and new_align < 5000:
                            new_align = abs(hit['qstart'] - narrowed_poss_hits_2['qend'].iloc[0])
                            extra_hit = pandas.DataFrame(narrowed_poss_hits_2.iloc[0]).transpose()

                            hit_new = pandas.concat([hit_new, extra_hit.iloc[[0]]], sort=False, ignore_index=False)
                            hit_new = hit_new.reset_index(drop=True)
                            if new_align < 5000:
                                narrowed_poss_hits_2 = compo_csv[
                                    (compo_csv['sstart'] >= (narrowed_poss_hits['send'].iloc[0] - 50)) & \
                                    (compo_csv['sstart'] <= (narrowed_poss_hits['send'].iloc[0] + 1000))]
                                narrowed_poss_hits_2 = narrowed_poss_hits_2.sort_values(by='align', ascending=False)

                        if new_align >= 5000:
                            altered = "merge"
                else:
                    new_align = 0
            elif hit_subject_ori == "reverse":
                narrowed_poss_hits = compo_csv[(compo_csv['sstart'] <= (hit['send'] + 50)) & \
                                               (compo_csv['sstart'] >= (hit['send'] - 1000))]
                narrowed_poss_hits = narrowed_poss_hits.sort_values(by=['sstart'], ascending=False)
                if not narrowed_poss_hits.empty:
                    new_align = abs(hit['qstart'] - narrowed_poss_hits['qend'].iloc[0])
                    if new_align >= 5000:
                        hit_new = pandas.DataFrame(hit_new).transpose()
                        current_hit = pandas.DataFrame(narrowed_poss_hits.iloc[0]).transpose()
                        hit_new = pandas.concat([hit_new, current_hit], sort=False, ignore_index=False)
                        hit_new = hit_new.reset_index(drop=True)
                        altered = "merge"
                    else:
                        hit_new = pandas.DataFrame(hit_new).transpose()
                        current_hit = pandas.DataFrame(narrowed_poss_hits.iloc[0]).transpose()
                        hit_new = pandas.concat([hit_new, current_hit], sort=False, ignore_index=False)
                        hit_new = hit_new.reset_index(drop=True)
                        narrowed_poss_hits_2 = compo_csv[
                            (compo_csv['sstart'] >= (narrowed_poss_hits['send'].iloc[0] - 50)) & \
                            (compo_csv['sstart'] <= (narrowed_poss_hits['send'].iloc[0] + 50))]
                        narrowed_poss_hits_2 = narrowed_poss_hits_2.sort_values(by='align', ascending=False)

                        while not narrowed_poss_hits_2.empty and new_align < 5000:
                            new_align = abs(hit['qstart'] - narrowed_poss_hits_2['qend'].iloc[0])
                            extra_hit = pandas.DataFrame(narrowed_poss_hits_2.iloc[0]).transpose()

                            hit_new = pandas.concat([hit_new, extra_hit.iloc[[0]]], sort=False, ignore_index=False)
                            hit_new = hit_new.reset_index(drop=True)
                            if new_align < 5000:
                                narrowed_poss_hits_2 = compo_csv[
                                    (compo_csv['sstart'] >= (narrowed_poss_hits['send'].iloc[0] - 50)) & \
                                    (compo_csv['sstart'] <= (narrowed_poss_hits['send'].iloc[0] + 50))]
                                narrowed_poss_hits_2 = narrowed_poss_hits_2.sort_values(by='align', ascending=False)

                        if new_align >= 5000:
                            altered = "merge"
                        altered = "no"
                else:
                    new_align = 0
        elif mge_ori == "reverse":
            if hit_subject_ori == "forward":
                narrowed_poss_hits = compo_csv[(compo_csv['send'] >= (hit['sstart'] - 1000)) & \
                                               (compo_csv['send'] <= (hit['sstart'] + 50))]
                narrowed_poss_hits = narrowed_poss_hits.sort_values(by = ['send'], ascending = False)

                if not narrowed_poss_hits.empty:
                    new_align = abs(hit['qend'] - narrowed_poss_hits['qstart'].iloc[0])

                    if new_align >= 5000:
                        hit_new = pandas.DataFrame(hit_new).transpose()
                        current_hit = pandas.DataFrame(narrowed_poss_hits.iloc[0]).transpose()
                        hit_new = pandas.concat([hit_new, current_hit], sort=False, ignore_index=False)
                        hit_new = hit_new.reset_index(drop=True)
                        altered = "merge"
                    else:
                        hit_new = pandas.DataFrame(hit_new).transpose()
                        current_hit = pandas.DataFrame(narrowed_poss_hits.iloc[0]).transpose()
                        hit_new = pandas.concat([hit_new, current_hit], sort=False, ignore_index=False)
                        hit_new = hit_new.reset_index(drop=True)
                        narrowed_poss_hits_2 = compo_csv[(compo_csv['send'] >= (narrowed_poss_hits['sstart'].iloc[0] - 1000)) & \
                                                       (compo_csv['send'] <= (narrowed_poss_hits['sstart'].iloc[0] + 50))]
                        narrowed_poss_hits_2 = narrowed_poss_hits_2.sort_values(by='align', ascending=False)

                        while not narrowed_poss_hits_2.empty and new_align < 5000:
                            new_align = abs(hit['qend'] - narrowed_poss_hits_2['qstart'].iloc[0])
                            extra_hit = pandas.DataFrame(narrowed_poss_hits_2.iloc[0]).transpose()

                            hit_new = pandas.concat([hit_new, extra_hit.iloc[[0]]], sort=False, ignore_index=False)
                            hit_new = hit_new.reset_index(drop=True)
                            if new_align < 5000:
                                narrowed_poss_hits_2 = compo_csv[(compo_csv['send'] >= (narrowed_poss_hits['sstart'].iloc[0] - 1000)) & \
                                                                 (compo_csv['send'] <= (narrowed_poss_hits['sstart'].iloc[0] + 50))]
                                narrowed_poss_hits_2 = narrowed_poss_hits_2.sort_values(by='align', ascending=False)

                        if new_align >= 5000:
                            altered = "merge"


                else:
                    new_align = 0
            elif hit_subject_ori == "reverse":
                narrowed_poss_hits = compo_csv[(compo_csv['sstart'] >= (hit['send'] - 50) )& \
                                               (compo_csv['sstart'] <= (hit['send'] + 1000))]
                narrowed_poss_hits = narrowed_poss_hits.sort_values(by=['sstart'], ascending=True)

                if not narrowed_poss_hits.empty:
                    new_align = abs(hit['qend'] - narrowed_poss_hits['qstart'].iloc[0])


                    if new_align >= 5000:
                        hit_new = pandas.DataFrame(hit_new).transpose()
                        current_hit = pandas.DataFrame(narrowed_poss_hits.iloc[0]).transpose()
                        hit_new = pandas.concat([hit_new, current_hit], sort=False, ignore_index=False)
                        hit_new = hit_new.reset_index(drop=True)
                        altered = "merge"
                        hit_new = pandas.DataFrame(hit_new).transpose()
                        current_hit = pandas.DataFrame(narrowed_poss_hits.iloc[0]).transpose()
                        hit_new = pandas.concat([hit_new, current_hit], sort=False, ignore_index=False)
                        hit_new = hit_new.reset_index(drop=True)
                        narrowed_poss_hits_2 = compo_csv[(compo_csv['sstart'] >= (narrowed_poss_hits['send'].iloc[0] - 50)) & \
                                                         (compo_csv['sstart'] <= (narrowed_poss_hits['send'].iloc[0] + 1000))]
                        narrowed_poss_hits_2 = narrowed_poss_hits_2.sort_values(by='align', ascending=False)

                        while not narrowed_poss_hits_2.empty and new_align < 5000:
                            new_align = abs(hit['qend'] - narrowed_poss_hits_2['qstart'].iloc[0])
                            extra_hit = pandas.DataFrame(narrowed_poss_hits_2.iloc[0]).transpose()

                            hit_new = pandas.concat([hit_new, extra_hit.iloc[[0]]], sort=False, ignore_index=False)
                            hit_new = hit_new.reset_index(drop=True)
                            if new_align < 5000:
                                narrowed_poss_hits_2 = compo_csv[
                                    (compo_csv['sstart'] >= (narrowed_poss_hits['send'].iloc[0] - 50)) & \
                                    (compo_csv['sstart'] <= (narrowed_poss_hits['send'].iloc[0] + 1000))]
                                narrowed_poss_hits_2 = narrowed_poss_hits_2.sort_values(by='align', ascending=False)

                        if new_align >= 5000:
                            altered = "merge"
                else:
                    new_align = 0

    return hit_new, new_align

def multi_hit_merger(hit_new_bef):
    ## Function to merge the many act compos found close to a hit into one hit
    ## on the compo scale.
    ## Input: hit_new_bef: Pandas DF of multiple act comparisons to merge together
    max_row = pandas.to_numeric(hit_new_bef['qend']).idxmax()
    min_row = pandas.to_numeric(hit_new_bef['qstart']).idxmin()
    length_bef = hit_new_bef.iloc[max_row, 7] - hit_new_bef.iloc[min_row, 6]
    hit_bef_out = pandas.Series({'query': hit_new_bef['query'].iloc[0],
                                 'subject': hit_new_bef['subject'].iloc[0],
                                 'pid': hit_new_bef['pid'].iloc[0],
                                 'align': length_bef,
                                 'gap': hit_new_bef['gap'].iloc[0],
                                 'mismatch': hit_new_bef['mismatch'].iloc[0],
                                 'qstart': hit_new_bef.iloc[min_row, 6],
                                 'qend': hit_new_bef.iloc[max_row, 7],
                                 'sstart': hit_new_bef.iloc[min_row, 8],
                                 'send': hit_new_bef.iloc[max_row, 9],
                                 'eval': hit_new_bef['eval'].iloc[0],
                                 'bitscore': hit_new_bef['bitscore'].iloc[0]})

    return hit_bef_out

def multi_contig_dropper(hits, ref_contig, ref_contig_bounds):
    ## Function to take in a df form compo and check each hit if on a certain contig,
    ## return df of hits only on the contig
    returned_hits = hits.copy()

    if not hits.empty:
        drop_indie = []
        if isinstance(hits, pandas.Series):
            hits = hits.to_frame().transpose()
        #returned_hits = returned_hits.reset_index(drop=True)
        for k in range(len(hits.index)):
            current_locs = hits.iloc[k,[8,9]].tolist()
            locs_contig = contig_checker(ref_contig_bounds, current_locs)
            if locs_contig != ref_contig:
                drop_indie.append(hits.index[k])



        returned_hits = hits.drop(drop_indie)
        #returned_hits = returned_hits.reset_index(drop=True)

    return returned_hits


def hit_mover(hit_before, hit_after, compo_csv, isolate_id, mge_bounds, mge_ori_bef, mge_ori_aft, ref_contig_bounds, ref_contigs):
    ## Function to include hits after the current if its less than 2k and the new hit
    ## is very close. Finds closest hits, if they are within 50bp of the next nearest.
    ## Input:
    ## hit_before - The act comparison hit identified upstream of the insertion
    ## hit_after - The act comparison hit identified downstream of the insertion
    ## compo_csv - The act comparison csv
    ## isolate_id - The isolate_id
    ## mge_bounds - List of length two, with each item the bounds of the contig the before and after hits are on
    ## mge_ori_bef - The orientation of the start of the MGE
    ## mge_ori_aft - The orientation of the end of the MGE
    ## ref_contig_bounds - The reference contig table
    ## ref_contigs - The contigs of the reference where the MGE is present.



    hit_before_length = abs(hit_before.iloc[7] - hit_before.iloc[6])
    hit_after_length = abs(hit_after.iloc[7] - hit_after.iloc[6])

    hit_new_bef = hit_before.copy()
    hit_new_aft = hit_after.copy()



    before_pass = False
    after_pass = False



    if hit_before_length < 5000:
        if mge_ori_bef == "forward":
            ## looking for hits previous to the current before
            poss_hit_before = compo_csv[(compo_csv['qend'] <= (hit_before['qstart'] + 50)) &\
                                         (compo_csv['qstart'] >= (mge_bounds[0][0] - 10)) ]
            poss_hit_before = poss_hit_before.sort_values(by = 'qend', ascending=False)
            new_align = hit_before_length
            if not poss_hit_before.empty:
                poss_hit_before['indexio'] = pandas.Series(list(poss_hit_before.index.values),index=poss_hit_before.index)
                current_hit = poss_hit_before[poss_hit_before['qend'] >= (hit_new_bef.iloc[6] - 1000)]
                current_hit = multi_contig_dropper(current_hit, ref_contigs[0], ref_contig_bounds)
                if not current_hit.empty:
                    new_align = hit_new_bef.iloc[7] - current_hit['qstart'].iloc[0]
                    if new_align >= 5000:
                        hit_new_bef = pandas.DataFrame(hit_new_bef).transpose()
                        current_hit = pandas.DataFrame(current_hit.iloc[0]).transpose()
                        hit_new_bef = pandas.concat([hit_new_bef, current_hit], sort=False, ignore_index=False)
                        hit_new_bef = hit_new_bef.reset_index(drop=True)
                    else:

                        hit_new_bef = pandas.DataFrame(hit_new_bef).transpose()
                        current_hit = pandas.DataFrame(current_hit.iloc[0]).transpose()
                        hit_new_bef = pandas.concat([hit_new_bef, current_hit], sort=False, ignore_index=False)
                        hit_new_bef = hit_new_bef.reset_index(drop=True)
                        poss_hit_before = poss_hit_before[poss_hit_before['indexio'] != current_hit['indexio'].iloc[0]]
                        while_n = 0
                        while not current_hit.empty and new_align < 5000:
                            while_n += 1
                            if while_n != 1:
                                hit_new_bef = pandas.concat([hit_new_bef, current_hit.iloc[[0]]], sort=False,ignore_index=False)
                                hit_new_bef = hit_new_bef.reset_index(drop=True)
                                poss_hit_before = poss_hit_before[poss_hit_before['indexio'] != current_hit['indexio'].iloc[0]]
                            end_indy = max(hit_new_bef.index.values) - 1
                            current_hit = poss_hit_before[(poss_hit_before['qend'] >= (hit_new_bef.iloc[end_indy, 6] - 1000))]
                            current_hit = multi_contig_dropper(current_hit, ref_contigs[0], ref_contig_bounds)
                            if not current_hit.empty:
                                new_align = abs(hit_new_bef.iloc[0, 7] - current_hit.iloc[0, 6])
                                poss_hit_before = poss_hit_before.drop(current_hit.index[0])
                        if new_align < 5000:
                            hit_new_bef = multi_hit_merger(hit_new_bef)
                            hit_new_bef, new_align = subject_checker(hit_new_bef, "before",poss_hit_before, mge_bounds, "forward")


                else:

                    hit_new_bef, new_align = subject_checker(hit_before, "before", poss_hit_before, mge_bounds, "forward")

            if isinstance(hit_new_bef, pandas.DataFrame):
                before_process = "merge"
            else:
                before_process = "no"

        elif mge_ori_bef == "reverse":

            poss_hit_before = compo_csv[(compo_csv['qstart'] >= (hit_before['qend'] - 50)) & \
                                        (compo_csv['qend'] <= (mge_bounds[0][1] + 10)) ]
            poss_hit_before = poss_hit_before.sort_values(by='qstart', ascending=True)
            ## add an index column to remove hits merged in

            if not poss_hit_before.empty:
                ## add an index column to remove hits merged in
                poss_hit_before['indexio'] = pandas.Series(list(poss_hit_before.index.values), index=poss_hit_before.index)
                current_hit = poss_hit_before[poss_hit_before['qstart'] <= (hit_new_bef.iloc[7] + 1000)]

                ## Going to assume that the match in the reference is not split over two contigs, that would
                ## be fatal!
                current_hit = multi_contig_dropper(current_hit, ref_contigs[0], ref_contig_bounds)

                if not current_hit.empty:
                    new_align = abs(hit_new_bef.iloc[6] - current_hit['qend'].iloc[0])

                    if new_align >= 5000:
                        hit_new_bef = pandas.DataFrame(hit_new_bef).transpose()
                        current_hit = pandas.DataFrame(current_hit.iloc[0]).transpose()
                        hit_new_bef = pandas.concat([hit_new_bef, current_hit], sort=False, ignore_index=False)
                        hit_new_bef = hit_new_bef.reset_index(drop=True)

                    else:
                        hit_new_bef = pandas.DataFrame(hit_new_bef).transpose()
                        current_hit = pandas.DataFrame(current_hit.iloc[0]).transpose()
                        hit_new_bef = pandas.concat([hit_new_bef, current_hit], sort=False, ignore_index=False)
                        hit_new_bef = hit_new_bef.reset_index(drop=True)
                        poss_hit_before = poss_hit_before[poss_hit_before['indexio'] != current_hit['indexio'].iloc[0]]

                        while_n = 0
                        while not current_hit.empty and new_align < 5000:
                            while_n += 1
                            if while_n != 1:

                                hit_new_bef = pandas.concat([hit_new_bef, current_hit.iloc[[0]]], sort=False, ignore_index=False)
                                hit_new_bef = hit_new_bef.reset_index(drop=True)
                                poss_hit_before = poss_hit_before[poss_hit_before['indexio'] != current_hit['indexio'].iloc[0]]
                            end_indy = max(hit_new_bef.index.values) - 1
                            current_hit = poss_hit_before[poss_hit_before['qstart'] <= (hit_new_bef.iloc[end_indy, 7] + 1000)]

                            current_hit = multi_contig_dropper(current_hit, ref_contigs[0], ref_contig_bounds)


                            if not current_hit.empty:
                                new_align = abs(hit_new_bef.iloc[0, 6] - current_hit.iloc[0, 7])
                                poss_hit_before = poss_hit_before.drop(current_hit.index[0])

                        if new_align < 5000:
                            hit_new_bef = multi_hit_merger(hit_new_bef)
                            hit_new_bef, new_align = subject_checker(hit_new_bef, "before",poss_hit_before, mge_bounds, "reverse")
                else:

                    hit_new_bef, new_align = subject_checker(hit_before, "before", poss_hit_before, mge_bounds, "reverse")

            if isinstance(hit_new_bef, pandas.DataFrame):
                before_process = "merge"
            else:
                before_process = "no"


    else:
        before_pass = True
        before_process = "No"

    if hit_after_length < 5000:
        if mge_ori_aft == "forward":
            poss_hit_after = compo_csv[(compo_csv['qstart'] >= (hit_after['qend'] - 50)) & \
                                        (compo_csv['qend'] <= (mge_bounds[1][1] + 10)) ]
            poss_hit_after = poss_hit_after.sort_values(by='qstart', ascending=True)

            if not poss_hit_after.empty:
                poss_hit_after['indexio'] = pandas.Series(list(poss_hit_after.index.values),
                                                           index=poss_hit_after.index)
                current_hit = poss_hit_after[poss_hit_after['qstart'] <= (hit_new_aft.iloc[7] + 1000)]
                current_hit = multi_contig_dropper(current_hit, ref_contigs[1], ref_contig_bounds)
                if not current_hit.empty:
                    new_align = abs(hit_new_aft.iloc[6] - current_hit['qend'].iloc[0])
                    if new_align >= 5000:
                        hit_new_aft = pandas.DataFrame(hit_new_aft).transpose()
                        current_hit = pandas.DataFrame(current_hit.iloc[0]).transpose()
                        hit_new_aft = pandas.concat([hit_new_aft, current_hit], sort=False, ignore_index=False)
                        hit_new_aft = hit_new_aft.reset_index(drop=True)

                    else:
                        hit_new_aft = pandas.DataFrame(hit_new_aft).transpose()
                        current_hit = pandas.DataFrame(current_hit.iloc[0]).transpose()
                        hit_new_aft = pandas.concat([hit_new_aft, current_hit], sort=False, ignore_index=False)
                        hit_new_aft = hit_new_aft.reset_index(drop=True)
                        poss_hit_after = poss_hit_after[poss_hit_after['indexio'] != current_hit['indexio'].iloc[0]]
                        while_n = 0
                        while not current_hit.empty and new_align < 5000:
                            while_n += 1
                            if while_n != 1:
                                hit_new_aft = pandas.concat([hit_new_aft, current_hit.iloc[[0]]], sort=False, ignore_index=False)
                                hit_new_aft = hit_new_aft.reset_index(drop=True)
                                poss_hit_after = poss_hit_after[
                                    poss_hit_after['indexio'] != current_hit['indexio'].iloc[0]]
                            end_indy = max(hit_new_aft.index.values) - 1
                            current_hit = poss_hit_after[poss_hit_after['qstart'] <= (hit_new_aft.iloc[end_indy, 7] + 1000)]
                            current_hit = multi_contig_dropper(current_hit, ref_contigs[1], ref_contig_bounds)
                            if not current_hit.empty:
                                new_align = abs(hit_new_aft.iloc[0, 6] - current_hit.iloc[0, 7])
                                poss_hit_after = poss_hit_after.drop(current_hit.index[0])
                        if new_align < 5000:
                            hit_new_aft = multi_hit_merger(hit_new_aft)
                            hit_new_aft, new_align = subject_checker(hit_new_aft, "after",poss_hit_after, mge_bounds, "forward")
            else:
                hit_new_aft, new_align = subject_checker(hit_after, "after", poss_hit_after, mge_bounds, "forward")

            if isinstance(hit_new_aft, pandas.DataFrame):
                after_process = "merge"
            else:
                after_process = "no"
        elif mge_ori_aft == "reverse":
            ## looking for hits previous to the current before
            poss_hit_after = compo_csv[(compo_csv['qend'] <= (hit_after['qstart'] + 50)) & \
                                        (compo_csv['qstart'] >= (mge_bounds[1][0] - 10)) ]
            poss_hit_after = poss_hit_after.sort_values(by='qend', ascending=False)
            new_align = hit_after_length
            if not poss_hit_after.empty:
                poss_hit_after['indexio'] = pandas.Series(list(poss_hit_after.index.values),
                                                          index=poss_hit_after.index)
                current_hit = poss_hit_after[poss_hit_after['qend'] >= (hit_new_aft.iloc[ 6] - 1000)]
                current_hit = multi_contig_dropper(current_hit, ref_contigs[1], ref_contig_bounds)
                if not current_hit.empty:

                    new_align = hit_new_aft.iloc[7] - current_hit['qstart'].iloc[0]
                    if new_align >= 5000:


                        hit_new_aft = pandas.DataFrame(hit_new_aft).transpose()
                        current_hit = pandas.DataFrame(current_hit.iloc[0]).transpose()
                        hit_new_aft = pandas.concat([hit_new_aft, current_hit], sort=False, ignore_index=False)
                        hit_new_aft = hit_new_aft.reset_index(drop=True)

                    else:

                        hit_new_aft = pandas.DataFrame(hit_new_aft).transpose()
                        current_hit = pandas.DataFrame(current_hit.iloc[0]).transpose()
                        hit_new_aft = pandas.concat([hit_new_aft, current_hit], sort=False, ignore_index=False)
                        hit_new_aft = hit_new_aft.reset_index(drop=True)
                        poss_hit_after = poss_hit_after[poss_hit_after['indexio'] != current_hit['indexio'].iloc[0]]
                        while_n = 0
                        while not current_hit.empty and new_align < 5000:
                            while_n += 1
                            if while_n != 1:
                                hit_new_aft = pandas.concat([hit_new_aft, current_hit.iloc[[0]]], sort=False, ignore_index=False)
                                hit_new_aft = hit_new_aft.reset_index(drop=True)
                                poss_hit_after = poss_hit_after[
                                    poss_hit_after['indexio'] != current_hit['indexio'].iloc[0]]
                            end_indy = max(hit_new_aft.index.values) - 1
                            current_hit = poss_hit_after[poss_hit_after['qend'] >= (hit_new_aft.iloc[end_indy, 6] - 1000)]
                            current_hit = multi_contig_dropper(current_hit, ref_contigs[1], ref_contig_bounds)
                            if not current_hit.empty:
                                new_align = hit_new_aft.iloc[0, 7] - current_hit.iloc[0, 6]
                                poss_hit_after = poss_hit_after.drop(current_hit.index[0])
                        if new_align < 5000:
                            hit_new_aft = multi_hit_merger(hit_new_aft)
                            hit_new_aft, new_align = subject_checker(hit_new_aft, "after",poss_hit_after, mge_bounds, "forward")


                else:
                    hit_new_aft, new_align = subject_checker(hit_after, "after", poss_hit_after, mge_bounds, "reverse")

            if isinstance(hit_new_aft, pandas.DataFrame):

                after_process = "merge"
            else:
                after_process = "no"
    else:
        after_pass = True
        after_process = "No"


    if before_process == "merge":

        max_row = pandas.to_numeric(hit_new_bef['qend']).idxmax()
        min_row = pandas.to_numeric(hit_new_bef['qstart']).idxmin()
        length_bef = hit_new_bef.iloc[max_row, 7] - hit_new_bef.iloc[min_row, 6]
        hit_bef_out = pandas.Series({'query': hit_before['query'],
                                 'subject': hit_before['subject'],
                                 'pid': hit_before['pid'],
                                 'align': length_bef,
                                 'gap': hit_before['gap'],
                                 'mismatch': hit_before['mismatch'],
                                 'qstart': hit_new_bef.iloc[min_row, 6],
                                 'qend': hit_new_bef.iloc[max_row, 7],
                                 'sstart': hit_new_bef.iloc[min_row,8],
                                 'send': hit_new_bef.iloc[max_row, 9],
                                 'eval': hit_before['eval'],
                                 'bitscore': hit_before['bitscore']})

        before_pass = length_bef >= 5000
    else:
        hit_bef_out = hit_before.copy()

    if after_process == "merge":
        max_row = pandas.to_numeric(hit_new_aft['qend']).idxmax()
        min_row = pandas.to_numeric(hit_new_aft['qstart']).idxmin()
        length_aft = hit_new_aft.iloc[max_row, 7] - hit_new_aft.iloc[min_row, 6]
        hit_aft_out = pandas.Series({'query': hit_after['query'],
                                     'subject': hit_after['subject'],
                                     'pid': hit_after['pid'],
                                     'align': length_aft,
                                     'gap': hit_after['gap'],
                                     'mismatch': hit_after['mismatch'],
                                     'qstart': hit_new_aft.iloc[min_row, 6],
                                     'qend': hit_new_aft.iloc[max_row, 7],
                                     'sstart': hit_new_aft.iloc[min_row, 8],
                                     'send': hit_new_aft.iloc[max_row, 9],
                                     'eval': hit_after['eval'],
                                     'bitscore': hit_after['bitscore']})


        after_pass = length_aft >= 5000
    else:
        hit_aft_out = hit_after.copy()


    tot_pass = before_pass and after_pass


    return hit_bef_out, hit_aft_out, tot_pass


def final_acceptor(hit_before, hit_after, isolate_id, mge_bounds, mge_ori_bef, mge_ori_after):
    ## Function to check whether the merged bounds are above 2500 and then
    ## to just take the 5000 from here if it's within the contig bounds
    ## input: hit_before: Merged hit before from hit_mover
    ##        hit_after: Merged hit after from hit_mover
    ##        compo_csv: The act comparison csv
    ##        isolate_id: The current isolate_id
    ##        mge_bounds: The contig bounds for the mge, list of two lists
    ##        mge_ori_bef: The orientation of the mge for start of MGE
    ##        mge_ori_aft: The orientation of the MGE at the end of it.

    hit_before_length = abs(hit_before.iloc[7] - hit_before.iloc[6])
    hit_after_length = abs(hit_after.iloc[7] - hit_after.iloc[6])

    hit_new_bef = hit_before.copy()
    hit_new_aft = hit_after.copy()

    if hit_before_length < 5000:
        if mge_ori_bef == "forward":
            new_end = hit_before.iloc[7] - 5000
            if new_end > mge_bounds[0][0]:
                max_send = max([hit_before['sstart'], hit_before['send']])

                hit_bef_out = pandas.Series({'query': hit_before['query'],
                                             'subject': hit_before['subject'],
                                             'pid': hit_before['pid'],
                                             'align': 5000,
                                             'gap': hit_before['gap'],
                                             'mismatch': hit_before['mismatch'],
                                             'qstart': new_end,
                                             'qend': hit_new_bef['qend'],
                                             'sstart': max_send - 5000,
                                             'send': max_send,
                                             'eval': hit_before['eval'],
                                             'bitscore': hit_before['bitscore']})
                hit_before_length = 5000
            else:
                hit_bef_out = hit_new_bef
        elif mge_ori_bef == "reverse":
            new_end = hit_before.iloc[6] + 5000
            if new_end < mge_bounds[0][1]:
                min_send = min([hit_before['sstart'], hit_before['send']])
                hit_bef_out = pandas.Series({'query': hit_before['query'],
                                             'subject': hit_before['subject'],
                                             'pid': hit_before['pid'],
                                             'align': 5000,
                                             'gap': hit_before['gap'],
                                             'mismatch': hit_before['mismatch'],
                                             'qstart': hit_before['qstart'],
                                             'qend': new_end,
                                             'sstart': min_send,
                                             'send': min_send + 5000,
                                             'eval': hit_before['eval'],
                                             'bitscore': hit_before['bitscore']})
                hit_before_length = 5000
            else:
                hit_bef_out = hit_new_bef
    else:
        hit_bef_out = hit_new_bef

    ## Now for the after hits
    if hit_after_length < 5000:
        if mge_ori_after == "reverse":
            new_end = hit_after.iloc[7] - 5000
            if new_end > mge_bounds[1][0]:
                max_send = max([hit_after['sstart'], hit_after['send']])
                hit_aft_out = pandas.Series({'query': hit_after['query'],
                                             'subject': hit_after['subject'],
                                             'pid': hit_after['pid'],
                                             'align': 5000,
                                             'gap': hit_after['gap'],
                                             'mismatch': hit_after['mismatch'],
                                             'qstart': new_end,
                                             'qend': hit_after['qend'],
                                             'sstart': max_send,
                                             'send': max_send - 5000,
                                             'eval': hit_after['eval'],
                                             'bitscore': hit_after['bitscore']})
                hit_after_length = 5000
            else:
                hit_aft_out = hit_new_aft
        elif mge_ori_after == "forward":
            new_end = hit_after.iloc[6] + 5000
            if new_end < mge_bounds[1][1]:
                min_send = min([hit_after['sstart'], hit_after['send']])
                hit_aft_out = pandas.Series({'query': hit_after['query'],
                                             'subject': hit_after['subject'],
                                             'pid': hit_after['pid'],
                                             'align': 5000,
                                             'gap': hit_after['gap'],
                                             'mismatch': hit_after['mismatch'],
                                             'qstart': hit_after['qstart'],
                                             'qend': new_end,
                                             'sstart': min_send,
                                             'send': min_send + 5000,
                                             'eval': hit_after['eval'],
                                             'bitscore': hit_after['bitscore']})
                hit_after_length = 5000
            else:
                hit_aft_out = hit_new_aft
    else:
        hit_aft_out = hit_new_aft

    if hit_before_length >= 5000 and hit_after_length >= 5000:
        both_tigs = True
    else:
        both_tigs = False

    return  hit_bef_out, hit_aft_out, both_tigs

def before_and_after_check(hit_to_check, mge_locs, compo_table, hit_loc, other_hit, isolate_id, mge_bef_ori,
                           mge_aft_ori):
    ## This function will take input from the hit distance checker. We're looking for
    ## any hits that might have been missed by our initial search in before and after
    ## hits function, i.e they have an align < 2000. They have to be adjacent to the
    ## mge and adjacent to the hit_to_check in the reference genome.

    hit_check_query = hit_to_check[[6, 7]]
    hit_check_subject = hit_to_check[[8, 9]]
    other_hit_subject = other_hit[[8,9]]

    if hit_loc == "before" :
        mge_ori = mge_bef_ori
    elif hit_loc == "after":
        mge_ori = mge_aft_ori

    if hit_check_subject[0] < hit_check_subject[1]:
        check_ori = "forward"
    elif hit_check_subject[0] > hit_check_subject[1]:
        check_ori = "reverse"

    added_hits = pandas.DataFrame()

    if hit_loc == "before":
        if mge_ori == "forward":
            if check_ori == "forward":
                poss_hits = compo_table[(compo_table['qend'] < (mge_locs[0] + 50)) &\
                                        (compo_table['qend'] > hit_check_query[0]) &\
                                        (compo_table['send'] > hit_check_subject[1]) &\
                                        (compo_table['sstart'] < (other_hit_subject[0] + 500))]
                poss_hits = poss_hits.sort_values(by=['qstart'], ascending=True)
                if not poss_hits.empty:
                    hit_orig = pandas.DataFrame(hit_to_check).transpose()
                    poss_hits = poss_hits.reset_index(drop=True)
                    current_hit = pandas.DataFrame(poss_hits.iloc[0]).transpose()
                    added_hits = pandas.concat([added_hits,hit_orig, current_hit], ignore_index=True, sort=False)
                    poss_hits = poss_hits.drop(0)
                    new_row = 0
                    while not poss_hits.empty:
                        new_row += 1
                        current_hit = pandas.DataFrame(poss_hits.iloc[0]).transpose()
                        added_hits = pandas.concat([added_hits, current_hit], ignore_index=True, sort=False)
                        poss_hits = poss_hits.drop(new_row)



            elif check_ori == "reverse":
                poss_hits = compo_table[(compo_table['qend'] < (mge_locs[0] + 50)) & \
                                        (compo_table['qend'] > hit_check_query[0]) & \
                                        (compo_table['send'] < hit_check_subject[1]) &\
                                        (compo_table['sstart'] > (other_hit_subject[0] - 500))]
                poss_hits = poss_hits.sort_values(by=['qend'], ascending=True)
                if not poss_hits.empty:
                    hit_orig = pandas.DataFrame(hit_to_check).transpose()
                    poss_hits = poss_hits.reset_index(drop=True)
                    current_hit = pandas.DataFrame(poss_hits.iloc[0]).transpose()
                    added_hits = pandas.concat([added_hits,hit_orig, current_hit], ignore_index=True, sort=False)
                    poss_hits = poss_hits.drop(0)
                    new_row = 0
                    while not poss_hits.empty:
                        new_row += 1
                        current_hit = pandas.DataFrame(poss_hits.iloc[0]).transpose()
                        added_hits = pandas.concat([added_hits, current_hit], ignore_index=True, sort=False)
                        poss_hits = poss_hits.drop(new_row)

        elif mge_ori == "reverse":
            if check_ori == "forward":

                poss_hits = compo_table[(compo_table['qstart'] > (mge_locs[0] - 50)) & \
                                        (compo_table['qstart'] < hit_check_query[1]) & \
                                        (compo_table['sstart'] < hit_check_subject[0]) & \
                                        (compo_table['send'] > (other_hit_subject[1] - 500))]
                poss_hits = poss_hits.sort_values(by=['qstart'], ascending=False)



                if not poss_hits.empty:
                    hit_orig = pandas.DataFrame(hit_to_check).transpose()
                    poss_hits = poss_hits.reset_index(drop=True)
                    current_hit = pandas.DataFrame(poss_hits.iloc[0]).transpose()
                    added_hits = pandas.concat([added_hits, hit_orig, current_hit], ignore_index=True, sort=False)
                    poss_hits = poss_hits.drop(0)
                    new_row = 0
                    while not poss_hits.empty:
                        new_row += 1
                        current_hit = pandas.DataFrame(poss_hits.iloc[0]).transpose()
                        added_hits = pandas.concat([added_hits, current_hit], ignore_index=True, sort=False)
                        poss_hits = poss_hits.drop(new_row)

            elif check_ori == "reverse":
                poss_hits = compo_table[(compo_table['qstart'] > (mge_locs[0] - 50)) & \
                                        (compo_table['qstart'] < hit_check_query[1]) & \
                                        (compo_table['sstart'] > hit_check_subject[0]) &\
                                        (compo_table['send'] < (other_hit_subject[1] + 500))]
                poss_hits = poss_hits.sort_values(by=['qstart'], ascending=False)
                if not poss_hits.empty:
                    hit_orig = pandas.DataFrame(hit_to_check).transpose()
                    poss_hits = poss_hits.reset_index(drop=True)
                    current_hit = pandas.DataFrame(poss_hits.iloc[0]).transpose()
                    added_hits = pandas.concat([added_hits, hit_orig, current_hit], ignore_index=True, sort=False)
                    poss_hits = poss_hits.drop(0)
                    new_row = 0
                    while not poss_hits.empty:
                        new_row += 1
                        current_hit = pandas.DataFrame(poss_hits.iloc[0]).transpose()
                        added_hits = pandas.concat([added_hits, current_hit], ignore_index=True, sort=False)
                        poss_hits = poss_hits.drop(new_row)

    elif hit_loc == "after":
        if mge_ori == "forward":
            if check_ori == "forward":
                poss_hits = compo_table[(compo_table['qstart'] > (mge_locs[1] - 50)) & \
                                        (compo_table['qstart'] < hit_check_query[1]) & \
                                        (compo_table['sstart'] < hit_check_subject[0])&\
                                        (compo_table['send'] > (other_hit_subject[1] - 500))]
                poss_hits = poss_hits.sort_values(by=['qstart'], ascending=False)
                if not poss_hits.empty:
                    hit_orig = pandas.DataFrame(hit_to_check).transpose()
                    poss_hits = poss_hits.reset_index(drop=True)
                    current_hit = pandas.DataFrame(poss_hits.iloc[0]).transpose()
                    added_hits = pandas.concat([added_hits, hit_orig, current_hit], ignore_index=True, sort=False)
                    poss_hits = poss_hits.drop(0)
                    new_row = 0
                    while not poss_hits.empty:
                        new_row += 1
                        current_hit = pandas.DataFrame(poss_hits.iloc[0]).transpose()
                        added_hits = pandas.concat([added_hits, current_hit], ignore_index=True, sort=False)
                        poss_hits = poss_hits.drop(new_row)


            elif check_ori == "reverse":
                poss_hits = compo_table[(compo_table['qstart'] > (mge_locs[1] - 50)) & \
                                        (compo_table['qstart'] < hit_check_query[1]) & \
                                        (compo_table['sstart'] > hit_check_subject[0]) &\
                                        (compo_table['send'] < (other_hit_subject[1] + 500))]
                poss_hits = poss_hits.sort_values(by=['qstart'], ascending=False)
                if not poss_hits.empty:
                    hit_orig = pandas.DataFrame(hit_to_check).transpose()
                    poss_hits = poss_hits.reset_index(drop=True)
                    current_hit = pandas.DataFrame(poss_hits.iloc[0]).transpose()
                    added_hits = pandas.concat([added_hits, hit_orig, current_hit], ignore_index=True, sort=False)
                    poss_hits = poss_hits.drop(0)
                    new_row = 0
                    while not poss_hits.empty:
                        new_row += 1
                        current_hit = pandas.DataFrame(poss_hits.iloc[0]).transpose()
                        added_hits = pandas.concat([added_hits, current_hit], ignore_index=True, sort=False)
                        poss_hits = poss_hits.drop(new_row)

        elif mge_ori == "reverse":
            if check_ori == "forward":
                poss_hits = compo_table[(compo_table['qend'] < (mge_locs[1] + 50)) & \
                                        (compo_table['qend'] > hit_check_query[0]) & \
                                        (compo_table['send'] > hit_check_subject[1]) &\
                                        (compo_table['sstart'] < (other_hit_subject[0] + 500))]
                poss_hits = poss_hits.sort_values(by=['qend'], ascending=True)
                if not poss_hits.empty:
                    hit_orig = pandas.DataFrame(hit_to_check).transpose()
                    poss_hits = poss_hits.reset_index(drop=True)
                    current_hit = pandas.DataFrame(poss_hits.iloc[0]).transpose()
                    added_hits = pandas.concat([added_hits, hit_orig, current_hit], ignore_index=True, sort=False)
                    poss_hits = poss_hits.drop(0)
                    new_row = 0
                    while not poss_hits.empty:
                        new_row += 1
                        current_hit = pandas.DataFrame(poss_hits.iloc[0]).transpose()
                        added_hits = pandas.concat([added_hits, current_hit], ignore_index=True, sort=False)
                        poss_hits = poss_hits.drop(new_row)


            elif check_ori == "reverse":
                poss_hits = compo_table[(compo_table['qend'] < (mge_locs[1] + 50)) & \
                                        (compo_table['qend'] > hit_check_query[0]) & \
                                        (compo_table['send'] < hit_check_subject[1]) &\
                                        (compo_table['sstart'] > (other_hit_subject[0] - 500))]

                poss_hits = poss_hits.sort_values(by=['qend'], ascending=True)
                if not poss_hits.empty:
                    hit_orig = pandas.DataFrame(hit_to_check).transpose()
                    poss_hits = poss_hits.reset_index(drop=True)
                    current_hit = pandas.DataFrame(poss_hits.iloc[0]).transpose()
                    added_hits = pandas.concat([added_hits, hit_orig, current_hit], ignore_index=True, sort=False)
                    poss_hits = poss_hits.drop(0)
                    new_row = 0
                    while not poss_hits.empty:
                        new_row += 1
                        current_hit = pandas.DataFrame(poss_hits.iloc[0]).transpose()
                        added_hits = pandas.concat([added_hits, current_hit], ignore_index=True, sort=False)
                        poss_hits = poss_hits.drop(new_row)


    return added_hits

def isolate_name_producer(names_series):
    ## Function to turn a series of isolate gff/fasta locations into a list of isolates:
    ## Input: names_series: A series object of fasta locations to work through

    ids = []

    for k in range(len(names_series.index)):
        current_id = names_series.iloc[k]
        current_base = os.path.basename(current_id)
        current_isolate = re.sub("\..*[a-z,A-Z].*$","", current_base)
        ids.append(current_isolate)

    return  ids

def all_presence_checker(id_list, series_ids):
    ## Function to check whether the total list of ids is within the blast csv
    ## Input: id_list: List of ids to check for presence all must be within the series ids
    ##        series_ids: the series of ids to check through
    skip = False
    num_in = 0
    for k in range(len(id_list)):
        if id_list[k] in series_ids:
            num_in += 1

    if num_in == len(id_list):
        skip = True

    return skip

def nice_proper_hits_ids(ids_list):
    ## Function to turn all the double underscore ids into standard isolate names for the act compos:
    ## Input: ids_list: list format of the first column of the proper hits
    nicer = []
    for id in ids_list:
        if id.count('_') == 2:
            last_occy = id.rfind('_')
            isolate_id = id[0:last_occy]
            nicer.append(isolate_id)
        else:
            nicer.append(id)

    return nicer

def act_mapper(hit_before, hit_after, act_loc, current_insert_locs):
    ## Function to take in the before and after hits generated from a new reference and convert
    ## the before and after hits sstart and send back into hits within the gubbins reference.
    ## Input: hit_before: The calculated hit before in the new reference
    ##        hit_after: The calculate hit after in the new reference
    ##        act_loc: The location of the act_comparison between the old reference and the new reference
    ##        current_insert_locs: The locs in the new reference with the insert in between.
    ## So I think we'll just calculate the location of the two closest points in both (so representative of either
    ## side of the insertion) then take the hit the other two as +- 5000 bp.

    ## Load up the act compo
    compo_names = ['query', 'subject', 'pid', 'align', 'gap', 'mismatch', 'qstart',
                   'qend', 'sstart', 'send', 'eval', 'bitscore']
    compo_table = pandas.read_csv(act_loc, sep='\t', names=compo_names)

    nuevo_before = hit_before.copy()
    nuevo_after = hit_after.copy()
    ## first test if both are within a single hit likely from an insertion away form where mge is within reference

    single_hit = compo_table[(compo_table['qstart'] <= current_insert_locs[0]) & (compo_table['qend'] >= current_insert_locs[1])]

    if single_hit.empty:
        ## Maybe this represent the insertion of the mge in the element too so lets look for hits either side
        hit_1 = compo_table[(compo_table['qend'] >= (current_insert_locs[0] - 50)) & (compo_table['qstart'] <= (current_insert_locs[0] + 50))]
        hit_2 = compo_table[(compo_table['qstart'] <= (current_insert_locs[1] + 50)) & (compo_table['qend'] >= (current_insert_locs[1] + 50))]


        hit_1 = hit_1.sort_values(by='align', ascending=False)
        hit_2 = hit_2.sort_values(by='align', ascending=False)

        if hit_1.empty or hit_2.empty:
            return  hit_before, hit_after, False
        else:
            if isinstance(hit_1, pandas.Series):
                hit_1 = hit_1.to_frame.transpose()
            if isinstance(hit_2, pandas.Series):
                hit_2 = hit_2.to_frame.transpose()
            send_gap = hit_1['qend'].iloc[0] - current_insert_locs[0]
            sstart_gap = current_insert_locs[1] - hit_2['qstart'].iloc[0]

            if hit_1['sstart'].iloc[0] < hit_1['send'].iloc[0]:
                new_sstart = hit_1['send'].iloc[0] - send_gap
                bef_adder = -5000
            else:
                new_sstart = hit_1['send'].iloc[0] + send_gap
                bef_adder = 5000
            if hit_2['sstart'].iloc[0] < hit_2['send'].iloc[0]:
                new_send = hit_2['sstart'].iloc[0] + sstart_gap
                aft_adder = 5000
            else:
                new_send = hit_2['sstart'].iloc[0] - sstart_gap
                aft_adder = -5000

            nuevo_before['send'] = new_sstart
            nuevo_before['sstart'] = new_sstart + bef_adder
            nuevo_after['sstart'] = new_send
            nuevo_after['send'] = new_send + aft_adder


            mapped = True
    else:
        single_hit = single_hit.sort_values(by='align', ascending=False)
        if isinstance(single_hit, pandas.Series):
            single_hit = single_hit.to_frame().transpose()

        send_gap = single_hit['qend'].iloc[0] - current_insert_locs[1]
        sstart_gap = current_insert_locs[0] - single_hit['qstart'].iloc[0]

        if single_hit['sstart'].iloc[0] < single_hit['send'].iloc[0]:
            new_sstart = single_hit['sstart'].iloc[0] + sstart_gap
            new_send = single_hit['send'].iloc[0] - send_gap
            bef_adder = -5000
            aft_adder = 5000
        else:
            new_sstart = single_hit['sstart'].iloc[0] - sstart_gap
            new_send = single_hit['send'].iloc[0] + send_gap
            bef_adder = 5000
            aft_adder = -5000

        nuevo_before['send'] = new_sstart
        nuevo_before['sstart'] = new_sstart + bef_adder
        nuevo_after['sstart'] = new_send
        nuevo_after['send'] = new_send + aft_adder
        mapped = True

    return  nuevo_before, nuevo_after, mapped

class mge_hits_row():
    def __init__(self, db_row):
        self.mge_start = db_row.iloc[0, 5]
        self.mge_end = db_row.iloc[0, 6]
        self.mge_bef_ori = db_row.iloc[0,10]
        self.mge_aft_ori = db_row.iloc[0, 11]
        self.id = db_row.iloc[0,0]
        self.merged = db_row['merged'].values[0]
        self.length = db_row['align']


    @property
    def id_z(self):
        isolate_id_z = self.id
        if isolate_id_z.count('_') == 2:
            last_occy = isolate_id_z.rfind('_')
            isolate_id = isolate_id_z[0:last_occy]
        else:
            isolate_id = isolate_id_z
        return isolate_id

    @property
    def mge_length(self):
        return(abs(self.mge_start - self.mge_end))

    @property
    def mge_ori(self):
        if self.mge_bef_ori == self.mge_bef_ori[1]:
            mge_ori = self.mge_bef_ori
        elif self.mge_bef_ori != self.mge_bef_ori[1]:
            mge_ori = "reverse"

        return mge_ori

    @property
    def align(self):
        return int(self.length)






if __name__ == '__main__':
    tab = str.maketrans("ACTG", "TGAC")
    tic = time.perf_counter()
    files_for_input = get_options()

    python_dir = os.path.dirname(os.path.abspath(__file__))
    perl_dir = re.sub("python$", "perl", python_dir)

    pandas.set_option('display.max_columns', 500)

    contig_file_abs_path = files_for_input.contig_loc
    absolute_act_path = files_for_input.act_loc
    act_dir = re.sub("referoo\.fasta\.","",absolute_act_path)
    fasta_csv = files_for_input.fasta_csv
    fasta_pandas = pandas.read_csv(fasta_csv)
    fasta_pandas.columns = ['isolate', 'reference', 'cluster_name']
    clusters_present = fasta_pandas['cluster_name'].unique()

    data_path = re.sub("python$", "data", os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe()))))  # script directory
    global_reference = data_path + "/pmen3_reference.fasta"
    

    proper_hits = pandas.read_csv(files_for_input.blast_csv)
    nice_ids = nice_proper_hits_ids(proper_hits.iloc[:, 0].tolist())
    merged_csv, merged_locs = merged_contig_checker(proper_hits, contig_file_abs_path, absolute_act_path)
    is_2k = merged_csv['align'] >= int(files_for_input.align)

    proper_hits = merged_csv[is_2k]
    proper_hits = proper_hits.reset_index(drop=True)

    nice_ids2 = nice_proper_hits_ids(proper_hits.iloc[:, 0].tolist())
    proper_hits['nicest_ids'] = pandas.Series(nice_ids2, index=proper_hits.index)
    ## Now lets load up the csv with the isolate names and their reference location

    isolate_ref_gff = pandas.read_csv(files_for_input.reference_csv)

    library_dat = pandas.read_csv(files_for_input.lib_csv)
    library_dat['mergio'] = "No"
    initial_lib_size = len(library_dat.index)
    ## Now lets remove the library ids from the proper hits index

    narrowed_prop = proper_hits[proper_hits['nicest_ids'].isin(library_dat['id']) == False]

    ## Ok so narrowed prop now contains all the hits we have to loop through and assign as hits in the
    ## library or not


    library_dat['insert_name'] = pandas.Series(range(1,(len(library_dat.index) + 1)))
    with open(("./" + files_for_input.output + "_realtered_act_refs.txt")) as file:
        refs_to_alter = file.read().splitlines()

    clusters_to_skip = []
    with open(("./" + files_for_input.output + "_new_act_refs.txt")) as file:
        new_refs = file.read().splitlines()
    ## Set up the csv for isolates that were missed due to incomplete hits
    missing_colnames = ["id","mge_start","mge_end","mge_length","insert_start","insert_end","before_gene_name", "after_gene_name",
                        "ref_name","cluster_name","reason"]
    missing_isolate = pandas.DataFrame(columns=missing_colnames)

    ## Set up the csv for the isolates with matching hits

    hit_col_names = ["id", "mge_start", "mge_end", "insert_start", "insert_end","before_sstart","before_send","after_sstart",
                     "after_send", "mge_length",
                     "insert_length", "insert_genes", "mge_genes", "flank_genes",
                     "mean_flank_gene_length", 'flanks_length', 'before_flank_gene', 'after_flank_gene',
                     'before_flank_avg',
                     'after_flank_avg', 'before_gene_name', 'after_gene_name', 'ref_name', 'cluster_name',
                     'insert_name']

    hit_df = pandas.DataFrame(columns=hit_col_names)
    tic_1 = time.perf_counter()
    print("")
    print("This many hits to get through: %s" % len(proper_hits.index))
    print("")

    fasta_directory = files_for_input.output + "_fasta_searches/"

    ## Now loop through the blast results ##

    for k in range(len(narrowed_prop.index)):
        ## First we'll get the hit locs in place for each of the hits
        start_len = len(missing_isolate.index) + len(hit_df.index) + len(library_dat.index)
        print("On isolate: ", k, end='\r', flush=True)

        current_row = mge_hits_row(narrowed_prop.iloc[[k]])
        hitters = [current_row.mge_start, current_row.mge_end]
        mge_oris = [current_row.mge_bef_ori, current_row.mge_aft_ori]
        mge_ori = current_row.mge_ori
        isolate_id_z = current_row.id
        isolate_id = current_row.id_z
        current_mge_length = current_row.mge_length



        ## ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ The single merged hit I want to add in atm ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~



        current_gff_loc, ref_loc, cluster_name = gff_finder(isolate_ref_gff, isolate_id, True)
        ref_name = os.path.basename(ref_loc.iloc[0])
        ref_name = re.sub("\..*[a-zA-Z]*$", "", ref_name)
        cluster_name = cluster_name.iloc[0]





        ref_contig_name = ref_name
        act_map = False

        if ref_name not in refs_to_alter:

            if ref_name in nice_ids:
                ## Need to re-run act for this cluster with a different id. First check if all the cluster
                ## are in the isolate_ids. If so, we have to skip the whole cluster
                current_cluster_vals = isolate_ref_gff[isolate_ref_gff['reference'] == ref_loc.iloc[0]]

                current_ids = isolate_name_producer(current_cluster_vals['isolate'])
                skip = all_presence_checker(current_ids, nice_ids)
                if skip:
                    clusters_to_skip.append(cluster_name)
                    all_ids = nice_ids
                    no_element_ids = numpy.setdiff1d(current_ids, all_ids).tolist()

                    element_ids = list(set(current_ids) & set(all_ids))
                    ## no element ids contains the isolates without the element in this cluster
                    ## Now we need to find their fastas and then run through the act comparison
                    fastas_to_run = []
                    for fasta_fn in no_element_ids:
                        current_loc, ref_loc, ignore_me = gff_finder(fasta_pandas, fasta_fn, False)
                        fastas_to_run.append(current_loc.iloc[0])

                    fastas_to_act = []
                    for fasta_fn in element_ids:
                        current_loc, ref_loc, ignore_me = gff_finder(fasta_pandas, fasta_fn, False)
                        fastas_to_act.append(current_loc.iloc[0])

                    new_ref_list = numpy.repeat(global_reference, len(fastas_to_act)).tolist()
                    new_ref_list.append(ref_loc)
                    fastas_to_act.append(global_reference)

                    new_act_df = pandas.DataFrame()
                    new_act_df['isolate'] = pandas.Series(fastas_to_act)
                    new_act_df['reference'] = pandas.Series(numpy.repeat(global_reference, len(fastas_to_act)).tolist(), \
                                                            index=new_act_df.index)
                    new_act_df['cluster_name'] = cluster_name

                    df_loc = "./" + cluster_name + "_altered_fasta_for_ACT.csv"
                    new_act_df.to_csv(path_or_buf=df_loc, index=False)
                    print("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~")
                    print("Rerunning act comparisons for %s isolates in cluster %s with new ref %s" % (
                        len(element_ids), cluster_name, "pmen3 reference"))
                    print("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~")
                    ## Now lets run these new acts to replace the current ones with the reference
                    act_command = "python " + python_dir + "/running_act_comparisons.py" + \
                                  " --csv " + df_loc + " --perl_dir " + perl_dir + "/" + " --act_dir " + act_dir

                    subprocess.call(act_command, shell=True)  # , stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)

                    refs_to_alter.append(ref_name)
                    new_refs.append("global")
                    with open(("./" + files_for_input.output + "_realtered_act_refs.txt"), mode='wt',
                              encoding='utf-8') as myfile:
                        myfile.write('\n'.join(refs_to_alter) + '\n')
                    with open(("./" + files_for_input.output + "_new_act_refs.txt"), mode='wt',
                              encoding='utf-8') as myfile:
                        myfile.write('\n'.join(new_refs) + '\n')
                    act_map = True

                    # missing_current = pandas.DataFrame()
                    # missing_current['id'] = pandas.Series(isolate_id)
                    # missing_current['mge_start'] = pandas.Series(hitters[0], index=missing_current.index)
                    # missing_current['mge_end'] = pandas.Series(hitters[1], index=missing_current.index)
                    # missing_current['insert_start'] = pandas.Series(hitters[0], index=missing_current.index)
                    # missing_current['insert_end'] = pandas.Series(hitters[1], index=missing_current.index)
                    # missing_current['before_gene_name'] = pandas.Series("No_hit", index=missing_current.index)
                    # missing_current['after_gene_name'] = pandas.Series("No_hit", index=missing_current.index)
                    # missing_current['ref_name'] = pandas.Series(ref_name, index=missing_current.index)
                    # missing_current['cluster_name'] = pandas.Series(cluster_name, index=missing_current.index)
                    # missing_current['mge_length'] = pandas.Series(current_mge_length, index=missing_current.index)
                    # missing_current['reason'] = pandas.Series(["In clusters to skip"], index=missing_current.index)
                    # missing_isolate = missing_isolate.append(missing_current, sort=False)
                    # clusters_to_skip.append(cluster_name)
                    # continue
                else:
                    ## Find n50 of those not in the blast file  re-run act compos
                    all_ids = nice_ids
                    no_element_ids = numpy.setdiff1d(current_ids, all_ids).tolist()

                    element_ids = list(set(current_ids) & set(all_ids))
                    ## no element ids contains the isolates without the element in this cluster
                    ## Now we need to find their fastas and then run through the act comparison
                    fastas_to_run = []
                    for fasta_fn in no_element_ids:
                        current_loc, ref_loc, ignore_me = gff_finder(fasta_pandas, fasta_fn, False)
                        fastas_to_run.append(current_loc.iloc[0])

                    fastas_to_act = []
                    for fasta_fn in element_ids:
                        current_loc, ref_loc, ignore_me = gff_finder(fasta_pandas, fasta_fn, False)
                        fastas_to_act.append(current_loc.iloc[0])

                    ## Now we've got the fasta list lets get the best n50 loc

                    new_ref = n50_calc(fastas_to_run, ref_name)
                    new_ref_name = os.path.basename(re.sub("\..*[a-z,A-Z].*$", "", new_ref))
                    ref_contig_name = new_ref_name
                    ## Now create the new fastas df to input to the act compo script
                    new_act_df = pandas.DataFrame()
                    new_act_df['isolate'] = pandas.Series(fastas_to_act)
                    new_act_df['reference'] = pandas.Series(numpy.repeat(new_ref, len(fastas_to_act)).tolist(),index=new_act_df.index)
                    new_act_df['cluster_name'] = cluster_name

                    df_loc = "./" + cluster_name + "_altered_fasta_for_ACT.csv"
                    new_act_df.to_csv(path_or_buf=df_loc, index=False)
                    print("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~")
                    print("Rerunning act comparisons for %s isolates in cluster %s with new ref %s" % (
                    len(element_ids), cluster_name, new_ref_name))
                    print("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~")
                    ## Now lets run these new acts to replace the current ones with the reference
                    act_command = "python " + python_dir + "/running_act_comparisons.py" + \
                                  " --csv " + df_loc + " --perl_dir " + perl_dir + "/" + " --act_dir " + act_dir

                    subprocess.call(act_command, shell=True)  # , stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)

                    refs_to_alter.append(ref_name)
                    new_refs.append(new_ref_name)
                    act_map = True
        else:
            act_map = True
            altered_index = [i for i, x in enumerate(refs_to_alter) if x == ref_name]
            ref_contig_name = new_refs[altered_index[0]]
            if ref_contig_name == "global":
                ref_contig_name = "pmen3_reference"

        compo_file = absolute_act_path + isolate_id + ".crunch.gz"
        compo_names = ['query', 'subject', 'pid', 'align', 'gap', 'mismatch', 'qstart',
                       'qend', 'sstart', 'send', 'eval', 'bitscore']
        compo_table = pandas.read_csv(compo_file, sep='\t', names=compo_names)

        ## Now we've got and opened the tab blast file, we should look for hits
        ## before the start of the MGE in the host genome.

        ## So now we've got the hits before and after the insertion site, we need to check if they're on the same
        ## contig as each other.

        contig_suffix = "#contig_bounds.csv"
        contig_isolate = re.sub("#", "_", isolate_id)
        contig_file_path = contig_file_abs_path + contig_isolate + contig_suffix
        ref_contig = re.sub("#", "_", ref_contig_name)
        ref_contig_file = contig_file_abs_path + ref_contig + contig_suffix
        contig_tab = pandas.read_csv(contig_file_path)
        try:
            ref_contig_tab = pandas.read_csv(ref_contig_file)
        except:
            ## reference contig doesn't assume this is for global ref, lets make sure the global reference does now
            grep_command = "grep -n \">\" " + global_reference + " > grep_output_temp"
            py_command = "python " + re.sub("data", "python",
                                           data_path) + "/contig_bound_check.py --grep_file grep_output_temp --fasta_file " + global_reference + " && rm grep_output_temp && mv *contig_bounds.csv " + contig_file_abs_path
            subprocess.call(grep_command, shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
            subprocess.call(py_command, shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
            ref_contig_tab = pandas.read_csv(ref_contig_file)

        contig_mge = contig_checker(contig_tab, hitters)
        contig_mge_bef = contig_checker(contig_tab, [hitters[0]])
        contig_mge_aft = contig_checker(contig_tab, [hitters[1]])


        mge_bounds = bounds_of_contig(contig_tab, contig_mge)
        mge_bounds_bef = bounds_of_contig(contig_tab, contig_mge_bef)
        mge_bounds_aft = bounds_of_contig(contig_tab, contig_mge_aft)

        ###########################################################################
        ## Now we'll do a check on the reference to see if it has the same ########
        ## identical hits as the current isolate in question ######################
        ###########################################################################





        if contig_mge_bef == contig_mge_aft:
            hit_before, hit_after, overlap = ref_contains_hit(compo_table, hitters, mge_bounds, isolate_id)
        else:
            overlap = "No"
            mge_bounds = [mge_bounds_bef, mge_bounds_aft]


        if overlap == "No":

            hit_before, hit_after, which_hit = before_and_after_hits(hitters, compo_table, mge_bounds,
                                                                     hits_to_search="both",  mge_bef_ori= current_row.mge_bef_ori,
                                                                     mge_aft_ori= current_row.mge_aft_ori)

        else:
            which_hit = "both"

        if hit_before[0] == 0:
            contig_before = None
            hit_before_length = 0
        else:
            hit_before_loc = hit_before.iloc[[6, 7]]
            hit_before_sub_loc = hit_before.iloc[[8, 9]]
            hit_before_length = abs(hit_before_loc[1] - hit_before_loc[0])
            contig_before = contig_checker(contig_tab, hit_before_loc)
            hit_before_sub_loc = hit_before_sub_loc.sort_values(ascending=True)
            contig_bef_ref = contig_checker(ref_contig_tab, hit_before_sub_loc)

        if hit_after[0] == 0:
            contig_after = None
            hit_after_length = 0
        else:
            hit_after_loc = hit_after.iloc[[6, 7]]
            hit_after_sub_loc = hit_after.iloc[[8, 9]]
            hit_after_length = abs(hit_after_loc[1] - hit_after_loc[0])
            contig_after = contig_checker(contig_tab, hit_after_loc)
            hit_after_sub_loc = hit_after_sub_loc.sort_values(ascending=True)
            contig_aft_ref = contig_checker(ref_contig_tab, hit_after_sub_loc)

        all_one_tig = contig_before == contig_mge and contig_mge == contig_after and which_hit == "both"

        ## Sometimes the overlap will be minimal but will still return yes, in which case we need to check for
        ## closer hits to the insert.

        if overlap == "Yes":
            if mge_ori == "forward":
                if hit_before_loc[1] == hitters[0] and hit_after_loc[0] == hitters[1]:
                    overlap = "Yes"
                else:
                    overlap = "No"
            elif mge_ori == "reverse":
                if hit_before_loc[0] == hitters[0] and hit_after_loc[1] == hitters[1]:
                    overlap = "Yes"
                else:
                    overlap = "No"

        if all_one_tig and contig_bef_ref == contig_aft_ref and overlap == "No":

            hit_before_check = before_and_after_check(hit_before, hitters, compo_table, "before", hit_after, isolate_id,
                                                      current_row.mge_bef_ori, current_row.mge_aft_ori)
            hit_after_check = before_and_after_check(hit_after, hitters, compo_table, "after", hit_before, isolate_id,
                                                      current_row.mge_bef_ori, current_row.mge_aft_ori)

            if not hit_before_check.empty:
                hit_before = multi_hit_merger(hit_before_check)
                hit_before_loc = hit_before.iloc[[6, 7]]
                hit_before_length = abs(hit_before_loc[1] - hit_before_loc[0])

            if not hit_after_check.empty:
                hit_after = multi_hit_merger(hit_after_check)
                hit_after_loc = hit_after.iloc[[6, 7]]
                hit_after_length = abs(hit_after_loc[1] - hit_after_loc[0])

        all_one_tig_5k = hit_before_length >= 5000 and hit_after_length >= 5000 and all_one_tig

        ref_contigs = [contig_bef_ref, contig_aft_ref]

        if all_one_tig and not all_one_tig_5k:
            mge_bounds = [mge_bounds_bef, mge_bounds_aft]
            hit_before, hit_after, all_one_tig_5k = hit_mover(hit_before, hit_after, compo_table, isolate_id,
                                                              mge_bounds,current_row.mge_bef_ori, current_row.mge_aft_ori,
                                                              ref_contig_tab, ref_contigs)
            hit_before_loc = hit_before.iloc[[6, 7]]
            hit_after_loc = hit_after.iloc[[6, 7]]

            if not all_one_tig_5k:
                mge_bounds = [mge_bounds_bef, mge_bounds_aft]
                hit_before, hit_after, all_one_tig_5k = final_acceptor(hit_before, hit_after, isolate_id, mge_bounds,
                                                                       current_row.mge_bef_ori, current_row.mge_aft_ori)
                hit_before_loc = hit_before.iloc[[6, 7]]
                hit_after_loc = hit_after.iloc[[6, 7]]



        ## Now for the merged isolates, we'll try and get good hits either side
        if all_one_tig_5k == False and contig_before == contig_mge_bef and contig_after == contig_mge_aft:

            hit_before_check = before_and_after_check(hit_before, hitters, compo_table, "before", hit_after, isolate_id,
                                                      current_row.mge_bef_ori, current_row.mge_aft_ori)
            hit_after_check = before_and_after_check(hit_after, hitters, compo_table, "after", hit_before, isolate_id,
                                                      current_row.mge_bef_ori, current_row.mge_aft_ori)

            if not hit_before_check.empty:
                hit_before = multi_hit_merger(hit_before_check)
                hit_before_loc = hit_before.iloc[[6, 7]]
                hit_before_length = abs(hit_before_loc[1] - hit_before_loc[0])

            if not hit_after_check.empty:
                hit_after = multi_hit_merger(hit_after_check)
                hit_after_loc = hit_after.iloc[[6, 7]]
                hit_after_length = abs(hit_after_loc[1] - hit_after_loc[0])

            if hit_after_length >= 5000 and hit_before_length >= 5000:
                all_one_tig_5k = True
            else:
                mge_bounds = [mge_bounds_bef, mge_bounds_aft]
                hit_before, hit_after, all_one_tig_5k = hit_mover(hit_before, hit_after, compo_table, isolate_id,
                                                                  mge_bounds, current_row.mge_bef_ori,
                                                                  current_row.mge_aft_ori,
                                                                  ref_contig_tab, ref_contigs)
                hit_before_loc = hit_before.iloc[[6, 7]]
                hit_after_loc = hit_after.iloc[[6, 7]]
                hit_before_length = abs(hit_before_loc[1] - hit_before_loc[0])
                hit_after_length = abs(hit_after_loc[1] - hit_after_loc[0])

                if not all_one_tig_5k:
                    hit_before, hit_after, all_one_tig_5k = final_acceptor(hit_before, hit_after, isolate_id,
                                                                            mge_bounds, current_row.mge_bef_ori,
                                                                           current_row.mge_aft_ori)
                    hit_before_loc = hit_before.iloc[[6, 7]]
                    hit_after_loc = hit_after.iloc[[6, 7]]
                    hit_before_length = abs(hit_before_loc[1] - hit_before_loc[0])
                    hit_after_length = abs(hit_after_loc[1] - hit_after_loc[0])


        if all_one_tig_5k:

            ## If the before and after hits to the isolate all line up on one contig we can assume this is
            ## a good enough hit to be used in library creation. Also needs to have at least 5k bp either side
            ## in this hit

            ## Need to split this up into a before hit getter and an after hit getter.

            current_gff = pandas.read_csv(current_gff_loc.iloc[0], sep='\t',
                                          names=['seqid', 'source', 'type', 'start', 'end', 'score', 'strand', 'phase',
                                                 'attributes'],
                                          header=None)
            non_altered_gff = current_gff.copy()

            if len(contig_tab.index) > 1:

                current_gff = gff_to_dna(current_gff, contig_tab, isolate_id, input_k=k)
                if isinstance(current_gff, str):
                    missing_current = pandas.DataFrame()
                    missing_current['id'] = pandas.Series(isolate_id)
                    missing_current['mge_start'] = pandas.Series(hitters[0], index=missing_current.index)
                    missing_current['mge_end'] = pandas.Series(hitters[1], index=missing_current.index)
                    missing_current['insert_start'] = pandas.Series(current_insert_locs[0], index=missing_current.index)
                    missing_current['insert_end'] = pandas.Series(current_insert_locs[1], index=missing_current.index)
                    missing_current['before_gene_name'] = pandas.Series(before_gene, index=missing_current.index)
                    missing_current['after_gene_name'] = pandas.Series(after_gene, index=missing_current.index)
                    missing_current['ref_name'] = pandas.Series(ref_name, index=missing_current.index)
                    missing_current['cluster_name'] = pandas.Series(cluster_name, index=missing_current.index)
                    missing_current['merged'] = pandas.Series(mergio, missing_current.index)
                    missing_current['mge_length'] = pandas.Series(current_mge_length, index=missing_current.index)
                    missing_current['reason'] = pandas.Series(["No Fasta sequence in GFF"], missing_current.index)
                    missing_isolate = missing_isolate.append(missing_current, sort=False)
                    continue

            if current_row.mge_bef_ori == "forward":
                ## Lets get the element length
                ## The insertion length
                ## Number of genes in the inserton
                ## number genes & average length 500bp either side.

                if current_row.merged == "Yes":
                    current_mge_length = current_row.align

                    mergio = True
                else:
                    current_mge_length = hitters[1] - hitters[0]
                    mergio = False


                current_insert_locs = [hit_before_loc[1], hit_after_loc[0]]
                current_insert_length = int(hit_after_loc[0]) - int(hit_before_loc[1])
                current_insert_s_locs = [hit_before.iloc[9], hit_after.iloc[8]]
                genes_insert = current_gff[
                    (current_gff['start'] >= current_insert_locs[0]) & (current_gff['end'] <= current_insert_locs[1])]
                genes_mge = current_gff[(current_gff['start'] >= hitters[0]) & (current_gff['end'] <= hitters[1])]
                genes_mge_num = len(genes_mge.index)
                gene_insert_num = len(genes_insert.index)

                ## 5000 bp regions.
                ## Include any genes overlapping region, so just based on end

                before_loc_gens = current_gff[(current_gff['end'] > (hit_before_loc[1] - 5000)) & (
                            current_gff['end'] <= (hit_before_loc[1] + 100))]
                num_genes_before = len(before_loc_gens.index)
                mean_length_before = [0]
                if not before_loc_gens.empty:
                    before_gene_lengths = []
                    for l in range(len(before_loc_gens.index)):
                        current_length = before_loc_gens.iloc[l, 4] - before_loc_gens.iloc[l, 3]
                        before_gene_lengths.append([current_length])
                    mean_length_before = numpy.mean(before_gene_lengths)
                before_gene, before_gene_gff_row = gene_name_finder(before_loc_gens, back_it_up=True)


            elif current_row.mge_bef_ori == "reverse":
                if current_row.merged == "Yes":
                    current_mge_length = current_row.align
                    mergio = True

                else:
                    current_mge_length = hitters[0] - hitters[1]
                    mergio = False


                current_insert_locs = [hit_after_loc[1], hit_before_loc[0]]
                current_insert_length = int(hit_before_loc[0]) - int(hit_after_loc[1])
                current_insert_s_locs = [hit_after.iloc[9], hit_before.iloc[8]]
                genes_insert = current_gff[(current_gff['start'] >= current_insert_locs[0]) \
                                           & (current_gff['end'] <= current_insert_locs[1])]
                genes_mge = current_gff[(current_gff['start'] >= hitters[1]) & (current_gff['end'] <= hitters[0])]

                genes_mge_num = len(genes_mge.index)
                gene_insert_num = len(genes_insert.index)

                ## 5000 bp regions.
                ## Include any genes overlapping region, so just based on end
                before_loc_gens = current_gff[(current_gff['start'] > (hit_before_loc[0] - 100)) & \
                                              (current_gff['start'] <= (hit_before_loc[0] + 5000))]
                num_genes_before = len(before_loc_gens.index)
                mean_length_before = [0]
                if not before_loc_gens.empty:
                    before_gene_lengths = []
                    for l in range(len(before_loc_gens.index)):
                        current_length = before_loc_gens.iloc[l, 4] - before_loc_gens.iloc[l, 3]
                        before_gene_lengths.append([current_length])
                    mean_length_before = numpy.mean(before_gene_lengths)
                before_gene, before_gene_gff_row = gene_name_finder(before_loc_gens, back_it_up=False)

            if current_row.mge_aft_ori == "forward":
                after_loc_gens = current_gff[(current_gff['start'] >= (hit_after_loc[0] - 100)) & (
                            current_gff['start'] < (hit_after_loc[0] + 5000))]
                num_genes_after = len(after_loc_gens.index)
                mean_length_after = [0]
                if not after_loc_gens.empty:
                    after_loc_lengths = []
                    for l in range(len(after_loc_gens.index)):
                        current_length = after_loc_gens.iloc[l, 4] - after_loc_gens.iloc[l, 3]
                        after_loc_lengths.append([current_length])
                    mean_length_after = numpy.mean(after_loc_lengths)


                after_gene, after_gene_gff_row = gene_name_finder(after_loc_gens, back_it_up=False)

            elif current_row.mge_aft_ori == "reverse":
                after_loc_gens = current_gff[(current_gff['end'] > (hit_after_loc[1] - 5000)) & \
                                             (current_gff['end'] <= (hit_after_loc[1] + 100))]
                num_genes_after = len(after_loc_gens.index)
                mean_length_after = [0]
                if not after_loc_gens.empty:
                    after_loc_lengths = []
                    for l in range(len(after_loc_gens.index)):
                        current_length = after_loc_gens.iloc[l, 4] - after_loc_gens.iloc[l, 3]
                        after_loc_lengths.append([current_length])
                    mean_length_after = numpy.mean(after_loc_lengths)

                flanks_genes = num_genes_before + num_genes_after


                after_gene, after_gene_gff_row = gene_name_finder(after_loc_gens, back_it_up=True)


            flanks_genes = num_genes_before + num_genes_after

            library_flank_gene_length = numpy.mean(before_gene_lengths + after_loc_lengths)
            # library_flank_gene_length = numpy.mean([mean_length_before, mean_length_after])
            tot_flanks_length = hit_before_length + hit_after_length
            gene_gff_rows = [before_gene_gff_row, after_gene_gff_row]

            if act_map:
                ## Get the altered ref to use, then check if we can actually map this back to the reference in
                ## the act_mapper function, if not skip hit
                altered_index = [i for i, x in enumerate(refs_to_alter) if x == ref_name]
                current_new = new_refs[altered_index[0]]
                if current_new == "global":
                    current_new = "pmen3_reference"
                new_act_loc = absolute_act_path + current_new + ".crunch.gz"
                if isolate_id != ref_name:
                    hit_before, hit_after, mapped = act_mapper(hit_before, hit_after, new_act_loc,
                                                               current_insert_s_locs)
                    if not mapped:
                        missing_current = pandas.DataFrame()
                        missing_current['id'] = pandas.Series(isolate_id)
                        missing_current['mge_start'] = pandas.Series(hitters[0], index=missing_current.index)
                        missing_current['mge_end'] = pandas.Series(hitters[1], index=missing_current.index)
                        missing_current['insert_start'] = pandas.Series(current_insert_locs[0], index=missing_current.index)
                        missing_current['insert_end'] = pandas.Series(current_insert_locs[1], index=missing_current.index)
                        missing_current['before_gene_name'] = pandas.Series(before_gene, index=missing_current.index)
                        missing_current['after_gene_name'] = pandas.Series(after_gene, index=missing_current.index)
                        missing_current['ref_name'] = pandas.Series(ref_name, index=missing_current.index)
                        missing_current['cluster_name'] = pandas.Series(cluster_name, index=missing_current.index)
                        missing_current['merged'] = pandas.Series(mergio, missing_current.index)
                        missing_current['mge_length'] = pandas.Series(current_mge_length, index=missing_current.index)
                        missing_current['reason'] = pandas.Series(["No map back to gub ref"], missing_current.index)
                        missing_isolate = missing_isolate.append(missing_current, sort=False)

                        continue
                else:
                    hit_before['sstart'] = hit_before['qstart']
                    hit_before['send'] = hit_before['qend']
                    hit_after['sstart'] = hit_after['qstart']
                    hit_after['send'] = hit_after['qend']
            library_pros = pandas.DataFrame()
            library_pros['id'] = pandas.Series(isolate_id)
            library_pros['mge_start'] = pandas.Series(hitters[0], index=library_pros.index)
            library_pros['mge_end'] = pandas.Series(hitters[1], index=library_pros.index)
            library_pros['insert_start'] = pandas.Series(current_insert_locs[0], index=library_pros.index)
            library_pros['insert_end'] = pandas.Series(current_insert_locs[1], index=library_pros.index)
            library_pros['before_sstart'] = pandas.Series(hit_before['sstart'], index=library_pros.index)
            library_pros['before_send'] = pandas.Series(hit_before['send'], index=library_pros.index)
            library_pros['after_sstart'] = pandas.Series(hit_after['sstart'], index=library_pros.index)
            library_pros['after_send'] = pandas.Series(hit_after['send'], index=library_pros.index)
            library_pros['mge_length'] = pandas.Series(current_mge_length, index=library_pros.index)
            library_pros['insert_length'] = pandas.Series(current_insert_length, index=library_pros.index)
            library_pros['insert_genes'] = pandas.Series(gene_insert_num, index=library_pros.index)
            library_pros['mge_genes'] = pandas.Series(genes_mge_num, index=library_pros.index)
            library_pros['flank_genes'] = pandas.Series(flanks_genes, index=library_pros.index)
            library_pros['mean_flank_gene_length'] = pandas.Series(library_flank_gene_length,
                                                                   index=library_pros.index)
            library_pros['flanks_length'] = pandas.Series(tot_flanks_length, index=library_pros.index)
            library_pros['before_flank_gene'] = pandas.Series(num_genes_before, index=library_pros.index)
            library_pros['after_flank_gene'] = pandas.Series(num_genes_after, index=library_pros.index)
            library_pros['before_flank_avg'] = pandas.Series(numpy.mean(before_gene_lengths),
                                                             index=library_pros.index)
            library_pros['after_flank_avg'] = pandas.Series(numpy.mean(after_loc_lengths), index=library_pros.index)

            library_pros['before_gene_name'] = pandas.Series(before_gene, index=library_pros.index)
            library_pros['after_gene_name'] = pandas.Series(after_gene, index=library_pros.index)

            library_pros['ref_name'] = pandas.Series(ref_name, index=library_pros.index)
            library_pros['cluster_name'] = pandas.Series(cluster_name, index=library_pros.index)
            ## check if to add in to library csv

            if genes_mge_num <= gene_insert_num or mergio == True:
                    hit_df, missing_isolate, library_dat = hit_detector(library_dat, library_pros, isolate_id, hit_df, missing_isolate, mergio, gene_gff_rows,
                                                                        fasta_pandas,ref_contig_name, contig_tab, ref_contig_tab, fasta_directory)
            else:

                missing_current = pandas.DataFrame()
                missing_current['id'] = pandas.Series(isolate_id)
                missing_current['mge_start'] = pandas.Series(hitters[0], index=missing_current.index)
                missing_current['mge_end'] = pandas.Series(hitters[1], index=missing_current.index)
                missing_current['insert_start'] = pandas.Series(current_insert_locs[0], index=missing_current.index)
                missing_current['insert_end'] = pandas.Series(current_insert_locs[1], index=missing_current.index)
                missing_current['before_gene_name'] = pandas.Series(before_gene, index=missing_current.index)
                missing_current['after_gene_name'] = pandas.Series(after_gene, index=missing_current.index)
                missing_current['ref_name'] = pandas.Series(ref_name, index=missing_current.index)
                missing_current['cluster_name'] = pandas.Series(cluster_name, index=missing_current.index)
                missing_current['merged'] = pandas.Series(mergio, missing_current.index)
                missing_current['mge_length'] = pandas.Series(current_mge_length, index=missing_current.index)
                missing_current['reason'] = pandas.Series(["More MGE than insert"], missing_current.index)

                missing_isolate = missing_isolate.append(missing_current, sort=False)




        else:
            missing_current = pandas.DataFrame()
            missing_current['id'] = pandas.Series(isolate_id)
            missing_current['mge_start'] = pandas.Series(hitters[0], index=missing_current.index)
            missing_current['mge_end'] = pandas.Series(hitters[1], index=missing_current.index)
            missing_current['ref_name'] = pandas.Series(ref_name, index=missing_current.index)
            missing_current['cluster_name'] = pandas.Series(cluster_name, index=missing_current.index)
            missing_current['insert_start'] = pandas.Series(hitters[0], index=missing_current.index)
            missing_current['insert_end'] = pandas.Series(hitters[1], index=missing_current.index)
            missing_current['before_gene_name'] = pandas.Series("No hit", index=missing_current.index)
            missing_current['after_gene_name'] = pandas.Series("No hit", index=missing_current.index)
            missing_current['merged'] = pandas.Series(mergio, missing_current.index)
            if current_row.merged == "Yes":
                missing_current['mge_length'] = pandas.Series(current_row.align, index=missing_current.index)
                missing_current['reason'] = pandas.Series(["MERGED No good hits before and after"],missing_current.index)
            else:
                missing_current['mge_length'] = pandas.Series(current_mge_length, index=missing_current.index)
                if hit_before is None or hit_after is None:
                    missing_current['reason'] = pandas.Series(["No good hits before and after- before and after none"],
                                                              missing_current.index)
                elif all_one_tig == False:
                    missing_current['reason'] = pandas.Series(["No good hits before and after- no two hits on contig"],
                                                              missing_current.index)
                elif all_one_tig:
                    missing_current['reason'] = pandas.Series(["No good hits before and after- hits too small"],
                                                              missing_current.index)
                else:
                    missing_current['reason'] = pandas.Series(["No good hits before and after- misc"],
                                                              missing_current.index)

            missing_isolate = missing_isolate.append(missing_current, sort = False)
        end_len = len(missing_isolate.index) + len(hit_df.index) + len(library_dat.index)
        if end_len != start_len + 1:
            print("")
            print("missing")
            print(isolate_id)
            print(all_one_tig_5k)




    hit_out_name = files_for_input.output + "_hits_df.csv"
    library_add = library_dat[library_dat['cluster_name'].isin(clusters_present)]
    hit_df = hit_df.append(library_add, sort = False)

    missing_out_name = files_for_input.output + "_missing_df.csv"
    hit_df.to_csv(path_or_buf=hit_out_name, index=False)
    missing_isolate.to_csv(path_or_buf=missing_out_name, index = False)
    library_name = files_for_input.output + "_library.csv"
    library_dat.to_csv(library_name, index=False)
    toc1 = time.perf_counter()
    toc = time.perf_counter()
    print("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~")
    print("Library size is now %s from initial %s" % (len(library_dat.index), initial_lib_size))
    print("This many hits allocated: %s of %s" % (len(hit_df.index), len(narrowed_prop.index) + len(library_add.index)))
    print("This many missing hits: %s (%s %s)" % (len(missing_isolate.index), round((len(missing_isolate.index) / (len(narrowed_prop.index)+ len(library_add.index))) * 100), "%"))
    print("Took this long for hit allocation: %s" % (toc1 - tic_1))
    print("Took this long overall: %s" % (toc - tic))
    print("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Hit allocator finished ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~")


back to top