Content - fcdbb2bf6039a728d256c1c2b8e89fed0e8b282b - 2f5c73b/src/branch_sequences.c

visit type:
Tip revision: a4b6c1d5566ee4616d60bd76f02d97e7c57e72c0 authored by andrewjpage on 15 April 2016, 12:27:52 UTC
Merge pull request #166 from andrewjpage/duplicate_sequences
Tip revision: a4b6c1d
branch_sequences.c
/*
 *  Wellcome Trust Sanger Institute
 *  Copyright (C) 2011  Wellcome Trust Sanger Institute
 *  
 *  This program is free software; you can redistribute it and/or
 *  modify it under the terms of the GNU General Public License
 *  as published by the Free Software Foundation; either version 2
 *  of the License, or (at your option) any later version.
 *  
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *  
 *  You should have received a copy of the GNU General Public License
 *  along with this program; if not, write to the Free Software
 *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
 */

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include "seqUtil.h"
#include "Newickform.h"
#include "branch_sequences.h"
#include "gubbins.h"
#include "parse_vcf.h"
#include "parse_phylip.h"
#include "snp_searching.h"
#include "block_tab_file.h"
#include "gff_file.h"
#include "string_cat.h"

int node_counter = 0;


// Order is not preserved.
int copy_and_concat_integer_arrays(int * array_1, int array_1_size, int * array_2, int array_2_size, int * output_array)
{
	int array_1_counter=0;
	int array_2_counter=0;
	
	for(array_1_counter = 0; array_1_counter< array_1_size; array_1_counter++)
	{
		output_array[array_1_counter] = array_1[array_1_counter];
	}
	
	for(array_2_counter = 0; array_2_counter < array_2_size; array_2_counter++)
	{
		output_array[array_2_counter+array_1_size] = array_2[array_2_counter];
	}
	return array_1_size+array_2_size;
}

int copy_and_concat_2d_integer_arrays(int ** array_1, int array_1_size, int ** array_2, int array_2_size, int ** output_array)
{
	int array_1_counter=0;
	int array_2_counter=0;
	
	for(array_1_counter = 0; array_1_counter< array_1_size; array_1_counter++)
	{
		output_array[0][array_1_counter] = array_1[0][array_1_counter];
		output_array[1][array_1_counter] = array_1[1][array_1_counter];
	}
	
	for(array_2_counter = 0; array_2_counter < array_2_size; array_2_counter++)
	{
		output_array[0][array_2_counter+array_1_size] = array_2[0][array_2_counter];
		output_array[1][array_2_counter+array_1_size] = array_2[1][array_2_counter];
	}
	return array_1_size+array_2_size;
}


// Loop over all recombination blocks and return a list of all snp indices that fall within those blocks.
int get_list_of_snp_indices_which_fall_in_downstream_recombinations(int ** current_block_coordinates,int num_blocks, int * snp_locations,int number_of_snps, int * snps_in_recombinations)
{
	int num_snps_in_recombinations =0;
	int i = 0;
	for(i = 0; i<num_blocks; i++ )
	{
		int current_index = 0;
		current_index = find_starting_index(current_block_coordinates[0][i],snp_locations,0, number_of_snps);
		
		int j;
		for(j = current_index; (j < number_of_snps && snp_locations[j] <= current_block_coordinates[1][i]); j++)
		{
			if(snp_locations[j] >= current_block_coordinates[0][i] && snp_locations[j] <= current_block_coordinates[1][i])
			{
				int k = 0;
				int seen_before = 0;
				// has this snp index been flagged before?
				for(k =0; k < num_snps_in_recombinations; k++)
				{
					if(snps_in_recombinations[k] == j)
					{
						seen_before = 1;
						break;
					}
				}
				if(seen_before == 0)
				{
					snps_in_recombinations[num_snps_in_recombinations] = j;
					num_snps_in_recombinations++;
    			}
			}
		}
	}
	return num_snps_in_recombinations;
		
}


// Go through the tree and build up the recombinations list from root to branch. Print out each sample name and a list of recombinations
void fill_in_recombinations_with_gaps(newick_node *root, int * parent_recombinations, int parent_num_recombinations, int current_total_snps,int num_blocks, int ** current_block_coordinates,int length_of_original_genome,int * snp_locations, int number_of_snps )
{
	newick_child *child;
	int * current_recombinations;
	int num_current_recombinations = 0 ;
	char * child_sequence = (char *) calloc((length_of_original_genome +1),sizeof(char));
	
	current_recombinations = (int *) calloc((root->num_recombinations+1+parent_num_recombinations),sizeof(int));
	num_current_recombinations = copy_and_concat_integer_arrays(root->recombinations, root->num_recombinations,parent_recombinations, parent_num_recombinations, current_recombinations);
	
 	// overwrite the bases of snps with N's
 	int i;
 	int sequence_index;
 	sequence_index = find_sequence_index_from_sample_name(root->taxon);
 	
 	set_number_of_recombinations_for_sample(root->taxon,root->num_recombinations);
 	set_number_of_snps_for_sample(root->taxon,root->number_of_snps);
	
	get_sequence_for_sample_name(child_sequence, root->taxon);
	int genome_length_excluding_blocks_and_gaps = calculate_genome_length_excluding_blocks_and_gaps(child_sequence, length_of_original_genome, current_block_coordinates, num_blocks);
	
	set_genome_length_excluding_blocks_and_gaps_for_sample(root->taxon,genome_length_excluding_blocks_and_gaps);
	
	int ** merged_block_coordinates;
	merged_block_coordinates = (int **) calloc(3,sizeof(int *));
	merged_block_coordinates[0] = (int*) calloc((num_blocks + root->number_of_blocks+1),sizeof(int ));
	merged_block_coordinates[1] = (int*) calloc((num_blocks + root->number_of_blocks+1),sizeof(int ));
	copy_and_concat_2d_integer_arrays(current_block_coordinates,num_blocks,root->block_coordinates, root->number_of_blocks,merged_block_coordinates );
	
	set_number_of_blocks_for_sample(root->taxon, root->number_of_blocks	);
 	set_number_of_bases_in_recombinations(root->taxon, calculate_number_of_bases_in_recombations_excluding_gaps(merged_block_coordinates, (num_blocks + root->number_of_blocks), child_sequence, snp_locations,current_total_snps));
	free(child_sequence); 	

 	for(i = 0; i < num_current_recombinations; i++)
 	{
 		update_sequence_base('N', sequence_index, current_recombinations[i]);
 	}


    // TODO: The stats for the number of snps in recombinations will need to be updated.
	int * snps_in_recombinations = (int *) calloc((number_of_snps +1),sizeof(int));
	int num_snps_in_recombinations = get_list_of_snp_indices_which_fall_in_downstream_recombinations(merged_block_coordinates, (num_blocks + root->number_of_blocks),snp_locations, number_of_snps, snps_in_recombinations);
    int num_snps_updated_from_downstream_recombintations=0;
 	for(i = 0; i < num_snps_in_recombinations; i++)
 	{
 		update_sequence_base('N', sequence_index, snps_in_recombinations[i]);
 	}
	free(snps_in_recombinations); 	

	if (root->childNum > 0)
	{
		child = root->child;
		set_internal_node(1,sequence_index);

		while (child != NULL)
		{
			fill_in_recombinations_with_gaps(child->node, current_recombinations, num_current_recombinations,(current_total_snps + root->number_of_snps),(num_blocks + root->number_of_blocks),merged_block_coordinates,length_of_original_genome, snp_locations, number_of_snps );
			child = child->next;

		}
	}
	else
	{
	set_internal_node(0,sequence_index);	
	}
	free(current_recombinations);
	free(merged_block_coordinates[0]);
	free(merged_block_coordinates[1]);
	free(merged_block_coordinates);
}

int calculate_number_of_bases_in_recombations_excluding_gaps(int ** block_coordinates, int num_blocks,char * child_sequence, int * snp_locations,int length_of_original_genome)
{
	int total_bases = 0;
	int current_block = 1;
	int start_block = 0;

	for(start_block = 0; start_block < num_blocks; start_block++)
	{
		if(block_coordinates[0][start_block] == -1 || block_coordinates[1][start_block] == -1)
		{
			continue;
		}
		
		for(current_block = 0 ; current_block < num_blocks; current_block++)
		{ 
			if(current_block == start_block)
			{
				continue;	
			}
			
			if(block_coordinates[0][current_block] == -1 || block_coordinates[1][current_block] == -1)
			{
				continue;
			}
			
			
			int found_overlap = 0;
		  if(block_coordinates[0][start_block] >=  block_coordinates[0][current_block] && block_coordinates[0][start_block] <= block_coordinates[1][current_block] )
		  {
				block_coordinates[0][start_block] = block_coordinates[0][current_block];
				found_overlap = 1;
			}
			
			if(block_coordinates[1][start_block] >=  block_coordinates[0][current_block]  && block_coordinates[1][start_block] <= block_coordinates[1][current_block])
		  {
				block_coordinates[1][start_block] = block_coordinates[1][current_block];
				found_overlap = 1;
			}
			
			if(found_overlap == 1)
			{
				block_coordinates[0][current_block] = -1;
				block_coordinates[1][current_block] = -1;
			}
		}	
		
	}
	for(start_block = 0; start_block < num_blocks; start_block++)
	{
		if(block_coordinates[0][start_block] == -1 || block_coordinates[1][start_block] == -1)
		{
			continue;
		}
		
		
	  total_bases += calculate_block_size_without_gaps(child_sequence,  snp_locations, block_coordinates[0][start_block], block_coordinates[1][start_block], length_of_original_genome);
  }
	
	return total_bases;
}

void carry_unambiguous_gaps_up_tree(newick_node *root)
{
	if(root->childNum > 0)
	{
		newick_child *child;
		int parent_sequence_index =  find_sequence_index_from_sample_name(root->taxon);
		
		child = root->child;
		int child_sequence_indices[root->childNum];
		int child_counter = 0;
		while (child != NULL)
		{
			child_sequence_indices[child_counter] = find_sequence_index_from_sample_name(child->node->taxon);
			carry_unambiguous_gaps_up_tree(child->node);
			child = child->next;
			child_counter++;
		}
		
		// compare the parent sequence to the each child sequence and update the gaps
		fill_in_unambiguous_gaps_in_parent_from_children(parent_sequence_index, child_sequence_indices,child_counter);
		//fill_in_unambiguous_bases_in_parent_from_children_where_parent_has_a_gap(parent_sequence_index, child_sequence_indices,child_counter);
	}
}

char *generate_branch_sequences(newick_node *root, FILE *vcf_file_pointer,int * snp_locations, int number_of_snps, char** column_names, int number_of_columns, char * leaf_sequence, int length_of_original_genome, FILE * block_file_pointer, FILE * gff_file_pointer,int min_snps,FILE * branch_snps_file_pointer, int window_min, int window_max)
{
	newick_child *child;
	int child_counter = 0;
	int current_branch =0;
	int branch_genome_size = 0;
	int number_of_branch_snps=0;
  root->current_node_id = ++node_counter;
	
	if (root->childNum == 0)
	{
		leaf_sequence = (char *) calloc((number_of_snps +1),sizeof(char));
		get_sequence_for_sample_name(leaf_sequence, root->taxon);
		
		root->taxon_names = (char *) calloc(MAX_SAMPLE_NAME_SIZE,sizeof(char));
		memcpy(root->taxon_names, root->taxon, size_of_string(root->taxon)+1);

    // Save some statistics about the sequence
		branch_genome_size = calculate_size_of_genome_without_gaps(leaf_sequence, 0,number_of_snps, length_of_original_genome);
		set_genome_length_without_gaps_for_sample(root->taxon,branch_genome_size);
		int number_of_gaps = length_of_original_genome-branch_genome_size;
		
		return leaf_sequence;
	}
	else
	{
		child = root->child;
		char * child_sequences[root->childNum];
		newick_node * child_nodes[root->childNum];
		root->taxon_names = (char *) calloc(MAX_SAMPLE_NAME_SIZE*number_of_columns,sizeof(char));

		// generate pointers for each child seuqn

		while (child != NULL)
		{
			// recursion
			child_sequences[child_counter] = generate_branch_sequences(child->node, vcf_file_pointer, snp_locations, number_of_snps, column_names, number_of_columns,  child_sequences[child_counter],length_of_original_genome, block_file_pointer,gff_file_pointer,min_snps,branch_snps_file_pointer, window_min, window_max);
			child_nodes[child_counter] = child->node;
			
			char delimiter_string[3] = {" "};
			concat_strings_created_with_malloc(root->taxon_names, delimiter_string);
			concat_strings_created_with_malloc(root->taxon_names, child_nodes[child_counter]->taxon_names);
			
			child_counter++;
			child = child->next;
		}
		
		// For all bases update the parent sequence with N if all child sequences.
		
		leaf_sequence = (char *) calloc((number_of_snps +1),sizeof(char));
		// All child sequneces should be available use them to find the ancestor sequence
		get_sequence_for_sample_name(leaf_sequence, root->taxon);
		
		branch_genome_size = calculate_size_of_genome_without_gaps(leaf_sequence, 0,number_of_snps, length_of_original_genome);
		set_genome_length_without_gaps_for_sample(root->taxon,branch_genome_size);
		
		
		
		for(current_branch = 0 ; current_branch< (root->childNum); current_branch++)
		{
			int * branches_snp_sites;
			branches_snp_sites = (int *) calloc((number_of_snps +1),sizeof(int));
			char * branch_snp_sequence;
			char * branch_snp_ancestor_sequence;
			branch_snp_sequence = (char *) calloc((number_of_snps +1),sizeof(char));
			branch_snp_ancestor_sequence = (char *) calloc((number_of_snps +1),sizeof(char));
			
			branch_genome_size = calculate_size_of_genome_without_gaps(child_sequences[current_branch], 0,number_of_snps, length_of_original_genome);
			number_of_branch_snps = calculate_number_of_snps_excluding_gaps(leaf_sequence, child_sequences[current_branch], number_of_snps, branches_snp_sites, snp_locations,branch_snp_sequence,branch_snp_ancestor_sequence);
			
			
			child_nodes[current_branch]->number_of_snps = number_of_branch_snps;
			print_branch_snp_details(branch_snps_file_pointer, child_nodes[current_branch]->taxon,root->taxon, branches_snp_sites, number_of_branch_snps, branch_snp_sequence, branch_snp_ancestor_sequence,child_nodes[current_branch]->taxon_names);
			
			get_likelihood_for_windows(child_sequences[current_branch], number_of_snps, branches_snp_sites, branch_genome_size, number_of_branch_snps,snp_locations, child_nodes[current_branch], block_file_pointer, root, branch_snp_sequence,gff_file_pointer,min_snps,length_of_original_genome,leaf_sequence, window_min, window_max);
			free(branch_snp_sequence);
			free(branch_snp_ancestor_sequence);
			free(child_sequences[current_branch]);
			free(branches_snp_sites);
			
		}
		
		return leaf_sequence;
	}
}


// Windows need to be of a fixed size
// calculate window size
// starting at coord of first snp, count number of snps which fall into window
// if region is blank, move on
int calculate_window_size(int branch_genome_size, int number_of_branch_snps,int window_min, int window_max)
{
	int window_size = 0;
	if(number_of_branch_snps == 0)
	{
		return window_min;
	}
	
	window_size = (int) ((branch_genome_size*1.0)/(number_of_branch_snps*1.0/WINDOW_SNP_MODE_TARGET));
	
	if(window_size < window_min)
	{
		return window_min;
	}
	else if(window_size > window_max)
	{
		return 	window_max;
	}

	return window_size;
}


void get_likelihood_for_windows(char * child_sequence, int length_of_sequence, int * snp_site_coords, int branch_genome_size, int number_of_branch_snps, int * snp_locations, newick_node * current_node, FILE * block_file_pointer, newick_node *root, char * branch_snp_sequence, FILE * gff_file_pointer, int min_snps, int length_of_original_genome, char * original_sequence,int window_min, int window_max)
{
	int i = 0;
	int window_size = 0;
	int window_start_coordinate = 0;
	int window_end_coordinate = 0;
	int number_of_snps_in_block = 0;
	int block_genome_size_without_gaps = 0;
	double branch_snp_density = 0.0;
	double block_snp_density = 0.0;
	int number_of_blocks = 0 ;
	int original_branch_genome_size = branch_genome_size;

	// place to store coordinates of recombinations snps
	current_node->recombinations = (int *) calloc((number_of_branch_snps+1),sizeof(int));
	
	int number_of_windows = (branch_genome_size/window_min) + 1;
	int * block_coordinates[4];
	block_coordinates[0] = (int *) calloc((number_of_windows+1),sizeof(int));
	block_coordinates[1] = (int *) calloc((number_of_windows+1),sizeof(int));
	block_coordinates[2] = (int *) calloc((number_of_windows+1),sizeof(int));
	block_coordinates[3] = (int *) calloc((number_of_windows+1),sizeof(int));
	
	double * block_likelihoods;	
	block_likelihoods = (double *) calloc((number_of_windows+1),sizeof(double));

	while(number_of_branch_snps > min_snps)
	{
		if(number_of_branch_snps <= min_snps)
		{
			free(block_coordinates[0]) ;
			free(block_coordinates[1]) ;
			free(block_coordinates[2]) ;
			free(block_coordinates[3]) ;
			free(block_likelihoods);
			return;
		}
		branch_snp_density = snp_density(branch_genome_size, number_of_branch_snps);

		window_size = calculate_window_size(branch_genome_size, number_of_branch_snps,window_min,window_max);

    int cutoff = calculate_cutoff(branch_genome_size, window_size, number_of_branch_snps);

		number_of_blocks = get_blocks(block_coordinates, length_of_original_genome, snp_site_coords, number_of_branch_snps,  window_size, cutoff, child_sequence,snp_locations,length_of_sequence);


		for(i = 0; i < number_of_blocks; i++)
		{
			number_of_snps_in_block = find_number_of_snps_in_block(block_coordinates[0][i], block_coordinates[1][i], snp_site_coords, branch_snp_sequence, number_of_branch_snps);
			block_genome_size_without_gaps = calculate_block_size_without_gaps(child_sequence, snp_locations, block_coordinates[0][i], block_coordinates[1][i], length_of_sequence);

			// minimum number of snps to be statistically significant in block
			if(number_of_snps_in_block <= min_snps)
			{
				block_coordinates[0][i] = -1;
				block_coordinates[1][i] = -1;
				continue;
			}
			
			block_snp_density = snp_density(block_genome_size_without_gaps, number_of_snps_in_block);
			// region with low number of snps so skip over
			if(block_snp_density <= branch_snp_density)
			{
				block_coordinates[0][i] = -1;
				block_coordinates[1][i] = -1;
				continue;	
			}
			
			block_likelihoods[i] = get_block_likelihood(branch_genome_size, number_of_branch_snps, block_genome_size_without_gaps, number_of_snps_in_block);
			block_coordinates[2][i] = (int) block_likelihoods[i];
			block_coordinates[3][i] = block_genome_size_without_gaps;
		}

		move_blocks_inwards_while_likelihood_improves(number_of_blocks,block_coordinates, min_snps, snp_site_coords, number_of_branch_snps, branch_snp_sequence, snp_locations, branch_genome_size, child_sequence, length_of_sequence,block_likelihoods,cutoff);

		int * candidate_blocks[4];
		candidate_blocks[0] = (int *) calloc((number_of_blocks+1),sizeof(int));
		candidate_blocks[1] = (int *) calloc((number_of_blocks+1),sizeof(int));
		candidate_blocks[2] = (int *) calloc((number_of_blocks+1),sizeof(int));
		candidate_blocks[3] = (int *) calloc((number_of_blocks+1),sizeof(int));
		
		double * candidate_block_likelihoods;
		candidate_block_likelihoods = (double *) calloc((number_of_blocks+1),sizeof(double));
		
		int number_of_candidate_blocks = 0;

		for(i = 0 ; i < number_of_blocks; i++)
		{
			if(block_coordinates[0][i] == -1 || block_coordinates[1][i] == -1)
			{
				continue;
			}
			int current_start = block_coordinates[0][i];
			int current_end = block_coordinates[1][i];
		  int block_snp_count = find_number_of_snps_in_block(current_start, current_end, snp_site_coords, branch_snp_sequence, number_of_branch_snps);
		  int block_genome_size_without_gaps = block_coordinates[3][i];

			if(p_value_test(branch_genome_size, block_genome_size_without_gaps, number_of_branch_snps, block_snp_count, min_snps) == 1)
			{
				
				candidate_blocks[0][number_of_candidate_blocks] = block_coordinates[0][i];
				candidate_blocks[1][number_of_candidate_blocks] = block_coordinates[1][i];
				// TODO use a float in a struct here, should be okay for the moment but assumes that there will be a clear integer difference between best and second best
				candidate_blocks[2][number_of_candidate_blocks] = block_coordinates[2][i];
				candidate_blocks[3][number_of_candidate_blocks] = block_genome_size_without_gaps;
				
				candidate_block_likelihoods[number_of_candidate_blocks] = block_likelihoods[i];
				number_of_candidate_blocks++;
			}
		}
		if(number_of_candidate_blocks == 0 )
		{
			free(block_coordinates[0]) ;
			free(block_coordinates[1]) ;
			free(block_coordinates[2]) ;
			free(block_coordinates[3]) ;
			free(block_likelihoods);
			free(candidate_blocks[0]);
		  free(candidate_blocks[1]);
		  free(candidate_blocks[2]);
		  free(candidate_blocks[3]);
			free(candidate_block_likelihoods);
			
			int new_recombination_size = (current_node->num_recombinations+1)*sizeof(int);
			if(new_recombination_size > 1024)
			{
			  current_node->recombinations = (int *) realloc(current_node->recombinations, new_recombination_size);
		  }
			return;	
		}
		number_of_branch_snps = flag_smallest_log_likelihood_recombinations(candidate_blocks, number_of_candidate_blocks, number_of_branch_snps, snp_site_coords, current_node->recombinations, current_node->num_recombinations,current_node, block_file_pointer, root, snp_locations, length_of_sequence,gff_file_pointer,candidate_block_likelihoods );
		branch_genome_size = original_branch_genome_size  - current_node->total_bases_removed_excluding_gaps;
	  free(candidate_blocks[0]);
	  free(candidate_blocks[1]);
	  free(candidate_blocks[2]);
	  free(candidate_blocks[3]);
		free(candidate_block_likelihoods);
	
	}
	free(block_coordinates[0]) ;
	free(block_coordinates[1]) ;
	free(block_coordinates[2]) ;
	free(block_coordinates[3]) ;
	free(block_likelihoods);
	int new_recombination_size = (current_node->num_recombinations+1)*sizeof(int);
	if(new_recombination_size > 1024)
	{
	  current_node->recombinations = (int *) realloc(current_node->recombinations, new_recombination_size);
  }
}

int extend_upper_part_of_window(int starting_coord, int initial_max_coord, int genome_size, int * gaps_in_original_genome_space)
{
		int max_snp_sliding_window_counter = initial_max_coord;
		int upper_offset = 0;
		int j = 0; 
		for(j = starting_coord; (j < initial_max_coord + upper_offset) && (j < genome_size); j++)
		{
		  if(gaps_in_original_genome_space[j]  == 1)
			{
				upper_offset++;
			}
		}

		max_snp_sliding_window_counter = initial_max_coord + upper_offset;
		return max_snp_sliding_window_counter;
}

int extend_lower_part_of_window(int starting_coord, int initial_min_coord, int genome_size, int * gaps_in_original_genome_space)
{
		int lower_offset = 0;
		int snp_sliding_window_counter = initial_min_coord;
		int j = 0; 
		for(j = starting_coord; (j >= 0) && j > (initial_min_coord - lower_offset) && (initial_min_coord - lower_offset >= 0); j--)
		{
			if(gaps_in_original_genome_space[j]  == 1)
			{
				lower_offset++;
			}
		}
		snp_sliding_window_counter = snp_sliding_window_counter - lower_offset;
		return snp_sliding_window_counter;
}


int get_blocks(int ** block_coordinates, int genome_size,int * snp_site_coords,int number_of_branch_snps, int window_size, int cutoff, char * original_sequence, int * snp_locations, int number_of_snps)
{
	// Set up the window counter with 1 value per base in the branch
 	int * window_count;
	window_count = (int *) calloc((genome_size+1),sizeof(int));
	int i;
	for(i =0; i< genome_size; i++)
	{
		window_count[i] = 0;
	}
	
	// Integer array with location of gaps
	int * gaps_in_original_genome_space;
	gaps_in_original_genome_space = (int *) calloc((genome_size+1),sizeof(int));
	int x =0;
	for(x=0; x< number_of_snps; x++)
	{
		if(original_sequence[x]  == 'N' || original_sequence[x]  == '-' )
		{
		  gaps_in_original_genome_space[snp_locations[x]-1] = 1;
	  }
	}
	
	
	// create the pileup of the snps and their sphere of influence
	int snp_counter = 0;
	for(snp_counter = 0; snp_counter < number_of_branch_snps; snp_counter++)
	{
		int j = 0;
		// Lower bound of the window around a snp
		int snp_sliding_window_counter = snp_site_coords[snp_counter]-(window_size/2);
		
		snp_sliding_window_counter = extend_lower_part_of_window(snp_site_coords[snp_counter] - 1 , snp_sliding_window_counter, genome_size,  gaps_in_original_genome_space);
	
		if(snp_sliding_window_counter < 0)
		{
			snp_sliding_window_counter = 0;
		}
		
		// Upper bound of the window around a snp
		int max_snp_sliding_window_counter = snp_site_coords[snp_counter]+(window_size/2);
		max_snp_sliding_window_counter = extend_upper_part_of_window(snp_site_coords[snp_counter] + 1, max_snp_sliding_window_counter, genome_size, gaps_in_original_genome_space);
		
		if(max_snp_sliding_window_counter>genome_size)
		{
			max_snp_sliding_window_counter = genome_size;
		}
		
		for(j = snp_sliding_window_counter; j < max_snp_sliding_window_counter; j++)
		{
			window_count[j] += 1;	
		}
	}
	
	int number_of_blocks = 0;
	int in_block = 0;
	int block_lower_bound = 0;
	// Scan across the pileup and record where blocks are above the cutoff
	
	for(i = 0; i < genome_size; i++)
	{
		// Just entered the start of a block
		if(window_count[i] > cutoff && in_block == 0)
		{
			block_lower_bound = i;
			in_block = 1;
    }

		// Just left a block
		if((window_count[i] <= cutoff || i+1 == genome_size ) && in_block == 1)
		{
			block_coordinates[0][number_of_blocks] = block_lower_bound;
			block_coordinates[1][number_of_blocks] = i-1;
			number_of_blocks++;
			in_block = 0;
		}
		
	}

  // Move blocks inwards to next SNP
	for(i = 0; i < number_of_blocks; i++)
	{		
		for(snp_counter = 0; snp_counter < number_of_branch_snps; snp_counter++)
		{
			if(snp_site_coords[snp_counter] >= block_coordinates[0][i] )
			{
				block_coordinates[0][i] = snp_site_coords[snp_counter];
				break;
			}
		}
		
		for(snp_counter = number_of_branch_snps-1; snp_counter >= 0 ; snp_counter--)
		{
			if(snp_site_coords[snp_counter] <= block_coordinates[1][i] )
			{
				block_coordinates[1][i] = snp_site_coords[snp_counter];
				break;
			}
		}
	}


	free(gaps_in_original_genome_space);
	free(window_count);
	return number_of_blocks;

}



void move_blocks_inwards_while_likelihood_improves(int number_of_blocks,int ** block_coordinates, int min_snps, int * snp_site_coords,  int number_of_branch_snps,char * branch_snp_sequence, int * snp_locations, int branch_genome_size,char * child_sequence, int length_of_sequence, double * block_likelihoods, int cutoff_value)
{
	int i;
	
	int previous_start;
	int previous_end;
	
	for(i = 0 ; i < number_of_blocks; i++)
	{
		int current_start = block_coordinates[0][i];
		int current_end = block_coordinates[1][i];
		int start_index = find_starting_index( current_start, snp_site_coords,0, number_of_branch_snps);
    int end_index   = find_starting_index( current_end, snp_site_coords, start_index, number_of_branch_snps);
		if( i == 0)
		{
			previous_start = block_coordinates[0][i];
			previous_end = block_coordinates[1][i];
		}
		else if(previous_start == block_coordinates[0][i] && previous_end == block_coordinates[1][i] && i > 0)
		{
			block_coordinates[0][i] = -1;
			block_coordinates[1][i] = -1;
		}
		else
		{
			previous_start = block_coordinates[0][i];
			previous_end = block_coordinates[1][i];
		}
	}
	
	
	for(i = 0 ; i < number_of_blocks; i++)
	{
		if(block_coordinates[0][i] == -1 || block_coordinates[1][i] == -1)
		{
			continue;
		}
		
		int current_start = block_coordinates[0][i];
		int current_end = block_coordinates[1][i];
		double current_block_likelihood = 0;
		int block_genome_size_without_gaps = block_coordinates[3][i];
		int block_snp_count;
		
	  int next_start_position = current_start;
		int start_index = find_starting_index( current_start, snp_site_coords,0, number_of_branch_snps);
    int end_index =  find_starting_index( current_end, snp_site_coords, start_index, number_of_branch_snps);

		block_snp_count = find_number_of_snps_in_block_with_start_end_index(current_start, current_end, snp_site_coords, branch_snp_sequence, number_of_branch_snps,start_index,end_index);
    
		if(block_genome_size_without_gaps == -1)
		{
			
			block_genome_size_without_gaps = calculate_block_size_without_gaps(child_sequence, snp_locations, current_start, current_end, length_of_sequence);
			block_coordinates[2][i] = current_block_likelihood;
			block_coordinates[3][i] = block_genome_size_without_gaps;
		}
		current_block_likelihood = get_block_likelihood(branch_genome_size, number_of_branch_snps, block_genome_size_without_gaps, block_snp_count);


		// Move left inwards while the likelihood gets better
		while(current_start < current_end && block_snp_count >= min_snps)
		{
			  next_start_position++;
			  next_start_position = advance_window_start_to_next_snp(next_start_position, snp_site_coords, branch_snp_sequence, number_of_branch_snps);
			
				if(next_start_position >= current_end)
				{
					break;
				}
				if(next_start_position <= current_start)
				{
					break;
				}
			
				int previous_block_snp_count = block_snp_count;
				int previous_block_genome_size_without_gaps = block_genome_size_without_gaps;
			  block_snp_count = find_number_of_snps_in_block_with_start_end_index(next_start_position, current_end, snp_site_coords, branch_snp_sequence, number_of_branch_snps,start_index,end_index);
			  block_genome_size_without_gaps = calculate_block_size_without_gaps(child_sequence, snp_locations, next_start_position, current_end, length_of_sequence);
			  
			  double next_block_likelihood = get_block_likelihood(branch_genome_size, number_of_branch_snps, block_genome_size_without_gaps, block_snp_count);
				
			if(next_block_likelihood <= current_block_likelihood)
			  {
			  	current_block_likelihood = next_block_likelihood;
					current_start = next_start_position;
					start_index++;
			  }
			  else
			  {
					block_snp_count = previous_block_snp_count;
					block_genome_size_without_gaps = previous_block_genome_size_without_gaps;
					break;
			  }
		}
		
		int next_end_position = current_end;
		
		// Move Right inwards while the likelihood gets better
		while(current_start < current_end && block_snp_count >= min_snps)
		{
			  next_end_position--;
				next_end_position = rewind_window_end_to_last_snp(next_end_position, snp_site_coords, branch_snp_sequence, number_of_branch_snps);
				
				if(next_end_position <= current_start )
				{
					break;
				}
				if(next_end_position >= current_end)
				{
					break;
				}
				
				int previous_block_snp_count = block_snp_count;
				int previous_block_genome_size_without_gaps = block_genome_size_without_gaps;
			  block_snp_count = find_number_of_snps_in_block(current_start, next_end_position, snp_site_coords, branch_snp_sequence, number_of_branch_snps);
			  block_genome_size_without_gaps = calculate_block_size_without_gaps(child_sequence, snp_locations, current_start, next_end_position, length_of_sequence);
			  
			  double next_block_likelihood = get_block_likelihood(branch_genome_size, number_of_branch_snps, block_genome_size_without_gaps, block_snp_count);
			  if(next_block_likelihood <= current_block_likelihood)
			  {
			  	current_block_likelihood = next_block_likelihood;
					current_end = next_end_position;
					end_index--;
			  }
			  else
			  {
					block_snp_count = previous_block_snp_count;
					block_genome_size_without_gaps = previous_block_genome_size_without_gaps;
					break;
			  }
			
		}
		
		  block_coordinates[0][i] = current_start;
		  block_coordinates[1][i] = current_end;
	  	block_coordinates[2][i] = (int) current_block_likelihood;
		  block_coordinates[3][i] = block_genome_size_without_gaps;
		
		  block_likelihoods[i] = current_block_likelihood;
	}	
}



int exclude_snp_sites_in_block(int window_start_coordinate, int window_end_coordinate, int * snp_site_coords, int number_of_branch_snps)
{
	int i;
	// inefficient
	int updated_snp_site_coords[number_of_branch_snps];
	int number_of_branch_snps_excluding_block = 0 ;
	
	for(i = 0 ; i< number_of_branch_snps; i++)
	{
		if(snp_site_coords[i]>= window_start_coordinate && snp_site_coords[i] <= window_end_coordinate)
		{
		}
		else
		{
			updated_snp_site_coords[number_of_branch_snps_excluding_block] = snp_site_coords[i];
			number_of_branch_snps_excluding_block++;
		}
	}
	
	for(i = 0; i < number_of_branch_snps_excluding_block; i++)
	{
	  snp_site_coords[i] = updated_snp_site_coords[i];
	}
	for(i=number_of_branch_snps_excluding_block; i< number_of_branch_snps; i++)
	{
		snp_site_coords[i] = 0;
	}
	
	return number_of_branch_snps_excluding_block;
}

int flag_smallest_log_likelihood_recombinations(int ** candidate_blocks, int number_of_candidate_blocks, int number_of_branch_snps, int * snp_site_coords, int * recombinations, int number_of_recombinations,newick_node * current_node, FILE * block_file_pointer, newick_node *root,int * snp_locations, int total_num_snps, FILE * gff_file_pointer, double * block_likelihooods)
{
	int number_of_branch_snps_excluding_block = number_of_branch_snps;
	if(number_of_candidate_blocks > 0)
	{
		int smallest_index = 0;
    int number_of_recombinations_in_window = 0;
		smallest_index = get_smallest_log_likelihood(block_likelihooods, number_of_candidate_blocks);
		number_of_recombinations_in_window = flag_recombinations_in_window(candidate_blocks[0][smallest_index], candidate_blocks[1][smallest_index],number_of_branch_snps, snp_site_coords, recombinations, number_of_recombinations,snp_locations,total_num_snps);	
    number_of_recombinations += number_of_recombinations_in_window;
		number_of_branch_snps_excluding_block = exclude_snp_sites_in_block(candidate_blocks[0][smallest_index],candidate_blocks[1][smallest_index], snp_site_coords,number_of_branch_snps);
		
		current_node->num_recombinations = number_of_recombinations;

		print_block_details(block_file_pointer, candidate_blocks[0][smallest_index], candidate_blocks[1][smallest_index],  number_of_recombinations_in_window, current_node->taxon,  root->taxon, current_node->taxon_names, current_node->childNum, block_likelihooods[smallest_index]);
		print_gff_line(gff_file_pointer, candidate_blocks[0][smallest_index], candidate_blocks[1][smallest_index],  number_of_recombinations_in_window, current_node->taxon,  root->taxon, current_node->taxon_names, block_likelihooods[smallest_index]);
		current_node->number_of_blocks = current_node->number_of_blocks + 1;
		
		current_node->total_bases_removed_excluding_gaps = current_node->total_bases_removed_excluding_gaps  + candidate_blocks[3][smallest_index];

		current_node->block_coordinates[0] = realloc((int *)current_node->block_coordinates[0], ((int)current_node->number_of_blocks +1)*sizeof(int));
		current_node->block_coordinates[1] = realloc((int *)current_node->block_coordinates[1], ((int)current_node->number_of_blocks +1)*sizeof(int));
		
		current_node->block_coordinates[0][current_node->number_of_blocks -1] = candidate_blocks[0][smallest_index];
		current_node->block_coordinates[1][current_node->number_of_blocks -1] = candidate_blocks[1][smallest_index];
	}
	current_node->number_of_snps = number_of_branch_snps_excluding_block;
	
	return number_of_branch_snps_excluding_block;
}

// candidate blocks contains, start coordinate, end_coordinate and log likelihood
int get_smallest_log_likelihood(double * candidate_blocks, int number_of_candidate_blocks)
{
	int i;
	int smallest_index = 0 ; 
	
	for(i=0; i< number_of_candidate_blocks; i++)
	{
		if(candidate_blocks[i] < candidate_blocks[smallest_index] && candidate_blocks[i] > 0)
		{
		   smallest_index = i;
		}
	}
	return smallest_index;
}


double snp_density(int length_of_sequence, int number_of_snps)
{
	return number_of_snps*1.0/length_of_sequence;
}




double calculate_threshold(int branch_genome_size, int window_size)
{
	return 1-(RANDOMNESS_DAMPNER/((branch_genome_size*1.0)/((window_size*1.0)/WINDOW_SNP_MODE_TARGET)));
}

int calculate_cutoff(int branch_genome_size, int window_size, int num_branch_snps)
{
	double threshold = 0.0;
	int cutoff = 0;
	double pvalue = 0.0;
	double part1, part2, part3 = 0.0;
	
	threshold = calculate_threshold(branch_genome_size, window_size);
	
	while( pvalue <= threshold)
	{
		part1 = reduce_factorial(window_size,cutoff)-reduce_factorial(cutoff,cutoff);
		part2 = log10((num_branch_snps*1.0)/branch_genome_size)*cutoff;
		part3 = log10(1.0-((num_branch_snps*1.0)/branch_genome_size))*(window_size-cutoff);
		pvalue = pvalue + pow(10,(part1 + part2 + part3));
		cutoff++;
	}
	cutoff--;

	
	return cutoff;
}

int p_value_test(int branch_genome_size, int window_size, int num_branch_snps, int block_snp_count, int min_snps)
{
	double threshold = 0.0;
	int cutoff = 0;
	double pvalue = 0.0;
	double part1, part2, part3 = 0.0;
	
	if( block_snp_count < min_snps)
	{
		return 0;	
	}
	
	threshold = 0.05/branch_genome_size;
	
	while( cutoff < block_snp_count)
	{
		part1 = reduce_factorial(window_size,cutoff)-reduce_factorial(cutoff,cutoff);
		part2 = log10((num_branch_snps*1.0)/branch_genome_size)*cutoff;
		part3 = log10(1.0-((num_branch_snps*1.0)/branch_genome_size))*(window_size-cutoff);
		pvalue = pvalue + pow(10,(part1 + part2 + part3));
		cutoff++;
	}
	
	// There should be rounding here to 10 decimal places
	pvalue = 1.0-pvalue;
	
	if(pvalue < threshold)
	{
		// the block could contain a recombination
		return 1;
	}
	
	return 0;
}


double reduce_factorial(int l, int i)
{
	double factorial;
	int x;
	
	factorial = log10(1.0);
	
	for(x = l-(i-1); x < l+1 ; x++)
	{
		factorial = factorial + log10(x);
	}
	return factorial;
}


// N = branch_genome_size
// C = number_of_branch_snps
// n = block_genome_size_without_gaps
// c = number_of_block_snps
double get_block_likelihood(int branch_genome_size, int number_of_branch_snps, int block_genome_size_without_gaps, int number_of_block_snps)
{
	double part1, part2, part3, part4;
	
	if(block_genome_size_without_gaps == 0)
	{
		return 0.0;
	}
	if(number_of_block_snps == 0)
	{
		return 0.0;
	}
	
	part1 = log10(number_of_block_snps*1.0/block_genome_size_without_gaps*1.0)*number_of_block_snps;
	
	if((block_genome_size_without_gaps-number_of_block_snps) == 0)
	{
		part2 = 0.0;	
	}
	else
	{
		part2 = log10( ((block_genome_size_without_gaps-number_of_block_snps)*1.0)/block_genome_size_without_gaps*1.0 )*(block_genome_size_without_gaps-number_of_block_snps);
	}
	
	if((number_of_branch_snps-number_of_block_snps) == 0)
	{
		part3 = 0.0;
	}
	else
	{
		part3 = log10(((number_of_branch_snps-number_of_block_snps)*1.0)/((branch_genome_size-block_genome_size_without_gaps)*1.0))*(number_of_branch_snps-number_of_block_snps);
	}
			
	if(((branch_genome_size-block_genome_size_without_gaps)-(number_of_branch_snps-number_of_block_snps))==0)
	{
	  part4 = 0.0;
	}
	else
	{
		part4=log10(((branch_genome_size-block_genome_size_without_gaps)-(number_of_branch_snps-number_of_block_snps)*1.0 )/((branch_genome_size-block_genome_size_without_gaps)*1.0)) * ((branch_genome_size-block_genome_size_without_gaps)-(number_of_branch_snps-number_of_block_snps));
	}
	
	return (part1+part2+part3+part4)*-1;
}

int calculate_genome_length_excluding_blocks_and_gaps(char * sequence, int length_of_sequence, int ** block_coordinates, int num_blocks)
{
	int * bases_to_be_excluded;  
	bases_to_be_excluded = (int*) calloc((length_of_sequence + 1),sizeof(int));
	
	int i = 0;
	for(i = 0; i<length_of_sequence; i++)
	{
		if(sequence[i] == 'N' || sequence[i] == '-' )
		{
			bases_to_be_excluded[i] = 1;
		}
	}
	
	int j = 0;
	for(j = 0; j<num_blocks; j++)
	{
		if(block_coordinates[0][j] == -1)
		{
			continue;
		}
		
		// Coordinates of blocks start at 1 and the index of the array starts at 0
		int block_index = 0;
		for(block_index = block_coordinates[0][j]; block_index <= block_coordinates[1][j]; block_index++ )
		{
			bases_to_be_excluded[block_index-1] = 1;
		}
	}
	
    int genome_length = 0;
	for(i = 0; i<length_of_sequence; i++)
	{
		if(bases_to_be_excluded[i] == 0 )
		{
			genome_length++;
		}
	}
	return genome_length;
}
Browse the archive

https://github.com/sanger-pathogens/gubbins