Raw File
parse_phylip.c
/*
 *  Wellcome Trust Sanger Institute
 *  Copyright (C) 2011  Wellcome Trust Sanger Institute
 *  
 *  This program is free software; you can redistribute it and/or
 *  modify it under the terms of the GNU General Public License
 *  as published by the Free Software Foundation; either version 2
 *  of the License, or (at your option) any later version.
 *  
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *  
 *  You should have received a copy of the GNU General Public License
 *  along with this program; if not, write to the Free Software
 *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
 */


#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "parse_phylip.h"
#include "alignment_file.h"

#include <zlib.h>
#include <sys/types.h>
#include "kseq.h"
#include "string_cat.h"

KSEQ_INIT(gzFile, gzread)

int num_samples;
int num_snps;
char ** sequences;
char ** phylip_sample_names;
int * internal_node;
sample_statistics ** statistics_for_samples;


int update_sequence_base(char new_sequence_base, int sequence_index, int base_index)
{
	if(sequences[sequence_index][base_index] != new_sequence_base)
	{
	   sequences[sequence_index][base_index] = new_sequence_base;	
	   return 1;
    }
	return 0;
}

void set_internal_node(int internal_node_value,int sequence_index)
{
	internal_node[sequence_index] = internal_node_value;
}

int get_internal_node(int sequence_index)
{
	return internal_node[sequence_index];
}

void get_sequence_for_sample_name(char * sequence_bases, char * sample_name)
{
	int sequence_index;
	sequence_index = find_sequence_index_from_sample_name( sample_name);
	if(sequence_index < 0)
	{
    printf("Couldnt find sequence name %s with index %d\n", sample_name,sequence_index);
	  exit(1);
  }

	memcpy(sequence_bases, sequences[sequence_index], size_of_string(sequences[sequence_index]) +1);
}


void fill_in_unambiguous_gaps_in_parent_from_children(int parent_sequence_index, int * child_sequence_indices, int num_children)
{
	int snp_counter = 0; 
	for(snp_counter = 0; snp_counter < num_snps ; snp_counter++)
	{
		int real_base_found = 0;
		int child_counter = 0;
		for(child_counter = 0 ; child_counter < num_children ; child_counter++)
		{
			int child_index = child_sequence_indices[child_counter];
			if(!( toupper(sequences[child_index][snp_counter]) == 'N' || sequences[child_index][snp_counter] == '-' ))
			{
				real_base_found = 1;
				break;
			}
		}
		
		if(real_base_found == 0 &&  toupper(sequences[parent_sequence_index][snp_counter]) != 'N' && sequences[parent_sequence_index][snp_counter] != '-')
		{
			sequences[parent_sequence_index][snp_counter] = 'N';	
		}
	}
}

void fill_in_unambiguous_bases_in_parent_from_children_where_parent_has_a_gap(int parent_sequence_index, int * child_sequence_indices, int num_children)
{
	int snp_counter = 0; 

	for(snp_counter = 0; snp_counter < num_snps ; snp_counter++)
	{
		if(toupper(sequences[parent_sequence_index][snp_counter]) != 'N' && sequences[parent_sequence_index][snp_counter] != '-')
		{
			break;
		}
		
		int real_base_found = 0;
		int child_counter = 0;
		char comparison_base ;
		for(child_counter = 0 ; child_counter < num_children ; child_counter++)
		{
			int child_index = child_sequence_indices[child_counter];
		  if(child_counter == 0)
			{
				comparison_base = toupper(sequences[child_index][snp_counter]);
			}
		
			if(comparison_base !=  toupper(sequences[child_index][snp_counter])  )
			{
				break;
			}
		}
		
		if(toupper(sequences[parent_sequence_index][snp_counter]) != comparison_base)
		{
			sequences[parent_sequence_index][snp_counter] = comparison_base;
		}
	}
}

int does_column_contain_snps(int snp_column, char reference_base)
{
	int i;
	reference_base = convert_reference_to_real_base_in_column( snp_column,  reference_base);
	for(i = 0; i < num_samples; i++)
	{
		if(internal_node[i]==1)
		{
			continue;	
		}
		
		if(sequences[i][snp_column] == '\0' || sequences[i][snp_column] == '\n')
		{
			return 0;	
		}
		
		if(sequences[i][snp_column]  != '-' && toupper(sequences[i][snp_column])  != 'N' && sequences[i][snp_column] != reference_base)
		{
			return 1;
		}
	}   
	return 0;
}


char convert_reference_to_real_base_in_column(int snp_column, char reference_base)
{
	int i;
	if(!(reference_base == '-' || toupper(reference_base) == 'N'))
	{
		return reference_base;
	}
	
	for(i = 0; i < num_samples; i++)
	{
		if(sequences[i][snp_column] == '\0' || sequences[i][snp_column] == '\n')
		{
			return reference_base;	
		}
		
		if(sequences[i][snp_column]  != '-' && toupper(sequences[i][snp_column])  != 'N')
		{
			return sequences[i][snp_column];
		}
	}
	return reference_base;
}

int number_of_samples_from_parse_phylip()
{
	return num_samples;
}

sample_statistics ** get_sample_statistics()
{
	return statistics_for_samples;
}


void get_sample_names_from_parse_phylip(char ** sample_names)
{
	int i;
	for(i = 0; i< num_samples; i++)
	{
		sample_names[i] =  (char *) calloc(MAX_SAMPLE_NAME_SIZE,sizeof(char));
		memcpy(sample_names[i], phylip_sample_names[i], size_of_string(phylip_sample_names[i]) +1);
	}
}
	

void filter_sequence_bases_and_rotate(char * reference_bases, char ** filtered_bases_for_snps, int number_of_filtered_snps)
{
	int i,j,reference_index;
	
	for(j = 0; j < number_of_filtered_snps; j++)
	{
		filtered_bases_for_snps[j] = (char *) calloc((num_samples+1),sizeof(char));
	}
		
	for(i = 0; i < num_samples; i++)
	{
		int filtered_base_counter = 0;
		
		for(reference_index = 0; reference_index < num_snps; reference_index++)
		{

			if(reference_bases[reference_index] == '\0')
			{
				break;	
			}
			
			if(reference_bases[reference_index] != '*' && sequences[i][reference_index] != '\0' && sequences[i][reference_index] != '\n')
			{
				filtered_bases_for_snps[filtered_base_counter][i] = sequences[i][reference_index];
				filtered_base_counter++;
			}
		}
	}
	for(j = 0; j < number_of_filtered_snps; j++)
	{
		 filtered_bases_for_snps[j][num_samples] = '\0';	
	}

}


void set_number_of_recombinations_for_sample(char * sample_name, int number_of_recombinations)
{
	int sample_index ;
	sample_index = find_sequence_index_from_sample_name( sample_name);
  if( sample_index == -1)
  {
		return;
	}
	((sample_statistics *) statistics_for_samples[sample_index])->number_of_recombinations = number_of_recombinations;
}


void set_number_of_snps_for_sample(char * sample_name, int number_of_snps)
{
	int sample_index ;
	sample_index = find_sequence_index_from_sample_name( sample_name);
  if( sample_index == -1)
  {
		return;
	}
	
	((sample_statistics *) statistics_for_samples[sample_index])->number_of_snps = number_of_snps;
}

void set_number_of_blocks_for_sample(char * sample_name,int num_blocks)
{
	int sample_index ;
	sample_index = find_sequence_index_from_sample_name( sample_name);
  if( sample_index == -1)
  {
		return;
	}
	
	((sample_statistics *) statistics_for_samples[sample_index])->number_of_blocks = num_blocks;
}



void set_genome_length_without_gaps_for_sample(char * sample_name, int genome_length_without_gaps)
{
	int sample_index ;
	sample_index = find_sequence_index_from_sample_name( sample_name);
  if( sample_index == -1)
  {
		return;
	}
	
	((sample_statistics *) statistics_for_samples[sample_index])->genome_length_without_gaps = genome_length_without_gaps;
}

void set_genome_length_excluding_blocks_and_gaps_for_sample(char * sample_name, int genome_length_excluding_blocks_and_gaps)
{
	int sample_index ;
	sample_index = find_sequence_index_from_sample_name( sample_name);
    if( sample_index == -1)
    {
		return;
	}
	
	((sample_statistics *) statistics_for_samples[sample_index])->genome_length_excluding_blocks_and_gaps = genome_length_excluding_blocks_and_gaps;
}

void set_number_of_bases_in_recombinations(char * sample_name, int bases_in_recombinations)
{
	int sample_index ;
	sample_index = find_sequence_index_from_sample_name( sample_name);
  if( sample_index == -1)
  {
		return;
	}
	((sample_statistics *) statistics_for_samples[sample_index])->bases_in_recombinations = bases_in_recombinations;
}




int find_sequence_index_from_sample_name( char * sample_name)
{
	int i;
	
	for(i =0; i< num_samples; i++)
	{
		if(strcmp(sample_name,phylip_sample_names[i]) == 0)	
		{
			return i;
		}
	}
	return -1;
}

void initialise_internal_node()
{
	int i=0;
	internal_node = (int *) calloc((num_samples+1),sizeof(int));
	for(i=0; i< num_samples; i++)
	{
		internal_node[i] = 0;
	}
}

void initialise_statistics()
{
	int i=0;
	statistics_for_samples = (sample_statistics **) calloc((num_samples+1),sizeof(sample_statistics *));
	for(i=0; i< num_samples; i++)
	{
		sample_statistics * sample_statistics_placeholder;
		sample_statistics_placeholder = (sample_statistics *) calloc(1, sizeof(sample_statistics));
		
		sample_statistics_placeholder->sample_name = (char *) calloc( MAX_SAMPLE_NAME_SIZE,sizeof(char));
		memcpy(sample_statistics_placeholder->sample_name, phylip_sample_names[i], size_of_string(phylip_sample_names[i]) +1);
		
		statistics_for_samples[i] = sample_statistics_placeholder;
	}
}

int number_of_snps_in_phylip()
{
	return num_snps;
}

void load_sequences_from_multifasta_file(char filename[])
{
	int i;

	num_snps    = genome_length(filename);
	num_samples = number_of_sequences_in_file(filename);
	
	sequences = (char **) calloc((num_samples+1),sizeof(char *));
	phylip_sample_names = (char **) calloc((num_samples+1),sizeof(char *));
	
	for(i = 0; i < num_samples; i++)
	{
		sequences[i] = (char *) calloc((num_snps+1),sizeof(char));
		phylip_sample_names[i] = (char *) calloc((MAX_SAMPLE_NAME_SIZE+1),sizeof(char));
	}
	get_sample_names_for_header(filename, phylip_sample_names, num_samples);
	
  int l;
  i = 0;
  int sequence_number = 0;

 	gzFile fp;
 	kseq_t *seq;

 	fp = gzopen(filename, "r");
 	seq = kseq_init(fp);

 	while ((l = kseq_read(seq)) >= 0) 
 	{
     for(i = 0; i< num_snps; i++)
 		{
 			sequences[sequence_number][i] = toupper(((char *) seq->seq.s)[i]);
 			if(sequences[sequence_number][i] == 'N')
 			{
 				sequences[sequence_number][i]  = '-';
 			}
 		}
     sequence_number++;
   }

 	kseq_destroy(seq);
 	gzclose(fp);

	initialise_statistics();
	initialise_internal_node();
}

void freeup_memory()
{
	int i;
	for(i = 0; i < num_samples; i++)
	{
	  free(sequences[i]);
		free(phylip_sample_names[i]);
  }
  free(sequences);
	free(phylip_sample_names);
	free(internal_node);
}

back to top