https://github.com/sanger-pathogens/gubbins
Tip revision: 0dfe1234ef48c2808e9b80809ea962e87ec2054a authored by Nick Croucher on 31 October 2022, 05:28:42 UTC
Merge and improve clade analysis scripts
Merge and improve clade analysis scripts
Tip revision: 0dfe123
Newickform.c
/*
* Yu-Wei Wu http://yuweibioinfo.blogspot.com/2008/10/newick-tree-parser-in-c-make-use-of.html
* Copyright (C) 2011 Yu-Wei Wu
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version 2
* of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
*/
#define __NEWICKFORM_C__
#include "seqUtil.h"
#include "Newickform.h"
#include "branch_sequences.h"
#include "gff_file.h"
#include "string_cat.h"
#define STR_OUT "out"
newick_node* build_newick_tree(char * filename, FILE *vcf_file_pointer,int * snp_locations, int number_of_snps, char** column_names, int number_of_columns,int length_of_original_genome,int min_snps, int window_min, int window_max, float uncorrected_p_value, float trimming_ratio, int extensive_search_flag)
{
int iLen, iMaxLen;
char *pcTreeStr;
char *pcInputFile;
char *pcOutputFile;
char acStrArray[256];
newick_node *root;
char *returnchar;
FILE *f;
// Initialize memory management procedure
seqMemInit();
pcInputFile = NULL;
pcOutputFile = NULL;
pcInputFile = filename;
// Open tree file
f = fopen(pcInputFile, "r+");
// Read in tree string
pcTreeStr = NULL;
iLen = 0;
iMaxLen = 0;
while (1)
{
memset(acStrArray, '\0', 256);
returnchar = fgets(acStrArray, 255, f);
if (acStrArray[0] == '\0' && feof(f))
{
break;
}
inputString(acStrArray, &pcTreeStr, &iLen, &iMaxLen);
}
fclose(f);
// Parse tree string
root = parseTree(pcTreeStr);
// output tab file
FILE * block_file_pointer;
char block_file_name[MAX_FILENAME_SIZE] = {""};
char block_file_extension[5]= {".tab"};
memcpy(block_file_name, filename, size_of_string(filename) +1);
concat_strings_created_with_malloc(block_file_name,block_file_extension);
block_file_pointer = fopen(block_file_name, "w");
// output tab file
FILE * branch_snps_file_pointer;
char branch_snps_file_name[MAX_FILENAME_SIZE]= {""};
char branchtab_extension[18]= {".branch_snps.tab"};
memcpy(branch_snps_file_name, filename, size_of_string(filename) +1);
concat_strings_created_with_malloc(branch_snps_file_name,branchtab_extension);
branch_snps_file_pointer = fopen(branch_snps_file_name, "w");
// output gff file
FILE * gff_file_pointer;
char gff_file_name[MAX_FILENAME_SIZE]= {""};
memcpy(gff_file_name, filename, size_of_string(filename) +1);
char gff_extension[5]= {".gff"};
concat_strings_created_with_malloc(gff_file_name,gff_extension);
gff_file_pointer = fopen(gff_file_name, "w");
print_gff_header(gff_file_pointer,length_of_original_genome);
char * root_sequence = NULL;
carry_unambiguous_gaps_up_tree(root);
root_sequence = generate_branch_sequences(root,
vcf_file_pointer,
snp_locations,
number_of_snps,
column_names,
number_of_columns,
root_sequence,
length_of_original_genome,
block_file_pointer,
gff_file_pointer,
min_snps,
branch_snps_file_pointer,
window_min,
window_max,
uncorrected_p_value,
trimming_ratio,
extensive_search_flag);
free(root_sequence);
int * parent_recombinations = NULL;
fill_in_recombinations_with_gaps(root, parent_recombinations, 0, 0,0,root->block_coordinates,length_of_original_genome,snp_locations,number_of_snps);
fclose(block_file_pointer);
fclose(gff_file_pointer);
fclose(branch_snps_file_pointer);
return root;
}
char * strip_quotes(char *taxon)
{
int i = 0;
int target_i =0;
char cleaned_taxon[MAX_FILENAME_SIZE] = {""};
while(taxon[i] != '\0')
{
if(taxon[i] != '\'')
{
cleaned_taxon[target_i] = taxon[i];
target_i++;
}
i++;
}
cleaned_taxon[target_i] = '\0';
memcpy(taxon, cleaned_taxon, size_of_string(cleaned_taxon) +1);
return taxon;
}
newick_node* parseTree(char *str)
{
newick_node *node;
newick_child *child;
char *pcCurrent;
char *pcStart;
char *pcColon = NULL;
char cTemp;
int iCount;
pcStart = str;
if (*pcStart != '(')
{
// Leaf node. Separate taxon name from distance. If distance not exist then take care of taxon name only
pcCurrent = str;
while (*pcCurrent != '\0')
{
if (*pcCurrent == ':')
{
pcColon = pcCurrent;
}
pcCurrent++;
}
node = (newick_node*)seqMalloc(sizeof(newick_node));
if (pcColon == NULL)
{
// Taxon only
node->taxon = (char*)seqMalloc((int)strlen(pcStart) + 1);
memcpy(node->taxon, pcStart, strlen(pcStart));
}
else
{
// Taxon
*pcColon = '\0';
node->taxon = (char*)seqMalloc((int)strlen(pcStart) + 1);
memcpy(node->taxon, pcStart, strlen(pcStart));
*pcColon = ':';
// Distance
pcColon++;
node->dist = (float)atof(pcColon);
}
node->taxon = strip_quotes(node->taxon);
node->number_of_blocks = 0;
node->childNum = 0;
}
else
{
// Create node
node = (newick_node*)seqMalloc(sizeof(newick_node));
child = NULL;
// Search for all child nodes
// Find all ',' until corresponding ')' is encountered
iCount = 0;
pcStart++;
pcCurrent = pcStart;
while (iCount >= 0)
{
switch (*pcCurrent)
{
case '(':
// Find corresponding ')' by counting
pcStart = pcCurrent;
pcCurrent++;
iCount++;
while (iCount > 0)
{
if (*pcCurrent == '(')
{
iCount++;
}
else if (*pcCurrent == ')')
{
iCount--;
}
pcCurrent++;
}
while (*pcCurrent != ',' && *pcCurrent != ')')
{
pcCurrent++;
}
cTemp = *pcCurrent;
*pcCurrent = '\0';
// Create a child node
if (child == NULL)
{
node->child = (newick_child*)seqMalloc(sizeof(newick_child));
node->childNum = 1;
child = node->child;
}
else
{
child->next = (newick_child*)seqMalloc(sizeof(newick_child));
node->childNum++;
child = child->next;
}
child->node = parseTree(pcStart);
*pcCurrent = cTemp;
if (*pcCurrent != ')')
{
pcCurrent++;
}
break;
case ')':
// End of tihs tree. Go to next part to retrieve distance
iCount--;
break;
case ',':
// Impossible separation since according to the algorithm, this symbol will never encountered.
// Currently don't handle this and don't create any node
break;
default:
// leaf node encountered
pcStart = pcCurrent;
while (*pcCurrent != ',' && *pcCurrent != ')')
{
pcCurrent++;
}
cTemp = *pcCurrent;
*pcCurrent = '\0';
// Create a child node
if (child == NULL)
{
node->child = (newick_child*)seqMalloc(sizeof(newick_child));
node->childNum = 1;
child = node->child;
}
else
{
child->next = (newick_child*)seqMalloc(sizeof(newick_child));
node->childNum++;
child = child->next;
}
child->node = parseTree(pcStart);
*pcCurrent = cTemp;
if (*pcCurrent != ')')
{
pcCurrent++;
}
break;
}
}
// If start at ':', then the internal node has no name.
pcCurrent++;
if (*pcCurrent == ':')
{
pcStart = pcCurrent + 1;
while (*pcCurrent != '\0' && *pcCurrent != ';')
{
pcCurrent++;
}
cTemp = *pcCurrent;
*pcCurrent = '\0';
node->dist = (float)atof(pcStart);
*pcCurrent = cTemp;
}
else if (*pcCurrent != ';' && *pcCurrent != '\0')
{
// Find ':' to retrieve distance, if any.
// At this time *pcCurrent should equal to ')'
pcStart = pcCurrent;
while (*pcCurrent != ':' && *pcCurrent != '\0' && *pcCurrent != ';')
{
pcCurrent++;
}
cTemp = *pcCurrent;
*pcCurrent = '\0';
node->taxon = seqMalloc((int)strlen(pcStart) + 1);
memcpy(node->taxon, pcStart, strlen(pcStart));
node->taxon = strip_quotes(node->taxon);
*pcCurrent = cTemp;
pcCurrent++;
pcStart = pcCurrent;
while (*pcCurrent != '\0' && *pcCurrent != ';')
{
pcCurrent++;
}
cTemp = *pcCurrent;
*pcCurrent = '\0';
node->dist = (float)atof(pcStart);
*pcCurrent = cTemp;
}
}
node->number_of_blocks = 0;
node->total_bases_removed_excluding_gaps = 0;
node->block_coordinates = (int **) calloc((3),sizeof(int *));
node->block_coordinates[0] = (int*) calloc((3),sizeof(int ));
node->block_coordinates[1] = (int*) calloc((3),sizeof(int ));
return node;
}
void print_tree(newick_node *root, FILE * outputfile)
{
newick_child *child;
if (root->childNum == 0)
{
fprintf(outputfile,"%s:%0.6f", root->taxon, root->dist);
}
else
{
child = root->child;
fprintf(outputfile,"(");
while (child != NULL)
{
print_tree(child->node,outputfile);
if (child->next != NULL)
{
fprintf(outputfile,",");
}
child = child->next;
}
if (root->taxon != NULL)
{
fprintf(outputfile,")%s:%0.6f", root->taxon, root->dist);
}
else
{
fprintf(outputfile,"):%0.6f", root->dist);
}
}
fflush(outputfile);
}