https://github.com/stamatak/standard-RAxML
Tip revision: 69a7edcba961f95f732e07ce64e7e5b1c7ae548b authored by Alexis Stamatakis on 04 October 2023, 07:33:55 UTC
Merge pull request #71 from martin-g/linux-arm64-support-2
Merge pull request #71 from martin-g/linux-arm64-support-2
Tip revision: 69a7edc
parsePartitions.c
/* RAxML-VI-HPC (version 2.2) a program for sequential and parallel estimation of phylogenetic trees
* Copyright August 2006 by Alexandros Stamatakis
*
* Partially derived from
* fastDNAml, a program for estimation of phylogenetic trees from sequences by Gary J. Olsen
*
* and
*
* Programs of the PHYLIP package by Joe Felsenstein.
*
* This program is free software; you may redistribute it and/or modify its
* under the terms of the GNU General Public License as published by the Free
* Software Foundation; either version 2 of the License, or (at your option)
* any later version.
*
* This program is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
* or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* for more details.
*
*
* For any other enquiries send an Email to Alexandros Stamatakis
* Alexandros.Stamatakis@epfl.ch
*
* When publishing work that is based on the results from RAxML-VI-HPC please cite:
*
* Alexandros Stamatakis:"RAxML-VI-HPC: maximum likelihood-based phylogenetic analyses with thousands of taxa and mixed models".
* Bioinformatics 2006; doi: 10.1093/bioinformatics/btl446
*/
#ifndef WIN32
#include <sys/times.h>
#include <sys/types.h>
#include <sys/time.h>
#include <unistd.h>
#endif
#include <math.h>
#include <time.h>
#include <stdlib.h>
#include <stdio.h>
#include <ctype.h>
#include <string.h>
#include <strings.h>
#include "axml.h"
/*****************************FUNCTIONS FOR READING MULTIPLE MODEL SPECIFICATIONS************************************************/
extern char modelFileName[1024];
extern char excludeFileName[1024];
extern char secondaryStructureFileName[1024];
extern char seq_file[1024];
extern char *protModels[NUM_PROT_MODELS];
static boolean lineContainsOnlyWhiteChars(char *line)
{
int i, n = strlen(line);
if(n == 0)
return TRUE;
for(i = 0; i < n; i++)
{
if(!whitechar(line[i]))
return FALSE;
}
return TRUE;
}
static int isNum(char c)
{
return (c == '0' || c == '1' || c == '2' || c == '3' || c == '4' ||
c == '5' || c == '6' || c == '7' || c == '8' || c == '9');
}
static void skipWhites(char **ch)
{
while(**ch == ' ' || **ch == '\t')
*ch = *ch + 1;
}
static void analyzeIdentifier(char **ch, int modelNumber, tree *tr)
{
char
*start = *ch,
ident[2048] = "",
model[2048] = "",
thisModel[2048] = "";
int
i = 0,
n,
j,
containsComma = 0;
while(**ch != '=')
{
if(**ch == '\n' || **ch == '\r')
{
printf("\nPartition file parsing error!\n");
printf("Each line must contain a \"=\" character\n");
printf("Offending line: %s\n", start);
printf("RAxML will exit now.\n\n");
errorExit(-1);
}
if(**ch != ' ' && **ch != '\t')
{
ident[i] = **ch;
i++;
}
*ch = *ch + 1;
}
ident[i] = '\0';
n = i;
i = 0;
for(i = 0; i < n; i++)
if(ident[i] == ',')
containsComma = 1;
if(!containsComma)
{
printf("Error, model file must have format: Substitution model, then a comma, and then the partition name\n");
exit(-1);
}
else
{
boolean
analyzeRest = TRUE,
useExternalFile = FALSE,
found = FALSE;
int
openBracket = 0,
closeBracket = 0,
openPos = 0,
closePos = 0;
i = 0;
while(ident[i] != ',')
{
if(ident[i] == '[')
{
openPos = i;
openBracket++;
}
if(ident[i] == ']')
{
closePos = i;
closeBracket++;
}
model[i] = ident[i];
i++;
}
if(closeBracket > 0 || openBracket > 0)
{
if((closeBracket == 1) && (openBracket == 1) && (openPos < closePos))
useExternalFile = TRUE;
else
{
printf("\nError: Apparently you want to specify a user-defined protein substitution model\n");
printf("or ascertainment bias correction model that shall be read from file\n");
printf("It must be enclosed in opening and closing bracktes like this: [prot=fileName] or [asc=fileName]\n\n");
printf("you specified: %s\n\n", model);
exit(-1);
}
}
if(useExternalFile)
{
char
designator[2048] = "",
fileName[2048] = "";
int
pos,
index,
lower = 0,
upper = i - 1;
boolean
isProteinFile = TRUE;
while(model[lower] == '[')
lower++;
while(model[upper] == ']')
upper--;
assert(lower < upper);
index = lower;
pos = 0;
while(model[index] != '~')
{
assert(index <= upper);
designator[pos] = model[index];
pos++;
index++;
}
designator[pos] = '\0';
if(strcmp(designator, "asc") == 0)
isProteinFile = FALSE;
else
{
if(strcmp(designator, "prot") == 0)
isProteinFile = TRUE;
else
{
printf("Error external partition file type %s does not exist\n", designator);
printf("Available file types: asc and prot\n");
exit(-1);
}
}
while(model[index] == '~')
index++;
pos = 0;
while(model[index] != ']')
{
fileName[pos] = model[index];
index++;
pos++;
}
fileName[pos] = '\0';
if(!filexists(fileName))
{
printf("\n\ncustom protein substitution or ascertainment bias file [%s] you want to use does not exist!\n", fileName);
printf("you need to specify the full path\n");
printf("the file name shall not contain blanks!\n\n");
exit(-1);
}
if(isProteinFile)
{
strcpy(tr->initialPartitionData[modelNumber].proteinSubstitutionFileName, fileName);
/*printf("%s \n", tr->initialPartitionData[modelNumber].proteinSubstitutionFileName);*/
tr->initialPartitionData[modelNumber].protModels = PROT_FILE;
tr->initialPartitionData[modelNumber].usePredefinedProtFreqs = TRUE;
tr->initialPartitionData[modelNumber].dataType = AA_DATA;
analyzeRest = FALSE;
}
else
{
int
newIndex = 0;
strcpy(tr->initialPartitionData[modelNumber].ascFileName, fileName);
i = 0;
while(ident[i] != ',')
{
if(ident[i] == '\0')
{
printf("Expecting two commas in string %s\n", ident);
exit(-1);
}
i++;
}
i++;
while(ident[i] != ',')
{
if(ident[i] == '\0')
{
printf("Expecting two commas in string %s\n", ident);
exit(-1);
}
model[newIndex] = ident[i];
i++;
newIndex++;
}
model[newIndex] = '\0';
}
}
if(analyzeRest)
{
/* AA */
tr->initialPartitionData[modelNumber].ascBias = FALSE;
for(i = 0; i < NUM_PROT_MODELS && !found; i++)
{
strcpy(thisModel, protModels[i]);
if(strcasecmp(model, thisModel) == 0)
{
tr->initialPartitionData[modelNumber].protModels = i;
tr->initialPartitionData[modelNumber].usePredefinedProtFreqs = TRUE;
tr->initialPartitionData[modelNumber].dataType = AA_DATA;
found = TRUE;
}
if(!found)
{
if(i != GTR && i != GTR_UNLINKED)
{
strcpy(thisModel, protModels[i]);
strcat(thisModel, "F");
if(strcasecmp(model, thisModel) == 0)
{
tr->initialPartitionData[modelNumber].protModels = i;
tr->initialPartitionData[modelNumber].usePredefinedProtFreqs = FALSE;
tr->initialPartitionData[modelNumber].dataType = AA_DATA;
found = TRUE;
if(tr->initialPartitionData[modelNumber].protModels == AUTO)
{
printf("\nError: Option AUTOF has been deprecated, exiting\n\n");
errorExit(-1);
}
}
}
}
if(!found)
{
strcpy(thisModel, protModels[i]);
strcat(thisModel, "X");
if(strcasecmp(model, thisModel) == 0)
{
tr->initialPartitionData[modelNumber].protModels = i;
tr->initialPartitionData[modelNumber].usePredefinedProtFreqs = FALSE;
tr->initialPartitionData[modelNumber].dataType = AA_DATA;
tr->initialPartitionData[modelNumber].optimizeBaseFrequencies = TRUE;
found = TRUE;
if(tr->initialPartitionData[modelNumber].protModels == AUTO)
{
printf("\nError: Option AUTOX has been deprecated, exiting\n\n");
errorExit(-1);
}
}
}
if(found && (tr->initialPartitionData[modelNumber].protModels == GTR || tr->initialPartitionData[modelNumber].protModels == GTR_UNLINKED))
tr->initialPartitionData[modelNumber].usePredefinedProtFreqs = FALSE;
}
/* AA with Asc bias*/
if(!found)
{
for(i = 0; i < NUM_PROT_MODELS && !found; i++)
{
strcpy(thisModel, "ASC_");
strcat(thisModel, protModels[i]);
if(strcasecmp(model, thisModel) == 0)
{
tr->initialPartitionData[modelNumber].protModels = i;
tr->initialPartitionData[modelNumber].usePredefinedProtFreqs = TRUE;
tr->initialPartitionData[modelNumber].dataType = AA_DATA;
found = TRUE;
}
if(!found)
{
if(i != GTR && i != GTR_UNLINKED)
{
strcpy(thisModel, protModels[i]);
strcat(thisModel, "F");
if(strcasecmp(model, thisModel) == 0)
{
tr->initialPartitionData[modelNumber].protModels = i;
tr->initialPartitionData[modelNumber].usePredefinedProtFreqs = FALSE;
tr->initialPartitionData[modelNumber].dataType = AA_DATA;
found = TRUE;
}
}
}
if(!found)
{
strcpy(thisModel, protModels[i]);
strcat(thisModel, "X");
if(strcasecmp(model, thisModel) == 0)
{
tr->initialPartitionData[modelNumber].protModels = i;
tr->initialPartitionData[modelNumber].usePredefinedProtFreqs = FALSE;
tr->initialPartitionData[modelNumber].dataType = AA_DATA;
tr->initialPartitionData[modelNumber].optimizeBaseFrequencies = TRUE;
found = TRUE;
}
}
if(found)
tr->initialPartitionData[modelNumber].ascBias = TRUE;
if(found && (tr->initialPartitionData[modelNumber].protModels == GTR || tr->initialPartitionData[modelNumber].protModels == GTR_UNLINKED))
tr->initialPartitionData[modelNumber].usePredefinedProtFreqs = FALSE;
}
}
if(!found)
{
if(strcasecmp(model, "DNA") == 0 || strcasecmp(model, "DNAX") == 0 || strcasecmp(model, "ASC_DNA") == 0 || strcasecmp(model, "ASC_DNAX") == 0)
{
tr->initialPartitionData[modelNumber].protModels = -1;
tr->initialPartitionData[modelNumber].usePredefinedProtFreqs = FALSE;
tr->initialPartitionData[modelNumber].dataType = DNA_DATA;
if(strcasecmp(model, "DNAX") == 0 || strcasecmp(model, "ASC_DNAX") == 0)
{
if(strcasecmp(model, "ASC_DNAX") == 0)
tr->initialPartitionData[modelNumber].ascBias = TRUE;
else
tr->initialPartitionData[modelNumber].ascBias = FALSE;
tr->initialPartitionData[modelNumber].optimizeBaseFrequencies = TRUE;
}
else
{
if(strcasecmp(model, "ASC_DNA") == 0)
tr->initialPartitionData[modelNumber].ascBias = TRUE;
else
tr->initialPartitionData[modelNumber].ascBias = FALSE;
tr->initialPartitionData[modelNumber].optimizeBaseFrequencies = FALSE;
}
found = TRUE;
}
else
{
if(strcasecmp(model, "BIN") == 0 || strcasecmp(model, "BINX") == 0 || strcasecmp(model, "ASC_BIN") == 0 || strcasecmp(model, "ASC_BINX") == 0)
{
tr->initialPartitionData[modelNumber].protModels = -1;
tr->initialPartitionData[modelNumber].usePredefinedProtFreqs = FALSE;
tr->initialPartitionData[modelNumber].dataType = BINARY_DATA;
if(strcasecmp(model, "BINX") == 0 || strcasecmp(model, "ASC_BINX") == 0)
{
if(strcasecmp(model, "ASC_BINX") == 0)
tr->initialPartitionData[modelNumber].ascBias = TRUE;
else
tr->initialPartitionData[modelNumber].ascBias = FALSE;
tr->initialPartitionData[modelNumber].optimizeBaseFrequencies = TRUE;
}
else
{
if(strcasecmp(model, "ASC_BIN") == 0)
tr->initialPartitionData[modelNumber].ascBias = TRUE;
else
tr->initialPartitionData[modelNumber].ascBias = FALSE;
tr->initialPartitionData[modelNumber].optimizeBaseFrequencies = FALSE;
}
found = TRUE;
}
else
{
if(strcasecmp(model, "MULTI") == 0 || strcasecmp(model, "MULTIX") == 0 || strcasecmp(model, "ASC_MULTI") == 0 || strcasecmp(model, "ASC_MULTIX") == 0)
{
tr->initialPartitionData[modelNumber].protModels = -1;
tr->initialPartitionData[modelNumber].usePredefinedProtFreqs = FALSE;
tr->initialPartitionData[modelNumber].dataType = GENERIC_32;
if(strcasecmp(model, "MULTIX") == 0 || strcasecmp(model, "ASC_MULTIX") == 0)
{
if(strcasecmp(model, "ASC_MULTIX") == 0)
tr->initialPartitionData[modelNumber].ascBias = TRUE;
else
tr->initialPartitionData[modelNumber].ascBias = FALSE;
tr->initialPartitionData[modelNumber].optimizeBaseFrequencies = TRUE;
}
else
{
if(strcasecmp(model, "ASC_MULTI") == 0)
tr->initialPartitionData[modelNumber].ascBias = TRUE;
else
tr->initialPartitionData[modelNumber].ascBias = FALSE;
tr->initialPartitionData[modelNumber].optimizeBaseFrequencies = FALSE;
}
found = TRUE;
}
else
{
if(strcasecmp(model, "CODON") == 0 || strcasecmp(model, "CODONX") == 0 || strcasecmp(model, "ASC_CODON") == 0 || strcasecmp(model, "ASC_CODONX") == 0)
{
tr->initialPartitionData[modelNumber].protModels = -1;
tr->initialPartitionData[modelNumber].usePredefinedProtFreqs = FALSE;
tr->initialPartitionData[modelNumber].dataType = GENERIC_64;
if(strcasecmp(model, "CODONX") == 0 || strcasecmp(model, "ASC_CODONX") == 0)
{
if(strcasecmp(model, "ASC_CODONX") == 0)
tr->initialPartitionData[modelNumber].ascBias = TRUE;
else
tr->initialPartitionData[modelNumber].ascBias = FALSE;
tr->initialPartitionData[modelNumber].optimizeBaseFrequencies = TRUE;
}
else
{
if(strcasecmp(model, "ASC_CODON") == 0)
tr->initialPartitionData[modelNumber].ascBias = TRUE;
else
tr->initialPartitionData[modelNumber].ascBias = FALSE;
tr->initialPartitionData[modelNumber].optimizeBaseFrequencies = FALSE;
}
found = TRUE;
}
}
}
}
}
if(!found)
{
printf("ERROR: you specified the unknown model %s for partition %d\n", model, modelNumber);
exit(-1);
}
}
i = 0;
while(ident[i++] != ',');
tr->initialPartitionData[modelNumber].partitionName = (char*)rax_malloc((n - i + 1) * sizeof(char));
j = 0;
while(i < n)
tr->initialPartitionData[modelNumber].partitionName[j++] = ident[i++];
tr->initialPartitionData[modelNumber].partitionName[j] = '\0';
}
}
static void setModel(int model, int position, int *a)
{
if(a[position] == -1)
a[position] = model;
else
{
printf("ERROR trying to assign model %d to position %d \n", model, position);
printf("while already model %d has been assigned to this position\n", a[position]);
exit(-1);
}
}
static int myGetline(char **lineptr, int *n, FILE *stream)
{
char *line, *p;
int size, copy, len;
int chunkSize = 256 * sizeof(char);
if (*lineptr == NULL || *n < 2)
{
line = (char *)rax_realloc(*lineptr, chunkSize, FALSE);
if (line == NULL)
return -1;
*lineptr = line;
*n = chunkSize;
}
line = *lineptr;
size = *n;
copy = size;
p = line;
while(1)
{
while (--copy > 0)
{
register int c = getc(stream);
if (c == EOF)
goto lose;
else
{
*p++ = c;
if(c == '\n' || c == '\r')
goto win;
}
}
/* Need to enlarge the line buffer. */
len = p - line;
size *= 2;
line = rax_realloc (line, size, FALSE);
if (line == NULL)
goto lose;
*lineptr = line;
*n = size;
p = line + len;
copy = size - len;
}
lose:
if (p == *lineptr)
return -1;
/* Return a partial line since we got an error in the middle. */
win:
*p = '\0';
return p - *lineptr;
}
static void nonContiguousError(analdef *adef)
{
if(adef->compressPatterns == FALSE)
{
printf("\nError: You are not allowed to use interleaved partitions, that is, assign non-contiguous sites\n");
printf("to the same partition model, when pattern compression is disabled via the -H flag,\n");
printf("or when pattern compression is disabled implicitely by some other option that requires it!\n\n");
exit(-1);
}
}
void parsePartitions(analdef *adef, rawdata *rdta, tree *tr)
{
FILE *f;
int numberOfModels = 0;
int nbytes = 0;
char *ch;
char *cc = (char *)NULL;
char **p_names;
int n, i, l;
int lower, upper, modulo;
char buf[256];
int **partitions;
int pairsCount;
int as, j;
int k;
f = myfopen(modelFileName, "rb");
while(myGetline(&cc, &nbytes, f) > -1)
{
if(!lineContainsOnlyWhiteChars(cc))
{
numberOfModels++;
}
if(cc)
rax_free(cc);
cc = (char *)NULL;
}
rewind(f);
p_names = (char **)rax_malloc(sizeof(char *) * numberOfModels);
partitions = (int **)rax_malloc(sizeof(int *) * numberOfModels);
tr->initialPartitionData = (pInfo*)rax_malloc(sizeof(pInfo) * numberOfModels);
for(i = 0; i < numberOfModels; i++)
{
tr->initialPartitionData[i].protModels = adef->proteinMatrix;
tr->initialPartitionData[i].usePredefinedProtFreqs = adef->protEmpiricalFreqs;
tr->initialPartitionData[i].optimizeBaseFrequencies = FALSE;
tr->initialPartitionData[i].dataType = -1;
}
for(i = 0; i < numberOfModels; i++)
partitions[i] = (int *)NULL;
i = 0;
while(myGetline(&cc, &nbytes, f) > -1)
{
if(!lineContainsOnlyWhiteChars(cc))
{
n = strlen(cc);
p_names[i] = (char *)rax_malloc(sizeof(char) * (n + 1));
strcpy(&(p_names[i][0]), cc);
i++;
}
if(cc)
rax_free(cc);
cc = (char *)NULL;
}
for(i = 0; i < numberOfModels; i++)
{
ch = p_names[i];
pairsCount = 0;
skipWhites(&ch);
if(*ch == '=')
{
printf("Identifier missing prior to '=' in %s\n", p_names[i]);
exit(-1);
}
analyzeIdentifier(&ch, i, tr);
ch++;
numberPairs:
pairsCount++;
partitions[i] = (int *)rax_realloc((void *)partitions[i], (1 + 3 * pairsCount) * sizeof(int), FALSE);
partitions[i][0] = pairsCount;
partitions[i][3 + 3 * (pairsCount - 1)] = -1;
skipWhites(&ch);
if(!isNum(*ch))
{
printf("%c Number expected in %s\n", *ch, p_names[i]);
exit(-1);
}
l = 0;
while(isNum(*ch))
{
/*printf("%c", *ch);*/
buf[l] = *ch;
ch++;
l++;
}
buf[l] = '\0';
lower = atoi(buf);
partitions[i][1 + 3 * (pairsCount - 1)] = lower;
skipWhites(&ch);
/* NEW */
if((*ch != '-') && (*ch != ','))
{
if(*ch == '\0' || *ch == '\n' || *ch == '\r')
{
upper = lower;
goto SINGLE_NUMBER;
}
else
{
printf("'-' or ',' expected in %s\n", p_names[i]);
exit(-1);
}
}
if(*ch == ',')
{
upper = lower;
nonContiguousError(adef);
goto SINGLE_NUMBER;
}
/* END NEW */
ch++;
skipWhites(&ch);
if(!isNum(*ch))
{
printf("%c Number expected in %s\n", *ch, p_names[i]);
exit(-1);
}
l = 0;
while(isNum(*ch))
{
buf[l] = *ch;
ch++;
l++;
}
buf[l] = '\0';
upper = atoi(buf);
SINGLE_NUMBER:
partitions[i][2 + 3 * (pairsCount - 1)] = upper;
if(upper < lower)
{
printf("Upper bound %d smaller than lower bound %d for this partition: %s\n", upper, lower, p_names[i]);
exit(-1);
}
skipWhites(&ch);
if(*ch == '\0' || *ch == '\n' || *ch == '\r') /* PC-LINEBREAK*/
{
goto parsed;
}
if(*ch == ',')
{
ch++;
nonContiguousError(adef);
goto numberPairs;
}
if(*ch == '\\')
{
ch++;
skipWhites(&ch);
if(!isNum(*ch))
{
printf("%c Number expected in %s\n", *ch, p_names[i]);
exit(-1);
}
nonContiguousError(adef);
l = 0;
while(isNum(*ch))
{
buf[l] = *ch;
ch++;
l++;
}
buf[l] = '\0';
modulo = atoi(buf);
partitions[i][3 + 3 * (pairsCount - 1)] = modulo;
skipWhites(&ch);
if(*ch == '\0' || *ch == '\n' || *ch == '\r')
{
goto parsed;
}
if(*ch == ',')
{
ch++;
nonContiguousError(adef);
goto numberPairs;
}
}
if(*ch == '/')
{
printf("\nRAxML detected the character \"/\" in your partition file.\n");
printf("Did you mean to write something similar to this: \"DNA, p1=1-100\\3\" ?\n");
printf("It's actually a backslash, not a slash, the program will exit now with an error!\n\n");
}
else
{
printf("\nRAxML detected the character \"%c\" in your partition file,\n", *ch);
printf("while it does not belong there!\n");
printf("\nAre you sure that your partition file complies with the RAxML partition file format?\n");
printf("\nActually reading the manual, does indeed do help a lot\n\n");
printf("The program will exit now with an error!\n\n");
}
printf("The problematic line in your partition file is this one here:\n\n");
printf("%s\n\n", p_names[i]);
assert(0);
parsed:
;
}
fclose(f);
/*********************************************************************************************************************/
for(i = 0; i <= rdta->sites; i++)
tr->model[i] = -1;
for(i = 0; i < numberOfModels; i++)
{
as = partitions[i][0];
for(j = 0; j < as; j++)
{
lower = partitions[i][1 + j * 3];
upper = partitions[i][2 + j * 3];
modulo = partitions[i][3 + j * 3];
if(modulo == -1)
{
for(k = lower; k <= upper; k++)
setModel(i, k, tr->model);
}
else
{
for(k = lower; k <= upper; k += modulo)
{
if(k <= rdta->sites)
setModel(i, k, tr->model);
}
}
}
}
for(i = 1; i < rdta->sites + 1; i++)
{
if(tr->model[i] == -1)
{
printf("ERROR: Alignment Position %d has not been assigned any model\n", i);
exit(-1);
}
}
for(i = 0; i < numberOfModels; i++)
{
rax_free(partitions[i]);
rax_free(p_names[i]);
}
rax_free(partitions);
rax_free(p_names);
tr->NumberOfModels = numberOfModels;
if(adef->perGeneBranchLengths)
{
if(tr->NumberOfModels > NUM_BRANCHES)
{
printf("You are trying to use %d partitioned models for an individual per-gene branch length estimate.\n", tr->NumberOfModels);
printf("Currently only %d are allowed to improve efficiency.\n", NUM_BRANCHES);
printf("\n");
printf("In order to change this please replace the line \"#define NUM_BRANCHES %d\" in file \"axml.h\" \n", NUM_BRANCHES);
printf("by \"#define NUM_BRANCHES %d\" and then re-compile RAxML.\n", tr->NumberOfModels);
exit(-1);
}
else
{
tr->multiBranch = 1;
tr->numBranches = tr->NumberOfModels;
}
}
}
/*******************************************************************************************************************************/
void handleExcludeFile(tree *tr, analdef *adef, rawdata *rdta)
{
FILE *f;
char buf[256];
int
ch,
j, value, i,
state = 0,
numberOfModels = 0,
l = -1,
excludeRegion = 0,
excludedColumns = 0,
modelCounter = 1;
int
*excludeArray, *countArray, *modelList;
int
**partitions;
printf("\n\n");
f = myfopen(excludeFileName, "rb");
while((ch = getc(f)) != EOF)
{
if(ch == '-')
numberOfModels++;
}
excludeArray = (int*)rax_malloc(sizeof(int) * (rdta->sites + 1));
countArray = (int*)rax_malloc(sizeof(int) * (rdta->sites + 1));
modelList = (int *)rax_malloc((rdta->sites + 1)* sizeof(int));
partitions = (int **)rax_malloc(sizeof(int *) * numberOfModels);
for(i = 0; i < numberOfModels; i++)
partitions[i] = (int *)rax_malloc(sizeof(int) * 2);
rewind(f);
while((ch = getc(f)) != EOF)
{
switch(state)
{
case 0: /* get first number */
if(!whitechar(ch))
{
if(!isNum(ch))
{
printf("exclude file must have format: number-number [number-number]*\n");
exit(-1);
}
l = 0;
buf[l++] = ch;
state = 1;
}
break;
case 1: /*get the number or detect - */
if(!isNum(ch) && ch != '-')
{
printf("exclude file must have format: number-number [number-number]*\n");
exit(-1);
}
if(isNum(ch))
{
buf[l++] = ch;
}
else
{
buf[l++] = '\0';
value = atoi(buf);
partitions[excludeRegion][0] = value;
state = 2;
}
break;
case 2: /*get second number */
if(!isNum(ch))
{
printf("exclude file must have format: number-number [number-number]*\n");
exit(-1);
}
l = 0;
buf[l++] = ch;
state = 3;
break;
case 3: /* continue second number or find end */
if(!isNum(ch) && !whitechar(ch))
{
printf("exclude file must have format: number-number [number-number]*\n");
exit(-1);
}
if(isNum(ch))
{
buf[l++] = ch;
}
else
{
buf[l++] = '\0';
value = atoi(buf);
partitions[excludeRegion][1] = value;
excludeRegion++;
state = 0;
}
break;
default:
assert(0);
}
}
if(state == 3)
{
buf[l++] = '\0';
value = atoi(buf);
partitions[excludeRegion][1] = value;
excludeRegion++;
}
assert(excludeRegion == numberOfModels);
for(i = 0; i <= rdta->sites; i++)
{
excludeArray[i] = -1;
countArray[i] = 0;
modelList[i] = -1;
}
for(i = 0; i < numberOfModels; i++)
{
int lower = partitions[i][0];
int upper = partitions[i][1];
if(lower > upper)
{
printf("Misspecified exclude region %d\n", i);
printf("lower bound %d is greater than upper bound %d\n", lower, upper);
exit(-1);
}
if(lower == 0)
{
printf("Misspecified exclude region %d\n", i);
printf("lower bound must be greater than 0\n");
exit(-1);
}
if(upper > rdta->sites)
{
printf("Misspecified exclude region %d\n", i);
printf("upper bound %d must be smaller than %d\n", upper, (rdta->sites + 1));
exit(-1);
}
for(j = lower; j <= upper; j++)
{
if(excludeArray[j] != -1)
{
printf("WARNING: Exclude regions %d and %d overlap at position %d (already excluded %d times)\n",
excludeArray[j], i, j, countArray[j]);
}
excludeArray[j] = i;
countArray[j] = countArray[j] + 1;
}
}
for(i = 1; i <= rdta->sites; i++)
{
if(excludeArray[i] != -1)
excludedColumns++;
else
{
modelList[modelCounter] = tr->model[i];
modelCounter++;
}
}
printf("You have excluded %d out of %d columns\n", excludedColumns, rdta->sites);
if(excludedColumns == rdta->sites)
{
printf("Error: You have excluded all sites\n");
exit(-1);
}
if(adef->useSecondaryStructure && (excludedColumns > 0))
{
char mfn[2048];
int countColumns;
FILE *newFile;
assert(adef->useMultipleModel);
strcpy(mfn, secondaryStructureFileName);
strcat(mfn, ".");
strcat(mfn, excludeFileName);
newFile = myfopen(mfn, "wb");
printBothOpen("\nA secondary structure file with analogous structure assignments for non-excluded columns is printed to file %s\n", mfn);
for(i = 1, countColumns = 0; i <= rdta->sites; i++)
{
if(excludeArray[i] == -1)
fprintf(newFile, "%c", tr->secondaryStructureInput[i - 1]);
else
countColumns++;
}
assert(countColumns == excludedColumns);
fprintf(newFile,"\n");
fclose(newFile);
}
if(adef->useMultipleModel && (excludedColumns > 0))
{
char mfn[2048];
FILE *newFile;
strcpy(mfn, modelFileName);
strcat(mfn, ".");
strcat(mfn, excludeFileName);
newFile = myfopen(mfn, "wb");
printf("\nA partition file with analogous model assignments for non-excluded columns is printed to file %s\n", mfn);
for(i = 0; i < tr->NumberOfModels; i++)
{
boolean modelStillExists = FALSE;
for(j = 1; (j <= rdta->sites) && (!modelStillExists); j++)
{
if(modelList[j] == i)
modelStillExists = TRUE;
}
if(modelStillExists)
{
int k = 1;
int lower, upper;
int parts = 0;
switch(tr->partitionData[i].dataType)
{
case AA_DATA:
{
char AAmodel[1024];
if(tr->partitionData[i].ascBias)
{
strcpy(AAmodel, "ASC_");
strcat(AAmodel, protModels[tr->partitionData[i].protModels]);
}
else
strcpy(AAmodel, protModels[tr->partitionData[i].protModels]);
if(tr->partitionData[i].usePredefinedProtFreqs == FALSE)
strcat(AAmodel, "F");
if(tr->partitionData[i].optimizeBaseFrequencies)
strcat(AAmodel, "X");
assert(!(tr->partitionData[i].optimizeBaseFrequencies && tr->partitionData[i].usePredefinedProtFreqs));
fprintf(newFile, "%s, ", AAmodel);
}
break;
case DNA_DATA:
if(tr->partitionData[i].optimizeBaseFrequencies)
{
if(tr->partitionData[i].ascBias)
fprintf(newFile, "ASC_DNAX, ");
else
fprintf(newFile, "DNAX, ");
}
else
{
if(tr->partitionData[i].ascBias)
fprintf(newFile, "ASC_DNA, ");
else
fprintf(newFile, "DNA, ");
}
break;
case BINARY_DATA:
if(tr->partitionData[i].optimizeBaseFrequencies)
{
if(tr->partitionData[i].ascBias)
fprintf(newFile, "ASC_BINX, ");
else
fprintf(newFile, "BINX, ");
}
else
{
if(tr->partitionData[i].ascBias)
fprintf(newFile, "ASC_BIN, ");
else
fprintf(newFile, "BIN, ");
}
break;
case GENERIC_32:
if(tr->partitionData[i].optimizeBaseFrequencies)
{
if(tr->partitionData[i].ascBias)
fprintf(newFile, "ASC_MULTIX, ");
else
fprintf(newFile, "MULTIX, ");
}
else
{
if(tr->partitionData[i].ascBias)
fprintf(newFile, "ASC_MULTI, ");
else
fprintf(newFile, "MULTI, ");
}
break;
case GENERIC_64:
if(tr->partitionData[i].optimizeBaseFrequencies)
{
if(tr->partitionData[i].ascBias)
fprintf(newFile, "ASC_CODONX, ");
else
fprintf(newFile, "CODONX, ");
}
else
{
if(tr->partitionData[i].ascBias)
fprintf(newFile, "ASC_CODON, ");
else
fprintf(newFile, "CODON, ");
}
break;
default:
assert(0);
}
fprintf(newFile, "%s = ", tr->partitionData[i].partitionName);
while(k <= rdta->sites)
{
if(modelList[k] == i)
{
lower = k;
while((modelList[k + 1] == i) && (k <= rdta->sites))
k++;
upper = k;
if(lower == upper)
{
if(parts == 0)
fprintf(newFile, "%d", lower);
else
fprintf(newFile, ",%d", lower);
}
else
{
if(parts == 0)
fprintf(newFile, "%d-%d", lower, upper);
else
fprintf(newFile, ",%d-%d", lower, upper);
}
parts++;
}
k++;
}
fprintf(newFile, "\n");
}
}
fclose(newFile);
}
{
FILE *newFile;
char mfn[2048];
strcpy(mfn, seq_file);
strcat(mfn, ".");
strcat(mfn, excludeFileName);
newFile = myfopen(mfn, "wb");
printf("\nAn alignment file with excluded columns is printed to file %s\n\n\n", mfn);
fprintf(newFile, "%d %d\n", tr->mxtips, rdta->sites - excludedColumns);
for(i = 1; i <= tr->mxtips; i++)
{
unsigned char *tipI = &(rdta->y[i][1]);
fprintf(newFile, "%s ", tr->nameList[i]);
for(j = 0; j < rdta->sites; j++)
{
if(excludeArray[j + 1] == -1)
fprintf(newFile, "%c", getInverseMeaning(tr->dataVector[j + 1], tipI[j]));
}
fprintf(newFile, "\n");
}
fclose(newFile);
}
fclose(f);
for(i = 0; i < numberOfModels; i++)
rax_free(partitions[i]);
rax_free(partitions);
rax_free(excludeArray);
rax_free(countArray);
rax_free(modelList);
}
void parseProteinModel(double *externalAAMatrix, char *fileName)
{
FILE
*f = myfopen(fileName, "rb");
int
doublesRead = 0,
result = 0,
i,
j;
double
acc = 0.0;
printf("\nParsing user-defined protein model from file %s\n", fileName);
while(doublesRead < 420)
{
result = fscanf(f, "%lf", &(externalAAMatrix[doublesRead++]));
if(result == EOF)
{
printf("Error protein model file must consist of exactly 420 entries \n");
printf("The first 400 entries are for the rates of the AA matrix, while the\n");
printf("last 20 should contain the empirical base frequencies\n");
printf("Reached End of File after %d entries\n", (doublesRead - 1));
exit(-1);
}
}
fclose(f);
/* CHECKS */
for(i = 0; i < 20; i++)
for(j = 0; j < 20; j++)
{
if(i != j)
{
if(externalAAMatrix[i * 20 + j] != externalAAMatrix[j * 20 + i])
{
printf("Error user-defined Protein model matrix must be symmetric\n");
printf("Entry P[%d][%d]=%f at position %d is not equal to P[%d][%d]=%f at position %d\n",
i, j, externalAAMatrix[i * 20 + j], (i * 20 + j),
j, i, externalAAMatrix[j * 20 + i], (j * 20 + i));
exit(-1);
}
}
}
acc = 0.0;
for(i = 400; i < 420; i++)
acc += externalAAMatrix[i];
if((acc > 1.0 + 1.0E-6) || (acc < 1.0 - 1.0E-6))
{
printf("Base frequencies in user-defined AA substitution matrix do not sum to 1.0\n");
printf("the sum is %1.80f\n", acc);
exit(-1);
}
}
void parseSecondaryStructure(tree *tr, analdef *adef, int sites)
{
if(adef->useSecondaryStructure)
{
FILE *f = myfopen(secondaryStructureFileName, "rb");
int
i,
k,
countCharacters = 0,
ch,
*characters,
**brackets,
opening,
closing,
depth,
numberOfSymbols,
numSecondaryColumns;
unsigned char bracketTypes[4][2] = {{'(', ')'}, {'<', '>'}, {'[', ']'}, {'{', '}'}};
numberOfSymbols = 4;
tr->secondaryStructureInput = (char*)rax_malloc(sizeof(char) * sites);
while((ch = fgetc(f)) != EOF)
{
if(ch == '(' || ch == ')' || ch == '<' || ch == '>' || ch == '[' || ch == ']' || ch == '{' || ch == '}' || ch == '.')
countCharacters++;
else
{
if(!whitechar(ch))
{
printf("Secondary Structure file %s contains character %c at position %d\n", secondaryStructureFileName, ch, countCharacters + 1);
printf("Allowed Characters are \"( ) < > [ ] { } \" and \".\" \n");
errorExit(-1);
}
}
}
if(countCharacters != sites)
{
printf("Error: Alignment length is: %d, secondary structure file has length %d\n", sites, countCharacters);
errorExit(-1);
}
characters = (int*)rax_malloc(sizeof(int) * countCharacters);
brackets = (int **)rax_malloc(sizeof(int*) * numberOfSymbols);
for(k = 0; k < numberOfSymbols; k++)
brackets[k] = (int*)rax_calloc(countCharacters, sizeof(int));
rewind(f);
countCharacters = 0;
while((ch = fgetc(f)) != EOF)
{
if(!whitechar(ch))
{
tr->secondaryStructureInput[countCharacters] = ch;
characters[countCharacters++] = ch;
}
}
assert(countCharacters == sites);
for(k = 0; k < numberOfSymbols; k++)
{
for(i = 0, opening = 0, closing = 0, depth = 0; i < countCharacters; i++)
{
if((characters[i] == bracketTypes[k][0] || characters[i] == bracketTypes[k][1]) &&
(tr->extendedDataVector[i+1] == AA_DATA || tr->extendedDataVector[i+1] == BINARY_DATA ||
tr->extendedDataVector[i+1] == GENERIC_32 || tr->extendedDataVector[i+1] == GENERIC_64))
{
printf("Secondary Structure only for DNA character positions \n");
printf("I am at position %d of the secondary structure file and this is not part of a DNA partition\n", i+1);
errorExit(-1);
}
if(characters[i] == bracketTypes[k][0])
{
depth++;
/*printf("%d %d\n", depth, i);*/
brackets[k][i] = depth;
opening++;
}
if(characters[i] == bracketTypes[k][1])
{
brackets[k][i] = depth;
/*printf("%d %d\n", depth, i); */
depth--;
closing++;
}
if(closing > opening)
{
printf("at position %d there is a closing bracket too much\n", i+1);
errorExit(-1);
}
}
if(depth != 0)
{
printf("Problem: Depth: %d\n", depth);
printf("Your secondary structure file may be missing a closing or opening paraenthesis!\n");
}
assert(depth == 0);
if(countCharacters != sites)
{
printf("Problem: sec chars: %d sites: %d\n",countCharacters, sites);
printf("The number of sites in the alignment does not match the length of the secondary structure file\n");
}
assert(countCharacters == sites);
if(closing != opening)
{
printf("Number of opening brackets %d should be equal to number of closing brackets %d\n", opening, closing);
errorExit(-1);
}
}
for(i = 0, numSecondaryColumns = 0; i < countCharacters; i++)
{
int checkSum = 0;
for(k = 0; k < numberOfSymbols; k++)
{
if(brackets[k][i] > 0)
{
checkSum++;
switch(tr->secondaryStructureModel)
{
case SEC_16:
case SEC_16_A:
case SEC_16_B:
case SEC_16_C:
case SEC_16_D:
case SEC_16_E:
case SEC_16_F:
case SEC_16_I:
case SEC_16_J:
case SEC_16_K:
tr->extendedDataVector[i+1] = SECONDARY_DATA;
break;
case SEC_6_A:
case SEC_6_B:
case SEC_6_C:
case SEC_6_D:
case SEC_6_E:
tr->extendedDataVector[i+1] = SECONDARY_DATA_6;
break;
case SEC_7_A:
case SEC_7_B:
case SEC_7_C:
case SEC_7_D:
case SEC_7_E:
case SEC_7_F:
tr->extendedDataVector[i+1] = SECONDARY_DATA_7;
break;
default:
assert(0);
}
numSecondaryColumns++;
}
}
assert(checkSum <= 1);
}
assert(numSecondaryColumns % 2 == 0);
/*printf("Number of secondary columns: %d merged columns: %d\n", numSecondaryColumns, numSecondaryColumns / 2);*/
tr->numberOfSecondaryColumns = numSecondaryColumns;
if(numSecondaryColumns > 0)
{
int model = tr->NumberOfModels;
int countPairs;
pInfo *partBuffer = (pInfo*)rax_malloc(sizeof(pInfo) * tr->NumberOfModels);
for(i = 1; i <= sites; i++)
{
for(k = 0; k < numberOfSymbols; k++)
{
if(brackets[k][i-1] > 0)
tr->model[i] = model;
}
}
/* now make a copy of partition data */
for(i = 0; i < tr->NumberOfModels; i++)
{
partBuffer[i].partitionName = (char*)rax_malloc((strlen(tr->extendedPartitionData[i].partitionName) + 1) * sizeof(char));
strcpy(partBuffer[i].partitionName, tr->extendedPartitionData[i].partitionName);
strcpy(partBuffer[i].proteinSubstitutionFileName, tr->extendedPartitionData[i].proteinSubstitutionFileName);
strcpy(partBuffer[i].ascFileName, tr->extendedPartitionData[i].ascFileName);
partBuffer[i].dataType = tr->extendedPartitionData[i].dataType;
partBuffer[i].protModels = tr->extendedPartitionData[i].protModels;
partBuffer[i].usePredefinedProtFreqs = tr->extendedPartitionData[i].usePredefinedProtFreqs;
partBuffer[i].optimizeBaseFrequencies = tr->extendedPartitionData[i].optimizeBaseFrequencies;
}
for(i = 0; i < tr->NumberOfModels; i++)
rax_free(tr->extendedPartitionData[i].partitionName);
rax_free(tr->extendedPartitionData);
tr->extendedPartitionData = (pInfo*)rax_malloc(sizeof(pInfo) * (tr->NumberOfModels + 1));
for(i = 0; i < tr->NumberOfModels; i++)
{
tr->extendedPartitionData[i].partitionName = (char*)rax_malloc((strlen(partBuffer[i].partitionName) + 1) * sizeof(char));
strcpy(tr->extendedPartitionData[i].partitionName, partBuffer[i].partitionName);
strcpy(tr->extendedPartitionData[i].proteinSubstitutionFileName, partBuffer[i].proteinSubstitutionFileName);
strcpy(tr->extendedPartitionData[i].ascFileName, partBuffer[i].ascFileName);
tr->extendedPartitionData[i].dataType = partBuffer[i].dataType;
tr->extendedPartitionData[i].protModels= partBuffer[i].protModels;
tr->extendedPartitionData[i].usePredefinedProtFreqs= partBuffer[i].usePredefinedProtFreqs;
tr->extendedPartitionData[i].optimizeBaseFrequencies = partBuffer[i].optimizeBaseFrequencies;
rax_free(partBuffer[i].partitionName);
}
rax_free(partBuffer);
tr->extendedPartitionData[i].partitionName = (char*)rax_malloc(64 * sizeof(char));
switch(tr->secondaryStructureModel)
{
case SEC_16:
case SEC_16_A:
case SEC_16_B:
case SEC_16_C:
case SEC_16_D:
case SEC_16_E:
case SEC_16_F:
case SEC_16_I:
case SEC_16_J:
case SEC_16_K:
strcpy(tr->extendedPartitionData[i].partitionName, "SECONDARY STRUCTURE 16 STATE MODEL");
tr->extendedPartitionData[i].dataType = SECONDARY_DATA;
break;
case SEC_6_A:
case SEC_6_B:
case SEC_6_C:
case SEC_6_D:
case SEC_6_E:
strcpy(tr->extendedPartitionData[i].partitionName, "SECONDARY STRUCTURE 6 STATE MODEL");
tr->extendedPartitionData[i].dataType = SECONDARY_DATA_6;
break;
case SEC_7_A:
case SEC_7_B:
case SEC_7_C:
case SEC_7_D:
case SEC_7_E:
case SEC_7_F:
strcpy(tr->extendedPartitionData[i].partitionName, "SECONDARY STRUCTURE 7 STATE MODEL");
tr->extendedPartitionData[i].dataType = SECONDARY_DATA_7;
break;
default:
assert(0);
}
tr->extendedPartitionData[i].protModels= -1;
tr->extendedPartitionData[i].usePredefinedProtFreqs = FALSE;
tr->NumberOfModels++;
if(adef->perGeneBranchLengths)
{
if(tr->NumberOfModels > NUM_BRANCHES)
{
printf("You are trying to use %d partitioned models for an individual per-gene branch length estimate.\n", tr->NumberOfModels);
printf("Currently only %d are allowed to improve efficiency.\n", NUM_BRANCHES);
printf("Note that the number of partitions has automatically been incremented by one to accommodate secondary structure models\n");
printf("\n");
printf("In order to change this please replace the line \"#define NUM_BRANCHES %d\" in file \"axml.h\" \n", NUM_BRANCHES);
printf("by \"#define NUM_BRANCHES %d\" and then re-compile RAxML.\n", tr->NumberOfModels);
exit(-1);
}
else
{
tr->multiBranch = 1;
tr->numBranches = tr->NumberOfModels;
}
}
assert(countCharacters == sites);
tr->secondaryStructurePairs = (int*)rax_malloc(sizeof(int) * countCharacters);
for(i = 0; i < countCharacters; i++)
tr->secondaryStructurePairs[i] = -1;
/*
for(i = 0; i < countCharacters; i++)
printf("%d", brackets[i]);
printf("\n");
*/
countPairs = 0;
for(k = 0; k < numberOfSymbols; k++)
{
i = 0;
while(i < countCharacters)
{
int
j = i,
bracket = -1,
openBracket,
closeBracket;
while(j < countCharacters && ((bracket = brackets[k][j]) == 0))
{
i++;
j++;
}
assert(bracket >= 0);
if(j == countCharacters)
{
assert(bracket == 0);
break;
}
openBracket = j;
j++;
while(bracket != brackets[k][j] && j < countCharacters)
j++;
assert(j < countCharacters);
closeBracket = j;
assert(closeBracket < countCharacters && openBracket < countCharacters);
assert(brackets[k][closeBracket] > 0 && brackets[k][openBracket] > 0);
/*printf("%d %d %d\n", openBracket, closeBracket, bracket);*/
brackets[k][closeBracket] = 0;
brackets[k][openBracket] = 0;
countPairs++;
tr->secondaryStructurePairs[closeBracket] = openBracket;
tr->secondaryStructurePairs[openBracket] = closeBracket;
}
assert(i == countCharacters);
}
assert(countPairs == numSecondaryColumns / 2);
/*for(i = 0; i < countCharacters; i++)
printf("%d ", tr->secondaryStructurePairs[i]);
printf("\n");*/
adef->useMultipleModel = TRUE;
}
for(k = 0; k < numberOfSymbols; k++)
rax_free(brackets[k]);
rax_free(brackets);
rax_free(characters);
fclose(f);
}
}