https://github.com/stamatak/standard-RAxML
Tip revision: 69a7edcba961f95f732e07ce64e7e5b1c7ae548b authored by Alexis Stamatakis on 04 October 2023, 07:33:55 UTC
Merge pull request #71 from martin-g/linux-arm64-support-2
Merge pull request #71 from martin-g/linux-arm64-support-2
Tip revision: 69a7edc
newviewGenericSpecial.c
/* RAxML-VI-HPC (version 2.2) a program for sequential and parallel estimation of phylogenetic trees
* Copyright August 2006 by Alexandros Stamatakis
*
* Partially derived from
* fastDNAml, a program for estimation of phylogenetic trees from sequences by Gary J. Olsen
*
* and
*
* Programs of the PHYLIP package by Joe Felsenstein.
* This program is free software; you may redistribute it and/or modify its
* under the terms of the GNU General Public License as published by the Free
* Software Foundation; either version 2 of the License, or (at your option)
* any later version.
*
* This program is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
* or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* for more details.
*
*
* For any other enquiries send an Email to Alexandros Stamatakis
* Alexandros.Stamatakis@epfl.ch
*
* When publishing work that is based on the results from RAxML-VI-HPC please cite:
*
* Alexandros Stamatakis:"RAxML-VI-HPC: maximum likelihood-based phylogenetic analyses with thousands of taxa and mixed models".
* Bioinformatics 2006; doi: 10.1093/bioinformatics/btl446
*/
#ifndef WIN32
#include <unistd.h>
#endif
#include <math.h>
#include <time.h>
#include <stdlib.h>
#include <stdio.h>
#include <ctype.h>
#include <string.h>
#include <stdint.h>
#include <limits.h>
#include "axml.h"
#ifdef __SIM_SSE3
#include <stdint.h>
#ifdef __x86_64__
#include <xmmintrin.h>
#include <pmmintrin.h>
#elif __aarch64__
#include "sse2neon.h"
#endif
const union __attribute__ ((aligned (BYTE_ALIGNMENT)))
{
uint64_t i[2];
__m128d m;
} absMask = {{0x7fffffffffffffffULL , 0x7fffffffffffffffULL }};
#endif
#ifdef _USE_PTHREADS
#include <pthread.h>
extern volatile int NumberOfThreads;
extern pthread_mutex_t mutex;
#endif
extern const unsigned int mask32[32];
boolean isGap(unsigned int *x, int pos)
{
return (x[pos / 32] & mask32[pos % 32]);
}
boolean noGap(unsigned int *x, int pos)
{
return (!(x[pos / 32] & mask32[pos % 32]));
}
void makeP_Flex(double z1, double z2, double *rptr, double *EI, double *EIGN, int numberOfCategories, double *left, double *right, const int numStates)
{
int
i,
j,
k;
const int
rates = numStates - 1,
statesSquare = numStates * numStates;
double
lz1[64],
lz2[64],
d1[64],
d2[64];
assert(numStates <= 64);
for(i = 0; i < rates; i++)
{
lz1[i] = EIGN[i] * z1;
lz2[i] = EIGN[i] * z2;
}
for(i = 0; i < numberOfCategories; i++)
{
for(j = 0; j < rates; j++)
{
d1[j] = EXP (rptr[i] * lz1[j]);
d2[j] = EXP (rptr[i] * lz2[j]);
}
for(j = 0; j < numStates; j++)
{
left[statesSquare * i + numStates * j] = 1.0;
right[statesSquare * i + numStates * j] = 1.0;
for(k = 1; k < numStates; k++)
{
left[statesSquare * i + numStates * j + k] = d1[k-1] * EI[rates * j + (k-1)];
right[statesSquare * i + numStates * j + k] = d2[k-1] * EI[rates * j + (k-1)];
}
}
}
}
#ifdef _HET
static void makeP_FlexHet(double z1, double z2, double *rptr, double *EI_1, double *EIGN_1,
double *EI_2, double *EIGN_2, int numberOfCategories, double *left, double *right, const int numStates)
{
int
i,
j,
k;
const int
rates = numStates - 1,
statesSquare = numStates * numStates;
double
lz1[64],
lz2[64],
d1[64],
d2[64];
assert(numStates <= 64);
for(i = 0; i < rates; i++)
{
lz1[i] = EIGN_1[i] * z1;
lz2[i] = EIGN_2[i] * z2;
}
for(i = 0; i < numberOfCategories; i++)
{
for(j = 0; j < rates; j++)
{
d1[j] = EXP (rptr[i] * lz1[j]);
d2[j] = EXP (rptr[i] * lz2[j]);
}
for(j = 0; j < numStates; j++)
{
left[statesSquare * i + numStates * j] = 1.0;
right[statesSquare * i + numStates * j] = 1.0;
for(k = 1; k < numStates; k++)
{
left[statesSquare * i + numStates * j + k] = d1[k-1] * EI_1[rates * j + (k-1)];
right[statesSquare * i + numStates * j + k] = d2[k-1] * EI_2[rates * j + (k-1)];
}
}
}
}
#endif
void makeP_FlexLG4(double z1, double z2, double *rptr, double *EI[4], double *EIGN[4], int numberOfCategories, double *left, double *right, const int numStates)
{
int
i,
j,
k;
const int
rates = numStates - 1,
statesSquare = numStates * numStates;
double
d1[64],
d2[64];
assert(numStates <= 64);
for(i = 0; i < numberOfCategories; i++)
{
for(j = 0; j < rates; j++)
{
d1[j] = EXP (rptr[i] * EIGN[i][j] * z1);
d2[j] = EXP (rptr[i] * EIGN[i][j] * z2);
}
for(j = 0; j < numStates; j++)
{
left[statesSquare * i + numStates * j] = 1.0;
right[statesSquare * i + numStates * j] = 1.0;
for(k = 1; k < numStates; k++)
{
left[statesSquare * i + numStates * j + k] = d1[k-1] * EI[i][rates * j + (k-1)];
right[statesSquare * i + numStates * j + k] = d2[k-1] * EI[i][rates * j + (k-1)];
}
}
}
}
static void newviewFlexCat(int tipCase, double *extEV,
int *cptr,
double *x1, double *x2, double *x3, double *tipVector,
int *ex3, unsigned char *tipX1, unsigned char *tipX2,
int n, double *left, double *right, int *wgt, int *scalerIncrement, const boolean useFastScaling, const int numStates)
{
double
*le, *ri, *v, *vl, *vr,
ump_x1, ump_x2, x1px2;
int i, l, j, scale, addScale = 0;
const int
statesSquare = numStates * numStates;
switch(tipCase)
{
case TIP_TIP:
{
for (i = 0; i < n; i++)
{
le = &left[cptr[i] * statesSquare];
ri = &right[cptr[i] * statesSquare];
vl = &(tipVector[numStates * tipX1[i]]);
vr = &(tipVector[numStates * tipX2[i]]);
v = &x3[numStates * i];
for(l = 0; l < numStates; l++)
v[l] = 0.0;
for(l = 0; l < numStates; l++)
{
ump_x1 = 0.0;
ump_x2 = 0.0;
for(j = 0; j < numStates; j++)
{
ump_x1 += vl[j] * le[l * numStates + j];
ump_x2 += vr[j] * ri[l * numStates + j];
}
x1px2 = ump_x1 * ump_x2;
for(j = 0; j < numStates; j++)
v[j] += x1px2 * extEV[l * numStates + j];
}
}
}
break;
case TIP_INNER:
{
for (i = 0; i < n; i++)
{
le = &left[cptr[i] * statesSquare];
ri = &right[cptr[i] * statesSquare];
vl = &(tipVector[numStates * tipX1[i]]);
vr = &x2[numStates * i];
v = &x3[numStates * i];
for(l = 0; l < numStates; l++)
v[l] = 0.0;
for(l = 0; l < numStates; l++)
{
ump_x1 = 0.0;
ump_x2 = 0.0;
for(j = 0; j < numStates; j++)
{
ump_x1 += vl[j] * le[l * numStates + j];
ump_x2 += vr[j] * ri[l * numStates + j];
}
x1px2 = ump_x1 * ump_x2;
for(j = 0; j < numStates; j++)
v[j] += x1px2 * extEV[l * numStates + j];
}
scale = 1;
for(l = 0; scale && (l < numStates); l++)
scale = ((v[l] < minlikelihood) && (v[l] > minusminlikelihood));
if(scale)
{
for(l = 0; l < numStates; l++)
v[l] *= twotothe256;
if(useFastScaling)
addScale += wgt[i];
else
ex3[i] += 1;
}
}
}
break;
case INNER_INNER:
for(i = 0; i < n; i++)
{
le = &left[cptr[i] * statesSquare];
ri = &right[cptr[i] * statesSquare];
vl = &x1[numStates * i];
vr = &x2[numStates * i];
v = &x3[numStates * i];
for(l = 0; l < numStates; l++)
v[l] = 0.0;
for(l = 0; l < numStates; l++)
{
ump_x1 = 0.0;
ump_x2 = 0.0;
for(j = 0; j < numStates; j++)
{
ump_x1 += vl[j] * le[l * numStates + j];
ump_x2 += vr[j] * ri[l * numStates + j];
}
x1px2 = ump_x1 * ump_x2;
for(j = 0; j < numStates; j++)
v[j] += x1px2 * extEV[l * numStates + j];
}
scale = 1;
for(l = 0; scale && (l < numStates); l++)
scale = ((v[l] < minlikelihood) && (v[l] > minusminlikelihood));
if(scale)
{
for(l = 0; l < numStates; l++)
v[l] *= twotothe256;
if(useFastScaling)
addScale += wgt[i];
else
ex3[i] += 1;
}
}
break;
default:
assert(0);
}
if(useFastScaling)
*scalerIncrement = addScale;
}
static void newviewFlexGamma(int tipCase,
double *x1, double *x2, double *x3, double *extEV, double *tipVector,
int *ex3, unsigned char *tipX1, unsigned char *tipX2,
int n, double *left, double *right, int *wgt, int *scalerIncrement, const boolean useFastScaling,
const int numStates)
{
double *v;
double x1px2;
int i, j, l, k, scale, addScale = 0;
double *vl, *vr, al, ar;
const int
statesSquare = numStates * numStates,
gammaStates = 4 * numStates;
switch(tipCase)
{
case TIP_TIP:
{
for(i = 0; i < n; i++)
{
for(k = 0; k < 4; k++)
{
vl = &(tipVector[numStates * tipX1[i]]);
vr = &(tipVector[numStates * tipX2[i]]);
v = &(x3[gammaStates * i + numStates * k]);
for(l = 0; l < numStates; l++)
v[l] = 0;
for(l = 0; l < numStates; l++)
{
al = 0.0;
ar = 0.0;
for(j = 0; j < numStates; j++)
{
al += vl[j] * left[k * statesSquare + l * numStates + j];
ar += vr[j] * right[k * statesSquare + l * numStates + j];
}
x1px2 = al * ar;
for(j = 0; j < numStates; j++)
v[j] += x1px2 * extEV[numStates * l + j];
}
}
}
}
break;
case TIP_INNER:
{
for (i = 0; i < n; i++)
{
for(k = 0; k < 4; k++)
{
vl = &(tipVector[numStates * tipX1[i]]);
vr = &(x2[gammaStates * i + numStates * k]);
v = &(x3[gammaStates * i + numStates * k]);
for(l = 0; l < numStates; l++)
v[l] = 0;
for(l = 0; l < numStates; l++)
{
al = 0.0;
ar = 0.0;
for(j = 0; j < numStates; j++)
{
al += vl[j] * left[k * statesSquare + l * numStates + j];
ar += vr[j] * right[k * statesSquare + l * numStates + j];
}
x1px2 = al * ar;
for(j = 0; j < numStates; j++)
v[j] += x1px2 * extEV[numStates * l + j];
}
}
v = &x3[gammaStates * i];
scale = 1;
for(l = 0; scale && (l < gammaStates); l++)
scale = (ABS(v[l]) < minlikelihood);
if(scale)
{
for(l = 0; l < gammaStates; l++)
v[l] *= twotothe256;
if(useFastScaling)
addScale += wgt[i];
else
ex3[i] += 1;
}
}
}
break;
case INNER_INNER:
for (i = 0; i < n; i++)
{
for(k = 0; k < 4; k++)
{
vl = &(x1[gammaStates * i + numStates * k]);
vr = &(x2[gammaStates * i + numStates * k]);
v = &(x3[gammaStates * i + numStates * k]);
for(l = 0; l < numStates; l++)
v[l] = 0;
for(l = 0; l < numStates; l++)
{
al = 0.0;
ar = 0.0;
for(j = 0; j < numStates; j++)
{
al += vl[j] * left[k * statesSquare + l * numStates + j];
ar += vr[j] * right[k * statesSquare + l * numStates + j];
}
x1px2 = al * ar;
for(j = 0; j < numStates; j++)
v[j] += x1px2 * extEV[numStates * l + j];
}
}
v = &(x3[gammaStates * i]);
scale = 1;
for(l = 0; scale && (l < gammaStates); l++)
scale = ((ABS(v[l]) < minlikelihood));
if (scale)
{
for(l = 0; l < gammaStates; l++)
v[l] *= twotothe256;
if(useFastScaling)
addScale += wgt[i];
else
ex3[i] += 1;
}
}
break;
default:
assert(0);
}
if(useFastScaling)
*scalerIncrement = addScale;
}
static void newviewAscCat(int tipCase,
double *x1, double *x2, double *x3, double *extEV, double *tipVector,
int *ex3,
const int n, double *left, double *right,
const int numStates, int dataType, const int qNumber, const int rNumber, int maxtips)
{
double
*le, *ri, *v, *vl, *vr,
ump_x1, ump_x2, x1px2;
int
i, l, j, scale;
switch(tipCase)
{
case TIP_TIP:
{
unsigned char
tip1[32],
tip2[32];
ascertainmentBiasSequence(tip1, numStates, dataType, qNumber);
ascertainmentBiasSequence(tip2, numStates, dataType, rNumber);
assert(qNumber <= maxtips && rNumber <= maxtips);
for (i = 0; i < n; i++)
{
le = &left[0];
ri = &right[0];
vl = &(tipVector[numStates * tip1[i]]);
vr = &(tipVector[numStates * tip2[i]]);
v = &x3[numStates * i];
for(l = 0; l < numStates; l++)
v[l] = 0.0;
for(l = 0; l < numStates; l++)
{
ump_x1 = 0.0;
ump_x2 = 0.0;
for(j = 0; j < numStates; j++)
{
ump_x1 += vl[j] * le[l * numStates + j];
ump_x2 += vr[j] * ri[l * numStates + j];
}
x1px2 = ump_x1 * ump_x2;
for(j = 0; j < numStates; j++)
v[j] += x1px2 * extEV[l * numStates + j];
}
}
}
break;
case TIP_INNER:
{
unsigned char
tip[32];
if(rNumber <= maxtips)
ascertainmentBiasSequence(tip, numStates, dataType, rNumber);
else
ascertainmentBiasSequence(tip, numStates, dataType, qNumber);
for (i = 0; i < n; i++)
{
le = &left[0];
ri = &right[0];
vl = &(tipVector[numStates * tip[i]]);
vr = &x2[numStates * i];
v = &x3[numStates * i];
for(l = 0; l < numStates; l++)
v[l] = 0.0;
for(l = 0; l < numStates; l++)
{
ump_x1 = 0.0;
ump_x2 = 0.0;
for(j = 0; j < numStates; j++)
{
ump_x1 += vl[j] * le[l * numStates + j];
ump_x2 += vr[j] * ri[l * numStates + j];
}
x1px2 = ump_x1 * ump_x2;
for(j = 0; j < numStates; j++)
v[j] += x1px2 * extEV[l * numStates + j];
}
scale = 1;
for(l = 0; scale && (l < numStates); l++)
scale = ((v[l] < minlikelihood) && (v[l] > minusminlikelihood));
if(scale)
{
for(l = 0; l < numStates; l++)
v[l] *= twotothe256;
ex3[i] += 1;
}
}
}
break;
case INNER_INNER:
for(i = 0; i < n; i++)
{
le = &left[0];
ri = &right[0];
vl = &x1[numStates * i];
vr = &x2[numStates * i];
v = &x3[numStates * i];
for(l = 0; l < numStates; l++)
v[l] = 0.0;
for(l = 0; l < numStates; l++)
{
ump_x1 = 0.0;
ump_x2 = 0.0;
for(j = 0; j < numStates; j++)
{
ump_x1 += vl[j] * le[l * numStates + j];
ump_x2 += vr[j] * ri[l * numStates + j];
}
x1px2 = ump_x1 * ump_x2;
for(j = 0; j < numStates; j++)
v[j] += x1px2 * extEV[l * numStates + j];
}
scale = 1;
for(l = 0; scale && (l < numStates); l++)
scale = ((v[l] < minlikelihood) && (v[l] > minusminlikelihood));
if(scale)
{
for(l = 0; l < numStates; l++)
v[l] *= twotothe256;
ex3[i] += 1;
}
}
break;
default:
assert(0);
}
}
static void newviewAscGamma(int tipCase,
double *x1, double *x2, double *x3, double *extEV, double *tipVector,
int *ex3,
const int n, double *left, double *right,
const int numStates, int dataType, const int qNumber, const int rNumber, int maxtips)
{
int
i, j, l, k, scale;
const int
statesSquare = numStates * numStates,
gammaStates = 4 * numStates;
double
*vl, *vr, al, ar, *v, x1px2;
switch(tipCase)
{
case TIP_TIP:
{
unsigned char
tip1[32],
tip2[32];
ascertainmentBiasSequence(tip1, numStates, dataType, qNumber);
ascertainmentBiasSequence(tip2, numStates, dataType, rNumber);
assert(qNumber <= maxtips && rNumber <= maxtips);
for(i = 0; i < n; i++)
{
for(k = 0; k < 4; k++)
{
vl = &(tipVector[numStates * tip1[i]]);
vr = &(tipVector[numStates * tip2[i]]);
v = &(x3[gammaStates * i + numStates * k]);
for(l = 0; l < numStates; l++)
v[l] = 0;
for(l = 0; l < numStates; l++)
{
al = 0.0;
ar = 0.0;
for(j = 0; j < numStates; j++)
{
al += vl[j] * left[k * statesSquare + l * numStates + j];
ar += vr[j] * right[k * statesSquare + l * numStates + j];
}
x1px2 = al * ar;
for(j = 0; j < numStates; j++)
v[j] += x1px2 * extEV[numStates * l + j];
}
}
}
}
break;
case TIP_INNER:
{
unsigned char
tip[32];
if(rNumber <= maxtips)
ascertainmentBiasSequence(tip, numStates, dataType, rNumber);
else
ascertainmentBiasSequence(tip, numStates, dataType, qNumber);
for (i = 0; i < n; i++)
{
for(k = 0; k < 4; k++)
{
vl = &(tipVector[numStates * tip[i]]);
vr = &(x2[gammaStates * i + numStates * k]);
v = &(x3[gammaStates * i + numStates * k]);
for(l = 0; l < numStates; l++)
v[l] = 0;
for(l = 0; l < numStates; l++)
{
al = 0.0;
ar = 0.0;
for(j = 0; j < numStates; j++)
{
al += vl[j] * left[k * statesSquare + l * numStates + j];
ar += vr[j] * right[k * statesSquare + l * numStates + j];
}
x1px2 = al * ar;
for(j = 0; j < numStates; j++)
v[j] += x1px2 * extEV[numStates * l + j];
}
}
v = &x3[gammaStates * i];
scale = 1;
for(l = 0; scale && (l < gammaStates); l++)
scale = (ABS(v[l]) < minlikelihood);
if(scale)
{
for(l = 0; l < gammaStates; l++)
v[l] *= twotothe256;
ex3[i] += 1;
}
}
}
break;
case INNER_INNER:
for (i = 0; i < n; i++)
{
for(k = 0; k < 4; k++)
{
vl = &(x1[gammaStates * i + numStates * k]);
vr = &(x2[gammaStates * i + numStates * k]);
v = &(x3[gammaStates * i + numStates * k]);
for(l = 0; l < numStates; l++)
v[l] = 0;
for(l = 0; l < numStates; l++)
{
al = 0.0;
ar = 0.0;
for(j = 0; j < numStates; j++)
{
al += vl[j] * left[k * statesSquare + l * numStates + j];
ar += vr[j] * right[k * statesSquare + l * numStates + j];
}
x1px2 = al * ar;
for(j = 0; j < numStates; j++)
v[j] += x1px2 * extEV[numStates * l + j];
}
}
v = &(x3[gammaStates * i]);
scale = 1;
for(l = 0; scale && (l < gammaStates); l++)
scale = ((ABS(v[l]) < minlikelihood));
if(scale)
{
for(l = 0; l < gammaStates; l++)
v[l] *= twotothe256;
ex3[i] += 1;
}
}
break;
default:
assert(0);
}
}
static void makeP(double z1, double z2, double *rptr, double *EI, double *EIGN, int numberOfCategories, double *left, double *right, int data, boolean saveMem, int maxCat)
{
int i, j, k;
switch(data)
{
case BINARY_DATA:
{
double d1, d2;
for(i = 0; i < numberOfCategories; i++)
{
d1 = EXP(rptr[i] * EIGN[0] * z1);
d2 = EXP(rptr[i] * EIGN[0] * z2);
for(j = 0; j < 2; j++)
{
left[i * 4 + j * 2] = 1.0;
right[i * 4 + j * 2] = 1.0;
left[i * 4 + j * 2 + 1] = d1 * EI[j];
right[i * 4 + j * 2 + 1] = d2 * EI[j];
}
}
}
break;
case DNA_DATA:
{
#ifdef __SIM_SSE3
double
d1[4] __attribute__ ((aligned (BYTE_ALIGNMENT))),
d2[4] __attribute__ ((aligned (BYTE_ALIGNMENT))),
ez1[3],
ez2[3],
EI_16[16] __attribute__ ((aligned (BYTE_ALIGNMENT)));
for(j = 0; j < 4; j++)
{
EI_16[j * 4] = 1.0;
for(k = 0; k < 3; k++)
EI_16[j * 4 + k + 1] = EI[3 * j + k];
}
for(j = 0; j < 3; j++)
{
ez1[j] = EIGN[j] * z1;
ez2[j] = EIGN[j] * z2;
}
for(i = 0; i < numberOfCategories; i++)
{
__m128d
d1_0, d1_1,
d2_0, d2_1;
d1[0] = 1.0;
d2[0] = 1.0;
for(j = 0; j < 3; j++)
{
d1[j+1] = EXP(rptr[i] * ez1[j]);
d2[j+1] = EXP(rptr[i] * ez2[j]);
}
d1_0 = _mm_load_pd(&d1[0]);
d1_1 = _mm_load_pd(&d1[2]);
d2_0 = _mm_load_pd(&d2[0]);
d2_1 = _mm_load_pd(&d2[2]);
for(j = 0; j < 4; j++)
{
double *ll = &left[i * 16 + j * 4];
double *rr = &right[i * 16 + j * 4];
__m128d eev = _mm_load_pd(&EI_16[4 * j]);
_mm_store_pd(&ll[0], _mm_mul_pd(d1_0, eev));
_mm_store_pd(&rr[0], _mm_mul_pd(d2_0, eev));
eev = _mm_load_pd(&EI_16[4 * j + 2]);
_mm_store_pd(&ll[2], _mm_mul_pd(d1_1, eev));
_mm_store_pd(&rr[2], _mm_mul_pd(d2_1, eev));
}
}
if(saveMem)
{
i = maxCat;
{
__m128d
d1_0, d1_1,
d2_0, d2_1;
d1[0] = 1.0;
d2[0] = 1.0;
for(j = 0; j < 3; j++)
{
d1[j+1] = EXP(ez1[j]);
d2[j+1] = EXP(ez2[j]);
}
d1_0 = _mm_load_pd(&d1[0]);
d1_1 = _mm_load_pd(&d1[2]);
d2_0 = _mm_load_pd(&d2[0]);
d2_1 = _mm_load_pd(&d2[2]);
for(j = 0; j < 4; j++)
{
double *ll = &left[i * 16 + j * 4];
double *rr = &right[i * 16 + j * 4];
__m128d eev = _mm_load_pd(&EI_16[4 * j]);
_mm_store_pd(&ll[0], _mm_mul_pd(d1_0, eev));
_mm_store_pd(&rr[0], _mm_mul_pd(d2_0, eev));
eev = _mm_load_pd(&EI_16[4 * j + 2]);
_mm_store_pd(&ll[2], _mm_mul_pd(d1_1, eev));
_mm_store_pd(&rr[2], _mm_mul_pd(d2_1, eev));
}
}
}
#else
double d1[3], d2[3];
for(i = 0; i < numberOfCategories; i++)
{
for(j = 0; j < 3; j++)
{
d1[j] = EXP(rptr[i] * EIGN[j] * z1);
d2[j] = EXP(rptr[i] * EIGN[j] * z2);
}
for(j = 0; j < 4; j++)
{
left[i * 16 + j * 4] = 1.0;
right[i * 16 + j * 4] = 1.0;
for(k = 0; k < 3; k++)
{
left[i * 16 + j * 4 + k + 1] = d1[k] * EI[3 * j + k];
right[i * 16 + j * 4 + k + 1] = d2[k] * EI[3 * j + k];
}
}
}
#endif
}
break;
case SECONDARY_DATA:
{
double lz1[15], lz2[15], d1[15], d2[15];
for(i = 0; i < 15; i++)
{
lz1[i] = EIGN[i] * z1;
lz2[i] = EIGN[i] * z2;
}
for(i = 0; i < numberOfCategories; i++)
{
for(j = 0; j < 15; j++)
{
d1[j] = EXP (rptr[i] * lz1[j]);
d2[j] = EXP (rptr[i] * lz2[j]);
}
for(j = 0; j < 16; j++)
{
left[256 * i + 16 * j] = 1.0;
right[256 * i + 16 * j] = 1.0;
for(k = 1; k < 16; k++)
{
left[256 * i + 16 * j + k] = d1[k-1] * EI[15 * j + (k-1)];
right[256 * i + 16 * j + k] = d2[k-1] * EI[15 * j + (k-1)];
}
}
}
}
break;
case SECONDARY_DATA_6:
{
double lz1[5], lz2[5], d1[5], d2[5];
for(i = 0; i < 5; i++)
{
lz1[i] = EIGN[i] * z1;
lz2[i] = EIGN[i] * z2;
}
for(i = 0; i < numberOfCategories; i++)
{
for(j = 0; j < 5; j++)
{
d1[j] = EXP (rptr[i] * lz1[j]);
d2[j] = EXP (rptr[i] * lz2[j]);
}
for(j = 0; j < 6; j++)
{
left[36 * i + 6 * j] = 1.0;
right[36 * i + 6 * j] = 1.0;
for(k = 1; k < 6; k++)
{
left[36 * i + 6 * j + k] = d1[k-1] * EI[5 * j + (k-1)];
right[36 * i + 6 * j + k] = d2[k-1] * EI[5 * j + (k-1)];
}
}
}
}
break;
case SECONDARY_DATA_7:
{
double lz1[6], lz2[6], d1[6], d2[6];
for(i = 0; i < 6; i++)
{
lz1[i] = EIGN[i] * z1;
lz2[i] = EIGN[i] * z2;
}
for(i = 0; i < numberOfCategories; i++)
{
for(j = 0; j < 6; j++)
{
d1[j] = EXP (rptr[i] * lz1[j]);
d2[j] = EXP (rptr[i] * lz2[j]);
}
for(j = 0; j < 7; j++)
{
left[49 * i + 7 * j] = 1.0;
right[49 * i + 7 * j] = 1.0;
for(k = 1; k < 7; k++)
{
left[49 * i + 7 * j + k] = d1[k-1] * EI[6 * j + (k-1)];
right[49 * i + 7 * j + k] = d2[k-1] * EI[6 * j + (k-1)];
}
}
}
}
break;
case AA_DATA:
{
double lz1[19], lz2[19], d1[19], d2[19];
for(i = 0; i < 19; i++)
{
lz1[i] = EIGN[i] * z1;
lz2[i] = EIGN[i] * z2;
}
for(i = 0; i < numberOfCategories; i++)
{
for(j = 0; j < 19; j++)
{
d1[j] = EXP (rptr[i] * lz1[j]);
d2[j] = EXP (rptr[i] * lz2[j]);
}
for(j = 0; j < 20; j++)
{
left[400 * i + 20 * j] = 1.0;
right[400 * i + 20 * j] = 1.0;
for(k = 1; k < 20; k++)
{
left[400 * i + 20 * j + k] = d1[k-1] * EI[19 * j + (k-1)];
right[400 * i + 20 * j + k] = d2[k-1] * EI[19 * j + (k-1)];
}
}
}
if(saveMem)
{
i = maxCat;
for(j = 0; j < 19; j++)
{
d1[j] = EXP (lz1[j]);
d2[j] = EXP (lz2[j]);
}
for(j = 0; j < 20; j++)
{
left[400 * i + 20 * j] = 1.0;
right[400 * i + 20 * j] = 1.0;
for(k = 1; k < 20; k++)
{
left[400 * i + 20 * j + k] = d1[k-1] * EI[19 * j + (k-1)];
right[400 * i + 20 * j + k] = d2[k-1] * EI[19 * j + (k-1)];
}
}
}
}
break;
default:
assert(0);
}
}
#ifndef __SIM_SSE3
static void newviewGTRCAT_BINARY( int tipCase, double *EV, int *cptr,
double *x1_start, double *x2_start, double *x3_start, double *tipVector,
int *ex3, unsigned char *tipX1, unsigned char *tipX2,
int n, double *left, double *right, int *wgt, int *scalerIncrement, const boolean useFastScaling)
{
double
*le,
*ri,
*x1, *x2, *x3;
double
ump_x1, ump_x2, x1px2[2];
int i, j, k, scale, addScale = 0;
switch(tipCase)
{
case TIP_TIP:
{
for (i = 0; i < n; i++)
{
x1 = &(tipVector[2 * tipX1[i]]);
x2 = &(tipVector[2 * tipX2[i]]);
x3 = &x3_start[2 * i];
le = &left[cptr[i] * 4];
ri = &right[cptr[i] * 4];
for(j = 0; j < 2; j++)
{
ump_x1 = 0.0;
ump_x2 = 0.0;
for(k = 0; k < 2; k++)
{
ump_x1 += x1[k] * le[j * 2 + k];
ump_x2 += x2[k] * ri[j * 2 + k];
}
x1px2[j] = ump_x1 * ump_x2;
}
for(j = 0; j < 2; j++)
x3[j] = 0.0;
for(j = 0; j < 2; j++)
for(k = 0; k < 2; k++)
x3[k] += x1px2[j] * EV[j * 2 + k];
}
}
break;
case TIP_INNER:
{
for (i = 0; i < n; i++)
{
x1 = &(tipVector[2 * tipX1[i]]);
x2 = &x2_start[2 * i];
x3 = &x3_start[2 * i];
le = &left[cptr[i] * 4];
ri = &right[cptr[i] * 4];
for(j = 0; j < 2; j++)
{
ump_x1 = 0.0;
ump_x2 = 0.0;
for(k = 0; k < 2; k++)
{
ump_x1 += x1[k] * le[j * 2 + k];
ump_x2 += x2[k] * ri[j * 2 + k];
}
x1px2[j] = ump_x1 * ump_x2;
}
for(j = 0; j < 2; j++)
x3[j] = 0.0;
for(j = 0; j < 2; j++)
for(k = 0; k < 2; k++)
x3[k] += x1px2[j] * EV[2 * j + k];
scale = 1;
for(j = 0; j < 2 && scale; j++)
scale = (x3[j] < minlikelihood && x3[j] > minusminlikelihood);
if(scale)
{
for(j = 0; j < 2; j++)
x3[j] *= twotothe256;
if(useFastScaling)
addScale += wgt[i];
else
ex3[i] += 1;
}
}
}
break;
case INNER_INNER:
for (i = 0; i < n; i++)
{
x1 = &x1_start[2 * i];
x2 = &x2_start[2 * i];
x3 = &x3_start[2 * i];
le = &left[cptr[i] * 4];
ri = &right[cptr[i] * 4];
for(j = 0; j < 2; j++)
{
ump_x1 = 0.0;
ump_x2 = 0.0;
for(k = 0; k < 2; k++)
{
ump_x1 += x1[k] * le[j * 2 + k];
ump_x2 += x2[k] * ri[j * 2 + k];
}
x1px2[j] = ump_x1 * ump_x2;
}
for(j = 0; j < 2; j++)
x3[j] = 0.0;
for(j = 0; j < 2; j++)
for(k = 0; k < 2; k++)
x3[k] += x1px2[j] * EV[2 * j + k];
scale = 1;
for(j = 0; j < 2 && scale; j++)
scale = (x3[j] < minlikelihood && x3[j] > minusminlikelihood);
if(scale)
{
for(j = 0; j < 2; j++)
x3[j] *= twotothe256;
if(useFastScaling)
addScale += wgt[i];
else
ex3[i] += 1;
}
}
break;
default:
assert(0);
}
if(useFastScaling)
*scalerIncrement = addScale;
}
#else
static void newviewGTRCAT_BINARY( int tipCase, double *EV, int *cptr,
double *x1_start, double *x2_start, double *x3_start, double *tipVector,
int *ex3, unsigned char *tipX1, unsigned char *tipX2,
int n, double *left, double *right, int *wgt, int *scalerIncrement, const boolean useFastScaling)
{
double
*le,
*ri,
*x1, *x2, *x3;
int i, l, scale, addScale = 0;
switch(tipCase)
{
case TIP_TIP:
{
for(i = 0; i < n; i++)
{
x1 = &(tipVector[2 * tipX1[i]]);
x2 = &(tipVector[2 * tipX2[i]]);
x3 = &x3_start[2 * i];
le = &left[cptr[i] * 4];
ri = &right[cptr[i] * 4];
_mm_store_pd(x3, _mm_setzero_pd());
for(l = 0; l < 2; l++)
{
__m128d al = _mm_mul_pd(_mm_load_pd(x1), _mm_load_pd(&le[l * 2]));
__m128d ar = _mm_mul_pd(_mm_load_pd(x2), _mm_load_pd(&ri[l * 2]));
al = _mm_hadd_pd(al, al);
ar = _mm_hadd_pd(ar, ar);
al = _mm_mul_pd(al, ar);
__m128d vv = _mm_load_pd(x3);
__m128d EVV = _mm_load_pd(&EV[2 * l]);
vv = _mm_add_pd(vv, _mm_mul_pd(al, EVV));
_mm_store_pd(x3, vv);
}
}
}
break;
case TIP_INNER:
{
for (i = 0; i < n; i++)
{
x1 = &(tipVector[2 * tipX1[i]]);
x2 = &x2_start[2 * i];
x3 = &x3_start[2 * i];
le = &left[cptr[i] * 4];
ri = &right[cptr[i] * 4];
_mm_store_pd(x3, _mm_setzero_pd());
for(l = 0; l < 2; l++)
{
__m128d al = _mm_mul_pd(_mm_load_pd(x1), _mm_load_pd(&le[l * 2]));
__m128d ar = _mm_mul_pd(_mm_load_pd(x2), _mm_load_pd(&ri[l * 2]));
al = _mm_hadd_pd(al, al);
ar = _mm_hadd_pd(ar, ar);
al = _mm_mul_pd(al, ar);
__m128d vv = _mm_load_pd(x3);
__m128d EVV = _mm_load_pd(&EV[2 * l]);
vv = _mm_add_pd(vv, _mm_mul_pd(al, EVV));
_mm_store_pd(x3, vv);
}
__m128d minlikelihood_sse = _mm_set1_pd( minlikelihood );
scale = 1;
__m128d v1 = _mm_and_pd(_mm_load_pd(x3), absMask.m);
v1 = _mm_cmplt_pd(v1, minlikelihood_sse);
if(_mm_movemask_pd( v1 ) != 3)
scale = 0;
if(scale)
{
__m128d twoto = _mm_set_pd(twotothe256, twotothe256);
__m128d ex3v = _mm_load_pd(x3);
_mm_store_pd(x3, _mm_mul_pd(ex3v,twoto));
if(useFastScaling)
addScale += wgt[i];
else
ex3[i] += 1;
}
}
}
break;
case INNER_INNER:
for (i = 0; i < n; i++)
{
x1 = &x1_start[2 * i];
x2 = &x2_start[2 * i];
x3 = &x3_start[2 * i];
le = &left[cptr[i] * 4];
ri = &right[cptr[i] * 4];
_mm_store_pd(x3, _mm_setzero_pd());
for(l = 0; l < 2; l++)
{
__m128d al = _mm_mul_pd(_mm_load_pd(x1), _mm_load_pd(&le[l * 2]));
__m128d ar = _mm_mul_pd(_mm_load_pd(x2), _mm_load_pd(&ri[l * 2]));
al = _mm_hadd_pd(al, al);
ar = _mm_hadd_pd(ar, ar);
al = _mm_mul_pd(al, ar);
__m128d vv = _mm_load_pd(x3);
__m128d EVV = _mm_load_pd(&EV[2 * l]);
vv = _mm_add_pd(vv, _mm_mul_pd(al, EVV));
_mm_store_pd(x3, vv);
}
__m128d minlikelihood_sse = _mm_set1_pd( minlikelihood );
scale = 1;
__m128d v1 = _mm_and_pd(_mm_load_pd(x3), absMask.m);
v1 = _mm_cmplt_pd(v1, minlikelihood_sse);
if(_mm_movemask_pd( v1 ) != 3)
scale = 0;
if(scale)
{
__m128d twoto = _mm_set_pd(twotothe256, twotothe256);
__m128d ex3v = _mm_load_pd(x3);
_mm_store_pd(x3, _mm_mul_pd(ex3v,twoto));
if(useFastScaling)
addScale += wgt[i];
else
ex3[i] += 1;
}
}
break;
default:
assert(0);
}
if(useFastScaling)
*scalerIncrement = addScale;
}
#endif
#ifdef __SIM_SSE3
static void newviewGTRGAMMA_BINARY(int tipCase,
double *x1_start, double *x2_start, double *x3_start,
double *EV, double *tipVector,
int *ex3, unsigned char *tipX1, unsigned char *tipX2,
const int n, double *left, double *right, int *wgt, int *scalerIncrement, const boolean useFastScaling
)
{
double
*x1, *x2, *x3;
int i, k, l, scale, addScale = 0;
switch(tipCase)
{
case TIP_TIP:
for (i = 0; i < n; i++)
{
x1 = &(tipVector[2 * tipX1[i]]);
x2 = &(tipVector[2 * tipX2[i]]);
for(k = 0; k < 4; k++)
{
x3 = &(x3_start[8 * i + 2 * k]);
_mm_store_pd(x3, _mm_setzero_pd());
for(l = 0; l < 2; l++)
{
__m128d al = _mm_mul_pd(_mm_load_pd(x1), _mm_load_pd(&left[k * 4 + l * 2]));
__m128d ar = _mm_mul_pd(_mm_load_pd(x2), _mm_load_pd(&right[k * 4 + l * 2]));
al = _mm_hadd_pd(al, al);
ar = _mm_hadd_pd(ar, ar);
al = _mm_mul_pd(al, ar);
__m128d vv = _mm_load_pd(x3);
__m128d EVV = _mm_load_pd(&EV[2 * l]);
vv = _mm_add_pd(vv, _mm_mul_pd(al, EVV));
_mm_store_pd(x3, vv);
}
}
}
break;
case TIP_INNER:
for (i = 0; i < n; i++)
{
x1 = &(tipVector[2 * tipX1[i]]);
for(k = 0; k < 4; k++)
{
x2 = &(x2_start[8 * i + 2 * k]);
x3 = &(x3_start[8 * i + 2 * k]);
_mm_store_pd(x3, _mm_setzero_pd());
for(l = 0; l < 2; l++)
{
__m128d al = _mm_mul_pd(_mm_load_pd(x1), _mm_load_pd(&left[k * 4 + l * 2]));
__m128d ar = _mm_mul_pd(_mm_load_pd(x2), _mm_load_pd(&right[k * 4 + l * 2]));
al = _mm_hadd_pd(al, al);
ar = _mm_hadd_pd(ar, ar);
al = _mm_mul_pd(al, ar);
__m128d vv = _mm_load_pd(x3);
__m128d EVV = _mm_load_pd(&EV[2 * l]);
vv = _mm_add_pd(vv, _mm_mul_pd(al, EVV));
_mm_store_pd(x3, vv);
}
}
x3 = &(x3_start[8 * i]);
__m128d minlikelihood_sse = _mm_set1_pd( minlikelihood );
scale = 1;
for(l = 0; scale && (l < 8); l += 2)
{
__m128d vv = _mm_load_pd(&x3[l]);
__m128d v1 = _mm_and_pd(vv, absMask.m);
v1 = _mm_cmplt_pd(v1, minlikelihood_sse);
if(_mm_movemask_pd( v1 ) != 3)
scale = 0;
}
if(scale)
{
__m128d twoto = _mm_set_pd(twotothe256, twotothe256);
for(l = 0; l < 8; l+=2)
{
__m128d ex3v = _mm_load_pd(&x3[l]);
_mm_store_pd(&x3[l], _mm_mul_pd(ex3v,twoto));
}
if(useFastScaling)
addScale += wgt[i];
else
ex3[i] += 1;
}
}
break;
case INNER_INNER:
for (i = 0; i < n; i++)
{
for(k = 0; k < 4; k++)
{
x1 = &(x1_start[8 * i + 2 * k]);
x2 = &(x2_start[8 * i + 2 * k]);
x3 = &(x3_start[8 * i + 2 * k]);
_mm_store_pd(x3, _mm_setzero_pd());
for(l = 0; l < 2; l++)
{
__m128d al = _mm_mul_pd(_mm_load_pd(x1), _mm_load_pd(&left[k * 4 + l * 2]));
__m128d ar = _mm_mul_pd(_mm_load_pd(x2), _mm_load_pd(&right[k * 4 + l * 2]));
al = _mm_hadd_pd(al, al);
ar = _mm_hadd_pd(ar, ar);
al = _mm_mul_pd(al, ar);
__m128d vv = _mm_load_pd(x3);
__m128d EVV = _mm_load_pd(&EV[2 * l]);
vv = _mm_add_pd(vv, _mm_mul_pd(al, EVV));
_mm_store_pd(x3, vv);
}
}
x3 = &(x3_start[8 * i]);
__m128d minlikelihood_sse = _mm_set1_pd( minlikelihood );
scale = 1;
for(l = 0; scale && (l < 8); l += 2)
{
__m128d vv = _mm_load_pd(&x3[l]);
__m128d v1 = _mm_and_pd(vv, absMask.m);
v1 = _mm_cmplt_pd(v1, minlikelihood_sse);
if(_mm_movemask_pd( v1 ) != 3)
scale = 0;
}
if(scale)
{
__m128d twoto = _mm_set_pd(twotothe256, twotothe256);
for(l = 0; l < 8; l+=2)
{
__m128d ex3v = _mm_load_pd(&x3[l]);
_mm_store_pd(&x3[l], _mm_mul_pd(ex3v,twoto));
}
if(useFastScaling)
addScale += wgt[i];
else
ex3[i] += 1;
}
}
break;
default:
assert(0);
}
if(useFastScaling)
*scalerIncrement = addScale;
}
#else
static void newviewGTRGAMMA_BINARY(int tipCase,
double *x1_start, double *x2_start, double *x3_start,
double *EV, double *tipVector,
int *ex3, unsigned char *tipX1, unsigned char *tipX2,
const int n, double *left, double *right, int *wgt, int *scalerIncrement, const boolean useFastScaling
)
{
double
*x1, *x2, *x3;
double
ump_x1,
ump_x2,
x1px2[4];
int i, j, k, l, scale, addScale = 0;
/* C-OPT figure out if we are at an inner node who has two tips/leaves
as descendants TIP_TIP, a tip and another inner node as descendant
TIP_INNER, or two inner nodes as descendants INNER_INNER */
switch(tipCase)
{
case TIP_TIP:
{
for (i = 0; i < n; i++)
{
x1 = &(tipVector[2 * tipX1[i]]);
x2 = &(tipVector[2 * tipX2[i]]);
x3 = &x3_start[i * 8];
for(j = 0; j < 8; j++)
x3[j] = 0.0;
for (j = 0; j < 4; j++)
{
for (k = 0; k < 2; k++)
{
ump_x1 = 0.0;
ump_x2 = 0.0;
for (l=0; l < 2; l++)
{
ump_x1 += x1[l] * left[ j*4 + k*2 + l];
ump_x2 += x2[l] * right[j*4 + k*2 + l];
}
x1px2[k] = ump_x1 * ump_x2;
}
for(k = 0; k < 2; k++)
for (l = 0; l < 2; l++)
x3[j * 2 + l] += x1px2[k] * EV[2 * k + l];
}
}
}
break;
case TIP_INNER:
{
for (i = 0; i < n; i++)
{
x1 = &(tipVector[2 * tipX1[i]]);
x2 = &x2_start[i * 8];
x3 = &x3_start[i * 8];
for(j = 0; j < 8; j++)
x3[j] = 0.0;
for (j = 0; j < 4; j++)
{
for (k = 0; k < 2; k++)
{
ump_x1 = 0.0;
ump_x2 = 0.0;
for (l=0; l < 2; l++)
{
ump_x1 += x1[l] * left[ j*4 + k*2 + l];
ump_x2 += x2[j*2 + l] * right[j*4 + k*2 + l];
}
x1px2[k] = ump_x1 * ump_x2;
}
for(k = 0; k < 2; k++)
for (l = 0; l < 2; l++)
x3[j * 2 + l] += x1px2[k] * EV[2 * k + l];
}
scale = 1;
for(l = 0; scale && (l < 8); l++)
scale = (ABS(x3[l]) < minlikelihood);
if(scale)
{
for (l=0; l < 8; l++)
x3[l] *= twotothe256;
if(useFastScaling)
addScale += wgt[i];
else
ex3[i] += 1;
}
}
}
break;
case INNER_INNER:
/* C-OPT here we don't do any pre-computations
This should be the most compute intensive loop of the three
cases here. If we have one or two tips as descendants
we can take a couple of shortcuts */
for (i = 0; i < n; i++)
{
x1 = &x1_start[i * 8];
x2 = &x2_start[i * 8];
x3 = &x3_start[i * 8];
for(j = 0; j < 8; j++)
x3[j] = 0.0;
for (j = 0; j < 4; j++)
{
for (k = 0; k < 2; k++)
{
ump_x1 = 0.0;
ump_x2 = 0.0;
for (l=0; l < 2; l++)
{
ump_x1 += x1[j*2 + l] * left[ j*4 + k*2 + l];
ump_x2 += x2[j*2 + l] * right[j*4 + k*2 + l];
}
x1px2[k] = ump_x1 * ump_x2;
}
for(k = 0; k < 2; k++)
for (l = 0; l < 2; l++)
x3[j * 2 + l] += x1px2[k] * EV[2 * k + l];
}
scale = 1;
for(l = 0; scale && (l < 8); l++)
scale = (ABS(x3[l]) < minlikelihood);
if(scale)
{
for (l=0; l<8; l++)
x3[l] *= twotothe256;
if(useFastScaling)
addScale += wgt[i];
else
ex3[i] += 1;
}
}
break;
default:
assert(0);
}
if(useFastScaling)
*scalerIncrement = addScale;
}
#endif
#ifndef __SIM_SSE3
static void newviewGTRCAT( int tipCase, double *EV, int *cptr,
double *x1_start, double *x2_start, double *x3_start, double *tipVector,
int *ex3, unsigned char *tipX1, unsigned char *tipX2,
int n, double *left, double *right, int *wgt, int *scalerIncrement, const boolean useFastScaling)
{
double
*le,
*ri,
*x1, *x2, *x3;
double
ump_x1, ump_x2, x1px2[4];
int i, j, k, scale, addScale = 0;
switch(tipCase)
{
case TIP_TIP:
{
for (i = 0; i < n; i++)
{
x1 = &(tipVector[4 * tipX1[i]]);
x2 = &(tipVector[4 * tipX2[i]]);
x3 = &x3_start[4 * i];
le = &left[cptr[i] * 16];
ri = &right[cptr[i] * 16];
for(j = 0; j < 4; j++)
{
ump_x1 = 0.0;
ump_x2 = 0.0;
for(k = 0; k < 4; k++)
{
ump_x1 += x1[k] * le[j * 4 + k];
ump_x2 += x2[k] * ri[j * 4 + k];
}
x1px2[j] = ump_x1 * ump_x2;
}
for(j = 0; j < 4; j++)
x3[j] = 0.0;
for(j = 0; j < 4; j++)
for(k = 0; k < 4; k++)
x3[k] += x1px2[j] * EV[j * 4 + k];
}
}
break;
case TIP_INNER:
{
for (i = 0; i < n; i++)
{
x1 = &(tipVector[4 * tipX1[i]]);
x2 = &x2_start[4 * i];
x3 = &x3_start[4 * i];
le = &left[cptr[i] * 16];
ri = &right[cptr[i] * 16];
for(j = 0; j < 4; j++)
{
ump_x1 = 0.0;
ump_x2 = 0.0;
for(k = 0; k < 4; k++)
{
ump_x1 += x1[k] * le[j * 4 + k];
ump_x2 += x2[k] * ri[j * 4 + k];
}
x1px2[j] = ump_x1 * ump_x2;
}
for(j = 0; j < 4; j++)
x3[j] = 0.0;
for(j = 0; j < 4; j++)
for(k = 0; k < 4; k++)
x3[k] += x1px2[j] * EV[4 * j + k];
scale = 1;
for(j = 0; j < 4 && scale; j++)
scale = (x3[j] < minlikelihood && x3[j] > minusminlikelihood);
if(scale)
{
for(j = 0; j < 4; j++)
x3[j] *= twotothe256;
if(useFastScaling)
addScale += wgt[i];
else
ex3[i] += 1;
}
}
}
break;
case INNER_INNER:
for (i = 0; i < n; i++)
{
x1 = &x1_start[4 * i];
x2 = &x2_start[4 * i];
x3 = &x3_start[4 * i];
le = &left[cptr[i] * 16];
ri = &right[cptr[i] * 16];
for(j = 0; j < 4; j++)
{
ump_x1 = 0.0;
ump_x2 = 0.0;
for(k = 0; k < 4; k++)
{
ump_x1 += x1[k] * le[j * 4 + k];
ump_x2 += x2[k] * ri[j * 4 + k];
}
x1px2[j] = ump_x1 * ump_x2;
}
for(j = 0; j < 4; j++)
x3[j] = 0.0;
for(j = 0; j < 4; j++)
for(k = 0; k < 4; k++)
x3[k] += x1px2[j] * EV[4 * j + k];
scale = 1;
for(j = 0; j < 4 && scale; j++)
scale = (x3[j] < minlikelihood && x3[j] > minusminlikelihood);
if(scale)
{
for(j = 0; j < 4; j++)
x3[j] *= twotothe256;
if(useFastScaling)
addScale += wgt[i];
else
ex3[i] += 1;
}
}
break;
default:
assert(0);
}
if(useFastScaling)
*scalerIncrement = addScale;
}
#else
static void newviewGTRCAT_SAVE( int tipCase, double *EV, int *cptr,
double *x1_start, double *x2_start, double *x3_start, double *tipVector,
int *ex3, unsigned char *tipX1, unsigned char *tipX2,
int n, double *left, double *right, int *wgt, int *scalerIncrement, const boolean useFastScaling,
unsigned int *x1_gap, unsigned int *x2_gap, unsigned int *x3_gap,
double *x1_gapColumn, double *x2_gapColumn, double *x3_gapColumn, const int maxCats)
{
double
*le,
*ri,
*x1,
*x2,
*x3,
*x1_ptr = x1_start,
*x2_ptr = x2_start,
*x3_ptr = x3_start,
EV_t[16] __attribute__ ((aligned (BYTE_ALIGNMENT)));
int
i,
j,
scale,
scaleGap = 0,
addScale = 0;
__m128d
minlikelihood_sse = _mm_set1_pd( minlikelihood ),
sc = _mm_set1_pd(twotothe256),
EVV[8];
for(i = 0; i < 4; i++)
for (j=0; j < 4; j++)
EV_t[4 * j + i] = EV[4 * i + j];
for(i = 0; i < 8; i++)
EVV[i] = _mm_load_pd(&EV_t[i * 2]);
{
x1 = x1_gapColumn;
x2 = x2_gapColumn;
x3 = x3_gapColumn;
le = &left[maxCats * 16];
ri = &right[maxCats * 16];
__m128d x1_0 = _mm_load_pd( &x1[0] );
__m128d x1_2 = _mm_load_pd( &x1[2] );
__m128d left_k0_0 = _mm_load_pd( &le[0] );
__m128d left_k0_2 = _mm_load_pd( &le[2] );
__m128d left_k1_0 = _mm_load_pd( &le[4] );
__m128d left_k1_2 = _mm_load_pd( &le[6] );
__m128d left_k2_0 = _mm_load_pd( &le[8] );
__m128d left_k2_2 = _mm_load_pd( &le[10] );
__m128d left_k3_0 = _mm_load_pd( &le[12] );
__m128d left_k3_2 = _mm_load_pd( &le[14] );
left_k0_0 = _mm_mul_pd(x1_0, left_k0_0);
left_k0_2 = _mm_mul_pd(x1_2, left_k0_2);
left_k1_0 = _mm_mul_pd(x1_0, left_k1_0);
left_k1_2 = _mm_mul_pd(x1_2, left_k1_2);
left_k0_0 = _mm_hadd_pd( left_k0_0, left_k0_2 );
left_k1_0 = _mm_hadd_pd( left_k1_0, left_k1_2);
left_k0_0 = _mm_hadd_pd( left_k0_0, left_k1_0);
left_k2_0 = _mm_mul_pd(x1_0, left_k2_0);
left_k2_2 = _mm_mul_pd(x1_2, left_k2_2);
left_k3_0 = _mm_mul_pd(x1_0, left_k3_0);
left_k3_2 = _mm_mul_pd(x1_2, left_k3_2);
left_k2_0 = _mm_hadd_pd( left_k2_0, left_k2_2);
left_k3_0 = _mm_hadd_pd( left_k3_0, left_k3_2);
left_k2_0 = _mm_hadd_pd( left_k2_0, left_k3_0);
__m128d x2_0 = _mm_load_pd( &x2[0] );
__m128d x2_2 = _mm_load_pd( &x2[2] );
__m128d right_k0_0 = _mm_load_pd( &ri[0] );
__m128d right_k0_2 = _mm_load_pd( &ri[2] );
__m128d right_k1_0 = _mm_load_pd( &ri[4] );
__m128d right_k1_2 = _mm_load_pd( &ri[6] );
__m128d right_k2_0 = _mm_load_pd( &ri[8] );
__m128d right_k2_2 = _mm_load_pd( &ri[10] );
__m128d right_k3_0 = _mm_load_pd( &ri[12] );
__m128d right_k3_2 = _mm_load_pd( &ri[14] );
right_k0_0 = _mm_mul_pd( x2_0, right_k0_0);
right_k0_2 = _mm_mul_pd( x2_2, right_k0_2);
right_k1_0 = _mm_mul_pd( x2_0, right_k1_0);
right_k1_2 = _mm_mul_pd( x2_2, right_k1_2);
right_k0_0 = _mm_hadd_pd( right_k0_0, right_k0_2);
right_k1_0 = _mm_hadd_pd( right_k1_0, right_k1_2);
right_k0_0 = _mm_hadd_pd( right_k0_0, right_k1_0);
right_k2_0 = _mm_mul_pd( x2_0, right_k2_0);
right_k2_2 = _mm_mul_pd( x2_2, right_k2_2);
right_k3_0 = _mm_mul_pd( x2_0, right_k3_0);
right_k3_2 = _mm_mul_pd( x2_2, right_k3_2);
right_k2_0 = _mm_hadd_pd( right_k2_0, right_k2_2);
right_k3_0 = _mm_hadd_pd( right_k3_0, right_k3_2);
right_k2_0 = _mm_hadd_pd( right_k2_0, right_k3_0);
__m128d x1px2_k0 = _mm_mul_pd( left_k0_0, right_k0_0 );
__m128d x1px2_k2 = _mm_mul_pd( left_k2_0, right_k2_0 );
__m128d EV_t_l0_k0 = EVV[0];
__m128d EV_t_l0_k2 = EVV[1];
__m128d EV_t_l1_k0 = EVV[2];
__m128d EV_t_l1_k2 = EVV[3];
__m128d EV_t_l2_k0 = EVV[4];
__m128d EV_t_l2_k2 = EVV[5];
__m128d EV_t_l3_k0 = EVV[6];
__m128d EV_t_l3_k2 = EVV[7];
EV_t_l0_k0 = _mm_mul_pd( x1px2_k0, EV_t_l0_k0 );
EV_t_l0_k2 = _mm_mul_pd( x1px2_k2, EV_t_l0_k2 );
EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l0_k2 );
EV_t_l1_k0 = _mm_mul_pd( x1px2_k0, EV_t_l1_k0 );
EV_t_l1_k2 = _mm_mul_pd( x1px2_k2, EV_t_l1_k2 );
EV_t_l1_k0 = _mm_hadd_pd( EV_t_l1_k0, EV_t_l1_k2 );
EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l1_k0 );
EV_t_l2_k0 = _mm_mul_pd( x1px2_k0, EV_t_l2_k0 );
EV_t_l2_k2 = _mm_mul_pd( x1px2_k2, EV_t_l2_k2 );
EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l2_k2 );
EV_t_l3_k0 = _mm_mul_pd( x1px2_k0, EV_t_l3_k0 );
EV_t_l3_k2 = _mm_mul_pd( x1px2_k2, EV_t_l3_k2 );
EV_t_l3_k0 = _mm_hadd_pd( EV_t_l3_k0, EV_t_l3_k2 );
EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l3_k0 );
if(tipCase != TIP_TIP)
{
scale = 1;
__m128d v1 = _mm_and_pd(EV_t_l0_k0, absMask.m);
v1 = _mm_cmplt_pd(v1, minlikelihood_sse);
if(_mm_movemask_pd( v1 ) != 3)
scale = 0;
else
{
v1 = _mm_and_pd(EV_t_l2_k0, absMask.m);
v1 = _mm_cmplt_pd(v1, minlikelihood_sse);
if(_mm_movemask_pd( v1 ) != 3)
scale = 0;
}
if(scale)
{
_mm_store_pd(&x3[0], _mm_mul_pd(EV_t_l0_k0, sc));
_mm_store_pd(&x3[2], _mm_mul_pd(EV_t_l2_k0, sc));
scaleGap = TRUE;
}
else
{
_mm_store_pd(x3, EV_t_l0_k0);
_mm_store_pd(&x3[2], EV_t_l2_k0);
}
}
else
{
_mm_store_pd(x3, EV_t_l0_k0);
_mm_store_pd(&x3[2], EV_t_l2_k0);
}
}
switch(tipCase)
{
case TIP_TIP:
for (i = 0; i < n; i++)
{
if(noGap(x3_gap, i))
{
x1 = &(tipVector[4 * tipX1[i]]);
x2 = &(tipVector[4 * tipX2[i]]);
x3 = x3_ptr;
if(isGap(x1_gap, i))
le = &left[maxCats * 16];
else
le = &left[cptr[i] * 16];
if(isGap(x2_gap, i))
ri = &right[maxCats * 16];
else
ri = &right[cptr[i] * 16];
__m128d x1_0 = _mm_load_pd( &x1[0] );
__m128d x1_2 = _mm_load_pd( &x1[2] );
__m128d left_k0_0 = _mm_load_pd( &le[0] );
__m128d left_k0_2 = _mm_load_pd( &le[2] );
__m128d left_k1_0 = _mm_load_pd( &le[4] );
__m128d left_k1_2 = _mm_load_pd( &le[6] );
__m128d left_k2_0 = _mm_load_pd( &le[8] );
__m128d left_k2_2 = _mm_load_pd( &le[10] );
__m128d left_k3_0 = _mm_load_pd( &le[12] );
__m128d left_k3_2 = _mm_load_pd( &le[14] );
left_k0_0 = _mm_mul_pd(x1_0, left_k0_0);
left_k0_2 = _mm_mul_pd(x1_2, left_k0_2);
left_k1_0 = _mm_mul_pd(x1_0, left_k1_0);
left_k1_2 = _mm_mul_pd(x1_2, left_k1_2);
left_k0_0 = _mm_hadd_pd( left_k0_0, left_k0_2 );
left_k1_0 = _mm_hadd_pd( left_k1_0, left_k1_2);
left_k0_0 = _mm_hadd_pd( left_k0_0, left_k1_0);
left_k2_0 = _mm_mul_pd(x1_0, left_k2_0);
left_k2_2 = _mm_mul_pd(x1_2, left_k2_2);
left_k3_0 = _mm_mul_pd(x1_0, left_k3_0);
left_k3_2 = _mm_mul_pd(x1_2, left_k3_2);
left_k2_0 = _mm_hadd_pd( left_k2_0, left_k2_2);
left_k3_0 = _mm_hadd_pd( left_k3_0, left_k3_2);
left_k2_0 = _mm_hadd_pd( left_k2_0, left_k3_0);
__m128d x2_0 = _mm_load_pd( &x2[0] );
__m128d x2_2 = _mm_load_pd( &x2[2] );
__m128d right_k0_0 = _mm_load_pd( &ri[0] );
__m128d right_k0_2 = _mm_load_pd( &ri[2] );
__m128d right_k1_0 = _mm_load_pd( &ri[4] );
__m128d right_k1_2 = _mm_load_pd( &ri[6] );
__m128d right_k2_0 = _mm_load_pd( &ri[8] );
__m128d right_k2_2 = _mm_load_pd( &ri[10] );
__m128d right_k3_0 = _mm_load_pd( &ri[12] );
__m128d right_k3_2 = _mm_load_pd( &ri[14] );
right_k0_0 = _mm_mul_pd( x2_0, right_k0_0);
right_k0_2 = _mm_mul_pd( x2_2, right_k0_2);
right_k1_0 = _mm_mul_pd( x2_0, right_k1_0);
right_k1_2 = _mm_mul_pd( x2_2, right_k1_2);
right_k0_0 = _mm_hadd_pd( right_k0_0, right_k0_2);
right_k1_0 = _mm_hadd_pd( right_k1_0, right_k1_2);
right_k0_0 = _mm_hadd_pd( right_k0_0, right_k1_0);
right_k2_0 = _mm_mul_pd( x2_0, right_k2_0);
right_k2_2 = _mm_mul_pd( x2_2, right_k2_2);
right_k3_0 = _mm_mul_pd( x2_0, right_k3_0);
right_k3_2 = _mm_mul_pd( x2_2, right_k3_2);
right_k2_0 = _mm_hadd_pd( right_k2_0, right_k2_2);
right_k3_0 = _mm_hadd_pd( right_k3_0, right_k3_2);
right_k2_0 = _mm_hadd_pd( right_k2_0, right_k3_0);
__m128d x1px2_k0 = _mm_mul_pd( left_k0_0, right_k0_0 );
__m128d x1px2_k2 = _mm_mul_pd( left_k2_0, right_k2_0 );
__m128d EV_t_l0_k0 = EVV[0];
__m128d EV_t_l0_k2 = EVV[1];
__m128d EV_t_l1_k0 = EVV[2];
__m128d EV_t_l1_k2 = EVV[3];
__m128d EV_t_l2_k0 = EVV[4];
__m128d EV_t_l2_k2 = EVV[5];
__m128d EV_t_l3_k0 = EVV[6];
__m128d EV_t_l3_k2 = EVV[7];
EV_t_l0_k0 = _mm_mul_pd( x1px2_k0, EV_t_l0_k0 );
EV_t_l0_k2 = _mm_mul_pd( x1px2_k2, EV_t_l0_k2 );
EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l0_k2 );
EV_t_l1_k0 = _mm_mul_pd( x1px2_k0, EV_t_l1_k0 );
EV_t_l1_k2 = _mm_mul_pd( x1px2_k2, EV_t_l1_k2 );
EV_t_l1_k0 = _mm_hadd_pd( EV_t_l1_k0, EV_t_l1_k2 );
EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l1_k0 );
EV_t_l2_k0 = _mm_mul_pd( x1px2_k0, EV_t_l2_k0 );
EV_t_l2_k2 = _mm_mul_pd( x1px2_k2, EV_t_l2_k2 );
EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l2_k2 );
EV_t_l3_k0 = _mm_mul_pd( x1px2_k0, EV_t_l3_k0 );
EV_t_l3_k2 = _mm_mul_pd( x1px2_k2, EV_t_l3_k2 );
EV_t_l3_k0 = _mm_hadd_pd( EV_t_l3_k0, EV_t_l3_k2 );
EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l3_k0 );
_mm_store_pd(x3, EV_t_l0_k0);
_mm_store_pd(&x3[2], EV_t_l2_k0);
x3_ptr += 4;
}
}
break;
case TIP_INNER:
for (i = 0; i < n; i++)
{
if(isGap(x3_gap, i))
{
if(scaleGap)
{
if(useFastScaling)
addScale += wgt[i];
else
ex3[i] += 1;
}
}
else
{
x1 = &(tipVector[4 * tipX1[i]]);
x2 = x2_ptr;
x3 = x3_ptr;
if(isGap(x1_gap, i))
le = &left[maxCats * 16];
else
le = &left[cptr[i] * 16];
if(isGap(x2_gap, i))
{
ri = &right[maxCats * 16];
x2 = x2_gapColumn;
}
else
{
ri = &right[cptr[i] * 16];
x2 = x2_ptr;
x2_ptr += 4;
}
__m128d x1_0 = _mm_load_pd( &x1[0] );
__m128d x1_2 = _mm_load_pd( &x1[2] );
__m128d left_k0_0 = _mm_load_pd( &le[0] );
__m128d left_k0_2 = _mm_load_pd( &le[2] );
__m128d left_k1_0 = _mm_load_pd( &le[4] );
__m128d left_k1_2 = _mm_load_pd( &le[6] );
__m128d left_k2_0 = _mm_load_pd( &le[8] );
__m128d left_k2_2 = _mm_load_pd( &le[10] );
__m128d left_k3_0 = _mm_load_pd( &le[12] );
__m128d left_k3_2 = _mm_load_pd( &le[14] );
left_k0_0 = _mm_mul_pd(x1_0, left_k0_0);
left_k0_2 = _mm_mul_pd(x1_2, left_k0_2);
left_k1_0 = _mm_mul_pd(x1_0, left_k1_0);
left_k1_2 = _mm_mul_pd(x1_2, left_k1_2);
left_k0_0 = _mm_hadd_pd( left_k0_0, left_k0_2 );
left_k1_0 = _mm_hadd_pd( left_k1_0, left_k1_2);
left_k0_0 = _mm_hadd_pd( left_k0_0, left_k1_0);
left_k2_0 = _mm_mul_pd(x1_0, left_k2_0);
left_k2_2 = _mm_mul_pd(x1_2, left_k2_2);
left_k3_0 = _mm_mul_pd(x1_0, left_k3_0);
left_k3_2 = _mm_mul_pd(x1_2, left_k3_2);
left_k2_0 = _mm_hadd_pd( left_k2_0, left_k2_2);
left_k3_0 = _mm_hadd_pd( left_k3_0, left_k3_2);
left_k2_0 = _mm_hadd_pd( left_k2_0, left_k3_0);
__m128d x2_0 = _mm_load_pd( &x2[0] );
__m128d x2_2 = _mm_load_pd( &x2[2] );
__m128d right_k0_0 = _mm_load_pd( &ri[0] );
__m128d right_k0_2 = _mm_load_pd( &ri[2] );
__m128d right_k1_0 = _mm_load_pd( &ri[4] );
__m128d right_k1_2 = _mm_load_pd( &ri[6] );
__m128d right_k2_0 = _mm_load_pd( &ri[8] );
__m128d right_k2_2 = _mm_load_pd( &ri[10] );
__m128d right_k3_0 = _mm_load_pd( &ri[12] );
__m128d right_k3_2 = _mm_load_pd( &ri[14] );
right_k0_0 = _mm_mul_pd( x2_0, right_k0_0);
right_k0_2 = _mm_mul_pd( x2_2, right_k0_2);
right_k1_0 = _mm_mul_pd( x2_0, right_k1_0);
right_k1_2 = _mm_mul_pd( x2_2, right_k1_2);
right_k0_0 = _mm_hadd_pd( right_k0_0, right_k0_2);
right_k1_0 = _mm_hadd_pd( right_k1_0, right_k1_2);
right_k0_0 = _mm_hadd_pd( right_k0_0, right_k1_0);
right_k2_0 = _mm_mul_pd( x2_0, right_k2_0);
right_k2_2 = _mm_mul_pd( x2_2, right_k2_2);
right_k3_0 = _mm_mul_pd( x2_0, right_k3_0);
right_k3_2 = _mm_mul_pd( x2_2, right_k3_2);
right_k2_0 = _mm_hadd_pd( right_k2_0, right_k2_2);
right_k3_0 = _mm_hadd_pd( right_k3_0, right_k3_2);
right_k2_0 = _mm_hadd_pd( right_k2_0, right_k3_0);
__m128d x1px2_k0 = _mm_mul_pd( left_k0_0, right_k0_0 );
__m128d x1px2_k2 = _mm_mul_pd( left_k2_0, right_k2_0 );
__m128d EV_t_l0_k0 = EVV[0];
__m128d EV_t_l0_k2 = EVV[1];
__m128d EV_t_l1_k0 = EVV[2];
__m128d EV_t_l1_k2 = EVV[3];
__m128d EV_t_l2_k0 = EVV[4];
__m128d EV_t_l2_k2 = EVV[5];
__m128d EV_t_l3_k0 = EVV[6];
__m128d EV_t_l3_k2 = EVV[7];
EV_t_l0_k0 = _mm_mul_pd( x1px2_k0, EV_t_l0_k0 );
EV_t_l0_k2 = _mm_mul_pd( x1px2_k2, EV_t_l0_k2 );
EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l0_k2 );
EV_t_l1_k0 = _mm_mul_pd( x1px2_k0, EV_t_l1_k0 );
EV_t_l1_k2 = _mm_mul_pd( x1px2_k2, EV_t_l1_k2 );
EV_t_l1_k0 = _mm_hadd_pd( EV_t_l1_k0, EV_t_l1_k2 );
EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l1_k0 );
EV_t_l2_k0 = _mm_mul_pd( x1px2_k0, EV_t_l2_k0 );
EV_t_l2_k2 = _mm_mul_pd( x1px2_k2, EV_t_l2_k2 );
EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l2_k2 );
EV_t_l3_k0 = _mm_mul_pd( x1px2_k0, EV_t_l3_k0 );
EV_t_l3_k2 = _mm_mul_pd( x1px2_k2, EV_t_l3_k2 );
EV_t_l3_k0 = _mm_hadd_pd( EV_t_l3_k0, EV_t_l3_k2 );
EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l3_k0 );
scale = 1;
__m128d v1 = _mm_and_pd(EV_t_l0_k0, absMask.m);
v1 = _mm_cmplt_pd(v1, minlikelihood_sse);
if(_mm_movemask_pd( v1 ) != 3)
scale = 0;
else
{
v1 = _mm_and_pd(EV_t_l2_k0, absMask.m);
v1 = _mm_cmplt_pd(v1, minlikelihood_sse);
if(_mm_movemask_pd( v1 ) != 3)
scale = 0;
}
if(scale)
{
_mm_store_pd(&x3[0], _mm_mul_pd(EV_t_l0_k0, sc));
_mm_store_pd(&x3[2], _mm_mul_pd(EV_t_l2_k0, sc));
if(useFastScaling)
addScale += wgt[i];
else
ex3[i] += 1;
}
else
{
_mm_store_pd(x3, EV_t_l0_k0);
_mm_store_pd(&x3[2], EV_t_l2_k0);
}
x3_ptr += 4;
}
}
break;
case INNER_INNER:
for (i = 0; i < n; i++)
{
if(isGap(x3_gap, i))
{
if(scaleGap)
{
if(useFastScaling)
addScale += wgt[i];
else
ex3[i] += 1;
}
}
else
{
x3 = x3_ptr;
if(isGap(x1_gap, i))
{
x1 = x1_gapColumn;
le = &left[maxCats * 16];
}
else
{
le = &left[cptr[i] * 16];
x1 = x1_ptr;
x1_ptr += 4;
}
if(isGap(x2_gap, i))
{
x2 = x2_gapColumn;
ri = &right[maxCats * 16];
}
else
{
ri = &right[cptr[i] * 16];
x2 = x2_ptr;
x2_ptr += 4;
}
__m128d x1_0 = _mm_load_pd( &x1[0] );
__m128d x1_2 = _mm_load_pd( &x1[2] );
__m128d left_k0_0 = _mm_load_pd( &le[0] );
__m128d left_k0_2 = _mm_load_pd( &le[2] );
__m128d left_k1_0 = _mm_load_pd( &le[4] );
__m128d left_k1_2 = _mm_load_pd( &le[6] );
__m128d left_k2_0 = _mm_load_pd( &le[8] );
__m128d left_k2_2 = _mm_load_pd( &le[10] );
__m128d left_k3_0 = _mm_load_pd( &le[12] );
__m128d left_k3_2 = _mm_load_pd( &le[14] );
left_k0_0 = _mm_mul_pd(x1_0, left_k0_0);
left_k0_2 = _mm_mul_pd(x1_2, left_k0_2);
left_k1_0 = _mm_mul_pd(x1_0, left_k1_0);
left_k1_2 = _mm_mul_pd(x1_2, left_k1_2);
left_k0_0 = _mm_hadd_pd( left_k0_0, left_k0_2 );
left_k1_0 = _mm_hadd_pd( left_k1_0, left_k1_2);
left_k0_0 = _mm_hadd_pd( left_k0_0, left_k1_0);
left_k2_0 = _mm_mul_pd(x1_0, left_k2_0);
left_k2_2 = _mm_mul_pd(x1_2, left_k2_2);
left_k3_0 = _mm_mul_pd(x1_0, left_k3_0);
left_k3_2 = _mm_mul_pd(x1_2, left_k3_2);
left_k2_0 = _mm_hadd_pd( left_k2_0, left_k2_2);
left_k3_0 = _mm_hadd_pd( left_k3_0, left_k3_2);
left_k2_0 = _mm_hadd_pd( left_k2_0, left_k3_0);
__m128d x2_0 = _mm_load_pd( &x2[0] );
__m128d x2_2 = _mm_load_pd( &x2[2] );
__m128d right_k0_0 = _mm_load_pd( &ri[0] );
__m128d right_k0_2 = _mm_load_pd( &ri[2] );
__m128d right_k1_0 = _mm_load_pd( &ri[4] );
__m128d right_k1_2 = _mm_load_pd( &ri[6] );
__m128d right_k2_0 = _mm_load_pd( &ri[8] );
__m128d right_k2_2 = _mm_load_pd( &ri[10] );
__m128d right_k3_0 = _mm_load_pd( &ri[12] );
__m128d right_k3_2 = _mm_load_pd( &ri[14] );
right_k0_0 = _mm_mul_pd( x2_0, right_k0_0);
right_k0_2 = _mm_mul_pd( x2_2, right_k0_2);
right_k1_0 = _mm_mul_pd( x2_0, right_k1_0);
right_k1_2 = _mm_mul_pd( x2_2, right_k1_2);
right_k0_0 = _mm_hadd_pd( right_k0_0, right_k0_2);
right_k1_0 = _mm_hadd_pd( right_k1_0, right_k1_2);
right_k0_0 = _mm_hadd_pd( right_k0_0, right_k1_0);
right_k2_0 = _mm_mul_pd( x2_0, right_k2_0);
right_k2_2 = _mm_mul_pd( x2_2, right_k2_2);
right_k3_0 = _mm_mul_pd( x2_0, right_k3_0);
right_k3_2 = _mm_mul_pd( x2_2, right_k3_2);
right_k2_0 = _mm_hadd_pd( right_k2_0, right_k2_2);
right_k3_0 = _mm_hadd_pd( right_k3_0, right_k3_2);
right_k2_0 = _mm_hadd_pd( right_k2_0, right_k3_0);
__m128d x1px2_k0 = _mm_mul_pd( left_k0_0, right_k0_0 );
__m128d x1px2_k2 = _mm_mul_pd( left_k2_0, right_k2_0 );
__m128d EV_t_l0_k0 = EVV[0];
__m128d EV_t_l0_k2 = EVV[1];
__m128d EV_t_l1_k0 = EVV[2];
__m128d EV_t_l1_k2 = EVV[3];
__m128d EV_t_l2_k0 = EVV[4];
__m128d EV_t_l2_k2 = EVV[5];
__m128d EV_t_l3_k0 = EVV[6];
__m128d EV_t_l3_k2 = EVV[7];
EV_t_l0_k0 = _mm_mul_pd( x1px2_k0, EV_t_l0_k0 );
EV_t_l0_k2 = _mm_mul_pd( x1px2_k2, EV_t_l0_k2 );
EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l0_k2 );
EV_t_l1_k0 = _mm_mul_pd( x1px2_k0, EV_t_l1_k0 );
EV_t_l1_k2 = _mm_mul_pd( x1px2_k2, EV_t_l1_k2 );
EV_t_l1_k0 = _mm_hadd_pd( EV_t_l1_k0, EV_t_l1_k2 );
EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l1_k0 );
EV_t_l2_k0 = _mm_mul_pd( x1px2_k0, EV_t_l2_k0 );
EV_t_l2_k2 = _mm_mul_pd( x1px2_k2, EV_t_l2_k2 );
EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l2_k2 );
EV_t_l3_k0 = _mm_mul_pd( x1px2_k0, EV_t_l3_k0 );
EV_t_l3_k2 = _mm_mul_pd( x1px2_k2, EV_t_l3_k2 );
EV_t_l3_k0 = _mm_hadd_pd( EV_t_l3_k0, EV_t_l3_k2 );
EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l3_k0 );
scale = 1;
__m128d v1 = _mm_and_pd(EV_t_l0_k0, absMask.m);
v1 = _mm_cmplt_pd(v1, minlikelihood_sse);
if(_mm_movemask_pd( v1 ) != 3)
scale = 0;
else
{
v1 = _mm_and_pd(EV_t_l2_k0, absMask.m);
v1 = _mm_cmplt_pd(v1, minlikelihood_sse);
if(_mm_movemask_pd( v1 ) != 3)
scale = 0;
}
if(scale)
{
EV_t_l0_k0 = _mm_mul_pd(EV_t_l0_k0, sc);
EV_t_l2_k0 = _mm_mul_pd(EV_t_l2_k0, sc);
if(useFastScaling)
addScale += wgt[i];
else
ex3[i] += 1;
}
_mm_store_pd(&x3[0], EV_t_l0_k0);
_mm_store_pd(&x3[2], EV_t_l2_k0);
x3_ptr += 4;
}
}
break;
default:
assert(0);
}
if(useFastScaling)
*scalerIncrement = addScale;
}
static void newviewGTRCAT( int tipCase, double *EV, int *cptr,
double *x1_start, double *x2_start, double *x3_start, double *tipVector,
int *ex3, unsigned char *tipX1, unsigned char *tipX2,
int n, double *left, double *right, int *wgt, int *scalerIncrement, const boolean useFastScaling)
{
double
*le,
*ri,
*x1,
*x2,
*x3,
EV_t[16] __attribute__ ((aligned (BYTE_ALIGNMENT)));
int
i,
j,
scale,
addScale = 0;
__m128d
minlikelihood_sse = _mm_set1_pd( minlikelihood ),
sc = _mm_set1_pd(twotothe256),
EVV[8];
for(i = 0; i < 4; i++)
for (j=0; j < 4; j++)
EV_t[4 * j + i] = EV[4 * i + j];
for(i = 0; i < 8; i++)
EVV[i] = _mm_load_pd(&EV_t[i * 2]);
switch(tipCase)
{
case TIP_TIP:
for (i = 0; i < n; i++)
{
x1 = &(tipVector[4 * tipX1[i]]);
x2 = &(tipVector[4 * tipX2[i]]);
x3 = &x3_start[i * 4];
le = &left[cptr[i] * 16];
ri = &right[cptr[i] * 16];
__m128d x1_0 = _mm_load_pd( &x1[0] );
__m128d x1_2 = _mm_load_pd( &x1[2] );
__m128d left_k0_0 = _mm_load_pd( &le[0] );
__m128d left_k0_2 = _mm_load_pd( &le[2] );
__m128d left_k1_0 = _mm_load_pd( &le[4] );
__m128d left_k1_2 = _mm_load_pd( &le[6] );
__m128d left_k2_0 = _mm_load_pd( &le[8] );
__m128d left_k2_2 = _mm_load_pd( &le[10] );
__m128d left_k3_0 = _mm_load_pd( &le[12] );
__m128d left_k3_2 = _mm_load_pd( &le[14] );
left_k0_0 = _mm_mul_pd(x1_0, left_k0_0);
left_k0_2 = _mm_mul_pd(x1_2, left_k0_2);
left_k1_0 = _mm_mul_pd(x1_0, left_k1_0);
left_k1_2 = _mm_mul_pd(x1_2, left_k1_2);
left_k0_0 = _mm_hadd_pd( left_k0_0, left_k0_2 );
left_k1_0 = _mm_hadd_pd( left_k1_0, left_k1_2);
left_k0_0 = _mm_hadd_pd( left_k0_0, left_k1_0);
left_k2_0 = _mm_mul_pd(x1_0, left_k2_0);
left_k2_2 = _mm_mul_pd(x1_2, left_k2_2);
left_k3_0 = _mm_mul_pd(x1_0, left_k3_0);
left_k3_2 = _mm_mul_pd(x1_2, left_k3_2);
left_k2_0 = _mm_hadd_pd( left_k2_0, left_k2_2);
left_k3_0 = _mm_hadd_pd( left_k3_0, left_k3_2);
left_k2_0 = _mm_hadd_pd( left_k2_0, left_k3_0);
__m128d x2_0 = _mm_load_pd( &x2[0] );
__m128d x2_2 = _mm_load_pd( &x2[2] );
__m128d right_k0_0 = _mm_load_pd( &ri[0] );
__m128d right_k0_2 = _mm_load_pd( &ri[2] );
__m128d right_k1_0 = _mm_load_pd( &ri[4] );
__m128d right_k1_2 = _mm_load_pd( &ri[6] );
__m128d right_k2_0 = _mm_load_pd( &ri[8] );
__m128d right_k2_2 = _mm_load_pd( &ri[10] );
__m128d right_k3_0 = _mm_load_pd( &ri[12] );
__m128d right_k3_2 = _mm_load_pd( &ri[14] );
right_k0_0 = _mm_mul_pd( x2_0, right_k0_0);
right_k0_2 = _mm_mul_pd( x2_2, right_k0_2);
right_k1_0 = _mm_mul_pd( x2_0, right_k1_0);
right_k1_2 = _mm_mul_pd( x2_2, right_k1_2);
right_k0_0 = _mm_hadd_pd( right_k0_0, right_k0_2);
right_k1_0 = _mm_hadd_pd( right_k1_0, right_k1_2);
right_k0_0 = _mm_hadd_pd( right_k0_0, right_k1_0);
right_k2_0 = _mm_mul_pd( x2_0, right_k2_0);
right_k2_2 = _mm_mul_pd( x2_2, right_k2_2);
right_k3_0 = _mm_mul_pd( x2_0, right_k3_0);
right_k3_2 = _mm_mul_pd( x2_2, right_k3_2);
right_k2_0 = _mm_hadd_pd( right_k2_0, right_k2_2);
right_k3_0 = _mm_hadd_pd( right_k3_0, right_k3_2);
right_k2_0 = _mm_hadd_pd( right_k2_0, right_k3_0);
__m128d x1px2_k0 = _mm_mul_pd( left_k0_0, right_k0_0 );
__m128d x1px2_k2 = _mm_mul_pd( left_k2_0, right_k2_0 );
__m128d EV_t_l0_k0 = EVV[0];
__m128d EV_t_l0_k2 = EVV[1];
__m128d EV_t_l1_k0 = EVV[2];
__m128d EV_t_l1_k2 = EVV[3];
__m128d EV_t_l2_k0 = EVV[4];
__m128d EV_t_l2_k2 = EVV[5];
__m128d EV_t_l3_k0 = EVV[6];
__m128d EV_t_l3_k2 = EVV[7];
EV_t_l0_k0 = _mm_mul_pd( x1px2_k0, EV_t_l0_k0 );
EV_t_l0_k2 = _mm_mul_pd( x1px2_k2, EV_t_l0_k2 );
EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l0_k2 );
EV_t_l1_k0 = _mm_mul_pd( x1px2_k0, EV_t_l1_k0 );
EV_t_l1_k2 = _mm_mul_pd( x1px2_k2, EV_t_l1_k2 );
EV_t_l1_k0 = _mm_hadd_pd( EV_t_l1_k0, EV_t_l1_k2 );
EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l1_k0 );
EV_t_l2_k0 = _mm_mul_pd( x1px2_k0, EV_t_l2_k0 );
EV_t_l2_k2 = _mm_mul_pd( x1px2_k2, EV_t_l2_k2 );
EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l2_k2 );
EV_t_l3_k0 = _mm_mul_pd( x1px2_k0, EV_t_l3_k0 );
EV_t_l3_k2 = _mm_mul_pd( x1px2_k2, EV_t_l3_k2 );
EV_t_l3_k0 = _mm_hadd_pd( EV_t_l3_k0, EV_t_l3_k2 );
EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l3_k0 );
_mm_store_pd(x3, EV_t_l0_k0);
_mm_store_pd(&x3[2], EV_t_l2_k0);
}
break;
case TIP_INNER:
for (i = 0; i < n; i++)
{
x1 = &(tipVector[4 * tipX1[i]]);
x2 = &x2_start[4 * i];
x3 = &x3_start[4 * i];
le = &left[cptr[i] * 16];
ri = &right[cptr[i] * 16];
__m128d x1_0 = _mm_load_pd( &x1[0] );
__m128d x1_2 = _mm_load_pd( &x1[2] );
__m128d left_k0_0 = _mm_load_pd( &le[0] );
__m128d left_k0_2 = _mm_load_pd( &le[2] );
__m128d left_k1_0 = _mm_load_pd( &le[4] );
__m128d left_k1_2 = _mm_load_pd( &le[6] );
__m128d left_k2_0 = _mm_load_pd( &le[8] );
__m128d left_k2_2 = _mm_load_pd( &le[10] );
__m128d left_k3_0 = _mm_load_pd( &le[12] );
__m128d left_k3_2 = _mm_load_pd( &le[14] );
left_k0_0 = _mm_mul_pd(x1_0, left_k0_0);
left_k0_2 = _mm_mul_pd(x1_2, left_k0_2);
left_k1_0 = _mm_mul_pd(x1_0, left_k1_0);
left_k1_2 = _mm_mul_pd(x1_2, left_k1_2);
left_k0_0 = _mm_hadd_pd( left_k0_0, left_k0_2 );
left_k1_0 = _mm_hadd_pd( left_k1_0, left_k1_2);
left_k0_0 = _mm_hadd_pd( left_k0_0, left_k1_0);
left_k2_0 = _mm_mul_pd(x1_0, left_k2_0);
left_k2_2 = _mm_mul_pd(x1_2, left_k2_2);
left_k3_0 = _mm_mul_pd(x1_0, left_k3_0);
left_k3_2 = _mm_mul_pd(x1_2, left_k3_2);
left_k2_0 = _mm_hadd_pd( left_k2_0, left_k2_2);
left_k3_0 = _mm_hadd_pd( left_k3_0, left_k3_2);
left_k2_0 = _mm_hadd_pd( left_k2_0, left_k3_0);
__m128d x2_0 = _mm_load_pd( &x2[0] );
__m128d x2_2 = _mm_load_pd( &x2[2] );
__m128d right_k0_0 = _mm_load_pd( &ri[0] );
__m128d right_k0_2 = _mm_load_pd( &ri[2] );
__m128d right_k1_0 = _mm_load_pd( &ri[4] );
__m128d right_k1_2 = _mm_load_pd( &ri[6] );
__m128d right_k2_0 = _mm_load_pd( &ri[8] );
__m128d right_k2_2 = _mm_load_pd( &ri[10] );
__m128d right_k3_0 = _mm_load_pd( &ri[12] );
__m128d right_k3_2 = _mm_load_pd( &ri[14] );
right_k0_0 = _mm_mul_pd( x2_0, right_k0_0);
right_k0_2 = _mm_mul_pd( x2_2, right_k0_2);
right_k1_0 = _mm_mul_pd( x2_0, right_k1_0);
right_k1_2 = _mm_mul_pd( x2_2, right_k1_2);
right_k0_0 = _mm_hadd_pd( right_k0_0, right_k0_2);
right_k1_0 = _mm_hadd_pd( right_k1_0, right_k1_2);
right_k0_0 = _mm_hadd_pd( right_k0_0, right_k1_0);
right_k2_0 = _mm_mul_pd( x2_0, right_k2_0);
right_k2_2 = _mm_mul_pd( x2_2, right_k2_2);
right_k3_0 = _mm_mul_pd( x2_0, right_k3_0);
right_k3_2 = _mm_mul_pd( x2_2, right_k3_2);
right_k2_0 = _mm_hadd_pd( right_k2_0, right_k2_2);
right_k3_0 = _mm_hadd_pd( right_k3_0, right_k3_2);
right_k2_0 = _mm_hadd_pd( right_k2_0, right_k3_0);
__m128d x1px2_k0 = _mm_mul_pd( left_k0_0, right_k0_0 );
__m128d x1px2_k2 = _mm_mul_pd( left_k2_0, right_k2_0 );
__m128d EV_t_l0_k0 = EVV[0];
__m128d EV_t_l0_k2 = EVV[1];
__m128d EV_t_l1_k0 = EVV[2];
__m128d EV_t_l1_k2 = EVV[3];
__m128d EV_t_l2_k0 = EVV[4];
__m128d EV_t_l2_k2 = EVV[5];
__m128d EV_t_l3_k0 = EVV[6];
__m128d EV_t_l3_k2 = EVV[7];
EV_t_l0_k0 = _mm_mul_pd( x1px2_k0, EV_t_l0_k0 );
EV_t_l0_k2 = _mm_mul_pd( x1px2_k2, EV_t_l0_k2 );
EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l0_k2 );
EV_t_l1_k0 = _mm_mul_pd( x1px2_k0, EV_t_l1_k0 );
EV_t_l1_k2 = _mm_mul_pd( x1px2_k2, EV_t_l1_k2 );
EV_t_l1_k0 = _mm_hadd_pd( EV_t_l1_k0, EV_t_l1_k2 );
EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l1_k0 );
EV_t_l2_k0 = _mm_mul_pd( x1px2_k0, EV_t_l2_k0 );
EV_t_l2_k2 = _mm_mul_pd( x1px2_k2, EV_t_l2_k2 );
EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l2_k2 );
EV_t_l3_k0 = _mm_mul_pd( x1px2_k0, EV_t_l3_k0 );
EV_t_l3_k2 = _mm_mul_pd( x1px2_k2, EV_t_l3_k2 );
EV_t_l3_k0 = _mm_hadd_pd( EV_t_l3_k0, EV_t_l3_k2 );
EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l3_k0 );
scale = 1;
__m128d v1 = _mm_and_pd(EV_t_l0_k0, absMask.m);
v1 = _mm_cmplt_pd(v1, minlikelihood_sse);
if(_mm_movemask_pd( v1 ) != 3)
scale = 0;
else
{
v1 = _mm_and_pd(EV_t_l2_k0, absMask.m);
v1 = _mm_cmplt_pd(v1, minlikelihood_sse);
if(_mm_movemask_pd( v1 ) != 3)
scale = 0;
}
if(scale)
{
_mm_store_pd(&x3[0], _mm_mul_pd(EV_t_l0_k0, sc));
_mm_store_pd(&x3[2], _mm_mul_pd(EV_t_l2_k0, sc));
if(useFastScaling)
addScale += wgt[i];
else
ex3[i] += 1;
}
else
{
_mm_store_pd(x3, EV_t_l0_k0);
_mm_store_pd(&x3[2], EV_t_l2_k0);
}
}
break;
case INNER_INNER:
for (i = 0; i < n; i++)
{
x1 = &x1_start[4 * i];
x2 = &x2_start[4 * i];
x3 = &x3_start[4 * i];
le = &left[cptr[i] * 16];
ri = &right[cptr[i] * 16];
__m128d x1_0 = _mm_load_pd( &x1[0] );
__m128d x1_2 = _mm_load_pd( &x1[2] );
__m128d left_k0_0 = _mm_load_pd( &le[0] );
__m128d left_k0_2 = _mm_load_pd( &le[2] );
__m128d left_k1_0 = _mm_load_pd( &le[4] );
__m128d left_k1_2 = _mm_load_pd( &le[6] );
__m128d left_k2_0 = _mm_load_pd( &le[8] );
__m128d left_k2_2 = _mm_load_pd( &le[10] );
__m128d left_k3_0 = _mm_load_pd( &le[12] );
__m128d left_k3_2 = _mm_load_pd( &le[14] );
left_k0_0 = _mm_mul_pd(x1_0, left_k0_0);
left_k0_2 = _mm_mul_pd(x1_2, left_k0_2);
left_k1_0 = _mm_mul_pd(x1_0, left_k1_0);
left_k1_2 = _mm_mul_pd(x1_2, left_k1_2);
left_k0_0 = _mm_hadd_pd( left_k0_0, left_k0_2 );
left_k1_0 = _mm_hadd_pd( left_k1_0, left_k1_2);
left_k0_0 = _mm_hadd_pd( left_k0_0, left_k1_0);
left_k2_0 = _mm_mul_pd(x1_0, left_k2_0);
left_k2_2 = _mm_mul_pd(x1_2, left_k2_2);
left_k3_0 = _mm_mul_pd(x1_0, left_k3_0);
left_k3_2 = _mm_mul_pd(x1_2, left_k3_2);
left_k2_0 = _mm_hadd_pd( left_k2_0, left_k2_2);
left_k3_0 = _mm_hadd_pd( left_k3_0, left_k3_2);
left_k2_0 = _mm_hadd_pd( left_k2_0, left_k3_0);
__m128d x2_0 = _mm_load_pd( &x2[0] );
__m128d x2_2 = _mm_load_pd( &x2[2] );
__m128d right_k0_0 = _mm_load_pd( &ri[0] );
__m128d right_k0_2 = _mm_load_pd( &ri[2] );
__m128d right_k1_0 = _mm_load_pd( &ri[4] );
__m128d right_k1_2 = _mm_load_pd( &ri[6] );
__m128d right_k2_0 = _mm_load_pd( &ri[8] );
__m128d right_k2_2 = _mm_load_pd( &ri[10] );
__m128d right_k3_0 = _mm_load_pd( &ri[12] );
__m128d right_k3_2 = _mm_load_pd( &ri[14] );
right_k0_0 = _mm_mul_pd( x2_0, right_k0_0);
right_k0_2 = _mm_mul_pd( x2_2, right_k0_2);
right_k1_0 = _mm_mul_pd( x2_0, right_k1_0);
right_k1_2 = _mm_mul_pd( x2_2, right_k1_2);
right_k0_0 = _mm_hadd_pd( right_k0_0, right_k0_2);
right_k1_0 = _mm_hadd_pd( right_k1_0, right_k1_2);
right_k0_0 = _mm_hadd_pd( right_k0_0, right_k1_0);
right_k2_0 = _mm_mul_pd( x2_0, right_k2_0);
right_k2_2 = _mm_mul_pd( x2_2, right_k2_2);
right_k3_0 = _mm_mul_pd( x2_0, right_k3_0);
right_k3_2 = _mm_mul_pd( x2_2, right_k3_2);
right_k2_0 = _mm_hadd_pd( right_k2_0, right_k2_2);
right_k3_0 = _mm_hadd_pd( right_k3_0, right_k3_2);
right_k2_0 = _mm_hadd_pd( right_k2_0, right_k3_0);
__m128d x1px2_k0 = _mm_mul_pd( left_k0_0, right_k0_0 );
__m128d x1px2_k2 = _mm_mul_pd( left_k2_0, right_k2_0 );
__m128d EV_t_l0_k0 = EVV[0];
__m128d EV_t_l0_k2 = EVV[1];
__m128d EV_t_l1_k0 = EVV[2];
__m128d EV_t_l1_k2 = EVV[3];
__m128d EV_t_l2_k0 = EVV[4];
__m128d EV_t_l2_k2 = EVV[5];
__m128d EV_t_l3_k0 = EVV[6];
__m128d EV_t_l3_k2 = EVV[7];
EV_t_l0_k0 = _mm_mul_pd( x1px2_k0, EV_t_l0_k0 );
EV_t_l0_k2 = _mm_mul_pd( x1px2_k2, EV_t_l0_k2 );
EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l0_k2 );
EV_t_l1_k0 = _mm_mul_pd( x1px2_k0, EV_t_l1_k0 );
EV_t_l1_k2 = _mm_mul_pd( x1px2_k2, EV_t_l1_k2 );
EV_t_l1_k0 = _mm_hadd_pd( EV_t_l1_k0, EV_t_l1_k2 );
EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l1_k0 );
EV_t_l2_k0 = _mm_mul_pd( x1px2_k0, EV_t_l2_k0 );
EV_t_l2_k2 = _mm_mul_pd( x1px2_k2, EV_t_l2_k2 );
EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l2_k2 );
EV_t_l3_k0 = _mm_mul_pd( x1px2_k0, EV_t_l3_k0 );
EV_t_l3_k2 = _mm_mul_pd( x1px2_k2, EV_t_l3_k2 );
EV_t_l3_k0 = _mm_hadd_pd( EV_t_l3_k0, EV_t_l3_k2 );
EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l3_k0 );
scale = 1;
__m128d v1 = _mm_and_pd(EV_t_l0_k0, absMask.m);
v1 = _mm_cmplt_pd(v1, minlikelihood_sse);
if(_mm_movemask_pd( v1 ) != 3)
scale = 0;
else
{
v1 = _mm_and_pd(EV_t_l2_k0, absMask.m);
v1 = _mm_cmplt_pd(v1, minlikelihood_sse);
if(_mm_movemask_pd( v1 ) != 3)
scale = 0;
}
if(scale)
{
_mm_store_pd(&x3[0], _mm_mul_pd(EV_t_l0_k0, sc));
_mm_store_pd(&x3[2], _mm_mul_pd(EV_t_l2_k0, sc));
if(useFastScaling)
addScale += wgt[i];
else
ex3[i] += 1;
}
else
{
_mm_store_pd(x3, EV_t_l0_k0);
_mm_store_pd(&x3[2], EV_t_l2_k0);
}
}
break;
default:
assert(0);
}
if(useFastScaling)
*scalerIncrement = addScale;
}
#endif
#ifdef __SIM_SSE3
static void newviewGTRGAMMA(int tipCase,
double *x1_start, double *x2_start, double *x3_start,
double *EV, double *tipVector,
int *ex3, unsigned char *tipX1, unsigned char *tipX2,
const int n, double *left, double *right, int *wgt, int *scalerIncrement, const boolean useFastScaling,
const unsigned int x1_presenceMap, const unsigned int x2_presenceMap)
{
int
i,
j,
k,
l,
addScale = 0;
double
*x1,
*x2,
*x3,
max,
maxima[2] __attribute__ ((aligned (BYTE_ALIGNMENT))),
EV_t[16] __attribute__ ((aligned (BYTE_ALIGNMENT)));
__m128d
values[8],
EVV[8];
for(k = 0; k < 4; k++)
for (l=0; l < 4; l++)
EV_t[4 * l + k] = EV[4 * k + l];
for(k = 0; k < 8; k++)
EVV[k] = _mm_load_pd(&EV_t[k * 2]);
switch(tipCase)
{
case TIP_TIP:
{
double *uX1, umpX1[256] __attribute__ ((aligned (BYTE_ALIGNMENT))), *uX2, umpX2[256] __attribute__ ((aligned (BYTE_ALIGNMENT)));
for (i = 1; i < 16; i++)
{
__m128d x1_1 = _mm_load_pd(&(tipVector[i*4]));
__m128d x1_2 = _mm_load_pd(&(tipVector[i*4 + 2]));
if(mask32[i] & x1_presenceMap)
{
for (j = 0; j < 4; j++)
for (k = 0; k < 4; k++)
{
__m128d left1 = _mm_load_pd(&left[j*16 + k*4]);
__m128d left2 = _mm_load_pd(&left[j*16 + k*4 + 2]);
__m128d acc = _mm_setzero_pd();
acc = _mm_add_pd(acc, _mm_mul_pd(left1, x1_1));
acc = _mm_add_pd(acc, _mm_mul_pd(left2, x1_2));
acc = _mm_hadd_pd(acc, acc);
_mm_storel_pd(&umpX1[i*16 + j*4 + k], acc);
}
}
if(mask32[i] & x2_presenceMap)
{
for (j = 0; j < 4; j++)
for (k = 0; k < 4; k++)
{
__m128d left1 = _mm_load_pd(&right[j*16 + k*4]);
__m128d left2 = _mm_load_pd(&right[j*16 + k*4 + 2]);
__m128d acc = _mm_setzero_pd();
acc = _mm_add_pd(acc, _mm_mul_pd(left1, x1_1));
acc = _mm_add_pd(acc, _mm_mul_pd(left2, x1_2));
acc = _mm_hadd_pd(acc, acc);
_mm_storel_pd(&umpX2[i*16 + j*4 + k], acc);
}
}
}
for (i = 0; i < n; i++)
{
x3 = &x3_start[i * 16];
uX1 = &umpX1[16 * tipX1[i]];
uX2 = &umpX2[16 * tipX2[i]];
for (j = 0; j < 4; j++)
{
__m128d uX1_k0_sse = _mm_load_pd( &uX1[j * 4] );
__m128d uX1_k2_sse = _mm_load_pd( &uX1[j * 4 + 2] );
__m128d uX2_k0_sse = _mm_load_pd( &uX2[j * 4] );
__m128d uX2_k2_sse = _mm_load_pd( &uX2[j * 4 + 2] );
/* multiply left * right */
__m128d x1px2_k0 = _mm_mul_pd( uX1_k0_sse, uX2_k0_sse );
__m128d x1px2_k2 = _mm_mul_pd( uX1_k2_sse, uX2_k2_sse );
/* multiply with EV matrix (!?) */
__m128d EV_t_l0_k0 = EVV[0];
__m128d EV_t_l0_k2 = EVV[1];
__m128d EV_t_l1_k0 = EVV[2];
__m128d EV_t_l1_k2 = EVV[3];
__m128d EV_t_l2_k0 = EVV[4];
__m128d EV_t_l2_k2 = EVV[5];
__m128d EV_t_l3_k0 = EVV[6];
__m128d EV_t_l3_k2 = EVV[7];
EV_t_l0_k0 = _mm_mul_pd( x1px2_k0, EV_t_l0_k0 );
EV_t_l0_k2 = _mm_mul_pd( x1px2_k2, EV_t_l0_k2 );
EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l0_k2 );
EV_t_l1_k0 = _mm_mul_pd( x1px2_k0, EV_t_l1_k0 );
EV_t_l1_k2 = _mm_mul_pd( x1px2_k2, EV_t_l1_k2 );
EV_t_l1_k0 = _mm_hadd_pd( EV_t_l1_k0, EV_t_l1_k2 );
EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l1_k0 );
EV_t_l2_k0 = _mm_mul_pd( x1px2_k0, EV_t_l2_k0 );
EV_t_l2_k2 = _mm_mul_pd( x1px2_k2, EV_t_l2_k2 );
EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l2_k2 );
EV_t_l3_k0 = _mm_mul_pd( x1px2_k0, EV_t_l3_k0 );
EV_t_l3_k2 = _mm_mul_pd( x1px2_k2, EV_t_l3_k2 );
EV_t_l3_k0 = _mm_hadd_pd( EV_t_l3_k0, EV_t_l3_k2 );
EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l3_k0 );
_mm_store_pd( &x3[j * 4 + 0], EV_t_l0_k0 );
_mm_store_pd( &x3[j * 4 + 2], EV_t_l2_k0 );
}
}
}
break;
case TIP_INNER:
{
double *uX1, umpX1[256] __attribute__ ((aligned (BYTE_ALIGNMENT)));
for (i = 1; i < 16; i++)
{
if(mask32[i] & x1_presenceMap)
{
__m128d x1_1 = _mm_load_pd(&(tipVector[i*4]));
__m128d x1_2 = _mm_load_pd(&(tipVector[i*4 + 2]));
for (j = 0; j < 4; j++)
for (k = 0; k < 4; k++)
{
__m128d left1 = _mm_load_pd(&left[j*16 + k*4]);
__m128d left2 = _mm_load_pd(&left[j*16 + k*4 + 2]);
__m128d acc = _mm_setzero_pd();
acc = _mm_add_pd(acc, _mm_mul_pd(left1, x1_1));
acc = _mm_add_pd(acc, _mm_mul_pd(left2, x1_2));
acc = _mm_hadd_pd(acc, acc);
_mm_storel_pd(&umpX1[i*16 + j*4 + k], acc);
}
}
}
for (i = 0; i < n; i++)
{
__m128d maxv =_mm_setzero_pd();
x2 = &x2_start[i * 16];
x3 = &x3_start[i * 16];
uX1 = &umpX1[16 * tipX1[i]];
for (j = 0; j < 4; j++)
{
/* multiply/add right side */
double *x2_p = &x2[j*4];
double *right_k0_p = &right[j*16];
double *right_k1_p = &right[j*16 + 1*4];
double *right_k2_p = &right[j*16 + 2*4];
double *right_k3_p = &right[j*16 + 3*4];
__m128d x2_0 = _mm_load_pd( &x2_p[0] );
__m128d x2_2 = _mm_load_pd( &x2_p[2] );
__m128d right_k0_0 = _mm_load_pd( &right_k0_p[0] );
__m128d right_k0_2 = _mm_load_pd( &right_k0_p[2] );
__m128d right_k1_0 = _mm_load_pd( &right_k1_p[0] );
__m128d right_k1_2 = _mm_load_pd( &right_k1_p[2] );
__m128d right_k2_0 = _mm_load_pd( &right_k2_p[0] );
__m128d right_k2_2 = _mm_load_pd( &right_k2_p[2] );
__m128d right_k3_0 = _mm_load_pd( &right_k3_p[0] );
__m128d right_k3_2 = _mm_load_pd( &right_k3_p[2] );
right_k0_0 = _mm_mul_pd( x2_0, right_k0_0);
right_k0_2 = _mm_mul_pd( x2_2, right_k0_2);
right_k1_0 = _mm_mul_pd( x2_0, right_k1_0);
right_k1_2 = _mm_mul_pd( x2_2, right_k1_2);
right_k0_0 = _mm_hadd_pd( right_k0_0, right_k0_2);
right_k1_0 = _mm_hadd_pd( right_k1_0, right_k1_2);
right_k0_0 = _mm_hadd_pd( right_k0_0, right_k1_0);
right_k2_0 = _mm_mul_pd( x2_0, right_k2_0);
right_k2_2 = _mm_mul_pd( x2_2, right_k2_2);
right_k3_0 = _mm_mul_pd( x2_0, right_k3_0);
right_k3_2 = _mm_mul_pd( x2_2, right_k3_2);
right_k2_0 = _mm_hadd_pd( right_k2_0, right_k2_2);
right_k3_0 = _mm_hadd_pd( right_k3_0, right_k3_2);
right_k2_0 = _mm_hadd_pd( right_k2_0, right_k3_0);
{
/* load left side from tip vector */
__m128d uX1_k0_sse = _mm_load_pd( &uX1[j * 4] );
__m128d uX1_k2_sse = _mm_load_pd( &uX1[j * 4 + 2] );
/* multiply left * right */
__m128d x1px2_k0 = _mm_mul_pd( uX1_k0_sse, right_k0_0 );
__m128d x1px2_k2 = _mm_mul_pd( uX1_k2_sse, right_k2_0 );
/* multiply with EV matrix (!?) */
__m128d EV_t_l0_k0 = EVV[0];
__m128d EV_t_l0_k2 = EVV[1];
__m128d EV_t_l1_k0 = EVV[2];
__m128d EV_t_l1_k2 = EVV[3];
__m128d EV_t_l2_k0 = EVV[4];
__m128d EV_t_l2_k2 = EVV[5];
__m128d EV_t_l3_k0 = EVV[6];
__m128d EV_t_l3_k2 = EVV[7];
EV_t_l0_k0 = _mm_mul_pd( x1px2_k0, EV_t_l0_k0 );
EV_t_l0_k2 = _mm_mul_pd( x1px2_k2, EV_t_l0_k2 );
EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l0_k2 );
EV_t_l1_k0 = _mm_mul_pd( x1px2_k0, EV_t_l1_k0 );
EV_t_l1_k2 = _mm_mul_pd( x1px2_k2, EV_t_l1_k2 );
EV_t_l1_k0 = _mm_hadd_pd( EV_t_l1_k0, EV_t_l1_k2 );
EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l1_k0 );
EV_t_l2_k0 = _mm_mul_pd( x1px2_k0, EV_t_l2_k0 );
EV_t_l2_k2 = _mm_mul_pd( x1px2_k2, EV_t_l2_k2 );
EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l2_k2 );
EV_t_l3_k0 = _mm_mul_pd( x1px2_k0, EV_t_l3_k0 );
EV_t_l3_k2 = _mm_mul_pd( x1px2_k2, EV_t_l3_k2 );
EV_t_l3_k0 = _mm_hadd_pd( EV_t_l3_k0, EV_t_l3_k2 );
EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l3_k0 );
values[j * 2] = EV_t_l0_k0;
values[j * 2 + 1] = EV_t_l2_k0;
maxv = _mm_max_pd(maxv, _mm_and_pd(EV_t_l0_k0, absMask.m));
maxv = _mm_max_pd(maxv, _mm_and_pd(EV_t_l2_k0, absMask.m));
}
}
_mm_store_pd(maxima, maxv);
max = MAX(maxima[0], maxima[1]);
if(max < minlikelihood)
{
__m128d sv = _mm_set1_pd(twotothe256);
_mm_store_pd(&x3[0], _mm_mul_pd(values[0], sv));
_mm_store_pd(&x3[2], _mm_mul_pd(values[1], sv));
_mm_store_pd(&x3[4], _mm_mul_pd(values[2], sv));
_mm_store_pd(&x3[6], _mm_mul_pd(values[3], sv));
_mm_store_pd(&x3[8], _mm_mul_pd(values[4], sv));
_mm_store_pd(&x3[10], _mm_mul_pd(values[5], sv));
_mm_store_pd(&x3[12], _mm_mul_pd(values[6], sv));
_mm_store_pd(&x3[14], _mm_mul_pd(values[7], sv));
if(useFastScaling)
addScale += wgt[i];
else
ex3[i] += 1;
}
else
{
_mm_store_pd(&x3[0], values[0]);
_mm_store_pd(&x3[2], values[1]);
_mm_store_pd(&x3[4], values[2]);
_mm_store_pd(&x3[6], values[3]);
_mm_store_pd(&x3[8], values[4]);
_mm_store_pd(&x3[10], values[5]);
_mm_store_pd(&x3[12], values[6]);
_mm_store_pd(&x3[14], values[7]);
}
}
}
break;
case INNER_INNER:
for (i = 0; i < n; i++)
{
__m128d maxv =_mm_setzero_pd();
x1 = &x1_start[i * 16];
x2 = &x2_start[i * 16];
x3 = &x3_start[i * 16];
for (j = 0; j < 4; j++)
{
double *x1_p = &x1[j*4];
double *left_k0_p = &left[j*16];
double *left_k1_p = &left[j*16 + 1*4];
double *left_k2_p = &left[j*16 + 2*4];
double *left_k3_p = &left[j*16 + 3*4];
__m128d x1_0 = _mm_load_pd( &x1_p[0] );
__m128d x1_2 = _mm_load_pd( &x1_p[2] );
__m128d left_k0_0 = _mm_load_pd( &left_k0_p[0] );
__m128d left_k0_2 = _mm_load_pd( &left_k0_p[2] );
__m128d left_k1_0 = _mm_load_pd( &left_k1_p[0] );
__m128d left_k1_2 = _mm_load_pd( &left_k1_p[2] );
__m128d left_k2_0 = _mm_load_pd( &left_k2_p[0] );
__m128d left_k2_2 = _mm_load_pd( &left_k2_p[2] );
__m128d left_k3_0 = _mm_load_pd( &left_k3_p[0] );
__m128d left_k3_2 = _mm_load_pd( &left_k3_p[2] );
left_k0_0 = _mm_mul_pd(x1_0, left_k0_0);
left_k0_2 = _mm_mul_pd(x1_2, left_k0_2);
left_k1_0 = _mm_mul_pd(x1_0, left_k1_0);
left_k1_2 = _mm_mul_pd(x1_2, left_k1_2);
left_k0_0 = _mm_hadd_pd( left_k0_0, left_k0_2 );
left_k1_0 = _mm_hadd_pd( left_k1_0, left_k1_2);
left_k0_0 = _mm_hadd_pd( left_k0_0, left_k1_0);
left_k2_0 = _mm_mul_pd(x1_0, left_k2_0);
left_k2_2 = _mm_mul_pd(x1_2, left_k2_2);
left_k3_0 = _mm_mul_pd(x1_0, left_k3_0);
left_k3_2 = _mm_mul_pd(x1_2, left_k3_2);
left_k2_0 = _mm_hadd_pd( left_k2_0, left_k2_2);
left_k3_0 = _mm_hadd_pd( left_k3_0, left_k3_2);
left_k2_0 = _mm_hadd_pd( left_k2_0, left_k3_0);
/* multiply/add right side */
double *x2_p = &x2[j*4];
double *right_k0_p = &right[j*16];
double *right_k1_p = &right[j*16 + 1*4];
double *right_k2_p = &right[j*16 + 2*4];
double *right_k3_p = &right[j*16 + 3*4];
__m128d x2_0 = _mm_load_pd( &x2_p[0] );
__m128d x2_2 = _mm_load_pd( &x2_p[2] );
__m128d right_k0_0 = _mm_load_pd( &right_k0_p[0] );
__m128d right_k0_2 = _mm_load_pd( &right_k0_p[2] );
__m128d right_k1_0 = _mm_load_pd( &right_k1_p[0] );
__m128d right_k1_2 = _mm_load_pd( &right_k1_p[2] );
__m128d right_k2_0 = _mm_load_pd( &right_k2_p[0] );
__m128d right_k2_2 = _mm_load_pd( &right_k2_p[2] );
__m128d right_k3_0 = _mm_load_pd( &right_k3_p[0] );
__m128d right_k3_2 = _mm_load_pd( &right_k3_p[2] );
right_k0_0 = _mm_mul_pd( x2_0, right_k0_0);
right_k0_2 = _mm_mul_pd( x2_2, right_k0_2);
right_k1_0 = _mm_mul_pd( x2_0, right_k1_0);
right_k1_2 = _mm_mul_pd( x2_2, right_k1_2);
right_k0_0 = _mm_hadd_pd( right_k0_0, right_k0_2);
right_k1_0 = _mm_hadd_pd( right_k1_0, right_k1_2);
right_k0_0 = _mm_hadd_pd( right_k0_0, right_k1_0);
right_k2_0 = _mm_mul_pd( x2_0, right_k2_0);
right_k2_2 = _mm_mul_pd( x2_2, right_k2_2);
right_k3_0 = _mm_mul_pd( x2_0, right_k3_0);
right_k3_2 = _mm_mul_pd( x2_2, right_k3_2);
right_k2_0 = _mm_hadd_pd( right_k2_0, right_k2_2);
right_k3_0 = _mm_hadd_pd( right_k3_0, right_k3_2);
right_k2_0 = _mm_hadd_pd( right_k2_0, right_k3_0);
/* multiply left * right */
__m128d x1px2_k0 = _mm_mul_pd( left_k0_0, right_k0_0 );
__m128d x1px2_k2 = _mm_mul_pd( left_k2_0, right_k2_0 );
/* multiply with EV matrix (!?) */
__m128d EV_t_l0_k0 = EVV[0];
__m128d EV_t_l0_k2 = EVV[1];
__m128d EV_t_l1_k0 = EVV[2];
__m128d EV_t_l1_k2 = EVV[3];
__m128d EV_t_l2_k0 = EVV[4];
__m128d EV_t_l2_k2 = EVV[5];
__m128d EV_t_l3_k0 = EVV[6];
__m128d EV_t_l3_k2 = EVV[7];
EV_t_l0_k0 = _mm_mul_pd( x1px2_k0, EV_t_l0_k0 );
EV_t_l0_k2 = _mm_mul_pd( x1px2_k2, EV_t_l0_k2 );
EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l0_k2 );
EV_t_l1_k0 = _mm_mul_pd( x1px2_k0, EV_t_l1_k0 );
EV_t_l1_k2 = _mm_mul_pd( x1px2_k2, EV_t_l1_k2 );
EV_t_l1_k0 = _mm_hadd_pd( EV_t_l1_k0, EV_t_l1_k2 );
EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l1_k0 );
EV_t_l2_k0 = _mm_mul_pd( x1px2_k0, EV_t_l2_k0 );
EV_t_l2_k2 = _mm_mul_pd( x1px2_k2, EV_t_l2_k2 );
EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l2_k2 );
EV_t_l3_k0 = _mm_mul_pd( x1px2_k0, EV_t_l3_k0 );
EV_t_l3_k2 = _mm_mul_pd( x1px2_k2, EV_t_l3_k2 );
EV_t_l3_k0 = _mm_hadd_pd( EV_t_l3_k0, EV_t_l3_k2 );
EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l3_k0 );
values[j * 2] = EV_t_l0_k0;
values[j * 2 + 1] = EV_t_l2_k0;
maxv = _mm_max_pd(maxv, _mm_and_pd(EV_t_l0_k0, absMask.m));
maxv = _mm_max_pd(maxv, _mm_and_pd(EV_t_l2_k0, absMask.m));
}
_mm_store_pd(maxima, maxv);
max = MAX(maxima[0], maxima[1]);
if(max < minlikelihood)
{
__m128d sv = _mm_set1_pd(twotothe256);
_mm_store_pd(&x3[0], _mm_mul_pd(values[0], sv));
_mm_store_pd(&x3[2], _mm_mul_pd(values[1], sv));
_mm_store_pd(&x3[4], _mm_mul_pd(values[2], sv));
_mm_store_pd(&x3[6], _mm_mul_pd(values[3], sv));
_mm_store_pd(&x3[8], _mm_mul_pd(values[4], sv));
_mm_store_pd(&x3[10], _mm_mul_pd(values[5], sv));
_mm_store_pd(&x3[12], _mm_mul_pd(values[6], sv));
_mm_store_pd(&x3[14], _mm_mul_pd(values[7], sv));
if(useFastScaling)
addScale += wgt[i];
else
ex3[i] += 1;
}
else
{
_mm_store_pd(&x3[0], values[0]);
_mm_store_pd(&x3[2], values[1]);
_mm_store_pd(&x3[4], values[2]);
_mm_store_pd(&x3[6], values[3]);
_mm_store_pd(&x3[8], values[4]);
_mm_store_pd(&x3[10], values[5]);
_mm_store_pd(&x3[12], values[6]);
_mm_store_pd(&x3[14], values[7]);
}
}
break;
default:
assert(0);
}
if(useFastScaling)
*scalerIncrement = addScale;
}
static void newviewGTRGAMMA_GAPPED_SAVE(int tipCase,
double *x1_start, double *x2_start, double *x3_start,
double *EV, double *tipVector,
int *ex3, unsigned char *tipX1, unsigned char *tipX2,
const int n, double *left, double *right, int *wgt, int *scalerIncrement, const boolean useFastScaling,
unsigned int *x1_gap, unsigned int *x2_gap, unsigned int *x3_gap,
double *x1_gapColumn, double *x2_gapColumn, double *x3_gapColumn, const unsigned int x1_presenceMap, const unsigned int x2_presenceMap)
{
int
i,
j,
k,
l,
addScale = 0,
scaleGap = 0;
double
*x1,
*x2,
*x3,
*x1_ptr = x1_start,
*x2_ptr = x2_start,
max,
maxima[2] __attribute__ ((aligned (BYTE_ALIGNMENT))),
EV_t[16] __attribute__ ((aligned (BYTE_ALIGNMENT)));
__m128d
values[8],
EVV[8];
for(k = 0; k < 4; k++)
for (l=0; l < 4; l++)
EV_t[4 * l + k] = EV[4 * k + l];
for(k = 0; k < 8; k++)
EVV[k] = _mm_load_pd(&EV_t[k * 2]);
switch(tipCase)
{
case TIP_TIP:
{
double *uX1, umpX1[256] __attribute__ ((aligned (BYTE_ALIGNMENT))), *uX2, umpX2[256] __attribute__ ((aligned (BYTE_ALIGNMENT)));
for (i = 1; i < 16; i++)
{
__m128d x1_1 = _mm_load_pd(&(tipVector[i*4]));
__m128d x1_2 = _mm_load_pd(&(tipVector[i*4 + 2]));
if((mask32[i] & x1_presenceMap) || i == 15)
{
for (j = 0; j < 4; j++)
for (k = 0; k < 4; k++)
{
__m128d left1 = _mm_load_pd(&left[j*16 + k*4]);
__m128d left2 = _mm_load_pd(&left[j*16 + k*4 + 2]);
__m128d acc = _mm_setzero_pd();
acc = _mm_add_pd(acc, _mm_mul_pd(left1, x1_1));
acc = _mm_add_pd(acc, _mm_mul_pd(left2, x1_2));
acc = _mm_hadd_pd(acc, acc);
_mm_storel_pd(&umpX1[i*16 + j*4 + k], acc);
}
}
if((mask32[i] & x2_presenceMap) || i == 15)
{
for (j = 0; j < 4; j++)
for (k = 0; k < 4; k++)
{
__m128d left1 = _mm_load_pd(&right[j*16 + k*4]);
__m128d left2 = _mm_load_pd(&right[j*16 + k*4 + 2]);
__m128d acc = _mm_setzero_pd();
acc = _mm_add_pd(acc, _mm_mul_pd(left1, x1_1));
acc = _mm_add_pd(acc, _mm_mul_pd(left2, x1_2));
acc = _mm_hadd_pd(acc, acc);
_mm_storel_pd(&umpX2[i*16 + j*4 + k], acc);
}
}
}
uX1 = &umpX1[240];
uX2 = &umpX2[240];
for (j = 0; j < 4; j++)
{
__m128d uX1_k0_sse = _mm_load_pd( &uX1[j * 4] );
__m128d uX1_k2_sse = _mm_load_pd( &uX1[j * 4 + 2] );
__m128d uX2_k0_sse = _mm_load_pd( &uX2[j * 4] );
__m128d uX2_k2_sse = _mm_load_pd( &uX2[j * 4 + 2] );
__m128d x1px2_k0 = _mm_mul_pd( uX1_k0_sse, uX2_k0_sse );
__m128d x1px2_k2 = _mm_mul_pd( uX1_k2_sse, uX2_k2_sse );
__m128d EV_t_l0_k0 = EVV[0];
__m128d EV_t_l0_k2 = EVV[1];
__m128d EV_t_l1_k0 = EVV[2];
__m128d EV_t_l1_k2 = EVV[3];
__m128d EV_t_l2_k0 = EVV[4];
__m128d EV_t_l2_k2 = EVV[5];
__m128d EV_t_l3_k0 = EVV[6];
__m128d EV_t_l3_k2 = EVV[7];
EV_t_l0_k0 = _mm_mul_pd( x1px2_k0, EV_t_l0_k0 );
EV_t_l0_k2 = _mm_mul_pd( x1px2_k2, EV_t_l0_k2 );
EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l0_k2 );
EV_t_l1_k0 = _mm_mul_pd( x1px2_k0, EV_t_l1_k0 );
EV_t_l1_k2 = _mm_mul_pd( x1px2_k2, EV_t_l1_k2 );
EV_t_l1_k0 = _mm_hadd_pd( EV_t_l1_k0, EV_t_l1_k2 );
EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l1_k0 );
EV_t_l2_k0 = _mm_mul_pd( x1px2_k0, EV_t_l2_k0 );
EV_t_l2_k2 = _mm_mul_pd( x1px2_k2, EV_t_l2_k2 );
EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l2_k2 );
EV_t_l3_k0 = _mm_mul_pd( x1px2_k0, EV_t_l3_k0 );
EV_t_l3_k2 = _mm_mul_pd( x1px2_k2, EV_t_l3_k2 );
EV_t_l3_k0 = _mm_hadd_pd( EV_t_l3_k0, EV_t_l3_k2 );
EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l3_k0 );
_mm_store_pd( &x3_gapColumn[j * 4 + 0], EV_t_l0_k0 );
_mm_store_pd( &x3_gapColumn[j * 4 + 2], EV_t_l2_k0 );
}
x3 = x3_start;
for (i = 0; i < n; i++)
{
if(!(x3_gap[i / 32] & mask32[i % 32]))
{
uX1 = &umpX1[16 * tipX1[i]];
uX2 = &umpX2[16 * tipX2[i]];
for (j = 0; j < 4; j++)
{
__m128d uX1_k0_sse = _mm_load_pd( &uX1[j * 4] );
__m128d uX1_k2_sse = _mm_load_pd( &uX1[j * 4 + 2] );
__m128d uX2_k0_sse = _mm_load_pd( &uX2[j * 4] );
__m128d uX2_k2_sse = _mm_load_pd( &uX2[j * 4 + 2] );
/* multiply left * right */
__m128d x1px2_k0 = _mm_mul_pd( uX1_k0_sse, uX2_k0_sse );
__m128d x1px2_k2 = _mm_mul_pd( uX1_k2_sse, uX2_k2_sse );
/* multiply with EV matrix (!?) */
__m128d EV_t_l0_k0 = EVV[0];
__m128d EV_t_l0_k2 = EVV[1];
__m128d EV_t_l1_k0 = EVV[2];
__m128d EV_t_l1_k2 = EVV[3];
__m128d EV_t_l2_k0 = EVV[4];
__m128d EV_t_l2_k2 = EVV[5];
__m128d EV_t_l3_k0 = EVV[6];
__m128d EV_t_l3_k2 = EVV[7];
EV_t_l0_k0 = _mm_mul_pd( x1px2_k0, EV_t_l0_k0 );
EV_t_l0_k2 = _mm_mul_pd( x1px2_k2, EV_t_l0_k2 );
EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l0_k2 );
EV_t_l1_k0 = _mm_mul_pd( x1px2_k0, EV_t_l1_k0 );
EV_t_l1_k2 = _mm_mul_pd( x1px2_k2, EV_t_l1_k2 );
EV_t_l1_k0 = _mm_hadd_pd( EV_t_l1_k0, EV_t_l1_k2 );
EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l1_k0 );
EV_t_l2_k0 = _mm_mul_pd( x1px2_k0, EV_t_l2_k0 );
EV_t_l2_k2 = _mm_mul_pd( x1px2_k2, EV_t_l2_k2 );
EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l2_k2 );
EV_t_l3_k0 = _mm_mul_pd( x1px2_k0, EV_t_l3_k0 );
EV_t_l3_k2 = _mm_mul_pd( x1px2_k2, EV_t_l3_k2 );
EV_t_l3_k0 = _mm_hadd_pd( EV_t_l3_k0, EV_t_l3_k2 );
EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l3_k0 );
_mm_store_pd( &x3[j * 4 + 0], EV_t_l0_k0 );
_mm_store_pd( &x3[j * 4 + 2], EV_t_l2_k0 );
}
x3 += 16;
}
}
}
break;
case TIP_INNER:
{
double
*uX1,
umpX1[256] __attribute__ ((aligned (BYTE_ALIGNMENT)));
for (i = 1; i < 16; i++)
{
if((mask32[i] & x1_presenceMap) || i == 15)
{
__m128d x1_1 = _mm_load_pd(&(tipVector[i*4]));
__m128d x1_2 = _mm_load_pd(&(tipVector[i*4 + 2]));
for (j = 0; j < 4; j++)
for (k = 0; k < 4; k++)
{
__m128d left1 = _mm_load_pd(&left[j*16 + k*4]);
__m128d left2 = _mm_load_pd(&left[j*16 + k*4 + 2]);
__m128d acc = _mm_setzero_pd();
acc = _mm_add_pd(acc, _mm_mul_pd(left1, x1_1));
acc = _mm_add_pd(acc, _mm_mul_pd(left2, x1_2));
acc = _mm_hadd_pd(acc, acc);
_mm_storel_pd(&umpX1[i*16 + j*4 + k], acc);
}
}
}
{
__m128d maxv =_mm_setzero_pd();
scaleGap = 0;
x2 = x2_gapColumn;
x3 = x3_gapColumn;
uX1 = &umpX1[240];
for (j = 0; j < 4; j++)
{
double *x2_p = &x2[j*4];
double *right_k0_p = &right[j*16];
double *right_k1_p = &right[j*16 + 1*4];
double *right_k2_p = &right[j*16 + 2*4];
double *right_k3_p = &right[j*16 + 3*4];
__m128d x2_0 = _mm_load_pd( &x2_p[0] );
__m128d x2_2 = _mm_load_pd( &x2_p[2] );
__m128d right_k0_0 = _mm_load_pd( &right_k0_p[0] );
__m128d right_k0_2 = _mm_load_pd( &right_k0_p[2] );
__m128d right_k1_0 = _mm_load_pd( &right_k1_p[0] );
__m128d right_k1_2 = _mm_load_pd( &right_k1_p[2] );
__m128d right_k2_0 = _mm_load_pd( &right_k2_p[0] );
__m128d right_k2_2 = _mm_load_pd( &right_k2_p[2] );
__m128d right_k3_0 = _mm_load_pd( &right_k3_p[0] );
__m128d right_k3_2 = _mm_load_pd( &right_k3_p[2] );
right_k0_0 = _mm_mul_pd( x2_0, right_k0_0);
right_k0_2 = _mm_mul_pd( x2_2, right_k0_2);
right_k1_0 = _mm_mul_pd( x2_0, right_k1_0);
right_k1_2 = _mm_mul_pd( x2_2, right_k1_2);
right_k0_0 = _mm_hadd_pd( right_k0_0, right_k0_2);
right_k1_0 = _mm_hadd_pd( right_k1_0, right_k1_2);
right_k0_0 = _mm_hadd_pd( right_k0_0, right_k1_0);
right_k2_0 = _mm_mul_pd( x2_0, right_k2_0);
right_k2_2 = _mm_mul_pd( x2_2, right_k2_2);
right_k3_0 = _mm_mul_pd( x2_0, right_k3_0);
right_k3_2 = _mm_mul_pd( x2_2, right_k3_2);
right_k2_0 = _mm_hadd_pd( right_k2_0, right_k2_2);
right_k3_0 = _mm_hadd_pd( right_k3_0, right_k3_2);
right_k2_0 = _mm_hadd_pd( right_k2_0, right_k3_0);
__m128d uX1_k0_sse = _mm_load_pd( &uX1[j * 4] );
__m128d uX1_k2_sse = _mm_load_pd( &uX1[j * 4 + 2] );
__m128d x1px2_k0 = _mm_mul_pd( uX1_k0_sse, right_k0_0 );
__m128d x1px2_k2 = _mm_mul_pd( uX1_k2_sse, right_k2_0 );
__m128d EV_t_l0_k0 = EVV[0];
__m128d EV_t_l0_k2 = EVV[1];
__m128d EV_t_l1_k0 = EVV[2];
__m128d EV_t_l1_k2 = EVV[3];
__m128d EV_t_l2_k0 = EVV[4];
__m128d EV_t_l2_k2 = EVV[5];
__m128d EV_t_l3_k0 = EVV[6];
__m128d EV_t_l3_k2 = EVV[7];
EV_t_l0_k0 = _mm_mul_pd( x1px2_k0, EV_t_l0_k0 );
EV_t_l0_k2 = _mm_mul_pd( x1px2_k2, EV_t_l0_k2 );
EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l0_k2 );
EV_t_l1_k0 = _mm_mul_pd( x1px2_k0, EV_t_l1_k0 );
EV_t_l1_k2 = _mm_mul_pd( x1px2_k2, EV_t_l1_k2 );
EV_t_l1_k0 = _mm_hadd_pd( EV_t_l1_k0, EV_t_l1_k2 );
EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l1_k0 );
EV_t_l2_k0 = _mm_mul_pd( x1px2_k0, EV_t_l2_k0 );
EV_t_l2_k2 = _mm_mul_pd( x1px2_k2, EV_t_l2_k2 );
EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l2_k2 );
EV_t_l3_k0 = _mm_mul_pd( x1px2_k0, EV_t_l3_k0 );
EV_t_l3_k2 = _mm_mul_pd( x1px2_k2, EV_t_l3_k2 );
EV_t_l3_k0 = _mm_hadd_pd( EV_t_l3_k0, EV_t_l3_k2 );
EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l3_k0 );
values[j * 2] = EV_t_l0_k0;
values[j * 2 + 1] = EV_t_l2_k0;
maxv = _mm_max_pd(maxv, _mm_and_pd(EV_t_l0_k0, absMask.m));
maxv = _mm_max_pd(maxv, _mm_and_pd(EV_t_l2_k0, absMask.m));
}
_mm_store_pd(maxima, maxv);
max = MAX(maxima[0], maxima[1]);
if(max < minlikelihood)
{
scaleGap = 1;
__m128d sv = _mm_set1_pd(twotothe256);
_mm_store_pd(&x3[0], _mm_mul_pd(values[0], sv));
_mm_store_pd(&x3[2], _mm_mul_pd(values[1], sv));
_mm_store_pd(&x3[4], _mm_mul_pd(values[2], sv));
_mm_store_pd(&x3[6], _mm_mul_pd(values[3], sv));
_mm_store_pd(&x3[8], _mm_mul_pd(values[4], sv));
_mm_store_pd(&x3[10], _mm_mul_pd(values[5], sv));
_mm_store_pd(&x3[12], _mm_mul_pd(values[6], sv));
_mm_store_pd(&x3[14], _mm_mul_pd(values[7], sv));
}
else
{
_mm_store_pd(&x3[0], values[0]);
_mm_store_pd(&x3[2], values[1]);
_mm_store_pd(&x3[4], values[2]);
_mm_store_pd(&x3[6], values[3]);
_mm_store_pd(&x3[8], values[4]);
_mm_store_pd(&x3[10], values[5]);
_mm_store_pd(&x3[12], values[6]);
_mm_store_pd(&x3[14], values[7]);
}
}
x3 = x3_start;
for (i = 0; i < n; i++)
{
if((x3_gap[i / 32] & mask32[i % 32]))
{
if(scaleGap)
{
if(useFastScaling)
addScale += wgt[i];
else
ex3[i] += 1;
}
}
else
{
__m128d maxv =_mm_setzero_pd();
if(x2_gap[i / 32] & mask32[i % 32])
x2 = x2_gapColumn;
else
{
x2 = x2_ptr;
x2_ptr += 16;
}
uX1 = &umpX1[16 * tipX1[i]];
for (j = 0; j < 4; j++)
{
double *x2_p = &x2[j*4];
double *right_k0_p = &right[j*16];
double *right_k1_p = &right[j*16 + 1*4];
double *right_k2_p = &right[j*16 + 2*4];
double *right_k3_p = &right[j*16 + 3*4];
__m128d x2_0 = _mm_load_pd( &x2_p[0] );
__m128d x2_2 = _mm_load_pd( &x2_p[2] );
__m128d right_k0_0 = _mm_load_pd( &right_k0_p[0] );
__m128d right_k0_2 = _mm_load_pd( &right_k0_p[2] );
__m128d right_k1_0 = _mm_load_pd( &right_k1_p[0] );
__m128d right_k1_2 = _mm_load_pd( &right_k1_p[2] );
__m128d right_k2_0 = _mm_load_pd( &right_k2_p[0] );
__m128d right_k2_2 = _mm_load_pd( &right_k2_p[2] );
__m128d right_k3_0 = _mm_load_pd( &right_k3_p[0] );
__m128d right_k3_2 = _mm_load_pd( &right_k3_p[2] );
right_k0_0 = _mm_mul_pd( x2_0, right_k0_0);
right_k0_2 = _mm_mul_pd( x2_2, right_k0_2);
right_k1_0 = _mm_mul_pd( x2_0, right_k1_0);
right_k1_2 = _mm_mul_pd( x2_2, right_k1_2);
right_k0_0 = _mm_hadd_pd( right_k0_0, right_k0_2);
right_k1_0 = _mm_hadd_pd( right_k1_0, right_k1_2);
right_k0_0 = _mm_hadd_pd( right_k0_0, right_k1_0);
right_k2_0 = _mm_mul_pd( x2_0, right_k2_0);
right_k2_2 = _mm_mul_pd( x2_2, right_k2_2);
right_k3_0 = _mm_mul_pd( x2_0, right_k3_0);
right_k3_2 = _mm_mul_pd( x2_2, right_k3_2);
right_k2_0 = _mm_hadd_pd( right_k2_0, right_k2_2);
right_k3_0 = _mm_hadd_pd( right_k3_0, right_k3_2);
right_k2_0 = _mm_hadd_pd( right_k2_0, right_k3_0);
{
//
// load left side from tip vector
//
__m128d uX1_k0_sse = _mm_load_pd( &uX1[j * 4] );
__m128d uX1_k2_sse = _mm_load_pd( &uX1[j * 4 + 2] );
//
// multiply left * right
//
__m128d x1px2_k0 = _mm_mul_pd( uX1_k0_sse, right_k0_0 );
__m128d x1px2_k2 = _mm_mul_pd( uX1_k2_sse, right_k2_0 );
//
// multiply with EV matrix (!?)
//
__m128d EV_t_l0_k0 = EVV[0];
__m128d EV_t_l0_k2 = EVV[1];
__m128d EV_t_l1_k0 = EVV[2];
__m128d EV_t_l1_k2 = EVV[3];
__m128d EV_t_l2_k0 = EVV[4];
__m128d EV_t_l2_k2 = EVV[5];
__m128d EV_t_l3_k0 = EVV[6];
__m128d EV_t_l3_k2 = EVV[7];
EV_t_l0_k0 = _mm_mul_pd( x1px2_k0, EV_t_l0_k0 );
EV_t_l0_k2 = _mm_mul_pd( x1px2_k2, EV_t_l0_k2 );
EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l0_k2 );
EV_t_l1_k0 = _mm_mul_pd( x1px2_k0, EV_t_l1_k0 );
EV_t_l1_k2 = _mm_mul_pd( x1px2_k2, EV_t_l1_k2 );
EV_t_l1_k0 = _mm_hadd_pd( EV_t_l1_k0, EV_t_l1_k2 );
EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l1_k0 );
EV_t_l2_k0 = _mm_mul_pd( x1px2_k0, EV_t_l2_k0 );
EV_t_l2_k2 = _mm_mul_pd( x1px2_k2, EV_t_l2_k2 );
EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l2_k2 );
EV_t_l3_k0 = _mm_mul_pd( x1px2_k0, EV_t_l3_k0 );
EV_t_l3_k2 = _mm_mul_pd( x1px2_k2, EV_t_l3_k2 );
EV_t_l3_k0 = _mm_hadd_pd( EV_t_l3_k0, EV_t_l3_k2 );
EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l3_k0 );
values[j * 2] = EV_t_l0_k0;
values[j * 2 + 1] = EV_t_l2_k0;
maxv = _mm_max_pd(maxv, _mm_and_pd(EV_t_l0_k0, absMask.m));
maxv = _mm_max_pd(maxv, _mm_and_pd(EV_t_l2_k0, absMask.m));
}
}
_mm_store_pd(maxima, maxv);
max = MAX(maxima[0], maxima[1]);
if(max < minlikelihood)
{
__m128d sv = _mm_set1_pd(twotothe256);
_mm_store_pd(&x3[0], _mm_mul_pd(values[0], sv));
_mm_store_pd(&x3[2], _mm_mul_pd(values[1], sv));
_mm_store_pd(&x3[4], _mm_mul_pd(values[2], sv));
_mm_store_pd(&x3[6], _mm_mul_pd(values[3], sv));
_mm_store_pd(&x3[8], _mm_mul_pd(values[4], sv));
_mm_store_pd(&x3[10], _mm_mul_pd(values[5], sv));
_mm_store_pd(&x3[12], _mm_mul_pd(values[6], sv));
_mm_store_pd(&x3[14], _mm_mul_pd(values[7], sv));
if(useFastScaling)
addScale += wgt[i];
else
ex3[i] += 1;
}
else
{
_mm_store_pd(&x3[0], values[0]);
_mm_store_pd(&x3[2], values[1]);
_mm_store_pd(&x3[4], values[2]);
_mm_store_pd(&x3[6], values[3]);
_mm_store_pd(&x3[8], values[4]);
_mm_store_pd(&x3[10], values[5]);
_mm_store_pd(&x3[12], values[6]);
_mm_store_pd(&x3[14], values[7]);
}
x3 += 16;
}
}
}
break;
case INNER_INNER:
{
__m128d maxv =_mm_setzero_pd();
scaleGap = 0;
x1 = x1_gapColumn;
x2 = x2_gapColumn;
x3 = x3_gapColumn;
for (j = 0; j < 4; j++)
{
double *x1_p = &x1[j*4];
double *left_k0_p = &left[j*16];
double *left_k1_p = &left[j*16 + 1*4];
double *left_k2_p = &left[j*16 + 2*4];
double *left_k3_p = &left[j*16 + 3*4];
__m128d x1_0 = _mm_load_pd( &x1_p[0] );
__m128d x1_2 = _mm_load_pd( &x1_p[2] );
__m128d left_k0_0 = _mm_load_pd( &left_k0_p[0] );
__m128d left_k0_2 = _mm_load_pd( &left_k0_p[2] );
__m128d left_k1_0 = _mm_load_pd( &left_k1_p[0] );
__m128d left_k1_2 = _mm_load_pd( &left_k1_p[2] );
__m128d left_k2_0 = _mm_load_pd( &left_k2_p[0] );
__m128d left_k2_2 = _mm_load_pd( &left_k2_p[2] );
__m128d left_k3_0 = _mm_load_pd( &left_k3_p[0] );
__m128d left_k3_2 = _mm_load_pd( &left_k3_p[2] );
left_k0_0 = _mm_mul_pd(x1_0, left_k0_0);
left_k0_2 = _mm_mul_pd(x1_2, left_k0_2);
left_k1_0 = _mm_mul_pd(x1_0, left_k1_0);
left_k1_2 = _mm_mul_pd(x1_2, left_k1_2);
left_k0_0 = _mm_hadd_pd( left_k0_0, left_k0_2 );
left_k1_0 = _mm_hadd_pd( left_k1_0, left_k1_2);
left_k0_0 = _mm_hadd_pd( left_k0_0, left_k1_0);
left_k2_0 = _mm_mul_pd(x1_0, left_k2_0);
left_k2_2 = _mm_mul_pd(x1_2, left_k2_2);
left_k3_0 = _mm_mul_pd(x1_0, left_k3_0);
left_k3_2 = _mm_mul_pd(x1_2, left_k3_2);
left_k2_0 = _mm_hadd_pd( left_k2_0, left_k2_2);
left_k3_0 = _mm_hadd_pd( left_k3_0, left_k3_2);
left_k2_0 = _mm_hadd_pd( left_k2_0, left_k3_0);
double *x2_p = &x2[j*4];
double *right_k0_p = &right[j*16];
double *right_k1_p = &right[j*16 + 1*4];
double *right_k2_p = &right[j*16 + 2*4];
double *right_k3_p = &right[j*16 + 3*4];
__m128d x2_0 = _mm_load_pd( &x2_p[0] );
__m128d x2_2 = _mm_load_pd( &x2_p[2] );
__m128d right_k0_0 = _mm_load_pd( &right_k0_p[0] );
__m128d right_k0_2 = _mm_load_pd( &right_k0_p[2] );
__m128d right_k1_0 = _mm_load_pd( &right_k1_p[0] );
__m128d right_k1_2 = _mm_load_pd( &right_k1_p[2] );
__m128d right_k2_0 = _mm_load_pd( &right_k2_p[0] );
__m128d right_k2_2 = _mm_load_pd( &right_k2_p[2] );
__m128d right_k3_0 = _mm_load_pd( &right_k3_p[0] );
__m128d right_k3_2 = _mm_load_pd( &right_k3_p[2] );
right_k0_0 = _mm_mul_pd( x2_0, right_k0_0);
right_k0_2 = _mm_mul_pd( x2_2, right_k0_2);
right_k1_0 = _mm_mul_pd( x2_0, right_k1_0);
right_k1_2 = _mm_mul_pd( x2_2, right_k1_2);
right_k0_0 = _mm_hadd_pd( right_k0_0, right_k0_2);
right_k1_0 = _mm_hadd_pd( right_k1_0, right_k1_2);
right_k0_0 = _mm_hadd_pd( right_k0_0, right_k1_0);
right_k2_0 = _mm_mul_pd( x2_0, right_k2_0);
right_k2_2 = _mm_mul_pd( x2_2, right_k2_2);
right_k3_0 = _mm_mul_pd( x2_0, right_k3_0);
right_k3_2 = _mm_mul_pd( x2_2, right_k3_2);
right_k2_0 = _mm_hadd_pd( right_k2_0, right_k2_2);
right_k3_0 = _mm_hadd_pd( right_k3_0, right_k3_2);
right_k2_0 = _mm_hadd_pd( right_k2_0, right_k3_0);
__m128d x1px2_k0 = _mm_mul_pd( left_k0_0, right_k0_0 );
__m128d x1px2_k2 = _mm_mul_pd( left_k2_0, right_k2_0 );
__m128d EV_t_l0_k0 = EVV[0];
__m128d EV_t_l0_k2 = EVV[1];
__m128d EV_t_l1_k0 = EVV[2];
__m128d EV_t_l1_k2 = EVV[3];
__m128d EV_t_l2_k0 = EVV[4];
__m128d EV_t_l2_k2 = EVV[5];
__m128d EV_t_l3_k0 = EVV[6];
__m128d EV_t_l3_k2 = EVV[7];
EV_t_l0_k0 = _mm_mul_pd( x1px2_k0, EV_t_l0_k0 );
EV_t_l0_k2 = _mm_mul_pd( x1px2_k2, EV_t_l0_k2 );
EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l0_k2 );
EV_t_l1_k0 = _mm_mul_pd( x1px2_k0, EV_t_l1_k0 );
EV_t_l1_k2 = _mm_mul_pd( x1px2_k2, EV_t_l1_k2 );
EV_t_l1_k0 = _mm_hadd_pd( EV_t_l1_k0, EV_t_l1_k2 );
EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l1_k0 );
EV_t_l2_k0 = _mm_mul_pd( x1px2_k0, EV_t_l2_k0 );
EV_t_l2_k2 = _mm_mul_pd( x1px2_k2, EV_t_l2_k2 );
EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l2_k2 );
EV_t_l3_k0 = _mm_mul_pd( x1px2_k0, EV_t_l3_k0 );
EV_t_l3_k2 = _mm_mul_pd( x1px2_k2, EV_t_l3_k2 );
EV_t_l3_k0 = _mm_hadd_pd( EV_t_l3_k0, EV_t_l3_k2 );
EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l3_k0 );
values[j * 2] = EV_t_l0_k0;
values[j * 2 + 1] = EV_t_l2_k0;
maxv = _mm_max_pd(maxv, _mm_and_pd(EV_t_l0_k0, absMask.m));
maxv = _mm_max_pd(maxv, _mm_and_pd(EV_t_l2_k0, absMask.m));
}
_mm_store_pd(maxima, maxv);
max = MAX(maxima[0], maxima[1]);
if(max < minlikelihood)
{
__m128d sv = _mm_set1_pd(twotothe256);
scaleGap = 1;
_mm_store_pd(&x3[0], _mm_mul_pd(values[0], sv));
_mm_store_pd(&x3[2], _mm_mul_pd(values[1], sv));
_mm_store_pd(&x3[4], _mm_mul_pd(values[2], sv));
_mm_store_pd(&x3[6], _mm_mul_pd(values[3], sv));
_mm_store_pd(&x3[8], _mm_mul_pd(values[4], sv));
_mm_store_pd(&x3[10], _mm_mul_pd(values[5], sv));
_mm_store_pd(&x3[12], _mm_mul_pd(values[6], sv));
_mm_store_pd(&x3[14], _mm_mul_pd(values[7], sv));
}
else
{
_mm_store_pd(&x3[0], values[0]);
_mm_store_pd(&x3[2], values[1]);
_mm_store_pd(&x3[4], values[2]);
_mm_store_pd(&x3[6], values[3]);
_mm_store_pd(&x3[8], values[4]);
_mm_store_pd(&x3[10], values[5]);
_mm_store_pd(&x3[12], values[6]);
_mm_store_pd(&x3[14], values[7]);
}
}
x3 = x3_start;
for (i = 0; i < n; i++)
{
if(x3_gap[i / 32] & mask32[i % 32])
{
if(scaleGap)
{
if(useFastScaling)
addScale += wgt[i];
else
ex3[i] += 1;
}
}
else
{
__m128d maxv =_mm_setzero_pd();
if(x1_gap[i / 32] & mask32[i % 32])
x1 = x1_gapColumn;
else
{
x1 = x1_ptr;
x1_ptr += 16;
}
if(x2_gap[i / 32] & mask32[i % 32])
x2 = x2_gapColumn;
else
{
x2 = x2_ptr;
x2_ptr += 16;
}
for (j = 0; j < 4; j++)
{
double *x1_p = &x1[j*4];
double *left_k0_p = &left[j*16];
double *left_k1_p = &left[j*16 + 1*4];
double *left_k2_p = &left[j*16 + 2*4];
double *left_k3_p = &left[j*16 + 3*4];
__m128d x1_0 = _mm_load_pd( &x1_p[0] );
__m128d x1_2 = _mm_load_pd( &x1_p[2] );
__m128d left_k0_0 = _mm_load_pd( &left_k0_p[0] );
__m128d left_k0_2 = _mm_load_pd( &left_k0_p[2] );
__m128d left_k1_0 = _mm_load_pd( &left_k1_p[0] );
__m128d left_k1_2 = _mm_load_pd( &left_k1_p[2] );
__m128d left_k2_0 = _mm_load_pd( &left_k2_p[0] );
__m128d left_k2_2 = _mm_load_pd( &left_k2_p[2] );
__m128d left_k3_0 = _mm_load_pd( &left_k3_p[0] );
__m128d left_k3_2 = _mm_load_pd( &left_k3_p[2] );
left_k0_0 = _mm_mul_pd(x1_0, left_k0_0);
left_k0_2 = _mm_mul_pd(x1_2, left_k0_2);
left_k1_0 = _mm_mul_pd(x1_0, left_k1_0);
left_k1_2 = _mm_mul_pd(x1_2, left_k1_2);
left_k0_0 = _mm_hadd_pd( left_k0_0, left_k0_2 );
left_k1_0 = _mm_hadd_pd( left_k1_0, left_k1_2);
left_k0_0 = _mm_hadd_pd( left_k0_0, left_k1_0);
left_k2_0 = _mm_mul_pd(x1_0, left_k2_0);
left_k2_2 = _mm_mul_pd(x1_2, left_k2_2);
left_k3_0 = _mm_mul_pd(x1_0, left_k3_0);
left_k3_2 = _mm_mul_pd(x1_2, left_k3_2);
left_k2_0 = _mm_hadd_pd( left_k2_0, left_k2_2);
left_k3_0 = _mm_hadd_pd( left_k3_0, left_k3_2);
left_k2_0 = _mm_hadd_pd( left_k2_0, left_k3_0);
//
// multiply/add right side
//
double *x2_p = &x2[j*4];
double *right_k0_p = &right[j*16];
double *right_k1_p = &right[j*16 + 1*4];
double *right_k2_p = &right[j*16 + 2*4];
double *right_k3_p = &right[j*16 + 3*4];
__m128d x2_0 = _mm_load_pd( &x2_p[0] );
__m128d x2_2 = _mm_load_pd( &x2_p[2] );
__m128d right_k0_0 = _mm_load_pd( &right_k0_p[0] );
__m128d right_k0_2 = _mm_load_pd( &right_k0_p[2] );
__m128d right_k1_0 = _mm_load_pd( &right_k1_p[0] );
__m128d right_k1_2 = _mm_load_pd( &right_k1_p[2] );
__m128d right_k2_0 = _mm_load_pd( &right_k2_p[0] );
__m128d right_k2_2 = _mm_load_pd( &right_k2_p[2] );
__m128d right_k3_0 = _mm_load_pd( &right_k3_p[0] );
__m128d right_k3_2 = _mm_load_pd( &right_k3_p[2] );
right_k0_0 = _mm_mul_pd( x2_0, right_k0_0);
right_k0_2 = _mm_mul_pd( x2_2, right_k0_2);
right_k1_0 = _mm_mul_pd( x2_0, right_k1_0);
right_k1_2 = _mm_mul_pd( x2_2, right_k1_2);
right_k0_0 = _mm_hadd_pd( right_k0_0, right_k0_2);
right_k1_0 = _mm_hadd_pd( right_k1_0, right_k1_2);
right_k0_0 = _mm_hadd_pd( right_k0_0, right_k1_0);
right_k2_0 = _mm_mul_pd( x2_0, right_k2_0);
right_k2_2 = _mm_mul_pd( x2_2, right_k2_2);
right_k3_0 = _mm_mul_pd( x2_0, right_k3_0);
right_k3_2 = _mm_mul_pd( x2_2, right_k3_2);
right_k2_0 = _mm_hadd_pd( right_k2_0, right_k2_2);
right_k3_0 = _mm_hadd_pd( right_k3_0, right_k3_2);
right_k2_0 = _mm_hadd_pd( right_k2_0, right_k3_0);
//
// multiply left * right
//
__m128d x1px2_k0 = _mm_mul_pd( left_k0_0, right_k0_0 );
__m128d x1px2_k2 = _mm_mul_pd( left_k2_0, right_k2_0 );
//
// multiply with EV matrix (!?)
//
__m128d EV_t_l0_k0 = EVV[0];
__m128d EV_t_l0_k2 = EVV[1];
__m128d EV_t_l1_k0 = EVV[2];
__m128d EV_t_l1_k2 = EVV[3];
__m128d EV_t_l2_k0 = EVV[4];
__m128d EV_t_l2_k2 = EVV[5];
__m128d EV_t_l3_k0 = EVV[6];
__m128d EV_t_l3_k2 = EVV[7];
EV_t_l0_k0 = _mm_mul_pd( x1px2_k0, EV_t_l0_k0 );
EV_t_l0_k2 = _mm_mul_pd( x1px2_k2, EV_t_l0_k2 );
EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l0_k2 );
EV_t_l1_k0 = _mm_mul_pd( x1px2_k0, EV_t_l1_k0 );
EV_t_l1_k2 = _mm_mul_pd( x1px2_k2, EV_t_l1_k2 );
EV_t_l1_k0 = _mm_hadd_pd( EV_t_l1_k0, EV_t_l1_k2 );
EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l1_k0 );
EV_t_l2_k0 = _mm_mul_pd( x1px2_k0, EV_t_l2_k0 );
EV_t_l2_k2 = _mm_mul_pd( x1px2_k2, EV_t_l2_k2 );
EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l2_k2 );
EV_t_l3_k0 = _mm_mul_pd( x1px2_k0, EV_t_l3_k0 );
EV_t_l3_k2 = _mm_mul_pd( x1px2_k2, EV_t_l3_k2 );
EV_t_l3_k0 = _mm_hadd_pd( EV_t_l3_k0, EV_t_l3_k2 );
EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l3_k0 );
values[j * 2] = EV_t_l0_k0;
values[j * 2 + 1] = EV_t_l2_k0;
maxv = _mm_max_pd(maxv, _mm_and_pd(EV_t_l0_k0, absMask.m));
maxv = _mm_max_pd(maxv, _mm_and_pd(EV_t_l2_k0, absMask.m));
}
_mm_store_pd(maxima, maxv);
max = MAX(maxima[0], maxima[1]);
if(max < minlikelihood)
{
__m128d sv = _mm_set1_pd(twotothe256);
_mm_store_pd(&x3[0], _mm_mul_pd(values[0], sv));
_mm_store_pd(&x3[2], _mm_mul_pd(values[1], sv));
_mm_store_pd(&x3[4], _mm_mul_pd(values[2], sv));
_mm_store_pd(&x3[6], _mm_mul_pd(values[3], sv));
_mm_store_pd(&x3[8], _mm_mul_pd(values[4], sv));
_mm_store_pd(&x3[10], _mm_mul_pd(values[5], sv));
_mm_store_pd(&x3[12], _mm_mul_pd(values[6], sv));
_mm_store_pd(&x3[14], _mm_mul_pd(values[7], sv));
if(useFastScaling)
addScale += wgt[i];
else
ex3[i] += 1;
}
else
{
_mm_store_pd(&x3[0], values[0]);
_mm_store_pd(&x3[2], values[1]);
_mm_store_pd(&x3[4], values[2]);
_mm_store_pd(&x3[6], values[3]);
_mm_store_pd(&x3[8], values[4]);
_mm_store_pd(&x3[10], values[5]);
_mm_store_pd(&x3[12], values[6]);
_mm_store_pd(&x3[14], values[7]);
}
x3 += 16;
}
}
break;
default:
assert(0);
}
if(useFastScaling)
*scalerIncrement = addScale;
}
#else
static void newviewGTRGAMMA(int tipCase,
double *x1_start, double *x2_start, double *x3_start,
double *EV, double *tipVector,
int *ex3, unsigned char *tipX1, unsigned char *tipX2,
const int n, double *left, double *right, int *wgt, int *scalerIncrement, const boolean useFastScaling,
const unsigned int x1_presenceMap, const unsigned int x2_presenceMap
)
{
int
i,
j,
k,
l,
scale,
addScale = 0;
double
*x1,
*x2,
*x3,
buf,
ump_x1,
ump_x2,
x1px2[4];
switch(tipCase)
{
case TIP_TIP:
{
double
*uX1,
umpX1[256],
*uX2,
umpX2[256];
for(i = 1; i < 16; i++)
{
x1 = &(tipVector[i * 4]);
if(mask32[i] & x1_presenceMap)
{
for(j=0; j<4; j++)
for(k=0; k<4; k++)
{
umpX1[i*16 + j*4 + k] = 0.0;
for (l=0; l < 4; l++)
umpX1[i*16 + j*4 + k] += x1[l] * left[j*16 + k*4 + l];
}
}
if(mask32[i] & x2_presenceMap)
{
for(j=0; j<4; j++)
for(k=0; k<4; k++)
{
umpX2[i*16 + j*4 + k] = 0.0;
for (l=0; l < 4; l++)
umpX2[i*16 + j*4 + k] += x1[l] * right[j*16 + k*4 + l];
}
}
}
for (i = 0; i < n; i++)
{
x3 = &x3_start[i * 16];
uX1 = &umpX1[16 * tipX1[i]];
uX2 = &umpX2[16 * tipX2[i]];
for(j = 0; j < 16; j++)
x3[j] = 0.0;
for (j = 0; j < 4; j++)
for (k = 0; k < 4; k++)
{
buf = uX1[j*4 + k] * uX2[j*4 + k];
for (l=0; l<4; l++)
x3[j * 4 + l] += buf * EV[4 * k + l];
}
}
}
break;
case TIP_INNER:
{
double *uX1, umpX1[256];
for (i = 1; i < 16; i++)
{
if(mask32[i] & x1_presenceMap)
{
x1 = &(tipVector[i*4]);
for (j = 0; j < 4; j++)
for (k = 0; k < 4; k++)
{
umpX1[i*16 + j*4 + k] = 0.0;
for (l=0; l < 4; l++)
umpX1[i*16 + j*4 + k] += x1[l] * left[j*16 + k*4 + l];
}
}
}
for (i = 0; i < n; i++)
{
x2 = &x2_start[i * 16];
x3 = &x3_start[i * 16];
uX1 = &umpX1[16 * tipX1[i]];
for(j = 0; j < 16; j++)
x3[j] = 0.0;
for (j = 0; j < 4; j++)
{
for (k = 0; k < 4; k++)
{
ump_x2 = 0.0;
for (l=0; l<4; l++)
ump_x2 += x2[j*4 + l] * right[j* 16 + k*4 + l];
x1px2[k] = uX1[j * 4 + k] * ump_x2;
}
for(k = 0; k < 4; k++)
for (l=0; l<4; l++)
x3[j * 4 + l] += x1px2[k] * EV[4 * k + l];
}
scale = 1;
for(l = 0; scale && (l < 16); l++)
scale = (ABS(x3[l]) < minlikelihood);
if(scale)
{
for (l=0; l<16; l++)
x3[l] *= twotothe256;
if(useFastScaling)
addScale += wgt[i];
else
ex3[i] += 1;
}
}
}
break;
case INNER_INNER:
for (i = 0; i < n; i++)
{
x1 = &x1_start[i * 16];
x2 = &x2_start[i * 16];
x3 = &x3_start[i * 16];
for(j = 0; j < 16; j++)
x3[j] = 0.0;
for (j = 0; j < 4; j++)
{
for (k = 0; k < 4; k++)
{
ump_x1 = 0.0;
ump_x2 = 0.0;
for (l=0; l<4; l++)
{
ump_x1 += x1[j*4 + l] * left[j*16 + k*4 +l];
ump_x2 += x2[j*4 + l] * right[j*16 + k*4 +l];
}
x1px2[k] = ump_x1 * ump_x2;
}
for(k = 0; k < 4; k++)
for (l=0; l<4; l++)
x3[j * 4 + l] += x1px2[k] * EV[4 * k + l];
}
scale = 1;
for(l = 0; scale && (l < 16); l++)
scale = (ABS(x3[l]) < minlikelihood);
if(scale)
{
for (l=0; l<16; l++)
x3[l] *= twotothe256;
if(useFastScaling)
addScale += wgt[i];
else
ex3[i] += 1;
}
}
break;
default:
assert(0);
}
if(useFastScaling)
*scalerIncrement = addScale;
}
#endif
#ifdef __SIM_SSE3
static void newviewGTRCATPROT_SAVE(int tipCase, double *extEV,
int *cptr,
double *x1, double *x2, double *x3, double *tipVector,
int *ex3, unsigned char *tipX1, unsigned char *tipX2,
int n, double *left, double *right, int *wgt, int *scalerIncrement, const boolean useFastScaling,
unsigned int *x1_gap, unsigned int *x2_gap, unsigned int *x3_gap,
double *x1_gapColumn, double *x2_gapColumn, double *x3_gapColumn, const int maxCats)
{
double
*le,
*ri,
*v,
*vl,
*vr,
*x1_ptr = x1,
*x2_ptr = x2,
*x3_ptr = x3;
int
i,
l,
j,
scale,
scaleGap = 0,
addScale = 0;
{
vl = x1_gapColumn;
vr = x2_gapColumn;
v = x3_gapColumn;
le = &left[maxCats * 400];
ri = &right[maxCats * 400];
for(l = 0; l < 20; l+=2)
_mm_store_pd(&v[l], _mm_setzero_pd());
for(l = 0; l < 20; l++)
{
__m128d x1v = _mm_setzero_pd();
__m128d x2v = _mm_setzero_pd();
double
*ev = &extEV[l * 20],
*lv = &le[l * 20],
*rv = &ri[l * 20];
for(j = 0; j < 20; j+=2)
{
x1v = _mm_add_pd(x1v, _mm_mul_pd(_mm_load_pd(&vl[j]), _mm_load_pd(&lv[j])));
x2v = _mm_add_pd(x2v, _mm_mul_pd(_mm_load_pd(&vr[j]), _mm_load_pd(&rv[j])));
}
x1v = _mm_hadd_pd(x1v, x1v);
x2v = _mm_hadd_pd(x2v, x2v);
x1v = _mm_mul_pd(x1v, x2v);
for(j = 0; j < 20; j+=2)
{
__m128d vv = _mm_load_pd(&v[j]);
vv = _mm_add_pd(vv, _mm_mul_pd(x1v, _mm_load_pd(&ev[j])));
_mm_store_pd(&v[j], vv);
}
}
if(tipCase != TIP_TIP)
{
__m128d minlikelihood_sse = _mm_set1_pd( minlikelihood );
scale = 1;
for(l = 0; scale && (l < 20); l += 2)
{
__m128d vv = _mm_load_pd(&v[l]);
__m128d v1 = _mm_and_pd(vv, absMask.m);
v1 = _mm_cmplt_pd(v1, minlikelihood_sse);
if(_mm_movemask_pd( v1 ) != 3)
scale = 0;
}
if(scale)
{
__m128d twoto = _mm_set_pd(twotothe256, twotothe256);
for(l = 0; l < 20; l+=2)
{
__m128d ex3v = _mm_load_pd(&v[l]);
_mm_store_pd(&v[l], _mm_mul_pd(ex3v,twoto));
}
scaleGap = TRUE;
}
}
}
switch(tipCase)
{
case TIP_TIP:
{
for (i = 0; i < n; i++)
{
if(noGap(x3_gap, i))
{
vl = &(tipVector[20 * tipX1[i]]);
vr = &(tipVector[20 * tipX2[i]]);
v = x3_ptr;
if(isGap(x1_gap, i))
le = &left[maxCats * 400];
else
le = &left[cptr[i] * 400];
if(isGap(x2_gap, i))
ri = &right[maxCats * 400];
else
ri = &right[cptr[i] * 400];
for(l = 0; l < 20; l+=2)
_mm_store_pd(&v[l], _mm_setzero_pd());
for(l = 0; l < 20; l++)
{
__m128d x1v = _mm_setzero_pd();
__m128d x2v = _mm_setzero_pd();
double
*ev = &extEV[l * 20],
*lv = &le[l * 20],
*rv = &ri[l * 20];
for(j = 0; j < 20; j+=2)
{
x1v = _mm_add_pd(x1v, _mm_mul_pd(_mm_load_pd(&vl[j]), _mm_load_pd(&lv[j])));
x2v = _mm_add_pd(x2v, _mm_mul_pd(_mm_load_pd(&vr[j]), _mm_load_pd(&rv[j])));
}
x1v = _mm_hadd_pd(x1v, x1v);
x2v = _mm_hadd_pd(x2v, x2v);
x1v = _mm_mul_pd(x1v, x2v);
for(j = 0; j < 20; j+=2)
{
__m128d vv = _mm_load_pd(&v[j]);
vv = _mm_add_pd(vv, _mm_mul_pd(x1v, _mm_load_pd(&ev[j])));
_mm_store_pd(&v[j], vv);
}
}
x3_ptr += 20;
}
}
}
break;
case TIP_INNER:
{
for (i = 0; i < n; i++)
{
if(isGap(x3_gap, i))
{
if(scaleGap)
{
if(useFastScaling)
addScale += wgt[i];
else
ex3[i] += 1;
}
}
else
{
vl = &(tipVector[20 * tipX1[i]]);
vr = x2_ptr;
v = x3_ptr;
if(isGap(x1_gap, i))
le = &left[maxCats * 400];
else
le = &left[cptr[i] * 400];
if(isGap(x2_gap, i))
{
ri = &right[maxCats * 400];
vr = x2_gapColumn;
}
else
{
ri = &right[cptr[i] * 400];
vr = x2_ptr;
x2_ptr += 20;
}
for(l = 0; l < 20; l+=2)
_mm_store_pd(&v[l], _mm_setzero_pd());
for(l = 0; l < 20; l++)
{
__m128d x1v = _mm_setzero_pd();
__m128d x2v = _mm_setzero_pd();
double
*ev = &extEV[l * 20],
*lv = &le[l * 20],
*rv = &ri[l * 20];
for(j = 0; j < 20; j+=2)
{
x1v = _mm_add_pd(x1v, _mm_mul_pd(_mm_load_pd(&vl[j]), _mm_load_pd(&lv[j])));
x2v = _mm_add_pd(x2v, _mm_mul_pd(_mm_load_pd(&vr[j]), _mm_load_pd(&rv[j])));
}
x1v = _mm_hadd_pd(x1v, x1v);
x2v = _mm_hadd_pd(x2v, x2v);
x1v = _mm_mul_pd(x1v, x2v);
for(j = 0; j < 20; j+=2)
{
__m128d vv = _mm_load_pd(&v[j]);
vv = _mm_add_pd(vv, _mm_mul_pd(x1v, _mm_load_pd(&ev[j])));
_mm_store_pd(&v[j], vv);
}
}
{
__m128d minlikelihood_sse = _mm_set1_pd( minlikelihood );
scale = 1;
for(l = 0; scale && (l < 20); l += 2)
{
__m128d vv = _mm_load_pd(&v[l]);
__m128d v1 = _mm_and_pd(vv, absMask.m);
v1 = _mm_cmplt_pd(v1, minlikelihood_sse);
if(_mm_movemask_pd( v1 ) != 3)
scale = 0;
}
}
if(scale)
{
__m128d twoto = _mm_set_pd(twotothe256, twotothe256);
for(l = 0; l < 20; l+=2)
{
__m128d ex3v = _mm_load_pd(&v[l]);
_mm_store_pd(&v[l], _mm_mul_pd(ex3v,twoto));
}
if(useFastScaling)
addScale += wgt[i];
else
ex3[i] += 1;
}
x3_ptr += 20;
}
}
}
break;
case INNER_INNER:
for(i = 0; i < n; i++)
{
if(isGap(x3_gap, i))
{
if(scaleGap)
{
if(useFastScaling)
addScale += wgt[i];
else
ex3[i] += 1;
}
}
else
{
v = x3_ptr;
if(isGap(x1_gap, i))
{
vl = x1_gapColumn;
le = &left[maxCats * 400];
}
else
{
le = &left[cptr[i] * 400];
vl = x1_ptr;
x1_ptr += 20;
}
if(isGap(x2_gap, i))
{
vr = x2_gapColumn;
ri = &right[maxCats * 400];
}
else
{
ri = &right[cptr[i] * 400];
vr = x2_ptr;
x2_ptr += 20;
}
for(l = 0; l < 20; l+=2)
_mm_store_pd(&v[l], _mm_setzero_pd());
for(l = 0; l < 20; l++)
{
__m128d x1v = _mm_setzero_pd();
__m128d x2v = _mm_setzero_pd();
double
*ev = &extEV[l * 20],
*lv = &le[l * 20],
*rv = &ri[l * 20];
for(j = 0; j < 20; j+=2)
{
x1v = _mm_add_pd(x1v, _mm_mul_pd(_mm_load_pd(&vl[j]), _mm_load_pd(&lv[j])));
x2v = _mm_add_pd(x2v, _mm_mul_pd(_mm_load_pd(&vr[j]), _mm_load_pd(&rv[j])));
}
x1v = _mm_hadd_pd(x1v, x1v);
x2v = _mm_hadd_pd(x2v, x2v);
x1v = _mm_mul_pd(x1v, x2v);
for(j = 0; j < 20; j+=2)
{
__m128d vv = _mm_load_pd(&v[j]);
vv = _mm_add_pd(vv, _mm_mul_pd(x1v, _mm_load_pd(&ev[j])));
_mm_store_pd(&v[j], vv);
}
}
{
__m128d minlikelihood_sse = _mm_set1_pd( minlikelihood );
scale = 1;
for(l = 0; scale && (l < 20); l += 2)
{
__m128d vv = _mm_load_pd(&v[l]);
__m128d v1 = _mm_and_pd(vv, absMask.m);
v1 = _mm_cmplt_pd(v1, minlikelihood_sse);
if(_mm_movemask_pd( v1 ) != 3)
scale = 0;
}
}
if(scale)
{
__m128d twoto = _mm_set_pd(twotothe256, twotothe256);
for(l = 0; l < 20; l+=2)
{
__m128d ex3v = _mm_load_pd(&v[l]);
_mm_store_pd(&v[l], _mm_mul_pd(ex3v,twoto));
}
if(useFastScaling)
addScale += wgt[i];
else
ex3[i] += 1;
}
x3_ptr += 20;
}
}
break;
default:
assert(0);
}
if(useFastScaling)
*scalerIncrement = addScale;
}
#endif
static void newviewGTRCATPROT(int tipCase, double *extEV,
int *cptr,
double *x1, double *x2, double *x3, double *tipVector,
int *ex3, unsigned char *tipX1, unsigned char *tipX2,
int n, double *left, double *right, int *wgt, int *scalerIncrement, const boolean useFastScaling)
{
double
*le, *ri, *v, *vl, *vr;
#ifndef __SIM_SSE3
double
ump_x1, ump_x2, x1px2;
#endif
int i, l, j, scale, addScale = 0;
switch(tipCase)
{
case TIP_TIP:
{
for (i = 0; i < n; i++)
{
le = &left[cptr[i] * 400];
ri = &right[cptr[i] * 400];
vl = &(tipVector[20 * tipX1[i]]);
vr = &(tipVector[20 * tipX2[i]]);
v = &x3[20 * i];
#ifdef __SIM_SSE3
for(l = 0; l < 20; l+=2)
_mm_store_pd(&v[l], _mm_setzero_pd());
#else
for(l = 0; l < 20; l++)
v[l] = 0.0;
#endif
for(l = 0; l < 20; l++)
{
#ifdef __SIM_SSE3
__m128d x1v = _mm_setzero_pd();
__m128d x2v = _mm_setzero_pd();
double
*ev = &extEV[l * 20],
*lv = &le[l * 20],
*rv = &ri[l * 20];
for(j = 0; j < 20; j+=2)
{
x1v = _mm_add_pd(x1v, _mm_mul_pd(_mm_load_pd(&vl[j]), _mm_load_pd(&lv[j])));
x2v = _mm_add_pd(x2v, _mm_mul_pd(_mm_load_pd(&vr[j]), _mm_load_pd(&rv[j])));
}
x1v = _mm_hadd_pd(x1v, x1v);
x2v = _mm_hadd_pd(x2v, x2v);
x1v = _mm_mul_pd(x1v, x2v);
for(j = 0; j < 20; j+=2)
{
__m128d vv = _mm_load_pd(&v[j]);
vv = _mm_add_pd(vv, _mm_mul_pd(x1v, _mm_load_pd(&ev[j])));
_mm_store_pd(&v[j], vv);
}
#else
ump_x1 = 0.0;
ump_x2 = 0.0;
for(j = 0; j < 20; j++)
{
ump_x1 += vl[j] * le[l * 20 + j];
ump_x2 += vr[j] * ri[l * 20 + j];
}
x1px2 = ump_x1 * ump_x2;
for(j = 0; j < 20; j++)
v[j] += x1px2 * extEV[l * 20 + j];
#endif
}
}
}
break;
case TIP_INNER:
{
for (i = 0; i < n; i++)
{
le = &left[cptr[i] * 400];
ri = &right[cptr[i] * 400];
vl = &(tipVector[20 * tipX1[i]]);
vr = &x2[20 * i];
v = &x3[20 * i];
#ifdef __SIM_SSE3
for(l = 0; l < 20; l+=2)
_mm_store_pd(&v[l], _mm_setzero_pd());
#else
for(l = 0; l < 20; l++)
v[l] = 0.0;
#endif
for(l = 0; l < 20; l++)
{
#ifdef __SIM_SSE3
__m128d x1v = _mm_setzero_pd();
__m128d x2v = _mm_setzero_pd();
double
*ev = &extEV[l * 20],
*lv = &le[l * 20],
*rv = &ri[l * 20];
for(j = 0; j < 20; j+=2)
{
x1v = _mm_add_pd(x1v, _mm_mul_pd(_mm_load_pd(&vl[j]), _mm_load_pd(&lv[j])));
x2v = _mm_add_pd(x2v, _mm_mul_pd(_mm_load_pd(&vr[j]), _mm_load_pd(&rv[j])));
}
x1v = _mm_hadd_pd(x1v, x1v);
x2v = _mm_hadd_pd(x2v, x2v);
x1v = _mm_mul_pd(x1v, x2v);
for(j = 0; j < 20; j+=2)
{
__m128d vv = _mm_load_pd(&v[j]);
vv = _mm_add_pd(vv, _mm_mul_pd(x1v, _mm_load_pd(&ev[j])));
_mm_store_pd(&v[j], vv);
}
#else
ump_x1 = 0.0;
ump_x2 = 0.0;
for(j = 0; j < 20; j++)
{
ump_x1 += vl[j] * le[l * 20 + j];
ump_x2 += vr[j] * ri[l * 20 + j];
}
x1px2 = ump_x1 * ump_x2;
for(j = 0; j < 20; j++)
v[j] += x1px2 * extEV[l * 20 + j];
#endif
}
#ifdef __SIM_SSE3
{
__m128d minlikelihood_sse = _mm_set1_pd( minlikelihood );
scale = 1;
for(l = 0; scale && (l < 20); l += 2)
{
__m128d vv = _mm_load_pd(&v[l]);
__m128d v1 = _mm_and_pd(vv, absMask.m);
v1 = _mm_cmplt_pd(v1, minlikelihood_sse);
if(_mm_movemask_pd( v1 ) != 3)
scale = 0;
}
}
#else
scale = 1;
for(l = 0; scale && (l < 20); l++)
scale = ((v[l] < minlikelihood) && (v[l] > minusminlikelihood));
#endif
if(scale)
{
#ifdef __SIM_SSE3
__m128d twoto = _mm_set_pd(twotothe256, twotothe256);
for(l = 0; l < 20; l+=2)
{
__m128d ex3v = _mm_load_pd(&v[l]);
_mm_store_pd(&v[l], _mm_mul_pd(ex3v,twoto));
}
#else
for(l = 0; l < 20; l++)
v[l] *= twotothe256;
#endif
if(useFastScaling)
addScale += wgt[i];
else
ex3[i] += 1;
}
}
}
break;
case INNER_INNER:
for(i = 0; i < n; i++)
{
le = &left[cptr[i] * 400];
ri = &right[cptr[i] * 400];
vl = &x1[20 * i];
vr = &x2[20 * i];
v = &x3[20 * i];
#ifdef __SIM_SSE3
for(l = 0; l < 20; l+=2)
_mm_store_pd(&v[l], _mm_setzero_pd());
#else
for(l = 0; l < 20; l++)
v[l] = 0.0;
#endif
for(l = 0; l < 20; l++)
{
#ifdef __SIM_SSE3
__m128d x1v = _mm_setzero_pd();
__m128d x2v = _mm_setzero_pd();
double
*ev = &extEV[l * 20],
*lv = &le[l * 20],
*rv = &ri[l * 20];
for(j = 0; j < 20; j+=2)
{
x1v = _mm_add_pd(x1v, _mm_mul_pd(_mm_load_pd(&vl[j]), _mm_load_pd(&lv[j])));
x2v = _mm_add_pd(x2v, _mm_mul_pd(_mm_load_pd(&vr[j]), _mm_load_pd(&rv[j])));
}
x1v = _mm_hadd_pd(x1v, x1v);
x2v = _mm_hadd_pd(x2v, x2v);
x1v = _mm_mul_pd(x1v, x2v);
for(j = 0; j < 20; j+=2)
{
__m128d vv = _mm_load_pd(&v[j]);
vv = _mm_add_pd(vv, _mm_mul_pd(x1v, _mm_load_pd(&ev[j])));
_mm_store_pd(&v[j], vv);
}
#else
ump_x1 = 0.0;
ump_x2 = 0.0;
for(j = 0; j < 20; j++)
{
ump_x1 += vl[j] * le[l * 20 + j];
ump_x2 += vr[j] * ri[l * 20 + j];
}
x1px2 = ump_x1 * ump_x2;
for(j = 0; j < 20; j++)
v[j] += x1px2 * extEV[l * 20 + j];
#endif
}
#ifdef __SIM_SSE3
{
__m128d minlikelihood_sse = _mm_set1_pd( minlikelihood );
scale = 1;
for(l = 0; scale && (l < 20); l += 2)
{
__m128d vv = _mm_load_pd(&v[l]);
__m128d v1 = _mm_and_pd(vv, absMask.m);
v1 = _mm_cmplt_pd(v1, minlikelihood_sse);
if(_mm_movemask_pd( v1 ) != 3)
scale = 0;
}
}
#else
scale = 1;
for(l = 0; scale && (l < 20); l++)
scale = ((v[l] < minlikelihood) && (v[l] > minusminlikelihood));
#endif
if(scale)
{
#ifdef __SIM_SSE3
__m128d twoto = _mm_set_pd(twotothe256, twotothe256);
for(l = 0; l < 20; l+=2)
{
__m128d ex3v = _mm_load_pd(&v[l]);
_mm_store_pd(&v[l], _mm_mul_pd(ex3v,twoto));
}
#else
for(l = 0; l < 20; l++)
v[l] *= twotothe256;
#endif
if(useFastScaling)
addScale += wgt[i];
else
ex3[i] += 1;
}
}
break;
default:
assert(0);
}
if(useFastScaling)
*scalerIncrement = addScale;
}
static void newviewGTRCATSECONDARY(int tipCase, double *extEV,
int *cptr,
double *x1, double *x2, double *x3, double *tipVector,
int *ex3, unsigned char *tipX1, unsigned char *tipX2,
int n, double *left, double *right, int *wgt, int *scalerIncrement, const boolean useFastScaling)
{
double
*le, *ri, *v, *vl, *vr;
double
ump_x1, ump_x2, x1px2;
int i, l, j, scale, addScale = 0;
switch(tipCase)
{
case TIP_TIP:
{
for (i = 0; i < n; i++)
{
le = &left[cptr[i] * 256];
ri = &right[cptr[i] * 256];
vl = &(tipVector[16 * tipX1[i]]);
vr = &(tipVector[16 * tipX2[i]]);
v = &x3[16 * i];
for(l = 0; l < 16; l++)
v[l] = 0.0;
for(l = 0; l < 16; l++)
{
ump_x1 = 0.0;
ump_x2 = 0.0;
for(j = 0; j < 16; j++)
{
ump_x1 += vl[j] * le[l * 16 + j];
ump_x2 += vr[j] * ri[l * 16 + j];
}
x1px2 = ump_x1 * ump_x2;
for(j = 0; j < 16; j++)
v[j] += x1px2 * extEV[l * 16 + j];
}
}
}
break;
case TIP_INNER:
{
for (i = 0; i < n; i++)
{
le = &left[cptr[i] * 256];
ri = &right[cptr[i] * 256];
vl = &(tipVector[16 * tipX1[i]]);
vr = &x2[16 * i];
v = &x3[16 * i];
for(l = 0; l < 16; l++)
v[l] = 0.0;
for(l = 0; l < 16; l++)
{
ump_x1 = 0.0;
ump_x2 = 0.0;
for(j = 0; j < 16; j++)
{
ump_x1 += vl[j] * le[l * 16 + j];
ump_x2 += vr[j] * ri[l * 16 + j];
}
x1px2 = ump_x1 * ump_x2;
for(j = 0; j < 16; j++)
v[j] += x1px2 * extEV[l * 16 + j];
}
scale = 1;
for(l = 0; scale && (l < 16); l++)
scale = ((v[l] < minlikelihood) && (v[l] > minusminlikelihood));
if(scale)
{
for(l = 0; l < 16; l++)
v[l] *= twotothe256;
if(useFastScaling)
addScale += wgt[i];
else
ex3[i] += 1;
}
}
}
break;
case INNER_INNER:
for(i = 0; i < n; i++)
{
le = &left[cptr[i] * 256];
ri = &right[cptr[i] * 256];
vl = &x1[16 * i];
vr = &x2[16 * i];
v = &x3[16 * i];
for(l = 0; l < 16; l++)
v[l] = 0.0;
for(l = 0; l < 16; l++)
{
ump_x1 = 0.0;
ump_x2 = 0.0;
for(j = 0; j < 16; j++)
{
ump_x1 += vl[j] * le[l * 16 + j];
ump_x2 += vr[j] * ri[l * 16 + j];
}
x1px2 = ump_x1 * ump_x2;
for(j = 0; j < 16; j++)
v[j] += x1px2 * extEV[l * 16 + j];
}
scale = 1;
for(l = 0; scale && (l < 16); l++)
scale = ((v[l] < minlikelihood) && (v[l] > minusminlikelihood));
if(scale)
{
for(l = 0; l < 16; l++)
v[l] *= twotothe256;
if(useFastScaling)
addScale += wgt[i];
else
ex3[i] += 1;
}
}
break;
default:
assert(0);
}
if(useFastScaling)
*scalerIncrement = addScale;
}
static void newviewGTRCATSECONDARY_6(int tipCase, double *extEV,
int *cptr,
double *x1, double *x2, double *x3, double *tipVector,
int *ex3, unsigned char *tipX1, unsigned char *tipX2,
int n, double *left, double *right, int *wgt, int *scalerIncrement, const boolean useFastScaling)
{
double
*le, *ri, *v, *vl, *vr;
double
ump_x1, ump_x2, x1px2;
int i, l, j, scale, addScale = 0;
switch(tipCase)
{
case TIP_TIP:
{
for (i = 0; i < n; i++)
{
le = &left[cptr[i] * 36];
ri = &right[cptr[i] * 36];
vl = &(tipVector[6 * tipX1[i]]);
vr = &(tipVector[6 * tipX2[i]]);
v = &x3[6 * i];
for(l = 0; l < 6; l++)
v[l] = 0.0;
for(l = 0; l < 6; l++)
{
ump_x1 = 0.0;
ump_x2 = 0.0;
for(j = 0; j < 6; j++)
{
ump_x1 += vl[j] * le[l * 6 + j];
ump_x2 += vr[j] * ri[l * 6 + j];
}
x1px2 = ump_x1 * ump_x2;
for(j = 0; j < 6; j++)
v[j] += x1px2 * extEV[l * 6 + j];
}
}
}
break;
case TIP_INNER:
{
for (i = 0; i < n; i++)
{
le = &left[cptr[i] * 36];
ri = &right[cptr[i] * 36];
vl = &(tipVector[6 * tipX1[i]]);
vr = &x2[6 * i];
v = &x3[6 * i];
for(l = 0; l < 6; l++)
v[l] = 0.0;
for(l = 0; l < 6; l++)
{
ump_x1 = 0.0;
ump_x2 = 0.0;
for(j = 0; j < 6; j++)
{
ump_x1 += vl[j] * le[l * 6 + j];
ump_x2 += vr[j] * ri[l * 6 + j];
}
x1px2 = ump_x1 * ump_x2;
for(j = 0; j < 6; j++)
v[j] += x1px2 * extEV[l * 6 + j];
}
scale = 1;
for(l = 0; scale && (l < 6); l++)
scale = ((v[l] < minlikelihood) && (v[l] > minusminlikelihood));
if(scale)
{
for(l = 0; l < 6; l++)
v[l] *= twotothe256;
if(useFastScaling)
addScale += wgt[i];
else
ex3[i] += 1;
}
}
}
break;
case INNER_INNER:
for(i = 0; i < n; i++)
{
le = &left[cptr[i] * 36];
ri = &right[cptr[i] * 36];
vl = &x1[6 * i];
vr = &x2[6 * i];
v = &x3[6 * i];
for(l = 0; l < 6; l++)
v[l] = 0.0;
for(l = 0; l < 6; l++)
{
ump_x1 = 0.0;
ump_x2 = 0.0;
for(j = 0; j < 6; j++)
{
ump_x1 += vl[j] * le[l * 6 + j];
ump_x2 += vr[j] * ri[l * 6 + j];
}
x1px2 = ump_x1 * ump_x2;
for(j = 0; j < 6; j++)
v[j] += x1px2 * extEV[l * 6 + j];
}
scale = 1;
for(l = 0; scale && (l < 6); l++)
scale = ((v[l] < minlikelihood) && (v[l] > minusminlikelihood));
if(scale)
{
for(l = 0; l < 6; l++)
v[l] *= twotothe256;
if(useFastScaling)
addScale += wgt[i];
else
ex3[i] += 1;
}
}
break;
default:
assert(0);
}
if(useFastScaling)
*scalerIncrement = addScale;
}
static void newviewGTRCATSECONDARY_7(int tipCase, double *extEV,
int *cptr,
double *x1, double *x2, double *x3, double *tipVector,
int *ex3, unsigned char *tipX1, unsigned char *tipX2,
int n, double *left, double *right, int *wgt, int *scalerIncrement, const boolean useFastScaling)
{
double
*le, *ri, *v, *vl, *vr;
double
ump_x1, ump_x2, x1px2;
int i, l, j, scale, addScale = 0;
switch(tipCase)
{
case TIP_TIP:
{
for (i = 0; i < n; i++)
{
le = &left[cptr[i] * 49];
ri = &right[cptr[i] * 49];
vl = &(tipVector[7 * tipX1[i]]);
vr = &(tipVector[7 * tipX2[i]]);
v = &x3[7 * i];
for(l = 0; l < 7; l++)
v[l] = 0.0;
for(l = 0; l < 7; l++)
{
ump_x1 = 0.0;
ump_x2 = 0.0;
for(j = 0; j < 7; j++)
{
ump_x1 += vl[j] * le[l * 7 + j];
ump_x2 += vr[j] * ri[l * 7 + j];
}
x1px2 = ump_x1 * ump_x2;
for(j = 0; j < 7; j++)
v[j] += x1px2 * extEV[l * 7 + j];
}
}
}
break;
case TIP_INNER:
{
for (i = 0; i < n; i++)
{
le = &left[cptr[i] * 49];
ri = &right[cptr[i] * 49];
vl = &(tipVector[7 * tipX1[i]]);
vr = &x2[7 * i];
v = &x3[7 * i];
for(l = 0; l < 7; l++)
v[l] = 0.0;
for(l = 0; l < 7; l++)
{
ump_x1 = 0.0;
ump_x2 = 0.0;
for(j = 0; j < 7; j++)
{
ump_x1 += vl[j] * le[l * 7 + j];
ump_x2 += vr[j] * ri[l * 7 + j];
}
x1px2 = ump_x1 * ump_x2;
for(j = 0; j < 7; j++)
v[j] += x1px2 * extEV[l * 7 + j];
}
scale = 1;
for(l = 0; scale && (l < 7); l++)
scale = ((v[l] < minlikelihood) && (v[l] > minusminlikelihood));
if(scale)
{
for(l = 0; l < 7; l++)
v[l] *= twotothe256;
if(useFastScaling)
addScale += wgt[i];
else
ex3[i] += 1;
}
}
}
break;
case INNER_INNER:
for(i = 0; i < n; i++)
{
le = &left[cptr[i] * 49];
ri = &right[cptr[i] * 49];
vl = &x1[7 * i];
vr = &x2[7 * i];
v = &x3[7 * i];
for(l = 0; l < 7; l++)
v[l] = 0.0;
for(l = 0; l < 7; l++)
{
ump_x1 = 0.0;
ump_x2 = 0.0;
for(j = 0; j < 7; j++)
{
ump_x1 += vl[j] * le[l * 7 + j];
ump_x2 += vr[j] * ri[l * 7 + j];
}
x1px2 = ump_x1 * ump_x2;
for(j = 0; j < 7; j++)
v[j] += x1px2 * extEV[l * 7 + j];
}
scale = 1;
for(l = 0; scale && (l < 7); l++)
scale = ((v[l] < minlikelihood) && (v[l] > minusminlikelihood));
if(scale)
{
for(l = 0; l < 7; l++)
v[l] *= twotothe256;
if(useFastScaling)
addScale += wgt[i];
else
ex3[i] += 1;
}
}
break;
default:
assert(0);
}
if(useFastScaling)
*scalerIncrement = addScale;
}
static void newviewGTRGAMMAPROT(int tipCase,
double *x1, double *x2, double *x3, double *extEV, double *tipVector,
int *ex3, unsigned char *tipX1, unsigned char *tipX2,
int n, double *left, double *right, int *wgt, int *scalerIncrement, const boolean useFastScaling)
{
double *uX1, *uX2, *v;
double x1px2;
int i, j, l, k, scale, addScale = 0;
double *vl, *vr;
#ifndef __SIM_SSE3
double al, ar;
#endif
switch(tipCase)
{
case TIP_TIP:
{
double umpX1[1840], umpX2[1840];
for(i = 0; i < 23; i++)
{
v = &(tipVector[20 * i]);
for(k = 0; k < 80; k++)
{
#ifdef __SIM_SSE3
double *ll = &left[k * 20];
double *rr = &right[k * 20];
__m128d umpX1v = _mm_setzero_pd();
__m128d umpX2v = _mm_setzero_pd();
for(l = 0; l < 20; l+=2)
{
__m128d vv = _mm_load_pd(&v[l]);
umpX1v = _mm_add_pd(umpX1v, _mm_mul_pd(vv, _mm_load_pd(&ll[l])));
umpX2v = _mm_add_pd(umpX2v, _mm_mul_pd(vv, _mm_load_pd(&rr[l])));
}
umpX1v = _mm_hadd_pd(umpX1v, umpX1v);
umpX2v = _mm_hadd_pd(umpX2v, umpX2v);
_mm_storel_pd(&umpX1[80 * i + k], umpX1v);
_mm_storel_pd(&umpX2[80 * i + k], umpX2v);
#else
umpX1[80 * i + k] = 0.0;
umpX2[80 * i + k] = 0.0;
for(l = 0; l < 20; l++)
{
umpX1[80 * i + k] += v[l] * left[k * 20 + l];
umpX2[80 * i + k] += v[l] * right[k * 20 + l];
}
#endif
}
}
for(i = 0; i < n; i++)
{
uX1 = &umpX1[80 * tipX1[i]];
uX2 = &umpX2[80 * tipX2[i]];
for(j = 0; j < 4; j++)
{
v = &x3[i * 80 + j * 20];
#ifdef __SIM_SSE3
__m128d zero = _mm_setzero_pd();
for(k = 0; k < 20; k+=2)
_mm_store_pd(&v[k], zero);
for(k = 0; k < 20; k++)
{
double *eev = &extEV[k * 20];
x1px2 = uX1[j * 20 + k] * uX2[j * 20 + k];
__m128d x1px2v = _mm_set1_pd(x1px2);
for(l = 0; l < 20; l+=2)
{
__m128d vv = _mm_load_pd(&v[l]);
__m128d ee = _mm_load_pd(&eev[l]);
vv = _mm_add_pd(vv, _mm_mul_pd(x1px2v,ee));
_mm_store_pd(&v[l], vv);
}
}
#else
for(k = 0; k < 20; k++)
v[k] = 0.0;
for(k = 0; k < 20; k++)
{
x1px2 = uX1[j * 20 + k] * uX2[j * 20 + k];
for(l = 0; l < 20; l++)
v[l] += x1px2 * extEV[20 * k + l];
}
#endif
}
}
}
break;
case TIP_INNER:
{
double umpX1[1840], ump_x2[20];
for(i = 0; i < 23; i++)
{
v = &(tipVector[20 * i]);
for(k = 0; k < 80; k++)
{
#ifdef __SIM_SSE3
double *ll = &left[k * 20];
__m128d umpX1v = _mm_setzero_pd();
for(l = 0; l < 20; l+=2)
{
__m128d vv = _mm_load_pd(&v[l]);
umpX1v = _mm_add_pd(umpX1v, _mm_mul_pd(vv, _mm_load_pd(&ll[l])));
}
umpX1v = _mm_hadd_pd(umpX1v, umpX1v);
_mm_storel_pd(&umpX1[80 * i + k], umpX1v);
#else
umpX1[80 * i + k] = 0.0;
for(l = 0; l < 20; l++)
umpX1[80 * i + k] += v[l] * left[k * 20 + l];
#endif
}
}
for (i = 0; i < n; i++)
{
uX1 = &umpX1[80 * tipX1[i]];
for(k = 0; k < 4; k++)
{
v = &(x2[80 * i + k * 20]);
#ifdef __SIM_SSE3
for(l = 0; l < 20; l++)
{
double *r = &right[k * 400 + l * 20];
__m128d ump_x2v = _mm_setzero_pd();
for(j = 0; j < 20; j+= 2)
{
__m128d vv = _mm_load_pd(&v[j]);
__m128d rr = _mm_load_pd(&r[j]);
ump_x2v = _mm_add_pd(ump_x2v, _mm_mul_pd(vv, rr));
}
ump_x2v = _mm_hadd_pd(ump_x2v, ump_x2v);
_mm_storel_pd(&ump_x2[l], ump_x2v);
}
v = &(x3[80 * i + 20 * k]);
__m128d zero = _mm_setzero_pd();
for(l = 0; l < 20; l+=2)
_mm_store_pd(&v[l], zero);
for(l = 0; l < 20; l++)
{
double *eev = &extEV[l * 20];
x1px2 = uX1[k * 20 + l] * ump_x2[l];
__m128d x1px2v = _mm_set1_pd(x1px2);
for(j = 0; j < 20; j+=2)
{
__m128d vv = _mm_load_pd(&v[j]);
__m128d ee = _mm_load_pd(&eev[j]);
vv = _mm_add_pd(vv, _mm_mul_pd(x1px2v,ee));
_mm_store_pd(&v[j], vv);
}
}
#else
for(l = 0; l < 20; l++)
{
ump_x2[l] = 0.0;
for(j = 0; j < 20; j++)
ump_x2[l] += v[j] * right[k * 400 + l * 20 + j];
}
v = &(x3[80 * i + 20 * k]);
for(l = 0; l < 20; l++)
v[l] = 0;
for(l = 0; l < 20; l++)
{
x1px2 = uX1[k * 20 + l] * ump_x2[l];
for(j = 0; j < 20; j++)
v[j] += x1px2 * extEV[l * 20 + j];
}
#endif
}
#ifdef __SIM_SSE3
{
v = &(x3[80 * i]);
__m128d minlikelihood_sse = _mm_set1_pd( minlikelihood );
scale = 1;
for(l = 0; scale && (l < 80); l += 2)
{
__m128d vv = _mm_load_pd(&v[l]);
__m128d v1 = _mm_and_pd(vv, absMask.m);
v1 = _mm_cmplt_pd(v1, minlikelihood_sse);
if(_mm_movemask_pd( v1 ) != 3)
scale = 0;
}
}
#else
v = &x3[80 * i];
scale = 1;
for(l = 0; scale && (l < 80); l++)
scale = (ABS(v[l]) < minlikelihood);
#endif
if (scale)
{
#ifdef __SIM_SSE3
__m128d twoto = _mm_set_pd(twotothe256, twotothe256);
for(l = 0; l < 80; l+=2)
{
__m128d ex3v = _mm_load_pd(&v[l]);
_mm_store_pd(&v[l], _mm_mul_pd(ex3v,twoto));
}
#else
for(l = 0; l < 80; l++)
v[l] *= twotothe256;
#endif
if(useFastScaling)
addScale += wgt[i];
else
ex3[i] += 1;
}
}
}
break;
case INNER_INNER:
for (i = 0; i < n; i++)
{
for(k = 0; k < 4; k++)
{
vl = &(x1[80 * i + 20 * k]);
vr = &(x2[80 * i + 20 * k]);
v = &(x3[80 * i + 20 * k]);
#ifdef __SIM_SSE3
__m128d zero = _mm_setzero_pd();
for(l = 0; l < 20; l+=2)
_mm_store_pd(&v[l], zero);
#else
for(l = 0; l < 20; l++)
v[l] = 0;
#endif
for(l = 0; l < 20; l++)
{
#ifdef __SIM_SSE3
{
__m128d al = _mm_setzero_pd();
__m128d ar = _mm_setzero_pd();
double *ll = &left[k * 400 + l * 20];
double *rr = &right[k * 400 + l * 20];
double *EVEV = &extEV[20 * l];
for(j = 0; j < 20; j+=2)
{
__m128d lv = _mm_load_pd(&ll[j]);
__m128d rv = _mm_load_pd(&rr[j]);
__m128d vll = _mm_load_pd(&vl[j]);
__m128d vrr = _mm_load_pd(&vr[j]);
al = _mm_add_pd(al, _mm_mul_pd(vll, lv));
ar = _mm_add_pd(ar, _mm_mul_pd(vrr, rv));
}
al = _mm_hadd_pd(al, al);
ar = _mm_hadd_pd(ar, ar);
al = _mm_mul_pd(al, ar);
for(j = 0; j < 20; j+=2)
{
__m128d vv = _mm_load_pd(&v[j]);
__m128d EVV = _mm_load_pd(&EVEV[j]);
vv = _mm_add_pd(vv, _mm_mul_pd(al, EVV));
_mm_store_pd(&v[j], vv);
}
}
#else
al = 0.0;
ar = 0.0;
for(j = 0; j < 20; j++)
{
al += vl[j] * left[k * 400 + l * 20 + j];
ar += vr[j] * right[k * 400 + l * 20 + j];
}
x1px2 = al * ar;
for(j = 0; j < 20; j++)
v[j] += x1px2 * extEV[20 * l + j];
#endif
}
}
#ifdef __SIM_SSE3
{
v = &(x3[80 * i]);
__m128d minlikelihood_sse = _mm_set1_pd( minlikelihood );
scale = 1;
for(l = 0; scale && (l < 80); l += 2)
{
__m128d vv = _mm_load_pd(&v[l]);
__m128d v1 = _mm_and_pd(vv, absMask.m);
v1 = _mm_cmplt_pd(v1, minlikelihood_sse);
if(_mm_movemask_pd( v1 ) != 3)
scale = 0;
}
}
#else
v = &(x3[80 * i]);
scale = 1;
for(l = 0; scale && (l < 80); l++)
scale = ((ABS(v[l]) < minlikelihood));
#endif
if (scale)
{
#ifdef __SIM_SSE3
__m128d twoto = _mm_set_pd(twotothe256, twotothe256);
for(l = 0; l < 80; l+=2)
{
__m128d ex3v = _mm_load_pd(&v[l]);
_mm_store_pd(&v[l], _mm_mul_pd(ex3v,twoto));
}
#else
for(l = 0; l < 80; l++)
v[l] *= twotothe256;
#endif
if(useFastScaling)
addScale += wgt[i];
else
ex3[i] += 1;
}
}
break;
default:
assert(0);
}
if(useFastScaling)
*scalerIncrement = addScale;
}
static void newviewGTRGAMMAPROT_LG4(int tipCase,
double *x1, double *x2, double *x3, double *extEV[4], double *tipVector[4],
int *ex3, unsigned char *tipX1, unsigned char *tipX2,
int n, double *left, double *right, int *wgt, int *scalerIncrement, const boolean useFastScaling)
{
double *uX1, *uX2, *v;
double x1px2;
int i, j, l, k, scale, addScale = 0;
double *vl, *vr;
#ifndef __SIM_SSE3
double al, ar;
#endif
switch(tipCase)
{
case TIP_TIP:
{
double umpX1[1840], umpX2[1840];
for(i = 0; i < 23; i++)
{
for(k = 0; k < 80; k++)
{
v = &(tipVector[k / 20][20 * i]);
#ifdef __SIM_SSE3
double *ll = &left[k * 20];
double *rr = &right[k * 20];
__m128d umpX1v = _mm_setzero_pd();
__m128d umpX2v = _mm_setzero_pd();
for(l = 0; l < 20; l+=2)
{
__m128d vv = _mm_load_pd(&v[l]);
umpX1v = _mm_add_pd(umpX1v, _mm_mul_pd(vv, _mm_load_pd(&ll[l])));
umpX2v = _mm_add_pd(umpX2v, _mm_mul_pd(vv, _mm_load_pd(&rr[l])));
}
umpX1v = _mm_hadd_pd(umpX1v, umpX1v);
umpX2v = _mm_hadd_pd(umpX2v, umpX2v);
_mm_storel_pd(&umpX1[80 * i + k], umpX1v);
_mm_storel_pd(&umpX2[80 * i + k], umpX2v);
#else
umpX1[80 * i + k] = 0.0;
umpX2[80 * i + k] = 0.0;
for(l = 0; l < 20; l++)
{
umpX1[80 * i + k] += v[l] * left[k * 20 + l];
umpX2[80 * i + k] += v[l] * right[k * 20 + l];
}
#endif
}
}
for(i = 0; i < n; i++)
{
uX1 = &umpX1[80 * tipX1[i]];
uX2 = &umpX2[80 * tipX2[i]];
for(j = 0; j < 4; j++)
{
v = &x3[i * 80 + j * 20];
#ifdef __SIM_SSE3
__m128d zero = _mm_setzero_pd();
for(k = 0; k < 20; k+=2)
_mm_store_pd(&v[k], zero);
for(k = 0; k < 20; k++)
{
double *eev = &extEV[j][k * 20];
x1px2 = uX1[j * 20 + k] * uX2[j * 20 + k];
__m128d x1px2v = _mm_set1_pd(x1px2);
for(l = 0; l < 20; l+=2)
{
__m128d vv = _mm_load_pd(&v[l]);
__m128d ee = _mm_load_pd(&eev[l]);
vv = _mm_add_pd(vv, _mm_mul_pd(x1px2v,ee));
_mm_store_pd(&v[l], vv);
}
}
#else
for(k = 0; k < 20; k++)
v[k] = 0.0;
for(k = 0; k < 20; k++)
{
x1px2 = uX1[j * 20 + k] * uX2[j * 20 + k];
for(l = 0; l < 20; l++)
v[l] += x1px2 * extEV[j][20 * k + l];
}
#endif
}
}
}
break;
case TIP_INNER:
{
double umpX1[1840], ump_x2[20];
for(i = 0; i < 23; i++)
{
for(k = 0; k < 80; k++)
{
v = &(tipVector[k / 20][20 * i]);
#ifdef __SIM_SSE3
double *ll = &left[k * 20];
__m128d umpX1v = _mm_setzero_pd();
for(l = 0; l < 20; l+=2)
{
__m128d vv = _mm_load_pd(&v[l]);
umpX1v = _mm_add_pd(umpX1v, _mm_mul_pd(vv, _mm_load_pd(&ll[l])));
}
umpX1v = _mm_hadd_pd(umpX1v, umpX1v);
_mm_storel_pd(&umpX1[80 * i + k], umpX1v);
#else
umpX1[80 * i + k] = 0.0;
for(l = 0; l < 20; l++)
umpX1[80 * i + k] += v[l] * left[k * 20 + l];
#endif
}
}
for (i = 0; i < n; i++)
{
uX1 = &umpX1[80 * tipX1[i]];
for(k = 0; k < 4; k++)
{
v = &(x2[80 * i + k * 20]);
#ifdef __SIM_SSE3
for(l = 0; l < 20; l++)
{
double *r = &right[k * 400 + l * 20];
__m128d ump_x2v = _mm_setzero_pd();
for(j = 0; j < 20; j+= 2)
{
__m128d vv = _mm_load_pd(&v[j]);
__m128d rr = _mm_load_pd(&r[j]);
ump_x2v = _mm_add_pd(ump_x2v, _mm_mul_pd(vv, rr));
}
ump_x2v = _mm_hadd_pd(ump_x2v, ump_x2v);
_mm_storel_pd(&ump_x2[l], ump_x2v);
}
v = &(x3[80 * i + 20 * k]);
__m128d zero = _mm_setzero_pd();
for(l = 0; l < 20; l+=2)
_mm_store_pd(&v[l], zero);
for(l = 0; l < 20; l++)
{
double *eev = &extEV[k][l * 20];
x1px2 = uX1[k * 20 + l] * ump_x2[l];
__m128d x1px2v = _mm_set1_pd(x1px2);
for(j = 0; j < 20; j+=2)
{
__m128d vv = _mm_load_pd(&v[j]);
__m128d ee = _mm_load_pd(&eev[j]);
vv = _mm_add_pd(vv, _mm_mul_pd(x1px2v,ee));
_mm_store_pd(&v[j], vv);
}
}
#else
for(l = 0; l < 20; l++)
{
ump_x2[l] = 0.0;
for(j = 0; j < 20; j++)
ump_x2[l] += v[j] * right[k * 400 + l * 20 + j];
}
v = &(x3[80 * i + 20 * k]);
for(l = 0; l < 20; l++)
v[l] = 0;
for(l = 0; l < 20; l++)
{
x1px2 = uX1[k * 20 + l] * ump_x2[l];
for(j = 0; j < 20; j++)
v[j] += x1px2 * extEV[k][l * 20 + j];
}
#endif
}
#ifdef __SIM_SSE3
{
v = &(x3[80 * i]);
__m128d minlikelihood_sse = _mm_set1_pd( minlikelihood );
scale = 1;
for(l = 0; scale && (l < 80); l += 2)
{
__m128d vv = _mm_load_pd(&v[l]);
__m128d v1 = _mm_and_pd(vv, absMask.m);
v1 = _mm_cmplt_pd(v1, minlikelihood_sse);
if(_mm_movemask_pd( v1 ) != 3)
scale = 0;
}
}
#else
v = &x3[80 * i];
scale = 1;
for(l = 0; scale && (l < 80); l++)
scale = (ABS(v[l]) < minlikelihood);
#endif
if (scale)
{
#ifdef __SIM_SSE3
__m128d twoto = _mm_set_pd(twotothe256, twotothe256);
for(l = 0; l < 80; l+=2)
{
__m128d ex3v = _mm_load_pd(&v[l]);
_mm_store_pd(&v[l], _mm_mul_pd(ex3v,twoto));
}
#else
for(l = 0; l < 80; l++)
v[l] *= twotothe256;
#endif
if(useFastScaling)
addScale += wgt[i];
else
ex3[i] += 1;
}
}
}
break;
case INNER_INNER:
for (i = 0; i < n; i++)
{
for(k = 0; k < 4; k++)
{
vl = &(x1[80 * i + 20 * k]);
vr = &(x2[80 * i + 20 * k]);
v = &(x3[80 * i + 20 * k]);
#ifdef __SIM_SSE3
__m128d zero = _mm_setzero_pd();
for(l = 0; l < 20; l+=2)
_mm_store_pd(&v[l], zero);
#else
for(l = 0; l < 20; l++)
v[l] = 0;
#endif
for(l = 0; l < 20; l++)
{
#ifdef __SIM_SSE3
{
__m128d al = _mm_setzero_pd();
__m128d ar = _mm_setzero_pd();
double *ll = &left[k * 400 + l * 20];
double *rr = &right[k * 400 + l * 20];
double *EVEV = &extEV[k][20 * l];
for(j = 0; j < 20; j+=2)
{
__m128d lv = _mm_load_pd(&ll[j]);
__m128d rv = _mm_load_pd(&rr[j]);
__m128d vll = _mm_load_pd(&vl[j]);
__m128d vrr = _mm_load_pd(&vr[j]);
al = _mm_add_pd(al, _mm_mul_pd(vll, lv));
ar = _mm_add_pd(ar, _mm_mul_pd(vrr, rv));
}
al = _mm_hadd_pd(al, al);
ar = _mm_hadd_pd(ar, ar);
al = _mm_mul_pd(al, ar);
for(j = 0; j < 20; j+=2)
{
__m128d vv = _mm_load_pd(&v[j]);
__m128d EVV = _mm_load_pd(&EVEV[j]);
vv = _mm_add_pd(vv, _mm_mul_pd(al, EVV));
_mm_store_pd(&v[j], vv);
}
}
#else
al = 0.0;
ar = 0.0;
for(j = 0; j < 20; j++)
{
al += vl[j] * left[k * 400 + l * 20 + j];
ar += vr[j] * right[k * 400 + l * 20 + j];
}
x1px2 = al * ar;
for(j = 0; j < 20; j++)
v[j] += x1px2 * extEV[k][20 * l + j];
#endif
}
}
#ifdef __SIM_SSE3
{
v = &(x3[80 * i]);
__m128d minlikelihood_sse = _mm_set1_pd( minlikelihood );
scale = 1;
for(l = 0; scale && (l < 80); l += 2)
{
__m128d vv = _mm_load_pd(&v[l]);
__m128d v1 = _mm_and_pd(vv, absMask.m);
v1 = _mm_cmplt_pd(v1, minlikelihood_sse);
if(_mm_movemask_pd( v1 ) != 3)
scale = 0;
}
}
#else
v = &(x3[80 * i]);
scale = 1;
for(l = 0; scale && (l < 80); l++)
scale = ((ABS(v[l]) < minlikelihood));
#endif
if (scale)
{
#ifdef __SIM_SSE3
__m128d twoto = _mm_set_pd(twotothe256, twotothe256);
for(l = 0; l < 80; l+=2)
{
__m128d ex3v = _mm_load_pd(&v[l]);
_mm_store_pd(&v[l], _mm_mul_pd(ex3v,twoto));
}
#else
for(l = 0; l < 80; l++)
v[l] *= twotothe256;
#endif
if(useFastScaling)
addScale += wgt[i];
else
ex3[i] += 1;
}
}
break;
default:
assert(0);
}
if(useFastScaling)
*scalerIncrement = addScale;
}
#ifdef __SIM_SSE3
static void newviewGTRGAMMAPROT_GAPPED_SAVE(int tipCase,
double *x1, double *x2, double *x3, double *extEV, double *tipVector,
int *ex3, unsigned char *tipX1, unsigned char *tipX2,
int n, double *left, double *right, int *wgt, int *scalerIncrement, const boolean useFastScaling,
unsigned int *x1_gap, unsigned int *x2_gap, unsigned int *x3_gap,
double *x1_gapColumn, double *x2_gapColumn, double *x3_gapColumn
)
{
double *uX1, *uX2, *v;
double x1px2;
int i, j, l, k, scale, addScale = 0,
gapScaling = 0;
double
*vl, *vr, *x1v, *x2v,
*x1_ptr = x1,
*x2_ptr = x2,
*x3_ptr = x3;
switch(tipCase)
{
case TIP_TIP:
{
double umpX1[1840], umpX2[1840];
for(i = 0; i < 23; i++)
{
v = &(tipVector[20 * i]);
for(k = 0; k < 80; k++)
{
double *ll = &left[k * 20];
double *rr = &right[k * 20];
__m128d umpX1v = _mm_setzero_pd();
__m128d umpX2v = _mm_setzero_pd();
for(l = 0; l < 20; l+=2)
{
__m128d vv = _mm_load_pd(&v[l]);
umpX1v = _mm_add_pd(umpX1v, _mm_mul_pd(vv, _mm_load_pd(&ll[l])));
umpX2v = _mm_add_pd(umpX2v, _mm_mul_pd(vv, _mm_load_pd(&rr[l])));
}
umpX1v = _mm_hadd_pd(umpX1v, umpX1v);
umpX2v = _mm_hadd_pd(umpX2v, umpX2v);
_mm_storel_pd(&umpX1[80 * i + k], umpX1v);
_mm_storel_pd(&umpX2[80 * i + k], umpX2v);
}
}
{
uX1 = &umpX1[1760];
uX2 = &umpX2[1760];
for(j = 0; j < 4; j++)
{
v = &x3_gapColumn[j * 20];
__m128d zero = _mm_setzero_pd();
for(k = 0; k < 20; k+=2)
_mm_store_pd(&v[k], zero);
for(k = 0; k < 20; k++)
{
double *eev = &extEV[k * 20];
x1px2 = uX1[j * 20 + k] * uX2[j * 20 + k];
__m128d x1px2v = _mm_set1_pd(x1px2);
for(l = 0; l < 20; l+=2)
{
__m128d vv = _mm_load_pd(&v[l]);
__m128d ee = _mm_load_pd(&eev[l]);
vv = _mm_add_pd(vv, _mm_mul_pd(x1px2v,ee));
_mm_store_pd(&v[l], vv);
}
}
}
}
for(i = 0; i < n; i++)
{
if(!(x3_gap[i / 32] & mask32[i % 32]))
{
uX1 = &umpX1[80 * tipX1[i]];
uX2 = &umpX2[80 * tipX2[i]];
for(j = 0; j < 4; j++)
{
v = &x3_ptr[j * 20];
__m128d zero = _mm_setzero_pd();
for(k = 0; k < 20; k+=2)
_mm_store_pd(&v[k], zero);
for(k = 0; k < 20; k++)
{
double *eev = &extEV[k * 20];
x1px2 = uX1[j * 20 + k] * uX2[j * 20 + k];
__m128d x1px2v = _mm_set1_pd(x1px2);
for(l = 0; l < 20; l+=2)
{
__m128d vv = _mm_load_pd(&v[l]);
__m128d ee = _mm_load_pd(&eev[l]);
vv = _mm_add_pd(vv, _mm_mul_pd(x1px2v,ee));
_mm_store_pd(&v[l], vv);
}
}
}
x3_ptr += 80;
}
}
}
break;
case TIP_INNER:
{
double umpX1[1840], ump_x2[20];
for(i = 0; i < 23; i++)
{
v = &(tipVector[20 * i]);
for(k = 0; k < 80; k++)
{
double *ll = &left[k * 20];
__m128d umpX1v = _mm_setzero_pd();
for(l = 0; l < 20; l+=2)
{
__m128d vv = _mm_load_pd(&v[l]);
umpX1v = _mm_add_pd(umpX1v, _mm_mul_pd(vv, _mm_load_pd(&ll[l])));
}
umpX1v = _mm_hadd_pd(umpX1v, umpX1v);
_mm_storel_pd(&umpX1[80 * i + k], umpX1v);
}
}
{
uX1 = &umpX1[1760];
for(k = 0; k < 4; k++)
{
v = &(x2_gapColumn[k * 20]);
for(l = 0; l < 20; l++)
{
double *r = &right[k * 400 + l * 20];
__m128d ump_x2v = _mm_setzero_pd();
for(j = 0; j < 20; j+= 2)
{
__m128d vv = _mm_load_pd(&v[j]);
__m128d rr = _mm_load_pd(&r[j]);
ump_x2v = _mm_add_pd(ump_x2v, _mm_mul_pd(vv, rr));
}
ump_x2v = _mm_hadd_pd(ump_x2v, ump_x2v);
_mm_storel_pd(&ump_x2[l], ump_x2v);
}
v = &(x3_gapColumn[20 * k]);
__m128d zero = _mm_setzero_pd();
for(l = 0; l < 20; l+=2)
_mm_store_pd(&v[l], zero);
for(l = 0; l < 20; l++)
{
double *eev = &extEV[l * 20];
x1px2 = uX1[k * 20 + l] * ump_x2[l];
__m128d x1px2v = _mm_set1_pd(x1px2);
for(j = 0; j < 20; j+=2)
{
__m128d vv = _mm_load_pd(&v[j]);
__m128d ee = _mm_load_pd(&eev[j]);
vv = _mm_add_pd(vv, _mm_mul_pd(x1px2v,ee));
_mm_store_pd(&v[j], vv);
}
}
}
{
v = x3_gapColumn;
__m128d minlikelihood_sse = _mm_set1_pd( minlikelihood );
scale = 1;
for(l = 0; scale && (l < 80); l += 2)
{
__m128d vv = _mm_load_pd(&v[l]);
__m128d v1 = _mm_and_pd(vv, absMask.m);
v1 = _mm_cmplt_pd(v1, minlikelihood_sse);
if(_mm_movemask_pd( v1 ) != 3)
scale = 0;
}
}
if (scale)
{
gapScaling = 1;
__m128d twoto = _mm_set_pd(twotothe256, twotothe256);
for(l = 0; l < 80; l+=2)
{
__m128d ex3v = _mm_load_pd(&v[l]);
_mm_store_pd(&v[l], _mm_mul_pd(ex3v,twoto));
}
}
}
for (i = 0; i < n; i++)
{
if((x3_gap[i / 32] & mask32[i % 32]))
{
if(gapScaling)
{
if(useFastScaling)
addScale += wgt[i];
else
ex3[i] += 1;
}
}
else
{
uX1 = &umpX1[80 * tipX1[i]];
if(x2_gap[i / 32] & mask32[i % 32])
x2v = x2_gapColumn;
else
{
x2v = x2_ptr;
x2_ptr += 80;
}
for(k = 0; k < 4; k++)
{
v = &(x2v[k * 20]);
for(l = 0; l < 20; l++)
{
double *r = &right[k * 400 + l * 20];
__m128d ump_x2v = _mm_setzero_pd();
for(j = 0; j < 20; j+= 2)
{
__m128d vv = _mm_load_pd(&v[j]);
__m128d rr = _mm_load_pd(&r[j]);
ump_x2v = _mm_add_pd(ump_x2v, _mm_mul_pd(vv, rr));
}
ump_x2v = _mm_hadd_pd(ump_x2v, ump_x2v);
_mm_storel_pd(&ump_x2[l], ump_x2v);
}
v = &x3_ptr[20 * k];
__m128d zero = _mm_setzero_pd();
for(l = 0; l < 20; l+=2)
_mm_store_pd(&v[l], zero);
for(l = 0; l < 20; l++)
{
double *eev = &extEV[l * 20];
x1px2 = uX1[k * 20 + l] * ump_x2[l];
__m128d x1px2v = _mm_set1_pd(x1px2);
for(j = 0; j < 20; j+=2)
{
__m128d vv = _mm_load_pd(&v[j]);
__m128d ee = _mm_load_pd(&eev[j]);
vv = _mm_add_pd(vv, _mm_mul_pd(x1px2v,ee));
_mm_store_pd(&v[j], vv);
}
}
}
{
v = x3_ptr;
__m128d minlikelihood_sse = _mm_set1_pd( minlikelihood );
scale = 1;
for(l = 0; scale && (l < 80); l += 2)
{
__m128d vv = _mm_load_pd(&v[l]);
__m128d v1 = _mm_and_pd(vv, absMask.m);
v1 = _mm_cmplt_pd(v1, minlikelihood_sse);
if(_mm_movemask_pd( v1 ) != 3)
scale = 0;
}
}
if (scale)
{
__m128d twoto = _mm_set_pd(twotothe256, twotothe256);
for(l = 0; l < 80; l+=2)
{
__m128d ex3v = _mm_load_pd(&v[l]);
_mm_store_pd(&v[l], _mm_mul_pd(ex3v,twoto));
}
if(useFastScaling)
addScale += wgt[i];
else
ex3[i] += 1;
}
x3_ptr += 80;
}
}
}
break;
case INNER_INNER:
{
for(k = 0; k < 4; k++)
{
vl = &(x1_gapColumn[20 * k]);
vr = &(x2_gapColumn[20 * k]);
v = &(x3_gapColumn[20 * k]);
__m128d zero = _mm_setzero_pd();
for(l = 0; l < 20; l+=2)
_mm_store_pd(&v[l], zero);
for(l = 0; l < 20; l++)
{
{
__m128d al = _mm_setzero_pd();
__m128d ar = _mm_setzero_pd();
double *ll = &left[k * 400 + l * 20];
double *rr = &right[k * 400 + l * 20];
double *EVEV = &extEV[20 * l];
for(j = 0; j < 20; j+=2)
{
__m128d lv = _mm_load_pd(&ll[j]);
__m128d rv = _mm_load_pd(&rr[j]);
__m128d vll = _mm_load_pd(&vl[j]);
__m128d vrr = _mm_load_pd(&vr[j]);
al = _mm_add_pd(al, _mm_mul_pd(vll, lv));
ar = _mm_add_pd(ar, _mm_mul_pd(vrr, rv));
}
al = _mm_hadd_pd(al, al);
ar = _mm_hadd_pd(ar, ar);
al = _mm_mul_pd(al, ar);
for(j = 0; j < 20; j+=2)
{
__m128d vv = _mm_load_pd(&v[j]);
__m128d EVV = _mm_load_pd(&EVEV[j]);
vv = _mm_add_pd(vv, _mm_mul_pd(al, EVV));
_mm_store_pd(&v[j], vv);
}
}
}
}
{
v = x3_gapColumn;
__m128d minlikelihood_sse = _mm_set1_pd( minlikelihood );
scale = 1;
for(l = 0; scale && (l < 80); l += 2)
{
__m128d vv = _mm_load_pd(&v[l]);
__m128d v1 = _mm_and_pd(vv, absMask.m);
v1 = _mm_cmplt_pd(v1, minlikelihood_sse);
if(_mm_movemask_pd( v1 ) != 3)
scale = 0;
}
}
if (scale)
{
gapScaling = 1;
__m128d twoto = _mm_set_pd(twotothe256, twotothe256);
for(l = 0; l < 80; l+=2)
{
__m128d ex3v = _mm_load_pd(&v[l]);
_mm_store_pd(&v[l], _mm_mul_pd(ex3v,twoto));
}
}
}
for (i = 0; i < n; i++)
{
if(x3_gap[i / 32] & mask32[i % 32])
{
if(gapScaling)
{
if(useFastScaling)
addScale += wgt[i];
else
ex3[i] += 1;
}
}
else
{
if(x1_gap[i / 32] & mask32[i % 32])
x1v = x1_gapColumn;
else
{
x1v = x1_ptr;
x1_ptr += 80;
}
if(x2_gap[i / 32] & mask32[i % 32])
x2v = x2_gapColumn;
else
{
x2v = x2_ptr;
x2_ptr += 80;
}
for(k = 0; k < 4; k++)
{
vl = &(x1v[20 * k]);
vr = &(x2v[20 * k]);
v = &x3_ptr[20 * k];
__m128d zero = _mm_setzero_pd();
for(l = 0; l < 20; l+=2)
_mm_store_pd(&v[l], zero);
for(l = 0; l < 20; l++)
{
{
__m128d al = _mm_setzero_pd();
__m128d ar = _mm_setzero_pd();
double *ll = &left[k * 400 + l * 20];
double *rr = &right[k * 400 + l * 20];
double *EVEV = &extEV[20 * l];
for(j = 0; j < 20; j+=2)
{
__m128d lv = _mm_load_pd(&ll[j]);
__m128d rv = _mm_load_pd(&rr[j]);
__m128d vll = _mm_load_pd(&vl[j]);
__m128d vrr = _mm_load_pd(&vr[j]);
al = _mm_add_pd(al, _mm_mul_pd(vll, lv));
ar = _mm_add_pd(ar, _mm_mul_pd(vrr, rv));
}
al = _mm_hadd_pd(al, al);
ar = _mm_hadd_pd(ar, ar);
al = _mm_mul_pd(al, ar);
for(j = 0; j < 20; j+=2)
{
__m128d vv = _mm_load_pd(&v[j]);
__m128d EVV = _mm_load_pd(&EVEV[j]);
vv = _mm_add_pd(vv, _mm_mul_pd(al, EVV));
_mm_store_pd(&v[j], vv);
}
}
}
}
{
v = x3_ptr;
__m128d minlikelihood_sse = _mm_set1_pd( minlikelihood );
scale = 1;
for(l = 0; scale && (l < 80); l += 2)
{
__m128d vv = _mm_load_pd(&v[l]);
__m128d v1 = _mm_and_pd(vv, absMask.m);
v1 = _mm_cmplt_pd(v1, minlikelihood_sse);
if(_mm_movemask_pd( v1 ) != 3)
scale = 0;
}
}
if (scale)
{
__m128d twoto = _mm_set_pd(twotothe256, twotothe256);
for(l = 0; l < 80; l+=2)
{
__m128d ex3v = _mm_load_pd(&v[l]);
_mm_store_pd(&v[l], _mm_mul_pd(ex3v,twoto));
}
if(useFastScaling)
addScale += wgt[i];
else
ex3[i] += 1;
}
x3_ptr += 80;
}
}
break;
default:
assert(0);
}
if(useFastScaling)
*scalerIncrement = addScale;
}
#else
#endif
static void newviewGTRGAMMASECONDARY(int tipCase,
double *x1, double *x2, double *x3, double *extEV, double *tipVector,
int *ex3, unsigned char *tipX1, unsigned char *tipX2,
int n, double *left, double *right, int *wgt, int *scalerIncrement, const boolean useFastScaling)
{
double *v;
double x1px2;
int i, j, l, k, scale, addScale = 0;
double *vl, *vr, al, ar;
switch(tipCase)
{
case TIP_TIP:
{
for(i = 0; i < n; i++)
{
for(k = 0; k < 4; k++)
{
vl = &(tipVector[16 * tipX1[i]]);
vr = &(tipVector[16 * tipX2[i]]);
v = &(x3[64 * i + 16 * k]);
for(l = 0; l < 16; l++)
v[l] = 0;
for(l = 0; l < 16; l++)
{
al = 0.0;
ar = 0.0;
for(j = 0; j < 16; j++)
{
al += vl[j] * left[k * 256 + l * 16 + j];
ar += vr[j] * right[k * 256 + l * 16 + j];
}
x1px2 = al * ar;
for(j = 0; j < 16; j++)
v[j] += x1px2 * extEV[16 * l + j];
}
}
}
}
break;
case TIP_INNER:
{
for (i = 0; i < n; i++)
{
for(k = 0; k < 4; k++)
{
vl = &(tipVector[16 * tipX1[i]]);
vr = &(x2[64 * i + 16 * k]);
v = &(x3[64 * i + 16 * k]);
for(l = 0; l < 16; l++)
v[l] = 0;
for(l = 0; l < 16; l++)
{
al = 0.0;
ar = 0.0;
for(j = 0; j < 16; j++)
{
al += vl[j] * left[k * 256 + l * 16 + j];
ar += vr[j] * right[k * 256 + l * 16 + j];
}
x1px2 = al * ar;
for(j = 0; j < 16; j++)
v[j] += x1px2 * extEV[16 * l + j];
}
}
v = &x3[64 * i];
scale = 1;
for(l = 0; scale && (l < 64); l++)
scale = (ABS(v[l]) < minlikelihood);
if (scale)
{
for(l = 0; l < 64; l++)
v[l] *= twotothe256;
if(useFastScaling)
addScale += wgt[i];
else
ex3[i] += 1;
}
}
}
break;
case INNER_INNER:
for (i = 0; i < n; i++)
{
for(k = 0; k < 4; k++)
{
vl = &(x1[64 * i + 16 * k]);
vr = &(x2[64 * i + 16 * k]);
v = &(x3[64 * i + 16 * k]);
for(l = 0; l < 16; l++)
v[l] = 0;
for(l = 0; l < 16; l++)
{
al = 0.0;
ar = 0.0;
for(j = 0; j < 16; j++)
{
al += vl[j] * left[k * 256 + l * 16 + j];
ar += vr[j] * right[k * 256 + l * 16 + j];
}
x1px2 = al * ar;
for(j = 0; j < 16; j++)
v[j] += x1px2 * extEV[16 * l + j];
}
}
v = &(x3[64 * i]);
scale = 1;
for(l = 0; scale && (l < 64); l++)
scale = ((ABS(v[l]) < minlikelihood));
if (scale)
{
for(l = 0; l < 64; l++)
v[l] *= twotothe256;
if(useFastScaling)
addScale += wgt[i];
else
ex3[i] += 1;
}
}
break;
default:
assert(0);
}
if(useFastScaling)
*scalerIncrement = addScale;
}
static void newviewGTRGAMMASECONDARY_6(int tipCase,
double *x1, double *x2, double *x3, double *extEV, double *tipVector,
int *ex3, unsigned char *tipX1, unsigned char *tipX2,
int n, double *left, double *right, int *wgt, int *scalerIncrement, const boolean useFastScaling)
{
double *v;
double x1px2;
int i, j, l, k, scale, addScale = 0;
double *vl, *vr, al, ar;
switch(tipCase)
{
case TIP_TIP:
{
for(i = 0; i < n; i++)
{
for(k = 0; k < 4; k++)
{
vl = &(tipVector[6 * tipX1[i]]);
vr = &(tipVector[6 * tipX2[i]]);
v = &(x3[24 * i + 6 * k]);
for(l = 0; l < 6; l++)
v[l] = 0;
for(l = 0; l < 6; l++)
{
al = 0.0;
ar = 0.0;
for(j = 0; j < 6; j++)
{
al += vl[j] * left[k * 36 + l * 6 + j];
ar += vr[j] * right[k * 36 + l * 6 + j];
}
x1px2 = al * ar;
for(j = 0; j < 6; j++)
v[j] += x1px2 * extEV[6 * l + j];
}
}
}
}
break;
case TIP_INNER:
{
for (i = 0; i < n; i++)
{
for(k = 0; k < 4; k++)
{
vl = &(tipVector[6 * tipX1[i]]);
vr = &(x2[24 * i + 6 * k]);
v = &(x3[24 * i + 6 * k]);
for(l = 0; l < 6; l++)
v[l] = 0;
for(l = 0; l < 6; l++)
{
al = 0.0;
ar = 0.0;
for(j = 0; j < 6; j++)
{
al += vl[j] * left[k * 36 + l * 6 + j];
ar += vr[j] * right[k * 36 + l * 6 + j];
}
x1px2 = al * ar;
for(j = 0; j < 6; j++)
v[j] += x1px2 * extEV[6 * l + j];
}
}
v = &x3[24 * i];
scale = 1;
for(l = 0; scale && (l < 24); l++)
scale = (ABS(v[l]) < minlikelihood);
if(scale)
{
for(l = 0; l < 24; l++)
v[l] *= twotothe256;
if(useFastScaling)
addScale += wgt[i];
else
ex3[i] += 1;
}
}
}
break;
case INNER_INNER:
for (i = 0; i < n; i++)
{
for(k = 0; k < 4; k++)
{
vl = &(x1[24 * i + 6 * k]);
vr = &(x2[24 * i + 6 * k]);
v = &(x3[24 * i + 6 * k]);
for(l = 0; l < 6; l++)
v[l] = 0;
for(l = 0; l < 6; l++)
{
al = 0.0;
ar = 0.0;
for(j = 0; j < 6; j++)
{
al += vl[j] * left[k * 36 + l * 6 + j];
ar += vr[j] * right[k * 36 + l * 6 + j];
}
x1px2 = al * ar;
for(j = 0; j < 6; j++)
v[j] += x1px2 * extEV[6 * l + j];
}
}
v = &(x3[24 * i]);
scale = 1;
for(l = 0; scale && (l < 24); l++)
scale = ((ABS(v[l]) < minlikelihood));
if (scale)
{
for(l = 0; l < 24; l++)
v[l] *= twotothe256;
if(useFastScaling)
addScale += wgt[i];
else
ex3[i] += 1;
}
}
break;
default:
assert(0);
}
if(useFastScaling)
*scalerIncrement = addScale;
}
static void newviewGTRGAMMASECONDARY_7(int tipCase,
double *x1, double *x2, double *x3, double *extEV, double *tipVector,
int *ex3, unsigned char *tipX1, unsigned char *tipX2,
int n, double *left, double *right, int *wgt, int *scalerIncrement, const boolean useFastScaling)
{
double *v;
double x1px2;
int i, j, l, k, scale, addScale = 0;
double *vl, *vr, al, ar;
switch(tipCase)
{
case TIP_TIP:
{
for(i = 0; i < n; i++)
{
for(k = 0; k < 4; k++)
{
vl = &(tipVector[7 * tipX1[i]]);
vr = &(tipVector[7 * tipX2[i]]);
v = &(x3[28 * i + 7 * k]);
for(l = 0; l < 7; l++)
v[l] = 0;
for(l = 0; l < 7; l++)
{
al = 0.0;
ar = 0.0;
for(j = 0; j < 7; j++)
{
al += vl[j] * left[k * 49 + l * 7 + j];
ar += vr[j] * right[k * 49 + l * 7 + j];
}
x1px2 = al * ar;
for(j = 0; j < 7; j++)
v[j] += x1px2 * extEV[7 * l + j];
}
}
}
}
break;
case TIP_INNER:
{
for (i = 0; i < n; i++)
{
for(k = 0; k < 4; k++)
{
vl = &(tipVector[7 * tipX1[i]]);
vr = &(x2[28 * i + 7 * k]);
v = &(x3[28 * i + 7 * k]);
for(l = 0; l < 7; l++)
v[l] = 0;
for(l = 0; l < 7; l++)
{
al = 0.0;
ar = 0.0;
for(j = 0; j < 7; j++)
{
al += vl[j] * left[k * 49 + l * 7 + j];
ar += vr[j] * right[k * 49 + l * 7 + j];
}
x1px2 = al * ar;
for(j = 0; j < 7; j++)
v[j] += x1px2 * extEV[7 * l + j];
}
}
v = &x3[28 * i];
scale = 1;
for(l = 0; scale && (l < 28); l++)
scale = (ABS(v[l]) < minlikelihood);
if (scale)
{
for(l = 0; l < 28; l++)
v[l] *= twotothe256;
if(useFastScaling)
addScale += wgt[i];
else
ex3[i] += 1;
}
}
}
break;
case INNER_INNER:
for (i = 0; i < n; i++)
{
for(k = 0; k < 4; k++)
{
vl = &(x1[28 * i + 7 * k]);
vr = &(x2[28 * i + 7 * k]);
v = &(x3[28 * i + 7 * k]);
for(l = 0; l < 7; l++)
v[l] = 0;
for(l = 0; l < 7; l++)
{
al = 0.0;
ar = 0.0;
for(j = 0; j < 7; j++)
{
al += vl[j] * left[k * 49 + l * 7 + j];
ar += vr[j] * right[k * 49 + l * 7 + j];
}
x1px2 = al * ar;
for(j = 0; j < 7; j++)
v[j] += x1px2 * extEV[7 * l + j];
}
}
v = &(x3[28 * i]);
scale = 1;
for(l = 0; scale && (l < 28); l++)
scale = ((ABS(v[l]) < minlikelihood));
if (scale)
{
for(l = 0; l < 28; l++)
v[l] *= twotothe256;
if(useFastScaling)
addScale += wgt[i];
else
ex3[i] += 1;
}
}
break;
default:
assert(0);
}
if(useFastScaling)
*scalerIncrement = addScale;
}
void computeTraversalInfo(tree *tr, nodeptr p, traversalInfo *ti, int *counter, int maxTips, int numBranches)
{
if(isTip(p->number, maxTips))
return;
{
int i;
nodeptr q = p->next->back;
nodeptr r = p->next->next->back;
#ifdef _HET
boolean
parentIsTip;
if(isTip(p->back->number, maxTips))
parentIsTip = TRUE;
else
parentIsTip = FALSE;
#endif
if(isTip(r->number, maxTips) && isTip(q->number, maxTips))
{
while (! p->x)
{
if (! p->x)
getxnode(p);
}
ti[*counter].tipCase = TIP_TIP;
ti[*counter].pNumber = p->number;
ti[*counter].qNumber = q->number;
ti[*counter].rNumber = r->number;
#ifdef _HET
ti[*counter].parentIsTip = parentIsTip;
#endif
for(i = 0; i < numBranches; i++)
{
double
z;
z = q->z[i];
z = (z > zmin) ? log(z) : log(zmin);
ti[*counter].qz[i] = z;
z = r->z[i];
z = (z > zmin) ? log(z) : log(zmin);
ti[*counter].rz[i] = z;
#ifdef _BASTIEN
if(tr->doBastienStuff)
{
assert(q->secondDerivative[i] == q->back->secondDerivative[i]);
assert(r->secondDerivative[i] == r->back->secondDerivative[i]);
assert(q->secondDerivativeValid[i] && q->back->secondDerivativeValid[i]);
assert(r->secondDerivativeValid[i] && r->back->secondDerivativeValid[i]);
}
ti[*counter].secondDerivativeQ[i] = q->secondDerivative[i];
ti[*counter].secondDerivativeR[i] = r->secondDerivative[i];
#endif
}
*counter = *counter + 1;
}
else
{
if(isTip(r->number, maxTips) || isTip(q->number, maxTips))
{
nodeptr tmp;
if(isTip(r->number, maxTips))
{
tmp = r;
r = q;
q = tmp;
}
while ((! p->x) || (! r->x))
{
if (! r->x)
computeTraversalInfo(tr, r, ti, counter, maxTips, numBranches);
if (! p->x)
getxnode(p);
}
ti[*counter].tipCase = TIP_INNER;
ti[*counter].pNumber = p->number;
ti[*counter].qNumber = q->number;
ti[*counter].rNumber = r->number;
#ifdef _HET
ti[*counter].parentIsTip = parentIsTip;
#endif
for(i = 0; i < numBranches; i++)
{
double z;
z = q->z[i];
z = (z > zmin) ? log(z) : log(zmin);
ti[*counter].qz[i] = z;
z = r->z[i];
z = (z > zmin) ? log(z) : log(zmin);
ti[*counter].rz[i] = z;
#ifdef _BASTIEN
if(tr->doBastienStuff)
{
assert(q->secondDerivative[i] == q->back->secondDerivative[i]);
assert(r->secondDerivative[i] == r->back->secondDerivative[i]);
assert(q->secondDerivativeValid[i] && q->back->secondDerivativeValid[i]);
assert(r->secondDerivativeValid[i] && r->back->secondDerivativeValid[i]);
}
ti[*counter].secondDerivativeQ[i] = q->secondDerivative[i];
ti[*counter].secondDerivativeR[i] = r->secondDerivative[i];
#endif
}
*counter = *counter + 1;
}
else
{
while ((! p->x) || (! q->x) || (! r->x))
{
if (! q->x)
computeTraversalInfo(tr, q, ti, counter, maxTips, numBranches);
if (! r->x)
computeTraversalInfo(tr, r, ti, counter, maxTips, numBranches);
if (! p->x)
getxnode(p);
}
ti[*counter].tipCase = INNER_INNER;
ti[*counter].pNumber = p->number;
ti[*counter].qNumber = q->number;
ti[*counter].rNumber = r->number;
#ifdef _HET
ti[*counter].parentIsTip = parentIsTip;
#endif
for(i = 0; i < numBranches; i++)
{
double z;
z = q->z[i];
z = (z > zmin) ? log(z) : log(zmin);
ti[*counter].qz[i] = z;
z = r->z[i];
z = (z > zmin) ? log(z) : log(zmin);
ti[*counter].rz[i] = z;
#ifdef _BASTIEN
if(tr->doBastienStuff)
{
assert(q->secondDerivative[i] == q->back->secondDerivative[i]);
assert(r->secondDerivative[i] == r->back->secondDerivative[i]);
assert(q->secondDerivativeValid[i] && q->back->secondDerivativeValid[i]);
assert(r->secondDerivativeValid[i] && r->back->secondDerivativeValid[i]);
}
ti[*counter].secondDerivativeQ[i] = q->secondDerivative[i];
ti[*counter].secondDerivativeR[i] = r->secondDerivative[i];
#endif
}
*counter = *counter + 1;
}
}
}
}
void newviewIterative (tree *tr)
{
traversalInfo
*ti = tr->td[0].ti;
int
i,
model;
for(i = 1; i < tr->td[0].count; i++)
{
traversalInfo
*tInfo = &ti[i];
for(model = 0; model < tr->NumberOfModels; model++)
{
if(tr->executeModel[model])
{
double
*x1_start = (double*)NULL,
*x2_start = (double*)NULL,
*x3_start = tr->partitionData[model].xVector[tInfo->pNumber - tr->mxtips - 1],
*left = (double*)NULL,
*right = (double*)NULL,
*x1_gapColumn = (double*)NULL,
*x2_gapColumn = (double*)NULL,
*x3_gapColumn = (double*)NULL,
*x1_ascColumn = (double*)NULL,
*x2_ascColumn = (double*)NULL,
*x3_ascColumn = (double*)NULL,
qz,
rz;
unsigned int
x1_presenceMap = 0,
x2_presenceMap = 0,
*x1_gap = (unsigned int*)NULL,
*x2_gap = (unsigned int*)NULL,
*x3_gap = (unsigned int*)NULL;
int
scalerIncrement = 0,
*wgt = (int*)NULL,
*ex3 = (int*)NULL,
*ex3_asc = (int*)NULL;
unsigned char
*tipX1 = (unsigned char *)NULL,
*tipX2 = (unsigned char *)NULL;
size_t
gapOffset = 0,
rateHet,
states = (size_t)tr->partitionData[model].states,
width = tr->partitionData[model].width,
ascWidth = (size_t)tr->partitionData[model].states,
availableLength = tr->partitionData[model].xSpaceVector[(tInfo->pNumber - tr->mxtips - 1)],
requiredLength;
if(tr->rateHetModel == CAT)
rateHet = 1;
else
rateHet = 4;
if(tr->saveMemory)
{
size_t
j,
setBits = 0;
x1_gap = &(tr->partitionData[model].gapVector[tInfo->qNumber * tr->partitionData[model].gapVectorLength]);
x2_gap = &(tr->partitionData[model].gapVector[tInfo->rNumber * tr->partitionData[model].gapVectorLength]);
x3_gap = &(tr->partitionData[model].gapVector[tInfo->pNumber * tr->partitionData[model].gapVectorLength]);
gapOffset = states * (size_t)getUndetermined(tr->partitionData[model].dataType);
availableLength = tr->partitionData[model].xSpaceVector[(tInfo->pNumber - tr->mxtips - 1)];
for(j = 0; j < (size_t)tr->partitionData[model].gapVectorLength; j++)
{
x3_gap[j] = x1_gap[j] & x2_gap[j];
setBits += (size_t)(precomputed16_bitcount(x3_gap[j]));
}
requiredLength = (width - setBits) * rateHet * states * sizeof(double);
}
else
requiredLength = width * rateHet * states * sizeof(double);
if(requiredLength != availableLength)
{
if(x3_start)
rax_free(x3_start);
x3_start = (double*)rax_malloc(requiredLength);
tr->partitionData[model].xVector[tInfo->pNumber - tr->mxtips - 1] = x3_start;
tr->partitionData[model].xSpaceVector[(tInfo->pNumber - tr->mxtips - 1)] = requiredLength;
}
if(tr->useFastScaling)
wgt = tr->partitionData[model].wgt;
else
{
size_t
availableExpLength = tr->partitionData[model].expSpaceVector[(tInfo->pNumber - tr->mxtips - 1)],
requiredExpLength = width * sizeof(int);
ex3 = tr->partitionData[model].expVector[tInfo->pNumber - tr->mxtips - 1];
if(requiredExpLength != availableExpLength)
{
if(ex3)
rax_free(ex3);
ex3 = (int*)rax_malloc(requiredExpLength);
tr->partitionData[model].expVector[tInfo->pNumber - tr->mxtips - 1] = ex3;
tr->partitionData[model].expSpaceVector[(tInfo->pNumber - tr->mxtips - 1)] = requiredExpLength;
}
}
switch(tInfo->tipCase)
{
case TIP_TIP:
tipX1 = tr->partitionData[model].yVector[tInfo->qNumber];
tipX2 = tr->partitionData[model].yVector[tInfo->rNumber];
x1_presenceMap = tr->partitionData[model].presenceMap[tInfo->qNumber];
x2_presenceMap = tr->partitionData[model].presenceMap[tInfo->rNumber];
if(tr->saveMemory)
{
assert(gapOffset > 0);
x1_gapColumn = &(tr->partitionData[model].tipVector[gapOffset]);
x2_gapColumn = &(tr->partitionData[model].tipVector[gapOffset]);
x3_gapColumn = &tr->partitionData[model].gapColumn[(tInfo->pNumber - tr->mxtips - 1) * states * rateHet];
}
#ifdef _USE_PTHREADS
if(tr->partitionData[model].ascBias && tr->threadID == 0)
#else
if(tr->partitionData[model].ascBias)
#endif
{
size_t
k;
x3_ascColumn = &tr->partitionData[model].ascVector[(tInfo->pNumber - tr->mxtips - 1) * tr->partitionData[model].ascOffset];
ex3_asc = &tr->partitionData[model].ascExpVector[(tInfo->pNumber - tr->mxtips - 1) * ascWidth];
for(k = 0; k < ascWidth; k++)
ex3_asc[k] = 0;
}
if(!tr->useFastScaling)
{
size_t
k;
for(k = 0; k < width; k++)
ex3[k] = 0;
}
break;
case TIP_INNER:
tipX1 = tr->partitionData[model].yVector[tInfo->qNumber];
x2_start = tr->partitionData[model].xVector[tInfo->rNumber - tr->mxtips - 1];
x1_presenceMap = tr->partitionData[model].presenceMap[tInfo->qNumber];
if(tr->saveMemory)
{
assert(gapOffset > 0);
x1_gapColumn = &(tr->partitionData[model].tipVector[gapOffset]);
x2_gapColumn = &tr->partitionData[model].gapColumn[(tInfo->rNumber - tr->mxtips - 1) * states * rateHet];
x3_gapColumn = &tr->partitionData[model].gapColumn[(tInfo->pNumber - tr->mxtips - 1) * states * rateHet];
}
#ifdef _USE_PTHREADS
if(tr->partitionData[model].ascBias && tr->threadID == 0)
#else
if(tr->partitionData[model].ascBias)
#endif
{
size_t
k;
int
*ex2_asc;
x2_ascColumn = &tr->partitionData[model].ascVector[(tInfo->rNumber - tr->mxtips - 1) * tr->partitionData[model].ascOffset];
x3_ascColumn = &tr->partitionData[model].ascVector[(tInfo->pNumber - tr->mxtips - 1) * tr->partitionData[model].ascOffset];
ex2_asc = &tr->partitionData[model].ascExpVector[(tInfo->rNumber - tr->mxtips - 1) * ascWidth];
ex3_asc = &tr->partitionData[model].ascExpVector[(tInfo->pNumber - tr->mxtips - 1) * ascWidth];
for(k = 0; k < ascWidth; k++)
ex3_asc[k] = ex2_asc[k];
}
if(!tr->useFastScaling)
{
size_t
k;
int
*ex2 = tr->partitionData[model].expVector[tInfo->rNumber - tr->mxtips - 1];
for(k = 0; k < width; k++)
ex3[k] = ex2[k];
}
break;
case INNER_INNER:
x1_start = tr->partitionData[model].xVector[tInfo->qNumber - tr->mxtips - 1];
x2_start = tr->partitionData[model].xVector[tInfo->rNumber - tr->mxtips - 1];
if(tr->saveMemory)
{
x1_gapColumn = &tr->partitionData[model].gapColumn[(tInfo->qNumber - tr->mxtips - 1) * states * rateHet];
x2_gapColumn = &tr->partitionData[model].gapColumn[(tInfo->rNumber - tr->mxtips - 1) * states * rateHet];
x3_gapColumn = &tr->partitionData[model].gapColumn[(tInfo->pNumber - tr->mxtips - 1) * states * rateHet];
}
#ifdef _USE_PTHREADS
if(tr->partitionData[model].ascBias && tr->threadID == 0)
#else
if(tr->partitionData[model].ascBias)
#endif
{
size_t
k;
int
*ex1_asc,
*ex2_asc;
x1_ascColumn = &tr->partitionData[model].ascVector[(tInfo->qNumber - tr->mxtips - 1) * tr->partitionData[model].ascOffset];
x2_ascColumn = &tr->partitionData[model].ascVector[(tInfo->rNumber - tr->mxtips - 1) * tr->partitionData[model].ascOffset];
x3_ascColumn = &tr->partitionData[model].ascVector[(tInfo->pNumber - tr->mxtips - 1) * tr->partitionData[model].ascOffset];
ex1_asc = &tr->partitionData[model].ascExpVector[(tInfo->qNumber - tr->mxtips - 1) * ascWidth];
ex2_asc = &tr->partitionData[model].ascExpVector[(tInfo->rNumber - tr->mxtips - 1) * ascWidth];
ex3_asc = &tr->partitionData[model].ascExpVector[(tInfo->pNumber - tr->mxtips - 1) * ascWidth];
for(k = 0; k < ascWidth; k++)
ex3_asc[k] = ex1_asc[k] + ex2_asc[k];
}
if(!tr->useFastScaling)
{
size_t
k;
int
*ex1,
*ex2;
ex1 = tr->partitionData[model].expVector[tInfo->qNumber - tr->mxtips - 1];
ex2 = tr->partitionData[model].expVector[tInfo->rNumber - tr->mxtips - 1];
for(k = 0; k < width; k++)
ex3[k] = ex1[k] + ex2[k];
}
break;
default:
assert(0);
}
left = tr->partitionData[model].left;
right = tr->partitionData[model].right;
if(tr->multiBranch)
{
qz = tInfo->qz[model];
rz = tInfo->rz[model];
}
else
{
qz = tInfo->qz[0];
rz = tInfo->rz[0];
}
#ifdef _BASTIEN
if(tr->doBastienStuff)
{
if(tr->multiBranch)
assert(0);
else
{
printf("\nnewview\n");
printf("Hello, I am the second derivative at the q branch %1.40f\n", tInfo->secondDerivativeQ[0]);
printf("Hello, I am the second derivative at the r branch %1.40f\n", tInfo->secondDerivativeR[0]);
}
}
#endif
switch(tr->partitionData[model].dataType)
{
case BINARY_DATA:
switch(tr->rateHetModel)
{
case CAT:
{
makeP(qz, rz, tr->partitionData[model].perSiteRates, tr->partitionData[model].EI,
tr->partitionData[model].EIGN, tr->partitionData[model].numberOfCategories,
left, right, BINARY_DATA, tr->saveMemory, tr->maxCategories);
newviewGTRCAT_BINARY(tInfo->tipCase, tr->partitionData[model].EV, tr->partitionData[model].rateCategory,
x1_start, x2_start, x3_start, tr->partitionData[model].tipVector,
ex3, tipX1, tipX2,
width, left, right, wgt, &scalerIncrement, tr->useFastScaling
);
}
break;
case GAMMA:
case GAMMA_I:
{
makeP(qz, rz, tr->partitionData[model].gammaRates,
tr->partitionData[model].EI, tr->partitionData[model].EIGN,
4, left, right, BINARY_DATA, tr->saveMemory, tr->maxCategories);
newviewGTRGAMMA_BINARY(tInfo->tipCase,
x1_start, x2_start, x3_start, tr->partitionData[model].EV, tr->partitionData[model].tipVector,
ex3, tipX1, tipX2,
width, left, right, wgt, &scalerIncrement, tr->useFastScaling);
}
break;
default:
assert(0);
}
break;
case DNA_DATA:
switch(tr->rateHetModel)
{
case CAT:
{
makeP(qz, rz, tr->partitionData[model].perSiteRates, tr->partitionData[model].EI,
tr->partitionData[model].EIGN, tr->partitionData[model].numberOfCategories,
left, right, DNA_DATA, tr->saveMemory, tr->maxCategories);
#if (defined(__SIM_SSE3) || defined(__AVX))
if(tr->saveMemory)
{
#ifdef __AVX
newviewGTRCAT_AVX_GAPPED_SAVE(tInfo->tipCase, tr->partitionData[model].EV, tr->partitionData[model].rateCategory,
x1_start, x2_start, x3_start, tr->partitionData[model].tipVector,
ex3, tipX1, tipX2,
width, left, right, wgt, &scalerIncrement, tr->useFastScaling, x1_gap, x2_gap, x3_gap,
x1_gapColumn, x2_gapColumn, x3_gapColumn, tr->maxCategories);
#else
newviewGTRCAT_SAVE(tInfo->tipCase, tr->partitionData[model].EV, tr->partitionData[model].rateCategory,
x1_start, x2_start, x3_start, tr->partitionData[model].tipVector,
ex3, tipX1, tipX2,
width, left, right, wgt, &scalerIncrement, tr->useFastScaling, x1_gap, x2_gap, x3_gap,
x1_gapColumn, x2_gapColumn, x3_gapColumn, tr->maxCategories);
#endif
}
else
#endif
{
#ifdef __AVX
newviewGTRCAT_AVX(tInfo->tipCase, tr->partitionData[model].EV, tr->partitionData[model].rateCategory,
x1_start, x2_start, x3_start, tr->partitionData[model].tipVector,
ex3, tipX1, tipX2,
width, left, right, wgt, &scalerIncrement, tr->useFastScaling);
#else
newviewGTRCAT(tInfo->tipCase, tr->partitionData[model].EV, tr->partitionData[model].rateCategory,
x1_start, x2_start, x3_start, tr->partitionData[model].tipVector,
ex3, tipX1, tipX2,
width, left, right, wgt, &scalerIncrement, tr->useFastScaling);
#endif
}
}
break;
case GAMMA:
case GAMMA_I:
#ifdef _HET
{
double
*parentEV;
if(tInfo->parentIsTip)
parentEV = tr->partitionData[model].EV_TIP;
else
parentEV = tr->partitionData[model].EV;
switch(tInfo->tipCase)
{
case TIP_TIP:
makeP_FlexHet(qz, rz, tr->partitionData[model].gammaRates,
tr->partitionData[model].EI_TIP, tr->partitionData[model].EIGN_TIP, tr->partitionData[model].EI_TIP, tr->partitionData[model].EIGN_TIP,
4, left, right, 4);
newviewGTRGAMMA(tInfo->tipCase,
x1_start, x2_start, x3_start, parentEV, tr->partitionData[model].tipVector_TIP,
ex3, tipX1, tipX2,
width, left, right, wgt, &scalerIncrement, tr->useFastScaling, x1_presenceMap, x2_presenceMap);
break;
case TIP_INNER:
assert(tInfo->qNumber <= tr->mxtips && tInfo->rNumber > tr->mxtips);
makeP_FlexHet(qz, rz, tr->partitionData[model].gammaRates,
tr->partitionData[model].EI_TIP, tr->partitionData[model].EIGN_TIP, tr->partitionData[model].EI, tr->partitionData[model].EIGN,
4, left, right, 4);
newviewGTRGAMMA(tInfo->tipCase,
x1_start, x2_start, x3_start, parentEV, tr->partitionData[model].tipVector_TIP,
ex3, tipX1, tipX2,
width, left, right, wgt, &scalerIncrement, tr->useFastScaling, x1_presenceMap, x2_presenceMap);
break;
case INNER_INNER:
makeP_FlexHet(qz, rz, tr->partitionData[model].gammaRates,
tr->partitionData[model].EI, tr->partitionData[model].EIGN, tr->partitionData[model].EI, tr->partitionData[model].EIGN,
4, left, right, 4);
newviewGTRGAMMA(tInfo->tipCase,
x1_start, x2_start, x3_start, parentEV, (double *)NULL,
ex3, tipX1, tipX2,
width, left, right, wgt, &scalerIncrement, tr->useFastScaling, x1_presenceMap, x2_presenceMap);
break;
default:
assert(0);
}
}
#else
{
makeP(qz, rz, tr->partitionData[model].gammaRates,
tr->partitionData[model].EI, tr->partitionData[model].EIGN,
4, left, right, DNA_DATA, tr->saveMemory, tr->maxCategories);
#if (defined(__SIM_SSE3) || defined(__AVX))
if(tr->saveMemory)
{
#ifdef __AVX
newviewGTRGAMMA_AVX_GAPPED_SAVE(tInfo->tipCase,
x1_start, x2_start, x3_start, tr->partitionData[model].EV, tr->partitionData[model].tipVector,
ex3, tipX1, tipX2,
width, left, right, wgt, &scalerIncrement, tr->useFastScaling,
x1_gap, x2_gap, x3_gap,
x1_gapColumn, x2_gapColumn, x3_gapColumn, x1_presenceMap, x2_presenceMap);
#else
newviewGTRGAMMA_GAPPED_SAVE(tInfo->tipCase,
x1_start, x2_start, x3_start, tr->partitionData[model].EV, tr->partitionData[model].tipVector,
ex3, tipX1, tipX2,
width, left, right, wgt, &scalerIncrement, tr->useFastScaling,
x1_gap, x2_gap, x3_gap,
x1_gapColumn, x2_gapColumn, x3_gapColumn, x1_presenceMap, x2_presenceMap);
#endif
}
else
#endif
{
#ifdef __AVX
newviewGTRGAMMA_AVX(tInfo->tipCase,
x1_start, x2_start, x3_start, tr->partitionData[model].EV, tr->partitionData[model].tipVector,
ex3, tipX1, tipX2,
width, left, right, wgt, &scalerIncrement, tr->useFastScaling, x1_presenceMap, x2_presenceMap);
#else
newviewGTRGAMMA(tInfo->tipCase,
x1_start, x2_start, x3_start, tr->partitionData[model].EV, tr->partitionData[model].tipVector,
ex3, tipX1, tipX2,
width, left, right, wgt, &scalerIncrement, tr->useFastScaling, x1_presenceMap, x2_presenceMap);
#endif
}
}
#endif
break;
default:
assert(0);
}
break;
case AA_DATA:
switch(tr->rateHetModel)
{
case CAT:
{
makeP(qz, rz, tr->partitionData[model].perSiteRates,
tr->partitionData[model].EI,
tr->partitionData[model].EIGN,
tr->partitionData[model].numberOfCategories, left, right, AA_DATA, tr->saveMemory, tr->maxCategories);
#if (defined(__SIM_SSE3) || defined(__AVX))
if(tr->saveMemory)
#ifdef __AVX
newviewGTRCATPROT_AVX_GAPPED_SAVE(tInfo->tipCase, tr->partitionData[model].EV, tr->partitionData[model].rateCategory,
x1_start, x2_start, x3_start, tr->partitionData[model].tipVector,
ex3, tipX1, tipX2, width, left, right, wgt, &scalerIncrement, tr->useFastScaling, x1_gap, x2_gap, x3_gap,
x1_gapColumn, x2_gapColumn, x3_gapColumn, tr->maxCategories);
#else
newviewGTRCATPROT_SAVE(tInfo->tipCase, tr->partitionData[model].EV, tr->partitionData[model].rateCategory,
x1_start, x2_start, x3_start, tr->partitionData[model].tipVector,
ex3, tipX1, tipX2, width, left, right, wgt, &scalerIncrement, tr->useFastScaling, x1_gap, x2_gap, x3_gap,
x1_gapColumn, x2_gapColumn, x3_gapColumn, tr->maxCategories);
#endif
else
#endif
{
#ifdef __AVX
newviewGTRCATPROT_AVX(tInfo->tipCase, tr->partitionData[model].EV, tr->partitionData[model].rateCategory,
x1_start, x2_start, x3_start, tr->partitionData[model].tipVector,
ex3, tipX1, tipX2, width, left, right, wgt, &scalerIncrement, tr->useFastScaling);
#else
newviewGTRCATPROT(tInfo->tipCase, tr->partitionData[model].EV, tr->partitionData[model].rateCategory,
x1_start, x2_start, x3_start, tr->partitionData[model].tipVector,
ex3, tipX1, tipX2, width, left, right, wgt, &scalerIncrement, tr->useFastScaling);
#endif
}
}
break;
case GAMMA:
case GAMMA_I:
if(tr->partitionData[model].protModels == LG4 || tr->partitionData[model].protModels == LG4X)
{
makeP_FlexLG4(qz, rz, tr->partitionData[model].gammaRates,
tr->partitionData[model].EI_LG4,
tr->partitionData[model].EIGN_LG4,
4, left, right, 20);
#ifdef __AVX
newviewGTRGAMMAPROT_AVX_LG4(tInfo->tipCase,
x1_start, x2_start, x3_start,
tr->partitionData[model].EV_LG4,
tr->partitionData[model].tipVector_LG4,
ex3, tipX1, tipX2,
width, left, right, wgt, &scalerIncrement, tr->useFastScaling);
#else
newviewGTRGAMMAPROT_LG4(tInfo->tipCase,
x1_start, x2_start, x3_start,
tr->partitionData[model].EV_LG4,
tr->partitionData[model].tipVector_LG4,
ex3, tipX1, tipX2,
width, left, right,
wgt, &scalerIncrement, tr->useFastScaling);
#endif
}
else
{
makeP(qz, rz, tr->partitionData[model].gammaRates,
tr->partitionData[model].EI,
tr->partitionData[model].EIGN,
4, left, right, AA_DATA, tr->saveMemory, tr->maxCategories);
#if (defined(__SIM_SSE3) || defined(__AVX))
if(tr->saveMemory)
#ifdef __AVX
newviewGTRGAMMAPROT_AVX_GAPPED_SAVE(tInfo->tipCase,
x1_start, x2_start, x3_start,
tr->partitionData[model].EV,
tr->partitionData[model].tipVector,
ex3, tipX1, tipX2,
width, left, right, wgt, &scalerIncrement, tr->useFastScaling,
x1_gap, x2_gap, x3_gap,
x1_gapColumn, x2_gapColumn, x3_gapColumn);
#else
newviewGTRGAMMAPROT_GAPPED_SAVE(tInfo->tipCase,
x1_start, x2_start, x3_start,
tr->partitionData[model].EV,
tr->partitionData[model].tipVector,
ex3, tipX1, tipX2,
width, left, right, wgt, &scalerIncrement, tr->useFastScaling,
x1_gap, x2_gap, x3_gap,
x1_gapColumn, x2_gapColumn, x3_gapColumn);
#endif
else
#endif
{
#ifdef __AVX
newviewGTRGAMMAPROT_AVX(tInfo->tipCase,
x1_start, x2_start, x3_start,
tr->partitionData[model].EV,
tr->partitionData[model].tipVector,
ex3, tipX1, tipX2,
width, left, right, wgt, &scalerIncrement, tr->useFastScaling);
#else
newviewGTRGAMMAPROT(tInfo->tipCase,
x1_start, x2_start, x3_start,
tr->partitionData[model].EV,
tr->partitionData[model].tipVector,
ex3, tipX1, tipX2,
width, left, right, wgt, &scalerIncrement, tr->useFastScaling);
#endif
}
}
break;
default:
assert(0);
}
break;
case SECONDARY_DATA_6:
switch(tr->rateHetModel)
{
case CAT:
{
makeP(qz, rz, tr->partitionData[model].perSiteRates,
tr->partitionData[model].EI,
tr->partitionData[model].EIGN,
tr->partitionData[model].numberOfCategories, left, right, SECONDARY_DATA_6, tr->saveMemory, tr->maxCategories);
newviewGTRCATSECONDARY_6(tInfo->tipCase, tr->partitionData[model].EV, tr->partitionData[model].rateCategory,
x1_start, x2_start, x3_start, tr->partitionData[model].tipVector,
ex3, tipX1, tipX2, width, left, right, wgt, &scalerIncrement, tr->useFastScaling);
}
break;
case GAMMA:
case GAMMA_I:
{
makeP(qz, rz, tr->partitionData[model].gammaRates,
tr->partitionData[model].EI,
tr->partitionData[model].EIGN,
4, left, right, SECONDARY_DATA_6, tr->saveMemory, tr->maxCategories);
newviewGTRGAMMASECONDARY_6(tInfo->tipCase,
x1_start, x2_start, x3_start,
tr->partitionData[model].EV,
tr->partitionData[model].tipVector,
ex3, tipX1, tipX2,
width, left, right, wgt, &scalerIncrement, tr->useFastScaling);
}
break;
default:
assert(0);
}
break;
case SECONDARY_DATA_7:
switch(tr->rateHetModel)
{
case CAT:
{
makeP(qz, rz, tr->partitionData[model].perSiteRates,
tr->partitionData[model].EI,
tr->partitionData[model].EIGN,
tr->partitionData[model].numberOfCategories, left, right, SECONDARY_DATA_7, tr->saveMemory, tr->maxCategories);
newviewGTRCATSECONDARY_7(tInfo->tipCase, tr->partitionData[model].EV, tr->partitionData[model].rateCategory,
x1_start, x2_start, x3_start, tr->partitionData[model].tipVector,
ex3, tipX1, tipX2, width, left, right, wgt, &scalerIncrement, tr->useFastScaling);
}
break;
case GAMMA:
case GAMMA_I:
{
makeP(qz, rz, tr->partitionData[model].gammaRates,
tr->partitionData[model].EI,
tr->partitionData[model].EIGN,
4, left, right, SECONDARY_DATA_7, tr->saveMemory, tr->maxCategories);
newviewGTRGAMMASECONDARY_7(tInfo->tipCase,
x1_start, x2_start, x3_start,
tr->partitionData[model].EV,
tr->partitionData[model].tipVector,
ex3, tipX1, tipX2,
width, left, right, wgt, &scalerIncrement, tr->useFastScaling);
}
break;
default:
assert(0);
}
break;
case SECONDARY_DATA:
switch(tr->rateHetModel)
{
case CAT:
{
makeP(qz, rz, tr->partitionData[model].perSiteRates,
tr->partitionData[model].EI,
tr->partitionData[model].EIGN,
tr->partitionData[model].numberOfCategories, left, right, SECONDARY_DATA, tr->saveMemory, tr->maxCategories);
newviewGTRCATSECONDARY(tInfo->tipCase, tr->partitionData[model].EV, tr->partitionData[model].rateCategory,
x1_start, x2_start, x3_start, tr->partitionData[model].tipVector,
ex3, tipX1, tipX2, width, left, right, wgt, &scalerIncrement, tr->useFastScaling);
}
break;
case GAMMA:
case GAMMA_I:
{
makeP(qz, rz, tr->partitionData[model].gammaRates,
tr->partitionData[model].EI,
tr->partitionData[model].EIGN,
4, left, right, SECONDARY_DATA, tr->saveMemory, tr->maxCategories);
newviewGTRGAMMASECONDARY(tInfo->tipCase,
x1_start, x2_start, x3_start,
tr->partitionData[model].EV,
tr->partitionData[model].tipVector,
ex3, tipX1, tipX2,
width, left, right, wgt, &scalerIncrement, tr->useFastScaling);
}
break;
default:
assert(0);
}
break;
case GENERIC_32:
switch(tr->rateHetModel)
{
case CAT:
{
makeP_Flex(qz, rz, tr->partitionData[model].perSiteRates,
tr->partitionData[model].EI,
tr->partitionData[model].EIGN,
tr->partitionData[model].numberOfCategories, left, right, states);
newviewFlexCat(tInfo->tipCase, tr->partitionData[model].EV, tr->partitionData[model].rateCategory,
x1_start, x2_start, x3_start, tr->partitionData[model].tipVector,
ex3, tipX1, tipX2, width, left, right, wgt, &scalerIncrement, tr->useFastScaling, states);
}
break;
case GAMMA:
case GAMMA_I:
{
makeP_Flex(qz, rz, tr->partitionData[model].gammaRates,
tr->partitionData[model].EI,
tr->partitionData[model].EIGN,
4, left, right, states);
newviewFlexGamma(tInfo->tipCase,
x1_start, x2_start, x3_start,
tr->partitionData[model].EV,
tr->partitionData[model].tipVector,
ex3, tipX1, tipX2,
width, left, right, wgt, &scalerIncrement, tr->useFastScaling, states);
}
break;
default:
assert(0);
}
break;
case GENERIC_64:
break;
default:
assert(0);
}
#ifdef _USE_PTHREADS
if(tr->partitionData[model].ascBias && tr->threadID == 0)
#else
if(tr->partitionData[model].ascBias)
#endif
{
switch(tr->rateHetModel)
{
case CAT:
{
double
rates = 1.0;
//need to re-calculate transition probabilities assuming a rate of 1.0
makeP_Flex(qz, rz, &rates, tr->partitionData[model].EI,
tr->partitionData[model].EIGN,
1, left, right, states);
newviewAscCat(tInfo->tipCase,
x1_ascColumn, x2_ascColumn, x3_ascColumn,
tr->partitionData[model].EV,
tr->partitionData[model].tipVector, ex3_asc,
states, left, right, states, tr->partitionData[model].dataType,
tInfo->qNumber, tInfo->rNumber, tr->mxtips);
}
break;
case GAMMA:
newviewAscGamma(tInfo->tipCase,
x1_ascColumn, x2_ascColumn, x3_ascColumn,
tr->partitionData[model].EV,
tr->partitionData[model].tipVector,
ex3_asc,
states, left, right, states, tr->partitionData[model].dataType,
tInfo->qNumber, tInfo->rNumber, tr->mxtips);
break;
default:
assert(0);
}
}
if(tr->useFastScaling)
{
tr->partitionData[model].globalScaler[tInfo->pNumber] =
tr->partitionData[model].globalScaler[tInfo->qNumber] +
tr->partitionData[model].globalScaler[tInfo->rNumber] +
(unsigned int)scalerIncrement;
assert(tr->partitionData[model].globalScaler[tInfo->pNumber] < INT_MAX);
}
}
}
#ifdef _DEBUG_MULTI_EPA
printf("\n");
#endif
}
}
void newviewGeneric (tree *tr, nodeptr p)
{
if(isTip(p->number, tr->mxtips))
return;
tr->td[0].count = 1;
computeTraversalInfo(tr, p, &(tr->td[0].ti[0]), &(tr->td[0].count), tr->mxtips, tr->numBranches);
if(tr->td[0].count > 1)
{
#ifdef _USE_PTHREADS
masterBarrier(THREAD_NEWVIEW, tr);
#else
newviewIterative(tr);
#endif
}
}
void newviewGenericMasked(tree *tr, nodeptr p)
{
if(isTip(p->number, tr->mxtips))
return;
{
int i;
for(i = 0; i < tr->NumberOfModels; i++)
{
if(tr->partitionConverged[i])
tr->executeModel[i] = FALSE;
else
tr->executeModel[i] = TRUE;
}
{
tr->td[0].count = 1;
computeTraversalInfo(tr, p, &(tr->td[0].ti[0]), &(tr->td[0].count), tr->mxtips, tr->numBranches);
if(tr->td[0].count > 1)
{
#ifdef _USE_PTHREADS
masterBarrier(THREAD_NEWVIEW_MASKED, tr);
#else
newviewIterative(tr);
#endif
}
}
for(i = 0; i < tr->NumberOfModels; i++)
tr->executeModel[i] = TRUE;
}
}
#ifdef _USE_PTHREADS
void newviewMultiGrain(tree *tr, double *x1, double *x2, double *x3, int *_ex1, int *_ex2, int *_ex3, unsigned char *_tipX1, unsigned char *_tipX2,
int tipCase, double *_pz, double *_qz, int insertion)
{
int
scalerIncrement = 0,
model,
columnCounter = 0,
offsetCounter = 0;
setPartitionMask(tr, insertion, tr->executeModel);
#ifdef _DEBUG_MULTI_EPA
if(tr->threadID == THREAD_TO_DEBUG)
printf("NV %s: ", tr->nameList[tr->inserts[insertion]]);
#endif
for(model = 0; model < tr->NumberOfModels; model++)
{
int
width = tr->partitionData[model].upper - tr->partitionData[model].lower;
#ifdef _DEBUG_MULTI_EPA
if(tr->threadID == THREAD_TO_DEBUG)
printf("%d", tr->executeModel[model]);
#endif
if(tr->executeModel[model])
{
double
*x1_start = (double*)NULL,
*x2_start = (double*)NULL,
*x3_start = (double*)NULL,
*left = tr->partitionData[model].left,
*right = tr->partitionData[model].right,
pz, qz;
int
*wgt = &tr->contiguousWgt[columnCounter],
*rateCategory = &tr->contiguousRateCategory[columnCounter],
*ex1 = (int*)NULL,
*ex2 = (int*)NULL,
*ex3 = (int*)NULL;
unsigned char
*tipX1 = (unsigned char *)NULL,
*tipX2 = (unsigned char *)NULL;
//TODO fix this or not?
unsigned int
x1_presenceMap = ~0,
x2_presenceMap = ~0;
switch(tipCase)
{
case TIP_TIP:
tipX1 = &_tipX1[columnCounter];
tipX2 = &_tipX2[columnCounter];
ex3 = &_ex3[columnCounter];
x3_start = &x3[offsetCounter];
if(!tr->useFastScaling)
{
int k;
for(k = 0; k < width; k++)
ex3[k] = 0;
}
break;
case TIP_INNER:
tipX1 = &_tipX1[columnCounter];
ex2 = &_ex2[columnCounter];
x2_start = &x2[offsetCounter];
ex3 = &_ex3[columnCounter];
x3_start = &x3[offsetCounter];
if(!tr->useFastScaling)
{
int k;
for(k = 0; k < width; k++)
ex3[k] = ex2[k];
}
break;
case INNER_INNER:
ex1 = &_ex1[columnCounter];
x1_start = &x1[offsetCounter];
ex2 = &_ex2[columnCounter];
x2_start = &x2[offsetCounter];
ex3 = &_ex3[columnCounter];
x3_start = &x3[offsetCounter];
if(!tr->useFastScaling)
{
int k;
for(k = 0; k < width; k++)
ex3[k] = ex1[k] + ex2[k];
}
break;
default:
assert(0);
}
if(tr->multiBranch)
{
pz = _pz[model];
pz = (pz > zmin) ? log(pz) : log(zmin);
qz = _qz[model];
qz = (qz > zmin) ? log(qz) : log(zmin);
}
else
{
pz = _pz[0];
pz = (pz > zmin) ? log(pz) : log(zmin);
qz = _qz[0];
qz = (qz > zmin) ? log(qz) : log(zmin);
}
switch(tr->partitionData[model].dataType)
{
case BINARY_DATA:
switch(tr->rateHetModel)
{
case CAT:
makeP(pz, qz, tr->partitionData[model].perSiteRates, tr->partitionData[model].EI,
tr->partitionData[model].EIGN, tr->partitionData[model].numberOfCategories,
left, right, BINARY_DATA, tr->saveMemory, tr->maxCategories);
newviewGTRCAT_BINARY(tipCase, tr->partitionData[model].EV, rateCategory,
x1_start, x2_start, x3_start, tr->partitionData[model].tipVector,
ex3, tipX1, tipX2,
width, left, right, wgt, &scalerIncrement, tr->useFastScaling
);
break;
case GAMMA:
case GAMMA_I:
makeP(pz, qz, tr->partitionData[model].gammaRates,
tr->partitionData[model].EI, tr->partitionData[model].EIGN,
4, left, right, BINARY_DATA, tr->saveMemory, tr->maxCategories);
newviewGTRGAMMA_BINARY(tipCase,
x1_start, x2_start, x3_start, tr->partitionData[model].EV, tr->partitionData[model].tipVector,
ex3, tipX1, tipX2,
width, left, right, wgt, &scalerIncrement, tr->useFastScaling);
break;
default:
assert(0);
}
break;
case DNA_DATA:
switch(tr->rateHetModel)
{
case CAT:
makeP(pz, qz, tr->partitionData[model].perSiteRates, tr->partitionData[model].EI,
tr->partitionData[model].EIGN, tr->partitionData[model].numberOfCategories,
left, right, DNA_DATA, tr->saveMemory, tr->maxCategories);
#ifdef __AVX
newviewGTRCAT_AVX(tipCase, tr->partitionData[model].EV, rateCategory,
x1_start, x2_start, x3_start, tr->partitionData[model].tipVector,
ex3, tipX1, tipX2,
width, left, right, wgt, &scalerIncrement, tr->useFastScaling);
#else
newviewGTRCAT(tipCase, tr->partitionData[model].EV, rateCategory,
x1_start, x2_start, x3_start, tr->partitionData[model].tipVector,
ex3, tipX1, tipX2,
width, left, right, wgt, &scalerIncrement, tr->useFastScaling
);
#endif
break;
case GAMMA:
case GAMMA_I:
makeP(pz, qz, tr->partitionData[model].gammaRates,
tr->partitionData[model].EI, tr->partitionData[model].EIGN,
4, left, right, DNA_DATA, tr->saveMemory, tr->maxCategories);
#ifdef __AVX
newviewGTRGAMMA_AVX(tipCase,
x1_start, x2_start, x3_start, tr->partitionData[model].EV, tr->partitionData[model].tipVector,
ex3, tipX1, tipX2,
width, left, right, wgt, &scalerIncrement, tr->useFastScaling, x1_presenceMap, x2_presenceMap);
#else
newviewGTRGAMMA(tipCase,
x1_start, x2_start, x3_start, tr->partitionData[model].EV, tr->partitionData[model].tipVector,
ex3, tipX1, tipX2,
width, left, right, wgt, &scalerIncrement, tr->useFastScaling, x1_presenceMap, x2_presenceMap);
#endif
break;
default:
assert(0);
}
break;
case AA_DATA:
switch(tr->rateHetModel)
{
case CAT:
makeP(pz, qz, tr->partitionData[model].perSiteRates,
tr->partitionData[model].EI,
tr->partitionData[model].EIGN,
tr->partitionData[model].numberOfCategories, left, right, AA_DATA, tr->saveMemory, tr->maxCategories);
#ifdef __AVX
newviewGTRCATPROT_AVX(tipCase, tr->partitionData[model].EV, rateCategory,
x1_start, x2_start, x3_start, tr->partitionData[model].tipVector,
ex3, tipX1, tipX2, width, left, right, wgt, &scalerIncrement, tr->useFastScaling);
#else
newviewGTRCATPROT(tipCase, tr->partitionData[model].EV, rateCategory,
x1_start, x2_start, x3_start, tr->partitionData[model].tipVector,
ex3, tipX1, tipX2, width, left, right, wgt, &scalerIncrement, tr->useFastScaling);
#endif
break;
case GAMMA:
case GAMMA_I:
makeP(pz, qz, tr->partitionData[model].gammaRates,
tr->partitionData[model].EI,
tr->partitionData[model].EIGN,
4, left, right, AA_DATA, tr->saveMemory, tr->maxCategories);
#ifdef __AVX
newviewGTRGAMMAPROT_AVX(tipCase,
x1_start, x2_start, x3_start,
tr->partitionData[model].EV,
tr->partitionData[model].tipVector,
ex3, tipX1, tipX2,
width, left, right, wgt, &scalerIncrement, tr->useFastScaling);
#else
newviewGTRGAMMAPROT(tipCase,
x1_start, x2_start, x3_start,
tr->partitionData[model].EV,
tr->partitionData[model].tipVector,
ex3, tipX1, tipX2,
width, left, right, wgt, &scalerIncrement, tr->useFastScaling);
#endif
break;
default:
assert(0);
}
break;
case SECONDARY_DATA:
switch(tr->rateHetModel)
{
case CAT:
makeP(pz, qz, tr->partitionData[model].perSiteRates,
tr->partitionData[model].EI,
tr->partitionData[model].EIGN,
tr->partitionData[model].numberOfCategories, left, right, SECONDARY_DATA, tr->saveMemory, tr->maxCategories);
newviewGTRCATSECONDARY(tipCase, tr->partitionData[model].EV, rateCategory,
x1_start, x2_start, x3_start, tr->partitionData[model].tipVector,
ex3, tipX1, tipX2, width, left, right, wgt, &scalerIncrement, tr->useFastScaling);
break;
case GAMMA:
case GAMMA_I:
makeP(pz, qz, tr->partitionData[model].gammaRates,
tr->partitionData[model].EI,
tr->partitionData[model].EIGN,
4, left, right, SECONDARY_DATA, tr->saveMemory, tr->maxCategories);
newviewGTRGAMMASECONDARY(tipCase,
x1_start, x2_start, x3_start,
tr->partitionData[model].EV,
tr->partitionData[model].tipVector,
ex3, tipX1, tipX2,
width, left, right, wgt, &scalerIncrement, tr->useFastScaling);
break;
default:
assert(0);
}
break;
case SECONDARY_DATA_6:
switch(tr->rateHetModel)
{
case CAT:
makeP(pz, qz, tr->partitionData[model].perSiteRates,
tr->partitionData[model].EI,
tr->partitionData[model].EIGN,
tr->partitionData[model].numberOfCategories, left, right, SECONDARY_DATA_6, tr->saveMemory, tr->maxCategories);
newviewGTRCATSECONDARY_6(tipCase, tr->partitionData[model].EV, rateCategory,
x1_start, x2_start, x3_start, tr->partitionData[model].tipVector,
ex3, tipX1, tipX2, width, left, right, wgt, &scalerIncrement, tr->useFastScaling);
break;
case GAMMA:
case GAMMA_I:
makeP(pz, qz, tr->partitionData[model].gammaRates,
tr->partitionData[model].EI,
tr->partitionData[model].EIGN,
4, left, right, SECONDARY_DATA_6, tr->saveMemory, tr->maxCategories);
newviewGTRGAMMASECONDARY_6(tipCase,
x1_start, x2_start, x3_start,
tr->partitionData[model].EV,
tr->partitionData[model].tipVector,
ex3, tipX1, tipX2,
width, left, right, wgt, &scalerIncrement, tr->useFastScaling);
break;
default:
assert(0);
}
break;
case SECONDARY_DATA_7:
switch(tr->rateHetModel)
{
case CAT:
makeP(pz, qz, tr->partitionData[model].perSiteRates,
tr->partitionData[model].EI,
tr->partitionData[model].EIGN,
tr->partitionData[model].numberOfCategories, left, right, SECONDARY_DATA_7, tr->saveMemory, tr->maxCategories);
newviewGTRCATSECONDARY_7(tipCase, tr->partitionData[model].EV, rateCategory,
x1_start, x2_start, x3_start, tr->partitionData[model].tipVector,
ex3, tipX1, tipX2, width, left, right, wgt, &scalerIncrement, tr->useFastScaling);
break;
case GAMMA:
case GAMMA_I:
makeP(pz, qz, tr->partitionData[model].gammaRates,
tr->partitionData[model].EI,
tr->partitionData[model].EIGN,
4, left, right, SECONDARY_DATA_7, tr->saveMemory, tr->maxCategories);
newviewGTRGAMMASECONDARY_7(tipCase,
x1_start, x2_start, x3_start,
tr->partitionData[model].EV,
tr->partitionData[model].tipVector,
ex3, tipX1, tipX2,
width, left, right, wgt, &scalerIncrement, tr->useFastScaling);
break;
default:
assert(0);
}
break;
case GENERIC_32:
{
int
states = tr->partitionData[model].states;
switch(tr->rateHetModel)
{
case CAT:
{
makeP_Flex(pz, qz, tr->partitionData[model].perSiteRates,
tr->partitionData[model].EI,
tr->partitionData[model].EIGN,
tr->partitionData[model].numberOfCategories, left, right, states);
newviewFlexCat(tipCase, tr->partitionData[model].EV, rateCategory,
x1_start, x2_start, x3_start, tr->partitionData[model].tipVector,
ex3, tipX1, tipX2, width, left, right, wgt, &scalerIncrement, tr->useFastScaling, states);
}
break;
case GAMMA:
case GAMMA_I:
{
makeP_Flex(pz, qz, tr->partitionData[model].gammaRates,
tr->partitionData[model].EI,
tr->partitionData[model].EIGN,
4, left, right, states);
newviewFlexGamma(tipCase,
x1_start, x2_start, x3_start,
tr->partitionData[model].EV,
tr->partitionData[model].tipVector,
ex3, tipX1, tipX2,
width, left, right, wgt, &scalerIncrement, tr->useFastScaling, states);
}
break;
default:
assert(0);
}
}
break;
default:
assert(0);
}
}
columnCounter += width;
offsetCounter += width * tr->partitionData[model].states * tr->discreteRateCategories;
}
#ifdef _DEBUG_MULTI_EPA
if(tr->threadID == THREAD_TO_DEBUG)
printf("\n");
#endif
resetPartitionMask(tr, tr->executeModel);
}
void newviewClassify(tree *tr, branchInfo *b, double *z, int insertion)
{
int
leftNumber = b->epa->leftNodeNumber,
rightNumber = b->epa->rightNodeNumber,
tipCase = -1,
*ex1 = (int*)NULL,
*ex2 = (int*)NULL,
*ex3 = tr->temporaryScaling;
double
*x1_start = (double*)NULL,
*x2_start = (double*)NULL,
*x3_start = tr->temporaryVector;
unsigned char
*tipX1 = (unsigned char*)NULL,
*tipX2 = (unsigned char*)NULL;
if (isTip(leftNumber, tr->mxtips) && isTip(rightNumber, tr->mxtips))
{
tipCase = TIP_TIP;
tipX1 = tr->contiguousTips[leftNumber];
tipX2 = tr->contiguousTips[rightNumber];
}
else
{
if (isTip(leftNumber, tr->mxtips))
{
tipCase = TIP_INNER;
tipX1 = tr->contiguousTips[leftNumber];
x2_start = b->epa->right;
ex2 = b->epa->rightScaling;
}
else
{
if(isTip(rightNumber, tr->mxtips))
{
tipCase = TIP_INNER;
tipX1 = tr->contiguousTips[rightNumber];
x2_start = b->epa->left;
ex2 = b->epa->leftScaling;
}
else
{
tipCase = INNER_INNER;
x1_start = b->epa->left;
ex1 = b->epa->leftScaling;
x2_start = b->epa->right;
ex2 = b->epa->rightScaling;
}
}
}
newviewMultiGrain(tr, x1_start, x2_start, x3_start, ex1, ex2, ex3, tipX1, tipX2,
tipCase, z, z, insertion);
}
#endif