/* RAxML-VI-HPC (version 2.2) a program for sequential and parallel estimation of phylogenetic trees * Copyright August 2006 by Alexandros Stamatakis * * Partially derived from * fastDNAml, a program for estimation of phylogenetic trees from sequences by Gary J. Olsen * * and * * Programs of the PHYLIP package by Joe Felsenstein. * This program is free software; you may redistribute it and/or modify its * under the terms of the GNU General Public License as published by the Free * Software Foundation; either version 2 of the License, or (at your option) * any later version. * * This program is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License * for more details. * * * For any other enquiries send an Email to Alexandros Stamatakis * Alexandros.Stamatakis@epfl.ch * * When publishing work that is based on the results from RAxML-VI-HPC please cite: * * Alexandros Stamatakis:"RAxML-VI-HPC: maximum likelihood-based phylogenetic analyses with thousands of taxa and mixed models". * Bioinformatics 2006; doi: 10.1093/bioinformatics/btl446 */ #ifndef WIN32 #include #endif #include #include #include #include #include #include #include #include #include "axml.h" #ifdef __SIM_SSE3 #include #include #include const union __attribute__ ((aligned (BYTE_ALIGNMENT))) { uint64_t i[2]; __m128d m; } absMask = {{0x7fffffffffffffffULL , 0x7fffffffffffffffULL }}; #endif #ifdef _USE_PTHREADS #include extern volatile int NumberOfThreads; extern pthread_mutex_t mutex; #endif extern const unsigned int mask32[32]; boolean isGap(unsigned int *x, int pos) { return (x[pos / 32] & mask32[pos % 32]); } boolean noGap(unsigned int *x, int pos) { return (!(x[pos / 32] & mask32[pos % 32])); } void makeP_Flex(double z1, double z2, double *rptr, double *EI, double *EIGN, int numberOfCategories, double *left, double *right, const int numStates) { int i, j, k; const int rates = numStates - 1, statesSquare = numStates * numStates; double lz1[64], lz2[64], d1[64], d2[64]; assert(numStates <= 64); for(i = 0; i < rates; i++) { lz1[i] = EIGN[i] * z1; lz2[i] = EIGN[i] * z2; } for(i = 0; i < numberOfCategories; i++) { for(j = 0; j < rates; j++) { d1[j] = EXP (rptr[i] * lz1[j]); d2[j] = EXP (rptr[i] * lz2[j]); } for(j = 0; j < numStates; j++) { left[statesSquare * i + numStates * j] = 1.0; right[statesSquare * i + numStates * j] = 1.0; for(k = 1; k < numStates; k++) { left[statesSquare * i + numStates * j + k] = d1[k-1] * EI[rates * j + (k-1)]; right[statesSquare * i + numStates * j + k] = d2[k-1] * EI[rates * j + (k-1)]; } } } } #ifdef _HET static void makeP_FlexHet(double z1, double z2, double *rptr, double *EI_1, double *EIGN_1, double *EI_2, double *EIGN_2, int numberOfCategories, double *left, double *right, const int numStates) { int i, j, k; const int rates = numStates - 1, statesSquare = numStates * numStates; double lz1[64], lz2[64], d1[64], d2[64]; assert(numStates <= 64); for(i = 0; i < rates; i++) { lz1[i] = EIGN_1[i] * z1; lz2[i] = EIGN_2[i] * z2; } for(i = 0; i < numberOfCategories; i++) { for(j = 0; j < rates; j++) { d1[j] = EXP (rptr[i] * lz1[j]); d2[j] = EXP (rptr[i] * lz2[j]); } for(j = 0; j < numStates; j++) { left[statesSquare * i + numStates * j] = 1.0; right[statesSquare * i + numStates * j] = 1.0; for(k = 1; k < numStates; k++) { left[statesSquare * i + numStates * j + k] = d1[k-1] * EI_1[rates * j + (k-1)]; right[statesSquare * i + numStates * j + k] = d2[k-1] * EI_2[rates * j + (k-1)]; } } } } #endif void makeP_FlexLG4(double z1, double z2, double *rptr, double *EI[4], double *EIGN[4], int numberOfCategories, double *left, double *right, const int numStates) { int i, j, k; const int rates = numStates - 1, statesSquare = numStates * numStates; double d1[64], d2[64]; assert(numStates <= 64); for(i = 0; i < numberOfCategories; i++) { for(j = 0; j < rates; j++) { d1[j] = EXP (rptr[i] * EIGN[i][j] * z1); d2[j] = EXP (rptr[i] * EIGN[i][j] * z2); } for(j = 0; j < numStates; j++) { left[statesSquare * i + numStates * j] = 1.0; right[statesSquare * i + numStates * j] = 1.0; for(k = 1; k < numStates; k++) { left[statesSquare * i + numStates * j + k] = d1[k-1] * EI[i][rates * j + (k-1)]; right[statesSquare * i + numStates * j + k] = d2[k-1] * EI[i][rates * j + (k-1)]; } } } } static void newviewFlexCat(int tipCase, double *extEV, int *cptr, double *x1, double *x2, double *x3, double *tipVector, int *ex3, unsigned char *tipX1, unsigned char *tipX2, int n, double *left, double *right, int *wgt, int *scalerIncrement, const boolean useFastScaling, const int numStates) { double *le, *ri, *v, *vl, *vr, ump_x1, ump_x2, x1px2; int i, l, j, scale, addScale = 0; const int statesSquare = numStates * numStates; switch(tipCase) { case TIP_TIP: { for (i = 0; i < n; i++) { le = &left[cptr[i] * statesSquare]; ri = &right[cptr[i] * statesSquare]; vl = &(tipVector[numStates * tipX1[i]]); vr = &(tipVector[numStates * tipX2[i]]); v = &x3[numStates * i]; for(l = 0; l < numStates; l++) v[l] = 0.0; for(l = 0; l < numStates; l++) { ump_x1 = 0.0; ump_x2 = 0.0; for(j = 0; j < numStates; j++) { ump_x1 += vl[j] * le[l * numStates + j]; ump_x2 += vr[j] * ri[l * numStates + j]; } x1px2 = ump_x1 * ump_x2; for(j = 0; j < numStates; j++) v[j] += x1px2 * extEV[l * numStates + j]; } } } break; case TIP_INNER: { for (i = 0; i < n; i++) { le = &left[cptr[i] * statesSquare]; ri = &right[cptr[i] * statesSquare]; vl = &(tipVector[numStates * tipX1[i]]); vr = &x2[numStates * i]; v = &x3[numStates * i]; for(l = 0; l < numStates; l++) v[l] = 0.0; for(l = 0; l < numStates; l++) { ump_x1 = 0.0; ump_x2 = 0.0; for(j = 0; j < numStates; j++) { ump_x1 += vl[j] * le[l * numStates + j]; ump_x2 += vr[j] * ri[l * numStates + j]; } x1px2 = ump_x1 * ump_x2; for(j = 0; j < numStates; j++) v[j] += x1px2 * extEV[l * numStates + j]; } scale = 1; for(l = 0; scale && (l < numStates); l++) scale = ((v[l] < minlikelihood) && (v[l] > minusminlikelihood)); if(scale) { for(l = 0; l < numStates; l++) v[l] *= twotothe256; if(useFastScaling) addScale += wgt[i]; else ex3[i] += 1; } } } break; case INNER_INNER: for(i = 0; i < n; i++) { le = &left[cptr[i] * statesSquare]; ri = &right[cptr[i] * statesSquare]; vl = &x1[numStates * i]; vr = &x2[numStates * i]; v = &x3[numStates * i]; for(l = 0; l < numStates; l++) v[l] = 0.0; for(l = 0; l < numStates; l++) { ump_x1 = 0.0; ump_x2 = 0.0; for(j = 0; j < numStates; j++) { ump_x1 += vl[j] * le[l * numStates + j]; ump_x2 += vr[j] * ri[l * numStates + j]; } x1px2 = ump_x1 * ump_x2; for(j = 0; j < numStates; j++) v[j] += x1px2 * extEV[l * numStates + j]; } scale = 1; for(l = 0; scale && (l < numStates); l++) scale = ((v[l] < minlikelihood) && (v[l] > minusminlikelihood)); if(scale) { for(l = 0; l < numStates; l++) v[l] *= twotothe256; if(useFastScaling) addScale += wgt[i]; else ex3[i] += 1; } } break; default: assert(0); } if(useFastScaling) *scalerIncrement = addScale; } static void newviewFlexGamma(int tipCase, double *x1, double *x2, double *x3, double *extEV, double *tipVector, int *ex3, unsigned char *tipX1, unsigned char *tipX2, int n, double *left, double *right, int *wgt, int *scalerIncrement, const boolean useFastScaling, const int numStates) { double *v; double x1px2; int i, j, l, k, scale, addScale = 0; double *vl, *vr, al, ar; const int statesSquare = numStates * numStates, gammaStates = 4 * numStates; switch(tipCase) { case TIP_TIP: { for(i = 0; i < n; i++) { for(k = 0; k < 4; k++) { vl = &(tipVector[numStates * tipX1[i]]); vr = &(tipVector[numStates * tipX2[i]]); v = &(x3[gammaStates * i + numStates * k]); for(l = 0; l < numStates; l++) v[l] = 0; for(l = 0; l < numStates; l++) { al = 0.0; ar = 0.0; for(j = 0; j < numStates; j++) { al += vl[j] * left[k * statesSquare + l * numStates + j]; ar += vr[j] * right[k * statesSquare + l * numStates + j]; } x1px2 = al * ar; for(j = 0; j < numStates; j++) v[j] += x1px2 * extEV[numStates * l + j]; } } } } break; case TIP_INNER: { for (i = 0; i < n; i++) { for(k = 0; k < 4; k++) { vl = &(tipVector[numStates * tipX1[i]]); vr = &(x2[gammaStates * i + numStates * k]); v = &(x3[gammaStates * i + numStates * k]); for(l = 0; l < numStates; l++) v[l] = 0; for(l = 0; l < numStates; l++) { al = 0.0; ar = 0.0; for(j = 0; j < numStates; j++) { al += vl[j] * left[k * statesSquare + l * numStates + j]; ar += vr[j] * right[k * statesSquare + l * numStates + j]; } x1px2 = al * ar; for(j = 0; j < numStates; j++) v[j] += x1px2 * extEV[numStates * l + j]; } } v = &x3[gammaStates * i]; scale = 1; for(l = 0; scale && (l < gammaStates); l++) scale = (ABS(v[l]) < minlikelihood); if(scale) { for(l = 0; l < gammaStates; l++) v[l] *= twotothe256; if(useFastScaling) addScale += wgt[i]; else ex3[i] += 1; } } } break; case INNER_INNER: for (i = 0; i < n; i++) { for(k = 0; k < 4; k++) { vl = &(x1[gammaStates * i + numStates * k]); vr = &(x2[gammaStates * i + numStates * k]); v = &(x3[gammaStates * i + numStates * k]); for(l = 0; l < numStates; l++) v[l] = 0; for(l = 0; l < numStates; l++) { al = 0.0; ar = 0.0; for(j = 0; j < numStates; j++) { al += vl[j] * left[k * statesSquare + l * numStates + j]; ar += vr[j] * right[k * statesSquare + l * numStates + j]; } x1px2 = al * ar; for(j = 0; j < numStates; j++) v[j] += x1px2 * extEV[numStates * l + j]; } } v = &(x3[gammaStates * i]); scale = 1; for(l = 0; scale && (l < gammaStates); l++) scale = ((ABS(v[l]) < minlikelihood)); if (scale) { for(l = 0; l < gammaStates; l++) v[l] *= twotothe256; if(useFastScaling) addScale += wgt[i]; else ex3[i] += 1; } } break; default: assert(0); } if(useFastScaling) *scalerIncrement = addScale; } static void newviewAscCat(int tipCase, double *x1, double *x2, double *x3, double *extEV, double *tipVector, int *ex3, const int n, double *left, double *right, const int numStates, int dataType, const int qNumber, const int rNumber, int *ascMissingVector, int maxtips) { double *le, *ri, *v, *vl, *vr, ump_x1, ump_x2, x1px2; int i, l, j, scale; switch(tipCase) { case TIP_TIP: { unsigned char tip1[32], tip2[32]; ascertainmentBiasSequence(tip1, numStates, dataType, qNumber, ascMissingVector); ascertainmentBiasSequence(tip2, numStates, dataType, rNumber, ascMissingVector); assert(qNumber <= maxtips && rNumber <= maxtips); for (i = 0; i < n; i++) { le = &left[0]; ri = &right[0]; vl = &(tipVector[numStates * tip1[i]]); vr = &(tipVector[numStates * tip2[i]]); v = &x3[numStates * i]; for(l = 0; l < numStates; l++) v[l] = 0.0; for(l = 0; l < numStates; l++) { ump_x1 = 0.0; ump_x2 = 0.0; for(j = 0; j < numStates; j++) { ump_x1 += vl[j] * le[l * numStates + j]; ump_x2 += vr[j] * ri[l * numStates + j]; } x1px2 = ump_x1 * ump_x2; for(j = 0; j < numStates; j++) v[j] += x1px2 * extEV[l * numStates + j]; } } } break; case TIP_INNER: { unsigned char tip[32]; if(rNumber <= maxtips) ascertainmentBiasSequence(tip, numStates, dataType, rNumber, ascMissingVector); else ascertainmentBiasSequence(tip, numStates, dataType, qNumber, ascMissingVector); for (i = 0; i < n; i++) { le = &left[0]; ri = &right[0]; vl = &(tipVector[numStates * tip[i]]); vr = &x2[numStates * i]; v = &x3[numStates * i]; for(l = 0; l < numStates; l++) v[l] = 0.0; for(l = 0; l < numStates; l++) { ump_x1 = 0.0; ump_x2 = 0.0; for(j = 0; j < numStates; j++) { ump_x1 += vl[j] * le[l * numStates + j]; ump_x2 += vr[j] * ri[l * numStates + j]; } x1px2 = ump_x1 * ump_x2; for(j = 0; j < numStates; j++) v[j] += x1px2 * extEV[l * numStates + j]; } scale = 1; for(l = 0; scale && (l < numStates); l++) scale = ((v[l] < minlikelihood) && (v[l] > minusminlikelihood)); if(scale) { for(l = 0; l < numStates; l++) v[l] *= twotothe256; ex3[i] += 1; } } } break; case INNER_INNER: for(i = 0; i < n; i++) { le = &left[0]; ri = &right[0]; vl = &x1[numStates * i]; vr = &x2[numStates * i]; v = &x3[numStates * i]; for(l = 0; l < numStates; l++) v[l] = 0.0; for(l = 0; l < numStates; l++) { ump_x1 = 0.0; ump_x2 = 0.0; for(j = 0; j < numStates; j++) { ump_x1 += vl[j] * le[l * numStates + j]; ump_x2 += vr[j] * ri[l * numStates + j]; } x1px2 = ump_x1 * ump_x2; for(j = 0; j < numStates; j++) v[j] += x1px2 * extEV[l * numStates + j]; } scale = 1; for(l = 0; scale && (l < numStates); l++) scale = ((v[l] < minlikelihood) && (v[l] > minusminlikelihood)); if(scale) { for(l = 0; l < numStates; l++) v[l] *= twotothe256; ex3[i] += 1; } } break; default: assert(0); } } static void newviewAscGamma(int tipCase, double *x1, double *x2, double *x3, double *extEV, double *tipVector, int *ex3, const int n, double *left, double *right, const int numStates, int dataType, const int qNumber, const int rNumber, int *ascMissingVector, int maxtips) { int i, j, l, k, scale; const int statesSquare = numStates * numStates, gammaStates = 4 * numStates; double *vl, *vr, al, ar, *v, x1px2; switch(tipCase) { case TIP_TIP: { unsigned char tip1[32], tip2[32]; ascertainmentBiasSequence(tip1, numStates, dataType, qNumber, ascMissingVector); ascertainmentBiasSequence(tip2, numStates, dataType, rNumber, ascMissingVector); assert(qNumber <= maxtips && rNumber <= maxtips); for(i = 0; i < n; i++) { for(k = 0; k < 4; k++) { vl = &(tipVector[numStates * tip1[i]]); vr = &(tipVector[numStates * tip2[i]]); v = &(x3[gammaStates * i + numStates * k]); for(l = 0; l < numStates; l++) v[l] = 0; for(l = 0; l < numStates; l++) { al = 0.0; ar = 0.0; for(j = 0; j < numStates; j++) { al += vl[j] * left[k * statesSquare + l * numStates + j]; ar += vr[j] * right[k * statesSquare + l * numStates + j]; } x1px2 = al * ar; for(j = 0; j < numStates; j++) v[j] += x1px2 * extEV[numStates * l + j]; } } } } break; case TIP_INNER: { unsigned char tip[32]; if(rNumber <= maxtips) ascertainmentBiasSequence(tip, numStates, dataType, rNumber, ascMissingVector); else ascertainmentBiasSequence(tip, numStates, dataType, qNumber, ascMissingVector); for (i = 0; i < n; i++) { for(k = 0; k < 4; k++) { vl = &(tipVector[numStates * tip[i]]); vr = &(x2[gammaStates * i + numStates * k]); v = &(x3[gammaStates * i + numStates * k]); for(l = 0; l < numStates; l++) v[l] = 0; for(l = 0; l < numStates; l++) { al = 0.0; ar = 0.0; for(j = 0; j < numStates; j++) { al += vl[j] * left[k * statesSquare + l * numStates + j]; ar += vr[j] * right[k * statesSquare + l * numStates + j]; } x1px2 = al * ar; for(j = 0; j < numStates; j++) v[j] += x1px2 * extEV[numStates * l + j]; } } v = &x3[gammaStates * i]; scale = 1; for(l = 0; scale && (l < gammaStates); l++) scale = (ABS(v[l]) < minlikelihood); if(scale) { for(l = 0; l < gammaStates; l++) v[l] *= twotothe256; ex3[i] += 1; } } } break; case INNER_INNER: for (i = 0; i < n; i++) { for(k = 0; k < 4; k++) { vl = &(x1[gammaStates * i + numStates * k]); vr = &(x2[gammaStates * i + numStates * k]); v = &(x3[gammaStates * i + numStates * k]); for(l = 0; l < numStates; l++) v[l] = 0; for(l = 0; l < numStates; l++) { al = 0.0; ar = 0.0; for(j = 0; j < numStates; j++) { al += vl[j] * left[k * statesSquare + l * numStates + j]; ar += vr[j] * right[k * statesSquare + l * numStates + j]; } x1px2 = al * ar; for(j = 0; j < numStates; j++) v[j] += x1px2 * extEV[numStates * l + j]; } } v = &(x3[gammaStates * i]); scale = 1; for(l = 0; scale && (l < gammaStates); l++) scale = ((ABS(v[l]) < minlikelihood)); if(scale) { for(l = 0; l < gammaStates; l++) v[l] *= twotothe256; ex3[i] += 1; } } break; default: assert(0); } } static void makeP(double z1, double z2, double *rptr, double *EI, double *EIGN, int numberOfCategories, double *left, double *right, int data, boolean saveMem, int maxCat) { int i, j, k; switch(data) { case BINARY_DATA: { double d1, d2; for(i = 0; i < numberOfCategories; i++) { d1 = EXP(rptr[i] * EIGN[0] * z1); d2 = EXP(rptr[i] * EIGN[0] * z2); for(j = 0; j < 2; j++) { left[i * 4 + j * 2] = 1.0; right[i * 4 + j * 2] = 1.0; left[i * 4 + j * 2 + 1] = d1 * EI[j]; right[i * 4 + j * 2 + 1] = d2 * EI[j]; } } } break; case DNA_DATA: { #ifdef __SIM_SSE3 double d1[4] __attribute__ ((aligned (BYTE_ALIGNMENT))), d2[4] __attribute__ ((aligned (BYTE_ALIGNMENT))), ez1[3], ez2[3], EI_16[16] __attribute__ ((aligned (BYTE_ALIGNMENT))); for(j = 0; j < 4; j++) { EI_16[j * 4] = 1.0; for(k = 0; k < 3; k++) EI_16[j * 4 + k + 1] = EI[3 * j + k]; } for(j = 0; j < 3; j++) { ez1[j] = EIGN[j] * z1; ez2[j] = EIGN[j] * z2; } for(i = 0; i < numberOfCategories; i++) { __m128d d1_0, d1_1, d2_0, d2_1; d1[0] = 1.0; d2[0] = 1.0; for(j = 0; j < 3; j++) { d1[j+1] = EXP(rptr[i] * ez1[j]); d2[j+1] = EXP(rptr[i] * ez2[j]); } d1_0 = _mm_load_pd(&d1[0]); d1_1 = _mm_load_pd(&d1[2]); d2_0 = _mm_load_pd(&d2[0]); d2_1 = _mm_load_pd(&d2[2]); for(j = 0; j < 4; j++) { double *ll = &left[i * 16 + j * 4]; double *rr = &right[i * 16 + j * 4]; __m128d eev = _mm_load_pd(&EI_16[4 * j]); _mm_store_pd(&ll[0], _mm_mul_pd(d1_0, eev)); _mm_store_pd(&rr[0], _mm_mul_pd(d2_0, eev)); eev = _mm_load_pd(&EI_16[4 * j + 2]); _mm_store_pd(&ll[2], _mm_mul_pd(d1_1, eev)); _mm_store_pd(&rr[2], _mm_mul_pd(d2_1, eev)); } } if(saveMem) { i = maxCat; { __m128d d1_0, d1_1, d2_0, d2_1; d1[0] = 1.0; d2[0] = 1.0; for(j = 0; j < 3; j++) { d1[j+1] = EXP(ez1[j]); d2[j+1] = EXP(ez2[j]); } d1_0 = _mm_load_pd(&d1[0]); d1_1 = _mm_load_pd(&d1[2]); d2_0 = _mm_load_pd(&d2[0]); d2_1 = _mm_load_pd(&d2[2]); for(j = 0; j < 4; j++) { double *ll = &left[i * 16 + j * 4]; double *rr = &right[i * 16 + j * 4]; __m128d eev = _mm_load_pd(&EI_16[4 * j]); _mm_store_pd(&ll[0], _mm_mul_pd(d1_0, eev)); _mm_store_pd(&rr[0], _mm_mul_pd(d2_0, eev)); eev = _mm_load_pd(&EI_16[4 * j + 2]); _mm_store_pd(&ll[2], _mm_mul_pd(d1_1, eev)); _mm_store_pd(&rr[2], _mm_mul_pd(d2_1, eev)); } } } #else double d1[3], d2[3]; for(i = 0; i < numberOfCategories; i++) { for(j = 0; j < 3; j++) { d1[j] = EXP(rptr[i] * EIGN[j] * z1); d2[j] = EXP(rptr[i] * EIGN[j] * z2); } for(j = 0; j < 4; j++) { left[i * 16 + j * 4] = 1.0; right[i * 16 + j * 4] = 1.0; for(k = 0; k < 3; k++) { left[i * 16 + j * 4 + k + 1] = d1[k] * EI[3 * j + k]; right[i * 16 + j * 4 + k + 1] = d2[k] * EI[3 * j + k]; } } } #endif } break; case SECONDARY_DATA: { double lz1[15], lz2[15], d1[15], d2[15]; for(i = 0; i < 15; i++) { lz1[i] = EIGN[i] * z1; lz2[i] = EIGN[i] * z2; } for(i = 0; i < numberOfCategories; i++) { for(j = 0; j < 15; j++) { d1[j] = EXP (rptr[i] * lz1[j]); d2[j] = EXP (rptr[i] * lz2[j]); } for(j = 0; j < 16; j++) { left[256 * i + 16 * j] = 1.0; right[256 * i + 16 * j] = 1.0; for(k = 1; k < 16; k++) { left[256 * i + 16 * j + k] = d1[k-1] * EI[15 * j + (k-1)]; right[256 * i + 16 * j + k] = d2[k-1] * EI[15 * j + (k-1)]; } } } } break; case SECONDARY_DATA_6: { double lz1[5], lz2[5], d1[5], d2[5]; for(i = 0; i < 5; i++) { lz1[i] = EIGN[i] * z1; lz2[i] = EIGN[i] * z2; } for(i = 0; i < numberOfCategories; i++) { for(j = 0; j < 5; j++) { d1[j] = EXP (rptr[i] * lz1[j]); d2[j] = EXP (rptr[i] * lz2[j]); } for(j = 0; j < 6; j++) { left[36 * i + 6 * j] = 1.0; right[36 * i + 6 * j] = 1.0; for(k = 1; k < 6; k++) { left[36 * i + 6 * j + k] = d1[k-1] * EI[5 * j + (k-1)]; right[36 * i + 6 * j + k] = d2[k-1] * EI[5 * j + (k-1)]; } } } } break; case SECONDARY_DATA_7: { double lz1[6], lz2[6], d1[6], d2[6]; for(i = 0; i < 6; i++) { lz1[i] = EIGN[i] * z1; lz2[i] = EIGN[i] * z2; } for(i = 0; i < numberOfCategories; i++) { for(j = 0; j < 6; j++) { d1[j] = EXP (rptr[i] * lz1[j]); d2[j] = EXP (rptr[i] * lz2[j]); } for(j = 0; j < 7; j++) { left[49 * i + 7 * j] = 1.0; right[49 * i + 7 * j] = 1.0; for(k = 1; k < 7; k++) { left[49 * i + 7 * j + k] = d1[k-1] * EI[6 * j + (k-1)]; right[49 * i + 7 * j + k] = d2[k-1] * EI[6 * j + (k-1)]; } } } } break; case AA_DATA: { double lz1[19], lz2[19], d1[19], d2[19]; for(i = 0; i < 19; i++) { lz1[i] = EIGN[i] * z1; lz2[i] = EIGN[i] * z2; } for(i = 0; i < numberOfCategories; i++) { for(j = 0; j < 19; j++) { d1[j] = EXP (rptr[i] * lz1[j]); d2[j] = EXP (rptr[i] * lz2[j]); } for(j = 0; j < 20; j++) { left[400 * i + 20 * j] = 1.0; right[400 * i + 20 * j] = 1.0; for(k = 1; k < 20; k++) { left[400 * i + 20 * j + k] = d1[k-1] * EI[19 * j + (k-1)]; right[400 * i + 20 * j + k] = d2[k-1] * EI[19 * j + (k-1)]; } } } if(saveMem) { i = maxCat; for(j = 0; j < 19; j++) { d1[j] = EXP (lz1[j]); d2[j] = EXP (lz2[j]); } for(j = 0; j < 20; j++) { left[400 * i + 20 * j] = 1.0; right[400 * i + 20 * j] = 1.0; for(k = 1; k < 20; k++) { left[400 * i + 20 * j + k] = d1[k-1] * EI[19 * j + (k-1)]; right[400 * i + 20 * j + k] = d2[k-1] * EI[19 * j + (k-1)]; } } } } break; default: assert(0); } } #ifndef __SIM_SSE3 static void newviewGTRCAT_BINARY( int tipCase, double *EV, int *cptr, double *x1_start, double *x2_start, double *x3_start, double *tipVector, int *ex3, unsigned char *tipX1, unsigned char *tipX2, int n, double *left, double *right, int *wgt, int *scalerIncrement, const boolean useFastScaling) { double *le, *ri, *x1, *x2, *x3; double ump_x1, ump_x2, x1px2[2]; int i, j, k, scale, addScale = 0; switch(tipCase) { case TIP_TIP: { for (i = 0; i < n; i++) { x1 = &(tipVector[2 * tipX1[i]]); x2 = &(tipVector[2 * tipX2[i]]); x3 = &x3_start[2 * i]; le = &left[cptr[i] * 4]; ri = &right[cptr[i] * 4]; for(j = 0; j < 2; j++) { ump_x1 = 0.0; ump_x2 = 0.0; for(k = 0; k < 2; k++) { ump_x1 += x1[k] * le[j * 2 + k]; ump_x2 += x2[k] * ri[j * 2 + k]; } x1px2[j] = ump_x1 * ump_x2; } for(j = 0; j < 2; j++) x3[j] = 0.0; for(j = 0; j < 2; j++) for(k = 0; k < 2; k++) x3[k] += x1px2[j] * EV[j * 2 + k]; } } break; case TIP_INNER: { for (i = 0; i < n; i++) { x1 = &(tipVector[2 * tipX1[i]]); x2 = &x2_start[2 * i]; x3 = &x3_start[2 * i]; le = &left[cptr[i] * 4]; ri = &right[cptr[i] * 4]; for(j = 0; j < 2; j++) { ump_x1 = 0.0; ump_x2 = 0.0; for(k = 0; k < 2; k++) { ump_x1 += x1[k] * le[j * 2 + k]; ump_x2 += x2[k] * ri[j * 2 + k]; } x1px2[j] = ump_x1 * ump_x2; } for(j = 0; j < 2; j++) x3[j] = 0.0; for(j = 0; j < 2; j++) for(k = 0; k < 2; k++) x3[k] += x1px2[j] * EV[2 * j + k]; scale = 1; for(j = 0; j < 2 && scale; j++) scale = (x3[j] < minlikelihood && x3[j] > minusminlikelihood); if(scale) { for(j = 0; j < 2; j++) x3[j] *= twotothe256; if(useFastScaling) addScale += wgt[i]; else ex3[i] += 1; } } } break; case INNER_INNER: for (i = 0; i < n; i++) { x1 = &x1_start[2 * i]; x2 = &x2_start[2 * i]; x3 = &x3_start[2 * i]; le = &left[cptr[i] * 4]; ri = &right[cptr[i] * 4]; for(j = 0; j < 2; j++) { ump_x1 = 0.0; ump_x2 = 0.0; for(k = 0; k < 2; k++) { ump_x1 += x1[k] * le[j * 2 + k]; ump_x2 += x2[k] * ri[j * 2 + k]; } x1px2[j] = ump_x1 * ump_x2; } for(j = 0; j < 2; j++) x3[j] = 0.0; for(j = 0; j < 2; j++) for(k = 0; k < 2; k++) x3[k] += x1px2[j] * EV[2 * j + k]; scale = 1; for(j = 0; j < 2 && scale; j++) scale = (x3[j] < minlikelihood && x3[j] > minusminlikelihood); if(scale) { for(j = 0; j < 2; j++) x3[j] *= twotothe256; if(useFastScaling) addScale += wgt[i]; else ex3[i] += 1; } } break; default: assert(0); } if(useFastScaling) *scalerIncrement = addScale; } #else static void newviewGTRCAT_BINARY( int tipCase, double *EV, int *cptr, double *x1_start, double *x2_start, double *x3_start, double *tipVector, int *ex3, unsigned char *tipX1, unsigned char *tipX2, int n, double *left, double *right, int *wgt, int *scalerIncrement, const boolean useFastScaling) { double *le, *ri, *x1, *x2, *x3; int i, l, scale, addScale = 0; switch(tipCase) { case TIP_TIP: { for(i = 0; i < n; i++) { x1 = &(tipVector[2 * tipX1[i]]); x2 = &(tipVector[2 * tipX2[i]]); x3 = &x3_start[2 * i]; le = &left[cptr[i] * 4]; ri = &right[cptr[i] * 4]; _mm_store_pd(x3, _mm_setzero_pd()); for(l = 0; l < 2; l++) { __m128d al = _mm_mul_pd(_mm_load_pd(x1), _mm_load_pd(&le[l * 2])); __m128d ar = _mm_mul_pd(_mm_load_pd(x2), _mm_load_pd(&ri[l * 2])); al = _mm_hadd_pd(al, al); ar = _mm_hadd_pd(ar, ar); al = _mm_mul_pd(al, ar); __m128d vv = _mm_load_pd(x3); __m128d EVV = _mm_load_pd(&EV[2 * l]); vv = _mm_add_pd(vv, _mm_mul_pd(al, EVV)); _mm_store_pd(x3, vv); } } } break; case TIP_INNER: { for (i = 0; i < n; i++) { x1 = &(tipVector[2 * tipX1[i]]); x2 = &x2_start[2 * i]; x3 = &x3_start[2 * i]; le = &left[cptr[i] * 4]; ri = &right[cptr[i] * 4]; _mm_store_pd(x3, _mm_setzero_pd()); for(l = 0; l < 2; l++) { __m128d al = _mm_mul_pd(_mm_load_pd(x1), _mm_load_pd(&le[l * 2])); __m128d ar = _mm_mul_pd(_mm_load_pd(x2), _mm_load_pd(&ri[l * 2])); al = _mm_hadd_pd(al, al); ar = _mm_hadd_pd(ar, ar); al = _mm_mul_pd(al, ar); __m128d vv = _mm_load_pd(x3); __m128d EVV = _mm_load_pd(&EV[2 * l]); vv = _mm_add_pd(vv, _mm_mul_pd(al, EVV)); _mm_store_pd(x3, vv); } __m128d minlikelihood_sse = _mm_set1_pd( minlikelihood ); scale = 1; __m128d v1 = _mm_and_pd(_mm_load_pd(x3), absMask.m); v1 = _mm_cmplt_pd(v1, minlikelihood_sse); if(_mm_movemask_pd( v1 ) != 3) scale = 0; if(scale) { __m128d twoto = _mm_set_pd(twotothe256, twotothe256); __m128d ex3v = _mm_load_pd(x3); _mm_store_pd(x3, _mm_mul_pd(ex3v,twoto)); if(useFastScaling) addScale += wgt[i]; else ex3[i] += 1; } } } break; case INNER_INNER: for (i = 0; i < n; i++) { x1 = &x1_start[2 * i]; x2 = &x2_start[2 * i]; x3 = &x3_start[2 * i]; le = &left[cptr[i] * 4]; ri = &right[cptr[i] * 4]; _mm_store_pd(x3, _mm_setzero_pd()); for(l = 0; l < 2; l++) { __m128d al = _mm_mul_pd(_mm_load_pd(x1), _mm_load_pd(&le[l * 2])); __m128d ar = _mm_mul_pd(_mm_load_pd(x2), _mm_load_pd(&ri[l * 2])); al = _mm_hadd_pd(al, al); ar = _mm_hadd_pd(ar, ar); al = _mm_mul_pd(al, ar); __m128d vv = _mm_load_pd(x3); __m128d EVV = _mm_load_pd(&EV[2 * l]); vv = _mm_add_pd(vv, _mm_mul_pd(al, EVV)); _mm_store_pd(x3, vv); } __m128d minlikelihood_sse = _mm_set1_pd( minlikelihood ); scale = 1; __m128d v1 = _mm_and_pd(_mm_load_pd(x3), absMask.m); v1 = _mm_cmplt_pd(v1, minlikelihood_sse); if(_mm_movemask_pd( v1 ) != 3) scale = 0; if(scale) { __m128d twoto = _mm_set_pd(twotothe256, twotothe256); __m128d ex3v = _mm_load_pd(x3); _mm_store_pd(x3, _mm_mul_pd(ex3v,twoto)); if(useFastScaling) addScale += wgt[i]; else ex3[i] += 1; } } break; default: assert(0); } if(useFastScaling) *scalerIncrement = addScale; } #endif #ifdef __SIM_SSE3 static void newviewGTRGAMMA_BINARY(int tipCase, double *x1_start, double *x2_start, double *x3_start, double *EV, double *tipVector, int *ex3, unsigned char *tipX1, unsigned char *tipX2, const int n, double *left, double *right, int *wgt, int *scalerIncrement, const boolean useFastScaling ) { double *x1, *x2, *x3; int i, k, l, scale, addScale = 0; switch(tipCase) { case TIP_TIP: for (i = 0; i < n; i++) { x1 = &(tipVector[2 * tipX1[i]]); x2 = &(tipVector[2 * tipX2[i]]); for(k = 0; k < 4; k++) { x3 = &(x3_start[8 * i + 2 * k]); _mm_store_pd(x3, _mm_setzero_pd()); for(l = 0; l < 2; l++) { __m128d al = _mm_mul_pd(_mm_load_pd(x1), _mm_load_pd(&left[k * 4 + l * 2])); __m128d ar = _mm_mul_pd(_mm_load_pd(x2), _mm_load_pd(&right[k * 4 + l * 2])); al = _mm_hadd_pd(al, al); ar = _mm_hadd_pd(ar, ar); al = _mm_mul_pd(al, ar); __m128d vv = _mm_load_pd(x3); __m128d EVV = _mm_load_pd(&EV[2 * l]); vv = _mm_add_pd(vv, _mm_mul_pd(al, EVV)); _mm_store_pd(x3, vv); } } } break; case TIP_INNER: for (i = 0; i < n; i++) { x1 = &(tipVector[2 * tipX1[i]]); for(k = 0; k < 4; k++) { x2 = &(x2_start[8 * i + 2 * k]); x3 = &(x3_start[8 * i + 2 * k]); _mm_store_pd(x3, _mm_setzero_pd()); for(l = 0; l < 2; l++) { __m128d al = _mm_mul_pd(_mm_load_pd(x1), _mm_load_pd(&left[k * 4 + l * 2])); __m128d ar = _mm_mul_pd(_mm_load_pd(x2), _mm_load_pd(&right[k * 4 + l * 2])); al = _mm_hadd_pd(al, al); ar = _mm_hadd_pd(ar, ar); al = _mm_mul_pd(al, ar); __m128d vv = _mm_load_pd(x3); __m128d EVV = _mm_load_pd(&EV[2 * l]); vv = _mm_add_pd(vv, _mm_mul_pd(al, EVV)); _mm_store_pd(x3, vv); } } x3 = &(x3_start[8 * i]); __m128d minlikelihood_sse = _mm_set1_pd( minlikelihood ); scale = 1; for(l = 0; scale && (l < 8); l += 2) { __m128d vv = _mm_load_pd(&x3[l]); __m128d v1 = _mm_and_pd(vv, absMask.m); v1 = _mm_cmplt_pd(v1, minlikelihood_sse); if(_mm_movemask_pd( v1 ) != 3) scale = 0; } if(scale) { __m128d twoto = _mm_set_pd(twotothe256, twotothe256); for(l = 0; l < 8; l+=2) { __m128d ex3v = _mm_load_pd(&x3[l]); _mm_store_pd(&x3[l], _mm_mul_pd(ex3v,twoto)); } if(useFastScaling) addScale += wgt[i]; else ex3[i] += 1; } } break; case INNER_INNER: for (i = 0; i < n; i++) { for(k = 0; k < 4; k++) { x1 = &(x1_start[8 * i + 2 * k]); x2 = &(x2_start[8 * i + 2 * k]); x3 = &(x3_start[8 * i + 2 * k]); _mm_store_pd(x3, _mm_setzero_pd()); for(l = 0; l < 2; l++) { __m128d al = _mm_mul_pd(_mm_load_pd(x1), _mm_load_pd(&left[k * 4 + l * 2])); __m128d ar = _mm_mul_pd(_mm_load_pd(x2), _mm_load_pd(&right[k * 4 + l * 2])); al = _mm_hadd_pd(al, al); ar = _mm_hadd_pd(ar, ar); al = _mm_mul_pd(al, ar); __m128d vv = _mm_load_pd(x3); __m128d EVV = _mm_load_pd(&EV[2 * l]); vv = _mm_add_pd(vv, _mm_mul_pd(al, EVV)); _mm_store_pd(x3, vv); } } x3 = &(x3_start[8 * i]); __m128d minlikelihood_sse = _mm_set1_pd( minlikelihood ); scale = 1; for(l = 0; scale && (l < 8); l += 2) { __m128d vv = _mm_load_pd(&x3[l]); __m128d v1 = _mm_and_pd(vv, absMask.m); v1 = _mm_cmplt_pd(v1, minlikelihood_sse); if(_mm_movemask_pd( v1 ) != 3) scale = 0; } if(scale) { __m128d twoto = _mm_set_pd(twotothe256, twotothe256); for(l = 0; l < 8; l+=2) { __m128d ex3v = _mm_load_pd(&x3[l]); _mm_store_pd(&x3[l], _mm_mul_pd(ex3v,twoto)); } if(useFastScaling) addScale += wgt[i]; else ex3[i] += 1; } } break; default: assert(0); } if(useFastScaling) *scalerIncrement = addScale; } #else static void newviewGTRGAMMA_BINARY(int tipCase, double *x1_start, double *x2_start, double *x3_start, double *EV, double *tipVector, int *ex3, unsigned char *tipX1, unsigned char *tipX2, const int n, double *left, double *right, int *wgt, int *scalerIncrement, const boolean useFastScaling ) { double *x1, *x2, *x3; double ump_x1, ump_x2, x1px2[4]; int i, j, k, l, scale, addScale = 0; /* C-OPT figure out if we are at an inner node who has two tips/leaves as descendants TIP_TIP, a tip and another inner node as descendant TIP_INNER, or two inner nodes as descendants INNER_INNER */ switch(tipCase) { case TIP_TIP: { for (i = 0; i < n; i++) { x1 = &(tipVector[2 * tipX1[i]]); x2 = &(tipVector[2 * tipX2[i]]); x3 = &x3_start[i * 8]; for(j = 0; j < 8; j++) x3[j] = 0.0; for (j = 0; j < 4; j++) { for (k = 0; k < 2; k++) { ump_x1 = 0.0; ump_x2 = 0.0; for (l=0; l < 2; l++) { ump_x1 += x1[l] * left[ j*4 + k*2 + l]; ump_x2 += x2[l] * right[j*4 + k*2 + l]; } x1px2[k] = ump_x1 * ump_x2; } for(k = 0; k < 2; k++) for (l = 0; l < 2; l++) x3[j * 2 + l] += x1px2[k] * EV[2 * k + l]; } } } break; case TIP_INNER: { for (i = 0; i < n; i++) { x1 = &(tipVector[2 * tipX1[i]]); x2 = &x2_start[i * 8]; x3 = &x3_start[i * 8]; for(j = 0; j < 8; j++) x3[j] = 0.0; for (j = 0; j < 4; j++) { for (k = 0; k < 2; k++) { ump_x1 = 0.0; ump_x2 = 0.0; for (l=0; l < 2; l++) { ump_x1 += x1[l] * left[ j*4 + k*2 + l]; ump_x2 += x2[j*2 + l] * right[j*4 + k*2 + l]; } x1px2[k] = ump_x1 * ump_x2; } for(k = 0; k < 2; k++) for (l = 0; l < 2; l++) x3[j * 2 + l] += x1px2[k] * EV[2 * k + l]; } scale = 1; for(l = 0; scale && (l < 8); l++) scale = (ABS(x3[l]) < minlikelihood); if(scale) { for (l=0; l < 8; l++) x3[l] *= twotothe256; if(useFastScaling) addScale += wgt[i]; else ex3[i] += 1; } } } break; case INNER_INNER: /* C-OPT here we don't do any pre-computations This should be the most compute intensive loop of the three cases here. If we have one or two tips as descendants we can take a couple of shortcuts */ for (i = 0; i < n; i++) { x1 = &x1_start[i * 8]; x2 = &x2_start[i * 8]; x3 = &x3_start[i * 8]; for(j = 0; j < 8; j++) x3[j] = 0.0; for (j = 0; j < 4; j++) { for (k = 0; k < 2; k++) { ump_x1 = 0.0; ump_x2 = 0.0; for (l=0; l < 2; l++) { ump_x1 += x1[j*2 + l] * left[ j*4 + k*2 + l]; ump_x2 += x2[j*2 + l] * right[j*4 + k*2 + l]; } x1px2[k] = ump_x1 * ump_x2; } for(k = 0; k < 2; k++) for (l = 0; l < 2; l++) x3[j * 2 + l] += x1px2[k] * EV[2 * k + l]; } scale = 1; for(l = 0; scale && (l < 8); l++) scale = (ABS(x3[l]) < minlikelihood); if(scale) { for (l=0; l<8; l++) x3[l] *= twotothe256; if(useFastScaling) addScale += wgt[i]; else ex3[i] += 1; } } break; default: assert(0); } if(useFastScaling) *scalerIncrement = addScale; } #endif #ifndef __SIM_SSE3 static void newviewGTRCAT( int tipCase, double *EV, int *cptr, double *x1_start, double *x2_start, double *x3_start, double *tipVector, int *ex3, unsigned char *tipX1, unsigned char *tipX2, int n, double *left, double *right, int *wgt, int *scalerIncrement, const boolean useFastScaling) { double *le, *ri, *x1, *x2, *x3; double ump_x1, ump_x2, x1px2[4]; int i, j, k, scale, addScale = 0; switch(tipCase) { case TIP_TIP: { for (i = 0; i < n; i++) { x1 = &(tipVector[4 * tipX1[i]]); x2 = &(tipVector[4 * tipX2[i]]); x3 = &x3_start[4 * i]; le = &left[cptr[i] * 16]; ri = &right[cptr[i] * 16]; for(j = 0; j < 4; j++) { ump_x1 = 0.0; ump_x2 = 0.0; for(k = 0; k < 4; k++) { ump_x1 += x1[k] * le[j * 4 + k]; ump_x2 += x2[k] * ri[j * 4 + k]; } x1px2[j] = ump_x1 * ump_x2; } for(j = 0; j < 4; j++) x3[j] = 0.0; for(j = 0; j < 4; j++) for(k = 0; k < 4; k++) x3[k] += x1px2[j] * EV[j * 4 + k]; } } break; case TIP_INNER: { for (i = 0; i < n; i++) { x1 = &(tipVector[4 * tipX1[i]]); x2 = &x2_start[4 * i]; x3 = &x3_start[4 * i]; le = &left[cptr[i] * 16]; ri = &right[cptr[i] * 16]; for(j = 0; j < 4; j++) { ump_x1 = 0.0; ump_x2 = 0.0; for(k = 0; k < 4; k++) { ump_x1 += x1[k] * le[j * 4 + k]; ump_x2 += x2[k] * ri[j * 4 + k]; } x1px2[j] = ump_x1 * ump_x2; } for(j = 0; j < 4; j++) x3[j] = 0.0; for(j = 0; j < 4; j++) for(k = 0; k < 4; k++) x3[k] += x1px2[j] * EV[4 * j + k]; scale = 1; for(j = 0; j < 4 && scale; j++) scale = (x3[j] < minlikelihood && x3[j] > minusminlikelihood); if(scale) { for(j = 0; j < 4; j++) x3[j] *= twotothe256; if(useFastScaling) addScale += wgt[i]; else ex3[i] += 1; } } } break; case INNER_INNER: for (i = 0; i < n; i++) { x1 = &x1_start[4 * i]; x2 = &x2_start[4 * i]; x3 = &x3_start[4 * i]; le = &left[cptr[i] * 16]; ri = &right[cptr[i] * 16]; for(j = 0; j < 4; j++) { ump_x1 = 0.0; ump_x2 = 0.0; for(k = 0; k < 4; k++) { ump_x1 += x1[k] * le[j * 4 + k]; ump_x2 += x2[k] * ri[j * 4 + k]; } x1px2[j] = ump_x1 * ump_x2; } for(j = 0; j < 4; j++) x3[j] = 0.0; for(j = 0; j < 4; j++) for(k = 0; k < 4; k++) x3[k] += x1px2[j] * EV[4 * j + k]; scale = 1; for(j = 0; j < 4 && scale; j++) scale = (x3[j] < minlikelihood && x3[j] > minusminlikelihood); if(scale) { for(j = 0; j < 4; j++) x3[j] *= twotothe256; if(useFastScaling) addScale += wgt[i]; else ex3[i] += 1; } } break; default: assert(0); } if(useFastScaling) *scalerIncrement = addScale; } #else static void newviewGTRCAT_SAVE( int tipCase, double *EV, int *cptr, double *x1_start, double *x2_start, double *x3_start, double *tipVector, int *ex3, unsigned char *tipX1, unsigned char *tipX2, int n, double *left, double *right, int *wgt, int *scalerIncrement, const boolean useFastScaling, unsigned int *x1_gap, unsigned int *x2_gap, unsigned int *x3_gap, double *x1_gapColumn, double *x2_gapColumn, double *x3_gapColumn, const int maxCats) { double *le, *ri, *x1, *x2, *x3, *x1_ptr = x1_start, *x2_ptr = x2_start, *x3_ptr = x3_start, EV_t[16] __attribute__ ((aligned (BYTE_ALIGNMENT))); int i, j, scale, scaleGap = 0, addScale = 0; __m128d minlikelihood_sse = _mm_set1_pd( minlikelihood ), sc = _mm_set1_pd(twotothe256), EVV[8]; for(i = 0; i < 4; i++) for (j=0; j < 4; j++) EV_t[4 * j + i] = EV[4 * i + j]; for(i = 0; i < 8; i++) EVV[i] = _mm_load_pd(&EV_t[i * 2]); { x1 = x1_gapColumn; x2 = x2_gapColumn; x3 = x3_gapColumn; le = &left[maxCats * 16]; ri = &right[maxCats * 16]; __m128d x1_0 = _mm_load_pd( &x1[0] ); __m128d x1_2 = _mm_load_pd( &x1[2] ); __m128d left_k0_0 = _mm_load_pd( &le[0] ); __m128d left_k0_2 = _mm_load_pd( &le[2] ); __m128d left_k1_0 = _mm_load_pd( &le[4] ); __m128d left_k1_2 = _mm_load_pd( &le[6] ); __m128d left_k2_0 = _mm_load_pd( &le[8] ); __m128d left_k2_2 = _mm_load_pd( &le[10] ); __m128d left_k3_0 = _mm_load_pd( &le[12] ); __m128d left_k3_2 = _mm_load_pd( &le[14] ); left_k0_0 = _mm_mul_pd(x1_0, left_k0_0); left_k0_2 = _mm_mul_pd(x1_2, left_k0_2); left_k1_0 = _mm_mul_pd(x1_0, left_k1_0); left_k1_2 = _mm_mul_pd(x1_2, left_k1_2); left_k0_0 = _mm_hadd_pd( left_k0_0, left_k0_2 ); left_k1_0 = _mm_hadd_pd( left_k1_0, left_k1_2); left_k0_0 = _mm_hadd_pd( left_k0_0, left_k1_0); left_k2_0 = _mm_mul_pd(x1_0, left_k2_0); left_k2_2 = _mm_mul_pd(x1_2, left_k2_2); left_k3_0 = _mm_mul_pd(x1_0, left_k3_0); left_k3_2 = _mm_mul_pd(x1_2, left_k3_2); left_k2_0 = _mm_hadd_pd( left_k2_0, left_k2_2); left_k3_0 = _mm_hadd_pd( left_k3_0, left_k3_2); left_k2_0 = _mm_hadd_pd( left_k2_0, left_k3_0); __m128d x2_0 = _mm_load_pd( &x2[0] ); __m128d x2_2 = _mm_load_pd( &x2[2] ); __m128d right_k0_0 = _mm_load_pd( &ri[0] ); __m128d right_k0_2 = _mm_load_pd( &ri[2] ); __m128d right_k1_0 = _mm_load_pd( &ri[4] ); __m128d right_k1_2 = _mm_load_pd( &ri[6] ); __m128d right_k2_0 = _mm_load_pd( &ri[8] ); __m128d right_k2_2 = _mm_load_pd( &ri[10] ); __m128d right_k3_0 = _mm_load_pd( &ri[12] ); __m128d right_k3_2 = _mm_load_pd( &ri[14] ); right_k0_0 = _mm_mul_pd( x2_0, right_k0_0); right_k0_2 = _mm_mul_pd( x2_2, right_k0_2); right_k1_0 = _mm_mul_pd( x2_0, right_k1_0); right_k1_2 = _mm_mul_pd( x2_2, right_k1_2); right_k0_0 = _mm_hadd_pd( right_k0_0, right_k0_2); right_k1_0 = _mm_hadd_pd( right_k1_0, right_k1_2); right_k0_0 = _mm_hadd_pd( right_k0_0, right_k1_0); right_k2_0 = _mm_mul_pd( x2_0, right_k2_0); right_k2_2 = _mm_mul_pd( x2_2, right_k2_2); right_k3_0 = _mm_mul_pd( x2_0, right_k3_0); right_k3_2 = _mm_mul_pd( x2_2, right_k3_2); right_k2_0 = _mm_hadd_pd( right_k2_0, right_k2_2); right_k3_0 = _mm_hadd_pd( right_k3_0, right_k3_2); right_k2_0 = _mm_hadd_pd( right_k2_0, right_k3_0); __m128d x1px2_k0 = _mm_mul_pd( left_k0_0, right_k0_0 ); __m128d x1px2_k2 = _mm_mul_pd( left_k2_0, right_k2_0 ); __m128d EV_t_l0_k0 = EVV[0]; __m128d EV_t_l0_k2 = EVV[1]; __m128d EV_t_l1_k0 = EVV[2]; __m128d EV_t_l1_k2 = EVV[3]; __m128d EV_t_l2_k0 = EVV[4]; __m128d EV_t_l2_k2 = EVV[5]; __m128d EV_t_l3_k0 = EVV[6]; __m128d EV_t_l3_k2 = EVV[7]; EV_t_l0_k0 = _mm_mul_pd( x1px2_k0, EV_t_l0_k0 ); EV_t_l0_k2 = _mm_mul_pd( x1px2_k2, EV_t_l0_k2 ); EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l0_k2 ); EV_t_l1_k0 = _mm_mul_pd( x1px2_k0, EV_t_l1_k0 ); EV_t_l1_k2 = _mm_mul_pd( x1px2_k2, EV_t_l1_k2 ); EV_t_l1_k0 = _mm_hadd_pd( EV_t_l1_k0, EV_t_l1_k2 ); EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l1_k0 ); EV_t_l2_k0 = _mm_mul_pd( x1px2_k0, EV_t_l2_k0 ); EV_t_l2_k2 = _mm_mul_pd( x1px2_k2, EV_t_l2_k2 ); EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l2_k2 ); EV_t_l3_k0 = _mm_mul_pd( x1px2_k0, EV_t_l3_k0 ); EV_t_l3_k2 = _mm_mul_pd( x1px2_k2, EV_t_l3_k2 ); EV_t_l3_k0 = _mm_hadd_pd( EV_t_l3_k0, EV_t_l3_k2 ); EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l3_k0 ); if(tipCase != TIP_TIP) { scale = 1; __m128d v1 = _mm_and_pd(EV_t_l0_k0, absMask.m); v1 = _mm_cmplt_pd(v1, minlikelihood_sse); if(_mm_movemask_pd( v1 ) != 3) scale = 0; else { v1 = _mm_and_pd(EV_t_l2_k0, absMask.m); v1 = _mm_cmplt_pd(v1, minlikelihood_sse); if(_mm_movemask_pd( v1 ) != 3) scale = 0; } if(scale) { _mm_store_pd(&x3[0], _mm_mul_pd(EV_t_l0_k0, sc)); _mm_store_pd(&x3[2], _mm_mul_pd(EV_t_l2_k0, sc)); scaleGap = TRUE; } else { _mm_store_pd(x3, EV_t_l0_k0); _mm_store_pd(&x3[2], EV_t_l2_k0); } } else { _mm_store_pd(x3, EV_t_l0_k0); _mm_store_pd(&x3[2], EV_t_l2_k0); } } switch(tipCase) { case TIP_TIP: for (i = 0; i < n; i++) { if(noGap(x3_gap, i)) { x1 = &(tipVector[4 * tipX1[i]]); x2 = &(tipVector[4 * tipX2[i]]); x3 = x3_ptr; if(isGap(x1_gap, i)) le = &left[maxCats * 16]; else le = &left[cptr[i] * 16]; if(isGap(x2_gap, i)) ri = &right[maxCats * 16]; else ri = &right[cptr[i] * 16]; __m128d x1_0 = _mm_load_pd( &x1[0] ); __m128d x1_2 = _mm_load_pd( &x1[2] ); __m128d left_k0_0 = _mm_load_pd( &le[0] ); __m128d left_k0_2 = _mm_load_pd( &le[2] ); __m128d left_k1_0 = _mm_load_pd( &le[4] ); __m128d left_k1_2 = _mm_load_pd( &le[6] ); __m128d left_k2_0 = _mm_load_pd( &le[8] ); __m128d left_k2_2 = _mm_load_pd( &le[10] ); __m128d left_k3_0 = _mm_load_pd( &le[12] ); __m128d left_k3_2 = _mm_load_pd( &le[14] ); left_k0_0 = _mm_mul_pd(x1_0, left_k0_0); left_k0_2 = _mm_mul_pd(x1_2, left_k0_2); left_k1_0 = _mm_mul_pd(x1_0, left_k1_0); left_k1_2 = _mm_mul_pd(x1_2, left_k1_2); left_k0_0 = _mm_hadd_pd( left_k0_0, left_k0_2 ); left_k1_0 = _mm_hadd_pd( left_k1_0, left_k1_2); left_k0_0 = _mm_hadd_pd( left_k0_0, left_k1_0); left_k2_0 = _mm_mul_pd(x1_0, left_k2_0); left_k2_2 = _mm_mul_pd(x1_2, left_k2_2); left_k3_0 = _mm_mul_pd(x1_0, left_k3_0); left_k3_2 = _mm_mul_pd(x1_2, left_k3_2); left_k2_0 = _mm_hadd_pd( left_k2_0, left_k2_2); left_k3_0 = _mm_hadd_pd( left_k3_0, left_k3_2); left_k2_0 = _mm_hadd_pd( left_k2_0, left_k3_0); __m128d x2_0 = _mm_load_pd( &x2[0] ); __m128d x2_2 = _mm_load_pd( &x2[2] ); __m128d right_k0_0 = _mm_load_pd( &ri[0] ); __m128d right_k0_2 = _mm_load_pd( &ri[2] ); __m128d right_k1_0 = _mm_load_pd( &ri[4] ); __m128d right_k1_2 = _mm_load_pd( &ri[6] ); __m128d right_k2_0 = _mm_load_pd( &ri[8] ); __m128d right_k2_2 = _mm_load_pd( &ri[10] ); __m128d right_k3_0 = _mm_load_pd( &ri[12] ); __m128d right_k3_2 = _mm_load_pd( &ri[14] ); right_k0_0 = _mm_mul_pd( x2_0, right_k0_0); right_k0_2 = _mm_mul_pd( x2_2, right_k0_2); right_k1_0 = _mm_mul_pd( x2_0, right_k1_0); right_k1_2 = _mm_mul_pd( x2_2, right_k1_2); right_k0_0 = _mm_hadd_pd( right_k0_0, right_k0_2); right_k1_0 = _mm_hadd_pd( right_k1_0, right_k1_2); right_k0_0 = _mm_hadd_pd( right_k0_0, right_k1_0); right_k2_0 = _mm_mul_pd( x2_0, right_k2_0); right_k2_2 = _mm_mul_pd( x2_2, right_k2_2); right_k3_0 = _mm_mul_pd( x2_0, right_k3_0); right_k3_2 = _mm_mul_pd( x2_2, right_k3_2); right_k2_0 = _mm_hadd_pd( right_k2_0, right_k2_2); right_k3_0 = _mm_hadd_pd( right_k3_0, right_k3_2); right_k2_0 = _mm_hadd_pd( right_k2_0, right_k3_0); __m128d x1px2_k0 = _mm_mul_pd( left_k0_0, right_k0_0 ); __m128d x1px2_k2 = _mm_mul_pd( left_k2_0, right_k2_0 ); __m128d EV_t_l0_k0 = EVV[0]; __m128d EV_t_l0_k2 = EVV[1]; __m128d EV_t_l1_k0 = EVV[2]; __m128d EV_t_l1_k2 = EVV[3]; __m128d EV_t_l2_k0 = EVV[4]; __m128d EV_t_l2_k2 = EVV[5]; __m128d EV_t_l3_k0 = EVV[6]; __m128d EV_t_l3_k2 = EVV[7]; EV_t_l0_k0 = _mm_mul_pd( x1px2_k0, EV_t_l0_k0 ); EV_t_l0_k2 = _mm_mul_pd( x1px2_k2, EV_t_l0_k2 ); EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l0_k2 ); EV_t_l1_k0 = _mm_mul_pd( x1px2_k0, EV_t_l1_k0 ); EV_t_l1_k2 = _mm_mul_pd( x1px2_k2, EV_t_l1_k2 ); EV_t_l1_k0 = _mm_hadd_pd( EV_t_l1_k0, EV_t_l1_k2 ); EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l1_k0 ); EV_t_l2_k0 = _mm_mul_pd( x1px2_k0, EV_t_l2_k0 ); EV_t_l2_k2 = _mm_mul_pd( x1px2_k2, EV_t_l2_k2 ); EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l2_k2 ); EV_t_l3_k0 = _mm_mul_pd( x1px2_k0, EV_t_l3_k0 ); EV_t_l3_k2 = _mm_mul_pd( x1px2_k2, EV_t_l3_k2 ); EV_t_l3_k0 = _mm_hadd_pd( EV_t_l3_k0, EV_t_l3_k2 ); EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l3_k0 ); _mm_store_pd(x3, EV_t_l0_k0); _mm_store_pd(&x3[2], EV_t_l2_k0); x3_ptr += 4; } } break; case TIP_INNER: for (i = 0; i < n; i++) { if(isGap(x3_gap, i)) { if(scaleGap) { if(useFastScaling) addScale += wgt[i]; else ex3[i] += 1; } } else { x1 = &(tipVector[4 * tipX1[i]]); x2 = x2_ptr; x3 = x3_ptr; if(isGap(x1_gap, i)) le = &left[maxCats * 16]; else le = &left[cptr[i] * 16]; if(isGap(x2_gap, i)) { ri = &right[maxCats * 16]; x2 = x2_gapColumn; } else { ri = &right[cptr[i] * 16]; x2 = x2_ptr; x2_ptr += 4; } __m128d x1_0 = _mm_load_pd( &x1[0] ); __m128d x1_2 = _mm_load_pd( &x1[2] ); __m128d left_k0_0 = _mm_load_pd( &le[0] ); __m128d left_k0_2 = _mm_load_pd( &le[2] ); __m128d left_k1_0 = _mm_load_pd( &le[4] ); __m128d left_k1_2 = _mm_load_pd( &le[6] ); __m128d left_k2_0 = _mm_load_pd( &le[8] ); __m128d left_k2_2 = _mm_load_pd( &le[10] ); __m128d left_k3_0 = _mm_load_pd( &le[12] ); __m128d left_k3_2 = _mm_load_pd( &le[14] ); left_k0_0 = _mm_mul_pd(x1_0, left_k0_0); left_k0_2 = _mm_mul_pd(x1_2, left_k0_2); left_k1_0 = _mm_mul_pd(x1_0, left_k1_0); left_k1_2 = _mm_mul_pd(x1_2, left_k1_2); left_k0_0 = _mm_hadd_pd( left_k0_0, left_k0_2 ); left_k1_0 = _mm_hadd_pd( left_k1_0, left_k1_2); left_k0_0 = _mm_hadd_pd( left_k0_0, left_k1_0); left_k2_0 = _mm_mul_pd(x1_0, left_k2_0); left_k2_2 = _mm_mul_pd(x1_2, left_k2_2); left_k3_0 = _mm_mul_pd(x1_0, left_k3_0); left_k3_2 = _mm_mul_pd(x1_2, left_k3_2); left_k2_0 = _mm_hadd_pd( left_k2_0, left_k2_2); left_k3_0 = _mm_hadd_pd( left_k3_0, left_k3_2); left_k2_0 = _mm_hadd_pd( left_k2_0, left_k3_0); __m128d x2_0 = _mm_load_pd( &x2[0] ); __m128d x2_2 = _mm_load_pd( &x2[2] ); __m128d right_k0_0 = _mm_load_pd( &ri[0] ); __m128d right_k0_2 = _mm_load_pd( &ri[2] ); __m128d right_k1_0 = _mm_load_pd( &ri[4] ); __m128d right_k1_2 = _mm_load_pd( &ri[6] ); __m128d right_k2_0 = _mm_load_pd( &ri[8] ); __m128d right_k2_2 = _mm_load_pd( &ri[10] ); __m128d right_k3_0 = _mm_load_pd( &ri[12] ); __m128d right_k3_2 = _mm_load_pd( &ri[14] ); right_k0_0 = _mm_mul_pd( x2_0, right_k0_0); right_k0_2 = _mm_mul_pd( x2_2, right_k0_2); right_k1_0 = _mm_mul_pd( x2_0, right_k1_0); right_k1_2 = _mm_mul_pd( x2_2, right_k1_2); right_k0_0 = _mm_hadd_pd( right_k0_0, right_k0_2); right_k1_0 = _mm_hadd_pd( right_k1_0, right_k1_2); right_k0_0 = _mm_hadd_pd( right_k0_0, right_k1_0); right_k2_0 = _mm_mul_pd( x2_0, right_k2_0); right_k2_2 = _mm_mul_pd( x2_2, right_k2_2); right_k3_0 = _mm_mul_pd( x2_0, right_k3_0); right_k3_2 = _mm_mul_pd( x2_2, right_k3_2); right_k2_0 = _mm_hadd_pd( right_k2_0, right_k2_2); right_k3_0 = _mm_hadd_pd( right_k3_0, right_k3_2); right_k2_0 = _mm_hadd_pd( right_k2_0, right_k3_0); __m128d x1px2_k0 = _mm_mul_pd( left_k0_0, right_k0_0 ); __m128d x1px2_k2 = _mm_mul_pd( left_k2_0, right_k2_0 ); __m128d EV_t_l0_k0 = EVV[0]; __m128d EV_t_l0_k2 = EVV[1]; __m128d EV_t_l1_k0 = EVV[2]; __m128d EV_t_l1_k2 = EVV[3]; __m128d EV_t_l2_k0 = EVV[4]; __m128d EV_t_l2_k2 = EVV[5]; __m128d EV_t_l3_k0 = EVV[6]; __m128d EV_t_l3_k2 = EVV[7]; EV_t_l0_k0 = _mm_mul_pd( x1px2_k0, EV_t_l0_k0 ); EV_t_l0_k2 = _mm_mul_pd( x1px2_k2, EV_t_l0_k2 ); EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l0_k2 ); EV_t_l1_k0 = _mm_mul_pd( x1px2_k0, EV_t_l1_k0 ); EV_t_l1_k2 = _mm_mul_pd( x1px2_k2, EV_t_l1_k2 ); EV_t_l1_k0 = _mm_hadd_pd( EV_t_l1_k0, EV_t_l1_k2 ); EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l1_k0 ); EV_t_l2_k0 = _mm_mul_pd( x1px2_k0, EV_t_l2_k0 ); EV_t_l2_k2 = _mm_mul_pd( x1px2_k2, EV_t_l2_k2 ); EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l2_k2 ); EV_t_l3_k0 = _mm_mul_pd( x1px2_k0, EV_t_l3_k0 ); EV_t_l3_k2 = _mm_mul_pd( x1px2_k2, EV_t_l3_k2 ); EV_t_l3_k0 = _mm_hadd_pd( EV_t_l3_k0, EV_t_l3_k2 ); EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l3_k0 ); scale = 1; __m128d v1 = _mm_and_pd(EV_t_l0_k0, absMask.m); v1 = _mm_cmplt_pd(v1, minlikelihood_sse); if(_mm_movemask_pd( v1 ) != 3) scale = 0; else { v1 = _mm_and_pd(EV_t_l2_k0, absMask.m); v1 = _mm_cmplt_pd(v1, minlikelihood_sse); if(_mm_movemask_pd( v1 ) != 3) scale = 0; } if(scale) { _mm_store_pd(&x3[0], _mm_mul_pd(EV_t_l0_k0, sc)); _mm_store_pd(&x3[2], _mm_mul_pd(EV_t_l2_k0, sc)); if(useFastScaling) addScale += wgt[i]; else ex3[i] += 1; } else { _mm_store_pd(x3, EV_t_l0_k0); _mm_store_pd(&x3[2], EV_t_l2_k0); } x3_ptr += 4; } } break; case INNER_INNER: for (i = 0; i < n; i++) { if(isGap(x3_gap, i)) { if(scaleGap) { if(useFastScaling) addScale += wgt[i]; else ex3[i] += 1; } } else { x3 = x3_ptr; if(isGap(x1_gap, i)) { x1 = x1_gapColumn; le = &left[maxCats * 16]; } else { le = &left[cptr[i] * 16]; x1 = x1_ptr; x1_ptr += 4; } if(isGap(x2_gap, i)) { x2 = x2_gapColumn; ri = &right[maxCats * 16]; } else { ri = &right[cptr[i] * 16]; x2 = x2_ptr; x2_ptr += 4; } __m128d x1_0 = _mm_load_pd( &x1[0] ); __m128d x1_2 = _mm_load_pd( &x1[2] ); __m128d left_k0_0 = _mm_load_pd( &le[0] ); __m128d left_k0_2 = _mm_load_pd( &le[2] ); __m128d left_k1_0 = _mm_load_pd( &le[4] ); __m128d left_k1_2 = _mm_load_pd( &le[6] ); __m128d left_k2_0 = _mm_load_pd( &le[8] ); __m128d left_k2_2 = _mm_load_pd( &le[10] ); __m128d left_k3_0 = _mm_load_pd( &le[12] ); __m128d left_k3_2 = _mm_load_pd( &le[14] ); left_k0_0 = _mm_mul_pd(x1_0, left_k0_0); left_k0_2 = _mm_mul_pd(x1_2, left_k0_2); left_k1_0 = _mm_mul_pd(x1_0, left_k1_0); left_k1_2 = _mm_mul_pd(x1_2, left_k1_2); left_k0_0 = _mm_hadd_pd( left_k0_0, left_k0_2 ); left_k1_0 = _mm_hadd_pd( left_k1_0, left_k1_2); left_k0_0 = _mm_hadd_pd( left_k0_0, left_k1_0); left_k2_0 = _mm_mul_pd(x1_0, left_k2_0); left_k2_2 = _mm_mul_pd(x1_2, left_k2_2); left_k3_0 = _mm_mul_pd(x1_0, left_k3_0); left_k3_2 = _mm_mul_pd(x1_2, left_k3_2); left_k2_0 = _mm_hadd_pd( left_k2_0, left_k2_2); left_k3_0 = _mm_hadd_pd( left_k3_0, left_k3_2); left_k2_0 = _mm_hadd_pd( left_k2_0, left_k3_0); __m128d x2_0 = _mm_load_pd( &x2[0] ); __m128d x2_2 = _mm_load_pd( &x2[2] ); __m128d right_k0_0 = _mm_load_pd( &ri[0] ); __m128d right_k0_2 = _mm_load_pd( &ri[2] ); __m128d right_k1_0 = _mm_load_pd( &ri[4] ); __m128d right_k1_2 = _mm_load_pd( &ri[6] ); __m128d right_k2_0 = _mm_load_pd( &ri[8] ); __m128d right_k2_2 = _mm_load_pd( &ri[10] ); __m128d right_k3_0 = _mm_load_pd( &ri[12] ); __m128d right_k3_2 = _mm_load_pd( &ri[14] ); right_k0_0 = _mm_mul_pd( x2_0, right_k0_0); right_k0_2 = _mm_mul_pd( x2_2, right_k0_2); right_k1_0 = _mm_mul_pd( x2_0, right_k1_0); right_k1_2 = _mm_mul_pd( x2_2, right_k1_2); right_k0_0 = _mm_hadd_pd( right_k0_0, right_k0_2); right_k1_0 = _mm_hadd_pd( right_k1_0, right_k1_2); right_k0_0 = _mm_hadd_pd( right_k0_0, right_k1_0); right_k2_0 = _mm_mul_pd( x2_0, right_k2_0); right_k2_2 = _mm_mul_pd( x2_2, right_k2_2); right_k3_0 = _mm_mul_pd( x2_0, right_k3_0); right_k3_2 = _mm_mul_pd( x2_2, right_k3_2); right_k2_0 = _mm_hadd_pd( right_k2_0, right_k2_2); right_k3_0 = _mm_hadd_pd( right_k3_0, right_k3_2); right_k2_0 = _mm_hadd_pd( right_k2_0, right_k3_0); __m128d x1px2_k0 = _mm_mul_pd( left_k0_0, right_k0_0 ); __m128d x1px2_k2 = _mm_mul_pd( left_k2_0, right_k2_0 ); __m128d EV_t_l0_k0 = EVV[0]; __m128d EV_t_l0_k2 = EVV[1]; __m128d EV_t_l1_k0 = EVV[2]; __m128d EV_t_l1_k2 = EVV[3]; __m128d EV_t_l2_k0 = EVV[4]; __m128d EV_t_l2_k2 = EVV[5]; __m128d EV_t_l3_k0 = EVV[6]; __m128d EV_t_l3_k2 = EVV[7]; EV_t_l0_k0 = _mm_mul_pd( x1px2_k0, EV_t_l0_k0 ); EV_t_l0_k2 = _mm_mul_pd( x1px2_k2, EV_t_l0_k2 ); EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l0_k2 ); EV_t_l1_k0 = _mm_mul_pd( x1px2_k0, EV_t_l1_k0 ); EV_t_l1_k2 = _mm_mul_pd( x1px2_k2, EV_t_l1_k2 ); EV_t_l1_k0 = _mm_hadd_pd( EV_t_l1_k0, EV_t_l1_k2 ); EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l1_k0 ); EV_t_l2_k0 = _mm_mul_pd( x1px2_k0, EV_t_l2_k0 ); EV_t_l2_k2 = _mm_mul_pd( x1px2_k2, EV_t_l2_k2 ); EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l2_k2 ); EV_t_l3_k0 = _mm_mul_pd( x1px2_k0, EV_t_l3_k0 ); EV_t_l3_k2 = _mm_mul_pd( x1px2_k2, EV_t_l3_k2 ); EV_t_l3_k0 = _mm_hadd_pd( EV_t_l3_k0, EV_t_l3_k2 ); EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l3_k0 ); scale = 1; __m128d v1 = _mm_and_pd(EV_t_l0_k0, absMask.m); v1 = _mm_cmplt_pd(v1, minlikelihood_sse); if(_mm_movemask_pd( v1 ) != 3) scale = 0; else { v1 = _mm_and_pd(EV_t_l2_k0, absMask.m); v1 = _mm_cmplt_pd(v1, minlikelihood_sse); if(_mm_movemask_pd( v1 ) != 3) scale = 0; } if(scale) { EV_t_l0_k0 = _mm_mul_pd(EV_t_l0_k0, sc); EV_t_l2_k0 = _mm_mul_pd(EV_t_l2_k0, sc); if(useFastScaling) addScale += wgt[i]; else ex3[i] += 1; } _mm_store_pd(&x3[0], EV_t_l0_k0); _mm_store_pd(&x3[2], EV_t_l2_k0); x3_ptr += 4; } } break; default: assert(0); } if(useFastScaling) *scalerIncrement = addScale; } static void newviewGTRCAT( int tipCase, double *EV, int *cptr, double *x1_start, double *x2_start, double *x3_start, double *tipVector, int *ex3, unsigned char *tipX1, unsigned char *tipX2, int n, double *left, double *right, int *wgt, int *scalerIncrement, const boolean useFastScaling) { double *le, *ri, *x1, *x2, *x3, EV_t[16] __attribute__ ((aligned (BYTE_ALIGNMENT))); int i, j, scale, addScale = 0; __m128d minlikelihood_sse = _mm_set1_pd( minlikelihood ), sc = _mm_set1_pd(twotothe256), EVV[8]; for(i = 0; i < 4; i++) for (j=0; j < 4; j++) EV_t[4 * j + i] = EV[4 * i + j]; for(i = 0; i < 8; i++) EVV[i] = _mm_load_pd(&EV_t[i * 2]); switch(tipCase) { case TIP_TIP: for (i = 0; i < n; i++) { x1 = &(tipVector[4 * tipX1[i]]); x2 = &(tipVector[4 * tipX2[i]]); x3 = &x3_start[i * 4]; le = &left[cptr[i] * 16]; ri = &right[cptr[i] * 16]; __m128d x1_0 = _mm_load_pd( &x1[0] ); __m128d x1_2 = _mm_load_pd( &x1[2] ); __m128d left_k0_0 = _mm_load_pd( &le[0] ); __m128d left_k0_2 = _mm_load_pd( &le[2] ); __m128d left_k1_0 = _mm_load_pd( &le[4] ); __m128d left_k1_2 = _mm_load_pd( &le[6] ); __m128d left_k2_0 = _mm_load_pd( &le[8] ); __m128d left_k2_2 = _mm_load_pd( &le[10] ); __m128d left_k3_0 = _mm_load_pd( &le[12] ); __m128d left_k3_2 = _mm_load_pd( &le[14] ); left_k0_0 = _mm_mul_pd(x1_0, left_k0_0); left_k0_2 = _mm_mul_pd(x1_2, left_k0_2); left_k1_0 = _mm_mul_pd(x1_0, left_k1_0); left_k1_2 = _mm_mul_pd(x1_2, left_k1_2); left_k0_0 = _mm_hadd_pd( left_k0_0, left_k0_2 ); left_k1_0 = _mm_hadd_pd( left_k1_0, left_k1_2); left_k0_0 = _mm_hadd_pd( left_k0_0, left_k1_0); left_k2_0 = _mm_mul_pd(x1_0, left_k2_0); left_k2_2 = _mm_mul_pd(x1_2, left_k2_2); left_k3_0 = _mm_mul_pd(x1_0, left_k3_0); left_k3_2 = _mm_mul_pd(x1_2, left_k3_2); left_k2_0 = _mm_hadd_pd( left_k2_0, left_k2_2); left_k3_0 = _mm_hadd_pd( left_k3_0, left_k3_2); left_k2_0 = _mm_hadd_pd( left_k2_0, left_k3_0); __m128d x2_0 = _mm_load_pd( &x2[0] ); __m128d x2_2 = _mm_load_pd( &x2[2] ); __m128d right_k0_0 = _mm_load_pd( &ri[0] ); __m128d right_k0_2 = _mm_load_pd( &ri[2] ); __m128d right_k1_0 = _mm_load_pd( &ri[4] ); __m128d right_k1_2 = _mm_load_pd( &ri[6] ); __m128d right_k2_0 = _mm_load_pd( &ri[8] ); __m128d right_k2_2 = _mm_load_pd( &ri[10] ); __m128d right_k3_0 = _mm_load_pd( &ri[12] ); __m128d right_k3_2 = _mm_load_pd( &ri[14] ); right_k0_0 = _mm_mul_pd( x2_0, right_k0_0); right_k0_2 = _mm_mul_pd( x2_2, right_k0_2); right_k1_0 = _mm_mul_pd( x2_0, right_k1_0); right_k1_2 = _mm_mul_pd( x2_2, right_k1_2); right_k0_0 = _mm_hadd_pd( right_k0_0, right_k0_2); right_k1_0 = _mm_hadd_pd( right_k1_0, right_k1_2); right_k0_0 = _mm_hadd_pd( right_k0_0, right_k1_0); right_k2_0 = _mm_mul_pd( x2_0, right_k2_0); right_k2_2 = _mm_mul_pd( x2_2, right_k2_2); right_k3_0 = _mm_mul_pd( x2_0, right_k3_0); right_k3_2 = _mm_mul_pd( x2_2, right_k3_2); right_k2_0 = _mm_hadd_pd( right_k2_0, right_k2_2); right_k3_0 = _mm_hadd_pd( right_k3_0, right_k3_2); right_k2_0 = _mm_hadd_pd( right_k2_0, right_k3_0); __m128d x1px2_k0 = _mm_mul_pd( left_k0_0, right_k0_0 ); __m128d x1px2_k2 = _mm_mul_pd( left_k2_0, right_k2_0 ); __m128d EV_t_l0_k0 = EVV[0]; __m128d EV_t_l0_k2 = EVV[1]; __m128d EV_t_l1_k0 = EVV[2]; __m128d EV_t_l1_k2 = EVV[3]; __m128d EV_t_l2_k0 = EVV[4]; __m128d EV_t_l2_k2 = EVV[5]; __m128d EV_t_l3_k0 = EVV[6]; __m128d EV_t_l3_k2 = EVV[7]; EV_t_l0_k0 = _mm_mul_pd( x1px2_k0, EV_t_l0_k0 ); EV_t_l0_k2 = _mm_mul_pd( x1px2_k2, EV_t_l0_k2 ); EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l0_k2 ); EV_t_l1_k0 = _mm_mul_pd( x1px2_k0, EV_t_l1_k0 ); EV_t_l1_k2 = _mm_mul_pd( x1px2_k2, EV_t_l1_k2 ); EV_t_l1_k0 = _mm_hadd_pd( EV_t_l1_k0, EV_t_l1_k2 ); EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l1_k0 ); EV_t_l2_k0 = _mm_mul_pd( x1px2_k0, EV_t_l2_k0 ); EV_t_l2_k2 = _mm_mul_pd( x1px2_k2, EV_t_l2_k2 ); EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l2_k2 ); EV_t_l3_k0 = _mm_mul_pd( x1px2_k0, EV_t_l3_k0 ); EV_t_l3_k2 = _mm_mul_pd( x1px2_k2, EV_t_l3_k2 ); EV_t_l3_k0 = _mm_hadd_pd( EV_t_l3_k0, EV_t_l3_k2 ); EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l3_k0 ); _mm_store_pd(x3, EV_t_l0_k0); _mm_store_pd(&x3[2], EV_t_l2_k0); } break; case TIP_INNER: for (i = 0; i < n; i++) { x1 = &(tipVector[4 * tipX1[i]]); x2 = &x2_start[4 * i]; x3 = &x3_start[4 * i]; le = &left[cptr[i] * 16]; ri = &right[cptr[i] * 16]; __m128d x1_0 = _mm_load_pd( &x1[0] ); __m128d x1_2 = _mm_load_pd( &x1[2] ); __m128d left_k0_0 = _mm_load_pd( &le[0] ); __m128d left_k0_2 = _mm_load_pd( &le[2] ); __m128d left_k1_0 = _mm_load_pd( &le[4] ); __m128d left_k1_2 = _mm_load_pd( &le[6] ); __m128d left_k2_0 = _mm_load_pd( &le[8] ); __m128d left_k2_2 = _mm_load_pd( &le[10] ); __m128d left_k3_0 = _mm_load_pd( &le[12] ); __m128d left_k3_2 = _mm_load_pd( &le[14] ); left_k0_0 = _mm_mul_pd(x1_0, left_k0_0); left_k0_2 = _mm_mul_pd(x1_2, left_k0_2); left_k1_0 = _mm_mul_pd(x1_0, left_k1_0); left_k1_2 = _mm_mul_pd(x1_2, left_k1_2); left_k0_0 = _mm_hadd_pd( left_k0_0, left_k0_2 ); left_k1_0 = _mm_hadd_pd( left_k1_0, left_k1_2); left_k0_0 = _mm_hadd_pd( left_k0_0, left_k1_0); left_k2_0 = _mm_mul_pd(x1_0, left_k2_0); left_k2_2 = _mm_mul_pd(x1_2, left_k2_2); left_k3_0 = _mm_mul_pd(x1_0, left_k3_0); left_k3_2 = _mm_mul_pd(x1_2, left_k3_2); left_k2_0 = _mm_hadd_pd( left_k2_0, left_k2_2); left_k3_0 = _mm_hadd_pd( left_k3_0, left_k3_2); left_k2_0 = _mm_hadd_pd( left_k2_0, left_k3_0); __m128d x2_0 = _mm_load_pd( &x2[0] ); __m128d x2_2 = _mm_load_pd( &x2[2] ); __m128d right_k0_0 = _mm_load_pd( &ri[0] ); __m128d right_k0_2 = _mm_load_pd( &ri[2] ); __m128d right_k1_0 = _mm_load_pd( &ri[4] ); __m128d right_k1_2 = _mm_load_pd( &ri[6] ); __m128d right_k2_0 = _mm_load_pd( &ri[8] ); __m128d right_k2_2 = _mm_load_pd( &ri[10] ); __m128d right_k3_0 = _mm_load_pd( &ri[12] ); __m128d right_k3_2 = _mm_load_pd( &ri[14] ); right_k0_0 = _mm_mul_pd( x2_0, right_k0_0); right_k0_2 = _mm_mul_pd( x2_2, right_k0_2); right_k1_0 = _mm_mul_pd( x2_0, right_k1_0); right_k1_2 = _mm_mul_pd( x2_2, right_k1_2); right_k0_0 = _mm_hadd_pd( right_k0_0, right_k0_2); right_k1_0 = _mm_hadd_pd( right_k1_0, right_k1_2); right_k0_0 = _mm_hadd_pd( right_k0_0, right_k1_0); right_k2_0 = _mm_mul_pd( x2_0, right_k2_0); right_k2_2 = _mm_mul_pd( x2_2, right_k2_2); right_k3_0 = _mm_mul_pd( x2_0, right_k3_0); right_k3_2 = _mm_mul_pd( x2_2, right_k3_2); right_k2_0 = _mm_hadd_pd( right_k2_0, right_k2_2); right_k3_0 = _mm_hadd_pd( right_k3_0, right_k3_2); right_k2_0 = _mm_hadd_pd( right_k2_0, right_k3_0); __m128d x1px2_k0 = _mm_mul_pd( left_k0_0, right_k0_0 ); __m128d x1px2_k2 = _mm_mul_pd( left_k2_0, right_k2_0 ); __m128d EV_t_l0_k0 = EVV[0]; __m128d EV_t_l0_k2 = EVV[1]; __m128d EV_t_l1_k0 = EVV[2]; __m128d EV_t_l1_k2 = EVV[3]; __m128d EV_t_l2_k0 = EVV[4]; __m128d EV_t_l2_k2 = EVV[5]; __m128d EV_t_l3_k0 = EVV[6]; __m128d EV_t_l3_k2 = EVV[7]; EV_t_l0_k0 = _mm_mul_pd( x1px2_k0, EV_t_l0_k0 ); EV_t_l0_k2 = _mm_mul_pd( x1px2_k2, EV_t_l0_k2 ); EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l0_k2 ); EV_t_l1_k0 = _mm_mul_pd( x1px2_k0, EV_t_l1_k0 ); EV_t_l1_k2 = _mm_mul_pd( x1px2_k2, EV_t_l1_k2 ); EV_t_l1_k0 = _mm_hadd_pd( EV_t_l1_k0, EV_t_l1_k2 ); EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l1_k0 ); EV_t_l2_k0 = _mm_mul_pd( x1px2_k0, EV_t_l2_k0 ); EV_t_l2_k2 = _mm_mul_pd( x1px2_k2, EV_t_l2_k2 ); EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l2_k2 ); EV_t_l3_k0 = _mm_mul_pd( x1px2_k0, EV_t_l3_k0 ); EV_t_l3_k2 = _mm_mul_pd( x1px2_k2, EV_t_l3_k2 ); EV_t_l3_k0 = _mm_hadd_pd( EV_t_l3_k0, EV_t_l3_k2 ); EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l3_k0 ); scale = 1; __m128d v1 = _mm_and_pd(EV_t_l0_k0, absMask.m); v1 = _mm_cmplt_pd(v1, minlikelihood_sse); if(_mm_movemask_pd( v1 ) != 3) scale = 0; else { v1 = _mm_and_pd(EV_t_l2_k0, absMask.m); v1 = _mm_cmplt_pd(v1, minlikelihood_sse); if(_mm_movemask_pd( v1 ) != 3) scale = 0; } if(scale) { _mm_store_pd(&x3[0], _mm_mul_pd(EV_t_l0_k0, sc)); _mm_store_pd(&x3[2], _mm_mul_pd(EV_t_l2_k0, sc)); if(useFastScaling) addScale += wgt[i]; else ex3[i] += 1; } else { _mm_store_pd(x3, EV_t_l0_k0); _mm_store_pd(&x3[2], EV_t_l2_k0); } } break; case INNER_INNER: for (i = 0; i < n; i++) { x1 = &x1_start[4 * i]; x2 = &x2_start[4 * i]; x3 = &x3_start[4 * i]; le = &left[cptr[i] * 16]; ri = &right[cptr[i] * 16]; __m128d x1_0 = _mm_load_pd( &x1[0] ); __m128d x1_2 = _mm_load_pd( &x1[2] ); __m128d left_k0_0 = _mm_load_pd( &le[0] ); __m128d left_k0_2 = _mm_load_pd( &le[2] ); __m128d left_k1_0 = _mm_load_pd( &le[4] ); __m128d left_k1_2 = _mm_load_pd( &le[6] ); __m128d left_k2_0 = _mm_load_pd( &le[8] ); __m128d left_k2_2 = _mm_load_pd( &le[10] ); __m128d left_k3_0 = _mm_load_pd( &le[12] ); __m128d left_k3_2 = _mm_load_pd( &le[14] ); left_k0_0 = _mm_mul_pd(x1_0, left_k0_0); left_k0_2 = _mm_mul_pd(x1_2, left_k0_2); left_k1_0 = _mm_mul_pd(x1_0, left_k1_0); left_k1_2 = _mm_mul_pd(x1_2, left_k1_2); left_k0_0 = _mm_hadd_pd( left_k0_0, left_k0_2 ); left_k1_0 = _mm_hadd_pd( left_k1_0, left_k1_2); left_k0_0 = _mm_hadd_pd( left_k0_0, left_k1_0); left_k2_0 = _mm_mul_pd(x1_0, left_k2_0); left_k2_2 = _mm_mul_pd(x1_2, left_k2_2); left_k3_0 = _mm_mul_pd(x1_0, left_k3_0); left_k3_2 = _mm_mul_pd(x1_2, left_k3_2); left_k2_0 = _mm_hadd_pd( left_k2_0, left_k2_2); left_k3_0 = _mm_hadd_pd( left_k3_0, left_k3_2); left_k2_0 = _mm_hadd_pd( left_k2_0, left_k3_0); __m128d x2_0 = _mm_load_pd( &x2[0] ); __m128d x2_2 = _mm_load_pd( &x2[2] ); __m128d right_k0_0 = _mm_load_pd( &ri[0] ); __m128d right_k0_2 = _mm_load_pd( &ri[2] ); __m128d right_k1_0 = _mm_load_pd( &ri[4] ); __m128d right_k1_2 = _mm_load_pd( &ri[6] ); __m128d right_k2_0 = _mm_load_pd( &ri[8] ); __m128d right_k2_2 = _mm_load_pd( &ri[10] ); __m128d right_k3_0 = _mm_load_pd( &ri[12] ); __m128d right_k3_2 = _mm_load_pd( &ri[14] ); right_k0_0 = _mm_mul_pd( x2_0, right_k0_0); right_k0_2 = _mm_mul_pd( x2_2, right_k0_2); right_k1_0 = _mm_mul_pd( x2_0, right_k1_0); right_k1_2 = _mm_mul_pd( x2_2, right_k1_2); right_k0_0 = _mm_hadd_pd( right_k0_0, right_k0_2); right_k1_0 = _mm_hadd_pd( right_k1_0, right_k1_2); right_k0_0 = _mm_hadd_pd( right_k0_0, right_k1_0); right_k2_0 = _mm_mul_pd( x2_0, right_k2_0); right_k2_2 = _mm_mul_pd( x2_2, right_k2_2); right_k3_0 = _mm_mul_pd( x2_0, right_k3_0); right_k3_2 = _mm_mul_pd( x2_2, right_k3_2); right_k2_0 = _mm_hadd_pd( right_k2_0, right_k2_2); right_k3_0 = _mm_hadd_pd( right_k3_0, right_k3_2); right_k2_0 = _mm_hadd_pd( right_k2_0, right_k3_0); __m128d x1px2_k0 = _mm_mul_pd( left_k0_0, right_k0_0 ); __m128d x1px2_k2 = _mm_mul_pd( left_k2_0, right_k2_0 ); __m128d EV_t_l0_k0 = EVV[0]; __m128d EV_t_l0_k2 = EVV[1]; __m128d EV_t_l1_k0 = EVV[2]; __m128d EV_t_l1_k2 = EVV[3]; __m128d EV_t_l2_k0 = EVV[4]; __m128d EV_t_l2_k2 = EVV[5]; __m128d EV_t_l3_k0 = EVV[6]; __m128d EV_t_l3_k2 = EVV[7]; EV_t_l0_k0 = _mm_mul_pd( x1px2_k0, EV_t_l0_k0 ); EV_t_l0_k2 = _mm_mul_pd( x1px2_k2, EV_t_l0_k2 ); EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l0_k2 ); EV_t_l1_k0 = _mm_mul_pd( x1px2_k0, EV_t_l1_k0 ); EV_t_l1_k2 = _mm_mul_pd( x1px2_k2, EV_t_l1_k2 ); EV_t_l1_k0 = _mm_hadd_pd( EV_t_l1_k0, EV_t_l1_k2 ); EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l1_k0 ); EV_t_l2_k0 = _mm_mul_pd( x1px2_k0, EV_t_l2_k0 ); EV_t_l2_k2 = _mm_mul_pd( x1px2_k2, EV_t_l2_k2 ); EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l2_k2 ); EV_t_l3_k0 = _mm_mul_pd( x1px2_k0, EV_t_l3_k0 ); EV_t_l3_k2 = _mm_mul_pd( x1px2_k2, EV_t_l3_k2 ); EV_t_l3_k0 = _mm_hadd_pd( EV_t_l3_k0, EV_t_l3_k2 ); EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l3_k0 ); scale = 1; __m128d v1 = _mm_and_pd(EV_t_l0_k0, absMask.m); v1 = _mm_cmplt_pd(v1, minlikelihood_sse); if(_mm_movemask_pd( v1 ) != 3) scale = 0; else { v1 = _mm_and_pd(EV_t_l2_k0, absMask.m); v1 = _mm_cmplt_pd(v1, minlikelihood_sse); if(_mm_movemask_pd( v1 ) != 3) scale = 0; } if(scale) { _mm_store_pd(&x3[0], _mm_mul_pd(EV_t_l0_k0, sc)); _mm_store_pd(&x3[2], _mm_mul_pd(EV_t_l2_k0, sc)); if(useFastScaling) addScale += wgt[i]; else ex3[i] += 1; } else { _mm_store_pd(x3, EV_t_l0_k0); _mm_store_pd(&x3[2], EV_t_l2_k0); } } break; default: assert(0); } if(useFastScaling) *scalerIncrement = addScale; } #endif #ifdef __SIM_SSE3 static void newviewGTRGAMMA(int tipCase, double *x1_start, double *x2_start, double *x3_start, double *EV, double *tipVector, int *ex3, unsigned char *tipX1, unsigned char *tipX2, const int n, double *left, double *right, int *wgt, int *scalerIncrement, const boolean useFastScaling, const unsigned int x1_presenceMap, const unsigned int x2_presenceMap) { int i, j, k, l, addScale = 0; double *x1, *x2, *x3, max, maxima[2] __attribute__ ((aligned (BYTE_ALIGNMENT))), EV_t[16] __attribute__ ((aligned (BYTE_ALIGNMENT))); __m128d values[8], EVV[8]; for(k = 0; k < 4; k++) for (l=0; l < 4; l++) EV_t[4 * l + k] = EV[4 * k + l]; for(k = 0; k < 8; k++) EVV[k] = _mm_load_pd(&EV_t[k * 2]); switch(tipCase) { case TIP_TIP: { double *uX1, umpX1[256] __attribute__ ((aligned (BYTE_ALIGNMENT))), *uX2, umpX2[256] __attribute__ ((aligned (BYTE_ALIGNMENT))); for (i = 1; i < 16; i++) { __m128d x1_1 = _mm_load_pd(&(tipVector[i*4])); __m128d x1_2 = _mm_load_pd(&(tipVector[i*4 + 2])); if(mask32[i] & x1_presenceMap) { for (j = 0; j < 4; j++) for (k = 0; k < 4; k++) { __m128d left1 = _mm_load_pd(&left[j*16 + k*4]); __m128d left2 = _mm_load_pd(&left[j*16 + k*4 + 2]); __m128d acc = _mm_setzero_pd(); acc = _mm_add_pd(acc, _mm_mul_pd(left1, x1_1)); acc = _mm_add_pd(acc, _mm_mul_pd(left2, x1_2)); acc = _mm_hadd_pd(acc, acc); _mm_storel_pd(&umpX1[i*16 + j*4 + k], acc); } } if(mask32[i] & x2_presenceMap) { for (j = 0; j < 4; j++) for (k = 0; k < 4; k++) { __m128d left1 = _mm_load_pd(&right[j*16 + k*4]); __m128d left2 = _mm_load_pd(&right[j*16 + k*4 + 2]); __m128d acc = _mm_setzero_pd(); acc = _mm_add_pd(acc, _mm_mul_pd(left1, x1_1)); acc = _mm_add_pd(acc, _mm_mul_pd(left2, x1_2)); acc = _mm_hadd_pd(acc, acc); _mm_storel_pd(&umpX2[i*16 + j*4 + k], acc); } } } for (i = 0; i < n; i++) { x3 = &x3_start[i * 16]; uX1 = &umpX1[16 * tipX1[i]]; uX2 = &umpX2[16 * tipX2[i]]; for (j = 0; j < 4; j++) { __m128d uX1_k0_sse = _mm_load_pd( &uX1[j * 4] ); __m128d uX1_k2_sse = _mm_load_pd( &uX1[j * 4 + 2] ); __m128d uX2_k0_sse = _mm_load_pd( &uX2[j * 4] ); __m128d uX2_k2_sse = _mm_load_pd( &uX2[j * 4 + 2] ); /* multiply left * right */ __m128d x1px2_k0 = _mm_mul_pd( uX1_k0_sse, uX2_k0_sse ); __m128d x1px2_k2 = _mm_mul_pd( uX1_k2_sse, uX2_k2_sse ); /* multiply with EV matrix (!?) */ __m128d EV_t_l0_k0 = EVV[0]; __m128d EV_t_l0_k2 = EVV[1]; __m128d EV_t_l1_k0 = EVV[2]; __m128d EV_t_l1_k2 = EVV[3]; __m128d EV_t_l2_k0 = EVV[4]; __m128d EV_t_l2_k2 = EVV[5]; __m128d EV_t_l3_k0 = EVV[6]; __m128d EV_t_l3_k2 = EVV[7]; EV_t_l0_k0 = _mm_mul_pd( x1px2_k0, EV_t_l0_k0 ); EV_t_l0_k2 = _mm_mul_pd( x1px2_k2, EV_t_l0_k2 ); EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l0_k2 ); EV_t_l1_k0 = _mm_mul_pd( x1px2_k0, EV_t_l1_k0 ); EV_t_l1_k2 = _mm_mul_pd( x1px2_k2, EV_t_l1_k2 ); EV_t_l1_k0 = _mm_hadd_pd( EV_t_l1_k0, EV_t_l1_k2 ); EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l1_k0 ); EV_t_l2_k0 = _mm_mul_pd( x1px2_k0, EV_t_l2_k0 ); EV_t_l2_k2 = _mm_mul_pd( x1px2_k2, EV_t_l2_k2 ); EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l2_k2 ); EV_t_l3_k0 = _mm_mul_pd( x1px2_k0, EV_t_l3_k0 ); EV_t_l3_k2 = _mm_mul_pd( x1px2_k2, EV_t_l3_k2 ); EV_t_l3_k0 = _mm_hadd_pd( EV_t_l3_k0, EV_t_l3_k2 ); EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l3_k0 ); _mm_store_pd( &x3[j * 4 + 0], EV_t_l0_k0 ); _mm_store_pd( &x3[j * 4 + 2], EV_t_l2_k0 ); } } } break; case TIP_INNER: { double *uX1, umpX1[256] __attribute__ ((aligned (BYTE_ALIGNMENT))); for (i = 1; i < 16; i++) { if(mask32[i] & x1_presenceMap) { __m128d x1_1 = _mm_load_pd(&(tipVector[i*4])); __m128d x1_2 = _mm_load_pd(&(tipVector[i*4 + 2])); for (j = 0; j < 4; j++) for (k = 0; k < 4; k++) { __m128d left1 = _mm_load_pd(&left[j*16 + k*4]); __m128d left2 = _mm_load_pd(&left[j*16 + k*4 + 2]); __m128d acc = _mm_setzero_pd(); acc = _mm_add_pd(acc, _mm_mul_pd(left1, x1_1)); acc = _mm_add_pd(acc, _mm_mul_pd(left2, x1_2)); acc = _mm_hadd_pd(acc, acc); _mm_storel_pd(&umpX1[i*16 + j*4 + k], acc); } } } for (i = 0; i < n; i++) { __m128d maxv =_mm_setzero_pd(); x2 = &x2_start[i * 16]; x3 = &x3_start[i * 16]; uX1 = &umpX1[16 * tipX1[i]]; for (j = 0; j < 4; j++) { /* multiply/add right side */ double *x2_p = &x2[j*4]; double *right_k0_p = &right[j*16]; double *right_k1_p = &right[j*16 + 1*4]; double *right_k2_p = &right[j*16 + 2*4]; double *right_k3_p = &right[j*16 + 3*4]; __m128d x2_0 = _mm_load_pd( &x2_p[0] ); __m128d x2_2 = _mm_load_pd( &x2_p[2] ); __m128d right_k0_0 = _mm_load_pd( &right_k0_p[0] ); __m128d right_k0_2 = _mm_load_pd( &right_k0_p[2] ); __m128d right_k1_0 = _mm_load_pd( &right_k1_p[0] ); __m128d right_k1_2 = _mm_load_pd( &right_k1_p[2] ); __m128d right_k2_0 = _mm_load_pd( &right_k2_p[0] ); __m128d right_k2_2 = _mm_load_pd( &right_k2_p[2] ); __m128d right_k3_0 = _mm_load_pd( &right_k3_p[0] ); __m128d right_k3_2 = _mm_load_pd( &right_k3_p[2] ); right_k0_0 = _mm_mul_pd( x2_0, right_k0_0); right_k0_2 = _mm_mul_pd( x2_2, right_k0_2); right_k1_0 = _mm_mul_pd( x2_0, right_k1_0); right_k1_2 = _mm_mul_pd( x2_2, right_k1_2); right_k0_0 = _mm_hadd_pd( right_k0_0, right_k0_2); right_k1_0 = _mm_hadd_pd( right_k1_0, right_k1_2); right_k0_0 = _mm_hadd_pd( right_k0_0, right_k1_0); right_k2_0 = _mm_mul_pd( x2_0, right_k2_0); right_k2_2 = _mm_mul_pd( x2_2, right_k2_2); right_k3_0 = _mm_mul_pd( x2_0, right_k3_0); right_k3_2 = _mm_mul_pd( x2_2, right_k3_2); right_k2_0 = _mm_hadd_pd( right_k2_0, right_k2_2); right_k3_0 = _mm_hadd_pd( right_k3_0, right_k3_2); right_k2_0 = _mm_hadd_pd( right_k2_0, right_k3_0); { /* load left side from tip vector */ __m128d uX1_k0_sse = _mm_load_pd( &uX1[j * 4] ); __m128d uX1_k2_sse = _mm_load_pd( &uX1[j * 4 + 2] ); /* multiply left * right */ __m128d x1px2_k0 = _mm_mul_pd( uX1_k0_sse, right_k0_0 ); __m128d x1px2_k2 = _mm_mul_pd( uX1_k2_sse, right_k2_0 ); /* multiply with EV matrix (!?) */ __m128d EV_t_l0_k0 = EVV[0]; __m128d EV_t_l0_k2 = EVV[1]; __m128d EV_t_l1_k0 = EVV[2]; __m128d EV_t_l1_k2 = EVV[3]; __m128d EV_t_l2_k0 = EVV[4]; __m128d EV_t_l2_k2 = EVV[5]; __m128d EV_t_l3_k0 = EVV[6]; __m128d EV_t_l3_k2 = EVV[7]; EV_t_l0_k0 = _mm_mul_pd( x1px2_k0, EV_t_l0_k0 ); EV_t_l0_k2 = _mm_mul_pd( x1px2_k2, EV_t_l0_k2 ); EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l0_k2 ); EV_t_l1_k0 = _mm_mul_pd( x1px2_k0, EV_t_l1_k0 ); EV_t_l1_k2 = _mm_mul_pd( x1px2_k2, EV_t_l1_k2 ); EV_t_l1_k0 = _mm_hadd_pd( EV_t_l1_k0, EV_t_l1_k2 ); EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l1_k0 ); EV_t_l2_k0 = _mm_mul_pd( x1px2_k0, EV_t_l2_k0 ); EV_t_l2_k2 = _mm_mul_pd( x1px2_k2, EV_t_l2_k2 ); EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l2_k2 ); EV_t_l3_k0 = _mm_mul_pd( x1px2_k0, EV_t_l3_k0 ); EV_t_l3_k2 = _mm_mul_pd( x1px2_k2, EV_t_l3_k2 ); EV_t_l3_k0 = _mm_hadd_pd( EV_t_l3_k0, EV_t_l3_k2 ); EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l3_k0 ); values[j * 2] = EV_t_l0_k0; values[j * 2 + 1] = EV_t_l2_k0; maxv = _mm_max_pd(maxv, _mm_and_pd(EV_t_l0_k0, absMask.m)); maxv = _mm_max_pd(maxv, _mm_and_pd(EV_t_l2_k0, absMask.m)); } } _mm_store_pd(maxima, maxv); max = MAX(maxima[0], maxima[1]); if(max < minlikelihood) { __m128d sv = _mm_set1_pd(twotothe256); _mm_store_pd(&x3[0], _mm_mul_pd(values[0], sv)); _mm_store_pd(&x3[2], _mm_mul_pd(values[1], sv)); _mm_store_pd(&x3[4], _mm_mul_pd(values[2], sv)); _mm_store_pd(&x3[6], _mm_mul_pd(values[3], sv)); _mm_store_pd(&x3[8], _mm_mul_pd(values[4], sv)); _mm_store_pd(&x3[10], _mm_mul_pd(values[5], sv)); _mm_store_pd(&x3[12], _mm_mul_pd(values[6], sv)); _mm_store_pd(&x3[14], _mm_mul_pd(values[7], sv)); if(useFastScaling) addScale += wgt[i]; else ex3[i] += 1; } else { _mm_store_pd(&x3[0], values[0]); _mm_store_pd(&x3[2], values[1]); _mm_store_pd(&x3[4], values[2]); _mm_store_pd(&x3[6], values[3]); _mm_store_pd(&x3[8], values[4]); _mm_store_pd(&x3[10], values[5]); _mm_store_pd(&x3[12], values[6]); _mm_store_pd(&x3[14], values[7]); } } } break; case INNER_INNER: for (i = 0; i < n; i++) { __m128d maxv =_mm_setzero_pd(); x1 = &x1_start[i * 16]; x2 = &x2_start[i * 16]; x3 = &x3_start[i * 16]; for (j = 0; j < 4; j++) { double *x1_p = &x1[j*4]; double *left_k0_p = &left[j*16]; double *left_k1_p = &left[j*16 + 1*4]; double *left_k2_p = &left[j*16 + 2*4]; double *left_k3_p = &left[j*16 + 3*4]; __m128d x1_0 = _mm_load_pd( &x1_p[0] ); __m128d x1_2 = _mm_load_pd( &x1_p[2] ); __m128d left_k0_0 = _mm_load_pd( &left_k0_p[0] ); __m128d left_k0_2 = _mm_load_pd( &left_k0_p[2] ); __m128d left_k1_0 = _mm_load_pd( &left_k1_p[0] ); __m128d left_k1_2 = _mm_load_pd( &left_k1_p[2] ); __m128d left_k2_0 = _mm_load_pd( &left_k2_p[0] ); __m128d left_k2_2 = _mm_load_pd( &left_k2_p[2] ); __m128d left_k3_0 = _mm_load_pd( &left_k3_p[0] ); __m128d left_k3_2 = _mm_load_pd( &left_k3_p[2] ); left_k0_0 = _mm_mul_pd(x1_0, left_k0_0); left_k0_2 = _mm_mul_pd(x1_2, left_k0_2); left_k1_0 = _mm_mul_pd(x1_0, left_k1_0); left_k1_2 = _mm_mul_pd(x1_2, left_k1_2); left_k0_0 = _mm_hadd_pd( left_k0_0, left_k0_2 ); left_k1_0 = _mm_hadd_pd( left_k1_0, left_k1_2); left_k0_0 = _mm_hadd_pd( left_k0_0, left_k1_0); left_k2_0 = _mm_mul_pd(x1_0, left_k2_0); left_k2_2 = _mm_mul_pd(x1_2, left_k2_2); left_k3_0 = _mm_mul_pd(x1_0, left_k3_0); left_k3_2 = _mm_mul_pd(x1_2, left_k3_2); left_k2_0 = _mm_hadd_pd( left_k2_0, left_k2_2); left_k3_0 = _mm_hadd_pd( left_k3_0, left_k3_2); left_k2_0 = _mm_hadd_pd( left_k2_0, left_k3_0); /* multiply/add right side */ double *x2_p = &x2[j*4]; double *right_k0_p = &right[j*16]; double *right_k1_p = &right[j*16 + 1*4]; double *right_k2_p = &right[j*16 + 2*4]; double *right_k3_p = &right[j*16 + 3*4]; __m128d x2_0 = _mm_load_pd( &x2_p[0] ); __m128d x2_2 = _mm_load_pd( &x2_p[2] ); __m128d right_k0_0 = _mm_load_pd( &right_k0_p[0] ); __m128d right_k0_2 = _mm_load_pd( &right_k0_p[2] ); __m128d right_k1_0 = _mm_load_pd( &right_k1_p[0] ); __m128d right_k1_2 = _mm_load_pd( &right_k1_p[2] ); __m128d right_k2_0 = _mm_load_pd( &right_k2_p[0] ); __m128d right_k2_2 = _mm_load_pd( &right_k2_p[2] ); __m128d right_k3_0 = _mm_load_pd( &right_k3_p[0] ); __m128d right_k3_2 = _mm_load_pd( &right_k3_p[2] ); right_k0_0 = _mm_mul_pd( x2_0, right_k0_0); right_k0_2 = _mm_mul_pd( x2_2, right_k0_2); right_k1_0 = _mm_mul_pd( x2_0, right_k1_0); right_k1_2 = _mm_mul_pd( x2_2, right_k1_2); right_k0_0 = _mm_hadd_pd( right_k0_0, right_k0_2); right_k1_0 = _mm_hadd_pd( right_k1_0, right_k1_2); right_k0_0 = _mm_hadd_pd( right_k0_0, right_k1_0); right_k2_0 = _mm_mul_pd( x2_0, right_k2_0); right_k2_2 = _mm_mul_pd( x2_2, right_k2_2); right_k3_0 = _mm_mul_pd( x2_0, right_k3_0); right_k3_2 = _mm_mul_pd( x2_2, right_k3_2); right_k2_0 = _mm_hadd_pd( right_k2_0, right_k2_2); right_k3_0 = _mm_hadd_pd( right_k3_0, right_k3_2); right_k2_0 = _mm_hadd_pd( right_k2_0, right_k3_0); /* multiply left * right */ __m128d x1px2_k0 = _mm_mul_pd( left_k0_0, right_k0_0 ); __m128d x1px2_k2 = _mm_mul_pd( left_k2_0, right_k2_0 ); /* multiply with EV matrix (!?) */ __m128d EV_t_l0_k0 = EVV[0]; __m128d EV_t_l0_k2 = EVV[1]; __m128d EV_t_l1_k0 = EVV[2]; __m128d EV_t_l1_k2 = EVV[3]; __m128d EV_t_l2_k0 = EVV[4]; __m128d EV_t_l2_k2 = EVV[5]; __m128d EV_t_l3_k0 = EVV[6]; __m128d EV_t_l3_k2 = EVV[7]; EV_t_l0_k0 = _mm_mul_pd( x1px2_k0, EV_t_l0_k0 ); EV_t_l0_k2 = _mm_mul_pd( x1px2_k2, EV_t_l0_k2 ); EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l0_k2 ); EV_t_l1_k0 = _mm_mul_pd( x1px2_k0, EV_t_l1_k0 ); EV_t_l1_k2 = _mm_mul_pd( x1px2_k2, EV_t_l1_k2 ); EV_t_l1_k0 = _mm_hadd_pd( EV_t_l1_k0, EV_t_l1_k2 ); EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l1_k0 ); EV_t_l2_k0 = _mm_mul_pd( x1px2_k0, EV_t_l2_k0 ); EV_t_l2_k2 = _mm_mul_pd( x1px2_k2, EV_t_l2_k2 ); EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l2_k2 ); EV_t_l3_k0 = _mm_mul_pd( x1px2_k0, EV_t_l3_k0 ); EV_t_l3_k2 = _mm_mul_pd( x1px2_k2, EV_t_l3_k2 ); EV_t_l3_k0 = _mm_hadd_pd( EV_t_l3_k0, EV_t_l3_k2 ); EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l3_k0 ); values[j * 2] = EV_t_l0_k0; values[j * 2 + 1] = EV_t_l2_k0; maxv = _mm_max_pd(maxv, _mm_and_pd(EV_t_l0_k0, absMask.m)); maxv = _mm_max_pd(maxv, _mm_and_pd(EV_t_l2_k0, absMask.m)); } _mm_store_pd(maxima, maxv); max = MAX(maxima[0], maxima[1]); if(max < minlikelihood) { __m128d sv = _mm_set1_pd(twotothe256); _mm_store_pd(&x3[0], _mm_mul_pd(values[0], sv)); _mm_store_pd(&x3[2], _mm_mul_pd(values[1], sv)); _mm_store_pd(&x3[4], _mm_mul_pd(values[2], sv)); _mm_store_pd(&x3[6], _mm_mul_pd(values[3], sv)); _mm_store_pd(&x3[8], _mm_mul_pd(values[4], sv)); _mm_store_pd(&x3[10], _mm_mul_pd(values[5], sv)); _mm_store_pd(&x3[12], _mm_mul_pd(values[6], sv)); _mm_store_pd(&x3[14], _mm_mul_pd(values[7], sv)); if(useFastScaling) addScale += wgt[i]; else ex3[i] += 1; } else { _mm_store_pd(&x3[0], values[0]); _mm_store_pd(&x3[2], values[1]); _mm_store_pd(&x3[4], values[2]); _mm_store_pd(&x3[6], values[3]); _mm_store_pd(&x3[8], values[4]); _mm_store_pd(&x3[10], values[5]); _mm_store_pd(&x3[12], values[6]); _mm_store_pd(&x3[14], values[7]); } } break; default: assert(0); } if(useFastScaling) *scalerIncrement = addScale; } static void newviewGTRGAMMA_GAPPED_SAVE(int tipCase, double *x1_start, double *x2_start, double *x3_start, double *EV, double *tipVector, int *ex3, unsigned char *tipX1, unsigned char *tipX2, const int n, double *left, double *right, int *wgt, int *scalerIncrement, const boolean useFastScaling, unsigned int *x1_gap, unsigned int *x2_gap, unsigned int *x3_gap, double *x1_gapColumn, double *x2_gapColumn, double *x3_gapColumn, const unsigned int x1_presenceMap, const unsigned int x2_presenceMap) { int i, j, k, l, addScale = 0, scaleGap = 0; double *x1, *x2, *x3, *x1_ptr = x1_start, *x2_ptr = x2_start, max, maxima[2] __attribute__ ((aligned (BYTE_ALIGNMENT))), EV_t[16] __attribute__ ((aligned (BYTE_ALIGNMENT))); __m128d values[8], EVV[8]; for(k = 0; k < 4; k++) for (l=0; l < 4; l++) EV_t[4 * l + k] = EV[4 * k + l]; for(k = 0; k < 8; k++) EVV[k] = _mm_load_pd(&EV_t[k * 2]); switch(tipCase) { case TIP_TIP: { double *uX1, umpX1[256] __attribute__ ((aligned (BYTE_ALIGNMENT))), *uX2, umpX2[256] __attribute__ ((aligned (BYTE_ALIGNMENT))); for (i = 1; i < 16; i++) { __m128d x1_1 = _mm_load_pd(&(tipVector[i*4])); __m128d x1_2 = _mm_load_pd(&(tipVector[i*4 + 2])); if((mask32[i] & x1_presenceMap) || i == 15) { for (j = 0; j < 4; j++) for (k = 0; k < 4; k++) { __m128d left1 = _mm_load_pd(&left[j*16 + k*4]); __m128d left2 = _mm_load_pd(&left[j*16 + k*4 + 2]); __m128d acc = _mm_setzero_pd(); acc = _mm_add_pd(acc, _mm_mul_pd(left1, x1_1)); acc = _mm_add_pd(acc, _mm_mul_pd(left2, x1_2)); acc = _mm_hadd_pd(acc, acc); _mm_storel_pd(&umpX1[i*16 + j*4 + k], acc); } } if((mask32[i] & x2_presenceMap) || i == 15) { for (j = 0; j < 4; j++) for (k = 0; k < 4; k++) { __m128d left1 = _mm_load_pd(&right[j*16 + k*4]); __m128d left2 = _mm_load_pd(&right[j*16 + k*4 + 2]); __m128d acc = _mm_setzero_pd(); acc = _mm_add_pd(acc, _mm_mul_pd(left1, x1_1)); acc = _mm_add_pd(acc, _mm_mul_pd(left2, x1_2)); acc = _mm_hadd_pd(acc, acc); _mm_storel_pd(&umpX2[i*16 + j*4 + k], acc); } } } uX1 = &umpX1[240]; uX2 = &umpX2[240]; for (j = 0; j < 4; j++) { __m128d uX1_k0_sse = _mm_load_pd( &uX1[j * 4] ); __m128d uX1_k2_sse = _mm_load_pd( &uX1[j * 4 + 2] ); __m128d uX2_k0_sse = _mm_load_pd( &uX2[j * 4] ); __m128d uX2_k2_sse = _mm_load_pd( &uX2[j * 4 + 2] ); __m128d x1px2_k0 = _mm_mul_pd( uX1_k0_sse, uX2_k0_sse ); __m128d x1px2_k2 = _mm_mul_pd( uX1_k2_sse, uX2_k2_sse ); __m128d EV_t_l0_k0 = EVV[0]; __m128d EV_t_l0_k2 = EVV[1]; __m128d EV_t_l1_k0 = EVV[2]; __m128d EV_t_l1_k2 = EVV[3]; __m128d EV_t_l2_k0 = EVV[4]; __m128d EV_t_l2_k2 = EVV[5]; __m128d EV_t_l3_k0 = EVV[6]; __m128d EV_t_l3_k2 = EVV[7]; EV_t_l0_k0 = _mm_mul_pd( x1px2_k0, EV_t_l0_k0 ); EV_t_l0_k2 = _mm_mul_pd( x1px2_k2, EV_t_l0_k2 ); EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l0_k2 ); EV_t_l1_k0 = _mm_mul_pd( x1px2_k0, EV_t_l1_k0 ); EV_t_l1_k2 = _mm_mul_pd( x1px2_k2, EV_t_l1_k2 ); EV_t_l1_k0 = _mm_hadd_pd( EV_t_l1_k0, EV_t_l1_k2 ); EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l1_k0 ); EV_t_l2_k0 = _mm_mul_pd( x1px2_k0, EV_t_l2_k0 ); EV_t_l2_k2 = _mm_mul_pd( x1px2_k2, EV_t_l2_k2 ); EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l2_k2 ); EV_t_l3_k0 = _mm_mul_pd( x1px2_k0, EV_t_l3_k0 ); EV_t_l3_k2 = _mm_mul_pd( x1px2_k2, EV_t_l3_k2 ); EV_t_l3_k0 = _mm_hadd_pd( EV_t_l3_k0, EV_t_l3_k2 ); EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l3_k0 ); _mm_store_pd( &x3_gapColumn[j * 4 + 0], EV_t_l0_k0 ); _mm_store_pd( &x3_gapColumn[j * 4 + 2], EV_t_l2_k0 ); } x3 = x3_start; for (i = 0; i < n; i++) { if(!(x3_gap[i / 32] & mask32[i % 32])) { uX1 = &umpX1[16 * tipX1[i]]; uX2 = &umpX2[16 * tipX2[i]]; for (j = 0; j < 4; j++) { __m128d uX1_k0_sse = _mm_load_pd( &uX1[j * 4] ); __m128d uX1_k2_sse = _mm_load_pd( &uX1[j * 4 + 2] ); __m128d uX2_k0_sse = _mm_load_pd( &uX2[j * 4] ); __m128d uX2_k2_sse = _mm_load_pd( &uX2[j * 4 + 2] ); /* multiply left * right */ __m128d x1px2_k0 = _mm_mul_pd( uX1_k0_sse, uX2_k0_sse ); __m128d x1px2_k2 = _mm_mul_pd( uX1_k2_sse, uX2_k2_sse ); /* multiply with EV matrix (!?) */ __m128d EV_t_l0_k0 = EVV[0]; __m128d EV_t_l0_k2 = EVV[1]; __m128d EV_t_l1_k0 = EVV[2]; __m128d EV_t_l1_k2 = EVV[3]; __m128d EV_t_l2_k0 = EVV[4]; __m128d EV_t_l2_k2 = EVV[5]; __m128d EV_t_l3_k0 = EVV[6]; __m128d EV_t_l3_k2 = EVV[7]; EV_t_l0_k0 = _mm_mul_pd( x1px2_k0, EV_t_l0_k0 ); EV_t_l0_k2 = _mm_mul_pd( x1px2_k2, EV_t_l0_k2 ); EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l0_k2 ); EV_t_l1_k0 = _mm_mul_pd( x1px2_k0, EV_t_l1_k0 ); EV_t_l1_k2 = _mm_mul_pd( x1px2_k2, EV_t_l1_k2 ); EV_t_l1_k0 = _mm_hadd_pd( EV_t_l1_k0, EV_t_l1_k2 ); EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l1_k0 ); EV_t_l2_k0 = _mm_mul_pd( x1px2_k0, EV_t_l2_k0 ); EV_t_l2_k2 = _mm_mul_pd( x1px2_k2, EV_t_l2_k2 ); EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l2_k2 ); EV_t_l3_k0 = _mm_mul_pd( x1px2_k0, EV_t_l3_k0 ); EV_t_l3_k2 = _mm_mul_pd( x1px2_k2, EV_t_l3_k2 ); EV_t_l3_k0 = _mm_hadd_pd( EV_t_l3_k0, EV_t_l3_k2 ); EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l3_k0 ); _mm_store_pd( &x3[j * 4 + 0], EV_t_l0_k0 ); _mm_store_pd( &x3[j * 4 + 2], EV_t_l2_k0 ); } x3 += 16; } } } break; case TIP_INNER: { double *uX1, umpX1[256] __attribute__ ((aligned (BYTE_ALIGNMENT))); for (i = 1; i < 16; i++) { if((mask32[i] & x1_presenceMap) || i == 15) { __m128d x1_1 = _mm_load_pd(&(tipVector[i*4])); __m128d x1_2 = _mm_load_pd(&(tipVector[i*4 + 2])); for (j = 0; j < 4; j++) for (k = 0; k < 4; k++) { __m128d left1 = _mm_load_pd(&left[j*16 + k*4]); __m128d left2 = _mm_load_pd(&left[j*16 + k*4 + 2]); __m128d acc = _mm_setzero_pd(); acc = _mm_add_pd(acc, _mm_mul_pd(left1, x1_1)); acc = _mm_add_pd(acc, _mm_mul_pd(left2, x1_2)); acc = _mm_hadd_pd(acc, acc); _mm_storel_pd(&umpX1[i*16 + j*4 + k], acc); } } } { __m128d maxv =_mm_setzero_pd(); scaleGap = 0; x2 = x2_gapColumn; x3 = x3_gapColumn; uX1 = &umpX1[240]; for (j = 0; j < 4; j++) { double *x2_p = &x2[j*4]; double *right_k0_p = &right[j*16]; double *right_k1_p = &right[j*16 + 1*4]; double *right_k2_p = &right[j*16 + 2*4]; double *right_k3_p = &right[j*16 + 3*4]; __m128d x2_0 = _mm_load_pd( &x2_p[0] ); __m128d x2_2 = _mm_load_pd( &x2_p[2] ); __m128d right_k0_0 = _mm_load_pd( &right_k0_p[0] ); __m128d right_k0_2 = _mm_load_pd( &right_k0_p[2] ); __m128d right_k1_0 = _mm_load_pd( &right_k1_p[0] ); __m128d right_k1_2 = _mm_load_pd( &right_k1_p[2] ); __m128d right_k2_0 = _mm_load_pd( &right_k2_p[0] ); __m128d right_k2_2 = _mm_load_pd( &right_k2_p[2] ); __m128d right_k3_0 = _mm_load_pd( &right_k3_p[0] ); __m128d right_k3_2 = _mm_load_pd( &right_k3_p[2] ); right_k0_0 = _mm_mul_pd( x2_0, right_k0_0); right_k0_2 = _mm_mul_pd( x2_2, right_k0_2); right_k1_0 = _mm_mul_pd( x2_0, right_k1_0); right_k1_2 = _mm_mul_pd( x2_2, right_k1_2); right_k0_0 = _mm_hadd_pd( right_k0_0, right_k0_2); right_k1_0 = _mm_hadd_pd( right_k1_0, right_k1_2); right_k0_0 = _mm_hadd_pd( right_k0_0, right_k1_0); right_k2_0 = _mm_mul_pd( x2_0, right_k2_0); right_k2_2 = _mm_mul_pd( x2_2, right_k2_2); right_k3_0 = _mm_mul_pd( x2_0, right_k3_0); right_k3_2 = _mm_mul_pd( x2_2, right_k3_2); right_k2_0 = _mm_hadd_pd( right_k2_0, right_k2_2); right_k3_0 = _mm_hadd_pd( right_k3_0, right_k3_2); right_k2_0 = _mm_hadd_pd( right_k2_0, right_k3_0); __m128d uX1_k0_sse = _mm_load_pd( &uX1[j * 4] ); __m128d uX1_k2_sse = _mm_load_pd( &uX1[j * 4 + 2] ); __m128d x1px2_k0 = _mm_mul_pd( uX1_k0_sse, right_k0_0 ); __m128d x1px2_k2 = _mm_mul_pd( uX1_k2_sse, right_k2_0 ); __m128d EV_t_l0_k0 = EVV[0]; __m128d EV_t_l0_k2 = EVV[1]; __m128d EV_t_l1_k0 = EVV[2]; __m128d EV_t_l1_k2 = EVV[3]; __m128d EV_t_l2_k0 = EVV[4]; __m128d EV_t_l2_k2 = EVV[5]; __m128d EV_t_l3_k0 = EVV[6]; __m128d EV_t_l3_k2 = EVV[7]; EV_t_l0_k0 = _mm_mul_pd( x1px2_k0, EV_t_l0_k0 ); EV_t_l0_k2 = _mm_mul_pd( x1px2_k2, EV_t_l0_k2 ); EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l0_k2 ); EV_t_l1_k0 = _mm_mul_pd( x1px2_k0, EV_t_l1_k0 ); EV_t_l1_k2 = _mm_mul_pd( x1px2_k2, EV_t_l1_k2 ); EV_t_l1_k0 = _mm_hadd_pd( EV_t_l1_k0, EV_t_l1_k2 ); EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l1_k0 ); EV_t_l2_k0 = _mm_mul_pd( x1px2_k0, EV_t_l2_k0 ); EV_t_l2_k2 = _mm_mul_pd( x1px2_k2, EV_t_l2_k2 ); EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l2_k2 ); EV_t_l3_k0 = _mm_mul_pd( x1px2_k0, EV_t_l3_k0 ); EV_t_l3_k2 = _mm_mul_pd( x1px2_k2, EV_t_l3_k2 ); EV_t_l3_k0 = _mm_hadd_pd( EV_t_l3_k0, EV_t_l3_k2 ); EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l3_k0 ); values[j * 2] = EV_t_l0_k0; values[j * 2 + 1] = EV_t_l2_k0; maxv = _mm_max_pd(maxv, _mm_and_pd(EV_t_l0_k0, absMask.m)); maxv = _mm_max_pd(maxv, _mm_and_pd(EV_t_l2_k0, absMask.m)); } _mm_store_pd(maxima, maxv); max = MAX(maxima[0], maxima[1]); if(max < minlikelihood) { scaleGap = 1; __m128d sv = _mm_set1_pd(twotothe256); _mm_store_pd(&x3[0], _mm_mul_pd(values[0], sv)); _mm_store_pd(&x3[2], _mm_mul_pd(values[1], sv)); _mm_store_pd(&x3[4], _mm_mul_pd(values[2], sv)); _mm_store_pd(&x3[6], _mm_mul_pd(values[3], sv)); _mm_store_pd(&x3[8], _mm_mul_pd(values[4], sv)); _mm_store_pd(&x3[10], _mm_mul_pd(values[5], sv)); _mm_store_pd(&x3[12], _mm_mul_pd(values[6], sv)); _mm_store_pd(&x3[14], _mm_mul_pd(values[7], sv)); } else { _mm_store_pd(&x3[0], values[0]); _mm_store_pd(&x3[2], values[1]); _mm_store_pd(&x3[4], values[2]); _mm_store_pd(&x3[6], values[3]); _mm_store_pd(&x3[8], values[4]); _mm_store_pd(&x3[10], values[5]); _mm_store_pd(&x3[12], values[6]); _mm_store_pd(&x3[14], values[7]); } } x3 = x3_start; for (i = 0; i < n; i++) { if((x3_gap[i / 32] & mask32[i % 32])) { if(scaleGap) { if(useFastScaling) addScale += wgt[i]; else ex3[i] += 1; } } else { __m128d maxv =_mm_setzero_pd(); if(x2_gap[i / 32] & mask32[i % 32]) x2 = x2_gapColumn; else { x2 = x2_ptr; x2_ptr += 16; } uX1 = &umpX1[16 * tipX1[i]]; for (j = 0; j < 4; j++) { double *x2_p = &x2[j*4]; double *right_k0_p = &right[j*16]; double *right_k1_p = &right[j*16 + 1*4]; double *right_k2_p = &right[j*16 + 2*4]; double *right_k3_p = &right[j*16 + 3*4]; __m128d x2_0 = _mm_load_pd( &x2_p[0] ); __m128d x2_2 = _mm_load_pd( &x2_p[2] ); __m128d right_k0_0 = _mm_load_pd( &right_k0_p[0] ); __m128d right_k0_2 = _mm_load_pd( &right_k0_p[2] ); __m128d right_k1_0 = _mm_load_pd( &right_k1_p[0] ); __m128d right_k1_2 = _mm_load_pd( &right_k1_p[2] ); __m128d right_k2_0 = _mm_load_pd( &right_k2_p[0] ); __m128d right_k2_2 = _mm_load_pd( &right_k2_p[2] ); __m128d right_k3_0 = _mm_load_pd( &right_k3_p[0] ); __m128d right_k3_2 = _mm_load_pd( &right_k3_p[2] ); right_k0_0 = _mm_mul_pd( x2_0, right_k0_0); right_k0_2 = _mm_mul_pd( x2_2, right_k0_2); right_k1_0 = _mm_mul_pd( x2_0, right_k1_0); right_k1_2 = _mm_mul_pd( x2_2, right_k1_2); right_k0_0 = _mm_hadd_pd( right_k0_0, right_k0_2); right_k1_0 = _mm_hadd_pd( right_k1_0, right_k1_2); right_k0_0 = _mm_hadd_pd( right_k0_0, right_k1_0); right_k2_0 = _mm_mul_pd( x2_0, right_k2_0); right_k2_2 = _mm_mul_pd( x2_2, right_k2_2); right_k3_0 = _mm_mul_pd( x2_0, right_k3_0); right_k3_2 = _mm_mul_pd( x2_2, right_k3_2); right_k2_0 = _mm_hadd_pd( right_k2_0, right_k2_2); right_k3_0 = _mm_hadd_pd( right_k3_0, right_k3_2); right_k2_0 = _mm_hadd_pd( right_k2_0, right_k3_0); { // // load left side from tip vector // __m128d uX1_k0_sse = _mm_load_pd( &uX1[j * 4] ); __m128d uX1_k2_sse = _mm_load_pd( &uX1[j * 4 + 2] ); // // multiply left * right // __m128d x1px2_k0 = _mm_mul_pd( uX1_k0_sse, right_k0_0 ); __m128d x1px2_k2 = _mm_mul_pd( uX1_k2_sse, right_k2_0 ); // // multiply with EV matrix (!?) // __m128d EV_t_l0_k0 = EVV[0]; __m128d EV_t_l0_k2 = EVV[1]; __m128d EV_t_l1_k0 = EVV[2]; __m128d EV_t_l1_k2 = EVV[3]; __m128d EV_t_l2_k0 = EVV[4]; __m128d EV_t_l2_k2 = EVV[5]; __m128d EV_t_l3_k0 = EVV[6]; __m128d EV_t_l3_k2 = EVV[7]; EV_t_l0_k0 = _mm_mul_pd( x1px2_k0, EV_t_l0_k0 ); EV_t_l0_k2 = _mm_mul_pd( x1px2_k2, EV_t_l0_k2 ); EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l0_k2 ); EV_t_l1_k0 = _mm_mul_pd( x1px2_k0, EV_t_l1_k0 ); EV_t_l1_k2 = _mm_mul_pd( x1px2_k2, EV_t_l1_k2 ); EV_t_l1_k0 = _mm_hadd_pd( EV_t_l1_k0, EV_t_l1_k2 ); EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l1_k0 ); EV_t_l2_k0 = _mm_mul_pd( x1px2_k0, EV_t_l2_k0 ); EV_t_l2_k2 = _mm_mul_pd( x1px2_k2, EV_t_l2_k2 ); EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l2_k2 ); EV_t_l3_k0 = _mm_mul_pd( x1px2_k0, EV_t_l3_k0 ); EV_t_l3_k2 = _mm_mul_pd( x1px2_k2, EV_t_l3_k2 ); EV_t_l3_k0 = _mm_hadd_pd( EV_t_l3_k0, EV_t_l3_k2 ); EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l3_k0 ); values[j * 2] = EV_t_l0_k0; values[j * 2 + 1] = EV_t_l2_k0; maxv = _mm_max_pd(maxv, _mm_and_pd(EV_t_l0_k0, absMask.m)); maxv = _mm_max_pd(maxv, _mm_and_pd(EV_t_l2_k0, absMask.m)); } } _mm_store_pd(maxima, maxv); max = MAX(maxima[0], maxima[1]); if(max < minlikelihood) { __m128d sv = _mm_set1_pd(twotothe256); _mm_store_pd(&x3[0], _mm_mul_pd(values[0], sv)); _mm_store_pd(&x3[2], _mm_mul_pd(values[1], sv)); _mm_store_pd(&x3[4], _mm_mul_pd(values[2], sv)); _mm_store_pd(&x3[6], _mm_mul_pd(values[3], sv)); _mm_store_pd(&x3[8], _mm_mul_pd(values[4], sv)); _mm_store_pd(&x3[10], _mm_mul_pd(values[5], sv)); _mm_store_pd(&x3[12], _mm_mul_pd(values[6], sv)); _mm_store_pd(&x3[14], _mm_mul_pd(values[7], sv)); if(useFastScaling) addScale += wgt[i]; else ex3[i] += 1; } else { _mm_store_pd(&x3[0], values[0]); _mm_store_pd(&x3[2], values[1]); _mm_store_pd(&x3[4], values[2]); _mm_store_pd(&x3[6], values[3]); _mm_store_pd(&x3[8], values[4]); _mm_store_pd(&x3[10], values[5]); _mm_store_pd(&x3[12], values[6]); _mm_store_pd(&x3[14], values[7]); } x3 += 16; } } } break; case INNER_INNER: { __m128d maxv =_mm_setzero_pd(); scaleGap = 0; x1 = x1_gapColumn; x2 = x2_gapColumn; x3 = x3_gapColumn; for (j = 0; j < 4; j++) { double *x1_p = &x1[j*4]; double *left_k0_p = &left[j*16]; double *left_k1_p = &left[j*16 + 1*4]; double *left_k2_p = &left[j*16 + 2*4]; double *left_k3_p = &left[j*16 + 3*4]; __m128d x1_0 = _mm_load_pd( &x1_p[0] ); __m128d x1_2 = _mm_load_pd( &x1_p[2] ); __m128d left_k0_0 = _mm_load_pd( &left_k0_p[0] ); __m128d left_k0_2 = _mm_load_pd( &left_k0_p[2] ); __m128d left_k1_0 = _mm_load_pd( &left_k1_p[0] ); __m128d left_k1_2 = _mm_load_pd( &left_k1_p[2] ); __m128d left_k2_0 = _mm_load_pd( &left_k2_p[0] ); __m128d left_k2_2 = _mm_load_pd( &left_k2_p[2] ); __m128d left_k3_0 = _mm_load_pd( &left_k3_p[0] ); __m128d left_k3_2 = _mm_load_pd( &left_k3_p[2] ); left_k0_0 = _mm_mul_pd(x1_0, left_k0_0); left_k0_2 = _mm_mul_pd(x1_2, left_k0_2); left_k1_0 = _mm_mul_pd(x1_0, left_k1_0); left_k1_2 = _mm_mul_pd(x1_2, left_k1_2); left_k0_0 = _mm_hadd_pd( left_k0_0, left_k0_2 ); left_k1_0 = _mm_hadd_pd( left_k1_0, left_k1_2); left_k0_0 = _mm_hadd_pd( left_k0_0, left_k1_0); left_k2_0 = _mm_mul_pd(x1_0, left_k2_0); left_k2_2 = _mm_mul_pd(x1_2, left_k2_2); left_k3_0 = _mm_mul_pd(x1_0, left_k3_0); left_k3_2 = _mm_mul_pd(x1_2, left_k3_2); left_k2_0 = _mm_hadd_pd( left_k2_0, left_k2_2); left_k3_0 = _mm_hadd_pd( left_k3_0, left_k3_2); left_k2_0 = _mm_hadd_pd( left_k2_0, left_k3_0); double *x2_p = &x2[j*4]; double *right_k0_p = &right[j*16]; double *right_k1_p = &right[j*16 + 1*4]; double *right_k2_p = &right[j*16 + 2*4]; double *right_k3_p = &right[j*16 + 3*4]; __m128d x2_0 = _mm_load_pd( &x2_p[0] ); __m128d x2_2 = _mm_load_pd( &x2_p[2] ); __m128d right_k0_0 = _mm_load_pd( &right_k0_p[0] ); __m128d right_k0_2 = _mm_load_pd( &right_k0_p[2] ); __m128d right_k1_0 = _mm_load_pd( &right_k1_p[0] ); __m128d right_k1_2 = _mm_load_pd( &right_k1_p[2] ); __m128d right_k2_0 = _mm_load_pd( &right_k2_p[0] ); __m128d right_k2_2 = _mm_load_pd( &right_k2_p[2] ); __m128d right_k3_0 = _mm_load_pd( &right_k3_p[0] ); __m128d right_k3_2 = _mm_load_pd( &right_k3_p[2] ); right_k0_0 = _mm_mul_pd( x2_0, right_k0_0); right_k0_2 = _mm_mul_pd( x2_2, right_k0_2); right_k1_0 = _mm_mul_pd( x2_0, right_k1_0); right_k1_2 = _mm_mul_pd( x2_2, right_k1_2); right_k0_0 = _mm_hadd_pd( right_k0_0, right_k0_2); right_k1_0 = _mm_hadd_pd( right_k1_0, right_k1_2); right_k0_0 = _mm_hadd_pd( right_k0_0, right_k1_0); right_k2_0 = _mm_mul_pd( x2_0, right_k2_0); right_k2_2 = _mm_mul_pd( x2_2, right_k2_2); right_k3_0 = _mm_mul_pd( x2_0, right_k3_0); right_k3_2 = _mm_mul_pd( x2_2, right_k3_2); right_k2_0 = _mm_hadd_pd( right_k2_0, right_k2_2); right_k3_0 = _mm_hadd_pd( right_k3_0, right_k3_2); right_k2_0 = _mm_hadd_pd( right_k2_0, right_k3_0); __m128d x1px2_k0 = _mm_mul_pd( left_k0_0, right_k0_0 ); __m128d x1px2_k2 = _mm_mul_pd( left_k2_0, right_k2_0 ); __m128d EV_t_l0_k0 = EVV[0]; __m128d EV_t_l0_k2 = EVV[1]; __m128d EV_t_l1_k0 = EVV[2]; __m128d EV_t_l1_k2 = EVV[3]; __m128d EV_t_l2_k0 = EVV[4]; __m128d EV_t_l2_k2 = EVV[5]; __m128d EV_t_l3_k0 = EVV[6]; __m128d EV_t_l3_k2 = EVV[7]; EV_t_l0_k0 = _mm_mul_pd( x1px2_k0, EV_t_l0_k0 ); EV_t_l0_k2 = _mm_mul_pd( x1px2_k2, EV_t_l0_k2 ); EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l0_k2 ); EV_t_l1_k0 = _mm_mul_pd( x1px2_k0, EV_t_l1_k0 ); EV_t_l1_k2 = _mm_mul_pd( x1px2_k2, EV_t_l1_k2 ); EV_t_l1_k0 = _mm_hadd_pd( EV_t_l1_k0, EV_t_l1_k2 ); EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l1_k0 ); EV_t_l2_k0 = _mm_mul_pd( x1px2_k0, EV_t_l2_k0 ); EV_t_l2_k2 = _mm_mul_pd( x1px2_k2, EV_t_l2_k2 ); EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l2_k2 ); EV_t_l3_k0 = _mm_mul_pd( x1px2_k0, EV_t_l3_k0 ); EV_t_l3_k2 = _mm_mul_pd( x1px2_k2, EV_t_l3_k2 ); EV_t_l3_k0 = _mm_hadd_pd( EV_t_l3_k0, EV_t_l3_k2 ); EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l3_k0 ); values[j * 2] = EV_t_l0_k0; values[j * 2 + 1] = EV_t_l2_k0; maxv = _mm_max_pd(maxv, _mm_and_pd(EV_t_l0_k0, absMask.m)); maxv = _mm_max_pd(maxv, _mm_and_pd(EV_t_l2_k0, absMask.m)); } _mm_store_pd(maxima, maxv); max = MAX(maxima[0], maxima[1]); if(max < minlikelihood) { __m128d sv = _mm_set1_pd(twotothe256); scaleGap = 1; _mm_store_pd(&x3[0], _mm_mul_pd(values[0], sv)); _mm_store_pd(&x3[2], _mm_mul_pd(values[1], sv)); _mm_store_pd(&x3[4], _mm_mul_pd(values[2], sv)); _mm_store_pd(&x3[6], _mm_mul_pd(values[3], sv)); _mm_store_pd(&x3[8], _mm_mul_pd(values[4], sv)); _mm_store_pd(&x3[10], _mm_mul_pd(values[5], sv)); _mm_store_pd(&x3[12], _mm_mul_pd(values[6], sv)); _mm_store_pd(&x3[14], _mm_mul_pd(values[7], sv)); } else { _mm_store_pd(&x3[0], values[0]); _mm_store_pd(&x3[2], values[1]); _mm_store_pd(&x3[4], values[2]); _mm_store_pd(&x3[6], values[3]); _mm_store_pd(&x3[8], values[4]); _mm_store_pd(&x3[10], values[5]); _mm_store_pd(&x3[12], values[6]); _mm_store_pd(&x3[14], values[7]); } } x3 = x3_start; for (i = 0; i < n; i++) { if(x3_gap[i / 32] & mask32[i % 32]) { if(scaleGap) { if(useFastScaling) addScale += wgt[i]; else ex3[i] += 1; } } else { __m128d maxv =_mm_setzero_pd(); if(x1_gap[i / 32] & mask32[i % 32]) x1 = x1_gapColumn; else { x1 = x1_ptr; x1_ptr += 16; } if(x2_gap[i / 32] & mask32[i % 32]) x2 = x2_gapColumn; else { x2 = x2_ptr; x2_ptr += 16; } for (j = 0; j < 4; j++) { double *x1_p = &x1[j*4]; double *left_k0_p = &left[j*16]; double *left_k1_p = &left[j*16 + 1*4]; double *left_k2_p = &left[j*16 + 2*4]; double *left_k3_p = &left[j*16 + 3*4]; __m128d x1_0 = _mm_load_pd( &x1_p[0] ); __m128d x1_2 = _mm_load_pd( &x1_p[2] ); __m128d left_k0_0 = _mm_load_pd( &left_k0_p[0] ); __m128d left_k0_2 = _mm_load_pd( &left_k0_p[2] ); __m128d left_k1_0 = _mm_load_pd( &left_k1_p[0] ); __m128d left_k1_2 = _mm_load_pd( &left_k1_p[2] ); __m128d left_k2_0 = _mm_load_pd( &left_k2_p[0] ); __m128d left_k2_2 = _mm_load_pd( &left_k2_p[2] ); __m128d left_k3_0 = _mm_load_pd( &left_k3_p[0] ); __m128d left_k3_2 = _mm_load_pd( &left_k3_p[2] ); left_k0_0 = _mm_mul_pd(x1_0, left_k0_0); left_k0_2 = _mm_mul_pd(x1_2, left_k0_2); left_k1_0 = _mm_mul_pd(x1_0, left_k1_0); left_k1_2 = _mm_mul_pd(x1_2, left_k1_2); left_k0_0 = _mm_hadd_pd( left_k0_0, left_k0_2 ); left_k1_0 = _mm_hadd_pd( left_k1_0, left_k1_2); left_k0_0 = _mm_hadd_pd( left_k0_0, left_k1_0); left_k2_0 = _mm_mul_pd(x1_0, left_k2_0); left_k2_2 = _mm_mul_pd(x1_2, left_k2_2); left_k3_0 = _mm_mul_pd(x1_0, left_k3_0); left_k3_2 = _mm_mul_pd(x1_2, left_k3_2); left_k2_0 = _mm_hadd_pd( left_k2_0, left_k2_2); left_k3_0 = _mm_hadd_pd( left_k3_0, left_k3_2); left_k2_0 = _mm_hadd_pd( left_k2_0, left_k3_0); // // multiply/add right side // double *x2_p = &x2[j*4]; double *right_k0_p = &right[j*16]; double *right_k1_p = &right[j*16 + 1*4]; double *right_k2_p = &right[j*16 + 2*4]; double *right_k3_p = &right[j*16 + 3*4]; __m128d x2_0 = _mm_load_pd( &x2_p[0] ); __m128d x2_2 = _mm_load_pd( &x2_p[2] ); __m128d right_k0_0 = _mm_load_pd( &right_k0_p[0] ); __m128d right_k0_2 = _mm_load_pd( &right_k0_p[2] ); __m128d right_k1_0 = _mm_load_pd( &right_k1_p[0] ); __m128d right_k1_2 = _mm_load_pd( &right_k1_p[2] ); __m128d right_k2_0 = _mm_load_pd( &right_k2_p[0] ); __m128d right_k2_2 = _mm_load_pd( &right_k2_p[2] ); __m128d right_k3_0 = _mm_load_pd( &right_k3_p[0] ); __m128d right_k3_2 = _mm_load_pd( &right_k3_p[2] ); right_k0_0 = _mm_mul_pd( x2_0, right_k0_0); right_k0_2 = _mm_mul_pd( x2_2, right_k0_2); right_k1_0 = _mm_mul_pd( x2_0, right_k1_0); right_k1_2 = _mm_mul_pd( x2_2, right_k1_2); right_k0_0 = _mm_hadd_pd( right_k0_0, right_k0_2); right_k1_0 = _mm_hadd_pd( right_k1_0, right_k1_2); right_k0_0 = _mm_hadd_pd( right_k0_0, right_k1_0); right_k2_0 = _mm_mul_pd( x2_0, right_k2_0); right_k2_2 = _mm_mul_pd( x2_2, right_k2_2); right_k3_0 = _mm_mul_pd( x2_0, right_k3_0); right_k3_2 = _mm_mul_pd( x2_2, right_k3_2); right_k2_0 = _mm_hadd_pd( right_k2_0, right_k2_2); right_k3_0 = _mm_hadd_pd( right_k3_0, right_k3_2); right_k2_0 = _mm_hadd_pd( right_k2_0, right_k3_0); // // multiply left * right // __m128d x1px2_k0 = _mm_mul_pd( left_k0_0, right_k0_0 ); __m128d x1px2_k2 = _mm_mul_pd( left_k2_0, right_k2_0 ); // // multiply with EV matrix (!?) // __m128d EV_t_l0_k0 = EVV[0]; __m128d EV_t_l0_k2 = EVV[1]; __m128d EV_t_l1_k0 = EVV[2]; __m128d EV_t_l1_k2 = EVV[3]; __m128d EV_t_l2_k0 = EVV[4]; __m128d EV_t_l2_k2 = EVV[5]; __m128d EV_t_l3_k0 = EVV[6]; __m128d EV_t_l3_k2 = EVV[7]; EV_t_l0_k0 = _mm_mul_pd( x1px2_k0, EV_t_l0_k0 ); EV_t_l0_k2 = _mm_mul_pd( x1px2_k2, EV_t_l0_k2 ); EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l0_k2 ); EV_t_l1_k0 = _mm_mul_pd( x1px2_k0, EV_t_l1_k0 ); EV_t_l1_k2 = _mm_mul_pd( x1px2_k2, EV_t_l1_k2 ); EV_t_l1_k0 = _mm_hadd_pd( EV_t_l1_k0, EV_t_l1_k2 ); EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l1_k0 ); EV_t_l2_k0 = _mm_mul_pd( x1px2_k0, EV_t_l2_k0 ); EV_t_l2_k2 = _mm_mul_pd( x1px2_k2, EV_t_l2_k2 ); EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l2_k2 ); EV_t_l3_k0 = _mm_mul_pd( x1px2_k0, EV_t_l3_k0 ); EV_t_l3_k2 = _mm_mul_pd( x1px2_k2, EV_t_l3_k2 ); EV_t_l3_k0 = _mm_hadd_pd( EV_t_l3_k0, EV_t_l3_k2 ); EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l3_k0 ); values[j * 2] = EV_t_l0_k0; values[j * 2 + 1] = EV_t_l2_k0; maxv = _mm_max_pd(maxv, _mm_and_pd(EV_t_l0_k0, absMask.m)); maxv = _mm_max_pd(maxv, _mm_and_pd(EV_t_l2_k0, absMask.m)); } _mm_store_pd(maxima, maxv); max = MAX(maxima[0], maxima[1]); if(max < minlikelihood) { __m128d sv = _mm_set1_pd(twotothe256); _mm_store_pd(&x3[0], _mm_mul_pd(values[0], sv)); _mm_store_pd(&x3[2], _mm_mul_pd(values[1], sv)); _mm_store_pd(&x3[4], _mm_mul_pd(values[2], sv)); _mm_store_pd(&x3[6], _mm_mul_pd(values[3], sv)); _mm_store_pd(&x3[8], _mm_mul_pd(values[4], sv)); _mm_store_pd(&x3[10], _mm_mul_pd(values[5], sv)); _mm_store_pd(&x3[12], _mm_mul_pd(values[6], sv)); _mm_store_pd(&x3[14], _mm_mul_pd(values[7], sv)); if(useFastScaling) addScale += wgt[i]; else ex3[i] += 1; } else { _mm_store_pd(&x3[0], values[0]); _mm_store_pd(&x3[2], values[1]); _mm_store_pd(&x3[4], values[2]); _mm_store_pd(&x3[6], values[3]); _mm_store_pd(&x3[8], values[4]); _mm_store_pd(&x3[10], values[5]); _mm_store_pd(&x3[12], values[6]); _mm_store_pd(&x3[14], values[7]); } x3 += 16; } } break; default: assert(0); } if(useFastScaling) *scalerIncrement = addScale; } #else static void newviewGTRGAMMA(int tipCase, double *x1_start, double *x2_start, double *x3_start, double *EV, double *tipVector, int *ex3, unsigned char *tipX1, unsigned char *tipX2, const int n, double *left, double *right, int *wgt, int *scalerIncrement, const boolean useFastScaling, const unsigned int x1_presenceMap, const unsigned int x2_presenceMap ) { int i, j, k, l, scale, addScale = 0; double *x1, *x2, *x3, buf, ump_x1, ump_x2, x1px2[4]; switch(tipCase) { case TIP_TIP: { double *uX1, umpX1[256], *uX2, umpX2[256]; for(i = 1; i < 16; i++) { x1 = &(tipVector[i * 4]); if(mask32[i] & x1_presenceMap) { for(j=0; j<4; j++) for(k=0; k<4; k++) { umpX1[i*16 + j*4 + k] = 0.0; for (l=0; l < 4; l++) umpX1[i*16 + j*4 + k] += x1[l] * left[j*16 + k*4 + l]; } } if(mask32[i] & x2_presenceMap) { for(j=0; j<4; j++) for(k=0; k<4; k++) { umpX2[i*16 + j*4 + k] = 0.0; for (l=0; l < 4; l++) umpX2[i*16 + j*4 + k] += x1[l] * right[j*16 + k*4 + l]; } } } for (i = 0; i < n; i++) { x3 = &x3_start[i * 16]; uX1 = &umpX1[16 * tipX1[i]]; uX2 = &umpX2[16 * tipX2[i]]; for(j = 0; j < 16; j++) x3[j] = 0.0; for (j = 0; j < 4; j++) for (k = 0; k < 4; k++) { buf = uX1[j*4 + k] * uX2[j*4 + k]; for (l=0; l<4; l++) x3[j * 4 + l] += buf * EV[4 * k + l]; } } } break; case TIP_INNER: { double *uX1, umpX1[256]; for (i = 1; i < 16; i++) { if(mask32[i] & x1_presenceMap) { x1 = &(tipVector[i*4]); for (j = 0; j < 4; j++) for (k = 0; k < 4; k++) { umpX1[i*16 + j*4 + k] = 0.0; for (l=0; l < 4; l++) umpX1[i*16 + j*4 + k] += x1[l] * left[j*16 + k*4 + l]; } } } for (i = 0; i < n; i++) { x2 = &x2_start[i * 16]; x3 = &x3_start[i * 16]; uX1 = &umpX1[16 * tipX1[i]]; for(j = 0; j < 16; j++) x3[j] = 0.0; for (j = 0; j < 4; j++) { for (k = 0; k < 4; k++) { ump_x2 = 0.0; for (l=0; l<4; l++) ump_x2 += x2[j*4 + l] * right[j* 16 + k*4 + l]; x1px2[k] = uX1[j * 4 + k] * ump_x2; } for(k = 0; k < 4; k++) for (l=0; l<4; l++) x3[j * 4 + l] += x1px2[k] * EV[4 * k + l]; } scale = 1; for(l = 0; scale && (l < 16); l++) scale = (ABS(x3[l]) < minlikelihood); if(scale) { for (l=0; l<16; l++) x3[l] *= twotothe256; if(useFastScaling) addScale += wgt[i]; else ex3[i] += 1; } } } break; case INNER_INNER: for (i = 0; i < n; i++) { x1 = &x1_start[i * 16]; x2 = &x2_start[i * 16]; x3 = &x3_start[i * 16]; for(j = 0; j < 16; j++) x3[j] = 0.0; for (j = 0; j < 4; j++) { for (k = 0; k < 4; k++) { ump_x1 = 0.0; ump_x2 = 0.0; for (l=0; l<4; l++) { ump_x1 += x1[j*4 + l] * left[j*16 + k*4 +l]; ump_x2 += x2[j*4 + l] * right[j*16 + k*4 +l]; } x1px2[k] = ump_x1 * ump_x2; } for(k = 0; k < 4; k++) for (l=0; l<4; l++) x3[j * 4 + l] += x1px2[k] * EV[4 * k + l]; } scale = 1; for(l = 0; scale && (l < 16); l++) scale = (ABS(x3[l]) < minlikelihood); if(scale) { for (l=0; l<16; l++) x3[l] *= twotothe256; if(useFastScaling) addScale += wgt[i]; else ex3[i] += 1; } } break; default: assert(0); } if(useFastScaling) *scalerIncrement = addScale; } #endif #ifdef __SIM_SSE3 static void newviewGTRCATPROT_SAVE(int tipCase, double *extEV, int *cptr, double *x1, double *x2, double *x3, double *tipVector, int *ex3, unsigned char *tipX1, unsigned char *tipX2, int n, double *left, double *right, int *wgt, int *scalerIncrement, const boolean useFastScaling, unsigned int *x1_gap, unsigned int *x2_gap, unsigned int *x3_gap, double *x1_gapColumn, double *x2_gapColumn, double *x3_gapColumn, const int maxCats) { double *le, *ri, *v, *vl, *vr, *x1_ptr = x1, *x2_ptr = x2, *x3_ptr = x3; int i, l, j, scale, scaleGap = 0, addScale = 0; { vl = x1_gapColumn; vr = x2_gapColumn; v = x3_gapColumn; le = &left[maxCats * 400]; ri = &right[maxCats * 400]; for(l = 0; l < 20; l+=2) _mm_store_pd(&v[l], _mm_setzero_pd()); for(l = 0; l < 20; l++) { __m128d x1v = _mm_setzero_pd(); __m128d x2v = _mm_setzero_pd(); double *ev = &extEV[l * 20], *lv = &le[l * 20], *rv = &ri[l * 20]; for(j = 0; j < 20; j+=2) { x1v = _mm_add_pd(x1v, _mm_mul_pd(_mm_load_pd(&vl[j]), _mm_load_pd(&lv[j]))); x2v = _mm_add_pd(x2v, _mm_mul_pd(_mm_load_pd(&vr[j]), _mm_load_pd(&rv[j]))); } x1v = _mm_hadd_pd(x1v, x1v); x2v = _mm_hadd_pd(x2v, x2v); x1v = _mm_mul_pd(x1v, x2v); for(j = 0; j < 20; j+=2) { __m128d vv = _mm_load_pd(&v[j]); vv = _mm_add_pd(vv, _mm_mul_pd(x1v, _mm_load_pd(&ev[j]))); _mm_store_pd(&v[j], vv); } } if(tipCase != TIP_TIP) { __m128d minlikelihood_sse = _mm_set1_pd( minlikelihood ); scale = 1; for(l = 0; scale && (l < 20); l += 2) { __m128d vv = _mm_load_pd(&v[l]); __m128d v1 = _mm_and_pd(vv, absMask.m); v1 = _mm_cmplt_pd(v1, minlikelihood_sse); if(_mm_movemask_pd( v1 ) != 3) scale = 0; } if(scale) { __m128d twoto = _mm_set_pd(twotothe256, twotothe256); for(l = 0; l < 20; l+=2) { __m128d ex3v = _mm_load_pd(&v[l]); _mm_store_pd(&v[l], _mm_mul_pd(ex3v,twoto)); } scaleGap = TRUE; } } } switch(tipCase) { case TIP_TIP: { for (i = 0; i < n; i++) { if(noGap(x3_gap, i)) { vl = &(tipVector[20 * tipX1[i]]); vr = &(tipVector[20 * tipX2[i]]); v = x3_ptr; if(isGap(x1_gap, i)) le = &left[maxCats * 400]; else le = &left[cptr[i] * 400]; if(isGap(x2_gap, i)) ri = &right[maxCats * 400]; else ri = &right[cptr[i] * 400]; for(l = 0; l < 20; l+=2) _mm_store_pd(&v[l], _mm_setzero_pd()); for(l = 0; l < 20; l++) { __m128d x1v = _mm_setzero_pd(); __m128d x2v = _mm_setzero_pd(); double *ev = &extEV[l * 20], *lv = &le[l * 20], *rv = &ri[l * 20]; for(j = 0; j < 20; j+=2) { x1v = _mm_add_pd(x1v, _mm_mul_pd(_mm_load_pd(&vl[j]), _mm_load_pd(&lv[j]))); x2v = _mm_add_pd(x2v, _mm_mul_pd(_mm_load_pd(&vr[j]), _mm_load_pd(&rv[j]))); } x1v = _mm_hadd_pd(x1v, x1v); x2v = _mm_hadd_pd(x2v, x2v); x1v = _mm_mul_pd(x1v, x2v); for(j = 0; j < 20; j+=2) { __m128d vv = _mm_load_pd(&v[j]); vv = _mm_add_pd(vv, _mm_mul_pd(x1v, _mm_load_pd(&ev[j]))); _mm_store_pd(&v[j], vv); } } x3_ptr += 20; } } } break; case TIP_INNER: { for (i = 0; i < n; i++) { if(isGap(x3_gap, i)) { if(scaleGap) { if(useFastScaling) addScale += wgt[i]; else ex3[i] += 1; } } else { vl = &(tipVector[20 * tipX1[i]]); vr = x2_ptr; v = x3_ptr; if(isGap(x1_gap, i)) le = &left[maxCats * 400]; else le = &left[cptr[i] * 400]; if(isGap(x2_gap, i)) { ri = &right[maxCats * 400]; vr = x2_gapColumn; } else { ri = &right[cptr[i] * 400]; vr = x2_ptr; x2_ptr += 20; } for(l = 0; l < 20; l+=2) _mm_store_pd(&v[l], _mm_setzero_pd()); for(l = 0; l < 20; l++) { __m128d x1v = _mm_setzero_pd(); __m128d x2v = _mm_setzero_pd(); double *ev = &extEV[l * 20], *lv = &le[l * 20], *rv = &ri[l * 20]; for(j = 0; j < 20; j+=2) { x1v = _mm_add_pd(x1v, _mm_mul_pd(_mm_load_pd(&vl[j]), _mm_load_pd(&lv[j]))); x2v = _mm_add_pd(x2v, _mm_mul_pd(_mm_load_pd(&vr[j]), _mm_load_pd(&rv[j]))); } x1v = _mm_hadd_pd(x1v, x1v); x2v = _mm_hadd_pd(x2v, x2v); x1v = _mm_mul_pd(x1v, x2v); for(j = 0; j < 20; j+=2) { __m128d vv = _mm_load_pd(&v[j]); vv = _mm_add_pd(vv, _mm_mul_pd(x1v, _mm_load_pd(&ev[j]))); _mm_store_pd(&v[j], vv); } } { __m128d minlikelihood_sse = _mm_set1_pd( minlikelihood ); scale = 1; for(l = 0; scale && (l < 20); l += 2) { __m128d vv = _mm_load_pd(&v[l]); __m128d v1 = _mm_and_pd(vv, absMask.m); v1 = _mm_cmplt_pd(v1, minlikelihood_sse); if(_mm_movemask_pd( v1 ) != 3) scale = 0; } } if(scale) { __m128d twoto = _mm_set_pd(twotothe256, twotothe256); for(l = 0; l < 20; l+=2) { __m128d ex3v = _mm_load_pd(&v[l]); _mm_store_pd(&v[l], _mm_mul_pd(ex3v,twoto)); } if(useFastScaling) addScale += wgt[i]; else ex3[i] += 1; } x3_ptr += 20; } } } break; case INNER_INNER: for(i = 0; i < n; i++) { if(isGap(x3_gap, i)) { if(scaleGap) { if(useFastScaling) addScale += wgt[i]; else ex3[i] += 1; } } else { v = x3_ptr; if(isGap(x1_gap, i)) { vl = x1_gapColumn; le = &left[maxCats * 400]; } else { le = &left[cptr[i] * 400]; vl = x1_ptr; x1_ptr += 20; } if(isGap(x2_gap, i)) { vr = x2_gapColumn; ri = &right[maxCats * 400]; } else { ri = &right[cptr[i] * 400]; vr = x2_ptr; x2_ptr += 20; } for(l = 0; l < 20; l+=2) _mm_store_pd(&v[l], _mm_setzero_pd()); for(l = 0; l < 20; l++) { __m128d x1v = _mm_setzero_pd(); __m128d x2v = _mm_setzero_pd(); double *ev = &extEV[l * 20], *lv = &le[l * 20], *rv = &ri[l * 20]; for(j = 0; j < 20; j+=2) { x1v = _mm_add_pd(x1v, _mm_mul_pd(_mm_load_pd(&vl[j]), _mm_load_pd(&lv[j]))); x2v = _mm_add_pd(x2v, _mm_mul_pd(_mm_load_pd(&vr[j]), _mm_load_pd(&rv[j]))); } x1v = _mm_hadd_pd(x1v, x1v); x2v = _mm_hadd_pd(x2v, x2v); x1v = _mm_mul_pd(x1v, x2v); for(j = 0; j < 20; j+=2) { __m128d vv = _mm_load_pd(&v[j]); vv = _mm_add_pd(vv, _mm_mul_pd(x1v, _mm_load_pd(&ev[j]))); _mm_store_pd(&v[j], vv); } } { __m128d minlikelihood_sse = _mm_set1_pd( minlikelihood ); scale = 1; for(l = 0; scale && (l < 20); l += 2) { __m128d vv = _mm_load_pd(&v[l]); __m128d v1 = _mm_and_pd(vv, absMask.m); v1 = _mm_cmplt_pd(v1, minlikelihood_sse); if(_mm_movemask_pd( v1 ) != 3) scale = 0; } } if(scale) { __m128d twoto = _mm_set_pd(twotothe256, twotothe256); for(l = 0; l < 20; l+=2) { __m128d ex3v = _mm_load_pd(&v[l]); _mm_store_pd(&v[l], _mm_mul_pd(ex3v,twoto)); } if(useFastScaling) addScale += wgt[i]; else ex3[i] += 1; } x3_ptr += 20; } } break; default: assert(0); } if(useFastScaling) *scalerIncrement = addScale; } #endif static void newviewGTRCATPROT(int tipCase, double *extEV, int *cptr, double *x1, double *x2, double *x3, double *tipVector, int *ex3, unsigned char *tipX1, unsigned char *tipX2, int n, double *left, double *right, int *wgt, int *scalerIncrement, const boolean useFastScaling) { double *le, *ri, *v, *vl, *vr; #ifndef __SIM_SSE3 double ump_x1, ump_x2, x1px2; #endif int i, l, j, scale, addScale = 0; switch(tipCase) { case TIP_TIP: { for (i = 0; i < n; i++) { le = &left[cptr[i] * 400]; ri = &right[cptr[i] * 400]; vl = &(tipVector[20 * tipX1[i]]); vr = &(tipVector[20 * tipX2[i]]); v = &x3[20 * i]; #ifdef __SIM_SSE3 for(l = 0; l < 20; l+=2) _mm_store_pd(&v[l], _mm_setzero_pd()); #else for(l = 0; l < 20; l++) v[l] = 0.0; #endif for(l = 0; l < 20; l++) { #ifdef __SIM_SSE3 __m128d x1v = _mm_setzero_pd(); __m128d x2v = _mm_setzero_pd(); double *ev = &extEV[l * 20], *lv = &le[l * 20], *rv = &ri[l * 20]; for(j = 0; j < 20; j+=2) { x1v = _mm_add_pd(x1v, _mm_mul_pd(_mm_load_pd(&vl[j]), _mm_load_pd(&lv[j]))); x2v = _mm_add_pd(x2v, _mm_mul_pd(_mm_load_pd(&vr[j]), _mm_load_pd(&rv[j]))); } x1v = _mm_hadd_pd(x1v, x1v); x2v = _mm_hadd_pd(x2v, x2v); x1v = _mm_mul_pd(x1v, x2v); for(j = 0; j < 20; j+=2) { __m128d vv = _mm_load_pd(&v[j]); vv = _mm_add_pd(vv, _mm_mul_pd(x1v, _mm_load_pd(&ev[j]))); _mm_store_pd(&v[j], vv); } #else ump_x1 = 0.0; ump_x2 = 0.0; for(j = 0; j < 20; j++) { ump_x1 += vl[j] * le[l * 20 + j]; ump_x2 += vr[j] * ri[l * 20 + j]; } x1px2 = ump_x1 * ump_x2; for(j = 0; j < 20; j++) v[j] += x1px2 * extEV[l * 20 + j]; #endif } } } break; case TIP_INNER: { for (i = 0; i < n; i++) { le = &left[cptr[i] * 400]; ri = &right[cptr[i] * 400]; vl = &(tipVector[20 * tipX1[i]]); vr = &x2[20 * i]; v = &x3[20 * i]; #ifdef __SIM_SSE3 for(l = 0; l < 20; l+=2) _mm_store_pd(&v[l], _mm_setzero_pd()); #else for(l = 0; l < 20; l++) v[l] = 0.0; #endif for(l = 0; l < 20; l++) { #ifdef __SIM_SSE3 __m128d x1v = _mm_setzero_pd(); __m128d x2v = _mm_setzero_pd(); double *ev = &extEV[l * 20], *lv = &le[l * 20], *rv = &ri[l * 20]; for(j = 0; j < 20; j+=2) { x1v = _mm_add_pd(x1v, _mm_mul_pd(_mm_load_pd(&vl[j]), _mm_load_pd(&lv[j]))); x2v = _mm_add_pd(x2v, _mm_mul_pd(_mm_load_pd(&vr[j]), _mm_load_pd(&rv[j]))); } x1v = _mm_hadd_pd(x1v, x1v); x2v = _mm_hadd_pd(x2v, x2v); x1v = _mm_mul_pd(x1v, x2v); for(j = 0; j < 20; j+=2) { __m128d vv = _mm_load_pd(&v[j]); vv = _mm_add_pd(vv, _mm_mul_pd(x1v, _mm_load_pd(&ev[j]))); _mm_store_pd(&v[j], vv); } #else ump_x1 = 0.0; ump_x2 = 0.0; for(j = 0; j < 20; j++) { ump_x1 += vl[j] * le[l * 20 + j]; ump_x2 += vr[j] * ri[l * 20 + j]; } x1px2 = ump_x1 * ump_x2; for(j = 0; j < 20; j++) v[j] += x1px2 * extEV[l * 20 + j]; #endif } #ifdef __SIM_SSE3 { __m128d minlikelihood_sse = _mm_set1_pd( minlikelihood ); scale = 1; for(l = 0; scale && (l < 20); l += 2) { __m128d vv = _mm_load_pd(&v[l]); __m128d v1 = _mm_and_pd(vv, absMask.m); v1 = _mm_cmplt_pd(v1, minlikelihood_sse); if(_mm_movemask_pd( v1 ) != 3) scale = 0; } } #else scale = 1; for(l = 0; scale && (l < 20); l++) scale = ((v[l] < minlikelihood) && (v[l] > minusminlikelihood)); #endif if(scale) { #ifdef __SIM_SSE3 __m128d twoto = _mm_set_pd(twotothe256, twotothe256); for(l = 0; l < 20; l+=2) { __m128d ex3v = _mm_load_pd(&v[l]); _mm_store_pd(&v[l], _mm_mul_pd(ex3v,twoto)); } #else for(l = 0; l < 20; l++) v[l] *= twotothe256; #endif if(useFastScaling) addScale += wgt[i]; else ex3[i] += 1; } } } break; case INNER_INNER: for(i = 0; i < n; i++) { le = &left[cptr[i] * 400]; ri = &right[cptr[i] * 400]; vl = &x1[20 * i]; vr = &x2[20 * i]; v = &x3[20 * i]; #ifdef __SIM_SSE3 for(l = 0; l < 20; l+=2) _mm_store_pd(&v[l], _mm_setzero_pd()); #else for(l = 0; l < 20; l++) v[l] = 0.0; #endif for(l = 0; l < 20; l++) { #ifdef __SIM_SSE3 __m128d x1v = _mm_setzero_pd(); __m128d x2v = _mm_setzero_pd(); double *ev = &extEV[l * 20], *lv = &le[l * 20], *rv = &ri[l * 20]; for(j = 0; j < 20; j+=2) { x1v = _mm_add_pd(x1v, _mm_mul_pd(_mm_load_pd(&vl[j]), _mm_load_pd(&lv[j]))); x2v = _mm_add_pd(x2v, _mm_mul_pd(_mm_load_pd(&vr[j]), _mm_load_pd(&rv[j]))); } x1v = _mm_hadd_pd(x1v, x1v); x2v = _mm_hadd_pd(x2v, x2v); x1v = _mm_mul_pd(x1v, x2v); for(j = 0; j < 20; j+=2) { __m128d vv = _mm_load_pd(&v[j]); vv = _mm_add_pd(vv, _mm_mul_pd(x1v, _mm_load_pd(&ev[j]))); _mm_store_pd(&v[j], vv); } #else ump_x1 = 0.0; ump_x2 = 0.0; for(j = 0; j < 20; j++) { ump_x1 += vl[j] * le[l * 20 + j]; ump_x2 += vr[j] * ri[l * 20 + j]; } x1px2 = ump_x1 * ump_x2; for(j = 0; j < 20; j++) v[j] += x1px2 * extEV[l * 20 + j]; #endif } #ifdef __SIM_SSE3 { __m128d minlikelihood_sse = _mm_set1_pd( minlikelihood ); scale = 1; for(l = 0; scale && (l < 20); l += 2) { __m128d vv = _mm_load_pd(&v[l]); __m128d v1 = _mm_and_pd(vv, absMask.m); v1 = _mm_cmplt_pd(v1, minlikelihood_sse); if(_mm_movemask_pd( v1 ) != 3) scale = 0; } } #else scale = 1; for(l = 0; scale && (l < 20); l++) scale = ((v[l] < minlikelihood) && (v[l] > minusminlikelihood)); #endif if(scale) { #ifdef __SIM_SSE3 __m128d twoto = _mm_set_pd(twotothe256, twotothe256); for(l = 0; l < 20; l+=2) { __m128d ex3v = _mm_load_pd(&v[l]); _mm_store_pd(&v[l], _mm_mul_pd(ex3v,twoto)); } #else for(l = 0; l < 20; l++) v[l] *= twotothe256; #endif if(useFastScaling) addScale += wgt[i]; else ex3[i] += 1; } } break; default: assert(0); } if(useFastScaling) *scalerIncrement = addScale; } static void newviewGTRCATSECONDARY(int tipCase, double *extEV, int *cptr, double *x1, double *x2, double *x3, double *tipVector, int *ex3, unsigned char *tipX1, unsigned char *tipX2, int n, double *left, double *right, int *wgt, int *scalerIncrement, const boolean useFastScaling) { double *le, *ri, *v, *vl, *vr; double ump_x1, ump_x2, x1px2; int i, l, j, scale, addScale = 0; switch(tipCase) { case TIP_TIP: { for (i = 0; i < n; i++) { le = &left[cptr[i] * 256]; ri = &right[cptr[i] * 256]; vl = &(tipVector[16 * tipX1[i]]); vr = &(tipVector[16 * tipX2[i]]); v = &x3[16 * i]; for(l = 0; l < 16; l++) v[l] = 0.0; for(l = 0; l < 16; l++) { ump_x1 = 0.0; ump_x2 = 0.0; for(j = 0; j < 16; j++) { ump_x1 += vl[j] * le[l * 16 + j]; ump_x2 += vr[j] * ri[l * 16 + j]; } x1px2 = ump_x1 * ump_x2; for(j = 0; j < 16; j++) v[j] += x1px2 * extEV[l * 16 + j]; } } } break; case TIP_INNER: { for (i = 0; i < n; i++) { le = &left[cptr[i] * 256]; ri = &right[cptr[i] * 256]; vl = &(tipVector[16 * tipX1[i]]); vr = &x2[16 * i]; v = &x3[16 * i]; for(l = 0; l < 16; l++) v[l] = 0.0; for(l = 0; l < 16; l++) { ump_x1 = 0.0; ump_x2 = 0.0; for(j = 0; j < 16; j++) { ump_x1 += vl[j] * le[l * 16 + j]; ump_x2 += vr[j] * ri[l * 16 + j]; } x1px2 = ump_x1 * ump_x2; for(j = 0; j < 16; j++) v[j] += x1px2 * extEV[l * 16 + j]; } scale = 1; for(l = 0; scale && (l < 16); l++) scale = ((v[l] < minlikelihood) && (v[l] > minusminlikelihood)); if(scale) { for(l = 0; l < 16; l++) v[l] *= twotothe256; if(useFastScaling) addScale += wgt[i]; else ex3[i] += 1; } } } break; case INNER_INNER: for(i = 0; i < n; i++) { le = &left[cptr[i] * 256]; ri = &right[cptr[i] * 256]; vl = &x1[16 * i]; vr = &x2[16 * i]; v = &x3[16 * i]; for(l = 0; l < 16; l++) v[l] = 0.0; for(l = 0; l < 16; l++) { ump_x1 = 0.0; ump_x2 = 0.0; for(j = 0; j < 16; j++) { ump_x1 += vl[j] * le[l * 16 + j]; ump_x2 += vr[j] * ri[l * 16 + j]; } x1px2 = ump_x1 * ump_x2; for(j = 0; j < 16; j++) v[j] += x1px2 * extEV[l * 16 + j]; } scale = 1; for(l = 0; scale && (l < 16); l++) scale = ((v[l] < minlikelihood) && (v[l] > minusminlikelihood)); if(scale) { for(l = 0; l < 16; l++) v[l] *= twotothe256; if(useFastScaling) addScale += wgt[i]; else ex3[i] += 1; } } break; default: assert(0); } if(useFastScaling) *scalerIncrement = addScale; } static void newviewGTRCATSECONDARY_6(int tipCase, double *extEV, int *cptr, double *x1, double *x2, double *x3, double *tipVector, int *ex3, unsigned char *tipX1, unsigned char *tipX2, int n, double *left, double *right, int *wgt, int *scalerIncrement, const boolean useFastScaling) { double *le, *ri, *v, *vl, *vr; double ump_x1, ump_x2, x1px2; int i, l, j, scale, addScale = 0; switch(tipCase) { case TIP_TIP: { for (i = 0; i < n; i++) { le = &left[cptr[i] * 36]; ri = &right[cptr[i] * 36]; vl = &(tipVector[6 * tipX1[i]]); vr = &(tipVector[6 * tipX2[i]]); v = &x3[6 * i]; for(l = 0; l < 6; l++) v[l] = 0.0; for(l = 0; l < 6; l++) { ump_x1 = 0.0; ump_x2 = 0.0; for(j = 0; j < 6; j++) { ump_x1 += vl[j] * le[l * 6 + j]; ump_x2 += vr[j] * ri[l * 6 + j]; } x1px2 = ump_x1 * ump_x2; for(j = 0; j < 6; j++) v[j] += x1px2 * extEV[l * 6 + j]; } } } break; case TIP_INNER: { for (i = 0; i < n; i++) { le = &left[cptr[i] * 36]; ri = &right[cptr[i] * 36]; vl = &(tipVector[6 * tipX1[i]]); vr = &x2[6 * i]; v = &x3[6 * i]; for(l = 0; l < 6; l++) v[l] = 0.0; for(l = 0; l < 6; l++) { ump_x1 = 0.0; ump_x2 = 0.0; for(j = 0; j < 6; j++) { ump_x1 += vl[j] * le[l * 6 + j]; ump_x2 += vr[j] * ri[l * 6 + j]; } x1px2 = ump_x1 * ump_x2; for(j = 0; j < 6; j++) v[j] += x1px2 * extEV[l * 6 + j]; } scale = 1; for(l = 0; scale && (l < 6); l++) scale = ((v[l] < minlikelihood) && (v[l] > minusminlikelihood)); if(scale) { for(l = 0; l < 6; l++) v[l] *= twotothe256; if(useFastScaling) addScale += wgt[i]; else ex3[i] += 1; } } } break; case INNER_INNER: for(i = 0; i < n; i++) { le = &left[cptr[i] * 36]; ri = &right[cptr[i] * 36]; vl = &x1[6 * i]; vr = &x2[6 * i]; v = &x3[6 * i]; for(l = 0; l < 6; l++) v[l] = 0.0; for(l = 0; l < 6; l++) { ump_x1 = 0.0; ump_x2 = 0.0; for(j = 0; j < 6; j++) { ump_x1 += vl[j] * le[l * 6 + j]; ump_x2 += vr[j] * ri[l * 6 + j]; } x1px2 = ump_x1 * ump_x2; for(j = 0; j < 6; j++) v[j] += x1px2 * extEV[l * 6 + j]; } scale = 1; for(l = 0; scale && (l < 6); l++) scale = ((v[l] < minlikelihood) && (v[l] > minusminlikelihood)); if(scale) { for(l = 0; l < 6; l++) v[l] *= twotothe256; if(useFastScaling) addScale += wgt[i]; else ex3[i] += 1; } } break; default: assert(0); } if(useFastScaling) *scalerIncrement = addScale; } static void newviewGTRCATSECONDARY_7(int tipCase, double *extEV, int *cptr, double *x1, double *x2, double *x3, double *tipVector, int *ex3, unsigned char *tipX1, unsigned char *tipX2, int n, double *left, double *right, int *wgt, int *scalerIncrement, const boolean useFastScaling) { double *le, *ri, *v, *vl, *vr; double ump_x1, ump_x2, x1px2; int i, l, j, scale, addScale = 0; switch(tipCase) { case TIP_TIP: { for (i = 0; i < n; i++) { le = &left[cptr[i] * 49]; ri = &right[cptr[i] * 49]; vl = &(tipVector[7 * tipX1[i]]); vr = &(tipVector[7 * tipX2[i]]); v = &x3[7 * i]; for(l = 0; l < 7; l++) v[l] = 0.0; for(l = 0; l < 7; l++) { ump_x1 = 0.0; ump_x2 = 0.0; for(j = 0; j < 7; j++) { ump_x1 += vl[j] * le[l * 7 + j]; ump_x2 += vr[j] * ri[l * 7 + j]; } x1px2 = ump_x1 * ump_x2; for(j = 0; j < 7; j++) v[j] += x1px2 * extEV[l * 7 + j]; } } } break; case TIP_INNER: { for (i = 0; i < n; i++) { le = &left[cptr[i] * 49]; ri = &right[cptr[i] * 49]; vl = &(tipVector[7 * tipX1[i]]); vr = &x2[7 * i]; v = &x3[7 * i]; for(l = 0; l < 7; l++) v[l] = 0.0; for(l = 0; l < 7; l++) { ump_x1 = 0.0; ump_x2 = 0.0; for(j = 0; j < 7; j++) { ump_x1 += vl[j] * le[l * 7 + j]; ump_x2 += vr[j] * ri[l * 7 + j]; } x1px2 = ump_x1 * ump_x2; for(j = 0; j < 7; j++) v[j] += x1px2 * extEV[l * 7 + j]; } scale = 1; for(l = 0; scale && (l < 7); l++) scale = ((v[l] < minlikelihood) && (v[l] > minusminlikelihood)); if(scale) { for(l = 0; l < 7; l++) v[l] *= twotothe256; if(useFastScaling) addScale += wgt[i]; else ex3[i] += 1; } } } break; case INNER_INNER: for(i = 0; i < n; i++) { le = &left[cptr[i] * 49]; ri = &right[cptr[i] * 49]; vl = &x1[7 * i]; vr = &x2[7 * i]; v = &x3[7 * i]; for(l = 0; l < 7; l++) v[l] = 0.0; for(l = 0; l < 7; l++) { ump_x1 = 0.0; ump_x2 = 0.0; for(j = 0; j < 7; j++) { ump_x1 += vl[j] * le[l * 7 + j]; ump_x2 += vr[j] * ri[l * 7 + j]; } x1px2 = ump_x1 * ump_x2; for(j = 0; j < 7; j++) v[j] += x1px2 * extEV[l * 7 + j]; } scale = 1; for(l = 0; scale && (l < 7); l++) scale = ((v[l] < minlikelihood) && (v[l] > minusminlikelihood)); if(scale) { for(l = 0; l < 7; l++) v[l] *= twotothe256; if(useFastScaling) addScale += wgt[i]; else ex3[i] += 1; } } break; default: assert(0); } if(useFastScaling) *scalerIncrement = addScale; } static void newviewGTRGAMMAPROT(int tipCase, double *x1, double *x2, double *x3, double *extEV, double *tipVector, int *ex3, unsigned char *tipX1, unsigned char *tipX2, int n, double *left, double *right, int *wgt, int *scalerIncrement, const boolean useFastScaling) { double *uX1, *uX2, *v; double x1px2; int i, j, l, k, scale, addScale = 0; double *vl, *vr; #ifndef __SIM_SSE3 double al, ar; #endif switch(tipCase) { case TIP_TIP: { double umpX1[1840], umpX2[1840]; for(i = 0; i < 23; i++) { v = &(tipVector[20 * i]); for(k = 0; k < 80; k++) { #ifdef __SIM_SSE3 double *ll = &left[k * 20]; double *rr = &right[k * 20]; __m128d umpX1v = _mm_setzero_pd(); __m128d umpX2v = _mm_setzero_pd(); for(l = 0; l < 20; l+=2) { __m128d vv = _mm_load_pd(&v[l]); umpX1v = _mm_add_pd(umpX1v, _mm_mul_pd(vv, _mm_load_pd(&ll[l]))); umpX2v = _mm_add_pd(umpX2v, _mm_mul_pd(vv, _mm_load_pd(&rr[l]))); } umpX1v = _mm_hadd_pd(umpX1v, umpX1v); umpX2v = _mm_hadd_pd(umpX2v, umpX2v); _mm_storel_pd(&umpX1[80 * i + k], umpX1v); _mm_storel_pd(&umpX2[80 * i + k], umpX2v); #else umpX1[80 * i + k] = 0.0; umpX2[80 * i + k] = 0.0; for(l = 0; l < 20; l++) { umpX1[80 * i + k] += v[l] * left[k * 20 + l]; umpX2[80 * i + k] += v[l] * right[k * 20 + l]; } #endif } } for(i = 0; i < n; i++) { uX1 = &umpX1[80 * tipX1[i]]; uX2 = &umpX2[80 * tipX2[i]]; for(j = 0; j < 4; j++) { v = &x3[i * 80 + j * 20]; #ifdef __SIM_SSE3 __m128d zero = _mm_setzero_pd(); for(k = 0; k < 20; k+=2) _mm_store_pd(&v[k], zero); for(k = 0; k < 20; k++) { double *eev = &extEV[k * 20]; x1px2 = uX1[j * 20 + k] * uX2[j * 20 + k]; __m128d x1px2v = _mm_set1_pd(x1px2); for(l = 0; l < 20; l+=2) { __m128d vv = _mm_load_pd(&v[l]); __m128d ee = _mm_load_pd(&eev[l]); vv = _mm_add_pd(vv, _mm_mul_pd(x1px2v,ee)); _mm_store_pd(&v[l], vv); } } #else for(k = 0; k < 20; k++) v[k] = 0.0; for(k = 0; k < 20; k++) { x1px2 = uX1[j * 20 + k] * uX2[j * 20 + k]; for(l = 0; l < 20; l++) v[l] += x1px2 * extEV[20 * k + l]; } #endif } } } break; case TIP_INNER: { double umpX1[1840], ump_x2[20]; for(i = 0; i < 23; i++) { v = &(tipVector[20 * i]); for(k = 0; k < 80; k++) { #ifdef __SIM_SSE3 double *ll = &left[k * 20]; __m128d umpX1v = _mm_setzero_pd(); for(l = 0; l < 20; l+=2) { __m128d vv = _mm_load_pd(&v[l]); umpX1v = _mm_add_pd(umpX1v, _mm_mul_pd(vv, _mm_load_pd(&ll[l]))); } umpX1v = _mm_hadd_pd(umpX1v, umpX1v); _mm_storel_pd(&umpX1[80 * i + k], umpX1v); #else umpX1[80 * i + k] = 0.0; for(l = 0; l < 20; l++) umpX1[80 * i + k] += v[l] * left[k * 20 + l]; #endif } } for (i = 0; i < n; i++) { uX1 = &umpX1[80 * tipX1[i]]; for(k = 0; k < 4; k++) { v = &(x2[80 * i + k * 20]); #ifdef __SIM_SSE3 for(l = 0; l < 20; l++) { double *r = &right[k * 400 + l * 20]; __m128d ump_x2v = _mm_setzero_pd(); for(j = 0; j < 20; j+= 2) { __m128d vv = _mm_load_pd(&v[j]); __m128d rr = _mm_load_pd(&r[j]); ump_x2v = _mm_add_pd(ump_x2v, _mm_mul_pd(vv, rr)); } ump_x2v = _mm_hadd_pd(ump_x2v, ump_x2v); _mm_storel_pd(&ump_x2[l], ump_x2v); } v = &(x3[80 * i + 20 * k]); __m128d zero = _mm_setzero_pd(); for(l = 0; l < 20; l+=2) _mm_store_pd(&v[l], zero); for(l = 0; l < 20; l++) { double *eev = &extEV[l * 20]; x1px2 = uX1[k * 20 + l] * ump_x2[l]; __m128d x1px2v = _mm_set1_pd(x1px2); for(j = 0; j < 20; j+=2) { __m128d vv = _mm_load_pd(&v[j]); __m128d ee = _mm_load_pd(&eev[j]); vv = _mm_add_pd(vv, _mm_mul_pd(x1px2v,ee)); _mm_store_pd(&v[j], vv); } } #else for(l = 0; l < 20; l++) { ump_x2[l] = 0.0; for(j = 0; j < 20; j++) ump_x2[l] += v[j] * right[k * 400 + l * 20 + j]; } v = &(x3[80 * i + 20 * k]); for(l = 0; l < 20; l++) v[l] = 0; for(l = 0; l < 20; l++) { x1px2 = uX1[k * 20 + l] * ump_x2[l]; for(j = 0; j < 20; j++) v[j] += x1px2 * extEV[l * 20 + j]; } #endif } #ifdef __SIM_SSE3 { v = &(x3[80 * i]); __m128d minlikelihood_sse = _mm_set1_pd( minlikelihood ); scale = 1; for(l = 0; scale && (l < 80); l += 2) { __m128d vv = _mm_load_pd(&v[l]); __m128d v1 = _mm_and_pd(vv, absMask.m); v1 = _mm_cmplt_pd(v1, minlikelihood_sse); if(_mm_movemask_pd( v1 ) != 3) scale = 0; } } #else v = &x3[80 * i]; scale = 1; for(l = 0; scale && (l < 80); l++) scale = (ABS(v[l]) < minlikelihood); #endif if (scale) { #ifdef __SIM_SSE3 __m128d twoto = _mm_set_pd(twotothe256, twotothe256); for(l = 0; l < 80; l+=2) { __m128d ex3v = _mm_load_pd(&v[l]); _mm_store_pd(&v[l], _mm_mul_pd(ex3v,twoto)); } #else for(l = 0; l < 80; l++) v[l] *= twotothe256; #endif if(useFastScaling) addScale += wgt[i]; else ex3[i] += 1; } } } break; case INNER_INNER: for (i = 0; i < n; i++) { for(k = 0; k < 4; k++) { vl = &(x1[80 * i + 20 * k]); vr = &(x2[80 * i + 20 * k]); v = &(x3[80 * i + 20 * k]); #ifdef __SIM_SSE3 __m128d zero = _mm_setzero_pd(); for(l = 0; l < 20; l+=2) _mm_store_pd(&v[l], zero); #else for(l = 0; l < 20; l++) v[l] = 0; #endif for(l = 0; l < 20; l++) { #ifdef __SIM_SSE3 { __m128d al = _mm_setzero_pd(); __m128d ar = _mm_setzero_pd(); double *ll = &left[k * 400 + l * 20]; double *rr = &right[k * 400 + l * 20]; double *EVEV = &extEV[20 * l]; for(j = 0; j < 20; j+=2) { __m128d lv = _mm_load_pd(&ll[j]); __m128d rv = _mm_load_pd(&rr[j]); __m128d vll = _mm_load_pd(&vl[j]); __m128d vrr = _mm_load_pd(&vr[j]); al = _mm_add_pd(al, _mm_mul_pd(vll, lv)); ar = _mm_add_pd(ar, _mm_mul_pd(vrr, rv)); } al = _mm_hadd_pd(al, al); ar = _mm_hadd_pd(ar, ar); al = _mm_mul_pd(al, ar); for(j = 0; j < 20; j+=2) { __m128d vv = _mm_load_pd(&v[j]); __m128d EVV = _mm_load_pd(&EVEV[j]); vv = _mm_add_pd(vv, _mm_mul_pd(al, EVV)); _mm_store_pd(&v[j], vv); } } #else al = 0.0; ar = 0.0; for(j = 0; j < 20; j++) { al += vl[j] * left[k * 400 + l * 20 + j]; ar += vr[j] * right[k * 400 + l * 20 + j]; } x1px2 = al * ar; for(j = 0; j < 20; j++) v[j] += x1px2 * extEV[20 * l + j]; #endif } } #ifdef __SIM_SSE3 { v = &(x3[80 * i]); __m128d minlikelihood_sse = _mm_set1_pd( minlikelihood ); scale = 1; for(l = 0; scale && (l < 80); l += 2) { __m128d vv = _mm_load_pd(&v[l]); __m128d v1 = _mm_and_pd(vv, absMask.m); v1 = _mm_cmplt_pd(v1, minlikelihood_sse); if(_mm_movemask_pd( v1 ) != 3) scale = 0; } } #else v = &(x3[80 * i]); scale = 1; for(l = 0; scale && (l < 80); l++) scale = ((ABS(v[l]) < minlikelihood)); #endif if (scale) { #ifdef __SIM_SSE3 __m128d twoto = _mm_set_pd(twotothe256, twotothe256); for(l = 0; l < 80; l+=2) { __m128d ex3v = _mm_load_pd(&v[l]); _mm_store_pd(&v[l], _mm_mul_pd(ex3v,twoto)); } #else for(l = 0; l < 80; l++) v[l] *= twotothe256; #endif if(useFastScaling) addScale += wgt[i]; else ex3[i] += 1; } } break; default: assert(0); } if(useFastScaling) *scalerIncrement = addScale; } static void newviewGTRGAMMAPROT_LG4(int tipCase, double *x1, double *x2, double *x3, double *extEV[4], double *tipVector[4], int *ex3, unsigned char *tipX1, unsigned char *tipX2, int n, double *left, double *right, int *wgt, int *scalerIncrement, const boolean useFastScaling) { double *uX1, *uX2, *v; double x1px2; int i, j, l, k, scale, addScale = 0; double *vl, *vr; #ifndef __SIM_SSE3 double al, ar; #endif switch(tipCase) { case TIP_TIP: { double umpX1[1840], umpX2[1840]; for(i = 0; i < 23; i++) { for(k = 0; k < 80; k++) { v = &(tipVector[k / 20][20 * i]); #ifdef __SIM_SSE3 double *ll = &left[k * 20]; double *rr = &right[k * 20]; __m128d umpX1v = _mm_setzero_pd(); __m128d umpX2v = _mm_setzero_pd(); for(l = 0; l < 20; l+=2) { __m128d vv = _mm_load_pd(&v[l]); umpX1v = _mm_add_pd(umpX1v, _mm_mul_pd(vv, _mm_load_pd(&ll[l]))); umpX2v = _mm_add_pd(umpX2v, _mm_mul_pd(vv, _mm_load_pd(&rr[l]))); } umpX1v = _mm_hadd_pd(umpX1v, umpX1v); umpX2v = _mm_hadd_pd(umpX2v, umpX2v); _mm_storel_pd(&umpX1[80 * i + k], umpX1v); _mm_storel_pd(&umpX2[80 * i + k], umpX2v); #else umpX1[80 * i + k] = 0.0; umpX2[80 * i + k] = 0.0; for(l = 0; l < 20; l++) { umpX1[80 * i + k] += v[l] * left[k * 20 + l]; umpX2[80 * i + k] += v[l] * right[k * 20 + l]; } #endif } } for(i = 0; i < n; i++) { uX1 = &umpX1[80 * tipX1[i]]; uX2 = &umpX2[80 * tipX2[i]]; for(j = 0; j < 4; j++) { v = &x3[i * 80 + j * 20]; #ifdef __SIM_SSE3 __m128d zero = _mm_setzero_pd(); for(k = 0; k < 20; k+=2) _mm_store_pd(&v[k], zero); for(k = 0; k < 20; k++) { double *eev = &extEV[j][k * 20]; x1px2 = uX1[j * 20 + k] * uX2[j * 20 + k]; __m128d x1px2v = _mm_set1_pd(x1px2); for(l = 0; l < 20; l+=2) { __m128d vv = _mm_load_pd(&v[l]); __m128d ee = _mm_load_pd(&eev[l]); vv = _mm_add_pd(vv, _mm_mul_pd(x1px2v,ee)); _mm_store_pd(&v[l], vv); } } #else for(k = 0; k < 20; k++) v[k] = 0.0; for(k = 0; k < 20; k++) { x1px2 = uX1[j * 20 + k] * uX2[j * 20 + k]; for(l = 0; l < 20; l++) v[l] += x1px2 * extEV[j][20 * k + l]; } #endif } } } break; case TIP_INNER: { double umpX1[1840], ump_x2[20]; for(i = 0; i < 23; i++) { for(k = 0; k < 80; k++) { v = &(tipVector[k / 20][20 * i]); #ifdef __SIM_SSE3 double *ll = &left[k * 20]; __m128d umpX1v = _mm_setzero_pd(); for(l = 0; l < 20; l+=2) { __m128d vv = _mm_load_pd(&v[l]); umpX1v = _mm_add_pd(umpX1v, _mm_mul_pd(vv, _mm_load_pd(&ll[l]))); } umpX1v = _mm_hadd_pd(umpX1v, umpX1v); _mm_storel_pd(&umpX1[80 * i + k], umpX1v); #else umpX1[80 * i + k] = 0.0; for(l = 0; l < 20; l++) umpX1[80 * i + k] += v[l] * left[k * 20 + l]; #endif } } for (i = 0; i < n; i++) { uX1 = &umpX1[80 * tipX1[i]]; for(k = 0; k < 4; k++) { v = &(x2[80 * i + k * 20]); #ifdef __SIM_SSE3 for(l = 0; l < 20; l++) { double *r = &right[k * 400 + l * 20]; __m128d ump_x2v = _mm_setzero_pd(); for(j = 0; j < 20; j+= 2) { __m128d vv = _mm_load_pd(&v[j]); __m128d rr = _mm_load_pd(&r[j]); ump_x2v = _mm_add_pd(ump_x2v, _mm_mul_pd(vv, rr)); } ump_x2v = _mm_hadd_pd(ump_x2v, ump_x2v); _mm_storel_pd(&ump_x2[l], ump_x2v); } v = &(x3[80 * i + 20 * k]); __m128d zero = _mm_setzero_pd(); for(l = 0; l < 20; l+=2) _mm_store_pd(&v[l], zero); for(l = 0; l < 20; l++) { double *eev = &extEV[k][l * 20]; x1px2 = uX1[k * 20 + l] * ump_x2[l]; __m128d x1px2v = _mm_set1_pd(x1px2); for(j = 0; j < 20; j+=2) { __m128d vv = _mm_load_pd(&v[j]); __m128d ee = _mm_load_pd(&eev[j]); vv = _mm_add_pd(vv, _mm_mul_pd(x1px2v,ee)); _mm_store_pd(&v[j], vv); } } #else for(l = 0; l < 20; l++) { ump_x2[l] = 0.0; for(j = 0; j < 20; j++) ump_x2[l] += v[j] * right[k * 400 + l * 20 + j]; } v = &(x3[80 * i + 20 * k]); for(l = 0; l < 20; l++) v[l] = 0; for(l = 0; l < 20; l++) { x1px2 = uX1[k * 20 + l] * ump_x2[l]; for(j = 0; j < 20; j++) v[j] += x1px2 * extEV[k][l * 20 + j]; } #endif } #ifdef __SIM_SSE3 { v = &(x3[80 * i]); __m128d minlikelihood_sse = _mm_set1_pd( minlikelihood ); scale = 1; for(l = 0; scale && (l < 80); l += 2) { __m128d vv = _mm_load_pd(&v[l]); __m128d v1 = _mm_and_pd(vv, absMask.m); v1 = _mm_cmplt_pd(v1, minlikelihood_sse); if(_mm_movemask_pd( v1 ) != 3) scale = 0; } } #else v = &x3[80 * i]; scale = 1; for(l = 0; scale && (l < 80); l++) scale = (ABS(v[l]) < minlikelihood); #endif if (scale) { #ifdef __SIM_SSE3 __m128d twoto = _mm_set_pd(twotothe256, twotothe256); for(l = 0; l < 80; l+=2) { __m128d ex3v = _mm_load_pd(&v[l]); _mm_store_pd(&v[l], _mm_mul_pd(ex3v,twoto)); } #else for(l = 0; l < 80; l++) v[l] *= twotothe256; #endif if(useFastScaling) addScale += wgt[i]; else ex3[i] += 1; } } } break; case INNER_INNER: for (i = 0; i < n; i++) { for(k = 0; k < 4; k++) { vl = &(x1[80 * i + 20 * k]); vr = &(x2[80 * i + 20 * k]); v = &(x3[80 * i + 20 * k]); #ifdef __SIM_SSE3 __m128d zero = _mm_setzero_pd(); for(l = 0; l < 20; l+=2) _mm_store_pd(&v[l], zero); #else for(l = 0; l < 20; l++) v[l] = 0; #endif for(l = 0; l < 20; l++) { #ifdef __SIM_SSE3 { __m128d al = _mm_setzero_pd(); __m128d ar = _mm_setzero_pd(); double *ll = &left[k * 400 + l * 20]; double *rr = &right[k * 400 + l * 20]; double *EVEV = &extEV[k][20 * l]; for(j = 0; j < 20; j+=2) { __m128d lv = _mm_load_pd(&ll[j]); __m128d rv = _mm_load_pd(&rr[j]); __m128d vll = _mm_load_pd(&vl[j]); __m128d vrr = _mm_load_pd(&vr[j]); al = _mm_add_pd(al, _mm_mul_pd(vll, lv)); ar = _mm_add_pd(ar, _mm_mul_pd(vrr, rv)); } al = _mm_hadd_pd(al, al); ar = _mm_hadd_pd(ar, ar); al = _mm_mul_pd(al, ar); for(j = 0; j < 20; j+=2) { __m128d vv = _mm_load_pd(&v[j]); __m128d EVV = _mm_load_pd(&EVEV[j]); vv = _mm_add_pd(vv, _mm_mul_pd(al, EVV)); _mm_store_pd(&v[j], vv); } } #else al = 0.0; ar = 0.0; for(j = 0; j < 20; j++) { al += vl[j] * left[k * 400 + l * 20 + j]; ar += vr[j] * right[k * 400 + l * 20 + j]; } x1px2 = al * ar; for(j = 0; j < 20; j++) v[j] += x1px2 * extEV[k][20 * l + j]; #endif } } #ifdef __SIM_SSE3 { v = &(x3[80 * i]); __m128d minlikelihood_sse = _mm_set1_pd( minlikelihood ); scale = 1; for(l = 0; scale && (l < 80); l += 2) { __m128d vv = _mm_load_pd(&v[l]); __m128d v1 = _mm_and_pd(vv, absMask.m); v1 = _mm_cmplt_pd(v1, minlikelihood_sse); if(_mm_movemask_pd( v1 ) != 3) scale = 0; } } #else v = &(x3[80 * i]); scale = 1; for(l = 0; scale && (l < 80); l++) scale = ((ABS(v[l]) < minlikelihood)); #endif if (scale) { #ifdef __SIM_SSE3 __m128d twoto = _mm_set_pd(twotothe256, twotothe256); for(l = 0; l < 80; l+=2) { __m128d ex3v = _mm_load_pd(&v[l]); _mm_store_pd(&v[l], _mm_mul_pd(ex3v,twoto)); } #else for(l = 0; l < 80; l++) v[l] *= twotothe256; #endif if(useFastScaling) addScale += wgt[i]; else ex3[i] += 1; } } break; default: assert(0); } if(useFastScaling) *scalerIncrement = addScale; } #ifdef __SIM_SSE3 static void newviewGTRGAMMAPROT_GAPPED_SAVE(int tipCase, double *x1, double *x2, double *x3, double *extEV, double *tipVector, int *ex3, unsigned char *tipX1, unsigned char *tipX2, int n, double *left, double *right, int *wgt, int *scalerIncrement, const boolean useFastScaling, unsigned int *x1_gap, unsigned int *x2_gap, unsigned int *x3_gap, double *x1_gapColumn, double *x2_gapColumn, double *x3_gapColumn ) { double *uX1, *uX2, *v; double x1px2; int i, j, l, k, scale, addScale = 0, gapScaling = 0; double *vl, *vr, *x1v, *x2v, *x1_ptr = x1, *x2_ptr = x2, *x3_ptr = x3; switch(tipCase) { case TIP_TIP: { double umpX1[1840], umpX2[1840]; for(i = 0; i < 23; i++) { v = &(tipVector[20 * i]); for(k = 0; k < 80; k++) { double *ll = &left[k * 20]; double *rr = &right[k * 20]; __m128d umpX1v = _mm_setzero_pd(); __m128d umpX2v = _mm_setzero_pd(); for(l = 0; l < 20; l+=2) { __m128d vv = _mm_load_pd(&v[l]); umpX1v = _mm_add_pd(umpX1v, _mm_mul_pd(vv, _mm_load_pd(&ll[l]))); umpX2v = _mm_add_pd(umpX2v, _mm_mul_pd(vv, _mm_load_pd(&rr[l]))); } umpX1v = _mm_hadd_pd(umpX1v, umpX1v); umpX2v = _mm_hadd_pd(umpX2v, umpX2v); _mm_storel_pd(&umpX1[80 * i + k], umpX1v); _mm_storel_pd(&umpX2[80 * i + k], umpX2v); } } { uX1 = &umpX1[1760]; uX2 = &umpX2[1760]; for(j = 0; j < 4; j++) { v = &x3_gapColumn[j * 20]; __m128d zero = _mm_setzero_pd(); for(k = 0; k < 20; k+=2) _mm_store_pd(&v[k], zero); for(k = 0; k < 20; k++) { double *eev = &extEV[k * 20]; x1px2 = uX1[j * 20 + k] * uX2[j * 20 + k]; __m128d x1px2v = _mm_set1_pd(x1px2); for(l = 0; l < 20; l+=2) { __m128d vv = _mm_load_pd(&v[l]); __m128d ee = _mm_load_pd(&eev[l]); vv = _mm_add_pd(vv, _mm_mul_pd(x1px2v,ee)); _mm_store_pd(&v[l], vv); } } } } for(i = 0; i < n; i++) { if(!(x3_gap[i / 32] & mask32[i % 32])) { uX1 = &umpX1[80 * tipX1[i]]; uX2 = &umpX2[80 * tipX2[i]]; for(j = 0; j < 4; j++) { v = &x3_ptr[j * 20]; __m128d zero = _mm_setzero_pd(); for(k = 0; k < 20; k+=2) _mm_store_pd(&v[k], zero); for(k = 0; k < 20; k++) { double *eev = &extEV[k * 20]; x1px2 = uX1[j * 20 + k] * uX2[j * 20 + k]; __m128d x1px2v = _mm_set1_pd(x1px2); for(l = 0; l < 20; l+=2) { __m128d vv = _mm_load_pd(&v[l]); __m128d ee = _mm_load_pd(&eev[l]); vv = _mm_add_pd(vv, _mm_mul_pd(x1px2v,ee)); _mm_store_pd(&v[l], vv); } } } x3_ptr += 80; } } } break; case TIP_INNER: { double umpX1[1840], ump_x2[20]; for(i = 0; i < 23; i++) { v = &(tipVector[20 * i]); for(k = 0; k < 80; k++) { double *ll = &left[k * 20]; __m128d umpX1v = _mm_setzero_pd(); for(l = 0; l < 20; l+=2) { __m128d vv = _mm_load_pd(&v[l]); umpX1v = _mm_add_pd(umpX1v, _mm_mul_pd(vv, _mm_load_pd(&ll[l]))); } umpX1v = _mm_hadd_pd(umpX1v, umpX1v); _mm_storel_pd(&umpX1[80 * i + k], umpX1v); } } { uX1 = &umpX1[1760]; for(k = 0; k < 4; k++) { v = &(x2_gapColumn[k * 20]); for(l = 0; l < 20; l++) { double *r = &right[k * 400 + l * 20]; __m128d ump_x2v = _mm_setzero_pd(); for(j = 0; j < 20; j+= 2) { __m128d vv = _mm_load_pd(&v[j]); __m128d rr = _mm_load_pd(&r[j]); ump_x2v = _mm_add_pd(ump_x2v, _mm_mul_pd(vv, rr)); } ump_x2v = _mm_hadd_pd(ump_x2v, ump_x2v); _mm_storel_pd(&ump_x2[l], ump_x2v); } v = &(x3_gapColumn[20 * k]); __m128d zero = _mm_setzero_pd(); for(l = 0; l < 20; l+=2) _mm_store_pd(&v[l], zero); for(l = 0; l < 20; l++) { double *eev = &extEV[l * 20]; x1px2 = uX1[k * 20 + l] * ump_x2[l]; __m128d x1px2v = _mm_set1_pd(x1px2); for(j = 0; j < 20; j+=2) { __m128d vv = _mm_load_pd(&v[j]); __m128d ee = _mm_load_pd(&eev[j]); vv = _mm_add_pd(vv, _mm_mul_pd(x1px2v,ee)); _mm_store_pd(&v[j], vv); } } } { v = x3_gapColumn; __m128d minlikelihood_sse = _mm_set1_pd( minlikelihood ); scale = 1; for(l = 0; scale && (l < 80); l += 2) { __m128d vv = _mm_load_pd(&v[l]); __m128d v1 = _mm_and_pd(vv, absMask.m); v1 = _mm_cmplt_pd(v1, minlikelihood_sse); if(_mm_movemask_pd( v1 ) != 3) scale = 0; } } if (scale) { gapScaling = 1; __m128d twoto = _mm_set_pd(twotothe256, twotothe256); for(l = 0; l < 80; l+=2) { __m128d ex3v = _mm_load_pd(&v[l]); _mm_store_pd(&v[l], _mm_mul_pd(ex3v,twoto)); } } } for (i = 0; i < n; i++) { if((x3_gap[i / 32] & mask32[i % 32])) { if(gapScaling) { if(useFastScaling) addScale += wgt[i]; else ex3[i] += 1; } } else { uX1 = &umpX1[80 * tipX1[i]]; if(x2_gap[i / 32] & mask32[i % 32]) x2v = x2_gapColumn; else { x2v = x2_ptr; x2_ptr += 80; } for(k = 0; k < 4; k++) { v = &(x2v[k * 20]); for(l = 0; l < 20; l++) { double *r = &right[k * 400 + l * 20]; __m128d ump_x2v = _mm_setzero_pd(); for(j = 0; j < 20; j+= 2) { __m128d vv = _mm_load_pd(&v[j]); __m128d rr = _mm_load_pd(&r[j]); ump_x2v = _mm_add_pd(ump_x2v, _mm_mul_pd(vv, rr)); } ump_x2v = _mm_hadd_pd(ump_x2v, ump_x2v); _mm_storel_pd(&ump_x2[l], ump_x2v); } v = &x3_ptr[20 * k]; __m128d zero = _mm_setzero_pd(); for(l = 0; l < 20; l+=2) _mm_store_pd(&v[l], zero); for(l = 0; l < 20; l++) { double *eev = &extEV[l * 20]; x1px2 = uX1[k * 20 + l] * ump_x2[l]; __m128d x1px2v = _mm_set1_pd(x1px2); for(j = 0; j < 20; j+=2) { __m128d vv = _mm_load_pd(&v[j]); __m128d ee = _mm_load_pd(&eev[j]); vv = _mm_add_pd(vv, _mm_mul_pd(x1px2v,ee)); _mm_store_pd(&v[j], vv); } } } { v = x3_ptr; __m128d minlikelihood_sse = _mm_set1_pd( minlikelihood ); scale = 1; for(l = 0; scale && (l < 80); l += 2) { __m128d vv = _mm_load_pd(&v[l]); __m128d v1 = _mm_and_pd(vv, absMask.m); v1 = _mm_cmplt_pd(v1, minlikelihood_sse); if(_mm_movemask_pd( v1 ) != 3) scale = 0; } } if (scale) { __m128d twoto = _mm_set_pd(twotothe256, twotothe256); for(l = 0; l < 80; l+=2) { __m128d ex3v = _mm_load_pd(&v[l]); _mm_store_pd(&v[l], _mm_mul_pd(ex3v,twoto)); } if(useFastScaling) addScale += wgt[i]; else ex3[i] += 1; } x3_ptr += 80; } } } break; case INNER_INNER: { for(k = 0; k < 4; k++) { vl = &(x1_gapColumn[20 * k]); vr = &(x2_gapColumn[20 * k]); v = &(x3_gapColumn[20 * k]); __m128d zero = _mm_setzero_pd(); for(l = 0; l < 20; l+=2) _mm_store_pd(&v[l], zero); for(l = 0; l < 20; l++) { { __m128d al = _mm_setzero_pd(); __m128d ar = _mm_setzero_pd(); double *ll = &left[k * 400 + l * 20]; double *rr = &right[k * 400 + l * 20]; double *EVEV = &extEV[20 * l]; for(j = 0; j < 20; j+=2) { __m128d lv = _mm_load_pd(&ll[j]); __m128d rv = _mm_load_pd(&rr[j]); __m128d vll = _mm_load_pd(&vl[j]); __m128d vrr = _mm_load_pd(&vr[j]); al = _mm_add_pd(al, _mm_mul_pd(vll, lv)); ar = _mm_add_pd(ar, _mm_mul_pd(vrr, rv)); } al = _mm_hadd_pd(al, al); ar = _mm_hadd_pd(ar, ar); al = _mm_mul_pd(al, ar); for(j = 0; j < 20; j+=2) { __m128d vv = _mm_load_pd(&v[j]); __m128d EVV = _mm_load_pd(&EVEV[j]); vv = _mm_add_pd(vv, _mm_mul_pd(al, EVV)); _mm_store_pd(&v[j], vv); } } } } { v = x3_gapColumn; __m128d minlikelihood_sse = _mm_set1_pd( minlikelihood ); scale = 1; for(l = 0; scale && (l < 80); l += 2) { __m128d vv = _mm_load_pd(&v[l]); __m128d v1 = _mm_and_pd(vv, absMask.m); v1 = _mm_cmplt_pd(v1, minlikelihood_sse); if(_mm_movemask_pd( v1 ) != 3) scale = 0; } } if (scale) { gapScaling = 1; __m128d twoto = _mm_set_pd(twotothe256, twotothe256); for(l = 0; l < 80; l+=2) { __m128d ex3v = _mm_load_pd(&v[l]); _mm_store_pd(&v[l], _mm_mul_pd(ex3v,twoto)); } } } for (i = 0; i < n; i++) { if(x3_gap[i / 32] & mask32[i % 32]) { if(gapScaling) { if(useFastScaling) addScale += wgt[i]; else ex3[i] += 1; } } else { if(x1_gap[i / 32] & mask32[i % 32]) x1v = x1_gapColumn; else { x1v = x1_ptr; x1_ptr += 80; } if(x2_gap[i / 32] & mask32[i % 32]) x2v = x2_gapColumn; else { x2v = x2_ptr; x2_ptr += 80; } for(k = 0; k < 4; k++) { vl = &(x1v[20 * k]); vr = &(x2v[20 * k]); v = &x3_ptr[20 * k]; __m128d zero = _mm_setzero_pd(); for(l = 0; l < 20; l+=2) _mm_store_pd(&v[l], zero); for(l = 0; l < 20; l++) { { __m128d al = _mm_setzero_pd(); __m128d ar = _mm_setzero_pd(); double *ll = &left[k * 400 + l * 20]; double *rr = &right[k * 400 + l * 20]; double *EVEV = &extEV[20 * l]; for(j = 0; j < 20; j+=2) { __m128d lv = _mm_load_pd(&ll[j]); __m128d rv = _mm_load_pd(&rr[j]); __m128d vll = _mm_load_pd(&vl[j]); __m128d vrr = _mm_load_pd(&vr[j]); al = _mm_add_pd(al, _mm_mul_pd(vll, lv)); ar = _mm_add_pd(ar, _mm_mul_pd(vrr, rv)); } al = _mm_hadd_pd(al, al); ar = _mm_hadd_pd(ar, ar); al = _mm_mul_pd(al, ar); for(j = 0; j < 20; j+=2) { __m128d vv = _mm_load_pd(&v[j]); __m128d EVV = _mm_load_pd(&EVEV[j]); vv = _mm_add_pd(vv, _mm_mul_pd(al, EVV)); _mm_store_pd(&v[j], vv); } } } } { v = x3_ptr; __m128d minlikelihood_sse = _mm_set1_pd( minlikelihood ); scale = 1; for(l = 0; scale && (l < 80); l += 2) { __m128d vv = _mm_load_pd(&v[l]); __m128d v1 = _mm_and_pd(vv, absMask.m); v1 = _mm_cmplt_pd(v1, minlikelihood_sse); if(_mm_movemask_pd( v1 ) != 3) scale = 0; } } if (scale) { __m128d twoto = _mm_set_pd(twotothe256, twotothe256); for(l = 0; l < 80; l+=2) { __m128d ex3v = _mm_load_pd(&v[l]); _mm_store_pd(&v[l], _mm_mul_pd(ex3v,twoto)); } if(useFastScaling) addScale += wgt[i]; else ex3[i] += 1; } x3_ptr += 80; } } break; default: assert(0); } if(useFastScaling) *scalerIncrement = addScale; } #else #endif static void newviewGTRGAMMASECONDARY(int tipCase, double *x1, double *x2, double *x3, double *extEV, double *tipVector, int *ex3, unsigned char *tipX1, unsigned char *tipX2, int n, double *left, double *right, int *wgt, int *scalerIncrement, const boolean useFastScaling) { double *v; double x1px2; int i, j, l, k, scale, addScale = 0; double *vl, *vr, al, ar; switch(tipCase) { case TIP_TIP: { for(i = 0; i < n; i++) { for(k = 0; k < 4; k++) { vl = &(tipVector[16 * tipX1[i]]); vr = &(tipVector[16 * tipX2[i]]); v = &(x3[64 * i + 16 * k]); for(l = 0; l < 16; l++) v[l] = 0; for(l = 0; l < 16; l++) { al = 0.0; ar = 0.0; for(j = 0; j < 16; j++) { al += vl[j] * left[k * 256 + l * 16 + j]; ar += vr[j] * right[k * 256 + l * 16 + j]; } x1px2 = al * ar; for(j = 0; j < 16; j++) v[j] += x1px2 * extEV[16 * l + j]; } } } } break; case TIP_INNER: { for (i = 0; i < n; i++) { for(k = 0; k < 4; k++) { vl = &(tipVector[16 * tipX1[i]]); vr = &(x2[64 * i + 16 * k]); v = &(x3[64 * i + 16 * k]); for(l = 0; l < 16; l++) v[l] = 0; for(l = 0; l < 16; l++) { al = 0.0; ar = 0.0; for(j = 0; j < 16; j++) { al += vl[j] * left[k * 256 + l * 16 + j]; ar += vr[j] * right[k * 256 + l * 16 + j]; } x1px2 = al * ar; for(j = 0; j < 16; j++) v[j] += x1px2 * extEV[16 * l + j]; } } v = &x3[64 * i]; scale = 1; for(l = 0; scale && (l < 64); l++) scale = (ABS(v[l]) < minlikelihood); if (scale) { for(l = 0; l < 64; l++) v[l] *= twotothe256; if(useFastScaling) addScale += wgt[i]; else ex3[i] += 1; } } } break; case INNER_INNER: for (i = 0; i < n; i++) { for(k = 0; k < 4; k++) { vl = &(x1[64 * i + 16 * k]); vr = &(x2[64 * i + 16 * k]); v = &(x3[64 * i + 16 * k]); for(l = 0; l < 16; l++) v[l] = 0; for(l = 0; l < 16; l++) { al = 0.0; ar = 0.0; for(j = 0; j < 16; j++) { al += vl[j] * left[k * 256 + l * 16 + j]; ar += vr[j] * right[k * 256 + l * 16 + j]; } x1px2 = al * ar; for(j = 0; j < 16; j++) v[j] += x1px2 * extEV[16 * l + j]; } } v = &(x3[64 * i]); scale = 1; for(l = 0; scale && (l < 64); l++) scale = ((ABS(v[l]) < minlikelihood)); if (scale) { for(l = 0; l < 64; l++) v[l] *= twotothe256; if(useFastScaling) addScale += wgt[i]; else ex3[i] += 1; } } break; default: assert(0); } if(useFastScaling) *scalerIncrement = addScale; } static void newviewGTRGAMMASECONDARY_6(int tipCase, double *x1, double *x2, double *x3, double *extEV, double *tipVector, int *ex3, unsigned char *tipX1, unsigned char *tipX2, int n, double *left, double *right, int *wgt, int *scalerIncrement, const boolean useFastScaling) { double *v; double x1px2; int i, j, l, k, scale, addScale = 0; double *vl, *vr, al, ar; switch(tipCase) { case TIP_TIP: { for(i = 0; i < n; i++) { for(k = 0; k < 4; k++) { vl = &(tipVector[6 * tipX1[i]]); vr = &(tipVector[6 * tipX2[i]]); v = &(x3[24 * i + 6 * k]); for(l = 0; l < 6; l++) v[l] = 0; for(l = 0; l < 6; l++) { al = 0.0; ar = 0.0; for(j = 0; j < 6; j++) { al += vl[j] * left[k * 36 + l * 6 + j]; ar += vr[j] * right[k * 36 + l * 6 + j]; } x1px2 = al * ar; for(j = 0; j < 6; j++) v[j] += x1px2 * extEV[6 * l + j]; } } } } break; case TIP_INNER: { for (i = 0; i < n; i++) { for(k = 0; k < 4; k++) { vl = &(tipVector[6 * tipX1[i]]); vr = &(x2[24 * i + 6 * k]); v = &(x3[24 * i + 6 * k]); for(l = 0; l < 6; l++) v[l] = 0; for(l = 0; l < 6; l++) { al = 0.0; ar = 0.0; for(j = 0; j < 6; j++) { al += vl[j] * left[k * 36 + l * 6 + j]; ar += vr[j] * right[k * 36 + l * 6 + j]; } x1px2 = al * ar; for(j = 0; j < 6; j++) v[j] += x1px2 * extEV[6 * l + j]; } } v = &x3[24 * i]; scale = 1; for(l = 0; scale && (l < 24); l++) scale = (ABS(v[l]) < minlikelihood); if(scale) { for(l = 0; l < 24; l++) v[l] *= twotothe256; if(useFastScaling) addScale += wgt[i]; else ex3[i] += 1; } } } break; case INNER_INNER: for (i = 0; i < n; i++) { for(k = 0; k < 4; k++) { vl = &(x1[24 * i + 6 * k]); vr = &(x2[24 * i + 6 * k]); v = &(x3[24 * i + 6 * k]); for(l = 0; l < 6; l++) v[l] = 0; for(l = 0; l < 6; l++) { al = 0.0; ar = 0.0; for(j = 0; j < 6; j++) { al += vl[j] * left[k * 36 + l * 6 + j]; ar += vr[j] * right[k * 36 + l * 6 + j]; } x1px2 = al * ar; for(j = 0; j < 6; j++) v[j] += x1px2 * extEV[6 * l + j]; } } v = &(x3[24 * i]); scale = 1; for(l = 0; scale && (l < 24); l++) scale = ((ABS(v[l]) < minlikelihood)); if (scale) { for(l = 0; l < 24; l++) v[l] *= twotothe256; if(useFastScaling) addScale += wgt[i]; else ex3[i] += 1; } } break; default: assert(0); } if(useFastScaling) *scalerIncrement = addScale; } static void newviewGTRGAMMASECONDARY_7(int tipCase, double *x1, double *x2, double *x3, double *extEV, double *tipVector, int *ex3, unsigned char *tipX1, unsigned char *tipX2, int n, double *left, double *right, int *wgt, int *scalerIncrement, const boolean useFastScaling) { double *v; double x1px2; int i, j, l, k, scale, addScale = 0; double *vl, *vr, al, ar; switch(tipCase) { case TIP_TIP: { for(i = 0; i < n; i++) { for(k = 0; k < 4; k++) { vl = &(tipVector[7 * tipX1[i]]); vr = &(tipVector[7 * tipX2[i]]); v = &(x3[28 * i + 7 * k]); for(l = 0; l < 7; l++) v[l] = 0; for(l = 0; l < 7; l++) { al = 0.0; ar = 0.0; for(j = 0; j < 7; j++) { al += vl[j] * left[k * 49 + l * 7 + j]; ar += vr[j] * right[k * 49 + l * 7 + j]; } x1px2 = al * ar; for(j = 0; j < 7; j++) v[j] += x1px2 * extEV[7 * l + j]; } } } } break; case TIP_INNER: { for (i = 0; i < n; i++) { for(k = 0; k < 4; k++) { vl = &(tipVector[7 * tipX1[i]]); vr = &(x2[28 * i + 7 * k]); v = &(x3[28 * i + 7 * k]); for(l = 0; l < 7; l++) v[l] = 0; for(l = 0; l < 7; l++) { al = 0.0; ar = 0.0; for(j = 0; j < 7; j++) { al += vl[j] * left[k * 49 + l * 7 + j]; ar += vr[j] * right[k * 49 + l * 7 + j]; } x1px2 = al * ar; for(j = 0; j < 7; j++) v[j] += x1px2 * extEV[7 * l + j]; } } v = &x3[28 * i]; scale = 1; for(l = 0; scale && (l < 28); l++) scale = (ABS(v[l]) < minlikelihood); if (scale) { for(l = 0; l < 28; l++) v[l] *= twotothe256; if(useFastScaling) addScale += wgt[i]; else ex3[i] += 1; } } } break; case INNER_INNER: for (i = 0; i < n; i++) { for(k = 0; k < 4; k++) { vl = &(x1[28 * i + 7 * k]); vr = &(x2[28 * i + 7 * k]); v = &(x3[28 * i + 7 * k]); for(l = 0; l < 7; l++) v[l] = 0; for(l = 0; l < 7; l++) { al = 0.0; ar = 0.0; for(j = 0; j < 7; j++) { al += vl[j] * left[k * 49 + l * 7 + j]; ar += vr[j] * right[k * 49 + l * 7 + j]; } x1px2 = al * ar; for(j = 0; j < 7; j++) v[j] += x1px2 * extEV[7 * l + j]; } } v = &(x3[28 * i]); scale = 1; for(l = 0; scale && (l < 28); l++) scale = ((ABS(v[l]) < minlikelihood)); if (scale) { for(l = 0; l < 28; l++) v[l] *= twotothe256; if(useFastScaling) addScale += wgt[i]; else ex3[i] += 1; } } break; default: assert(0); } if(useFastScaling) *scalerIncrement = addScale; } void computeTraversalInfo(tree *tr, nodeptr p, traversalInfo *ti, int *counter, int maxTips, int numBranches) { if(isTip(p->number, maxTips)) return; { int i; nodeptr q = p->next->back; nodeptr r = p->next->next->back; #ifdef _HET boolean parentIsTip; if(isTip(p->back->number, maxTips)) parentIsTip = TRUE; else parentIsTip = FALSE; #endif if(isTip(r->number, maxTips) && isTip(q->number, maxTips)) { while (! p->x) { if (! p->x) getxnode(p); } ti[*counter].tipCase = TIP_TIP; ti[*counter].pNumber = p->number; ti[*counter].qNumber = q->number; ti[*counter].rNumber = r->number; #ifdef _HET ti[*counter].parentIsTip = parentIsTip; #endif for(i = 0; i < numBranches; i++) { double z; z = q->z[i]; z = (z > zmin) ? log(z) : log(zmin); ti[*counter].qz[i] = z; z = r->z[i]; z = (z > zmin) ? log(z) : log(zmin); ti[*counter].rz[i] = z; #ifdef _BASTIEN if(tr->doBastienStuff) { assert(q->secondDerivative[i] == q->back->secondDerivative[i]); assert(r->secondDerivative[i] == r->back->secondDerivative[i]); assert(q->secondDerivativeValid[i] && q->back->secondDerivativeValid[i]); assert(r->secondDerivativeValid[i] && r->back->secondDerivativeValid[i]); } ti[*counter].secondDerivativeQ[i] = q->secondDerivative[i]; ti[*counter].secondDerivativeR[i] = r->secondDerivative[i]; #endif } *counter = *counter + 1; } else { if(isTip(r->number, maxTips) || isTip(q->number, maxTips)) { nodeptr tmp; if(isTip(r->number, maxTips)) { tmp = r; r = q; q = tmp; } while ((! p->x) || (! r->x)) { if (! r->x) computeTraversalInfo(tr, r, ti, counter, maxTips, numBranches); if (! p->x) getxnode(p); } ti[*counter].tipCase = TIP_INNER; ti[*counter].pNumber = p->number; ti[*counter].qNumber = q->number; ti[*counter].rNumber = r->number; #ifdef _HET ti[*counter].parentIsTip = parentIsTip; #endif for(i = 0; i < numBranches; i++) { double z; z = q->z[i]; z = (z > zmin) ? log(z) : log(zmin); ti[*counter].qz[i] = z; z = r->z[i]; z = (z > zmin) ? log(z) : log(zmin); ti[*counter].rz[i] = z; #ifdef _BASTIEN if(tr->doBastienStuff) { assert(q->secondDerivative[i] == q->back->secondDerivative[i]); assert(r->secondDerivative[i] == r->back->secondDerivative[i]); assert(q->secondDerivativeValid[i] && q->back->secondDerivativeValid[i]); assert(r->secondDerivativeValid[i] && r->back->secondDerivativeValid[i]); } ti[*counter].secondDerivativeQ[i] = q->secondDerivative[i]; ti[*counter].secondDerivativeR[i] = r->secondDerivative[i]; #endif } *counter = *counter + 1; } else { while ((! p->x) || (! q->x) || (! r->x)) { if (! q->x) computeTraversalInfo(tr, q, ti, counter, maxTips, numBranches); if (! r->x) computeTraversalInfo(tr, r, ti, counter, maxTips, numBranches); if (! p->x) getxnode(p); } ti[*counter].tipCase = INNER_INNER; ti[*counter].pNumber = p->number; ti[*counter].qNumber = q->number; ti[*counter].rNumber = r->number; #ifdef _HET ti[*counter].parentIsTip = parentIsTip; #endif for(i = 0; i < numBranches; i++) { double z; z = q->z[i]; z = (z > zmin) ? log(z) : log(zmin); ti[*counter].qz[i] = z; z = r->z[i]; z = (z > zmin) ? log(z) : log(zmin); ti[*counter].rz[i] = z; #ifdef _BASTIEN if(tr->doBastienStuff) { assert(q->secondDerivative[i] == q->back->secondDerivative[i]); assert(r->secondDerivative[i] == r->back->secondDerivative[i]); assert(q->secondDerivativeValid[i] && q->back->secondDerivativeValid[i]); assert(r->secondDerivativeValid[i] && r->back->secondDerivativeValid[i]); } ti[*counter].secondDerivativeQ[i] = q->secondDerivative[i]; ti[*counter].secondDerivativeR[i] = r->secondDerivative[i]; #endif } *counter = *counter + 1; } } } } void newviewIterative (tree *tr) { traversalInfo *ti = tr->td[0].ti; int i, model; for(i = 1; i < tr->td[0].count; i++) { traversalInfo *tInfo = &ti[i]; for(model = 0; model < tr->NumberOfModels; model++) { if(tr->executeModel[model]) { double *x1_start = (double*)NULL, *x2_start = (double*)NULL, *x3_start = tr->partitionData[model].xVector[tInfo->pNumber - tr->mxtips - 1], *left = (double*)NULL, *right = (double*)NULL, *x1_gapColumn = (double*)NULL, *x2_gapColumn = (double*)NULL, *x3_gapColumn = (double*)NULL, *x1_ascColumn = (double*)NULL, *x2_ascColumn = (double*)NULL, *x3_ascColumn = (double*)NULL, qz, rz; unsigned int x1_presenceMap = 0, x2_presenceMap = 0, *x1_gap = (unsigned int*)NULL, *x2_gap = (unsigned int*)NULL, *x3_gap = (unsigned int*)NULL; int scalerIncrement = 0, *wgt = (int*)NULL, *ex3 = (int*)NULL, *ex3_asc = (int*)NULL; unsigned char *tipX1 = (unsigned char *)NULL, *tipX2 = (unsigned char *)NULL; size_t gapOffset = 0, rateHet, states = (size_t)tr->partitionData[model].states, width = tr->partitionData[model].width, ascWidth = (size_t)tr->partitionData[model].states, availableLength = tr->partitionData[model].xSpaceVector[(tInfo->pNumber - tr->mxtips - 1)], requiredLength; if(tr->rateHetModel == CAT) rateHet = 1; else rateHet = 4; if(tr->saveMemory) { size_t j, setBits = 0; x1_gap = &(tr->partitionData[model].gapVector[tInfo->qNumber * tr->partitionData[model].gapVectorLength]); x2_gap = &(tr->partitionData[model].gapVector[tInfo->rNumber * tr->partitionData[model].gapVectorLength]); x3_gap = &(tr->partitionData[model].gapVector[tInfo->pNumber * tr->partitionData[model].gapVectorLength]); gapOffset = states * (size_t)getUndetermined(tr->partitionData[model].dataType); availableLength = tr->partitionData[model].xSpaceVector[(tInfo->pNumber - tr->mxtips - 1)]; for(j = 0; j < (size_t)tr->partitionData[model].gapVectorLength; j++) { x3_gap[j] = x1_gap[j] & x2_gap[j]; setBits += (size_t)(precomputed16_bitcount(x3_gap[j])); } requiredLength = (width - setBits) * rateHet * states * sizeof(double); } else requiredLength = width * rateHet * states * sizeof(double); if(requiredLength != availableLength) { if(x3_start) rax_free(x3_start); x3_start = (double*)rax_malloc(requiredLength); tr->partitionData[model].xVector[tInfo->pNumber - tr->mxtips - 1] = x3_start; tr->partitionData[model].xSpaceVector[(tInfo->pNumber - tr->mxtips - 1)] = requiredLength; } if(tr->useFastScaling) wgt = tr->partitionData[model].wgt; else { size_t availableExpLength = tr->partitionData[model].expSpaceVector[(tInfo->pNumber - tr->mxtips - 1)], requiredExpLength = width * sizeof(int); ex3 = tr->partitionData[model].expVector[tInfo->pNumber - tr->mxtips - 1]; if(requiredExpLength != availableExpLength) { if(ex3) rax_free(ex3); ex3 = (int*)rax_malloc(requiredExpLength); tr->partitionData[model].expVector[tInfo->pNumber - tr->mxtips - 1] = ex3; tr->partitionData[model].expSpaceVector[(tInfo->pNumber - tr->mxtips - 1)] = requiredExpLength; } } switch(tInfo->tipCase) { case TIP_TIP: tipX1 = tr->partitionData[model].yVector[tInfo->qNumber]; tipX2 = tr->partitionData[model].yVector[tInfo->rNumber]; x1_presenceMap = tr->partitionData[model].presenceMap[tInfo->qNumber]; x2_presenceMap = tr->partitionData[model].presenceMap[tInfo->rNumber]; if(tr->saveMemory) { assert(gapOffset > 0); x1_gapColumn = &(tr->partitionData[model].tipVector[gapOffset]); x2_gapColumn = &(tr->partitionData[model].tipVector[gapOffset]); x3_gapColumn = &tr->partitionData[model].gapColumn[(tInfo->pNumber - tr->mxtips - 1) * states * rateHet]; } #ifdef _USE_PTHREADS if(tr->partitionData[model].ascBias && tr->threadID == 0) #else if(tr->partitionData[model].ascBias) #endif { size_t k; x3_ascColumn = &tr->partitionData[model].ascVector[(tInfo->pNumber - tr->mxtips - 1) * tr->partitionData[model].ascOffset]; ex3_asc = &tr->partitionData[model].ascExpVector[(tInfo->pNumber - tr->mxtips - 1) * ascWidth]; for(k = 0; k < ascWidth; k++) ex3_asc[k] = 0; } if(!tr->useFastScaling) { size_t k; for(k = 0; k < width; k++) ex3[k] = 0; } break; case TIP_INNER: tipX1 = tr->partitionData[model].yVector[tInfo->qNumber]; x2_start = tr->partitionData[model].xVector[tInfo->rNumber - tr->mxtips - 1]; x1_presenceMap = tr->partitionData[model].presenceMap[tInfo->qNumber]; if(tr->saveMemory) { assert(gapOffset > 0); x1_gapColumn = &(tr->partitionData[model].tipVector[gapOffset]); x2_gapColumn = &tr->partitionData[model].gapColumn[(tInfo->rNumber - tr->mxtips - 1) * states * rateHet]; x3_gapColumn = &tr->partitionData[model].gapColumn[(tInfo->pNumber - tr->mxtips - 1) * states * rateHet]; } #ifdef _USE_PTHREADS if(tr->partitionData[model].ascBias && tr->threadID == 0) #else if(tr->partitionData[model].ascBias) #endif { size_t k; int *ex2_asc; x2_ascColumn = &tr->partitionData[model].ascVector[(tInfo->rNumber - tr->mxtips - 1) * tr->partitionData[model].ascOffset]; x3_ascColumn = &tr->partitionData[model].ascVector[(tInfo->pNumber - tr->mxtips - 1) * tr->partitionData[model].ascOffset]; ex2_asc = &tr->partitionData[model].ascExpVector[(tInfo->rNumber - tr->mxtips - 1) * ascWidth]; ex3_asc = &tr->partitionData[model].ascExpVector[(tInfo->pNumber - tr->mxtips - 1) * ascWidth]; for(k = 0; k < ascWidth; k++) ex3_asc[k] = ex2_asc[k]; } if(!tr->useFastScaling) { size_t k; int *ex2 = tr->partitionData[model].expVector[tInfo->rNumber - tr->mxtips - 1]; for(k = 0; k < width; k++) ex3[k] = ex2[k]; } break; case INNER_INNER: x1_start = tr->partitionData[model].xVector[tInfo->qNumber - tr->mxtips - 1]; x2_start = tr->partitionData[model].xVector[tInfo->rNumber - tr->mxtips - 1]; if(tr->saveMemory) { x1_gapColumn = &tr->partitionData[model].gapColumn[(tInfo->qNumber - tr->mxtips - 1) * states * rateHet]; x2_gapColumn = &tr->partitionData[model].gapColumn[(tInfo->rNumber - tr->mxtips - 1) * states * rateHet]; x3_gapColumn = &tr->partitionData[model].gapColumn[(tInfo->pNumber - tr->mxtips - 1) * states * rateHet]; } #ifdef _USE_PTHREADS if(tr->partitionData[model].ascBias && tr->threadID == 0) #else if(tr->partitionData[model].ascBias) #endif { size_t k; int *ex1_asc, *ex2_asc; x1_ascColumn = &tr->partitionData[model].ascVector[(tInfo->qNumber - tr->mxtips - 1) * tr->partitionData[model].ascOffset]; x2_ascColumn = &tr->partitionData[model].ascVector[(tInfo->rNumber - tr->mxtips - 1) * tr->partitionData[model].ascOffset]; x3_ascColumn = &tr->partitionData[model].ascVector[(tInfo->pNumber - tr->mxtips - 1) * tr->partitionData[model].ascOffset]; ex1_asc = &tr->partitionData[model].ascExpVector[(tInfo->qNumber - tr->mxtips - 1) * ascWidth]; ex2_asc = &tr->partitionData[model].ascExpVector[(tInfo->rNumber - tr->mxtips - 1) * ascWidth]; ex3_asc = &tr->partitionData[model].ascExpVector[(tInfo->pNumber - tr->mxtips - 1) * ascWidth]; for(k = 0; k < ascWidth; k++) ex3_asc[k] = ex1_asc[k] + ex2_asc[k]; } if(!tr->useFastScaling) { size_t k; int *ex1, *ex2; ex1 = tr->partitionData[model].expVector[tInfo->qNumber - tr->mxtips - 1]; ex2 = tr->partitionData[model].expVector[tInfo->rNumber - tr->mxtips - 1]; for(k = 0; k < width; k++) ex3[k] = ex1[k] + ex2[k]; } break; default: assert(0); } left = tr->partitionData[model].left; right = tr->partitionData[model].right; if(tr->multiBranch) { qz = tInfo->qz[model]; rz = tInfo->rz[model]; } else { qz = tInfo->qz[0]; rz = tInfo->rz[0]; } #ifdef _BASTIEN if(tr->doBastienStuff) { if(tr->multiBranch) assert(0); else { printf("\nnewview\n"); printf("Hello, I am the second derivative at the q branch %1.40f\n", tInfo->secondDerivativeQ[0]); printf("Hello, I am the second derivative at the r branch %1.40f\n", tInfo->secondDerivativeR[0]); } } #endif switch(tr->partitionData[model].dataType) { case BINARY_DATA: switch(tr->rateHetModel) { case CAT: { makeP(qz, rz, tr->partitionData[model].perSiteRates, tr->partitionData[model].EI, tr->partitionData[model].EIGN, tr->partitionData[model].numberOfCategories, left, right, BINARY_DATA, tr->saveMemory, tr->maxCategories); newviewGTRCAT_BINARY(tInfo->tipCase, tr->partitionData[model].EV, tr->partitionData[model].rateCategory, x1_start, x2_start, x3_start, tr->partitionData[model].tipVector, ex3, tipX1, tipX2, width, left, right, wgt, &scalerIncrement, tr->useFastScaling ); } break; case GAMMA: case GAMMA_I: { makeP(qz, rz, tr->partitionData[model].gammaRates, tr->partitionData[model].EI, tr->partitionData[model].EIGN, 4, left, right, BINARY_DATA, tr->saveMemory, tr->maxCategories); newviewGTRGAMMA_BINARY(tInfo->tipCase, x1_start, x2_start, x3_start, tr->partitionData[model].EV, tr->partitionData[model].tipVector, ex3, tipX1, tipX2, width, left, right, wgt, &scalerIncrement, tr->useFastScaling); } break; default: assert(0); } break; case DNA_DATA: switch(tr->rateHetModel) { case CAT: { makeP(qz, rz, tr->partitionData[model].perSiteRates, tr->partitionData[model].EI, tr->partitionData[model].EIGN, tr->partitionData[model].numberOfCategories, left, right, DNA_DATA, tr->saveMemory, tr->maxCategories); #if (defined(__SIM_SSE3) || defined(__AVX)) if(tr->saveMemory) { #ifdef __AVX newviewGTRCAT_AVX_GAPPED_SAVE(tInfo->tipCase, tr->partitionData[model].EV, tr->partitionData[model].rateCategory, x1_start, x2_start, x3_start, tr->partitionData[model].tipVector, ex3, tipX1, tipX2, width, left, right, wgt, &scalerIncrement, tr->useFastScaling, x1_gap, x2_gap, x3_gap, x1_gapColumn, x2_gapColumn, x3_gapColumn, tr->maxCategories); #else newviewGTRCAT_SAVE(tInfo->tipCase, tr->partitionData[model].EV, tr->partitionData[model].rateCategory, x1_start, x2_start, x3_start, tr->partitionData[model].tipVector, ex3, tipX1, tipX2, width, left, right, wgt, &scalerIncrement, tr->useFastScaling, x1_gap, x2_gap, x3_gap, x1_gapColumn, x2_gapColumn, x3_gapColumn, tr->maxCategories); #endif } else #endif { #ifdef __AVX newviewGTRCAT_AVX(tInfo->tipCase, tr->partitionData[model].EV, tr->partitionData[model].rateCategory, x1_start, x2_start, x3_start, tr->partitionData[model].tipVector, ex3, tipX1, tipX2, width, left, right, wgt, &scalerIncrement, tr->useFastScaling); #else newviewGTRCAT(tInfo->tipCase, tr->partitionData[model].EV, tr->partitionData[model].rateCategory, x1_start, x2_start, x3_start, tr->partitionData[model].tipVector, ex3, tipX1, tipX2, width, left, right, wgt, &scalerIncrement, tr->useFastScaling); #endif } } break; case GAMMA: case GAMMA_I: #ifdef _HET { double *parentEV; if(tInfo->parentIsTip) parentEV = tr->partitionData[model].EV_TIP; else parentEV = tr->partitionData[model].EV; switch(tInfo->tipCase) { case TIP_TIP: makeP_FlexHet(qz, rz, tr->partitionData[model].gammaRates, tr->partitionData[model].EI_TIP, tr->partitionData[model].EIGN_TIP, tr->partitionData[model].EI_TIP, tr->partitionData[model].EIGN_TIP, 4, left, right, 4); newviewGTRGAMMA(tInfo->tipCase, x1_start, x2_start, x3_start, parentEV, tr->partitionData[model].tipVector_TIP, ex3, tipX1, tipX2, width, left, right, wgt, &scalerIncrement, tr->useFastScaling, x1_presenceMap, x2_presenceMap); break; case TIP_INNER: assert(tInfo->qNumber <= tr->mxtips && tInfo->rNumber > tr->mxtips); makeP_FlexHet(qz, rz, tr->partitionData[model].gammaRates, tr->partitionData[model].EI_TIP, tr->partitionData[model].EIGN_TIP, tr->partitionData[model].EI, tr->partitionData[model].EIGN, 4, left, right, 4); newviewGTRGAMMA(tInfo->tipCase, x1_start, x2_start, x3_start, parentEV, tr->partitionData[model].tipVector_TIP, ex3, tipX1, tipX2, width, left, right, wgt, &scalerIncrement, tr->useFastScaling, x1_presenceMap, x2_presenceMap); break; case INNER_INNER: makeP_FlexHet(qz, rz, tr->partitionData[model].gammaRates, tr->partitionData[model].EI, tr->partitionData[model].EIGN, tr->partitionData[model].EI, tr->partitionData[model].EIGN, 4, left, right, 4); newviewGTRGAMMA(tInfo->tipCase, x1_start, x2_start, x3_start, parentEV, (double *)NULL, ex3, tipX1, tipX2, width, left, right, wgt, &scalerIncrement, tr->useFastScaling, x1_presenceMap, x2_presenceMap); break; default: assert(0); } } #else { makeP(qz, rz, tr->partitionData[model].gammaRates, tr->partitionData[model].EI, tr->partitionData[model].EIGN, 4, left, right, DNA_DATA, tr->saveMemory, tr->maxCategories); #if (defined(__SIM_SSE3) || defined(__AVX)) if(tr->saveMemory) { #ifdef __AVX newviewGTRGAMMA_AVX_GAPPED_SAVE(tInfo->tipCase, x1_start, x2_start, x3_start, tr->partitionData[model].EV, tr->partitionData[model].tipVector, ex3, tipX1, tipX2, width, left, right, wgt, &scalerIncrement, tr->useFastScaling, x1_gap, x2_gap, x3_gap, x1_gapColumn, x2_gapColumn, x3_gapColumn, x1_presenceMap, x2_presenceMap); #else newviewGTRGAMMA_GAPPED_SAVE(tInfo->tipCase, x1_start, x2_start, x3_start, tr->partitionData[model].EV, tr->partitionData[model].tipVector, ex3, tipX1, tipX2, width, left, right, wgt, &scalerIncrement, tr->useFastScaling, x1_gap, x2_gap, x3_gap, x1_gapColumn, x2_gapColumn, x3_gapColumn, x1_presenceMap, x2_presenceMap); #endif } else #endif { #ifdef __AVX newviewGTRGAMMA_AVX(tInfo->tipCase, x1_start, x2_start, x3_start, tr->partitionData[model].EV, tr->partitionData[model].tipVector, ex3, tipX1, tipX2, width, left, right, wgt, &scalerIncrement, tr->useFastScaling, x1_presenceMap, x2_presenceMap); #else newviewGTRGAMMA(tInfo->tipCase, x1_start, x2_start, x3_start, tr->partitionData[model].EV, tr->partitionData[model].tipVector, ex3, tipX1, tipX2, width, left, right, wgt, &scalerIncrement, tr->useFastScaling, x1_presenceMap, x2_presenceMap); #endif } } #endif break; default: assert(0); } break; case AA_DATA: switch(tr->rateHetModel) { case CAT: { makeP(qz, rz, tr->partitionData[model].perSiteRates, tr->partitionData[model].EI, tr->partitionData[model].EIGN, tr->partitionData[model].numberOfCategories, left, right, AA_DATA, tr->saveMemory, tr->maxCategories); #if (defined(__SIM_SSE3) || defined(__AVX)) if(tr->saveMemory) #ifdef __AVX newviewGTRCATPROT_AVX_GAPPED_SAVE(tInfo->tipCase, tr->partitionData[model].EV, tr->partitionData[model].rateCategory, x1_start, x2_start, x3_start, tr->partitionData[model].tipVector, ex3, tipX1, tipX2, width, left, right, wgt, &scalerIncrement, tr->useFastScaling, x1_gap, x2_gap, x3_gap, x1_gapColumn, x2_gapColumn, x3_gapColumn, tr->maxCategories); #else newviewGTRCATPROT_SAVE(tInfo->tipCase, tr->partitionData[model].EV, tr->partitionData[model].rateCategory, x1_start, x2_start, x3_start, tr->partitionData[model].tipVector, ex3, tipX1, tipX2, width, left, right, wgt, &scalerIncrement, tr->useFastScaling, x1_gap, x2_gap, x3_gap, x1_gapColumn, x2_gapColumn, x3_gapColumn, tr->maxCategories); #endif else #endif { #ifdef __AVX newviewGTRCATPROT_AVX(tInfo->tipCase, tr->partitionData[model].EV, tr->partitionData[model].rateCategory, x1_start, x2_start, x3_start, tr->partitionData[model].tipVector, ex3, tipX1, tipX2, width, left, right, wgt, &scalerIncrement, tr->useFastScaling); #else newviewGTRCATPROT(tInfo->tipCase, tr->partitionData[model].EV, tr->partitionData[model].rateCategory, x1_start, x2_start, x3_start, tr->partitionData[model].tipVector, ex3, tipX1, tipX2, width, left, right, wgt, &scalerIncrement, tr->useFastScaling); #endif } } break; case GAMMA: case GAMMA_I: if(tr->partitionData[model].protModels == LG4 || tr->partitionData[model].protModels == LG4X) { makeP_FlexLG4(qz, rz, tr->partitionData[model].gammaRates, tr->partitionData[model].EI_LG4, tr->partitionData[model].EIGN_LG4, 4, left, right, 20); #ifdef __AVX newviewGTRGAMMAPROT_AVX_LG4(tInfo->tipCase, x1_start, x2_start, x3_start, tr->partitionData[model].EV_LG4, tr->partitionData[model].tipVector_LG4, ex3, tipX1, tipX2, width, left, right, wgt, &scalerIncrement, tr->useFastScaling); #else newviewGTRGAMMAPROT_LG4(tInfo->tipCase, x1_start, x2_start, x3_start, tr->partitionData[model].EV_LG4, tr->partitionData[model].tipVector_LG4, ex3, tipX1, tipX2, width, left, right, wgt, &scalerIncrement, tr->useFastScaling); #endif } else { makeP(qz, rz, tr->partitionData[model].gammaRates, tr->partitionData[model].EI, tr->partitionData[model].EIGN, 4, left, right, AA_DATA, tr->saveMemory, tr->maxCategories); #if (defined(__SIM_SSE3) || defined(__AVX)) if(tr->saveMemory) #ifdef __AVX newviewGTRGAMMAPROT_AVX_GAPPED_SAVE(tInfo->tipCase, x1_start, x2_start, x3_start, tr->partitionData[model].EV, tr->partitionData[model].tipVector, ex3, tipX1, tipX2, width, left, right, wgt, &scalerIncrement, tr->useFastScaling, x1_gap, x2_gap, x3_gap, x1_gapColumn, x2_gapColumn, x3_gapColumn); #else newviewGTRGAMMAPROT_GAPPED_SAVE(tInfo->tipCase, x1_start, x2_start, x3_start, tr->partitionData[model].EV, tr->partitionData[model].tipVector, ex3, tipX1, tipX2, width, left, right, wgt, &scalerIncrement, tr->useFastScaling, x1_gap, x2_gap, x3_gap, x1_gapColumn, x2_gapColumn, x3_gapColumn); #endif else #endif { #ifdef __AVX newviewGTRGAMMAPROT_AVX(tInfo->tipCase, x1_start, x2_start, x3_start, tr->partitionData[model].EV, tr->partitionData[model].tipVector, ex3, tipX1, tipX2, width, left, right, wgt, &scalerIncrement, tr->useFastScaling); #else newviewGTRGAMMAPROT(tInfo->tipCase, x1_start, x2_start, x3_start, tr->partitionData[model].EV, tr->partitionData[model].tipVector, ex3, tipX1, tipX2, width, left, right, wgt, &scalerIncrement, tr->useFastScaling); #endif } } break; default: assert(0); } break; case SECONDARY_DATA_6: switch(tr->rateHetModel) { case CAT: { makeP(qz, rz, tr->partitionData[model].perSiteRates, tr->partitionData[model].EI, tr->partitionData[model].EIGN, tr->partitionData[model].numberOfCategories, left, right, SECONDARY_DATA_6, tr->saveMemory, tr->maxCategories); newviewGTRCATSECONDARY_6(tInfo->tipCase, tr->partitionData[model].EV, tr->partitionData[model].rateCategory, x1_start, x2_start, x3_start, tr->partitionData[model].tipVector, ex3, tipX1, tipX2, width, left, right, wgt, &scalerIncrement, tr->useFastScaling); } break; case GAMMA: case GAMMA_I: { makeP(qz, rz, tr->partitionData[model].gammaRates, tr->partitionData[model].EI, tr->partitionData[model].EIGN, 4, left, right, SECONDARY_DATA_6, tr->saveMemory, tr->maxCategories); newviewGTRGAMMASECONDARY_6(tInfo->tipCase, x1_start, x2_start, x3_start, tr->partitionData[model].EV, tr->partitionData[model].tipVector, ex3, tipX1, tipX2, width, left, right, wgt, &scalerIncrement, tr->useFastScaling); } break; default: assert(0); } break; case SECONDARY_DATA_7: switch(tr->rateHetModel) { case CAT: { makeP(qz, rz, tr->partitionData[model].perSiteRates, tr->partitionData[model].EI, tr->partitionData[model].EIGN, tr->partitionData[model].numberOfCategories, left, right, SECONDARY_DATA_7, tr->saveMemory, tr->maxCategories); newviewGTRCATSECONDARY_7(tInfo->tipCase, tr->partitionData[model].EV, tr->partitionData[model].rateCategory, x1_start, x2_start, x3_start, tr->partitionData[model].tipVector, ex3, tipX1, tipX2, width, left, right, wgt, &scalerIncrement, tr->useFastScaling); } break; case GAMMA: case GAMMA_I: { makeP(qz, rz, tr->partitionData[model].gammaRates, tr->partitionData[model].EI, tr->partitionData[model].EIGN, 4, left, right, SECONDARY_DATA_7, tr->saveMemory, tr->maxCategories); newviewGTRGAMMASECONDARY_7(tInfo->tipCase, x1_start, x2_start, x3_start, tr->partitionData[model].EV, tr->partitionData[model].tipVector, ex3, tipX1, tipX2, width, left, right, wgt, &scalerIncrement, tr->useFastScaling); } break; default: assert(0); } break; case SECONDARY_DATA: switch(tr->rateHetModel) { case CAT: { makeP(qz, rz, tr->partitionData[model].perSiteRates, tr->partitionData[model].EI, tr->partitionData[model].EIGN, tr->partitionData[model].numberOfCategories, left, right, SECONDARY_DATA, tr->saveMemory, tr->maxCategories); newviewGTRCATSECONDARY(tInfo->tipCase, tr->partitionData[model].EV, tr->partitionData[model].rateCategory, x1_start, x2_start, x3_start, tr->partitionData[model].tipVector, ex3, tipX1, tipX2, width, left, right, wgt, &scalerIncrement, tr->useFastScaling); } break; case GAMMA: case GAMMA_I: { makeP(qz, rz, tr->partitionData[model].gammaRates, tr->partitionData[model].EI, tr->partitionData[model].EIGN, 4, left, right, SECONDARY_DATA, tr->saveMemory, tr->maxCategories); newviewGTRGAMMASECONDARY(tInfo->tipCase, x1_start, x2_start, x3_start, tr->partitionData[model].EV, tr->partitionData[model].tipVector, ex3, tipX1, tipX2, width, left, right, wgt, &scalerIncrement, tr->useFastScaling); } break; default: assert(0); } break; case GENERIC_32: switch(tr->rateHetModel) { case CAT: { makeP_Flex(qz, rz, tr->partitionData[model].perSiteRates, tr->partitionData[model].EI, tr->partitionData[model].EIGN, tr->partitionData[model].numberOfCategories, left, right, states); newviewFlexCat(tInfo->tipCase, tr->partitionData[model].EV, tr->partitionData[model].rateCategory, x1_start, x2_start, x3_start, tr->partitionData[model].tipVector, ex3, tipX1, tipX2, width, left, right, wgt, &scalerIncrement, tr->useFastScaling, states); } break; case GAMMA: case GAMMA_I: { makeP_Flex(qz, rz, tr->partitionData[model].gammaRates, tr->partitionData[model].EI, tr->partitionData[model].EIGN, 4, left, right, states); newviewFlexGamma(tInfo->tipCase, x1_start, x2_start, x3_start, tr->partitionData[model].EV, tr->partitionData[model].tipVector, ex3, tipX1, tipX2, width, left, right, wgt, &scalerIncrement, tr->useFastScaling, states); } break; default: assert(0); } break; case GENERIC_64: break; default: assert(0); } #ifdef _USE_PTHREADS if(tr->partitionData[model].ascBias && tr->threadID == 0) #else if(tr->partitionData[model].ascBias) #endif { switch(tr->rateHetModel) { case CAT: { double rates = 1.0; //need to re-calculate transition probabilities assuming a rate of 1.0 makeP_Flex(qz, rz, &rates, tr->partitionData[model].EI, tr->partitionData[model].EIGN, 1, left, right, states); newviewAscCat(tInfo->tipCase, x1_ascColumn, x2_ascColumn, x3_ascColumn, tr->partitionData[model].EV, tr->partitionData[model].tipVector, ex3_asc, states, left, right, states, tr->partitionData[model].dataType, tInfo->qNumber, tInfo->rNumber, tr->partitionData[model].ascMissingVector, tr->mxtips); } break; case GAMMA: newviewAscGamma(tInfo->tipCase, x1_ascColumn, x2_ascColumn, x3_ascColumn, tr->partitionData[model].EV, tr->partitionData[model].tipVector, ex3_asc, states, left, right, states, tr->partitionData[model].dataType, tInfo->qNumber, tInfo->rNumber, tr->partitionData[model].ascMissingVector, tr->mxtips); break; default: assert(0); } } if(tr->useFastScaling) { tr->partitionData[model].globalScaler[tInfo->pNumber] = tr->partitionData[model].globalScaler[tInfo->qNumber] + tr->partitionData[model].globalScaler[tInfo->rNumber] + (unsigned int)scalerIncrement; assert(tr->partitionData[model].globalScaler[tInfo->pNumber] < INT_MAX); } } } #ifdef _DEBUG_MULTI_EPA printf("\n"); #endif } } void newviewGeneric (tree *tr, nodeptr p) { if(isTip(p->number, tr->mxtips)) return; tr->td[0].count = 1; computeTraversalInfo(tr, p, &(tr->td[0].ti[0]), &(tr->td[0].count), tr->mxtips, tr->numBranches); if(tr->td[0].count > 1) { #ifdef _USE_PTHREADS masterBarrier(THREAD_NEWVIEW, tr); #else newviewIterative(tr); #endif } } void newviewGenericMasked(tree *tr, nodeptr p) { if(isTip(p->number, tr->mxtips)) return; { int i; for(i = 0; i < tr->NumberOfModels; i++) { if(tr->partitionConverged[i]) tr->executeModel[i] = FALSE; else tr->executeModel[i] = TRUE; } { tr->td[0].count = 1; computeTraversalInfo(tr, p, &(tr->td[0].ti[0]), &(tr->td[0].count), tr->mxtips, tr->numBranches); if(tr->td[0].count > 1) { #ifdef _USE_PTHREADS masterBarrier(THREAD_NEWVIEW_MASKED, tr); #else newviewIterative(tr); #endif } } for(i = 0; i < tr->NumberOfModels; i++) tr->executeModel[i] = TRUE; } } #ifdef _USE_PTHREADS void newviewMultiGrain(tree *tr, double *x1, double *x2, double *x3, int *_ex1, int *_ex2, int *_ex3, unsigned char *_tipX1, unsigned char *_tipX2, int tipCase, double *_pz, double *_qz, int insertion) { int scalerIncrement = 0, model, columnCounter = 0, offsetCounter = 0; setPartitionMask(tr, insertion, tr->executeModel); #ifdef _DEBUG_MULTI_EPA if(tr->threadID == THREAD_TO_DEBUG) printf("NV %s: ", tr->nameList[tr->inserts[insertion]]); #endif for(model = 0; model < tr->NumberOfModels; model++) { int width = tr->partitionData[model].upper - tr->partitionData[model].lower; #ifdef _DEBUG_MULTI_EPA if(tr->threadID == THREAD_TO_DEBUG) printf("%d", tr->executeModel[model]); #endif if(tr->executeModel[model]) { double *x1_start = (double*)NULL, *x2_start = (double*)NULL, *x3_start = (double*)NULL, *left = tr->partitionData[model].left, *right = tr->partitionData[model].right, pz, qz; int *wgt = &tr->contiguousWgt[columnCounter], *rateCategory = &tr->contiguousRateCategory[columnCounter], *ex1 = (int*)NULL, *ex2 = (int*)NULL, *ex3 = (int*)NULL; unsigned char *tipX1 = (unsigned char *)NULL, *tipX2 = (unsigned char *)NULL; //TODO fix this or not? unsigned int x1_presenceMap = ~0, x2_presenceMap = ~0; switch(tipCase) { case TIP_TIP: tipX1 = &_tipX1[columnCounter]; tipX2 = &_tipX2[columnCounter]; ex3 = &_ex3[columnCounter]; x3_start = &x3[offsetCounter]; if(!tr->useFastScaling) { int k; for(k = 0; k < width; k++) ex3[k] = 0; } break; case TIP_INNER: tipX1 = &_tipX1[columnCounter]; ex2 = &_ex2[columnCounter]; x2_start = &x2[offsetCounter]; ex3 = &_ex3[columnCounter]; x3_start = &x3[offsetCounter]; if(!tr->useFastScaling) { int k; for(k = 0; k < width; k++) ex3[k] = ex2[k]; } break; case INNER_INNER: ex1 = &_ex1[columnCounter]; x1_start = &x1[offsetCounter]; ex2 = &_ex2[columnCounter]; x2_start = &x2[offsetCounter]; ex3 = &_ex3[columnCounter]; x3_start = &x3[offsetCounter]; if(!tr->useFastScaling) { int k; for(k = 0; k < width; k++) ex3[k] = ex1[k] + ex2[k]; } break; default: assert(0); } if(tr->multiBranch) { pz = _pz[model]; pz = (pz > zmin) ? log(pz) : log(zmin); qz = _qz[model]; qz = (qz > zmin) ? log(qz) : log(zmin); } else { pz = _pz[0]; pz = (pz > zmin) ? log(pz) : log(zmin); qz = _qz[0]; qz = (qz > zmin) ? log(qz) : log(zmin); } switch(tr->partitionData[model].dataType) { case BINARY_DATA: switch(tr->rateHetModel) { case CAT: makeP(pz, qz, tr->partitionData[model].perSiteRates, tr->partitionData[model].EI, tr->partitionData[model].EIGN, tr->partitionData[model].numberOfCategories, left, right, BINARY_DATA, tr->saveMemory, tr->maxCategories); newviewGTRCAT_BINARY(tipCase, tr->partitionData[model].EV, rateCategory, x1_start, x2_start, x3_start, tr->partitionData[model].tipVector, ex3, tipX1, tipX2, width, left, right, wgt, &scalerIncrement, tr->useFastScaling ); break; case GAMMA: case GAMMA_I: makeP(pz, qz, tr->partitionData[model].gammaRates, tr->partitionData[model].EI, tr->partitionData[model].EIGN, 4, left, right, BINARY_DATA, tr->saveMemory, tr->maxCategories); newviewGTRGAMMA_BINARY(tipCase, x1_start, x2_start, x3_start, tr->partitionData[model].EV, tr->partitionData[model].tipVector, ex3, tipX1, tipX2, width, left, right, wgt, &scalerIncrement, tr->useFastScaling); break; default: assert(0); } break; case DNA_DATA: switch(tr->rateHetModel) { case CAT: makeP(pz, qz, tr->partitionData[model].perSiteRates, tr->partitionData[model].EI, tr->partitionData[model].EIGN, tr->partitionData[model].numberOfCategories, left, right, DNA_DATA, tr->saveMemory, tr->maxCategories); #ifdef __AVX newviewGTRCAT_AVX(tipCase, tr->partitionData[model].EV, rateCategory, x1_start, x2_start, x3_start, tr->partitionData[model].tipVector, ex3, tipX1, tipX2, width, left, right, wgt, &scalerIncrement, tr->useFastScaling); #else newviewGTRCAT(tipCase, tr->partitionData[model].EV, rateCategory, x1_start, x2_start, x3_start, tr->partitionData[model].tipVector, ex3, tipX1, tipX2, width, left, right, wgt, &scalerIncrement, tr->useFastScaling ); #endif break; case GAMMA: case GAMMA_I: makeP(pz, qz, tr->partitionData[model].gammaRates, tr->partitionData[model].EI, tr->partitionData[model].EIGN, 4, left, right, DNA_DATA, tr->saveMemory, tr->maxCategories); #ifdef __AVX newviewGTRGAMMA_AVX(tipCase, x1_start, x2_start, x3_start, tr->partitionData[model].EV, tr->partitionData[model].tipVector, ex3, tipX1, tipX2, width, left, right, wgt, &scalerIncrement, tr->useFastScaling, x1_presenceMap, x2_presenceMap); #else newviewGTRGAMMA(tipCase, x1_start, x2_start, x3_start, tr->partitionData[model].EV, tr->partitionData[model].tipVector, ex3, tipX1, tipX2, width, left, right, wgt, &scalerIncrement, tr->useFastScaling, x1_presenceMap, x2_presenceMap); #endif break; default: assert(0); } break; case AA_DATA: switch(tr->rateHetModel) { case CAT: makeP(pz, qz, tr->partitionData[model].perSiteRates, tr->partitionData[model].EI, tr->partitionData[model].EIGN, tr->partitionData[model].numberOfCategories, left, right, AA_DATA, tr->saveMemory, tr->maxCategories); #ifdef __AVX newviewGTRCATPROT_AVX(tipCase, tr->partitionData[model].EV, rateCategory, x1_start, x2_start, x3_start, tr->partitionData[model].tipVector, ex3, tipX1, tipX2, width, left, right, wgt, &scalerIncrement, tr->useFastScaling); #else newviewGTRCATPROT(tipCase, tr->partitionData[model].EV, rateCategory, x1_start, x2_start, x3_start, tr->partitionData[model].tipVector, ex3, tipX1, tipX2, width, left, right, wgt, &scalerIncrement, tr->useFastScaling); #endif break; case GAMMA: case GAMMA_I: makeP(pz, qz, tr->partitionData[model].gammaRates, tr->partitionData[model].EI, tr->partitionData[model].EIGN, 4, left, right, AA_DATA, tr->saveMemory, tr->maxCategories); #ifdef __AVX newviewGTRGAMMAPROT_AVX(tipCase, x1_start, x2_start, x3_start, tr->partitionData[model].EV, tr->partitionData[model].tipVector, ex3, tipX1, tipX2, width, left, right, wgt, &scalerIncrement, tr->useFastScaling); #else newviewGTRGAMMAPROT(tipCase, x1_start, x2_start, x3_start, tr->partitionData[model].EV, tr->partitionData[model].tipVector, ex3, tipX1, tipX2, width, left, right, wgt, &scalerIncrement, tr->useFastScaling); #endif break; default: assert(0); } break; case SECONDARY_DATA: switch(tr->rateHetModel) { case CAT: makeP(pz, qz, tr->partitionData[model].perSiteRates, tr->partitionData[model].EI, tr->partitionData[model].EIGN, tr->partitionData[model].numberOfCategories, left, right, SECONDARY_DATA, tr->saveMemory, tr->maxCategories); newviewGTRCATSECONDARY(tipCase, tr->partitionData[model].EV, rateCategory, x1_start, x2_start, x3_start, tr->partitionData[model].tipVector, ex3, tipX1, tipX2, width, left, right, wgt, &scalerIncrement, tr->useFastScaling); break; case GAMMA: case GAMMA_I: makeP(pz, qz, tr->partitionData[model].gammaRates, tr->partitionData[model].EI, tr->partitionData[model].EIGN, 4, left, right, SECONDARY_DATA, tr->saveMemory, tr->maxCategories); newviewGTRGAMMASECONDARY(tipCase, x1_start, x2_start, x3_start, tr->partitionData[model].EV, tr->partitionData[model].tipVector, ex3, tipX1, tipX2, width, left, right, wgt, &scalerIncrement, tr->useFastScaling); break; default: assert(0); } break; case SECONDARY_DATA_6: switch(tr->rateHetModel) { case CAT: makeP(pz, qz, tr->partitionData[model].perSiteRates, tr->partitionData[model].EI, tr->partitionData[model].EIGN, tr->partitionData[model].numberOfCategories, left, right, SECONDARY_DATA_6, tr->saveMemory, tr->maxCategories); newviewGTRCATSECONDARY_6(tipCase, tr->partitionData[model].EV, rateCategory, x1_start, x2_start, x3_start, tr->partitionData[model].tipVector, ex3, tipX1, tipX2, width, left, right, wgt, &scalerIncrement, tr->useFastScaling); break; case GAMMA: case GAMMA_I: makeP(pz, qz, tr->partitionData[model].gammaRates, tr->partitionData[model].EI, tr->partitionData[model].EIGN, 4, left, right, SECONDARY_DATA_6, tr->saveMemory, tr->maxCategories); newviewGTRGAMMASECONDARY_6(tipCase, x1_start, x2_start, x3_start, tr->partitionData[model].EV, tr->partitionData[model].tipVector, ex3, tipX1, tipX2, width, left, right, wgt, &scalerIncrement, tr->useFastScaling); break; default: assert(0); } break; case SECONDARY_DATA_7: switch(tr->rateHetModel) { case CAT: makeP(pz, qz, tr->partitionData[model].perSiteRates, tr->partitionData[model].EI, tr->partitionData[model].EIGN, tr->partitionData[model].numberOfCategories, left, right, SECONDARY_DATA_7, tr->saveMemory, tr->maxCategories); newviewGTRCATSECONDARY_7(tipCase, tr->partitionData[model].EV, rateCategory, x1_start, x2_start, x3_start, tr->partitionData[model].tipVector, ex3, tipX1, tipX2, width, left, right, wgt, &scalerIncrement, tr->useFastScaling); break; case GAMMA: case GAMMA_I: makeP(pz, qz, tr->partitionData[model].gammaRates, tr->partitionData[model].EI, tr->partitionData[model].EIGN, 4, left, right, SECONDARY_DATA_7, tr->saveMemory, tr->maxCategories); newviewGTRGAMMASECONDARY_7(tipCase, x1_start, x2_start, x3_start, tr->partitionData[model].EV, tr->partitionData[model].tipVector, ex3, tipX1, tipX2, width, left, right, wgt, &scalerIncrement, tr->useFastScaling); break; default: assert(0); } break; case GENERIC_32: { int states = tr->partitionData[model].states; switch(tr->rateHetModel) { case CAT: { makeP_Flex(pz, qz, tr->partitionData[model].perSiteRates, tr->partitionData[model].EI, tr->partitionData[model].EIGN, tr->partitionData[model].numberOfCategories, left, right, states); newviewFlexCat(tipCase, tr->partitionData[model].EV, rateCategory, x1_start, x2_start, x3_start, tr->partitionData[model].tipVector, ex3, tipX1, tipX2, width, left, right, wgt, &scalerIncrement, tr->useFastScaling, states); } break; case GAMMA: case GAMMA_I: { makeP_Flex(pz, qz, tr->partitionData[model].gammaRates, tr->partitionData[model].EI, tr->partitionData[model].EIGN, 4, left, right, states); newviewFlexGamma(tipCase, x1_start, x2_start, x3_start, tr->partitionData[model].EV, tr->partitionData[model].tipVector, ex3, tipX1, tipX2, width, left, right, wgt, &scalerIncrement, tr->useFastScaling, states); } break; default: assert(0); } } break; default: assert(0); } } columnCounter += width; offsetCounter += width * tr->partitionData[model].states * tr->discreteRateCategories; } #ifdef _DEBUG_MULTI_EPA if(tr->threadID == THREAD_TO_DEBUG) printf("\n"); #endif resetPartitionMask(tr, tr->executeModel); } void newviewClassify(tree *tr, branchInfo *b, double *z, int insertion) { int leftNumber = b->epa->leftNodeNumber, rightNumber = b->epa->rightNodeNumber, tipCase = -1, *ex1 = (int*)NULL, *ex2 = (int*)NULL, *ex3 = tr->temporaryScaling; double *x1_start = (double*)NULL, *x2_start = (double*)NULL, *x3_start = tr->temporaryVector; unsigned char *tipX1 = (unsigned char*)NULL, *tipX2 = (unsigned char*)NULL; if (isTip(leftNumber, tr->mxtips) && isTip(rightNumber, tr->mxtips)) { tipCase = TIP_TIP; tipX1 = tr->contiguousTips[leftNumber]; tipX2 = tr->contiguousTips[rightNumber]; } else { if (isTip(leftNumber, tr->mxtips)) { tipCase = TIP_INNER; tipX1 = tr->contiguousTips[leftNumber]; x2_start = b->epa->right; ex2 = b->epa->rightScaling; } else { if(isTip(rightNumber, tr->mxtips)) { tipCase = TIP_INNER; tipX1 = tr->contiguousTips[rightNumber]; x2_start = b->epa->left; ex2 = b->epa->leftScaling; } else { tipCase = INNER_INNER; x1_start = b->epa->left; ex1 = b->epa->leftScaling; x2_start = b->epa->right; ex2 = b->epa->rightScaling; } } } newviewMultiGrain(tr, x1_start, x2_start, x3_start, ex1, ex2, ex3, tipX1, tipX2, tipCase, z, z, insertion); } #endif