/** @file TreePiece.cpp */ #include #include #include #include // jetley #include "limits.h" #include "ParallelGravity.h" #include "DataManager.h" #include "Reductions.h" // jetley #include "MultistepLB.h" #include "MultistepLB_notopo.h" #include "MultistepNodeLB_notopo.h" #include "Orb3dLB.h" #include "Orb3dLB_notopo.h" #include "HierarchOrbLB.h" // jetley - refactoring //#include "codes.h" #include "Opt.h" #include "Compute.h" #include "TreeWalk.h" //#include "State.h" #include "Space.h" #include "gravity.h" #include "smooth.h" #include "PETreeMerger.h" #include "IntraNodeLBManager.h" #include "CkLoopAPI.h" #if !CMK_LB_USER_DATA #error "Please recompile charm with --enable-lbuserdata" #endif #ifdef CUDA #ifdef CUDA_INSTRUMENT_WRS #define GPU_INSTRUMENT_WRS #include "wr.h" #endif #endif /* // uncomment when using cuda version of charm but running without GPUs struct workRequest; void kernelSelect(workRequest *wr) { } */ #ifdef PUSH_GRAVITY #include "ckmulticast.h" #endif using namespace std; using namespace SFC; using namespace TreeStuff; using namespace TypeHandling; int TreeStuff::maxBucketSize; #ifdef PUSH_GRAVITY extern CkGroupID ckMulticastGrpId; #endif CkpvExtern(int, _lb_obj_index); //forward declaration string getColor(GenericTreeNode*); const char *typeString(NodeType type); /* * set periodic information in all the TreePieces */ void TreePiece::setPeriodic(int nRepsPar, // Number of replicas in // each direction Vector3D fPeriodPar, // Size of periodic box int bEwaldPar, // Use Ewald summation double fEwCutPar, // Cutoff on real summation double dEwhCutPar, // Cutoff on Fourier summation int bPeriodPar, // Periodic boundaries int bComovePar, // Comoving coordinates double dRhoFacPar // Background density ) { nReplicas = nRepsPar; fPeriod = fPeriodPar; bEwald = bEwaldPar; fEwCut = fEwCutPar; dEwhCut = dEwhCutPar; bPeriodic = bPeriodPar; bComove = bComovePar; dRhoFac = dRhoFacPar; if(ewt == NULL) { ewt = new EWT[nMaxEwhLoop]; } } // Scale velocities (needed to convert to canonical momenta for // comoving coordinates. void TreePiece::velScale(double dScale, const CkCallback& cb) { for(unsigned int i = 0; i < myNumParticles; ++i) { myParticles[i+1].velocity *= dScale; if(TYPETest(&myParticles[i+1], TYPE_GAS)) myParticles[i+1].vPred() *= dScale; } contribute(cb); } /// After the bounding box has been found, we can assign keys to the particles void TreePiece::assignKeys(CkReductionMsg* m) { if(m->getSize() != sizeof(OrientedBox)) { ckerr << thisIndex << ": TreePiece: Fatal: Wrong size reduction message received!" << endl; CkAssert(0); callback.send(0); delete m; return; } boundingBox = *static_cast *>(m->getData()); delete m; if(thisIndex == 0 && verbosity > 1) ckout << "TreePiece: Bounding box originally: " << boundingBox << endl; //give particles keys, using bounding box to scale if((domainDecomposition!=ORB_dec) && (domainDecomposition!=ORB_space_dec) && myNumParticles > 0){ // get longest axis Vector3D bsize = boundingBox.size(); float max = (bsize.x > bsize.y) ? bsize.x : bsize.y; max = (max > bsize.z) ? max : bsize.z; // // Make the bounding box cubical. // Vector3D bcenter = boundingBox.center(); const float fEps = 1.0 + 9.5e-7; // slop to ensure keys fall // between 0 and 1. bsize = Vector3D(fEps*0.5*max); boundingBox = OrientedBox(bcenter-bsize, bcenter+bsize); if(thisIndex == 0 && verbosity > 1) ckout << "TreePiece: Bounding box now: " << boundingBox << endl; myParticles[0].key = firstPossibleKey; myParticles[myNumParticles+1].key = lastPossibleKey; for(unsigned int i = 0; i < myNumParticles; ++i) { myParticles[i+1].key = generateKey(myParticles[i+1].position, boundingBox); } sort(&myParticles[1], &myParticles[myNumParticles+1]); } #if COSMO_DEBUG > 1 char fout[100]; sprintf(fout,"tree.%d.%d.before",thisIndex,iterationNo); ofstream ofs(fout); for (int i=1; i<=myNumParticles; ++i) ofs << keyBits(myParticles[i].key,KeyBits) << " " << myParticles[i].position[0] << " " << myParticles[i].position[1] << " " << myParticles[i].position[2] << endl; ofs.close(); #endif if(verbosity >= 5) cout << thisIndex << ": TreePiece: Assigned keys to all my particles" << endl; contribute(callback); } /**************ORB Decomposition***************/ /// Three comparison routines used in sort and upper_bound /// to order particles in each of three dimensions, respectively bool comp_dim0(GravityParticle p1, GravityParticle p2) { return p1.position[0] < p2.position[0]; } bool comp_dim1(GravityParticle p1, GravityParticle p2) { return p1.position[1] < p2.position[1]; } bool comp_dim2(GravityParticle p1, GravityParticle p2) { return p1.position[2] < p2.position[2]; } ///Initialize stuff before doing ORB decomposition void TreePiece::initORBPieces(const CkCallback& cb){ OrientedBox box = boundingBox; orbBoundaries.clear(); orbBoundaries.push_back(myParticles+1); orbBoundaries.push_back(myParticles+myNumParticles+1); firstTime=true; phase=0; //Initialize function pointers compFuncPtr[0]= &comp_dim0; compFuncPtr[1]= &comp_dim1; compFuncPtr[2]= &comp_dim2; myBinCountsORB.clear(); myBinCountsORB.push_back(myNumParticles); //Find out how many levels will be there before we go into the tree owned //completely by the TreePiece chunkRootLevel=0; unsigned int tmp = numTreePieces; while(tmp){ tmp >>= 1; chunkRootLevel++; } chunkRootLevel--; boxes = new OrientedBox[chunkRootLevel+1]; splitDims = new char[chunkRootLevel+1]; boxes[0] = boundingBox; contribute(sizeof(OrientedBox), &box, boxReduction, cb); } /// Allocate memory for sorted particles. /// @param cb callback after everything is sorted. /// @param cback callback to perform now. void TreePiece::initBeforeORBSend(unsigned int myCount, unsigned int myCountGas, unsigned int myCountStar, const CkCallback& cb, const CkCallback& cback){ callback = cb; CkCallback nextCallback = cback; if(numTreePieces == 1) { myCount = myNumParticles; myCountGas = myNumSPH; myCountStar = myNumStar; } myExpectedCount = myCount; myExpectedCountSPH = myCountGas; myExpectedCountStar = myCountStar; mySortedParticles.clear(); mySortedParticles.reserve(myExpectedCount); mySortedParticlesSPH.clear(); mySortedParticlesSPH.reserve(myExpectedCountSPH); mySortedParticlesStar.clear(); mySortedParticlesStar.reserve(myExpectedCountStar); contribute(nextCallback); } void TreePiece::sendORBParticles(){ std::list::iterator iter; std::list::iterator iter2; int i=0; int nCounts = myBinCountsORB.size()/3; // Gas counts are in second // half and star counts are // in third half for(iter=orbBoundaries.begin();iter!=orbBoundaries.end();iter++,i++){ iter2=iter; iter2++; if(iter2==orbBoundaries.end()) break; if(myBinCountsORB[i]>0) { extraSPHData *pGasOut = NULL; if(myBinCountsORB[nCounts+i] > 0) { pGasOut = new extraSPHData[myBinCountsORB[nCounts+i]]; if (verbosity>=3) CkPrintf("me:%d to:%d nPart :%d, nGas:%d\n", thisIndex, i, *iter2 - *iter, myBinCountsORB[nCounts+i]); int iGasOut = 0; for(GravityParticle *pPart = *iter; pPart < *iter2; pPart++) { if(TYPETest(pPart, TYPE_GAS)) { pGasOut[iGasOut] = *(extraSPHData *)pPart->extraData; iGasOut++; } } } extraStarData *pStarOut = NULL; if(myBinCountsORB[2*nCounts+i] > 0) { pStarOut = new extraStarData[myBinCountsORB[2*nCounts+i]]; int iStarOut = 0; for(GravityParticle *pPart = *iter; pPart < *iter2; pPart++) { if(pPart->isStar()) { pStarOut[iStarOut] = *(extraStarData *)pPart->extraData; iStarOut++; } } } if(i==thisIndex){ acceptORBParticles(*iter,myBinCountsORB[i], pGasOut, myBinCountsORB[nCounts+i], pStarOut, myBinCountsORB[2*nCounts+i]); } else{ pieces[i].acceptORBParticles(*iter,myBinCountsORB[i], pGasOut, myBinCountsORB[nCounts+i], pStarOut, myBinCountsORB[2*nCounts+i]); } if(pGasOut != NULL) delete[] pGasOut; if(pStarOut != NULL) delete[] pStarOut; } } if(myExpectedCount > (int) myNumParticles){ delete [] myParticles; nStore = (int)((myExpectedCount + 2)*(1.0 + dExtraStore)); myParticles = new GravityParticle[nStore]; } myNumParticles = myExpectedCount; if(myExpectedCountSPH > (int) myNumSPH){ if(nStoreSPH > 0) delete [] mySPHParticles; nStoreSPH = (int)(myExpectedCountSPH*(1.0 + dExtraStore)); mySPHParticles = new extraSPHData[nStoreSPH]; } myNumSPH = myExpectedCountSPH; if(myExpectedCountStar > (int) myNumStar){ delete [] myStarParticles; allocateStars(); } myNumStar = myExpectedCountStar; if(myExpectedCount == 0) // No particles. Make sure transfer is // complete acceptORBParticles(NULL, 0, NULL, 0, NULL, 0); } /// Accept particles from other TreePieces once the sorting has finished void TreePiece::acceptORBParticles(const GravityParticle* particles, const int n, const extraSPHData *pGas, const int nGasIn, const extraStarData *pStar, const int nStarIn) { copy(particles, particles + n, back_inserter(mySortedParticles)); copy(pGas, pGas + nGasIn, back_inserter(mySortedParticlesSPH)); copy(pStar, pStar + nStarIn, back_inserter(mySortedParticlesStar)); if(myExpectedCount == mySortedParticles.size()) { //I've got all my particles //Assigning keys to particles for(int i=0;i1) ckout << thisIndex <<" contributing to accept particles"<cb; std::list::iterator iter; std::list::iterator iter2; iter = orbBoundaries.begin(); iter2 = orbBoundaries.begin(); iter2++; phase++; int index = thisIndex >> (chunkRootLevel-phase+1); Key lastBit; lastBit = thisIndex >> (chunkRootLevel-phase); lastBit = lastBit & 0x1; boxes[phase] = boxes[phase-1]; if(lastBit){ boxes[phase].lesser_corner[splittersMsg->dim[index]] = splittersMsg->pos[index]; } else{ boxes[phase].greater_corner[splittersMsg->dim[index]] = splittersMsg->pos[index]; } splitDims[phase-1]=splittersMsg->dim[index]; for(int i=0;ilength;i++){ int dimen=(int)splittersMsg->dim[i]; //Current location of the division is stored in a variable //Evaluate the number of particles in each division GravityParticle dummy; Vector3D divide(0.0,0.0,0.0); divide[dimen] = splittersMsg->pos[i]; dummy.position = divide; GravityParticle* divEnd = upper_bound(*iter,*iter2,dummy,compFuncPtr[dimen]); orbBoundaries.insert(iter2,divEnd); iter = iter2; iter2++; } firstTime = true; // First part is total particles, second part is gas counts, third // part is star counts myBinCountsORB.assign(6*splittersMsg->length,0); copy(tempBinCounts.begin(),tempBinCounts.end(),myBinCountsORB.begin()); delete splittersMsg; contribute(cback); } /// @brief Evaluate particle counts for ORB decomposition /// @param m A message containing splitting dimensions and splits, and /// the callback to contribute /// Counts the particles of this treepiece on each side of the /// splits. These counts are summed in a contribution to the /// specified callback. /// void TreePiece::evaluateParticleCounts(ORBSplittersMsg *splittersMsg) { CkCallback& cback = splittersMsg->cb; // For each split, BinCounts has total lower, total higher. // The second half of the array has the counts for gas particles. // The third half of the array has the counts for star particles. tempBinCounts.assign(6*splittersMsg->length,0); std::list::iterator iter; std::list::iterator iter2; iter = orbBoundaries.begin(); iter2 = orbBoundaries.begin(); iter2++; for(int i=0;ilength;i++){ int dimen = (int)splittersMsg->dim[i]; if(firstTime){ sort(*iter,*iter2,compFuncPtr[dimen]); } //Evaluate the number of particles in each division GravityParticle dummy; GravityParticle* divStart = *iter; Vector3D divide(0.0,0.0,0.0); divide[dimen] = splittersMsg->pos[i]; dummy.position = divide; GravityParticle* divEnd = upper_bound(*iter,*iter2,dummy,compFuncPtr[dimen]); tempBinCounts[2*i] = divEnd - divStart; tempBinCounts[2*i + 1] = myBinCountsORB[i] - (divEnd - divStart); int nGasLow = 0; int nGasHigh = 0; int nStarLow = 0; int nStarHigh = 0; for(GravityParticle *pPart = divStart; pPart < divEnd; pPart++) { // Count gas if(TYPETest(pPart, TYPE_GAS)) nGasLow++; // Count stars if(TYPETest(pPart, TYPE_STAR)) nStarLow++; } for(GravityParticle *pPart = divEnd; pPart < *iter2; pPart++) { // Count gas if(TYPETest(pPart, TYPE_GAS)) nGasHigh++; // Count stars if(TYPETest(pPart, TYPE_STAR)) nStarHigh++; } tempBinCounts[2*splittersMsg->length + 2*i] = nGasLow; tempBinCounts[2*splittersMsg->length + 2*i + 1] = nGasHigh; tempBinCounts[4*splittersMsg->length + 2*i] = nStarLow; tempBinCounts[4*splittersMsg->length + 2*i + 1] = nStarHigh; iter++; iter2++; } if(firstTime) firstTime=false; contribute(6*splittersMsg->length*sizeof(int), &(*tempBinCounts.begin()), CkReduction::sum_int, cback); delete splittersMsg; } #ifdef REDUCTION_HELPER void ReductionHelper::evaluateBoundaries(SFC::Key* keys, const int n, int skipEvery, const CkCallback& cb){ splitters.assign(keys, keys + n); if(localTreePieces.presentTreePieces.size() == 0){ int numBins = skipEvery ? n - (n-1)/(skipEvery+1) - 1 : n - 1; int64_t *dummy = new int64_t[numBins]; for(int i = 0; i < numBins; i++) dummy[i] = 0; contribute(sizeof(int64_t)*numBins, dummy, CkReduction::sum_long, cb); delete [] dummy; return; } for(int i = 0; i < localTreePieces.presentTreePieces.size(); i++){ localTreePieces.presentTreePieces[i]->evaluateBoundaries(keys, n, skipEvery, cb); } } void ReductionHelper::evaluateBoundaries(const CkBitVector &binsToSplit, const CkCallback& cb) { std::vector newSplitters; SFC::Key leftBound, rightBound; newSplitters.reserve(splitters.size() * 4); newSplitters.push_back(SFC::firstPossibleKey); for (int i = 0; i < binsToSplit.Length(); i++) { if (binsToSplit.Test(i) == true) { leftBound = splitters[i]; rightBound = splitters[i + 1]; if (newSplitters.back() != (rightBound | 7L) ) { if (newSplitters.back() != (leftBound | 7L)) { newSplitters.push_back(leftBound | 7L); } newSplitters.push_back((leftBound / 4 * 3 + rightBound / 4) | 7L); newSplitters.push_back((leftBound / 2 + rightBound / 2) | 7L); newSplitters.push_back((leftBound / 4 + rightBound / 4 * 3) | 7L); newSplitters.push_back(rightBound | 7L); } } } if (newSplitters.back() != lastPossibleKey) { newSplitters.push_back(lastPossibleKey); } evaluateBoundaries(&newSplitters[0], newSplitters.size(), 0, cb); } #endif /// Determine my part of the sorting histograms by counting the number /// of my particles in each bin. /// This routine assumes the particles in key order. /// The parameter skipEvery means that every "skipEvery" bins counted, one /// must be skipped. When skipEvery is set, the keys are in groups of /// "skipEvery" size, and only splits within each group need to be /// evaluated. Hence the counts between the end of one group, and the /// start of the next group are not evaluated. This feature is used /// by the Oct decomposition. void TreePiece::evaluateBoundaries(SFC::Key* keys, const int n, int skipEvery, const CkCallback& cb){ #ifdef COSMO_EVENT double startTimer = CmiWallTimer(); #endif int numBins = skipEvery ? n - (n-1)/(skipEvery+1) - 1 : n - 1; //this array will contain the number of particles I own in each bin int64_t *myCounts; #ifdef REDUCTION_HELPER myCounts = new int64_t[numBins]; #else //myBinCounts.assign(numBins, 0); myBinCounts.resize(numBins); myCounts = myBinCounts.getVec(); #endif memset(myCounts, 0, numBins*sizeof(int64_t)); if (myNumParticles > 0) { Key* endKeys = keys+n; GravityParticle *binBegin = &myParticles[1]; GravityParticle *binEnd; GravityParticle dummy; GravityParticle *interpolatedBound; GravityParticle *refinedLowerBound; GravityParticle *refinedUpperBound; //int binIter = 0; //vector::iterator binIter = myBinCounts.begin(); //vector::iterator keyIter = dm->boundaryKeys.begin(); Key* keyIter = lower_bound(keys, keys+n, binBegin->key); int binIter = skipEvery ? (keyIter-keys) - (keyIter-keys-1) / (skipEvery+1) - 1: keyIter - keys - 1; int skip = skipEvery ? skipEvery - (keyIter-keys-1) % (skipEvery+1) : -1; if (binIter == -1) { dummy.key = keys[0]; binBegin = upper_bound(binBegin, &myParticles[myNumParticles+1], dummy); keyIter++; binIter++; skip = skipEvery ? skipEvery : -1; } for( ; keyIter != endKeys; ++keyIter) { dummy.key = *keyIter; if (domainDecomposition == SFC_dec || domainDecomposition == SFC_peano_dec || domainDecomposition == SFC_peano_dec_3D || domainDecomposition == SFC_peano_dec_2D) { // try to guess a better upper bound ptrdiff_t remainingParticles = &myParticles[myNumParticles + 1] - binBegin; ptrdiff_t remainingBins = endKeys - keyIter; ptrdiff_t interpolationInterval = remainingParticles / remainingBins; ptrdiff_t scaledInterval = (ptrdiff_t) ( (double) interpolationInterval * 1.5); if (remainingParticles > scaledInterval) { interpolatedBound = binBegin + scaledInterval; } else { interpolatedBound = binBegin + interpolationInterval; } if (interpolatedBound->key <= dummy.key) { refinedLowerBound = interpolatedBound; refinedUpperBound = &myParticles[myNumParticles + 1]; } else { refinedLowerBound = binBegin; refinedUpperBound = interpolatedBound; } /// find the last place I could put this splitter key in /// my array of particles binEnd = upper_bound(refinedLowerBound, refinedUpperBound, dummy); } else { binEnd = upper_bound(binBegin, &myParticles[myNumParticles+1], dummy); } /// this tells me the number of particles between the /// last two splitter keys if (skip != 0) { myCounts[binIter] = ((int64_t)(binEnd - binBegin)); ++binIter; --skip; } else { skip = skipEvery; } if(&myParticles[myNumParticles+1] <= binEnd) break; binBegin = binEnd; } #ifdef COSMO_EVENTS traceUserBracketEvent(boundaryEvaluationUE, startTimer, CmiWallTimer()); #endif } //send my bin counts back in a reduction #ifdef REDUCTION_HELPER reductionHelperProxy.ckLocalBranch()->reduceBinCounts(numBins, myCounts, cb); delete[] myCounts; #else contribute(numBins * sizeof(int64_t), myCounts, CkReduction::sum_long, cb); #endif } void TreePiece::unshuffleParticlesWoDD(const CkCallback& callback) { double tpLoad; myShuffleMsg = NULL; after_dd_callback = callback; if (dm == NULL) { dm = (DataManager*)CkLocalNodeBranch(dataManagerID); } tpLoad = getObjTime(); populateSavedPhaseData(prevLARung, tpLoad, treePieceActivePartsTmp); //find my responsibility myPlace = find(dm->responsibleIndex.begin(), dm->responsibleIndex.end(), thisIndex) - dm->responsibleIndex.begin(); if (myPlace == dm->responsibleIndex.size()) { myPlace = -2; } setNumExpectedNeighborMsgs(); if (myNumParticles == 0) { incomingParticlesSelf = true; if (thisIndex == 0) { CkCallback cbqd(CkIndex_TreePiece::shuffleAfterQD(), thisProxy); CkStartQD(cbqd); } return; } sendParticlesDuringDD(true); if (thisIndex == 0) { CkCallback cbqd(CkIndex_TreePiece::shuffleAfterQD(), thisProxy); CkStartQD(cbqd); } } /* * Accepts sorted particles from external TreePieces */ void TreePiece::acceptSortedParticlesFromOther(ParticleShuffleMsg *shuffleMsg) { if(shuffleMsg == NULL) { return; } // Copy the particles from shuffleMsg to the tmpShuffle array myTmpShuffleParticle.insert(myTmpShuffleParticle.end(), shuffleMsg->particles, shuffleMsg->particles + shuffleMsg->n); // Copy the SPH particles from shuffleMsg to the tmpShuffle array myTmpShuffleSphParticle.insert(myTmpShuffleSphParticle.end(), shuffleMsg->pGas, shuffleMsg->pGas + shuffleMsg->nSPH); // Copy the Star particles from shuffleMsg to the tmpShuffle array myTmpShuffleStarParticle.insert(myTmpShuffleStarParticle.end(), shuffleMsg->pStar, shuffleMsg->pStar + shuffleMsg->nStar); incomingParticlesArrived += shuffleMsg->n; treePieceLoadTmp += shuffleMsg->load; savePhaseData(savedPhaseLoadTmp, savedPhaseParticleTmp, shuffleMsg->loads, shuffleMsg->parts_per_phase, shuffleMsg->nloads); delete shuffleMsg; } /* * When reusing the splitters, we perform the migration of particles and then * wait for QD. This method is called once the quiescence is detected. */ void TreePiece::shuffleAfterQD() { // myShuffleMsg holds the particles that came from within this TreePiece // (internal transfer). if (myShuffleMsg != NULL) { incomingParticlesArrived += myShuffleMsg->n; treePieceLoadTmp += myShuffleMsg->load; savePhaseData(savedPhaseLoadTmp, savedPhaseParticleTmp, myShuffleMsg->loads, myShuffleMsg->parts_per_phase, myShuffleMsg->nloads); } // This function is called after QD which means all the particles have arrived // therefore set the particleCounts in the DataManager based on total external // particles that arrived and the ones that arrived within the TreePiece. if (myPlace != -2) { dm->particleCounts[myPlace] = incomingParticlesArrived; } if (myPlace == -2 || dm->particleCounts[myPlace] == 0) { // Special case where no particle is assigned to this TreePiece if (myNumParticles > 0){ delete[] myParticles; myParticles = NULL; } myNumParticles = 0; nStore = 0; if (nStoreSPH > 0){ delete[] mySPHParticles; mySPHParticles = NULL; } myNumSPH = 0; nStoreSPH = 0; if (nStoreStar > 0){ delete[] myStarParticles; myStarParticles = NULL; } myNumStar = 0; nStoreStar = 0; incomingParticlesSelf = false; incomingParticlesMsg.clear(); treePieceLoad = treePieceLoadTmp; treePieceLoadTmp = 0.0; savedPhaseLoad.swap(savedPhaseLoadTmp); savedPhaseParticle.swap(savedPhaseParticleTmp); savedPhaseLoadTmp.clear(); savedPhaseParticleTmp.clear(); if(verbosity>1) ckout << thisIndex <<" no particles assigned"< 0) delete[] myParticles; nStore = (int)((dm->particleCounts[myPlace] + 2)*(1.0 + dExtraStore)); myParticles = new GravityParticle[nStore]; myNumParticles = dm->particleCounts[myPlace]; incomingParticlesArrived = 0; incomingParticlesSelf = false; treePieceLoad = treePieceLoadTmp; treePieceLoadTmp = 0.0; savedPhaseLoad.swap(savedPhaseLoadTmp); savedPhaseParticle.swap(savedPhaseParticleTmp); savedPhaseLoadTmp.clear(); savedPhaseParticleTmp.clear(); // Merge all the particles which includes the ones received from within and // from outside mergeAllParticlesAndSaveCentroid(); //signify completion with a reduction if(verbosity>1) ckout << thisIndex <<" contributing to accept particles" <nSPH; nStar += myShuffleMsg->nStar; } if (nStoreSPH > 0) delete[] mySPHParticles; myNumSPH = nSPH; nStoreSPH = (int)(myNumSPH*(1.0 + dExtraStore)); if(nStoreSPH > 0) mySPHParticles = new extraSPHData[nStoreSPH]; else mySPHParticles = NULL; if (nStoreStar > 0) delete[] myStarParticles; myNumStar = nStar; allocateStars(); nSPH = 0; nStar = 0; // myTmpShuffle__Particle holds the particles that came from external // TreePieces // Copy the gas and star data from the tmpshufflemsg array memcpy(&mySPHParticles[nSPH], &myTmpShuffleSphParticle[0], myTmpShuffleSphParticle.size()*sizeof(extraSPHData)); nSPH += myTmpShuffleSphParticle.size(); memcpy(&myStarParticles[nStar], &myTmpShuffleStarParticle[0], myTmpShuffleStarParticle.size()*sizeof(extraStarData)); nStar += myTmpShuffleStarParticle.size(); int iGas = 0; int iStar = 0; // Set the location of the star data in the particle data for the ones that // came in from an external TreePiece. for (int i = 0; i < myTmpShuffleParticle.size(); i++) { if (myTmpShuffleParticle[i].isGas()) { myTmpShuffleParticle[i].extraData = (extraSPHData *) &mySPHParticles[iGas]; iGas++; } else if (myTmpShuffleParticle[i].isStar()) { myTmpShuffleParticle[i].extraData = (extraStarData *) &myStarParticles[iStar]; iStar++; } else { myTmpShuffleParticle[i].extraData = NULL; } } // Now copy the SPH and Star for particles that moved within the TreePiece if (myShuffleMsg != NULL) { memcpy(&mySPHParticles[nSPH], myShuffleMsg->pGas, (myShuffleMsg->nSPH)*sizeof(extraSPHData)); nSPH += (myShuffleMsg->nSPH); memcpy(&myStarParticles[nStar], myShuffleMsg->pStar, (myShuffleMsg->nStar)*sizeof(extraStarData)); nStar += (myShuffleMsg->nStar); } // sort is [first, last) // Note that the particles that were received from outside were just // appended to the myTmpShuffleParticle. Though within each shuffleMsg they // will be sorted, when they are appended the myTmpShuffleParticle need not // sorted. // Sort the items that came from outside. Since we expect them to be a small // number this sort is used sort(myTmpShuffleParticle.begin(), myTmpShuffleParticle.end()); int left, right, tmp; left = right = 0; tmp = 1; int leftend = 0; if (myShuffleMsg != NULL) { leftend = myShuffleMsg->n; } int rightend = myTmpShuffleParticle.size(); Vector3D vCenter(0.0, 0.0, 0.0); // merge sort. myShuffleMsg contains sorted particles and so does // myTmpShuffleParticles. myTmpShuffleParticles contain particles that were // transferred from outside this TreePiece. As they come in, they are merged // in sorted order. // For the particles that came from outside, the extraData field is already // set above. // when merging the pointer to the star data has to be set for the particles // that moved within. while (left < leftend && right < rightend) { if (myShuffleMsg->particles[left] < myTmpShuffleParticle[right]) { myParticles[tmp] = myShuffleMsg->particles[left++]; if (myParticles[tmp].isGas()) { myParticles[tmp].extraData = (extraSPHData *) &mySPHParticles[iGas++]; } else if (myParticles[tmp].isStar()) { myParticles[tmp].extraData = (extraStarData *) &myStarParticles[iStar++]; } else { myParticles[tmp].extraData = NULL; } } else { myParticles[tmp] = myTmpShuffleParticle[right++]; } vCenter += myParticles[tmp].position; tmp++; } while (left < leftend) { myParticles[tmp] = myShuffleMsg->particles[left++]; if (myParticles[tmp].isGas()) { myParticles[tmp].extraData = (extraSPHData *) &mySPHParticles[iGas++]; } else if (myParticles[tmp].isStar()) { myParticles[tmp].extraData = (extraStarData *) &myStarParticles[iStar++]; } else { myParticles[tmp].extraData = NULL; } vCenter += myParticles[tmp].position; tmp++; } while (right < rightend) { myParticles[tmp] = myTmpShuffleParticle[right++]; vCenter += myParticles[tmp].position; tmp++; } // Clear all the tmp datastructures which were holding the migrated particles myTmpShuffleParticle.clear(); myTmpShuffleSphParticle.clear(); myTmpShuffleStarParticle.clear(); // Save the centroid savedCentroid = vCenter/(double)myNumParticles; } void TreePiece::setNumExpectedNeighborMsgs() { nbor_msgs_count_ = 2; // This TreePiece is out of the responsible index range if (myPlace == -2) { nbor_msgs_count_ = 0; } // This TreePiece is the first one so will get only from my right neighbor if (myPlace == 0) { nbor_msgs_count_--; } // This TreePiece is the last one so will get only from my left neighbor if (myPlace == (dm->responsibleIndex.size()-1)) { nbor_msgs_count_--; } } /// Once final splitter keys have been decided, I need to give my /// particles out to the TreePiece responsible for them void TreePiece::unshuffleParticles(CkReductionMsg* m){ double tpLoad; if (dm == NULL) { dm = (DataManager*)CkLocalNodeBranch(dataManagerID); } tpLoad = getObjTime(); populateSavedPhaseData(prevLARung, tpLoad, treePieceActivePartsTmp); callback = *static_cast(m->getData()); myPlace = find(dm->responsibleIndex.begin(), dm->responsibleIndex.end(), thisIndex) - dm->responsibleIndex.begin(); if (myPlace == dm->responsibleIndex.size()) { myPlace = -2; } setNumExpectedNeighborMsgs(); // CkPrintf("[%d] myplace %d\n", thisIndex, myPlace); /* if(myNumParticles > 0){ for(int i = 0; i <= myNumParticles+1; i++){ CkPrintf("[%d] part %d key %llx\n", thisIndex, i, myParticles[i].key); } } */ if (myNumParticles == 0) { incomingParticlesSelf = true; acceptSortedParticles(NULL); delete m; return; } sendParticlesDuringDD(false); acceptSortedParticles(NULL); delete m; } void TreePiece::sendParticlesDuringDD(bool withqd) { double tpLoad; tpLoad = getObjTime(); GravityParticle *binBegin = &myParticles[1]; vector::iterator iter = lower_bound(dm->boundaryKeys.begin(), dm->boundaryKeys.end(), binBegin->key); vector::const_iterator endKeys = dm->boundaryKeys.end(); int offset = iter - dm->boundaryKeys.begin() - 1; vector::iterator responsibleIter = dm->responsibleIndex.begin() + offset; GravityParticle *binEnd; GravityParticle dummy; for( ; iter != endKeys; ++iter, ++responsibleIter) { dummy.key = *iter; //find particles between this and the last key binEnd = upper_bound(binBegin, &myParticles[myNumParticles+1], dummy); // If I have any particles in this bin, send them to // the responsible TreePiece int nPartOut = binEnd - binBegin; int saved_phase_len = savedPhaseLoad.size(); if(nPartOut > 0) { int nGasOut = 0; int nStarOut = 0; for(GravityParticle *pPart = binBegin; pPart < binEnd; pPart++) { if(pPart->isGas()) nGasOut++; if(pPart->isStar()) nStarOut++; } ParticleShuffleMsg *shuffleMsg = new (saved_phase_len, saved_phase_len, nPartOut, nGasOut, nStarOut) ParticleShuffleMsg(saved_phase_len, nPartOut, nGasOut, nStarOut, 0.0); memset(shuffleMsg->parts_per_phase, 0, saved_phase_len*sizeof(unsigned int)); // Calculate the number of particles leaving the treepiece per phase for(GravityParticle *pPart = binBegin; pPart < binEnd; pPart++) { for(int i = 0; i < saved_phase_len; i++) { if (pPart->rung >= i) { shuffleMsg->parts_per_phase[i] = shuffleMsg->parts_per_phase[i] + 1; } } if(havePhaseData(PHASE_FEEDBACK) && (pPart->isGas() || pPart->isStar())) shuffleMsg->parts_per_phase[PHASE_FEEDBACK] += 1; } shuffleMsg->load = tpLoad * nPartOut / myNumParticles; memset(shuffleMsg->loads, 0.0, saved_phase_len*sizeof(double)); // Calculate the partial load per phase for (int i = 0; i < saved_phase_len; i++) { if (havePhaseData(i) && savedPhaseParticle[i] != 0) { double dLoadFrac = shuffleMsg->parts_per_phase[i] / (float) savedPhaseParticle[i]; /* * The following can happen if the number of particles on * a given rung increases significantly because of a * timestep adjustment. */ if (dLoadFrac > 1.0) dLoadFrac = 1.0; shuffleMsg->loads[i] = savedPhaseLoad[i] * dLoadFrac; } else if (havePhaseData(0) && myNumParticles != 0) { shuffleMsg->loads[i] = savedPhaseLoad[0] * (shuffleMsg->parts_per_phase[i] / (float) myNumParticles); } } if (verbosity>=3) CkPrintf("me:%d to:%d nPart :%d, nGas:%d, nStar: %d\n", thisIndex, *responsibleIter,nPartOut, nGasOut, nStarOut); int iGasOut = 0; int iStarOut = 0; GravityParticle *pPartOut = shuffleMsg->particles; for(GravityParticle *pPart = binBegin; pPart < binEnd; pPart++, pPartOut++) { *pPartOut = *pPart; if(pPart->isGas()) { shuffleMsg->pGas[iGasOut] = *(extraSPHData *)pPart->extraData; iGasOut++; } if(pPart->isStar()) { shuffleMsg->pStar[iStarOut] = *(extraStarData *)pPart->extraData; iStarOut++; } } if(*responsibleIter == thisIndex) { if (verbosity > 1) CkPrintf("TreePiece %d: keeping %d / %d particles: %d\n", thisIndex, nPartOut, myNumParticles, nPartOut*10000/myNumParticles); if (withqd) { myShuffleMsg = shuffleMsg; } else { acceptSortedParticles(shuffleMsg); } } else { if (withqd) { treeProxy[*responsibleIter].acceptSortedParticlesFromOther(shuffleMsg); } else { treeProxy[*responsibleIter].acceptSortedParticles(shuffleMsg); } } } if(&myParticles[myNumParticles + 1] <= binEnd) break; binBegin = binEnd; } incomingParticlesSelf = true; } /// Accept particles from other TreePieces once the sorting has finished void TreePiece::acceptSortedParticles(ParticleShuffleMsg *shuffleMsg) { //Need to get the place here again. Getting the place in //unshuffleParticles and using it here results in a race condition. if (dm == NULL) dm = (DataManager*)CkLocalNodeBranch(dataManagerID); myPlace = find(dm->responsibleIndex.begin(), dm->responsibleIndex.end(), thisIndex) - dm->responsibleIndex.begin(); if (myPlace == dm->responsibleIndex.size()) myPlace = -2; // The following assert does not work anymore when TreePieces can //have 0 particles assigned //assert(myPlace >= 0 && myPlace < dm->particleCounts.size()); if (myPlace == -2 || dm->particleCounts[myPlace] == 0) { // Special case where no particle is assigned to this TreePiece if (myNumParticles > 0){ delete[] myParticles; myParticles = NULL; } myNumParticles = 0; nStore = 0; if (nStoreSPH > 0){ delete[] mySPHParticles; mySPHParticles = NULL; } myNumSPH = 0; nStoreSPH = 0; if (nStoreStar > 0){ delete[] myStarParticles; myStarParticles = NULL; } myNumStar = 0; nStoreStar = 0; incomingParticlesSelf = false; incomingParticlesMsg.clear(); if(verbosity>1) ckout << thisIndex <<" no particles assigned"<n; treePieceLoadTmp += shuffleMsg->load; savePhaseData(savedPhaseLoadTmp, savedPhaseParticleTmp, shuffleMsg->loads, shuffleMsg->parts_per_phase, shuffleMsg->nloads); } if (verbosity>=3) ckout << thisIndex <<" waiting for " << dm->particleCounts[myPlace]-incomingParticlesArrived << " particles ("<particleCounts[myPlace]<<"-" << incomingParticlesArrived<<")" << (incomingParticlesSelf?" self":"")<particleCounts[myPlace] == incomingParticlesArrived && incomingParticlesSelf) { //I've got all my particles if (myNumParticles > 0) delete[] myParticles; nStore = (int)((dm->particleCounts[myPlace] + 2)*(1.0 + dExtraStore)); myParticles = new GravityParticle[nStore]; myNumParticles = dm->particleCounts[myPlace]; incomingParticlesArrived = 0; incomingParticlesSelf = false; treePieceLoad = treePieceLoadTmp; treePieceLoadTmp = 0.0; savedPhaseLoad.swap(savedPhaseLoadTmp); savedPhaseParticle.swap(savedPhaseParticleTmp); savedPhaseLoadTmp.clear(); savedPhaseParticleTmp.clear(); int nSPH = 0; int nStar = 0; int iMsg; for(iMsg = 0; iMsg < incomingParticlesMsg.size(); iMsg++) { nSPH += incomingParticlesMsg[iMsg]->nSPH; nStar += incomingParticlesMsg[iMsg]->nStar; } if (nStoreSPH > 0) delete[] mySPHParticles; myNumSPH = nSPH; nStoreSPH = (int)(myNumSPH*(1.0 + dExtraStore)); if(nStoreSPH > 0) mySPHParticles = new extraSPHData[nStoreSPH]; else mySPHParticles = NULL; if (nStoreStar > 0) delete[] myStarParticles; myNumStar = nStar; allocateStars(); int nPart = 0; nSPH = 0; nStar = 0; for(iMsg = 0; iMsg < incomingParticlesMsg.size(); iMsg++) { memcpy(&myParticles[nPart+1], incomingParticlesMsg[iMsg]->particles, incomingParticlesMsg[iMsg]->n*sizeof(GravityParticle)); nPart += incomingParticlesMsg[iMsg]->n; memcpy(&mySPHParticles[nSPH], incomingParticlesMsg[iMsg]->pGas, incomingParticlesMsg[iMsg]->nSPH*sizeof(extraSPHData)); nSPH += incomingParticlesMsg[iMsg]->nSPH; memcpy(&myStarParticles[nStar], incomingParticlesMsg[iMsg]->pStar, incomingParticlesMsg[iMsg]->nStar*sizeof(extraStarData)); nStar += incomingParticlesMsg[iMsg]->nStar; delete incomingParticlesMsg[iMsg]; } incomingParticlesMsg.clear(); // assign gas data pointers and determine centroid int iGas = 0; int iStar = 0; Vector3D vCenter(0.0, 0.0, 0.0); for(int iPart = 0; iPart < myNumParticles; iPart++) { vCenter += myParticles[iPart+1].position; if(myParticles[iPart+1].isGas()) { myParticles[iPart+1].extraData = (extraSPHData *)&mySPHParticles[iGas]; iGas++; } else if(myParticles[iPart+1].isStar()) { myParticles[iPart+1].extraData = (extraStarData *)&myStarParticles[iStar]; iStar++; } else myParticles[iPart+1].extraData = NULL; } sort(myParticles+1, myParticles+myNumParticles+1); savedCentroid = vCenter/(double)myNumParticles; //signify completion with a reduction if(verbosity>1) ckout << thisIndex <<" contributing to accept particles" < &loads, std::vector &parts_per_phase, double* shuffleloads, unsigned int *shuffleparts, int shufflelen) { int len = 0; int num_additional; for (int i = 0; i < shufflelen; i++) { len = loads.size(); if (i >= len) { num_additional = i - len + 1; while (num_additional > 0) { loads.push_back(-1.0); parts_per_phase.push_back(0); num_additional--; } } if (loads[i] == -1) { loads[i] = 0.0; } loads[i] += shuffleloads[i]; parts_per_phase[i] += shuffleparts[i]; } } void TreePiece::populateSavedPhaseData(int phase, double tp_load, unsigned int activeparts) { int len = savedPhaseLoad.size(); int num_additional; if (phase == -1) { phase = 0; activeparts = myNumParticles; //return; } if (phase > len-1) { num_additional = phase - len + 1; while (num_additional > 0) { savedPhaseLoad.push_back(-1.0); savedPhaseParticle.push_back(0); num_additional--; } len = savedPhaseLoad.size(); } savedPhaseLoad[phase] = tp_load; savedPhaseParticle[phase] = activeparts; } bool TreePiece::havePhaseData(int phase) { return (savedPhaseLoad.size() > phase && savedPhaseLoad[phase] > -0.5); } #if 0 void TreePiece::checkin(){ if(myDecomposer == NULL){ myDecomposer = decomposerProxy.ckLocalBranch(); } CkPrintf("[%d] checkin\n", thisIndex); myDecomposer->checkin(); } void Decomposer::checkin(){ numTreePiecesCheckedIn++; // +1 for self checkin(), since Decomposer::unshuffleParticles // (in which myNumTreePieces is set) may be called // after all local TreePieces have called Decomposer::checkin() // through TreePiece::acceptSortedParticles(); CkPrintf("decomposer %d checked in %d/%d\n", CkMyPe(), numTreePiecesCheckedIn, myNumTreePieces); if(numTreePiecesCheckedIn == myNumTreePieces){ numTreePiecesCheckedIn = 0; myNumTreePieces = -1; // return control to mainchare //contribute(callback); } } #endif // Find the center of mass of the star particles. This is needed for // cooling_planet, so that gas particles know how far they are from the central // star void TreePiece::starCenterOfMass(const CkCallback& cb) { // Initialize sum of mass*position and mass. In order to just contribute // one variable, contain both these values in a single array. // dMassPos[0,1,2] = sum(m*{x,y,z}) // dMassPos[3] = sum(m) double dMassPos[4] = {0}; // Loop over all particles to sum mass and mass*position for only star // particles for (unsigned int i = 0; i < myNumParticles; ++i) { GravityParticle *p = &myParticles[i+1]; if (TYPETest(p, TYPE_STAR)) { // Loop over x,y,z for (int j=0; j<3; ++j) { dMassPos[j] += p->mass * p->position[j]; } // Add particle mass dMassPos[3] += p->mass; } } contribute(4*sizeof(double), dMassPos, CkReduction::sum_double, cb); } // Sum energies for diagnostics void TreePiece::calcEnergy(const CkCallback& cb) { double dEnergy[7]; // 0 -> kinetic; 1 -> virial ; 2 -> potential; // 3-5 -> L; 6 -> thermal Vector3D L; dEnergy[0] = 0.0; dEnergy[1] = 0.0; dEnergy[2] = 0.0; dEnergy[6] = 0.0; for(unsigned int i = 0; i < myNumParticles; ++i) { GravityParticle *p = &myParticles[i+1]; dEnergy[0] += p->mass*p->velocity.lengthSquared(); dEnergy[1] += p->mass*dot(p->treeAcceleration, p->position); dEnergy[2] += p->mass*p->potential; L += p->mass*cross(p->position, p->velocity); if (TYPETest(p, TYPE_GAS)) dEnergy[6] += p->mass*p->u(); } dEnergy[0] *= 0.5; dEnergy[2] *= 0.5; dEnergy[3] = L.x; dEnergy[4] = L.y; dEnergy[5] = L.z; contribute(7*sizeof(double), dEnergy, CkReduction::sum_double, cb); } void TreePiece::kick(int iKickRung, double dDelta[MAXRUNG+1], int bClosing, // Are we at the end of a timestep int bNeedVPred, // do we need to update vpred int bGasIsothermal, // Isothermal EOS double duDelta[MAXRUNG+1], // dts for energy const CkCallback& cb) { // LBTurnInstrumentOff(); for(unsigned int i = 1; i <= myNumParticles; ++i) { GravityParticle *p = &myParticles[i]; if(p->rung >= iKickRung) { if(bNeedVPred && TYPETest(p, TYPE_GAS)) { if(bClosing) { // update predicted quantities to end of step p->vPred() = p->velocity + dDelta[p->rung]*p->treeAcceleration; if(!bGasIsothermal) { #ifndef COOLING_NONE p->u() = p->u() + p->uDot()*duDelta[p->rung]; if (p->u() < 0) { double uold = p->u() - p->uDot()*duDelta[p->rung]; p->u() = uold*exp(p->uDot()*duDelta[p->rung]/uold); } #else /* COOLING_NONE */ p->u() += p->PdV()*duDelta[p->rung]; if (p->u() < 0) { double uold = p->u() - p->PdV()*duDelta[p->rung]; p->u() = uold*exp(p->PdV()*duDelta[p->rung]/uold); } #endif /* COOLING_NONE */ p->uPred() = p->u(); } } else { // predicted quantities are at the beginning // of step p->vPred() = p->velocity; if(!bGasIsothermal) { p->uPred() = p->u(); #ifndef COOLING_NONE p->u() += p->uDot()*duDelta[p->rung]; if (p->u() < 0) { double uold = p->u() - p->uDot()*duDelta[p->rung]; p->u() = uold*exp(p->uDot()*duDelta[p->rung]/uold); } #else /* COOLING_NONE */ p->u() += p->PdV()*duDelta[p->rung]; if (p->u() < 0) { double uold = p->u() - p->PdV()*duDelta[p->rung]; p->u() = uold*exp(p->PdV()*duDelta[p->rung]/uold); } #endif /* COOLING_NONE */ } } CkAssert(p->u() >= 0.0); CkAssert(p->uPred() >= 0.0); } p->velocity += dDelta[p->rung]*p->treeAcceleration; } } contribute(cb); } void TreePiece::initAccel(int iKickRung, const CkCallback& cb) { for(unsigned int i = 1; i <= myNumParticles; ++i) { if(myParticles[i].rung >= iKickRung) { myParticles[i].treeAcceleration = 0; myParticles[i].potential = 0; myParticles[i].dtGrav = 0; } } contribute(cb); } /** * Adjust timesteps of active particles. * @param iKickRung The rung we are on. * @param bEpsAccStep Use sqrt(eps/acc) timestepping * @param bGravStep Use sqrt(r^3/GM) timestepping * @param bSphStep Use Courant condition * @param bViscosityLimitdt Use viscosity in Courant condition * @param dEta Factor to use in determing timestep * @param dEtaCourant Courant factor to use in determing timestep * @param dEtauDot Factor to use in uDot based timestep * @param dDelta Base timestep * @param dAccFac Acceleration scaling for cosmology * @param dCosmoFac Cosmo scaling for Courant * @param dhMinOverSoft minimum smoothing parameter. * @param dResolveJeans multiple of Jeans length to be resolved. * @param bDoGas We are calculating gas forces. * @param cb Callback function reduces currrent maximum rung */ void TreePiece::adjust(int iKickRung, int bEpsAccStep, int bGravStep, int bSphStep, int bViscosityLimitdt, double dEta, double dEtaCourant, double dEtauDot, double dDelta, double dAccFac, double dCosmoFac, double dhMinOverSoft, double dResolveJeans, int bDoGas, const CkCallback& cb) { int iCurrMaxRung = 0; int nMaxRung = 0; // number of particles in maximum rung for(unsigned int i = 1; i <= myNumParticles; ++i) { GravityParticle *p = &myParticles[i]; if(p->rung >= iKickRung) { CkAssert(p->soft > 0.0); double dTIdeal = dDelta; double dTGrav, dTCourant, dTEdot; if(bEpsAccStep) { double acc = dAccFac*p->treeAcceleration.length(); double dt; #ifdef EPSACCH // Set gas timestep based on SPH smoothing. if(bDoGas && p->isGas()) { dt = dEta*sqrt(0.5*p->fBall/acc); } #else // Set gas timestep based on gravity softening and SPH smoothing. if(bDoGas && p->isGas() && dhMinOverSoft < 1 && p->fBall < 2.0*p->soft) { if(p->fBall > 2.0*dhMinOverSoft*p->soft) dt = dEta*sqrt(0.5*p->fBall/acc); else dt = dEta*sqrt(dhMinOverSoft*p->soft/acc); } #endif else dt = dEta*sqrt(p->soft/acc); dTGrav = dt; if(dt < dTIdeal) dTIdeal = dt; } if(bGravStep) { double dt = dEta/sqrt(dAccFac*p->dtGrav); if(dt < dTIdeal) dTIdeal = dt; } if(bSphStep && TYPETest(p, TYPE_GAS)) { double dt; double ph = sqrt(0.25*p->fBall*p->fBall); if (p->mumax() > 0.0) { if (bViscosityLimitdt) dt = dEtaCourant*dCosmoFac*(ph /(p->c() + 0.6*(p->c() + 2*p->BalsaraSwitch()*p->mumax()))); else dt = dEtaCourant*dCosmoFac*(ph/(p->c() + 0.6*(p->c() + 2*p->mumax()))); } else dt = dEtaCourant*dCosmoFac*(ph/(1.6*p->c())); dTCourant = dt; if(dt < dTIdeal) dTIdeal = dt; if (dEtauDot > 0.0 && p->PdV() < 0.0) { /* Prevent rapid adiabatic cooling */ assert(p->PoverRho2() > 0.0); // Use P/rho as internal energy estimate since "u" may // be driven to 0 with cooling. double dPoverRhoJeans = PoverRhoFloorJeans(dResolveJeans, p); double uEff = dPoverRhoJeans/(GAMMA_JEANS-1) + p->fDensity*p->PoverRho2(); dt = dEtauDot*uEff/fabs(p->PdV()); if (dt < dTIdeal) dTIdeal = dt; } dTEdot = dt; } int iNewRung = DtToRung(dDelta, dTIdeal); if(iNewRung > 29) { CkError("Small timestep dt: %g, soft: %g, accel: %g\n", dTIdeal, p->soft, p->treeAcceleration.length()); if(p->isGas()) CkError("Small gas dt: %g, dtgrav: %g, dtcourant: %g, dtEdot: %g\n", dTIdeal, dTGrav, dTCourant, dTEdot); } if(iNewRung > MAXRUNG) { CkError("dt: %g, soft: %g, accel: %g\n", dTIdeal, p->soft, p->treeAcceleration.length()); CkAbort("Timestep too small"); } if(iNewRung < iKickRung) iNewRung = iKickRung; if(iNewRung > iCurrMaxRung) { iCurrMaxRung = iNewRung; nMaxRung = 1; } else if(iNewRung == iCurrMaxRung) nMaxRung++; myParticles[i].rung = iNewRung; #ifdef NEED_DT myParticles[i].dt = dTIdeal; #endif } } // Pack into array for reduction int64_t newcount[2]; newcount[0] = iCurrMaxRung; newcount[1] = nMaxRung; contribute(2*sizeof(int64_t), newcount, max_count, cb); } void TreePiece::truncateRung(int iCurrMaxRung, const CkCallback& cb) { for(unsigned int i = 1; i <= myNumParticles; ++i) { GravityParticle *p = &myParticles[i]; if(p->rung > iCurrMaxRung) p->rung--; CkAssert(p->rung <= iCurrMaxRung); } contribute(cb); } void TreePiece::rungStats(const CkCallback& cb) { int64_t nInRung[MAXRUNG+1]; for(int iRung = 0; iRung <= MAXRUNG; iRung++) nInRung[iRung] = 0; for(unsigned int i = 1; i <= myNumParticles; ++i) { nInRung[myParticles[i].rung]++; } contribute((MAXRUNG+1)*sizeof(int64_t), nInRung, CkReduction::sum_long, cb); } void TreePiece::countActive(int activeRung, const CkCallback& cb) { int64_t nActive[2]; nActive[0] = nActive[1] = 0; for(unsigned int i = 1; i <= myNumParticles; ++i) { if(myParticles[i].rung >= activeRung) { nActive[0]++; if(TYPETest(&myParticles[i], TYPE_GAS)) { nActive[1]++; } } } contribute(2*sizeof(int64_t), nActive, CkReduction::sum_long, cb); } /// @brief assign domain number to each particle for diagnostic void TreePiece::assignDomain(const CkCallback &cb) { for(unsigned int i = 1; i <= myNumParticles; ++i) { myParticles[i].interMass = thisIndex; } contribute(cb); } void TreePiece::drift(double dDelta, // time step in x containing // cosmo scaling int bNeedVpred, // Update predicted velocities int bGasIsothermal, // Isothermal EOS double dvDelta, // time step in v containing // cosmo scaling double duDelta, // time step for internal energy int nGrowMass, // GrowMass particles are locked // in place bool buildTree, // is a treebuild happening before the // next drift? const CkCallback& cb) { callback = cb; // called by assignKeys() deleteTree(); if(bucketReqs != NULL) { delete[] bucketReqs; bucketReqs = NULL; } boundingBox.reset(); int bInBox = 1; for(unsigned int i = 1; i <= myNumParticles; ++i) { GravityParticle *p = &myParticles[i]; if (p->iOrder >= nGrowMass) p->position += dDelta*p->velocity; if(bPeriodic) { for(int j = 0; j < 3; j++) { if(p->position[j] >= 0.5*fPeriod[j]){ p->position[j] -= fPeriod[j]; } if(p->position[j] < -0.5*fPeriod[j]){ p->position[j] += fPeriod[j]; } bool a = (p->position[j] >= -0.5*fPeriod[j]); bool b = (p->position[j] < 0.5*fPeriod[j]); // Sanity Checks bInBox = bInBox && a; bInBox = bInBox && b; } if(!bInBox){ CkPrintf("[%d] p: %f,%f,%f\n", thisIndex, p->position[0], p->position[1], p->position[2]); CkAbort("binbox failed\n"); } } boundingBox.grow(p->position); if(bNeedVpred && TYPETest(p, TYPE_GAS)) { p->vPred() += dvDelta*p->treeAcceleration; if(!bGasIsothermal) { #ifndef COOLING_NONE p->uPred() += p->uDot()*duDelta; if (p->uPred() < 0) { // Backout the update to upred double uold = p->uPred() - p->uDot()*duDelta; // uold could be negative because of round-off // error. If this is the case then uDot*Delta/u is // large, the final uPred will be zero. if(uold <= 0.0) p->uPred() = 0.0; // Cooling rate is large: use an exponential decay // of timescale u/uDot. else p->uPred() = uold*exp(p->uDot()*duDelta/uold); } #else p->uPred() += p->PdV()*duDelta; if (p->uPred() < 0) { double uold = p->uPred() - p->PdV()*duDelta; p->uPred() = uold*exp(p->PdV()*duDelta/uold); } #endif CkAssert(p->uPred() >= 0.0); } } } CkAssert(bInBox); if(!bInBox){ CkAbort("binbox2 failed\n"); } if(buildTree) contribute(sizeof(OrientedBox), &boundingBox, growOrientedBox_float, CkCallback(CkIndex_TreePiece::assignKeys(0), pieces)); else contribute(cb); } /// @brief Adjust particlePointer attribute of all the nodes in the /// tree. /// /// Call this if "myParticles" is about to change, but you still need /// the tree. void TreePiece::adjustTreePointers(GenericTreeNode *node, GravityParticle *newParts) { // Is particlePointer point to memory we are about to delete? if(node->particlePointer >= myParticles && node->particlePointer < myParticles + myNumParticles + 2) { node->particlePointer = newParts + (node->particlePointer - myParticles); } GenericTreeNode *child; for (unsigned int i=0; inumChildren(); ++i) { child = node->getChildren(i); if (child != NULL) adjustTreePointers(child, newParts); } } void TreePiece::newParticle(GravityParticle *p) { // N.B. there are 2 boundary particles if(myNumParticles + 2 >= nStore) { int nTmpStore = (int) ((nStore + 1)*(1.0 + dExtraStore)); CkError("WARNING: Increasing particle store to %d\n", nTmpStore); CkAssert(nTmpStore > nStore); GravityParticle *myTmpParticles = new GravityParticle[nTmpStore]; memcpy(myTmpParticles, myParticles, (2 + myNumParticles)*sizeof(GravityParticle)); adjustTreePointers(root, myTmpParticles); delete[] myParticles; myParticles = myTmpParticles; nStore = nTmpStore; } // Move Boundary particle myParticles[myNumParticles+2] = myParticles[myNumParticles+1]; myNumParticles++; myParticles[myNumParticles] = *p; myParticles[myNumParticles].iOrder = -1; if(p->isGas()) { if(myNumSPH >= nStoreSPH) { int nTmpStore = (int) ((nStoreSPH + 1)*(1.0 + dExtraStore)); CkError("WARNING: Increasing gas particle store to %d\n", nTmpStore); CkAssert(nTmpStore > nStoreSPH); extraSPHData *myTmpParticles = new extraSPHData[nTmpStore]; memcpy(myTmpParticles, mySPHParticles, (myNumSPH)*sizeof(extraSPHData)); for(int i = 1; i <= myNumParticles; i++) { // assign pointers if(myParticles[i].isGas()) { int iSPH = (extraSPHData *)myParticles[i].extraData - mySPHParticles; myParticles[i].extraData = myTmpParticles + iSPH; } } delete[] mySPHParticles; mySPHParticles = myTmpParticles; nStoreSPH = nTmpStore; } mySPHParticles[myNumSPH] = *((extraSPHData *) p->extraData); myParticles[myNumParticles].extraData = &mySPHParticles[myNumSPH]; myNumSPH++; } if(p->isStar()) { if(myNumStar >= nStoreStar) { int nTmpStore = (int) ((nStoreStar + 1)*(1.0 + dExtraStore)); CkError("WARNING: Increasing star particle store to %d\n", nTmpStore); CkAssert(nTmpStore > nStoreStar); extraStarData *myTmpParticles = new extraStarData[nTmpStore]; memcpy(myTmpParticles, myStarParticles, (myNumStar)*sizeof(extraStarData)); for(int i = 1; i <= myNumParticles; i++) { // assign pointers if(myParticles[i].isStar()) { int iStar = (extraStarData *)myParticles[i].extraData - myStarParticles; myParticles[i].extraData = myTmpParticles + iStar; } } delete[] myStarParticles; myStarParticles = myTmpParticles; nStoreStar = nTmpStore; } myStarParticles[myNumStar] = *((extraStarData *) p->extraData); myParticles[myNumParticles].extraData = &myStarParticles[myNumStar]; myNumStar++; } } /** * Count add/deleted particles, and compact main particle storage. * Extradata storage will be reclaimed on the next domain decompose. * This contributes to a "set" reduction, which the main chare will * iterate through to assign new iOrders. */ void TreePiece::colNParts(const CkCallback &cb) { CountSetPart counts; counts.index = thisIndex; counts.nAddGas = 0; counts.nAddDark = 0; counts.nAddStar = 0; counts.nDelGas = 0; counts.nDelDark = 0; counts.nDelStar = 0; unsigned int i, j; int newNPart = myNumParticles; // the number of particles may change. for(i = 1, j = 1; i <= myNumParticles; ++i) { if(j < i) myParticles[j] = myParticles[i]; GravityParticle *p = &myParticles[i]; if(p->iOrder == -1) { // A new Particle j++; if (p->isGas()) counts.nAddGas++; else if(p->isStar()) counts.nAddStar++; else if(p->isDark()) counts.nAddDark++; else CkAbort("Bad Particle type"); } else if(TYPETest(p, TYPE_DELETED) ) { newNPart--; if (p->isGas()) counts.nDelGas++; else if(p->isStar()) counts.nDelStar++; else if(p->isDark()) counts.nDelDark++; else CkAbort("Bad Particle type"); } else { j++; } } if(j < i) // move boundary particle if needed myParticles[j] = myParticles[i]; myNumParticles = newNPart; contribute(sizeof(counts), &counts, CkReduction::concat, cb); } /** * Assign iOrders to recently added particles. * Also insure keys are OK */ void TreePiece::newOrder(const NewMaxOrder *nStarts, const int n, const CkCallback &cb) { unsigned int i; CkAssert(thisIndex < n); int64_t nStartSPH = nStarts[thisIndex].nMaxOrderGas; int64_t nStartDark = nStarts[thisIndex].nMaxOrderDark; int64_t nStartStar = nStarts[thisIndex].nMaxOrder; boundingBox.reset(); int iNewStars = 0; for(i = 1; i <= myNumParticles; ++i) { GravityParticle *p = &myParticles[i]; boundingBox.grow(p->position); if(p->iOrder == -1) { if (p->isGas()) p->iOrder = nStartSPH++; else if (p->isDark()) p->iOrder = nStartDark++; else { /* Also record iOrder in the starLog table. */ CkAssert(iNewStars < iSeTab.size()); CmiLock(dm->lockStarLog); dm->starLog->seTab[iSeTab[iNewStars]].iOrdStar = nStartStar; dm->starLog->nOrdered++; CmiUnlock(dm->lockStarLog); iNewStars++; p->iOrder = nStartStar++; } } } iSeTab.clear(); callback = cb; // called by assignKeys() // get the new particles into key order contribute(sizeof(OrientedBox), &boundingBox, growOrientedBox_float, CkCallback(CkIndex_TreePiece::assignKeys(0), pieces)); } /** * Update total particle numbers. */ void TreePiece::setNParts(int64_t _nTotalSPH, int64_t _nTotalDark, int64_t _nTotalStar, const CkCallback &cb) { nTotalSPH = _nTotalSPH; nTotalDark = _nTotalDark; nTotalStar = _nTotalStar; nTotalParticles = nTotalSPH + nTotalDark + nTotalStar; contribute(cb); } /// @param dSoft gravitational softening /// @param cb callback void TreePiece::setSoft(const double dSoft, const CkCallback& cb) { for(unsigned int i = 1; i <= myNumParticles; ++i) { #ifdef CHANGESOFT myParticles[i].fSoft0 = dSoft; #endif myParticles[i].soft = dSoft; } contribute(cb); } /** * \brief Adjust comoving softening to maintain constant physical * softening */ void TreePiece::physicalSoft(const double dSoftMax, const double dFac, const int bSoftMaxMul, const CkCallback& cb) { #ifdef CHANGESOFT CkAssert(dFac > 0.0); if (bSoftMaxMul) { // dSoftMax is a maximum multiplier for(unsigned int i = 1; i <= myNumParticles; ++i) { myParticles[i].soft = myParticles[i].fSoft0*dFac; CkAssert(myParticles[i].soft > 0.0); } } else { // dSoftMax is an absolute limit CkAssert(dSoftMax > 0.0); for(unsigned int i = 1; i <= myNumParticles; ++i) { myParticles[i].soft = myParticles[i].fSoft0*dFac; if(myParticles[i].soft > dSoftMax) myParticles[i].soft = dSoftMax; } } #endif contribute(cb); } void TreePiece::growMass(int nGrowMass, double dDeltaM, const CkCallback& cb) { for(unsigned int i = 1; i <= myNumParticles; ++i) { if(myParticles[i].iOrder < nGrowMass) myParticles[i].mass += dDeltaM; } contribute(cb); } /* * Gathers information for center of mass calculation * For each particle type the 0th and first mass moment is summed. */ void TreePiece::getCOM(const CkCallback& cb, int bLiveViz) { int i; double com[12]; // structure is m*position and mass for all // particle types; for(i = 0; i < 12; i++) com[i] = 0; for(i = 1; i <= myNumParticles; ++i) { double m = myParticles[i].mass; if ( TYPETest(&(myParticles[i]), TYPE_GAS) ) { com[0] += m*myParticles[i].position[0]; com[1] += m*myParticles[i].position[1]; com[2] += m*myParticles[i].position[2]; com[3] += m; } else if ( TYPETest(&(myParticles[i]), TYPE_DARK) ) { com[4] += m*myParticles[i].position[0]; com[5] += m*myParticles[i].position[1]; com[6] += m*myParticles[i].position[2]; com[7] += m; } else if ( TYPETest(&(myParticles[i]), TYPE_STAR) ) { com[8] += m*myParticles[i].position[0]; com[9] += m*myParticles[i].position[1]; com[10] += m*myParticles[i].position[2]; com[11] += m; } } if(bLiveViz) // Use LiveViz array lvProxy[thisIndex].ckLocal()->contribute(12*sizeof(double), com, CkReduction::sum_double, cb); else contribute(12*sizeof(double), com, CkReduction::sum_double, cb); } /* * Gathers information for center of mass calculation for one type of * particle. */ void TreePiece::getCOMByType(int iType, const CkCallback& cb, int bLiveViz) { int i; double com[4]; // structure is m*position and mass for(i = 0; i < 4; i++) com[i] = 0; for(i = 1; i <= myNumParticles; ++i) { if ( TYPETest(&(myParticles[i]), iType) ) { double m = myParticles[i].mass; com[0] += m*myParticles[i].position[0]; com[1] += m*myParticles[i].position[1]; com[2] += m*myParticles[i].position[2]; com[3] += m; } } if(bLiveViz) // Use LiveViz array lvProxy[thisIndex].ckLocal()->contribute(4*sizeof(double), com, CkReduction::sum_double, cb); else contribute(4*sizeof(double), com, CkReduction::sum_double, cb); } /// @brief structure for efficiently reading iOrders. struct SortStruct { int64_t iOrder; int iStore; }; int CompSortStruct(const void * a, const void * b) { return ( ( ((struct SortStruct *) a)->iOrder < ((struct SortStruct *) b)->iOrder ? -1 : 1 ) ); } /// @brief Read file of iOrders to set particle type. /// /// This is used by the photogenic code of dumpframe. void TreePiece::SetTypeFromFileSweep(int iSetMask, char *file, struct SortStruct *ss, int nss, int *pniOrder, int *pnSet) { int niOrder = 0, nSet = 0; int64_t iOrder, iOrderOld; int nRet; FILE *fp; int iss; fp = fopen( file, "r" ); assert( fp != NULL ); iss = 0; iOrderOld = -1; while ( (nRet=fscanf( fp, "%ld\n", &iOrder )) == 1 ) { niOrder++; assert( iOrder > iOrderOld ); iOrderOld = iOrder; while (ss[iss].iOrder < iOrder) { iss++; if (iss >= nss) goto DoneSS; } if (iOrder == ss[iss].iOrder) { TYPESet(&(myParticles[1 + ss[iss].iStore]),iSetMask); nSet++; } } DoneSS: #if 0 /* The following should only be done for debugging. It doubles the amount of file reading. */ /* Finish reading file to verify consistency across processors */ while ( (nRet=fscanf( fp, "%d\n", &iOrder )) == 1 ) { niOrder++; assert( iOrder > iOrderOld ); iOrderOld = iOrder; } #endif fclose(fp); *pniOrder += niOrder; *pnSet += nSet; return; } /* * Set particle type by reading in iOrders from file */ void TreePiece::setTypeFromFile(int iSetMask, char *file, const CkCallback& cb) { struct SortStruct *ss; int i,nss; int niOrder = 0; int nSet = 0; nss = myNumParticles; ss = (struct SortStruct *) malloc(sizeof(*ss)*nss); assert( ss != NULL ); for(i=0;iImage; GravityParticle *p; for (int i=1; i<=myNumParticles; ++i) { p = &(myParticles[i]); switch (in.iRender) { case DF_RENDER_POINT: dfRenderParticlePoint(&in, Image, p, dm); break; case DF_RENDER_TSC: dfRenderParticleTSC( &in, Image, p, dm); break; case DF_RENDER_SOLID: dfRenderParticleSolid( &in, Image, p, dm); break; case DF_RENDER_SHINE: /* Not implemented -- just does point */ dfRenderParticlePoint( &in, Image, p, dm); break; } } if(!liveVizDump) contribute(cb); else { // this is the RGB 3-byte/pixel image created from floating point image // data in dfFinishFrame - here we just create the pointer to pass in unsigned char *gray; dfFinishFrame(NULL, 0, 0, &in, Image, true, &gray); // final image assembly for liveViz before shipping the image // to the client. // This calls a reduction which may conflict with a TreePiece // reduction. Hence we use a shadow array "lvProxy" to avoid // this conflict. liveVizDeposit(savedLiveVizMsg, 0, 0, in.nxPix, in.nyPix, gray, lvProxy[thisIndex].ckLocal(), sum_image_data); savedLiveVizMsg = NULL; free(gray); } } /** * Overall start of building Tree. * * For ORB trees, this continues on to TreePiece::startORBTreeBuild. */ #ifdef PUSH_GRAVITY void TreePiece::buildTree(int bucketSize, const CkCallback& cb, bool _merge) #else void TreePiece::buildTree(int bucketSize, const CkCallback& cb) #endif { #if COSMO_DEBUG > 1 char fout[100]; sprintf(fout,"tree.%d.%d.after",thisIndex,iterationNo); ofstream ofs(fout); for (int i=1; i<=myNumParticles; ++i) ofs << keyBits(myParticles[i].key,KeyBits) << " " << myParticles[i].position[0] << " " << myParticles[i].position[1] << " " << myParticles[i].position[2] << endl; ofs.close(); #endif maxBucketSize = bucketSize; callback = cb; myTreeParticles = myNumParticles; deleteTree(); if(bucketReqs != NULL) { delete[] bucketReqs; bucketReqs = NULL; } #ifdef PUSH_GRAVITY // used to indicate whether trees on SMP node should be // merged or not: we do not merge trees when pushing, to // increase concurrency doMerge = _merge; #endif // decide which logic are we using to divide the particles: Oct or ORB switch (useTree) { case Binary_Oct: case Oct_Oct: Key bounds[2]; if(numTreePieces == 1) { // No need to share boundary information contribute(0, NULL, CkReduction::nop, CkCallback(CkIndex_TreePiece::recvdBoundaries(0), thisProxy)); return; } if (myNumParticles > 0) { #ifdef COSMO_PRINT CkPrintf("[%d] Keys: %016llx %016llx\n",thisIndex,myParticles[1].key,myParticles[myNumParticles].key); #endif bounds[0] = myParticles[1].key; bounds[1] = myParticles[myNumParticles].key; } int myPlace; if (dm == NULL) dm = (DataManager*)CkLocalNodeBranch(dataManagerID); myPlace = find(dm->responsibleIndex.begin(), dm->responsibleIndex.end(), thisIndex) - dm->responsibleIndex.begin(); if (myPlace == dm->responsibleIndex.size()) { // outside range // of used TreePieces contribute(0, NULL, CkReduction::nop, CkCallback(CkIndex_TreePiece::recvdBoundaries(0), thisProxy)); return; } if (myNumParticles > 0) { if (myPlace != 0) { thisProxy[dm->responsibleIndex[myPlace-1]].recvBoundary(bounds[0], RIGHT); } if (myPlace != (dm->responsibleIndex.size()-1)) { thisProxy[dm->responsibleIndex[myPlace+1]].recvBoundary(bounds[1], LEFT); } } break; case Binary_ORB: // WARNING: ORB trees do not allow TreePieces to have 0 particles! contribute(CkCallback(CkIndex_TreePiece::startORBTreeBuild(0), thisArrayID)); break; } } void TreePiece::recvBoundary(SFC::Key key, NborDir dir) { if (dir == LEFT) { myParticles[0].key = key; } else if (dir == RIGHT) { myParticles[myNumParticles+1].key = key; } else { CkAbort("Received nbor msg from someone who hasn't set the direction\n"); } if (nbor_msgs_count_ <= 0) { CkAbort("nbor_msgs_count_ <= 0 so may be not set\n"); } nbor_msgs_count_--; // All the messages from my neighbors have been received. Do a reduction to // ensure that all the TreePieces have received the boundary information if (nbor_msgs_count_ == 0) { contribute(0, NULL, CkReduction::nop, CkCallback(CkIndex_TreePiece::recvdBoundaries(0), thisProxy)); } // Since this TreePiece doesn't have any particles, forward the boundary // either to the right or left neightbor if (myNumParticles <= 0) { int myPlace; if (dm == NULL) dm = (DataManager*)CkLocalNodeBranch(dataManagerID); myPlace = find(dm->responsibleIndex.begin(), dm->responsibleIndex.end(), thisIndex) - dm->responsibleIndex.begin(); if (myPlace == dm->responsibleIndex.size()) { myPlace = -2; return; } if (dir == LEFT) { if (myPlace != (dm->responsibleIndex.size()-1)) { thisProxy[dm->responsibleIndex[myPlace+1]].recvBoundary(key, LEFT); } } else if (dir == RIGHT) { if (myPlace != 0) { thisProxy[dm->responsibleIndex[myPlace-1]].recvBoundary(key, RIGHT); } } } } void TreePiece::recvdBoundaries(CkReductionMsg* m) { delete m; if (nbor_msgs_count_ == 0) { startOctTreeBuild(NULL); setNumExpectedNeighborMsgs(); // In anticipation of another // treebuild before domain decomposition. } } void TreePiece::quiescence() { CkPrintf("[%d] quiescence detected, pending %d total %d\n", thisIndex, sLocalGravityState->myNumParticlesPending, numBuckets); for (unsigned int i=0; icounterArrays[0][i] + sLocalGravityState->counterArrays[0][i]; if (remaining != 0) CkPrintf("[%d] requests for %d remaining l %d r %d\n", thisIndex,i, sRemoteGravityState->counterArrays[0][i], sLocalGravityState->counterArrays[0][i]); } CkPrintf("quiescence detected!\n"); mainChare.niceExit(); } /*****************ORB**********************/ void TreePiece::startORBTreeBuild(CkReductionMsg* m){ delete m; if (dm == NULL) { dm = (DataManager*)CkLocalNodeBranch(dataManagerID); } if (myNumParticles == 0) { // No particle assigned to this TreePiece #ifdef PUSH_GRAVITY if(doMerge){ #endif if (verbosity > 3) ckerr << "TreePiece " << thisIndex << ": No particles, finished tree build" << endl; contribute(sizeof(callback), &callback, CkReduction::random, CkCallback(CkIndex_DataManager::combineLocalTrees((CkReductionMsg*)NULL), CProxy_DataManager(dataManagerID))); #ifdef PUSH_GRAVITY } else{ // if not combining trees, return control to mainchare contribute(callback); } #endif return; } myParticles[0].key = thisIndex; myParticles[myNumParticles+1].key = thisIndex; compFuncPtr[0]= &comp_dim0; compFuncPtr[1]= &comp_dim1; compFuncPtr[2]= &comp_dim2; pTreeNodes = new NodePool(); root = pTreeNodes->alloc_one(1, numTreePieces>1?Tree::Boundary:Tree::Internal, 0, myNumParticles+1, 0); if (thisIndex == 0) root->firstParticle ++; if (thisIndex == (int)numTreePieces-1) root->lastParticle --; root->particleCount = myNumParticles; nodeLookupTable[(Tree::NodeKey)1] = root; //root->key = firstPossibleKey; root->boundingBox = boundingBox; //nodeLookup[root->lookupKey()] = root; numBuckets = 0; bucketList.clear(); if(verbosity > 3) ckerr << "TreePiece " << thisIndex << ": Starting tree build" << endl; #if INTERLIST_VER > 0 root->startBucket=0; #endif // recursively build the tree buildORBTree(root, 0); delete [] boxes; delete [] splitDims; //Keys to all the particles have been assigned inside buildORBTree // check all the pending requests in for RemoteMoments for (MomentRequestType::iterator iter = momentRequests.begin(); iter != momentRequests.end(); ) { NodeKey nodeKey = iter->first; GenericTreeNode *node = keyToNode(nodeKey); CkVec *l = iter->second; CkAssert(node != NULL); // we actually need to increment the iterator before deleting the element, // otherwise the iterator lose its validity! iter++; if (node->getType() == Empty || node->moments.totalMass > 0) { for (int i=0; ilength(); ++i) { CkEntryOptions opts; opts.setPriority((unsigned int) -100000000); streamingProxy[(*l)[i]].receiveRemoteMoments(nodeKey, node->getType(), node->firstParticle, node->particleCount, thisIndex, node->moments, node->boundingBox, node->bndBoxBall, node->iParticleTypes, &opts); } delete l; momentRequests.erase(node->getKey()); } } if(verbosity > 3) ckerr << "TreePiece " << thisIndex << ": Number of buckets: " << numBuckets << endl; if(verbosity > 3) ckerr << "TreePiece " << thisIndex << ": Finished tree build, resolving boundary nodes" << endl; if (numTreePieces == 1) { treeBuildComplete(); } } OrientedBox TreePiece::constructBoundingBox(GenericTreeNode* node,int level, int numChild){ OrientedBox tmpBox; if(node->getType()==NonLocal){ if(numChild==0){ tmpBox = boxes[level]; tmpBox.greater_corner[splitDims[level]] = boxes[level+1].lesser_corner[splitDims[level]]; } else{ tmpBox = boxes[level]; tmpBox.lesser_corner[splitDims[level]] = boxes[level+1].greater_corner[splitDims[level]]; } return tmpBox; } else{ return boxes[level+1]; } } void TreePiece::buildORBTree(GenericTreeNode * node, int level){ if (level == NodeKeyBits-1) { ckerr << thisIndex << ": TreePiece(ORB): This piece of tree has exhausted all the bits in the keys. Super double-plus ungood!" << endl; ckerr << "Left particle: " << (node->firstParticle) << " Right particle: " << (node->lastParticle) << endl; ckerr << "Left key : " << keyBits((myParticles[node->firstParticle]).key, KeyBits).c_str() << endl; ckerr << "Right key: " << keyBits((myParticles[node->lastParticle]).key, KeyBits).c_str() << endl; return; } CkAssert(node->getType() == Boundary || node->getType() == Internal); node->makeOrbChildren(myParticles, myNumParticles, level, chunkRootLevel, compFuncPtr, (domainDecomposition == ORB_space_dec), pTreeNodes); node->rungs = 0; GenericTreeNode *child; #if INTERLIST_VER > 0 int bucketsBeneath = 0; #endif for (unsigned int i=0; inumChildren(); ++i) { child = node->getChildren(i); CkAssert(child != NULL); if(levelboundingBox = constructBoundingBox(child,level,i); } #if INTERLIST_VER > 0 child->startBucket=numBuckets; #endif nodeLookupTable[child->getKey()] = child; if (child->getType() == NonLocal) { // find a remote index for the node int first, last; bool isShared = nodeOwnership(child->getKey(), first, last); if (last < first) { // the node is really empty because falling between two TreePieces child->makeEmpty(); child->remoteIndex = thisIndex; } else { child->remoteIndex = getResponsibleIndex(first, last); // if we have a remote child, the node is a Boundary. Thus count that we // have to receive one more message for the NonLocal node node->remoteIndex --; // request the remote chare to fill this node with the Moments CkEntryOptions opts; opts.setPriority((unsigned int) -110000000); streamingProxy[child->remoteIndex].requestRemoteMoments(child->getKey(), thisIndex, &opts); } } else if (child->getType() == Internal && child->lastParticle - child->firstParticle < maxBucketSize) { CkAssert(child->firstParticle != 0 && child->lastParticle != myNumParticles+1); child->remoteIndex = thisIndex; child->makeBucket(myParticles); bucketList.push_back(child); //Assign keys to all the particles inside the bucket int num = child->lastParticle - child->firstParticle + 1; int bits = 0; while(num > (1<getKey() & mask; tmpKey = tmpKey << bits; for(int i=child->firstParticle;i<=child->lastParticle;i++){ myParticles[i].key = tmpKey; tmpKey++; } #if INTERLIST_VER > 0 child->startBucket=numBuckets; #endif numBuckets++; if (node->getType() != Boundary) { node->moments += child->moments; node->bndBoxBall.grow(child->bndBoxBall); node->iParticleTypes |= child->iParticleTypes; } if (child->rungs > node->rungs) node->rungs = child->rungs; } else if (child->getType() == Empty) { child->remoteIndex = thisIndex; } else { if (child->getType() == Internal) child->remoteIndex = thisIndex; // else the index is already 0 buildORBTree(child, level+1); // if we have a Boundary child, we will have to compute it's multipole // before we can compute the multipole of the current node (and we'll do // it in receiveRemoteMoments) if (child->getType() == Boundary) node->remoteIndex --; if (node->getType() != Boundary) { node->moments += child->moments; node->bndBoxBall.grow(child->bndBoxBall); node->iParticleTypes |= child->iParticleTypes; } if (child->rungs > node->rungs) node->rungs = child->rungs; } #if INTERLIST_VER > 0 bucketsBeneath += child->numBucketsBeneath; #endif } #if INTERLIST_VER > 0 node->numBucketsBeneath = bucketsBeneath; #endif /* The old version collected Boundary nodes, the new version collects NonLocal nodes */ if (node->getType() == Internal) { calculateRadiusFarthestCorner(node->moments, node->boundingBox); } } /******************************************/ /** * Actual treebuild for each treepiece. Each treepiece begins its * treebuild at the global root. During the treebuild, the * nodeLookupTable, a mapping from keys to nodes, is constructed. * Also, the bucket list is constructed. Constructing moments for the * boundary nodes requires requesting the moments of nodes on other * pieces. After all the moments are constructed, the treepiece * passes its root to the DataManager via DataManager::notifyPresence. * * After the local treebuild is finished * DataManager::combineLocalTrees is called, and the treebuild phase * is finished. */ void TreePiece::startOctTreeBuild(CkReductionMsg* m) { delete m; if (dm == NULL) { dm = (DataManager*)CkLocalNodeBranch(dataManagerID); } if (myNumParticles == 0) { #ifdef PUSH_GRAVITY if(doMerge){ #endif // No particle assigned to this TreePiece if (verbosity > 3) ckerr << "TreePiece " << thisIndex << ": No particles, finished tree build" << endl; contribute(sizeof(callback), &callback, CkReduction::random, CkCallback(CkIndex_DataManager::combineLocalTrees((CkReductionMsg*)NULL), CProxy_DataManager(dataManagerID))); #ifdef PUSH_GRAVITY } else{ // if not merging, return control to mainchare contribute(callback); } #endif return; } //CmiLock(dm->__nodelock); // The boundary particles are set to the splitters of the // neighboring pieces, i.e. the nearest particles that are not in // this treepiece. myPlace = find(dm->responsibleIndex.begin(), dm->responsibleIndex.end(), thisIndex) - dm->responsibleIndex.begin(); if(myPlace == 0) myParticles[0].key = firstPossibleKey; if(myPlace == dm->responsibleIndex.size() - 1) myParticles[myNumParticles + 1].key = lastPossibleKey; CkAssert(myParticles[1].key >= myParticles[0].key); CkAssert(myParticles[myNumParticles + 1].key >= myParticles[myNumParticles].key); // create the root of the global tree switch (useTree) { case Binary_Oct: pTreeNodes = new NodePool; root = pTreeNodes->alloc_one(1, numTreePieces>1?Tree::Boundary:Tree::Internal, 0, myNumParticles+1, 0); root->particlePointer = &myParticles[1]; break; case Oct_Oct: //root = new OctTreeNode(1, Tree::Boundary, 0, myNumParticles+1, 0); break; default: CkAbort("We should have never reached here!"); } if (myPlace == 0) root->firstParticle ++; if (myPlace == dm->responsibleIndex.size()-1) root->lastParticle --; root->particleCount = myNumParticles; nodeLookupTable[(Tree::NodeKey)1] = root; root->boundingBox = boundingBox; numBuckets = 0; bucketList.clear(); if(verbosity > 3) ckerr << "TreePiece " << thisIndex << ": Starting tree build" << endl; #if INTERLIST_VER > 0 root->startBucket=0; #endif // recursively build the tree double start; try { #if defined MERGE_REMOTE_REQUESTS LocalTreeTraversal traversal; // 1. Construct top level of your tree first, being careful // not to send out requests for remote moments, and not // descending into treepiece-local portions of the tree // This is called the Remote tree RemoteTreeBuilder remoteTreeBuilder(this,false); traversal.dft(root,&remoteTreeBuilder,0); // 2. Next, merge these top-level trees, causing requests // to be sent out to the owners of the remote moments required // on this PE. peTreeMergerProxy.ckLocalBranch()->mergeNonLocalRequests(root,this); // 3. Then, construct the treepiece-local portions of the tree // and respond to remote requests for local moments // 4. Finally, do a node-level merge on PE trees // Items 3 and 4 are performed after PE-level merge of trees has been done return; #elif defined SPLIT_PHASE_TREE_BUILD LocalTreeTraversal traversal; // Try to construct the non-local part of the tree: // this results in early dispatch of remote moment // requests RemoteTreeBuilder w1(this,true); traversal.dft(root,&w1,0); // Then construct the local parts of the tree LocalTreeBuilder w2(this); traversal.dft(root,&w2,0); #else start = CmiWallTimer(); buildOctTree(root, 0); traceUserBracketEvent(tbRecursiveUE,start,CmiWallTimer()); #endif } catch (std::bad_alloc) { CkAbort("Out of memory in treebuild"); } processRemoteRequestsForMoments(); if(verbosity > 3){ ckerr << "TreePiece " << thisIndex << ": Number of buckets: " << numBuckets << endl; ckerr << "TreePiece " << thisIndex << ": Finished tree build, resolving boundary nodes" << endl; } //CmiUnlock(dm->__nodelock); if (numTreePieces == 1) { treeBuildComplete(); } } void TreePiece::sendRequestForNonLocalMoments(GenericTreeNode *pickedNode){ int first, last; bool isShared = nodeOwnership(pickedNode->getKey(), first, last); if (last >= first) { // Choose a piece from among the owners from which to // request moments in such a way that if I am a piece with a // higher index, I request from a higher indexed treepiece. pickedNode->remoteIndex = getResponsibleIndex(first,last); // request the remote chare to fill this node with the Moments CkEntryOptions opts; opts.setPriority((unsigned int) -110000000); treeProxy[pickedNode->remoteIndex].requestRemoteMoments(pickedNode->getKey(), thisIndex, &opts); } } void TreePiece::processRemoteRequestsForMoments(){ double start = CmiWallTimer(); // check all the pending requests in for RemoteMoments for (MomentRequestType::iterator iter = momentRequests.begin(); iter != momentRequests.end(); ) { NodeKey nodeKey = iter->first; GenericTreeNode *node = keyToNode(nodeKey); CkVec *l = iter->second; CkAssert(node != NULL); // we actually need to increment the iterator before deleting the element, // otherwise the iterator lose its validity! iter++; if (node->getType() == Empty || node->moments.totalMass > 0) { for (int i=0; ilength(); ++i) { CkEntryOptions opts; opts.setPriority((unsigned int) -100000000); streamingProxy[(*l)[i]].receiveRemoteMoments(nodeKey, node->getType(), node->firstParticle, node->particleCount, thisIndex, node->moments, node->boundingBox, node->bndBoxBall, node->iParticleTypes, &opts); } delete l; momentRequests.erase(node->getKey()); } } traceUserBracketEvent(tbFlushRequestsUE,start,CmiWallTimer()); } void NonEmptyTreePieceCounter::addLocation(CkLocation &loc){ const int *indexData = loc.getIndex().data(); TreePiece *tp = treeProxy[indexData[0]].ckLocal(); int np = tp->getNumParticles(); if(np > 0) count++; } void NonEmptyTreePieceCounter::reset() { count = 0; } void TreePiece::mergeNonLocalRequestsDone(){ // 3. Construct the treepiece-local portions of the tree MERGE_REMOTE_REQUESTS_VERBOSE("[%d] mergeNonLocalRequestsDone\n", thisIndex); LocalTreeTraversal traversal; LocalTreeBuilder localTreeBuilder(this); traversal.dft(root,&localTreeBuilder,0); localTreeBuildComplete = true; // at this point, I might have completed building // the entire tree, since: // (a) I must have finished building the // RemoteTree, otherwise the PETreeMerger // wouldn't have invoked this method on me // (b) I might have received all the moments for // which I was a client // 4. Respond to remote requests for local moments processRemoteRequestsForMoments(); // If none of the root's children will be updated // with remote moments, we are done if(root->remoteIndex == thisIndex){ treeBuildComplete(); } } /// Determine who are all the owners of this node /// @return true if the caller is part of the owners, false otherwise bool TreePiece::nodeOwnership(const Tree::NodeKey nkey, int &firstOwner, int &lastOwner) { if(useTree == Binary_ORB){ // Added for ORB Trees int keyLevel=0; Key tmpKey = Key(nkey); while(tmpKey > 1){ tmpKey >>= 1; keyLevel++; } if(keyLevel >= chunkRootLevel){ tmpKey = nkey >> (keyLevel-chunkRootLevel); tmpKey = tmpKey - (1 << chunkRootLevel); firstOwner = tmpKey; lastOwner = tmpKey; } else{ tmpKey = nkey << (chunkRootLevel - keyLevel); tmpKey = tmpKey - (1 << chunkRootLevel); firstOwner = tmpKey; Key mask = (1 << (chunkRootLevel - keyLevel)) - 1; tmpKey = nkey << (chunkRootLevel - keyLevel); tmpKey = tmpKey - (1 << chunkRootLevel); tmpKey = tmpKey + mask; lastOwner = tmpKey; } } else{ // From the nodekey determine the particle keys that bound this node. Key firstKey = Key(nkey); Key lastKey = Key(nkey + 1); const Key mask = Key(1) << KeyBits; while (! (firstKey & mask)) { firstKey <<= 1; lastKey <<= 1; } firstKey &= ~mask; lastKey &= ~mask; lastKey -= 1; vector::iterator locLeft; locLeft = lower_bound(dm->boundaryKeys.begin(), dm->boundaryKeys.end(), firstKey); if (locLeft != dm->boundaryKeys.begin()) { locLeft--; } vector::iterator locRight; locRight = lower_bound(locLeft, dm->boundaryKeys.end(), lastKey); if (locRight == dm->boundaryKeys.end()) { locRight--; } firstOwner = (locLeft - dm->boundaryKeys.begin()); lastOwner = (locRight - dm->boundaryKeys.begin() - 1); #if COSMO_PRINT > 1 std::string str = keyBits(nkey,KeyBits); CkPrintf("[%d] NO: key=%s, first=%d, last=%d\n",thisIndex,str.c_str(),locLeft-dm->splitters,locRight-dm->splitters); #endif } return (myPlace >= firstOwner && myPlace <= lastOwner); } /** A recursive algorithm for building my tree. Examines successive bits in the particles' keys, looking for splits. Each bit is a level of nodes in the tree. We keep going down until we can bucket the particles. */ void TreePiece::buildOctTree(GenericTreeNode * node, int level) { #ifdef TREE_BREADTH_FIRST CkQ *queue = new CkQ(1024); CkQ *queueNext = new CkQ(1024); queue->enq(node); GenericTreeNode *rootNode = node; while (1) { node = queue->deq(); if (node == NULL) { node = queueNext->deq(); CkQ *tmp = queue; queue = queueNext; queueNext = tmp; level++; } if (node == NULL) break; #endif if (level == NodeKeyBits-2) { ckerr << thisIndex << ": TreePiece: This piece of tree has exhausted all the bits in the keys. Super double-plus ungood!" << endl; ckerr << "Left particle: " << (node->firstParticle) << " Right particle: " << (node->lastParticle) << endl; ckerr << "Left key : " << keyBits((myParticles[node->firstParticle]).key, KeyBits).c_str() << endl; ckerr << "Right key: " << keyBits((myParticles[node->lastParticle]).key, KeyBits).c_str() << endl; ckerr << "Node type: " << node->getType() << endl; ckerr << "myNumParticles: " << myNumParticles << endl; CkAbort("Tree is too deep!"); return; } CkAssert(node->getType() == Boundary || node->getType() == Internal); node->makeOctChildren(myParticles, myNumParticles, level, pTreeNodes); // The boundingBox was used above to determine the spacially equal // split between the children. Now reset it so it can be calculated // from the particle positions. node->boundingBox.reset(); node->rungs = 0; GenericTreeNode *child; #if INTERLIST_VER > 0 int bucketsBeneath = 0; #endif for (unsigned int i=0; inumChildren(); ++i) { child = node->getChildren(i); CkAssert(child != NULL); #if INTERLIST_VER > 0 child->startBucket=numBuckets; #endif nodeLookupTable[child->getKey()] = child; if (child->getType() == NonLocal) { // find a remote index for the node int first, last; bool isShared = nodeOwnership(child->getKey(), first, last); if (last < first) { // the node is really empty because falling between two TreePieces child->makeEmpty(); child->remoteIndex = thisIndex; } else { child->remoteIndex = getResponsibleIndex(first, last); // if we have a remote child, the node is a Boundary. Thus count that we // have to receive one more message for the NonLocal node node->remoteIndex --; // request the remote chare to fill this node with the Moments CkEntryOptions opts; opts.setPriority((unsigned int) -110000000); streamingProxy[child->remoteIndex].requestRemoteMoments(child->getKey(), thisIndex, &opts); } } else if (child->getType() == Internal && (child->lastParticle - child->firstParticle < maxBucketSize || level >= NodeKeyBits-3)) { if(level >= NodeKeyBits-3 && child->lastParticle - child->firstParticle >= maxBucketSize) ckerr << "Truncated tree with " << child->lastParticle - child->firstParticle << " particle bucket" << endl; CkAssert(child->firstParticle != 0 && child->lastParticle != myNumParticles+1); child->remoteIndex = thisIndex; child->makeBucket(myParticles); bucketList.push_back(child); #if INTERLIST_VER > 0 child->startBucket=numBuckets; #endif numBuckets++; if (node->getType() != Boundary) { node->moments += child->moments; node->boundingBox.grow(child->boundingBox); node->bndBoxBall.grow(child->bndBoxBall); node->iParticleTypes |= child->iParticleTypes; } if (child->rungs > node->rungs) node->rungs = child->rungs; } else if (child->getType() == Empty) { child->remoteIndex = thisIndex; } else { if (child->getType() == Internal) child->remoteIndex = thisIndex; // else the index is already 0 #ifdef TREE_BREADTH_FIRST queueNext->enq(child); #else buildOctTree(child, level+1); #endif // if we have a Boundary child, we will have to compute it's multipole // before we can compute the multipole of the current node (and we'll do // it in receiveRemoteMoments) if (child->getType() == Boundary) node->remoteIndex --; #ifndef TREE_BREADTH_FIRST if (node->getType() != Boundary) { node->moments += child->moments; node->boundingBox.grow(child->boundingBox); node->bndBoxBall.grow(child->bndBoxBall); node->iParticleTypes |= child->iParticleTypes; } // for the rung information we can always do now since it is a local property if (child->rungs > node->rungs) node->rungs = child->rungs; #endif } #if INTERLIST_VER > 0 bucketsBeneath += child->numBucketsBeneath; #endif } #if INTERLIST_VER > 0 node->numBucketsBeneath = bucketsBeneath; #endif /* The old version collected Boundary nodes, the new version collects NonLocal nodes */ #ifndef TREE_BREADTH_FIRST if (node->getType() == Internal) { calculateRadiusFarthestCorner(node->moments, node->boundingBox); } #endif #ifdef TREE_BREADTH_FIRST } growBottomUp(rootNode); #endif } #ifdef TREE_BREADTH_FIRST void TreePiece::growBottomUp(GenericTreeNode *node) { GenericTreeNode *child; for (unsigned int i=0; inumChildren(); ++i) { child = node->getChildren(i); if (child->getType() == NonLocal || (child->getType() == Bucket) || child->getType() == Empty) continue; growBottomUp(child); if (node->getType() != Boundary) { node->moments += child->moments; node->boundingBox.grow(child->boundingBox); node->bndBoxBall.grow(child->bndBoxBall); node->iParticleTypes |= child->iParticleTypes; } if (child->rungs > node->rungs) node->rungs = child->rungs; } if (node->getType() == Internal) { calculateRadiusFarthestCorner(node->moments, node->boundingBox); } } #endif /// When the node is found to be null, forward it to the neighbor bool TreePiece::sendFillReqNodeWhenNull(CkCacheRequestMsg *msg) { Tree::NodeKey key = msg->key; KeyType firstKey = KeyType(key); KeyType lastKey = KeyType(key + 1); const KeyType mask = KeyType(1) << KeyBits; while (! (firstKey & mask)) { firstKey <<= 1; lastKey <<= 1; } firstKey &= ~mask; lastKey &= ~mask; lastKey -= 1; // If the firstkey of the requested key is greater than the last particle key, // then this node may be with the right neighbor if (myParticles[myNumParticles].key < firstKey) { streamingProxy[thisIndex+1].fillRequestNode(msg); return true; } // If the lastkey of the requested key is less than the first particle key, // then this node may be with the left neighbor if (myParticles[1].key > lastKey) { streamingProxy[thisIndex-1].fillRequestNode(msg); return true; } return false; } /// \brief entry method to obtain the moments of a node void TreePiece::requestRemoteMoments(const Tree::NodeKey key, int sender) { GenericTreeNode *node = keyToNode(key); if (node != NULL && node->getType() != NonLocalBucket // If it's a non-local // bucket, punt to real // owner, because we need // particle information && (node->getType() == Empty || node->moments.totalMass > 0)) { CkEntryOptions opts; opts.setPriority((unsigned int) -100000000); streamingProxy[sender].receiveRemoteMoments(key, node->getType(), node->firstParticle, node->particleCount, thisIndex, node->moments, node->boundingBox, node->bndBoxBall, node->iParticleTypes, &opts); return; } // If this node is NULL and outside this TP range (last Particle < key firstKey) // and if so send it to my right neighbor. // If the TP first particle > key last key, then send it to my left neighbor. Key firstKey = Key(key); Key lastKey = Key(key + 1); const Key mask = Key(1) << KeyBits; while (! (firstKey & mask)) { firstKey <<= 1; lastKey <<= 1; } firstKey &= ~mask; lastKey &= ~mask; lastKey -= 1; if (myParticles[myNumParticles].key < firstKey && thisIndex < numTreePieces-1) { streamingProxy[thisIndex+1].requestRemoteMoments(key, sender); } else if (myParticles[1].key > lastKey && thisIndex > 0) { streamingProxy[thisIndex-1].requestRemoteMoments(key, sender); } else { // Save request for when we've calculated the moment. CkVec *l = momentRequests[key]; if (l == NULL) { l = new CkVec(); momentRequests[key] = l; } l->push_back(sender); } } /// \brief response from requestRemoteMoments void TreePiece::receiveRemoteMoments(const Tree::NodeKey key, Tree::NodeType type, int firstParticle, int numParticles, int remIdx, const MultipoleMoments& moments, const OrientedBox& box, const OrientedBox& boxBall, const unsigned int iParticleTypes) { GenericTreeNode *node = keyToNode(key); CkAssert(node != NULL); MERGE_REMOTE_REQUESTS_VERBOSE("[%d] receiveRemoteMoments %llu\n",thisIndex,key); // assign the incoming moments to the node if (type == Empty) node->makeEmpty(); else { if (type == Bucket || type == NonLocalBucket) { node->setType(NonLocalBucket); node->firstParticle = firstParticle; node->lastParticle = firstParticle + numParticles - 1; } node->particleCount = numParticles; node->moments = moments; node->boundingBox = box; node->bndBoxBall = boxBall; node->iParticleTypes = iParticleTypes; node->remoteIndex = remIdx; } #ifdef MERGE_REMOTE_REQUESTS deliverMomentsToClients(node); #endif // look if we can compute the moments of some ancestors, and eventually send // them to a requester GenericTreeNode *parent = node->parent; if(parent != NULL){ CkAssert(parent->getType() == Boundary); parent->remoteIndex++; } #ifdef MERGE_REMOTE_REQUESTS // If we are merging remote requests, since we only build the skeleton // top-level tree (i.e. RemoteTree) in the first pass (RemoteTreeBuilder) // we might not have computed the moments of internal siblings of this node // Moreover, we do not decrement the remoteIndex of the parent for these // Internal children whose moments have not yet been computed. Therefore, // we want to avoid the situation wherein a Boundary parent node's non-Internal // child's moments are received in this invocation, but the moments of the // parent have not yet been computed. We can be sure that the Internal childrens' // moments have been computed when the LocalTreeBuilder has finished if(!localTreeBuildComplete) return; #endif while (parent != NULL && parent->remoteIndex == 0) { GenericTreeNode *parentsParent = boundaryParentReady(parent); #ifdef MERGE_REMOTE_REQUESTS deliverMomentsToClients(parent); #endif parent = parentsParent; } if (parent == NULL) { // if we are here then we are at the root, and thus we have finished to get // all moments treeBuildComplete(); } } GenericTreeNode *TreePiece::boundaryParentReady(GenericTreeNode *parent){ // compute the multipole for the parent MERGE_REMOTE_REQUESTS_VERBOSE("[%d] boundaryParentReady %llu\n",thisIndex,parent->getKey()); parent->particleCount = 0; parent->remoteIndex = thisIndex; // reset the reference index to ourself GenericTreeNode *child; for (unsigned int i=0; inumChildren(); ++i) { child = parent->getChildren(i); parent->particleCount += child->particleCount; accumulateMomentsFromChild(parent,child); } calculateRadiusFarthestCorner(parent->moments, parent->boundingBox); // check if someone has requested this node MomentRequestType::iterator iter; if ((iter = momentRequests.find(parent->getKey())) != momentRequests.end()) { CkVec *l = iter->second; for (int i=0; ilength(); ++i) { CkEntryOptions opts; opts.setPriority((unsigned int) -100000000); streamingProxy[(*l)[i]].receiveRemoteMoments(parent->getKey(), parent->getType(), parent->firstParticle, parent->particleCount, thisIndex, parent->moments, parent->boundingBox, parent->bndBoxBall, parent->iParticleTypes, &opts); } delete l; momentRequests.erase(parent->getKey()); } // go to the next ancestor GenericTreeNode *node = parent; parent = node->parent; // we just computed the current parent's child's // moments. The current parent must be a Boundary if(parent != NULL){ CkAssert(parent->getType() == Boundary); parent->remoteIndex++; } return parent; } void TreePiece::accumulateMomentsFromChild(GenericTreeNode *parent, GenericTreeNode *child){ parent->moments += child->moments; parent->boundingBox.grow(child->boundingBox); parent->bndBoxBall.grow(child->bndBoxBall); parent->iParticleTypes |= child->iParticleTypes; } /// @brief determine if moments of node have been requested. /// /// This is used by MERGE_REMOTE_REQUESTS to distribute the received /// moments among Treepieces on a core. void TreePiece::deliverMomentsToClients(GenericTreeNode *node){ std::map::iterator it; it = nonLocalMomentsClients.find(node->getKey()); if(it == nonLocalMomentsClients.end()) return; CkAssert(it->second.targetNode == node); deliverMomentsToClients(it); } /// @brief send moments to all the pieces that have requested them. /// /// This is used by MERGE_REMOTE_REQUESTS to distribute the received /// moments among Treepieces on a core. void TreePiece::deliverMomentsToClients(const std::map::iterator &it){ NonLocalMomentsClientList &entry = it->second; CkVec &clients = entry.clients; GenericTreeNode *node = entry.targetNode; CkAssert(node->remoteIndex >= 0); for(int i = 0; i < clients.length(); i++){ MERGE_REMOTE_REQUESTS_VERBOSE("[%d] send %llu (%s) moments to %d\n", thisIndex, node->getKey(), typeString(node->getType()),clients[i].clientTreePiece->getIndex()); clients[i].clientTreePiece->receiveRemoteMoments(node->getKey(),node->getType(), node->firstParticle,node->particleCount,node->remoteIndex,node->moments, node->boundingBox,node->bndBoxBall,node->iParticleTypes); } clients.clear(); nonLocalMomentsClients.erase(it); } void TreePiece::treeBuildComplete(){ #ifdef MERGE_REMOTE_REQUESTS MERGE_REMOTE_REQUESTS_VERBOSE("[%d] treeBuildComplete\n", thisIndex); // reset CkAssert(localTreeBuildComplete); localTreeBuildComplete = false; CkAssert(nonLocalMomentsClients.empty()); #endif #ifdef PUSH_GRAVITY if(doMerge){ #endif dm->notifyPresence(root, this); contribute(sizeof(callback), &callback, CkReduction::random, CkCallback(CkIndex_DataManager::combineLocalTrees((CkReductionMsg*)NULL), CProxy_DataManager(dataManagerID))); #ifdef PUSH_GRAVITY } else{ // if not merging, return control to mainchare contribute(callback); } #endif } bool bIsReplica(int reqID) { int offsetcode = reqID >> 22; int x = (offsetcode & 0x7) - 3; int y = ((offsetcode >> 3) & 0x7) - 3; int z = ((offsetcode >> 6) & 0x7) - 3; return x || y || z; } int decodeReqID(int reqID) { const int offsetmask = 0x1ff << 22; return reqID & (~offsetmask); } int encodeOffset(int reqID, int x, int y, int z) { // Bit limitations follow CkAssert(x > -4 && x < 4); CkAssert(y > -4 && y < 4); CkAssert(z > -4 && z < 4); // Replica in each direction is mapped to 0-7 range int offsetcode = (x + 3) | ((y+3) << 3) | ((z+3) << 6); // Assume we only have 32 bits to work with (minus sign bit) // 4 million buckets is our limitation CkAssert(reqID < (1 << 22)); return reqID | (offsetcode << 22); } // Add reqID to encoded offset int reEncodeOffset(int reqID, int offsetID) { const int offsetmask = 0x1ff << 22; return reqID | (offsetmask & offsetID); } /** * Initialize all particles for gravity force calculation. * This includes zeroing out the acceleration and potential. */ void TreePiece::initBuckets() { int ewaldCondition = (bEwald ? 0 : 1); for (unsigned int j=0; jboundingBox.reset(); // XXXX dangerous should have separate // Active bounds for(int i = node->firstParticle; i <= node->lastParticle; ++i) { if (myParticles[i].rung >= activeRung) { myParticles[i].treeAcceleration = 0; myParticles[i].potential = 0; myParticles[i].dtGrav = 0; // node->boundingBox.grow(myParticles[i].position); if(bComove && !bPeriodic) { /* * Add gravity from the rest of the * Universe. This adds force and * potential from a uniform (negative!) * density sphere. */ myParticles[i].treeAcceleration = dRhoFac*myParticles[i].position; myParticles[i].potential = -0.5*dRhoFac*myParticles[i].position.lengthSquared(); myParticles[i].dtGrav = dRhoFac; } } } bucketReqs[j].finished = ewaldCondition; /*#if COSMO_DEBUG > 1 if(iterationNo==1 || listMigrated==true){ std::set *list = new std::set(); bucketcheckList.push_back(*list); } #endif*/ } #if COSMO_DEBUG > 1 || defined CHANGA_REFACTOR_WALKCHECK || defined CHANGA_REFACTOR_WALKCHECK_INTERLIST bucketcheckList.resize(numBuckets); #endif } void TreePiece::startNextBucket() { int currentBucket = sLocalGravityState->currentBucket; if(currentBucket >= numBuckets) return; #if INTERLIST_VER > 0 // no need to do the following, because interlist walk and interlist compute are only // ever associated with each other, unlike the topdown walk, which may be associated // with gravity or prefetch objects // sInterListWalk->init(sGravity, this); GenericTreeNode *lca; // Least Common Ancestor // check whether we have a valid lca. for the first bucket // (currentBucket == 0) the lca must be set to the highest point // in the local tree that contains bucket 0 (i.e., root). lca = getStartAncestor(currentBucket, prevBucket, root); int lcaLevel = lca->getLevel(lca->getKey()); // set opt sGravity->init(lca, activeRung, sLocal); // must set lowest node here #ifdef CHANGA_REFACTOR_MEMCHECK CkPrintf("startNextBucket memcheck after init\n"); CmiMemoryCheck(); #endif #else GenericTreeNode *target = bucketList[currentBucket]; sTopDown->init(sGravity, this); sGravity->init(target, activeRung, sLocal); #endif #if INTERLIST_VER > 0 DoubleWalkState *lstate = (DoubleWalkState *)sLocalGravityState; if(!lstate->placedRoots[0]){ lstate->placedRoots[0] = true; #endif for(int cr = 0; cr < numChunks; cr++){ #ifdef DISABLE_NODE_TREE GenericTreeNode *chunkRoot = keyToNode(prefetchRoots[cr]); #else GenericTreeNode *chunkRoot = dm->chunkRootToNode(prefetchRoots[cr]); #endif if(chunkRoot != 0){ for(int x = -nReplicas; x <= nReplicas; x++) { for(int y = -nReplicas; y <= nReplicas; y++) { for(int z = -nReplicas; z <= nReplicas; z++) { #if INTERLIST_VER > 0 // place chunk root on chklist of lcaLevel: OffsetNode on; on.node = chunkRoot; // value of currentBucket doesn't matter on.offsetID = encodeOffset(0, x,y,z); lstate->chklists[lcaLevel].enq(on); #else // last -1 arg is the activeWalkIndex sTopDown->walk(chunkRoot, sLocalGravityState, -1, encodeOffset(currentBucket, x,y,z), -1); #endif } } } } } #if INTERLIST_VER > 0 #ifdef CHANGA_REFACTOR_MEMCHECK CkPrintf("startNextBucket memcheck after enq\n"); CmiMemoryCheck(); #endif } #endif #if INTERLIST_VER > 0 // all nodes to walk on have been enqueued, call walk // lca is the node the dft starts from. // sIntersListStateLocal contains the nodes to be walked on // last -1 arg is the activeWalkIndex // second last -1 arg is reqID. Because it doesn't make sense in this case, // we send the currentBucket number instead (as target) // third last -1 arg is chunk sInterListWalk->walk(lca, sLocalGravityState, -1, currentBucket, -1); #ifdef CHANGA_REFACTOR_MEMCHECK CkPrintf("startNextBucket memcheck after walk\n"); CmiMemoryCheck(); #endif #endif } /*inline*/ void TreePiece::finishBucket(int iBucket) { BucketGravityRequest *req = &bucketReqs[iBucket]; int remaining; remaining = sRemoteGravityState->counterArrays[0][iBucket] + sLocalGravityState->counterArrays[0][iBucket]; CkAssert(remaining >= 0); #ifdef COSMO_PRINT CkPrintf("[%d] Is finished %d? finished=%d, %d still missing!\n",thisIndex,iBucket,req->finished, remaining); #endif // XXX finished means Ewald is done. if(req->finished && remaining == 0) { sLocalGravityState->myNumParticlesPending -= 1; #ifdef COSMO_PRINT_BK CkPrintf("[%d] Finished bucket %d, %d particles remaining\n",thisIndex,iBucket, sLocalGravityState->myNumParticlesPending); #endif if(sLocalGravityState->myNumParticlesPending == 0) { if(verbosity>1){ #if COSMO_STATS > 0 CkPrintf("[%d] TreePiece %d finished with bucket %d , openCriterions:%lld\n",CkMyPe(),thisIndex,iBucket,numOpenCriterionCalls); #else CkPrintf("[%d] TreePiece %d finished with bucket %d\n",CkMyPe(),thisIndex,iBucket); #endif } #if defined CUDA // in cuda version, must wait till particle accels. // are copied back from gpu; can markwalkdone only // after this, otherwise there is a race condition // between the start of the next iteration, wherein // the treepiece registers itself afresh with the // data manager. if during this time the particles // haven't been copied (and updateParticles hasn't // been called), the registeredTreePieces list will // not be reset, so that the data manager gets // confused. dm->transferParticleVarsBack(); //dm->freeLocalTreeMemory(); #else // move on to markwalkdone in non-cuda version continueWrapUp(); #endif } } } void TreePiece::continueWrapUp(){ #ifdef CHECK_WALK_COMPLETIONS CkPrintf("[%d] markWalkDone TreePiece::continueWrapUp\n", thisIndex); #endif #ifdef CACHE_MEM_STATS memWithCache = CmiMemoryUsage()/(1024*1024); #endif nNodeCacheEntries = cacheNode.ckLocalBranch()->getCache()->size(); nPartCacheEntries = cacheGravPart.ckLocalBranch()->getCache()->size(); markWalkDone(); if(verbosity > 4){ ckerr << "TreePiece " << thisIndex << ": My particles are done" << endl; } } void TreePiece::doAllBuckets(){ #if COSMO_DEBUG > 0 char fout[100]; sprintf(fout,"tree.%d.%d",thisIndex,iterationNo); ofstream ofs(fout); printTree(root,ofs); ofs.close(); report(); #endif dummyMsg *msg = new (8*sizeof(int)) dummyMsg; *((int *)CkPriorityPtr(msg)) = 2 * numTreePieces * numChunks + thisIndex + 1; CkSetQueueing(msg,CK_QUEUEING_IFIFO); msg->val=0; thisProxy[thisIndex].nextBucket(msg); #ifdef CUDA_INSTRUMENT_WRS ((DoubleWalkState *)sLocalGravityState)->nodeListConstructionTimeStart(); ((DoubleWalkState *)sLocalGravityState)->partListConstructionTimeStart(); #endif } void TreePiece::nextBucket(dummyMsg *msg){ unsigned int i=0; int currentBucket = sLocalGravityState->currentBucket; if(verbosity >= 4) CkPrintf("[%d] walking bucket %d\n", thisIndex, currentBucket); bool useckloop = false; int yield_num = _yieldPeriod; #if INTERLIST_VER > 0 #if !defined(CUDA) LoopParData* lpdata; int tmpBucketBegin; if (bUseCkLoopPar && otherIdlePesAvail()) { useckloop = true; // This value was chosen to be 2*Nodesize so that we have enough buckets for // all the PEs in the node and also giving some extra for load balance. yield_num = 2 * CkMyNodeSize(); lpdata = new LoopParData(); lpdata->tp = this; tmpBucketBegin = currentBucket; } #endif sInterListWalk->init(sGravity, this); #endif while(irungs >= activeRung){ #ifdef CHANGA_REFACTOR_INTERLIST_PRINT_BUCKET_START_FIN CkPrintf("[%d] local bucket active %d buckRem: %d + %d \n", thisIndex, currentBucket, sRemoteGravityState->counterArrays[0][currentBucket], sLocalGravityState->counterArrays[0][currentBucket]); #endif // construct lists startNextBucket(); #if INTERLIST_VER > 0 // do computation GenericTreeNode *lowestNode = ((DoubleWalkState *) sLocalGravityState)->lowestNode; int startBucket, end; getBucketsBeneathBounds(lowestNode, startBucket, end); CkAssert(currentBucket >= startBucket); if (useckloop) { lpdata->lowNodes.insertAtEnd(lowestNode); lpdata->bucketids.insertAtEnd(currentBucket); CkVec cl; CkVec rp; CkVec lp; // Populate the list with nodes and particles with which the force is // calculated. sGravity->fillLists(sLocalGravityState, this, -1, currentBucket, end, cl, rp, lp); lpdata->chunkids.insertAtEnd(-1); lpdata->clists.insertAtEnd(cl); lpdata->rpilists.insertAtEnd(rp); lpdata->lpilists.insertAtEnd(lp); } else { sGravity->stateReady(sLocalGravityState, this, -1, currentBucket, end); } #ifdef CHANGA_REFACTOR_MEMCHECK CkPrintf("active: nextBucket memcheck after stateReady\n"); CmiMemoryCheck(); #endif // book-keeping #if COSMO_PRINT_BK > 1 CkPrintf("[%d] active local bucket book-keep\n", thisIndex); #endif int numActualBuckets = doBookKeepingForTargetActive(currentBucket, end, -1, !useckloop, sLocalGravityState); #ifdef CHANGA_REFACTOR_MEMCHECK CkPrintf("active: nextBucket memcheck after book-keeping (prev: %d, curr: %d, i: %d)\n", prevBucket, currentBucket, i); CmiMemoryCheck(); #endif // current target is active, so it will be // ok to mark it as prev active target for next // time prevBucket = currentBucket; currentBucket = end; if(end < numBuckets) // finishBucket() could clean up state if // we are at the end sLocalGravityState->currentBucket = end; i += numActualBuckets; #else sLocalGravityState->counterArrays[0][currentBucket]--; finishBucket(currentBucket); currentBucket++; if(currentBucket < numBuckets) // state could be deleted in this case. sLocalGravityState->currentBucket++; i++; #endif } else{ #if INTERLIST_VER > 0 // target was not active, so tree under lca has // not been processed. look for next active bucket // keep moving forward until an active bucket is reached // all (inactive) buckets encountered meanwhile are // finished #if COSMO_PRINT_BK > 1 CkPrintf("[%d] inactive local bucket book-keep\n", thisIndex); #endif currentBucket += doBookKeepingForTargetInactive(-1, !useckloop, sLocalGravityState); #ifdef CHANGA_REFACTOR_MEMCHECK CkPrintf("not active: nextBucket memcheck after book-keeping (prev: %d, curr: %d)\n", prevBucket, currentBucket); CmiMemoryCheck(); #endif #else while(currentBucket < numBuckets && bucketList[currentBucket]->rungs < activeRung){ sLocalGravityState->counterArrays[0][currentBucket]--; finishBucket(currentBucket); currentBucket++; if(currentBucket < numBuckets) // state could be deleted in // this case. sLocalGravityState->currentBucket++; } #endif // i isn't incremented because we've skipped inactive buckets }// end else (target not active) }// end while if (useckloop) { // Use ckloop to parallelize force calculation and this will update the // counterArrays as well. executeCkLoopParallelization(lpdata, tmpBucketBegin, yield_num, -1, sLocalGravityState); delete lpdata; } if (currentBucket 0 && defined CUDA // The local walk might have some outstanding computation requests // at this point. Flush these lists to GPU DoubleWalkState *ds = (DoubleWalkState *)sLocalGravityState; ListCompute *lc = (ListCompute *)sGravity; if(lc && ds){ if(ds->nodeLists.totalNumInteractions > 0){ lc->sendNodeInteractionsToGpu(ds, this); lc->resetCudaNodeState(ds); } if(ds->particleLists.totalNumInteractions > 0){ lc->sendPartInteractionsToGpu(ds, this); lc->resetCudaPartState(ds); } } #endif delete msg; } } void doWorkForCkLoop(int start, int end, void *result, int pnum, void * param) { LoopParData* lpdata = (LoopParData *)param; TreePiece* tp = lpdata->tp; double tstart = CkWallTimer(); for (int i = start; i <= end; i++) { tp->doParallelNextBucketWork(i, lpdata); } double tend = CkWallTimer(); *(double *)result = tend - tstart; } void TreePiece::doParallelNextBucketWork(int idx, LoopParData* lpdata) { GenericTreeNode* lowestNode = lpdata->lowNodes[idx]; int currentBucket = lpdata->bucketids[idx]; int chunkNum = lpdata->chunkids[idx]; int startBucket, endBucket; getBucketsBeneathBounds(lowestNode, startBucket, endBucket); sGravity->stateReadyPar(this, currentBucket, endBucket, lpdata->clists[idx], lpdata->rpilists[idx], lpdata->lpilists[idx]); } /// This function could be replaced by the doAllBuckets() call. void TreePiece::calculateGravityLocal() { doAllBuckets(); } void TreePiece::calculateEwald(dummyMsg *msg) { #ifdef SPCUDA h_idata = (EwaldData*) malloc(sizeof(EwaldData)); CkArrayIndex1D myIndex = CkArrayIndex1D(thisIndex); EwaldHostMemorySetup(h_idata, myNumParticles, nEwhLoop); EwaldGPU(); #else bool useckloop = false; int yield_num = _yieldPeriod; if (bUseCkLoopPar && otherIdlePesAvail()) { useckloop = true; // This value was chosen to be32*Nodesize so that we have enough buckets for // all the PEs in the node and also giving some extra for load balance. yield_num = 3 * CkMyNodeSize(); calculateEwaldUsingCkLoop(msg, yield_num); } unsigned int i=0; while (i < yield_num && ewaldCurrentBucket < numBuckets) { if (!useckloop) { BucketEwald(bucketList[ewaldCurrentBucket], nReplicas, fEwCut); } bucketReqs[ewaldCurrentBucket].finished = 1; finishBucket(ewaldCurrentBucket); ewaldCurrentBucket++; i++; } if (ewaldCurrentBucket idlepes = nodeLBMgrProxy.ckLocalBranch()->getOtherIdlePes(); if (idlepes.size() > 0.5 * CkMyNodeSize()) { return true; } return false; } void TreePiece::callBucketEwald(int id) { BucketEwald(bucketList[id], nReplicas, fEwCut); } void doCalcEwald(int start, int end, void *result, int pnum, void * param) { LoopParData* lpdata = (LoopParData *)param; TreePiece* tp = lpdata->tp; double tstart = CkWallTimer(); for (int i = start; i <= end; i++) { tp->callBucketEwald(lpdata->bucketids[i]); } double tend = CkWallTimer(); *(double *)result = tend - tstart; } void TreePiece::calculateEwaldUsingCkLoop(dummyMsg *msg, int yield_num) { unsigned int i=0; LoopParData* lpdata = new LoopParData(); lpdata->tp = this; int sbucket = ewaldCurrentBucket; while (i < yield_num && ewaldCurrentBucket < numBuckets) { lpdata->bucketids.insertAtEnd(ewaldCurrentBucket); ewaldCurrentBucket++; i++; } int num_chunks = 3 * CkMyNodeSize(); // CkLoop library limits the number of chunks to be 64. if (num_chunks > 64) { num_chunks = 64; } int start = 0; int end = lpdata->bucketids.length(); if (num_chunks > end) { num_chunks = end; } if (num_chunks > 0) { double timebeforeckloop = getObjTime(); double timeforckloop; LBTurnInstrumentOff(); double stime = CkWallTimer(); #if CMK_SMP CkLoop_Parallelize(doCalcEwald, 1, lpdata, num_chunks, start, end-1, 1, &timeforckloop, CKLOOP_DOUBLE_SUM); #else CkAbort("CkLoop usage only in SMP mode\n"); #endif double etime = CkWallTimer() - stime; setObjTime(timebeforeckloop + timeforckloop); LBTurnInstrumentOn(); } ewaldCurrentBucket = sbucket; delete lpdata; } const char *typeString(NodeType type); void TreePiece::calculateGravityRemote(ComputeChunkMsg *msg) { unsigned int i=0; // cache internal tree: start directly asking the CacheManager #ifdef DISABLE_NODE_TREE GenericTreeNode *chunkRoot = keyToNode(prefetchRoots[msg->chunkNum]); #else GenericTreeNode *chunkRoot = dm->chunkRootToNode(prefetchRoots[msg->chunkNum]); #endif #ifdef CHANGA_REFACTOR_MEMCHECK CkPrintf("memcheck right after starting cgr\n"); CmiMemoryCheck(); #endif #ifdef CUDA_INSTRUMENT_WRS ((DoubleWalkState *)sRemoteGravityState)->nodeListConstructionTimeStart(); ((DoubleWalkState *)sRemoteGravityState)->partListConstructionTimeStart(); ((DoubleWalkState *)sInterListStateRemoteResume)->nodeListConstructionTimeStart(); ((DoubleWalkState *)sInterListStateRemoteResume)->partListConstructionTimeStart(); #endif CkAssert(chunkRoot != NULL); bool useckloop = false; int yield_num = _yieldPeriod; #if INTERLIST_VER > 0 #if !defined(CUDA) LoopParData* lpdata; // Keep track of which was the currentBucket so that it can be restored in the // ckloop part. int tmpBucketBegin; if (bUseCkLoopPar && otherIdlePesAvail()) { useckloop = true; // This value was chosen to be 2*Nodesize so that we have enough buckets for // all the PEs in the node and also giving some extra for load balance. yield_num = 2 * CkMyNodeSize(); lpdata = new LoopParData(); lpdata->tp = this; tmpBucketBegin = sRemoteGravityState->currentBucket; } #endif sInterListWalk->init(sGravity, this); #else sTopDown->init(sGravity, this); #endif #ifdef CHANGA_REFACTOR_MEMCHECK CkPrintf("memcheck right after init\n"); CmiMemoryCheck(); #endif while (icurrentBucket < numBuckets ) { #ifdef CHANGA_REFACTOR_WALKCHECK if(thisIndex == CHECK_INDEX && sRemoteGravityState->currentBucket == CHECK_BUCKET){ CkPrintf("Starting remote walk\n"); } #endif #ifdef CHANGA_REFACTOR_INTERLIST_PRINT_BUCKET_START_FIN CkPrintf("[%d] remote bucket active %d buckRem: %d + %d, remChunk: %d\n", thisIndex, sRemoteGravityState->currentBucket, sRemoteGravityState->counterArrays[0][sRemoteGravityState->currentBucket], sLocalGravityState->counterArrays[0][sRemoteGravityState->currentBucket], sRemoteGravityState->counterArrays[1][msg->chunkNum]); #endif // Interlist and normal versions both have 'target' nodes GenericTreeNode *target = bucketList[sRemoteGravityState->currentBucket]; #if INTERLIST_VER > 0 GenericTreeNode *lca = 0; #else sGravity->init(target, activeRung, sRemote); #endif #ifdef CHANGA_REFACTOR_MEMCHECK CkPrintf("memcheck in while loop (i: %d, currentRemote Bucket: %d)\n", i, sRemoteGravityState->currentBucket); CmiMemoryCheck(); #endif if (target->rungs >= activeRung) { #if INTERLIST_VER > 0 lca = getStartAncestor(sRemoteGravityState->currentBucket, prevRemoteBucket, root); int lcaLevel = lca->getLevel(lca->getKey()); // set remote opt, etc sGravity->init(lca, activeRung, sRemote); // need to enqueue the chunkroot replicas only once, // before walking the first bucket DoubleWalkState *rstate = (DoubleWalkState *)sRemoteGravityState; if(!rstate->placedRoots[msg->chunkNum]){ rstate->placedRoots[msg->chunkNum] = true; rstate->level = 0; sGravity->initState(rstate); #if COSMO_PRINT_BK > 0 CkPrintf("[%d] CGR: placing chunk %d root %ld (type %s) replicas\n", thisIndex, msg->chunkNum, chunkRoot->getKey(), typeString(chunkRoot->getType())); #endif #endif for(int x = -nReplicas; x <= nReplicas; x++) { for(int y = -nReplicas; y <= nReplicas; y++) { for(int z = -nReplicas; z <= nReplicas; z++) { #if CHANGA_REFACTOR_DEBUG > 1 CkPrintf("[%d]: starting remote walk with chunk=%d, current remote bucket=%d, (%d,%d,%d)\n", thisIndex, msg->chunkNum, sRemoteGravityState->currentBucket, x, y, z); #endif #if INTERLIST_VER > 0 // put chunk root in correct checklist // the currentRemote Bucket argument to encodeOffset doesn't really // matter; only the offset is of consequence OffsetNode on; on.node = chunkRoot; on.offsetID = encodeOffset(0, x,y,z); rstate->chklists[lcaLevel].enq(on); #else sTopDown->walk(chunkRoot, sRemoteGravityState, msg->chunkNum,encodeOffset(sRemoteGravityState->currentBucket,x, y, z), remoteGravityAwi); #endif } } } #if INTERLIST_VER > 0 #ifdef CHANGA_REFACTOR_MEMCHECK CkPrintf("active: memcheck after enqueuing (i: %d, currentRemote Bucket: %d, lca: %ld)\n", i, sRemoteGravityState->currentBucket, lca->getKey()); CmiMemoryCheck(); #endif }// if !placedRoots // now that all the nodes to walk on have been enqueued in the chklist of the lca, we can call // the walk function. // lca is the node the dft starts from. // sRemoteGravityState contains the nodes to be walked on // target for walk has already been set // the offset doesn't really matter, so send target bucket number (currentBucketRemote), which is needed to construct the reqID // in case nodes are missed and walks need to be resumed. sInterListWalk->walk(lca, sRemoteGravityState, msg->chunkNum, sRemoteGravityState->currentBucket, interListAwi); #ifdef CHANGA_REFACTOR_MEMCHECK CkPrintf("active: memcheck after walk\n"); CmiMemoryCheck(); #endif // now that the walk has been completed, do some book-keeping // keeping track of the fact that a chunk has been processed by some buckets // and that all particles under lowest node have used this chunk. GenericTreeNode *lowestNode = rstate->lowestNode; int startBucket, end; getBucketsBeneathBounds(lowestNode, startBucket, end); CkAssert(sRemoteGravityState->currentBucket >= startBucket); // When using ckloop to parallelize force calculation, first the list is // populated with nodes and particles with which the force is calculated. if (useckloop) { lpdata->lowNodes.insertAtEnd(lowestNode); lpdata->bucketids.insertAtEnd(sRemoteGravityState->currentBucket); CkVec cl; CkVec rp; CkVec lp; // Populate the remote, local list sGravity->fillLists(sRemoteGravityState, this, msg->chunkNum, sRemoteGravityState->currentBucket, end, cl, rp, lp); lpdata->chunkids.insertAtEnd(msg->chunkNum); lpdata->clists.insertAtEnd(cl); lpdata->rpilists.insertAtEnd(rp); lpdata->lpilists.insertAtEnd(lp); } else { sGravity->stateReady(sRemoteGravityState, this, msg->chunkNum, sRemoteGravityState->currentBucket, end); } #ifdef CHANGA_REFACTOR_MEMCHECK CkPrintf("active: memcheck after stateReady\n"); CmiMemoryCheck(); #endif #if COSMO_PRINT_BK > 1 CkPrintf("[%d] active remote bucket book-keep current: %d end: %d chunk: %d\n", thisIndex, sRemoteGravityState->currentBucket, end, msg->chunkNum); #endif int numActualBuckets = doBookKeepingForTargetActive( sRemoteGravityState->currentBucket, end, msg->chunkNum, !useckloop, sRemoteGravityState); #ifdef CHANGA_REFACTOR_MEMCHECK CkPrintf("active: memcheck after book-keeping\n"); CmiMemoryCheck(); #endif prevRemoteBucket = sRemoteGravityState->currentBucket; sRemoteGravityState->currentBucket = end; i += numActualBuckets; #endif }// end if target active #if INTERLIST_VER > 0 else{ doBookKeepingForTargetInactive(msg->chunkNum, !useckloop, sRemoteGravityState); #ifdef CHANGA_REFACTOR_MEMCHECK CkPrintf("inactive: memcheck after book-keeping (prevRemote: %d, current: %d, chunk: %d)\n", prevRemoteBucket, sRemoteGravityState->currentBucket, msg->chunkNum); CmiMemoryCheck(); #endif // i isn't incremented because we've skipped inactive buckets } #endif #if INTERLIST_VER > 0 #else sRemoteGravityState->counterArrays[0][sRemoteGravityState->currentBucket]--; finishBucket(sRemoteGravityState->currentBucket); sRemoteGravityState->counterArrays[1][msg->chunkNum] --; sRemoteGravityState->currentBucket++; i++; #endif }// end while i < yieldPeriod and currentRemote Bucket < numBuckets if (useckloop) { // Now call ckloop parallelization function which will execute the force // calculation in parallel and update the counterArrays. executeCkLoopParallelization(lpdata, tmpBucketBegin, yield_num, msg->chunkNum, sRemoteGravityState); delete lpdata; } if (sRemoteGravityState->currentBucket < numBuckets) { thisProxy[thisIndex].calculateGravityRemote(msg); #if COSMO_PRINT > 0 CkPrintf("{%d} sending self-message chunk %d, prio %d\n",thisIndex,msg->chunkNum,*(int*)CkPriorityPtr(msg)); #endif } else { sRemoteGravityState->currentBucket = 0; #if INTERLIST_VER > 0 && defined CUDA // The remote walk might have some outstanding computation requests // at this point. Flush these lists to GPU DoubleWalkState *ds = (DoubleWalkState *)sRemoteGravityState; ListCompute *lc = (ListCompute *)sGravity; if(lc && ds){ if(ds->nodeLists.totalNumInteractions > 0){ lc->sendNodeInteractionsToGpu(ds, this); lc->resetCudaNodeState(ds); } if(ds->particleLists.totalNumInteractions > 0){ lc->sendPartInteractionsToGpu(ds, this); lc->resetCudaPartState(ds); } } #endif int chunkRemaining; chunkRemaining = sRemoteGravityState->counterArrays[1][msg->chunkNum]; #if INTERLIST_VER > 0 prevRemoteBucket = -1; #endif CkAssert(chunkRemaining >= 0); #if COSMO_PRINT_BK > 1 CkPrintf("[%d] cgr chunk: %d remaining Chunk: %d\n", thisIndex, msg->chunkNum, chunkRemaining); #endif if (chunkRemaining == 0) { // we finished completely using this chunk, so we acknowledge the cache // if this is not true it means we had some hard misses #ifdef COSMO_PRINT_BK CkPrintf("[%d] FINISHED CHUNK %d from calculateGravityRemote\n",thisIndex,msg->chunkNum); #endif #ifdef CUDA // The remote-resume walk might have some outstanding computation requests // at this point. Flush these lists to GPU DoubleWalkState *ds = (DoubleWalkState *)sInterListStateRemoteResume; ListCompute *lc = (ListCompute *)sGravity; if(lc && ds){ bool nodeDummy = false; bool partDummy = true; if(ds->nodeLists.totalNumInteractions > 0){ nodeDummy = true; } else if(ds->particleLists.totalNumInteractions > 0){ partDummy = true; } if(nodeDummy && partDummy){ lc->sendNodeInteractionsToGpu(ds, this); lc->sendPartInteractionsToGpu(ds, this, true); lc->resetCudaNodeState(ds); lc->resetCudaPartState(ds); } else if(nodeDummy){ lc->sendNodeInteractionsToGpu(ds, this,true); lc->resetCudaNodeState(ds); } else if(partDummy){ lc->sendPartInteractionsToGpu(ds, this, true); lc->resetCudaPartState(ds); } } #endif cacheGravPart[CkMyPe()].finishedChunk(msg->chunkNum, particleInterRemote[msg->chunkNum]); #ifdef CHECK_WALK_COMPLETIONS CkPrintf("[%d] finishedChunk TreePiece::calculateGravityRemote\n", thisIndex); #endif finishedChunk(msg->chunkNum); } #if COSMO_PRINT > 0 CkPrintf("{%d} resetting message chunk %d, prio %d\n",thisIndex,msg->chunkNum,*(int*)CkPriorityPtr(msg)); #endif delete msg; } } int TreePiece::doBookKeepingForTargetActive(int curbucket, int end, int chunkNum, bool updatestate, State* gravityState) { int numActualBuckets = 0; for(int j = curbucket; j < end; j++){ // counterArrays are not updated yet and finishBucket is not called for // ckloop parallelization at this point. It is done only after the // ckloop execution is completed. if (updatestate) { // for the cuda version, must decrement here // since we add to numAdditionalReqs for a bucket // each time a work request involving it is sent out. // the per bucket, per chunk counts are balanced as follows: // initialize in startiteration: +1 // for request sent to gpu: +1 // initial walk completed: -1 (this is where we are in the code currently) // for each request completed: -1 (this happens in cudaCallback) gravityState->counterArrays[0][j]--; #if COSMO_PRINT_BK > 1 CkPrintf("[%d] bucket %d numAddReq: %d,%d\n", thisIndex, j, sRemoteGravityState->counterArrays[0][j], sLocalGravityState->counterArrays[0][j]); #endif #ifdef CUDA if(bucketList[j]->rungs < activeRung){ #endif finishBucket(j); #ifdef CUDA } #endif } if(bucketList[j]->rungs >= activeRung){ numActualBuckets++; } } if (updatestate && chunkNum >= 0) { gravityState->counterArrays[1][chunkNum] -= (end-curbucket); } return numActualBuckets; } int TreePiece::doBookKeepingForTargetInactive(int chunkNum, bool updatestate, State* gravityState) { // target was not active, so tree under lca has // not been processed. look for next active bucket // keep moving forward until an active bucket is reached // all (inactive) buckets encountered meanwhile are // finished int bucketsSkipped = 0; // prevRemoteBucket keeps track of last *active* bucket // if it is -1, the lca turns out to be chunkroot // otherwise, the lca is some other valid node #if COSMO_PRINT_BK > 1 CkPrintf("[%d] inactive remote bucket book-keep chunk: %d\n", thisIndex, chunkNum); #endif while(gravityState->currentBucket < numBuckets && bucketList[gravityState->currentBucket]->rungs < activeRung){ if (updatestate) { gravityState->counterArrays[0][gravityState->currentBucket]--; finishBucket(gravityState->currentBucket); } #if COSMO_PRINT_BK > 1 CkPrintf("[%d] bucket %d numAddReq: %d,%d\n", thisIndex, sRemoteGravityState->currentBucket, sRemoteGravityState->counterArrays[0][sRemoteGravityState->currentBucket], sLocalGravityState->counterArrays[0][sRemoteGravityState->currentBucket]); #endif bucketsSkipped++; gravityState->currentBucket++; } if (updatestate && chunkNum >= 0) { gravityState->counterArrays[1][chunkNum] -= bucketsSkipped; } return bucketsSkipped; } // Execute using ckloop. // lpdata contains the required data // startbucket the bucket storing the value to which currentBucket needs to be // reset // chunkNum is the chunk of the remote walk we are on // gravityState contains the state of the walk void TreePiece::executeCkLoopParallelization(LoopParData *lpdata, int startbucket, int yield_num, int chunkNum, State* gravityState) { // This num_chunks is related to ckloop. int num_chunks = 2 * CkMyNodeSize(); // CkLoop library limits the number of chunks to be 64. if (num_chunks > 64) { num_chunks = 64; } int start = 0; int end = lpdata->bucketids.length(); if (num_chunks > end) { num_chunks = end; } if (num_chunks > 0) { double timebeforeckloop = getObjTime(); double timeforckloop; LBTurnInstrumentOff(); CkLoop_Parallelize(doWorkForCkLoop, 1, lpdata, num_chunks, start, end-1, 1, &timeforckloop, CKLOOP_DOUBLE_SUM); setObjTime(timebeforeckloop + timeforckloop); LBTurnInstrumentOn(); } int i = 0; int counter = 0; gravityState->currentBucket = startbucket; // Do bookkeeping to update counterArray state while (i< yield_num && gravityState->currentBucket < numBuckets ) { // Interlist and normal versions both have 'target' nodes GenericTreeNode *target = bucketList[gravityState->currentBucket]; if (target->rungs >= activeRung) { // now that the walk has been completed, do some book-keeping // keeping track of the fact that a chunk has been processed by some buckets // and that all particles under lowest node have used this chunk. GenericTreeNode *lowestNode = lpdata->lowNodes[counter]; counter++; int start, end; getBucketsBeneathBounds(lowestNode, start, end); CkAssert(gravityState->currentBucket >= start); int numActualBuckets = doBookKeepingForTargetActive(gravityState->currentBucket, end, chunkNum, true, gravityState); if (chunkNum >= 0) // Remote walks only prevRemoteBucket = gravityState->currentBucket; gravityState->currentBucket = end; i += numActualBuckets; }// end if target active else{ doBookKeepingForTargetInactive(chunkNum, true, gravityState); } }// end while i < yieldPeriod and currentRemote Bucket < numBuckets } #ifdef CUDA void TreePiece::callFreeRemoteChunkMemory(int chunk){ dm->freeRemoteChunkMemory(chunk); } #endif #if INTERLIST_VER > 0 /// @brief return the largest node which contains current bucket, but /// which does not contain previous bucket. If previous is -1 then /// return the root. GenericTreeNode *TreePiece::getStartAncestor(int current, int previous, GenericTreeNode *chunkroot){ GenericTreeNode *lca = 0; if(previous == -1){ return chunkroot; } GenericTreeNode *target = bucketList[current]; GenericTreeNode *prevTarget = bucketList[previous]; Tree::NodeKey targetKey = target->getKey(); Tree::NodeKey prevTargetKey = prevTarget->getKey(); // get LCA key Tree::NodeKey lcaKey = target->getLongestCommonPrefix(targetKey, prevTargetKey); // get LCA node lca = keyToNode(lcaKey); int whichChild = lca->whichChild(targetKey); return lca->getChildren(whichChild); } #endif // We are done with the node Cache void TreePiece::finishNodeCache(const CkCallback& cb) { int j; for (j = 0; j < numChunks; j++) { cacheNode.ckLocalBranch()->finishedChunk(j, 0); } contribute(cb); } #ifdef PUSH_GRAVITY /* This method is intended to calculate forces in the 'small' timesteps, i.e. when few particles are active. It causes the TreePiece to broadcast its buckets to all other TreePieces. The others compute forces on its buckets and contribute forces a reduction for this TreePiece in particular. When all work has finished, quiescence is detected and we move on in the small step. */ void TreePiece::startPushGravity(int am, double myTheta){ LBTurnInstrumentOn(); iterationNo++; activeRung = am; theta = myTheta; thetaMono = theta*theta*theta*theta; CkAssert(!doMerge); if(!createdSpanningTree){ createdSpanningTree = true; allTreePieceSection = CProxySection_TreePiece::ckNew(thisProxy,0,numTreePieces-1,1); CkMulticastMgr *mgr = CProxy_CkMulticastMgr(ckMulticastGrpId).ckLocalBranch(); allTreePieceSection.ckSectionDelegate(mgr); } BucketMsg *msg = createBucketMsg(); if(msg != NULL) allTreePieceSection.recvPushBuckets(msg); } BucketMsg *TreePiece::createBucketMsg(){ int saveNumActiveParticles = 0; int numActiveParticles = 0; int numActiveBuckets = 0; // First count the number of active particles and buckets for(int i = 0; i < bucketList.size(); i++){ GenericTreeNode *bucket = bucketList[i]; int buckStart = bucket->firstParticle; int buckEnd = bucket->lastParticle; GravityParticle *buckParticles = bucket->particlePointer; saveNumActiveParticles = numActiveParticles; for(int j = 0; j <= buckEnd-buckStart; j++){ if(buckParticles[j].rung >= activeRung){ numActiveParticles++; } } if(numActiveParticles > saveNumActiveParticles) numActiveBuckets++; } if(numActiveParticles == 0) return NULL; // allocate message BucketMsg *msg = new (numActiveBuckets,numActiveParticles) BucketMsg; numActiveParticles = 0; numActiveBuckets = 0; // Copy active particles and buckets into message; change pointer offsets // of bucket particle boundaries to integers for(int i = 0; i < bucketList.size(); i++){ // source bucket GenericTreeNode *sbucket = bucketList[i]; int buckStart = sbucket->firstParticle; int buckEnd = sbucket->lastParticle; GravityParticle *buckParticles = sbucket->particlePointer; saveNumActiveParticles = numActiveParticles; for(int j = 0; j <= buckEnd-buckStart; j++){ if(buckParticles[j].rung >= activeRung){ // copy active particle to bucket msg msg->particles[numActiveParticles] = buckParticles[j]; numActiveParticles++; } } if(numActiveParticles > saveNumActiveParticles){ // copy active bucket to bucket msg msg->buckets[numActiveBuckets] = *sbucket; GenericTreeNode &tbucket = msg->buckets[numActiveBuckets]; // set particle bounds for copied bucket (as integers) tbucket.particlePointer = NULL; tbucket.firstParticle = saveNumActiveParticles; tbucket.lastParticle = numActiveParticles-1; numActiveBuckets++; } } msg->numBuckets = numActiveBuckets; msg->numParticles = numActiveParticles; msg->whichTreePiece = thisIndex; return msg; } void TreePiece::recvPushBuckets(BucketMsg *msg){ GenericTreeNode *foreignBuckets; int numForeignBuckets; int numFields = 4; // make sure there is enough space for foreignParticles foreignParticles.resize(msg->numParticles); foreignParticleAccelerations.resize(numFields*msg->numParticles); // obtain positions of foreignParticles from message unpackBuckets(msg,foreignBuckets,numForeignBuckets); if(myNumParticles > 0){ // If there is a local tree associated with this tree piece, // calculate forces on foreignParticles due to it calculateForces(foreignBuckets,numForeignBuckets); } // update cookie CkGetSectionInfo(cookieJar[msg->whichTreePiece],msg); for(int i = 0; i < msg->numParticles; i++){ foreignParticleAccelerations[numFields*i] = foreignParticles[i].treeAcceleration.x; foreignParticleAccelerations[numFields*i+1] = foreignParticles[i].treeAcceleration.y; foreignParticleAccelerations[numFields*i+2] = foreignParticles[i].treeAcceleration.z; foreignParticleAccelerations[numFields*i+3] = foreignParticles[i].interMass; } // contribute accelerations CkCallback cb(CkIndex_TreePiece::recvPushAccelerations(NULL),CkArrayIndex1D(msg->whichTreePiece),thisProxy); CkMulticastMgr *mgr = CProxy_CkMulticastMgr(ckMulticastGrpId).ckLocalBranch(); mgr->contribute(foreignParticleAccelerations.length()*sizeof(double),&foreignParticleAccelerations[0],CkReduction::sum_double,cookieJar[msg->whichTreePiece],cb); delete msg; } void TreePiece::unpackBuckets(BucketMsg *msg, GenericTreeNode *&foreignBuckets, int &numForeignBuckets){ // Copy foreign particle positions, etc. into local buffer for(int i = 0; i < msg->numParticles; i++){ foreignParticles[i] = msg->particles[i]; foreignParticles[i].treeAcceleration.x = 0.0; foreignParticles[i].treeAcceleration.y = 0.0; foreignParticles[i].treeAcceleration.z = 0.0; foreignParticles[i].interMass = 0.0; } // Make buckets point to appropriate positions in local buffer of particles foreignBuckets = msg->buckets; numForeignBuckets = msg->numBuckets; GravityParticle *baseParticlePtr = &foreignParticles[0]; for(int i = 0; i < numForeignBuckets; i++){ GenericTreeNode &bucket = foreignBuckets[i]; bucket.particlePointer = baseParticlePtr+bucket.firstParticle; } } void TreePiece::calculateForces(GenericTreeNode *foreignBuckets, int numForeignBuckets){ TopDownTreeWalk topdown; GravityCompute grav; NullState nullState; PushGravityOpt pushOpt; grav.init(NULL,activeRung,&pushOpt); CkAssert(root != NULL); for(int i = 0; i < numForeignBuckets; i++){ GenericTreeNode &target = foreignBuckets[i]; grav.setComputeEntity(&target); topdown.init(&grav,this); // for each replica for(int x = -nReplicas; x <= nReplicas; x++){ for(int y = -nReplicas; y <= nReplicas; y++){ for(int z = -nReplicas; z <= nReplicas; z++){ // begin walk at root // -1 for chunk and active walk index // bucket number 'i' doesn't serve any purpose, // since this traversal will not generate any remote requests. topdown.walk(root,&nullState,-1,encodeOffset(i,x,y,z),-1); } } } } } void TreePiece::recvPushAccelerations(CkReductionMsg *msg){ double *accelerations = (double *) msg->getData(); int numAccelerations = msg->getSize()/sizeof(double); int j = 0; int numUpdates = 0; int numFields = 4; for(int i = 1; i <= myNumParticles; i++){ if(myParticles[i].rung >= activeRung){ myParticles[i].treeAcceleration.x = accelerations[j]; myParticles[i].treeAcceleration.y = accelerations[j+1]; myParticles[i].treeAcceleration.z = accelerations[j+2]; myParticles[i].interMass = accelerations[j+3]; j += numFields; numUpdates++; double totalMass = myParticles[i].mass+myParticles[i].interMass; if(totalMass != myTotalMass){ CkPrintf("[%d] particle %d interMass %f should be %f partMass %f\n", thisIndex, i, totalMass, myTotalMass, myParticles[i].mass); CkAbort("bad intermass\n"); } } } CkAssert(numUpdates == numAccelerations/numFields); } #endif void TreePiece::findTotalMass(CkCallback &cb){ callback = cb; myTotalMass = 0; for(int i = 1; i <= myNumParticles; i++){ myTotalMass += myParticles[i].mass; } contribute(sizeof(double), &myTotalMass, CkReduction::sum_double, CkCallback(CkIndex_TreePiece::recvTotalMass(NULL),thisProxy)); } void TreePiece::recvTotalMass(CkReductionMsg *msg){ myTotalMass = *((double *)msg->getData()); contribute(callback); } /// This method starts the tree walk and gravity calculation. It /// first registers with the node and particle caches. It initializes /// the particle acceleration by calling initBucket(). It initializes /// treewalk bookkeeping, and starts a prefetch walk which will /// eventually call a remote walk (a walk on non-local nodes). It /// then starts the Ewald initialization, and finally starts the local /// gravity walk. void TreePiece::startGravity(int am, // the active mask for multistepping double myTheta, // opening criterion const CkCallback& cb) { LBTurnInstrumentOn(); iterationNo++; cbGravity = cb; activeRung = am; theta = myTheta; thetaMono = theta*theta*theta*theta; int oldNumChunks = numChunks; dm->getChunks(numChunks, prefetchRoots); CkArrayIndexMax idxMax = CkArrayIndex1D(thisIndex); // The following if is necessary to make nodes containing only TreePieces // without particles to get stuck and crash... if (numChunks == 0 && myNumParticles == 0) numChunks = 1; int dummy; cacheNode.ckLocalBranch()->cacheSync(numChunks, idxMax, localIndex); cacheGravPart.ckLocalBranch()->cacheSync(numChunks, idxMax, dummy); nodeLBMgrProxy.ckLocalBranch()->registerTP(); if (myNumParticles == 0) { // No particles assigned to this TreePiece for (int i=0; i< numChunks; ++i) { cacheGravPart.ckLocalBranch()->finishedChunk(i, 0); } nodeLBMgrProxy.ckLocalBranch()->finishedTPWork(); CkCallback cbf = CkCallback(CkIndex_TreePiece::finishWalk(), pieces); gravityProxy[thisIndex].ckLocal()->contribute(cbf); return; } // allocate and zero out statistics counters if (oldNumChunks != numChunks ) { delete[] nodeInterRemote; delete[] particleInterRemote; nodeInterRemote = new u_int64_t[numChunks]; particleInterRemote = new u_int64_t[numChunks]; } if(nodeInterRemote == NULL) nodeInterRemote = new u_int64_t[numChunks]; if(particleInterRemote == NULL) particleInterRemote = new u_int64_t[numChunks]; #if COSMO_STATS > 0 nodesOpenedLocal = 0; nodesOpenedRemote = 0; numOpenCriterionCalls=0; #endif nodeInterLocal = 0; for (int i=0; i1) CkPrintf("Node: %d, TreePiece %d: I have %d buckets\n", CkMyNode(), thisIndex,numBuckets); if (bucketReqs==NULL) bucketReqs = new BucketGravityRequest[numBuckets]; ewaldCurrentBucket = 0; #if INTERLIST_VER > 0 prevBucket = -1; prevRemoteBucket = -1; #endif initBuckets(); switch(domainDecomposition){ case Oct_dec: case ORB_dec: case ORB_space_dec: //Prefetch Roots for Oct prefetchReq[0].reset(); for (unsigned int i=1; i<=myNumParticles; ++i) { if (myParticles[i].rung >= activeRung) { prefetchReq[0].grow(myParticles[i].position); } } break; default: //Prefetch Roots for SFC prefetchReq[0].reset(); for (unsigned int i=1; i<=myNumParticles; ++i) { // set to first active particle if (myParticles[i].rung >= activeRung) { prefetchReq[0].grow(myParticles[i].position); break; } } prefetchReq[1].reset(); for (unsigned int i=myNumParticles; i>=1; --i) { // set to last active particle if (myParticles[i].rung >= activeRung) { prefetchReq[1].grow(myParticles[i].position); break; } } break; } #if CHANGA_REFACTOR_DEBUG > 0 CkPrintf("Beginning prefetch\n"); #endif sTopDown = new TopDownTreeWalk; sLocal = new LocalOpt; sRemote = new RemoteOpt; if(_prefetch) sPrefetch = new PrefetchCompute; else sPrefetch = new DummyPrefetchCompute; sPref = new PrefetchOpt; State *remoteWalkState; State *localWalkState; Compute *compute; TreeWalk *walk; #if INTERLIST_VER > 0 compute = new ListCompute; walk = new LocalTargetWalk; #else compute = new GravityCompute; #endif // much of this part is common to interlist and normal algo. compute->init((void *)0, activeRung, sLocal); localWalkState = compute->getNewState(numBuckets); compute->init((void *)0, activeRung, sRemote); remoteWalkState = compute->getNewState(numBuckets, numChunks); #if INTERLIST_VER > 0 // interaction list algo needs a separate state for walk resumption sInterListStateRemoteResume = compute->getNewState(); // remote resume and remote share counters // but have separate lists sInterListStateRemoteResume->counterArrays[0] = remoteWalkState->counterArrays[0]; sInterListStateRemoteResume->counterArrays[1] = remoteWalkState->counterArrays[1]; #endif // remaining Chunk[] for(int i = 0; i < numChunks; i++) { remoteWalkState->counterArrays[1][i] = numBuckets; } for(int i = 0; i < numBuckets; i++){ remoteWalkState->counterArrays[0][i] = numChunks; localWalkState->counterArrays[0][i] = 1; } sGravity = compute; sLocalGravityState = localWalkState; sRemoteGravityState = remoteWalkState; #if INTERLIST_VER > 0 sInterListWalk = walk; // cuda - need number of active buckets this iteration so // that we can flush accumulated interactions to gpu when // we realize no more computation requests will be received // (i.e. all buckets have finished their RNR/Local walks) #ifdef CUDA #ifdef CUDA_INSTRUMENT_WRS instrumentId = dm->initInstrumentation(); localNodeListConstructionTime = 0.0; remoteNodeListConstructionTime = 0.0; remoteResumeNodeListConstructionTime = 0.0; localPartListConstructionTime = 0.0; remotePartListConstructionTime = 0.0; remoteResumePartListConstructionTime = 0.0; nLocalNodeReqs = 0; nRemoteNodeReqs = 0; nRemoteResumeNodeReqs = 0; nLocalPartReqs = 0; nRemotePartReqs = 0; nRemoteResumePartReqs = 0; #endif numActiveBuckets = 0; calculateNumActiveParticles(); for(int i = 0; i < numBuckets; i++){ if(bucketList[i]->rungs >= activeRung){ numActiveBuckets++; } } if(numActiveBuckets > 0 && verbosity > 1){ CkPrintf("[%d] num active buckets %d avg size: %f\n", thisIndex, numActiveBuckets, 1.0*myNumActiveParticles/numActiveBuckets); } { DoubleWalkState *state = (DoubleWalkState *)sRemoteGravityState; ((ListCompute *)sGravity)->initCudaState(state, numBuckets, remoteNodesPerReq, remotePartsPerReq, false); // no missed nodes/particles state->nodes = NULL; state->particles = NULL; DoubleWalkState *lstate = (DoubleWalkState *)sLocalGravityState; ((ListCompute *)sGravity)->initCudaState(lstate, numBuckets, localNodesPerReq, localPartsPerReq, false); // ditto lstate->nodes = NULL; // allocate space for local particles // if this is a small phase; we do not transfer // all particles owned by this processor in small // phases and so refer to particles inside an // auxiliary array shipped with computation requests. // this auxiliary array is 'particles', below: if(largePhase()){ lstate->particles = NULL; } else{ // allocate an amount of space that // depends on the rung lstate->particles = new CkVec(AVG_SOURCE_PARTICLES_PER_ACTIVE*myNumActiveParticles); lstate->particles->length() = 0; // need to allocate memory for data structure that stores bucket // active info (where in the gpu's target particle memory this // bucket starts, and its size; strictly speaking, we don't need // the size attribute.) // XXX - no need to allocate/delete every iteration bucketActiveInfo = new BucketActiveInfo[numBuckets]; } } { DoubleWalkState *state = (DoubleWalkState *)sInterListStateRemoteResume; ((ListCompute *)sGravity)->initCudaState(state, numBuckets, remoteResumeNodesPerReq, remoteResumePartsPerReq, true); state->nodes = new CkVec(100000); state->nodes->length() = 0; state->particles = new CkVec(200000); state->particles->length() = 0; //state->nodeMap.clear(); state->nodeMap.resize(remoteResumeNodesPerReq); state->nodeMap.length() = 0; state->partMap.clear(); } #endif // CUDA #if CUDA_STATS localNodeInteractions = 0; localPartInteractions = 0; remoteNodeInteractions = 0; remotePartInteractions = 0; remoteResumeNodeInteractions = 0; remoteResumePartInteractions = 0; #endif #endif // INTERLIST_VER sLocalGravityState->myNumParticlesPending = numBuckets; sRemoteGravityState->numPendingChunks = numChunks; // current remote bucket sRemoteGravityState->currentBucket = 0; sPrefetch->init((void *)0, activeRung, sPref); sTopDown->init(sPrefetch, this); sPrefetchState = sPrefetch->getNewState(1); // instead of prefetchWaiting, we count through state->counters[0] sPrefetchState->counterArrays[0][0] = (2*nReplicas + 1)*(2*nReplicas + 1)*(2*nReplicas + 1); activeWalks.reserve(maxAwi); addActiveWalk(prefetchAwi, sTopDown,sPrefetch,sPref,sPrefetchState); #ifdef CHECK_WALK_COMPLETIONS CkPrintf("[%d] addActiveWalk prefetch (%d)\n", thisIndex, activeWalks.length()); #endif #if INTERLIST_VER > 0 addActiveWalk(interListAwi, sInterListWalk, sGravity, sRemote, sInterListStateRemoteResume); #ifdef CHECK_WALK_COMPLETIONS CkPrintf("[%d] addActiveWalk interList (%d)\n", thisIndex, activeWalks.length()); #endif #else addActiveWalk(remoteGravityAwi,sTopDown,sGravity,sRemote, sRemoteGravityState); #ifdef CHECK_WALK_COMPLETIONS CkPrintf("[%d] addActiveWalk remoteGravity (%d)\n", thisIndex, activeWalks.length()); #endif #endif // variable currentBucket masquerades as current chunk initiatePrefetch(sPrefetchState->currentBucket); #if CHANGA_REFACTOR_DEBUG > 0 CkPrintf("[%d]sending message to commence local gravity calculation\n", thisIndex); #endif if (bEwald) thisProxy[thisIndex].EwaldInit(); #if defined CUDA // ask datamanager to serialize local trees // prefetch can occur concurrently with this, // though calculateGravityLocal can only come // afterwards. dm->serializeLocalTree(); #else thisProxy[thisIndex].commenceCalculateGravityLocal(); #endif #ifdef CHANGA_PRINT_MEMUSAGE int piecesPerPe = numTreePieces/CmiNumPes(); if(thisIndex % piecesPerPe == 0) CkPrintf("[%d]: CmiMaxMemoryUsage: %f M\n", CmiMyPe(), (float)CmiMaxMemoryUsage()/(1 << 20)); #endif } // Starts the prefetching of the specified chunk; once the // given chunk has been completely prefetched, the prefetch // compute invokes startRemoteChunk() void TreePiece::initiatePrefetch(int chunk){ #ifdef DISABLE_NODE_TREE GenericTreeNode *child = keyToNode(prefetchRoots[chunk]); #else GenericTreeNode *child = dm->chunkRootToNode(prefetchRoots[chunk]); #endif CmiAssert(child != NULL); int first, last; for(int x = -nReplicas; x <= nReplicas; x++) { for(int y = -nReplicas; y <= nReplicas; y++) { for(int z = -nReplicas; z <= nReplicas; z++) { if (child == NULL) { nodeOwnership(prefetchRoots[chunk], first, last); child = requestNode(dm->responsibleIndex[(first+last)>>1], prefetchRoots[chunk], chunk, encodeOffset(0, x, y, z), prefetchAwi, (void *)0, true); } if (child != NULL) { #if CHANGA_REFACTOR_DEBUG > 1 CkPrintf("[%d] starting prefetch walk with current Prefetch=%d, numPrefetchReq=%d (%d,%d,%d)\n", thisIndex, chunk, numPrefetchReq, x,y,z); #endif sTopDown->walk(child, sPrefetchState, chunk, encodeOffset(0,x,y,z), prefetchAwi); } } } } } void TreePiece::commenceCalculateGravityLocal(){ #if INTERLIST_VER > 0 // must set placedRoots to false before starting local comp. DoubleWalkState *lstate = (DoubleWalkState *)sLocalGravityState; lstate->placedRoots[0] = false; #endif calculateGravityLocal(); } void TreePiece::startRemoteChunk() { #if CHANGA_REFACTOR_DEBUG > 0 CkPrintf("[%d] sending message to commence remote gravity\n", thisIndex); #endif traceUserEvent(prefetchDoneUE); #ifdef CUDA // dm counts until all treepieces have acknowledged prefetch completion // it then flattens the tree on the processor, sends it to the device // and sends messages to each of the registered treepieces to continueStartRemoteChunk() dm->donePrefetch(sPrefetchState->currentBucket); #else continueStartRemoteChunk(sPrefetchState->currentBucket); #endif } /// @brief Main work of StartRemoteChunk() /// Schedule a TreePiece::calculateGravityRemote() then start /// prefetching for the next chunk. void TreePiece::continueStartRemoteChunk(int chunk){ // FIXME - can value of chunk be different from current Prefetch? ComputeChunkMsg *msg = new (8*sizeof(int)) ComputeChunkMsg(sPrefetchState->currentBucket); *(int*)CkPriorityPtr(msg) = numTreePieces * numChunks + thisIndex + 1; CkSetQueueing(msg, CK_QUEUEING_IFIFO); #if INTERLIST_VER > 0 DoubleWalkState *rstate = (DoubleWalkState *)sRemoteGravityState; rstate->placedRoots[msg->chunkNum] = false; #endif thisProxy[thisIndex].calculateGravityRemote(msg); // start prefetching next chunk if (++sPrefetchState->currentBucket < numChunks) { // Nothing needs to be changed for this chunk - // the prefetchReqs and their number remains the same // We only need to reassociate the tree walk with the // prefetch compute object and the prefetch object wiht // the prefetch opt object sTopDown->reassoc(sPrefetch); // prefetch walk isn't associated with any particular bucket // but the entire treepiece // this method invocation does nothing. indeed, nothing // needs to be done because sPrefetch is always associated with // sPref sPrefetch->reassoc((void *)0,activeRung,sPref); // instead of prefetchWaiting, we count through state->counters[0] //prefetchWaiting = (2*nReplicas + 1)*(2*nReplicas + 1)*(2*nReplicas + 1); sPrefetchState->counterArrays[0][0] = (2*nReplicas + 1)*(2*nReplicas + 1)*(2*nReplicas + 1); // variable currentBucket masquerades as current chunk initiatePrefetch(sPrefetchState->currentBucket); } } // Sets the load of the TreePiece object void TreePiece::setTreePieceLoad(int activeRung) { treePieceActivePartsTmp = numActiveParticles; if (havePhaseData(activeRung)) { treePieceLoadExp = savedPhaseLoad[activeRung]; } else if (havePhaseData(0)) { float ratio = 1.0; if(myNumParticles != 0){ ratio = numActiveParticles/(float)myNumParticles; } treePieceLoadExp = ratio * savedPhaseLoad[0]; } else { treePieceLoadExp = treePieceLoad; } setObjTime(treePieceLoadExp); treePieceLoad = 0; } // jetley - contribute your centroid. AtSync is now called by the load balancer (broadcast) when it has // all centroids. void TreePiece::startlb(CkCallback &cb, int activeRung){ if(verbosity > 1) CkPrintf("[%d] load set to: %g, actual: %g\n", thisIndex, treePieceLoad, getObjTime()); callback = cb; lbActiveRung = activeRung; if(verbosity > 1) CkPrintf("[%d] TreePiece %d calling AtSync()\n",CkMyPe(),thisIndex); unsigned int i; if(activeRung == 0){ numActiveParticles = myNumParticles; } else if(activeRung == PHASE_FEEDBACK) { numActiveParticles = myNumSPH + myNumStar; } else{ for(numActiveParticles = 0, i = 1; i <= myNumParticles; i++) if(myParticles[i].rung >= activeRung) numActiveParticles++; } int64_t active_tp[2]; active_tp[0] = numActiveParticles; active_tp[1] = myNumParticles; contribute(2*sizeof(int64_t), &active_tp, CkReduction::sum_long, CkCallback(CkReductionTarget(TreePiece,getParticleInfoForLB),thisProxy)); } // This is called by startlb to check whether to call the load balancer void TreePiece::getParticleInfoForLB(int64_t active_part, int64_t total_part) { bool doLB = ((float)active_part/total_part > dFracLoadBalance) ? true : false; // Don't do LB if (!doLB) { setTreePieceLoad(lbActiveRung); prevLARung = lbActiveRung; setObjTime(0.0); contribute(callback); return; } LDObjHandle myHandle = myRec->getLdHandle(); TaggedVector3D tv(savedCentroid, myHandle, numActiveParticles, myNumParticles, lbActiveRung, prevLARung); tv.tp = thisIndex; tv.tag = thisIndex; setTreePieceLoad(lbActiveRung); if (CkpvAccess(_lb_obj_index) != -1) { void *data = getObjUserData(CkpvAccess(_lb_obj_index)); *(TaggedVector3D *) data = tv; } thisProxy[thisIndex].doAtSync(); prevLARung = lbActiveRung; } void TreePiece::doAtSync(){ if(verbosity > 1) CkPrintf("[%d] TreePiece %d calling AtSync() at %g\n",CkMyPe(),thisIndex, CkWallTimer()); AtSync(); } void TreePiece::ResumeFromSync(){ if(verbosity > 1) CkPrintf("[%d] TreePiece %d in ResumefromSync\n",CkMyPe(),thisIndex); contribute(callback); } const GenericTreeNode *TreePiece::lookupNode(Tree::NodeKey key){ return keyToNode(key); }; const GravityParticle *TreePiece::lookupParticles(int begin) { return &myParticles[begin]; } GenericTreeNode* TreePiece::requestNode(int remoteIndex, Tree::NodeKey key, int chunk, int reqID, int awi, void *source, bool isPrefetch) { CkAssert(remoteIndex < (int) numTreePieces); CkAssert(chunk < numChunks); if(_cache){ #if COSMO_PRINT > 1 CkPrintf("[%d] b=%d requesting node %s to %d for %s (additional=%d)\n",thisIndex,reqID,keyBits(key,KeyBits).c_str(),remoteIndex,keyBits(bucketList[reqID]->getKey(),KeyBits).c_str(), sRemoteGravityState->counterArrays[0][decodeReqID(reqID)] + sLocalGravityState->counterArrays[0][decodeReqID(reqID)]); #endif CProxyElement_ArrayElement thisElement(thisProxy[thisIndex]); CkCacheUserData userData; userData.d0 = (((CmiUInt8) awi)<<32)+reqID; userData.d1 = (CmiUInt8) source; CkCacheRequestorData request(thisElement, &EntryTypeGravityNode::callback, userData); CkArrayIndexMax remIdx = CkArrayIndex1D(remoteIndex); GenericTreeNode *res = (GenericTreeNode *) cacheNode.ckLocalBranch()->requestData(key,remIdx,chunk,&gravityNodeEntry,request); #ifdef CHANGA_REFACTOR_INTERLIST_PRINT_BUCKET_START_FIN if(source && !res){ int start, end; GenericTreeNode *testSource = (GenericTreeNode *)source; getBucketsBeneathBounds(testSource, start, end); CkPrintf("[%d] tgt=%d requesting node %ld source = %ld(%d - %d) buckRem = %d + %d, remChunk = %d\n",thisIndex,decodeReqID(reqID), key, testSource->getKey(), start, end, sRemoteGravityState->counterArrays[0][decodeReqID(reqID)], sLocalGravityState->counterArrays[0][decodeReqID(reqID)], sRemoteGravityState->counterArrays[1][chunk]); // check whether source contains target GenericTreeNode *target = bucketList[decodeReqID(reqID)]; if(!testSource->contains(target->getKey())){ CkAbort("source does not contain target\n"); } } #endif return res; } else{ CkAbort("Non cached version not anymore supported, feel free to fix it!"); } return NULL; } ExternalGravityParticle *TreePiece::requestParticles(Tree::NodeKey key,int chunk,int remoteIndex,int begin,int end,int reqID, int awi, void *source, bool isPrefetch) { if (_cache) { CProxyElement_ArrayElement thisElement(thisProxy[thisIndex]); CkCacheUserData userData; userData.d0 = (((CmiUInt8) awi)<<32)+reqID; userData.d1 = (CmiUInt8) source; CkCacheRequestorData request(thisElement, &EntryTypeGravityParticle::callback, userData); CkArrayIndexMax remIdx = CkArrayIndex1D(remoteIndex); CkAssert(key < (((NodeKey) 1) << (NodeKeyBits-1))); // // Key is shifted to distiguish between nodes and particles // KeyType ckey = key<<1; CacheParticle *p = (CacheParticle *) cacheGravPart.ckLocalBranch()->requestData(ckey,remIdx,chunk,&gravityParticleEntry,request); if (p == NULL) { #ifdef CHANGA_REFACTOR_INTERLIST_PRINT_BUCKET_START_FIN if(source){ int start, end; GenericTreeNode *testSource = (GenericTreeNode *)source; getBucketsBeneathBounds(testSource, start, end); CkPrintf("[%d] tgt=%d requesting particles %ld source = %ld(%d - %d) buckRem = %d + %d, remChunk = %d\n",thisIndex,decodeReqID(reqID), key, testSource->getKey(), start, end, sRemoteGravityState->counterArrays[0][decodeReqID(reqID)], sLocalGravityState->counterArrays[0][decodeReqID(reqID)], sRemoteGravityState->counterArrays[1][chunk]); // check whether source contains target GenericTreeNode *target = bucketList[decodeReqID(reqID)]; if(!testSource->contains(target->getKey())){ CkAbort("source does not contain target\n"); } } #endif #if COSMO_PRINT > 1 CkPrintf("[%d] b=%d requestParticles: additional=%d\n",thisIndex, decodeReqID(reqID), sRemoteGravityState->counterArrays[0][decodeReqID(reqID)] + sLocalGravityState->counterArrays[0][decodeReqID(reqID)]); #endif return NULL; } return p->part; } else { CkAbort("Non cached version not anymore supported, feel free to fix it!"); } return NULL; } GravityParticle * TreePiece::requestSmoothParticles(Tree::NodeKey key,int chunk,int remoteIndex, int begin,int end,int reqID, int awi, void *source, bool isPrefetch) { if (_cache) { CProxyElement_ArrayElement thisElement(thisProxy[thisIndex]); CkCacheUserData userData; userData.d0 = (((CmiUInt8) awi)<<32)+reqID; userData.d1 = (CmiUInt8) source; CkCacheRequestorData request(thisElement, &EntryTypeSmoothParticle::callback, userData); CkArrayIndexMax remIdx = CkArrayIndex1D(remoteIndex); KeyType ckey = key<<1; CacheSmoothParticle *p = (CacheSmoothParticle *) cacheSmoothPart.ckLocalBranch()->requestData(ckey,remIdx,chunk,&smoothParticleEntry,request); if (p == NULL) { return NULL; } return p->partCached; } else { CkAbort("Non cached version not anymore supported, feel free to fix it!"); } return NULL; } #if COSMO_DEBUG > 1 || defined CHANGA_REFACTOR_WALKCHECK || defined CHANGA_REFACTOR_WALKCHECK_INTERLIST //Recursive routine to combine keys -- Written only for Binary Trees void TreePiece::combineKeys(Tree::NodeKey key,int bucket){ Tree::NodeKey mask = Key(1); Tree::NodeKey lastBit = key & mask; Tree::NodeKey sibKey; if(lastBit==mask){ sibKey = key >> 1; sibKey <<= 1; } else{ sibKey = key | mask; } std::multiset::iterator iter = (bucketcheckList[bucket]).find(sibKey); if(iter==bucketcheckList[bucket].end()) return; else{//Sibling key has been found in the Binary tree bucketcheckList[bucket].erase(bucketcheckList[bucket].find(key)); bucketcheckList[bucket].erase(iter); if(bucket == TEST_BUCKET && thisIndex == TEST_TP){ CkPrintf("[%d] combine(%ld, %ld)\n", thisIndex, key, sibKey); CkPrintf("[%d] add %ld\n", thisIndex, key >> 1, sibKey); } key >>= 1; bucketcheckList[bucket].insert(key); combineKeys(key,bucket); } } void TreePiece::checkWalkCorrectness(){ Tree::NodeKey endKey = Key(1); int count = (2*nReplicas+1) * (2*nReplicas+1) * (2*nReplicas+1); CkPrintf("[%d] checking walk correctness...\n",thisIndex); bool someWrong = false; for(int i=0;irungs < activeRung) continue; if(bucketcheckList[i].size()!=count) wrong = 1; for (std::multiset::iterator iter = bucketcheckList[i].begin(); iter != bucketcheckList[i].end(); iter++) { if (*iter != endKey) wrong = 1; } if (wrong) { someWrong = true; CkPrintf("Error: [%d] Not all nodes were traversed by bucket %d\n",thisIndex,i); for (std::multiset::iterator iter=bucketcheckList[i].begin(); iter != bucketcheckList[i].end(); iter++) { CkPrintf(" [%d] key %ld\n",thisIndex,*iter); } } else { bucketcheckList[i].clear(); } } if(someWrong) CkExit(); } #endif /********************************************************************/ void TreePiece::outputStatistics(const CkCallback& cb) { #if COSMO_STATS > 0 if(verbosity > 1) { u_int64_t nodeInterRemoteTotal = 0; u_int64_t particleInterRemoteTotal = 0; for (int i=0; i 0) mySPHParticles = new extraSPHData[nStoreSPH]; allocateStars(); } for(unsigned int i=1;i<=myNumParticles;i++){ p | myParticles[i]; if(myParticles[i].isGas()) { int iSPH; if(!p.isUnpacking()) { iSPH = (extraSPHData *)myParticles[i].extraData - mySPHParticles; CkAssert(iSPH < myNumSPH); p | iSPH; } else { p | iSPH; myParticles[i].extraData = mySPHParticles + iSPH; CkAssert(iSPH < myNumSPH); } } if(myParticles[i].isStar()) { int iStar; if(!p.isUnpacking()) { iStar = (extraStarData *)myParticles[i].extraData - myStarParticles; CkAssert(iStar < myNumStar); p | iStar; } else { p | iStar; myParticles[i].extraData = myStarParticles + iStar; CkAssert(iStar < myNumStar); } } } for(unsigned int i=0;i 0 p | prevBucket; p | prevRemoteBucket; #endif p | ewaldCurrentBucket; #if COSMO_STATS > 0 p | myNumMACChecks; p | nodesOpenedLocal; p | nodesOpenedRemote; p | numOpenCriterionCalls; p | piecemass; #endif if (p.isUnpacking()) { particleInterRemote = NULL; nodeInterRemote = NULL; switch(domainDecomposition) { case SFC_dec: case SFC_peano_dec: case SFC_peano_dec_3D: case SFC_peano_dec_2D: numPrefetchReq = 2; break; case Oct_dec: case ORB_dec: case ORB_space_dec: numPrefetchReq = 1; break; default: CmiAbort("Pupper has wrong domain decomposition type!\n"); } } p | myPlace; p | bGasCooling; if(p.isUnpacking()){ dm = NULL; #ifndef COOLING_NONE if(!_inrestart) { // not restarting from checkpoint dm = (DataManager*)CkLocalNodeBranch(dataManagerID); if(bGasCooling) CoolData = CoolDerivsInit(dm->Cool); } #endif } if (verbosity > 1) { if (p.isSizing()) { ckout << "TreePiece " << thisIndex << ": Getting PUP'd!"; ckout << " size: " << ((PUP::sizer*)&p)->size(); ckout << endl; } } } void TreePiece::reconstructNodeLookup(GenericTreeNode *node) { nodeLookupTable[node->getKey()] = node; node->particlePointer = &myParticles[node->firstParticle]; if (node->getType() == Bucket) bucketList.push_back(node); GenericTreeNode *child; for (unsigned int i=0; inumChildren(); ++i) { child = node->getChildren(i); if (child != NULL) reconstructNodeLookup(child); } } /** Check that all the particles in the tree are really in their boxes. Because the keys are made of only the first 21 out of 23 bits of the floating point representation, there can be particles that are outside their box by tiny amounts. Whether this is bad is not yet known. */ void TreePiece::checkTree(GenericTreeNode* node) { if(node->getType() == Empty) return; if(node->getType() == Bucket) { for(unsigned int iter = node->firstParticle; iter <= node->lastParticle; ++iter) { if(!node->boundingBox.contains(myParticles[iter].position)) { ckerr << "Not in the box: Box: " << node->boundingBox << " Position: " << myParticles[iter].position << "\nNode key: " << keyBits(node->getKey(), KeyBits).c_str() << "\nParticle key: " << keyBits(myParticles[iter].key, KeyBits).c_str() << endl; } } } else if(node->getType() != NonLocal && node->getType() != NonLocalBucket) { GenericTreeNode* childIterator; for(unsigned int i = 0; i < node->numChildren(); ++i) { childIterator = node->getChildren(i); if(childIterator) checkTree(childIterator); } } } /// Color a node string getColor(GenericTreeNode* node) { ostringstream oss; switch(node->getType()) { case Bucket: case Internal: oss << "black"; break; case NonLocal: case NonLocalBucket: oss << "red"; break; case Boundary: oss << "purple"; break; default: oss << "yellow"; } return oss.str(); } /// Make a label for a node string makeLabel(GenericTreeNode* node) { ostringstream oss; oss << keyBits(node->getKey(), NodeKeyBits) << "\\n"; switch(node->getType()) { case Invalid: oss << "Invalid"; break; case Bucket: oss << "Bucket"; break; case Internal: oss << "Internal"; break; case NonLocal: oss << "NonLocal: Chare " << node->remoteIndex; break; case NonLocalBucket: oss << "NonLocalBucket: Chare " << node->remoteIndex; break; case Empty: oss << "Empty"; break; case Boundary: oss << "Boundary: Total N " << node->remoteIndex; break; case Top: oss << "Top"; break; default: oss << "Unknown NodeType!"; } return oss.str(); } /// Print a text version of a tree void TreePiece::printTree(GenericTreeNode* node, ostream& os) { if(node == 0) return; string nodeID = keyBits(node->getKey(), NodeKeyBits); os << nodeID << " "; int first, last; switch(node->getType()) { case Bucket: os << "Bucket: Size=" << (node->lastParticle - node->firstParticle + 1) << "(" << node->firstParticle << "-" << node->lastParticle << ")"; break; case Internal: os << "Internal: Size=" << (node->lastParticle - node->firstParticle + 1) << "(" << node->firstParticle << "-" << node->lastParticle << ")"; break; case NonLocal: nodeOwnership(node->getKey(), first, last); os << "NonLocal: Chare=" << node->remoteIndex << ", Owners=" << first << "-" << last; break; case NonLocalBucket: nodeOwnership(node->getKey(), first, last); CkAssert(first == last); os << "NonLocalBucket: Chare=" << node->remoteIndex << ", Owner=" << first << ", Size=" << node->particleCount << "(" << node->firstParticle << "-" << node->lastParticle << ")"; break; case Boundary: nodeOwnership(node->getKey(), first, last); os << "Boundary: Totalsize=" << node->particleCount << ", Localsize=" << (node->lastParticle - node->firstParticle) << "(" << node->firstParticle + (node->firstParticle==0?1:0) << "-" << node->lastParticle - (node->lastParticle==myNumParticles+1?1:0) << "), Owners=" << first << "-" << last; break; case Empty: os << "Empty "<remoteIndex; break; } #ifndef HEXADECAPOLE if (node->getType() == Bucket || node->getType() == Internal || node->getType() == Boundary || node->getType() == NonLocal || node->getType() == NonLocalBucket) os << " V "<moments.soft<<" "<moments.cm.x<<" "<moments.cm.y<<" "<moments.cm.z<<" "<moments.xx<<" "<moments.xy<<" "<moments.xz<<" "<moments.yy<<" "<moments.yz<<" "<moments.zz<<" "<boundingBox; #endif os << "\n"; if(node->getType() == NonLocal || node->getType() == NonLocalBucket || node->getType() == Bucket || node->getType() == Empty) return; GenericTreeNode* childIterator; for(unsigned int i = 0; i < node->numChildren(); ++i) { childIterator = node->getChildren(i); if(childIterator) printTree(childIterator, os); else { os << "\tnode [color=\"green\"]\n"; os << "\t\"" << nodeID << i << "\" [label=\"None\"]\n"; os << "\t\"" << nodeID << "\" -> \"" << nodeID << i << "\";\n"; } } } /// Print a graphviz version of A tree void TreePiece::printTreeViz(GenericTreeNode* node, ostream& os) { if(node == 0) return; string nodeID = keyBits(node->getKey(), NodeKeyBits); os << "\tnode [color=\"" << getColor(node) << "\"]\n"; os << "\t\"" << nodeID << "\" [label=\"" << keyBits(node->getKey(), NodeKeyBits) << "\\n"; int first, last; switch(node->getType()) { case Bucket: os << "Bucket\\nSize: " << (node->lastParticle - node->firstParticle + 1); break; case Internal: os << "Internal\\nSize: " << (node->lastParticle - node->firstParticle + 1); break; case NonLocal: nodeOwnership(node->getKey(), first, last); os << "NonLocal: Chare " << node->remoteIndex << "\\nOwners: " << (last-first+1) << "\\nRemote size: " << (node->lastParticle - node->firstParticle + 1); break; case NonLocalBucket: nodeOwnership(node->getKey(), first, last); os << "NonLocalBucket: Chare " << node->remoteIndex << "\\nOwner: " << first << "\\nSize: " << node->particleCount; //<< "(" << node->firstParticle << "-" << node->lastParticle << ")"; break; case Boundary: os << "Boundary\\nTotalsize: " << node->particleCount << "\\nLocalsize: " << (node->lastParticle - node->firstParticle); break; case Empty: os << "Empty "<remoteIndex; break; } os << "\"]\n"; if(node->parent) os << "\t\"" << keyBits(node->parent->getKey(), NodeKeyBits) << "\" -> \"" << nodeID << "\";\n"; if(node->getType() == NonLocal || node->getType() == NonLocalBucket || node->getType() == Bucket || node->getType() == Empty) return; GenericTreeNode* childIterator; for(unsigned int i = 0; i < node->numChildren(); ++i) { childIterator = node->getChildren(i); if(childIterator) printTreeViz(childIterator, os); else { os << "\tnode [color=\"green\"]\n"; os << "\t\"" << nodeID << i << "\" [label=\"None\"]\n"; os << "\t\"" << nodeID << "\" -> \"" << nodeID << i << "\";\n"; } } } /// Write a file containing a graphviz dot graph of my tree void TreePiece::report() { ostringstream outfilename; outfilename << "tree." << thisIndex << "." << iterationNo << ".dot"; ofstream os(outfilename.str().c_str()); os << "digraph G" << thisIndex << " {\n"; os << "\tcenter = \"true\"\n"; os << "\tsize = \"7.5,10\"\n"; //os << "\tratio = \"fill\"\n"; //os << "\tfontname = \"Courier\"\n"; os << "\tnode [style=\"bold\"]\n"; os << "\tlabel = \"Piece: " << thisIndex << "\\nParticles: " << myNumParticles << "\"\n"; os << "\tfontname = \"Helvetica\"\n"; printTreeViz(root, os); os << "}" << endl; os.close(); } /// Print a text version of a tree void printGenericTree(GenericTreeNode* node, ostream& os) { if(node == 0) return; string nodeID = keyBits(node->getKey(), NodeKeyBits); os << nodeID << " "; switch(node->getType()) { case Bucket: os << "Bucket: Size=" << (node->lastParticle - node->firstParticle + 1) << "(" << node->firstParticle << "-" << node->lastParticle << ")"; break; case Internal: os << "Internal: Size=" << (node->lastParticle - node->firstParticle + 1) << "(" << node->firstParticle << "-" << node->lastParticle << ")"; break; case NonLocal: os << "NonLocal: Chare=" << node->remoteIndex; //<< ", Owners=" << first << "-" << last; break; case NonLocalBucket: os << "NonLocalBucket: Chare=" << node->remoteIndex << ", Size=" << node->particleCount; //<< "(" << node->firstParticle << "-" << node->lastParticle << ")"; break; case Boundary: os << "Boundary: Totalsize=" << node->particleCount << ", Localsize=" << (node->lastParticle - node->firstParticle) << "(" << node->firstParticle << "-" << node->lastParticle; break; case Empty: os << "Empty "<remoteIndex; break; } #ifndef HEXADECAPOLE if (node->getType() == Bucket || node->getType() == Internal || node->getType() == Boundary || node->getType() == NonLocal || node->getType() == NonLocalBucket) os << " V "<moments.soft<<" "<moments.cm.x<<" "<moments.cm.y<<" "<moments.cm.z<<" "<moments.xx<<" "<moments.xy<<" "<moments.xz<<" "<moments.yy<<" "<moments.yz<<" "<moments.zz; #endif os << "\n"; if(node->getType() == NonLocal || node->getType() == NonLocalBucket || node->getType() == Bucket || node->getType() == Empty) return; GenericTreeNode* childIterator; for(unsigned int i = 0; i < node->numChildren(); ++i) { childIterator = node->getChildren(i); if(childIterator) printGenericTree(childIterator, os); else { os << "\tnode [color=\"green\"]\n"; os << "\t\"" << nodeID << i << "\" [label=\"None\"]\n"; os << "\t\"" << nodeID << "\" -> \"" << nodeID << i << "\";\n"; } } } CkReduction::reducerType TreePieceStatistics::sum; /* * Collect treewalking statistics across all TreePieces */ void TreePiece::collectStatistics(CkCallback& cb) { LBTurnInstrumentOff(); #if COSMO_DEBUG > 1 || defined CHANGA_REFACTOR_WALKCHECK || defined CHANGA_REFACTOR_WALKCHECK_INTERLIST checkWalkCorrectness(); #endif #if COSMO_STATS > 0 u_int64_t nodeInterRemoteTotal = 0; u_int64_t particleInterRemoteTotal = 0; for (int i=0; i offset = decodeOffset(reqID); TreeWalk *tw; Compute *compute; State *state; #ifdef CHANGA_REFACTOR_INTERLIST_PRINT_BUCKET_START_FIN if(source){ int start, end; GenericTreeNode *testSource = (GenericTreeNode *)source; getBucketsBeneathBounds(testSource, start, end); CkPrintf("[%d] tgt=%d recv node %ld source = %ld(%d - %d) buckRem = %d + %d, remChunk = %d\n",thisIndex,decodeReqID(reqID), node->getKey(), testSource->getKey(), start, end, sRemoteGravityState->counterArrays[0][decodeReqID(reqID)], sLocalGravityState->counterArrays[0][decodeReqID(reqID)], sRemoteGravityState->counterArrays[1][chunk]); // check whether source contains target GenericTreeNode *target = bucketList[decodeReqID(reqID)]; if(!testSource->contains(target->getKey())){ CkAbort("source does not contain target\n"); } } #endif // retrieve the activewalk record CkAssert(awi < activeWalks.size()); ActiveWalk &a = activeWalks[awi]; tw = a.tw; compute = a.c; state = a.s; // reassociate objects with each other tw->reassoc(compute); compute->reassoc(source, activeRung, a.o); // resume walk tw->resumeWalk(node, state, chunk, reqID, awi); // we need source to update the counters in all buckets // underneath the source. note that in the interlist walk, // the computeEntity of the compute will likely have changed as the walk continued. // however, the resumeWalk function takes care to set it back to 'source' // after it is done walking. compute->nodeRecvdEvent(this,chunk,state,targetBucket); } void TreePiece::receiveParticlesCallback(ExternalGravityParticle *egp, int num, int chunk, int reqID, Tree::NodeKey &remoteBucket, int awi, void *source){ Compute *c; State *state; CkAssert(awi < maxAwi); // retrieve the activewalk record ActiveWalk &a = activeWalks[awi]; //tw = a.tw; c = a.c; state = a.s; #ifdef CHANGA_REFACTOR_INTERLIST_PRINT_BUCKET_START_FIN if(source){ int start, end; GenericTreeNode *testSource = (GenericTreeNode *)source; getBucketsBeneathBounds(testSource, start, end); CkPrintf("[%d] tgt=%d recv particles %ld source = %ld (%d - %d) buckRem = %d + %d, remChunk = %d\n",thisIndex,decodeReqID(reqID), remoteBucket, testSource->getKey(), start, end, sRemoteGravityState->counterArrays[0][decodeReqID(reqID)], sLocalGravityState->counterArrays[0][decodeReqID(reqID)], sRemoteGravityState->counterArrays[1][chunk]); // check whether source contains target GenericTreeNode *target = bucketList[decodeReqID(reqID)]; if(!testSource->contains(target->getKey())){ CkAbort("source does not contain target\n"); } } #endif // Some sanity checks if(awi == interListAwi) { #if INTERLIST_VER > 0 ListCompute *lc = dynamic_cast(c); CkAssert(lc != NULL); #else CkAbort("Using ListCompute in non-list version\n"); #endif } else if(awi == remoteGravityAwi) { GravityCompute *gc = dynamic_cast(c); CkAssert(gc != NULL); } else if(awi == prefetchAwi) { PrefetchCompute *pc = dynamic_cast(c); CkAssert(pc != NULL); } else { CkAssert(0); } c->reassoc(source, activeRung, a.o); c->recvdParticles(egp,num,chunk,reqID,state,this, remoteBucket); } void TreePiece::receiveParticlesFullCallback(GravityParticle *gp, int num, int chunk, int reqID, Tree::NodeKey &remoteBucket, int awi, void *source){ Compute *c; State *state; // retrieve the activewalk record ActiveWalk &a = activeWalks[awi]; c = a.c; state = a.s; c->reassoc(source, activeRung, a.o); c->recvdParticlesFull(gp,num,chunk,reqID,state,this, remoteBucket); } void TreePiece::addActiveWalk(int iAwi, TreeWalk *tw, Compute *c, Opt *o, State *s){ activeWalks.insert(iAwi, ActiveWalk(tw,c,o,s)); } void TreePiece::freeWalkObjects(){ #ifdef CHECK_WALK_COMPLETIONS CkPrintf("[%d] TreePiece::freeWalkObjects\n", thisIndex); #endif if(sTopDown){ delete sTopDown; sTopDown = NULL; } #if INTERLIST_VER > 0 if(sInterListWalk) { delete sInterListWalk; sInterListWalk = NULL; } #endif if(sGravity){ sGravity->reassoc(0,0,sRemote); #if INTERLIST_VER > 0 && defined CUDA { DoubleWalkState *state = (DoubleWalkState *) sRemoteGravityState; DoubleWalkState *rstate = (DoubleWalkState *) sInterListStateRemoteResume; delete state->particles; delete rstate->nodes; delete rstate->particles; if(!largePhase()){ DoubleWalkState *lstate = (DoubleWalkState *) sLocalGravityState; delete lstate->particles; delete [] bucketActiveInfo; } } #endif // remote-no-resume state sGravity->freeState(sRemoteGravityState); sRemoteGravityState = NULL; #if INTERLIST_VER > 0 // remote-resume state // overwrite copies of counters shared with sRemoteGravityState to // avoid double deletion. See startGravity() sInterListStateRemoteResume->counterArrays[0] = NULL; sInterListStateRemoteResume->counterArrays[1] = NULL; sGravity->freeState(sInterListStateRemoteResume); sInterListStateRemoteResume = NULL; #endif // local state sGravity->reassoc(0,0,sLocal); sGravity->freeState(sLocalGravityState); sLocalGravityState = NULL; delete sGravity; sGravity = NULL; } if(sPrefetch) { sPrefetch->freeState(sPrefetchState); delete sPrefetch; delete sPref; delete sLocal; delete sRemote; sPrefetch = NULL; } } void TreePiece::finishedChunk(int chunk){ sRemoteGravityState->numPendingChunks--; if(sRemoteGravityState->numPendingChunks == 0){ #ifdef CHECK_WALK_COMPLETIONS CkPrintf("[%d] finishedChunk %d, calling markWalkDone\n", thisIndex, chunk); #endif markWalkDone(); } } void TreePiece::markWalkDone() { // There are always two walks to wait for: one saying that all // computation for our buckets are done and one saying that all // outstanding cache requests are satisfied. if (++completedActiveWalks == 2) { // At this point this treepiece has completed its walk. However, // there may be outstanding requests by other pieces. We need to // wait for all walks to complete before freeing data structures. #ifdef CHECK_WALK_COMPLETIONS CkPrintf("[%d] inside markWalkDone, completedActiveWalks: %d, activeWalks: %d, contrib finishWalk\n", thisIndex, completedActiveWalks, activeWalks.size()); #endif nodeLBMgrProxy.ckLocalBranch()->finishedTPWork(); CkCallback cb = CkCallback(CkIndex_TreePiece::finishWalk(), pieces); gravityProxy[thisIndex].ckLocal()->contribute(cb); } } void TreePiece::finishWalk() { if(verbosity > 1) CkPrintf("[%d] current load: %g current particles: %d\n", thisIndex, getObjTime(), myNumParticles); completedActiveWalks = 0; freeWalkObjects(); #ifdef CACHE_MEM_STATS memPostCache = CmiMemoryUsage()/(1024*1024); #endif #ifdef CHECK_WALK_COMPLETIONS CkPrintf("[%d] inside finishWalk contrib callback\n", thisIndex); #endif #ifdef CUDA_INSTRUMENT_WRS RequestTimeInfo *rti1 = hapi_queryInstrument(instrumentId, DM_TRANSFER_LOCAL, activeRung); RequestTimeInfo *rti2 = hapi_queryInstrument(instrumentId, DM_TRANSFER_REMOTE_CHUNK, activeRung); RequestTimeInfo *rti3 = hapi_queryInstrument(instrumentId, DM_TRANSFER_BACK, activeRung); RequestTimeInfo *rti4 = hapi_queryInstrument(instrumentId, DM_TRANSFER_FREE_LOCAL, activeRung); RequestTimeInfo *rti5 = hapi_queryInstrument(instrumentId, DM_TRANSFER_FREE_REMOTE_CHUNK, activeRung); RequestTimeInfo *rti6 = hapi_queryInstrument(instrumentId, TP_GRAVITY_LOCAL, activeRung); RequestTimeInfo *rti7 = hapi_queryInstrument(instrumentId, TP_GRAVITY_REMOTE, activeRung); RequestTimeInfo *rti8 = hapi_queryInstrument(instrumentId, TP_GRAVITY_REMOTE_RESUME, activeRung); RequestTimeInfo *rti9 = hapi_queryInstrument(instrumentId,TP_PART_GRAVITY_LOCAL_SMALLPHASE, activeRung); RequestTimeInfo *rti10 = hapi_queryInstrument(instrumentId, TP_PART_GRAVITY_LOCAL, activeRung); RequestTimeInfo *rti11 = hapi_queryInstrument(instrumentId, TP_PART_GRAVITY_REMOTE, activeRung); RequestTimeInfo *rti12 = hapi_queryInstrument(instrumentId, TP_PART_GRAVITY_REMOTE_RESUME, activeRung); RequestTimeInfo *rti13 = hapi_queryInstrument(instrumentId, TOP_EWALD_KERNEL, activeRung); RequestTimeInfo *rti14 = hapi_queryInstrument(instrumentId, BOTTOM_EWALD_KERNEL, activeRung); if(rti6 != NULL){ CkPrintf("[%d] (%d) CUDA_INSTRUMENT_WRS localnode: (%f,%f,%f) count: %d\n", thisIndex, activeRung, rti6->transferTime/rti6->n, rti6->kernelTime/rti6->n, rti6->cleanupTime/rti6->n, rti6->n); } if(rti7 != NULL){ CkPrintf("[%d] (%d) CUDA_INSTRUMENT_WRS remotenode: (%f,%f,%f) count: %d\n", thisIndex, activeRung, rti7->transferTime/rti7->n, rti7->kernelTime/rti7->n, rti7->cleanupTime/rti7->n, rti7->n); } if(rti8 != NULL){ CkPrintf("[%d] (%d) CUDA_INSTRUMENT_WRS remoteresumenode: (%f,%f,%f) count: %d\n", thisIndex, activeRung, rti8->transferTime/rti8->n, rti8->kernelTime/rti8->n, rti8->cleanupTime/rti8->n, rti8->n); } if(rti10 != NULL){ CkPrintf("[%d] (%d) CUDA_INSTRUMENT_WRS localpart: (%f,%f,%f) count: %d\n", thisIndex, activeRung, rti10->transferTime/rti10->n, rti10->kernelTime/rti10->n, rti10->cleanupTime/rti10->n, rti10->n); } if(rti11 != NULL){ CkPrintf("[%d] (%d) CUDA_INSTRUMENT_WRS remotepart: (%f,%f,%f) count: %d\n", thisIndex, activeRung, rti11->transferTime/rti11->n, rti11->kernelTime/rti11->n, rti11->cleanupTime/rti11->n, rti11->n); } if(rti12 != NULL){ CkPrintf("[%d] (%d) CUDA_INSTRUMENT_WRS remoteresumepart: (%f,%f,%f) count: %d\n", thisIndex, activeRung, rti12->transferTime/rti12->n, rti12->kernelTime/rti12->n, rti12->cleanupTime/rti12->n, rti12->n); } if(nLocalNodeReqs > 0){ CkPrintf("[%d] (%d) CUDA_INSTRUMENT_WRS construction local node reqs: %d, avg: %f\n", thisIndex, activeRung, nLocalNodeReqs, localNodeListConstructionTime/nLocalNodeReqs ); } if(nRemoteNodeReqs > 0){ CkPrintf("[%d] (%d) CUDA_INSTRUMENT_WRS construction remote node reqs: %d, avg: %f\n", thisIndex, activeRung, nRemoteNodeReqs, remoteNodeListConstructionTime/nRemoteNodeReqs ); } if(nRemoteResumeNodeReqs > 0){ CkPrintf("[%d] (%d) CUDA_INSTRUMENT_WRS construction remote resume node reqs: %d, avg: %f\n", thisIndex, activeRung, nRemoteResumeNodeReqs, remoteResumeNodeListConstructionTime/nRemoteResumeNodeReqs ); } if(nLocalPartReqs > 0){ CkPrintf("[%d] (%d) CUDA_INSTRUMENT_WRS construction local part reqs: %d, avg: %f\n", thisIndex, activeRung, nLocalPartReqs, localPartListConstructionTime/nLocalPartReqs ); } if(nRemotePartReqs > 0){ CkPrintf("[%d] (%d) CUDA_INSTRUMENT_WRS construction remote part reqs: %d, avg: %f\n", thisIndex, activeRung, nRemotePartReqs, remotePartListConstructionTime/nRemotePartReqs ); } if(nRemoteResumePartReqs > 0){ CkPrintf("[%d] (%d) CUDA_INSTRUMENT_WRS construction remote resume part reqs: %d, avg: %f\n", thisIndex, activeRung, nRemoteResumePartReqs, remoteResumePartListConstructionTime/nRemoteResumePartReqs ); } #endif #ifdef CUDA_STATS CkPrintf("[%d] (%d) CUDA_STATS localnode: %ld\n", thisIndex, activeRung, localNodeInteractions); CkPrintf("[%d] (%d) CUDA_STATS remotenode: %ld\n", thisIndex, activeRung, remoteNodeInteractions); CkPrintf("[%d] (%d) CUDA_STATS remoteresumenode: %ld\n", thisIndex, activeRung, remoteResumeNodeInteractions); CkPrintf("[%d] (%d) CUDA_STATS localpart: %ld\n", thisIndex, activeRung, localPartInteractions); CkPrintf("[%d] (%d) CUDA_STATS remotepart: %ld\n", thisIndex, activeRung, remotePartInteractions); CkPrintf("[%d] (%d) CUDA_STATS remoteresumepart: %ld\n", thisIndex, activeRung, remoteResumePartInteractions); #endif gravityProxy[thisIndex].ckLocal()->contribute(cbGravity); } #if INTERLIST_VER > 0 /// \brief get range of bucket numbers beneath a given TreeNode. /// \param source Given TreeNode /// \param start Index of first bucket (returned) /// \param end Index of last bucket (returned) void TreePiece::getBucketsBeneathBounds(GenericTreeNode *&source, int &start, int &end){ start = source->startBucket; end = start+(source->numBucketsBeneath); } // called when missed data is received void TreePiece::updateBucketState(int start, int end, int n, int chunk, State *state){ #if COSMO_PRINT_BK > 1 CkPrintf("[%d] data received book-keep\n", thisIndex); #endif for(int i = start; i < end; i++){ if(bucketList[i]->rungs >= activeRung){ state->counterArrays[0][i] -= n; #if COSMO_PRINT_BK > 1 CkPrintf("[%d] bucket %d numAddReq: %d,%d\n", thisIndex, i, sRemoteGravityState->counterArrays[0][i], sLocalGravityState->counterArrays[0][i]); #endif #if !defined CUDA // updatebucketstart is only called after we have finished // with received nodes/particles (i.e. after stateReady in // either case.) therefore, some work requests must already // have been issued before it was invoked, so there will // be a finishBucket call afterwards to ensure progress. finishBucket(i); #endif } } state->counterArrays[1][chunk] -= n; } // called on a miss void TreePiece::updateUnfinishedBucketState(int start, int end, int n, int chunk, State *state){ #if COSMO_PRINT_BK > 1 CkPrintf("[%d] data missed book-keep (%d)\n", thisIndex, chunk); #endif for(int i = start; i < end; i++){ if(bucketList[i]->rungs >= activeRung){ state->counterArrays[0][i] += n; #if COSMO_PRINT_BK > 1 CkPrintf("[%d] bucket %d numAddReq: %d,%d\n", thisIndex, i, sRemoteGravityState->counterArrays[0][i], sLocalGravityState->counterArrays[0][i]); #endif } } state->counterArrays[1][chunk] += n; } #endif // INTERLIST_VER /* create this chare's portion of the image We're lazy, so use the existing DumpFrame framework */ void TreePiece::liveVizDumpFrameInit(liveVizRequestMsg *msg) { savedLiveVizMsg = msg; if(thisIndex == 0) { ckerr << "Calling liveViz setup on main chare" << endl; mainChare.liveVizImagePrep(msg); } } /* * Utility to turn projections on or off * bOn == True => turn on. */ void TreePiece::setProjections(int bOn) { if(bOn) traceBegin(); else traceEnd(); } /* * Gather memory use statistics recorded during the tree walk */ void TreePiece::memCacheStats(const CkCallback &cb) { int memOut[4]; memOut[0] = memWithCache; memOut[1] = memPostCache; memOut[2] = nNodeCacheEntries; memOut[3] = nPartCacheEntries; contribute(4*sizeof(int), memOut, CkReduction::max_int, cb); } void TreePiece::balanceBeforeInitialForces(CkCallback &cb){ LDObjHandle handle = myRec->getLdHandle(); LBDatabase *lbdb = LBDatabaseObj(); int nlbs = lbdb->getNLoadBalancers(); if(nlbs == 0) { // no load balancers. Skip this contribute(cb); return; } Vector3D centroid; centroid.x = 0.0; centroid.y = 0.0; centroid.z = 0.0; for(int i = 1; i <= myNumParticles; i++){ centroid += myParticles[i].position; } if(myNumParticles > 0){ centroid /= myNumParticles; } TaggedVector3D tv(centroid, handle, myNumParticles, myNumParticles, 0, 0); tv.tp = thisIndex; /* CkPrintf("[%d] centroid %f %f %f\n", thisIndex, tv.vec.x, tv.vec.y, tv.vec.z ); */ string msname("MultistepLB"); string orb3dname("Orb3dLB"); string ms_notoponame("MultistepLB_notopo"); string msnode_notoponame("MultistepNodeLB_notopo"); string orb3d_notoponame("Orb3dLB_notopo"); string msorb_name("MultistepOrbLB"); string hierarch_name("HierarchOrbLB"); BaseLB **lbs = lbdb->getLoadBalancers(); int i; if(foundLB == Null){ for(i = 0; i < nlbs; i++){ if(msname == string(lbs[i]->lbName())){ proxy = lbs[i]->getGroupID(); foundLB = Multistep; proxy = lbs[i]->getGroupID(); break; } else if(orb3dname == string(lbs[i]->lbName())){ proxy = lbs[i]->getGroupID(); foundLB = Orb3d; proxy = lbs[i]->getGroupID(); break; } else if(ms_notoponame == string(lbs[i]->lbName())){ proxy = lbs[i]->getGroupID(); foundLB = Multistep_notopo; break; } else if(msnode_notoponame == string(lbs[i]->lbName())){ proxy = lbs[i]->getGroupID(); foundLB = MultistepNode_notopo; break; } else if(msorb_name == string(lbs[i]->lbName())){ proxy = lbs[i]->getGroupID(); foundLB = MultistepOrb; break; } else if(orb3d_notoponame == string(lbs[i]->lbName())){ proxy = lbs[i]->getGroupID(); foundLB = Orb3d_notopo; break; } else if(hierarch_name == string(lbs[i]->lbName())) { proxy = lbs[i]->getGroupID(); foundLB = HierarchOrb; } } } if (CkpvAccess(_lb_obj_index) != -1) { void *data = getObjUserData(CkpvAccess(_lb_obj_index)); *(TaggedVector3D *)data = tv; } thisProxy[thisIndex].doAtSync(); // this will be called in resumeFromSync() callback = cb; } // Choose a piece from among the owners from which to // request moments in such a way that they are evenly distributed over // the owners. int TreePiece::getResponsibleIndex(int first, int last){ int which = first + (thisIndex % (1 + last - first)); if(verbosity > 3) CkPrintf("[tp %d] choosing responsible index %d from %d to %d\n", thisIndex, dm->responsibleIndex[which], first, last); return dm->responsibleIndex[which]; } std::map::iterator TreePiece::createTreeBuildMomentsEntry(GenericTreeNode *pickedNode){ std::pair::iterator,bool> ret; ret = nonLocalMomentsClients.insert(make_pair(pickedNode->getKey(),NonLocalMomentsClientList(pickedNode))); CkAssert(ret.second); return ret.first; } #ifdef CUDA void TreePiece::clearMarkedBuckets(CkVec &markedBuckets){ int len = markedBuckets.length(); for(int i = 0; i < len; i++){ markedBuckets[i]->bucketArrayIndex = -1; } } void TreePiece::clearMarkedBucketsAll(){ for(int i = 0; i < numBuckets; i++){ bucketList[i]->bucketArrayIndex = -1; } } #endif #ifdef REDUCTION_HELPER ReductionHelper::ReductionHelper(){ } ReductionHelper::ReductionHelper(CkMigrateMessage *m) : CBase_ReductionHelper(m){ } void ReductionHelper::pup(PUP::er &p){ CBase_ReductionHelper::pup(p); } void ReductionHelper::countTreePieces(const CkCallback &cb){ // count the number of tree pieces on this PE senseLocalTreePieces(); // no tree pieces can have called "reduce()" at this point numTreePiecesCheckedIn = 0; //CkPrintf("ReductionHelper %d counted %d pieces on PE\n", CkMyPe(), myNumTreePieces); contribute(cb); } void ReductionHelper::reduceBinCounts(int nBins, int64_t *binCounts, const CkCallback &cb){ numTreePiecesCheckedIn++; //CkPrintf("ReductionHelper %d recvd %d/%d contributions\n", CkMyPe(), numTreePiecesCheckedIn, myNumTreePieces); if(numTreePiecesCheckedIn == 1){ // resize bin counts vector myBinCounts.resize(nBins); // initialize counts to contribution to reduction from first tree piece memcpy(&myBinCounts[0], binCounts, nBins*sizeof(int64_t)); } else{ CkAssert(nBins == myBinCounts.size()); for(int i = 0; i < nBins; i++){ myBinCounts[i] += binCounts[i]; } } // is it time to contribute to PE-wide reduction yet? if(numTreePiecesCheckedIn == localTreePieces.presentTreePieces.size()){ //CkPrintf("ReductionHelper %d contributing to PE-level reduction\n", CkMyPe()); numTreePiecesCheckedIn = 0; contribute(sizeof(int64_t)*myBinCounts.size(), &myBinCounts[0], CkReduction::sum_long, cb); } } void TreePieceCounter::addLocation(CkLocation &loc){ const int *indexData = loc.getIndex().data(); TreePiece *tp = treeProxy[indexData[0]].ckLocal(); presentTreePieces.push_back(tp); } void TreePieceCounter::reset() { presentTreePieces.resize(0); } void ReductionHelper::senseLocalTreePieces(){ localTreePieces.reset(); CkLocMgr *mgr = treeProxy.ckLocMgr(); mgr->iterate(localTreePieces); } #endif