https://github.com/N-BodyShop/changa
Raw File
Tip revision: a36cde4621947f1a2d309a9e0b77aed35650e688 authored by Harshitha on 10 May 2014, 01:33:10 UTC
Intra node load balancing using CkLoop
Tip revision: a36cde4
ParallelGravity.h
/** @file ParallelGravity.h
 */

#ifndef PARALLELGRAVITY_H
#define PARALLELGRAVITY_H

#include "config.h"
#include <string>
#include <map>
#include <vector>
#include <algorithm>

#include "pup_stl.h"
#include "ckio.h"

#include "Vector3D.h"
#include "tree_xdr.h"
#include "TipsyFile.h"
#include "SFC.h"
#include "TreeNode.h"
#include "GenericTreeNode.h"
#include "Interval.h"
#include "parameters.h"
#include "param.h"
#include "dumpframe.h"
#include "liveViz.h"

#include "TaggedVector3D.h"

#include "codes.h"
#include "CacheInterface.h"

#ifdef SPCUDA
#include "EwaldCUDA.h"
#endif

#ifdef CUDA
#include "cuda_typedef.h"
#endif

#include "keytype.h"

PUPbytes(InDumpFrame);
PUPbytes(COOL);
PUPbytes(COOLPARAM);

#ifdef HPM_COUNTER
#include <libhpm.h>
#endif

#include <map>

#define MERGE_REMOTE_REQUESTS_VERBOSE /*CkPrintf*/

using namespace std;

using namespace Tree;

enum LBStrategy{
  Null=0,
  Multistep,
  Orb3d,
  Multistep_notopo,
  MultistepNode_notopo,
  Orb3d_notopo,
  MultistepOrb,
  HierarchOrb
};
PUPbytes(LBStrategy);

#ifdef SELECTIVE_TRACING
enum TraceState {
  TraceNormal = 0,
  TraceSkip
};
#endif

enum DomainsDec {
    SFC_dec=0,	// Space Filling Curve with Morton ordering
    Oct_dec=1, 	// Oct tree
    ORB_dec=2,	// Bisect the longest axis, balancing particles
    SFC_peano_dec=3,	// SFC with Peano-Hilbert ordering
    SFC_peano_dec_3D=4, // Joachim Stadel's implementation of P-H ordering
    SFC_peano_dec_2D=5,	// 2D version of Peano-Hilbert ordering
    ORB_space_dec=6		// Bisect space
};

/// tolerance for unequal pieces in SFC based decompositions.
const double ddTolerance = 0.1;

inline void operator|(PUP::er &p,DomainsDec &d) {
  int di;
  if (p.isUnpacking()) {
    p | di;
    d = (DomainsDec)di;
  } else {
    di = (int)d;
    p | di;
  }
}

#include "GravityParticle.h"

class SmoothParams;

#include "InOutput.h"

#include "ParallelGravity.decl.h"

extern CProxy_Main mainChare;
extern int verbosity;
extern bool _cache;
extern int _nocache;
extern int _cacheLineDepth;
extern unsigned int _yieldPeriod;
extern DomainsDec domainDecomposition;
extern double dExtraStore;
extern double dMaxBalance;
extern int bUseCkLoopPar;
extern GenericTrees useTree;
extern CProxy_TreePiece treeProxy;
#ifdef REDUCTION_HELPER
extern CProxy_ReductionHelper reductionHelperProxy;
#endif
extern CProxy_LvArray lvProxy;	    // Proxy for the liveViz array
extern CProxy_LvArray smoothProxy;  // Proxy for smooth reduction
extern CProxy_LvArray gravityProxy; // Proxy for gravity reduction
extern CProxy_TreePiece streamingProxy;
extern CProxy_DataManager dMProxy;
extern CProxy_IntraNodeLBManager nodeLBMgrProxy;
extern unsigned int numTreePieces;
extern unsigned int particlesPerChare;
extern int nIOProcessor;

extern CProxy_DumpFrameData dfDataProxy;
extern CProxy_PETreeMerger peTreeMergerProxy;
extern CProxy_CkCacheManager<KeyType> cacheGravPart;
extern CProxy_CkCacheManager<KeyType> cacheSmoothPart;
extern CProxy_CkCacheManager<KeyType> cacheNode;

/// The group ID of your DataManager.  You must set this!
extern CkGroupID dataManagerID;

extern int boundaryEvaluationUE;
extern int weightBalanceUE;
extern int networkProgressUE;
extern int nodeForceUE;
extern int partForceUE;

extern int tbRecursiveUE;
extern int tbFlushRequestsUE;
extern int prefetchDoneUE;

extern int _prefetch;
extern int _randChunks;
extern int _numChunks;
extern unsigned int bucketSize;
extern int lbcomm_cutoff_msgs;

//jetley
extern int localNodesPerReq;
extern int remoteNodesPerReq;
extern int remoteResumeNodesPerReq;
extern int localPartsPerReq;
extern int remotePartsPerReq;
extern int remoteResumePartsPerReq;

extern double largePhaseThreshold;

extern double theta;
extern double thetaMono;

extern int numInitDecompBins;
extern int octRefineLevel;

class dummyMsg : public CMessage_dummyMsg{
public:
int val;
/*#if INTERLIST_VER > 0
int level;
GenericTreeNode* startNode;
#endif*/
};

class TreePieceStatistics {
  u_int64_t nodesOpenedLocal;
  u_int64_t nodesOpenedRemote;
  u_int64_t nodeInterLocal;
  u_int64_t nodeInterRemote;
  u_int64_t particleInterLocal;
  u_int64_t particleInterRemote;
  u_int64_t openCriterionCalls;
  int nActive;

  TreePieceStatistics() : nodesOpenedLocal(0), nodesOpenedRemote(0),
      nodeInterLocal(0), nodeInterRemote(0), particleInterLocal(0),
      particleInterRemote(0), openCriterionCalls(0), nActive(0) { }

 public:
  TreePieceStatistics(u_int64_t nol, u_int64_t nor, u_int64_t occ, u_int64_t nil, u_int64_t nir,
		      u_int64_t pil, u_int64_t pir, int na) :
      nodesOpenedLocal(nol), nodesOpenedRemote(nor), nodeInterLocal(nil),
      nodeInterRemote(nir), particleInterLocal(pil), particleInterRemote(pir),
      openCriterionCalls(occ), nActive(na) { }

  void printTo(CkOStream &os) {
    os << "  TreePiece: " << nActive << " particles active." << endl;
    os << "  TreePiece: " << nodesOpenedLocal << " local nodes opened, ";
    os << nodesOpenedRemote << " remote" << endl;
    os << "  TreePiece: " << openCriterionCalls << " num of open criterion calls" << endl;
    os << "  TreePiece: " << nodeInterLocal << " local particle-node interactions, ";
    os << nodeInterRemote << " remote" << endl;
    os << "  TreePiece: " << particleInterLocal << " local particle-particle interactions, ";
    os << particleInterRemote << " remote" << endl;
    os << "  TreePiece: "
       << (particleInterLocal + particleInterRemote)/(double) nActive
       << " particles, "
       << (nodeInterLocal + nodeInterRemote)/(double) nActive
       << " nodes per particle" << endl;
  }

  static CkReduction::reducerType sum;

  static CkReductionMsg *sumFn(int nMsg, CkReductionMsg **msgs) {
    TreePieceStatistics ret;
    for (int i=0; i<nMsg; ++i) {
      CkAssert(msgs[i]->getSize() == sizeof(TreePieceStatistics));
      TreePieceStatistics *data = (TreePieceStatistics *)msgs[i]->getData();
      ret.nodesOpenedLocal += data->nodesOpenedLocal;
      ret.nodesOpenedRemote += data->nodesOpenedRemote;
      ret.openCriterionCalls += data->openCriterionCalls;
      ret.nodeInterLocal += data->nodeInterLocal;
      ret.nodeInterRemote += data->nodeInterRemote;
      ret.particleInterLocal += data->particleInterLocal;
      ret.particleInterRemote += data->particleInterRemote;
      ret.nActive += data->nActive;
    }
    return CkReductionMsg::buildNew(sizeof(TreePieceStatistics), &ret);
  }
};

class ComputeChunkMsg : public CMessage_ComputeChunkMsg {
  ComputeChunkMsg() {} // not available
 public:
  int chunkNum;

  ComputeChunkMsg(int i) : chunkNum(i) {
  }
};

/// @brief Message for evaluating splits for the ORB domain decomposition
///
/// This message contains the splitting dimensions and values for an
/// ORB tree.  The size arrays pos and dim are set by arguments to
/// the new() operator, but should be equal to length.
class ORBSplittersMsg : public CMessage_ORBSplittersMsg{
public:
  /// Number of splits
  int length;
  /// Positions of splits
  double *pos;
  /// Dimension of splits
  char *dim;
  /// Callback for reduction of particle counts
  CkCallback cb;

  ORBSplittersMsg(int len, CkCallback callback): length (len), cb(callback) {}

};

/// Message for shuffling particles during domain decomposition
class ParticleShuffleMsg : public CMessage_ParticleShuffleMsg{
public:
    int nloads;
    int n;
    int nSPH;
    int nStar;
    double load;
    double *loads;
    unsigned int *parts_per_phase;
    GravityParticle *particles;
    extraSPHData *pGas;
    extraStarData *pStar;
    ParticleShuffleMsg(int nload, int npart, int nsph, int nstar, double pload): 
      nloads(nload), n(npart), nSPH(nsph), nStar(nstar), load(pload) {}
};

#ifdef PUSH_GRAVITY
#include "ckmulticast.h"

struct BucketMsg : public CkMcastBaseMsg, public CMessage_BucketMsg {
  GenericTreeNode *buckets;
  int numBuckets;
  ExternalGravityParticle *particles;
  int numParticles;
  int whichTreePiece;
};
#endif
    
/// Class to count added and deleted particles
class CountSetPart 
{
 public:
    int index;			/* chare index */
    int nAddGas;
    int nDelGas;
    int nAddDark;
    int nDelDark;
    int nAddStar;
    int nDelStar;

    void pup(PUP::er& p) {
	p | index;
	p | nAddGas;
	p | nDelGas;
	p | nAddDark;
	p | nDelDark;
	p | nAddStar;
	p | nDelStar;
	}
    };

class Compare{ //Defines the comparison operator on the map used in balancer
  int dim;
public:
  Compare() {}
  Compare(int i) : dim(i) {}

  void setDim(int i){ dim = i; }

  bool operator()(GravityParticle& p1, GravityParticle& p2) const {
    return p1.position[dim] < p2.position[dim];
  }
};


/*
 * Multistepping routines
 *
 * Each major timestep can have MAXSUBSTEPS substeps where MAXSUBSTEPS
 * is a large power of 2 that can fit in an integer: 1 << MAXRUNG,
 * with MAXRUNG something like 30.
 * A given particle is on a "Rung" such that it takes 1 << Rung
 * substeps per major timestep.  That is, it's force is updated every
 * 1 << (MAXRUNG - Rung) smallest substeps.
 *
 * Needed routines:
 * DtToRung(): take an ideal timestep, the major timestep and
 * determine a rung.
 */

const int MAXRUNG = 30;
const int MAXSUBSTEPS = 1 << MAXRUNG;
const double MAXSUBSTEPS_INV = 1 / (double)MAXSUBSTEPS;

inline int RungToSubsteps(int iRung) {
  return 1 << (MAXRUNG - iRung);
}

inline int DtToRung(double dDelta, double dTideal) {
  int iSteps = (int) ceil(dDelta/dTideal);

  int iRung = 0;
  iSteps--;
  while(iSteps > 0) {
    iRung++;
    iSteps >>= 1;
  }
  return iRung;
}

inline double RungToDt(double dDelta, int iRung) {
  return dDelta*RungToSubsteps(iRung)*MAXSUBSTEPS_INV;
}

/// @brief slot in MultistepLB to hold feedback phase load information
const int PHASE_FEEDBACK = MAXRUNG + 1;

/// @brief Overall flow control of the simulation.
///
/// As well as controlling the overall flow of the simulation, the
/// constructors are the main entry points into the program.
/// The sequence of tasks is: read the simulation parameters (Main()),
/// read in the initial conditions (setupICs()), calculate the initial
/// forces (initialForces()), then iterate across timesteps and write
/// the final output (doSimulation()).
///
class Main : public CBase_Main {
	CkArgMsg *args;
	std::string basefilename;
        /// Save parameters for output
        OutputParams *pOutput;
	/// globally finished IO
	CkCallback cbIO;
        /// Save file token for CkIO
        Ck::IO::File fIOFile;
	CProxy_Sorter sorter;
	int64_t nTotalParticles;
	int64_t nTotalSPH;
	int64_t nTotalDark;
	int64_t nTotalStar;
	int64_t nMaxOrderGas;  /* Maximum iOrders */
	int64_t nMaxOrderDark;
	int64_t nMaxOrder;
	
	double dTime;		/* Simulation time */
	double dEcosmo;		/* variables for integrating
				   Lazer-Irvine eq. */
	double dUOld;
	double dTimeOld;
	unsigned int printBinaryAcc;
	PRM prm;		/* parameter parsing info */
	Parameters param; /* actual parameters */
	CkVec<double> vdOutTime; // Desired output times
	int iOut;
	/*
	 ** Tracking for frame dumping function
	 */
	int bDumpFrame;
	struct DumpFrameContext **df;
	int bIsRestarting;
	int bChkFirst;		/* alternate between 0 and 1 for checkpoint */
	double dSimStartTime;   // Start time for entire simulation
	int iStop;		/* indicate we're stopping the
				   simulation early */
	int64_t nActiveGrav;
	int64_t nActiveSPH;

#ifdef CUDA
          double localNodesPerReqDouble;
          double remoteNodesPerReqDouble;
          double remoteResumeNodesPerReqDouble;
          double localPartsPerReqDouble;
          double remotePartsPerReqDouble;
          double remoteResumePartsPerReqDouble;
#endif

#ifdef CHECK_TIME_WITHIN_BIGSTEP
       double wallTimeStart;
#endif

#ifdef SELECTIVE_TRACING
       int monitorRung;
       int monitorStart;
       int numTraceIterations;
       int numSkipIterations;
       int numMaxTrace;
       int traceIteration;
       int traceState;
       bool projectionsOn;

       void turnProjectionsOn(int activeRung);
       void turnProjectionsOff();
#endif


public:

	Main(CkArgMsg* m);
	Main(CkMigrateMessage *m);

	void niceExit();
	void setupICs();
	void initialForces();
	void doSimulation();
	void restart();
	void waitForGravity(const CkCallback &cb, double startTime);
        void advanceBigStep(int);
	int adjust(int iKickRung);
	void rungStats();
	void countActive(int activeRung);
	void calcEnergy(double, double, const char *);
	void getStartTime();
	void getOutTimes();
	int bOutTime();
	void writeOutput(int iStep) ;
        void outputBinary(OutputParams& params, int bParaWrite,
                          const CkCallback& cb);
        void cbOpen(Ck::IO::FileReadyMsg *msg);
        void cbIOReady(Ck::IO::SessionReadyMsg *msg);
        void cbIOComplete(CkReductionMsg *msg);
        void cbIOClosed(CkReductionMsg *msg);
        std::string getNCNextOutput(OutputParams& params);
	void updateSoft();
	void growMass(double dTime, double dDelta);
	void initSph();
	void initCooling();
	void initStarLog();
	int ReadASCII(char *extension, int nDataPerLine, double *dDataOut);
        void restartGas();
	void doSph(int activeRung, int bNeedDensity = 1);
	void FormStars(double dTime, double dDelta);
	void StellarFeedback(double dTime, double dDelta);
	int DumpFrameInit(double dTime, double dStep, int bRestart);
	void DumpFrame(double dTime, double dStep);
	int nextMaxRungIncDF(int nextMaxRung);
	void addDelParticles();
	void memoryStats();
	void memoryStatsCache();
	void pup(PUP::er& p);
	void liveVizImagePrep(liveVizRequestMsg *msg);
};

/* IBM brain damage */
#undef hz
typedef struct ewaldTable {
  double hx,hy,hz;
  double hCfac,hSfac;
} EWT;

// jetley
class MissRecord;
class State;

///Remote Cell interaction lists for all tree levels
typedef struct OffsetNodeStruct
{
      GenericTreeNode *node;
      int offsetID;
}OffsetNode;

#if INTERLIST_VER > 0
typedef CkQ<OffsetNode> CheckList;
typedef CkVec<OffsetNode> UndecidedList;
typedef CkVec<UndecidedList> UndecidedLists;
#endif

typedef struct particlesInfoR{
    ExternalGravityParticle* particles;
    int numParticles;
    Vector3D<double> offset;
#if defined CHANGA_REFACTOR_PRINT_INTERACTIONS || defined CHANGA_REFACTOR_WALKCHECK_INTERLIST || defined CUDA
    NodeKey key;
#endif
#if COSMO_DEBUG > 1
    GenericTreeNode *nd;
#endif
} RemotePartInfo;

 ///Local Particle Info structure
typedef struct particlesInfoL{
    GravityParticle* particles;
    int numParticles;
    Vector3D<double> offset;
#if defined CHANGA_REFACTOR_PRINT_INTERACTIONS || defined CHANGA_REFACTOR_WALKCHECK_INTERLIST || defined CUDA
    NodeKey key;
#endif
#if COSMO_DEBUG > 1 || defined CUDA
    GenericTreeNode *nd;
#endif
} LocalPartInfo;

typedef struct LoopParDataStruct {
  CkVec<GenericTreeNode*> lowNodes;
  CkVec<int> bucketids;
  CkVec<int> chunkids;
  CkVec<CkVec<OffsetNode> > clists;
  CkVec<CkVec<RemotePartInfo> > rpilists;
  CkVec<CkVec<LocalPartInfo> > lpilists;
  TreePiece* tp;
} LoopParData;



#ifdef CUDA
struct BucketActiveInfo{
  int start;
  int size;
};
#endif

class SmoothCompute;
#include "Compute.h"

#if INTERLIST_VER > 0 && defined CUDA
template<typename T> class GenericList;
#endif

/// @brief client that has requested a moment.
struct NonLocalMomentsClient {
  TreePiece *clientTreePiece;
  GenericTreeNode *clientNode;

  NonLocalMomentsClient() :
    clientTreePiece(NULL),
    clientNode(NULL)
  {}

  NonLocalMomentsClient(TreePiece *tp, GenericTreeNode *node) : 
    clientTreePiece(tp),
    clientNode(node)
  {}
};

/// @brief List of clients needing a particular moment
struct NonLocalMomentsClientList {
  GenericTreeNode *targetNode;
  CkVec<NonLocalMomentsClient> clients;

  NonLocalMomentsClientList() : 
    targetNode(NULL)
  {}

  NonLocalMomentsClientList(GenericTreeNode *node) :
    targetNode(node)
  {}

  void addClient(const NonLocalMomentsClient &cli){
    clients.push_back(cli);
  }
};

/// Fundamental structure that holds particle and tree data.
class TreePiece : public CBase_TreePiece {
   // jetley
   friend class PrefetchCompute;
   friend class GravityCompute;
   friend class SmoothCompute;
   friend class KNearestSmoothCompute;
   friend class ReSmoothCompute;
   friend class MarkSmoothCompute;
   friend class ListCompute;
   friend class NearNeighborState;
   friend class ReNearNeighborState;
   friend class MarkNeighborState;
   friend class BottomUpTreeWalk;
#if INTERLIST_VER > 0 && defined CUDA
   friend class DataManager;
   template<typename T> friend class GenericList;
#endif

   friend class RemoteTreeBuilder; 
   friend class LocalTreeBuilder; 

   /// @brief Walk for gravity prefetch
   TreeWalk *sTopDown;
   TreeWalk *twSmooth;
#if INTERLIST_VER > 0
   TreeWalk *sInterListWalk;
   // clearable, used for resumed walks
   State *sInterListStateRemoteResume;
#endif
   Compute *sGravity, *sPrefetch;
   SmoothCompute *sSmooth;
   
   Opt *sLocal, *sRemote, *sPref;
   Opt *optSmooth;

   State *sPrefetchState;
   /// Keeps track of the gravity walks over the local tree.
   State *sLocalGravityState, *sRemoteGravityState, *sSmoothState;
   typedef std::map<KeyType, CkVec<int>* > SmPartRequestType;
   // buffer of requests for smoothParticles.
   SmPartRequestType smPartRequests;

   CkVec<ActiveWalk> activeWalks;
   int completedActiveWalks; // XXX this should be part of the gravity
			     // walk state.
   int nCacheAccesses; // keep track of outstanding cache accesses to
		       // know when writebacks complete.  XXX this
		       // should be part of the smooth state
   
   double treePieceLoad; // used to store CPU load data for incoming particles
   double treePieceLoadTmp; // temporary accumulator for above
   double treePieceLoadExp;
   unsigned int treePieceActivePartsTmp;
   std::vector<double> savedPhaseLoad;
   std::vector<unsigned int> savedPhaseParticle;
   std::vector<double> savedPhaseLoadTmp;
   std::vector<unsigned int> savedPhaseParticleTmp;

   int memWithCache, memPostCache;  // store memory usage.
   int nNodeCacheEntries, nPartCacheEntries;  // store memory usage.

#ifdef PUSH_GRAVITY
   bool doMerge;
   bool createdSpanningTree;
   CProxySection_TreePiece allTreePieceSection;
   CkVec<GravityParticle> foreignParticles;
   CkVec<double> foreignParticleAccelerations;

   map<int,CkSectionInfo> cookieJar;
  
   BucketMsg *createBucketMsg();
   void unpackBuckets(BucketMsg *, GenericTreeNode *&foreignBuckets, int &numForeignBuckets);
   void calculateForces(GenericTreeNode *foreignBuckets, int numForeignBuckets);

#endif

 public:

#ifdef PUSH_GRAVITY
  void startPushGravity(int am, double myTheta);
  void recvPushBuckets(BucketMsg *);
  void recvPushAccelerations(CkReductionMsg *);
#endif

#if COSMO_PRINT_BK > 1
  State *getSRemoteGravityState(){ return sRemoteGravityState; }
  State *getSLocalGravityState(){ return sLocalGravityState; }
#endif
  void memCacheStats(const CkCallback &cb);
  void addActiveWalk(int iAwi, TreeWalk *tw, Compute *c, Opt *o, State *s);

  void markWalkDone();
  void finishWalk();
  void markSmoothWalkDone();
  void finishSmoothWalk();

  int getIndex() {
    return thisIndex;
  }

  int getLocalIndex(){
    return localIndex;
  }

  /// @brief accumulate node interaction count for statistics
  void addToNodeInterRemote(int chunk, int howmany){
    nodeInterRemote[chunk] += howmany;
  }

  /// @brief accumulate particle interaction count for statistics
  void addToParticleInterRemote(int chunk, int howmany){
    particleInterRemote[chunk] += howmany;
  }

  /// @brief accumulate node interaction count for statistics
  void addToNodeInterLocal(int howmany){
    nodeInterLocal += howmany;
  }

  /// @brief accumulate particle interaction count for statistics
  void addToParticleInterLocal(int howmany){
    particleInterLocal += howmany;
  }

  /// Start prefetching the specfied chunk; prefetch compute
  /// calls startRemoteChunk() once chunk prefetch is complete
  void initiatePrefetch(int chunk);
  /// Start a new remote computation upon prefetch finished
  void startRemoteChunk();

  int getNumParticles(){
    return myNumParticles;
  }

  GravityParticle *getParticles(){return myParticles;}


#ifdef CUDA
        // this variable holds the number of buckets active at
        // the start of an iteration
        // it is used to ascertain how many buckets still need to 
        // be processed via the stateReady function with regard
        // to their local and remote-no-resume walks. 
        // if all numActiveBuckets
        // have been processed but we still have leftover nodes/particles
        // in the list of interations to the sent to the gpu, we flush
        // the list
        int numActiveBuckets; 
        int myNumActiveParticles;
        BucketActiveInfo *bucketActiveInfo;

        int getNumBuckets(){
        	return numBuckets;
        }

        void callFreeRemoteChunkMemory(int chunk);

        int getActiveRung(){ return activeRung; }
#ifdef CUDA_INSTRUMENT_WRS
        int getInstrumentId(){ return instrumentId; }
#endif
        // returns either all particles or only active particles,
        // depending on fraction of active particles to their
        // total count.
        int getDMNumParticles(){
          if(largePhase()){
            return myNumParticles;
          }
          else{
            return myNumActiveParticles; 
          }
        }

        int getNumActiveParticles(){
          return myNumActiveParticles;
        }

        void calculateNumActiveParticles(){ 
          myNumActiveParticles = 0;
          for(int i = 1; i <= myNumParticles; i++){
            if(myParticles[i].rung >= activeRung){
              myNumActiveParticles++;
            }
          }
        }

        bool largePhase(){
          return (1.0*myNumActiveParticles/myNumParticles) >= largePhaseThreshold;
        }

        void getDMParticles(CompactPartData *fillArray, int &fillIndex){
          if(largePhase()){
            for(int b = 0; b < numBuckets; b++){
              GenericTreeNode *bucket = bucketList[b];
              int buckstart = bucket->firstParticle;
              int buckend = bucket->lastParticle;
              GravityParticle *buckparts = bucket->particlePointer;
              bucket->bucketArrayIndex = fillIndex;
              for(int i = buckstart; i <= buckend; i++){
                fillArray[fillIndex] = buckparts[i-buckstart];
#if defined CUDA_EMU_KERNEL_NODE_PRINTS || defined CUDA_EMU_KERNEL_PART_PRINTS
                fillArray[fillIndex].tp = thisIndex;
                fillArray[fillIndex].id = i;
#endif
                fillIndex++;
              }
            }
          }
          else{
            for(int b = 0; b < numBuckets; b++){
              GenericTreeNode *bucket = bucketList[b];
              if(bucket->rungs < activeRung){
                continue;
              }
              BucketActiveInfo *binfo = &(bucketActiveInfo[b]);
              
              int buckstart = bucket->firstParticle;
              int buckend = bucket->lastParticle;
              GravityParticle *buckparts = bucket->particlePointer;

              binfo->start = fillIndex;
              for(int i = buckstart; i <= buckend; i++){
                if(buckparts[i-buckstart].rung >= activeRung){
                  fillArray[fillIndex] = buckparts[i-buckstart];
#if defined CUDA_EMU_KERNEL_NODE_PRINTS || defined CUDA_EMU_KERNEL_PART_PRINTS
                  fillArray[fillIndex].tp = thisIndex;
                  fillArray[fillIndex].id = i;
#endif
                  fillIndex++;
                }
              }
              binfo->size = fillIndex-binfo->start;
            }
          }
        }

        bool isActive(int partNum){
          return myParticles[partNum].rung >= activeRung;
        }

        void clearMarkedBuckets(CkVec<GenericTreeNode *> &markedBuckets);
        void clearMarkedBucketsAll();

#ifdef CUDA_STATS
        long long localNodeInteractions;
        long long localPartInteractions;
        long long remoteNodeInteractions;
        long long remotePartInteractions;
        long long remoteResumeNodeInteractions;
        long long remoteResumePartInteractions;
#endif

#ifdef CUDA_INSTRUMENT_WRS
        int instrumentId;

        double localNodeListConstructionTime;
        double remoteNodeListConstructionTime;
        double remoteResumeNodeListConstructionTime;
        double localPartListConstructionTime;
        double remotePartListConstructionTime;
        double remoteResumePartListConstructionTime;
        
        int nLocalNodeReqs;
        int nRemoteNodeReqs;
        int nRemoteResumeNodeReqs;
        int nLocalPartReqs;
        int nRemotePartReqs;
        int nRemoteResumePartReqs;

#endif

#endif

        void continueStartRemoteChunk(int chunk);
        void continueWrapUp();

#if INTERLIST_VER > 0
        void getBucketsBeneathBounds(GenericTreeNode *&node, int &start, int &end);
        void updateBucketState(int start, int end, int n, int chunk, State *state);
        void updateUnfinishedBucketState(int start, int end, int n, int chunk, State *state);
#endif

#if defined CHANGA_REFACTOR_WALKCHECK || defined CHANGA_REFACTOR_WALKCHECK_INTERLIST
        void addToBucketChecklist(int bucketIndex, NodeKey k){
          bucketcheckList[bucketIndex].insert(k);
          if(bucketIndex == TEST_BUCKET && thisIndex == TEST_TP)
            CkPrintf("[%d] add %ld\n", thisIndex, k);
        }
#endif

#if INTERLIST_VER > 0
        GenericTreeNode *getStartAncestor(int current, int previous, GenericTreeNode *dflt);
#endif
	/// \brief convert a key to a node using the nodeLookupTable
	inline GenericTreeNode *keyToNode(const Tree::NodeKey k){
          NodeLookupType::iterator iter = nodeLookupTable.find(k);
          if (iter != nodeLookupTable.end()) return iter->second;
          else return NULL;
        }

        GenericTreeNode *getBucket(int i){
          return bucketList[i];
        }

	/// Time read in from input file
	double dStartTime;

private:        
	// liveViz 
	liveVizRequestMsg * savedLiveVizMsg;

        // jetley - proxy for load balancer
        CkGroupID proxy;
        LBStrategy foundLB;
        // jetley - whether proxy is valid or not
        bool proxyValid;
        // jetley - saved first internal node
        Vector3D<float> savedCentroid;
        bool proxySet;
        // jetley - multistep load balancing
        int prevLARung;

	/// @brief Used to inform the mainchare that the requested operation has
	/// globally finished
	CkCallback callback;
	/// gravity globally finished
	CkCallback cbGravity;
	/// smooth globally finished
	CkCallback cbSmooth;
  CkCallback after_dd_callback;
	/// Total number of particles contained in this chare
	unsigned int myNumParticles;
	/// Array with the particles in this chare
	GravityParticle* myParticles;
	/// Actual storage in the above array
	int nStore;

  // Temporary location to hold the particles that have come from outside this
  // TreePiece. This is used in the case where we migrate the particles and
  // detect completion of migration using QD.
	std::vector<GravityParticle> myTmpShuffleParticle;
	std::vector<extraSPHData> myTmpShuffleSphParticle;
	std::vector<extraStarData> myTmpShuffleStarParticle;
  ParticleShuffleMsg* myShuffleMsg;
	/// Number of particles in my tree.  Can be different from
	/// myNumParticles when particles are created.
	int myTreeParticles;
 public:
	/// Total Particles in the simulation
	int64_t nTotalParticles;
	/// Total Gas Particles
	int64_t nTotalSPH;
	/// Total Dark Particles
	int64_t nTotalDark;
	/// Total Star Particles
	int64_t nTotalStar;
 private:
	/// Total gas in this chare
	unsigned int myNumSPH;
	/// Array with SPH particle data
	extraSPHData *mySPHParticles;
	/// Actual storage in the above array
	int nStoreSPH;
	/// Total stars in this chare
	unsigned int myNumStar;
	/// Array with star particle data
	extraStarData *myStarParticles;
	/// Actual storage in the above array
	int nStoreStar;
	/// MaxIOrder for output
	int64_t nMaxOrder;
        /// Start particle for reading
        int64_t nStartRead;
	/// particle count for output
	int myIOParticles;

	/// List of all the node-buckets in this TreePiece
	std::vector<GenericTreeNode *> bucketList;

	/// Array with sorted particles for domain decomposition (ORB)
	std::vector<GravityParticle> mySortedParticles;
	/// Array with sorted SPH data for domain decomposition (ORB)
	std::vector<extraSPHData> mySortedParticlesSPH;
	/// Array with sorted Star data for domain decomposition (ORB)
	std::vector<extraStarData> mySortedParticlesStar;
        /// Array with incoming particles messages for domain decomposition
	std::vector<ParticleShuffleMsg*> incomingParticlesMsg;
        /// How many particles have already arrived during domain decomposition
        int incomingParticlesArrived;
        /// Flag to acknowledge that the current TreePiece has already
        /// contributed to itself, prevents that acceptSortedParticles will
        /// change the TreePiece particles before the one belonging to someone
        /// else have been sent out
        bool incomingParticlesSelf;

	/// holds the total mass of the current TreePiece
	double piecemass;

	/// used to determine which coordinate we are outputting, while printing
	/// accelerations in ASCII format
	int cnt;
	/// used to determine if the x, y, z coordinates should be printed
	/// together while outputting accelerations in ASCII format
	int packed;

          // the index assigned by the CacheManager upon registration
          int localIndex;

	//unsigned int numSplitters;
        //SFC::Key* splitters;
	CProxy_TreePiece pieces;
	/// A proxy to the DataManager.
	//CProxy_DataManager dataManager; // unused...
	/// A local pointer to my DataManager.
	DataManager* dm;

	/// The counts of how many particles belonging to other
	/// TreePieces I currently hold
#ifndef REDUCTION_HELPER
	CkVec<int64_t> myBinCounts;
#endif
	std::vector<int> myBinCountsORB;
	/// My index in the responsibility array.
	int myPlace;
	/// The keys determined by the Sorter that separate me from my
	/// neighbors.
	SFC::Key leftSplitter, rightSplitter;

	std::string basefilename;
	// Bounding box of the entire simulation
	OrientedBox<float> boundingBox;
	unsigned iterationNo;
	/// The root of the global tree, always local to any chare
	GenericTreeNode* root;
	/// pool of memory to hold TreeNodes: makes allocation more efficient.
	NodePool *pTreeNodes;

	typedef std::map<NodeKey, CkVec<int>* >   MomentRequestType;
	/// Keep track of the requests for remote moments not yet satisfied.
	/// Used only during the tree construction.  This is a map
	/// from NodeKey to a vector of treepieces that have requested it.
	MomentRequestType momentRequests;

	/// Opening angle
	//double theta; -- moved as readonly
	/// Opening angle - monopole
	//double thetaMono; -- moved as readonly
        /// The current active mask for force computation in multistepping
        int activeRung;

	/// Periodic Boundary stuff
	int bPeriodic;
	Vector3D<double> fPeriod;
	int nReplicas;
	int bEwald;		/* Perform Ewald */
	double fEwCut;
	double dEwhCut;
	EWT *ewt;
	int nMaxEwhLoop;
	int nEwhLoop;
#ifdef HEXADECAPOLE
	MOMC momcRoot;		/* complete moments of root */
#endif

	int bGasCooling;
#ifndef COOLING_NONE
	clDerivsData *CoolData;
#endif
	/// Setup for writing
	int nSetupWriteStage;
	int64_t nStartWrite;	// Particle number at which this piece starts
				// to write file.

	/// Map between Keys and TreeNodes, used to get a node from a key
	NodeLookupType nodeLookupTable;

	/// Number of nodes still missing before starting the real computation
	//u_int64_t prefetchWaiting;
	/// Array of keys that will be the root of the prefetching chunks
	Tree::NodeKey *prefetchRoots;
	/// Placeholder for particles used for prefetching
	OrientedBox<double> prefetchReq[2];
	unsigned int numPrefetchReq;

	/// number of chunks in which the tree will be chopped for prefetching
	int numChunks;

#if COSMO_STATS > 0
	u_int64_t myNumMACChecks;
	u_int64_t nodesOpenedLocal;
	u_int64_t nodesOpenedRemote;
	u_int64_t numOpenCriterionCalls;
#endif
	/// node interaction count for statistics
	u_int64_t nodeInterLocal;
	/// node interaction count for statistics
	u_int64_t *nodeInterRemote;
	/// particle interaction count for statistics
	u_int64_t particleInterLocal;
	/// particle interaction count for statistics
	u_int64_t *particleInterRemote;

	int nActive;		// number of particles that are active

	/// Size of bucketList, total number of buckets present
	unsigned int numBuckets;
#if INTERLIST_VER > 0
	/// Completed buckets, remote gravity walk
	int prevRemoteBucket;
	/// Completed buckets, local gravity walk
	int prevBucket;
#endif
	/// Used to start the Ewald computation for all buckets, one after the other
	unsigned int ewaldCurrentBucket;
	/// @brief Used as a placeholder while traversing the tree and computing
	/// forces. This should not exist in the cache version since all its
	/// information is duplicate from the correspondent TreeNode (in
	/// bucketList), and myParticles.
	/// @todo Eliminate the usage of this in the cache walk
	BucketGravityRequest *bucketReqs;

	/// Pointer to the instance of the local cache
        //CacheManager *localCache;

	// Entries used for the CacheManager
	EntryTypeGravityParticle gravityParticleEntry;
	EntryTypeSmoothParticle smoothParticleEntry;
	EntryTypeGravityNode gravityNodeEntry;

#if COSMO_DEBUG > 1 || defined CHANGA_REFACTOR_WALKCHECK || defined CHANGA_REFACTOR_WALKCHECK_INTERLIST
  ///A set for each bucket to check the correctness of the treewalk
  typedef std::vector< std::multiset<Tree::NodeKey> > DebugList;
  DebugList bucketcheckList;
#endif

  ///Variables added for ORB decomposition

  ///Used to denote the first call to evaluateParticleCounts within each phase
  bool firstTime;

  ///Temporary variable for the particle counts in each ORB box
  std::vector<int> tempBinCounts;

  ///Particles defining boundaries for ORB boxes
  std::list<GravityParticle *> orbBoundaries;

  ///Expected number of particles for each TreePiece after ORB decomposition
  int myExpectedCount;
  ///Expected number of SPH particles for each TreePiece after ORB
  ///decomposition
  int myExpectedCountSPH;
  ///Expected number of Star particles for each TreePiece after ORB
  ///decomposition
  int myExpectedCountStar;

  ///Level after which the local subtrees of all the TreePieces start
  unsigned int chunkRootLevel;

  ///Keeps track of the bounding boxes and dimensions along which they are splitted, starting from
  ///root to chunkRootLevel
  OrientedBox<float>* boxes;
  char* splitDims;

  ///Phase of ORB decomposition: the number of boxes double in each phase till they are equal to the number
  ///of TreePieces
  int phase;

  double myTotalMass;

 #if INTERLIST_VER > 0


 public:
#endif
  // called when a chunk has been used completely (chunkRemaining[chunk] == 0)
  void finishedChunk(int chunk);

  ///Array of comparison function pointers
  bool (*compFuncPtr[3])(GravityParticle,GravityParticle);
  double tmpTime;
  double totalTime;
 public:

#ifdef SPCUDA
  EwaldData *h_idata;
#endif
  void EwaldGPU(); 
  void EwaldGPUComplete();

#if COSMO_DEBUG > 1 || defined CHANGA_REFACTOR_WALKCHECK || defined CHANGA_REFACTOR_WALKCHECK_INTERLIST
  ///This function checks the correctness of the treewalk
  void checkWalkCorrectness();
  void combineKeys(Tree::NodeKey key,int bucket);
#endif

  /* DEBUGGING */
  void quiescence();
  /*END DEBUGGING */

  NodeLookupType &getNodeLookupTable() {
	  return nodeLookupTable;
  }

  int getNumNodes() {
	  return nodeLookupTable.size();
  }

  /// delete treenodes if allocated
  void deleteTree() {
    if(pTreeNodes != NULL) {
        delete pTreeNodes;
        pTreeNodes = NULL;
        root = NULL;
        nodeLookupTable.clear();
        }
    else {
        if (root != NULL) {
            root->fullyDelete();
            delete root;
            root = NULL;
            nodeLookupTable.clear();
            }
        }
    }

	/// Recursive call to build the subtree with root "node", level
	/// specifies the level at which "node" resides inside the tree
	void buildOctTree(GenericTreeNode* node, int level);
#ifdef TREE_BREADTH_FIRST
	void growBottomUp(GenericTreeNode* node);
#endif

	/// Compute all the moments for the nodes that are NonLocal, so that
	/// during the tree traversal, they contain useful information to decide
	/// whether to open or not.
	void calculateRemoteMoments(GenericTreeNode* node);

	/// Checks that every particle is indeed included in its bucket node
	/// (may not be true due to truncation of the last two bits while
	/// generating the 63 bit keys.
	void checkTree(GenericTreeNode* node);

	/// Given a node, check who is the first owner and the last owner of it.
	/// It assumes that there are splitters, and that there is an ordering
	/// of them across chares. It works for SFC ordering.
	bool nodeOwnership(const Tree::NodeKey nkey, int &firstOwner, int &lastOwner);


	/// Initialize all the buckets for the tree walk
	/// @TODO: Eliminate this redundant copy!
	void initBuckets();
	template <class Tsmooth>
	void initBucketsSmooth(Tsmooth tSmooth);
	void smoothNextBucket();
	void reSmoothNextBucket();
	void markSmoothNextBucket();
	void smoothBucketComputation();
	/** @brief Start the treewalk for the next bucket among those belonging
	 * to me. The buckets are simply ordered in a vector.
	 */
	void startNextBucket();
	/** @brief Start a full step of bucket computation, it sends a message
	 * to trigger nextBucket() which will loop over all the buckets.
	 */
	void doAllBuckets();
	void reconstructNodeLookup(GenericTreeNode *node);
	//void rebuildSFCTree(GenericTreeNode *node,GenericTreeNode *parent,int *);

public:
 TreePiece() : pieces(thisArrayID), root(0), proxyValid(false),
	    proxySet(false), prevLARung (-1), sTopDown(0), sGravity(0),
	  sPrefetch(0), sLocal(0), sRemote(0), sPref(0), sSmooth(0), 
	  treePieceLoad(0.0), treePieceLoadTmp(0.0), treePieceLoadExp(0.0),
    treePieceActivePartsTmp(0) {
	  //CkPrintf("[%d] TreePiece created on proc %d\n",thisIndex, CkMyPe());
	  // ComlibDelegateProxy(&streamingProxy);
	  dm = NULL;
	  foundLB = Null; 
	  iterationNo=0;
	  usesAtSync = true;
	  pTreeNodes = NULL;
	  bucketReqs=NULL;
	  nCacheAccesses = 0;
	  memWithCache = 0;
	  memPostCache = 0;
	  nNodeCacheEntries = 0;
	  nPartCacheEntries = 0;
	  completedActiveWalks = 0;
	  myPlace = -1;
	  nSetupWriteStage = -1;
    //openingDiffCount=0;
    chunkRootLevel=0;
    //splitters = NULL;
#if COSMO_STATS > 0
	  nodesOpenedLocal = 0;
	  nodesOpenedRemote = 0;
	  nodeInterLocal = 0;
	  particleInterLocal = 0;

	  numOpenCriterionCalls=0;

	  piecemass = 0.0;
#endif
	  packed=0;			/* output accelerations in x,y,z
					   packed format */
	  cnt=0;			/* Counter over x,y,z when not packed */
	  particleInterRemote = NULL;
	  nodeInterRemote = NULL;

#if INTERLIST_VER > 0
	  sInterListWalk = NULL;
#endif
#ifdef CUDA
          numActiveBuckets = -1;
#ifdef CUDA_STATS
          localNodeInteractions = 0;
          localPartInteractions = 0;
          remoteNodeInteractions = 0;
          remotePartInteractions = 0;
          remoteResumeNodeInteractions = 0;
          remoteResumePartInteractions = 0;
#endif
#endif

	  tmpTime=0.0;
	  totalTime=0.0;
	  // temporarely set to -1, it will updated after the tree is built
	  numChunks=-1;
	  prefetchRoots = NULL;
	  numPrefetchReq = 0;
	  ewt = NULL;
	  nMaxEwhLoop = 100;

          incomingParticlesMsg.clear();
          incomingParticlesArrived = 0;
          incomingParticlesSelf = false;

          myParticles = NULL;
          mySPHParticles = NULL;
          myStarParticles = NULL;
	  myNumParticles = myNumSPH = myNumStar = 0;
	  nStore = nStoreSPH = nStoreStar = 0;
	  myTreeParticles = -1;
	  orbBoundaries.clear();
	  boxes = NULL;
	  splitDims = NULL;
	  bGasCooling = 0;

#ifdef PUSH_GRAVITY
          createdSpanningTree = false;
#endif

          localTreeBuildComplete = false;
	}

	TreePiece(CkMigrateMessage* m) {
	  treePieceLoadTmp = 0.0;
          // jetley
          proxyValid = false;
          proxySet = false;

	  usesAtSync = true;
	  //localCache = NULL;
	  dm = NULL;
	  bucketReqs = NULL;
	  numChunks=-1;
	  nCacheAccesses = 0;
	  memWithCache = 0;
	  memPostCache = 0;
	  nNodeCacheEntries = 0;
	  nPartCacheEntries = 0;
	  completedActiveWalks = 0;
	  prefetchRoots = NULL;
	  //remaining Chunk = NULL;
          ewt = NULL;
	  root = NULL;
	  pTreeNodes = NULL;

      sTopDown = 0;
	  sGravity = NULL;
	  sPrefetch = NULL;
	  sSmooth = NULL;
#if INTERLIST_VER > 0
	  sInterListWalk = NULL;
#endif

          incomingParticlesMsg.clear();
          incomingParticlesArrived = 0;
          incomingParticlesSelf = false;

	  cnt=0;
	  nodeInterRemote = NULL;
          particleInterRemote = NULL;

	  orbBoundaries.clear();
	  boxes = NULL;
	  splitDims = NULL;
	  myTreeParticles = -1;


          localTreeBuildComplete = false;
	}

        private:
        void freeWalkObjects();
	void allocateStars() {
	    nStoreStar = (int) (myNumStar*(1.0 + dExtraStore));
	    // Stars tend to form out of gas, so make sure there is
	    // enough room.
	    nStoreStar += 12 + (int) (myNumSPH*dExtraStore);
	    myStarParticles = new extraStarData[nStoreStar];
	    }

        public:
	~TreePiece() {
	  if (verbosity>1) ckout <<"Deallocating treepiece "<<thisIndex<<endl;
	  if(nStore > 0) delete[] myParticles;
	  if(nStoreSPH > 0) delete[] mySPHParticles;
	  if(nStoreStar > 0) delete[] myStarParticles;
	  delete[] nodeInterRemote;
	  delete[] particleInterRemote;
	  delete[] bucketReqs;
          delete[] ewt;

	  deleteTree();

	  if(boxes!= NULL ) delete[] boxes;
	  if(splitDims != NULL) delete[] splitDims;

#ifndef COOLING_NONE
	  if(bGasCooling)
	      CoolDerivsFinalize(CoolData);
#endif
          if (verbosity>1) ckout <<"Finished deallocation of treepiece "<<thisIndex<<endl;
	}

	void restart();

	void setPeriodic(int nReplicas, Vector3D<double> fPeriod, int bEwald,
			 double fEwCut, double fEwhCut, int bPeriod);
	void BucketEwald(GenericTreeNode *req, int nReps,double fEwCut);
	void EwaldInit();
	void calculateEwald(dummyMsg *m);
  void calculateEwaldUsingCkLoop(dummyMsg *msg, int yield_num);
  void callBucketEwald(int id);
  void doParallelNextBucketWork(int id, LoopParData* lpdata);
	void initCoolingData(const CkCallback& cb);
	// Scale velocities (needed to convert to canonical momenta for
	// comoving coordinates.)
	void velScale(double dScale, const CkCallback& cb);

	/// @brief Load I.C. from NChilada file
        /// @param dTuFac conversion factor from temperature to
        /// internal energy
        void loadNChilada(const std::string& filename, const double dTuFac,
                          const CkCallback& cb);
        void readFloatBinary(OutputParams& params, int bParaRead,
            const CkCallback& cb);
	/// @brief Load I.C. from Tipsy file
        /// @param filename tipsy file
        /// @param dTuFac conversion factor from temperature to
        /// internal energy
        /// @param bDoublePos Positions are in double precision
        /// @param bDoubleVel Velocities are in double precision
	void loadTipsy(const std::string& filename, const double dTuFac,
                       const bool bDoublePos,
                       const bool bDoubleVel,
		       const CkCallback& cb);

        void readIOrd(const std::string& filename, const CkCallback& cb);
        void readIGasOrd(const std::string& filename, const CkCallback& cb);
        void readESNrate(const std::string& filename, const CkCallback& cb);
        void readOxMassFrac(const std::string& filename, const CkCallback& cb);
        void readFeMassFrac(const std::string& filename, const CkCallback& cb);
        void readMassForm(const std::string& filename, const CkCallback& cb);
        void readCoolOnTime(const std::string& filename, const CkCallback& cb);
        void readCoolArray0(const std::string& filename, const CkCallback& cb);
        void readCoolArray1(const std::string& filename, const CkCallback& cb);
        void readCoolArray2(const std::string& filename, const CkCallback& cb);
        void readCoolArray3(const std::string& filename, const CkCallback& cb);
        void getMaxIOrds(const CkCallback& cb);
        void RestartEnergy(double dTuFac, const CkCallback& cb);
        void findTotalMass(CkCallback &cb);
        void recvTotalMass(CkReductionMsg *msg);

	// Write a Tipsy file
	void writeTipsy(Tipsy::TipsyWriter& w,
			const double dvFac, // scale velocities
			const double duTFac, // convert temperature
                        const bool bDoublePos,
                        const bool bDoubleVel,
			const int bCool);
	// Find position in the file to start writing
	void setupWrite(int iStage, u_int64_t iPrevOffset,
			const std::string& filename, const double dTime,
			const double dvFac, const double duTFac,
                        const bool bDoublePos,
                        const bool bDoubleVel,
			const int bCool, const CkCallback& cb);
	// control parallelism in the write
	void parallelWrite(int iPass, const CkCallback& cb,
			   const std::string& filename, const double dTime,
			   const double dvFac, // scale velocities
			   const double duTFac, // convert temperature
                           const bool bDoublePos,
                           const bool bDoubleVel,
			   const int bCool);
	// serial output
	void serialWrite(u_int64_t iPrevOffset, const std::string& filename,
			 const double dTime,
			 const double dvFac, const double duTFac,
                         const bool bDoublePos,
                         const bool bDoubleVel,
			 const int bCool, const CkCallback& cb);
	// setup for serial output
	void oneNodeWrite(int iIndex,
			       int iOutParticles,
			       int iOutSPH,
			       int iOutStar,
			       GravityParticle *particles, // particles to
						     // write
			       extraSPHData *pGas, // SPH data
			       extraStarData *pStar, // Star data
			       int *piSPH, // SPH data offsets
			       int *piStar, // Star data offsets
			       const u_int64_t iPrevOffset,
			       const std::string& filename,  // output file
			       const double dTime,      // time or expansion
			       const double dvFac,  // velocity conversion
			     const double duTFac, // temperature
						  // conversion
                           const bool bDoublePos,
                           const bool bDoubleVel,
			  const int bCool,
			  const CkCallback &cb);
	// Reorder for output
	void reOrder(int64_t nMaxOrder, CkCallback& cb);
	// move particles around for output
	void ioShuffle(CkReductionMsg *msg);
	void ioAcceptSortedParticles(ParticleShuffleMsg *);
	/** Inform the DataManager of my node that I'm here.
	 The callback will receive a CkReductionMsg containing no data.
	void registerWithDataManager(const CkGroupID& dataManagerID,
				     const CkCallback& cb);
	 */
	// Assign keys after loading tipsy file and finding Bounding box
	void assignKeys(CkReductionMsg* m);
	void evaluateBoundaries(SFC::Key* keys, const int n, int isRefine, const CkCallback& cb);
	void unshuffleParticles(CkReductionMsg* m);
	void acceptSortedParticles(ParticleShuffleMsg *);
  void shuffleAfterQD();
  void unshuffleParticlesWoDD(const CkCallback& cb);
  void acceptSortedParticlesFromOther(ParticleShuffleMsg *);

  /*****ORB Decomposition*******/
  void initORBPieces(const CkCallback& cb);
  void initBeforeORBSend(unsigned int myCount, unsigned int myCountGas,
			 unsigned int myCountStar,
			 const CkCallback& cb, const CkCallback& cback);
  void sendORBParticles();
  void acceptORBParticles(const GravityParticle* particles, const int n);
  void acceptORBParticles(const GravityParticle* particles, const int n,
			  const extraSPHData *pGas, const int nGasIn,
			  const extraStarData *pStar, const int nStarIn);
  void finalizeBoundaries(ORBSplittersMsg *splittersMsg);
  void evaluateParticleCounts(ORBSplittersMsg *splittersMsg);
  /*****************************/

  void kick(int iKickRung, double dDelta[MAXRUNG+1], int bClosing,
	    int bNeedVPred, int bGasIsothermal, double duDelta[MAXRUNG+1],
	    const CkCallback& cb);
  void drift(double dDelta, int bNeedVPred, int bGasIsothermal, double dvDelta,
	     double duDelta, int nGrowMass, bool buildTree,
	     const CkCallback& cb);
  void initAccel(int iKickRung, const CkCallback& cb);
/**
 * Adjust timesteps of active particles.
 * @param iKickRung The rung we are on.
 * @param bEpsAccStep Use sqrt(eps/acc) timestepping
 * @param bGravStep Use sqrt(r^3/GM) timestepping
 * @param bSphStep Use Courant condition
 * @param bViscosityLimitdt Use viscosity in Courant condition
 * @param dEta Factor to use in determing timestep
 * @param dEtaCourant Courant factor to use in determing timestep
 * @param dEtauDot Factor to use in uDot based timestep
 * @param dDelta Base timestep
 * @param dAccFac Acceleration scaling for cosmology
 * @param dCosmoFac Cosmo scaling for Courant
 * @param dhMinOverSoft minimum smoothing parameter.
 * @param bDoGas We are calculating gas forces.
 * @param cb Callback function reduces currrent maximum rung
 */
  void adjust(int iKickRung, int bEpsAccStep, int bGravStep,
	      int bSphStep, int bViscosityLimitdt,
	      double dEta, double dEtaCourant, double dEtauDot,
	      double dDelta, double dAccFac,
	      double dCosmoFac, double dhMinOverSoft,
	      int bDoGas,
	      const CkCallback& cb);
  /**
   * @brief Truncate the highest rung
   * @param iCurrMaxRung new maximum rung.
   * @param cb callback.
   */
  void truncateRung(int iCurrMaxRung, const CkCallback& cb);
  void rungStats(const CkCallback& cb);
  void countActive(int activeRung, const CkCallback& cb);
  void assignDomain(const CkCallback& cb);
  void calcEnergy(const CkCallback& cb);
  /// add new particle
  void newParticle(GravityParticle *p);
  /// Count add/deleted particles, and compact main particle storage.
  void colNParts(const CkCallback &cb);
  /// Assign iOrders to recently added particles.
  void newOrder(int64_t nStartSPH, int64_t nStartDark,
		int64_t nStartStar, const CkCallback &cb);
  
  /// Update total particle numbers
  void setNParts(int64_t _nTotalSPH, int64_t _nTotalDark,
		 int64_t _nTotalStar, const CkCallback &cb);
  /// @brief Set the gravitational softening on all particles
  void setSoft(const double dSoft, const CkCallback &cb);
	void physicalSoft(const double dSoftMax, const double dFac,
			  const int bSoftMaxMul, const CkCallback& cb);
	void growMass(int nGrowMass, double dDeltaM, const CkCallback& cb);
	void InitEnergy(double dTuFac, double z, double dTime,
			const CkCallback& cb);
	void updateuDot(int activeRung, double duDelta[MAXRUNG+1],
			double dStartTime[MAXRUNG+1], int bCool, int bAll,
			int bUpdateState, const CkCallback& cb);
	void ballMax(int activeRung, double dFac, const CkCallback& cb);
	void sphViscosityLimiter(int bOn, int activeRung, const CkCallback& cb);
	void getAdiabaticGasPressure(double gamma, double gammam1,
				     const CkCallback &cb);
	void getCoolingGasPressure(double gamma, double gammam1,
				   const CkCallback &cb);
	void FormStars(Stfm param, double dTime, double dDelta, double dCosmoFac,
		       const CkCallback& cb);
	void flushStarLog(const CkCallback& cb);
	void Feedback(Fdbk &fb, double dTime, double dDelta,
		       const CkCallback& cb);
	void massMetalsEnergyCheck(int bPreDist, const CkCallback& cb);
	void SetTypeFromFileSweep(int iSetMask, char *file,
	   struct SortStruct *ss, int nss, int *pniOrder, int *pnSet);
	void setTypeFromFile(int iSetMask, char *file, const CkCallback& cb);
	void getCOM(const CkCallback& cb, int bLiveViz);
	void getCOMByType(int iType, const CkCallback& cb, int bLiveViz);
	void DumpFrame(InDumpFrame in, const CkCallback& cb, int liveVizDump) ;
	void liveVizDumpFrameInit(liveVizRequestMsg *msg);
	void setProjections(int bOn);

	/// \brief Charm entry point to build the tree (called by Main).
#ifdef PUSH_GRAVITY
	void buildTree(int bucketSize, const CkCallback& cb, bool merge);
#else
	void buildTree(int bucketSize, const CkCallback& cb);
#endif

	/// \brief Real tree build, independent of other TreePieces.
	void startOctTreeBuild(CkReductionMsg* m);

  /********ORB Tree**********/
  //void receiveBoundingBoxes(BoundingBoxes *msg);
  void startORBTreeBuild(CkReductionMsg* m);
  OrientedBox<float> constructBoundingBox(GenericTreeNode *node,int level, int numChild);
  void buildORBTree(GenericTreeNode * node, int level);
  /**************************/

	/// Request the moments for this node.
	void requestRemoteMoments(const Tree::NodeKey key, int sender);
	void receiveRemoteMoments(const Tree::NodeKey key, Tree::NodeType type, int firstParticle, int numParticles, const MultipoleMoments& moments, const OrientedBox<double>& box, const OrientedBox<double>& boxBall, const unsigned int iParticleTypes);

	/// Entry point for the local computation: for each bucket compute the
	/// force that its particles see due to the other particles hosted in
	/// this TreePiece. The opening angle theta has already been passed
	/// through startGravity().  This function just calls doAllBuckets().
	void calculateGravityLocal();
	/// Do some minor preparation for the local walkk then
	/// calculateGravityLocal().
	void commenceCalculateGravityLocal();

	/// Entry point for the remote computation: for each bucket compute the
	/// force that its particles see due to the other particles NOT hosted
	/// by this TreePiece, and belonging to a subset of the global tree
	/// (specified by chunkNum).
	void calculateGravityRemote(ComputeChunkMsg *msg);
  void executeCkLoopParallelization(LoopParData *lpdata, int startbucket,
      int yield_num, int chunkNum, State* gravityState);
  int doBookKeepingForTargetActive(int curbucket, int end, int chunkNum, bool
    updatestate, State* gravityState);
  void doBookKeepingForTargetInactive(int curbucket, int chunkNum,
    bool updatestate, State* gravityState);

	/// As above but for the Smooth operation
	void calculateSmoothLocal();
	void calculateReSmoothLocal();
	void calculateMarkSmoothLocal();
	void nextBucketSmooth(dummyMsg *msg);
	void nextBucketReSmooth(dummyMsg *msg);
	void nextBucketMarkSmooth(dummyMsg *msg);
#if INTERLIST_VER > 0
#if 0
  void calculateForceRemoteBucket(int bucketIndex, int chunk);
  void calculateForceLocalBucket(int bucketIndex);
#endif
#endif

  /// @brief Start a tree based gravity computation.
  /// @param am the active rung for the computation
  /// @param theta the opening angle
  /// @param cb the callback to use after all the computation has finished
  void startGravity(int am, double myTheta, const CkCallback& cb);
  /// Setup utility function for all the smooths.  Initializes caches.
  void setupSmooth();
  /// Start a tree based smooth computation.
  /// @param p parameters, including the type, of the smooth.
  /// @param iLowhFix true if a minimum h/softening is used.
  /// @param nSmooth Number of particles to smooth over.
  /// @param dfBall2OverSoft2 square of minimum ratio of h to softening.
  /// @param cb the callback to use after all the computation has finished
  /// @reference SmoothParams
  void startSmooth(SmoothParams *p, int iLowhFix, int nSmooth,
		   double dfBall2OverSoft2, const CkCallback &cb);
  void startReSmooth(SmoothParams *p, const CkCallback& cb);
  void startMarkSmooth(SmoothParams *p, const CkCallback& cb);

  void finishNodeCache(const CkCallback& cb);
	/// Function called by the CacheManager to send out request for needed
	/// remote data, so that the later computation will hit.
	void prefetch(GenericTreeNode *node, int offsetID);
	void prefetch(ExternalGravityParticle *part);

	/// @brief Retrieve the remote node, goes through the cache if present
        //GenericTreeNode* requestNode(int remoteIndex, Tree::NodeKey lookupKey, int chunk, int reqID, bool isPrefetch=false);

        GenericTreeNode* requestNode(int remoteIndex, Tree::NodeKey lookupKey, int chunk, int reqID, int awi, void *source, bool isPrefetch);
	/// @brief Receive a request for Nodes from a remote processor, copy the
	/// data into it, and send back a message.
	void fillRequestNode(CkCacheRequestMsg<KeyType> *msg);
	/** @brief Receive the node from the cache as following a previous
	 * request which returned NULL, and continue the treewalk of the bucket
	 * which requested it with this new node.
	*/
#if 0
	void receiveNode(GenericTreeNode &node, int chunk, unsigned int reqID);
#endif
	/// @brief Find the key in the KeyTable, and copy the node over the passed pointer
	const GenericTreeNode* lookupNode(Tree::NodeKey key);
	/// Find the particles starting at "begin", and return a pointer to it
	const GravityParticle* lookupParticles(int begin);

	/// @brief Check if we have done with the treewalk on a specific bucket,
	/// and if we have, check also if we are done with all buckets
	void finishBucket(int iBucket);

	/** @brief Routine which does the tree walk on non-local nodes. It is
	 * called back for every incoming node (which are those requested to the
	 * cache during previous treewalks), and continue the treewalk from
	 * where it had been interrupted. It will possibly made other remote
	 * requests.
	 */
	void cachedWalkBucketTree(GenericTreeNode* node, int chunk, int reqID);

#if INTERLIST_VER > 0
	void calculateForcesNode(OffsetNode node, GenericTreeNode *myNode,
				 int level,int chunk);
	void calculateForces(OffsetNode node, GenericTreeNode *myNode,
			     int level,int chunk);
#endif

        ExternalGravityParticle *requestParticles(Tree::NodeKey key,int chunk,int remoteIndex,int begin,int end,int reqID, int awi, void *source, bool isPrefetch=false);
	GravityParticle *requestSmoothParticles(Tree::NodeKey key, int chunk,
				    int remoteIndex, int begin,int end,
				    int reqID, int awi, void *source, bool isPrefetch);
	void fillRequestParticles(CkCacheRequestMsg<KeyType> *msg);
	void fillRequestSmoothParticles(CkCacheRequestMsg<KeyType> *msg);
	void flushSmoothParticles(CkCacheFillMsg<KeyType> *msg);
	void processReqSmoothParticles();

	//void startlb(CkCallback &cb);
	void startlb(CkCallback &cb, int activeRung);
  void populateSavedPhaseData(int phase, double tpload, unsigned int activeparts);
  bool havePhaseData(int phase);
  void savePhaseData(std::vector<double> &loads, std::vector<unsigned int>
    &parts_per_phase, double* shuffleloads, unsigned int *shuffleparts,
    int shufflelen);
	void ResumeFromSync();

	void outputAccelerations(OrientedBox<double> accelerationBox, const std::string& suffix, const CkCallback& cb);
	void outputASCII(OutputParams& params, int bParaWrite,
			 const CkCallback& cb);
	void oneNodeOutVec(OutputParams& params, Vector3D<double>* avOut,
			   int nPart, int iIndex, int bDone,
			   CkCallback& cb) ;
	void oneNodeOutArr(OutputParams& params, double* adOut,
			   int nPart, int iIndex, int bDone,
			   CkCallback& cb) ;
	void oneNodeOutIntArr(OutputParams& params, int *aiOut,
			      int nPart, int iIndex, CkCallback& cb);
        void outputBinaryStart(OutputParams& params, int64_t nStart,
                               const CkCallback& cb);
	void outputBinary(Ck::IO::Session, OutputParams& params);
        void minmaxNCOut(OutputParams& params, const CkCallback& cb);

	void outputStatistics(const CkCallback& cb);
	/// Collect the total statistics from the various chares
	void collectStatistics(CkCallback &cb);

        /** @brief Entry method used to split the processing of all the buckets
         * in small pieces. It calls startNextBucket() _yieldPeriod number of
         * times, and then it returns to the scheduler after enqueuing a message
         * for itself.  After each startNextBucket() call, the state
         * is checked for completed walks (and forces calculated, and
         * finishBucket() is called to clean up.
	 */
        void nextBucket(dummyMsg *m);
  void nextBucketUsingCkLoop(dummyMsg *m);

	void report();
	void printTreeViz(GenericTreeNode* node, std::ostream& os);
	void printTree(GenericTreeNode* node, std::ostream& os);
	void pup(PUP::er& p);

        // jetley
        // need this in TreeWalk
        GenericTreeNode *getRoot() {return root;}
        // need this in Compute
	inline Vector3D<double> decodeOffset(int reqID) {
	    int offsetcode = reqID >> 22;
	    int x = (offsetcode & 0x7) - 3;
	    int y = ((offsetcode >> 3) & 0x7) - 3;
	    int z = ((offsetcode >> 6) & 0x7) - 3;

	    Vector3D<double> offset(x*fPeriod.x, y*fPeriod.y, z*fPeriod.z);

	    return offset;
	    }

        GenericTreeNode *nodeMissed(int reqID, int remoteIndex, Tree::NodeKey &key, int chunk, bool isPrefetch, int awi, void *source);

        ExternalGravityParticle *particlesMissed(Tree::NodeKey &key, int chunk, int remoteIndex, int firstParticle, int lastParticle, int reqID, bool isPrefetch, int awi, void *source);

        void receiveNodeCallback(GenericTreeNode *node, int chunk, int reqID, int awi, void *source);
        void receiveParticlesCallback(ExternalGravityParticle *egp, int num, int chunk, int reqID, Tree::NodeKey &remoteBucket, int awi, void *source);
        void receiveParticlesFullCallback(GravityParticle *egp, int num, int chunk, int reqID, Tree::NodeKey &remoteBucket, int awi, void *source);
        void receiveProxy(CkGroupID _proxy){ proxy = _proxy; proxySet = true; /*CkPrintf("[%d : %d] received proxy\n", CkMyPe(), thisIndex);*/}
        void doAtSync();

        void balanceBeforeInitialForces(CkCallback &cb);
        
        // For merging of remote moment requests
        // before sending messages during tree building
        public:
        void sendRequestForNonLocalMoments(GenericTreeNode *pickedNode);
        void mergeNonLocalRequestsDone();
        //void addTreeBuildMomentsClient(GenericTreeNode *targetNode, TreePiece *client, GenericTreeNode *clientNode);
        std::map<NodeKey,NonLocalMomentsClientList>::iterator createTreeBuildMomentsEntry(GenericTreeNode *pickedNode);


        private:
        // XXX - hashtable instead of map
        std::map<NodeKey,NonLocalMomentsClientList> nonLocalMomentsClients;
        bool localTreeBuildComplete;
        int getResponsibleIndex(int first, int last);
        

        GenericTreeNode *boundaryParentReady(GenericTreeNode *parent);
        void accumulateMomentsFromChild(GenericTreeNode *parent, GenericTreeNode *child);

        void deliverMomentsToClients(GenericTreeNode *);
        void deliverMomentsToClients(const std::map<NodeKey,NonLocalMomentsClientList>::iterator &it);
        void treeBuildComplete();
        void processRemoteRequestsForMoments();
        void sendParticlesDuringDD(bool withqd);
        void mergeAllParticlesAndSaveCentroid();
        bool otherIdlePesAvail();

};

/// @brief Class for shadow arrays to avoid reduction conflicts in
/// overlapping liveViz, SPH and gravity.
class LvArray : public CBase_LvArray {
 public:
    LvArray() {}
    LvArray(CkMigrateMessage* m) {}
    } ;

int decodeReqID(int);
int encodeOffset(int reqID, int x, int y, int z);
bool bIsReplica(int reqID);
void initNodeLock();
void printGenericTree(GenericTreeNode* node, std::ostream& os) ;
//bool compBucket(GenericTreeNode *ln,GenericTreeNode *rn);

#ifdef REDUCTION_HELPER

class TreePieceCounter : public CkLocIterator { 
  public:
    TreePieceCounter() { reset(); }
    void addLocation(CkLocation &loc);
    void reset();

  public:
    CkVec<TreePiece *> presentTreePieces;
};



class ReductionHelper : public CBase_ReductionHelper {
  public:
  ReductionHelper();
  ReductionHelper(CkMigrateMessage *);
  void pup(PUP::er &p);

  void countTreePieces(const CkCallback &cb);
  void reduceBinCounts(int nBins, int64_t *binCounts, const CkCallback &cb);
  void evaluateBoundaries(SFC::Key *keys, const int n, int isRefine, const CkCallback& cb);
  void evaluateBoundaries(const CkBitVector &binsToSplit, const CkCallback& cb);

  private:
  void senseLocalTreePieces();

  private:

  CkVec<int64_t> myBinCounts;
  int numTreePiecesCheckedIn;

  TreePieceCounter localTreePieces;
  std::vector<SFC::Key> splitters;
};
#endif

class NonEmptyTreePieceCounter : public CkLocIterator {            
  public:
    NonEmptyTreePieceCounter() { reset(); }
    void addLocation(CkLocation &loc);
    void reset();

  public:
    int count;
};

#endif //PARALLELGRAVITY_H
back to top