https://github.com/geodynamics/citcoms
Revision bcf06ab870d4cfd4a7c8594146ed51e41b23d5f9 authored by Eh Tan on 09 August 2007, 22:57:28 UTC, committed by Eh Tan on 09 August 2007, 22:57:28 UTC
Two non-dimensional parameters are added: "dissipation_number" and "gruneisen"
under the Solver component. One can use the original incompressible solver by
setting "gruneisen=0". The code will treat this as "gruneisen=infinity". 
Setting non-zero value to "gruneisen" will switch to compressible solver.

One can use the TALA solver for incompressible case by setting "gruneisen" to
a non-zero value while setting "dissipation_number=0". This is useful when
debugging the compressible solver.

Two implementations are available: one by Wei Leng (U. Colorado) and one by
Eh Tan (CIG). Leng's version uses the original conjugate gradient method for
the Uzawa iteration and moves the contribution of compressibility to the RHS,
similar to the method of Ita and King, JGR, 1994. Tan's version uses the
bi-conjugate gradient stablized method for the Uzawa iteration, similar to the
method of Tan and Gurnis, JGR, 2007. Both versions agree very well. In the
benchmark case, 33x33x33 nodes per cap, Di/gamma=1.0, Ra=1.0, delta function
of load at the mid mantle, the peak velocity differs by only 0.007%. Leng's
version is enabled by default. Edit function solve_Ahat_p_fhat() in
lib/Stokes_flow_Incomp.c to switch to Tan's version.

1 parent 91bcb85
Raw File
Tip revision: bcf06ab870d4cfd4a7c8594146ed51e41b23d5f9 authored by Eh Tan on 09 August 2007, 22:57:28 UTC
Finished the compressible Stokes solver for TALA.
Tip revision: bcf06ab
Full_parallel_related.c
/*
 *~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 *
 *<LicenseText>
 *
 * CitcomS by Louis Moresi, Shijie Zhong, Lijie Han, Eh Tan,
 * Clint Conrad, Michael Gurnis, and Eun-seo Choi.
 * Copyright (C) 1994-2005, California Institute of Technology.
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 *
 *</LicenseText>
 *
 *~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 */
/* Routines here are for intel paragon with MPI */

#include <mpi.h>
#include <math.h>

#include "element_definitions.h"
#include "global_defs.h"
#include "sphere_communication.h"

#include "parallel_related.h"


static void set_horizontal_communicator(struct All_variables*);
static void set_vertical_communicator(struct All_variables*);

static void exchange_node_d(struct All_variables *, double**, int);
static void exchange_node_f(struct All_variables *, float**, int);


/* ============================================ */
/* ============================================ */

void full_parallel_processor_setup(struct All_variables *E)
  {

  int i,j,k,m,me,temp,pid_surf;
  int cap_id_surf;
  int surf_proc_per_cap, proc_per_cap, total_proc;

  me = E->parallel.me;

  if ( E->parallel.nprocx != E->parallel.nprocy ) {
    if (E->parallel.me==0) fprintf(stderr,"!!!! nprocx must equal to nprocy \n");
    parallel_process_termination();
  }

  surf_proc_per_cap = E->parallel.nprocx * E->parallel.nprocy;
  proc_per_cap = surf_proc_per_cap * E->parallel.nprocz;
  total_proc = E->sphere.caps * proc_per_cap;
  E->parallel.total_surf_proc = E->sphere.caps * surf_proc_per_cap;

  if ( total_proc != E->parallel.nproc ) {
    if (E->parallel.me==0) fprintf(stderr,"!!!! # of requested CPU is incorrect \n");
    parallel_process_termination();
    }

  E->sphere.caps_per_proc = max(1,E->sphere.caps*E->parallel.nprocz/E->parallel.nproc);

  if (E->sphere.caps_per_proc > 1) {
    if (E->parallel.me==0) fprintf(stderr,"!!!! # caps per proc > 1 is not supported.\n \n");
    parallel_process_termination();
  }

  /* determine the location of processors in each cap */
  cap_id_surf = me / proc_per_cap;

 /* z-direction first*/
  E->parallel.me_loc[3] = (me - cap_id_surf*proc_per_cap) % E->parallel.nprocz;

 /* x-direction then*/
  E->parallel.me_loc[1] = ((me - cap_id_surf*proc_per_cap - E->parallel.me_loc[3])/E->parallel.nprocz) % E->parallel.nprocx;

 /* y-direction then*/
  E->parallel.me_loc[2] = ((((me - cap_id_surf*proc_per_cap - E->parallel.me_loc[3])/E->parallel.nprocz) - E->parallel.me_loc[1])/E->parallel.nprocx) % E->parallel.nprocy;


/*
the numbering of proc in each caps is as so (example for an xyz = 2x2x2 box):
NOTE: This is different (in a way) than the numbering of the nodes:
the nodeal number has the first oordinate as theta, which goes N-S and
the second oordinate as fi, which goes E-W. Here we use R-L as the first
oordinate and F-B
 0 = lower  left front corner=[000]  4 = lower  left back corner=[010]
 1 = upper  left front corner=[001]  5 = upper  left back corner=[011]
 2 = lower right front corner=[100]  6 = lower right back corner=[110]
 3 = upper right front corner=[101]  7 = upper right back corner=[111]
[xyz] is x=E->parallel.me_loc[1],y=E->parallel.me_loc[2],z=E->parallel.me_loc[3]
*/

  /* determine cap id for each cap in a given processor  */
  pid_surf = me/proc_per_cap; /* cap number (0~11) */
  i = cases[E->sphere.caps_per_proc]; /* 1 for more than 12 processors */

  for (j=1;j<=E->sphere.caps_per_proc;j++)  {
    temp = pid_surf*E->sphere.caps_per_proc + j-1; /* cap number (out of 12) */
    E->sphere.capid[j] = incases1[i].links[temp]; /* id (1~12) of the current cap */
    }

  /* determine which caps are linked with each of 12 caps  */
  /* if the 12 caps are broken, set these up instead */
  if (surf_proc_per_cap > 1) {
     E->sphere.max_connections = 8;
  }

  /* steup location-to-processor map */
  E->parallel.loc2proc_map = (int ****) malloc(E->sphere.caps*sizeof(int ***));
  for (m=0;m<E->sphere.caps;m++)  {
    E->parallel.loc2proc_map[m] = (int ***) malloc(E->parallel.nprocx*sizeof(int **));
    for (i=0;i<E->parallel.nprocx;i++) {
      E->parallel.loc2proc_map[m][i] = (int **) malloc(E->parallel.nprocy*sizeof(int *));
      for (j=0;j<E->parallel.nprocy;j++)
	E->parallel.loc2proc_map[m][i][j] = (int *) malloc(E->parallel.nprocz*sizeof(int));
    }
  }

  for (m=0;m<E->sphere.caps;m++)
    for (i=0;i<E->parallel.nprocx;i++)
      for (j=0;j<E->parallel.nprocy;j++)
	for (k=0;k<E->parallel.nprocz;k++) {
	  if (E->sphere.caps_per_proc>1) {
	    temp = cases[E->sphere.caps_per_proc];
	    E->parallel.loc2proc_map[m][i][j][k] = incases2[temp].links[m-1];
	  }
	  else
	    E->parallel.loc2proc_map[m][i][j][k] = m*proc_per_cap
	      + j*E->parallel.nprocx*E->parallel.nprocz
	      + i*E->parallel.nprocz + k;
	}

  if (E->control.verbose) {
    fprintf(E->fp_out,"me=%d loc1=%d loc2=%d loc3=%d\n",me,E->parallel.me_loc[1],E->parallel.me_loc[2],E->parallel.me_loc[3]);
    for (j=1;j<=E->sphere.caps_per_proc;j++) {
      fprintf(E->fp_out,"capid[%d]=%d \n",j,E->sphere.capid[j]);
    }
    for (m=0;m<E->sphere.caps;m++)
      for (j=0;j<E->parallel.nprocy;j++)
	for (i=0;i<E->parallel.nprocx;i++)
	  for (k=0;k<E->parallel.nprocz;k++)
	    fprintf(E->fp_out,"loc2proc_map[cap=%d][x=%d][y=%d][z=%d] = %d\n",
		    m,i,j,k,E->parallel.loc2proc_map[m][i][j][k]);

    fflush(E->fp_out);
  }

  set_vertical_communicator(E);
  set_horizontal_communicator(E);

  E->exchange_node_d = exchange_node_d;
  E->exchange_node_f = exchange_node_f;

  return;
  }



static void set_horizontal_communicator(struct All_variables *E)
{
  MPI_Group world_g, horizon_g;
  int i,j,k,m,n;
  int *processors;

  processors = (int *) malloc((E->parallel.total_surf_proc+1)*sizeof(int));

  k = E->parallel.me_loc[3];
  n = 0;
  for (m=0;m<E->sphere.caps;m++)
    for (i=0;i<E->parallel.nprocx;i++)
      for (j=0;j<E->parallel.nprocy;j++) {
	processors[n] = E->parallel.loc2proc_map[m][i][j][k];
	n++;
      }

  MPI_Comm_group(E->parallel.world, &world_g);
  MPI_Group_incl(world_g, E->parallel.total_surf_proc, processors, &horizon_g);
  MPI_Comm_create(E->parallel.world, horizon_g, &(E->parallel.horizontal_comm));

  if (E->control.verbose) {
    fprintf(E->fp_out,"horizontal group of me=%d loc3=%d\n",E->parallel.me,E->parallel.me_loc[3]);
    for (j=0;j<E->parallel.total_surf_proc;j++) {
      fprintf(E->fp_out,"%d proc=%d\n",j,processors[j]);
    }
    fflush(E->fp_out);
  }

  MPI_Group_free(&horizon_g);
  MPI_Group_free(&world_g);
  free((void *) processors);

  return;
}


static void set_vertical_communicator(struct All_variables *E)
{
  MPI_Group world_g, vertical_g;
  int i,j,k,m;
  int *processors;

  processors = (int *)malloc((E->parallel.nprocz+2)*sizeof(int));

  m = E->sphere.capid[1] - 1;  /* assume 1 cap per proc. */
  i = E->parallel.me_loc[1];
  j = E->parallel.me_loc[2];

  for (k=0;k<E->parallel.nprocz;k++) {
      processors[k] = E->parallel.loc2proc_map[m][i][j][k];
  }

  MPI_Comm_group(E->parallel.world, &world_g);
  MPI_Group_incl(world_g, E->parallel.nprocz, processors, &vertical_g);
  MPI_Comm_create(E->parallel.world, vertical_g, &(E->parallel.vertical_comm));

  if (E->control.verbose) {
    fprintf(E->fp_out,"vertical group of me=%d loc1=%d loc2=%d\n",E->parallel.me,E->parallel.me_loc[1],E->parallel.me_loc[2]);
    for (j=0;j<E->parallel.nprocz;j++) {
      fprintf(E->fp_out,"%d proc=%d\n",j,processors[j]);
    }
    fflush(E->fp_out);
  }

  MPI_Group_free(&vertical_g);
  MPI_Group_free(&world_g);
  free((void *) processors);
}


/* =========================================================================
get element information for each processor.
 ========================================================================= */

void full_parallel_domain_decomp0(struct All_variables *E)
  {

  int i,nox,noz,noy,me;

  me = E->parallel.me;

  E->lmesh.elx = E->mesh.elx/E->parallel.nprocx;
  E->lmesh.elz = E->mesh.elz/E->parallel.nprocz;
  E->lmesh.ely = E->mesh.ely/E->parallel.nprocy;
  E->lmesh.nox = E->lmesh.elx + 1;
  E->lmesh.noz = E->lmesh.elz + 1;
  E->lmesh.noy = E->lmesh.ely + 1;

  E->lmesh.exs = E->parallel.me_loc[1]*E->lmesh.elx;
  E->lmesh.eys = E->parallel.me_loc[2]*E->lmesh.ely;
  E->lmesh.ezs = E->parallel.me_loc[3]*E->lmesh.elz;
  E->lmesh.nxs = E->parallel.me_loc[1]*E->lmesh.elx+1;
  E->lmesh.nys = E->parallel.me_loc[2]*E->lmesh.ely+1;
  E->lmesh.nzs = E->parallel.me_loc[3]*E->lmesh.elz+1;

  E->lmesh.nno = E->lmesh.noz*E->lmesh.nox*E->lmesh.noy;
  E->lmesh.nel = E->lmesh.ely*E->lmesh.elx*E->lmesh.elz;
  E->lmesh.npno = E->lmesh.nel;

  E->lmesh.nsf = E->lmesh.nno/E->lmesh.noz;
  E->lmesh.snel = E->lmesh.elx*E->lmesh.ely;

  for(i=E->mesh.levmax;i>=E->mesh.levmin;i--)   {

     if (E->control.NMULTIGRID||E->control.EMULTIGRID)  {
        nox = E->lmesh.elx/(int)pow(2.0,(double)(E->mesh.levmax-i))+1;
        noy = E->lmesh.ely/(int)pow(2.0,(double)(E->mesh.levmax-i))+1;
        noz = E->lmesh.elz/(int)pow(2.0,(double)(E->mesh.levmax-i))+1;
        E->parallel.redundant[i]=0;
        }
     else
        { noz = E->lmesh.noz;
          noy = E->lmesh.noy;
          nox = E->lmesh.nox;
        }

     E->lmesh.ELX[i] = nox-1;
     E->lmesh.ELY[i] = noy-1;
     E->lmesh.ELZ[i] = noz-1;
     E->lmesh.NOZ[i] = noz;
     E->lmesh.NOY[i] = noy;
     E->lmesh.NOX[i] = nox;
     E->lmesh.NNO[i] = nox * noz * noy;
     E->lmesh.NNOV[i] = E->lmesh.NNO[i];
     E->lmesh.SNEL[i] = E->lmesh.ELX[i]*E->lmesh.ELY[i];

     E->lmesh.NEL[i] = (nox-1) * (noz-1) * (noy-1);
     E->lmesh.NPNO[i] = E->lmesh.NEL[i] ;

     E->lmesh.NEQ[i] = E->mesh.nsd * E->lmesh.NNOV[i] ;

     E->lmesh.EXS[i] = E->parallel.me_loc[1]*E->lmesh.ELX[i];
     E->lmesh.EYS[i] = E->parallel.me_loc[2]*E->lmesh.ELY[i];
     E->lmesh.EZS[i] = E->parallel.me_loc[3]*E->lmesh.ELZ[i];
     E->lmesh.NXS[i] = E->parallel.me_loc[1]*E->lmesh.ELX[i]+1;
     E->lmesh.NYS[i] = E->parallel.me_loc[2]*E->lmesh.ELY[i]+1;
     E->lmesh.NZS[i] = E->parallel.me_loc[3]*E->lmesh.ELZ[i]+1;
     }

/*
fprintf(stderr,"b %d %d %d %d %d %d %d\n",E->parallel.me,E->parallel.me_loc[1],E->parallel.me_loc[2],E->parallel.me_loc[3],E->lmesh.nzs,E->lmesh.nys,E->lmesh.noy);
*/
/* parallel_process_termination();
*/
  return;
  }



/* ============================================
 determine boundary nodes for
 exchange info across the boundaries
 ============================================ */

void full_parallel_domain_boundary_nodes(E)
  struct All_variables *E;
  {

  void parallel_process_termination();

  int m,i,ii,j,k,l,node,el,lnode;
  int lev,ele,elx,elz,ely,nel,nno,nox,noz,noy;
  FILE *fp;
  char output_file[255];

  for(lev=E->mesh.gridmin;lev<=E->mesh.gridmax;lev++)   {
    for(m=1;m<=E->sphere.caps_per_proc;m++)   {
      nel = E->lmesh.NEL[lev];
      elx = E->lmesh.ELX[lev];
      elz = E->lmesh.ELZ[lev];
      ely = E->lmesh.ELY[lev];
      nox = E->lmesh.NOX[lev];
      noy = E->lmesh.NOY[lev];
      noz = E->lmesh.NOZ[lev];
      nno = E->lmesh.NNO[lev];

/* do the ZOY boundary elements first */
      lnode = 0;
      ii =1;              /* left */
      for(j=1;j<=noz;j++)
      for(k=1;k<=noy;k++)  {
        node = j + (k-1)*noz*nox;
        E->parallel.NODE[lev][m][++lnode].bound[ii] =  node;
        E->NODE[lev][m][node] = E->NODE[lev][m][node] | OFFSIDE;
        }

      E->parallel.NUM_NNO[lev][m].bound[ii] = lnode;


      lnode = 0;
      ii =2;              /* right */
      for(j=1;j<=noz;j++)
      for(k=1;k<=noy;k++)      {
        node = (nox-1)*noz + j + (k-1)*noz*nox;
        E->parallel.NODE[lev][m][++lnode].bound[ii] =  node;
        E->NODE[lev][m][node] = E->NODE[lev][m][node] | OFFSIDE;
        }

      E->parallel.NUM_NNO[lev][m].bound[ii] = lnode;


/* do XOY boundary elements */
      ii=5;                           /* bottom */
      lnode=0;
      for(k=1;k<=noy;k++)
      for(i=1;i<=nox;i++)   {
        node = (k-1)*nox*noz + (i-1)*noz + 1;
        E->parallel.NODE[lev][m][++lnode].bound[ii] = node;
        E->NODE[lev][m][node] = E->NODE[lev][m][node] | OFFSIDE;
        }

      E->parallel.NUM_NNO[lev][m].bound[ii] = lnode;

      ii=6;                           /* top  */
      lnode=0;
      for(k=1;k<=noy;k++)
      for(i=1;i<=nox;i++)  {
        node = (k-1)*nox*noz + i*noz;
        E->parallel.NODE[lev][m][++lnode].bound[ii] = node;
        E->NODE[lev][m][node] = E->NODE[lev][m][node] | OFFSIDE;
        }

      E->parallel.NUM_NNO[lev][m].bound[ii] = lnode;


/* do XOZ boundary elements for 3D */
      ii=3;                           /* front */
      lnode=0;
      for(j=1;j<=noz;j++)
      for(i=1;i<=nox;i++)   {
        node = (i-1)*noz +j;
        E->parallel.NODE[lev][m][++lnode].bound[ii] = node;
        E->NODE[lev][m][node] = E->NODE[lev][m][node] | OFFSIDE;
        }

      E->parallel.NUM_NNO[lev][m].bound[ii] = lnode;

      ii=4;                           /* rear */
      lnode=0;
      for(j=1;j<=noz;j++)
      for(i=1;i<=nox;i++)   {
        node = noz*nox*(noy-1) + (i-1)*noz +j;
        E->parallel.NODE[lev][m][++lnode].bound[ii] = node;
        E->NODE[lev][m][node] = E->NODE[lev][m][node] | OFFSIDE;
        }

      E->parallel.NUM_NNO[lev][m].bound[ii] = lnode;

         /* determine the overlapped nodes between caps or between proc */

    if (E->parallel.me_loc[3]!=E->parallel.nprocz-1 )
      for (lnode=1;lnode<=E->parallel.NUM_NNO[lev][m].bound[6];lnode++) {
        node = E->parallel.NODE[lev][m][lnode].bound[6];
        E->NODE[lev][m][node] = E->NODE[lev][m][node] | SKIP;
        }

    if (E->sphere.capid[m]==12 && E->parallel.me_loc[1]==E->parallel.nprocx-1 && E->parallel.me_loc[2]==E->parallel.nprocy-1) /* back right of cap number 12 */
      for (lnode=1;lnode<=E->parallel.NUM_NNO[lev][m].bound[2];lnode++) {
        node = E->parallel.NODE[lev][m][lnode].bound[2];
        if (node<=nno-noz)  {
           E->NODE[lev][m][node] = E->NODE[lev][m][node] | SKIP;
           if (E->parallel.me_loc[3]==E->parallel.nprocz-1 || node%noz!=0)
             E->NODE[lev][m][node] = E->NODE[lev][m][node] | SKIPS;
	   }
        }
    else
      for (lnode=1;lnode<=E->parallel.NUM_NNO[lev][m].bound[2];lnode++) {
        node = E->parallel.NODE[lev][m][lnode].bound[2];
        E->NODE[lev][m][node] = E->NODE[lev][m][node] | SKIP;
        if (E->parallel.me_loc[3]==E->parallel.nprocz-1 || node%noz!=0)
          E->NODE[lev][m][node] = E->NODE[lev][m][node] | SKIPS;
        }

    if (E->sphere.capid[m]==1 && E->parallel.me_loc[1]==0 && E->parallel.me_loc[2]==0) /* front left of cap number 1 */
      for (lnode=1;lnode<=E->parallel.NUM_NNO[lev][m].bound[3];lnode++) {
        node = E->parallel.NODE[lev][m][lnode].bound[3];
        if (node>noz)  {
           E->NODE[lev][m][node] = E->NODE[lev][m][node] | SKIP;
           if (E->parallel.me_loc[3]==E->parallel.nprocz-1 || node%noz!=0)
             E->NODE[lev][m][node] = E->NODE[lev][m][node] | SKIPS;
	   }
        }
    else
      for (lnode=1;lnode<=E->parallel.NUM_NNO[lev][m].bound[3];lnode++) {
        node = E->parallel.NODE[lev][m][lnode].bound[3];
        E->NODE[lev][m][node] = E->NODE[lev][m][node] | SKIP;
        if (E->parallel.me_loc[3]==E->parallel.nprocz-1 || node%noz!=0)
           E->NODE[lev][m][node] = E->NODE[lev][m][node] | SKIPS;
        }

      }       /* end for m */
    }   /* end for level */

 /* count # of global nodes, ignoring overlapping nodes */
 ii=0;
 for (m=1;m<=E->sphere.caps_per_proc;m++)
    for (node=1;node<=E->lmesh.nno;node++)
      if(E->node[m][node] & SKIPS)
        ++ii;

 MPI_Allreduce(&ii, &node , 1, MPI_INT,MPI_SUM, E->parallel.world);

 E->mesh.nno = E->lmesh.nno*E->parallel.nproc - node - 2*E->mesh.noz;
 E->mesh.neq = E->mesh.nno*3;

if (E->control.verbose) {
 fprintf(E->fp_out,"output_shared_nodes %d \n",E->parallel.me);
 for(lev=E->mesh.gridmax;lev>=E->mesh.gridmin;lev--)
   for (m=1;m<=E->sphere.caps_per_proc;m++)      {
    fprintf(E->fp_out,"lev=%d  me=%d capid=%d m=%d \n",lev,E->parallel.me,E->sphere.capid[m],m);
    for (ii=1;ii<=6;ii++)
      for (i=1;i<=E->parallel.NUM_NNO[lev][m].bound[ii];i++)
        fprintf(E->fp_out,"ii=%d   %d %d \n",ii,i,E->parallel.NODE[lev][m][i].bound[ii]);

    lnode=0;
    for (node=1;node<=E->lmesh.nno;node++)
      if((E->NODE[lev][m][node] & SKIP)) {
        lnode++;
        fprintf(E->fp_out,"skip %d %d \n",lnode,node);
        }
    }
 fflush(E->fp_out);
  }



  return;
  }


/* ============================================
 determine communication routs  and boundary ID for
 exchange info across the boundaries
 assuming fault nodes are in the top row of processors
 ============================================ */

static void face_eqn_node_to_pass(struct All_variables *, int, int, int, int);
static void line_eqn_node_to_pass(struct All_variables *, int, int, int, int, int, int);

void full_parallel_communication_routs_v(E)
  struct All_variables *E;
  {

  int m,i,ii,j,k,l,node,el,elt,lnode,jj,doff,target;
  int lev,elx,elz,ely,nno,nox,noz,noy,p,kkk,kk,kf,kkkp;
  int me, nprocx,nprocy,nprocz,nprocxz;
  int tscaps,cap,scap,large,npass,lx,ly,lz,temp,layer;

  const int dims=E->mesh.nsd;

  me = E->parallel.me;
  nprocx = E->parallel.nprocx;
  nprocy = E->parallel.nprocy;
  nprocz = E->parallel.nprocz;
  nprocxz = nprocx * nprocz;
  tscaps = E->parallel.total_surf_proc;
  lx = E->parallel.me_loc[1];
  ly = E->parallel.me_loc[2];
  lz = E->parallel.me_loc[3];

        /* determine the communications in horizontal direction        */
  for(lev=E->mesh.gridmax;lev>=E->mesh.gridmin;lev--)       {
    nox = E->lmesh.NOX[lev];
    noz = E->lmesh.NOZ[lev];
    noy = E->lmesh.NOY[lev];

    for(m=1;m<=E->sphere.caps_per_proc;m++)    {
      cap = E->sphere.capid[m] - 1;  /* which cap I am in (0~11) */

      /* left */
      npass = ii = 1;
      if (lx != 0)
	target = E->parallel.loc2proc_map[cap][lx-1][ly][lz];
      else
	if ( cap%3 != 0) {
	  temp = (cap+2) % 12;
	  target = E->parallel.loc2proc_map[temp][nprocx-1][ly][lz];
	}
	else {
	  temp = (cap+3) % 12;
	  target = E->parallel.loc2proc_map[temp][ly][0][lz];
	}

      E->parallel.PROCESSOR[lev][m].pass[npass] = target;
      face_eqn_node_to_pass(E,lev,m,npass,ii);

      /* right */
      npass = ii = 2;
      if (lx != nprocx-1)
	target = E->parallel.loc2proc_map[cap][lx+1][ly][lz];
      else
	if ( cap%3 != 2) {
	  temp = (12+cap-2) % 12;
	  target = E->parallel.loc2proc_map[temp][0][ly][lz];
	}
	else {
	  temp = (12+cap-3) % 12;
	  target = E->parallel.loc2proc_map[temp][ly][nprocy-1][lz];
	}
      E->parallel.PROCESSOR[lev][m].pass[npass] = target;
      face_eqn_node_to_pass(E,lev,m,npass,ii);

      /* front */
      npass = ii = 3;
      if (ly != 0)
	target = E->parallel.loc2proc_map[cap][lx][ly-1][lz];
      else
	if ( cap%3 != 0) {
	  temp = cap-1;
	  target = E->parallel.loc2proc_map[temp][lx][nprocy-1][lz];
	}
	else {
	  temp = (12+cap-3) % 12;
	  target = E->parallel.loc2proc_map[temp][0][lx][lz];
	}

      E->parallel.PROCESSOR[lev][m].pass[npass] = target;
      face_eqn_node_to_pass(E,lev,m,npass,ii);

      /* back */
      npass = ii = 4;
      if (ly != nprocy-1)
	target = E->parallel.loc2proc_map[cap][lx][ly+1][lz];
      else
	if ( cap%3 != 2) {
	  temp = cap+1;
	  target = E->parallel.loc2proc_map[temp][lx][0][lz];
	}
	else {
	  temp = (cap+3) % 12;
	  target = E->parallel.loc2proc_map[temp][nprocx-1][lx][lz];
	}

      E->parallel.PROCESSOR[lev][m].pass[npass] = target;
      face_eqn_node_to_pass(E,lev,m,npass,ii);

      /* do lines parallel to Z */

	/* front-left line */
	if (!( (cap%3==1) && (lx==0) && (ly==0) )) {
	  npass ++;
	  if ((cap%3==0) && (lx==0) && (ly==0)) {
	    temp = (cap+6) % 12;
	    target = E->parallel.loc2proc_map[temp][lx][ly][lz];
	  }
	  else if ((cap%3==0) && (lx==0))
	    target = E->parallel.PROCESSOR[lev][m].pass[1] - nprocz;
	  else if ((cap%3==0) && (ly==0))
	    target = E->parallel.PROCESSOR[lev][m].pass[3] - nprocxz;
	  else
	    target = E->parallel.PROCESSOR[lev][m].pass[1] - nprocxz;

	  E->parallel.PROCESSOR[lev][m].pass[npass] = target;
	  line_eqn_node_to_pass(E,lev,m,npass,noz,1,1);
	}

	/* back-right line */
	if (!( (cap%3==1) && (lx==nprocx-1) && (ly==nprocy-1) )) {
	  npass ++;
	  if ((cap%3==2) && (lx==nprocx-1) && (ly==nprocy-1)) {
	    temp = (cap+6) % 12;
	    target = E->parallel.loc2proc_map[temp][lx][ly][lz];
	  }
	  else if ((cap%3==2) && (lx==nprocx-1))
	    target = E->parallel.PROCESSOR[lev][m].pass[2] + nprocz;
	  else if ((cap%3==2) && (ly==nprocy-1))
	    target = E->parallel.PROCESSOR[lev][m].pass[4] + nprocxz;
	  else
	    target = E->parallel.PROCESSOR[lev][m].pass[2] + nprocxz;

	  E->parallel.PROCESSOR[lev][m].pass[npass] = target;
	  line_eqn_node_to_pass(E,lev,m,npass,noz,(noy*nox-1)*noz+1,1);
	}

	/* back-left line */
	if (!( (cap%3==2 || cap%3==0) && (lx==0) && (ly==nprocy-1) )) {
	  npass ++;
	  if ((cap%3==2) && (ly==nprocy-1))
	    target = E->parallel.PROCESSOR[lev][m].pass[4] - nprocxz;
	  else if ((cap%3==0) && (lx==0))
	    target = E->parallel.PROCESSOR[lev][m].pass[1] + nprocz;
	  else
	    target = E->parallel.PROCESSOR[lev][m].pass[1] + nprocxz;

	  E->parallel.PROCESSOR[lev][m].pass[npass] = target;
	  line_eqn_node_to_pass(E,lev,m,npass,noz,(noy-1)*nox*noz+1,1);
	}

	/* front-right line */
	if (!( (cap%3==2 || cap%3==0) && (lx==nprocx-1) && (ly==0) )) {
	  npass ++;
	  if ((cap%3==2) && (lx==nprocx-1))
	    target = E->parallel.PROCESSOR[lev][m].pass[2] - nprocz;
	  else if ((cap%3==0) && (ly==0))
	    target = E->parallel.PROCESSOR[lev][m].pass[3] + nprocxz;
	  else
	    target = E->parallel.PROCESSOR[lev][m].pass[2] - nprocxz;

	  E->parallel.PROCESSOR[lev][m].pass[npass] = target;
	  line_eqn_node_to_pass(E,lev,m,npass,noz,(nox-1)*noz+1,1);
	}


      E->parallel.TNUM_PASS[lev][m] = npass;

    }   /* end for m  */
  }   /* end for lev  */

  /* determine the communications in vertical direction        */
  for(lev=E->mesh.gridmax;lev>=E->mesh.gridmin;lev--)       {
    kkk = 0;
    for(ii=5;ii<=6;ii++)       {    /* do top & bottom */
      E->parallel.NUM_PASSz[lev].bound[ii] = 1;
      if(lz==0 && ii==5)
	E->parallel.NUM_PASSz[lev].bound[ii] = 0;
      else if(lz==nprocz-1 && ii==6)
	E->parallel.NUM_PASSz[lev].bound[ii] = 0;

      for (p=1;p<=E->parallel.NUM_PASSz[lev].bound[ii];p++)  {
	kkk ++;
	/* determine the pass ID for ii-th boundary and p-th pass */
	kkkp = kkk + E->sphere.max_connections;

	E->parallel.NUM_NODEz[lev].pass[kkk] = 0;
	E->parallel.NUM_NEQz[lev].pass[kkk] = 0;

	for(m=1;m<=E->sphere.caps_per_proc;m++)    {
	  cap = E->sphere.capid[m] - 1;  /* which cap I am in (0~11) */
	  E->parallel.PROCESSORz[lev].pass[kkk] =
	    E->parallel.loc2proc_map[cap][lx][ly][lz+((ii==5)?-1:1)];

	  jj=0;  kk=0;
	  for (k=1;k<=E->parallel.NUM_NNO[lev][m].bound[ii];k++)   {
	    node = E->parallel.NODE[lev][m][k].bound[ii];
	    E->parallel.EXCHANGE_NODE[lev][m][++kk].pass[kkkp] = node;
	    for(doff=1;doff<=dims;doff++)
	      E->parallel.EXCHANGE_ID[lev][m][++jj].pass[kkkp] =
		E->ID[lev][m][node].doff[doff];
	  }
	  E->parallel.NUM_NODE[lev][m].pass[kkkp] = kk;
	  E->parallel.NUM_NEQ[lev][m].pass[kkkp] = jj;
	  E->parallel.NUM_NODEz[lev].pass[kkk] += kk;
	  E->parallel.NUM_NEQz[lev].pass[kkk] += jj;
	}

      }   /* end for loop p */
    }     /* end for j */

    E->parallel.TNUM_PASSz[lev] = kkk;
  }        /* end for level */



  if(E->control.verbose) {
    for(lev=E->mesh.gridmax;lev>=E->mesh.gridmin;lev--) {
      fprintf(E->fp_out,"output_communication route surface for lev=%d \n",lev);
      for (m=1;m<=E->sphere.caps_per_proc;m++)  {
	fprintf(E->fp_out,"  me= %d cap=%d pass  %d \n",E->parallel.me,E->sphere.capid[m],E->parallel.TNUM_PASS[lev][m]);
	for (k=1;k<=E->parallel.TNUM_PASS[lev][m];k++)   {
	  fprintf(E->fp_out,"proc %d and pass  %d to proc %d with %d eqn and %d node\n",E->parallel.me,k,E->parallel.PROCESSOR[lev][m].pass[k],E->parallel.NUM_NEQ[lev][m].pass[k],E->parallel.NUM_NODE[lev][m].pass[k]);
/* 	  fprintf(E->fp_out,"Eqn:\n");  */
/* 	  for (ii=1;ii<=E->parallel.NUM_NEQ[lev][m].pass[k];ii++)  */
/* 	    fprintf(E->fp_out,"%d %d\n",ii,E->parallel.EXCHANGE_ID[lev][m][ii].pass[k]);  */
/* 	  fprintf(E->fp_out,"Node:\n");  */
/* 	  for (ii=1;ii<=E->parallel.NUM_NODE[lev][m].pass[k];ii++)  */
/* 	    fprintf(E->fp_out,"%d %d\n",ii,E->parallel.EXCHANGE_NODE[lev][m][ii].pass[k]);  */
	}
      }

      fprintf(E->fp_out,"output_communication route vertical \n");
      fprintf(E->fp_out," me= %d pass  %d \n",E->parallel.me,E->parallel.TNUM_PASSz[lev]);
      for (k=1;k<=E->parallel.TNUM_PASSz[lev];k++)   {
	kkkp = k + E->sphere.max_connections;
	fprintf(E->fp_out,"proc %d and pass  %d to proc %d\n",E->parallel.me,k,E->parallel.PROCESSORz[lev].pass[k]);
	for (m=1;m<=E->sphere.caps_per_proc;m++)  {
	  fprintf(E->fp_out,"cap=%d eqn=%d node=%d\n",E->sphere.capid[m],E->parallel.NUM_NEQ[lev][m].pass[kkkp],E->parallel.NUM_NODE[lev][m].pass[kkkp]);
/* 	  for (ii=1;ii<=E->parallel.NUM_NEQ[lev][m].pass[kkkp];ii++) */
/* 	    fprintf(E->fp_out,"%d %d\n",ii,E->parallel.EXCHANGE_ID[lev][m][ii].pass[kkkp]); */
/* 	  for (ii=1;ii<=E->parallel.NUM_NODE[lev][m].pass[kkkp];ii++) */
/* 	    fprintf(E->fp_out,"%d %d\n",ii,E->parallel.EXCHANGE_NODE[lev][m][ii].pass[kkkp]); */
	}
      }
    }
    fflush(E->fp_out);
  }

  return;
  }


/* ============================================
 determine communication routs for
 exchange info across the boundaries on the surfaces
 assuming fault nodes are in the top row of processors
 ============================================ */

void full_parallel_communication_routs_s(E)
  struct All_variables *E;
  {

  int i,ii,j,k,l,node,el,elt,lnode,jj,doff;
  int lev,nno,nox,noz,noy,kkk,kk,kf;
  int me,m, nprocz;
  void parallel_process_termination();

  const int dims=E->mesh.nsd;

  /* This function is needed only for get_CBF_topo(), */
  /* which is obsolete.             Tan2 Feb. 24 2002 */
  return;


/*   me = E->parallel.me; */
/*   nprocz = E->parallel.nprocz; */

        /* determine the communications in horizontal direction        */
/*   for(lev=E->mesh.gridmax;lev>=E->mesh.gridmin;lev--)       { */
/*     nox = E->lmesh.NOX[lev]; */
/*     noz = E->lmesh.NOZ[lev]; */
/*     noy = E->lmesh.NOY[lev]; */

/*     for(m=1;m<=E->sphere.caps_per_proc;m++)    { */
/*       j = E->sphere.capid[m]; */

/*       for (kkk=1;kkk<=E->parallel.TNUM_PASS[lev][m];kkk++) { */
/*         if (kkk<=4) { */  /* first 4 communications are for planes */
/*           ii = kkk; */
/*           E->parallel.NUM_sNODE[lev][m].pass[kkk] =  */
/*                            E->parallel.NUM_NNO[lev][m].bound[ii]/noz; */

/*           for (k=1;k<=E->parallel.NUM_sNODE[lev][m].pass[kkk];k++)   { */
/*             lnode = k; */
/*             node = (E->parallel.NODE[lev][m][lnode].bound[ii]-1)/noz + 1; */
/*             E->parallel.EXCHANGE_sNODE[lev][m][k].pass[kkk] = node; */
/*             }  */ /* end for node k */
/*           }    */   /* end for first 4 communications */

/*         else  {    */      /* the last FOUR communications are for lines */
/*           E->parallel.NUM_sNODE[lev][m].pass[kkk]=1; */
/*           for (k=1;k<=E->parallel.NUM_sNODE[lev][m].pass[kkk];k++)   { */
/* 	    if (E->parallel.nprocx*E->parallel.nprocy > 1) { */ /* 4 or more horiz. proc*/
/* 	      switch(kkk) { */
/* 	      case 5: */
/* 		ii = 1; */
/* 		lnode = k; */
/* 		break; */
/* 	      case 6: */
/* 		ii = 2; */
/* 		lnode = k + (noy-1)*noz*noy; */
/* 		break; */
/* 	      case 7: */
/* 		ii = 1; */
/* 		lnode = k + (noy-1)*noz*noy; */
/* 		break; */
/* 	      case 8: */
/* 		ii = 2; */
/* 		lnode = k; */
/* 		break; */
/* 	      } */
/* 	    } */
/* 	    else { */ /* 1 or fewer horiz. processors per cap */
/* 	      ii = kkk-4; */
/* 	      if (j%3==2)  */       /* for the middle caps */
/* 		lnode = k + ((ii==1)?1:0)*(noy-1)*noz*noy; */
/* 	      else    */            /* for the caps linked to poles */
/* 		lnode = k + ((ii==1)?0:1)*(noy-1)*noz*noy; */
/* 	    } */



/*             node = (E->parallel.NODE[lev][m][lnode].bound[ii]-1)/noz + 1; */
/*             E->parallel.EXCHANGE_sNODE[lev][m][k].pass[kkk] = node; */
/*             } */  /* end for node k */
/*           } */  /* end for the last FOUR communications */

/*         }   */ /* end for kkk  */
/*       }    *//* end for m  */

/*     }   */ /* end for lev  */

/*   return; */
  }



/* ================================================ */
/* ================================================ */

static void face_eqn_node_to_pass(E,lev,m,npass,bd)
  struct All_variables *E;
  int lev,m,npass,bd;
{
  int jj,kk,node,doff;
  const int dims=E->mesh.nsd;

  E->parallel.NUM_NODE[lev][m].pass[npass] = E->parallel.NUM_NNO[lev][m].bound[bd];

  jj = 0;
  for (kk=1;kk<=E->parallel.NUM_NODE[lev][m].pass[npass];kk++)   {
    node = E->parallel.NODE[lev][m][kk].bound[bd];
    E->parallel.EXCHANGE_NODE[lev][m][kk].pass[npass] = node;
    for(doff=1;doff<=dims;doff++)
      E->parallel.EXCHANGE_ID[lev][m][++jj].pass[npass] = E->ID[lev][m][node].doff[doff];
  }

  E->parallel.NUM_NEQ[lev][m].pass[npass] = jj;

  return;
}

/* ================================================ */
/* ================================================ */

static void line_eqn_node_to_pass(E,lev,m,npass,num_node,offset,stride)
  struct All_variables *E;
  int lev,m,npass,num_node,offset,stride;
{
  int jj,kk,node,doff;
  const int dims=E->mesh.nsd;

  E->parallel.NUM_NODE[lev][m].pass[npass] = num_node;

  jj=0;
  for (kk=1;kk<=E->parallel.NUM_NODE[lev][m].pass[npass];kk++)   {
    node = (kk-1)*stride + offset;
    E->parallel.EXCHANGE_NODE[lev][m][kk].pass[npass] = node;
    for(doff=1;doff<=dims;doff++)
      E->parallel.EXCHANGE_ID[lev][m][++jj].pass[npass] = E->ID[lev][m][node].doff[doff];
  }

  E->parallel.NUM_NEQ[lev][m].pass[npass] = jj;

  return;
}

/* ================================================
WARNING: BUGS AHEAD

   for (m=1;m<=E->sphere.caps_per_proc;m++)    {
     for (k=1;k<=E->parallel.TNUM_PASS[lev][m];k++)  {

       sizeofk = (1+E->parallel.NUM_NEQ[lev][m].pass[k])*sizeof(double);
       S[k]=(double *)malloc( sizeofk );
       R[k]=(double *)malloc( sizeofk );
       }
      }

This piece of code contain a bug. Arrays S and R are allocated for each m.
But most of the memory is leaked.

In this version of CitcomS, sphere.caps_per_proc is always equal to one.
So, this bug won't manifest itself. But in other version of CitcomS, it will.

by Tan2 7/21, 2003
================================================ */

void full_exchange_id_d(E, U, lev)
 struct All_variables *E;
 double **U;
 int lev;
 {

 int ii,j,jj,m,k,kk,t_cap,idb,msginfo[8];
 double *S[73],*R[73], *RV, *SV;
 int mid_recv, sizeofk;

 MPI_Status status[100];
 MPI_Status status1;
 MPI_Request request[100];

 for (m=1;m<=E->sphere.caps_per_proc;m++)    {
   for (k=1;k<=E->parallel.TNUM_PASS[lev][m];k++)  {
     sizeofk = (1+E->parallel.NUM_NEQ[lev][m].pass[k])*sizeof(double);
     S[k]=(double *)malloc( sizeofk );
     R[k]=(double *)malloc( sizeofk );
   }
 }

 sizeofk = 0;
 for (k=1;k<=E->parallel.TNUM_PASSz[lev];k++)  {
   kk = (1+E->parallel.NUM_NEQz[lev].pass[k])*sizeof(double);
   sizeofk = max(sizeofk, kk);
 }
 RV=(double *)malloc( sizeofk );
 SV=(double *)malloc( sizeofk );

  idb=0;
  for (m=1;m<=E->sphere.caps_per_proc;m++)   {
    for (k=1;k<=E->parallel.TNUM_PASS[lev][m];k++)     {

      for (j=1;j<=E->parallel.NUM_NEQ[lev][m].pass[k];j++) {
        S[k][j-1] = U[m][ E->parallel.EXCHANGE_ID[lev][m][j].pass[k] ];
	}

      if (E->parallel.PROCESSOR[lev][m].pass[k] != E->parallel.me &&
	  E->parallel.PROCESSOR[lev][m].pass[k] != -1) {
	  idb ++;
          MPI_Isend(S[k], E->parallel.NUM_NEQ[lev][m].pass[k], MPI_DOUBLE,
		    E->parallel.PROCESSOR[lev][m].pass[k], 1,
		    E->parallel.world, &request[idb-1]);
      }
    }           /* for k */
  }     /* for m */         /* finish sending */

  for (m=1;m<=E->sphere.caps_per_proc;m++)   {
    for (k=1;k<=E->parallel.TNUM_PASS[lev][m];k++)   {

      if (E->parallel.PROCESSOR[lev][m].pass[k] != E->parallel.me &&
	  E->parallel.PROCESSOR[lev][m].pass[k] != -1) {
         idb++;
	 MPI_Irecv(R[k],E->parallel.NUM_NEQ[lev][m].pass[k], MPI_DOUBLE,
		   E->parallel.PROCESSOR[lev][m].pass[k], 1,
		   E->parallel.world, &request[idb-1]);
      }
      else {
	for (j=1;j<=E->parallel.NUM_NEQ[lev][m].pass[k];j++)
           U[m][ E->parallel.EXCHANGE_ID[lev][m][j].pass[k] ] += S[k][j-1];
      }
    }      /* for k */
  }     /* for m */         /* finish receiving */

  MPI_Waitall(idb,request,status);

  for (m=1;m<=E->sphere.caps_per_proc;m++)   {
    for (k=1;k<=E->parallel.TNUM_PASS[lev][m];k++)   {

      if (E->parallel.PROCESSOR[lev][m].pass[k] != E->parallel.me &&
	  E->parallel.PROCESSOR[lev][m].pass[k] != -1) {
	for (j=1;j<=E->parallel.NUM_NEQ[lev][m].pass[k];j++)
	  U[m][ E->parallel.EXCHANGE_ID[lev][m][j].pass[k] ] += R[k][j-1];
      }
    }
  }

  /* for vertical direction  */

  for (k=1;k<=E->parallel.TNUM_PASSz[lev];k++)  {
    jj = 0;
    kk = k + E->sphere.max_connections;

    for(m=1;m<=E->sphere.caps_per_proc;m++)
      for (j=1;j<=E->parallel.NUM_NEQ[lev][m].pass[kk];j++)
        SV[jj++] = U[m][ E->parallel.EXCHANGE_ID[lev][m][j].pass[kk] ];

    MPI_Sendrecv(SV, E->parallel.NUM_NEQz[lev].pass[k], MPI_DOUBLE,
		 E->parallel.PROCESSORz[lev].pass[k], 1,
                 RV, E->parallel.NUM_NEQz[lev].pass[k], MPI_DOUBLE,
		 E->parallel.PROCESSORz[lev].pass[k], 1,
		 E->parallel.world, &status1);

    jj = 0;
    for(m=1;m<=E->sphere.caps_per_proc;m++)
      for (j=1;j<=E->parallel.NUM_NEQ[lev][m].pass[kk];j++)
        U[m][ E->parallel.EXCHANGE_ID[lev][m][j].pass[kk] ] += RV[jj++];
  }

 for (m=1;m<=E->sphere.caps_per_proc;m++)    {
   for (k=1;k<=E->parallel.TNUM_PASS[lev][m];k++)  {
     free((void*) S[k]);
     free((void*) R[k]);
   }
 }

 free((void*) SV);
 free((void*) RV);

 return;
 }


/* ================================================ */
/* ================================================ */
static void exchange_node_d(E, U, lev)
 struct All_variables *E;
 double **U;
 int lev;
 {

 int ii,j,jj,m,k,kk,t_cap,idb,msginfo[8];
 double *S[73],*R[73], *RV, *SV;
 int mid_recv, sizeofk;

 MPI_Status status[100];
 MPI_Status status1;
 MPI_Request request[100];

 kk=0;
 for (m=1;m<=E->sphere.caps_per_proc;m++)    {
   for (k=1;k<=E->parallel.TNUM_PASS[lev][m];k++)  {
     ++kk;
     sizeofk = (1+E->parallel.NUM_NODE[lev][m].pass[k])*sizeof(double);
     S[kk]=(double *)malloc( sizeofk );
     R[kk]=(double *)malloc( sizeofk );
   }
 }

 idb= 0;
 for (k=1;k<=E->parallel.TNUM_PASSz[lev];k++)  {
   sizeofk = (1+E->parallel.NUM_NODEz[lev].pass[k])*sizeof(double);
   idb = max(idb,sizeofk);
 }

 RV=(double *)malloc( idb );
 SV=(double *)malloc( idb );

  idb=0;
  for (m=1;m<=E->sphere.caps_per_proc;m++)   {
    for (k=1;k<=E->parallel.TNUM_PASS[lev][m];k++)     {
      kk=k;

      for (j=1;j<=E->parallel.NUM_NODE[lev][m].pass[k];j++)
        S[kk][j-1] = U[m][ E->parallel.EXCHANGE_NODE[lev][m][j].pass[k] ];

      if (E->parallel.PROCESSOR[lev][m].pass[k]!=E->parallel.me) {
	if (E->parallel.PROCESSOR[lev][m].pass[k]!=-1) {
         idb ++;
        MPI_Isend(S[kk],E->parallel.NUM_NODE[lev][m].pass[k],MPI_DOUBLE,
             E->parallel.PROCESSOR[lev][m].pass[k],1,E->parallel.world,&request[idb-1]);
	}
         }
      }           /* for k */
    }     /* for m */         /* finish sending */

  for (m=1;m<=E->sphere.caps_per_proc;m++)   {
    for (k=1;k<=E->parallel.TNUM_PASS[lev][m];k++)   {
      kk=k;

      if (E->parallel.PROCESSOR[lev][m].pass[k]!=E->parallel.me)  {
	if (E->parallel.PROCESSOR[lev][m].pass[k]!=-1) {
         idb++;
         MPI_Irecv(R[kk],E->parallel.NUM_NODE[lev][m].pass[k],MPI_DOUBLE,
         E->parallel.PROCESSOR[lev][m].pass[k],1,E->parallel.world,&request[idb-1]);
         }
      }

      else   {
	kk=k;
         for (j=1;j<=E->parallel.NUM_NODE[lev][m].pass[k];j++)
           U[m][ E->parallel.EXCHANGE_NODE[lev][m][j].pass[k] ] += S[kk][j-1];
         }
      }      /* for k */
    }     /* for m */         /* finish receiving */

  MPI_Waitall(idb,request,status);

  for (m=1;m<=E->sphere.caps_per_proc;m++)   {
    for (k=1;k<=E->parallel.TNUM_PASS[lev][m];k++)   {
      kk=k;

      if (E->parallel.PROCESSOR[lev][m].pass[k]!=E->parallel.me)
	if (E->parallel.PROCESSOR[lev][m].pass[k]!=-1) {
        for (j=1;j<=E->parallel.NUM_NODE[lev][m].pass[k];j++)
           U[m][ E->parallel.EXCHANGE_NODE[lev][m][j].pass[k] ] += R[kk][j-1];
      }
    }
    }

                /* for vertical direction  */

  for (k=1;k<=E->parallel.TNUM_PASSz[lev];k++)  {
    jj = 0;
    kk = k + E->sphere.max_connections;

    for(m=1;m<=E->sphere.caps_per_proc;m++)
      for (j=1;j<=E->parallel.NUM_NODE[lev][m].pass[kk];j++)
        SV[jj++] = U[m][ E->parallel.EXCHANGE_NODE[lev][m][j].pass[kk] ];

    MPI_Sendrecv(SV,E->parallel.NUM_NODEz[lev].pass[k],MPI_DOUBLE,
             E->parallel.PROCESSORz[lev].pass[k],1,
                 RV,E->parallel.NUM_NODEz[lev].pass[k],MPI_DOUBLE,
             E->parallel.PROCESSORz[lev].pass[k],1,E->parallel.world,&status1);

    jj = 0;
    for(m=1;m<=E->sphere.caps_per_proc;m++)
      for (j=1;j<=E->parallel.NUM_NODE[lev][m].pass[kk];j++)
        U[m][ E->parallel.EXCHANGE_NODE[lev][m][j].pass[kk] ] += RV[jj++];
    }

  kk = 0;
 for (m=1;m<=E->sphere.caps_per_proc;m++)    {
   for (k=1;k<=E->parallel.TNUM_PASS[lev][m];k++)  {
     kk++;
     free((void*) S[kk]);
     free((void*) R[kk]);
   }
 }

 free((void*) SV);
 free((void*) RV);

 return;
}

/* ================================================ */
/* ================================================ */

static void exchange_node_f(E, U, lev)
 struct All_variables *E;
 float **U;
 int lev;
 {

 int ii,j,jj,m,k,kk,t_cap,idb,msginfo[8];

 float *S[73],*R[73], *RV, *SV;
 int mid_recv, sizeofk;

 MPI_Status status[100];
 MPI_Status status1;
 MPI_Request request[100];

 kk=0;
 for (m=1;m<=E->sphere.caps_per_proc;m++)    {
   for (k=1;k<=E->parallel.TNUM_PASS[lev][m];k++)  {
     ++kk;
     sizeofk = (1+E->parallel.NUM_NODE[lev][m].pass[k])*sizeof(float);
     S[kk]=(float *)malloc( sizeofk );
     R[kk]=(float *)malloc( sizeofk );
   }
 }

 idb= 0;
 for (k=1;k<=E->parallel.TNUM_PASSz[lev];k++)  {
   sizeofk = (1+E->parallel.NUM_NODEz[lev].pass[k])*sizeof(float);
   idb = max(idb,sizeofk);
 }

 RV=(float *)malloc( idb );
 SV=(float *)malloc( idb );

  idb=0;
  for (m=1;m<=E->sphere.caps_per_proc;m++)   {
    for (k=1;k<=E->parallel.TNUM_PASS[lev][m];k++)     {
      kk=k;

      for (j=1;j<=E->parallel.NUM_NODE[lev][m].pass[k];j++)
        S[kk][j-1] = U[m][ E->parallel.EXCHANGE_NODE[lev][m][j].pass[k] ];

      if (E->parallel.PROCESSOR[lev][m].pass[k]!=E->parallel.me) {
	if (E->parallel.PROCESSOR[lev][m].pass[k]!=-1) {
         idb ++;
        MPI_Isend(S[kk],E->parallel.NUM_NODE[lev][m].pass[k],MPI_FLOAT,
             E->parallel.PROCESSOR[lev][m].pass[k],1,E->parallel.world,&request[idb-1]);
	}
         }
      }           /* for k */
    }     /* for m */         /* finish sending */

  for (m=1;m<=E->sphere.caps_per_proc;m++)   {
    for (k=1;k<=E->parallel.TNUM_PASS[lev][m];k++)   {
      kk=k;

      if (E->parallel.PROCESSOR[lev][m].pass[k]!=E->parallel.me)  {
	if (E->parallel.PROCESSOR[lev][m].pass[k]!=-1) {
         idb++;
         MPI_Irecv(R[kk],E->parallel.NUM_NODE[lev][m].pass[k],MPI_FLOAT,
         E->parallel.PROCESSOR[lev][m].pass[k],1,E->parallel.world,&request[idb-1]);
         }
      }

      else   {
	kk=k;
         for (j=1;j<=E->parallel.NUM_NODE[lev][m].pass[k];j++)
           U[m][ E->parallel.EXCHANGE_NODE[lev][m][j].pass[k] ] += S[kk][j-1];
         }
      }      /* for k */
    }     /* for m */         /* finish receiving */

  MPI_Waitall(idb,request,status);

  for (m=1;m<=E->sphere.caps_per_proc;m++)   {
    for (k=1;k<=E->parallel.TNUM_PASS[lev][m];k++)   {
      kk=k;

      if (E->parallel.PROCESSOR[lev][m].pass[k]!=E->parallel.me)
	if (E->parallel.PROCESSOR[lev][m].pass[k]!=-1) {
        for (j=1;j<=E->parallel.NUM_NODE[lev][m].pass[k];j++)
           U[m][ E->parallel.EXCHANGE_NODE[lev][m][j].pass[k] ] += R[kk][j-1];
      }
    }
    }

                /* for vertical direction  */

  for (k=1;k<=E->parallel.TNUM_PASSz[lev];k++)  {
    jj = 0;
    kk = k + E->sphere.max_connections;

    for(m=1;m<=E->sphere.caps_per_proc;m++)
      for (j=1;j<=E->parallel.NUM_NODE[lev][m].pass[kk];j++)
        SV[jj++] = U[m][ E->parallel.EXCHANGE_NODE[lev][m][j].pass[kk] ];

    MPI_Sendrecv(SV,E->parallel.NUM_NODEz[lev].pass[k],MPI_FLOAT,
             E->parallel.PROCESSORz[lev].pass[k],1,
                 RV,E->parallel.NUM_NODEz[lev].pass[k],MPI_FLOAT,
             E->parallel.PROCESSORz[lev].pass[k],1,E->parallel.world,&status1);

    jj = 0;
    for(m=1;m<=E->sphere.caps_per_proc;m++)
      for (j=1;j<=E->parallel.NUM_NODE[lev][m].pass[kk];j++)
        U[m][ E->parallel.EXCHANGE_NODE[lev][m][j].pass[kk] ] += RV[jj++];
    }

  kk = 0;
 for (m=1;m<=E->sphere.caps_per_proc;m++)    {
   for (k=1;k<=E->parallel.TNUM_PASS[lev][m];k++)  {
     kk++;
     free((void*) S[kk]);
     free((void*) R[kk]);
   }
 }

 free((void*) SV);
 free((void*) RV);

 return;
 }
/* ================================================ */
/* ================================================ */

static void exchange_snode_f(E, U1, U2, lev)
 struct All_variables *E;
 float **U1,**U2;
 int lev;
 {

 int ii,j,k,m,kk,t_cap,idb,msginfo[8];
 float *S[73],*R[73];
 int mid_recv, sizeofk;

 MPI_Status status[100];
 MPI_Status status1;
 MPI_Request request[100];

   kk=0;
   for (m=1;m<=E->sphere.caps_per_proc;m++)    {
     for (k=1;k<=E->parallel.TNUM_PASS[E->mesh.levmax][m];k++)  {
       ++kk;
       sizeofk = (1+2*E->parallel.NUM_sNODE[E->mesh.levmax][m].pass[k])*sizeof(float);
       S[kk]=(float *)malloc( sizeofk );
       R[kk]=(float *)malloc( sizeofk );
       }
     }

  idb=0;
  for (m=1;m<=E->sphere.caps_per_proc;m++)   {
    for (k=1;k<=E->parallel.TNUM_PASS[lev][m];k++)     {
      kk=k;

      for (j=1;j<=E->parallel.NUM_sNODE[lev][m].pass[k];j++)  {
        S[kk][j-1] = U1[m][ E->parallel.EXCHANGE_sNODE[lev][m][j].pass[k] ];
        S[kk][j-1+E->parallel.NUM_sNODE[lev][m].pass[k]]
                   = U2[m][ E->parallel.EXCHANGE_sNODE[lev][m][j].pass[k] ];
        }

      if (E->parallel.PROCESSOR[lev][m].pass[k]!=E->parallel.me) {
	if (E->parallel.PROCESSOR[lev][m].pass[k]!=-1) {
         idb ++;
         MPI_Isend(S[kk],2*E->parallel.NUM_sNODE[lev][m].pass[k],MPI_FLOAT,
             E->parallel.PROCESSOR[lev][m].pass[k],1,E->parallel.world,&request[idb-1]);
         }
      }
      }           /* for k */
    }     /* for m */         /* finish sending */

  for (m=1;m<=E->sphere.caps_per_proc;m++)   {
    for (k=1;k<=E->parallel.TNUM_PASS[lev][m];k++)   {
      kk=k;

      if (E->parallel.PROCESSOR[lev][m].pass[k]!=E->parallel.me)  {
	if (E->parallel.PROCESSOR[lev][m].pass[k]!=-1) {

         idb ++;
         MPI_Irecv(R[kk],2*E->parallel.NUM_sNODE[lev][m].pass[k],MPI_FLOAT,
           E->parallel.PROCESSOR[lev][m].pass[k],1,E->parallel.world,&request[idb-1]);
         }
      }

      else   {
	kk=k;
         for (j=1;j<=E->parallel.NUM_sNODE[lev][m].pass[k];j++)     {
           U1[m][ E->parallel.EXCHANGE_sNODE[lev][m][j].pass[k] ] += S[kk][j-1];
           U2[m][ E->parallel.EXCHANGE_sNODE[lev][m][j].pass[k] ] +=
                               S[kk][j-1+E->parallel.NUM_sNODE[lev][m].pass[k]];
           }
         }
      }      /* for k */
    }     /* for m */         /* finish receiving */

  MPI_Waitall(idb,request,status);

  for (m=1;m<=E->sphere.caps_per_proc;m++)   {
    for (k=1;k<=E->parallel.TNUM_PASS[lev][m];k++)   {
      kk=k;

      if (E->parallel.PROCESSOR[lev][m].pass[k]!=E->parallel.me)
	if (E->parallel.PROCESSOR[lev][m].pass[k]!=-1) {
        for (j=1;j<=E->parallel.NUM_sNODE[lev][m].pass[k];j++)    {
           U1[m][ E->parallel.EXCHANGE_sNODE[lev][m][j].pass[k] ] += R[kk][j-1];
           U2[m][ E->parallel.EXCHANGE_sNODE[lev][m][j].pass[k] ] +=
                              R[kk][j-1+E->parallel.NUM_sNODE[lev][m].pass[k]];
           }
	}
      }
  }

  kk=0;
  for (m=1;m<=E->sphere.caps_per_proc;m++)    {
    for (k=1;k<=E->parallel.TNUM_PASS[E->mesh.levmax][m];k++)  {
      ++kk;
      free((void*) S[kk]);
      free((void*) R[kk]);
    }
  }

 return;
 }


back to top