https://gitlab.inria.fr/pm2/pm2
Raw File
Tip revision: 28348cbb7443cecb94f0380eacf8e9c4fd3b247e authored by Alexandre Denis on 11 July 2022, 16:08:29 UTC
nmad: some more error-checking on remote queues
Tip revision: 28348cb
mpi_bench_generic.h
/*
 * MadMPI benchmark
 * Copyright (C) 2015-2020 (see AUTHORS file)
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or (at
 * your option) any later version.
 *
 * This program is distributed in the hope that it will be useful, but
 * WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * General Public License for more details.
 */


#include <mpi.h>
#include <time.h>
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#include <assert.h>

#include "mpi_sync_clocks.h"
#include "mpi_bench_config.h"

#ifndef MPI_BENCH_GENERIC_H
#define MPI_BENCH_GENERIC_H

#ifdef HAVE_HWLOC
#include <hwloc.h>
#endif /* HAVE_HWLOC */

#define TAG 0

#define MIN_DEFAULT         0
#define MAX_DEFAULT        (128 * 1024 * 1024)
#define MULT_DEFAULT        1.4
#define INCR_DEFAULT        0
#define LOOPS_DEFAULT_PARAM 50
#define LOOPS_DEFAULT       1000
#define PARAM_DEFAULT       -1

#define LOOPS_CALIBRATE     10000
#define USEC_CALIBRATE_BARRIER (1000 * 1000 * 5)

#define LOOPS_TIMEOUT_SECONDS 3
#define LOOPS_MAX_DATA        ((uint64_t)(512 * 1024 * 1024))

/* ********************************************************* */

/** parameters for a benchmark */
struct mpi_bench_param_s
{
  size_t start_len;
  size_t end_len;
  double multiplier;
  size_t increment;
  long long iterations;
  int    param;        /**< fixed parameter; -1 to use bounds */
};

/** bounds for parameterized benchmarks */
struct mpi_bench_param_bounds_s
{
  int min, max;
  double mult;
  int incr;
};

enum mpi_bench_rtt_e
  {
    MPI_BENCH_RTT_HALF   = 0, /**< display half-roundtrip (supposed to be one-way latency) */
    MPI_BENCH_RTT_FULL   = 1, /**< display full roundtrip (either it is directly meaningfull, or will be post-processed) */
    MPI_BENCH_RTT_SUBLAT = 2, /**< display roundtrip minus ack latency */
    MPI_BENCH_RTT_COLLECTIVE = 3, /**< display roundtrip time max between all nodes (using sync clocks) */
    _MPI_BENCH_RTT_LAST
  };

struct mpi_bench_s
{
  const char*label;
  const char*name;
  const enum mpi_bench_rtt_e rtt; /**< whether we should output round-trip time or half-rtt (one way latency) */
  const int threads; /**< whether we need MPI_THREAD_MULTIPLE */
  const int collective; /**< whether the operaiton is collective (display results only on node 0) */
  void (*server)(void*buf, size_t len);
  void (*client)(void*buf, size_t len);
  void (*init)(void*buf, size_t len); /**< called before a round with a given set of param+size */
  void (*finalize)(void);      /**< called at the end of a round for a given param+size */
  void (*setparam)(int param); /**< set a new param */
  void (*endparam)(void);      /**< called at the end of a round for a given param */
  const char*param_label;      /**< label of parameter */
  const struct mpi_bench_param_bounds_s*(*getparams)(void);
};

void mpi_bench_init(int*argc, char***argv, int threads);
void mpi_bench_run(const struct mpi_bench_s*mpi_bench, const struct mpi_bench_param_s*params);
void mpi_bench_finalize(void);


/* ********************************************************* */

/** common variables shared between init, main, and individual benchmarks */
struct mpi_bench_common_s
{
  int self, peer, size;
  int is_server;
  MPI_Comm comm;
};

extern struct mpi_bench_common_s mpi_bench_common;

/* ** Compute ********************************************** */

#define MIN_COMPUTE 0
#define MAX_COMPUTE 20000
#define MULT_COMPUTE 1.4

#define MIN_COMPUTE_COLL 0
#define MAX_COMPUTE_COLL 500000
#define MULT_COMPUTE_COLL 2

static volatile double r = 1.0;

/** dummy computation of a given time */
static void mpi_bench_do_compute(int usec) __attribute__((unused));
static void mpi_bench_do_compute(int usec)
{
  sync_clocks_generic_tick_t t1, t2;
  double delay = 0.0;
  sync_clocks_generic_get_tick(t1);
  while(delay < usec)
    {
      int k;
      for(k = 0; k < 10; k++)
	{
	  r = (r * 1.1) + 2.213890 - k;
	}
      sync_clocks_generic_get_tick(t2);
      delay = sync_clocks_generic_ticks2delay(&t1, &t2);
    }
}

/** computation on variable-size vector */
static void mpi_bench_compute_vector(void*buf, size_t len) __attribute__((unused));
static void mpi_bench_compute_vector(void*buf, size_t len)
{
  unsigned char*m = buf;
  size_t i;
#ifdef _OPENMP
#pragma omp parallel for
#endif
  for(i = 0; i < len; i++)
    {
      double v = (double)m[i];
      v = sqrt(v * v + 1.0);
      m[i] = (unsigned char)v;
    }
}

/* ** non-contiguous datatype ****************************** */

/** default blocksize for non-contiguous datatype */
#define MPI_BENCH_NONCONTIG_BLOCKSIZE 32

static void*noncontig_buf = NULL;
static size_t noncontig_bufsize = 0;
static MPI_Datatype noncontig_dtype = MPI_DATATYPE_NULL;

static inline void mpi_bench_noncontig_type_init(int blocksize, size_t len)
{
  noncontig_bufsize = len * 2 + blocksize;
  noncontig_buf = malloc(noncontig_bufsize);
  memset(noncontig_buf, 0, noncontig_bufsize);
  MPI_Type_vector(len / blocksize, blocksize, 2 * blocksize, MPI_CHAR, &noncontig_dtype);
  MPI_Type_commit(&noncontig_dtype);
}

static inline void mpi_bench_noncontig_type_destroy(void)
{
  if(noncontig_dtype != MPI_DATATYPE_NULL)
    {
      MPI_Type_free(&noncontig_dtype);
    }
  free(noncontig_buf);
  noncontig_buf = NULL;
  noncontig_bufsize = 0;
}

/* ** Threads ********************************************** */

#define THREADS_MAX 512
#define THREADS_DEFAULT 16

/** Get the max number of threads to use */
int mpi_bench_get_threads(void);

/* ** ACKs ************************************************* */

void mpi_bench_ack_send(void);
void mpi_bench_ack_recv(void);

#endif /* MPI_BENCH_GENERIC_H */
back to top