Content - 58590e90fb07cf5cf511f181ae0d3015e0dbba89 - a9a8ae3/fast_divmod.h

fast_divmod.h
//
// Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved
// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
//

#pragma once

#include <cstdio>
#include <cassert>
#include <cmath>

namespace Microsoft { namespace MSR { namespace CNTK {

namespace detail {

__host__ __device__ __forceinline__
int mulhi(const int M, const int n) {
#ifdef __CUDA_ARCH__
  return __mulhi(M, n);
#else
  return (((unsigned long long)((long long)M * (long long)n)) >> 32);
#endif
}

}

// Based on code from Chapter 10 of "Hacker's Delight, 2nd ed."
class fast_divmod {
 public:
  fast_divmod(int d = 1) : d_(d), a_(0) {
    find_magic_numbers();
  }
  __host__ __device__
  fast_divmod(const fast_divmod& other) : d_(other.d_), a_(other.a_), s_(other.s_), M_(other.M_) {};
  __host__ __device__
  int div(int n) {
    // get high 32 bits of M * n
    int q = detail::mulhi(M_, n);
  
    // deal with add / subs if needed
    q += a_ * n;

    // shift if necessary
    if (s_ >= 0) {
      q >>= s_;
      q += ((unsigned int)q >> 31);
    }

    return q;
  }
  __host__ __device__
  void divmod(int n, int& q, int& r) {
    // handle special cases
    if (d_ == 1) {
      q = n;
      r = 0;
    } else if (d_ == -1) {
      q = -n;
      r = 0;
    } else {
      // general case
      q = div(n);
      r = n - q*d_;
    }
  }
  // Needed for overflow checks in FixedArray / FixedMatrix
  bool operator!=(const fast_divmod& other) {
    return (d_ != other.d_) || (M_ != other.M_) || (s_ != other.s_) || (a_ != other.a_);
  }

 public:
  int d_, M_, s_, a_;

 private:
  // Based on code from Hacker's delight 2.ed
  // Chapter 10, figure 10-1
  void find_magic_numbers() {
    // special case for d = 1, -1
    if (d_ == 1) {
      M_ = 0;
      s_ = 0;
      a_ = 1;
      return;
    } else if (d_ == -1) {
      M_ = 0;
      s_ = -1;
      a_ = -1;
      return;
    }
    // general case
    const unsigned two31 = 0x80000000;
    unsigned abs_d = (d_ == 0) ? 1 : abs(d_);
    unsigned t = two31 + ((unsigned)d_ >> 31); // t = 2^31 + (d < 0) ? 1 : 0
    unsigned abs_nc = t - 1 - (t % abs_d); // |n_c| = t - 1 - rem(t, |d|)
    int p = 31;
    unsigned q1 = two31 / abs_nc;      // Init q_1 = 2^31 / |n_c|
    unsigned r1 = two31 - q1 * abs_nc; // Init r_1 = rem(q_1, |n_c|)
    unsigned q2 = two31 / abs_d;       // Init q_2 = 2^31 / |d|
    unsigned r2 = two31 - q2 * abs_d;  // Init r_2 = rem(q_2, |d|)

    unsigned delta;
    // iterate p until
    // 2^p < n_c * (d - rem(2^p, d)) is satisfied
    do {
      ++p;
      q1 *= 2;
      r1 *= 2;

      if (r1 >= abs_nc) {
        q1 += 1;
        r1 -= abs_nc;
      }
      q2 *= 2;
      r2 *= 2;

      if (r2 >= abs_d) {
        q2 += 1;
        r2 -= abs_d;
      }
      delta = abs_d - r2;
    } while (q1 < delta ||
             (q1 == delta && r1 == 0));

    // store magic numbers
    M_ = q2 + 1;
    if (d_ < 0) M_ = -M_;
    s_ = p - 32;

    // generate sentinel for correct adds / subs
    // "generate the add if d > 0 and M < 0"
    if ((d_ > 0) && (M_ < 0)) a_ = 1;
    //  "generate the sub if d < 0 and M > 0"
    else if ((d_ < 0) && (M_ > 0)) a_ = -1;
    // Otherwise no add / sub needed
    else a_ = 0;
  }
};

}}}  // End namespaces