Content - a6d2a63cf8640f6d3d33bf4e080c5158fe92e37c - 7b9445f/powerlawCommon.cc

powerlawCommon.cc
#include "powerlawCommon.h"

#include <gsl/gsl_statistics_double.h>

#include "random.h"

/**
 * @author: W.M. Otte (wim@invivonmr.uu.nl); Image Sciences Institute, UMC
 * Utrecht, NL.
 * @date: 19-11-2009
 *
 * Function implementations of powerlaw scaling parameter estimation.
 *
 * ***************************************************************************
 * Method: "Power-law distributions in empirical data", Clauset et al, 2009
 * http://www.santafe.edu/~aaronc/powerlaws/
 * ***************************************************************************
 */
namespace plfit {
/**
 *
 */

void Powerlaw::SingleFit(const VectorType& V, VectorType& results, bool nosmall,
                         bool finite, float startValue, float incrementValue,
                         float endValue) {
  // determine mle-type (discrete or continuous) ...
  bool discrete = IsDiscrete(V);

  if (discrete) {
    // std::cout << "*** INFO ***: Discrete maximum likelihood estimation." <<
    // std::endl;
    MleInt(V, nosmall, finite, startValue, incrementValue, endValue, results);

  } else {
    // std::cout << "*** INFO ***: Continuous maximum likelihood estimation." <<
    // std::endl;
    MleReal(V, nosmall, finite, results);
  }
}

/**
 *
 */

void Powerlaw::BootstrapFit(const VectorType& V, VectorType& results,
                            bool nosmall, bool finiteSize, ValueType startValue,
                            ValueType incrementValue, ValueType endValue,
                            unsigned int bootstrapIterations, bool verbose) {
  // determine mle-type (discrete or continuous) ...
  bool discrete = IsDiscrete(V);

  Bootstrap(V, nosmall, finiteSize, startValue, incrementValue, endValue,
            discrete, bootstrapIterations, results, verbose);
}

/**
 * Return true of all vector elements are integers.
 */

bool Powerlaw::IsDiscrete(const VectorType& V) {
  typename VectorType::const_iterator new_end =
      std::find_if(V.begin(), V.end(), floating_point<ValueType>());
  return (V.end() == new_end);
}

/**
 * Remove duplicated values from vector and return unique values vector.
 */

void Powerlaw::Unique(const VectorType& V, VectorType& results) {
  VectorType tmp;

  std::copy(V.begin(), V.end(), std::back_inserter(tmp));

  // first sort...
  std::sort(tmp.begin(), tmp.end());

  // determine end point of unique vector elements...
  typename VectorType::iterator new_end = std::unique(tmp.begin(), tmp.end());

  // iterator...
  typename VectorType::iterator it = tmp.begin();

  // copy all unique elements in return vector...
  while (it != new_end) {
    results.push_back(*it);
    it++;
  }
}

/**
 * Remove last element of given vector.
 */

void Powerlaw::RemoveLastElement(VectorType& V) {
  V.erase(V.end() - 1, V.end());
}

/**
 * Return sorted inport vector.
 */

void Powerlaw::Sort(const VectorType& V, VectorType& W) {
  std::copy(V.begin(), V.end(), std::back_inserter(W));
  std::sort(W.begin(), W.end());
}

/**
 *
 */

void Powerlaw::KeepHigherOrEqual(VectorType& V, ValueType x) {
  typename VectorType::iterator new_end = std::remove_if(
      V.begin(), V.end(),
      std::bind(std::less<ValueType>(), std::placeholders::_1, x));

  V.erase(new_end, V.end());
}

/**
 *
 */

void Powerlaw::KeepLowerOrEqual(VectorType& V, ValueType x) {
  typename VectorType::iterator new_end = std::remove_if(
      V.begin(), V.end(),
      std::bind(std::greater_equal<ValueType>(), std::placeholders::_1, x));

  V.erase(new_end, V.end());
}

/**
 * Return incremental vector.
 */

void Powerlaw::GetIncrementVector(ValueType start, ValueType increment,
                                  ValueType end, VectorType& V) {
  int n = (end - start) / increment;

  if (n <= 0) {
    std::cerr << "*** WARNING ***: Increment vector is set to size: 0!"
              << std::endl;
  }

  for (; start <= end; start += increment) {
    V.push_back(start);
  }
}

/**
 * Return standard deviation (sqrt variance).
 */

ValueType Powerlaw::GetSD(const std::vector<ValueType>& V) {
  return gsl_stats_sd(V.data(), 1, V.size());
}

/**
 * Return cumulative sum of input vector.
 */

void Powerlaw::CumulativeSum(const VectorType& V, VectorType& W) {
  std::copy(V.begin(), V.end(), std::back_inserter(W));

  for (unsigned int i = 1; i < V.size(); i++) {
    W[i] += W[i - 1];
  }
}

/**
 * Return uniform random values vector from inputs, with similar size.
 */

void Powerlaw::GetRandomValue(const VectorType& inputs, VectorType& results) {
  for (unsigned int i = 0; i < inputs.size(); i++) {
    results.push_back(inputs[randint((int)inputs.size())]);
  }
}

/**
 * Discrete Maximum likelihood estimation.
 */

void Powerlaw::MleInt(const VectorType& x, bool nosmall, bool finiteSize,
                      ValueType startValue, ValueType increment,
                      ValueType endValue, VectorType& results) {
  VectorType vec;
  GetIncrementVector(startValue, increment, endValue, vec);
  bool finiteSizeMessagePrinted = false;

  VectorType zvec(vec.size());
  std::transform(vec.begin(), vec.end(), zvec.begin(), zeta());

  VectorType xmins;
  Unique(x, xmins);

  RemoveLastElement(xmins);

  // first and second column of data matrix...
  VectorType dat1(xmins.size());
  VectorType dat2(xmins.size());

  VectorType sorted_x;

  Sort(x, sorted_x);

  ValueType Y = 0;
  ValueType xmax = *(std::max_element(sorted_x.begin(), sorted_x.end()));

  for (unsigned int xm = 0; xm < xmins.size(); xm++) {
    ValueType xmin = xmins[xm];

    VectorType z(sorted_x);

    KeepHigherOrEqual(z, xmin);

    ValueType n = (ValueType)z.size();

    // fill L with -Inf
    VectorType L(vec.size(), -std::numeric_limits<ValueType>::infinity());

    // use copy of z, because z is used again later...
    VectorType tmp(z);
    std::transform(z.begin(), z.end(), tmp.begin(), log<ValueType>());
    ValueType slogz =
        std::accumulate(tmp.begin(), tmp.end(), static_cast<ValueType>(0));

    // xminvec = (1:xmin-1) ...
    VectorType xminvec_root(xmin - 1);
    for (int i = 0; i < xminvec_root.size(); ++i) xminvec_root[i] = 1 + i;

    for (unsigned int k = 0; k < vec.size(); k++) {
      VectorType xminvec(xminvec_root);

      ValueType exp = vec[k];

      // xminvec.^-vec(k)...
      std::transform(
          xminvec.begin(), xminvec.end(), xminvec.begin(),
          std::bind(power<ValueType>(), std::placeholders::_1, -exp));

      // sum( xminvec.^-vec(k))...
      ValueType sum = std::accumulate(xminvec.begin(), xminvec.end(),
                                      static_cast<ValueType>(0));

      // log-likelihood
      L[k] = -exp * slogz - n * std::log(zvec[k] - sum);
    }

    typename VectorType::iterator max_it = std::max_element(L.begin(), L.end());

    Y = *(max_it);
    unsigned int I = (unsigned int)std::distance(L.begin(), max_it);

    //  compute KS statistic

    ValueType exp = vec[I];

    // xmin:xmax ...
    VectorType xmin_xmax(xmax - (xmin - 1));
    for (int i = 0; i < xmin_xmax.size(); ++i) xmin_xmax[i] = xmin + i;

    // first_part = ( ( ( xmin:xmax ).^-exp ) ) ...
    VectorType first_part(xmin_xmax.size());

    std::transform(xmin_xmax.begin(), xmin_xmax.end(), first_part.begin(),
                   std::bind(power<ValueType>(), std::placeholders::_1, -exp));

    // second_part = (zvec(I) - sum( ( 1:xmin-1 ).^-exp ) ) ...

    // 1:xmin - 1 ...
    VectorType pp(xmin);
    for (int i = 0; i < pp.size(); ++i) pp[i] = 1 + i;
    RemoveLastElement(pp);

    // ( 1:xmin - 1 ) .^ - exp ...
    std::transform(pp.begin(), pp.end(), pp.begin(),
                   std::bind(power<ValueType>(), std::placeholders::_1, -exp));

    // sum( ( 1:xmin -1 ) .^ - exp ) ...
    ValueType sum_sp =
        std::accumulate(pp.begin(), pp.end(), static_cast<ValueType>(0));

    // zvec[I] - sum( ( 1:xmin -1 ) .^ - exp ) ...
    ValueType second_part = zvec[I] - sum_sp;

    // first_part / second_part ...
    std::transform(first_part.begin(), first_part.end(), first_part.begin(),
                   std::bind(std::divides<ValueType>(), std::placeholders::_1,
                             second_part));

    // fit = cumsum( first_part /. second_part ); ==
    // fit = cumsum((((xmin:xmax).^-vec(I)))./ (zvec(I) -
    // sum((1:xmin-1).^-vec(I)))) ...
    VectorType fit;
    CumulativeSum(first_part, fit);

    // normalized histogram ...
    Histogram hist(z, xmin, xmax, (xmax - xmin), true);
    VectorType histogram = hist.getHistogram();

    //  cdi = cumsum(hist(z, xmin:xmax)./n) ...
    VectorType cdi;
    CumulativeSum(histogram, cdi);

    std::transform(fit.begin(), fit.end(), cdi.begin(), fit.begin(),
                   std::minus<ValueType>());
    std::transform(fit.begin(), fit.end(), fit.begin(), abs<ValueType>());

    // dat(xm,:) = [max(abs( fit - cdi )) vec(I)] ...
    dat1[xm] = *(std::max_element(fit.begin(), fit.end()));
    dat2[xm] = vec[I];
  }

  // select the index for the minimum value of D
  // [ D, I ] = min( dat( :, 1 ) ) ...
  typename VectorType::iterator min_it =
      std::min_element(dat1.begin(), dat1.end());

  unsigned int I = (unsigned int)std::distance(dat1.begin(), min_it);
  ValueType xmin = xmins[I];

  // z = x(x>=xmin);
  // n = length(z);
  VectorType z(x);
  KeepHigherOrEqual(z, xmin);
  ValueType n = (ValueType)z.size();

  // alpha = dat( I, 2 ) ...
  ValueType alpha = dat2[I];

  // finite-size correction
  if (finiteSize) {
    alpha =
        alpha * (static_cast<ValueType>(n) - 1) / static_cast<ValueType>(n) +
        1 / static_cast<ValueType>(n);
  }

  if (!finiteSize && (n < 50) && (!finiteSizeMessagePrinted)) {
    std::cout << "*** WARNING ***: finite-size bias may be present!"
              << std::endl;
    finiteSizeMessagePrinted = true;
  }

  // L = -alpha * sum( log( z ) ) - n * log( zvec( find( vec <= alpha, 1 ,
  // 'last' ) ) - sum((1:xmin-1).^-alpha));

  // 1:xmin - 1 ...
  VectorType pp(xmin);
  for (int i = 0; i < pp.size(); ++i) pp[i] = 1 + i;
  RemoveLastElement(pp);
  // ( 1:xmin - 1 ) .^ - alpha ...
  std::transform(pp.begin(), pp.end(), pp.begin(),
                 std::bind(power<ValueType>(), std::placeholders::_1, -alpha));
  // sum( ( 1:xmin -1 ) .^ - alpha ) ...
  ValueType sum_third_part =
      std::accumulate(pp.begin(), pp.end(), static_cast<ValueType>(0));

  std::reverse(vec.begin(), vec.end());
  typename VectorType::iterator it_last = std::find_if(
      vec.begin(), vec.end(),
      std::bind(std::less_equal<ValueType>(), std::placeholders::_1, alpha));
  unsigned int index = (unsigned int)std::distance(vec.begin(), it_last) + 1;
  // n * log( zvec( find( vec <= alpha, 1 , 'last' ) ) -
  // sum((1:xmin-1).^-alpha)) ...
  ValueType sum_second_part =
      n * std::log(zvec[vec.size() - index] - sum_third_part);

  // -alpha * sum( log( z ) ) ...
  std::transform(z.begin(), z.end(), z.begin(), log<ValueType>());
  ValueType sum_first_part =
      -alpha * std::accumulate(z.begin(), z.end(), static_cast<ValueType>(0));

  results.push_back(alpha);
  results.push_back(xmin);
  results.push_back(sum_first_part - sum_second_part);
}

/**
 * Continues Maximum likelihood estimation.
 */

void Powerlaw::MleReal(const VectorType& x, bool nosmall, bool finiteSize,
                       VectorType& results) {
  VectorType xmins;
  Unique(x, xmins);
  bool finiteSizeMessagePrinted = false;

  RemoveLastElement(xmins);

  VectorType dat(xmins.size(), 0);

  VectorType sorted_x;

  Sort(x, sorted_x);

  for (unsigned int xm = 0; xm < xmins.size(); xm++) {
    VectorType z(sorted_x);

    ValueType xmin = xmins[xm];

    KeepHigherOrEqual(z, xmin);
    VectorType tmp(z);  // backup for later in fuction...

    ValueType n = (ValueType)z.size();

    // estimate alpha using direct MLE
    std::transform(
        z.begin(), z.end(), z.begin(),
        std::bind(log_div<ValueType>(), std::placeholders::_1, xmin));

    ValueType a =
        static_cast<ValueType>(n) /
        std::accumulate(z.begin(), z.end(), static_cast<ValueType>(0));

    if (nosmall) {
      if ((static_cast<ValueType>(a) - 1) / sqrt(static_cast<ValueType>(n)) >
          0.1) {
        dat.erase(dat.begin() + xm, dat.end());
        xm = (unsigned int)xmins.size() + 1;
        break;
      }
    }

    // compute KS statistic
    VectorType cx(n);
    for (int i = 0; i < cx.size(); ++i) cx[i] = i;
    std::transform(
        cx.begin(), cx.end(), cx.begin(),
        std::bind(std::divides<ValueType>(), std::placeholders::_1, n));

    // cf = xmin / z ...
    VectorType cf(tmp.size(), xmin);
    std::transform(cf.begin(), cf.end(), tmp.begin(), cf.begin(),
                   std::divides<ValueType>());

    // cf = 1 - ( xmin / z ) ^ a ...
    std::transform(
        cf.begin(), cf.end(), cf.begin(),
        std::bind(power_minus_one<ValueType>(), std::placeholders::_1, a));

    // max( abs( cf - cx ) ) ...
    std::transform(cf.begin(), cf.end(), cx.begin(), cf.begin(),
                   std::minus<ValueType>());
    dat[xm] = *(std::max_element(cf.begin(), cf.end()));
  }

  // D = min(dat) ...
  ValueType D = *(std::min_element(dat.begin(), dat.end()));

  // xmin  = xmins( find( dat <= D, 1, 'first' ) ) ...
  typename VectorType::iterator new_end = std::find_if(
      dat.begin(), dat.end(),
      std::bind(std::less_equal<ValueType>(), std::placeholders::_1, D));

  ValueType xmin = xmins[std::distance(dat.begin(), new_end)];

  // z = x( x >= xmin ) ...
  VectorType z(x);
  KeepHigherOrEqual(z, xmin);

  unsigned int n = (unsigned int)z.size();

  // alpha = 1 + n ./ sum( log(z./xmin) ) ...
  VectorType tmp(z.size(), xmin);
  std::transform(z.begin(), z.end(), tmp.begin(), z.begin(),
                 log_div<ValueType>());
  ValueType sum =
      std::accumulate(z.begin(), z.end(), static_cast<ValueType>(0));
  ValueType alpha = 1.0 + z.size() / sum;

  // finite-size correction
  if (finiteSize) {
    alpha =
        alpha * (static_cast<ValueType>(n) - 1) / static_cast<ValueType>(n) +
        1 / static_cast<ValueType>(n);
  }

  if (!finiteSize && (n < 50) && (!finiteSizeMessagePrinted)) {
    std::cout << "*** WARNING ***: finite-size bias may be present!"
              << std::endl;
    finiteSizeMessagePrinted = true;
  }

  // log-likelihood: L = n*log((alpha-1)/xmin) - alpha.*sum(log(z./xmin));
  ValueType L = n * std::log((alpha - 1) / xmin) - alpha * sum;

  results.push_back(alpha);
  results.push_back(xmin);
  results.push_back(L);
}

/**
 * Run bootstrapping of mle n-times and return Vector with indices:
 *
 * 0: average alpha
 * 1: average xmin
 * 2: average L
 *
 * 3: sd alpha
 * 4: sd xmin
 * 5: sd L
 *
 * Empty VectorType is returned when data is corrupt.
 */

void Powerlaw::Bootstrap(const VectorType& inputs, bool nosmall,
                         bool finiteSize, ValueType startValue,
                         ValueType increment, ValueType endValue, bool discrete,
                         unsigned int n, VectorType& results, bool verbose) {
  VectorType all_alpha;
  VectorType all_xmin;
  VectorType all_L;

  if (verbose)
    std::cout << "*** INFO ***: boostrapping done (%): " << std::endl;

  unsigned int successfulBootstraps = 0;

  for (unsigned int i = 0; i < n; i++) {
    if (verbose) {
      std::cout << (static_cast<ValueType>(i) / static_cast<ValueType>(n)) *
                       100;
      std::cout.flush();
      std::cout << '\r';
    }

    VectorType random_inputs;
    GetRandomValue(inputs, random_inputs);

    VectorType run;

    Mle(random_inputs, nosmall, finiteSize, startValue, increment, endValue,
        discrete, run);

    if (!run.empty()) {
      all_alpha.push_back(run[0]);
      all_xmin.push_back(run[1]);
      all_L.push_back(run[2]);
      successfulBootstraps++;
    }
  }

  if (successfulBootstraps != n) {
    ValueType p = (static_cast<ValueType>(successfulBootstraps) /
                   static_cast<ValueType>(n)) *
                  100;
    std::cerr << "*** WARNING ***: bootstrapping only ran partially "
                 "-> ("
              << p << " %)." << std::endl;
  }

  if (!all_alpha.empty()) {
    ValueType average_alpha =
        std::accumulate(all_alpha.begin(), all_alpha.end(),
                        static_cast<ValueType>(0)) /
        all_alpha.size();
    ValueType average_xmin = std::accumulate(all_xmin.begin(), all_xmin.end(),
                                             static_cast<ValueType>(0)) /
                             all_xmin.size();
    ;
    ValueType average_L =
        std::accumulate(all_L.begin(), all_L.end(), static_cast<ValueType>(0)) /
        all_L.size();
    ;

    ValueType sd_alpha = GetSD(all_alpha);
    ValueType sd_xmin = GetSD(all_xmin);
    ValueType sd_L = GetSD(all_L);

    results.push_back(average_alpha);
    results.push_back(average_xmin);
    results.push_back(average_L);

    results.push_back(sd_alpha);
    results.push_back(sd_xmin);
    results.push_back(sd_L);
  }
}

/**
 * Maximum likelihood estimation.
 *
 * If input is not correct an empty VectorType will be returned!
 */

void Powerlaw::Mle(const VectorType& inputs, bool nosmall, bool finiteSize,
                   ValueType startValue, ValueType increment,
                   ValueType endValue, bool discrete, VectorType& results) {
  // check if at all inputs are not identical ...
  if (std::max_element(inputs.begin(), inputs.end()) ==
      std::min_element(inputs.begin(), inputs.end())) {
    return;
  }

  if (discrete) {
    if (startValue > 1.0)
      MleInt(inputs, nosmall, finiteSize, startValue, increment, endValue,
             results);

    else
      std::cerr << "*** ERROR ***: start-value should be higher than 1.0!"
                << std::endl;

  } else
    MleReal(inputs, nosmall, finiteSize, results);
}
}  // namespace plfit