https://github.com/hadley/dplyr
Raw File
Tip revision: 98b8a0f5de25e238ac97514da24ec228610c8701 authored by Lionel Henry on 19 January 2021, 09:23:23 UTC
Merge pull request #5686 from lionel-/fix-warning-overhead
Tip revision: 98b8a0f
group_by.cpp
#include "dplyr.h"
#include <vector>

// support for expand_groups()
class ExpanderCollecter;

struct ExpanderResult {
  ExpanderResult(R_xlen_t start_, R_xlen_t end_, R_xlen_t index_) :
  start(start_),
  end(end_),
  index(index_)
  {}

  R_xlen_t start;
  R_xlen_t end;
  R_xlen_t index;

  inline R_xlen_t size() const {
    return end - start;
  }
};

class Expander {
public:
  virtual ~Expander() {};
  virtual R_xlen_t size() const = 0;
  virtual ExpanderResult collect(ExpanderCollecter& results, int depth) const = 0;
};

class ExpanderCollecter {
public:
  ExpanderCollecter(int nvars_, SEXP new_indices_, int new_size_, SEXP new_rows_, SEXP old_rows_) :
  nvars(nvars_),
  old_rows(old_rows_),
  new_size(new_size_),
  new_indices(new_indices_),
  new_rows(new_rows_),
  leaf_index(0),
  vec_new_indices(nvars)
  {
    Rf_classgets(new_rows, dplyr::vectors::classes_vctrs_list_of);
    Rf_setAttrib(new_rows, dplyr::symbols::ptype, dplyr::vectors::empty_int_vector);

    for (int i = 0; i < nvars; i++) {
      SEXP new_indices_i = Rf_allocVector(INTSXP, new_size);
      SET_VECTOR_ELT(new_indices, i, new_indices_i);
      vec_new_indices[i] = INTEGER(new_indices_i);
    }
  }

  ExpanderResult collect_leaf(R_xlen_t start, R_xlen_t end, R_xlen_t index) {
    if (start == end) {
      SET_VECTOR_ELT(new_rows, leaf_index++, dplyr::vectors::empty_int_vector);
    } else {
      SET_VECTOR_ELT(new_rows, leaf_index++, VECTOR_ELT(old_rows, start));
    }

    return ExpanderResult(leaf_index - 1, leaf_index, index);
  }

  ExpanderResult collect_node(int depth, R_xlen_t index, const std::vector<Expander*>& expanders) {
    int n = expanders.size();
    if (n == 0) {
      return ExpanderResult(NA_INTEGER, NA_INTEGER, index);
    }

    R_xlen_t nr = 0;

    ExpanderResult first = expanders[0]->collect(*this, depth + 1);
    R_xlen_t start = first.start;
    R_xlen_t end = first.end;
    fill_indices(depth, start, end, first.index);

    nr += first.size();

    for (R_xlen_t i = 1; i < n; i++) {
      ExpanderResult exp_i = expanders[i]->collect(*this, depth + 1);
      fill_indices(depth, exp_i.start, exp_i.end, exp_i.index);

      nr += exp_i.size();
      end = exp_i.end;
    }

    return ExpanderResult(start, end, index);
  }

private:
  int nvars;
  SEXP old_rows;
  R_xlen_t new_size;
  SEXP new_indices;
  SEXP new_rows;
  int leaf_index;

  std::vector<int*> vec_new_indices;

  void fill_indices(int depth, R_xlen_t start, R_xlen_t end, R_xlen_t index) {
    std::fill(vec_new_indices[depth] + start, vec_new_indices[depth] + end, index);
  }

  ExpanderCollecter(const ExpanderCollecter&);
};


Expander* expander(const std::vector<SEXP>& data, const std::vector<int*>& positions, int depth, R_xlen_t index, R_xlen_t start, R_xlen_t end);

inline R_xlen_t expanders_size(const std::vector<Expander*> expanders) {
  R_xlen_t n = 0;
  for (size_t i = 0; i < expanders.size(); i++) {
    n += expanders[i]->size();
  }
  return n;
}

class FactorExpander : public Expander {
public:
  FactorExpander(const std::vector<SEXP>& data_, const std::vector<int*>& positions_, int depth_, R_xlen_t index_, R_xlen_t start_, R_xlen_t end_) :
  data(data_),
  positions(positions_),
  index(index_),
  start(start_),
  end(end_)
  {
    SEXP fac = data[depth_];
    SEXP levels = PROTECT(Rf_getAttrib(fac, dplyr::symbols::levels));
    R_xlen_t n_levels = XLENGTH(levels);
    UNPROTECT(1);

    expanders.resize(n_levels);

    int* fac_pos = positions[depth_];

    // for each level, setup an expander for `depth + 1`
    R_xlen_t j = start;
    for (R_xlen_t i = 0; i < n_levels; i++) {
      R_xlen_t start_i = j;
      while (j < end && fac_pos[j] == i + 1) j++;
      expanders[i] = expander(data, positions, depth_ + 1, i + 1, start_i, j);
    }

    // implicit NA
    if (j < end) {
      expanders.push_back(expander(data, positions, depth_ + 1, NA_INTEGER, j, end));
    }
  }
  ~FactorExpander() {
    for (int i = expanders.size() - 1; i >= 0; i--) delete expanders[i];
  }

  virtual R_xlen_t size() const {
    return expanders_size(expanders);
  }

  ExpanderResult collect(ExpanderCollecter& results, int depth) const {
    return results.collect_node(depth, index, expanders);
  }

private:
  const std::vector<SEXP>& data;
  const std::vector<int*>& positions;
  R_xlen_t index;
  R_xlen_t start;
  R_xlen_t end;

  std::vector<Expander*> expanders;
};

class VectorExpander : public Expander {
public:
  VectorExpander(const std::vector<SEXP>& data_, const std::vector<int*>& positions_, int depth_, R_xlen_t index_, R_xlen_t start, R_xlen_t end) :
  index(index_)
  {
    // edge case no data, we need a fake expander with NA index
    if (start == end) {
      expanders.push_back(expander(data_, positions_, depth_ + 1, NA_INTEGER, start, end));
    } else {
      int* vec_pos = positions_[depth_];

      for (R_xlen_t j = start; j < end;) {
        R_xlen_t current = vec_pos[j];
        R_xlen_t start_idx = j;

        ++j;
        for (; j < end && vec_pos[j] == current; ++j);
        expanders.push_back(expander(data_, positions_, depth_ + 1, current, start_idx, j));
      }
    }

  }
  ~VectorExpander() {
    for (int i = expanders.size() - 1; i >= 0; i--) delete expanders[i];
  }

  virtual R_xlen_t size() const {
    return expanders_size(expanders);
  }

  ExpanderResult collect(ExpanderCollecter& results, int depth) const {
    return results.collect_node(depth, index, expanders);
  }

private:
  int index;
  std::vector<Expander*> expanders;
};

class LeafExpander : public Expander {
public:
  LeafExpander(const std::vector<SEXP>& data_, const std::vector<int*>& positions_, int depth_, int index_, int start_, int end_) :
  index(index_),
  start(start_),
  end(end_)
  {}

  ~LeafExpander() {}

  virtual R_xlen_t size() const {
    return 1;
  }

  ExpanderResult collect(ExpanderCollecter& results, int depth) const {
    return results.collect_leaf(start, end, index);
  }

private:
  R_xlen_t index;
  R_xlen_t start;
  R_xlen_t end;
};

Expander* expander(const std::vector<SEXP>& data, const std::vector<int*>& positions, int depth, R_xlen_t index, R_xlen_t start, R_xlen_t end) {
  if (depth == (int)positions.size()) {
    return new LeafExpander(data, positions, depth, index, start, end);
  } else if (Rf_isFactor(data[depth])) {
    return new FactorExpander(data, positions, depth, index, start, end);
  } else {
    return new VectorExpander(data, positions, depth, index, start, end);
  }
}

SEXP dplyr_expand_groups(SEXP old_groups, SEXP positions, SEXP s_nr) {
  int nr = INTEGER(s_nr)[0];
  R_xlen_t nvars = XLENGTH(old_groups) - 1;

  SEXP old_rows = VECTOR_ELT(old_groups, nvars);
  std::vector<SEXP> vec_data(nvars);
  std::vector<int*> vec_positions(nvars);
  for (R_xlen_t i = 0; i < nvars; i++) {
    vec_data[i] = VECTOR_ELT(old_groups, i);
    vec_positions[i] = INTEGER(VECTOR_ELT(positions, i));
  }

  Expander* exp = expander(vec_data, vec_positions, 0, NA_INTEGER, 0, nr);
  SEXP new_indices = PROTECT(Rf_allocVector(VECSXP, nvars));
  SEXP new_rows = PROTECT(Rf_allocVector(VECSXP, exp->size()));
  ExpanderCollecter results(nvars, new_indices, exp->size(), new_rows, old_rows);
  exp->collect(results, 0);

  SEXP out = PROTECT(Rf_allocVector(VECSXP, 2));
  SET_VECTOR_ELT(out, 0, new_indices);
  SET_VECTOR_ELT(out, 1, new_rows);
  delete exp;

  Rf_namesgets(out, dplyr::vectors::names_expanded);

  UNPROTECT(3);
  return out;
}

SEXP dplyr_validate_grouped_df(SEXP df, SEXP s_check_bounds) {
  if (!Rf_inherits(df, "grouped_df")) {
    return Rf_mkString("not a `grouped_df` object.");
  }

  SEXP groups = PROTECT(Rf_getAttrib(df, dplyr::symbols::groups));

  if (!Rf_inherits(groups, "data.frame") || XLENGTH(groups) < 1) {
    SEXP out = Rf_mkString("The `groups` attribute is not a data frame with its last column called `.rows`.");
    UNPROTECT(1);
    return out;
  }

  SEXP groups_names = PROTECT(Rf_getAttrib(groups, R_NamesSymbol));
  if (Rf_isNull(groups_names) || TYPEOF(groups_names) != STRSXP || ::strcmp(CHAR(STRING_ELT(groups_names, XLENGTH(groups_names) - 1)), ".rows")) {
    SEXP out = Rf_mkString("The `groups` attribute is not a data frame with its last column called `.rows`.");
    UNPROTECT(2);
    return out;
  }

  SEXP dot_rows = VECTOR_ELT(groups, XLENGTH(groups) - 1);
  if (TYPEOF(dot_rows) != VECSXP) {
    SEXP out = Rf_mkString("The `groups` attribute is not a data frame with its last column called `.rows`.");
    UNPROTECT(2);
    return out;
  }
  const SEXP* p_dot_rows = VECTOR_PTR_RO(dot_rows);

  if (LOGICAL(s_check_bounds)[0]) {
    R_xlen_t nr = XLENGTH(dot_rows);
    for (R_xlen_t i = 0; i < nr; i++) {
      SEXP rows_i = p_dot_rows[i];
      if (TYPEOF(rows_i) != INTSXP) {
        SEXP out = Rf_mkString("`.rows` column is not a list of one-based integer vectors.");
        UNPROTECT(2);
        return out;
      }
    }

    R_xlen_t nr_df = vctrs::short_vec_size(df);
    for (R_xlen_t i = 0; i < nr; i++) {
      SEXP rows_i = p_dot_rows[i];
      R_xlen_t n_i = XLENGTH(rows_i);
      int* p_rows_i = INTEGER(rows_i);
      for (R_xlen_t j = 0; j < n_i; j++, ++p_rows_i) {
        if (*p_rows_i < 1 || *p_rows_i > nr_df) {
          SEXP out = Rf_mkString("out of bounds indices.");
          UNPROTECT(2);
          return out;
        }
      }
    }

  }

  UNPROTECT(2);
  return R_NilValue;
}
back to top