https://github.com/hadley/dplyr
Raw File
Tip revision: 16647fc0e6fa2d2905a11efe08fbca2db0ec4df1 authored by Romain Francois on 22 July 2020, 12:05:00 UTC
Release summary
Tip revision: 16647fc
bind.r
#' Efficiently bind multiple data frames by row and column
#'
#' This is an efficient implementation of the common pattern of
#' `do.call(rbind, dfs)` or `do.call(cbind, dfs)` for binding many
#' data frames into one.
#'
#' The output of `bind_rows()` will contain a column if that column
#' appears in any of the inputs.
#'
#' @param ... Data frames to combine.
#'
#'   Each argument can either be a data frame, a list that could be a data
#'   frame, or a list of data frames.
#'
#'   When row-binding, columns are matched by name, and any missing
#'   columns will be filled with NA.
#'
#'   When column-binding, rows are matched by position, so all data
#'   frames must have the same number of rows. To match by value, not
#'   position, see [mutate-joins].
#' @param .id Data frame identifier.
#'
#'   When `.id` is supplied, a new column of identifiers is
#'   created to link each row to its original data frame. The labels
#'   are taken from the named arguments to `bind_rows()`. When a
#'   list of data frames is supplied, the labels are taken from the
#'   names of the list. If no names are found a numeric sequence is
#'   used instead.
#' @return `bind_rows()` and `bind_cols()` return the same type as
#'   the first input, either a data frame, `tbl_df`, or `grouped_df`.
#' @examples
#' one <- starwars[1:4, ]
#' two <- starwars[9:12, ]
#'
#' # You can supply data frames as arguments:
#' bind_rows(one, two)
#'
#' # The contents of lists are spliced automatically:
#' bind_rows(list(one, two))
#' bind_rows(split(starwars, starwars$homeworld))
#' bind_rows(list(one, two), list(two, one))
#'
#'
#' # In addition to data frames, you can supply vectors. In the rows
#' # direction, the vectors represent rows and should have inner
#' # names:
#' bind_rows(
#'   c(a = 1, b = 2),
#'   c(a = 3, b = 4)
#' )
#'
#' # You can mix vectors and data frames:
#' bind_rows(
#'   c(a = 1, b = 2),
#'   tibble(a = 3:4, b = 5:6),
#'   c(a = 7, b = 8)
#' )
#'
#'
#' # When you supply a column name with the `.id` argument, a new
#' # column is created to link each row to its original data frame
#' bind_rows(list(one, two), .id = "id")
#' bind_rows(list(a = one, b = two), .id = "id")
#' bind_rows("group 1" = one, "group 2" = two, .id = "groups")
#'
#' # Columns don't need to match when row-binding
#' bind_rows(tibble(x = 1:3), tibble(y = 1:4))
#' \dontrun{
#' # Rows do need to match when column-binding
#' bind_cols(tibble(x = 1:3), tibble(y = 1:2))
#'
#' # even with 0 columns
#' bind_cols(tibble(x = 1:3), tibble())
#' }
#'
#' bind_cols(one, two)
#' bind_cols(list(one, two))
#' @name bind
NULL

#' @export
#' @rdname bind
bind_rows <- function(..., .id = NULL) {
  dots <- list2(...)

  # bind_rows() has weird legacy squashing behaviour
  is_flattenable <- function(x) vec_is_list(x) && !is_named(x)
  if (length(dots) == 1 && is_bare_list(dots[[1]])) {
    dots <- dots[[1]]
  }
  dots <- flatten_if(dots, is_flattenable)
  dots <- discard(dots, is.null)

  if (is_named(dots) && !all(map_lgl(dots, dataframe_ish))) {
    # This is hit by map_dfr() so we can't easily deprecate
    return(as_tibble(dots))
  }

  for (i in seq_along(dots)) {
    .x <- dots[[i]]
    if (!is.data.frame(.x) && !vec_is(.x)) {
      abort(glue("Argument {i} must be a data frame or a named atomic vector."))
    }

    if (is.null(names(.x))) {
      abort(glue("Argument {i} must have names."))
    }
  }

  if (!is_null(.id)) {
    if (!is_string(.id)) {
      bad_args(".id", "must be a scalar string, ",
        "not {friendly_type_of(.id)} of length {length(.id)}."
      )
    }
    if (!is_named(dots)) {
      names(dots) <- seq_along(dots)
    }
  }

  if (!length(dots)) {
    return(tibble())
  }

  first <- dots[[1L]]
  dots <- map(dots, function(.x) {
    if (vec_is_list(.x)) {
      .x <- new_data_frame(as.list(.x))
    }
    .x
  })

  if (is.null(.id)) {
    names(dots) <- NULL
  }
  out <- vec_rbind(!!!dots, .names_to = .id)
  if (length(dots)) {
    if (is.data.frame(first)) {
      out <- dplyr_reconstruct(out, first)
    } else {
      out <- as_tibble(out)
    }
  }
  out
}

#' @export
#' @rdname bind
bind_cols <- function(...) {
  dots <- list2(...)

  dots <- squash_if(dots, vec_is_list)
  dots <- discard(dots, is.null)

  # Strip names off of data frame components so that vec_cbind() unpacks them
  is_data_frame <- map_lgl(dots, is.data.frame)
  names(dots)[is_data_frame] <- ""

  out <- vec_cbind(!!!dots)
  if (!any(map_lgl(dots, is.data.frame))) {
    out <- as_tibble(out)
  }
  if (length(dots) && is.data.frame(first <- dots[[1L]])) {
    out <- dplyr_reconstruct(out, first)
  }
  out
}

# helpers -----------------------------------------------------------------

dataframe_ish <- function(.x) {
  is.data.frame(.x) || (vec_is(.x) && is_named(.x))
}
back to top