https://github.com/cran/stringi
Raw File
Tip revision: 29ff8be9fae3ad316384d836756ecf06e3a9da8c authored by Marek Gagolewski on 28 June 2015, 00:00:00 UTC
version 0.5-5
Tip revision: 29ff8be
compare.R
## This file is part of the 'stringi' package for R.
## Copyright (C) 2013-2015, Marek Gagolewski and Bartek Tartanus
## All rights reserved.
##
## Redistribution and use in source and binary forms, with or without
## modification, are permitted provided that the following conditions are met:
##
## 1. Redistributions of source code must retain the above copyright notice,
## this list of conditions and the following disclaimer.
##
## 2. Redistributions in binary form must reproduce the above copyright notice,
## this list of conditions and the following disclaimer in the documentation
## and/or other materials provided with the distribution.
##
## 3. Neither the name of the copyright holder nor the names of its
## contributors may be used to endorse or promote products derived from
## this software without specific prior written permission.
##
## THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
## "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING,
## BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
## FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
## HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
## SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
## PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
## OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
## WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
## OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
## EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.


#' @title
#' Compare Strings with or without Collation
#'
#' @description
#' These functions may be used to determine if two strings
#' are equal, canonically equivalent (this is performed in a much more clever
#' fashion than when testing for equality),
#' or to check whether they appear in
#' a specific lexicographic order.
#'
#'
#' @details
#' All the functions listed here are vectorized over \code{e1} and \code{e2}.
#'
#' \code{stri_cmp_eq} tests whether two corresponding strings
#' consist of exactly the same code points, while \code{stri_cmp_neq} allow
#' to check whether there is any difference between them. These are
#' locale-independent operations: for natural language text processing,
#' in which the notion of canonical equivalence is more valid, this might
#' not be exactly what you are looking for, see Examples.
#' By the way, note that \pkg{stringi} always silently removes UTF-8
#' BOMs from input strings, so e.g. \code{stri_cmp_eq} does not take
#' BOMs into account while comparing strings.
#'
#' On the other hand, \code{stri_cmp_equiv} tests for
#' canonical equivalence of two strings and is locale-dependent.
#' Additionally, the \pkg{ICU}'s Collator may be tuned up so that
#' e.g. the comparison is case-insensitive.
#' To test whether two strings are not canonically equivalent,
#' call \code{stri_cmp_nequiv}.
#'
#' What is more,  \code{stri_cmp_le} tests whether
#' the elements in the first vector are less than or equal to
#' the corresponding elements in the second vector,
#' \code{stri_cmp_ge} tests whether they are greater or equal,
#' \code{stri_cmp_lt} if less, and \code{stri_cmp_gt} if greater,
#' see also e.g. \code{\link{\%s<\%}}.
#'
#' Finally, \code{stri_compare} is an alias to \code{stri_cmp}. They both
#' perform exactly the same locale-dependent operation.
#' Both functions provide a C library's \code{strcmp()} look-and-feel,
#' see Value for details.
#'
#'
#' For more information on \pkg{ICU}'s Collator and how to tune it up
#' in \pkg{stringi}, refer to \code{\link{stri_opts_collator}}.
#' Please note that different locale settings may lead to different results
#' (see the examples below).
#'
#'
#' @param e1,e2 character vectors or objects coercible to character vectors
#' @param opts_collator a named list with \pkg{ICU} Collator's options
#' as generated with \code{\link{stri_opts_collator}}, \code{NULL}
#' for default collation options.
#' @param ... additional settings for \code{opts_collator}
#'
#' @return The \code{stri_cmp} and \code{stri_compare} functions
#' return an integer vector with comparison results of corresponding
#' pairs of elements in \code{e1} and \code{e2}:
#' \code{-1} if \code{e1[...] < e2[...]},
#' \code{0} if they are canonically equivalent, and \code{1} if greater.
#'
#' The other functions return a logical vector that indicates
#' whether a given relation holds between two corresponding elements
#' in \code{e1} and \code{e2}.
#'
#' @references
#' \emph{Collation} - ICU User Guide,
#' \url{http://userguide.icu-project.org/collation}
#'
#' @examples
#' # in Polish ch < h:
#' stri_cmp_lt("hladny", "chladny", locale="pl_PL")
#'
#' # in Slovak ch > h:
#' stri_cmp_lt("hladny", "chladny", locale="sk_SK")
#'
#' # < or > (depends on locale):
#' stri_cmp("hladny", "chladny")
#'
#' # ignore case differences:
#' stri_cmp_equiv("hladny", "HLADNY", strength=2)
#'
#' # also ignore diacritical differences:
#' stri_cmp_equiv("hladn\u00FD", "hladny", strength=1, locale="sk_SK")
#'
#' # non-Unicode-normalized vs normalized string:
#' stri_cmp_equiv(stri_trans_nfkd("\u0105"), "\u105")
#'
#' # note the difference:
#' stri_cmp_eq(stri_trans_nfkd("\u0105"), "\u105")
#'
#' # ligatures:
#' stri_cmp_equiv("\ufb00", "ff", strength=2)
#'
#' # phonebook collation
#' stri_cmp_equiv("G\u00e4rtner", "Gaertner", locale="de_DE@@collation=phonebook", strength=1L)
#' stri_cmp_equiv("G\u00e4rtner", "Gaertner", locale="de_DE", strength=1L)
#'
#' @family locale_sensitive
#' @export
#' @rdname stri_compare
stri_compare <- function(e1, e2, ..., opts_collator=NULL) {
   if (!missing(...))
       opts_collator <- do.call(stri_opts_collator, as.list(c(opts_collator, ...)))
   .Call(C_stri_cmp_integer, e1, e2, opts_collator)
}


#' @export
#' @rdname stri_compare
stri_cmp <- stri_compare


#' @export
#' @rdname stri_compare
stri_cmp_eq <- function(e1, e2) {
   .Call(C_stri_cmp_codepoints, e1, e2, 0L)
}


#' @export
#' @rdname stri_compare
stri_cmp_neq <- function(e1, e2) {
   .Call(C_stri_cmp_codepoints, e1, e2, 1L)
}


#' @export
#' @rdname stri_compare
stri_cmp_equiv <- function(e1, e2, ..., opts_collator=NULL) {
   if (!missing(...))
       opts_collator <- do.call(stri_opts_collator, as.list(c(opts_collator, ...)))
   .Call(C_stri_cmp_logical, e1, e2, opts_collator, c(0L, 0L))
}


#' @export
#' @rdname stri_compare
stri_cmp_nequiv <- function(e1, e2, ..., opts_collator=NULL) {
   if (!missing(...))
       opts_collator <- do.call(stri_opts_collator, as.list(c(opts_collator, ...)))
   .Call(C_stri_cmp_logical, e1, e2, opts_collator, c(0L, 1L))
}

#' @export
#' @rdname stri_compare
stri_cmp_lt <- function(e1, e2, ..., opts_collator=NULL) {
   if (!missing(...))
       opts_collator <- do.call(stri_opts_collator, as.list(c(opts_collator, ...)))
   .Call(C_stri_cmp_logical, e1, e2, opts_collator, c(-1L, 0L))
}


#' @export
#' @rdname stri_compare
stri_cmp_gt <- function(e1, e2, ..., opts_collator=NULL) {
   if (!missing(...))
       opts_collator <- do.call(stri_opts_collator, as.list(c(opts_collator, ...)))
   .Call(C_stri_cmp_logical, e1, e2, opts_collator, c(1L, 0L))
}

#' @export
#' @rdname stri_compare
stri_cmp_le <- function(e1, e2, ..., opts_collator=NULL) {
   if (!missing(...))
       opts_collator <- do.call(stri_opts_collator, as.list(c(opts_collator, ...)))
   .Call(C_stri_cmp_logical, e1, e2, opts_collator, c(1L, 1L))
}


#' @export
#' @rdname stri_compare
stri_cmp_ge <- function(e1, e2, ..., opts_collator=NULL) {
   if (!missing(...))
       opts_collator <- do.call(stri_opts_collator, as.list(c(opts_collator, ...)))
   .Call(C_stri_cmp_logical, e1, e2, opts_collator, c(-1L, 1L))
}


#' @title
#' Compare Strings with or without Collation
#'
#' @description
#' Relational operators for comparing corresponding strings in
#' two character vectors, with a typical \R look-and-feel.
#'
#' @details
#' These functions call \code{\link{stri_cmp_le}} or its
#' friends, using default collator options.
#' Thus, they are vectorized over \code{e1} and \code{e2}.
#'
#' \code{\%stri==\%} tests for canonical equivalence of strings
#' (see \code{\link{stri_cmp_equiv}}) and is a locale-dependent operation.
#' On the other hand, \code{\%stri===\%} performs a locale-independent,
#' code point-based comparison.
#'
#'
#' @param e1,e2 character vectors or objects coercible to character vectors
#'
#' @return All the functions return a logical vector
#' indicating the result of a pairwise comparison.
#' As usual, the elements of shorter vectors are recycled if necessary.
#'
#'
#' @examples
#' "a" %stri<% "b"
#' c("a", "b", "c") %stri>=% "b"
#'
#' @usage
#' e1 \%s<\% e2
#'
#' @family locale_sensitive
#' @rdname oper_comparison
#' @export
"%s<%" <- function(e1, e2) {
   stri_cmp_lt(e1, e2)
}


#' @usage
#' e1 \%s<=\% e2
#' @rdname oper_comparison
#' @export
"%s<=%" <- function(e1, e2) {
   stri_cmp_le(e1, e2)
}


#' @usage
#' e1 \%s>\% e2
#' @rdname oper_comparison
#' @export
"%s>%" <- function(e1, e2) {
   stri_cmp_gt(e1, e2)
}


#' @usage
#' e1 \%s>=\% e2
#' @rdname oper_comparison
#' @export
"%s>=%" <- function(e1, e2) {
   stri_cmp_ge(e1, e2)
}


#' @usage
#' e1 \%s==\% e2
#' @rdname oper_comparison
#' @export
"%s==%" <- function(e1, e2) {
   stri_cmp_equiv(e1, e2)
}


#' @usage
#' e1 \%s!=\% e2
#' @rdname oper_comparison
#' @export
"%s!=%" <- function(e1, e2) {
   stri_cmp_nequiv(e1, e2)
}


#' @usage
#' e1 \%s===\% e2
#' @rdname oper_comparison
#' @export
"%s===%" <- function(e1, e2) {
   stri_cmp_eq(e1, e2)
}


#' @usage
#' e1 \%s!==\% e2
#' @rdname oper_comparison
#' @export
"%s!==%" <- function(e1, e2) {
   stri_cmp_neq(e1, e2)
}


#' @usage
#' e1 \%stri<\% e2
#' @rdname oper_comparison
#' @export
"%stri<%" <- function(e1, e2) {
   stri_cmp_lt(e1, e2)
}


#' @usage
#' e1 \%stri<=\% e2
#' @rdname oper_comparison
#' @export
"%stri<=%" <- function(e1, e2) {
   stri_cmp_le(e1, e2)
}


#' @usage
#' e1 \%stri>\% e2
#' @rdname oper_comparison
#' @export
"%stri>%" <- function(e1, e2) {
   stri_cmp_gt(e1, e2)
}


#' @usage
#' e1 \%stri>=\% e2
#' @rdname oper_comparison
#' @export
"%stri>=%" <- function(e1, e2) {
   stri_cmp_ge(e1, e2)
}


#' @usage
#' e1 \%stri==\% e2
#' @rdname oper_comparison
#' @export
"%stri==%" <- function(e1, e2) {
   stri_cmp_equiv(e1, e2)
}


#' @usage
#' e1 \%stri!=\% e2
#' @rdname oper_comparison
#' @export
"%stri!=%" <- function(e1, e2) {
   stri_cmp_nequiv(e1, e2)
}


#' @usage
#' e1 \%stri===\% e2
#' @rdname oper_comparison
#' @export
"%stri===%" <- function(e1, e2) {
   stri_cmp_eq(e1, e2)
}


#' @usage
#' e1 \%stri!==\% e2
#' @rdname oper_comparison
#' @export
"%stri!==%" <- function(e1, e2) {
   stri_cmp_neq(e1, e2)
}


#' @title
#' Ordering Permutation and Sorting
#'
#'
#' @description
#' \link{stri_order} determines a permutation which rearranges
#' strings into an ascending or descending order.
#' \link{stri_sort} sorts the vector according to a lexicographic order.
#'
#'
#' @details
#' For more information on \pkg{ICU}'s Collator and how to tune it up
#' in \pkg{stringi}, refer to \code{\link{stri_opts_collator}}.
#'
#' These functions use a stable sort algorithm (\pkg{STL}'s stable_sort),
#' which performs up to \eqn{N*log^2(N)} element comparisons,
#' where \eqn{N} is the length of \code{str}.
#'
#' Interestingly, our benchmarks indicate that \code{stri_order}
#' is most often faster that \R's \code{order}.
#'
#' @param str a character vector
#' @param decreasing a single logical value; should the sort order
#'    be nondecreasing (\code{FALSE}, default)
#'    or nonincreasing (\code{TRUE})?
#' @param na_last a single logical value; controls the treatment of \code{NA}s
#'    in \code{str}. If \code{TRUE}, then missing values in \code{str} are put
#'    at the end; if \code{FALSE}, they are put at the beginning;
#'    if \code{NA}, then they are removed from the output.
#' @param opts_collator a named list with \pkg{ICU} Collator's options
#' as generated with \code{\link{stri_opts_collator}}, \code{NULL}
#' for default collation options
#' @param ... additional settings for \code{opts_collator}
#'
#' @return For \code{stri_order}, an integer vector that gives the sort order
#' is returned.
#'
#' For \code{stri_order}, you get a sorted version of \code{str},
#' i.e. a character vector.
#'
#' @references
#' \emph{Collation} - ICU User Guide,
#' \url{http://userguide.icu-project.org/collation}
#'
#' @family locale_sensitive
#' @export
#' @rdname stri_order
#'
#' @examples
#' stri_sort(c("hladny", "chladny"), locale="pl_PL")
#'
#' stri_sort(c("hladny", "chladny"), locale="sk_SK")
stri_order <- function(str, decreasing=FALSE, na_last=TRUE, ..., opts_collator=NULL) {
   if (!missing(...))
       opts_collator <- do.call(stri_opts_collator, as.list(c(opts_collator, ...)))
   .Call(C_stri_order_or_sort, str, decreasing, na_last, opts_collator, 1L)
}


#' @export
#' @rdname stri_order
stri_sort <-  function(str, decreasing=FALSE, na_last=NA, ..., opts_collator=NULL) {
   if (!missing(...))
       opts_collator <- do.call(stri_opts_collator, as.list(c(opts_collator, ...)))
   .Call(C_stri_order_or_sort, str, decreasing, na_last, opts_collator, 2L)
}


#' @title Extract Unique Elements
#'
#' @description
#' This function returns a character vector like \code{str},
#' but with duplicate elements removed.
#'
#' @details
#' As usual in \pkg{stringi}, no attributes are copied.
#' Unlike \code{\link{unique}}, this function
#' tests for canonical equivalence of strings (and not
#' whether the strings are just bytewise equal). Such an operation
#' is locale-dependent. Hence, \code{stri_unique} is significantly
#' slower (but much better suited for natural language processing)
#' than its base R counterpart.
#'
#' See also \code{\link{stri_duplicated}} for indicating non-unique elements.
#'
#' @param str a character vector
#' @param opts_collator a named list with \pkg{ICU} Collator's options
#' as generated with \code{\link{stri_opts_collator}}, \code{NULL}
#' for default collation options
#' @param ... additional settings for \code{opts_collator}
#'
#' @return Returns a character vector.
#'
#' @examples
#' # normalized and non-Unicode-normalized version of the same code point:
#' stri_unique(c("\u0105", stri_trans_nfkd("\u0105")))
#' unique(c("\u0105", stri_trans_nfkd("\u0105")))
#'
#' stri_unique(c("gro\u00df", "GROSS", "Gro\u00df", "Gross"), strength=1)
#'
#' @references
#' \emph{Collation} - ICU User Guide,
#' \url{http://userguide.icu-project.org/collation}
#'
#' @family locale_sensitive
#' @export
stri_unique <-  function(str, ..., opts_collator=NULL) {
   if (!missing(...))
       opts_collator <- do.call(stri_opts_collator, as.list(c(opts_collator, ...)))
   .Call(C_stri_unique, str, opts_collator)
}


#' @title
#' Determine Duplicated Elements
#'
#' @description
#' \code{stri_duplicated()} determines which strings in a character vector
#' are duplicates of other elements.
#'
#' \code{stri_duplicated_any()} determines if there are any duplicated
#' strings in a character vector.
#'
#' @details
#' Missing values are regarded as equal.
#'
#' Unlike \code{\link{duplicated}} and \code{\link{anyDuplicated}},
#' these functions test for canonical equivalence of strings
#' (and not whether the strings are just bytewise equal)
#' Such operations are locale-dependent.
#' Hence, \code{stri_duplicated} and \code{stri_duplicated_any}
#' are significantly slower (but much better suited for natural language
#' processing) than their base R counterpart.
#'
#' See also \code{\link{stri_unique}} for extracting unique elements.
#'
#' @param str a character vector
#' @param fromLast a single logical value;
#'    indicating whether duplication should be considered from the
#'    reverse side
#' @param opts_collator a named list with \pkg{ICU} Collator's options
#' as generated with \code{\link{stri_opts_collator}}, \code{NULL}
#' for default collation options
#' @param ... additional settings for \code{opts_collator}
#'
#' @return
#' \code{stri_duplicated()} returns a logical vector of the same length
#' as \code{str}. Each of its elements indicates whether a canonically
#' equivalent string was already found in \code{str}.
#'
#' \code{stri_duplicated_any()} returns a single non-negative integer.
#' Value of 0 indicates that all the elements in \code{str} are unique.
#' Otherwise, it gives the index of the first non-unique element.
#'
#' @references
#' \emph{Collation} - ICU User Guide,
#' \url{http://userguide.icu-project.org/collation}
#'
#' @examples
#' # In the following examples, we have 3 duplicated values,
#' # "a" - 2 times, NA - 1 time
#' stri_duplicated(c("a", "b", "a", NA, "a", NA))
#' stri_duplicated(c("a", "b", "a", NA, "a", NA), fromLast=TRUE)
#' stri_duplicated_any(c("a", "b", "a", NA, "a", NA))
#'
#' # compare the results:
#' stri_duplicated(c("\u0105", stri_trans_nfkd("\u0105")))
#' duplicated(c("\u0105", stri_trans_nfkd("\u0105")))
#'
#' stri_duplicated(c("gro\u00df", "GROSS", "Gro\u00df", "Gross"), strength=1)
#' duplicated(c("gro\u00df", "GROSS", "Gro\u00df", "Gross"))
#'
#' @rdname stri_duplicated
#' @family locale_sensitive
#' @export
stri_duplicated <-  function(str, fromLast=FALSE, ..., opts_collator=NULL) {
   if (!missing(...))
       opts_collator <- do.call(stri_opts_collator, as.list(c(opts_collator, ...)))
   .Call(C_stri_duplicated, str, fromLast, opts_collator)
}


#' @rdname stri_duplicated
#' @family locale_sensitive
#' @export
stri_duplicated_any <-  function(str, fromLast=FALSE, ..., opts_collator=NULL) {
   if (!missing(...))
       opts_collator <- do.call(stri_opts_collator, as.list(c(opts_collator, ...)))
   .Call(C_stri_duplicated_any, str, fromLast, opts_collator)
}
back to top