diff --git a/CHANGELOG b/CHANGELOG index c55a998..8715cf4 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,3 +1,8 @@ +2.1.0 - Thomas J. Leeper + +- Added `unf.array()` method. (#18) +- Added `sort` argument to `unf()` to optionally not sort column UNF hashes. (#18) + 2.0.5 - Thomas J. Leeper - Exported new function `unf_equal()` to better comply with R CMD check. diff --git a/DESCRIPTION b/DESCRIPTION index 6de4d3b..c95254e 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: UNF -Version: 2.0.6 +Version: 2.1.0 Title: Tools for Creating Universal Numeric Fingerprints for Data -Date: 2016-08-09 +Date: 2017-05-08 Authors@R: c(person("Thomas", "Leeper", role = c("aut","cre"), email = "thosjleeper@gmail.com"), person("Micah", "Altman", role = c("aut"))) @@ -23,4 +23,4 @@ License: GPL-2 URL: https://github.com/leeper/UNF BugReports: https://github.com/leeper/UNF/issues VignetteBuilder: knitr -RoxygenNote: 5.0.1 +RoxygenNote: 6.0.1 diff --git a/NAMESPACE b/NAMESPACE index d142bff..af0f27f 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -17,6 +17,7 @@ S3method(as.unfvector,ts) S3method(print,UNF) S3method(print,UNFtest) S3method(print,unfvector) +S3method(unf,array) S3method(unf,data.frame) S3method(unf,default) S3method(unf,list) diff --git a/R/unf.R b/R/unf.R index c0f17cb..4c4e206 100644 --- a/R/unf.R +++ b/R/unf.R @@ -2,6 +2,7 @@ #' @description UNF is a cryptographic hash or signature that can be used to uniquely identify (a version of) a dataset, or a subset thereof. #' @param x For \code{unf}, a vector, matrix, dataframe, or list; for \code{unf3}, \code{unf4}, \code{unf5}, a vector. If \code{x} is a dataframe or list with one variable or one vector element, respectively, \code{unf} returns the UNF for the single vector (which is consistent with the Dataverse implementation but ambiguous in the UNF standard). For algorithm versions < 5, all non-numeric vectors are treated as character. #' @param version Version of the UNF algorithm. Allowed values are 3, 4, 4.1, 5, and 6. Always use the same version of the algorithm to check a UNF. Default for \code{unf} is 6 and default for \code{unf4} is 4 (but can also be set to 4.1, which is identical except for using SHA256 instead of MD5). +#' @param sort A logical indicating whether to sort the columns/variables of a matrix or data frame. The default is \code{TRUE}. If \code{FALSE}, column order is respected when calculating the final UNF hash. This can be useful for distinguishing two matrices from one another. #' @param digits The number of significant digits for rounding for numeric values. Default is 7L. Must be between 1 and 15. #' @param characters The number of characters for truncation. Default is 128L. Must be greater than 1. #' @param truncation The number of bits to truncate the UNF signature to. Default is 128L. Must be one of: 128,192,196,256. @@ -125,32 +126,38 @@ unf.default <- function(x, version = 6, ...) { } #' @export -unf.data.frame <- function(x, version = 6, ...){ +unf.data.frame <- function(x, version = 6, sort = TRUE, ...){ if (length(x) == 1) { return(unf(x[[1]], version = version, ...)) } - locale <- Sys.getlocale(category="LC_COLLATE") - Sys.setlocale(category="LC_COLLATE", "C") - on.exit(Sys.setlocale(category="LC_COLLATE", locale)) + if (isTRUE(sort)) { + sort_fun <- sort + locale <- Sys.getlocale(category="LC_COLLATE") + Sys.setlocale(category="LC_COLLATE", "C") + on.exit(Sys.setlocale(category="LC_COLLATE", locale)) + } else { + sort_fun <- function(x) x + } if (version == 3) { vars <- sapply(x, function(i) unf3(i, ...)$unf) - out <- unf3(sort(vars), ...) + out <- unf3(sort_fun(vars), ...) } else if (version == 4) { vars <- sapply(x, function(i) unf4(i, ...)$unf) - out <- unf4(sort(vars), ...) + out <- unf4(sort_fun(vars), ...) } else if (version == 4.1) { vars <- sapply(x, function(i) unf4(i, version = 4.1, ...)$unf) - out <- unf4(sort(vars), version = 4.1, ...) + out <- unf4(sort_fun(vars), version = 4.1, ...) } else if (version == 5) { vars <- sapply(x, function(i) unf5(i, ...)$unf) - out <- unf5(sort(vars), ...) + out <- unf5(sort_fun(vars), ...) } else if (version == 6) { vars <- sapply(x, function(i) unf6(i, ...)$unf) - out <- unf6(sort(vars), ...) + out <- unf6(sort_fun(vars), ...) } else { stop("Unrecognized UNF version: must be 3, 4, 4.1, 5, or 6.") } out$variables <- vars + attr(out, "sort") <- sort return(out) } @@ -160,8 +167,13 @@ unf.list <- function(x, version = 6, ...) { } #' @export -unf.matrix <- function(x, version = 6, ...) { - unf(as.data.frame(x), version = version, ...) +unf.matrix <- function(x, version = 6, sort = TRUE, ...) { + unf(as.data.frame(x), version = version, sort = sort, ...) +} + +#' @export +unf.array <- function(x, version = 6, sort = TRUE, ...) { + unf(as.data.frame(x), version = version, sort = sort, ...) } #' @export diff --git a/man/UNF-package.Rd b/man/UNF-package.Rd index 8935218..f745e6c 100644 --- a/man/UNF-package.Rd +++ b/man/UNF-package.Rd @@ -2,8 +2,8 @@ % Please edit documentation in R/UNF-package.R \docType{package} \name{UNF-package} -\alias{UNF} \alias{UNF-package} +\alias{UNF} \title{Tools for creating universal numeric fingerprints for data} \description{ Computes a univeral numeric fingerprint of data objects. @@ -21,11 +21,10 @@ A UNF differs from an ordinary file checksum in several important ways: \item \emph{UNFs are strongly tamper resistant.} Any accidental or intentional changes to data values will change the resulting UNF. Most file checksums and descriptive statistics detect only certain types of changes. } } -\author{ -Thomas J. Leeper and Micah Altman. -} \seealso{ \code{\link{unf}} \code{\link{\%unf\%}} } +\author{ +Thomas J. Leeper and Micah Altman. +} \keyword{package} - diff --git a/man/as.unfvector.Rd b/man/as.unfvector.Rd index 9e09cf3..20278bb 100644 --- a/man/as.unfvector.Rd +++ b/man/as.unfvector.Rd @@ -20,10 +20,9 @@ Standardize a vector according to UNF specifications \details{ The UNF specifications describes how to coerce all R data types to a standardized character representation. This S3 method exposes that coercion functionality. } -\author{ -Thomas J. Leeper (\email{thosjleeper@gmail.com}) -} \seealso{ \code{\link{unf}}, \code{\link{unf6}}, \code{\link{\%unf\%}} } - +\author{ +Thomas J. Leeper (\email{thosjleeper@gmail.com}) +} diff --git a/man/equal.Rd b/man/equal.Rd index d7423c4..4ba37ca 100644 --- a/man/equal.Rd +++ b/man/equal.Rd @@ -35,11 +35,10 @@ unf_equal(a, b, digits = 3) unf(a) \%unf\% "UNF6:aKW4lAFNBH8vfrnrDbQZjg==" -} -\author{ -Thomas J. Leeper } \seealso{ \code{\link{unf}} } - +\author{ +Thomas J. Leeper +} diff --git a/man/signifz.Rd b/man/signifz.Rd index 2a274c4..5ceef76 100644 --- a/man/signifz.Rd +++ b/man/signifz.Rd @@ -23,11 +23,10 @@ Rounding toward zero assures that \code{signifz(signifz(x,digits=m),digits=n)} = signif(pi,digits=5) signifz(pi,digits=5) -} -\author{ -Micah Altman } \seealso{ \code{\link[base]{signif}}, \code{\link{unf}} } - +\author{ +Micah Altman +} diff --git a/man/unf.Rd b/man/unf.Rd index 7b4129a..58d4c45 100644 --- a/man/unf.Rd +++ b/man/unf.Rd @@ -58,6 +58,8 @@ unf6(x, digits = 7L, characters = 128L, truncation = 128L, \item{complex_as_character}{A logical indicating whether to format raw vectors as character. If \code{TRUE}, UNF should match Dataverse UNFv5 implementation. If \code{FALSE}, complex numbers are formatted as \code{A,iB}.} +\item{sort}{A logical indicating whether to sort the columns/variables of a matrix or data frame. The default is \code{TRUE}. If \code{FALSE}, column order is respected when calculating the final UNF hash. This can be useful for distinguishing two matrices from one another.} + \item{\ldots}{Additional arguments passed to specific algorithm functions. Ignored.} } \value{ @@ -152,4 +154,3 @@ Data Citation Synthesis Group. 2013. Declaration of Data Citation Principles [DR \seealso{ \code{\link{\%unf\%}} } -