Skip to content

Commit

Permalink
Updates for writing Unicode datatype
Browse files Browse the repository at this point in the history
  • Loading branch information
grimbough committed Sep 27, 2023
1 parent 3d1c80e commit c42af70
Show file tree
Hide file tree
Showing 13 changed files with 78 additions and 20 deletions.
1 change: 1 addition & 0 deletions .Rbuildignore
Original file line number Diff line number Diff line change
Expand Up @@ -7,4 +7,5 @@ vignettes/Rarr_cache*
^codecov\.yml$
^\.github$
README.Rmd
README_cache*
inst/rmd/imgs*
6 changes: 3 additions & 3 deletions .github/workflows/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ on:
push:
pull_request:
branches:
- master
- devel

name: R-CMD-check

Expand All @@ -21,14 +21,14 @@ jobs:
config:
- { os: windows-2022, bioc-version: 'devel'}
- { os: macOS-latest, bioc-version: 'devel'}
- { os: ubuntu-20.04, bioc-version: 'devel'}
- { os: ubuntu-22.04, bioc-version: 'devel'}

steps:
- name: Configure git
run: |
git config --global core.autocrlf false
- uses: actions/checkout@v3
- uses: actions/checkout@v4

- name: Setup R and Bioconductor
uses: grimbough/bioc-actions/setup-bioc@v1
Expand Down
2 changes: 2 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@
* Corrected issue where fixed length string datatypes would be written with
null terminators, resulting in strings that were one byte longer than the
dtype value written in the `.zarray` metadata. Also backported to Rarr 1.0.3
* Added support for reading and writing the fixed length Unicode datatype, and
for reading variable length UTF-8 datatype.

# Rarr 0.99.9

Expand Down
39 changes: 27 additions & 12 deletions R/write_data.R
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
}

## if data type was supplied directly, always use that
if (!data_type %in% c("<i4", "<f8", "|S")) {
if (!data_type %in% c("<i4", "<f8", "|S", "<U")) {
data_type <- switch(data_type,
"integer" = "<i4",
"double" = "<f8",
Expand All @@ -26,15 +26,16 @@
"<i4" = 0L,
"<f8" = 0,
"|S" = "",
"<U" = "",
NULL
)
}

if (data_type == "|S") {
if (data_type %in% c("|S", "<U", ">U")) {
if (missing(nchar) || nchar < 1) {
stop("The 'nchar' argument must be provided and be a positive integer")
}
data_type <- paste0("|S", as.integer(nchar))
data_type <- paste0(data_type, as.integer(nchar))
}

return(list(data_type = data_type, fill_value = fill_value))
Expand Down Expand Up @@ -273,18 +274,23 @@ update_zarr_array <- function(zarr_array_path, x, index) {
metadata <- read_array_metadata(zarr_array_path)

data_type <- switch(storage.mode(x),
"integer" = "<i4",
"double" = "<f8",
"character" = "|S",
"integer" = "<i",
"double" = "<f",
"character" = c("|S", "<U", ">U"),
NULL
)
if (data_type != metadata$dtype) {
if (!substr(metadata$dtype, 1,2) %in% data_type) {
stop("New data is not of the same type as the existing array.")
}

zarr_dim <- unlist(metadata$shape)
chunk_dim <- unlist(metadata$chunks)

## convert strings to Unicode if required
if(grepl("<U|>U", x = metadata$dtype, fixed = FALSE)) {
x <- .unicode_to_int(input = x, typestr = metadata$dtype)
}

## coerce x to the same shape as the zarr to be updated
x <- array(x, dim = vapply(index, length, integer(1)))

Expand Down Expand Up @@ -349,7 +355,8 @@ update_zarr_array <- function(zarr_array_path, x, index) {
input_chunk = chunk_in_mem,
chunk_path = chunk_path,
compressor = metadata$compressor,
data_type_size = .parse_datatype(metadata$dtype)$nbytes
data_type_size = .parse_datatype(metadata$dtype)$nbytes,
is_base64 = (.parse_datatype(metadata$dtype)$base_type == "unicode")
)

}
Expand All @@ -366,16 +373,21 @@ update_zarr_array <- function(zarr_array_path, x, index) {
#' @param data_type_size An integer giving the size of the original datatype.
#' This is passed to the blosc algorithm, which seems to need it to achieve
#' any compression.
#' @param is_base64 When dealing with Py_unicode strings we convert them to
#' base64 strings for storage in our intermediate R arrays. This argument
#' indicates if base64 is in use, because the conversion to raw in .as_raw
#' should be done differently for base64 strings vs other types.
#'
#' @returns Returns `TRUE` if writing is successful. Mostly called for the
#' side-effect of writing the compressed chunk to disk.
#'
#' @keywords Internal
.compress_and_write_chunk <- function(input_chunk, chunk_path,
compressor = use_zlib(),
data_type_size) {
data_type_size, is_base64 = FALSE) {
## the compression tools need a raw vector
raw_chunk <- .as_raw(as.vector(input_chunk), nchar = data_type_size)
raw_chunk <- .as_raw(as.vector(input_chunk), nchar = data_type_size,
is_base64 = is_base64)

if(is.null(compressor)) {
compressed_chunk <- raw_chunk
Expand Down Expand Up @@ -411,10 +423,13 @@ update_zarr_array <- function(zarr_array_path, x, index) {

}

.as_raw <- function(d, nchar) {
.as_raw <- function(d, nchar, is_base64) {
## we need to create fixed length strings either via padding or trimming
if(is.character(d)) {
raw_list <- iconv(d, toRaw = TRUE)
if(is_base64)
raw_list <- lapply(d, jsonlite::base64_dec)
else
raw_list <- iconv(d, toRaw = TRUE)
unlist(
lapply(raw_list, FUN = function(x, nchar) {
if(!is.null(x))
Expand Down
23 changes: 23 additions & 0 deletions R/write_utils.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@

.unicode_to_int <- function(input, typestr) {

data_type <- .parse_datatype(typestr)

nchar <- as.integer(data_type$nbytes / 4L)

to <- ifelse(data_type$endian == "little", "UCS-4LE", "UCS-4BE")
raw_list <- iconv(input, to = to, toRaw = TRUE)

base64_strings <- vapply(raw_list,
function(x) {
if(length(x) < nchar * 4) {
x <- c(x, as.raw(rep(0, (nchar * 4) - length(x))))
}
jsonlite::base64_enc(x)
},
FUN.VALUE = character(1),
USE.NAMES = FALSE)

return(base64_strings)

}
4 changes: 2 additions & 2 deletions README.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ datatype support. It will be updated as progress is made.
|`timedelta` | &#x274C; / &#x274C; | |
|`datetime` | &#x274C; / &#x274C; | |
|`string` | &#x2714; / &#x2714; | |
|`Unicode` | &#x274C; / &#x274C; | |
|`Unicode` | &#x2714; / &#x2714; | |
|`void *` | &#x274C; / &#x274C; | |
| Structured data types | &#x274C; / &#x274C; | |

Expand All @@ -95,7 +95,7 @@ datatype support. It will be updated as progress is made.
|`blosc` | &#x2714; / &#x2714; | Only `lz4` compression level 5 is enabled for writing. |
|`LZMA ` | &#x2714; / &#x2714; | |
|`LZ4` | &#x2714; / &#x2714; | |
|`Zstd` | &#x274C; / &#x274C; | Algorithm is available via blosc for writing, but can't currently be access through the R interface |
|`Zstd` | &#x274C; / &#x274C; | Algorithm is available via blosc for writing, but can't currently be accessed through the R interface |

Please open an [issue](https://github.com/grimbough/Rarr/issues) if support for a required compression tool is missing.

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
],
"compressor": {
"id": "zlib",
"level": 1
"level": 6
},
"dtype": "<U20",
"fill_value": "",
Expand Down
Binary file modified inst/extdata/zarr_examples/column-first/Unicode.zarr/0.0
Binary file not shown.
Binary file modified inst/extdata/zarr_examples/column-first/Unicode.zarr/0.1
Binary file not shown.
Binary file modified inst/extdata/zarr_examples/column-first/Unicode.zarr/1.0
Binary file not shown.
2 changes: 1 addition & 1 deletion inst/scripts/create_test_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -208,7 +208,7 @@
z = zarr.open('/data/column-first/Unicode.zarr', mode='w', shape=(12, 12),
chunks=(6, 6), order="F", fill_value = "",
dtype='U20',
compressor = zarr.Zlib())
compressor = zarr.Zlib(level = 6))
z[:,0] = greetings
z[0,:] = greetings

Expand Down
11 changes: 11 additions & 0 deletions inst/tinytest/test_string.R
Original file line number Diff line number Diff line change
Expand Up @@ -71,3 +71,14 @@ expect_silent(
)
expect_equal(res[,1], greetings)
expect_equal(res[1,], greetings)


## writing & reading unicode
path <- tempfile()
create_empty_zarr_array(path, dim = c(12,12), chunk_dim = c(6,6),
data_type = "<U", nchar = 20)
x <- array("", dim = c(12,12))
x[1,] <- greetings
x[,1] <- greetings
update_zarr_array(zarr_array_path = path, x, index = list(1:12, 1:12))
expect_identical(read_zarr_array(path), x)
8 changes: 7 additions & 1 deletion man/dot-compress_and_write_chunk.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit c42af70

Please sign in to comment.