diff --git a/CHANGELOG.md b/CHANGELOG.md index bb853f33..d22c8078 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -16,6 +16,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Added `array_subset::ArrayStoreBytesError`, `store_bytes`, and `store_bytes_unchecked` - Added `parallel_chunks` option to `Array`, enabled by default. Lets `store_array_subset` and `retrieve_array_subset` (and their variants) encode/decode chunks in parallel - Added experimental `zfp` codec implementation behind `zfp` feature flag (disabled by default) + - Added experimental `bitround` codec implementation behind `bitround` feature flag (disabled by default) ### Changed - **Breaking**: `array::data_type::DataType` is now marked `#[non_exhaustive]` diff --git a/Cargo.toml b/Cargo.toml index 5e3265e4..a587bca5 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -13,11 +13,12 @@ exclude = [".dockerignore", ".github", ".editorconfig", "Dockerfile", "coverage. [features] default = ["transpose", "blosc", "gzip", "sharding", "crc32c", "zstd", "http", "zip", "ndarray"] # Codecs -transpose = ["dep:ndarray"] +bitround = [] blosc = ["dep:blosc-sys"] +crc32c = ["dep:crc32fast"] gzip = ["dep:flate2"] sharding = [] -crc32c = ["dep:crc32fast"] +transpose = ["dep:ndarray"] zfp = ["dep:zfp-sys"] zstd = ["dep:zstd"] # Stores diff --git a/src/array/codec.rs b/src/array/codec.rs index f34e2f60..8c61739d 100644 --- a/src/array/codec.rs +++ b/src/array/codec.rs @@ -14,11 +14,17 @@ pub mod array_to_array; pub mod array_to_bytes; pub mod bytes_to_bytes; +// Array to array +#[cfg(feature = "bitround")] +pub use array_to_array::bitround::{ + BitroundCodec, BitroundCodecConfiguration, BitroundCodecConfigurationV1, +}; #[cfg(feature = "transpose")] pub use array_to_array::transpose::{ TransposeCodec, TransposeCodecConfiguration, TransposeCodecConfigurationV1, }; +// Array to bytes #[cfg(feature = "sharding")] pub use array_to_bytes::sharding::{ ShardingCodec, ShardingCodecConfiguration, ShardingCodecConfigurationV1, @@ -29,8 +35,8 @@ pub use array_to_bytes::{ bytes::{BytesCodec, BytesCodecConfiguration, BytesCodecConfigurationV1}, codec_chain::CodecChain, }; -// pub use array_to_bytes::zfp::{ZfpCodec, ZfpCodecConfiguration, ZfpCodecConfigurationV1}; +// Bytes to bytes #[cfg(feature = "blosc")] pub use bytes_to_bytes::blosc::{BloscCodec, BloscCodecConfiguration, BloscCodecConfigurationV1}; #[cfg(feature = "crc32c")] diff --git a/src/array/codec/array_to_array.rs b/src/array/codec/array_to_array.rs index 89379d5e..1cfafe6e 100644 --- a/src/array/codec/array_to_array.rs +++ b/src/array/codec/array_to_array.rs @@ -1,4 +1,6 @@ //! Array to array codecs. +#[cfg(feature = "bitround")] +pub mod bitround; #[cfg(feature = "transpose")] pub mod transpose; diff --git a/src/array/codec/array_to_array/bitround.rs b/src/array/codec/array_to_array/bitround.rs new file mode 100644 index 00000000..b9eee263 --- /dev/null +++ b/src/array/codec/array_to_array/bitround.rs @@ -0,0 +1,217 @@ +//! The bitround array to array codec. +//! +//! Rounds the mantissa of floating point data types to the specified number of bits. +//! +//! This codec requires the `bitround` feature, which is disabled by default. +//! +//! The current implementation does not write its metadata to the array metadata, so the array can be imported by tools which do not presently support this codec. +//! This functionality will be changed when the bitround codec is in the zarr specification and supported by multiple implementations. +//! +//! See [`BitroundCodecConfigurationV1`] for example `JSON` metadata. +//! + +mod bitround_codec; +mod bitround_configuration; +mod bitround_partial_decoder; + +pub use bitround_codec::BitroundCodec; +pub use bitround_configuration::{BitroundCodecConfiguration, BitroundCodecConfigurationV1}; + +use crate::{ + array::{ + codec::{Codec, CodecError, CodecPlugin}, + DataType, + }, + metadata::Metadata, + plugin::PluginCreateError, +}; + +const IDENTIFIER: &str = "bitround"; + +// Register the codec. +inventory::submit! { + CodecPlugin::new(IDENTIFIER, is_name_bitround, create_codec_bitround) +} + +fn is_name_bitround(name: &str) -> bool { + name.eq(IDENTIFIER) +} + +fn create_codec_bitround(metadata: &Metadata) -> Result { + let configuration: BitroundCodecConfiguration = metadata.to_configuration()?; + let codec = Box::new(BitroundCodec::new_with_configuration(&configuration)); + Ok(Codec::ArrayToArray(codec)) +} + +fn round_bits16(mut input: u16, keepbits: u32) -> u16 { + let maxbits = 10; + if keepbits >= maxbits { + input + } else { + let maskbits = maxbits - keepbits; + let all_set = u16::MAX; + let mask = (all_set >> maskbits) << maskbits; + let half_quantum1 = (1 << (maskbits - 1)) - 1; + input += ((input >> maskbits) & 1) + half_quantum1; + input &= mask; + input + } +} + +fn round_bits32(mut input: u32, keepbits: u32) -> u32 { + let maxbits = 23; + if keepbits >= maxbits { + input + } else { + let maskbits = maxbits - keepbits; + let all_set = u32::MAX; + let mask = (all_set >> maskbits) << maskbits; + let half_quantum1 = (1 << (maskbits - 1)) - 1; + input += ((input >> maskbits) & 1) + half_quantum1; + input &= mask; + input + } +} + +fn round_bits64(mut input: u64, keepbits: u32) -> u64 { + let maxbits = 52; + if keepbits >= maxbits { + input + } else { + let maskbits = maxbits - keepbits; + let all_set = u64::MAX; + let mask = (all_set >> maskbits) << maskbits; + let half_quantum1 = (1 << (maskbits - 1)) - 1; + input += ((input >> maskbits) & 1) + half_quantum1; + input &= mask; + input + } +} + +fn round_bytes(bytes: &mut [u8], data_type: &DataType, keepbits: u32) -> Result<(), CodecError> { + match data_type { + DataType::Float16 | DataType::BFloat16 => { + let round = |chunk: &mut [u8]| { + let element = u16::from_ne_bytes(chunk.try_into().unwrap()); + let element = u16::to_ne_bytes(round_bits16(element, keepbits)); + chunk.copy_from_slice(&element); + }; + bytes.chunks_exact_mut(2).for_each(round); + Ok(()) + } + DataType::Float32 | DataType::Complex64 => { + let round = |chunk: &mut [u8]| { + let element = u32::from_ne_bytes(chunk.try_into().unwrap()); + let element = u32::to_ne_bytes(round_bits32(element, keepbits)); + chunk.copy_from_slice(&element); + }; + bytes.chunks_exact_mut(4).for_each(round); + Ok(()) + } + DataType::Float64 | DataType::Complex128 => { + let round = |chunk: &mut [u8]| { + let element = u64::from_ne_bytes(chunk.try_into().unwrap()); + let element = u64::to_ne_bytes(round_bits64(element, keepbits)); + chunk.copy_from_slice(&element); + }; + bytes.chunks_exact_mut(8).for_each(round); + Ok(()) + } + _ => Err(CodecError::UnsupportedDataType( + data_type.clone(), + IDENTIFIER.to_string(), + )), + } +} + +#[cfg(test)] +mod tests { + use array_representation::ArrayRepresentation; + use itertools::Itertools; + + use crate::{ + array::{ + array_representation, + codec::{ + ArrayCodecTraits, ArrayToArrayCodecTraits, ArrayToBytesCodecTraits, BytesCodec, + }, + DataType, + }, + array_subset::ArraySubset, + }; + + use super::*; + + #[test] + fn codec_bitround_float() { + // 1 sign bit, 8 exponent, 3 mantissa + const JSON: &'static str = r#"{ "keepbits": 3 }"#; + let array_representation = + ArrayRepresentation::new(vec![4], DataType::Float32, 0.0f32.into()).unwrap(); + let elements: Vec = vec![ + // | + 0.0, + // 1.23456789 -> 001111111001|11100000011001010010 + // 1.25 -> 001111111010 + 1.23456789, + // -8.3587192 -> 110000010000|01011011110101010000 + // -8.0 -> 110000010000 + -8.3587192834, + // 98765.43210-> 010001111100|00001110011010110111 + // 98304.0 -> 010001111100 + 98765.43210, + ]; + let bytes = safe_transmute::transmute_to_bytes(&elements).to_vec(); + + let codec_configuration: BitroundCodecConfiguration = serde_json::from_str(JSON).unwrap(); + let codec = BitroundCodec::new_with_configuration(&codec_configuration); + + let encoded = codec.encode(bytes.clone(), &array_representation).unwrap(); + let decoded = codec + .decode(encoded.clone(), &array_representation) + .unwrap(); + let decoded_elements = safe_transmute::transmute_many_permissive::(&decoded) + .unwrap() + .to_vec(); + assert_eq!(decoded_elements, &[0.0f32, 1.25f32, -8.0f32, 98304.0f32]); + } + + #[test] + fn codec_bitround_partial_decode() { + const JSON: &'static str = r#"{ "keepbits": 2 }"#; + let codec_configuration: BitroundCodecConfiguration = serde_json::from_str(JSON).unwrap(); + let codec = BitroundCodec::new_with_configuration(&codec_configuration); + + let elements: Vec = (0..32).map(|i| i as f32).collect(); + let bytes = safe_transmute::transmute_to_bytes(&elements).to_vec(); + let array_representation = ArrayRepresentation::new( + vec![elements.len().try_into().unwrap()], + DataType::Float32, + 0.0f32.into(), + ) + .unwrap(); + + let encoded = codec.encode(bytes.clone(), &array_representation).unwrap(); + let decoded_regions = [ + ArraySubset::new_with_start_shape(vec![3], vec![2]).unwrap(), + ArraySubset::new_with_start_shape(vec![17], vec![4]).unwrap(), + ]; + let input_handle = Box::new(std::io::Cursor::new(encoded)); + let bytes_codec = BytesCodec::default(); + let input_handle = bytes_codec.partial_decoder(input_handle); + let partial_decoder = codec.partial_decoder(input_handle); + let decoded_partial_chunk = partial_decoder + .partial_decode(&array_representation, &decoded_regions) + .unwrap(); + let decoded_partial_chunk = decoded_partial_chunk + .iter() + .map(|bytes| { + safe_transmute::transmute_many_permissive::(&bytes) + .unwrap() + .to_vec() + }) + .collect_vec(); + let answer: &[Vec] = &[vec![3.0, 4.0], vec![16.0, 16.0, 20.0, 20.0]]; + assert_eq!(answer, decoded_partial_chunk); + } +} diff --git a/src/array/codec/array_to_array/bitround/bitround_codec.rs b/src/array/codec/array_to_array/bitround/bitround_codec.rs new file mode 100644 index 00000000..487145cf --- /dev/null +++ b/src/array/codec/array_to_array/bitround/bitround_codec.rs @@ -0,0 +1,107 @@ +use crate::{ + array::{ + codec::{ + ArrayCodecTraits, ArrayPartialDecoderTraits, ArrayToArrayCodecTraits, CodecError, + CodecTraits, + }, + ArrayRepresentation, DataType, + }, + metadata::Metadata, +}; + +use super::{bitround_partial_decoder, round_bytes, BitroundCodecConfiguration, IDENTIFIER}; + +/// A `bitround` codec implementation. +#[derive(Clone, Debug, Default)] +pub struct BitroundCodec { + keepbits: u32, +} + +impl BitroundCodec { + /// Create a new bitround codec. + /// + /// `keepbits` is the number of bits to round to in the floating point mantissa. + #[must_use] + pub fn new(keepbits: u32) -> Self { + Self { keepbits } + } + + /// Create a new bitround codec from a configuration. + #[must_use] + pub fn new_with_configuration(configuration: &BitroundCodecConfiguration) -> Self { + let BitroundCodecConfiguration::V1(configuration) = configuration; + Self { + keepbits: configuration.keepbits, + } + } +} + +impl CodecTraits for BitroundCodec { + fn create_metadata(&self) -> Option { + // FIXME: Output the metadata when the bitround codec is in the zarr specification and supported by multiple implementations. + // let configuration = BitroundCodecConfigurationV1 { + // keepbits: self.keepbits, + // }; + // Some(Metadata::new_with_serializable_configuration(IDENTIFIER, &configuration).unwrap()) + None + } + + fn partial_decoder_should_cache_input(&self) -> bool { + false + } + + fn partial_decoder_decodes_all(&self) -> bool { + false + } +} + +impl ArrayCodecTraits for BitroundCodec { + fn encode( + &self, + mut decoded_value: Vec, + decoded_representation: &ArrayRepresentation, + ) -> Result, CodecError> { + round_bytes( + &mut decoded_value, + decoded_representation.data_type(), + self.keepbits, + )?; + Ok(decoded_value) + } + + fn decode( + &self, + encoded_value: Vec, + _decoded_representation: &ArrayRepresentation, + ) -> Result, CodecError> { + Ok(encoded_value) + } +} + +impl ArrayToArrayCodecTraits for BitroundCodec { + fn partial_decoder<'a>( + &'a self, + input_handle: Box, + ) -> Box { + Box::new(bitround_partial_decoder::BitroundPartialDecoder::new( + input_handle, + self.keepbits, + )) + } + + fn compute_encoded_size( + &self, + decoded_representation: &ArrayRepresentation, + ) -> Result { + let data_type = decoded_representation.data_type(); + match data_type { + DataType::Float16 | DataType::BFloat16 | DataType::Float32 | DataType::Float64 => { + Ok(decoded_representation.clone()) + } + _ => Err(CodecError::UnsupportedDataType( + data_type.clone(), + IDENTIFIER.to_string(), + )), + } + } +} diff --git a/src/array/codec/array_to_array/bitround/bitround_configuration.rs b/src/array/codec/array_to_array/bitround/bitround_configuration.rs new file mode 100644 index 00000000..ffdae4bc --- /dev/null +++ b/src/array/codec/array_to_array/bitround/bitround_configuration.rs @@ -0,0 +1,58 @@ +use derive_more::{Display, From}; +use serde::{Deserialize, Serialize}; + +/// A wrapper to handle various versions of `bitround` codec configuration parameters. +#[derive(Serialize, Deserialize, Clone, Eq, PartialEq, Debug, Display, From)] +#[serde(untagged)] +pub enum BitroundCodecConfiguration { + /// Version 1.0. + V1(BitroundCodecConfigurationV1), +} + +/// Bitround codec configuration parameters (version 1.0). +/// +/// ### Example: Keep 10 bits of the mantissa +/// ```rust +/// # let JSON = r#" +/// { +/// "keepbits": 10 +/// } +/// # "#; +/// # let configuration: zarrs::array::codec::BitroundCodecConfigurationV1 = serde_json::from_str(JSON).unwrap(); +/// ``` +#[derive(Serialize, Deserialize, Clone, Eq, PartialEq, Debug, Display)] +#[serde(deny_unknown_fields)] +pub struct BitroundCodecConfigurationV1 { + /// The number of mantissa bits to keep for a floating point data type. + pub keepbits: u32, +} + +#[cfg(test)] +mod tests { + use crate::metadata::Metadata; + + use super::*; + + #[test] + fn codec_bitround_metadata() { + serde_json::from_str::( + r#"{ + "name": "bitround", + "configuration": { + "keepbits": 10 + } + }"#, + ) + .unwrap(); + } + + #[test] + fn codec_bitround_config() { + serde_json::from_str::( + r#"{ + "keepbits": 10 + }"#, + ) + .unwrap(); + } +} diff --git a/src/array/codec/array_to_array/bitround/bitround_partial_decoder.rs b/src/array/codec/array_to_array/bitround/bitround_partial_decoder.rs new file mode 100644 index 00000000..f87d42cd --- /dev/null +++ b/src/array/codec/array_to_array/bitround/bitround_partial_decoder.rs @@ -0,0 +1,43 @@ +use crate::{ + array::{ + codec::{ArrayPartialDecoderTraits, CodecError}, + ArrayRepresentation, + }, + array_subset::ArraySubset, +}; + +use super::round_bytes; + +/// The partial decoder for the Bitround codec. +pub struct BitroundPartialDecoder<'a> { + input_handle: Box, + keepbits: u32, +} + +impl<'a> BitroundPartialDecoder<'a> { + /// Create a new partial decoder for the Bitround codec. + pub fn new(input_handle: Box, keepbits: u32) -> Self { + Self { + input_handle, + keepbits, + } + } +} + +impl ArrayPartialDecoderTraits for BitroundPartialDecoder<'_> { + fn partial_decode( + &self, + decoded_representation: &ArrayRepresentation, + array_subsets: &[ArraySubset], + ) -> Result>, CodecError> { + let mut bytes = self + .input_handle + .partial_decode(decoded_representation, array_subsets)?; + + for bytes in &mut bytes { + round_bytes(bytes, decoded_representation.data_type(), self.keepbits)?; + } + + Ok(bytes) + } +} diff --git a/src/lib.rs b/src/lib.rs index 5c2d38c5..735fec22 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -23,7 +23,7 @@ //! - [x] Chunk grids: [`regular`](crate::array::chunk_grid::RegularChunkGrid), [`rectangular`](crate::array::chunk_grid::RectangularChunkGrid) ([draft](https://github.com/orgs/zarr-developers/discussions/52)) //! - [x] Chunk key encoding: [`default`](crate::array::chunk_key_encoding::DefaultChunkKeyEncoding), [`v2`](crate::array::chunk_key_encoding::V2ChunkKeyEncoding) //! - [x] Codecs: -//! - array->array: [`transpose`](crate::array::codec::array_to_array::transpose), +//! - array->array: [`transpose`](crate::array::codec::array_to_array::transpose), [`bitround`](crate::array::codec::array_to_array::bitround) (experimental) //! - array->bytes: [`bytes`](crate::array::codec::array_to_bytes::bytes) [(spec issue)](https://github.com/zarr-developers/zarr-specs/pull/263), [`sharding`](crate::array::codec::array_to_bytes::sharding), [`zfp`](crate::array::codec::array_to_bytes::zfp) (experimental) //! - bytes->bytes: [`blosc`](crate::array::codec::bytes_to_bytes::blosc), [`gzip`](crate::array::codec::bytes_to_bytes::gzip), [`zstd`](crate::array::codec::bytes_to_bytes::zstd) [(spec issue)](https://github.com/zarr-developers/zarr-specs/pull/256), [`crc32c checksum`](crate::array::codec::bytes_to_bytes::crc32c) //! - [x] Storage transformers: [`usage_log`](crate::storage::storage_transformer::UsageLogStorageTransformer), [`performance_metrics`](crate::storage::storage_transformer::PerformanceMetricsStorageTransformer) @@ -33,8 +33,9 @@ //! - Codecs: `blosc`, `gzip`, `transpose`, `zstd`, `sharding`, `crc32c`. //! - Stores: `http`, `zip`. //! - `ndarray`: adds [`ndarray`] utility functions to [`Array`](crate::array::Array). +//! //! The following features are disabled by default: -//! - Codecs: `zfp` +//! - Codecs: `bitround`, `zfp` //! //! ## Examples //! Examples can be run with `cargo run --example EXAMPLE_NAME`