From d19039a6f3d205991c8138f78f441e804ae7db40 Mon Sep 17 00:00:00 2001 From: Matthias Mohr Date: Wed, 3 Jan 2024 14:56:03 +0100 Subject: [PATCH] Clarify NaN in comparisons, clarify character encoding issues --- CHANGELOG.md | 8 ++++++-- eq.json | 9 ++++++++- gt.json | 9 ++++++++- gte.json | 9 ++++++++- lt.json | 9 ++++++++- lte.json | 9 ++++++++- meta/implementation.md | 22 ++++++++++++++++++---- neq.json | 9 ++++++++- proposals/is_infinite.json | 4 ++-- text_begins.json | 2 +- text_contains.json | 2 +- text_ends.json | 2 +- 12 files changed, 77 insertions(+), 17 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index f4811eee..f4ec2834 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,8 +8,12 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Changed -- Clarified for various mathematical functions the defined input and output ranges. Mention that `NaN` is returned outside of the defined input range where possible. -- Clarified for various processes the handling of no-data values and null, see also the [implementation guide](meta/implementation.md). [#480](https://github.com/Open-EO/openeo-processes/issues/480) +- Clarified for various mathematical functions the defined input and output ranges. + Mention that `NaN` is returned outside of the defined input range where possible. +- Clarified for several comparison processes how `NaN` values have to be handled. +- Clarified for various processes the handling of no-data values and `null`, see also the [implementation guide](meta/implementation.md#no-data-value). [#480](https://github.com/Open-EO/openeo-processes/issues/480) +- Added a [section about character encodings to the implementation guide](meta/implementation.md#character-encoding). + Removed any character encoding related wording from the process specifications itself. - Added a uniqueness contraint to various array-typed parameters (e.g. lists of dimension names or labels) - `array_interpolate_linear`: Apply interpolation to NaN and no-data values. - `clip`: Throw an exception if min > max. [#472](https://github.com/Open-EO/openeo-processes/issues/472) diff --git a/eq.json b/eq.json index b1b23385..73ffaaa4 100644 --- a/eq.json +++ b/eq.json @@ -1,7 +1,7 @@ { "id": "eq", "summary": "Equal to comparison", - "description": "Compares whether `x` is strictly equal to `y`.\n\n**Remarks:**\n\n* Data types MUST be checked strictly. For example, a string with the content *1* is not equal to the number *1*. Nevertheless, an integer *1* is equal to a floating-point number *1.0* as `integer` is a sub-type of `number`.\n* If any operand is a no-data value, the result will be the no-data value (or `null`).\n* Temporal strings are normal strings. To compare temporal strings as dates/times, use ``date_difference()``.", + "description": "Compares whether `x` is strictly equal to `y`.\n\n**Remarks:**\n\n* Data types MUST be checked strictly. For example, a string with the content *1* is not equal to the number *1*. Nevertheless, an integer *1* is equal to a floating-point number *1.0* as `integer` is a sub-type of `number`.\n* If any operand is a no-data value, the result will be the no-data value (or `null`).\n* The comparison of `NaN` (not a number) follows [IEEE Standard 754](https://ieeexplore.ieee.org/document/8766229).\n* Temporal strings are normal strings. To compare temporal strings as dates/times, use ``date_difference()``.", "categories": [ "texts", "comparison" @@ -146,5 +146,12 @@ }, "returns": false } + ], + "links": [ + { + "rel": "about", + "href": "https://ieeexplore.ieee.org/document/4610935", + "title": "IEEE Standard 754-2008 for Floating-Point Arithmetic" + } ] } diff --git a/gt.json b/gt.json index 542f618c..dbb33296 100644 --- a/gt.json +++ b/gt.json @@ -1,7 +1,7 @@ { "id": "gt", "summary": "Greater than comparison", - "description": "Compares whether `x` is strictly greater than `y`.\n\n**Remarks:**\n\n* If any operand is a no-data value, the result will be the no-data value (or `null`).\n* If any operand is not a `number`, the process returns `false`.\n* Temporal strings are normal strings. To compare temporal strings as dates/times, use ``date_difference()``.", + "description": "Compares whether `x` is strictly greater than `y`.\n\n**Remarks:**\n\n* If any operand is a no-data value, the result will be the no-data value (or `null`).\n* The comparison of `NaN` (not a number) follows [IEEE Standard 754](https://ieeexplore.ieee.org/document/8766229).\n* If any operand is not the data type `number`, the process returns `false`.\n* Temporal strings are normal strings. To compare temporal strings as dates/times, use ``date_difference()``.", "categories": [ "comparison" ], @@ -90,5 +90,12 @@ }, "returns": false } + ], + "links": [ + { + "rel": "about", + "href": "https://ieeexplore.ieee.org/document/4610935", + "title": "IEEE Standard 754-2008 for Floating-Point Arithmetic" + } ] } diff --git a/gte.json b/gte.json index 712b6b9c..3c054ba8 100644 --- a/gte.json +++ b/gte.json @@ -1,7 +1,7 @@ { "id": "gte", "summary": "Greater than or equal to comparison", - "description": "Compares whether `x` is greater than or equal to `y`.\n\n**Remarks:**\n\n* If any operand is a no-data value, the result will be the no-data value (or `null`).\n* If the operands are not equal (see process ``eq()``) and any of them is not a `number`, the process returns `false`.\n* Temporal strings are normal strings. To compare temporal strings as dates/times, use ``date_difference()``.", + "description": "Compares whether `x` is greater than or equal to `y`.\n\n**Remarks:**\n\n* If any operand is a no-data value, the result will be the no-data value (or `null`).\n* The comparison of `NaN` (not a number) follows [IEEE Standard 754](https://ieeexplore.ieee.org/document/8766229).\n* If the operands are not equal (see process ``eq()``) and any of them is not the data type `number`, the process returns `false`.\n* Temporal strings are normal strings. To compare temporal strings as dates/times, use ``date_difference()``.", "categories": [ "comparison" ], @@ -84,6 +84,13 @@ "returns": false } ], + "links": [ + { + "rel": "about", + "href": "https://ieeexplore.ieee.org/document/4610935", + "title": "IEEE Standard 754-2008 for Floating-Point Arithmetic" + } + ], "process_graph": { "eq": { "process_id": "eq", diff --git a/lt.json b/lt.json index b7e35bf4..bcb167b1 100644 --- a/lt.json +++ b/lt.json @@ -1,7 +1,7 @@ { "id": "lt", "summary": "Less than comparison", - "description": "Compares whether `x` is strictly less than `y`.\n\n**Remarks:**\n\n* If any operand is a no-data value, the result will be the no-data value (or `null`).\n* If any operand is not a `number`, the process returns `false`.\n* Temporal strings are normal strings. To compare temporal strings as dates/times, use ``date_difference()``.", + "description": "Compares whether `x` is strictly less than `y`.\n\n**Remarks:**\n\n* If any operand is a no-data value, the result will be the no-data value (or `null`).\n* The comparison of `NaN` (not a number) follows [IEEE Standard 754](https://ieeexplore.ieee.org/document/8766229).\n* If any operand is not the data type `number`, the process returns `false`.\n* Temporal strings are normal strings. To compare temporal strings as dates/times, use ``date_difference()``.", "categories": [ "comparison" ], @@ -90,5 +90,12 @@ }, "returns": false } + ], + "links": [ + { + "rel": "about", + "href": "https://ieeexplore.ieee.org/document/4610935", + "title": "IEEE Standard 754-2008 for Floating-Point Arithmetic" + } ] } diff --git a/lte.json b/lte.json index 5ab05126..0968dfa1 100644 --- a/lte.json +++ b/lte.json @@ -1,7 +1,7 @@ { "id": "lte", "summary": "Less than or equal to comparison", - "description": "Compares whether `x` is less than or equal to `y`.\n\n**Remarks:**\n\n* If any operand is a no-data value, the result will be the no-data value (or `null`).\n* If the operands are not equal (see process ``eq()``) and any of them is not a `number`, the process returns `false`.\n* Temporal strings are normal strings. To compare temporal strings as dates/times, use ``date_difference()``.", + "description": "Compares whether `x` is less than or equal to `y`.\n\n**Remarks:**\n\n* If any operand is a no-data value, the result will be the no-data value (or `null`).\n* The comparison of `NaN` (not a number) follows [IEEE Standard 754](https://ieeexplore.ieee.org/document/8766229).\n* If the operands are not equal (see process ``eq()``) and any of them is not the data type `number`, the process returns `false`.\n* Temporal strings are normal strings. To compare temporal strings as dates/times, use ``date_difference()``.", "categories": [ "comparison" ], @@ -84,6 +84,13 @@ "returns": false } ], + "links": [ + { + "rel": "about", + "href": "https://ieeexplore.ieee.org/document/4610935", + "title": "IEEE Standard 754-2008 for Floating-Point Arithmetic" + } + ], "process_graph": { "eq": { "process_id": "eq", diff --git a/meta/implementation.md b/meta/implementation.md index fafe6ea0..0f7e75ca 100644 --- a/meta/implementation.md +++ b/meta/implementation.md @@ -4,16 +4,16 @@ This file is meant to provide some additional implementation details for back-en ## No-data value -A data cube shall always keep reference of the applicable no-data values. -The no-data value can be chosen by the back-end implementation, e.g. depending on the data type of the data. +A data cube shall always keep reference of the applicable no-data value(s). +The no-data values can be chosen by the back-end implementation, e.g. depending on the data type of the data. No-data values should be exposed for each pre-defined Collection in its metadata. For all data generated through openEO (e.g. through synchronous or batch jobs), the metadata and/or data shall expose the no-data values. -The openEO process specifications generally use `null` as a generic value to express no-data values. +The openEO process specifications generally uses `null` as a generic value to express no-data values. This is primarily meant for the JSON encoding, this means: 1. in the process specification (data type `null` in the schema), and -2. in the process graph (if the no-data value exposed through the metadata can't be used in JSON). +2. in the process graph (if the no-data value exposed through the metadata can't be used in JSON, e.g. `NaN`). Back-ends may or may not use `null` as a no-data value internally. @@ -23,12 +23,26 @@ no-data values in openEO and `NaN` (IEEE 754) sometimes differs. **Array processes:** Some array processes (e.g. `array_find` or `any`) use `null` as a return value. In the context of data cube operations (e.g. in `reduce_dimension`), `null` values returned by the array processes shall be replaced with the no-data value of the data cube. +As the processes may be used outside of data cubes where the no-data values are undefined, +most processes fall back to `null` in this case (reflected through the mention of "(or `null`)" in the process description). ## Optimizations for conditions (e.g. `if`) None of the openEO processes per se is "special" and thus all are treated the same way by default. Nevertheless, there are some cases where a special treatment can make a huge difference. +## Character encoding + +String-related processes previously mentioned that strings have to be "encoded in UTF-8 by default". +This was removed and we clarify the behavior here: + +For data transfer through the API, the character encoding of strings is specified using HTTP headers. +This means all strings provided in the process graph have the same encoding as specified in the HTTP headers. +Back-ends can internally use any character encoding and as such may need to convert the character encoding +upon receipt of the process graph. +It is recommended to use a [Unicode](https://en.wikipedia.org/wiki/Unicode) character encoding such as UTF-8. +In case of doubt, clients and server should assume UTF-8 as character encoding. + ### Branching behavior The `if` process (and any process that is working on some kind of condition) are usually diff --git a/neq.json b/neq.json index 1b115b3b..383d6b8b 100644 --- a/neq.json +++ b/neq.json @@ -1,7 +1,7 @@ { "id": "neq", "summary": "Not equal to comparison", - "description": "Compares whether `x` is **not** strictly equal to `y`.\n\n**Remarks:**\n\n* Data types MUST be checked strictly. For example, a string with the content *1* is not equal to the number *1*. Nevertheless, an integer *1* is equal to a floating-point number *1.0* as `integer` is a sub-type of `number`.\n* If any operand is a no-data value, the result will be the no-data value (or `null`).\n* Strings are expected to be encoded in UTF-8 by default.\n* Temporal strings are normal strings. To compare temporal strings as dates/times, use ``date_difference()``.", + "description": "Compares whether `x` is **not** strictly equal to `y`.\n\n**Remarks:**\n\n* Data types MUST be checked strictly. For example, a string with the content *1* is not equal to the number *1*. Nevertheless, an integer *1* is equal to a floating-point number *1.0* as `integer` is a sub-type of `number`.\n* If any operand is a no-data value, the result will be the no-data value (or `null`).\n* The comparison of `NaN` (not a number) follows [IEEE Standard 754](https://ieeexplore.ieee.org/document/8766229).\n* Temporal strings are normal strings. To compare temporal strings as dates/times, use ``date_difference()``.", "categories": [ "texts", "comparison" @@ -147,6 +147,13 @@ "returns": true } ], + "links": [ + { + "rel": "about", + "href": "https://ieeexplore.ieee.org/document/4610935", + "title": "IEEE Standard 754-2008 for Floating-Point Arithmetic" + } + ], "process_graph": { "eq": { "process_id": "eq", diff --git a/proposals/is_infinite.json b/proposals/is_infinite.json index b6a5acca..da0d165f 100644 --- a/proposals/is_infinite.json +++ b/proposals/is_infinite.json @@ -1,7 +1,7 @@ { "id": "is_infinite", "summary": "Value is an infinite number", - "description": "Checks whether the specified value `x` is an infinite number. The definition of infinite numbers follows the [IEEE Standard 754](https://ieeexplore.ieee.org/document/4610935). The special numerical value `NaN` (not a number) as defined by the [IEEE Standard 754](https://ieeexplore.ieee.org/document/4610935) is not an infinite number and must return `false`.", + "description": "Checks whether the specified value `x` is an infinite number. The definition of infinite numbers (and `NaN`) follows the [IEEE Standard 754](https://ieeexplore.ieee.org/document/4610935). `NaN` (not a number) is not an infinite number and must return `false`.", "categories": [ "comparison" ], @@ -28,4 +28,4 @@ "title": "IEEE Standard 754-2008 for Floating-Point Arithmetic" } ] -} \ No newline at end of file +} diff --git a/text_begins.json b/text_begins.json index f422999f..8fc39624 100644 --- a/text_begins.json +++ b/text_begins.json @@ -1,7 +1,7 @@ { "id": "text_begins", "summary": "Text begins with another text", - "description": "Checks whether the text (also known as *string*) specified for `data` contains the text specified for `pattern` at the beginning. Both are expected to be encoded in UTF-8 by default. No-data values are passed through and therefore get propagated.", + "description": "Checks whether the text (also known as *string*) specified for `data` contains the text specified for `pattern` at the beginning. No-data values are passed through and therefore get propagated.", "categories": [ "texts", "comparison" diff --git a/text_contains.json b/text_contains.json index d0046131..a41a3d95 100644 --- a/text_contains.json +++ b/text_contains.json @@ -1,7 +1,7 @@ { "id": "text_contains", "summary": "Text contains another text", - "description": "Checks whether the text (also known as *string*) specified for `data` contains the text specified for `pattern`. Both are expected to be encoded in UTF-8 by default. No-data values are passed through and therefore get propagated.", + "description": "Checks whether the text (also known as *string*) specified for `data` contains the text specified for `pattern`. No-data values are passed through and therefore get propagated.", "categories": [ "texts", "comparison" diff --git a/text_ends.json b/text_ends.json index 7bd2238c..70a04855 100644 --- a/text_ends.json +++ b/text_ends.json @@ -1,7 +1,7 @@ { "id": "text_ends", "summary": "Text ends with another text", - "description": "Checks whether the text (also known as *string*) specified for `data` contains the text specified for `pattern` at the end. Both are expected to be encoded in UTF-8 by default. No-data values are passed through and therefore get propagated.", + "description": "Checks whether the text (also known as *string*) specified for `data` contains the text specified for `pattern` at the end. No-data values are passed through and therefore get propagated.", "categories": [ "texts", "comparison"