Merge pull request #44 from trossi/fix-ascii

Fix parsing and unparsing ascii files with escaped characters and NA strings
vnmabus · Oct 7, 2024 · d65ec51 · d65ec51
2 parents 5166258 + 918ea23
commit d65ec51
Show file tree

Hide file tree

Showing 8 changed files with 101 additions and 22 deletions.
diff --git a/rdata/parser/_ascii.py b/rdata/parser/_ascii.py
@@ -60,15 +60,20 @@ def _parse_array_values(
         return array
 
     def parse_string(self, length: int) -> bytes:
-        # Non-ascii characters in strings are written using octal byte codes,
+        # Read the ascii string
+        s = self._readline()
+
+        # R escapes question marks ('?') so they come always as r'\?'.
+        # Let's start unescaping those.
+        s = s.replace(r"\?", "?")
+
+        # Non-ascii characters and space are written using octal byte codes,
         # for example, a string 'aä' (2 chars) in UTF-8 is written as an ascii
         # string r'a\303\244' (9 chars). We want to transform this to a byte
         # string b'a\303\244' (3 bytes) corresponding to the byte
         # representation of the original UTF-8 string.
         # Let's use this string as an example to go through the code below
 
-        # Read the ascii string
-        s = self._readline()
         # Now s = r'a\303\244' (9 chars)
 
         # Convert characters to bytes (all characters are ascii)

diff --git a/rdata/tests/data/test_ascii_ascii_chars.rds b/rdata/tests/data/test_ascii_ascii_chars.rds
@@ -0,0 +1,11 @@
+A
+3
+263168
+197888
+5
+UTF-8
+16
+1
+262153
+102
+0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!\"#$%&\'()*+,-./:;<=>\?@[\\]^_`{|}~\040\t\n\r\v\f\r\n
diff --git a/rdata/tests/data/test_ascii_chars.rds b/rdata/tests/data/test_ascii_chars.rds
diff --git a/rdata/tests/data/test_ascii_na_string.rds b/rdata/tests/data/test_ascii_na_string.rds
@@ -0,0 +1,10 @@
+A
+3
+263168
+197888
+5
+UTF-8
+16
+1
+9
+-1
diff --git a/rdata/tests/test_rdata.py b/rdata/tests/test_rdata.py
@@ -102,6 +102,13 @@ def test_na_string(self) -> None:
             "test_na_string": [None],
         })
 
+    def test_ascii_na_string(self) -> None:
+        """Test that the NA string is parsed correctly."""
+        # File created in R with
+        # saveRDS(as.character(NA), file="test_ascii_na_string.rds", ascii=TRUE, compress=FALSE)  # noqa: E501
+        data = rdata.read_rds(TESTDATA_PATH / "test_ascii_na_string.rds")
+        np.testing.assert_equal(data, [None])
+
     def test_complex(self) -> None:
         """Test that complex numbers can be parsed."""
         data = rdata.read_rda(TESTDATA_PATH / "test_complex.rda")
@@ -708,6 +715,20 @@ def test_ascii(self) -> None:
                 np.testing.assert_equal(ma.get_fill_value(),
                                         ref_ma.get_fill_value())
 
+    def test_ascii_characters(self) -> None:
+        """Test reading string with all ascii printable characters."""
+        # File created in R with
+        # saveRDS("0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~ \t\n\r\v\f\r\n", file="test_ascii_chars.rds")  # noqa: E501,ERA001
+        data = rdata.read_rds(TESTDATA_PATH / "test_ascii_chars.rds")
+        assert data == "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~ \t\n\r\v\f\r\n", data  # noqa: E501
+
+    def test_ascii_ascii_characters(self) -> None:
+        """Test reading string with all ascii printable characters."""
+        # File created in R with
+        # saveRDS("0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~ \t\n\r\v\f\r\n", file="test_ascii_ascii_chars.rds", ascii=TRUE, compress=FALSE)  # noqa: E501,ERA001
+        data = rdata.read_rds(TESTDATA_PATH / "test_ascii_ascii_chars.rds")
+        assert data == "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~ \t\n\r\v\f\r\n", data  # noqa: E501
+
     def test_nan_inf(self) -> None:
         """Test reading nan and inf."""
         data = rdata.read_rds(TESTDATA_PATH / "test_nan_inf.rds")

diff --git a/rdata/unparser/_ascii.py b/rdata/unparser/_ascii.py
@@ -3,18 +3,49 @@
 from __future__ import annotations
 
 import string
-from typing import TYPE_CHECKING, Any
+from typing import TYPE_CHECKING
 
 import numpy as np
 
 from ._unparser import Unparser
 
 if TYPE_CHECKING:
     import io
+    from typing import Any, Final
 
     import numpy.typing as npt
 
 
+def build_byte_to_str_map() -> tuple[str, ...]:
+    """Build byte-to-string mapping for string conversion."""
+
+    def escape(b: bytes) -> str:
+        r"""Escape string, e.g., b'\n' -> r'\n'."""
+        return b.decode("latin1").encode("unicode_escape").decode("ascii")
+
+    # Fill mapping with octal codes
+    byte_to_str = [rf"\{byte:03o}" for byte in range(256)]
+
+    # Update mapping for ascii characters
+    for byte in string.printable.encode("ascii"):
+        # Note: indexing bytestring yields ints
+        assert isinstance(byte, int)
+        byte_to_str[byte] = escape(bytes([byte]))
+
+    # Update mapping for special characters
+    byte_to_str[b'"'[0]] = r'\"'
+    byte_to_str[b"'"[0]] = r"\'"
+    byte_to_str[b"?"[0]] = r"\?"
+    byte_to_str[b" "[0]] = r"\040"
+    byte_to_str[b"\v"[0]] = r"\v"
+    byte_to_str[b"\f"[0]] = r"\f"
+
+    return tuple(byte_to_str)
+
+
+BYTE_TO_STR: Final = build_byte_to_str_map()
+
+
 class UnparserASCII(Unparser):
     """Unparser for files in ASCII format."""
 
@@ -68,16 +99,14 @@ def _unparse_array_values(self, array: npt.NDArray[Any]) -> None:
 
             self._add_line(line)
 
-    def unparse_string(self, value: bytes) -> None:
-        """Unparse a string."""
-        self.unparse_int(len(value))
-
+    def _unparse_string_characters(self, value: bytes) -> None:
         # Ideally we could do here the reverse of parsing,
-        # i.e., value = value.decode('latin1').encode('unicode_escape').decode('ascii')
+        # i.e., output = value.decode('latin1').encode('unicode_escape').decode('ascii')
         # This would produce byte representation in hex such as '\xc3\xa4',
         # but we need to have the equivalent octal presentation '\303\244'.
-        # So, we do somewhat manual conversion instead:
-        s = "".join(chr(byte) if chr(byte) in string.printable else rf"\{byte:03o}"
-                    for byte in value)
+        # In addition, some ascii characters need to be escaped.
+
+        # Convert string byte-by-byte
+        output = "".join(BYTE_TO_STR[byte] for byte in value)
 
-        self._add_line(s)
+        self._add_line(output)
diff --git a/rdata/unparser/_unparser.py b/rdata/unparser/_unparser.py
@@ -73,9 +73,17 @@ def unparse_array(self, array: npt.NDArray[Any]) -> None:
     def _unparse_array_values(self, array: npt.NDArray[Any]) -> None:
         """Unparse the values of an array."""
 
-    @abc.abstractmethod
-    def unparse_string(self, value: bytes) -> None:
+    def unparse_string(self, value: bytes | None) -> None:
         """Unparse a string."""
+        if value is None:
+            self.unparse_int(-1)
+            return
+        self.unparse_int(len(value))
+        self._unparse_string_characters(value)
+
+    @abc.abstractmethod
+    def _unparse_string_characters(self, value: bytes) -> None:
+        """Unparse characters of a string (not None)."""
 
     def unparse_r_data(self, r_data: RData) -> None:
         """Unparse an RData object."""

diff --git a/rdata/unparser/_xdr.py b/rdata/unparser/_xdr.py
@@ -56,10 +56,5 @@ def _unparse_array_values(self, array: npt.NDArray[Any]) -> None:
         data = array.data if array.flags["C_CONTIGUOUS"] else array.tobytes()
         self.file.write(data)
 
-    def unparse_string(self, value: bytes) -> None:
-        """Unparse a string."""
-        if value is None:
-            self.unparse_int(-1)
-        else:
-            self.unparse_int(len(value))
-            self.file.write(value)
+    def _unparse_string_characters(self, value: bytes) -> None:
+        self.file.write(value)