Skip to content

Commit

Permalink
Merge pull request #44 from trossi/fix-ascii
Browse files Browse the repository at this point in the history
Fix parsing and unparsing ascii files with escaped characters and NA strings
  • Loading branch information
vnmabus authored Oct 7, 2024
2 parents 5166258 + 918ea23 commit d65ec51
Show file tree
Hide file tree
Showing 8 changed files with 101 additions and 22 deletions.
11 changes: 8 additions & 3 deletions rdata/parser/_ascii.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,15 +60,20 @@ def _parse_array_values(
return array

def parse_string(self, length: int) -> bytes:
# Non-ascii characters in strings are written using octal byte codes,
# Read the ascii string
s = self._readline()

# R escapes question marks ('?') so they come always as r'\?'.
# Let's start unescaping those.
s = s.replace(r"\?", "?")

# Non-ascii characters and space are written using octal byte codes,
# for example, a string 'aä' (2 chars) in UTF-8 is written as an ascii
# string r'a\303\244' (9 chars). We want to transform this to a byte
# string b'a\303\244' (3 bytes) corresponding to the byte
# representation of the original UTF-8 string.
# Let's use this string as an example to go through the code below

# Read the ascii string
s = self._readline()
# Now s = r'a\303\244' (9 chars)

# Convert characters to bytes (all characters are ascii)
Expand Down
11 changes: 11 additions & 0 deletions rdata/tests/data/test_ascii_ascii_chars.rds
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
A
3
263168
197888
5
UTF-8
16
1
262153
102
0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!\"#$%&\'()*+,-./:;<=>\?@[\\]^_`{|}~\040\t\n\r\v\f\r\n
Binary file added rdata/tests/data/test_ascii_chars.rds
Binary file not shown.
10 changes: 10 additions & 0 deletions rdata/tests/data/test_ascii_na_string.rds
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
A
3
263168
197888
5
UTF-8
16
1
9
-1
21 changes: 21 additions & 0 deletions rdata/tests/test_rdata.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,13 @@ def test_na_string(self) -> None:
"test_na_string": [None],
})

def test_ascii_na_string(self) -> None:
"""Test that the NA string is parsed correctly."""
# File created in R with
# saveRDS(as.character(NA), file="test_ascii_na_string.rds", ascii=TRUE, compress=FALSE) # noqa: E501
data = rdata.read_rds(TESTDATA_PATH / "test_ascii_na_string.rds")
np.testing.assert_equal(data, [None])

def test_complex(self) -> None:
"""Test that complex numbers can be parsed."""
data = rdata.read_rda(TESTDATA_PATH / "test_complex.rda")
Expand Down Expand Up @@ -708,6 +715,20 @@ def test_ascii(self) -> None:
np.testing.assert_equal(ma.get_fill_value(),
ref_ma.get_fill_value())

def test_ascii_characters(self) -> None:
"""Test reading string with all ascii printable characters."""
# File created in R with
# saveRDS("0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~ \t\n\r\v\f\r\n", file="test_ascii_chars.rds") # noqa: E501,ERA001
data = rdata.read_rds(TESTDATA_PATH / "test_ascii_chars.rds")
assert data == "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~ \t\n\r\v\f\r\n", data # noqa: E501

def test_ascii_ascii_characters(self) -> None:
"""Test reading string with all ascii printable characters."""
# File created in R with
# saveRDS("0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~ \t\n\r\v\f\r\n", file="test_ascii_ascii_chars.rds", ascii=TRUE, compress=FALSE) # noqa: E501,ERA001
data = rdata.read_rds(TESTDATA_PATH / "test_ascii_ascii_chars.rds")
assert data == "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~ \t\n\r\v\f\r\n", data # noqa: E501

def test_nan_inf(self) -> None:
"""Test reading nan and inf."""
data = rdata.read_rds(TESTDATA_PATH / "test_nan_inf.rds")
Expand Down
49 changes: 39 additions & 10 deletions rdata/unparser/_ascii.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,18 +3,49 @@
from __future__ import annotations

import string
from typing import TYPE_CHECKING, Any
from typing import TYPE_CHECKING

import numpy as np

from ._unparser import Unparser

if TYPE_CHECKING:
import io
from typing import Any, Final

import numpy.typing as npt


def build_byte_to_str_map() -> tuple[str, ...]:
"""Build byte-to-string mapping for string conversion."""

def escape(b: bytes) -> str:
r"""Escape string, e.g., b'\n' -> r'\n'."""
return b.decode("latin1").encode("unicode_escape").decode("ascii")

# Fill mapping with octal codes
byte_to_str = [rf"\{byte:03o}" for byte in range(256)]

# Update mapping for ascii characters
for byte in string.printable.encode("ascii"):
# Note: indexing bytestring yields ints
assert isinstance(byte, int)
byte_to_str[byte] = escape(bytes([byte]))

# Update mapping for special characters
byte_to_str[b'"'[0]] = r'\"'
byte_to_str[b"'"[0]] = r"\'"
byte_to_str[b"?"[0]] = r"\?"
byte_to_str[b" "[0]] = r"\040"
byte_to_str[b"\v"[0]] = r"\v"
byte_to_str[b"\f"[0]] = r"\f"

return tuple(byte_to_str)


BYTE_TO_STR: Final = build_byte_to_str_map()


class UnparserASCII(Unparser):
"""Unparser for files in ASCII format."""

Expand Down Expand Up @@ -68,16 +99,14 @@ def _unparse_array_values(self, array: npt.NDArray[Any]) -> None:

self._add_line(line)

def unparse_string(self, value: bytes) -> None:
"""Unparse a string."""
self.unparse_int(len(value))

def _unparse_string_characters(self, value: bytes) -> None:
# Ideally we could do here the reverse of parsing,
# i.e., value = value.decode('latin1').encode('unicode_escape').decode('ascii')
# i.e., output = value.decode('latin1').encode('unicode_escape').decode('ascii')
# This would produce byte representation in hex such as '\xc3\xa4',
# but we need to have the equivalent octal presentation '\303\244'.
# So, we do somewhat manual conversion instead:
s = "".join(chr(byte) if chr(byte) in string.printable else rf"\{byte:03o}"
for byte in value)
# In addition, some ascii characters need to be escaped.

# Convert string byte-by-byte
output = "".join(BYTE_TO_STR[byte] for byte in value)

self._add_line(s)
self._add_line(output)
12 changes: 10 additions & 2 deletions rdata/unparser/_unparser.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,9 +73,17 @@ def unparse_array(self, array: npt.NDArray[Any]) -> None:
def _unparse_array_values(self, array: npt.NDArray[Any]) -> None:
"""Unparse the values of an array."""

@abc.abstractmethod
def unparse_string(self, value: bytes) -> None:
def unparse_string(self, value: bytes | None) -> None:
"""Unparse a string."""
if value is None:
self.unparse_int(-1)
return
self.unparse_int(len(value))
self._unparse_string_characters(value)

@abc.abstractmethod
def _unparse_string_characters(self, value: bytes) -> None:
"""Unparse characters of a string (not None)."""

def unparse_r_data(self, r_data: RData) -> None:
"""Unparse an RData object."""
Expand Down
9 changes: 2 additions & 7 deletions rdata/unparser/_xdr.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,10 +56,5 @@ def _unparse_array_values(self, array: npt.NDArray[Any]) -> None:
data = array.data if array.flags["C_CONTIGUOUS"] else array.tobytes()
self.file.write(data)

def unparse_string(self, value: bytes) -> None:
"""Unparse a string."""
if value is None:
self.unparse_int(-1)
else:
self.unparse_int(len(value))
self.file.write(value)
def _unparse_string_characters(self, value: bytes) -> None:
self.file.write(value)

0 comments on commit d65ec51

Please sign in to comment.