Use constant byte-to-string mapping for conversion

vnmabus · Oct 2, 2024 · 918ea23 · 918ea23
1 parent 2d8c1f4
commit 918ea23
Showing 1 changed file with 35 additions and 40 deletions.
diff --git a/rdata/unparser/_ascii.py b/rdata/unparser/_ascii.py
@@ -3,18 +3,49 @@
 from __future__ import annotations
 
 import string
-from typing import TYPE_CHECKING, Any
+from typing import TYPE_CHECKING
 
 import numpy as np
 
 from ._unparser import Unparser
 
 if TYPE_CHECKING:
     import io
+    from typing import Any, Final
 
     import numpy.typing as npt
 
 
+def build_byte_to_str_map() -> tuple[str, ...]:
+    """Build byte-to-string mapping for string conversion."""
+
+    def escape(b: bytes) -> str:
+        r"""Escape string, e.g., b'\n' -> r'\n'."""
+        return b.decode("latin1").encode("unicode_escape").decode("ascii")
+
+    # Fill mapping with octal codes
+    byte_to_str = [rf"\{byte:03o}" for byte in range(256)]
+
+    # Update mapping for ascii characters
+    for byte in string.printable.encode("ascii"):
+        # Note: indexing bytestring yields ints
+        assert isinstance(byte, int)
+        byte_to_str[byte] = escape(bytes([byte]))
+
+    # Update mapping for special characters
+    byte_to_str[b'"'[0]] = r'\"'
+    byte_to_str[b"'"[0]] = r"\'"
+    byte_to_str[b"?"[0]] = r"\?"
+    byte_to_str[b" "[0]] = r"\040"
+    byte_to_str[b"\v"[0]] = r"\v"
+    byte_to_str[b"\f"[0]] = r"\f"
+
+    return tuple(byte_to_str)
+
+
+BYTE_TO_STR: Final = build_byte_to_str_map()
+
+
 class UnparserASCII(Unparser):
     """Unparser for files in ASCII format."""
 
@@ -73,45 +104,9 @@ def _unparse_string_characters(self, value: bytes) -> None:
         # i.e., output = value.decode('latin1').encode('unicode_escape').decode('ascii')
         # This would produce byte representation in hex such as '\xc3\xa4',
         # but we need to have the equivalent octal presentation '\303\244'.
-        # So, we need to do somewhat manual conversion instead.
-
-        # List of ascii characters that are written directly;
-        # this is all printable ascii except
-        # - ' '  that Python writes as ' ',    but R as '\040'
-        # - '\v' that Python writes as '\x0b', but R as '\v'
-        # - '\f' that Python writes as '\x0c', but R as '\f'
-        write_raw = string.printable.replace(" ", "")\
-                                    .replace("\v", "")\
-                                    .replace("\f", "")
-
-        def escape(b: bytes) -> str:
-            r"""Escape string, e.g., b'\n' -> r'\\n'."""
-            return b.decode("latin1").encode("unicode_escape").decode("ascii")
-
-        # Go though the string byte-by-byte as we need to
-        # convert every non-ascii character separately
-        output = ""
-        ascii_buffer = b""
-        for byte in value:
-            if chr(byte) in write_raw:
-                # Collect ascii characters to substring buffer
-                ascii_buffer += bytes([byte])
-            else:
-                # Encountered a non-ascii character!
-                # Escape and add the ascii buffer
-                output += escape(ascii_buffer)
-                ascii_buffer = b""
-                # Add '\v' or '\f' or non-ascii character in octal presentation
-                if chr(byte) == "\v":
-                    output += r"\v"
-                elif chr(byte) == "\f":
-                    output += r"\f"
-                else:
-                    output += rf"\{byte:03o}"
-        # Escape and add the remaining ascii buffer
-        output += escape(ascii_buffer)
+        # In addition, some ascii characters need to be escaped.
 
-        # Escape some more characters like R does
-        output = output.replace('"', r'\"').replace("'", r"\'").replace("?", r"\?")
+        # Convert string byte-by-byte
+        output = "".join(BYTE_TO_STR[byte] for byte in value)
 
         self._add_line(output)