Skip to content

Commit

Permalink
Use constant byte-to-string mapping for conversion
Browse files Browse the repository at this point in the history
  • Loading branch information
trossi committed Oct 2, 2024
1 parent 2d8c1f4 commit 918ea23
Showing 1 changed file with 35 additions and 40 deletions.
75 changes: 35 additions & 40 deletions rdata/unparser/_ascii.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,18 +3,49 @@
from __future__ import annotations

import string
from typing import TYPE_CHECKING, Any
from typing import TYPE_CHECKING

import numpy as np

from ._unparser import Unparser

if TYPE_CHECKING:
import io
from typing import Any, Final

import numpy.typing as npt


def build_byte_to_str_map() -> tuple[str, ...]:
"""Build byte-to-string mapping for string conversion."""

def escape(b: bytes) -> str:
r"""Escape string, e.g., b'\n' -> r'\n'."""
return b.decode("latin1").encode("unicode_escape").decode("ascii")

# Fill mapping with octal codes
byte_to_str = [rf"\{byte:03o}" for byte in range(256)]

# Update mapping for ascii characters
for byte in string.printable.encode("ascii"):
# Note: indexing bytestring yields ints
assert isinstance(byte, int)
byte_to_str[byte] = escape(bytes([byte]))

# Update mapping for special characters
byte_to_str[b'"'[0]] = r'\"'
byte_to_str[b"'"[0]] = r"\'"
byte_to_str[b"?"[0]] = r"\?"
byte_to_str[b" "[0]] = r"\040"
byte_to_str[b"\v"[0]] = r"\v"
byte_to_str[b"\f"[0]] = r"\f"

return tuple(byte_to_str)


BYTE_TO_STR: Final = build_byte_to_str_map()


class UnparserASCII(Unparser):
"""Unparser for files in ASCII format."""

Expand Down Expand Up @@ -73,45 +104,9 @@ def _unparse_string_characters(self, value: bytes) -> None:
# i.e., output = value.decode('latin1').encode('unicode_escape').decode('ascii')
# This would produce byte representation in hex such as '\xc3\xa4',
# but we need to have the equivalent octal presentation '\303\244'.
# So, we need to do somewhat manual conversion instead.

# List of ascii characters that are written directly;
# this is all printable ascii except
# - ' ' that Python writes as ' ', but R as '\040'
# - '\v' that Python writes as '\x0b', but R as '\v'
# - '\f' that Python writes as '\x0c', but R as '\f'
write_raw = string.printable.replace(" ", "")\
.replace("\v", "")\
.replace("\f", "")

def escape(b: bytes) -> str:
r"""Escape string, e.g., b'\n' -> r'\\n'."""
return b.decode("latin1").encode("unicode_escape").decode("ascii")

# Go though the string byte-by-byte as we need to
# convert every non-ascii character separately
output = ""
ascii_buffer = b""
for byte in value:
if chr(byte) in write_raw:
# Collect ascii characters to substring buffer
ascii_buffer += bytes([byte])
else:
# Encountered a non-ascii character!
# Escape and add the ascii buffer
output += escape(ascii_buffer)
ascii_buffer = b""
# Add '\v' or '\f' or non-ascii character in octal presentation
if chr(byte) == "\v":
output += r"\v"
elif chr(byte) == "\f":
output += r"\f"
else:
output += rf"\{byte:03o}"
# Escape and add the remaining ascii buffer
output += escape(ascii_buffer)
# In addition, some ascii characters need to be escaped.

# Escape some more characters like R does
output = output.replace('"', r'\"').replace("'", r"\'").replace("?", r"\?")
# Convert string byte-by-byte
output = "".join(BYTE_TO_STR[byte] for byte in value)

self._add_line(output)

0 comments on commit 918ea23

Please sign in to comment.