Skip to content

Commit

Permalink
Refactor USE_RS_TOKENIZER flag
Browse files Browse the repository at this point in the history
  • Loading branch information
georgesittas committed Nov 26, 2024
1 parent 2049d63 commit c24af91
Show file tree
Hide file tree
Showing 3 changed files with 13 additions and 22 deletions.
3 changes: 2 additions & 1 deletion sqlglot/dialects/bigquery.py
Original file line number Diff line number Diff line change
Expand Up @@ -356,7 +356,6 @@ class BigQuery(Dialect):
FORCE_EARLY_ALIAS_REF_EXPANSION = True
EXPAND_ALIAS_REFS_EARLY_ONLY_IN_GROUP_BY = True
PRESERVE_ORIGINAL_NAMES = True
USE_RS_JSONPATH_TOKENIZER = False

# https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#case_sensitivity
NORMALIZATION_STRATEGY = NormalizationStrategy.CASE_INSENSITIVE
Expand Down Expand Up @@ -487,6 +486,8 @@ class Tokenizer(tokens.Tokenizer):
KEYWORDS.pop("/*+")

class JSONPathTokenizer(jsonpath.JSONPathTokenizer):
USE_RS_TOKENIZER = False

def _scan(self, until: t.Optional[t.Callable] = None, skip_spaces: bool = True) -> None:
return super()._scan(until=until, skip_spaces=False)

Expand Down
11 changes: 1 addition & 10 deletions sqlglot/dialects/dialect.py
Original file line number Diff line number Diff line change
Expand Up @@ -411,13 +411,6 @@ class Dialect(metaclass=_Dialect):
is cast to x's type to match it instead.
"""

USE_RS_JSONPATH_TOKENIZER: t.Optional[bool] = None
"""
Whether the JSONPathTokenizer should use the Rust implementation (True by default) or always
enforce the SQLGlot/Python version. This is required in case a dialect requires specialized
tokenization, in which case it must override the py-based Tokenizer functions
"""

REGEXP_EXTRACT_DEFAULT_GROUP = 0
"""The default value for the capturing group."""

Expand Down Expand Up @@ -964,9 +957,7 @@ def tokenizer(self) -> Tokenizer:

@property
def jsonpath_tokenizer(self) -> JSONPathTokenizer:
return self.jsonpath_tokenizer_class(
dialect=self, use_rs_tokenizer=self.USE_RS_JSONPATH_TOKENIZER
)
return self.jsonpath_tokenizer_class(dialect=self)

def parser(self, **opts) -> Parser:
return self.parser_class(dialect=self, **opts)
Expand Down
21 changes: 10 additions & 11 deletions sqlglot/tokens.py
Original file line number Diff line number Diff line change
Expand Up @@ -525,7 +525,9 @@ def _quotes_to_format(
if " " in key or any(single in key for single in klass.SINGLE_TOKENS)
)

if USE_RS_TOKENIZER:
klass.USE_RS_TOKENIZER = klass.USE_RS_TOKENIZER and USE_RS_TOKENIZER

if klass.USE_RS_TOKENIZER:
settings = RsTokenizerSettings(
white_space={k: _TOKEN_TYPE_TO_INDEX[v] for k, v in klass.WHITE_SPACE.items()},
single_tokens={k: _TOKEN_TYPE_TO_INDEX[v] for k, v in klass.SINGLE_TOKENS.items()},
Expand Down Expand Up @@ -638,8 +640,12 @@ class Tokenizer(metaclass=_Tokenizer):
# Whether string escape characters function as such when placed within raw strings
STRING_ESCAPES_ALLOWED_IN_RAW_STRINGS = True

# Whether nested comments like /* one /* two */ */ are supported
NESTED_COMMENTS = True

# Whether the rust version of this tokenizer is supported
USE_RS_TOKENIZER = True

HINT_START = "/*+"

TOKENS_PRECEDING_HINT = {TokenType.SELECT, TokenType.INSERT, TokenType.UPDATE, TokenType.DELETE}
Expand Down Expand Up @@ -984,21 +990,14 @@ class Tokenizer(metaclass=_Tokenizer):
"_peek",
"_prev_token_line",
"_rs_dialect_settings",
"_use_rs_tokenizer",
)

def __init__(
self, dialect: DialectType = None, use_rs_tokenizer: t.Optional[bool] = None
) -> None:
def __init__(self, dialect: DialectType = None) -> None:
from sqlglot.dialects import Dialect

self.dialect = Dialect.get_or_raise(dialect)

self._use_rs_tokenizer = (
use_rs_tokenizer if use_rs_tokenizer is not None else USE_RS_TOKENIZER
)

if self._use_rs_tokenizer:
if self.USE_RS_TOKENIZER:
self._rs_dialect_settings = RsTokenizerDialectSettings(
unescaped_sequences=self.dialect.UNESCAPED_SEQUENCES,
identifiers_can_start_with_digit=self.dialect.IDENTIFIERS_CAN_START_WITH_DIGIT,
Expand All @@ -1023,7 +1022,7 @@ def reset(self) -> None:

def tokenize(self, sql: str) -> t.List[Token]:
"""Returns a list of tokens corresponding to the SQL string `sql`."""
if self._use_rs_tokenizer:
if self.USE_RS_TOKENIZER:
return self.tokenize_rs(sql)

self.reset()
Expand Down

0 comments on commit c24af91

Please sign in to comment.