diff --git a/coconut/_pyparsing.py b/coconut/_pyparsing.py index 170c3e5c..5a62c5ef 100644 --- a/coconut/_pyparsing.py +++ b/coconut/_pyparsing.py @@ -20,7 +20,6 @@ from coconut.root import * # NOQA import os -import re import sys import traceback from warnings import warn @@ -146,6 +145,7 @@ # ----------------------------------------------------------------------------------------------------------------------- if MODERN_PYPARSING: + ParserElement.leaveWhitespace = ParserElement.leave_whitespace SUPPORTS_PACKRAT_CONTEXT = False elif CPYPARSING: @@ -290,22 +290,6 @@ def enableIncremental(*args, **kwargs): all_parse_elements = None -# ----------------------------------------------------------------------------------------------------------------------- -# MISSING OBJECTS: -# ----------------------------------------------------------------------------------------------------------------------- - -python_quoted_string = getattr(_pyparsing, "python_quoted_string", None) -if python_quoted_string is None: - python_quoted_string = _pyparsing.Combine( - # multiline strings must come first - (_pyparsing.Regex(r'"""(?:[^"\\]|""(?!")|"(?!"")|\\.)*', flags=re.MULTILINE) + '"""').setName("multiline double quoted string") - | (_pyparsing.Regex(r"'''(?:[^'\\]|''(?!')|'(?!'')|\\.)*", flags=re.MULTILINE) + "'''").setName("multiline single quoted string") - | (_pyparsing.Regex(r'"(?:[^"\n\r\\]|(?:\\")|(?:\\(?:[^x]|x[0-9a-fA-F]+)))*') + '"').setName("double quoted string") - | (_pyparsing.Regex(r"'(?:[^'\n\r\\]|(?:\\')|(?:\\(?:[^x]|x[0-9a-fA-F]+)))*") + "'").setName("single quoted string") - ).setName("Python quoted string") - _pyparsing.python_quoted_string = python_quoted_string - - # ----------------------------------------------------------------------------------------------------------------------- # FAST REPRS: # ----------------------------------------------------------------------------------------------------------------------- diff --git a/coconut/compiler/compiler.py b/coconut/compiler/compiler.py index 24e746a0..9b555b5f 100644 --- a/coconut/compiler/compiler.py +++ b/coconut/compiler/compiler.py @@ -187,6 +187,7 @@ manage, sub_all, ComputationNode, + StartOfStrGrammar, ) from coconut.compiler.header import ( minify_header, @@ -1305,7 +1306,7 @@ def streamline(self, grammars, inputstring=None, force=False, inner=False): input_len = 0 if inputstring is None else len(inputstring) if force or (streamline_grammar_for_len is not None and input_len > streamline_grammar_for_len): start_time = get_clock_time() - prep_grammar(grammar, streamline=True) + prep_grammar(grammar, for_scan=False, streamline=True) logger.log_lambda( lambda: "Streamlined {grammar} in {time} seconds{info}.".format( grammar=get_name(grammar), @@ -1502,7 +1503,7 @@ def str_proc(self, inputstring, **kwargs): hold["exprs"][-1] += c elif hold["paren_level"] > 0: raise self.make_err(CoconutSyntaxError, "imbalanced parentheses in format string expression", inputstring, i, reformat=False) - elif match_in(self.end_f_str_expr, remaining_text): + elif does_parse(self.end_f_str_expr, remaining_text): hold["in_expr"] = False hold["str_parts"].append(c) else: @@ -2128,11 +2129,11 @@ def tre_return_handle(loc, tokens): type_ignore=self.type_ignore_comment(), ) self.tre_func_name <<= base_keyword(func_name).suppress() - return attach( - self.tre_return, + return StartOfStrGrammar(attach( + self.tre_return_base, tre_return_handle, greedy=True, - ) + )) def detect_is_gen(self, raw_lines): """Determine if the given function code is for a generator.""" diff --git a/coconut/compiler/grammar.py b/coconut/compiler/grammar.py index c25e309c..5e17c24f 100644 --- a/coconut/compiler/grammar.py +++ b/coconut/compiler/grammar.py @@ -40,7 +40,6 @@ Optional, ParserElement, StringEnd, - StringStart, Word, ZeroOrMore, hexnums, @@ -48,7 +47,6 @@ originalTextFor, nestedExpr, FollowedBy, - python_quoted_string, restOfLine, ) @@ -119,6 +117,7 @@ using_fast_grammar_methods, disambiguate_literal, any_of, + StartOfStrGrammar, ) @@ -924,7 +923,6 @@ class Grammar(object): # rparen handles simple stmts ending parenthesized stmt lambdas end_simple_stmt_item = FollowedBy(newline | semicolon | rparen) - start_marker = StringStart() moduledoc_marker = condense(ZeroOrMore(lineitem) - Optional(moduledoc_item)) end_marker = StringEnd() indent = Literal(openindent) @@ -2669,19 +2667,19 @@ class Grammar(object): line = newline | stmt file_input = condense(moduledoc_marker - ZeroOrMore(line)) - raw_file_parser = start_marker - file_input - end_marker + raw_file_parser = StartOfStrGrammar(file_input - end_marker) line_by_line_file_parser = ( - start_marker - moduledoc_marker - stores_loc_item, - start_marker - line - stores_loc_item, + StartOfStrGrammar(moduledoc_marker - stores_loc_item), + StartOfStrGrammar(line - stores_loc_item), ) file_parser = line_by_line_file_parser if USE_LINE_BY_LINE else raw_file_parser single_input = condense(Optional(line) - ZeroOrMore(newline)) eval_input = condense(testlist - ZeroOrMore(newline)) - single_parser = start_marker - single_input - end_marker - eval_parser = start_marker - eval_input - end_marker - some_eval_parser = start_marker + eval_input + single_parser = StartOfStrGrammar(single_input - end_marker) + eval_parser = StartOfStrGrammar(eval_input - end_marker) + some_eval_parser = StartOfStrGrammar(eval_input) parens = originalTextFor(nestedExpr("(", ")", ignoreExpr=None)) brackets = originalTextFor(nestedExpr("[", "]", ignoreExpr=None)) @@ -2699,15 +2697,16 @@ class Grammar(object): ) ) unsafe_xonsh_parser, _impl_call_ref = disable_inside( - single_parser, + single_input - end_marker, unsafe_impl_call_ref, ) impl_call_ref <<= _impl_call_ref - xonsh_parser, _anything_stmt, _xonsh_command = disable_outside( + _xonsh_parser, _anything_stmt, _xonsh_command = disable_outside( unsafe_xonsh_parser, unsafe_anything_stmt, unsafe_xonsh_command, ) + xonsh_parser = StartOfStrGrammar(_xonsh_parser) anything_stmt <<= _anything_stmt xonsh_command <<= _xonsh_command @@ -2731,7 +2730,7 @@ class Grammar(object): noqa_regex = compile_regex(r"\b[Nn][Oo][Qq][Aa]\b") - just_non_none_atom = start_marker + ~keyword("None") + known_atom + end_marker + just_non_none_atom = StartOfStrGrammar(~keyword("None") + known_atom + end_marker) original_function_call_tokens = ( lparen.suppress() + rparen.suppress() @@ -2741,9 +2740,8 @@ class Grammar(object): ) tre_func_name = Forward() - tre_return = ( - start_marker - + keyword("return").suppress() + tre_return_base = ( + keyword("return").suppress() + maybeparens( lparen, tre_func_name + original_function_call_tokens, @@ -2751,9 +2749,8 @@ class Grammar(object): ) + end_marker ) - tco_return = attach( - start_marker - + keyword("return").suppress() + tco_return = StartOfStrGrammar(attach( + keyword("return").suppress() + maybeparens( lparen, disallow_keywords(untcoable_funcs, with_suffix="(") @@ -2778,7 +2775,7 @@ class Grammar(object): tco_return_handle, # this is the root in what it's used for, so might as well evaluate greedily greedy=True, - ) + )) rest_of_lambda = Forward() lambdas = keyword("lambda") - rest_of_lambda - colon @@ -2818,9 +2815,8 @@ class Grammar(object): )) ) - split_func = ( - start_marker - - keyword("def").suppress() + split_func = StartOfStrGrammar( + keyword("def").suppress() - unsafe_dotted_name - Optional(brackets).suppress() - lparen.suppress() @@ -2834,13 +2830,13 @@ class Grammar(object): | ~indent + ~dedent + any_char + keyword("for") + unsafe_name + keyword("in") ) - just_a_string = start_marker + string_atom + end_marker + just_a_string = StartOfStrGrammar(string_atom + end_marker) end_of_line = end_marker | Literal("\n") | pound unsafe_equals = Literal("=") - parse_err_msg = start_marker + ( + parse_err_msg = StartOfStrGrammar( # should be in order of most likely to actually be the source of the error first fixto( ZeroOrMore(~questionmark + ~Literal("\n") + any_char) @@ -2859,22 +2855,31 @@ class Grammar(object): start_f_str_regex = compile_regex(r"\br?fr?$") start_f_str_regex_len = 4 - end_f_str_expr = combine(start_marker + (rbrace | colon | bang)) + end_f_str_expr = StartOfStrGrammar(combine(rbrace | colon | bang).leaveWhitespace()) + + python_quoted_string = regex_item( + # multiline strings must come first + r'"""(?:[^"\\]|\n|""(?!")|"(?!"")|\\.)*"""' + r"|'''(?:[^'\\]|\n|''(?!')|'(?!'')|\\.)*'''" + r'|"(?:[^"\n\r\\]|(?:\\")|(?:\\(?:[^x]|x[0-9a-fA-F]+)))*"' + r"|'(?:[^'\n\r\\]|(?:\\')|(?:\\(?:[^x]|x[0-9a-fA-F]+)))*'" + ) - string_start = start_marker + python_quoted_string + string_start = StartOfStrGrammar(python_quoted_string) - no_unquoted_newlines = start_marker + ZeroOrMore(python_quoted_string | ~Literal("\n") + any_char) + end_marker + no_unquoted_newlines = StartOfStrGrammar( + ZeroOrMore(python_quoted_string | ~Literal("\n") + any_char) + + end_marker + ) - operator_stmt = ( - start_marker - + keyword("operator").suppress() + operator_stmt = StartOfStrGrammar( + keyword("operator").suppress() + restOfLine ) unsafe_import_from_name = condense(ZeroOrMore(unsafe_dot) + unsafe_dotted_name | OneOrMore(unsafe_dot)) - from_import_operator = ( - start_marker - + keyword("from").suppress() + from_import_operator = StartOfStrGrammar( + keyword("from").suppress() + unsafe_import_from_name + keyword("import").suppress() + keyword("operator").suppress() diff --git a/coconut/compiler/util.py b/coconut/compiler/util.py index 813b76fd..1f8f1297 100644 --- a/coconut/compiler/util.py +++ b/coconut/compiler/util.py @@ -72,6 +72,7 @@ ParserElement, MatchFirst, And, + StringStart, _trim_arity, _ParseResultsWithOffset, all_parse_elements, @@ -610,8 +611,31 @@ def parsing_context(inner_parse=None): yield -def prep_grammar(grammar, streamline=False): +class StartOfStrGrammar(object): + """A container object that denotes grammars that should always be parsed at the start of the string.""" + __slots__ = ("grammar",) + start_marker = StringStart() + + def __init__(self, grammar): + self.grammar = grammar + + def with_start_marker(self): + """Get the grammar with the start marker.""" + internal_assert(not CPYPARSING, "StartOfStrGrammar.with_start_marker() should only be necessary without cPyparsing") + return self.start_marker + self.grammar + + @property + def name(self): + return get_name(self.grammar) + + +def prep_grammar(grammar, for_scan, streamline=False): """Prepare a grammar item to be used as the root of a parse.""" + if isinstance(grammar, StartOfStrGrammar): + if for_scan: + grammar = grammar.with_start_marker() + else: + grammar = grammar.grammar grammar = trace(grammar) if streamline: grammar.streamlined = False @@ -624,7 +648,7 @@ def prep_grammar(grammar, streamline=False): def parse(grammar, text, inner=None, eval_parse_tree=True): """Parse text using grammar.""" with parsing_context(inner): - result = prep_grammar(grammar).parseString(text) + result = prep_grammar(grammar, for_scan=False).parseString(text) if eval_parse_tree: result = unpack(result) return result @@ -645,8 +669,12 @@ def does_parse(grammar, text, inner=None): def all_matches(grammar, text, inner=None, eval_parse_tree=True): """Find all matches for grammar in text.""" + kwargs = {} + if CPYPARSING and isinstance(grammar, StartOfStrGrammar): + grammar = grammar.grammar + kwargs["maxStartLoc"] = 0 with parsing_context(inner): - for tokens, start, stop in prep_grammar(grammar).scanString(text): + for tokens, start, stop in prep_grammar(grammar, for_scan=True).scanString(text, **kwargs): if eval_parse_tree: tokens = unpack(tokens) yield tokens, start, stop @@ -668,8 +696,12 @@ def match_in(grammar, text, inner=None): def transform(grammar, text, inner=None): """Transform text by replacing matches to grammar.""" + kwargs = {} + if CPYPARSING and isinstance(grammar, StartOfStrGrammar): + grammar = grammar.grammar + kwargs["maxStartLoc"] = 0 with parsing_context(inner): - result = prep_grammar(add_action(grammar, unpack)).transformString(text) + result = prep_grammar(add_action(grammar, unpack), for_scan=True).transformString(text, **kwargs) if result == text: result = None return result diff --git a/coconut/constants.py b/coconut/constants.py index 2553e530..0ac8ea87 100644 --- a/coconut/constants.py +++ b/coconut/constants.py @@ -1019,7 +1019,7 @@ def get_path_env_var(env_var, default): # min versions are inclusive unpinned_min_versions = { - "cPyparsing": (2, 4, 7, 2, 3, 2), + "cPyparsing": (2, 4, 7, 2, 3, 3), ("pre-commit", "py3"): (3,), ("psutil", "py>=27"): (5,), "jupyter": (1, 0),