From 501bc448e79e9dacb81d89c51269c622b9ba501d Mon Sep 17 00:00:00 2001 From: "R. Bernstein" Date: Wed, 11 Dec 2024 22:27:53 -0500 Subject: [PATCH] Revert "String tokenization fix (#108)" This reverts commit e26f5bc3f02600c2cec237b77c97af62186d95f1. --- mathics_scanner/feed.py | 54 +++++++++------------------- mathics_scanner/prescanner.py | 11 ------ mathics_scanner/tokeniser.py | 51 +++++--------------------- test/test_string_tokens.py | 67 ----------------------------------- test/test_tokeniser.py | 27 +++++++------- 5 files changed, 39 insertions(+), 171 deletions(-) delete mode 100644 test/test_string_tokens.py diff --git a/mathics_scanner/feed.py b/mathics_scanner/feed.py index b31438c..b23bc31 100644 --- a/mathics_scanner/feed.py +++ b/mathics_scanner/feed.py @@ -20,68 +20,46 @@ def __init__(self, filename: str): :param filename: A string that describes the source of the feeder, i.e. the filename that is being feed. """ - - # A message is a list that starts out with a "symbol_name", like "Part", - # a message tag, like "partw", and a list of argument to be used in - # creating a message in list of messages. - self.messages: List[list] = [] - + self.messages: List[str] = [] self.lineno = 0 self.filename = filename @abstractmethod - def feed(self) -> str: + def feed(self): """ Consume and return next line of code. Each line should be followed by a newline character. Returns '' after all lines are consumed. """ - ... + + return "" @abstractmethod def empty(self) -> bool: """ Return True once all lines have been consumed. """ - ... - - def message(self, symbol_name: str, tag: str, *args) -> None: - """ - - A Generic routine for appending a message to the ``self.messages`` message - queue. - - ``symbol_name`` is usually the string symbol name of the built-in function that - is recording the error. "Syntax" error is the exception to this rule. - ``tag`` specifies a class of errors that this error belongs to. - - ``*args`` are the specific message arguments. Usually (but not here) - the arguments are used to fill out a template specified by ``tag`` - - For example, consider this message displayed: - - Part::partw: Part {10} of abcde does not exist. - - "Part" is the symbol_name, "partw" is the tag and args is: - (,)>, ) + return True + def message(self, sym: str, tag: str, *args) -> None: + """ + Append a generic message of type ``sym`` to the message queue. """ - if symbol_name == "Syntax": - message = self.syntax_message(symbol_name, tag, *args) + if sym == "Syntax": + message = self.syntax_message(sym, tag, *args) else: - message = [symbol_name, tag] + list(args) - + message = [sym, tag] + list(args) self.messages.append(message) - def syntax_message(self, symbol_name: str, tag: str, *args) -> list: + def syntax_message(self, sym: str, tag: str, *args) -> list: """ - Append a "Syntax" error message to the message queue. + Append a message concerning syntax errors to the message queue. """ if len(args) > 3: raise ValueError("Too many args.") - message = [symbol_name, tag] + message = [sym, tag] for i in range(3): if i < len(args): message.append(f'"{args[i]}"') @@ -115,7 +93,7 @@ def __init__(self, lines, filename=""): else: self.lines = lines - def feed(self) -> str: + def feed(self): if self.lineno < len(self.lines): result = self.lines[self.lineno] self.lineno += 1 @@ -140,7 +118,7 @@ def __init__(self, code: str, filename=""): self.code = code self._empty = False - def feed(self) -> str: + def feed(self): if self._empty: return "" self._empty = True diff --git a/mathics_scanner/prescanner.py b/mathics_scanner/prescanner.py index c470a87..6099151 100644 --- a/mathics_scanner/prescanner.py +++ b/mathics_scanner/prescanner.py @@ -98,7 +98,6 @@ def try_parse_base(start_shift: int, end_shift: int, base: int) -> None: self.feeder.message("Syntax", "sntoct2") elif last == 3: self.feeder.message("Syntax", "sntoct1") - raise ScanError() elif last == 4: self.feeder.message("Syntax", "snthex") else: @@ -152,16 +151,6 @@ def try_parse_named_character(start_shift: int): # Stay in same line fragment, but advance the cursor position. self.pos = i + 1 - # FIXME: - # The following code is boneheadedly wrong because - # the surrounding lexical context determines whether - # an escape sequences should be valid or not. - # For example, inside a comment, there is no such thing - # as an invalid escape sequence. And this cause \050 which is - # a valid escape sequence, parenthesis, to get treated like - # a grouping symbol inside of a string. - # ... - # # In the following loop, we look for and replace escape # sequences. The current character under consideration is at # self.code[self.pos]. When an escape sequence is found at diff --git a/mathics_scanner/tokeniser.py b/mathics_scanner/tokeniser.py index 11638e4..442b434 100644 --- a/mathics_scanner/tokeniser.py +++ b/mathics_scanner/tokeniser.py @@ -427,7 +427,7 @@ def sntx_message(self, pos: Optional[int] = None): self.feeder.message("Syntax", "sntxf", pre, post) # TODO: Convert this to __next__ in the future. - def next(self) -> Token: + def next(self) -> "Token": "Returns the next token." self._skip_blank() if self.pos >= len(self.code): @@ -505,7 +505,7 @@ def _skip_blank(self): else: break - def _token_mode(self, match: re.Match, tag: str, mode: str) -> Token: + def _token_mode(self, match: re.Match, tag: str, mode: str) -> "Token": """ Pick out the text in ``match``, convert that into a ``Token``, and return that. @@ -517,15 +517,15 @@ def _token_mode(self, match: re.Match, tag: str, mode: str) -> Token: self._change_token_scanning_mode(mode) return Token(tag, text, match.start(0)) - def t_Filename(self, match: re.Match) -> Token: + def t_Filename(self, match: re.Match) -> "Token": "Scan for ``Filename`` token and return that" return self._token_mode(match, "Filename", "expr") - def t_Get(self, match: re.Match) -> Token: + def t_Get(self, match: re.Match) -> "Token": "Scan for a ``Get`` token from ``match`` and return that token" return self._token_mode(match, "Get", "filename") - def t_Number(self, match: re.Match) -> Token: + def t_Number(self, match: re.Match) -> "Token": "Break out from ``match`` the next token which is expected to be a Number" text = match.group(0) pos = match.end(0) @@ -537,15 +537,15 @@ def t_Number(self, match: re.Match) -> Token: self.pos = pos return Token("Number", text, match.start(0)) - def t_Put(self, match: re.Match) -> Token: + def t_Put(self, match: re.Match) -> "Token": "Scan for a ``Put`` token and return that" return self._token_mode(match, "Put", "filename") - def t_PutAppend(self, match: re.Match) -> Token: + def t_PutAppend(self, match: re.Match) -> "Token": "Scan for a ``PutAppend`` token and return that" return self._token_mode(match, "PutAppend", "filename") - def t_String(self, match: re.Match) -> Token: + def t_String(self, match: re.Match) -> "Token": "Break out from self.code the next token which is expected to be a String" start, end = self.pos, None self.pos += 1 # skip opening '"' @@ -559,46 +559,13 @@ def t_String(self, match: re.Match) -> Token: else: break char = self.code[self.pos] - - # FIXME: This is wrong. If the previous - # character was \ then we don't break. if char == '"': self.pos += 1 end = self.pos break if char == "\\": - if self.pos + 1 == len(self.code): - # We have a \ at the end of a line. - self.incomplete() - newlines.append(self.pos) - - # Code below is in pre-scanner. We might decide - # later to move that code here. - # elif self.code[self.pos + 1] in "01234567": - # # See if we have an octal number. - # try_parse_base(1, 4, 8) - - else: - # newlines (\n), tabs (\t) and double backslash - # "\\" have the backslash preserved. But for other - # characters, the backslash is removed. - if self.code[self.pos + 1] not in ( - "b", # word boundary? - "f", # form-feed? - "n", # newline - "r", # carrage return - "t", # tab - "\\", # Backslash - '"', # FIXME - Remove. Mathics3 code has bugs that rely - # on this - ): - self.feeder.message( - "Syntax", "stresc", self.code[self.pos : self.pos + 2] - ) - raise ScanError() - - self.pos += 2 + self.pos += 2 else: self.pos += 1 diff --git a/test/test_string_tokens.py b/test/test_string_tokens.py deleted file mode 100644 index 092905a..0000000 --- a/test/test_string_tokens.py +++ /dev/null @@ -1,67 +0,0 @@ -# -*- coding: utf-8 -*- -""" -Tests translation from text characters to the token: String -""" - -import pytest - -from mathics_scanner.errors import IncompleteSyntaxError, ScanError -from mathics_scanner.feed import SingleLineFeeder -from mathics_scanner.tokeniser import Token, Tokeniser - - -def check_string(source_text, expected_text: str): - token = single_token(source_text) - assert token is not None - assert token.tag == "String" - assert token.text == expected_text - - -def incomplete_error(s: str, failure_msg: str): - with pytest.raises(IncompleteSyntaxError) as excinfo: - get_tokens(s) - - assert excinfo, failure_msg - - -def scan_error(s: str, failure_msg: str): - with pytest.raises(ScanError) as excinfo: - get_tokens(s) - - assert excinfo, failure_msg - - -def single_token(source_text) -> Token: - tokens = get_tokens(source_text) - assert len(tokens) == 1 - token = tokens[0] - return token - - -def get_tokens(source_text: str): - tokeniser = Tokeniser(SingleLineFeeder(source_text)) - tokens = [] - while True: - token = tokeniser.next() - if token.tag == "END": - break - else: - tokens.append(token) - return tokens - - -def test_string(): - for escape_string in ("\b", "\f", "\n", "\r", "\t"): - check_string(f'"a{escape_string}"', f'"a{escape_string}"') - - # Broken: - # "a\050", "a\051" "a\052" - # Prescanning eagerly replaces the escape sequences with - # symbols "(", ")", or "*" respectively and this messes up parsing - # somehow. - check_string(r'"abc"', r'"abc"') - check_string(r'"abc(*def*)"', r'"abc(*def*)"') - check_string(r'"a\"b\\c"', r'"a\"b\\c"') - incomplete_error(r'"abc', "String does not have terminating quote") - incomplete_error(r'"\"', "Unterminated escape sequence") - scan_error(r'"a\X"', '"X" is not a valid escape character') diff --git a/test/test_tokeniser.py b/test/test_tokeniser.py index d05df9d..bb0d25d 100644 --- a/test/test_tokeniser.py +++ b/test/test_tokeniser.py @@ -3,14 +3,13 @@ Tests translation from strings to sequences of tokens. """ +import pytest import random import sys -import pytest - -from mathics_scanner.errors import IncompleteSyntaxError, InvalidSyntaxError, ScanError +from mathics_scanner.tokeniser import Tokeniser, Token, is_symbol_name +from mathics_scanner.errors import ScanError, IncompleteSyntaxError, InvalidSyntaxError from mathics_scanner.feed import SingleLineFeeder -from mathics_scanner.tokeniser import Token, Tokeniser, is_symbol_name def check_number(code): @@ -23,6 +22,11 @@ def check_symbol(code): assert token, Token("Symbol", code, 0) +def check_string(code): + token = single_token(code) + assert token, Token("String", code, 0) + + def incomplete_error(string): with pytest.raises(IncompleteSyntaxError): tokens(string) @@ -180,15 +184,12 @@ def test_precision(): check_number("1.5`10") -# String tests (with many more than those -# below are now in test_string_token.py -# -# def test_string(): -# check_string(r'"abc"') -# incomplete_error(r'"abc') -# check_string(r'"abc(*def*)"') -# check_string(r'"a\"b\\c"') -# incomplete_error(r'"\"') +def test_string(): + check_string(r'"abc"') + incomplete_error(r'"abc') + check_string(r'"abc(*def*)"') + check_string(r'"a\"b\\c"') + incomplete_error(r'"\"') def test_set():