From 4eb5499f2d908312794ec1910d7b7f0374550415 Mon Sep 17 00:00:00 2001 From: rocky Date: Wed, 11 Dec 2024 06:37:51 -0500 Subject: [PATCH 1/3] Handle backslash better... Recognize CTRL-characters \n, \t, \r, etc feed.py: lint and fix types Split out string token test from other test_tokeniser.py which is rather large. --- mathics_scanner/feed.py | 52 ++++++++++++++++++++++---------- mathics_scanner/tokeniser.py | 58 ++++++++++++++++++++++++++---------- test/test_string_tokens.py | 53 ++++++++++++++++++++++++++++++++ test/test_tokeniser.py | 27 ++++++++--------- 4 files changed, 145 insertions(+), 45 deletions(-) create mode 100644 test/test_string_tokens.py diff --git a/mathics_scanner/feed.py b/mathics_scanner/feed.py index b23bc31..8cc0d8f 100644 --- a/mathics_scanner/feed.py +++ b/mathics_scanner/feed.py @@ -20,46 +20,66 @@ def __init__(self, filename: str): :param filename: A string that describes the source of the feeder, i.e. the filename that is being feed. """ - self.messages: List[str] = [] + + # A message is a list that starts out with a "symbol_name", like "Part", + # a message tag, like "partw", and a list of argument to be used in + # creating a message in list of messages. + self.messages: List[list] = [] + self.lineno = 0 self.filename = filename @abstractmethod - def feed(self): + def feed(self) -> str: """ Consume and return next line of code. Each line should be followed by a newline character. Returns '' after all lines are consumed. """ - - return "" + ... @abstractmethod def empty(self) -> bool: """ Return True once all lines have been consumed. """ + ... - return True - - def message(self, sym: str, tag: str, *args) -> None: + def message(self, symbol_name: str, tag: str, *args) -> None: """ - Append a generic message of type ``sym`` to the message queue. + + A Generic message appending routine. the ``self.messages`` message queue. + + ``symbol_name`` is usually the string symbol name of the built-in function that + is recording the error. "Syntax" error is the exception to this rule. + + ``tag`` specifies a class of errors that this error belongs to. + + ``*args`` are the specific message arguments. Usually (but not here) + the arguments are used to fill out a template specified by ``tag`` + + For example, consider this message displayed: + + Part::partw: Part {10} of abcde does not exist. + + "Part" is the symbol_name, "partw" is the tag and args is: + (,)>, ) """ - if sym == "Syntax": - message = self.syntax_message(sym, tag, *args) + if symbol_name == "Syntax": + message = self.syntax_message(symbol_name, tag, *args) else: - message = [sym, tag] + list(args) + message = [symbol_name, tag] + list(args) + self.messages.append(message) - def syntax_message(self, sym: str, tag: str, *args) -> list: + def syntax_message(self, symbol_name: str, tag: str, *args) -> list: """ - Append a message concerning syntax errors to the message queue. + Append a syntax-message error message to the message queue. """ if len(args) > 3: raise ValueError("Too many args.") - message = [sym, tag] + message = [symbol_name, tag] for i in range(3): if i < len(args): message.append(f'"{args[i]}"') @@ -93,7 +113,7 @@ def __init__(self, lines, filename=""): else: self.lines = lines - def feed(self): + def feed(self) -> str: if self.lineno < len(self.lines): result = self.lines[self.lineno] self.lineno += 1 @@ -118,7 +138,7 @@ def __init__(self, code: str, filename=""): self.code = code self._empty = False - def feed(self): + def feed(self) -> str: if self._empty: return "" self._empty = True diff --git a/mathics_scanner/tokeniser.py b/mathics_scanner/tokeniser.py index 442b434..5532e1a 100644 --- a/mathics_scanner/tokeniser.py +++ b/mathics_scanner/tokeniser.py @@ -414,20 +414,22 @@ def incomplete(self): self.prescanner.incomplete() self.code += self.prescanner.replace_escape_sequences() - def sntx_message(self, pos: Optional[int] = None): + def sntx_message(self, pos: Optional[int] = None, tag: Optional[str] = None): """ Send a "syntx{b,f} error message to the input-reading feeder. """ if pos is None: pos = self.pos pre, post = self.code[:pos], self.code[pos:].rstrip("\n") + if tag is None: + tag = "sntxb" if pos == 0 else "sntxf" if pos == 0: - self.feeder.message("Syntax", "sntxb", post) + self.feeder.message("Syntax", tag, post) else: - self.feeder.message("Syntax", "sntxf", pre, post) + self.feeder.message("Syntax", tag, pre, post) # TODO: Convert this to __next__ in the future. - def next(self) -> "Token": + def next(self) -> Token: "Returns the next token." self._skip_blank() if self.pos >= len(self.code): @@ -505,7 +507,7 @@ def _skip_blank(self): else: break - def _token_mode(self, match: re.Match, tag: str, mode: str) -> "Token": + def _token_mode(self, match: re.Match, tag: str, mode: str) -> Token: """ Pick out the text in ``match``, convert that into a ``Token``, and return that. @@ -517,15 +519,15 @@ def _token_mode(self, match: re.Match, tag: str, mode: str) -> "Token": self._change_token_scanning_mode(mode) return Token(tag, text, match.start(0)) - def t_Filename(self, match: re.Match) -> "Token": + def t_Filename(self, match: re.Match) -> Token: "Scan for ``Filename`` token and return that" return self._token_mode(match, "Filename", "expr") - def t_Get(self, match: re.Match) -> "Token": + def t_Get(self, match: re.Match) -> Token: "Scan for a ``Get`` token from ``match`` and return that token" return self._token_mode(match, "Get", "filename") - def t_Number(self, match: re.Match) -> "Token": + def t_Number(self, match: re.Match) -> Token: "Break out from ``match`` the next token which is expected to be a Number" text = match.group(0) pos = match.end(0) @@ -537,39 +539,65 @@ def t_Number(self, match: re.Match) -> "Token": self.pos = pos return Token("Number", text, match.start(0)) - def t_Put(self, match: re.Match) -> "Token": + def t_Put(self, match: re.Match) -> Token: "Scan for a ``Put`` token and return that" return self._token_mode(match, "Put", "filename") - def t_PutAppend(self, match: re.Match) -> "Token": + def t_PutAppend(self, match: re.Match) -> Token: "Scan for a ``PutAppend`` token and return that" return self._token_mode(match, "PutAppend", "filename") - def t_String(self, match: re.Match) -> "Token": + def t_String(self, match: re.Match) -> Token: "Break out from self.code the next token which is expected to be a String" start, end = self.pos, None self.pos += 1 # skip opening '"' - newlines = [] + skipped_chars = [] while True: if self.pos >= len(self.code): if end is None: # reached end while still inside string self.incomplete() - newlines.append(self.pos) + skipped_chars.append(self.pos) else: break char = self.code[self.pos] + + # FIXME: This is wrong. If the previous + # character was \ then we don't break. if char == '"': self.pos += 1 end = self.pos break if char == "\\": - self.pos += 2 + if self.pos + 1 == len(self.code): + # We have a \ at the end of a line. + self.incomplete() + skipped_chars.append(self.pos) + else: + # newlines (\n), tabs (\t) and double backslash + # "\\" have the backslash preserved. But for other + # characters, the backslash is removed. + if self.code[self.pos + 1] not in ( + "b", # bell? + "f", # form-feed? + "n", # newline + "r", # carrage return + "t", # tab + "\\", # Backslash + '"', # FIXME - Remove. Mathics3 code has bugs that rely + # on this + ): + self.feeder.message( + "Syntax", "stresc", self.code[self.pos : self.pos + 2] + ) + return Token("String", "", start) + + self.pos += 2 else: self.pos += 1 - indices = [start] + newlines + [end] + indices = [start] + skipped_chars + [end] result = "".join( self.code[indices[i] : indices[i + 1]] for i in range(len(indices) - 1) ) diff --git a/test/test_string_tokens.py b/test/test_string_tokens.py new file mode 100644 index 0000000..4dc757c --- /dev/null +++ b/test/test_string_tokens.py @@ -0,0 +1,53 @@ +# -*- coding: utf-8 -*- +""" +Tests translation from text characters to the token: String +""" + +import pytest + +from mathics_scanner.errors import IncompleteSyntaxError +from mathics_scanner.feed import SingleLineFeeder +from mathics_scanner.tokeniser import Token, Tokeniser + + +def check_string(source_text, expected_text: str): + token = single_token(source_text) + assert token is not None + assert token.tag == "String" + assert token.text == expected_text + + +def incomplete_error(s: str): + with pytest.raises(IncompleteSyntaxError): + get_tokens(s) + + +def single_token(source_text) -> Token: + tokens = get_tokens(source_text) + assert len(tokens) == 1 + token = tokens[0] + return token + + +def get_tokens(source_text: str): + tokeniser = Tokeniser(SingleLineFeeder(source_text)) + tokens = [] + while True: + token = tokeniser.next() + if token.tag == "END": + break + else: + tokens.append(token) + return tokens + + +def test_string(): + for ctrl_char in ("\b", "\f", "\n", "\r", "\t"): + check_string(f'"a{ctrl_char}"', f'"a{ctrl_char}"') + + incomplete_error(r'"a\X"') + check_string(r'"abc"', r'"abc"') + incomplete_error(r'"abc') + check_string(r'"abc(*def*)"', r'"abc(*def*)"') + check_string(r'"a\"b\\c"', r'"a\"b\\c"') + incomplete_error(r'"\"') diff --git a/test/test_tokeniser.py b/test/test_tokeniser.py index bb0d25d..d05df9d 100644 --- a/test/test_tokeniser.py +++ b/test/test_tokeniser.py @@ -3,13 +3,14 @@ Tests translation from strings to sequences of tokens. """ -import pytest import random import sys -from mathics_scanner.tokeniser import Tokeniser, Token, is_symbol_name -from mathics_scanner.errors import ScanError, IncompleteSyntaxError, InvalidSyntaxError +import pytest + +from mathics_scanner.errors import IncompleteSyntaxError, InvalidSyntaxError, ScanError from mathics_scanner.feed import SingleLineFeeder +from mathics_scanner.tokeniser import Token, Tokeniser, is_symbol_name def check_number(code): @@ -22,11 +23,6 @@ def check_symbol(code): assert token, Token("Symbol", code, 0) -def check_string(code): - token = single_token(code) - assert token, Token("String", code, 0) - - def incomplete_error(string): with pytest.raises(IncompleteSyntaxError): tokens(string) @@ -184,12 +180,15 @@ def test_precision(): check_number("1.5`10") -def test_string(): - check_string(r'"abc"') - incomplete_error(r'"abc') - check_string(r'"abc(*def*)"') - check_string(r'"a\"b\\c"') - incomplete_error(r'"\"') +# String tests (with many more than those +# below are now in test_string_token.py +# +# def test_string(): +# check_string(r'"abc"') +# incomplete_error(r'"abc') +# check_string(r'"abc(*def*)"') +# check_string(r'"a\"b\\c"') +# incomplete_error(r'"\"') def test_set(): From ab2239376901d308da1998d1525b78eaadcb9d5d Mon Sep 17 00:00:00 2001 From: rocky Date: Wed, 11 Dec 2024 13:14:25 -0500 Subject: [PATCH 2/3] Add more test and tests for new stuff --- mathics_scanner/prescanner.py | 11 +++++++++++ mathics_scanner/tokeniser.py | 9 ++++++++- test/test_string_tokens.py | 26 ++++++++++++++++++++------ 3 files changed, 39 insertions(+), 7 deletions(-) diff --git a/mathics_scanner/prescanner.py b/mathics_scanner/prescanner.py index 6099151..c470a87 100644 --- a/mathics_scanner/prescanner.py +++ b/mathics_scanner/prescanner.py @@ -98,6 +98,7 @@ def try_parse_base(start_shift: int, end_shift: int, base: int) -> None: self.feeder.message("Syntax", "sntoct2") elif last == 3: self.feeder.message("Syntax", "sntoct1") + raise ScanError() elif last == 4: self.feeder.message("Syntax", "snthex") else: @@ -151,6 +152,16 @@ def try_parse_named_character(start_shift: int): # Stay in same line fragment, but advance the cursor position. self.pos = i + 1 + # FIXME: + # The following code is boneheadedly wrong because + # the surrounding lexical context determines whether + # an escape sequences should be valid or not. + # For example, inside a comment, there is no such thing + # as an invalid escape sequence. And this cause \050 which is + # a valid escape sequence, parenthesis, to get treated like + # a grouping symbol inside of a string. + # ... + # # In the following loop, we look for and replace escape # sequences. The current character under consideration is at # self.code[self.pos]. When an escape sequence is found at diff --git a/mathics_scanner/tokeniser.py b/mathics_scanner/tokeniser.py index 5532e1a..90a41f4 100644 --- a/mathics_scanner/tokeniser.py +++ b/mathics_scanner/tokeniser.py @@ -574,6 +574,13 @@ def t_String(self, match: re.Match) -> Token: # We have a \ at the end of a line. self.incomplete() skipped_chars.append(self.pos) + + # Code below is in pre-scanner. We might decide + # later to move that code here. + # elif self.code[self.pos + 1] in "01234567": + # # See if we have an octal number. + # try_parse_base(1, 4, 8) + else: # newlines (\n), tabs (\t) and double backslash # "\\" have the backslash preserved. But for other @@ -591,7 +598,7 @@ def t_String(self, match: re.Match) -> Token: self.feeder.message( "Syntax", "stresc", self.code[self.pos : self.pos + 2] ) - return Token("String", "", start) + raise ScanError() self.pos += 2 else: diff --git a/test/test_string_tokens.py b/test/test_string_tokens.py index 4dc757c..9e73a55 100644 --- a/test/test_string_tokens.py +++ b/test/test_string_tokens.py @@ -5,7 +5,7 @@ import pytest -from mathics_scanner.errors import IncompleteSyntaxError +from mathics_scanner.errors import IncompleteSyntaxError, ScanError from mathics_scanner.feed import SingleLineFeeder from mathics_scanner.tokeniser import Token, Tokeniser @@ -17,10 +17,19 @@ def check_string(source_text, expected_text: str): assert token.text == expected_text -def incomplete_error(s: str): - with pytest.raises(IncompleteSyntaxError): +def incomplete_error(s: str, failure_msg: str): + with pytest.raises(IncompleteSyntaxError) as excinfo: get_tokens(s) + assert excinfo, failure_msg + + +def scan_error(s: str, failure_msg: str): + with pytest.raises(ScanError) as excinfo: + get_tokens(s) + + assert excinfo, failure_msg + def single_token(source_text) -> Token: tokens = get_tokens(source_text) @@ -45,9 +54,14 @@ def test_string(): for ctrl_char in ("\b", "\f", "\n", "\r", "\t"): check_string(f'"a{ctrl_char}"', f'"a{ctrl_char}"') - incomplete_error(r'"a\X"') + # Broken: + # "a\050", "a\051" "a\052" + # Prescanning eagerly replaces the escape sequences with + # symbols "(", ")", or "*" respectively and this messes up parsing + # somehow. check_string(r'"abc"', r'"abc"') - incomplete_error(r'"abc') check_string(r'"abc(*def*)"', r'"abc(*def*)"') check_string(r'"a\"b\\c"', r'"a\"b\\c"') - incomplete_error(r'"\"') + incomplete_error(r'"abc', "String does not have terminating quote") + incomplete_error(r'"\"', "Unterminated escape sequence") + incomplete_error(r'"a\X"', '"X" is not a valid escape character') From ba02e6fe81bb12efff3f7fb584adaa4a2b387ecd Mon Sep 17 00:00:00 2001 From: rocky Date: Wed, 11 Dec 2024 13:54:49 -0500 Subject: [PATCH 3/3] Called wrong check routine --- mathics_scanner/feed.py | 6 ++++-- mathics_scanner/tokeniser.py | 18 ++++++++---------- test/test_string_tokens.py | 6 +++--- 3 files changed, 15 insertions(+), 15 deletions(-) diff --git a/mathics_scanner/feed.py b/mathics_scanner/feed.py index 8cc0d8f..b31438c 100644 --- a/mathics_scanner/feed.py +++ b/mathics_scanner/feed.py @@ -47,7 +47,8 @@ def empty(self) -> bool: def message(self, symbol_name: str, tag: str, *args) -> None: """ - A Generic message appending routine. the ``self.messages`` message queue. + A Generic routine for appending a message to the ``self.messages`` message + queue. ``symbol_name`` is usually the string symbol name of the built-in function that is recording the error. "Syntax" error is the exception to this rule. @@ -63,6 +64,7 @@ def message(self, symbol_name: str, tag: str, *args) -> None: "Part" is the symbol_name, "partw" is the tag and args is: (,)>, ) + """ if symbol_name == "Syntax": @@ -74,7 +76,7 @@ def message(self, symbol_name: str, tag: str, *args) -> None: def syntax_message(self, symbol_name: str, tag: str, *args) -> list: """ - Append a syntax-message error message to the message queue. + Append a "Syntax" error message to the message queue. """ if len(args) > 3: diff --git a/mathics_scanner/tokeniser.py b/mathics_scanner/tokeniser.py index 90a41f4..11638e4 100644 --- a/mathics_scanner/tokeniser.py +++ b/mathics_scanner/tokeniser.py @@ -414,19 +414,17 @@ def incomplete(self): self.prescanner.incomplete() self.code += self.prescanner.replace_escape_sequences() - def sntx_message(self, pos: Optional[int] = None, tag: Optional[str] = None): + def sntx_message(self, pos: Optional[int] = None): """ Send a "syntx{b,f} error message to the input-reading feeder. """ if pos is None: pos = self.pos pre, post = self.code[:pos], self.code[pos:].rstrip("\n") - if tag is None: - tag = "sntxb" if pos == 0 else "sntxf" if pos == 0: - self.feeder.message("Syntax", tag, post) + self.feeder.message("Syntax", "sntxb", post) else: - self.feeder.message("Syntax", tag, pre, post) + self.feeder.message("Syntax", "sntxf", pre, post) # TODO: Convert this to __next__ in the future. def next(self) -> Token: @@ -551,13 +549,13 @@ def t_String(self, match: re.Match) -> Token: "Break out from self.code the next token which is expected to be a String" start, end = self.pos, None self.pos += 1 # skip opening '"' - skipped_chars = [] + newlines = [] while True: if self.pos >= len(self.code): if end is None: # reached end while still inside string self.incomplete() - skipped_chars.append(self.pos) + newlines.append(self.pos) else: break char = self.code[self.pos] @@ -573,7 +571,7 @@ def t_String(self, match: re.Match) -> Token: if self.pos + 1 == len(self.code): # We have a \ at the end of a line. self.incomplete() - skipped_chars.append(self.pos) + newlines.append(self.pos) # Code below is in pre-scanner. We might decide # later to move that code here. @@ -586,7 +584,7 @@ def t_String(self, match: re.Match) -> Token: # "\\" have the backslash preserved. But for other # characters, the backslash is removed. if self.code[self.pos + 1] not in ( - "b", # bell? + "b", # word boundary? "f", # form-feed? "n", # newline "r", # carrage return @@ -604,7 +602,7 @@ def t_String(self, match: re.Match) -> Token: else: self.pos += 1 - indices = [start] + skipped_chars + [end] + indices = [start] + newlines + [end] result = "".join( self.code[indices[i] : indices[i + 1]] for i in range(len(indices) - 1) ) diff --git a/test/test_string_tokens.py b/test/test_string_tokens.py index 9e73a55..092905a 100644 --- a/test/test_string_tokens.py +++ b/test/test_string_tokens.py @@ -51,8 +51,8 @@ def get_tokens(source_text: str): def test_string(): - for ctrl_char in ("\b", "\f", "\n", "\r", "\t"): - check_string(f'"a{ctrl_char}"', f'"a{ctrl_char}"') + for escape_string in ("\b", "\f", "\n", "\r", "\t"): + check_string(f'"a{escape_string}"', f'"a{escape_string}"') # Broken: # "a\050", "a\051" "a\052" @@ -64,4 +64,4 @@ def test_string(): check_string(r'"a\"b\\c"', r'"a\"b\\c"') incomplete_error(r'"abc', "String does not have terminating quote") incomplete_error(r'"\"', "Unterminated escape sequence") - incomplete_error(r'"a\X"', '"X" is not a valid escape character') + scan_error(r'"a\X"', '"X" is not a valid escape character')