From 4eb5499f2d908312794ec1910d7b7f0374550415 Mon Sep 17 00:00:00 2001
From: rocky <rb@dustyfeet.com>
Date: Wed, 11 Dec 2024 06:37:51 -0500
Subject: [PATCH 1/3] Handle backslash better...

Recognize CTRL-characters \n, \t, \r, etc
feed.py: lint and fix types

Split out string token test from other test_tokeniser.py which is
rather large.
---
 mathics_scanner/feed.py      | 52 ++++++++++++++++++++++----------
 mathics_scanner/tokeniser.py | 58 ++++++++++++++++++++++++++----------
 test/test_string_tokens.py   | 53 ++++++++++++++++++++++++++++++++
 test/test_tokeniser.py       | 27 ++++++++---------
 4 files changed, 145 insertions(+), 45 deletions(-)
 create mode 100644 test/test_string_tokens.py

diff --git a/mathics_scanner/feed.py b/mathics_scanner/feed.py
index b23bc31..8cc0d8f 100644
--- a/mathics_scanner/feed.py
+++ b/mathics_scanner/feed.py
@@ -20,46 +20,66 @@ def __init__(self, filename: str):
         :param filename: A string that describes the source of the feeder, i.e.
                          the filename that is being feed.
         """
-        self.messages: List[str] = []
+
+        # A message is a list that starts out with a "symbol_name", like "Part",
+        # a message tag, like "partw", and a list of argument to be used in
+        # creating a message in list of messages.
+        self.messages: List[list] = []
+
         self.lineno = 0
         self.filename = filename
 
     @abstractmethod
-    def feed(self):
+    def feed(self) -> str:
         """
         Consume and return next line of code. Each line should be followed by a
         newline character. Returns '' after all lines are consumed.
         """
-
-        return ""
+        ...
 
     @abstractmethod
     def empty(self) -> bool:
         """
         Return True once all lines have been consumed.
         """
+        ...
 
-        return True
-
-    def message(self, sym: str, tag: str, *args) -> None:
+    def message(self, symbol_name: str, tag: str, *args) -> None:
         """
-        Append a generic message of type ``sym`` to the message queue.
+
+         A Generic message appending routine. the ``self.messages`` message queue.
+
+        ``symbol_name`` is usually the string symbol name of the built-in function that
+        is recording the error. "Syntax" error is the exception to this rule.
+
+        ``tag`` specifies a class of errors that this error belongs to.
+
+        ``*args`` are the specific message arguments. Usually (but not here)
+        the arguments are used to fill out a template specified by ``tag``
+
+        For example, consider this message displayed:
+
+            Part::partw: Part {10} of abcde does not exist.
+
+        "Part" is the symbol_name, "partw" is the tag and args is:
+        (<ListExpression: (<Integer: 10>,)>, <String: "abcde">)
         """
 
-        if sym == "Syntax":
-            message = self.syntax_message(sym, tag, *args)
+        if symbol_name == "Syntax":
+            message = self.syntax_message(symbol_name, tag, *args)
         else:
-            message = [sym, tag] + list(args)
+            message = [symbol_name, tag] + list(args)
+
         self.messages.append(message)
 
-    def syntax_message(self, sym: str, tag: str, *args) -> list:
+    def syntax_message(self, symbol_name: str, tag: str, *args) -> list:
         """
-        Append a message concerning syntax errors to the message queue.
+        Append a syntax-message error message to the message queue.
         """
 
         if len(args) > 3:
             raise ValueError("Too many args.")
-        message = [sym, tag]
+        message = [symbol_name, tag]
         for i in range(3):
             if i < len(args):
                 message.append(f'"{args[i]}"')
@@ -93,7 +113,7 @@ def __init__(self, lines, filename=""):
         else:
             self.lines = lines
 
-    def feed(self):
+    def feed(self) -> str:
         if self.lineno < len(self.lines):
             result = self.lines[self.lineno]
             self.lineno += 1
@@ -118,7 +138,7 @@ def __init__(self, code: str, filename=""):
         self.code = code
         self._empty = False
 
-    def feed(self):
+    def feed(self) -> str:
         if self._empty:
             return ""
         self._empty = True
diff --git a/mathics_scanner/tokeniser.py b/mathics_scanner/tokeniser.py
index 442b434..5532e1a 100644
--- a/mathics_scanner/tokeniser.py
+++ b/mathics_scanner/tokeniser.py
@@ -414,20 +414,22 @@ def incomplete(self):
         self.prescanner.incomplete()
         self.code += self.prescanner.replace_escape_sequences()
 
-    def sntx_message(self, pos: Optional[int] = None):
+    def sntx_message(self, pos: Optional[int] = None, tag: Optional[str] = None):
         """
         Send a "syntx{b,f} error message to the input-reading feeder.
         """
         if pos is None:
             pos = self.pos
         pre, post = self.code[:pos], self.code[pos:].rstrip("\n")
+        if tag is None:
+            tag = "sntxb" if pos == 0 else "sntxf"
         if pos == 0:
-            self.feeder.message("Syntax", "sntxb", post)
+            self.feeder.message("Syntax", tag, post)
         else:
-            self.feeder.message("Syntax", "sntxf", pre, post)
+            self.feeder.message("Syntax", tag, pre, post)
 
     # TODO: Convert this to __next__ in the future.
-    def next(self) -> "Token":
+    def next(self) -> Token:
         "Returns the next token."
         self._skip_blank()
         if self.pos >= len(self.code):
@@ -505,7 +507,7 @@ def _skip_blank(self):
             else:
                 break
 
-    def _token_mode(self, match: re.Match, tag: str, mode: str) -> "Token":
+    def _token_mode(self, match: re.Match, tag: str, mode: str) -> Token:
         """
         Pick out the text in ``match``, convert that into a ``Token``, and
         return that.
@@ -517,15 +519,15 @@ def _token_mode(self, match: re.Match, tag: str, mode: str) -> "Token":
         self._change_token_scanning_mode(mode)
         return Token(tag, text, match.start(0))
 
-    def t_Filename(self, match: re.Match) -> "Token":
+    def t_Filename(self, match: re.Match) -> Token:
         "Scan for ``Filename`` token and return that"
         return self._token_mode(match, "Filename", "expr")
 
-    def t_Get(self, match: re.Match) -> "Token":
+    def t_Get(self, match: re.Match) -> Token:
         "Scan for a ``Get`` token from ``match`` and return that token"
         return self._token_mode(match, "Get", "filename")
 
-    def t_Number(self, match: re.Match) -> "Token":
+    def t_Number(self, match: re.Match) -> Token:
         "Break out from ``match`` the next token which is expected to be a Number"
         text = match.group(0)
         pos = match.end(0)
@@ -537,39 +539,65 @@ def t_Number(self, match: re.Match) -> "Token":
             self.pos = pos
         return Token("Number", text, match.start(0))
 
-    def t_Put(self, match: re.Match) -> "Token":
+    def t_Put(self, match: re.Match) -> Token:
         "Scan for a ``Put`` token and return that"
         return self._token_mode(match, "Put", "filename")
 
-    def t_PutAppend(self, match: re.Match) -> "Token":
+    def t_PutAppend(self, match: re.Match) -> Token:
         "Scan for a ``PutAppend`` token and return that"
         return self._token_mode(match, "PutAppend", "filename")
 
-    def t_String(self, match: re.Match) -> "Token":
+    def t_String(self, match: re.Match) -> Token:
         "Break out from self.code the next token which is expected to be a String"
         start, end = self.pos, None
         self.pos += 1  # skip opening '"'
-        newlines = []
+        skipped_chars = []
         while True:
             if self.pos >= len(self.code):
                 if end is None:
                     # reached end while still inside string
                     self.incomplete()
-                    newlines.append(self.pos)
+                    skipped_chars.append(self.pos)
                 else:
                     break
             char = self.code[self.pos]
+
+            # FIXME: This is wrong. If the previous
+            # character was \ then we don't break.
             if char == '"':
                 self.pos += 1
                 end = self.pos
                 break
 
             if char == "\\":
-                self.pos += 2
+                if self.pos + 1 == len(self.code):
+                    # We have a \ at the end of a line.
+                    self.incomplete()
+                    skipped_chars.append(self.pos)
+                else:
+                    # newlines (\n), tabs (\t) and double backslash
+                    # "\\" have the backslash preserved. But for other
+                    # characters, the backslash is removed.
+                    if self.code[self.pos + 1] not in (
+                        "b",  # bell?
+                        "f",  # form-feed?
+                        "n",  # newline
+                        "r",  # carrage return
+                        "t",  # tab
+                        "\\",  # Backslash
+                        '"',  # FIXME - Remove. Mathics3 code has bugs that rely
+                        # on this
+                    ):
+                        self.feeder.message(
+                            "Syntax", "stresc", self.code[self.pos : self.pos + 2]
+                        )
+                        return Token("String", "", start)
+
+                    self.pos += 2
             else:
                 self.pos += 1
 
-        indices = [start] + newlines + [end]
+        indices = [start] + skipped_chars + [end]
         result = "".join(
             self.code[indices[i] : indices[i + 1]] for i in range(len(indices) - 1)
         )
diff --git a/test/test_string_tokens.py b/test/test_string_tokens.py
new file mode 100644
index 0000000..4dc757c
--- /dev/null
+++ b/test/test_string_tokens.py
@@ -0,0 +1,53 @@
+# -*- coding: utf-8 -*-
+"""
+Tests translation from text characters to the token: String
+"""
+
+import pytest
+
+from mathics_scanner.errors import IncompleteSyntaxError
+from mathics_scanner.feed import SingleLineFeeder
+from mathics_scanner.tokeniser import Token, Tokeniser
+
+
+def check_string(source_text, expected_text: str):
+    token = single_token(source_text)
+    assert token is not None
+    assert token.tag == "String"
+    assert token.text == expected_text
+
+
+def incomplete_error(s: str):
+    with pytest.raises(IncompleteSyntaxError):
+        get_tokens(s)
+
+
+def single_token(source_text) -> Token:
+    tokens = get_tokens(source_text)
+    assert len(tokens) == 1
+    token = tokens[0]
+    return token
+
+
+def get_tokens(source_text: str):
+    tokeniser = Tokeniser(SingleLineFeeder(source_text))
+    tokens = []
+    while True:
+        token = tokeniser.next()
+        if token.tag == "END":
+            break
+        else:
+            tokens.append(token)
+    return tokens
+
+
+def test_string():
+    for ctrl_char in ("\b", "\f", "\n", "\r", "\t"):
+        check_string(f'"a{ctrl_char}"', f'"a{ctrl_char}"')
+
+    incomplete_error(r'"a\X"')
+    check_string(r'"abc"', r'"abc"')
+    incomplete_error(r'"abc')
+    check_string(r'"abc(*def*)"', r'"abc(*def*)"')
+    check_string(r'"a\"b\\c"', r'"a\"b\\c"')
+    incomplete_error(r'"\"')
diff --git a/test/test_tokeniser.py b/test/test_tokeniser.py
index bb0d25d..d05df9d 100644
--- a/test/test_tokeniser.py
+++ b/test/test_tokeniser.py
@@ -3,13 +3,14 @@
 Tests translation from strings to sequences of tokens.
 """
 
-import pytest
 import random
 import sys
 
-from mathics_scanner.tokeniser import Tokeniser, Token, is_symbol_name
-from mathics_scanner.errors import ScanError, IncompleteSyntaxError, InvalidSyntaxError
+import pytest
+
+from mathics_scanner.errors import IncompleteSyntaxError, InvalidSyntaxError, ScanError
 from mathics_scanner.feed import SingleLineFeeder
+from mathics_scanner.tokeniser import Token, Tokeniser, is_symbol_name
 
 
 def check_number(code):
@@ -22,11 +23,6 @@ def check_symbol(code):
     assert token, Token("Symbol", code, 0)
 
 
-def check_string(code):
-    token = single_token(code)
-    assert token, Token("String", code, 0)
-
-
 def incomplete_error(string):
     with pytest.raises(IncompleteSyntaxError):
         tokens(string)
@@ -184,12 +180,15 @@ def test_precision():
     check_number("1.5`10")
 
 
-def test_string():
-    check_string(r'"abc"')
-    incomplete_error(r'"abc')
-    check_string(r'"abc(*def*)"')
-    check_string(r'"a\"b\\c"')
-    incomplete_error(r'"\"')
+# String tests (with many more than those
+# below are now in test_string_token.py
+#
+# def test_string():
+#     check_string(r'"abc"')
+#     incomplete_error(r'"abc')
+#     check_string(r'"abc(*def*)"')
+#     check_string(r'"a\"b\\c"')
+#     incomplete_error(r'"\"')
 
 
 def test_set():

From ab2239376901d308da1998d1525b78eaadcb9d5d Mon Sep 17 00:00:00 2001
From: rocky <rb@dustyfeet.com>
Date: Wed, 11 Dec 2024 13:14:25 -0500
Subject: [PATCH 2/3] Add more test and tests for new stuff

---
 mathics_scanner/prescanner.py | 11 +++++++++++
 mathics_scanner/tokeniser.py  |  9 ++++++++-
 test/test_string_tokens.py    | 26 ++++++++++++++++++++------
 3 files changed, 39 insertions(+), 7 deletions(-)

diff --git a/mathics_scanner/prescanner.py b/mathics_scanner/prescanner.py
index 6099151..c470a87 100644
--- a/mathics_scanner/prescanner.py
+++ b/mathics_scanner/prescanner.py
@@ -98,6 +98,7 @@ def try_parse_base(start_shift: int, end_shift: int, base: int) -> None:
                     self.feeder.message("Syntax", "sntoct2")
                 elif last == 3:
                     self.feeder.message("Syntax", "sntoct1")
+                    raise ScanError()
                 elif last == 4:
                     self.feeder.message("Syntax", "snthex")
                 else:
@@ -151,6 +152,16 @@ def try_parse_named_character(start_shift: int):
             # Stay in same line fragment, but advance the cursor position.
             self.pos = i + 1
 
+        # FIXME:
+        #  The following code is boneheadedly wrong because
+        #  the surrounding lexical context determines whether
+        #  an escape sequences should be valid or not.
+        #  For example, inside a comment, there is no such thing
+        #  as an invalid escape sequence. And this cause  \050 which is
+        #  a valid escape sequence, parenthesis, to get treated like
+        #  a grouping symbol inside of a string.
+        # ...
+        #
         # In the following loop, we look for and replace escape
         # sequences. The current character under consideration is at
         # self.code[self.pos].  When an escape sequence is found at
diff --git a/mathics_scanner/tokeniser.py b/mathics_scanner/tokeniser.py
index 5532e1a..90a41f4 100644
--- a/mathics_scanner/tokeniser.py
+++ b/mathics_scanner/tokeniser.py
@@ -574,6 +574,13 @@ def t_String(self, match: re.Match) -> Token:
                     # We have a \ at the end of a line.
                     self.incomplete()
                     skipped_chars.append(self.pos)
+
+                # Code below is in pre-scanner. We might decide
+                # later to move that code here.
+                # elif self.code[self.pos + 1] in "01234567":
+                #     # See if we have an octal number.
+                #     try_parse_base(1, 4, 8)
+
                 else:
                     # newlines (\n), tabs (\t) and double backslash
                     # "\\" have the backslash preserved. But for other
@@ -591,7 +598,7 @@ def t_String(self, match: re.Match) -> Token:
                         self.feeder.message(
                             "Syntax", "stresc", self.code[self.pos : self.pos + 2]
                         )
-                        return Token("String", "", start)
+                        raise ScanError()
 
                     self.pos += 2
             else:
diff --git a/test/test_string_tokens.py b/test/test_string_tokens.py
index 4dc757c..9e73a55 100644
--- a/test/test_string_tokens.py
+++ b/test/test_string_tokens.py
@@ -5,7 +5,7 @@
 
 import pytest
 
-from mathics_scanner.errors import IncompleteSyntaxError
+from mathics_scanner.errors import IncompleteSyntaxError, ScanError
 from mathics_scanner.feed import SingleLineFeeder
 from mathics_scanner.tokeniser import Token, Tokeniser
 
@@ -17,10 +17,19 @@ def check_string(source_text, expected_text: str):
     assert token.text == expected_text
 
 
-def incomplete_error(s: str):
-    with pytest.raises(IncompleteSyntaxError):
+def incomplete_error(s: str, failure_msg: str):
+    with pytest.raises(IncompleteSyntaxError) as excinfo:
         get_tokens(s)
 
+    assert excinfo, failure_msg
+
+
+def scan_error(s: str, failure_msg: str):
+    with pytest.raises(ScanError) as excinfo:
+        get_tokens(s)
+
+    assert excinfo, failure_msg
+
 
 def single_token(source_text) -> Token:
     tokens = get_tokens(source_text)
@@ -45,9 +54,14 @@ def test_string():
     for ctrl_char in ("\b", "\f", "\n", "\r", "\t"):
         check_string(f'"a{ctrl_char}"', f'"a{ctrl_char}"')
 
-    incomplete_error(r'"a\X"')
+    # Broken:
+    # "a\050", "a\051" "a\052"
+    # Prescanning eagerly replaces the escape sequences with
+    # symbols "(", ")", or "*" respectively and this messes up parsing
+    # somehow.
     check_string(r'"abc"', r'"abc"')
-    incomplete_error(r'"abc')
     check_string(r'"abc(*def*)"', r'"abc(*def*)"')
     check_string(r'"a\"b\\c"', r'"a\"b\\c"')
-    incomplete_error(r'"\"')
+    incomplete_error(r'"abc', "String does not have terminating quote")
+    incomplete_error(r'"\"', "Unterminated escape sequence")
+    incomplete_error(r'"a\X"', '"X" is not a valid escape character')

From ba02e6fe81bb12efff3f7fb584adaa4a2b387ecd Mon Sep 17 00:00:00 2001
From: rocky <rb@dustyfeet.com>
Date: Wed, 11 Dec 2024 13:54:49 -0500
Subject: [PATCH 3/3] Called wrong check routine

---
 mathics_scanner/feed.py      |  6 ++++--
 mathics_scanner/tokeniser.py | 18 ++++++++----------
 test/test_string_tokens.py   |  6 +++---
 3 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/mathics_scanner/feed.py b/mathics_scanner/feed.py
index 8cc0d8f..b31438c 100644
--- a/mathics_scanner/feed.py
+++ b/mathics_scanner/feed.py
@@ -47,7 +47,8 @@ def empty(self) -> bool:
     def message(self, symbol_name: str, tag: str, *args) -> None:
         """
 
-         A Generic message appending routine. the ``self.messages`` message queue.
+        A Generic routine for appending a message to the ``self.messages`` message
+        queue.
 
         ``symbol_name`` is usually the string symbol name of the built-in function that
         is recording the error. "Syntax" error is the exception to this rule.
@@ -63,6 +64,7 @@ def message(self, symbol_name: str, tag: str, *args) -> None:
 
         "Part" is the symbol_name, "partw" is the tag and args is:
         (<ListExpression: (<Integer: 10>,)>, <String: "abcde">)
+
         """
 
         if symbol_name == "Syntax":
@@ -74,7 +76,7 @@ def message(self, symbol_name: str, tag: str, *args) -> None:
 
     def syntax_message(self, symbol_name: str, tag: str, *args) -> list:
         """
-        Append a syntax-message error message to the message queue.
+        Append a "Syntax" error message to the message queue.
         """
 
         if len(args) > 3:
diff --git a/mathics_scanner/tokeniser.py b/mathics_scanner/tokeniser.py
index 90a41f4..11638e4 100644
--- a/mathics_scanner/tokeniser.py
+++ b/mathics_scanner/tokeniser.py
@@ -414,19 +414,17 @@ def incomplete(self):
         self.prescanner.incomplete()
         self.code += self.prescanner.replace_escape_sequences()
 
-    def sntx_message(self, pos: Optional[int] = None, tag: Optional[str] = None):
+    def sntx_message(self, pos: Optional[int] = None):
         """
         Send a "syntx{b,f} error message to the input-reading feeder.
         """
         if pos is None:
             pos = self.pos
         pre, post = self.code[:pos], self.code[pos:].rstrip("\n")
-        if tag is None:
-            tag = "sntxb" if pos == 0 else "sntxf"
         if pos == 0:
-            self.feeder.message("Syntax", tag, post)
+            self.feeder.message("Syntax", "sntxb", post)
         else:
-            self.feeder.message("Syntax", tag, pre, post)
+            self.feeder.message("Syntax", "sntxf", pre, post)
 
     # TODO: Convert this to __next__ in the future.
     def next(self) -> Token:
@@ -551,13 +549,13 @@ def t_String(self, match: re.Match) -> Token:
         "Break out from self.code the next token which is expected to be a String"
         start, end = self.pos, None
         self.pos += 1  # skip opening '"'
-        skipped_chars = []
+        newlines = []
         while True:
             if self.pos >= len(self.code):
                 if end is None:
                     # reached end while still inside string
                     self.incomplete()
-                    skipped_chars.append(self.pos)
+                    newlines.append(self.pos)
                 else:
                     break
             char = self.code[self.pos]
@@ -573,7 +571,7 @@ def t_String(self, match: re.Match) -> Token:
                 if self.pos + 1 == len(self.code):
                     # We have a \ at the end of a line.
                     self.incomplete()
-                    skipped_chars.append(self.pos)
+                    newlines.append(self.pos)
 
                 # Code below is in pre-scanner. We might decide
                 # later to move that code here.
@@ -586,7 +584,7 @@ def t_String(self, match: re.Match) -> Token:
                     # "\\" have the backslash preserved. But for other
                     # characters, the backslash is removed.
                     if self.code[self.pos + 1] not in (
-                        "b",  # bell?
+                        "b",  # word boundary?
                         "f",  # form-feed?
                         "n",  # newline
                         "r",  # carrage return
@@ -604,7 +602,7 @@ def t_String(self, match: re.Match) -> Token:
             else:
                 self.pos += 1
 
-        indices = [start] + skipped_chars + [end]
+        indices = [start] + newlines + [end]
         result = "".join(
             self.code[indices[i] : indices[i + 1]] for i in range(len(indices) - 1)
         )
diff --git a/test/test_string_tokens.py b/test/test_string_tokens.py
index 9e73a55..092905a 100644
--- a/test/test_string_tokens.py
+++ b/test/test_string_tokens.py
@@ -51,8 +51,8 @@ def get_tokens(source_text: str):
 
 
 def test_string():
-    for ctrl_char in ("\b", "\f", "\n", "\r", "\t"):
-        check_string(f'"a{ctrl_char}"', f'"a{ctrl_char}"')
+    for escape_string in ("\b", "\f", "\n", "\r", "\t"):
+        check_string(f'"a{escape_string}"', f'"a{escape_string}"')
 
     # Broken:
     # "a\050", "a\051" "a\052"
@@ -64,4 +64,4 @@ def test_string():
     check_string(r'"a\"b\\c"', r'"a\"b\\c"')
     incomplete_error(r'"abc', "String does not have terminating quote")
     incomplete_error(r'"\"', "Unterminated escape sequence")
-    incomplete_error(r'"a\X"', '"X" is not a valid escape character')
+    scan_error(r'"a\X"', '"X" is not a valid escape character')