Revert "String tokenization fix (#108)" (#109)

This reverts commit e26f5bc.
Mathics3 · Dec 12, 2024 · 3a55964 · 3a55964
1 parent e26f5bc
commit 3a55964
Show file tree

Hide file tree

Showing 5 changed files with 39 additions and 171 deletions.
diff --git a/mathics_scanner/feed.py b/mathics_scanner/feed.py
@@ -20,68 +20,46 @@ def __init__(self, filename: str):
         :param filename: A string that describes the source of the feeder, i.e.
                          the filename that is being feed.
         """
-
-        # A message is a list that starts out with a "symbol_name", like "Part",
-        # a message tag, like "partw", and a list of argument to be used in
-        # creating a message in list of messages.
-        self.messages: List[list] = []
-
+        self.messages: List[str] = []
         self.lineno = 0
         self.filename = filename
 
     @abstractmethod
-    def feed(self) -> str:
+    def feed(self):
         """
         Consume and return next line of code. Each line should be followed by a
         newline character. Returns '' after all lines are consumed.
         """
-        ...
+
+        return ""
 
     @abstractmethod
     def empty(self) -> bool:
         """
         Return True once all lines have been consumed.
         """
-        ...
-
-    def message(self, symbol_name: str, tag: str, *args) -> None:
-        """
-
-        A Generic routine for appending a message to the ``self.messages`` message
-        queue.
-
-        ``symbol_name`` is usually the string symbol name of the built-in function that
-        is recording the error. "Syntax" error is the exception to this rule.
 
-        ``tag`` specifies a class of errors that this error belongs to.
-
-        ``*args`` are the specific message arguments. Usually (but not here)
-        the arguments are used to fill out a template specified by ``tag``
-
-        For example, consider this message displayed:
-
-            Part::partw: Part {10} of abcde does not exist.
-
-        "Part" is the symbol_name, "partw" is the tag and args is:
-        (<ListExpression: (<Integer: 10>,)>, <String: "abcde">)
+        return True
 
+    def message(self, sym: str, tag: str, *args) -> None:
+        """
+        Append a generic message of type ``sym`` to the message queue.
         """
 
-        if symbol_name == "Syntax":
-            message = self.syntax_message(symbol_name, tag, *args)
+        if sym == "Syntax":
+            message = self.syntax_message(sym, tag, *args)
         else:
-            message = [symbol_name, tag] + list(args)
-
+            message = [sym, tag] + list(args)
         self.messages.append(message)
 
-    def syntax_message(self, symbol_name: str, tag: str, *args) -> list:
+    def syntax_message(self, sym: str, tag: str, *args) -> list:
         """
-        Append a "Syntax" error message to the message queue.
+        Append a message concerning syntax errors to the message queue.
         """
 
         if len(args) > 3:
             raise ValueError("Too many args.")
-        message = [symbol_name, tag]
+        message = [sym, tag]
         for i in range(3):
             if i < len(args):
                 message.append(f'"{args[i]}"')
@@ -115,7 +93,7 @@ def __init__(self, lines, filename=""):
         else:
             self.lines = lines
 
-    def feed(self) -> str:
+    def feed(self):
         if self.lineno < len(self.lines):
             result = self.lines[self.lineno]
             self.lineno += 1
@@ -140,7 +118,7 @@ def __init__(self, code: str, filename=""):
         self.code = code
         self._empty = False
 
-    def feed(self) -> str:
+    def feed(self):
         if self._empty:
             return ""
         self._empty = True

diff --git a/mathics_scanner/prescanner.py b/mathics_scanner/prescanner.py
@@ -98,7 +98,6 @@ def try_parse_base(start_shift: int, end_shift: int, base: int) -> None:
                     self.feeder.message("Syntax", "sntoct2")
                 elif last == 3:
                     self.feeder.message("Syntax", "sntoct1")
-                    raise ScanError()
                 elif last == 4:
                     self.feeder.message("Syntax", "snthex")
                 else:
@@ -152,16 +151,6 @@ def try_parse_named_character(start_shift: int):
             # Stay in same line fragment, but advance the cursor position.
             self.pos = i + 1
 
-        # FIXME:
-        #  The following code is boneheadedly wrong because
-        #  the surrounding lexical context determines whether
-        #  an escape sequences should be valid or not.
-        #  For example, inside a comment, there is no such thing
-        #  as an invalid escape sequence. And this cause  \050 which is
-        #  a valid escape sequence, parenthesis, to get treated like
-        #  a grouping symbol inside of a string.
-        # ...
-        #
         # In the following loop, we look for and replace escape
         # sequences. The current character under consideration is at
         # self.code[self.pos].  When an escape sequence is found at

diff --git a/mathics_scanner/tokeniser.py b/mathics_scanner/tokeniser.py
@@ -427,7 +427,7 @@ def sntx_message(self, pos: Optional[int] = None):
             self.feeder.message("Syntax", "sntxf", pre, post)
 
     # TODO: Convert this to __next__ in the future.
-    def next(self) -> Token:
+    def next(self) -> "Token":
         "Returns the next token."
         self._skip_blank()
         if self.pos >= len(self.code):
@@ -505,7 +505,7 @@ def _skip_blank(self):
             else:
                 break
 
-    def _token_mode(self, match: re.Match, tag: str, mode: str) -> Token:
+    def _token_mode(self, match: re.Match, tag: str, mode: str) -> "Token":
         """
         Pick out the text in ``match``, convert that into a ``Token``, and
         return that.
@@ -517,15 +517,15 @@ def _token_mode(self, match: re.Match, tag: str, mode: str) -> Token:
         self._change_token_scanning_mode(mode)
         return Token(tag, text, match.start(0))
 
-    def t_Filename(self, match: re.Match) -> Token:
+    def t_Filename(self, match: re.Match) -> "Token":
         "Scan for ``Filename`` token and return that"
         return self._token_mode(match, "Filename", "expr")
 
-    def t_Get(self, match: re.Match) -> Token:
+    def t_Get(self, match: re.Match) -> "Token":
         "Scan for a ``Get`` token from ``match`` and return that token"
         return self._token_mode(match, "Get", "filename")
 
-    def t_Number(self, match: re.Match) -> Token:
+    def t_Number(self, match: re.Match) -> "Token":
         "Break out from ``match`` the next token which is expected to be a Number"
         text = match.group(0)
         pos = match.end(0)
@@ -537,15 +537,15 @@ def t_Number(self, match: re.Match) -> Token:
             self.pos = pos
         return Token("Number", text, match.start(0))
 
-    def t_Put(self, match: re.Match) -> Token:
+    def t_Put(self, match: re.Match) -> "Token":
         "Scan for a ``Put`` token and return that"
         return self._token_mode(match, "Put", "filename")
 
-    def t_PutAppend(self, match: re.Match) -> Token:
+    def t_PutAppend(self, match: re.Match) -> "Token":
         "Scan for a ``PutAppend`` token and return that"
         return self._token_mode(match, "PutAppend", "filename")
 
-    def t_String(self, match: re.Match) -> Token:
+    def t_String(self, match: re.Match) -> "Token":
         "Break out from self.code the next token which is expected to be a String"
         start, end = self.pos, None
         self.pos += 1  # skip opening '"'
@@ -559,46 +559,13 @@ def t_String(self, match: re.Match) -> Token:
                 else:
                     break
             char = self.code[self.pos]
-
-            # FIXME: This is wrong. If the previous
-            # character was \ then we don't break.
             if char == '"':
                 self.pos += 1
                 end = self.pos
                 break
 
             if char == "\\":
-                if self.pos + 1 == len(self.code):
-                    # We have a \ at the end of a line.
-                    self.incomplete()
-                    newlines.append(self.pos)
-
-                # Code below is in pre-scanner. We might decide
-                # later to move that code here.
-                # elif self.code[self.pos + 1] in "01234567":
-                #     # See if we have an octal number.
-                #     try_parse_base(1, 4, 8)
-
-                else:
-                    # newlines (\n), tabs (\t) and double backslash
-                    # "\\" have the backslash preserved. But for other
-                    # characters, the backslash is removed.
-                    if self.code[self.pos + 1] not in (
-                        "b",  # word boundary?
-                        "f",  # form-feed?
-                        "n",  # newline
-                        "r",  # carrage return
-                        "t",  # tab
-                        "\\",  # Backslash
-                        '"',  # FIXME - Remove. Mathics3 code has bugs that rely
-                        # on this
-                    ):
-                        self.feeder.message(
-                            "Syntax", "stresc", self.code[self.pos : self.pos + 2]
-                        )
-                        raise ScanError()
-
-                    self.pos += 2
+                self.pos += 2
             else:
                 self.pos += 1
 

diff --git a/test/test_string_tokens.py b/test/test_string_tokens.py
diff --git a/test/test_tokeniser.py b/test/test_tokeniser.py
@@ -3,14 +3,13 @@
 Tests translation from strings to sequences of tokens.
 """
 
+import pytest
 import random
 import sys
 
-import pytest
-
-from mathics_scanner.errors import IncompleteSyntaxError, InvalidSyntaxError, ScanError
+from mathics_scanner.tokeniser import Tokeniser, Token, is_symbol_name
+from mathics_scanner.errors import ScanError, IncompleteSyntaxError, InvalidSyntaxError
 from mathics_scanner.feed import SingleLineFeeder
-from mathics_scanner.tokeniser import Token, Tokeniser, is_symbol_name
 
 
 def check_number(code):
@@ -23,6 +22,11 @@ def check_symbol(code):
     assert token, Token("Symbol", code, 0)
 
 
+def check_string(code):
+    token = single_token(code)
+    assert token, Token("String", code, 0)
+
+
 def incomplete_error(string):
     with pytest.raises(IncompleteSyntaxError):
         tokens(string)
@@ -180,15 +184,12 @@ def test_precision():
     check_number("1.5`10")
 
 
-# String tests (with many more than those
-# below are now in test_string_token.py
-#
-# def test_string():
-#     check_string(r'"abc"')
-#     incomplete_error(r'"abc')
-#     check_string(r'"abc(*def*)"')
-#     check_string(r'"a\"b\\c"')
-#     incomplete_error(r'"\"')
+def test_string():
+    check_string(r'"abc"')
+    incomplete_error(r'"abc')
+    check_string(r'"abc(*def*)"')
+    check_string(r'"a\"b\\c"')
+    incomplete_error(r'"\"')
 
 
 def test_set():