Skip to content

Commit

Permalink
Revert "String tokenization fix (#108)"
Browse files Browse the repository at this point in the history
This reverts commit e26f5bc.
  • Loading branch information
rocky authored Dec 12, 2024
1 parent e26f5bc commit 501bc44
Show file tree
Hide file tree
Showing 5 changed files with 39 additions and 171 deletions.
54 changes: 16 additions & 38 deletions mathics_scanner/feed.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,68 +20,46 @@ def __init__(self, filename: str):
:param filename: A string that describes the source of the feeder, i.e.
the filename that is being feed.
"""

# A message is a list that starts out with a "symbol_name", like "Part",
# a message tag, like "partw", and a list of argument to be used in
# creating a message in list of messages.
self.messages: List[list] = []

self.messages: List[str] = []
self.lineno = 0
self.filename = filename

@abstractmethod
def feed(self) -> str:
def feed(self):
"""
Consume and return next line of code. Each line should be followed by a
newline character. Returns '' after all lines are consumed.
"""
...

return ""

@abstractmethod
def empty(self) -> bool:
"""
Return True once all lines have been consumed.
"""
...

def message(self, symbol_name: str, tag: str, *args) -> None:
"""
A Generic routine for appending a message to the ``self.messages`` message
queue.
``symbol_name`` is usually the string symbol name of the built-in function that
is recording the error. "Syntax" error is the exception to this rule.

``tag`` specifies a class of errors that this error belongs to.
``*args`` are the specific message arguments. Usually (but not here)
the arguments are used to fill out a template specified by ``tag``
For example, consider this message displayed:
Part::partw: Part {10} of abcde does not exist.
"Part" is the symbol_name, "partw" is the tag and args is:
(<ListExpression: (<Integer: 10>,)>, <String: "abcde">)
return True

def message(self, sym: str, tag: str, *args) -> None:
"""
Append a generic message of type ``sym`` to the message queue.
"""

if symbol_name == "Syntax":
message = self.syntax_message(symbol_name, tag, *args)
if sym == "Syntax":
message = self.syntax_message(sym, tag, *args)
else:
message = [symbol_name, tag] + list(args)

message = [sym, tag] + list(args)
self.messages.append(message)

def syntax_message(self, symbol_name: str, tag: str, *args) -> list:
def syntax_message(self, sym: str, tag: str, *args) -> list:
"""
Append a "Syntax" error message to the message queue.
Append a message concerning syntax errors to the message queue.
"""

if len(args) > 3:
raise ValueError("Too many args.")
message = [symbol_name, tag]
message = [sym, tag]
for i in range(3):
if i < len(args):
message.append(f'"{args[i]}"')
Expand Down Expand Up @@ -115,7 +93,7 @@ def __init__(self, lines, filename=""):
else:
self.lines = lines

def feed(self) -> str:
def feed(self):
if self.lineno < len(self.lines):
result = self.lines[self.lineno]
self.lineno += 1
Expand All @@ -140,7 +118,7 @@ def __init__(self, code: str, filename=""):
self.code = code
self._empty = False

def feed(self) -> str:
def feed(self):
if self._empty:
return ""
self._empty = True
Expand Down
11 changes: 0 additions & 11 deletions mathics_scanner/prescanner.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,6 @@ def try_parse_base(start_shift: int, end_shift: int, base: int) -> None:
self.feeder.message("Syntax", "sntoct2")
elif last == 3:
self.feeder.message("Syntax", "sntoct1")
raise ScanError()
elif last == 4:
self.feeder.message("Syntax", "snthex")
else:
Expand Down Expand Up @@ -152,16 +151,6 @@ def try_parse_named_character(start_shift: int):
# Stay in same line fragment, but advance the cursor position.
self.pos = i + 1

# FIXME:
# The following code is boneheadedly wrong because
# the surrounding lexical context determines whether
# an escape sequences should be valid or not.
# For example, inside a comment, there is no such thing
# as an invalid escape sequence. And this cause \050 which is
# a valid escape sequence, parenthesis, to get treated like
# a grouping symbol inside of a string.
# ...
#
# In the following loop, we look for and replace escape
# sequences. The current character under consideration is at
# self.code[self.pos]. When an escape sequence is found at
Expand Down
51 changes: 9 additions & 42 deletions mathics_scanner/tokeniser.py
Original file line number Diff line number Diff line change
Expand Up @@ -427,7 +427,7 @@ def sntx_message(self, pos: Optional[int] = None):
self.feeder.message("Syntax", "sntxf", pre, post)

# TODO: Convert this to __next__ in the future.
def next(self) -> Token:
def next(self) -> "Token":
"Returns the next token."
self._skip_blank()
if self.pos >= len(self.code):
Expand Down Expand Up @@ -505,7 +505,7 @@ def _skip_blank(self):
else:
break

def _token_mode(self, match: re.Match, tag: str, mode: str) -> Token:
def _token_mode(self, match: re.Match, tag: str, mode: str) -> "Token":
"""
Pick out the text in ``match``, convert that into a ``Token``, and
return that.
Expand All @@ -517,15 +517,15 @@ def _token_mode(self, match: re.Match, tag: str, mode: str) -> Token:
self._change_token_scanning_mode(mode)
return Token(tag, text, match.start(0))

def t_Filename(self, match: re.Match) -> Token:
def t_Filename(self, match: re.Match) -> "Token":
"Scan for ``Filename`` token and return that"
return self._token_mode(match, "Filename", "expr")

def t_Get(self, match: re.Match) -> Token:
def t_Get(self, match: re.Match) -> "Token":
"Scan for a ``Get`` token from ``match`` and return that token"
return self._token_mode(match, "Get", "filename")

def t_Number(self, match: re.Match) -> Token:
def t_Number(self, match: re.Match) -> "Token":
"Break out from ``match`` the next token which is expected to be a Number"
text = match.group(0)
pos = match.end(0)
Expand All @@ -537,15 +537,15 @@ def t_Number(self, match: re.Match) -> Token:
self.pos = pos
return Token("Number", text, match.start(0))

def t_Put(self, match: re.Match) -> Token:
def t_Put(self, match: re.Match) -> "Token":
"Scan for a ``Put`` token and return that"
return self._token_mode(match, "Put", "filename")

def t_PutAppend(self, match: re.Match) -> Token:
def t_PutAppend(self, match: re.Match) -> "Token":
"Scan for a ``PutAppend`` token and return that"
return self._token_mode(match, "PutAppend", "filename")

def t_String(self, match: re.Match) -> Token:
def t_String(self, match: re.Match) -> "Token":
"Break out from self.code the next token which is expected to be a String"
start, end = self.pos, None
self.pos += 1 # skip opening '"'
Expand All @@ -559,46 +559,13 @@ def t_String(self, match: re.Match) -> Token:
else:
break
char = self.code[self.pos]

# FIXME: This is wrong. If the previous
# character was \ then we don't break.
if char == '"':
self.pos += 1
end = self.pos
break

if char == "\\":
if self.pos + 1 == len(self.code):
# We have a \ at the end of a line.
self.incomplete()
newlines.append(self.pos)

# Code below is in pre-scanner. We might decide
# later to move that code here.
# elif self.code[self.pos + 1] in "01234567":
# # See if we have an octal number.
# try_parse_base(1, 4, 8)

else:
# newlines (\n), tabs (\t) and double backslash
# "\\" have the backslash preserved. But for other
# characters, the backslash is removed.
if self.code[self.pos + 1] not in (
"b", # word boundary?
"f", # form-feed?
"n", # newline
"r", # carrage return
"t", # tab
"\\", # Backslash
'"', # FIXME - Remove. Mathics3 code has bugs that rely
# on this
):
self.feeder.message(
"Syntax", "stresc", self.code[self.pos : self.pos + 2]
)
raise ScanError()

self.pos += 2
self.pos += 2
else:
self.pos += 1

Expand Down
67 changes: 0 additions & 67 deletions test/test_string_tokens.py

This file was deleted.

27 changes: 14 additions & 13 deletions test/test_tokeniser.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,13 @@
Tests translation from strings to sequences of tokens.
"""

import pytest
import random
import sys

import pytest

from mathics_scanner.errors import IncompleteSyntaxError, InvalidSyntaxError, ScanError
from mathics_scanner.tokeniser import Tokeniser, Token, is_symbol_name
from mathics_scanner.errors import ScanError, IncompleteSyntaxError, InvalidSyntaxError
from mathics_scanner.feed import SingleLineFeeder
from mathics_scanner.tokeniser import Token, Tokeniser, is_symbol_name


def check_number(code):
Expand All @@ -23,6 +22,11 @@ def check_symbol(code):
assert token, Token("Symbol", code, 0)


def check_string(code):
token = single_token(code)
assert token, Token("String", code, 0)


def incomplete_error(string):
with pytest.raises(IncompleteSyntaxError):
tokens(string)
Expand Down Expand Up @@ -180,15 +184,12 @@ def test_precision():
check_number("1.5`10")


# String tests (with many more than those
# below are now in test_string_token.py
#
# def test_string():
# check_string(r'"abc"')
# incomplete_error(r'"abc')
# check_string(r'"abc(*def*)"')
# check_string(r'"a\"b\\c"')
# incomplete_error(r'"\"')
def test_string():
check_string(r'"abc"')
incomplete_error(r'"abc')
check_string(r'"abc(*def*)"')
check_string(r'"a\"b\\c"')
incomplete_error(r'"\"')


def test_set():
Expand Down

0 comments on commit 501bc44

Please sign in to comment.