Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

String tokenization fix #108

Merged
merged 3 commits into from
Dec 12, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
54 changes: 38 additions & 16 deletions mathics_scanner/feed.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,46 +20,68 @@ def __init__(self, filename: str):
:param filename: A string that describes the source of the feeder, i.e.
the filename that is being feed.
"""
self.messages: List[str] = []

# A message is a list that starts out with a "symbol_name", like "Part",
# a message tag, like "partw", and a list of argument to be used in
# creating a message in list of messages.
self.messages: List[list] = []

self.lineno = 0
self.filename = filename

@abstractmethod
def feed(self):
def feed(self) -> str:
"""
Consume and return next line of code. Each line should be followed by a
newline character. Returns '' after all lines are consumed.
"""

return ""
...

@abstractmethod
def empty(self) -> bool:
"""
Return True once all lines have been consumed.
"""
...

return True

def message(self, sym: str, tag: str, *args) -> None:
def message(self, symbol_name: str, tag: str, *args) -> None:
"""
Append a generic message of type ``sym`` to the message queue.

A Generic routine for appending a message to the ``self.messages`` message
queue.

``symbol_name`` is usually the string symbol name of the built-in function that
is recording the error. "Syntax" error is the exception to this rule.

``tag`` specifies a class of errors that this error belongs to.

``*args`` are the specific message arguments. Usually (but not here)
the arguments are used to fill out a template specified by ``tag``

For example, consider this message displayed:

Part::partw: Part {10} of abcde does not exist.

"Part" is the symbol_name, "partw" is the tag and args is:
(<ListExpression: (<Integer: 10>,)>, <String: "abcde">)

"""

if sym == "Syntax":
message = self.syntax_message(sym, tag, *args)
if symbol_name == "Syntax":
message = self.syntax_message(symbol_name, tag, *args)
else:
message = [sym, tag] + list(args)
message = [symbol_name, tag] + list(args)

self.messages.append(message)

def syntax_message(self, sym: str, tag: str, *args) -> list:
def syntax_message(self, symbol_name: str, tag: str, *args) -> list:
"""
Append a message concerning syntax errors to the message queue.
Append a "Syntax" error message to the message queue.
"""

if len(args) > 3:
raise ValueError("Too many args.")
message = [sym, tag]
message = [symbol_name, tag]
for i in range(3):
if i < len(args):
message.append(f'"{args[i]}"')
Expand Down Expand Up @@ -93,7 +115,7 @@ def __init__(self, lines, filename=""):
else:
self.lines = lines

def feed(self):
def feed(self) -> str:
if self.lineno < len(self.lines):
result = self.lines[self.lineno]
self.lineno += 1
Expand All @@ -118,7 +140,7 @@ def __init__(self, code: str, filename=""):
self.code = code
self._empty = False

def feed(self):
def feed(self) -> str:
if self._empty:
return ""
self._empty = True
Expand Down
11 changes: 11 additions & 0 deletions mathics_scanner/prescanner.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,7 @@ def try_parse_base(start_shift: int, end_shift: int, base: int) -> None:
self.feeder.message("Syntax", "sntoct2")
elif last == 3:
self.feeder.message("Syntax", "sntoct1")
raise ScanError()
elif last == 4:
self.feeder.message("Syntax", "snthex")
else:
Expand Down Expand Up @@ -151,6 +152,16 @@ def try_parse_named_character(start_shift: int):
# Stay in same line fragment, but advance the cursor position.
self.pos = i + 1

# FIXME:
# The following code is boneheadedly wrong because
# the surrounding lexical context determines whether
# an escape sequences should be valid or not.
# For example, inside a comment, there is no such thing
# as an invalid escape sequence. And this cause \050 which is
# a valid escape sequence, parenthesis, to get treated like
# a grouping symbol inside of a string.
# ...
#
# In the following loop, we look for and replace escape
# sequences. The current character under consideration is at
# self.code[self.pos]. When an escape sequence is found at
Expand Down
51 changes: 42 additions & 9 deletions mathics_scanner/tokeniser.py
Original file line number Diff line number Diff line change
Expand Up @@ -427,7 +427,7 @@ def sntx_message(self, pos: Optional[int] = None):
self.feeder.message("Syntax", "sntxf", pre, post)

# TODO: Convert this to __next__ in the future.
def next(self) -> "Token":
def next(self) -> Token:
"Returns the next token."
self._skip_blank()
if self.pos >= len(self.code):
Expand Down Expand Up @@ -505,7 +505,7 @@ def _skip_blank(self):
else:
break

def _token_mode(self, match: re.Match, tag: str, mode: str) -> "Token":
def _token_mode(self, match: re.Match, tag: str, mode: str) -> Token:
"""
Pick out the text in ``match``, convert that into a ``Token``, and
return that.
Expand All @@ -517,15 +517,15 @@ def _token_mode(self, match: re.Match, tag: str, mode: str) -> "Token":
self._change_token_scanning_mode(mode)
return Token(tag, text, match.start(0))

def t_Filename(self, match: re.Match) -> "Token":
def t_Filename(self, match: re.Match) -> Token:
"Scan for ``Filename`` token and return that"
return self._token_mode(match, "Filename", "expr")

def t_Get(self, match: re.Match) -> "Token":
def t_Get(self, match: re.Match) -> Token:
"Scan for a ``Get`` token from ``match`` and return that token"
return self._token_mode(match, "Get", "filename")

def t_Number(self, match: re.Match) -> "Token":
def t_Number(self, match: re.Match) -> Token:
"Break out from ``match`` the next token which is expected to be a Number"
text = match.group(0)
pos = match.end(0)
Expand All @@ -537,15 +537,15 @@ def t_Number(self, match: re.Match) -> "Token":
self.pos = pos
return Token("Number", text, match.start(0))

def t_Put(self, match: re.Match) -> "Token":
def t_Put(self, match: re.Match) -> Token:
"Scan for a ``Put`` token and return that"
return self._token_mode(match, "Put", "filename")

def t_PutAppend(self, match: re.Match) -> "Token":
def t_PutAppend(self, match: re.Match) -> Token:
"Scan for a ``PutAppend`` token and return that"
return self._token_mode(match, "PutAppend", "filename")

def t_String(self, match: re.Match) -> "Token":
def t_String(self, match: re.Match) -> Token:
"Break out from self.code the next token which is expected to be a String"
start, end = self.pos, None
self.pos += 1 # skip opening '"'
Expand All @@ -559,13 +559,46 @@ def t_String(self, match: re.Match) -> "Token":
else:
break
char = self.code[self.pos]

# FIXME: This is wrong. If the previous
# character was \ then we don't break.
if char == '"':
self.pos += 1
end = self.pos
break

if char == "\\":
self.pos += 2
if self.pos + 1 == len(self.code):
# We have a \ at the end of a line.
self.incomplete()
newlines.append(self.pos)

# Code below is in pre-scanner. We might decide
# later to move that code here.
# elif self.code[self.pos + 1] in "01234567":
# # See if we have an octal number.
# try_parse_base(1, 4, 8)

else:
# newlines (\n), tabs (\t) and double backslash
# "\\" have the backslash preserved. But for other
# characters, the backslash is removed.
if self.code[self.pos + 1] not in (
"b", # word boundary?
"f", # form-feed?
"n", # newline
"r", # carrage return
"t", # tab
"\\", # Backslash
'"', # FIXME - Remove. Mathics3 code has bugs that rely
# on this
):
self.feeder.message(
"Syntax", "stresc", self.code[self.pos : self.pos + 2]
)
raise ScanError()

self.pos += 2
else:
self.pos += 1

Expand Down
67 changes: 67 additions & 0 deletions test/test_string_tokens.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
# -*- coding: utf-8 -*-
"""
Tests translation from text characters to the token: String
"""

import pytest

from mathics_scanner.errors import IncompleteSyntaxError, ScanError
from mathics_scanner.feed import SingleLineFeeder
from mathics_scanner.tokeniser import Token, Tokeniser


def check_string(source_text, expected_text: str):
token = single_token(source_text)
assert token is not None
assert token.tag == "String"
assert token.text == expected_text


def incomplete_error(s: str, failure_msg: str):
with pytest.raises(IncompleteSyntaxError) as excinfo:
get_tokens(s)

assert excinfo, failure_msg


def scan_error(s: str, failure_msg: str):
with pytest.raises(ScanError) as excinfo:
get_tokens(s)

assert excinfo, failure_msg


def single_token(source_text) -> Token:
tokens = get_tokens(source_text)
assert len(tokens) == 1
token = tokens[0]
return token


def get_tokens(source_text: str):
tokeniser = Tokeniser(SingleLineFeeder(source_text))
tokens = []
while True:
token = tokeniser.next()
if token.tag == "END":
break
else:
tokens.append(token)
return tokens


def test_string():
for escape_string in ("\b", "\f", "\n", "\r", "\t"):
check_string(f'"a{escape_string}"', f'"a{escape_string}"')

# Broken:
# "a\050", "a\051" "a\052"
# Prescanning eagerly replaces the escape sequences with
# symbols "(", ")", or "*" respectively and this messes up parsing
# somehow.
check_string(r'"abc"', r'"abc"')
check_string(r'"abc(*def*)"', r'"abc(*def*)"')
check_string(r'"a\"b\\c"', r'"a\"b\\c"')
incomplete_error(r'"abc', "String does not have terminating quote")
incomplete_error(r'"\"', "Unterminated escape sequence")
scan_error(r'"a\X"', '"X" is not a valid escape character')
27 changes: 13 additions & 14 deletions test/test_tokeniser.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,14 @@
Tests translation from strings to sequences of tokens.
"""

import pytest
import random
import sys

from mathics_scanner.tokeniser import Tokeniser, Token, is_symbol_name
from mathics_scanner.errors import ScanError, IncompleteSyntaxError, InvalidSyntaxError
import pytest

from mathics_scanner.errors import IncompleteSyntaxError, InvalidSyntaxError, ScanError
from mathics_scanner.feed import SingleLineFeeder
from mathics_scanner.tokeniser import Token, Tokeniser, is_symbol_name


def check_number(code):
Expand All @@ -22,11 +23,6 @@ def check_symbol(code):
assert token, Token("Symbol", code, 0)


def check_string(code):
token = single_token(code)
assert token, Token("String", code, 0)


def incomplete_error(string):
with pytest.raises(IncompleteSyntaxError):
tokens(string)
Expand Down Expand Up @@ -184,12 +180,15 @@ def test_precision():
check_number("1.5`10")


def test_string():
check_string(r'"abc"')
incomplete_error(r'"abc')
check_string(r'"abc(*def*)"')
check_string(r'"a\"b\\c"')
incomplete_error(r'"\"')
# String tests (with many more than those
# below are now in test_string_token.py
#
# def test_string():
# check_string(r'"abc"')
# incomplete_error(r'"abc')
# check_string(r'"abc(*def*)"')
# check_string(r'"a\"b\\c"')
# incomplete_error(r'"\"')


def test_set():
Expand Down
Loading