Skip to content

Commit

Permalink
Fixed unclosed text literal tokenizing.
Browse files Browse the repository at this point in the history
  • Loading branch information
amyjko committed Oct 24, 2023
1 parent c414ae2 commit 0ec4d43
Show file tree
Hide file tree
Showing 2 changed files with 40 additions and 15 deletions.
12 changes: 6 additions & 6 deletions src/nodes/Token.ts
Original file line number Diff line number Diff line change
Expand Up @@ -34,12 +34,12 @@ export default class Token extends Node {
text instanceof UnicodeString ? text : new UnicodeString(text);

// No token is allowed to be empty except the end token.
if (
this.text.isEmpty() &&
!this.isSymbol(Sym.End) &&
!this.isSymbol(Sym.Words)
)
throw Error('This token has no text');
// if (
// this.text.isEmpty() &&
// !this.isSymbol(Sym.End) &&
// !this.isSymbol(Sym.Words)
// )
// throw Error('This token has no text');
}

// NODE CONTRACT
Expand Down
43 changes: 34 additions & 9 deletions src/parser/Tokenizer.ts
Original file line number Diff line number Diff line change
Expand Up @@ -387,16 +387,23 @@ export function tokenize(source: string): TokenList {
}
// If we're not in a doc, then slurp preceding space before the next token.
else {
const spaceMatch = source.match(/^[ \t\n]+/);
space = spaceMatch === null ? '' : spaceMatch[0];
space = getNextSpace(source);
}

// Trim the space we found.
source = source.substring(space.length);

// Tokenize the next token. We tokenize in documentation mode if we're inside a doc and the eval depth
// has not changed since we've entered.
const nextToken = getNextToken(source, context);
const stuff = getNextToken(source, context);

// Did the next token pull out some unexpected space? Override the space
const nextToken = Array.isArray(stuff) ? stuff[0] : stuff;
if (Array.isArray(stuff) && stuff[1] !== undefined) {
const extraSpace = stuff[1];
source = source.substring(extraSpace.length);
space = extraSpace;
}

// Add the new token to the list
tokens.push(nextToken);
Expand Down Expand Up @@ -457,10 +464,16 @@ export function tokenize(source: string): TokenList {
return new TokenList(tokens, spaces);
}

function getNextToken(source: string, context: Token[]): Token {
function getNextToken(
source: string,
context: Token[]
): Token | [Token, string | undefined] {
// If there's nothing left after trimming source, return an end of file token.
if (source.length === 0) return new Token('', Sym.End);

// Any extra space we find a long the way, primarily if we end an unclosed text literal.
let space: string | undefined = undefined;

if (context.length > 0) {
const container = context[0];
// If we're in text, keep reading until the next code open, text close, end of line, or end of source,
Expand All @@ -482,8 +495,7 @@ function getNextToken(source: string, context: Token[]): Token {
// If we ended this text with a newline, then shift out of the context.
if (stopIndex === lineIndex) context.shift();

// If we found more than one words characters, make a word.
// Otherwise, tokenize whatever comes next.
// If we found more than one words characters, make a word token for the text.
if (stopIndex > 0)
return new Token(
source.substring(
Expand All @@ -494,6 +506,12 @@ function getNextToken(source: string, context: Token[]): Token {
),
Sym.Words
);
// Otherwise, read any preceding space for the next token, and tokenize whatever comes next.
else {
space = getNextSpace(source);
source = source.substring(space.length);
if (source.length === 0) return [new Token('', Sym.End), space];
}
}
// If we're in a doc, special case a few token types that only appear in docs (URL, WORDS)
else if (
Expand Down Expand Up @@ -522,21 +540,23 @@ function getNextToken(source: string, context: Token[]): Token {
typeof pattern.pattern === 'string' &&
source.startsWith(pattern.pattern)
)
return new Token(pattern.pattern, pattern.types);
return [new Token(pattern.pattern, pattern.types), space];
else if (pattern.pattern instanceof RegExp) {
const match = source.match(pattern.pattern);
// If we found a match, return it if
// 1) It's _not_ a text close, or
// 2) It is, but there are either no open templates (syntax error!), or
// 3) There is an open template and it's the closing delimiter matches the current open text delimiter.
if (match !== null) return new Token(match[0], pattern.types);
if (match !== null)
return [new Token(match[0], pattern.types), space];
}
}

// Otherwise, we fail and return an error token that contains all of the text until the next recognizable token.
// This is a recursive call: it tries to tokenize the next character, skipping this one, going all the way to the
// end of the source if necessary, but stopping at the nearest recognizable token.
const next = getNextToken(source.substring(1), context);
const stuff = getNextToken(source.substring(1), context);
const next = Array.isArray(stuff) ? stuff[0] : stuff;
return new Token(
source.substring(
0,
Expand All @@ -547,3 +567,8 @@ function getNextToken(source: string, context: Token[]): Token {
Sym.Unknown
);
}

function getNextSpace(source: string) {
const spaceMatch = source.match(/^[ \t\n]+/);
return spaceMatch === null ? '' : spaceMatch[0];
}

0 comments on commit 0ec4d43

Please sign in to comment.