Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Nathan/refine gdscript parser #88

Draft
wants to merge 6 commits into
base: master
Choose a base branch
from
Draft
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
241 changes: 208 additions & 33 deletions mdx-utils/src/gdscript/parser_gdscript.nim
Original file line number Diff line number Diff line change
@@ -1,4 +1,22 @@
## Minimal GDScript parser specialized for code include shortcodes. Tokenizes symbol definitions and their body and collects all their content.
## Minimal GDScript parser specialized for code include shortcodes. Tokenizes
## symbol definitions and their body and collects all their content.
##
## Preprocesses GDScript code to extract code between anchor comments, like
## #ANCHOR:anchor_name
## ... Code here
## #END:anchor_name
##
## This works in 2 passes:
##
## 1. Preprocesses the code to extract the code between anchor comments and remove anchor comments.
## 2. Parses the preprocessed code to tokenize symbols and their content.
##
## Users can then query and retrieve the code between anchors or the definition
## and body of a symbol.
##
## This was originally written as a tool to only parse GDScript symbols, with
## the anchor preprocessing added later, so the approach may not be the most
## efficient.
import std/[tables, unittest, strutils, times]
when compileOption("profiler"):
import std/nimprof
Expand Down Expand Up @@ -26,13 +44,46 @@ type
children: seq[Token]

Scanner = object
# TODO: Cache the source elsewhere for reading the content of tokens after parsing.
source: string
current: int
indentLevel: int
bracketDepth: int
peekIndex: int

CodeAnchor = object
## A code anchor is how we call comments used to mark a region in the code, with the form
## #ANCHOR:anchor_name
## ...
## #END:anchor_name
##
## This object is used to extract the code between the anchor and the end tag.
nameStart, nameEnd: int
codeStart, codeEnd: int
# Used to remove the anchor tags from the final code
# codeStart marks the end of the anchor tag, codeEnd marks the start of the end tag
anchorTagStart, endTagEnd: int

GDScriptFile = object
## Represents a parsed GDScript file with its symbols and source code
filePath: string
source: string
## Map of symbol names to their tokens
symbols: Table[string, Token]
## Map of anchor names to their code anchors
anchors: Table[string, CodeAnchor]

SymbolQuery = object
## Represents a query to get a symbol from a GDScript file, like
## ClassName.definition or func_name.body or var_name.
name: string
isDefinition: bool
isBody: bool
isClass: bool
childName: string

# Caches parsed GDScript files
var gdscriptFiles = initTable[string, GDScriptFile]()

proc getCode(token: Token, source: string): string {.inline.} =
return source[token.range.start ..< token.range.end]

Expand Down Expand Up @@ -109,8 +160,9 @@ proc matchString(s: var Scanner, expected: string): bool {.inline.} =
return true
return false

proc countIndentation(s: var Scanner): int {.inline.} =
proc countIndentationAndAdvance(s: var Scanner): int {.inline.} =
## Counts the number of spaces and tabs starting from the current position
## Advances the scanner as it counts the indentation
## Call this function at the start of a line to count the indentation
result = 0
while true:
Expand Down Expand Up @@ -155,6 +207,9 @@ proc scanIdentifier(s: var Scanner): tuple[start: int, `end`: int] {.inline.} =
result = (start, s.current)

proc scanToEndOfLine(s: var Scanner): tuple[start, `end`: int] {.inline.} =
## Scans to the end of the current line, returning the start (the current
## position when the function was called) and end positions (the \n character
## at the end of the line).
let start = s.current
let length = s.source.len
var offset = 0
Expand Down Expand Up @@ -198,29 +253,119 @@ proc isNewDefinition(s: var Scanner): bool {.inline.} =
s.skipWhitespace()
result =
s.peekString("func") or s.peekString("var") or s.peekString("const") or
s.peekString("class") or s.peekString("signal") or s.peekString("enum")
s.peekString("class") or s.peekString("signal") or s.peekString("enum") or
# TODO: Consider how to handle regular comments vs anchors
s.peekString("#ANCHOR") or s.peekString("#END") or s.peekString("# ANCHOR") or
s.peekString("# END")
s.current = savedPos
return result

proc scanBody(s: var Scanner, startIndent: int): tuple[bodyStart, bodyEnd: int] =
let start = s.current
while not s.isAtEnd():
let currentIndent = s.countIndentation()
let currentIndent = s.countIndentationAndAdvance()
if currentIndent <= startIndent and not s.isAtEnd():
if isNewDefinition(s):
break

discard scanToEndOfLine(s)
# s.current points to the first letter of the next token, after the
# indentation. We need to backtrack to find the actual end of the body.
var index = s.current - 1
while index > 0 and s.source[index] in [' ', '\r', '\t', '\n']:
index -= 1
s.current = index + 1
result = (start, s.current)

proc scanAnchor(s: var Scanner): CodeAnchor =
## Scans from a #ANCHOR tag to the matching #END tag
## This is used in a preprocessing pass.

result.anchorTagStart = s.current

# Skip the #ANCHOR: or # ANCHOR: part
while s.getCurrentChar() != ':':
s.current += 1
s.skipWhitespace()
# Skip the colon to parse the anchor name
s.current += 1

let (nameStart, nameEnd) = s.scanIdentifier()
discard s.scanToEndOfLine()

result.nameStart = nameStart
result.nameEnd = nameEnd
result.codeStart = s.current

# Look for the matching END tag
let anchorName = s.source[nameStart ..< nameEnd]
var foundEndTag = false

let endTag = "#END:" & anchorName
let endTagWithSpace = "# END:" & anchorName
while not s.isAtEnd():
s.skipWhitespace()
if s.peekString(endTag) or s.peekString(endTagWithSpace):
foundEndTag = true
result.codeEnd = s.current
discard s.scanToEndOfLine()
result.endTagEnd = s.current
break

if not s.isAtEnd():
discard s.advance()

if not foundEndTag:
# The anchor is not closed, reset the scanner to after the anchor's opening
# tag, raise an error, and allow continuing with parsing.
s.current = result.codeStart
raise newException(
ValueError, "Anchor region " & anchorName & " is missing an #END tag."
)

proc preprocessAnchors(
source: string
): tuple[anchors: Table[string,CodeAnchor], processed: string] =
## Preprocesses the source code to extract the code between anchor comments
## and remove the anchor comments.
##
## Returns a table that maps anchor names to the CodeAnchor objects for easy
## lookup and the processed source code, with the anchor comments removed.
var anchors: seq[CodeAnchor] = @[]
var newSource = newStringOfCap(source.len)
var s =
Scanner(source: source, current: 0, indentLevel: 0, bracketDepth: 0, peekIndex: 0)

var lastEnd = 0
while not s.isAtEnd():
let c = s.getCurrentChar()
if c == '#':
if s.peekString("#ANCHOR") or s.peekString("# ANCHOR"):
let anchor = scanAnchor(s)
anchors.add(anchor)
newSource.add(source[lastEnd ..< anchor.anchorTagStart])
newSource.add(source[anchor.codeStart ..< anchor.codeEnd])
lastEnd = anchor.endTagEnd
s.current += 1

newSource.add(source[lastEnd ..< source.len])
var anchorsTable = initTable[string, CodeAnchor](anchors.len)
for anchor in anchors:
anchorsTable[s.source[anchor.nameStart ..< anchor.nameEnd]] = anchor
return (anchorsTable, newSource)

proc scanToken(s: var Scanner): Token =
while not s.isAtEnd():
s.indentLevel = s.countIndentation()
s.indentLevel = s.countIndentationAndAdvance()
s.skipWhitespace()

let startPos = s.current
let c = s.getCurrentChar()
case c
# Comment, skip to end of line and continue
of '#':
discard s.scanToEndOfLine()
continue
# Function definition
of 'f':
if s.matchString("func"):
Expand Down Expand Up @@ -350,7 +495,8 @@ proc parseClass(s: var Scanner, classToken: var Token) =
let classIndent = s.indentLevel
s.current = classToken.range.bodyStart
while not s.isAtEnd():
let currentIndent = s.countIndentation()
#Problem: s is on the first char of the token instead of the beginning of the line
let currentIndent = s.countIndentationAndAdvance()
if currentIndent <= classIndent:
if isNewDefinition(s):
break
Expand All @@ -374,24 +520,28 @@ proc parseGDScript(source: string): seq[Token] =
token.range.end = scanner.current
result.add(token)

type GDScriptFile = object
filePath: string
source: string
symbols: Table[string, Token]

# Caches parsed GDScript files
var gdscriptFiles = initTable[string, GDScriptFile]()

proc parseGDScriptFile(path: string) =
# Parses a GDScript file and caches it
## Parses a GDScript file and caches it in the gdscriptFiles table.
## The parsing happens in two passes:
##
## 1. We preprocess the source code to extract the code between anchor comments and remove these comment lines.
## 2. We parse the preprocessed source code to tokenize symbols and their content.
##
## Preprocessing makes the symbol parsing easier afterwards, although it means we scan the file twice.
let source = readFile(path)
let tokens = parseGDScript(source)
let (anchors, processedSource) = preprocessAnchors(source)
let tokens = parseGDScript(processedSource)
var symbols = initTable[string, Token]()
for token in tokens:
let name = token.getName(source)
let name = token.getName(processedSource)
symbols[name] = token

gdscriptFiles[path] = GDScriptFile(filePath: path, source: source, symbols: symbols)
gdscriptFiles[path] = GDScriptFile(
filePath: path,
source: source,
symbols: symbols,
anchors: anchors
)

proc getTokenFromCache(symbolName: string, filePath: string): Token =
# Gets a token from the cache given a symbol name and the path to the GDScript file
Expand All @@ -407,12 +557,6 @@ proc getTokenFromCache(symbolName: string, filePath: string): Token =

return file.symbols[symbolName]

proc getGDScriptCodeFromCache(filePath: string): var string =
# Gets the code of a GDScript file from the cache given its path
if not gdscriptFiles.hasKey(filePath):
parseGDScriptFile(filePath)
return gdscriptFiles[filePath].source

proc getSymbolText(symbolName: string, path: string): string =
# Gets the text of a symbol given its name and the path to the file
let token = getTokenFromCache(symbolName, path)
Expand All @@ -437,13 +581,6 @@ proc getSymbolBody(symbolName: string, path: string): string =
let file = gdscriptFiles[path]
return token.getBody(file.source)

type SymbolQuery = object
name: string
isDefinition: bool
isBody: bool
isClass: bool
childName: string

proc parseSymbolQuery(query: string): SymbolQuery =
## Turns a symbol query string like ClassName.body or ClassName.function.definition
## into a SymbolQuery object for easier processing.
Expand Down Expand Up @@ -500,7 +637,6 @@ proc getCode*(symbolQuery: string, filePath: string): string =
result = getSymbolBody(query.name, filePath)
else:
result = getSymbolText(query.name, filePath)
result = result.strip(trailing = true)

proc runPerformanceTest() =
let codeTest =
Expand Down Expand Up @@ -647,6 +783,45 @@ class StateMachine extends Node:
classToken.children[2].tokenType == TokenType.Variable
classToken.children[3].tokenType == TokenType.Function

test "Parse larger inner class with anchors":
let code =
"""
#ANCHOR:class_StateDie
class StateDie extends State:

const SmokeExplosionScene = preload("res://assets/vfx/smoke_vfx/smoke_explosion.tscn")

func _init(init_mob: Mob3D) -> void:
super("Die", init_mob)

func enter() -> void:
mob.skin.play("die")

var smoke_explosion := SmokeExplosionScene.instantiate()
mob.add_sibling(smoke_explosion)
smoke_explosion.global_position = mob.global_position

mob.skin.animation_finished.connect(func (_animation_name: String) -> void:
mob.queue_free()
)
#END:class_StateDie
"""
let (anchors, processedSource) = preprocessAnchors(code)
let tokens = parseGDScript(processedSource)
check:
tokens.len == 1
if tokens.len == 1:
let classToken = tokens[0]
check:
classToken.tokenType == TokenType.Class
classToken.getName(processedSource) == "StateDie"
classToken.children.len == 3
# Trailing anchor comments should not be included in the token
not classToken.getBody(processedSource).contains("#END")
else:
echo "Found tokens: ", tokens.len
printTokens(tokens, processedSource)

when isMainModule:
runUnitTests()
#runPerformanceTest()