From 9fd1ab90922198d123f4e30ea2bec7d26cf8adbe Mon Sep 17 00:00:00 2001 From: Kaz Wesley Date: Wed, 11 Oct 2023 06:04:38 -0700 Subject: [PATCH] Parser TS bindings (#7881) Generate TS bindings and lazy deserialization for the parser types. # Important Notes - The new API is imported into `ffi.ts`, but not yet used. - I have tested the generated code in isolation, but cannot commit tests as we are not currently able to load WASM modules when running in `vitest`. --- Cargo.lock | 13 + Cargo.toml | 1 + app/gui/language/parser/src/translation.rs | 4 +- app/gui2/.prettierignore | 4 +- app/gui2/env.d.ts | 2 +- app/gui2/eslint.config.js | 3 +- app/gui2/package.json | 8 +- app/gui2/parser-codegen/codegen.ts | 413 +++++++++++++ app/gui2/parser-codegen/index.ts | 17 + app/gui2/parser-codegen/schema.ts | 35 ++ app/gui2/parser-codegen/serialization.ts | 372 ++++++++++++ app/gui2/parser-codegen/util.ts | 91 +++ app/gui2/rust-ffi/src/lib.rs | 12 +- app/gui2/src/generated/.gitkeep | 0 app/gui2/src/stores/graph.ts | 2 +- .../__tests__/lsUpdate.test.ts | 2 +- .../suggestionDatabase/documentation.ts | 2 +- .../src/stores/suggestionDatabase/entry.ts | 3 +- .../src/stores/suggestionDatabase/lsUpdate.ts | 2 +- app/gui2/src/util/ast.ts | 78 +++ app/gui2/src/util/docParser.ts | 72 +++ app/gui2/src/util/ffi.ts | 524 +---------------- app/gui2/src/util/parserSupport.ts | 255 ++++++++ app/gui2/tsconfig.app.json | 1 - app/gui2/tsconfig.node.json | 1 + app/gui2/vite.config.ts | 2 +- app/gui2/vitest.config.ts | 2 +- lib/rust/metamodel/lexpr/src/lib.rs | 6 +- lib/rust/metamodel/src/data_structures.rs | 8 + lib/rust/metamodel/src/java/from_meta.rs | 4 +- lib/rust/metamodel/src/java/implementation.rs | 2 +- lib/rust/metamodel/src/java/mod.rs | 24 +- lib/rust/metamodel/src/lib.rs | 1 + lib/rust/metamodel/src/meta/graphviz.rs | 3 +- lib/rust/metamodel/src/meta/mod.rs | 23 +- lib/rust/parser/Cargo.toml | 5 +- lib/rust/parser/debug/Cargo.toml | 1 + lib/rust/parser/debug/src/bin/binary_ast.rs | 24 + lib/rust/parser/debug/src/bin/json_ast.rs | 20 + lib/rust/parser/debug/src/lib.rs | 1 + lib/rust/parser/schema/Cargo.toml | 19 + lib/rust/parser/schema/src/lib.rs | 369 ++++++++++++ lib/rust/parser/schema/src/main.rs | 27 + lib/rust/parser/src/format.rs | 552 ++++++++++++++++++ lib/rust/parser/src/lexer.rs | 278 +++++---- lib/rust/parser/src/lib.rs | 21 +- lib/rust/parser/src/macros/built_in.rs | 19 +- lib/rust/parser/src/macros/resolver.rs | 7 +- lib/rust/parser/src/main.rs | 32 +- lib/rust/parser/src/serialization.rs | 20 +- lib/rust/parser/src/source/code.rs | 155 ++++- lib/rust/parser/src/source/span.rs | 47 +- lib/rust/parser/src/syntax/token.rs | 39 +- lib/rust/parser/src/syntax/tree.rs | 23 +- package-lock.json | 44 +- package.json | 1 + 56 files changed, 2874 insertions(+), 822 deletions(-) create mode 100644 app/gui2/parser-codegen/codegen.ts create mode 100644 app/gui2/parser-codegen/index.ts create mode 100644 app/gui2/parser-codegen/schema.ts create mode 100644 app/gui2/parser-codegen/serialization.ts create mode 100644 app/gui2/parser-codegen/util.ts create mode 100644 app/gui2/src/generated/.gitkeep create mode 100644 app/gui2/src/util/ast.ts create mode 100644 app/gui2/src/util/docParser.ts create mode 100644 app/gui2/src/util/parserSupport.ts create mode 100644 lib/rust/parser/debug/src/bin/binary_ast.rs create mode 100644 lib/rust/parser/debug/src/bin/json_ast.rs create mode 100644 lib/rust/parser/schema/Cargo.toml create mode 100644 lib/rust/parser/schema/src/lib.rs create mode 100644 lib/rust/parser/schema/src/main.rs create mode 100644 lib/rust/parser/src/format.rs diff --git a/Cargo.lock b/Cargo.lock index 9f91c6369124..17f1d958945d 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2365,6 +2365,7 @@ dependencies = [ "serde", "serde_json", "uuid 1.2.2", + "wasm-bindgen-test", ] [[package]] @@ -2377,6 +2378,7 @@ dependencies = [ "enso-reflect", "lexpr", "serde", + "serde_json", ] [[package]] @@ -2400,6 +2402,17 @@ dependencies = [ "jni", ] +[[package]] +name = "enso-parser-schema" +version = "0.1.0" +dependencies = [ + "enso-metamodel", + "enso-parser", + "enso-reflect", + "serde", + "serde_json", +] + [[package]] name = "enso-parser-syntax-tree-visitor" version = "0.1.0" diff --git a/Cargo.toml b/Cargo.toml index 1f673f5807e2..3acba36dfbc5 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -23,6 +23,7 @@ members = [ "lib/rust/parser/src/syntax/tree/visitor", "lib/rust/parser/jni", "lib/rust/parser/generate-java", + "lib/rust/parser/schema", "lib/rust/parser/debug", "lib/rust/ensogl/pack", "lib/rust/profiler/data", diff --git a/app/gui/language/parser/src/translation.rs b/app/gui/language/parser/src/translation.rs index 0a891640bb4b..78e1eb77d17c 100644 --- a/app/gui/language/parser/src/translation.rs +++ b/app/gui/language/parser/src/translation.rs @@ -81,7 +81,7 @@ impl Translate { let space = span.left_offset.code.repr.len(); self.space_after.insert(self.offset, space); self.offset += space; - span.left_offset.visible.width_in_spaces + usize::try_from(span.left_offset.visible.width_in_spaces).unwrap() } /// This must be called at the beginning of each [`Token`], as they are processed in depth-first @@ -93,7 +93,7 @@ impl Translate { /// This must be called at the beginning of each [`Token`], as they are processed in depth-first /// order. It updates the internal counter for the token's bytes, and returns its contents. fn visit_token_ref(&mut self, token: syntax::token::Ref) -> WithInitialSpace { - let space = token.left_offset.visible.width_in_spaces; + let space = usize::try_from(token.left_offset.visible.width_in_spaces).unwrap(); let body = token.code.to_string(); self.space_after.insert(self.offset, space); self.offset += token.left_offset.code.repr.len(); diff --git a/app/gui2/.prettierignore b/app/gui2/.prettierignore index 766e0a799a36..bb6b39b75420 100644 --- a/app/gui2/.prettierignore +++ b/app/gui2/.prettierignore @@ -1,2 +1,4 @@ *.html -*.css \ No newline at end of file +*.css + +**/generated diff --git a/app/gui2/env.d.ts b/app/gui2/env.d.ts index 3ae511733df5..814a0e7e22f2 100644 --- a/app/gui2/env.d.ts +++ b/app/gui2/env.d.ts @@ -1,7 +1,7 @@ /// declare const PROJECT_MANAGER_URL: string -declare const RUNNING_VTEST: boolean +declare const RUNNING_VITEST: boolean // This is an augmentation to the built-in `ImportMeta` interface. // This file MUST NOT contain any top-level imports. diff --git a/app/gui2/eslint.config.js b/app/gui2/eslint.config.js index a282d3d43b2c..220caf4e65b4 100644 --- a/app/gui2/eslint.config.js +++ b/app/gui2/eslint.config.js @@ -9,7 +9,7 @@ const DIR_NAME = path.dirname(url.fileURLToPath(import.meta.url)) const conf = [ { - ignores: ['rust-ffi/pkg', 'dist'], + ignores: ['rust-ffi/pkg', 'dist', 'src/generated'], }, ...compat.extends('plugin:vue/vue3-recommended'), eslintJs.configs.recommended, @@ -26,6 +26,7 @@ const conf = [ './tsconfig.server.json', './tsconfig.app.vitest.json', './tsconfig.server.vitest.json', + './parser-codegen/tsconfig.json', ], }, }, diff --git a/app/gui2/package.json b/app/gui2/package.json index c716657686b9..86d3cb5f5d32 100644 --- a/app/gui2/package.json +++ b/app/gui2/package.json @@ -19,8 +19,10 @@ "typecheck": "vue-tsc --noEmit -p tsconfig.app.json --composite false", "lint": "eslint .", "format": "prettier --write src/ && eslint . --fix", - "build-rust-ffi": "cd rust-ffi && wasm-pack build --release --target web", - "preinstall": "npm run build-rust-ffi" + "build-rust-ffi": "wasm-pack build ./rust-ffi --release --target web", + "generate-ast-schema": "cargo run -p enso-parser-schema > src/generated/ast-schema.json", + "generate-ast-types": "tsx ./parser-codegen/index.ts src/generated/ast-schema.json src/generated/ast.ts", + "preinstall": "npm run build-rust-ffi && npm run generate-ast-schema && npm run generate-ast-types" }, "dependencies": { "@babel/parser": "^7.22.16", @@ -75,6 +77,7 @@ "@vue/tsconfig": "^0.4.0", "ag-grid-community": "^30.1.0", "ag-grid-enterprise": "^30.1.0", + "change-case": "^5.0.2", "d3": "^7.4.0", "esbuild": "^0.19.3", "eslint": "^8.49.0", @@ -86,6 +89,7 @@ "shuffle-seed": "^1.1.6", "sql-formatter": "^13.0.0", "tailwindcss": "^3.2.7", + "tsx": "^3.12.6", "typescript": "~5.2.2", "vite": "^4.4.9", "vite-plugin-inspect": "^0.7.38", diff --git a/app/gui2/parser-codegen/codegen.ts b/app/gui2/parser-codegen/codegen.ts new file mode 100644 index 000000000000..a60f7dac04e3 --- /dev/null +++ b/app/gui2/parser-codegen/codegen.ts @@ -0,0 +1,413 @@ +/** + * Generates TypeScript bindings from a schema describing types and their serialization. + * + * Internally, the generated types deserialize their data on demand. This benefits performance: If we eagerly + * deserialized a serialized tree to a tree of objects in memory, creating the tree would produce many heap-allocated + * objects, and visiting the tree would require dereferencing chains of heap pointers. Deserializing while traversing + * allows the optimizer to stack-allocate the temporary objects, saving time and reducing GC pressure. + */ + +import ts from 'typescript' +import * as Schema from './schema.js' +import { + Type, + abstractTypeDeserializer, + fieldDeserializer, + fieldDynValue, + seekCursor, + support, + supportImports, +} from './serialization.js' +import { + assignmentStatement, + forwardToSuper, + mapIdent, + modifiers, + namespacedName, + toCamel, + toPascal, +} from './util.js' +const tsf = ts.factory + +// === Public API === + +export function implement(schema: Schema.Schema): string { + const file = ts.createSourceFile('source.ts', '', ts.ScriptTarget.ESNext, false, ts.ScriptKind.TS) + const printer = ts.createPrinter({ newLine: ts.NewLineKind.LineFeed }) + let output = '// *** THIS FILE GENERATED BY `parser-codegen` ***\n' + + function emit(data: ts.Node) { + output += printer.printNode(ts.EmitHint.Unspecified, data, file) + output += '\n' + } + + emit( + tsf.createImportDeclaration( + [], + tsf.createImportClause( + false, + undefined, + tsf.createNamedImports( + Array.from(Object.entries(supportImports), ([name, isTypeOnly]) => + tsf.createImportSpecifier(isTypeOnly, undefined, tsf.createIdentifier(name)), + ), + ), + ), + tsf.createStringLiteral('@/util/parserSupport', true), + undefined, + ), + ) + for (const id in schema.types) { + const ty = schema.types[id] + if (ty?.parent == null) { + const discriminants = schema.serialization[id]?.discriminants + if (discriminants == null) { + emit(makeConcreteType(id, schema)) + } else { + const ty = makeAbstractType(id, discriminants, schema) + emit(ty.module) + emit(ty.export) + } + } else { + // Ignore child types; they are generated when `makeAbstractType` processes the parent. + } + } + output += `export function deserializeTree(data: ArrayBuffer): Tree { + const cursor = new Cursor(data, data.byteLength - 4) + return Tree.read(cursor.readPointer()) + }` + return output +} + +// === Implementation === + +function makeType(ref: Schema.TypeRef, schema: Schema.Schema): Type { + const c = ref.class + switch (c) { + case 'type': { + const ty = schema.types[ref.id] + if (!ty) throw new Error(`Invalid type ref: ${ref.id}`) + const parent = ty.parent != null ? schema.types[ty.parent] : undefined + const typeName = namespacedName(ty.name, parent?.name) + const layout = schema.serialization[ref.id] + if (!layout) throw new Error(`Invalid serialization ref: ${ref.id}`) + if (layout.discriminants != null) { + return Type.Abstract(typeName) + } else { + return Type.Concrete(typeName, layout.size) + } + } + case 'primitive': { + const p = ref.type + switch (p) { + case 'bool': + return Type.Boolean + case 'u32': + return Type.UInt32 + case 'i32': + return Type.Int32 + case 'u64': + return Type.UInt64 + case 'i64': + return Type.Int64 + case 'char': + return Type.Char + case 'string': + return Type.String + default: { + const _ = p satisfies never + throw new Error(`unreachable: PrimitiveType.type='${p}'`) + } + } + } + case 'sequence': + return Type.Sequence(makeType(ref.type, schema)) + case 'option': + return Type.Option(makeType(ref.type, schema)) + case 'result': + return Type.Result(makeType(ref.type0, schema), makeType(ref.type1, schema)) + default: { + const _ = c satisfies never + throw new Error(`unreachable: TypeRef.class='${c}' in ${JSON.stringify(ref)}`) + } + } +} + +type Field = { + name: string + type: Type + offset: number +} + +function makeField( + name: string, + typeRef: Schema.TypeRef, + offset: number, + schema: Schema.Schema, +): Field { + return { + name: mapIdent(toCamel(name)), + type: makeType(typeRef, schema), + offset: offset, + } +} + +function makeGetter(field: Field): ts.GetAccessorDeclaration { + return fieldDeserializer(tsf.createIdentifier(field.name), field.type, field.offset) +} + +function makeConcreteType(id: string, schema: Schema.Schema): ts.ClassDeclaration { + const ident = tsf.createIdentifier(toPascal(schema.types[id]!.name)) + const paramIdent = tsf.createIdentifier('cursor') + const cursorParam = tsf.createParameterDeclaration( + [], + undefined, + paramIdent, + undefined, + support.Cursor, + undefined, + ) + return makeClass( + [modifiers.export], + ident, + [ + forwardToSuper(paramIdent, support.Cursor), + tsf.createMethodDeclaration( + [modifiers.static], + undefined, + 'read', + undefined, + [], + [cursorParam], + tsf.createTypeReferenceNode(ident), + tsf.createBlock([ + tsf.createReturnStatement(tsf.createNewExpression(ident, [], [paramIdent])), + ]), + ), + ], + id, + schema, + ) +} + +function makeDebugFunction(fields: Field[], typeName?: string): ts.MethodDeclaration { + const ident = tsf.createIdentifier('fields') + const fieldAssignments = fields.map((field) => + tsf.createArrayLiteralExpression([ + tsf.createStringLiteral(field.name), + fieldDynValue(field.type, field.offset), + ]), + ) + if (typeName != null) { + fieldAssignments.push( + tsf.createArrayLiteralExpression([ + tsf.createStringLiteral('type'), + tsf.createObjectLiteralExpression([ + tsf.createPropertyAssignment('type', tsf.createStringLiteral('primitive')), + tsf.createPropertyAssignment('value', tsf.createStringLiteral(typeName)), + ]), + ]), + ) + } + return tsf.createMethodDeclaration( + [], + undefined, + ident, + undefined, + [], + [], + tsf.createTypeReferenceNode(`[string, ${support.DynValue}][]`), + tsf.createBlock([ + tsf.createReturnStatement( + tsf.createArrayLiteralExpression([ + tsf.createSpreadElement( + tsf.createCallExpression( + tsf.createPropertyAccessExpression(tsf.createSuper(), ident), + undefined, + undefined, + ), + ), + ...fieldAssignments, + ]), + ), + ]), + ) +} + +function makeGetters(id: string, schema: Schema.Schema, typeName?: string): ts.ClassElement[] { + const serialization = schema.serialization[id] + const type = schema.types[id] + if (serialization == null || type == null) throw new Error(`Invalid type id: ${id}`) + const fields = serialization.fields.map(([name, offset]: [string, number]) => { + const field = type.fields[name] + if (field == null) throw new Error(`Invalid field name '${name}' for type '${type.name}'`) + return makeField(name, field, offset, schema) + }) + return [...fields.map(makeGetter), makeDebugFunction(fields, typeName)] +} + +function makeClass( + modifiers: ts.Modifier[], + name: ts.Identifier, + members: ts.ClassElement[], + id: string, + schema: Schema.Schema, +): ts.ClassDeclaration { + return tsf.createClassDeclaration( + modifiers, + name, + undefined, + [ + tsf.createHeritageClause(ts.SyntaxKind.ExtendsKeyword, [ + tsf.createExpressionWithTypeArguments(support.LazyObject, []), + ]), + ], + [...members, ...makeGetters(id, schema)], + ) +} + +type ChildType = { + definition: ts.ClassDeclaration + reference: ts.TypeNode + enumMember: ts.EnumMember + case: ts.CaseClause +} + +function makeChildType( + base: ts.Identifier, + id: string, + discriminant: string, + schema: Schema.Schema, +): ChildType { + const ty = schema.types[id] + if (ty == null) throw new Error(`Invalid type id: ${id}`) + const name = toPascal(ty.name) + const ident = tsf.createIdentifier(name) + const cursorIdent = tsf.createIdentifier('cursor') + const cursorParam = tsf.createParameterDeclaration( + [], + undefined, + cursorIdent, + undefined, + support.Cursor, + undefined, + ) + const discriminantInt = tsf.createNumericLiteral(parseInt(discriminant, 10)) + return { + definition: tsf.createClassDeclaration( + [modifiers.export], + name, + undefined, + [ + tsf.createHeritageClause(ts.SyntaxKind.ExtendsKeyword, [ + tsf.createExpressionWithTypeArguments(base, []), + ]), + ], + [ + tsf.createPropertyDeclaration( + [modifiers.readonly], + 'type', + undefined, + tsf.createTypeReferenceNode('Type.' + name), + undefined, + ), + tsf.createConstructorDeclaration( + [], + [ + tsf.createParameterDeclaration( + [], + undefined, + cursorIdent, + undefined, + support.Cursor, + undefined, + ), + ], + tsf.createBlock([ + tsf.createExpressionStatement( + tsf.createCallExpression(tsf.createIdentifier('super'), [], [cursorIdent]), + ), + assignmentStatement( + tsf.createPropertyAccessExpression(tsf.createIdentifier('this'), 'type'), + tsf.createPropertyAccessExpression(tsf.createIdentifier('Type'), name), + ), + ]), + ), + tsf.createMethodDeclaration( + [modifiers.static], + undefined, + 'read', + undefined, + [], + [cursorParam], + tsf.createTypeReferenceNode(ident), + tsf.createBlock([ + tsf.createReturnStatement(tsf.createNewExpression(ident, [], [cursorIdent])), + ]), + ), + ...makeGetters(id, schema, name), + ], + ), + reference: tsf.createTypeReferenceNode(name), + enumMember: tsf.createEnumMember(toPascal(ty.name), discriminantInt), + case: tsf.createCaseClause(discriminantInt, [ + tsf.createReturnStatement(tsf.createNewExpression(ident, [], [seekCursor(cursorIdent, 4)])), + ]), + } +} + +type AbstractType = { + module: ts.ModuleDeclaration + export: ts.TypeAliasDeclaration +} + +function makeAbstractType( + id: string, + discriminants: Schema.DiscriminantMap, + schema: Schema.Schema, +): AbstractType { + const ty = schema.types[id]! + const name = toPascal(ty.name) + const ident = tsf.createIdentifier(name) + const baseIdent = tsf.createIdentifier('AbstractBase') + const childTypes = Array.from(Object.entries(discriminants), ([discrim, id]: [string, string]) => + makeChildType(baseIdent, id, discrim, schema), + ) + const cursorIdent = tsf.createIdentifier('cursor') + const moduleDecl = tsf.createModuleDeclaration( + [modifiers.export], + ident, + tsf.createModuleBlock([ + makeClass( + [modifiers.abstract], + baseIdent, + [forwardToSuper(cursorIdent, support.Cursor, [modifiers.protected])], + id, + schema, + ), + tsf.createEnumDeclaration( + [modifiers.export, modifiers.const], + 'Type', + childTypes.map((child) => child.enumMember), + ), + ...childTypes.map((child) => child.definition), + tsf.createTypeAliasDeclaration( + [modifiers.export], + ident, + undefined, + tsf.createUnionTypeNode(childTypes.map((child) => child.reference)), + ), + abstractTypeDeserializer( + ident, + childTypes.map((child) => child.case), + ), + ]), + ) + const abstractTypeExport = tsf.createTypeAliasDeclaration( + [modifiers.export], + ident, + undefined, + tsf.createTypeReferenceNode(name + '.' + name), + ) + return { module: moduleDecl, export: abstractTypeExport } +} diff --git a/app/gui2/parser-codegen/index.ts b/app/gui2/parser-codegen/index.ts new file mode 100644 index 000000000000..ab06131db292 --- /dev/null +++ b/app/gui2/parser-codegen/index.ts @@ -0,0 +1,17 @@ +import * as fs from 'node:fs' +import * as process from 'node:process' +import * as codegen from './codegen.js' +import * as Schema from './schema.js' + +const schemaPath = process.argv[2] +const outputPath = process.argv[3] + +if (!schemaPath || !outputPath) { + console.error('Usage: parser-codegen ') + process.exit(1) +} + +console.log(`Generating ${outputPath} from ${schemaPath}.`) +const schema: Schema.Schema = JSON.parse(fs.readFileSync(schemaPath, 'utf8')) +const code = codegen.implement(schema) +fs.writeFileSync(outputPath, code) diff --git a/app/gui2/parser-codegen/schema.ts b/app/gui2/parser-codegen/schema.ts new file mode 100644 index 000000000000..85ebea74a23f --- /dev/null +++ b/app/gui2/parser-codegen/schema.ts @@ -0,0 +1,35 @@ +export type Schema = { + types: Types + serialization: Serialization +} +export type TypeId = string +export type Types = { + [id: TypeId]: Type +} +export type Type = { + name: string + fields: Fields + parent?: string +} +export type Fields = { + [name: string]: TypeRef +} +export type TypeRef = Class | Primitive | Sequence | Option | Result +export type Class = { class: 'type'; id: TypeId } +export type Primitive = { class: 'primitive'; type: PrimitiveType } +export type Sequence = { class: 'sequence'; type: TypeRef } +export type Option = { class: 'option'; type: TypeRef } +export type Result = { class: 'result'; type0: TypeRef; type1: TypeRef } +export type PrimitiveType = 'bool' | 'u32' | 'u64' | 'i32' | 'i64' | 'char' | 'string' + +export type Serialization = { + [id: TypeId]: Layout +} +export type Layout = { + discriminants?: DiscriminantMap + fields: [name: string, offset: number][] + size: number +} +export type DiscriminantMap = { + [discriminant: number]: TypeId +} diff --git a/app/gui2/parser-codegen/serialization.ts b/app/gui2/parser-codegen/serialization.ts new file mode 100644 index 000000000000..add65487dce0 --- /dev/null +++ b/app/gui2/parser-codegen/serialization.ts @@ -0,0 +1,372 @@ +/** Generates code lazily deserializing from an application-specific binary format. */ + +import ts from 'typescript' +import { casesOrThrow, modifiers } from './util.js' + +const { factory: tsf } = ts + +// === Definitions === + +const noneType = tsf.createTypeReferenceNode('undefined') +const cursorFieldIdent = tsf.createIdentifier('lazyObjectData') +const POINTER_SIZE: number = 4 +// Symbols exported by the `parserSupport` module. +export const supportImports = { + LazyObject: false, + Cursor: false, + Result: true, + DynValue: true, + Dyn: false, +} as const +export const support = { + LazyObject: tsf.createIdentifier('LazyObject'), + Cursor: tsf.createTypeReferenceNode(tsf.createIdentifier('Cursor')), + Result: (t0: ts.TypeNode, t1: ts.TypeNode) => + tsf.createTypeReferenceNode(tsf.createIdentifier('Result'), [t0, t1]), + DynValue: 'DynValue', + Dyn: tsf.createIdentifier('Dyn'), +} as const + +const cursorMethods = { + readString: primitiveReader('readString'), + readBool: primitiveReader('readBool'), + readU32: primitiveReader('readU32'), + readI32: primitiveReader('readI32'), + readU64: primitiveReader('readU64'), + readI64: primitiveReader('readI64'), + readPointer: primitiveReader('readPointer'), + readSequence: readerTransformerSized('readSequence'), + readOption: readerTransformer('readOption'), + readResult: readerTransformerTwoTyped('readResult'), +} as const +const dynBuilders = { + Primitive: dynReader('Primitive'), + Result: dynReader('Result'), + Sequence: dynReader('Sequence'), + Option: dynReader('Option'), + Object: dynReader('Object'), +} as const + +type ExpressionTransformer = (expression: ts.Expression) => ts.Expression + +// === Public API === + +export class Type { + readonly type: ts.TypeNode + readonly reader: ExpressionTransformer + readonly dynReader: ExpressionTransformer + readonly size: number + + private constructor( + type: ts.TypeNode, + reader: ExpressionTransformer, + dynReader: ExpressionTransformer, + size: number, + ) { + this.type = type + this.reader = reader + this.dynReader = dynReader + this.size = size + } + + static Abstract(name: string): Type { + const valueReader = abstractTypeReader(name) + return new Type( + tsf.createTypeReferenceNode(name), + valueReader, + dynBuilders.Object(valueReader), + POINTER_SIZE, + ) + } + + static Concrete(name: string, size: number): Type { + const valueReader = concreteTypeReader(name) + return new Type( + tsf.createTypeReferenceNode(name), + valueReader, + dynBuilders.Object(valueReader), + size, + ) + } + + static Sequence(element: Type): Type { + return new Type( + tsf.createTypeReferenceNode('Iterable', [element.type]), + cursorMethods.readSequence(element.reader, element.size), + dynBuilders.Sequence(cursorMethods.readSequence(element.dynReader, element.size)), + POINTER_SIZE, + ) + } + + static Option(element: Type): Type { + return new Type( + tsf.createUnionTypeNode([element.type, noneType]), + cursorMethods.readOption(element.reader), + dynBuilders.Option(cursorMethods.readOption(element.dynReader)), + POINTER_SIZE + 1, + ) + } + + static Result(ok: Type, err: Type): Type { + return new Type( + support.Result(ok.type, err.type), + cursorMethods.readResult(ok.reader, err.reader), + dynBuilders.Result(cursorMethods.readResult(ok.dynReader, err.dynReader)), + POINTER_SIZE, + ) + } + + static Boolean: Type = new Type( + tsf.createTypeReferenceNode('boolean'), + cursorMethods.readBool, + dynBuilders.Primitive(cursorMethods.readBool), + 1, + ) + static UInt32: Type = new Type( + tsf.createTypeReferenceNode('number'), + cursorMethods.readU32, + dynBuilders.Primitive(cursorMethods.readU32), + 4, + ) + static Int32: Type = new Type( + tsf.createTypeReferenceNode('number'), + cursorMethods.readI32, + dynBuilders.Primitive(cursorMethods.readI32), + 4, + ) + static UInt64: Type = new Type( + tsf.createTypeReferenceNode('bigint'), + cursorMethods.readU64, + dynBuilders.Primitive(cursorMethods.readU64), + 8, + ) + static Int64: Type = new Type( + tsf.createTypeReferenceNode('bigint'), + cursorMethods.readI64, + dynBuilders.Primitive(cursorMethods.readI64), + 8, + ) + static Char: Type = new Type( + tsf.createTypeReferenceNode('number'), + cursorMethods.readU32, + dynBuilders.Primitive(cursorMethods.readU32), + 4, + ) + static String: Type = new Type( + tsf.createTypeReferenceNode('string'), + cursorMethods.readString, + dynBuilders.Primitive(cursorMethods.readString), + POINTER_SIZE, + ) +} + +export function seekCursor(cursor: ts.Expression, offset: number): ts.Expression { + if (offset === 0) { + return cursor + } else { + return tsf.createCallExpression( + tsf.createPropertyAccessExpression(cursor, 'seek'), + [], + [tsf.createNumericLiteral(offset)], + ) + } +} + +export function abstractTypeDeserializer( + ident: ts.Identifier, + cases: ts.CaseClause[], +): ts.FunctionDeclaration { + const cursorIdent = tsf.createIdentifier('cursor') + return tsf.createFunctionDeclaration( + [modifiers.export], + undefined, + 'read', + [], + [ + tsf.createParameterDeclaration( + [], + undefined, + cursorIdent, + undefined, + support.Cursor, + undefined, + ), + ], + tsf.createTypeReferenceNode(ident), + tsf.createBlock([ + tsf.createSwitchStatement( + cursorMethods.readU32(cursorIdent), + casesOrThrow(cases, 'Unexpected discriminant while deserializing.'), + ), + ]), + ) +} + +export function fieldDeserializer( + ident: ts.Identifier, + type: Type, + offset: number, +): ts.GetAccessorDeclaration { + return tsf.createGetAccessorDeclaration( + [], + ident, + [], + type.type, + tsf.createBlock([ + tsf.createReturnStatement( + type.reader( + seekCursor( + tsf.createPropertyAccessExpression(tsf.createThis(), cursorFieldIdent), + offset, + ), + ), + ), + ]), + ) +} + +export function fieldDynValue(type: Type, offset: number): ts.Expression { + return type.dynReader( + seekCursor(tsf.createPropertyAccessExpression(tsf.createThis(), cursorFieldIdent), offset), + ) +} + +// === Implementation === + +/** Returns a function that, given an expression evaluating to a [`Cursor`], returns an expression applying a + * deserialization method with the given name to the cursor. */ +function primitiveReader(name: string): ExpressionTransformer { + return (cursor) => + tsf.createCallExpression(tsf.createPropertyAccessExpression(cursor, name), [], []) +} + +/** + * Given the name of a runtime `Cursor` method that deserializes a derived type given a function to deserialize a + * base type, return a codegen-time function that generates a *reader* for a derived type from a *reader* for the base + * type, where a *reader* is a function producing a deserialization expression from an expression that evaluates to a + * `Cursor`. + * + * For example, if we have a reader produced by `primitiveReader('readU32')`, we can use it to create an expression + * representing the deserialization of a number from an expression that will evaluate to a location in the input. If we + * create a `readerTransformer('readOption')`, we can apply it to the number reader to yield an optional-number reader. + */ +function readerTransformer( + name: string, +): (readElement: ExpressionTransformer) => ExpressionTransformer { + const innerParameter = tsf.createIdentifier('element') + return (readElement: ExpressionTransformer) => (cursor: ts.Expression) => { + return tsf.createCallExpression( + tsf.createPropertyAccessExpression(cursor, name), + [], + [ + tsf.createArrowFunction( + [], + [], + [ + tsf.createParameterDeclaration( + [], + undefined, + innerParameter, + undefined, + support.Cursor, + undefined, + ), + ], + undefined, + undefined, + readElement(innerParameter), + ), + ], + ) + } +} + +/** Similar to [`readerTransformer`], but for deserialization-transformers that produce a reader by combining two input + * readers. */ +function readerTransformerTwoTyped( + name: string, +): (readOk: ExpressionTransformer, readErr: ExpressionTransformer) => ExpressionTransformer { + function makeArrow(reader: ExpressionTransformer, data: ts.Identifier) { + return tsf.createArrowFunction( + [], + [], + [tsf.createParameterDeclaration([], undefined, data, undefined, support.Cursor, undefined)], + undefined, + undefined, + reader(data), + ) + } + + const okData = tsf.createIdentifier('okData') + const errData = tsf.createIdentifier('errData') + return (readOk: ExpressionTransformer, readErr: ExpressionTransformer) => + (cursor: ts.Expression) => { + return tsf.createCallExpression( + tsf.createPropertyAccessExpression(cursor, name), + [], + [makeArrow(readOk, okData), makeArrow(readErr, errData)], + ) + } +} + +/** Similar to [`readerTransformer`], but for deserialization-transformers are parameterized by the size of their + * element. */ +function readerTransformerSized( + name: string, +): (readElement: ExpressionTransformer, size: number) => ExpressionTransformer { + const innerParameter = tsf.createIdentifier('element') + return (readElement: ExpressionTransformer, size: number) => (cursor: ts.Expression) => { + return tsf.createCallExpression( + tsf.createPropertyAccessExpression(cursor, name), + [], + [ + tsf.createArrowFunction( + [], + [], + [ + tsf.createParameterDeclaration( + [], + undefined, + innerParameter, + undefined, + support.Cursor, + undefined, + ), + ], + undefined, + undefined, + readElement(innerParameter), + ), + tsf.createNumericLiteral(size), + ], + ) + } +} + +function dynReader(name: string): (readValue: ExpressionTransformer) => ExpressionTransformer { + return (readValue: ExpressionTransformer) => (cursor: ts.Expression) => { + return tsf.createCallExpression( + tsf.createPropertyAccessExpression(support.Dyn, name), + [], + [readValue(cursor)], + ) + } +} + +function abstractTypeReader(name: string): ExpressionTransformer { + return (cursor: ts.Expression) => + tsf.createCallExpression( + tsf.createPropertyAccessExpression(tsf.createIdentifier(name), 'read'), + [], + [cursorMethods.readPointer(cursor)], + ) +} + +function concreteTypeReader(name: string): ExpressionTransformer { + return (cursor: ts.Expression) => + tsf.createCallExpression( + tsf.createPropertyAccessExpression(tsf.createIdentifier(name), 'read'), + [], + [cursor], + ) +} diff --git a/app/gui2/parser-codegen/util.ts b/app/gui2/parser-codegen/util.ts new file mode 100644 index 000000000000..11ea74b4db8e --- /dev/null +++ b/app/gui2/parser-codegen/util.ts @@ -0,0 +1,91 @@ +import * as changeCase from 'change-case' +import ts from 'typescript' +const tsf = ts.factory + +// === Identifier utilities === + +export function toPascal(ident: string): string { + if (ident.includes('.')) throw new Error('toPascal cannot be applied to a namespaced name.') + return changeCase.pascalCase(ident) +} + +export function toCamel(ident: string): string { + if (ident.includes('.')) throw new Error('toCamel cannot be applied to a namespaced name.') + return changeCase.camelCase(ident) +} + +const RENAME = new Map([ + // TS reserved words. + ['constructor', 'ident'], + ['type', 'typeNode'], + // Rename source references to reflect our usage: + // - In `Tree`s: + ['spanLeftOffsetCodeOffsetUtf16', 'whitespaceStartInCodeParsed'], + ['spanLeftOffsetCodeUtf16', 'whitespaceLengthInCodeParsed'], + ['spanCodeLengthUtf16', 'childrenLengthInCodeParsed'], + // - In `Tokens`s: + ['leftOffsetCodeOffsetUtf16', 'whitespaceStartInCodeBuffer'], + ['leftOffsetCodeUtf16', 'whitespaceLengthInCodeBuffer'], + ['codeUtf16', 'lengthInCodeBuffer'], + ['codeOffsetUtf16', 'startInCodeBuffer'], +]) + +export function mapIdent(ident: string): string { + return RENAME.get(ident) ?? ident +} + +export function namespacedName(name: string, namespace?: string): string { + if (namespace == null) { + return toPascal(name) + } else { + return toPascal(namespace) + '.' + toPascal(name) + } +} + +// === AST utilities === + +export const modifiers = { + export: tsf.createModifier(ts.SyntaxKind.ExportKeyword), + const: tsf.createModifier(ts.SyntaxKind.ConstKeyword), + readonly: tsf.createModifier(ts.SyntaxKind.ReadonlyKeyword), + abstract: tsf.createModifier(ts.SyntaxKind.AbstractKeyword), + static: tsf.createModifier(ts.SyntaxKind.StaticKeyword), + protected: tsf.createModifier(ts.SyntaxKind.ProtectedKeyword), +} as const + +export function assignmentStatement(left: ts.Expression, right: ts.Expression): ts.Statement { + return tsf.createExpressionStatement( + tsf.createBinaryExpression(left, ts.SyntaxKind.EqualsToken, right), + ) +} + +export function forwardToSuper( + ident: ts.Identifier, + type: ts.TypeNode, + modifiers?: ts.ModifierLike[], +) { + return tsf.createConstructorDeclaration( + modifiers, + [tsf.createParameterDeclaration([], undefined, ident, undefined, type, undefined)], + tsf.createBlock([ + tsf.createExpressionStatement( + tsf.createCallExpression(tsf.createIdentifier('super'), [], [ident]), + ), + ]), + ) +} + +export function casesOrThrow(cases: ts.CaseClause[], error: string): ts.CaseBlock { + return tsf.createCaseBlock([ + ...cases, + tsf.createDefaultClause([ + tsf.createThrowStatement( + tsf.createNewExpression( + tsf.createIdentifier('Error'), + [], + [tsf.createStringLiteral(error)], + ), + ), + ]), + ]) +} diff --git a/app/gui2/rust-ffi/src/lib.rs b/app/gui2/rust-ffi/src/lib.rs index 22f994116208..bf7d75703615 100644 --- a/app/gui2/rust-ffi/src/lib.rs +++ b/app/gui2/rust-ffi/src/lib.rs @@ -14,18 +14,18 @@ thread_local! { pub static PARSER: Parser = Parser::new(); } -#[wasm_bindgen] -pub fn parse_to_json(code: &str) -> String { - let ast = PARSER.with(|parser| parser.run(code)); - serde_json::to_string(&ast).expect("Failed to serialize AST to JSON") -} - #[wasm_bindgen] pub fn parse_doc_to_json(docs: &str) -> String { let docs = enso_doc_parser::parse(docs); serde_json::to_string(&docs).expect("Failed to serialize Doc Sections to JSON") } +#[wasm_bindgen] +pub fn parse(code: &str) -> Vec { + let ast = PARSER.with(|parser| parser.run(code)); + enso_parser::format::serialize(&ast).expect("Failed to serialize AST to binary format") +} + #[wasm_bindgen(start)] fn main() { console_error_panic_hook::set_once(); diff --git a/app/gui2/src/generated/.gitkeep b/app/gui2/src/generated/.gitkeep new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/app/gui2/src/stores/graph.ts b/app/gui2/src/stores/graph.ts index edbd675c1532..e991cfc706c6 100644 --- a/app/gui2/src/stores/graph.ts +++ b/app/gui2/src/stores/graph.ts @@ -1,6 +1,6 @@ import { assert, assertNever } from '@/util/assert' +import { Ast, parseEnso } from '@/util/ast' import { useObserveYjs } from '@/util/crdt' -import { parseEnso, type Ast } from '@/util/ffi' import type { Opt } from '@/util/opt' import { Vec2 } from '@/util/vec2' import * as map from 'lib0/map' diff --git a/app/gui2/src/stores/suggestionDatabase/__tests__/lsUpdate.test.ts b/app/gui2/src/stores/suggestionDatabase/__tests__/lsUpdate.test.ts index 89105a8ab4ba..cc1fab58ff23 100644 --- a/app/gui2/src/stores/suggestionDatabase/__tests__/lsUpdate.test.ts +++ b/app/gui2/src/stores/suggestionDatabase/__tests__/lsUpdate.test.ts @@ -1,4 +1,4 @@ -import { parseDocs } from '@/util/ffi' +import { parseDocs } from '@/util/docParser' import { tryIdentifier, tryQualifiedName } from '@/util/qualifiedName' import { unwrap } from '@/util/result' import * as lsTypes from 'shared/languageServerTypes/suggestions' diff --git a/app/gui2/src/stores/suggestionDatabase/documentation.ts b/app/gui2/src/stores/suggestionDatabase/documentation.ts index 05e98bff300e..29b2b9ad1bca 100644 --- a/app/gui2/src/stores/suggestionDatabase/documentation.ts +++ b/app/gui2/src/stores/suggestionDatabase/documentation.ts @@ -1,6 +1,6 @@ import type { Group } from '@/stores/suggestionDatabase' import { findIndexOpt } from '@/util/array' -import { parseDocs, type Doc } from '@/util/ffi' +import { parseDocs, type Doc } from '@/util/docParser' import { isSome, type Opt } from '@/util/opt' import { tryQualifiedName, type QualifiedName } from '@/util/qualifiedName' import { unwrap } from '@/util/result' diff --git a/app/gui2/src/stores/suggestionDatabase/entry.ts b/app/gui2/src/stores/suggestionDatabase/entry.ts index 51a063e597cd..d1bedeaf02b3 100644 --- a/app/gui2/src/stores/suggestionDatabase/entry.ts +++ b/app/gui2/src/stores/suggestionDatabase/entry.ts @@ -1,5 +1,5 @@ import { assert } from '@/util/assert' -import type { Doc } from '@/util/ffi' +import type { Doc } from '@/util/docParser' import { isIdentifier, isQualifiedName, @@ -13,7 +13,6 @@ import type { SuggestionEntryArgument, SuggestionEntryScope, } from 'shared/languageServerTypes/suggestions' -export type { Doc } from '@/util/ffi' export type { SuggestionEntryArgument, SuggestionEntryScope, diff --git a/app/gui2/src/stores/suggestionDatabase/lsUpdate.ts b/app/gui2/src/stores/suggestionDatabase/lsUpdate.ts index e4a6f27960ee..5925bcf927be 100644 --- a/app/gui2/src/stores/suggestionDatabase/lsUpdate.ts +++ b/app/gui2/src/stores/suggestionDatabase/lsUpdate.ts @@ -1,13 +1,13 @@ import { SuggestionDb, type Group } from '@/stores/suggestionDatabase' import { SuggestionKind, - type Doc, type SuggestionEntry, type SuggestionEntryArgument, type SuggestionEntryScope, type Typename, } from '@/stores/suggestionDatabase/entry' import { assert } from '@/util/assert' +import type { Doc } from '@/util/docParser' import { type Opt } from '@/util/opt' import { qnJoin, diff --git a/app/gui2/src/util/ast.ts b/app/gui2/src/util/ast.ts new file mode 100644 index 000000000000..c096a8bf5113 --- /dev/null +++ b/app/gui2/src/util/ast.ts @@ -0,0 +1,78 @@ +import * as Ast from '@/generated/ast' +import { debug, validateSpans } from '@/util/parserSupport' +import { parse } from './ffi' + +export { Ast } + +export function parseEnso(code: string): Ast.Tree { + const blob = parse(code) + return Ast.deserializeTree(blob.buffer) +} + +if (import.meta.vitest) { + const { test, expect } = import.meta.vitest + test('testParse', () => { + const identInput = ' foo bar\n' + const tree = parseEnso(identInput) + expect(debug(tree)).toMatchObject({ + childrenLengthInCodeParsed: 8, + whitespaceStartInCodeParsed: 0, + whitespaceLengthInCodeParsed: 1, + statements: [ + { + expression: { + arg: { + childrenLengthInCodeParsed: 3, + whitespaceStartInCodeParsed: 4, + whitespaceLengthInCodeParsed: 1, + token: { + startInCodeBuffer: 5, + lengthInCodeBuffer: 3, + whitespaceLengthInCodeBuffer: 0, + }, + type: 'Ident', + }, + func: { + childrenLengthInCodeParsed: 3, + whitespaceLengthInCodeParsed: 0, + token: { + startInCodeBuffer: 1, + lengthInCodeBuffer: 3, + whitespaceLengthInCodeBuffer: 0, + }, + type: 'Ident', + }, + childrenLengthInCodeParsed: 7, + whitespaceLengthInCodeParsed: 0, + type: 'App', + }, + newline: { + lengthInCodeBuffer: 0, + whitespaceLengthInCodeBuffer: 0, + }, + }, + { + expression: undefined, + newline: { + startInCodeBuffer: 8, + lengthInCodeBuffer: 1, + whitespaceLengthInCodeBuffer: 0, + }, + }, + ], + type: 'BodyBlock', + }) + }) + test('testCase', () => { + const input = 'Data.read\n2 + 2' + const tree = parseEnso(input) + const endPos = validateSpans(tree) + expect(endPos).toStrictEqual(input.length) + }) + test('testSpans', () => { + const input = ' foo bar\n' + const tree = parseEnso(input) + const endPos = validateSpans(tree) + expect(endPos).toStrictEqual(input.length) + }) +} diff --git a/app/gui2/src/util/docParser.ts b/app/gui2/src/util/docParser.ts new file mode 100644 index 000000000000..20641718191e --- /dev/null +++ b/app/gui2/src/util/docParser.ts @@ -0,0 +1,72 @@ +import { parse_doc_to_json } from './ffi' + +export function parseDocs(docs: string): Doc.Section[] { + const json = parse_doc_to_json(docs) + return JSON.parse(json) +} + +export namespace Doc { + export type HtmlString = string + export type Tag = + | 'Added' + | 'Advanced' + | 'Alias' + | 'Deprecated' + | 'Icon' + | 'Group' + | 'Modified' + | 'Private' + | 'Removed' + | 'TextOnly' + | 'Unstable' + | 'Upcoming' + export type Mark = 'Important' | 'Info' | 'Example' + + export interface Argument { + name: string + description: HtmlString + } + + export type Section = + | { Tag: Section.Tag } + | { Paragraph: Section.Paragraph } + | { List: Section.List } + | { Arguments: Section.Arguments } + | { Keyed: Section.Keyed } + | { Marked: Section.Marked } + + export namespace Section { + /** The documentation tag. */ + export interface Tag { + tag: Doc.Tag + body: HtmlString + } + + /** The paragraph of the text. */ + export interface Paragraph { + body: HtmlString + } + + /** A list of items. Each item starts with a dash (`-`). */ + export interface List { + items: HtmlString[] + } + + /** A list of items, but each item is an [`Argument`]. Starts with `Arguments:` keyword. */ + export interface Arguments { + args: Argument[] + } + + /** The section that starts with the key followed by the colon and the body. */ + export interface Keyed { + key: String + body: HtmlString + } + /** The section that starts with the mark followed by the header and the body. */ + export interface Marked { + mark: Mark + header?: string + body: HtmlString + } + } +} diff --git a/app/gui2/src/util/ffi.ts b/app/gui2/src/util/ffi.ts index 88ff18edfc0e..1bc80d6773e6 100644 --- a/app/gui2/src/util/ffi.ts +++ b/app/gui2/src/util/ffi.ts @@ -1,8 +1,6 @@ -import type { NonEmptyArray } from '@/util/array' -import type { Opt } from '@/util/opt' -import init, { parse_doc_to_json, parse_to_json } from '../../rust-ffi/pkg/rust_ffi' +import init, { parse, parse_doc_to_json } from '../../rust-ffi/pkg/rust_ffi' -if (RUNNING_VTEST) { +if (RUNNING_VITEST) { const fs = await import('node:fs/promises') const buffer = await fs.readFile('./rust-ffi/pkg/rust_ffi_bg.wasm') await init(buffer) @@ -10,519 +8,5 @@ if (RUNNING_VTEST) { await init() } -export function parseEnso(code: string): Ast.Tree { - const json = parse_to_json(code) - return JSON.parse(json) -} - -export namespace Ast { - export interface Tree { - span: Span - variant: Variant - } - - export type Token = - | Token.AutoScope - | Token.CloseSymbol - | Token.Digits - | Token.Ident - | Token.Newline - | Token.NumberBase - | Token.OpenSymbol - | Token.Operator - | Token.TextEnd - | Token.TextEscape - | Token.TextSection - | Token.TextStart - | Token.Wildcard - - namespace Token { - declare const Brand: unique symbol - - interface TokenBase { - left_offset: Offset - code: Code - variant: T & { [Brand]: B } - } - - export type AutoScope = TokenBase<'AutoScope'> - export type CloseSymbol = TokenBase<'CloseSymbol'> - export type Digits = TokenBase<'Digits', { base: Opt<'Binary' | 'Octal' | 'Hexadecimal'> }> - export type Ident = TokenBase< - 'Ident', - { - is_free: boolean - lift_level: number - is_type: boolean - is_operator_lexically: boolean - } - > - export type Newline = TokenBase<'Newline'> - export type NumberBase = TokenBase<'NumberBase'> - export type OpenSymbol = TokenBase<'OpenSymbol'> - export type Operator = TokenBase<'Operator'> - export type TextEnd = TokenBase<'TextEnd'> - export type TextEscape = TokenBase< - 'TextEscape', - { - /** - * Escaped character Unicode scalar value, Serialized from Rust's `char`. - * https://doc.rust-lang.org/std/primitive.char.html - */ - value: Opt - } - > - export type TextSection = TokenBase<'TextSection'> - export type TextStart = TokenBase<'TextStart'> - export type Wildcard = TokenBase<'Wildcard', { lift_level: number }> - } - - export interface Span { - code_length: Length - left_offset: Offset - } - - export interface Code { - repr: CowStrPtr - utf16: number - } - - export interface Length { - utf8: number - utf16: number - } - - export interface Offset { - visible: VisibleOffset - code: Code - } - - export interface CowStrPtr { - begin: number - len: number - } - - export interface VisibleOffset { - width_in_spaces: number - } - - export type Variant = - | { Invalid: Variant.Invalid } - | { BodyBlock: Variant.BodyBlock } - | { ArgumentBlockApplication: Variant.ArgumentBlockApplication } - | { OperatorBlockApplication: Variant.OperatorBlockApplication } - | { Ident: Variant.Ident } - | { Number: Variant.Number } - | { Wildcard: Variant.Wildcard } - | { AutoScope: Variant.AutoScope } - | { TextLiteral: Variant.TextLiteral } - | { App: Variant.App } - | { NamedApp: Variant.NamedApp } - | { DefaultApp: Variant.DefaultApp } - | { OprApp: Variant.OprApp } - | { UnaryOprApp: Variant.UnaryOprApp } - | { OprSectionBoundary: Variant.OprSectionBoundary } - | { TemplateFunction: Variant.TemplateFunction } - | { MultiSegmentApp: Variant.MultiSegmentApp } - | { TypeDef: Variant.TypeDef } - | { Assignment: Variant.Assignment } - | { Function: Variant.Function } - | { ForeignFunction: Variant.ForeignFunction } - | { Import: Variant.Import } - | { Export: Variant.Export } - | { Group: Variant.Group } - | { TypeSignature: Variant.TypeSignature } - | { TypeAnnotated: Variant.TypeAnnotated } - | { CaseOf: Variant.CaseOf } - | { Lambda: Variant.Lambda } - | { Array: Variant.Array } - | { Tuple: Variant.Tuple } - | { Annotated: Variant.Annotated } - | { AnnotatedBuiltin: Variant.AnnotatedBuiltin } - | { Documented: Variant.Documented } - | { ConstructorDefinition: Variant.ConstructorDefinition } - - export namespace Variant { - export interface Invalid { - error: Error - ast: Tree - } - - export interface BlockLine { - newline: Token.Newline - expression: Opt - } - - export interface BlockOperatorLine {} - - export interface BodyBlock { - statements: BlockLine[] - } - - export interface ArgumentBlockApplication { - lhs: Opt - arguments: BlockLine[] - } - - export interface OperatorBlockApplication { - lhs: Opt - expressions: BlockOperatorLine[] - excess: BlockLine[] - } - export interface Ident { - token: Token.Ident - } - - export interface FractionalDigits { - dot: Token.Operator - digits: Token.Digits - } - - export interface Number { - base: Opt - integer: Opt - fractional_digits: Opt - } - export interface Wildcard { - token: Token.Wildcard - de_bruijn_index: Opt - } - - export interface AutoScope { - token: Token.AutoScope - } - - export interface TextLiteral { - open: Opt - newline: Opt - elements: TextElement[] - close: Opt - } - - export interface App { - func: Tree - arg: Tree - } - - export interface NamedApp { - func: Tree - open: Opt - name: Token.Ident - equals: Token.Operator - arg: Tree - close: Opt - } - - export interface DefaultApp { - func: Tree - default: Token.Ident - } - - type Result = { Ok: T } | { Err: E } - - export interface OprApp { - lhs: Opt - opr: Result - rhs: Opt - } - - export interface UnaryOprApp { - opr: Token.Operator - rhs: Opt - } - - export interface OprSectionBoundary { - arguments: number - ast: Tree - } - - export interface TemplateFunction { - arguments: number - ast: Tree - } - - export interface MultiSegmentApp { - segments: NonEmptyArray - } - - export interface MultiSegmentAppSegment { - header: Token - body: Opt - } - - export interface ArgumentType { - operator: Token.Operator - type: Tree - } - - export interface ArgumentDefault { - equals: Token.Operator - expression: Tree - } - - /** A function argument definition. */ - export interface ArgumentDefinition { - /** Opening parenthesis (outer). */ - open: Opt - /** Opening parenthesis (inner). */ - open2: Opt - /** An optional execution-suspension unary operator (~). */ - suspension: Opt - /** The pattern being bound to an argument. */ - pattern: Tree - /** An optional type ascribed to an argument. */ - type: Opt - /** Closing parenthesis (inner). */ - close2: Opt - /** An optional default value for an argument. */ - default: Opt - /** Closing parenthesis (outer). */ - close: Opt - } - - export interface ArgumentDefinitionLine { - newline: Token.Newline - argument: Opt - } - - export interface TypeDef { - keyword: Token.Ident - name: Token.Ident - params: ArgumentDefinition[] - body: BlockLine[] - } - - export interface Assignment { - pattern: Tree - equals: Token.Operator - expr: Tree - } - - export interface Function { - name: Tree - args: ArgumentDefinition[] - equals: Token.Operator - body: Opt - } - - export interface ForeignFunction { - foreign: Token.Ident - language: Token.Ident - name: Token.Ident - args: ArgumentDefinition[] - equals: Token.Operator - body: Tree - } - - export interface Import { - polyglot: Opt - from: Opt - import: MultiSegmentAppSegment - all: Opt - as_: Opt - hiding: Opt - } - - export interface Export { - from: Opt - export: MultiSegmentAppSegment - all: Opt - as_: Opt - hiding: Opt - } - - export interface Group { - open: Opt - body: Opt - close: Opt - } - - export interface TypeSignature { - variable: Tree - operator: Token.Operator - type_: Tree - } - - export interface TypeAnnotated { - expression: Tree - operator: Token.Operator - type_: Tree - } - - export interface CaseLine { - newline: Opt - case: Opt - } - - export interface Case { - documentation: Opt - pattern: Opt - arrow: Opt - expression: Opt - } - - export interface CaseOf { - case: Token.Ident - expression: Opt - of: Token.Ident - cases: CaseLine[] - } - - export interface Lambda { - operator: Token.Operator - arrow: Opt - } - - export interface OperatorDelimitedTree { - operator: Token.Operator - body: Opt - } - - export interface Array { - left: Token.OpenSymbol - first: Opt - rest: OperatorDelimitedTree[] - right: Token.CloseSymbol - } - - export interface Tuple { - left: Token.OpenSymbol - first: Opt - rest: OperatorDelimitedTree[] - right: Token.CloseSymbol - } - - export interface Annotated { - token: Token.Operator - annotation: Token.Ident - argument: Opt - newlines: Token.Newline[] - expression: Opt - } - - export interface AnnotatedBuiltin { - token: Token.Operator - annotation: Token.Ident - newlines: Token.Newline[] - expression: Opt - } - - export interface Documented { - documentation: DocComment - expression: Opt - } - - export interface ConstructorDefinition { - constructor: Token.Ident - arguments: ArgumentDefinition[] - block: ArgumentDefinitionLine[] - } - } - - export interface DocComment { - open: Token.TextStart - elements: TextElement[] - newlines: Token.Newline[] - } - - export type TextElement = - | { Section: TextElement.Section } - | { Escape: TextElement.Escape } - | { Newline: TextElement.Newline } - | { Splice: TextElement.Splice } - - export namespace TextElement { - export interface Section { - text: Token.TextSection - } - - export interface Escape { - token: Token.TextEscape - } - - export interface Newline { - newline: Token.Newline - } - - export interface Splice { - open: Token.OpenSymbol - expression: Opt - close: Token.CloseSymbol - } - } - - export interface Error { - message: string - } -} - -export function parseDocs(docs: string): Doc.Section[] { - const json = parse_doc_to_json(docs) - return JSON.parse(json) -} - -export namespace Doc { - export type HtmlString = string - export type Tag = - | 'Added' - | 'Advanced' - | 'Alias' - | 'Deprecated' - | 'Icon' - | 'Group' - | 'Modified' - | 'Private' - | 'Removed' - | 'TextOnly' - | 'Unstable' - | 'Upcoming' - export type Mark = 'Important' | 'Info' | 'Example' - - export interface Argument { - name: string - description: HtmlString - } - - export type Section = - | { Tag: Section.Tag } - | { Paragraph: Section.Paragraph } - | { List: Section.List } - | { Arguments: Section.Arguments } - | { Keyed: Section.Keyed } - | { Marked: Section.Marked } - - export namespace Section { - /** The documentation tag. */ - export interface Tag { - tag: Doc.Tag - body: HtmlString - } - - /** The paragraph of the text. */ - export interface Paragraph { - body: HtmlString - } - - /** A list of items. Each item starts with a dash (`-`). */ - export interface List { - items: HtmlString[] - } - - /** A list of items, but each item is an [`Argument`]. Starts with `Arguments:` keyword. */ - export interface Arguments { - args: Argument[] - } - - /** The section that starts with the key followed by the colon and the body. */ - export interface Keyed { - key: String - body: HtmlString - } - /** The section that starts with the mark followed by the header and the body. */ - export interface Marked { - mark: Mark - header?: string - body: HtmlString - } - } -} +// eslint-disable-next-line camelcase +export { parse, parse_doc_to_json } diff --git a/app/gui2/src/util/parserSupport.ts b/app/gui2/src/util/parserSupport.ts new file mode 100644 index 000000000000..72f47b5305ad --- /dev/null +++ b/app/gui2/src/util/parserSupport.ts @@ -0,0 +1,255 @@ +/** This file supports the module in `../generated/ast.ts` that is produced by `parser-codegen`. */ + +export { type Result } from '@/util/result' +import { Err, Error, Ok, type Result } from '@/util/result' + +export type Primitive = { + type: 'primitive' + value: boolean | number | bigint | string +} +export type DynValue = Primitive | DynSequence | DynResult | DynOption | DynObject +export type DynResult = { + type: 'result' + value: Result +} +export type DynSequence = { + type: 'sequence' + value: Iterable +} +export type DynOption = { + type: 'option' + value: DynValue | undefined +} +export type DynObject = { + type: 'object' + getFields: () => [string, DynValue][] +} +export const Dyn = { + Primitive: (value: boolean | number | bigint | string): DynValue => ({ + type: 'primitive', + value: value, + }), + Result: (value: Result): DynValue => ({ type: 'result', value: value }), + Sequence: (value: Iterable): DynValue => ({ type: 'sequence', value: value }), + Option: (value: DynValue | undefined): DynValue => ({ type: 'option', value: value }), + Object: (value: LazyObject): DynValue => ({ + type: 'object', + getFields: value.fields.bind(value), + }), +} as const + +/** Base class for objects that lazily deserialize fields when accessed. */ +export abstract class LazyObject { + protected readonly lazyObjectData: Cursor + + protected constructor(data: Cursor) { + this.lazyObjectData = data + } + + fields(): [string, DynValue][] { + return [] + } +} + +export const builtin = { + Array: Array, +} as const + +export class Cursor { + private readonly blob: DataView + + constructor(buffer: ArrayBuffer, address: number) { + this.blob = new DataView(buffer, address) + } + + *readSequence(readElement: (cursor: Cursor) => T, elementSize: number): Iterable { + const data = this.readPointer() + let count = data.readU32() + let offset = 4 + while (count > 0) { + yield readElement(data.seek(offset)) + count-- + offset += elementSize + } + } + + readOption(readElement: (cursor: Cursor) => T): T | undefined { + const discriminant = this.readU8() + switch (discriminant) { + case 0: + return undefined + case 1: + return readElement(this.seek(1).readPointer()) + default: + throw new Error(`Invalid Option discriminant: 0x${discriminant.toString(16)}.`) + } + } + + readResult( + readOk: (cursor: Cursor) => Ok, + readErr: (cursor: Cursor) => Err, + ): Result { + const data = this.readPointer() + const discriminant = data.readU32() + switch (discriminant) { + case 0: + return Ok(readOk(data.seek(4))) + case 1: + return Err(readErr(data.seek(4))) + default: + throw new Error(`Invalid Result discriminant: 0x${discriminant.toString(16)}.`) + } + } + + readPointer(): Cursor { + const pointee = this.readU32() + return new Cursor(this.blob.buffer, pointee) + } + + readU8(): number { + return this.blob.getUint8(0) + } + + readU32(): number { + return this.blob.getUint32(0, true) + } + + readI32(): number { + return this.blob.getInt32(0, true) + } + + readU64(): bigint { + return this.blob.getBigUint64(0, true) + } + + readI64(): bigint { + return this.blob.getBigInt64(0, true) + } + + readBool(): boolean { + const value = this.readU8() + switch (value) { + case 0: + return false + case 1: + return true + default: + throw new Error( + `Invalid boolean: 0x${value.toString(16)} @ 0x${this.blob.byteOffset.toString(16)}.`, + ) + } + } + + readString(): string { + const data = this.readPointer() + const len = data.readU32() + const bytes = data.blob.buffer.slice(data.blob.byteOffset + 4, data.blob.byteOffset + 4 + len) + return new TextDecoder().decode(bytes) + } + + seek(offset: number): Cursor { + return new Cursor(this.blob.buffer, this.blob.byteOffset + offset) + } + + address(): number { + return this.blob.byteOffset + } +} + +export function debug(obj: LazyObject): any { + return debug_(Dyn.Object(obj)) +} + +function debug_(value: DynValue): any { + switch (value.type) { + case 'sequence': + return Array.from(value.value, debug_) + case 'result': + if (value.value.ok) return Ok(debug_(value.value.value)) + else return Err(debug_(value.value.error.payload)) + case 'option': + if (value.value != null) return debug_(value.value) + else return undefined + case 'object': { + // FIXME: Include the `hide` reflect property in the schema, and apply it during code generation to avoid magic + // strings here. + const hide = [ + 'codeReprBegin', + 'codeReprLen', + 'leftOffsetCodeReprBegin', + 'leftOffsetCodeReprLen', + 'leftOffsetVisible', + 'spanLeftOffsetCodeReprBegin', + 'spanLeftOffsetCodeReprLen', + 'spanLeftOffsetVisible', + ] + return Object.fromEntries( + value + .getFields() + .filter(([name, _]) => !hide.includes(name)) + .map(([name, value]) => [name, debug_(value)]), + ) + } + case 'primitive': + return value.value + } +} + +export function validateSpans(obj: LazyObject, initialPos?: number): number { + const state = { pos: initialPos ?? 0 } + validateSpans_(Dyn.Object(obj), state) + return state.pos +} + +function validateSpans_(value: DynValue, state: { pos: number }) { + switch (value.type) { + case 'sequence': + for (const elem of value.value) validateSpans_(elem, state) + break + case 'result': + if (value.value.ok) validateSpans_(value.value.value, state) + else validateSpans_(value.value.error.payload, state) + break + case 'option': + if (value.value != null) validateSpans_(value.value, state) + break + case 'object': + return validateObjectSpans(value, state) + case 'primitive': + break + } +} + +function validateObjectSpans(value: DynObject, state: { pos: number }) { + const fields = new Map(value.getFields()) + const whitespaceStart = + fields.get('whitespaceStartInCodeParsed') ?? fields.get('whitespaceStartInCodeBuffer') + const whitespaceLength = + fields.get('whitespaceLengthInCodeParsed') ?? fields.get('whitespaceLengthInCodeBuffer') + const codeStart = fields.get('startInCodeBuffer') + const codeLength = fields.get('lengthInCodeBuffer') + const childrenCodeLength = fields.get('childrenLengthInCodeParsed') + if ( + !( + whitespaceLength?.type === 'primitive' && + whitespaceLength.value === 0 && + codeLength?.type === 'primitive' && + codeLength?.value === 0 + ) + ) { + if (whitespaceStart?.type === 'primitive' && whitespaceStart.value !== state.pos) + throw new Error(`Span error (whitespace) in: ${JSON.stringify(debug_(value))}.`) + if (whitespaceLength?.type === 'primitive') state.pos += whitespaceLength.value as number + if (codeStart?.type === 'primitive' && codeStart.value !== state.pos) + throw new Error('Span error (code).') + if (codeLength?.type === 'primitive') state.pos += codeLength.value as number + } + let endPos: number | undefined + if (childrenCodeLength?.type === 'primitive') + endPos = state.pos + (childrenCodeLength.value as number) + for (const entry of fields) { + const [_name, value] = entry + validateSpans_(value, state) + } + if (endPos != null && state.pos !== endPos) throw new Error('Span error (children).') +} diff --git a/app/gui2/tsconfig.app.json b/app/gui2/tsconfig.app.json index a2ad1558f021..51b4efe3bfd1 100644 --- a/app/gui2/tsconfig.app.json +++ b/app/gui2/tsconfig.app.json @@ -16,7 +16,6 @@ "composite": true, "outDir": "../../node_modules/.cache/tsc", "baseUrl": ".", - "noEmit": true, "allowImportingTsExtensions": true, "noUncheckedIndexedAccess": true, "exactOptionalPropertyTypes": true, diff --git a/app/gui2/tsconfig.node.json b/app/gui2/tsconfig.node.json index cc1982ee8cbd..c333414101e9 100644 --- a/app/gui2/tsconfig.node.json +++ b/app/gui2/tsconfig.node.json @@ -6,6 +6,7 @@ "playwright.config.*", "eslint.config.js", "e2e/**/*", + "parser-codegen/**/*", "node.env.d.ts" ], "compilerOptions": { diff --git a/app/gui2/vite.config.ts b/app/gui2/vite.config.ts index 6a3cd2c1abac..8a3814761963 100644 --- a/app/gui2/vite.config.ts +++ b/app/gui2/vite.config.ts @@ -29,7 +29,7 @@ export default defineConfig({ IS_DEV_MODE: JSON.stringify(process.env.NODE_ENV !== 'production'), CLOUD_ENV: process.env.ENSO_CLOUD_ENV != null ? JSON.stringify(process.env.ENSO_CLOUD_ENV) : 'undefined', - RUNNING_VTEST: false, + RUNNING_VITEST: false, 'import.meta.vitest': false, // Single hardcoded usage of `global` in by aws-amplify. 'global.TYPED_ARRAY_SUPPORT': true, diff --git a/app/gui2/vitest.config.ts b/app/gui2/vitest.config.ts index 2bf6a232ed23..c7ff56109912 100644 --- a/app/gui2/vitest.config.ts +++ b/app/gui2/vitest.config.ts @@ -12,7 +12,7 @@ export default mergeConfig( root: fileURLToPath(new URL('./', import.meta.url)), }, define: { - RUNNING_VTEST: true, + RUNNING_VITEST: true, }, }), ) diff --git a/lib/rust/metamodel/lexpr/src/lib.rs b/lib/rust/metamodel/lexpr/src/lib.rs index 0bcd9d2e7301..b2502f3e3117 100644 --- a/lib/rust/metamodel/lexpr/src/lib.rs +++ b/lib/rust/metamodel/lexpr/src/lib.rs @@ -128,7 +128,7 @@ impl<'g> ToSExpr<'g> { let mut child = None; for id in hierarchy.iter().rev() { let ty = &self.graph[id]; - let mut fields = ty.data.as_struct().unwrap(); + let mut fields = ty.data.fields().unwrap(); if let Some(i) = ty.child_field { fields = &fields[..i]; } @@ -138,14 +138,14 @@ impl<'g> ToSExpr<'g> { if !discriminants.is_empty() { let discriminant_index = read_u32(data); let id = discriminants[&(discriminant_index as usize)]; - let fields = self.graph[id].data.as_struct().unwrap(); + let fields = self.graph[id].data.fields().unwrap(); out.extend(fields.iter().filter_map(|field| self.field(field, data))); child = Some(id); } for id in hierarchy { let ty = &self.graph[id]; if let Some(i) = ty.child_field { - let mut fields = ty.data.as_struct().unwrap(); + let mut fields = ty.data.fields().unwrap(); fields = &fields[i..]; out.extend(fields.iter().filter_map(|field| self.field(field, data))); } diff --git a/lib/rust/metamodel/src/data_structures.rs b/lib/rust/metamodel/src/data_structures.rs index 655f21c1fdeb..ce814bb3e0bd 100644 --- a/lib/rust/metamodel/src/data_structures.rs +++ b/lib/rust/metamodel/src/data_structures.rs @@ -160,6 +160,14 @@ impl std::ops::IndexMut<&Key> for VecMap { &mut self[*key] } } +impl<'a, T> IntoIterator for &'a VecMap { + type Item = (Key, &'a T); + type IntoIter = impl Iterator; + + fn into_iter(self) -> Self::IntoIter { + self.iter() + } +} /// Types used by `VecMap`. pub mod vecmap { diff --git a/lib/rust/metamodel/src/java/from_meta.rs b/lib/rust/metamodel/src/java/from_meta.rs index d5a1114ed1c2..4e9bd864bf35 100644 --- a/lib/rust/metamodel/src/java/from_meta.rs +++ b/lib/rust/metamodel/src/java/from_meta.rs @@ -9,10 +9,10 @@ //! this stage, although [`Dynamic`] methods are used so that if any classes are modified before //! the model is rendered to syntax, the generated methods will reflect the changes. -use crate::java::*; - use crate::meta; +use crate::java::*; + // ====================== diff --git a/lib/rust/metamodel/src/java/implementation.rs b/lib/rust/metamodel/src/java/implementation.rs index fdea1f6d8b9a..0588740bc601 100644 --- a/lib/rust/metamodel/src/java/implementation.rs +++ b/lib/rust/metamodel/src/java/implementation.rs @@ -11,7 +11,7 @@ use std::fmt::Write; // === Implementing Java Datatypes === // =================================== -/// Produce Java syntax implement all the types modeled in a [`TypeGraph`]. +/// Produce Java syntax implementing all the types modeled in a [`TypeGraph`]. pub fn implement(graph: &TypeGraph, package: &str) -> Vec { let mut implementations = BTreeMap::new(); for (id, class) in graph.classes.iter() { diff --git a/lib/rust/metamodel/src/java/mod.rs b/lib/rust/metamodel/src/java/mod.rs index 737a0ecb6663..90c5c3138188 100644 --- a/lib/rust/metamodel/src/java/mod.rs +++ b/lib/rust/metamodel/src/java/mod.rs @@ -1,27 +1,27 @@ //! Representation of datatype definitions in the Java typesystem. +mod from_meta; +#[cfg(feature = "graphviz")] +mod graphviz; +mod implementation; + +use crate::data_structures::VecMap; +use derive_more::Index; +use derive_more::IndexMut; +use std::collections::BTreeMap; + + // ============== // === Export === // ============== pub mod bincode; - - - -mod from_meta; -#[cfg(feature = "graphviz")] -mod graphviz; -mod implementation; pub mod syntax; pub mod transform; -use crate::data_structures::VecMap; -use derive_more::Index; -use derive_more::IndexMut; pub use from_meta::from_meta; pub use implementation::implement as to_syntax; -use std::collections::BTreeMap; @@ -46,8 +46,6 @@ pub const STRING: &str = "String"; pub type FieldId = crate::data_structures::Id; /// Identifies a Java class within a `TypeGraph`. pub type ClassId = crate::data_structures::vecmap::Key; -/// Identifier for a class whose value hasn't been set yet. -pub type UnboundClassId = crate::data_structures::vecmap::UnboundKey; diff --git a/lib/rust/metamodel/src/lib.rs b/lib/rust/metamodel/src/lib.rs index 4992f7d798be..cd5411980947 100644 --- a/lib/rust/metamodel/src/lib.rs +++ b/lib/rust/metamodel/src/lib.rs @@ -45,6 +45,7 @@ // === Features === #![feature(option_get_or_insert_default)] +#![feature(type_alias_impl_trait)] // === Standard Linter Configuration === #![deny(non_ascii_idents)] #![warn(unsafe_code)] diff --git a/lib/rust/metamodel/src/meta/graphviz.rs b/lib/rust/metamodel/src/meta/graphviz.rs index 0f6a669c89e0..a2c11ad807cb 100644 --- a/lib/rust/metamodel/src/meta/graphviz.rs +++ b/lib/rust/metamodel/src/meta/graphviz.rs @@ -28,8 +28,7 @@ pub fn graph(typegraph: &TypeGraph) -> Graph { let primitive = matches!(&ty.data, Data::Primitive(_)); let label = ty.name.to_string(); graph.nodes.insert(sname.clone(), Node { primitive, node_type, label }); - let parentlike = ty.parent.iter().chain(&ty.mixins); - for id in parentlike { + if let Some(id) = ty.parent.as_ref() { let sparent = format!("{}{}", types[id].name, id); graph.edges.push((sparent.clone(), sname.clone(), EdgeType::Subtype)); } diff --git a/lib/rust/metamodel/src/meta/mod.rs b/lib/rust/metamodel/src/meta/mod.rs index ee5a471253c8..9e1db21434c6 100644 --- a/lib/rust/metamodel/src/meta/mod.rs +++ b/lib/rust/metamodel/src/meta/mod.rs @@ -48,8 +48,6 @@ pub struct Type { pub data: Data, /// The parent type, if any. pub parent: Option, - /// Types that this type inherits from that are not the parent. - pub mixins: Vec, /// If true, this type cannot be instantiated. pub abstract_: bool, /// If true, this type is not open to extension by children outside those defined with it. @@ -66,12 +64,11 @@ impl Type { /// Create a new datatype, with defaults for most fields. pub fn new(name: TypeName, data: Data) -> Self { let parent = Default::default(); - let mixins = Default::default(); let abstract_ = Default::default(); let closed = Default::default(); let child_field = Default::default(); let discriminants = Default::default(); - Type { name, data, parent, mixins, abstract_, closed, child_field, discriminants } + Type { name, data, parent, abstract_, closed, child_field, discriminants } } } @@ -86,7 +83,7 @@ pub enum Data { impl Data { /// If this is a [`Data::Struct`], return its fields. - pub fn as_struct(&self) -> Option<&[Field]> { + pub fn fields(&self) -> Option<&[Field]> { match self { Data::Struct(fields) => Some(&fields[..]), _ => None, @@ -271,6 +268,10 @@ impl TypeName { pub fn to_pascal_case(&self) -> String { self.0.to_pascal_case() } + /// Render in snake_case. + pub fn to_snake_case(&self) -> String { + self.0.to_snake_case() + } /// Append another `TypeName` to the end of `self`. See `Identifier::append`. pub fn append(&mut self, other: Self) { self.0.append(other.0) @@ -302,6 +303,13 @@ impl FieldName { ident => Some(ident), } } + /// Render in snake_case. + pub fn to_snake_case(&self) -> Option { + match self.0.to_snake_case() { + ident if ident.is_empty() => None, + ident => Some(ident), + } + } /// Append another `FieldName` to the end of `self`. See `Identifier::append`. pub fn append(&mut self, other: Self) { self.0.append(other.0) @@ -355,9 +363,6 @@ impl TypeGraph { if let Some(parent) = &mut ty.parent { rewrite(parent); } - for parent in &mut ty.mixins { - rewrite(parent); - } match &mut ty.data { Data::Struct(fields) => for field in fields { @@ -391,7 +396,6 @@ impl TypeGraph { name: _, data, parent, - mixins, abstract_: _, closed: _, child_field: _, @@ -404,7 +408,6 @@ impl TypeGraph { if let Some(parent) = parent { to_visit.insert(*parent); } - to_visit.extend(mixins); to_visit.extend(discriminants.values()); match data { Data::Struct(fields) => to_visit.extend(fields.iter().map(|field| field.type_)), diff --git a/lib/rust/parser/Cargo.toml b/lib/rust/parser/Cargo.toml index 815caa6c57e9..06f2abfc7cd5 100644 --- a/lib/rust/parser/Cargo.toml +++ b/lib/rust/parser/Cargo.toml @@ -21,10 +21,13 @@ serde_json = { workspace = true } uuid = { version = "1.1", features = ["serde"] } bincode = "1.3" -[dev-dependencies] +[target.'cfg(not(target_arch = "wasm32"))'.dev-dependencies] enso-metamodel = { path = "../metamodel", features = ["rust"] } enso-metamodel-lexpr = { path = "../metamodel/lexpr" } lexpr = "0.2.6" rand = "0.8.5" rand_chacha = "0.3.1" rand_distr = "0.4.3" + +[target.'cfg(target_arch = "wasm32")'.dev-dependencies] +wasm-bindgen-test = { workspace = true } diff --git a/lib/rust/parser/debug/Cargo.toml b/lib/rust/parser/debug/Cargo.toml index ba073126fdba..adb3b586def9 100644 --- a/lib/rust/parser/debug/Cargo.toml +++ b/lib/rust/parser/debug/Cargo.toml @@ -16,3 +16,4 @@ enso-metamodel-lexpr = { path = "../../metamodel/lexpr" } enso-reflect = { path = "../../reflect" } lexpr = "0.2.6" serde = { version = "1.0", features = ["derive"] } +serde_json = { workspace = true } diff --git a/lib/rust/parser/debug/src/bin/binary_ast.rs b/lib/rust/parser/debug/src/bin/binary_ast.rs new file mode 100644 index 000000000000..f41990209c12 --- /dev/null +++ b/lib/rust/parser/debug/src/bin/binary_ast.rs @@ -0,0 +1,24 @@ +//! Run the parser from the command line, and output the raw binary serialization of the AST for +//! debugging. + +// === Features === +// === Standard Linter Configuration === +#![deny(non_ascii_idents)] +#![warn(unsafe_code)] +// === Non-Standard Linter Configuration === + +use std::io::Write; + +fn main() { + use std::io::Read; + let mut input = String::new(); + std::io::stdin().read_to_string(&mut input).unwrap(); + let mut code = input.as_str(); + if let Some((_meta, code_)) = enso_parser::metadata::parse(code) { + code = code_; + } + let ast = enso_parser::Parser::new().run(code); + let data = + enso_parser::format::serialize(&ast).expect("Failed to serialize AST to binary format"); + std::io::stdout().write_all(&data).unwrap(); +} diff --git a/lib/rust/parser/debug/src/bin/json_ast.rs b/lib/rust/parser/debug/src/bin/json_ast.rs new file mode 100644 index 000000000000..4fc7ee2bb8b4 --- /dev/null +++ b/lib/rust/parser/debug/src/bin/json_ast.rs @@ -0,0 +1,20 @@ +//! Run the parser from the command line, and output the a JSON serialization of the AST for +//! debugging. + +// === Features === +// === Standard Linter Configuration === +#![deny(non_ascii_idents)] +#![warn(unsafe_code)] +// === Non-Standard Linter Configuration === + +fn main() { + use std::io::Read; + let mut input = String::new(); + std::io::stdin().read_to_string(&mut input).unwrap(); + let mut code = input.as_str(); + if let Some((_meta, code_)) = enso_parser::metadata::parse(code) { + code = code_; + } + let ast = enso_parser::Parser::new().run(code); + serde_json::to_writer(std::io::stdout(), &ast).unwrap(); +} diff --git a/lib/rust/parser/debug/src/lib.rs b/lib/rust/parser/debug/src/lib.rs index 6914ceaf48a7..98c0ec59dd70 100644 --- a/lib/rust/parser/debug/src/lib.rs +++ b/lib/rust/parser/debug/src/lib.rs @@ -123,6 +123,7 @@ fn strip_hidden_fields(tree: Value) -> Value { ":spanLeftOffsetCodeReprBegin", ":spanLeftOffsetCodeReprLen", ":spanLeftOffsetCodeUtf16", + ":spanLeftOffsetCodeOffsetUtf16", ":spanCodeLengthUtf8", ":spanCodeLengthUtf16", ]; diff --git a/lib/rust/parser/schema/Cargo.toml b/lib/rust/parser/schema/Cargo.toml new file mode 100644 index 000000000000..8c701480873b --- /dev/null +++ b/lib/rust/parser/schema/Cargo.toml @@ -0,0 +1,19 @@ +[package] +name = "enso-parser-schema" +version = "0.1.0" +authors = ["Enso Team "] +edition = "2021" +description = "Generates schema describing Enso Parser AST types." +readme = "README.md" +homepage = "https://github.com/enso-org/enso" +repository = "https://github.com/enso-org/enso" +license-file = "../../LICENSE" + +[dependencies] +enso-metamodel = { path = "../../metamodel", features = ["rust"] } +enso-parser = { path = ".." } +enso-reflect = { path = "../../reflect", features = ["graphviz"] } +serde = { version = "1", features = ["derive"] } + +[target.'cfg(not(target_arch = "wasm32"))'.dependencies] +serde_json = { workspace = true } diff --git a/lib/rust/parser/schema/src/lib.rs b/lib/rust/parser/schema/src/lib.rs new file mode 100644 index 000000000000..f8edb52d1c92 --- /dev/null +++ b/lib/rust/parser/schema/src/lib.rs @@ -0,0 +1,369 @@ +//! Supports generation of a schema describing `enso-parser`'s AST types. + +// === Standard Linter Configuration === +#![deny(non_ascii_idents)] +#![warn(unsafe_code)] +#![allow(clippy::bool_to_int_with_if)] +#![allow(clippy::let_and_return)] +// === Non-Standard Linter Configuration === +#![allow(clippy::option_map_unit_fn)] +#![allow(clippy::precedence)] +#![allow(dead_code)] +#![deny(unconditional_recursion)] +#![warn(missing_copy_implementations)] +#![warn(missing_debug_implementations)] +#![warn(missing_docs)] +#![warn(trivial_casts)] +#![warn(trivial_numeric_casts)] +#![warn(unused_import_braces)] +#![warn(unused_qualifications)] + +use enso_metamodel::meta; +use enso_metamodel::meta::Data; +use enso_reflect::Reflect; +use std::collections::BTreeMap; +use std::collections::HashMap; +use std::rc::Rc; + + + +// =================== +// === Entry Point === +// =================== + +/// Return a serializable [`Schema`] describing the parser types. +pub fn schema() -> impl serde::Serialize { + let (graph, _) = enso_metamodel::rust::to_meta(enso_parser::syntax::Tree::reflect()); + let Types { types, ids } = types(&graph); + let serialization = serialization(&graph) + .filter_map(|(k, v)| ids.get(&k).map(|k| (k.clone(), v.map_ids(|k| ids[&k].clone())))) + .collect(); + Schema { types, serialization } +} + + + +// ============== +// === Schema === +// ============== + +/// Describes a set of types and their serialization. +#[derive(Debug, serde::Serialize, serde::Deserialize)] +pub struct Schema { + /// The type definitions. + pub types: HashMap, + /// Serialization information for the types. + pub serialization: HashMap, +} + + +// === Type graph === + +/// Describes a type. +#[derive(Debug, serde::Serialize, serde::Deserialize)] +#[serde(rename_all = "camelCase")] +pub struct Type { + /// The type's name, in snake case. + pub name: Rc, + /// The type's fields. + pub fields: HashMap, + /// The type's parent, if any. + #[serde(skip_serializing_if = "Option::is_none")] + pub parent: Option, +} + +/// Arbitrary key uniquely identifying a type within the schema. +#[derive(Debug, serde::Serialize, serde::Deserialize, Clone, Hash, PartialEq, Eq)] +pub struct TypeId(Rc); + +/// The name of a field, in snake case. +#[derive(Debug, serde::Serialize, serde::Deserialize, Clone, Hash, PartialEq, Eq)] +pub struct FieldName(Rc); + +/// A reference to a type, which may be a [`Type`] defined in the schema, a primitive, or a +/// parameterized generic type. +#[derive(Debug, serde::Serialize, serde::Deserialize, Clone, Hash, PartialEq, Eq)] +#[serde(rename_all = "lowercase")] +#[serde(tag = "class")] +pub enum TypeRef { + /// A type defined in the schema. + Type { + /// Identifies the type. + id: TypeId, + }, + /// A [`Primitive`]. + Primitive { + /// A [`Primitive`]. + r#type: Primitive, + }, + /// A sequence of values of a type. + Sequence { + /// The type of the elements of the sequence. + r#type: Box, + }, + /// An optional value of a type. + Option { + /// The type of the value, if present. + r#type: Box, + }, + /// A value that may indicate success or error. + Result { + /// The type of the value, in case of success. + r#type0: Box, + /// The type of the value, in case of error. + r#type1: Box, + }, +} + +/// The base types that all [`Type`]s defined in the schema are ultimately composed of. +#[derive(Debug, serde::Serialize, serde::Deserialize, Copy, Clone, Hash, PartialEq, Eq)] +#[serde(rename_all = "lowercase")] +pub enum Primitive { + /// A boolean value. + Bool, + /// A 32-bit unsigned integer. + U32, + /// A 64-bit unsigned integer. + U64, + /// A 32-bit signed integer. + I32, + /// A 64-bit signed integer. + I64, + /// A unicode codepoint, in the range 0x0000..=0x10FFFF. + Char, + /// A UTF-8 string. + String, +} + + +// === Serialization === + +/// Describes the serialized layout of a type. +#[derive(Debug, serde::Serialize, serde::Deserialize)] +#[serde(rename_all = "camelCase")] +pub struct Layout { + /// The fields, in order. Names are references to the fields defined in the [`Type`]. + pub fields: Vec<(FieldName, usize)>, + /// Values that encode the possible child types of this type. + #[serde(skip_serializing_if = "Option::is_none")] + pub discriminants: Option>, + /// The number of bytes this type's encoding takes as a field of a containing struct, element + /// of a sequence, or parent of another type. + pub size: usize, +} + +/// Number distinguishing different possible child types. +#[derive(Debug, serde::Serialize, serde::Deserialize, Clone, PartialEq, Eq, PartialOrd, Ord, Copy)] +pub struct Discriminant(u32); + + + +// ================== +// === Type Graph === +// ================== + +struct Types { + types: HashMap, + ids: HashMap, +} + +fn types(graph: &meta::TypeGraph) -> Types { + let mut next_type_id = 0; + let mut next_type_id = || { + let result = TypeId(format!("type_{next_type_id}").into()); + next_type_id += 1; + result + }; + let mut type_refs = HashMap::new(); + let mut ids = HashMap::new(); + let mut primitives = vec![]; + // Map struct types; gather primitive types. + for (key, ty) in &graph.types { + match &ty.data { + Data::Struct(_) => { + let id = next_type_id(); + ids.insert(key, id.clone()); + type_refs.insert(key, TypeRef::Type { id }); + } + Data::Primitive(prim) => { + primitives.push((key, prim)); + } + } + } + // Convert primitive types, handling dependencies in a topological order. + while !primitives.is_empty() { + primitives.retain(|(key, prim)| { + let ty = match prim { + meta::Primitive::Bool => Some(TypeRef::Primitive { r#type: Primitive::Bool }), + meta::Primitive::U32 => Some(TypeRef::Primitive { r#type: Primitive::U32 }), + meta::Primitive::U64 => Some(TypeRef::Primitive { r#type: Primitive::U64 }), + meta::Primitive::I32 => Some(TypeRef::Primitive { r#type: Primitive::I32 }), + meta::Primitive::I64 => Some(TypeRef::Primitive { r#type: Primitive::I64 }), + meta::Primitive::Char => Some(TypeRef::Primitive { r#type: Primitive::Char }), + meta::Primitive::String => Some(TypeRef::Primitive { r#type: Primitive::String }), + meta::Primitive::Sequence(id) => + type_refs.get(id).cloned().map(|ty| TypeRef::Sequence { r#type: Box::new(ty) }), + meta::Primitive::Option(id) => + type_refs.get(id).cloned().map(|ty| TypeRef::Option { r#type: Box::new(ty) }), + meta::Primitive::Result(id0, id1) => type_refs.get(id0).cloned().and_then(|ty0| { + type_refs.get(id1).cloned().map(|ty1| TypeRef::Result { + r#type0: Box::new(ty0), + r#type1: Box::new(ty1), + }) + }), + }; + if let Some(ty) = ty { + type_refs.insert(*key, ty); + false + } else { + true + } + }); + } + let types: HashMap<_, _> = graph + .types + .iter() + .filter_map(|(key, ty)| { + ty.data.fields().map(|fields| { + let key_to_id = |id| ids[id].clone(); + (key_to_id(&key), Type { + name: ty.name.to_snake_case().into(), + fields: fields + .iter() + .map(|f| { + let name = f.name.to_snake_case().expect("Tuples not supported."); + let r#type = type_refs[&f.type_].clone(); + (FieldName(name.into()), r#type) + }) + .collect(), + parent: ty.parent.as_ref().map(key_to_id), + }) + }) + }) + .collect(); + Types { types, ids } +} + + + +// ===================== +// === Serialization === +// ===================== + +fn serialization( + graph: &meta::TypeGraph, +) -> impl Iterator)> + '_ { + let sizes = solve_sizes(graph); + layouts(graph, sizes) +} + +const POINTER: usize = 4; + +/// Returns the inheritance-size of the object, if `sizes` contains all the needed information to +/// compute this type. The inheritance-size is the shallow size of all the type's fields, including +/// fields inherited from ancestor types. +fn compute_size( + graph: &meta::TypeGraph, + key: meta::TypeId, + sizes: &HashMap, +) -> Option { + use meta::Primitive; + let ty = &graph[key]; + Some(match &ty.data { + Data::Primitive(Primitive::Bool) => 1, + Data::Primitive(Primitive::U32 | Primitive::I32 | Primitive::Char) => 4, + Data::Primitive(Primitive::U64 | Primitive::I64) => 8, + Data::Primitive(Primitive::Option(_)) => 1 + POINTER, + Data::Primitive(Primitive::String | Primitive::Sequence(_) | Primitive::Result(_, _)) => + POINTER, + Data::Struct(fields) => { + let inherited_size = + if let Some(parent) = ty.parent { *sizes.get(&parent)? } else { 0 }; + let mut fields_size = 0; + for field in fields { + let ty = &graph[&field.type_]; + fields_size += if !ty.discriminants.is_empty() { + POINTER + } else { + *sizes.get(&field.type_)? + }; + } + inherited_size + fields_size + } + }) +} + +fn solve_sizes(graph: &meta::TypeGraph) -> HashMap { + let mut uncomputed: Vec<_> = graph.types.keys().collect(); + let mut sizes = HashMap::new(); + // Termination: Each step will make progress as long as there is no cycle in the type + // dependencies. Only an unconditional reference to a type creates a dependency. A cycle in + // unconditional type references would only occur if the input contained an infinite-sized type. + // + // Performance: In the worst case, this implementation requires time quadratic in the number of + // types (for inputs with deep composition graphs). However, it is simpler and more efficient + // for *reasonable* inputs than an implementation with better worst-case behavior. + while !uncomputed.is_empty() { + let uncomputed_before_step = uncomputed.len(); + uncomputed.retain(|key| match compute_size(graph, *key, &sizes) { + Some(size) => { + sizes.insert(*key, size); + false + } + None => true, + }); + assert_ne!(uncomputed.len(), uncomputed_before_step); + } + sizes +} + +/// Given the sizes of all types in the graph, compute the field offsets and return the layouts for +/// all the types. +fn layouts( + graph: &meta::TypeGraph, + sizes: HashMap, +) -> impl Iterator)> + '_ { + graph.types.iter().map(move |(key, ty)| { + (key, { + let mut offset = ty.parent.map_or(0, |key| sizes[&key]); + let fields = ty + .data + .fields() + .map(|fields| { + fields + .iter() + .map(|field| { + let entry = + (FieldName(field.name.to_snake_case().unwrap().into()), offset); + offset += if graph[&field.type_].discriminants.is_empty() { + sizes[&field.type_] + } else { + POINTER + }; + entry + }) + .collect() + }) + .unwrap_or_default(); + if ty.discriminants.is_empty() { + Layout { fields, discriminants: None, size: sizes[&key] } + } else { + let discriminants = ty + .discriminants + .iter() + .map(|(k, v)| (Discriminant((*k).try_into().unwrap()), *v)) + .collect(); + Layout { fields, discriminants: Some(discriminants), size: POINTER } + } + }) + }) +} + +impl Layout { + fn map_ids(self, f: impl Fn(Id) -> Id2) -> Layout { + let Layout { fields, discriminants, size } = self; + let discriminants = discriminants + .map(|discriminants| discriminants.into_iter().map(|(k, v)| (k, f(v))).collect()); + Layout { fields, discriminants, size } + } +} diff --git a/lib/rust/parser/schema/src/main.rs b/lib/rust/parser/schema/src/main.rs new file mode 100644 index 000000000000..c0b1502bbb9d --- /dev/null +++ b/lib/rust/parser/schema/src/main.rs @@ -0,0 +1,27 @@ +//! Generate a schema representing `enso-parser`'s AST types. This schema can be used to generate +//! AST representations and deserialization in other languages, such as TypeScript. +//! +//! The JSON schema data will be emitted to standard output. + +// === Standard Linter Configuration === +#![deny(non_ascii_idents)] +#![warn(unsafe_code)] +// === Non-Standard Linter Configuration === +#![deny(unconditional_recursion)] +#![warn(missing_copy_implementations)] +#![warn(missing_debug_implementations)] +#![warn(missing_docs)] +#![warn(trivial_casts)] +#![warn(trivial_numeric_casts)] +#![warn(unused_import_braces)] +#![warn(unused_qualifications)] + + + +// ========================= +// === Schema Generation === +// ========================= + +fn main() { + serde_json::to_writer_pretty(std::io::stdout(), &enso_parser_schema::schema()).unwrap() +} diff --git a/lib/rust/parser/src/format.rs b/lib/rust/parser/src/format.rs new file mode 100644 index 000000000000..c3e633aadaef --- /dev/null +++ b/lib/rust/parser/src/format.rs @@ -0,0 +1,552 @@ +//! Serializer for a binary format compatible with a lazy deserialization strategy. +//! +//! # Design +//! +//! In order to support lazy deserialization, fields of each object are located at fixed offsets +//! from the object; variable-sized objects (sequences, optional values, and discriminated unions) +//! are stored out of band, with a reference in the owning object identifying the location of the +//! data. +//! +//! Consequently, the format produced by this module is not actually "serial". This results in a bit +//! of an impedance mismatch using the `Serializer` trait: `serde` presents each field to the +//! serializer once, but we ultimately need to write to two different places in the output for +//! each "boxed" field (the data, and the reference to it). +//! +//! The approach used here is to maintain a stack of the fields of incomplete objects as we +//! descend in to them; when an object is completed, it is moved to the heap. This requires +//! moving each boxed object once. +//! +//! Alternatives: +//! - ⏰ Deferred: Generate a proper non-serializer with `metamodel`. This would support higher +//! performance, and a recursion-free implementation that would allow arbitrarily-deep trees. +//! - ❌ Rejected: Use the `len` hints provided by `serde` to pre-allocate objects of the correct +//! size: The requirement that every field have the same size representation would be too onerous. + +use serde::ser; +use serde::ser::SerializeSeq; +use serde::Serialize; +use std::fmt::Debug; +use std::fmt::Display; +use std::fmt::Formatter; + + +// ================= +// === Constants === +// ================= + +/// Maximum allowed nesting depth of compound objects. This is empirically determined to be reached +/// before stack overflow on supported targets (see [`test::test_infinite_recursion`] and +/// [`test::wasm::test_infinite_recursion`]). +const RECURSION_LIMIT: usize = 1024; + +/// If enabled, logs debugging info to stderr. +const DEBUG: bool = false; + + + +// ================= +// === Serialize === +// ================= + +/// Generate a binary representation of the value. +pub fn serialize(value: T) -> Result> { + let mut serializer = Serializer::new(); + value.serialize(&mut serializer)?; + serializer.heap.append(&mut serializer.stack); + debug_assert_eq!(serializer.recursion_depth, 0); + debug_assert_eq!(serializer.object_depth, 0); + debug_assert_eq!(&serializer.parent_structs, &[]); + Ok(serializer.heap) +} + + + +// ================== +// === Serializer === +// ================== + +/// Converts Rust values to the portable format. +#[derive(Debug, Default)] +pub struct Serializer { + /// Complete objects, located at their final addresses. + heap: Vec, + /// All the fields of currently-incomplete objects. + stack: Vec, + recursion_depth: usize, + object_depth: usize, + parent_structs: Vec, +} + +impl Serializer { + /// Return a new [`Serializer`]. + pub fn new() -> Self { + Self::default() + } + + fn object_serializer(&mut self) -> Result { + if self.recursion_depth < RECURSION_LIMIT { + self.recursion_depth += 1; + self.object_depth += 1; + let begin = self.stack.len(); + Ok(ObjectSerializer { serializer: self, begin }) + } else { + Err(Error::RecursionLimitExceeded) + } + } + + fn build_object(&mut self, begin: usize) -> Result<()> { + use serde::ser::Serializer; + self.recursion_depth -= 1; + self.object_depth -= 1; + let address = self.heap.len(); + if DEBUG { + eprintln!("-> {address}"); + } + self.heap.extend(self.stack.drain(begin..)); + self.serialize_u32(u32::try_from(address).unwrap()) + } +} + + +// ==== Object Serializer === + +/// Serializes compound types. +#[derive(Debug)] +pub struct ObjectSerializer<'a> { + serializer: &'a mut Serializer, + begin: usize, +} + +impl<'a> ObjectSerializer<'a> { + fn finish(self) -> Result<()> { + self.serializer.build_object(self.begin) + } +} + + +// ==== Parent Struct === + +/// Information for transforming a struct into a combined parent/child representation. +#[derive(Debug, Copy, Clone, PartialEq, Eq)] +struct ParentStruct { + object_depth_inside: usize, + begin: usize, + // Useful for debugging. + _name: &'static str, +} + + + +// ========================================== +// === Serialization Trait Implementation === +// ========================================== + +impl<'a> ser::Serializer for &'a mut Serializer { + type Ok = Ok; + type Error = Error; + + type SerializeSeq = ObjectSerializer<'a>; + type SerializeTuple = Self; + type SerializeTupleStruct = Self; + type SerializeTupleVariant = ObjectSerializer<'a>; + type SerializeMap = ObjectSerializer<'a>; + type SerializeStruct = Self; + type SerializeStructVariant = ObjectSerializer<'a>; + + fn serialize_bool(self, v: bool) -> Result<()> { + self.stack.push(v as u8); + Ok(()) + } + + fn serialize_i8(self, v: i8) -> Result<()> { + self.stack.push(v as u8); + Ok(()) + } + + fn serialize_i16(self, v: i16) -> Result<()> { + self.stack.extend_from_slice(&v.to_le_bytes()); + Ok(()) + } + + fn serialize_i32(self, v: i32) -> Result<()> { + self.stack.extend_from_slice(&v.to_le_bytes()); + Ok(()) + } + + fn serialize_i64(self, v: i64) -> Result<()> { + self.stack.extend_from_slice(&v.to_le_bytes()); + Ok(()) + } + + fn serialize_u8(self, v: u8) -> Result<()> { + self.stack.push(v); + Ok(()) + } + + fn serialize_u16(self, v: u16) -> Result<()> { + self.stack.extend_from_slice(&v.to_le_bytes()); + Ok(()) + } + + fn serialize_u32(self, v: u32) -> Result<()> { + self.stack.extend_from_slice(&v.to_le_bytes()); + Ok(()) + } + + fn serialize_u64(self, v: u64) -> Result<()> { + self.stack.extend_from_slice(&v.to_le_bytes()); + Ok(()) + } + + fn serialize_f32(self, v: f32) -> Result<()> { + self.serialize_u32(v.to_bits()) + } + + fn serialize_f64(self, v: f64) -> Result<()> { + self.serialize_u64(v.to_bits()) + } + + fn serialize_char(self, v: char) -> Result<()> { + self.serialize_u32(v.into()) + } + + fn serialize_str(self, v: &str) -> Result<()> { + self.serialize_bytes(v.as_bytes()) + } + + fn serialize_bytes(self, v: &[u8]) -> Result<()> { + let ser = self.serialize_seq(Some(v.len()))?; + ser.serializer.stack.extend_from_slice(v); + ser.finish() + } + + fn serialize_none(self) -> Result<()> { + self.serialize_u8(0)?; + self.serialize_u32(0xcdcdcdcd) + } + + fn serialize_some(self, value: &T) -> Result<()> + where T: ?Sized + Serialize { + self.serialize_u8(1)?; + let object = self.object_serializer()?; + value.serialize(&mut *object.serializer)?; + object.finish() + } + + fn serialize_unit(self) -> Result<()> { + Ok(()) + } + + fn serialize_unit_struct(self, _name: &'static str) -> Result<()> { + self.serialize_unit() + } + + fn serialize_unit_variant( + self, + _name: &'static str, + variant_index: u32, + _variant: &'static str, + ) -> Result<()> { + let object = self.object_serializer()?; + variant_index.serialize(&mut *object.serializer)?; + object.finish() + } + + fn serialize_newtype_struct(self, _name: &'static str, value: &T) -> Result<()> + where T: ?Sized + Serialize { + value.serialize(self) + } + + fn serialize_newtype_variant( + self, + name: &'static str, + variant_index: u32, + variant: &'static str, + value: &T, + ) -> Result<()> + where + T: ?Sized + Serialize, + { + if name == "Variant" + && let Some(ancestor) = self.parent_structs.last() + && ancestor.object_depth_inside == self.object_depth { + let parent_start = ancestor.begin; + let _ancestor_name = ancestor._name; + // Add the child's fields to the stack (following the parent's fields). + value.serialize(&mut *self)?; + // Build the object on the heap. + let address = self.heap.len(); + self.heap.extend_from_slice(&variant_index.to_le_bytes()); + self.heap.extend(self.stack.drain(parent_start..)); + let end_address = self.heap.len(); + if DEBUG { + eprintln!(">> {address}-{end_address} [{_ancestor_name}::{variant}]"); + } + self.serialize_u32(u32::try_from(address).unwrap())?; + } else { + let mut ser = self.object_serializer()?; + ser.serialize_element(&variant_index)?; + ser.serialize_element(value)?; + ser.finish()?; + } + Ok(()) + } + + fn serialize_seq(self, len: Option) -> Result { + let len = len.unwrap(); + let mut ser = self.object_serializer()?; + ser.serialize_element(&u32::try_from(len).unwrap())?; + Ok(ser) + } + + fn serialize_tuple(self, _len: usize) -> Result { + self.object_depth += 1; + Ok(self) + } + + fn serialize_tuple_struct( + self, + _name: &'static str, + _len: usize, + ) -> Result { + self.object_depth += 1; + Ok(self) + } + + fn serialize_tuple_variant( + self, + _name: &'static str, + variant_index: u32, + _variant: &'static str, + _len: usize, + ) -> Result { + let ser = self.object_serializer()?; + variant_index.serialize(&mut *ser.serializer)?; + Ok(ser) + } + + fn serialize_map(self, len: Option) -> Result { + self.serialize_seq(len) + } + + fn serialize_struct(self, name: &'static str, _len: usize) -> Result { + self.object_depth += 1; + if matches!(name, "Tree" | "Token") { + let object_depth_inside = self.object_depth; + let begin = self.stack.len(); + self.parent_structs.push(ParentStruct { object_depth_inside, begin, _name: name }); + } + Ok(self) + } + + fn serialize_struct_variant( + self, + _name: &'static str, + variant_index: u32, + _variant: &'static str, + _len: usize, + ) -> Result { + let ser = self.object_serializer()?; + variant_index.serialize(&mut *ser.serializer)?; + Ok(ser) + } +} + + +// === Inline Compound Type Trait Implementations === + +impl ser::SerializeStruct for &'_ mut Serializer { + type Ok = Ok; + type Error = Error; + + fn serialize_field(&mut self, _key: &'static str, value: &T) -> Result<()> + where T: ?Sized + Serialize { + value.serialize(&mut **self) + } + + fn end(self) -> Result<()> { + if let Some(ancestor) = self.parent_structs.last() { + if ancestor.object_depth_inside == self.object_depth { + self.parent_structs.pop(); + } + } + self.object_depth -= 1; + Ok(()) + } +} + +impl ser::SerializeTuple for &'_ mut Serializer { + type Ok = Ok; + type Error = Error; + + fn serialize_element(&mut self, value: &T) -> Result<()> + where T: ?Sized + Serialize { + value.serialize(&mut **self) + } + + fn end(self) -> Result<()> { + self.object_depth -= 1; + Ok(()) + } +} + +impl ser::SerializeTupleStruct for &'_ mut Serializer { + type Ok = Ok; + type Error = Error; + + fn serialize_field(&mut self, value: &T) -> Result<()> + where T: ?Sized + Serialize { + value.serialize(&mut **self) + } + + fn end(self) -> Result<()> { + self.object_depth -= 1; + Ok(()) + } +} + + +// === Boxed Compound Type Trait Implementations === + +impl ser::SerializeStructVariant for ObjectSerializer<'_> { + type Ok = Ok; + type Error = Error; + + fn serialize_field(&mut self, _key: &'static str, value: &T) -> Result<()> + where T: ?Sized + Serialize { + value.serialize(&mut *self.serializer) + } + + fn end(self) -> Result<()> { + self.finish() + } +} + +impl SerializeSeq for ObjectSerializer<'_> { + type Ok = Ok; + type Error = Error; + + fn serialize_element(&mut self, value: &T) -> Result<()> + where T: ?Sized + Serialize { + value.serialize(&mut *self.serializer) + } + + fn end(self) -> Result<()> { + self.finish() + } +} + +impl ser::SerializeTupleVariant for ObjectSerializer<'_> { + type Ok = Ok; + type Error = Error; + + fn serialize_field(&mut self, value: &T) -> Result<()> + where T: ?Sized + Serialize { + value.serialize(&mut *self.serializer) + } + + fn end(self) -> Result<()> { + self.finish() + } +} + +impl ser::SerializeMap for ObjectSerializer<'_> { + type Ok = Ok; + type Error = Error; + + fn serialize_key(&mut self, key: &T) -> Result<()> + where T: ?Sized + Serialize { + key.serialize(&mut *self.serializer) + } + + fn serialize_value(&mut self, value: &T) -> Result<()> + where T: ?Sized + Serialize { + value.serialize(&mut *self.serializer) + } + + fn end(self) -> Result<()> { + self.finish() + } +} + + + +// ==================== +// === Result Types === +// ==================== + +type Ok = (); + +/// Describes a serialization failure. +#[derive(Debug)] +pub enum Error { + /// Indicates that the nested object depth of the input exceeded [`RECURSION_LIMIT`], and + /// serialization was aborted to prevent a stack overflow. This is not expected to occur for + /// "reasonable" syntax trees. + RecursionLimitExceeded, + /// A serialization failure described by a message. + Custom(String), +} + +impl ser::StdError for Error {} + +impl Display for Error { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + Debug::fmt(self, f) + } +} + +impl ser::Error for Error { + fn custom(msg: T) -> Self + where T: Display { + Self::Custom(msg.to_string()) + } +} + +/// The result of a serialization attempt. +pub type Result = std::result::Result; + + +// ============= +// === Tests === +// ============= + +#[cfg(test)] +mod test { + use serde::Serialize; + + #[test] + fn test_infinite_recursion() { + use std::cell::RefCell; + use std::rc::Rc; + /// A serializable object containing a reference to itself. + #[derive(Serialize)] + struct Cyclic { + this: RefCell>>, + } + impl Cyclic { + fn new() -> Rc { + let cyclic = Rc::new(Cyclic { this: RefCell::new(None) }); + *cyclic.this.borrow_mut() = Some(Rc::clone(&cyclic)); + cyclic + } + } + // Note that if recursion is not adequately limited the expected failure mode is aborting + // due to stack overflow. We are just checking `is_err` here for good measure. + assert!(super::serialize(Cyclic::new()).is_err()); + } + + #[cfg(target_arch = "wasm32")] + mod wasm { + + use wasm_bindgen_test::wasm_bindgen_test; + use wasm_bindgen_test::wasm_bindgen_test_configure; + + wasm_bindgen_test_configure!(run_in_browser); + + #[wasm_bindgen_test] + fn test_infinite_recursion() { + super::test_infinite_recursion() + } + } +} diff --git a/lib/rust/parser/src/lexer.rs b/lib/rust/parser/src/lexer.rs index 150b1a76d80a..029e2d1e81cc 100644 --- a/lib/rust/parser/src/lexer.rs +++ b/lib/rust/parser/src/lexer.rs @@ -18,7 +18,7 @@ use std::str; /// An optimization constant. Based on it, the estimated memory is allocated on the beginning of /// parsing. -pub const AVERAGE_TOKEN_LEN: usize = 5; +const AVERAGE_TOKEN_LEN: usize = 5; /// Within an indented text block, this sets the minimum whitespace to be trimmed from the start of /// each line. const MIN_TEXT_TRIM: VisibleOffset = VisibleOffset(4); @@ -32,7 +32,7 @@ const MIN_TEXT_TRIM: VisibleOffset = VisibleOffset(4); /// Allows checking if the incoming char matches a predicate. The predicate can be another char /// (then this is simply check for equality), or a function `FnMut(char) -> bool`. This trait allows /// defining parsers which can work with both simple and function-based matchers. -pub trait Pattern { +trait Pattern { /// Check whether [`input`] matches this pattern. fn match_pattern(&mut self, input: char) -> bool; } @@ -81,31 +81,53 @@ pattern_impl_for_char_slice!(1, 2, 3, 4, 5, 6, 7, 8, 9, 10); pub struct Lexer<'s> { #[deref] #[deref_mut] - pub state: LexerState, - pub input: &'s str, - pub iterator: str::CharIndices<'s>, - pub output: Vec>, + state: LexerState, + input: &'s str, + iterator: str::CharIndices<'s>, + output: Vec>, /// Memory for storing tokens, reused as an optimization. - pub token_storage: VecAllocation>, + token_storage: VecAllocation>, +} + +#[derive(Debug, Default, Copy, Clone, PartialEq, Eq, PartialOrd, Ord)] +struct StrOffset { + utf8: Bytes, + utf16: u32, +} + +impl Sub for StrOffset { + type Output = Self; + + fn sub(self, rhs: Self) -> Self::Output { + Self { utf8: self.utf8 - rhs.utf8, utf16: self.utf16 - rhs.utf16 } + } +} + +impl Add for StrOffset { + type Output = Self; + + fn add(self, rhs: Self) -> Self::Output { + Self { utf8: self.utf8 + rhs.utf8, utf16: self.utf16 + rhs.utf16 } + } } /// Internal state of the [`Lexer`]. #[derive(Debug, Default)] #[allow(missing_docs)] pub struct LexerState { - pub current_char: Option, - pub current_offset: Bytes, - pub last_spaces_offset: Bytes, - pub last_spaces_visible_offset: VisibleOffset, - pub current_block_indent: VisibleOffset, - pub block_indent_stack: Vec, - pub internal_error: Option, - pub stack: Vec, + current_char: Option, + current_offset: StrOffset, + last_spaces_offset: StrOffset, + last_spaces_visible_offset: VisibleOffset, + current_block_indent: VisibleOffset, + block_indent_stack: Vec, + internal_error: Option, + stack: Vec, } /// Suspended states. #[derive(Debug, PartialEq, Eq, Copy, Clone)] -pub enum State { +enum State { /// Reading a single-line text literal. InlineText, /// Reading a multi-line text literal. @@ -117,6 +139,8 @@ pub enum State { }, } +type Mark<'s> = (StrOffset, Offset<'s>); + impl<'s> Lexer<'s> { /// Constructor. pub fn new(input: &'s str) -> Self { @@ -139,11 +163,18 @@ impl<'s> Lexer<'s> { fn next_input_char(&mut self) -> bool { let next = self.iterator.next(); if let Some((current_offset, current_char)) = next { - self.current_offset = Bytes(current_offset); + self.current_offset = StrOffset { + utf8: Bytes(current_offset), + utf16: self.current_offset.utf16 + + self.current_char.map_or(0, |c| c.len_utf16() as u32), + }; self.current_char = Some(current_char); true - } else if self.current_char.is_some() { - self.current_offset = Bytes(self.input.len()); + } else if let Some(c) = self.current_char { + self.current_offset = StrOffset { + utf8: Bytes(self.input.len()), + utf16: self.current_offset.utf16 + c.len_utf16() as u32, + }; self.current_char = None; true } else { @@ -153,7 +184,7 @@ impl<'s> Lexer<'s> { /// Run the provided function and compute how much input it consumed. #[inline(always)] - pub fn run_and_get_offset(&mut self, f: impl FnOnce(&mut Self) -> T) -> (T, Bytes) { + fn run_and_get_offset(&mut self, f: impl FnOnce(&mut Self) -> T) -> (T, StrOffset) { let start_offset = self.current_offset; let out = f(self); let len = self.current_offset - start_offset; @@ -162,8 +193,8 @@ impl<'s> Lexer<'s> { /// Run the provided function and check if it consumed any input. #[inline(always)] - pub fn run_and_check_if_progressed(&mut self, f: impl FnOnce(&mut Self)) -> bool { - self.run_and_get_offset(f).1.is_positive() + fn run_and_check_if_progressed(&mut self, f: impl FnOnce(&mut Self)) -> bool { + self.run_and_get_offset(f).1.utf8.is_positive() } /// Consume spaces after parsing a [`Token`] and update the internal spacing info. @@ -184,41 +215,45 @@ impl<'s> Lexer<'s> { /// Run the provided function. If it consumed any chars, return the [`Token`] containing the /// provided function output. Returns [`None`] otherwise. #[inline(always)] - pub fn token(&mut self, f: impl FnOnce(&mut Self) -> T) -> Option> { + fn token(&mut self, f: impl FnOnce(&mut Self) -> T) -> Option> { let start = self.current_offset; let (elem, len) = self.run_and_get_offset(f); - len.is_positive().as_some_from(|| { + len.utf8.is_positive().as_some_from(|| { let end = start + len; - let code = self.input.slice(start..end); let left_offset_start = start - self.last_spaces_offset; - let offset_code = self.input.slice(left_offset_start..start); + let (offset_code, code) = self + .input + .slice(left_offset_start.utf8..end.utf8) + .split_at(self.last_spaces_offset.utf8.unchecked_raw()); let visible_offset = self.last_spaces_visible_offset; - let offset = Offset(visible_offset, offset_code); + let offset = Offset( + visible_offset, + Code::from_str_at_offset(offset_code, left_offset_start.utf16), + ); self.spaces_after_lexeme(); - Token(offset, code, elem) + Token(offset, Code::from_str_at_offset(code, start.utf16), elem) }) } /// A zero-length token which is placed before the last consumed spaces if they were not /// followed by any token. #[inline(always)] - pub fn marker_token(&mut self, elem: T) -> Token<'s, T> { + fn marker_token(&mut self, elem: T) -> Token<'s, T> { let visible_offset = VisibleOffset(0); let start = self.current_offset - self.last_spaces_offset; - let code = self.input.slice(start..start); - let offset = Offset(visible_offset, code); - Token(offset, code, elem) + let offset = Offset(visible_offset, Code::empty(start.utf16)); + Token(offset, Code::empty(start.utf16), elem) } /// Push the [`token`] to the result stream. #[inline(always)] - pub fn submit_token(&mut self, token: Token<'s>) { + fn submit_token(&mut self, token: Token<'s>) { self.output.push(token); } /// Start a new block. #[inline(always)] - pub fn start_block(&mut self, new_indent: VisibleOffset) { + fn start_block(&mut self, new_indent: VisibleOffset) { let current_block_indent = self.current_block_indent; self.block_indent_stack.push(current_block_indent); self.current_block_indent = new_indent; @@ -226,7 +261,7 @@ impl<'s> Lexer<'s> { /// Finish the current block. #[inline(always)] - pub fn end_block(&mut self) -> Option { + fn end_block(&mut self) -> Option { self.block_indent_stack.pop().map(|prev| { let out = self.current_block_indent; self.current_block_indent = prev; @@ -244,13 +279,13 @@ impl<'s> Lexer<'s> { impl<'s> Lexer<'s> { /// Consume the next character, unconditionally. #[inline(always)] - pub fn take_next(&mut self) -> bool { + fn take_next(&mut self) -> bool { self.next_input_char() } /// Consume exactly one character if it matches the pattern. Returns [`true`] if it succeeded. #[inline(always)] - pub fn take_1(&mut self, mut pat: impl Pattern) -> bool { + fn take_1(&mut self, mut pat: impl Pattern) -> bool { match self.current_char.map(|t| pat.match_pattern(t)) { Some(true) => self.next_input_char(), _ => false, @@ -259,13 +294,13 @@ impl<'s> Lexer<'s> { /// Version of [`take_1`] that discards its result. #[inline(always)] - pub fn take_1_(&mut self, pat: impl Pattern) { + fn take_1_(&mut self, pat: impl Pattern) { self.take_1(pat); } /// Consume characters as long as they match the pattern. #[inline(always)] - pub fn take_while(&mut self, mut pat: impl Pattern) { + fn take_while(&mut self, mut pat: impl Pattern) { while let Some(true) = self.current_char.map(|t| pat.match_pattern(t)) { self.next_input_char(); } @@ -274,7 +309,7 @@ impl<'s> Lexer<'s> { /// Consume characters as long as they match the pattern. Returns [`true`] if at least one /// character was consumed. #[inline(always)] - pub fn take_while_1(&mut self, f: impl Copy + Pattern) -> bool { + fn take_while_1(&mut self, f: impl Copy + Pattern) -> bool { let ok = self.take_1(f); if ok { self.take_while(f); @@ -284,7 +319,7 @@ impl<'s> Lexer<'s> { /// Version of [`take_while_1`] that discards its result. #[inline(always)] - pub fn take_while_1_(&mut self, f: impl Copy + Pattern) { + fn take_while_1_(&mut self, f: impl Copy + Pattern) { self.take_while_1(f); } } @@ -378,7 +413,7 @@ impl<'s> Lexer<'s> { /// Check whether the provided character is a newline character. #[inline(always)] -pub fn is_newline_char(t: char) -> bool { +fn is_newline_char(t: char) -> bool { t == '\n' || t == '\r' } @@ -508,9 +543,9 @@ fn is_operator_body_char(t: char) -> bool { /// Info about identifier being parsed. #[derive(Clone, Copy, Debug)] #[allow(missing_docs)] -pub struct IdentInfo { +struct IdentInfo { starts_with_underscore: bool, - lift_level: usize, + lift_level: u32, starts_with_uppercase: bool, is_default: bool, } @@ -518,9 +553,9 @@ pub struct IdentInfo { impl IdentInfo { /// Constructor. #[inline(always)] - pub fn new(repr: &str) -> Self { + fn new(repr: &str) -> Self { let starts_with_underscore = repr.starts_with('_'); - let lift_level = repr.chars().rev().take_while(|t| *t == '\'').count(); + let lift_level = repr.chars().rev().take_while(|t| *t == '\'').count() as u32; let starts_with_uppercase = repr.chars().next().map(|c| c.is_uppercase()).unwrap_or_default(); let is_default = repr == "default"; @@ -544,7 +579,7 @@ impl token::Variant { /// Convert the provided string to ident. The provided repr should contain valid identifier /// characters. This condition will not be checked. #[inline(always)] - pub fn new_ident_unchecked(repr: &str) -> token::variant::Ident { + fn new_ident_unchecked(repr: &str) -> token::variant::Ident { let info = IdentInfo::new(repr); let is_operator = false; token::variant::Ident( @@ -559,9 +594,9 @@ impl token::Variant { /// Convert the provided string to ident or wildcard. The provided repr should contain valid /// identifier characters. This condition will not be checked. #[inline(always)] - pub fn new_ident_or_wildcard_unchecked(repr: &str) -> token::Variant { + fn new_ident_or_wildcard_unchecked(repr: &str) -> token::Variant { let info = IdentInfo::new(repr); - if info.starts_with_underscore && repr.len() == 1 + info.lift_level { + if info.starts_with_underscore && repr.len() as u32 == 1 + info.lift_level { token::Variant::wildcard(info.lift_level) } else { let is_free = info.starts_with_underscore; @@ -640,7 +675,7 @@ impl<'s> Lexer<'s> { // have different precedences; this is a special case here because the distinction // requires lookahead. "." if self.last_spaces_visible_offset.width_in_spaces == 0 - && let Some(char) = self.current_char && char.is_ascii_digit() => { + && let Some(char) = self.current_char && char.is_ascii_digit() => { let opr = token::OperatorProperties::new() .with_binary_infix_precedence(81) .as_decimal(); @@ -858,9 +893,13 @@ impl<'s> Lexer<'s> { self.token(|this| this.take_while(is_hexadecimal_digit)), }; let joiner = token::OperatorProperties::new() - .with_binary_infix_precedence(usize::MAX) + .with_binary_infix_precedence(u32::MAX) .as_token_joiner(); - self.submit_token(Token("", "", token::Variant::operator(joiner))); + self.submit_token(Token( + Code::empty_without_offset(), + Code::empty_without_offset(), + token::Variant::operator(joiner), + )); // Every number has a digits-token, even if it's zero-length. let token = token.unwrap_or_default(); self.submit_token(token.with_variant(token::Variant::digits(Some(base)))); @@ -897,7 +936,7 @@ impl<'s> Lexer<'s> { let indent = self.current_block_indent; let open_quote_start = self.mark(); self.last_spaces_visible_offset = VisibleOffset(0); - self.last_spaces_offset = Bytes(0); + self.last_spaces_offset = default(); self.take_next(); // At least two quote characters. if let Some(char) = self.current_char && char == quote_char { @@ -916,17 +955,17 @@ impl<'s> Lexer<'s> { // Exactly two quote characters: Open and shut case. let close_quote_end = self.mark(); let token = self.make_token(open_quote_start, close_quote_start.clone(), - token::Variant::text_start()); + token::Variant::text_start()); self.output.push(token); let token = self.make_token(close_quote_start, close_quote_end, - token::Variant::text_end()); + token::Variant::text_end()); self.output.push(token); } } else { // One quote followed by non-quote character: Inline quote. let open_quote_end = self.mark(); let token = self.make_token(open_quote_start, open_quote_end, - token::Variant::text_start()); + token::Variant::text_start()); self.output.push(token); self.inline_quote(quote_char, text_type); } @@ -935,7 +974,7 @@ impl<'s> Lexer<'s> { fn multiline_text( &mut self, - open_quote_start: (Bytes, Offset<'s>), + open_quote_start: Mark<'s>, block_indent: VisibleOffset, text_type: TextType, ) { @@ -1037,12 +1076,18 @@ impl<'s> Lexer<'s> { } if let Some(indent) = new_indent { if indent <= *block_indent { - self.output.push(Token::from(token::text_end("", ""))); + self.output.push(Token::from(token::text_end( + Code::empty_without_offset(), + Code::empty_without_offset(), + ))); self.end_blocks(indent); self.output.extend(newlines); if self.current_offset == text_start.0 { self.last_spaces_visible_offset = text_start.1.visible; - self.last_spaces_offset = text_start.1.code.len(); + self.last_spaces_offset = StrOffset { + utf8: text_start.1.code.len(), + utf16: text_start.1.code.len_utf16(), + }; } return TextEndedAt::End; } @@ -1107,17 +1152,13 @@ impl<'s> Lexer<'s> { let close_quote_end = self.mark(); self.make_token(text_end, close_quote_end, token::Variant::text_end()) } else { - Token::from(token::text_end("", "")) + Token::from(token::text_end(Code::empty_without_offset(), Code::empty_without_offset())) }; self.output.push(end_token); TextEndedAt::End } - fn text_escape( - &mut self, - backslash_start: (Bytes, Offset<'s>), - char: char, - ) -> (Bytes, Offset<'s>) { + fn text_escape(&mut self, backslash_start: Mark<'s>, char: char) -> Mark<'s> { let leader = match char { 'x' => Some((2, false)), 'u' => Some((4, true)), @@ -1180,27 +1221,25 @@ impl<'s> Lexer<'s> { } } - fn mark(&mut self) -> (Bytes, Offset<'s>) { + fn mark(&mut self) -> Mark<'s> { let start = self.current_offset; let left_offset_start = start - self.last_spaces_offset; - let offset_code = self.input.slice(left_offset_start..start); + let offset_code = self.input.slice(left_offset_start.utf8..start.utf8); let visible_offset = self.last_spaces_visible_offset; self.last_spaces_visible_offset = VisibleOffset(0); - self.last_spaces_offset = Bytes(0); - (start, Offset(visible_offset, offset_code)) + self.last_spaces_offset = default(); + ( + start, + Offset(visible_offset, Code::from_str_at_offset(offset_code, left_offset_start.utf16)), + ) } - fn make_token( - &self, - from: (Bytes, Offset<'s>), - to: (Bytes, Offset<'s>), - variant: token::Variant, - ) -> Token<'s> { + fn make_token(&self, from: Mark<'s>, to: Mark<'s>, variant: token::Variant) -> Token<'s> { let (start, offset) = from; let end = to.0; - let start = start.unchecked_raw(); - let end = end.unchecked_raw(); - Token(offset, &self.input[start..end], variant) + let start8 = start.utf8.unchecked_raw(); + let end8 = end.utf8.unchecked_raw(); + Token(offset, Code::from_str_at_offset(&self.input[start8..end8], start.utf16), variant) } } @@ -1364,11 +1403,14 @@ impl<'s> Lexer<'s> { } if self.last_spaces_visible_offset != VisibleOffset(0) { let left_offset_start = self.current_offset - self.last_spaces_offset; - let offset_code = self.input.slice(left_offset_start..self.current_offset); + let offset_code = self.input.slice(left_offset_start.utf8..self.current_offset.utf8); let visible_offset = self.last_spaces_visible_offset; - let offset = Offset(visible_offset, offset_code); + let offset = Offset( + visible_offset, + Code::from_str_at_offset(offset_code, left_offset_start.utf16), + ); let eof = token::variant::Variant::Newline(token::variant::Newline()); - self.submit_token(Token(offset, "", eof)); + self.submit_token(Token(offset, Code::empty(self.current_offset.utf16), eof)); } let mut internal_error = self.internal_error.take(); if self.current_char.is_some() { @@ -1398,24 +1440,35 @@ pub mod test { use super::*; pub use token::*; + fn test_code(code: &str) -> Code { + Code::from_str_without_offset(code) + } + /// Constructor. pub fn ident_<'s>(left_offset: &'s str, code: &'s str) -> Token<'s> { let is_free = code.starts_with('_'); - let lift_level = code.chars().rev().take_while(|t| *t == '\'').count(); + let lift_level = code.chars().rev().take_while(|t| *t == '\'').count() as u32; let is_uppercase = code.chars().next().map(|c| c.is_uppercase()).unwrap_or_default(); let is_operator = false; + let left_offset = test_code(left_offset); + let code = test_code(code); token::ident_(left_offset, code, is_free, lift_level, is_uppercase, is_operator, false) } /// Constructor. pub fn wildcard_<'s>(left_offset: &'s str, code: &'s str) -> Token<'s> { - let lift_level = code.chars().rev().take_while(|t| *t == '\'').count(); + let lift_level = code.chars().rev().take_while(|t| *t == '\'').count() as u32; + let left_offset = test_code(left_offset); + let code = test_code(code); token::wildcard_(left_offset, code, lift_level) } /// Constructor. pub fn operator_<'s>(left_offset: &'s str, code: &'s str) -> Token<'s> { - Token(left_offset, code, token::Variant::operator(analyze_operator(code))) + let variant = token::Variant::operator(analyze_operator(code)); + let left_offset = test_code(left_offset); + let code = test_code(code); + Token(left_offset, code, variant) } } @@ -1424,6 +1477,14 @@ mod tests { use super::test::*; use super::*; + fn empty<'a>() -> Code<'a> { + Code::empty_without_offset() + } + + fn test_code(code: &str) -> Code { + Code::from_str_without_offset(code) + } + fn test_lexer_many<'s>(inputs: Vec<(&'s str, Vec>)>) { for (input, output) in inputs { test_lexer(input, output) @@ -1431,7 +1492,10 @@ mod tests { } fn test_lexer<'s>(input: &'s str, expected: Vec>) { - assert_eq!(run(input).unwrap(), expected); + let result: Vec<_> = + run(input).unwrap().into_iter().map(|token| token.without_offsets()).collect(); + let expected: Vec<_> = expected.into_iter().map(|token| token.without_offsets()).collect(); + assert_eq!(result, expected); } fn lexer_case_idents<'s>(idents: &[&'s str]) -> Vec<(&'s str, Vec>)> { @@ -1452,43 +1516,45 @@ mod tests { #[test] fn test_case_block() { + let newline = newline_(empty(), test_code("\n")); test_lexer_many(vec![ - ("\n", vec![newline_("", "\n")]), + ("\n", vec![newline_(empty(), test_code("\n"))]), ("\n foo\n bar", vec![ - block_start_("", ""), - newline_("", "\n"), + block_start_(empty(), empty()), + newline.clone(), ident_(" ", "foo"), - newline_("", "\n"), + newline.clone(), ident_(" ", "bar"), - block_end_("", ""), + block_end_(empty(), empty()), ]), ("foo\n +", vec![ ident_("", "foo"), - block_start_("", ""), - newline_("", "\n"), + block_start_(empty(), empty()), + newline, operator_(" ", "+"), - block_end_("", ""), + block_end_(empty(), empty()), ]), ]); } #[test] fn test_case_block_bad_indents() { + let newline = newline_(empty(), test_code("\n")); #[rustfmt::skip] test_lexer_many(vec![ ("\n foo\n bar\nbaz", vec![ - block_start_("", ""), - newline_("", "\n"), ident_(" ", "foo"), - newline_("", "\n"), ident_(" ", "bar"), - block_end_("", ""), - newline_("", "\n"), ident_("", "baz"), + block_start_(empty(), empty()), + newline.clone(), ident_(" ", "foo"), + newline.clone(), ident_(" ", "bar"), + block_end_(empty(), empty()), + newline.clone(), ident_("", "baz"), ]), ("\n foo\n bar\n baz", vec![ - block_start_("", ""), - newline_("", "\n"), ident_(" ", "foo"), - newline_("", "\n"), ident_(" ", "bar"), - newline_("", "\n"), ident_(" ", "baz"), - block_end_("", ""), + block_start_(empty(), empty()), + newline.clone(), ident_(" ", "foo"), + newline.clone(), ident_(" ", "bar"), + newline, ident_(" ", "baz"), + block_end_(empty(), empty()), ]), ]); } @@ -1497,8 +1563,8 @@ mod tests { fn test_case_whitespace_only_line() { test_lexer_many(vec![("foo\n \nbar", vec![ ident_("", "foo"), - newline_("", "\n"), - newline_(" ", "\n"), + newline_(empty(), test_code("\n")), + newline_(test_code(" "), test_code("\n")), ident_("", "bar"), ])]); } @@ -1523,7 +1589,7 @@ mod tests { #[test] fn test_numeric_literal() { - test_lexer("10", vec![digits_("", "10", None)]); + test_lexer("10", vec![digits_(empty(), test_code("10"), None)]); } #[test] diff --git a/lib/rust/parser/src/lib.rs b/lib/rust/parser/src/lib.rs index 7b1164adab27..6a01af76f865 100644 --- a/lib/rust/parser/src/lib.rs +++ b/lib/rust/parser/src/lib.rs @@ -77,12 +77,8 @@ #![recursion_limit = "256"] // === Features === -#![allow(incomplete_features)] #![feature(let_chains)] -#![feature(allocator_api)] -#![feature(exact_size_is_empty)] #![feature(test)] -#![feature(specialization)] #![feature(if_let_guard)] #![feature(box_patterns)] #![feature(option_get_or_insert_default)] @@ -107,10 +103,12 @@ use crate::prelude::*; + // ============== // === Export === // ============== +pub mod format; pub mod lexer; pub mod macros; pub mod metadata; @@ -119,7 +117,6 @@ pub mod source; pub mod syntax; - /// Popular utilities, imported by most modules of this crate. pub mod prelude { pub use enso_prelude::serde_reexports::*; @@ -129,10 +126,6 @@ pub mod prelude { pub use enso_types::traits::*; pub use enso_types::unit2::Bytes; - /// Wraps return value for functions whose implementations don't handle all cases yet. When the - /// parser is complete, this type will be eliminated. - pub type WipResult = Result; - /// Return type for functions that will only fail in case of a bug in the implementation. #[derive(Debug, Default)] pub struct ParseResult { @@ -208,7 +201,7 @@ impl Default for Parser { /// interpreted as a variable assignment or method definition. fn expression_to_statement(mut tree: syntax::Tree<'_>) -> syntax::Tree<'_> { use syntax::tree::*; - let mut left_offset = source::span::Offset::default(); + let mut left_offset = tree.span.left_offset.position_before(); if let Tree { variant: box Variant::Annotated(annotated), .. } = &mut tree { annotated.expression = annotated.expression.take().map(expression_to_statement); return tree; @@ -351,7 +344,7 @@ pub fn parse_argument_application<'s>( match &mut expression.variant { box Variant::App(App { func, arg }) => { let arg = parse_argument_definition(arg.clone()); - func.span.left_offset += mem::take(&mut expression.span.left_offset); + func.span.left_offset += expression.span.left_offset.take_as_prefix(); *expression = func.clone(); Some(arg) } @@ -365,7 +358,7 @@ pub fn parse_argument_application<'s>( let close2 = default(); let type_ = default(); let default = Some(ArgumentDefault { equals, expression: arg.clone() }); - func.span.left_offset += mem::take(&mut expression.span.left_offset); + func.span.left_offset += expression.span.left_offset.take_as_prefix(); *expression = func.clone(); Some(ArgumentDefinition { open, @@ -380,7 +373,7 @@ pub fn parse_argument_application<'s>( } box Variant::DefaultApp(DefaultApp { func, default: default_ }) => { let pattern = Tree::ident(default_.clone()); - func.span.left_offset += mem::take(&mut expression.span.left_offset); + func.span.left_offset += expression.span.left_offset.take_as_prefix(); *expression = func.clone(); Some(ArgumentDefinition { open: default(), @@ -485,6 +478,7 @@ mod benches { } #[bench] + #[cfg(not(target_arch = "wasm32"))] fn bench_blocks(bencher: &mut Bencher) { use rand::prelude::*; use rand_chacha::ChaCha8Rng; @@ -526,6 +520,7 @@ mod benches { } #[bench] + #[cfg(not(target_arch = "wasm32"))] fn bench_expressions(bencher: &mut Bencher) { use rand::prelude::*; use rand_chacha::ChaCha8Rng; diff --git a/lib/rust/parser/src/macros/built_in.rs b/lib/rust/parser/src/macros/built_in.rs index 50abe53d1b83..40ddedc48a25 100644 --- a/lib/rust/parser/src/macros/built_in.rs +++ b/lib/rust/parser/src/macros/built_in.rs @@ -3,6 +3,7 @@ use crate::macros::pattern::*; use crate::macros::*; +use crate::source::Code; use crate::syntax::operator; @@ -339,7 +340,7 @@ fn to_body_statement(mut line_expression: syntax::Tree<'_>) -> syntax::Tree<'_> return line_expression; } let mut last_argument_default = default(); - let mut left_offset = crate::source::Offset::default(); + let mut left_offset = line_expression.span.left_offset.position_before(); let lhs = match &line_expression { Tree { variant: box Variant::OprApp(OprApp { lhs: Some(lhs), opr: Ok(opr), rhs: Some(rhs) }), @@ -437,8 +438,10 @@ fn case_body<'s>( _ => initial_case.push(item), } } - if !initial_case.is_empty() { - let newline = syntax::token::newline("", ""); + if let Some(_first) = initial_case.first() { + // FIXME: Create 0-length span at offset preceding `_first`. + let newline = + syntax::token::newline(Code::empty_without_offset(), Code::empty_without_offset()); case_builder.push(syntax::item::Line { newline, items: initial_case }); } block.into_iter().for_each(|line| case_builder.push(line)); @@ -823,6 +826,14 @@ fn expect_qualified(tree: syntax::Tree) -> syntax::Tree { } fn expected_nonempty<'s>() -> syntax::Tree<'s> { - let empty = syntax::Tree::ident(syntax::token::ident("", "", false, 0, false, false, false)); + let empty = syntax::Tree::ident(syntax::token::ident( + Code::empty_without_offset(), + Code::empty_without_offset(), + false, + 0, + false, + false, + false, + )); empty.with_error("Expected tokens.") } diff --git a/lib/rust/parser/src/macros/resolver.rs b/lib/rust/parser/src/macros/resolver.rs index a0f944adde2a..3e8c85060e70 100644 --- a/lib/rust/parser/src/macros/resolver.rs +++ b/lib/rust/parser/src/macros/resolver.rs @@ -26,6 +26,7 @@ use crate::prelude::*; use crate::macros; use crate::macros::pattern; +use crate::source::Code; use crate::syntax; use crate::syntax::token; use crate::syntax::token::Token; @@ -142,8 +143,10 @@ impl<'s> Resolver<'s> { /// Create a new resolver, in statement context. pub fn new_statement() -> Self { let scopes = default(); - let open_blocks = - vec![syntax::item::Line { newline: token::newline("", ""), items: default() }]; + let open_blocks = vec![syntax::item::Line { + newline: token::newline(Code::empty(0), Code::empty(0)), + items: default(), + }]; let macro_stack = default(); let segments = default(); let items = default(); diff --git a/lib/rust/parser/src/main.rs b/lib/rust/parser/src/main.rs index 42edc729e2d4..aa7fa97bc8ac 100644 --- a/lib/rust/parser/src/main.rs +++ b/lib/rust/parser/src/main.rs @@ -75,25 +75,21 @@ fn check_file(path: &str, mut code: &str, parser: &mut enso_parser::Parser) { }); for (error, span) in &*errors.borrow() { let whitespace = &span.left_offset.code.repr; - if matches!(whitespace, Cow::Borrowed(_)) { - let start = whitespace.as_ptr() as usize + whitespace.len() - code.as_ptr() as usize; - let mut line = 1; - let mut char = 0; - for (i, c) in code.char_indices() { - if i >= start { - break; - } - if c == '\n' { - line += 1; - char = 0; - } else { - char += 1; - } + let start = whitespace.as_ptr() as usize + whitespace.len() - code.as_ptr() as usize; + let mut line = 1; + let mut char = 0; + for (i, c) in code.char_indices() { + if i >= start { + break; } - eprintln!("{path}:{line}:{char}: {}", &error); - } else { - eprintln!("{path}:?:?: {}", &error); - }; + if c == '\n' { + line += 1; + char = 0; + } else { + char += 1; + } + } + eprintln!("{path}:{line}:{char}: {}", &error); } for (parsed, original) in ast.code().lines().zip(code.lines()) { assert_eq!(parsed, original, "Bug: dropped tokens, while parsing: {path}"); diff --git a/lib/rust/parser/src/serialization.rs b/lib/rust/parser/src/serialization.rs index 7b6c73bed343..4bebb3dc9517 100644 --- a/lib/rust/parser/src/serialization.rs +++ b/lib/rust/parser/src/serialization.rs @@ -5,6 +5,8 @@ use crate::prelude::*; +use crate::source::code::StrRef; + // ============ @@ -41,25 +43,17 @@ pub(crate) struct Code { } /// Serde wrapper to serialize a `Cow` as the `Code` representation. -#[allow(clippy::ptr_arg)] // This is the signature required by serde. -pub(crate) fn serialize_cow(cow: &Cow<'_, str>, ser: S) -> Result +pub(crate) fn serialize_cow(s: &StrRef, ser: S) -> Result where S: serde::Serializer { - let s = match cow { - Cow::Borrowed(s) => { - let begin = str::as_ptr(s) as u32; - let len = s.len() as u32; - Code { begin, len } - } - Cow::Owned(s) if s.is_empty() => Code { begin: 0, len: 0 }, - Cow::Owned(_) => panic!(), - }; + let s = s.0; + let s = Code { begin: str::as_ptr(s) as u32, len: s.len() as u32 }; s.serialize(ser) } -pub(crate) fn deserialize_cow<'c, 'de, D>(deserializer: D) -> Result, D::Error> +pub(crate) fn deserialize_cow<'c, 'de, D>(deserializer: D) -> Result, D::Error> where D: serde::Deserializer<'de> { let _ = deserializer.deserialize_u64(DeserializeU64); - Ok(Cow::Owned(String::new())) + Ok(StrRef("")) } diff --git a/lib/rust/parser/src/source/code.rs b/lib/rust/parser/src/source/code.rs index fde5f6f1d66f..7be3332fd7b2 100644 --- a/lib/rust/parser/src/source/code.rs +++ b/lib/rust/parser/src/source/code.rs @@ -8,20 +8,102 @@ use crate::prelude::*; // === Code === // ============ +/// Wrap a `&str` that is skipped while deserializing; serde doesn't allow a custom deserializer to +/// produce a `&'s str` without borrowing from the input. +#[derive(Debug, Clone, Default, Eq, PartialEq, Deref)] +pub struct StrRef<'s>(pub &'s str); + /// A code representation. It can either be a borrowed source code or a modified owned one. -#[derive(Clone, Default, Eq, PartialEq, Serialize, Reflect, Deserialize, Deref)] +#[derive(Clone, Debug, Default, Eq, PartialEq, Serialize, Reflect, Deserialize, Deref)] #[allow(missing_docs)] pub struct Code<'s> { #[serde(serialize_with = "crate::serialization::serialize_cow")] #[serde(deserialize_with = "crate::serialization::deserialize_cow")] #[reflect(as = "crate::serialization::Code", flatten, hide)] #[deref] - pub repr: Cow<'s, str>, + pub repr: StrRef<'s>, + #[reflect(hide)] + offset_utf16: u32, #[reflect(hide)] - pub utf16: usize, + utf16: u32, } impl<'s> Code<'s> { + /// Return a code reference from the given source and offset within the document. + #[inline(always)] + pub fn from_str_at_offset(repr: &'s str, offset_utf16: u32) -> Self { + let utf16 = repr.chars().map(|c| c.len_utf16() as u32).sum(); + let repr = StrRef(repr); + Self { repr, offset_utf16, utf16 } + } + + /// Return a code reference at the beginning of the document. This can be used in testing, when + /// accurate code references are not needed. + #[inline(always)] + pub fn from_str_without_offset(repr: &'s str) -> Self { + Self::from_str_at_offset(repr, 0) + } + + /// Return a copy of this value, and set this value to a 0-length value following the returned + /// value. + #[inline(always)] + pub fn take_as_prefix(&mut self) -> Self { + let end = self.offset_utf16 + self.utf16; + Self { + repr: mem::take(&mut self.repr), + offset_utf16: mem::replace(&mut self.offset_utf16, end), + utf16: mem::take(&mut self.utf16), + } + } + + /// Return a 0-length `Code` located immediately before the start of this `Code`. + pub fn position_before(&self) -> Self { + Self { repr: default(), offset_utf16: self.offset_utf16, utf16: default() } + } + + /// Return a 0-length `Code` located immediately after the end of this `Code`. + pub fn position_after(&self) -> Self { + Self { + repr: default(), + offset_utf16: self.offset_utf16 + self.utf16, + utf16: default(), + } + } + + /// Return the length in UTF-16 code units. + pub fn len_utf16(&self) -> u32 { + self.utf16 + } + + /// Split the UTF-8 code at the given byte offset. + pub fn split_at(&self, offset: usize) -> (Self, Self) { + let (left, right) = self.repr.split_at(offset); + let left_utf16 = left.chars().map(|c| c.len_utf16() as u32).sum(); + let right_utf16 = self.utf16 - left_utf16; + ( + Self { + repr: StrRef(left), + offset_utf16: self.offset_utf16, + utf16: left_utf16, + }, + Self { + repr: StrRef(right), + offset_utf16: self.offset_utf16 + left_utf16, + utf16: right_utf16, + }, + ) + } + + /// Return a reference to an empty string, not associated with any location in the document. + pub fn empty_without_offset() -> Self { + Self { repr: StrRef(""), offset_utf16: 0, utf16: 0 } + } + + /// Return a reference to an empty string. + pub fn empty(offset: u32) -> Self { + Self { repr: StrRef(""), offset_utf16: offset, utf16: 0 } + } + /// Length of the code in bytes. #[inline(always)] pub fn len(&self) -> Bytes { @@ -39,48 +121,31 @@ impl<'s> Code<'s> { pub fn is_empty(&self) -> bool { self.repr.is_empty() } -} -impl<'a> From> for Code<'a> { - #[inline(always)] - fn from(repr: Cow<'a, str>) -> Self { - let utf16 = repr.encode_utf16().count(); - Self { repr, utf16 } - } -} - -impl<'a> From<&'a str> for Code<'a> { - #[inline(always)] - fn from(str: &'a str) -> Self { - let utf16 = str.encode_utf16().count(); - let repr = str.into(); - Self { repr, utf16 } + /// Return this value with its start position removed (set to 0). This can be used to compare + /// values ignoring offsets. + pub fn without_offset(&self) -> Self { + Self { repr: self.repr.clone(), offset_utf16: default(), utf16: self.utf16 } } } impl<'s> Display for Code<'s> { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - Display::fmt(&self.repr, f) - } -} - -impl<'s> Debug for Code<'s> { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - Debug::fmt(&self.repr, f) + Display::fmt(&*self.repr, f) } } impl<'a, 'b> PartialEq<&'b str> for Code<'a> { #[inline(always)] fn eq(&self, other: &&'b str) -> bool { - self.repr.eq(other) + self.repr.0.eq(*other) } } impl AsRef for Code<'_> { #[inline(always)] fn as_ref(&self) -> &str { - &self.repr + self.repr.0 } } @@ -94,16 +159,40 @@ impl std::borrow::Borrow for Code<'_> { impl<'s> AddAssign> for Code<'s> { #[inline(always)] fn add_assign(&mut self, other: Code<'s>) { - self.repr.add_assign(other.repr); - self.utf16.add_assign(other.utf16); + self.add_assign(&other) } } impl<'s> AddAssign<&Code<'s>> for Code<'s> { #[inline(always)] fn add_assign(&mut self, other: &Code<'s>) { - self.repr.add_assign(other.repr.clone()); - self.utf16.add_assign(other.utf16); + match (self.is_empty(), other.is_empty()) { + (false, true) => (), + (true, true) => { + // The span builder works by starting with `Span::empty_without_offset()`, and + // appending to the right side. In order to ensure every span has an offset: When + // the LHS is empty, take the location from the RHS even if the RHS is also empty. + self.offset_utf16 = other.offset_utf16; + } + (true, false) => { + *self = other.clone(); + } + (false, false) => { + let range = self.repr.as_bytes().as_ptr_range(); + #[allow(unsafe_code)] // See comments in block. + unsafe { + // Combining two slices is sound if: + // - They have the same element lifetime (ensured by the type signature). + // - The second ends where the first begins (checked below). + assert_eq!(range.end, other.repr.as_ptr()); + let joined = + slice::from_raw_parts(range.start, self.repr.len() + other.repr.len()); + // Concatenating two UTF-8 strings always yields a valid UTF-8 string. + self.repr = StrRef(std::str::from_utf8_unchecked(joined)); + } + self.utf16 += other.utf16; + } + } } } @@ -113,8 +202,10 @@ impl<'s> AddAssign<&Code<'s>> for Code<'s> { /// The length of a [`Code`] object. #[derive(Copy, Clone, Debug, Default, Eq, PartialEq, Serialize, Reflect, Deserialize)] pub struct Length { + #[reflect(skip)] + #[serde(skip)] utf8: usize, - utf16: usize, + utf16: u32, } impl Length { diff --git a/lib/rust/parser/src/source/span.rs b/lib/rust/parser/src/source/span.rs index eb825432ca84..cd243c348a2e 100644 --- a/lib/rust/parser/src/source/span.rs +++ b/lib/rust/parser/src/source/span.rs @@ -28,12 +28,12 @@ pub mod traits { #[allow(missing_docs)] #[reflect(transparent)] pub struct VisibleOffset { - pub width_in_spaces: usize, + pub width_in_spaces: u32, } /// Constructor. #[allow(non_snake_case)] -pub const fn VisibleOffset(width_in_spaces: usize) -> VisibleOffset { +pub const fn VisibleOffset(width_in_spaces: u32) -> VisibleOffset { VisibleOffset { width_in_spaces } } @@ -70,8 +70,7 @@ pub struct Offset<'s> { /// Constructor. #[allow(non_snake_case)] -pub fn Offset<'s>(visible: VisibleOffset, code: impl Into>) -> Offset<'s> { - let code = code.into(); +pub fn Offset(visible: VisibleOffset, code: Code) -> Offset { Offset { visible, code } } @@ -87,6 +86,29 @@ impl<'s> Offset<'s> { pub fn exists(&self) -> bool { !self.is_empty() } + + /// Return a copy of this value, and set this value to a 0-length offset following the returned + /// value. + #[inline(always)] + pub fn take_as_prefix(&mut self) -> Self { + Self { visible: mem::take(&mut self.visible), code: self.code.take_as_prefix() } + } + + /// Return a 0-length `Span` representing the position before the start of this `Span`. + pub fn position_before(&self) -> Self { + Self { visible: default(), code: self.code.position_before() } + } + + /// Return a 0-length `Span` representing the position after the end of this `Span`. + pub fn position_after(&self) -> Self { + Self { visible: default(), code: self.code.position_before() } + } + + /// Return this value with its start position removed (set to 0). This can be used to compare + /// spans ignoring offsets. + pub fn without_offset(&self) -> Self { + Self { visible: self.visible, code: self.code.without_offset() } + } } impl<'s> AsRef> for Offset<'s> { @@ -95,10 +117,10 @@ impl<'s> AsRef> for Offset<'s> { } } -impl<'s> From<&'s str> for Offset<'s> { +impl<'s> From> for Offset<'s> { #[inline(always)] - fn from(code: &'s str) -> Self { - Offset(code.into(), code) + fn from(code: Code<'s>) -> Self { + Offset((*code.repr).into(), code) } } @@ -117,7 +139,6 @@ impl<'s> AddAssign<&Offset<'s>> for Offset<'s> { } - // ============ // === Span === // ============ @@ -127,7 +148,7 @@ impl<'s> AddAssign<&Offset<'s>> for Offset<'s> { /// element. This is done in order to not duplicate the data. For example, some AST nodes contain a /// lot of tokens. They need to remember their span, but they do not need to remember their code, /// because it is already stored in the tokens. -#[derive(Clone, Debug, Default, Eq, PartialEq, Serialize, Reflect, Deserialize)] +#[derive(Clone, Debug, Eq, PartialEq, Serialize, Reflect, Deserialize)] #[allow(missing_docs)] pub struct Span<'s> { #[reflect(hide, flatten)] @@ -139,8 +160,8 @@ pub struct Span<'s> { impl<'s> Span<'s> { /// Constructor. - pub fn new() -> Self { - default() + pub fn empty_without_offset() -> Self { + Self { left_offset: Code::empty_without_offset().into(), code_length: default() } } /// Check whether the span is empty. @@ -258,7 +279,7 @@ pub trait FirstChildTrim<'s> { impl<'s> FirstChildTrim<'s> for Span<'s> { #[inline(always)] fn trim_as_first_child(&mut self) -> Span<'s> { - let left_offset = mem::take(&mut self.left_offset); + let left_offset = self.left_offset.take_as_prefix(); let code_length = self.code_length; Span { left_offset, code_length } } @@ -275,7 +296,7 @@ impl<'s> FirstChildTrim<'s> for Span<'s> { #[macro_export] macro_rules! span_builder { ($($arg:ident),* $(,)?) => { - $crate::source::span::Span::new() $(.add(&mut $arg))* + $crate::source::span::Span::empty_without_offset() $(.add(&mut $arg))* }; } diff --git a/lib/rust/parser/src/syntax/token.rs b/lib/rust/parser/src/syntax/token.rs index 11d973b7bbea..c5ee27186ddf 100644 --- a/lib/rust/parser/src/syntax/token.rs +++ b/lib/rust/parser/src/syntax/token.rs @@ -105,7 +105,7 @@ use enso_shapely_macros::tagged_enum; // ============= /// The lexical token definition. See the module docs to learn more about its usage scenarios. -#[derive(Clone, Default, Deref, DerefMut, Eq, PartialEq, Serialize, Reflect, Deserialize)] +#[derive(Clone, Debug, Default, Deref, DerefMut, Eq, PartialEq, Serialize, Reflect, Deserialize)] #[allow(missing_docs)] pub struct Token<'s, T = Variant> { #[reflect(flatten, hide)] @@ -123,11 +123,10 @@ pub struct Token<'s, T = Variant> { #[allow(non_snake_case)] pub fn Token<'s, T>( left_offset: impl Into>, - code: impl Into>, + code: Code<'s>, variant: T, ) -> Token<'s, T> { let left_offset = left_offset.into(); - let code = code.into(); Token { variant, left_offset, code } } @@ -138,9 +137,10 @@ impl<'s, T> Token<'s, T> { #[inline(always)] pub fn split_at(self, offset: Bytes) -> (Token<'s, ()>, Token<'s, ()>, T) { let left_lexeme_offset = self.left_offset; - let right_lexeme_offset = Offset::default(); - let left = Token(left_lexeme_offset, self.code.slice(Bytes(0)..offset), ()); - let right = Token(right_lexeme_offset, self.code.slice(offset..), ()); + let right_lexeme_offset = self.code.position_after(); + let (left_code, right_code) = self.code.split_at(offset.unchecked_raw()); + let left = Token(left_lexeme_offset, left_code, ()); + let right = Token(right_lexeme_offset, right_code, ()); (left, right, self.variant) } @@ -170,10 +170,15 @@ impl<'s, T> Token<'s, T> { } } -impl<'s, T: Debug> Debug for Token<'s, T> { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - write!(f, "[{}:{:?}] ", self.left_offset.visible, self.code)?; - Debug::fmt(&self.variant, f) +impl<'s, V: Clone> Token<'s, V> { + /// Return this value with all source references stripped of positions. This supports comparing + /// tokens irrespective of their locations in the source. + pub fn without_offsets(&self) -> Self { + Self { + left_offset: self.left_offset.without_offset(), + code: self.code.without_offset(), + variant: self.variant.clone(), + } } } @@ -186,7 +191,7 @@ impl<'s, T: PartialEq> PartialEq> for &Token<'s, T> { impl<'s, T> FirstChildTrim<'s> for Token<'s, T> { #[inline(always)] fn trim_as_first_child(&mut self) -> Span<'s> { - let left_offset = mem::take(&mut self.left_offset); + let left_offset = self.left_offset.take_as_prefix(); let code_length = self.code.length(); Span { left_offset, code_length } } @@ -259,12 +264,12 @@ macro_rules! with_token_definition { ($f:ident ($($args:tt)*)) => { $f! { $($arg BlockStart, BlockEnd, Wildcard { - pub lift_level: usize + pub lift_level: u32 }, AutoScope, Ident { pub is_free: bool, - pub lift_level: usize, + pub lift_level: u32, #[reflect(rename = "is_type_or_constructor")] pub is_type: bool, pub is_operator_lexically: bool, @@ -355,7 +360,7 @@ impl OperatorProperties { } /// Return a copy of this operator, with the given binary infix precedence. - pub fn with_binary_infix_precedence(self, value: usize) -> Self { + pub fn with_binary_infix_precedence(self, value: u32) -> Self { let precedence = Precedence { value }; debug_assert!(precedence > Precedence::min()); Self { binary_infix_precedence: Some(precedence), ..self } @@ -528,7 +533,7 @@ impl OperatorProperties { #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Reflect, Deserialize, PartialOrd, Ord)] pub struct Precedence { /// A numeric value determining precedence order. - value: usize, + value: u32, } impl Precedence { @@ -608,7 +613,7 @@ macro_rules! generate_token_aliases { /// Constructor. pub fn [<$variant:snake:lower>]<'s> ( left_offset: impl Into>, - code: impl Into>, + code: Code<'s>, $($($field : $field_ty),*)? ) -> $variant<'s> { Token(left_offset, code, variant::$variant($($($field),*)?)) @@ -617,7 +622,7 @@ macro_rules! generate_token_aliases { /// Constructor. pub fn [<$variant:snake:lower _>]<'s> ( left_offset: impl Into>, - code: impl Into>, + code: Code<'s>, $($($field : $field_ty),*)? ) -> Token<'s> { Token(left_offset, code, variant::$variant($($($field),*)?)).into() diff --git a/lib/rust/parser/src/syntax/tree.rs b/lib/rust/parser/src/syntax/tree.rs index 55a358d5989a..0d0107b706ba 100644 --- a/lib/rust/parser/src/syntax/tree.rs +++ b/lib/rust/parser/src/syntax/tree.rs @@ -23,7 +23,7 @@ pub mod block; // ============ /// The Abstract Syntax Tree of the language. -#[derive(Clone, Deref, DerefMut, Eq, PartialEq, Serialize, Reflect, Deserialize)] +#[derive(Clone, Debug, Deref, DerefMut, Eq, PartialEq, Serialize, Reflect, Deserialize)] #[allow(missing_docs)] pub struct Tree<'s> { #[reflect(flatten, hide)] @@ -41,19 +41,6 @@ pub fn Tree<'s>(span: Span<'s>, variant: impl Into>) -> Tree<'s> { Tree { variant, span } } -impl<'s> Debug for Tree<'s> { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - let max_code_len = 30; - let ellipsis = "..."; - let mut code = self.code(); - if code.len() > max_code_len { - code = format!("{}{}", &code[..max_code_len - ellipsis.len()], ellipsis); - } - write!(f, "[{}:{}:\"{}\"] ", self.span.left_offset.visible, self.span.code_length, code)?; - Debug::fmt(&self.variant, f) - } -} - impl<'s> AsRef> for Tree<'s> { fn as_ref(&self) -> &Span<'s> { &self.span @@ -64,7 +51,7 @@ impl<'s> Default for Tree<'s> { fn default() -> Self { Self { variant: Box::new(Variant::Ident(Ident { token: Default::default() })), - span: Default::default(), + span: Span::empty_without_offset(), } } } @@ -782,7 +769,7 @@ pub fn apply<'s>(mut func: Tree<'s>, mut arg: Tree<'s>) -> Tree<'s> { func } (_, Variant::ArgumentBlockApplication(block)) if block.lhs.is_none() => { - let func_left_offset = mem::take(&mut func.span.left_offset); + let func_left_offset = func.span.left_offset.take_as_prefix(); let arg_left_offset = mem::replace(&mut arg.span.left_offset, func_left_offset); if let Some(first) = block.arguments.first_mut() { first.newline.left_offset += arg_left_offset; @@ -791,7 +778,7 @@ pub fn apply<'s>(mut func: Tree<'s>, mut arg: Tree<'s>) -> Tree<'s> { arg } (_, Variant::OperatorBlockApplication(block)) if block.lhs.is_none() => { - let func_left_offset = mem::take(&mut func.span.left_offset); + let func_left_offset = func.span.left_offset.take_as_prefix(); let arg_left_offset = mem::replace(&mut arg.span.left_offset, func_left_offset); if let Some(first) = block.expressions.first_mut() { first.newline.left_offset += arg_left_offset; @@ -921,7 +908,7 @@ pub fn apply_operator<'s>( if let Variant::ArgumentBlockApplication(block) = &mut *rhs_.variant { if block.lhs.is_none() { if let Some(first) = block.arguments.first_mut() { - first.newline.left_offset += mem::take(&mut rhs_.span.left_offset); + first.newline.left_offset += rhs_.span.left_offset.take_as_prefix(); } let ArgumentBlockApplication { lhs: _, arguments } = block; let arguments = mem::take(arguments); diff --git a/package-lock.json b/package-lock.json index 8afb02cce039..9bbdacc8376f 100644 --- a/package-lock.json +++ b/package-lock.json @@ -14,6 +14,7 @@ ], "dependencies": { "chromedriver": "^106.0.1", + "enso-gui2": "^0.1.0", "lint": "^0.8.19", "run": "^1.4.0", "tslib": "^2.6.2" @@ -83,6 +84,7 @@ "@vue/tsconfig": "^0.4.0", "ag-grid-community": "^30.1.0", "ag-grid-enterprise": "^30.1.0", + "change-case": "^5.0.2", "d3": "^7.4.0", "esbuild": "^0.19.3", "eslint": "^8.49.0", @@ -94,6 +96,7 @@ "shuffle-seed": "^1.1.6", "sql-formatter": "^13.0.0", "tailwindcss": "^3.2.7", + "tsx": "^3.12.6", "typescript": "~5.2.2", "vite": "^4.4.9", "vite-plugin-inspect": "^0.7.38", @@ -4516,7 +4519,8 @@ }, "node_modules/aggregate-error": { "version": "3.1.0", - "license": "MIT", + "resolved": "https://registry.npmjs.org/aggregate-error/-/aggregate-error-3.1.0.tgz", + "integrity": "sha512-4I7Td01quW/RpocfNayFdFVk1qSuoh0E7JrbRJ16nH01HhKFQ88INq9Sd+nd72zqRySlr9BmDA8xlEJ6vJMrYA==", "dependencies": { "clean-stack": "^2.0.0", "indent-string": "^4.0.0" @@ -5011,7 +5015,8 @@ }, "node_modules/axios": { "version": "0.27.2", - "license": "MIT", + "resolved": "https://registry.npmjs.org/axios/-/axios-0.27.2.tgz", + "integrity": "sha512-t+yRIyySRTp/wua5xEr+z1q60QmLq8ABsS5O9Me1AsE5dfKqgnCFzwiCZZ/cGNd1lq4/7akDWMxdhVlucjmnOQ==", "dependencies": { "follow-redirects": "^1.14.9", "form-data": "^4.0.0" @@ -5642,6 +5647,12 @@ "url": "https://github.com/chalk/chalk?sponsor=1" } }, + "node_modules/change-case": { + "version": "5.0.2", + "resolved": "https://registry.npmjs.org/change-case/-/change-case-5.0.2.tgz", + "integrity": "sha512-tH6YZXViaeC2/Mnt8m4gSmbZfNorC2rhwCX2bXw8SYZWr8ljCPB7iA+1TLG9t7yroWBFauc63LlOZ1gucMVCWw==", + "dev": true + }, "node_modules/char-regex": { "version": "2.0.1", "license": "MIT", @@ -5706,8 +5717,9 @@ }, "node_modules/chromedriver": { "version": "106.0.1", + "resolved": "https://registry.npmjs.org/chromedriver/-/chromedriver-106.0.1.tgz", + "integrity": "sha512-thaBvbDEPgGocSp4/SBIajQz3G7UQfUqCOHZBp9TVhRJv7c91eZrUGcjeJUaNF4p9CfSjCYNYzs4EVVryqmddA==", "hasInstallScript": true, - "license": "Apache-2.0", "dependencies": { "@testim/chrome-version": "^1.1.3", "axios": "^0.27.2", @@ -5746,7 +5758,8 @@ }, "node_modules/clean-stack": { "version": "2.2.0", - "license": "MIT", + "resolved": "https://registry.npmjs.org/clean-stack/-/clean-stack-2.2.0.tgz", + "integrity": "sha512-4diC9HaTE+KRAMWhDhrGOECgWZxoevMc5TlkObMqNSsVU62PYzXZ/SMTjzyGAFF1YusgxGcSWTEXBhp0CPwQ1A==", "engines": { "node": ">=6" } @@ -5967,7 +5980,8 @@ }, "node_modules/compare-versions": { "version": "5.0.3", - "license": "MIT" + "resolved": "https://registry.npmjs.org/compare-versions/-/compare-versions-5.0.3.tgz", + "integrity": "sha512-4UZlZP8Z99MGEY+Ovg/uJxJuvoXuN4M6B3hKaiackiHrgzQFEe3diJi1mf1PNHbFujM7FvLrK2bpgIaImbtZ1A==" }, "node_modules/concat-map": { "version": "0.0.1", @@ -6775,7 +6789,8 @@ }, "node_modules/del": { "version": "6.1.1", - "license": "MIT", + "resolved": "https://registry.npmjs.org/del/-/del-6.1.1.tgz", + "integrity": "sha512-ua8BhapfP0JUJKC/zV9yHHDW/rDoDxP4Zhn3AkA6/xT6gY7jYXJiaeyBZznYVujhZZET+UgcbZiQ7sN3WqcImg==", "dependencies": { "globby": "^11.0.1", "graceful-fs": "^4.2.4", @@ -8617,14 +8632,15 @@ "license": "ISC" }, "node_modules/follow-redirects": { - "version": "1.15.2", + "version": "1.15.3", + "resolved": "https://registry.npmjs.org/follow-redirects/-/follow-redirects-1.15.3.tgz", + "integrity": "sha512-1VzOtuEM8pC9SFU1E+8KfTjZyMztRsgEfwQl44z8A25uy13jSzTj6dyK2Df52iV0vgHCfBwLhDWevLn95w5v6Q==", "funding": [ { "type": "individual", "url": "https://github.com/sponsors/RubenVerborgh" } ], - "license": "MIT", "engines": { "node": ">=4.0" }, @@ -9390,7 +9406,8 @@ }, "node_modules/indent-string": { "version": "4.0.0", - "license": "MIT", + "resolved": "https://registry.npmjs.org/indent-string/-/indent-string-4.0.0.tgz", + "integrity": "sha512-EdDDZu4A2OyIK7Lr/2zG+w5jmbuk1DVBnEwREQvBzspBJkCEbRa8GxU1lghYcaGJCnRWibjDXlq779X1/y5xwg==", "engines": { "node": ">=8" } @@ -9853,7 +9870,8 @@ }, "node_modules/is-path-cwd": { "version": "2.2.0", - "license": "MIT", + "resolved": "https://registry.npmjs.org/is-path-cwd/-/is-path-cwd-2.2.0.tgz", + "integrity": "sha512-w942bTcih8fdJPJmQHFzkS76NEP8Kzzvmw92cXsazb8intwLqPibPPdXf4ANdKV3rYMuuQYGIWtvz9JilB3NFQ==", "engines": { "node": ">=6" } @@ -11868,7 +11886,8 @@ }, "node_modules/p-map": { "version": "4.0.0", - "license": "MIT", + "resolved": "https://registry.npmjs.org/p-map/-/p-map-4.0.0.tgz", + "integrity": "sha512-/bjOqmgETBYB5BoEeGVea8dmvHb2m9GLy1E9W43yeyfP6QQCZGFNa+XRceJEuDB6zqr+gKpIAmlLebMpykw/MQ==", "dependencies": { "aggregate-error": "^3.0.0" }, @@ -12681,7 +12700,8 @@ }, "node_modules/proxy-from-env": { "version": "1.1.0", - "license": "MIT" + "resolved": "https://registry.npmjs.org/proxy-from-env/-/proxy-from-env-1.1.0.tgz", + "integrity": "sha512-D+zkORCbA9f1tdWRK0RaCR3GPv50cMxcrz4X8k5LTSUD1Dkw47mKJEZQNunItRTkWwgtaUSo1RVFRIG9ZXiFYg==" }, "node_modules/prr": { "version": "1.0.1", diff --git a/package.json b/package.json index 6bc959dd74c7..0a51427144a3 100644 --- a/package.json +++ b/package.json @@ -8,6 +8,7 @@ }, "dependencies": { "chromedriver": "^106.0.1", + "enso-gui2": "^0.1.0", "lint": "^0.8.19", "run": "^1.4.0", "tslib": "^2.6.2"