diff --git a/.babelrc b/.babelrc new file mode 100644 index 0000000..10823cf --- /dev/null +++ b/.babelrc @@ -0,0 +1,15 @@ +{ + "ignore": [ + "node_modules/**/*.js" + ], + "compact": false, + "retainLines": false, + "presets": [ + ["env", { + "targets": { + "browsers": ["last 2 versions", "safari >= 7"], + "node": "4.0" + } + }] + ] +} diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..442aed3 --- /dev/null +++ b/.gitattributes @@ -0,0 +1,21 @@ +*.sh text eol=lf +*.bat text eol=crlf +*.php text eol=lf +*.inc text eol=lf +*.html text eol=lf +*.json text eol=lf +*.js text eol=lf +*.css text eol=lf +*.less text eol=lf +*.sass text eol=lf +*.ini text eol=lf +*.txt text eol=lf +*.xml text eol=lf +*.md text eol=lf +*.markdown text eol=lf +*.json5 text eol=lf + +*.pdf binary +*.psd binary +*.pptx binary +*.xlsx binary diff --git a/.gitignore b/.gitignore index b1a6b49..4d7f2ef 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,13 @@ +.DS_Store node_modules/ +npm-debug.log # Editor bak files *~ *.bak *.orig + +# example output +examples/output/ + +/gcc-externs.js diff --git a/.npmignore b/.npmignore new file mode 100644 index 0000000..1ca94d2 --- /dev/null +++ b/.npmignore @@ -0,0 +1,16 @@ +.DS_Store +node_modules/ +npm-debug.log + +# Editor backup files +*.bak +*~ + +# scratch space +/tmp/ + +# Ignore build/publish scripts, etc. +Makefile + +# misc files which are used during development +__patch_*.js diff --git a/.travis.yml b/.travis.yml new file mode 100644 index 0000000..f0913fb --- /dev/null +++ b/.travis.yml @@ -0,0 +1,11 @@ +language: node_js +sudo: false + +node_js: + - 8 + - 7 + - 6 + - 5 + - 4 + - node + diff --git a/LICENSE.md b/LICENSE.md new file mode 100644 index 0000000..e8fcb80 --- /dev/null +++ b/LICENSE.md @@ -0,0 +1,20 @@ +The MIT License (MIT) + +Copyright (c) 2009-2017 Zachary Carter + +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files (the "Software"), to deal in +the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +the Software, and to permit persons to whom the Software is furnished to do so, +subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..5f747d6 --- /dev/null +++ b/Makefile @@ -0,0 +1,170 @@ + +LEX = node ./dist/cli-cjs-es5.js + +ROLLUP = node_modules/.bin/rollup +BABEL = node_modules/.bin/babel +MOCHA = node_modules/.bin/mocha + + + + +all: build test examples + +prep: npm-install + +npm-install: + npm install + +npm-update: + ncu -a --packageFile=package.json + +build: + node __patch_version_in_js.js + node __patch_lexer_kernel_in_js.js + -mkdir -p dist + $(ROLLUP) -c + $(BABEL) dist/regexp-lexer-cjs.js -o dist/regexp-lexer-cjs-es5.js + $(BABEL) dist/regexp-lexer-umd.js -o dist/regexp-lexer-umd-es5.js + $(ROLLUP) -c rollup.config-cli.js + $(BABEL) dist/cli-cjs.js -o dist/cli-cjs-es5.js + $(BABEL) dist/cli-umd.js -o dist/cli-umd-es5.js + node __patch_nodebang_in_js.js + +test: + $(MOCHA) --timeout 18000 --check-leaks --globals assert tests/ + +examples: \ + example-include \ + example-lex \ + examples_basic2_lex \ + examples_basic_lex \ + examples_c99 \ + examples_ccalc_lex \ + examples_classy \ + examples_codegen_feature_tester_base \ + examples_comments \ + examples_compiled_calc_parse \ + examples_faking \ + examples_floop \ + examples_handlebars \ + examples_issue_url_lexing \ + examples_issue_x1 \ + examples_issue_x2 \ + examples_lex_grammar \ + examples_lexer_comm_debug \ + examples_pascal \ + examples_regex \ + examples_semwhitespace \ + examples_tikiwikiparser \ + examples_unicode2 \ + examples_unicode \ + examples_with_custom_lexer \ + examples_with_includes + +example-lex: + $(LEX) examples/lex.l -o examples/output/ -x + +example-include: + $(LEX) examples/with-includes.test.lex -o examples/output/ -x + +examples_basic2_lex: + $(LEX) examples/basic2_lex.jison -o examples/output/ -x + +examples_basic_lex: + $(LEX) examples/basic_lex.jison -o examples/output/ -x + +examples_c99: + $(LEX) examples/c99.l -o examples/output/ -x + +examples_ccalc_lex: + $(LEX) examples/ccalc-lex.l -o examples/output/ -x + +examples_classy: + $(LEX) examples/classy.jisonlex -o examples/output/ -x + +examples_codegen_feature_tester_base: + $(LEX) examples/codegen-feature-tester-base.jison -o examples/output/ -x + +examples_comments: + $(LEX) examples/comments.jison -o examples/output/ -x + +examples_compiled_calc_parse: + $(LEX) examples/compiled_calc_parse.jison -o examples/output/ -x + +examples_faking: + $(LEX) examples/faking-multiple-start-rules-alt.jison -o examples/output/ -x + +examples_floop: + $(LEX) examples/floop.l -o examples/output/ -x + +examples_handlebars: + $(LEX) examples/handlebars.jison.l -o examples/output/ -x + +examples_issue_x1: + $(LEX) examples/issue-19-jison_lex-fixed.jison -o examples/output/ -x + +examples_issue_x2: + $(LEX) examples/issue-19-jison_lex.jison -o examples/output/ -x + +examples_issue_url_lexing: + $(LEX) examples/issue-357-url-lexing.jison -o examples/output/ -x + +examples_lex_grammar: + $(LEX) examples/lex_grammar.jisonlex -o examples/output/ -x + +examples_lexer_comm_debug: + $(LEX) examples/parser-to-lexer-communication-test-w-debug.jison -o examples/output/ -x + +examples_pascal: + $(LEX) examples/pascal.l -o examples/output/ -x + +examples_regex: + $(LEX) examples/regex.jison -o examples/output/ -x + +examples_semwhitespace: + $(LEX) examples/semwhitespace_lex.jison -o examples/output/ -x + +examples_tikiwikiparser: + $(LEX) examples/tikiwikiparser.jison -o examples/output/ -x + +examples_unicode: + $(LEX) examples/unicode.jison -o examples/output/ -x + +examples_unicode2: + $(LEX) examples/unicode2.jison -o examples/output/ -x + +examples_with_includes: + $(LEX) examples/with-includes.jison -o examples/output/ -x + +examples_with_custom_lexer: + $(LEX) examples/with_custom_lexer.jison -o examples/output/ -x + + +# increment the XXX number in the package.json file: version ..- +bump: + +git-tag: + +publish: + npm run pub + + + + + + +clean: + -rm -rf dist/ + -rm -rf node_modules/ + -rm -f package-lock.json + -rm -rf examples/output/ + +superclean: clean + -find . -type d -name 'node_modules' -exec rm -rf "{}" \; + + + + + +.PHONY: all prep npm-install build test examples clean superclean bump git-tag publish example-lex example-include examples_basic2_lex examples_basic_lex examples_c99 examples_ccalc_lex examples_classy examples_codegen_feature_tester_base examples_comments examples_compiled_calc_parse examples_faking examples_floop examples_handlebars examples_issue_url_lexing examples_issue_x1 examples_issue_x2 examples_lex_grammar examples_lexer_comm_debug examples_pascal examples_regex examples_semwhitespace examples_tikiwikiparser examples_unicode2 examples_unicode examples_with_custom_lexer examples_with_includes + diff --git a/README.md b/README.md index 3d93789..0726425 100644 --- a/README.md +++ b/README.md @@ -1,10 +1,120 @@ -# jison-lex +# jison-lex \[OBSOLETED] + + +[![Join the chat at https://gitter.im/jison-parsers-lexers/Lobby](https://badges.gitter.im/jison-parsers-lexers/Lobby.svg)](https://gitter.im/jison-parsers-lexers/Lobby?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge) +[![Build Status](https://travis-ci.org/GerHobbelt/jison-lex.svg?branch=master)](https://travis-ci.org/GerHobbelt/jison-lex) +[![NPM version](https://badge.fury.io/js/jison-gho.svg)](https://badge.fury.io/js/jison-gho) +[![Dependency Status](https://img.shields.io/david/GerHobbelt/jison-lex.svg)](https://david-dm.org/GerHobbelt/jison-lex) +[![npm](https://img.shields.io/npm/dm/@gerhobbelt/jison-lex.svg?maxAge=2592000)]() + + + + A lexical analyzer generator used by [jison](http://jison.org). It takes a lexical grammar definition (either in JSON or Bison's lexical grammar format) and outputs a JavaScript lexer. + + + +> +> # deprecation notice +> +> From today (2017/oct/15) the jison-lex repository is **obsoleted** +> for the `jison-lex` package/codebase: the **primary source** is the +> [jison](https://github.com/GerHobbelt/jison) +> [monorepo](https://medium.com/netscape/the-case-for-monorepos-907c1361708a)'s `packages/jison-lex/` +> directory. See also https://github.com/GerHobbelt/jison/issues/16. +> +> (For a comparable argument, see also ["Why is Babel a monorepo?"](https://github.com/babel/babel/blob/master/doc/design/monorepo.md)) +> +> Issues, pull requests, etc. for `jison-lex` should be filed there; hence +> we do not accept issue reports in this secondary repository any more. +> +> This repository will track the primary source for a while still, but be +> *very aware* that this particular repository will always be lagging behind! +> + + + ## install -npm install jison-lex -g + +`npm install jison-gho` + +Then the `jison-lex` library is located in the subdirectory `packages/jison-lex/` of the `jison-gho` monorepo, i.e. `.../node_modules/jison-gho/packages/jison-lex/`. + +Alternatively, the entire `jison-lex` API is also available via the `jison` API itself as can be seen from this internal `jison` code snippet: + +``` +import Lexer from '../packages/jison-lex'; +import ebnfParser from '../packages/ebnf-parser'; +import lexParser from '../packages/lex-parser'; +import grammarPrinter from './util/grammar-printer.js'; +import helpers from '../packages/helpers-lib'; +var rmCommonWS = helpers.rmCommonWS; +var camelCase = helpers.camelCase; +var code_exec = helpers.exec; +import XRegExp from '@gerhobbelt/xregexp'; +import recast from '@gerhobbelt/recast'; +import astUtils from '@gerhobbelt/ast-util'; +import json5 from '@gerhobbelt/json5'; + +// Also export other APIs: the JISON module should act as a 'facade' for the others, +// so applications using the JISON compiler itself can rely on it providing everything +// in a guaranteed compatible version as it allows userland code to use the precise +// same APIs as JISON will be using itself: +Jison.Lexer = Lexer; +Jison.ebnfParser = ebnfParser; +Jison.lexParser = lexParser; +Jison.codeExec = code_exec; +Jison.XRegExp = XRegExp; +Jison.recast = recast; +Jison.astUtils = astUtils; +Jison.JSON5 = json5; +Jison.prettyPrint = grammarPrinter; +Jison.rmCommonWS = rmCommonWS; +Jison.mkStdOptions = mkStdOptions; +Jison.camelCase = camelCase; +Jison.autodetectAndConvertToJSONformat = autodetectAndConvertToJSONformat; +... +Jison.Parser = Parser; + +export default Jison; +``` + +hence you can get at it this way, for example: + +``` +import jisonAPI from 'jison-gho'; +// get a reference to the full `jison-lex` API: +const jisonLexAPI = jisonAPI.Lexer; +``` + + + +## build + +To build the parser yourself, follow the install & build directions of the [monorepo](https://github.com/GerHobbelt/jison). + +> +> ### Note about ES6/rollup usage vs. ES5 +> +> All `dist/` library files are 'self-contained': they include all 'local imports' +> from within this jison monorepo in order to deliver a choice of source files +> for your perusal where you only need to worry about importing **external dependencies** +> (such as `recast`). +> +> As such, these `dist/` files **should** be easier to minify and/or use in older +> (ES5) environments. +> +> #### rollup +> +> Iff you use `rollup` or similar tools in an ES6/ES2015/ES2017 setting, then the +> [`package.json::module`](https://github.com/rollup/rollup/wiki/pkg.module) has +> already been set up for you to use the *original sources* instead! +> + ## usage + ``` Usage: jison-lex [file] [options] @@ -16,10 +126,11 @@ Options: --version print version and exit ``` -## programatic usage + +## programmatic usage ``` -var JisonLex = require('jison-lex'); +var JisonLex = require('@gerhobbelt/jison-lex'); var grammar = { rules: [ @@ -42,6 +153,29 @@ lexer.lex(); // => 'X' lexer.lex(); // => 'Y' +``` + ## license + MIT + + + +## related repositories + +- [jison / jison-gho](https://github.com/GerHobbelt/jison) @ [NPM](https://www.npmjs.com/package/jison-gho) +- [jison-lex](https://github.com/GerHobbelt/jison/master/packages/jison-lex) @ [NPM](https://www.npmjs.com/package/@gerhobbelt/jison-lex) +- [lex-parser](https://github.com/GerHobbelt/jison/master/packages/lex-parser) @ [NPM](https://www.npmjs.com/package/@gerhobbelt/lex-parser) +- [ebnf-parser](https://github.com/GerHobbelt/jison/master/packages/ebnf-parser) @ [NPM](https://www.npmjs.com/package/@gerhobbelt/ebnf-parser) +- [jison2json](https://github.com/GerHobbelt/jison/master/packages/jison2json) @ [NPM](https://www.npmjs.com/package/@gerhobbelt/jison2json) +- [json2jison](https://github.com/GerHobbelt/jison/master/packages/json2jison) @ [NPM](https://www.npmjs.com/package/@gerhobbelt/json2jison) +- [jison-helpers-lib](https://github.com/GerHobbelt/jison/master/packages/helpers-lib) @ [NPM](https://www.npmjs.com/package/jison-helpers-lib) +- ### secondary source repositories + + [jison-lex](https://github.com/GerHobbelt/jison-lex) + + [lex-parser](https://github.com/GerHobbelt/lex-parser) + + [ebnf-parser](https://github.com/GerHobbelt/ebnf-parser) + + [jison2json](https://github.com/GerHobbelt/jison2json) + + [json2jison](https://github.com/GerHobbelt/json2jison) + + [jison-helpers-lib](https://github.com/GerHobbelt/jison-helpers-lib) + diff --git a/__patch_lexer_kernel_in_js.js b/__patch_lexer_kernel_in_js.js new file mode 100644 index 0000000..eb8dbb6 --- /dev/null +++ b/__patch_lexer_kernel_in_js.js @@ -0,0 +1,56 @@ + +const globby = require('globby'); +const fs = require('fs'); + +var kernel = fs.readFileSync('jison-lexer-kernel.js', 'utf8'); +kernel = kernel +.replace(/\\/g, '\\\\') +.replace(/`/g, '\\`') +// strip header comment too: +.replace(/^[^{]*/, '') +.replace(/[\s\r\n]+$/, '') // rtrim() +; + +var errorClassCode = fs.readFileSync('jison-lexer-error-code.js', 'utf8'); +errorClassCode = errorClassCode +.replace(/\\/g, '\\\\') +.replace(/`/g, '\\`') +.trim(); + +globby(['regexp-lexer.js']).then(paths => { + var count = 0; + + //console.log(paths); + paths.forEach(path => { + var updated = false; + + //console.log('path: ', path); + + var src = fs.readFileSync(path, 'utf8'); + src = src + .replace(/(\/\/ --- START lexer kernel ---)[^]+?(\/\/ --- END lexer kernel ---)/, function f(m, p1, p2) { + return p1 + ` +return \`${kernel}\`; + ` + p2; + }) + .replace(/(\/\/ --- START lexer error class ---)[^]+?(\/\/ --- END lexer error class ---)/, function f(m, p1, p2) { + return p1 + ` + +var prelude = \`${errorClassCode}\`; + + ` + p2; + }); + updated = true; + + if (updated) { + count++; + console.log('updated: ', path); + fs.writeFileSync(path, src, { + encoding: 'utf8', + flags: 'w' + }); + } + }); + + console.log('\nUpdated', count, 'files\' lexer kernel core code.'); +}); diff --git a/__patch_nodebang_in_js.js b/__patch_nodebang_in_js.js new file mode 100644 index 0000000..50319f1 --- /dev/null +++ b/__patch_nodebang_in_js.js @@ -0,0 +1,30 @@ + +const globby = require('globby'); +const fs = require('fs'); + + +globby(['dist/cli*.js']).then(paths => { + var count = 0; + + //console.log(paths); + paths.forEach(path => { + var updated = false; + + //console.log('path: ', path); + + var src = fs.readFileSync(path, 'utf8'); + src = "#!/usr/bin/env node\n\n\n" + src.replace(/^#![^\n]+/, ''); + updated = true; + + if (updated) { + count++; + console.log('updated: ', path); + fs.writeFileSync(path, src, { + encoding: 'utf8', + flags: 'w' + }); + } + }); + + console.log('\nUpdated', count, 'files\' CLI/node hash-bang'); +}); diff --git a/__patch_version_in_js.js b/__patch_version_in_js.js new file mode 100644 index 0000000..21861e7 --- /dev/null +++ b/__patch_version_in_js.js @@ -0,0 +1,37 @@ + +// fetch the version from package.json and patch the specified files + +const version = require('./package.json').version; +const globby = require('globby'); +const fs = require('fs'); + + +globby(['*lexer*.js', '*cli*.js']).then(paths => { + var count = 0; + + //console.log(paths); + paths.forEach(path => { + var updated = false; + + //console.log('path: ', path); + + var src = fs.readFileSync(path, 'utf8'); + src = src.replace(/^(\s*var version = )([^;]+;)/gm, function repl(s, m1, m2) { + if (m2 !== "'" + version + "';") { + updated = true; + } + return m1 + "'" + version + "';"; + }); + + if (updated) { + count++; + console.log('updated: ', path); + fs.writeFileSync(path, src, { + encoding: 'utf8', + flags: 'w' + }); + } + }); + + console.log('\nUpdated', count, 'files\' version info to version', version); +}); diff --git a/cli.js b/cli.js index 8ca2894..df6ecb0 100755 --- a/cli.js +++ b/cli.js @@ -1,86 +1,235 @@ -#!/usr/bin/env node - -var version = require('./package.json').version; - -var path = require('path'); -var fs = require('fs'); -var lexParser = require('lex-parser'); -var RegExpLexer = require('./regexp-lexer.js'); - - -var opts = require("nomnom") - .script('jison-lex') - .option('file', { - flag: true, - position: 0, - help: 'file containing a lexical grammar' - }) - .option('outfile', { - abbr: 'o', - metavar: 'FILE', - help: 'Filename and base module name of the generated parser' - }) - .option('module-type', { - abbr: 't', - default: 'commonjs', - metavar: 'TYPE', - help: 'The type of module to generate (commonjs, js)' - }) - .option('version', { - abbr: 'V', - flag: true, - help: 'print version and exit', - callback: function() { - return version; - } - }); -exports.main = function (opts) { - if (opts.file) { - var raw = fs.readFileSync(path.normalize(opts.file), 'utf8'), - name = path.basename((opts.outfile||opts.file)).replace(/\..*$/g,''); +import fs from 'fs'; +import path from 'path'; +import nomnom from '@gerhobbelt/nomnom'; - fs.writeFileSync(opts.outfile||(name + '.js'), processGrammar(raw, name)); - } else { - readin(function (raw) { - console.log(processGrammar(raw)); - }); +import RegExpLexer from './regexp-lexer.js'; + +var version = '0.6.1-208'; // require('./package.json').version; + + +function getCommandlineOptions() { + 'use strict'; + + var opts = nomnom + .script('jison-lex') + .unknownOptionTreatment(false) // do not accept unknown options! + .options({ + file: { + flag: true, + position: 0, + help: 'file containing a lexical grammar' + }, + json: { + abbr: 'j', + flag: true, + default: false, + help: 'jison will expect a grammar in either JSON/JSON5 or JISON format: the precise format is autodetected' + }, + outfile: { + abbr: 'o', + metavar: 'FILE', + help : 'Filepath and base module name of the generated parser;\nwhen terminated with a / (dir separator) it is treated as the destination directory where the generated output will be stored' + }, + debug: { + abbr: 'd', + flag: true, + default: false, + help: 'Debug mode' + }, + dumpSourceCodeOnFailure: { + full: 'dump-sourcecode-on-failure', + flag: true, + default: true, + help: 'Dump the generated source code to a special named file when the internal generator tests fail, i.e. when the generated source code does not compile in the JavaScript engine. Enabling this option helps you to diagnose/debug crashes (thrown exceptions) in the code generator due to various reasons: you can, for example, load the dumped sourcecode in another environment (e.g. NodeJS) to get more info on the precise location and cause of the compile failure.' + }, + throwErrorOnCompileFailure: { + full: 'throw-on-compile-failure', + flag: true, + default: true, + help: 'Throw an exception when the generated source code fails to compile in the JavaScript engine. **WARNING**: Turning this feature OFF permits the code generator to produce non-working source code and treat that as SUCCESS. This MAY be desirable code generator behaviour, but only rarely.' + }, + reportStats: { + full: 'info', + abbr: 'I', + flag: true, + default: false, + help: 'Report some statistics about the generated parser' + }, + moduleType: { + full: 'module-type', + abbr: 't', + default: 'commonjs', + metavar: 'TYPE', + choices: ['commonjs', 'amd', 'js', 'es'], + help: 'The type of module to generate (commonjs, amd, es, js)' + }, + moduleName: { + full: 'module-name', + abbr: 'n', + metavar: 'NAME', + help: 'The name of the generated parser object, namespace supported' + }, + main: { + full: 'main', + abbr: 'x', + flag: true, + default: false, + help: 'Include .main() entry point in generated commonjs module' + }, + moduleMain: { + full: 'module-main', + abbr: 'y', + metavar: 'NAME', + help: 'The main module function definition' + }, + version: { + abbr: 'V', + flag: true, + help: 'print version and exit', + callback: function () { + return version; + } + } + }).parse(); + + return opts; +} + +var cli = module.exports; + +cli.main = function cliMain(opts) { + 'use strict'; + + opts = RegExpLexer.mkStdOptions(opts); + + function isDirectory(fp) { + try { + return fs.lstatSync(fp).isDirectory(); + } catch (e) { + return false; + } } -}; -function processGrammar (file, name) { - var grammar; - try { - grammar = lexParser.parse(file); - } catch (e) { + function mkdirp(fp) { + if (!fp || fp === '.' || fp.length === 0) { + return false; + } try { - grammar = JSON.parse(file); - } catch (e2) { - throw e; + fs.mkdirSync(fp); + return true; + } catch (e) { + if (e.code === 'ENOENT') { + var parent = path.dirname(fp); + // Did we hit the root directory by now? If so, abort! + // Else, create the parent; iff that fails, we fail too... + if (parent !== fp && mkdirp(parent)) { + try { + // Retry creating the original directory: it should succeed now + fs.mkdirSync(fp); + return true; + } catch (e) { + return false; + } + } + } } + return false; } - var settings = grammar.options || {}; - if (!settings.moduleType) settings.moduleType = opts['module-type']; - if (!settings.moduleName && name) settings.moduleName = name.replace(/-\w/g, function (match){ return match.charAt(1).toUpperCase(); }); + function processInputFile() { + // getting raw files + var lex; + var original_cwd = process.cwd(); - grammar.options = settings; + var raw = fs.readFileSync(path.normalize(opts.file), 'utf8'); - return RegExpLexer.generate(grammar); -} + // making best guess at json mode + opts.json = path.extname(opts.file) === '.json' || opts.json; + + // When only the directory part of the output path was specified, then we + // do NOT have the target module name in there as well! + var outpath = opts.outfile; + if (/[\\\/]$/.test(outpath) || isDirectory(outpath)) { + opts.outfile = null; + outpath = outpath.replace(/[\\\/]$/, ''); + } + if (outpath && outpath.length > 0) { + outpath += '/'; + } else { + outpath = ''; + } + + // setting output file name and module name based on input file name + // if they aren't specified. + var name = path.basename(opts.outfile || opts.file); + + // get the base name (i.e. the file name without extension) + // i.e. strip off only the extension and keep any other dots in the filename + name = path.basename(name, path.extname(name)); -function readin (cb) { - var stdin = process.openStdin(), + opts.outfile = opts.outfile || (outpath + name + '.js'); + if (!opts.moduleName && name) { + opts.moduleName = opts.defaultModuleName = name.replace(/-\w/g, + function (match) { + return match.charAt(1).toUpperCase(); + }); + } + + // Change CWD to the directory where the source grammar resides: this helps us properly + // %include any files mentioned in the grammar with relative paths: + var new_cwd = path.dirname(path.normalize(opts.file)); + process.chdir(new_cwd); + + var lexer = cli.generateLexerString(raw, opts); + + // and change back to the CWD we started out with: + process.chdir(original_cwd); + + mkdirp(path.dirname(opts.outfile)); + fs.writeFileSync(opts.outfile, lexer); + console.log('JISON-LEX output for module [' + opts.moduleName + '] has been written to file:', opts.outfile); + } + + function readin(cb) { + var stdin = process.openStdin(), data = ''; - stdin.setEncoding('utf8'); - stdin.addListener('data', function (chunk) { - data += chunk; - }); - stdin.addListener('end', function () { - cb(data); - }); + stdin.setEncoding('utf8'); + stdin.addListener('data', function (chunk) { + data += chunk; + }); + stdin.addListener('end', function () { + cb(data); + }); + } + + function processStdin() { + readin(function processStdinReadInCallback(raw) { + console.log(cli.generateLexerString(raw, opts)); + }); + } + + // if an input file wasn't given, assume input on stdin + if (opts.file) { + processInputFile(); + } else { + processStdin(); + } +}; + +cli.generateLexerString = function generateLexerString(lexerSpec, opts) { + 'use strict'; + + // var settings = RegExpLexer.mkStdOptions(opts); + var predefined_tokens = null; + + return RegExpLexer.generate(lexerSpec, predefined_tokens, opts); +}; + + +if (require.main === module) { + var opts = getCommandlineOptions(); + cli.main(opts); } -if (require.main === module) - exports.main(opts.parse()); diff --git a/dist/cli-cjs-es5.js b/dist/cli-cjs-es5.js new file mode 100644 index 0000000..4594750 --- /dev/null +++ b/dist/cli-cjs-es5.js @@ -0,0 +1,2775 @@ +#!/usr/bin/env node + + +'use strict'; + +var _typeof = typeof Symbol === "function" && typeof Symbol.iterator === "symbol" ? function (obj) { return typeof obj; } : function (obj) { return obj && typeof Symbol === "function" && obj.constructor === Symbol && obj !== Symbol.prototype ? "symbol" : typeof obj; }; + +var _templateObject = _taggedTemplateLiteral(['\n var __hacky_counter__ = 0;\n\n /**\n * @constructor\n * @nocollapse\n */\n function XRegExp(re, f) {\n this.re = re;\n this.flags = f;\n this._getUnicodeProperty = function (k) {};\n var fake = /./; // WARNING: this exact \'fake\' is also depended upon by the xregexp unit test!\n __hacky_counter__++;\n fake.__hacky_backy__ = __hacky_counter__;\n return fake;\n }\n '], ['\n var __hacky_counter__ = 0;\n\n /**\n * @constructor\n * @nocollapse\n */\n function XRegExp(re, f) {\n this.re = re;\n this.flags = f;\n this._getUnicodeProperty = function (k) {};\n var fake = /./; // WARNING: this exact \'fake\' is also depended upon by the xregexp unit test!\n __hacky_counter__++;\n fake.__hacky_backy__ = __hacky_counter__;\n return fake;\n }\n ']), + _templateObject2 = _taggedTemplateLiteral(['\n return ', ';\n'], ['\n return ', ';\n']), + _templateObject3 = _taggedTemplateLiteral(['\n // Code Generator Information Report\n // ---------------------------------\n //\n // Options:\n //\n // backtracking: .................... ', '\n // location.ranges: ................. ', '\n // location line+column tracking: ... ', '\n //\n //\n // Forwarded Parser Analysis flags:\n //\n // uses yyleng: ..................... ', '\n // uses yylineno: ................... ', '\n // uses yytext: ..................... ', '\n // uses yylloc: ..................... ', '\n // uses lexer values: ............... ', ' / ', '\n // location tracking: ............... ', '\n // location assignment: ............. ', '\n //\n //\n // Lexer Analysis flags:\n //\n // uses yyleng: ..................... ', '\n // uses yylineno: ................... ', '\n // uses yytext: ..................... ', '\n // uses yylloc: ..................... ', '\n // uses ParseError API: ............. ', '\n // uses yyerror: .................... ', '\n // uses location tracking & editing: ', '\n // uses more() API: ................. ', '\n // uses unput() API: ................ ', '\n // uses reject() API: ............... ', '\n // uses less() API: ................. ', '\n // uses display APIs pastInput(), upcomingInput(), showPosition():\n // ............................. ', '\n // uses describeYYLLOC() API: ....... ', '\n //\n // --------- END OF REPORT -----------\n\n '], ['\n // Code Generator Information Report\n // ---------------------------------\n //\n // Options:\n //\n // backtracking: .................... ', '\n // location.ranges: ................. ', '\n // location line+column tracking: ... ', '\n //\n //\n // Forwarded Parser Analysis flags:\n //\n // uses yyleng: ..................... ', '\n // uses yylineno: ................... ', '\n // uses yytext: ..................... ', '\n // uses yylloc: ..................... ', '\n // uses lexer values: ............... ', ' / ', '\n // location tracking: ............... ', '\n // location assignment: ............. ', '\n //\n //\n // Lexer Analysis flags:\n //\n // uses yyleng: ..................... ', '\n // uses yylineno: ................... ', '\n // uses yytext: ..................... ', '\n // uses yylloc: ..................... ', '\n // uses ParseError API: ............. ', '\n // uses yyerror: .................... ', '\n // uses location tracking & editing: ', '\n // uses more() API: ................. ', '\n // uses unput() API: ................ ', '\n // uses reject() API: ............... ', '\n // uses less() API: ................. ', '\n // uses display APIs pastInput(), upcomingInput(), showPosition():\n // ............................. ', '\n // uses describeYYLLOC() API: ....... ', '\n //\n // --------- END OF REPORT -----------\n\n ']), + _templateObject4 = _taggedTemplateLiteral(['\n var lexer = {\n '], ['\n var lexer = {\n ']), + _templateObject5 = _taggedTemplateLiteral([',\n JisonLexerError: JisonLexerError,\n performAction: ', ',\n simpleCaseActionClusters: ', ',\n rules: [\n ', '\n ],\n conditions: ', '\n };\n '], [',\n JisonLexerError: JisonLexerError,\n performAction: ', ',\n simpleCaseActionClusters: ', ',\n rules: [\n ', '\n ],\n conditions: ', '\n };\n ']), + _templateObject6 = _taggedTemplateLiteral(['\n /* lexer generated by jison-lex ', ' */\n\n /*\n * Returns a Lexer object of the following structure:\n *\n * Lexer: {\n * yy: {} The so-called "shared state" or rather the *source* of it;\n * the real "shared state" `yy` passed around to\n * the rule actions, etc. is a direct reference!\n *\n * This "shared context" object was passed to the lexer by way of \n * the `lexer.setInput(str, yy)` API before you may use it.\n *\n * This "shared context" object is passed to the lexer action code in `performAction()`\n * so userland code in the lexer actions may communicate with the outside world \n * and/or other lexer rules\' actions in more or less complex ways.\n *\n * }\n *\n * Lexer.prototype: {\n * EOF: 1,\n * ERROR: 2,\n *\n * yy: The overall "shared context" object reference.\n *\n * JisonLexerError: function(msg, hash),\n *\n * performAction: function lexer__performAction(yy, yyrulenumber, YY_START),\n *\n * The function parameters and `this` have the following value/meaning:\n * - `this` : reference to the `lexer` instance. \n * `yy_` is an alias for `this` lexer instance reference used internally.\n *\n * - `yy` : a reference to the `yy` "shared state" object which was passed to the lexer\n * by way of the `lexer.setInput(str, yy)` API before.\n *\n * Note:\n * The extra arguments you specified in the `%parse-param` statement in your\n * **parser** grammar definition file are passed to the lexer via this object\n * reference as member variables.\n *\n * - `yyrulenumber` : index of the matched lexer rule (regex), used internally.\n *\n * - `YY_START`: the current lexer "start condition" state.\n *\n * parseError: function(str, hash, ExceptionClass),\n *\n * constructLexErrorInfo: function(error_message, is_recoverable),\n * Helper function.\n * Produces a new errorInfo \'hash object\' which can be passed into `parseError()`.\n * See it\'s use in this lexer kernel in many places; example usage:\n *\n * var infoObj = lexer.constructParseErrorInfo(\'fail!\', true);\n * var retVal = lexer.parseError(infoObj.errStr, infoObj, lexer.JisonLexerError);\n *\n * options: { ... lexer %options ... },\n *\n * lex: function(),\n * Produce one token of lexed input, which was passed in earlier via the `lexer.setInput()` API.\n * You MAY use the additional `args...` parameters as per `%parse-param` spec of the **lexer** grammar:\n * these extra `args...` are added verbatim to the `yy` object reference as member variables.\n *\n * WARNING:\n * Lexer\'s additional `args...` parameters (via lexer\'s `%parse-param`) MAY conflict with\n * any attributes already added to `yy` by the **parser** or the jison run-time; \n * when such a collision is detected an exception is thrown to prevent the generated run-time \n * from silently accepting this confusing and potentially hazardous situation! \n *\n * cleanupAfterLex: function(do_not_nuke_errorinfos),\n * Helper function.\n *\n * This helper API is invoked when the **parse process** has completed: it is the responsibility\n * of the **parser** (or the calling userland code) to invoke this method once cleanup is desired. \n *\n * This helper may be invoked by user code to ensure the internal lexer gets properly garbage collected.\n *\n * setInput: function(input, [yy]),\n *\n *\n * input: function(),\n *\n *\n * unput: function(str),\n *\n *\n * more: function(),\n *\n *\n * reject: function(),\n *\n *\n * less: function(n),\n *\n *\n * pastInput: function(n),\n *\n *\n * upcomingInput: function(n),\n *\n *\n * showPosition: function(),\n *\n *\n * test_match: function(regex_match_array, rule_index),\n *\n *\n * next: function(),\n *\n *\n * begin: function(condition),\n *\n *\n * pushState: function(condition),\n *\n *\n * popState: function(),\n *\n *\n * topState: function(),\n *\n *\n * _currentRules: function(),\n *\n *\n * stateStackSize: function(),\n *\n *\n * performAction: function(yy, yy_, yyrulenumber, YY_START),\n *\n *\n * rules: [...],\n *\n *\n * conditions: {associative list: name ==> set},\n * }\n *\n *\n * token location info (`yylloc`): {\n * first_line: n,\n * last_line: n,\n * first_column: n,\n * last_column: n,\n * range: [start_number, end_number]\n * (where the numbers are indexes into the input string, zero-based)\n * }\n *\n * ---\n *\n * The `parseError` function receives a \'hash\' object with these members for lexer errors:\n *\n * {\n * text: (matched text)\n * token: (the produced terminal token, if any)\n * token_id: (the produced terminal token numeric ID, if any)\n * line: (yylineno)\n * loc: (yylloc)\n * recoverable: (boolean: TRUE when the parser MAY have an error recovery rule\n * available for this particular error)\n * yy: (object: the current parser internal "shared state" `yy`\n * as is also available in the rule actions; this can be used,\n * for instance, for advanced error analysis and reporting)\n * lexer: (reference to the current lexer instance used by the parser)\n * }\n *\n * while `this` will reference the current lexer instance.\n *\n * When `parseError` is invoked by the lexer, the default implementation will\n * attempt to invoke `yy.parser.parseError()`; when this callback is not provided\n * it will try to invoke `yy.parseError()` instead. When that callback is also not\n * provided, a `JisonLexerError` exception will be thrown containing the error\n * message and `hash`, as constructed by the `constructLexErrorInfo()` API.\n *\n * Note that the lexer\'s `JisonLexerError` error class is passed via the\n * `ExceptionClass` argument, which is invoked to construct the exception\n * instance to be thrown, so technically `parseError` will throw the object\n * produced by the `new ExceptionClass(str, hash)` JavaScript expression.\n *\n * ---\n *\n * You can specify lexer options by setting / modifying the `.options` object of your Lexer instance.\n * These options are available:\n *\n * (Options are permanent.)\n * \n * yy: {\n * parseError: function(str, hash, ExceptionClass)\n * optional: overrides the default `parseError` function.\n * }\n *\n * lexer.options: {\n * pre_lex: function()\n * optional: is invoked before the lexer is invoked to produce another token.\n * `this` refers to the Lexer object.\n * post_lex: function(token) { return token; }\n * optional: is invoked when the lexer has produced a token `token`;\n * this function can override the returned token value by returning another.\n * When it does not return any (truthy) value, the lexer will return\n * the original `token`.\n * `this` refers to the Lexer object.\n *\n * WARNING: the next set of options are not meant to be changed. They echo the abilities of\n * the lexer as per when it was compiled!\n *\n * ranges: boolean\n * optional: `true` ==> token location info will include a .range[] member.\n * flex: boolean\n * optional: `true` ==> flex-like lexing behaviour where the rules are tested\n * exhaustively to find the longest match.\n * backtrack_lexer: boolean\n * optional: `true` ==> lexer regexes are tested in order and for invoked;\n * the lexer terminates the scan when a token is returned by the action code.\n * xregexp: boolean\n * optional: `true` ==> lexer rule regexes are "extended regex format" requiring the\n * `XRegExp` library. When this %option has not been specified at compile time, all lexer\n * rule regexes have been written as standard JavaScript RegExp expressions.\n * }\n */\n '], ['\n /* lexer generated by jison-lex ', ' */\n\n /*\n * Returns a Lexer object of the following structure:\n *\n * Lexer: {\n * yy: {} The so-called "shared state" or rather the *source* of it;\n * the real "shared state" \\`yy\\` passed around to\n * the rule actions, etc. is a direct reference!\n *\n * This "shared context" object was passed to the lexer by way of \n * the \\`lexer.setInput(str, yy)\\` API before you may use it.\n *\n * This "shared context" object is passed to the lexer action code in \\`performAction()\\`\n * so userland code in the lexer actions may communicate with the outside world \n * and/or other lexer rules\' actions in more or less complex ways.\n *\n * }\n *\n * Lexer.prototype: {\n * EOF: 1,\n * ERROR: 2,\n *\n * yy: The overall "shared context" object reference.\n *\n * JisonLexerError: function(msg, hash),\n *\n * performAction: function lexer__performAction(yy, yyrulenumber, YY_START),\n *\n * The function parameters and \\`this\\` have the following value/meaning:\n * - \\`this\\` : reference to the \\`lexer\\` instance. \n * \\`yy_\\` is an alias for \\`this\\` lexer instance reference used internally.\n *\n * - \\`yy\\` : a reference to the \\`yy\\` "shared state" object which was passed to the lexer\n * by way of the \\`lexer.setInput(str, yy)\\` API before.\n *\n * Note:\n * The extra arguments you specified in the \\`%parse-param\\` statement in your\n * **parser** grammar definition file are passed to the lexer via this object\n * reference as member variables.\n *\n * - \\`yyrulenumber\\` : index of the matched lexer rule (regex), used internally.\n *\n * - \\`YY_START\\`: the current lexer "start condition" state.\n *\n * parseError: function(str, hash, ExceptionClass),\n *\n * constructLexErrorInfo: function(error_message, is_recoverable),\n * Helper function.\n * Produces a new errorInfo \\\'hash object\\\' which can be passed into \\`parseError()\\`.\n * See it\\\'s use in this lexer kernel in many places; example usage:\n *\n * var infoObj = lexer.constructParseErrorInfo(\\\'fail!\\\', true);\n * var retVal = lexer.parseError(infoObj.errStr, infoObj, lexer.JisonLexerError);\n *\n * options: { ... lexer %options ... },\n *\n * lex: function(),\n * Produce one token of lexed input, which was passed in earlier via the \\`lexer.setInput()\\` API.\n * You MAY use the additional \\`args...\\` parameters as per \\`%parse-param\\` spec of the **lexer** grammar:\n * these extra \\`args...\\` are added verbatim to the \\`yy\\` object reference as member variables.\n *\n * WARNING:\n * Lexer\'s additional \\`args...\\` parameters (via lexer\'s \\`%parse-param\\`) MAY conflict with\n * any attributes already added to \\`yy\\` by the **parser** or the jison run-time; \n * when such a collision is detected an exception is thrown to prevent the generated run-time \n * from silently accepting this confusing and potentially hazardous situation! \n *\n * cleanupAfterLex: function(do_not_nuke_errorinfos),\n * Helper function.\n *\n * This helper API is invoked when the **parse process** has completed: it is the responsibility\n * of the **parser** (or the calling userland code) to invoke this method once cleanup is desired. \n *\n * This helper may be invoked by user code to ensure the internal lexer gets properly garbage collected.\n *\n * setInput: function(input, [yy]),\n *\n *\n * input: function(),\n *\n *\n * unput: function(str),\n *\n *\n * more: function(),\n *\n *\n * reject: function(),\n *\n *\n * less: function(n),\n *\n *\n * pastInput: function(n),\n *\n *\n * upcomingInput: function(n),\n *\n *\n * showPosition: function(),\n *\n *\n * test_match: function(regex_match_array, rule_index),\n *\n *\n * next: function(),\n *\n *\n * begin: function(condition),\n *\n *\n * pushState: function(condition),\n *\n *\n * popState: function(),\n *\n *\n * topState: function(),\n *\n *\n * _currentRules: function(),\n *\n *\n * stateStackSize: function(),\n *\n *\n * performAction: function(yy, yy_, yyrulenumber, YY_START),\n *\n *\n * rules: [...],\n *\n *\n * conditions: {associative list: name ==> set},\n * }\n *\n *\n * token location info (\\`yylloc\\`): {\n * first_line: n,\n * last_line: n,\n * first_column: n,\n * last_column: n,\n * range: [start_number, end_number]\n * (where the numbers are indexes into the input string, zero-based)\n * }\n *\n * ---\n *\n * The \\`parseError\\` function receives a \\\'hash\\\' object with these members for lexer errors:\n *\n * {\n * text: (matched text)\n * token: (the produced terminal token, if any)\n * token_id: (the produced terminal token numeric ID, if any)\n * line: (yylineno)\n * loc: (yylloc)\n * recoverable: (boolean: TRUE when the parser MAY have an error recovery rule\n * available for this particular error)\n * yy: (object: the current parser internal "shared state" \\`yy\\`\n * as is also available in the rule actions; this can be used,\n * for instance, for advanced error analysis and reporting)\n * lexer: (reference to the current lexer instance used by the parser)\n * }\n *\n * while \\`this\\` will reference the current lexer instance.\n *\n * When \\`parseError\\` is invoked by the lexer, the default implementation will\n * attempt to invoke \\`yy.parser.parseError()\\`; when this callback is not provided\n * it will try to invoke \\`yy.parseError()\\` instead. When that callback is also not\n * provided, a \\`JisonLexerError\\` exception will be thrown containing the error\n * message and \\`hash\\`, as constructed by the \\`constructLexErrorInfo()\\` API.\n *\n * Note that the lexer\\\'s \\`JisonLexerError\\` error class is passed via the\n * \\`ExceptionClass\\` argument, which is invoked to construct the exception\n * instance to be thrown, so technically \\`parseError\\` will throw the object\n * produced by the \\`new ExceptionClass(str, hash)\\` JavaScript expression.\n *\n * ---\n *\n * You can specify lexer options by setting / modifying the \\`.options\\` object of your Lexer instance.\n * These options are available:\n *\n * (Options are permanent.)\n * \n * yy: {\n * parseError: function(str, hash, ExceptionClass)\n * optional: overrides the default \\`parseError\\` function.\n * }\n *\n * lexer.options: {\n * pre_lex: function()\n * optional: is invoked before the lexer is invoked to produce another token.\n * \\`this\\` refers to the Lexer object.\n * post_lex: function(token) { return token; }\n * optional: is invoked when the lexer has produced a token \\`token\\`;\n * this function can override the returned token value by returning another.\n * When it does not return any (truthy) value, the lexer will return\n * the original \\`token\\`.\n * \\`this\\` refers to the Lexer object.\n *\n * WARNING: the next set of options are not meant to be changed. They echo the abilities of\n * the lexer as per when it was compiled!\n *\n * ranges: boolean\n * optional: \\`true\\` ==> token location info will include a .range[] member.\n * flex: boolean\n * optional: \\`true\\` ==> flex-like lexing behaviour where the rules are tested\n * exhaustively to find the longest match.\n * backtrack_lexer: boolean\n * optional: \\`true\\` ==> lexer regexes are tested in order and for invoked;\n * the lexer terminates the scan when a token is returned by the action code.\n * xregexp: boolean\n * optional: \\`true\\` ==> lexer rule regexes are "extended regex format" requiring the\n * \\`XRegExp\\` library. When this %option has not been specified at compile time, all lexer\n * rule regexes have been written as standard JavaScript RegExp expressions.\n * }\n */\n ']), + _templateObject7 = _taggedTemplateLiteral(['\n export {\n lexer,\n yylex as lex\n };\n '], ['\n export {\n lexer,\n yylex as lex\n };\n ']); + +function _taggedTemplateLiteral(strings, raw) { return Object.freeze(Object.defineProperties(strings, { raw: { value: Object.freeze(raw) } })); } + +function _interopDefault(ex) { + return ex && (typeof ex === 'undefined' ? 'undefined' : _typeof(ex)) === 'object' && 'default' in ex ? ex['default'] : ex; +} + +var fs = _interopDefault(require('fs')); +var path = _interopDefault(require('path')); +var nomnom = _interopDefault(require('@gerhobbelt/nomnom')); +var XRegExp = _interopDefault(require('@gerhobbelt/xregexp')); +var json5 = _interopDefault(require('@gerhobbelt/json5')); +var lexParser = _interopDefault(require('@gerhobbelt/lex-parser')); +var assert = _interopDefault(require('assert')); +var helpers = _interopDefault(require('jison-helpers-lib')); + +// +// Helper library for set definitions +// +// MIT Licensed +// +// +// This code is intended to help parse regex set expressions and mix them +// together, i.e. to answer questions like this: +// +// what is the resulting regex set expression when we mix the regex set +// `[a-z]` with the regex set `[^\s]` where with 'mix' we mean that any +// input which matches either input regex should match the resulting +// regex set. (a.k.a. Full Outer Join, see also http://www.diffen.com/difference/Inner_Join_vs_Outer_Join) +// + +'use strict'; + +var XREGEXP_UNICODE_ESCAPE_RE$1 = /^\{[A-Za-z0-9 \-\._]+\}/; // Matches the XRegExp Unicode escape braced part, e.g. `{Number}` +var CHR_RE$1 = /^(?:[^\\]|\\[^cxu0-9]|\\[0-9]{1,3}|\\c[A-Z]|\\x[0-9a-fA-F]{2}|\\u[0-9a-fA-F]{4}|\\u\{[0-9a-fA-F]+\})/; +var SET_PART_RE$1 = /^(?:[^\\\]]|\\[^cxu0-9]|\\[0-9]{1,3}|\\c[A-Z]|\\x[0-9a-fA-F]{2}|\\u[0-9a-fA-F]{4}|\\u\{[0-9a-fA-F]+\})+/; +var NOTHING_SPECIAL_RE$1 = /^(?:[^\\\[\]\(\)\|^\{\}]|\\[^cxu0-9]|\\[0-9]{1,3}|\\c[A-Z]|\\x[0-9a-fA-F]{2}|\\u[0-9a-fA-F]{4}|\\u\{[0-9a-fA-F]+\})+/; +var SET_IS_SINGLE_PCODE_RE = /^\\[dDwWsS]$|^\\p\{[A-Za-z0-9 \-\._]+\}$/; + +var UNICODE_BASE_PLANE_MAX_CP$1 = 65535; + +// The expanded regex sets which are equivalent to the given `\\{c}` escapes: +// +// `/\s/`: +var WHITESPACE_SETSTR$1 = ' \f\n\r\t\x0B\xA0\u1680\u180E\u2000-\u200A\u2028\u2029\u202F\u205F\u3000\uFEFF'; +// `/\d/`: +var DIGIT_SETSTR$1 = '0-9'; +// `/\w/`: +var WORDCHAR_SETSTR$1 = 'A-Za-z0-9_'; + +// Helper for `bitarray2set()`: convert character code to a representation string suitable for use in a regex +function i2c(i) { + var c, x; + + switch (i) { + case 10: + return '\\n'; + + case 13: + return '\\r'; + + case 9: + return '\\t'; + + case 8: + return '\\b'; + + case 12: + return '\\f'; + + case 11: + return '\\v'; + + case 45: + // ASCII/Unicode for '-' dash + return '\\-'; + + case 91: + // '[' + return '\\['; + + case 92: + // '\\' + return '\\\\'; + + case 93: + // ']' + return '\\]'; + + case 94: + // ']' + return '\\^'; + } + if (i < 32 || i > 0xFFF0 /* Unicode Specials, also in UTF16 */ + || i >= 0xD800 && i <= 0xDFFF /* Unicode Supplementary Planes; we're TOAST in JavaScript as we're NOT UTF-16 but UCS-2! */ + || String.fromCharCode(i).match(/[\u2028\u2029]/) /* Code compilation via `new Function()` does not like to see these, or rather: treats them as just another form of CRLF, which breaks your generated regex code! */ + ) { + // Detail about a detail: + // U+2028 and U+2029 are part of the `\s` regex escape code (`\s` and `[\s]` match either of these) and when placed in a JavaScript + // source file verbatim (without escaping it as a `\uNNNN` item) then JavaScript will interpret it as such and consequently report + // a b0rked generated parser, as the generated code would include this regex right here. + // Hence we MUST escape these buggers everywhere we go... + x = i.toString(16); + if (x.length >= 1 && i <= 0xFFFF) { + c = '0000' + x; + return '\\u' + c.substr(c.length - 4); + } else { + return '\\u{' + x + '}'; + } + } + return String.fromCharCode(i); +} + +// Helper collection for `bitarray2set()`: we have expanded all these cached `\\p{NAME}` regex sets when creating +// this bitarray and now we should look at these expansions again to see if `bitarray2set()` can produce a +// `\\p{NAME}` shorthand to represent [part of] the bitarray: +var Pcodes_bitarray_cache = {}; +var Pcodes_bitarray_cache_test_order = []; + +// Helper collection for `bitarray2set()` for minifying special cases of result sets which can be represented by +// a single regex 'escape', e.g. `\d` for digits 0-9. +var EscCode_bitarray_output_refs; + +// now initialize the EscCodes_... table above: +init_EscCode_lookup_table(); + +function init_EscCode_lookup_table() { + var s, + bitarr, + set2esc = {}, + esc2bitarr = {}; + + // patch global lookup tables for the time being, while we calculate their *real* content in this function: + EscCode_bitarray_output_refs = { + esc2bitarr: {}, + set2esc: {} + }; + Pcodes_bitarray_cache_test_order = []; + + // `/\S': + bitarr = []; + set2bitarray(bitarr, '^' + WHITESPACE_SETSTR$1); + s = bitarray2set(bitarr); + esc2bitarr['S'] = bitarr; + set2esc[s] = 'S'; + // set2esc['^' + s] = 's'; + Pcodes_bitarray_cache['\\S'] = bitarr; + + // `/\s': + bitarr = []; + set2bitarray(bitarr, WHITESPACE_SETSTR$1); + s = bitarray2set(bitarr); + esc2bitarr['s'] = bitarr; + set2esc[s] = 's'; + // set2esc['^' + s] = 'S'; + Pcodes_bitarray_cache['\\s'] = bitarr; + + // `/\D': + bitarr = []; + set2bitarray(bitarr, '^' + DIGIT_SETSTR$1); + s = bitarray2set(bitarr); + esc2bitarr['D'] = bitarr; + set2esc[s] = 'D'; + // set2esc['^' + s] = 'd'; + Pcodes_bitarray_cache['\\D'] = bitarr; + + // `/\d': + bitarr = []; + set2bitarray(bitarr, DIGIT_SETSTR$1); + s = bitarray2set(bitarr); + esc2bitarr['d'] = bitarr; + set2esc[s] = 'd'; + // set2esc['^' + s] = 'D'; + Pcodes_bitarray_cache['\\d'] = bitarr; + + // `/\W': + bitarr = []; + set2bitarray(bitarr, '^' + WORDCHAR_SETSTR$1); + s = bitarray2set(bitarr); + esc2bitarr['W'] = bitarr; + set2esc[s] = 'W'; + // set2esc['^' + s] = 'w'; + Pcodes_bitarray_cache['\\W'] = bitarr; + + // `/\w': + bitarr = []; + set2bitarray(bitarr, WORDCHAR_SETSTR$1); + s = bitarray2set(bitarr); + esc2bitarr['w'] = bitarr; + set2esc[s] = 'w'; + // set2esc['^' + s] = 'W'; + Pcodes_bitarray_cache['\\w'] = bitarr; + + EscCode_bitarray_output_refs = { + esc2bitarr: esc2bitarr, + set2esc: set2esc + }; + + updatePcodesBitarrayCacheTestOrder(); +} + +function updatePcodesBitarrayCacheTestOrder(opts) { + var t = new Array(UNICODE_BASE_PLANE_MAX_CP$1 + 1); + var l = {}; + var user_has_xregexp = opts && opts.options && opts.options.xregexp; + var i, j, k, ba; + + // mark every character with which regex pcodes they are part of: + for (k in Pcodes_bitarray_cache) { + ba = Pcodes_bitarray_cache[k]; + + if (!user_has_xregexp && k.indexOf('\\p{') >= 0) { + continue; + } + + var cnt = 0; + for (i = 0; i <= UNICODE_BASE_PLANE_MAX_CP$1; i++) { + if (ba[i]) { + cnt++; + if (!t[i]) { + t[i] = [k]; + } else { + t[i].push(k); + } + } + } + l[k] = cnt; + } + + // now dig out the unique ones: only need one per pcode. + // + // We ASSUME every \\p{NAME} 'pcode' has at least ONE character + // in it that is ONLY matched by that particular pcode. + // If this assumption fails, nothing is lost, but our 'regex set + // optimized representation' will be sub-optimal as than this pcode + // won't be tested during optimization. + // + // Now that would be a pity, so the assumption better holds... + // Turns out the assumption doesn't hold already for /\S/ + /\D/ + // as the second one (\D) is a pure subset of \S. So we have to + // look for markers which match multiple escapes/pcodes for those + // ones where a unique item isn't available... + var lut = []; + var done = {}; + var keys = Object.keys(Pcodes_bitarray_cache); + + for (i = 0; i <= UNICODE_BASE_PLANE_MAX_CP$1; i++) { + k = t[i][0]; + if (t[i].length === 1 && !done[k]) { + assert(l[k] > 0); + lut.push([i, k]); + done[k] = true; + } + } + + for (j = 0; keys[j]; j++) { + k = keys[j]; + + if (!user_has_xregexp && k.indexOf('\\p{') >= 0) { + continue; + } + + if (!done[k]) { + assert(l[k] > 0); + // find a minimum span character to mark this one: + var w = Infinity; + var rv; + ba = Pcodes_bitarray_cache[k]; + for (i = 0; i <= UNICODE_BASE_PLANE_MAX_CP$1; i++) { + if (ba[i]) { + var tl = t[i].length; + if (tl > 1 && tl < w) { + assert(l[k] > 0); + rv = [i, k]; + w = tl; + } + } + } + if (rv) { + done[k] = true; + lut.push(rv); + } + } + } + + // order from large set to small set so that small sets don't gobble + // characters also represented by overlapping larger set pcodes. + // + // Again we assume something: that finding the large regex pcode sets + // before the smaller, more specialized ones, will produce a more + // optimal minification of the regex set expression. + // + // This is a guestimate/heuristic only! + lut.sort(function (a, b) { + var k1 = a[1]; + var k2 = b[1]; + var ld = l[k2] - l[k1]; + if (ld) { + return ld; + } + // and for same-size sets, order from high to low unique identifier. + return b[0] - a[0]; + }); + + Pcodes_bitarray_cache_test_order = lut; +} + +// 'Join' a regex set `[...]` into a Unicode range spanning logic array, flagging every character in the given set. +function set2bitarray(bitarr, s, opts) { + var orig = s; + var set_is_inverted = false; + var bitarr_orig; + + function mark(d1, d2) { + if (d2 == null) d2 = d1; + for (var i = d1; i <= d2; i++) { + bitarr[i] = true; + } + } + + function add2bitarray(dst, src) { + for (var i = 0; i <= UNICODE_BASE_PLANE_MAX_CP$1; i++) { + if (src[i]) { + dst[i] = true; + } + } + } + + function eval_escaped_code(s) { + var c; + // decode escaped code? If none, just take the character as-is + if (s.indexOf('\\') === 0) { + var l = s.substr(0, 2); + switch (l) { + case '\\c': + c = s.charCodeAt(2) - 'A'.charCodeAt(0) + 1; + return String.fromCharCode(c); + + case '\\x': + s = s.substr(2); + c = parseInt(s, 16); + return String.fromCharCode(c); + + case '\\u': + s = s.substr(2); + if (s[0] === '{') { + s = s.substr(1, s.length - 2); + } + c = parseInt(s, 16); + if (c >= 0x10000) { + return new Error('We do NOT support Extended Plane Unicode Codepoints (i.e. CodePoints beyond U:FFFF) in regex set expressions, e.g. \\u{' + s + '}'); + } + return String.fromCharCode(c); + + case '\\0': + case '\\1': + case '\\2': + case '\\3': + case '\\4': + case '\\5': + case '\\6': + case '\\7': + s = s.substr(1); + c = parseInt(s, 8); + return String.fromCharCode(c); + + case '\\r': + return '\r'; + + case '\\n': + return '\n'; + + case '\\v': + return '\v'; + + case '\\f': + return '\f'; + + case '\\t': + return '\t'; + + case '\\b': + return '\b'; + + default: + // just the character itself: + return s.substr(1); + } + } else { + return s; + } + } + + if (s && s.length) { + var c1, c2; + + // inverted set? + if (s[0] === '^') { + set_is_inverted = true; + s = s.substr(1); + bitarr_orig = bitarr; + bitarr = new Array(UNICODE_BASE_PLANE_MAX_CP$1 + 1); + } + + // BITARR collects flags for characters set. Inversion means the complement set of character is st instead. + // This results in an OR operations when sets are joined/chained. + + while (s.length) { + c1 = s.match(CHR_RE$1); + if (!c1) { + // hit an illegal escape sequence? cope anyway! + c1 = s[0]; + } else { + c1 = c1[0]; + // Quick hack for XRegExp escapes inside a regex `[...]` set definition: we *could* try to keep those + // intact but it's easier to unfold them here; this is not nice for when the grammar specifies explicit + // XRegExp support, but alas, we'll get there when we get there... ;-) + switch (c1) { + case '\\p': + s = s.substr(c1.length); + c2 = s.match(XREGEXP_UNICODE_ESCAPE_RE$1); + if (c2) { + c2 = c2[0]; + s = s.substr(c2.length); + // do we have this one cached already? + var pex = c1 + c2; + var ba4p = Pcodes_bitarray_cache[pex]; + if (!ba4p) { + // expand escape: + var xr = new XRegExp('[' + pex + ']'); // TODO: case-insensitive grammar??? + // rewrite to a standard `[...]` regex set: XRegExp will do this for us via `XRegExp.toString()`: + var xs = '' + xr; + // remove the wrapping `/.../` to get at the (possibly *combined* series of) `[...]` sets inside: + xs = xs.substr(1, xs.length - 2); + + ba4p = reduceRegexToSetBitArray(xs, pex, opts); + + Pcodes_bitarray_cache[pex] = ba4p; + updatePcodesBitarrayCacheTestOrder(opts); + } + // merge bitarrays: + add2bitarray(bitarr, ba4p); + continue; + } + break; + + case '\\S': + case '\\s': + case '\\W': + case '\\w': + case '\\d': + case '\\D': + // these can't participate in a range, but need to be treated special: + s = s.substr(c1.length); + // check for \S, \s, \D, \d, \W, \w and expand them: + var ba4e = EscCode_bitarray_output_refs.esc2bitarr[c1[1]]; + assert(ba4e); + add2bitarray(bitarr, ba4e); + continue; + + case '\\b': + // matches a backspace: https://developer.mozilla.org/en/docs/Web/JavaScript/Guide/Regular_Expressions#special-backspace + c1 = '\b'; + break; + } + } + var v1 = eval_escaped_code(c1); + // propagate deferred exceptions = error reports. + if (v1 instanceof Error) { + return v1; + } + v1 = v1.charCodeAt(0); + s = s.substr(c1.length); + + if (s[0] === '-' && s.length >= 2) { + // we can expect a range like 'a-z': + s = s.substr(1); + c2 = s.match(CHR_RE$1); + if (!c2) { + // hit an illegal escape sequence? cope anyway! + c2 = s[0]; + } else { + c2 = c2[0]; + } + var v2 = eval_escaped_code(c2); + // propagate deferred exceptions = error reports. + if (v2 instanceof Error) { + return v1; + } + v2 = v2.charCodeAt(0); + s = s.substr(c2.length); + + // legal ranges go UP, not /DOWN! + if (v1 <= v2) { + mark(v1, v2); + } else { + console.warn('INVALID CHARACTER RANGE found in regex: ', { re: orig, start: c1, start_n: v1, end: c2, end_n: v2 }); + mark(v1); + mark('-'.charCodeAt(0)); + mark(v2); + } + continue; + } + mark(v1); + } + + // When we have marked all slots, '^' NEGATES the set, hence we flip all slots. + // + // Since a regex like `[^]` should match everything(?really?), we don't need to check if the MARK + // phase actually marked anything at all: the `^` negation will correctly flip=mark the entire + // range then. + if (set_is_inverted) { + for (var i = 0; i <= UNICODE_BASE_PLANE_MAX_CP$1; i++) { + if (!bitarr[i]) { + bitarr_orig[i] = true; + } + } + } + } + return false; +} + +// convert a simple bitarray back into a regex set `[...]` content: +function bitarray2set(l, output_inverted_variant, output_minimized) { + // construct the inverse(?) set from the mark-set: + // + // Before we do that, we inject a sentinel so that our inner loops + // below can be simple and fast: + l[UNICODE_BASE_PLANE_MAX_CP$1 + 1] = 1; + // now reconstruct the regex set: + var rv = []; + var i, j, cnt, lut, tn, tspec, match, pcode, ba4pcode, l2; + var bitarr_is_cloned = false; + var l_orig = l; + + if (output_inverted_variant) { + // generate the inverted set, hence all unmarked slots are part of the output range: + cnt = 0; + for (i = 0; i <= UNICODE_BASE_PLANE_MAX_CP$1; i++) { + if (!l[i]) { + cnt++; + } + } + if (cnt === UNICODE_BASE_PLANE_MAX_CP$1 + 1) { + // When there's nothing in the output we output a special 'match-nothing' regex: `[^\S\s]`. + // BUT... since we output the INVERTED set, we output the match-all set instead: + return '\\S\\s'; + } else if (cnt === 0) { + // When we find the entire Unicode range is in the output match set, we replace this with + // a shorthand regex: `[\S\s]` + // BUT... since we output the INVERTED set, we output the match-nothing set instead: + return '^\\S\\s'; + } + + // Now see if we can replace several bits by an escape / pcode: + if (output_minimized) { + lut = Pcodes_bitarray_cache_test_order; + for (tn = 0; lut[tn]; tn++) { + tspec = lut[tn]; + // check if the uniquely identifying char is in the inverted set: + if (!l[tspec[0]]) { + // check if the pcode is covered by the inverted set: + pcode = tspec[1]; + ba4pcode = Pcodes_bitarray_cache[pcode]; + match = 0; + for (j = 0; j <= UNICODE_BASE_PLANE_MAX_CP$1; j++) { + if (ba4pcode[j]) { + if (!l[j]) { + // match in current inverted bitset, i.e. there's at + // least one 'new' bit covered by this pcode/escape: + match++; + } else if (l_orig[j]) { + // mismatch! + match = false; + break; + } + } + } + + // We're only interested in matches which actually cover some + // yet uncovered bits: `match !== 0 && match !== false`. + // + // Apply the heuristic that the pcode/escape is only going to be used + // when it covers *more* characters than its own identifier's length: + if (match && match > pcode.length) { + rv.push(pcode); + + // and nuke the bits in the array which match the given pcode: + // make sure these edits are visible outside this function as + // `l` is an INPUT parameter (~ not modified)! + if (!bitarr_is_cloned) { + l2 = new Array(UNICODE_BASE_PLANE_MAX_CP$1 + 1); + for (j = 0; j <= UNICODE_BASE_PLANE_MAX_CP$1; j++) { + l2[j] = l[j] || ba4pcode[j]; // `!(!l[j] && !ba4pcode[j])` + } + // recreate sentinel + l2[UNICODE_BASE_PLANE_MAX_CP$1 + 1] = 1; + l = l2; + bitarr_is_cloned = true; + } else { + for (j = 0; j <= UNICODE_BASE_PLANE_MAX_CP$1; j++) { + l[j] = l[j] || ba4pcode[j]; + } + } + } + } + } + } + + i = 0; + while (i <= UNICODE_BASE_PLANE_MAX_CP$1) { + // find first character not in original set: + while (l[i]) { + i++; + } + if (i >= UNICODE_BASE_PLANE_MAX_CP$1 + 1) { + break; + } + // find next character not in original set: + for (j = i + 1; !l[j]; j++) {} /* empty loop */ + // generate subset: + rv.push(i2c(i)); + if (j - 1 > i) { + rv.push((j - 2 > i ? '-' : '') + i2c(j - 1)); + } + i = j; + } + } else { + // generate the non-inverted set, hence all logic checks are inverted here... + cnt = 0; + for (i = 0; i <= UNICODE_BASE_PLANE_MAX_CP$1; i++) { + if (l[i]) { + cnt++; + } + } + if (cnt === UNICODE_BASE_PLANE_MAX_CP$1 + 1) { + // When we find the entire Unicode range is in the output match set, we replace this with + // a shorthand regex: `[\S\s]` + return '\\S\\s'; + } else if (cnt === 0) { + // When there's nothing in the output we output a special 'match-nothing' regex: `[^\S\s]`. + return '^\\S\\s'; + } + + // Now see if we can replace several bits by an escape / pcode: + if (output_minimized) { + lut = Pcodes_bitarray_cache_test_order; + for (tn = 0; lut[tn]; tn++) { + tspec = lut[tn]; + // check if the uniquely identifying char is in the set: + if (l[tspec[0]]) { + // check if the pcode is covered by the set: + pcode = tspec[1]; + ba4pcode = Pcodes_bitarray_cache[pcode]; + match = 0; + for (j = 0; j <= UNICODE_BASE_PLANE_MAX_CP$1; j++) { + if (ba4pcode[j]) { + if (l[j]) { + // match in current bitset, i.e. there's at + // least one 'new' bit covered by this pcode/escape: + match++; + } else if (!l_orig[j]) { + // mismatch! + match = false; + break; + } + } + } + + // We're only interested in matches which actually cover some + // yet uncovered bits: `match !== 0 && match !== false`. + // + // Apply the heuristic that the pcode/escape is only going to be used + // when it covers *more* characters than its own identifier's length: + if (match && match > pcode.length) { + rv.push(pcode); + + // and nuke the bits in the array which match the given pcode: + // make sure these edits are visible outside this function as + // `l` is an INPUT parameter (~ not modified)! + if (!bitarr_is_cloned) { + l2 = new Array(UNICODE_BASE_PLANE_MAX_CP$1 + 1); + for (j = 0; j <= UNICODE_BASE_PLANE_MAX_CP$1; j++) { + l2[j] = l[j] && !ba4pcode[j]; + } + // recreate sentinel + l2[UNICODE_BASE_PLANE_MAX_CP$1 + 1] = 1; + l = l2; + bitarr_is_cloned = true; + } else { + for (j = 0; j <= UNICODE_BASE_PLANE_MAX_CP$1; j++) { + l[j] = l[j] && !ba4pcode[j]; + } + } + } + } + } + } + + i = 0; + while (i <= UNICODE_BASE_PLANE_MAX_CP$1) { + // find first character not in original set: + while (!l[i]) { + i++; + } + if (i >= UNICODE_BASE_PLANE_MAX_CP$1 + 1) { + break; + } + // find next character not in original set: + for (j = i + 1; l[j]; j++) {} /* empty loop */ + if (j > UNICODE_BASE_PLANE_MAX_CP$1 + 1) { + j = UNICODE_BASE_PLANE_MAX_CP$1 + 1; + } + // generate subset: + rv.push(i2c(i)); + if (j - 1 > i) { + rv.push((j - 2 > i ? '-' : '') + i2c(j - 1)); + } + i = j; + } + } + + assert(rv.length); + var s = rv.join(''); + assert(s); + + // Check if the set is better represented by one of the regex escapes: + var esc4s = EscCode_bitarray_output_refs.set2esc[s]; + if (esc4s) { + // When we hit a special case like this, it is always the shortest notation, hence wins on the spot! + return '\\' + esc4s; + } + return s; +} + +// Pretty brutal conversion of 'regex' `s` back to raw regex set content: strip outer [...] when they're there; +// ditto for inner combos of sets, i.e. `]|[` as in `[0-9]|[a-z]`. +function reduceRegexToSetBitArray(s, name, opts) { + var orig = s; + + // propagate deferred exceptions = error reports. + if (s instanceof Error) { + return s; + } + + var l = new Array(UNICODE_BASE_PLANE_MAX_CP$1 + 1); + var internal_state = 0; + var derr; + + while (s.length) { + var c1 = s.match(CHR_RE$1); + if (!c1) { + // cope with illegal escape sequences too! + return new Error('illegal escape sequence at start of regex part: "' + s + '" of regex "' + orig + '"'); + } else { + c1 = c1[0]; + } + s = s.substr(c1.length); + + switch (c1) { + case '[': + // this is starting a set within the regex: scan until end of set! + var set_content = []; + while (s.length) { + var inner = s.match(SET_PART_RE$1); + if (!inner) { + inner = s.match(CHR_RE$1); + if (!inner) { + // cope with illegal escape sequences too! + return new Error('illegal escape sequence at start of regex part: ' + s + '" of regex "' + orig + '"'); + } else { + inner = inner[0]; + } + if (inner === ']') break; + } else { + inner = inner[0]; + } + set_content.push(inner); + s = s.substr(inner.length); + } + + // ensure that we hit the terminating ']': + var c2 = s.match(CHR_RE$1); + if (!c2) { + // cope with illegal escape sequences too! + return new Error('regex set expression is broken in regex: "' + orig + '" --> "' + s + '"'); + } else { + c2 = c2[0]; + } + if (c2 !== ']') { + return new Error('regex set expression is broken in regex: ' + orig); + } + s = s.substr(c2.length); + + var se = set_content.join(''); + if (!internal_state) { + derr = set2bitarray(l, se, opts); + // propagate deferred exceptions = error reports. + if (derr instanceof Error) { + return derr; + } + + // a set is to use like a single character in a longer literal phrase, hence input `[abc]word[def]` would thus produce output `[abc]`: + internal_state = 1; + } + break; + + // Strip unescaped pipes to catch constructs like `\\r|\\n` and turn them into + // something ready for use inside a regex set, e.g. `\\r\\n`. + // + // > Of course, we realize that converting more complex piped constructs this way + // > will produce something you might not expect, e.g. `A|WORD2` which + // > would end up as the set `[AW]` which is something else than the input + // > entirely. + // > + // > However, we can only depend on the user (grammar writer) to realize this and + // > prevent this from happening by not creating such oddities in the input grammar. + case '|': + // a|b --> [ab] + internal_state = 0; + break; + + case '(': + // (a) --> a + // + // TODO - right now we treat this as 'too complex': + + // Strip off some possible outer wrappers which we know how to remove. + // We don't worry about 'damaging' the regex as any too-complex regex will be caught + // in the validation check at the end; our 'strippers' here would not damage useful + // regexes anyway and them damaging the unacceptable ones is fine. + s = s.replace(/^\((?:\?:)?(.*?)\)$/, '$1'); // (?:...) -> ... and (...) -> ... + s = s.replace(/^\^?(.*?)\$?$/, '$1'); // ^...$ --> ... (catch these both inside and outside the outer grouping, hence do the ungrouping twice: one before, once after this) + s = s.replace(/^\((?:\?:)?(.*?)\)$/, '$1'); // (?:...) -> ... and (...) -> ... + + return new Error('[macro [' + name + '] is unsuitable for use inside regex set expressions: "[' + orig + ']"]'); + + case '.': + case '*': + case '+': + case '?': + // wildcard + // + // TODO - right now we treat this as 'too complex': + return new Error('[macro [' + name + '] is unsuitable for use inside regex set expressions: "[' + orig + ']"]'); + + case '{': + // range, e.g. `x{1,3}`, or macro? + // TODO - right now we treat this as 'too complex': + return new Error('[macro [' + name + '] is unsuitable for use inside regex set expressions: "[' + orig + ']"]'); + + default: + // literal character or word: take the first character only and ignore the rest, so that + // the constructed set for `word|noun` would be `[wb]`: + if (!internal_state) { + derr = set2bitarray(l, c1, opts); + // propagate deferred exceptions = error reports. + if (derr instanceof Error) { + return derr; + } + + internal_state = 2; + } + break; + } + } + + s = bitarray2set(l); + + // When this result is suitable for use in a set, than we should be able to compile + // it in a regex; that way we can easily validate whether macro X is fit to be used + // inside a regex set: + try { + var re; + assert(s); + assert(!(s instanceof Error)); + re = new XRegExp('[' + s + ']'); + re.test(s[0]); + + // One thing is apparently *not* caught by the RegExp compile action above: `[a[b]c]` + // so we check for lingering UNESCAPED brackets in here as those cannot be: + if (/[^\\][\[\]]/.exec(s)) { + throw new Error('unescaped brackets in set data'); + } + } catch (ex) { + // make sure we produce a set range expression which will fail badly when it is used + // in actual code: + s = new Error('[macro [' + name + '] is unsuitable for use inside regex set expressions: "[' + s + ']"]: ' + ex.message); + } + + assert(s); + // propagate deferred exceptions = error reports. + if (s instanceof Error) { + return s; + } + return l; +} + +// Convert bitarray representing, for example, `'0-9'` to regex string `[0-9]` +// -- or in this example it can be further optimized to only `\d`! +function produceOptimizedRegex4Set(bitarr) { + // First try to produce a minimum regex from the bitarray directly: + var s1 = bitarray2set(bitarr, false, true); + + // and when the regex set turns out to match a single pcode/escape, then + // use that one as-is: + if (s1.match(SET_IS_SINGLE_PCODE_RE)) { + // When we hit a special case like this, it is always the shortest notation, hence wins on the spot! + return s1; + } else { + s1 = '[' + s1 + ']'; + } + + // Now try to produce a minimum regex from the *inverted* bitarray via negation: + // Because we look at a negated bitset, there's no use looking for matches with + // special cases here. + var s2 = bitarray2set(bitarr, true, true); + + if (s2[0] === '^') { + s2 = s2.substr(1); + if (s2.match(SET_IS_SINGLE_PCODE_RE)) { + // When we hit a special case like this, it is always the shortest notation, hence wins on the spot! + return s2; + } + } else { + s2 = '^' + s2; + } + s2 = '[' + s2 + ']'; + + // Then, as some pcode/escapes still happen to deliver a LARGER regex string in the end, + // we also check against the plain, unadulterated regex set expressions: + // + // First try to produce a minimum regex from the bitarray directly: + var s3 = bitarray2set(bitarr, false, false); + + // and when the regex set turns out to match a single pcode/escape, then + // use that one as-is: + if (s3.match(SET_IS_SINGLE_PCODE_RE)) { + // When we hit a special case like this, it is always the shortest notation, hence wins on the spot! + return s3; + } else { + s3 = '[' + s3 + ']'; + } + + // Now try to produce a minimum regex from the *inverted* bitarray via negation: + // Because we look at a negated bitset, there's no use looking for matches with + // special cases here. + var s4 = bitarray2set(bitarr, true, false); + + if (s4[0] === '^') { + s4 = s4.substr(1); + if (s4.match(SET_IS_SINGLE_PCODE_RE)) { + // When we hit a special case like this, it is always the shortest notation, hence wins on the spot! + return s4; + } + } else { + s4 = '^' + s4; + } + s4 = '[' + s4 + ']'; + + if (s2.length < s1.length) { + s1 = s2; + } + if (s3.length < s1.length) { + s1 = s3; + } + if (s4.length < s1.length) { + s1 = s4; + } + + return s1; +} + +var setmgmt = { + XREGEXP_UNICODE_ESCAPE_RE: XREGEXP_UNICODE_ESCAPE_RE$1, + CHR_RE: CHR_RE$1, + SET_PART_RE: SET_PART_RE$1, + NOTHING_SPECIAL_RE: NOTHING_SPECIAL_RE$1, + SET_IS_SINGLE_PCODE_RE: SET_IS_SINGLE_PCODE_RE, + + UNICODE_BASE_PLANE_MAX_CP: UNICODE_BASE_PLANE_MAX_CP$1, + + WHITESPACE_SETSTR: WHITESPACE_SETSTR$1, + DIGIT_SETSTR: DIGIT_SETSTR$1, + WORDCHAR_SETSTR: WORDCHAR_SETSTR$1, + + set2bitarray: set2bitarray, + bitarray2set: bitarray2set, + produceOptimizedRegex4Set: produceOptimizedRegex4Set, + reduceRegexToSetBitArray: reduceRegexToSetBitArray +}; + +// Basic Lexer implemented using JavaScript regular expressions +// Zachary Carter +// MIT Licensed + +var rmCommonWS = helpers.rmCommonWS; +var camelCase = helpers.camelCase; +var code_exec = helpers.exec; +// import recast from '@gerhobbelt/recast'; +// import astUtils from '@gerhobbelt/ast-util'; +var version$1 = '0.6.1-205'; // require('./package.json').version; + + +var XREGEXP_UNICODE_ESCAPE_RE = setmgmt.XREGEXP_UNICODE_ESCAPE_RE; // Matches the XRegExp Unicode escape braced part, e.g. `{Number}` +var CHR_RE = setmgmt.CHR_RE; +var SET_PART_RE = setmgmt.SET_PART_RE; +var NOTHING_SPECIAL_RE = setmgmt.NOTHING_SPECIAL_RE; +var UNICODE_BASE_PLANE_MAX_CP = setmgmt.UNICODE_BASE_PLANE_MAX_CP; + +// WARNING: this regex MUST match the regex for `ID` in ebnf-parser::bnf.l jison language lexer spec! (`ID = [{ALPHA}]{ALNUM}*`) +// +// This is the base XRegExp ID regex used in many places; this should match the ID macro definition in the EBNF/BNF parser et al as well! +var ID_REGEX_BASE = '[\\p{Alphabetic}_][\\p{Alphabetic}_\\p{Number}]*'; + +// see also ./lib/cli.js +/** +@public +@nocollapse +*/ +var defaultJisonLexOptions = { + moduleType: 'commonjs', + debug: false, + enableDebugLogs: false, + json: false, + main: false, // CLI: not:(--main option) + dumpSourceCodeOnFailure: true, + throwErrorOnCompileFailure: true, + + moduleName: undefined, + defaultModuleName: 'lexer', + file: undefined, + outfile: undefined, + inputPath: undefined, + inputFilename: undefined, + warn_cb: undefined, // function(msg) | true (= use Jison.Print) | false (= throw Exception) + + xregexp: false, + lexerErrorsAreRecoverable: false, + flex: false, + backtrack_lexer: false, + ranges: false, // track position range, i.e. start+end indexes in the input string + trackPosition: true, // track line+column position in the input string + caseInsensitive: false, + showSource: false, + exportSourceCode: false, + exportAST: false, + prettyCfg: true, + pre_lex: undefined, + post_lex: undefined +}; + +// Merge sets of options. +// +// Convert alternative jison option names to their base option. +// +// The *last* option set which overrides the default wins, where 'override' is +// defined as specifying a not-undefined value which is not equal to the +// default value. +// +// When the FIRST argument is STRING "NODEFAULT", then we MUST NOT mix the +// default values avialable in Jison.defaultJisonOptions. +// +// Return a fresh set of options. +/** @public */ +function mkStdOptions() /*...args*/{ + var h = Object.prototype.hasOwnProperty; + + var opts = {}; + var args = [].concat.apply([], arguments); + // clone defaults, so we do not modify those constants? + if (args[0] !== "NODEFAULT") { + args.unshift(defaultJisonLexOptions); + } else { + args.shift(); + } + + for (var i = 0, len = args.length; i < len; i++) { + var o = args[i]; + if (!o) continue; + + // clone input (while camel-casing the options), so we do not modify those either. + var o2 = {}; + + for (var p in o) { + if (typeof o[p] !== 'undefined' && h.call(o, p)) { + o2[camelCase(p)] = o[p]; + } + } + + // now clean them options up: + if (typeof o2.main !== 'undefined') { + o2.noMain = !o2.main; + } + + delete o2.main; + + // special check for `moduleName` to ensure we detect the 'default' moduleName entering from the CLI + // NOT overriding the moduleName set in the grammar definition file via an `%options` entry: + if (o2.moduleName === o2.defaultModuleName) { + delete o2.moduleName; + } + + // now see if we have an overriding option here: + for (var p in o2) { + if (h.call(o2, p)) { + if (typeof o2[p] !== 'undefined') { + opts[p] = o2[p]; + } + } + } + } + + return opts; +} + +// set up export/output attributes of the `options` object instance +function prepExportStructures(options) { + // set up the 'option' `exportSourceCode` as a hash object for returning + // all generated source code chunks to the caller + var exportSourceCode = options.exportSourceCode; + if (!exportSourceCode || (typeof exportSourceCode === 'undefined' ? 'undefined' : _typeof(exportSourceCode)) !== 'object') { + exportSourceCode = { + enabled: !!exportSourceCode + }; + } else if (typeof exportSourceCode.enabled !== 'boolean') { + exportSourceCode.enabled = true; + } + options.exportSourceCode = exportSourceCode; +} + +// Autodetect if the input lexer spec is in JSON or JISON +// format when the `options.json` flag is `true`. +// +// Produce the JSON lexer spec result when these are JSON formatted already as that +// would save us the trouble of doing this again, anywhere else in the JISON +// compiler/generator. +// +// Otherwise return the *parsed* lexer spec as it has +// been processed through LexParser. +function autodetectAndConvertToJSONformat(lexerSpec, options) { + var chk_l = null; + var ex1, err; + + if (typeof lexerSpec === 'string') { + if (options.json) { + try { + chk_l = json5.parse(lexerSpec); + + // When JSON5-based parsing of the lexer spec succeeds, this implies the lexer spec is specified in `JSON mode` + // *OR* there's a JSON/JSON5 format error in the input: + } catch (e) { + ex1 = e; + } + } + if (!chk_l) { + // // WARNING: the lexer may receive options specified in the **grammar spec file**, + // // hence we should mix the options to ensure the lexParser always + // // receives the full set! + // // + // // make sure all options are 'standardized' before we go and mix them together: + // options = mkStdOptions(grammar.options, options); + try { + chk_l = lexParser.parse(lexerSpec, options); + } catch (e) { + if (options.json) { + err = new Error('Could not parse lexer spec in JSON AUTODETECT mode\nError: ' + ex1.message + ' (' + e.message + ')'); + err.secondary_exception = e; + err.stack = ex1.stack; + } else { + err = new Error('Could not parse lexer spec\nError: ' + e.message); + err.stack = e.stack; + } + throw err; + } + } + } else { + chk_l = lexerSpec; + } + + // Save time! Don't reparse the entire lexer spec *again* inside the code generators when that's not necessary: + + return chk_l; +} + +// expand macros and convert matchers to RegExp's +function prepareRules(dict, actions, caseHelper, tokens, startConditions, opts) { + var m, + i, + k, + rule, + action, + conditions, + active_conditions, + rules = dict.rules || [], + newRules = [], + macros = {}, + regular_rule_count = 0, + simple_rule_count = 0; + + // Assure all options are camelCased: + assert(typeof opts.options['case-insensitive'] === 'undefined'); + + if (!tokens) { + tokens = {}; + } + + // Depending on the location within the regex we need different expansions of the macros: + // one expansion for when a macro is *inside* a `[...]` and another expansion when a macro + // is anywhere else in a regex: + if (dict.macros) { + macros = prepareMacros(dict.macros, opts); + } + + function tokenNumberReplacement(str, token) { + return 'return ' + (tokens[token] || '\'' + token.replace(/'/g, '\\\'') + '\''); + } + + // Make sure a comment does not contain any embedded '*/' end-of-comment marker + // as that would break the generated code + function postprocessComment(str) { + if (Array.isArray(str)) { + str = str.join(' '); + } + str = str.replace(/\*\//g, '*\\/'); // destroy any inner `*/` comment terminator sequence. + return str; + } + + actions.push('switch(yyrulenumber) {'); + + for (i = 0; i < rules.length; i++) { + rule = rules[i]; + m = rule[0]; + + active_conditions = []; + if (Object.prototype.toString.apply(m) !== '[object Array]') { + // implicit add to all inclusive start conditions + for (k in startConditions) { + if (startConditions[k].inclusive) { + active_conditions.push(k); + startConditions[k].rules.push(i); + } + } + } else if (m[0] === '*') { + // Add to ALL start conditions + active_conditions.push('*'); + for (k in startConditions) { + startConditions[k].rules.push(i); + } + rule.shift(); + m = rule[0]; + } else { + // Add to explicit start conditions + conditions = rule.shift(); + m = rule[0]; + for (k = 0; k < conditions.length; k++) { + if (!startConditions.hasOwnProperty(conditions[k])) { + startConditions[conditions[k]] = { + rules: [], + inclusive: false + }; + console.warn('Lexer Warning:', '"' + conditions[k] + '" start condition should be defined as %s or %x; assuming %x now.'); + } + active_conditions.push(conditions[k]); + startConditions[conditions[k]].rules.push(i); + } + } + + if (typeof m === 'string') { + m = expandMacros(m, macros, opts); + m = new XRegExp('^(?:' + m + ')', opts.options.caseInsensitive ? 'i' : ''); + } + newRules.push(m); + if (typeof rule[1] === 'function') { + rule[1] = String(rule[1]).replace(/^\s*function \(\)\s?\{/, '').replace(/\}\s*$/, ''); + } + action = rule[1]; + action = action.replace(/return '((?:\\'|[^']+)+)'/g, tokenNumberReplacement); + action = action.replace(/return "((?:\\"|[^"]+)+)"/g, tokenNumberReplacement); + + var code = ['\n/*! Conditions::']; + code.push(postprocessComment(active_conditions)); + code.push('*/', '\n/*! Rule:: '); + code.push(postprocessComment(rules[i][0])); + code.push('*/', '\n'); + + // When the action is *only* a simple `return TOKEN` statement, then add it to the caseHelpers; + // otherwise add the additional `break;` at the end. + // + // Note: we do NOT analyze the action block any more to see if the *last* line is a simple + // `return NNN;` statement as there are too many shoddy idioms, e.g. + // + // ``` + // %{ if (cond) + // return TOKEN; + // %} + // ``` + // + // which would then cause havoc when our action code analysis (using regexes or otherwise) was 'too simple' + // to catch these culprits; hence we resort and stick with the most fundamental approach here: + // always append `break;` even when it would be obvious to a human that such would be 'unreachable code'. + var match_nr = /^return[\s\r\n]+((?:'(?:\\'|[^']+)+')|(?:"(?:\\"|[^"]+)+")|\d+)[\s\r\n]*;?$/.exec(action.trim()); + if (match_nr) { + simple_rule_count++; + caseHelper.push([].concat(code, i, ':', match_nr[1]).join(' ').replace(/[\n]/g, '\n ')); + } else { + regular_rule_count++; + actions.push([].concat('case', i, ':', code, action, '\nbreak;').join(' ')); + } + } + actions.push('default:'); + actions.push(' return this.simpleCaseActionClusters[yyrulenumber];'); + actions.push('}'); + + return { + rules: newRules, + macros: macros, + + regular_rule_count: regular_rule_count, + simple_rule_count: simple_rule_count + }; +} + +// expand all macros (with maybe one exception) in the given regex: the macros may exist inside `[...]` regex sets or +// elsewhere, which requires two different treatments to expand these macros. +function reduceRegex(s, name, opts, expandAllMacrosInSet_cb, expandAllMacrosElsewhere_cb) { + var orig = s; + + function errinfo() { + if (name) { + return 'macro [[' + name + ']]'; + } else { + return 'regex [[' + orig + ']]'; + } + } + + // propagate deferred exceptions = error reports. + if (s instanceof Error) { + return s; + } + + var c1, c2; + var rv = []; + var derr; + var se; + + while (s.length) { + c1 = s.match(CHR_RE); + if (!c1) { + // cope with illegal escape sequences too! + return new Error(errinfo() + ': illegal escape sequence at start of regex part: ' + s); + } else { + c1 = c1[0]; + } + s = s.substr(c1.length); + + switch (c1) { + case '[': + // this is starting a set within the regex: scan until end of set! + var set_content = []; + var l = new Array(UNICODE_BASE_PLANE_MAX_CP + 1); + + while (s.length) { + var inner = s.match(SET_PART_RE); + if (!inner) { + inner = s.match(CHR_RE); + if (!inner) { + // cope with illegal escape sequences too! + return new Error(errinfo() + ': illegal escape sequence at start of regex part: ' + s); + } else { + inner = inner[0]; + } + if (inner === ']') break; + } else { + inner = inner[0]; + } + set_content.push(inner); + s = s.substr(inner.length); + } + + // ensure that we hit the terminating ']': + c2 = s.match(CHR_RE); + if (!c2) { + // cope with illegal escape sequences too! + return new Error(errinfo() + ': regex set expression is broken: "' + s + '"'); + } else { + c2 = c2[0]; + } + if (c2 !== ']') { + return new Error(errinfo() + ': regex set expression is broken: apparently unterminated'); + } + s = s.substr(c2.length); + + se = set_content.join(''); + + // expand any macros in here: + if (expandAllMacrosInSet_cb) { + se = expandAllMacrosInSet_cb(se); + assert(se); + if (se instanceof Error) { + return new Error(errinfo() + ': ' + se.message); + } + } + + derr = setmgmt.set2bitarray(l, se, opts); + if (derr instanceof Error) { + return new Error(errinfo() + ': ' + derr.message); + } + + // find out which set expression is optimal in size: + var s1 = setmgmt.produceOptimizedRegex4Set(l); + + // check if the source regex set potentially has any expansions (guestimate!) + // + // The indexOf('{') picks both XRegExp Unicode escapes and JISON lexer macros, which is perfect for us here. + var has_expansions = se.indexOf('{') >= 0; + + se = '[' + se + ']'; + + if (!has_expansions && se.length < s1.length) { + s1 = se; + } + rv.push(s1); + break; + + // XRegExp Unicode escape, e.g. `\\p{Number}`: + case '\\p': + c2 = s.match(XREGEXP_UNICODE_ESCAPE_RE); + if (c2) { + c2 = c2[0]; + s = s.substr(c2.length); + + // nothing to expand. + rv.push(c1 + c2); + } else { + // nothing to stretch this match, hence nothing to expand. + rv.push(c1); + } + break; + + // Either a range expression or the start of a macro reference: `.{1,3}` or `{NAME}`. + // Treat it as a macro reference and see if it will expand to anything: + case '{': + c2 = s.match(NOTHING_SPECIAL_RE); + if (c2) { + c2 = c2[0]; + s = s.substr(c2.length); + + var c3 = s[0]; + s = s.substr(c3.length); + if (c3 === '}') { + // possibly a macro name in there... Expand if possible: + c2 = c1 + c2 + c3; + if (expandAllMacrosElsewhere_cb) { + c2 = expandAllMacrosElsewhere_cb(c2); + assert(c2); + if (c2 instanceof Error) { + return new Error(errinfo() + ': ' + c2.message); + } + } + } else { + // not a well-terminated macro reference or something completely different: + // we do not even attempt to expand this as there's guaranteed nothing to expand + // in this bit. + c2 = c1 + c2 + c3; + } + rv.push(c2); + } else { + // nothing to stretch this match, hence nothing to expand. + rv.push(c1); + } + break; + + // Recognize some other regex elements, but there's no need to understand them all. + // + // We are merely interested in any chunks now which do *not* include yet another regex set `[...]` + // nor any `{MACRO}` reference: + default: + // non-set character or word: see how much of this there is for us and then see if there + // are any macros still lurking inside there: + c2 = s.match(NOTHING_SPECIAL_RE); + if (c2) { + c2 = c2[0]; + s = s.substr(c2.length); + + // nothing to expand. + rv.push(c1 + c2); + } else { + // nothing to stretch this match, hence nothing to expand. + rv.push(c1); + } + break; + } + } + + s = rv.join(''); + + // When this result is suitable for use in a set, than we should be able to compile + // it in a regex; that way we can easily validate whether macro X is fit to be used + // inside a regex set: + try { + var re; + re = new XRegExp(s); + re.test(s[0]); + } catch (ex) { + // make sure we produce a regex expression which will fail badly when it is used + // in actual code: + return new Error(errinfo() + ': expands to an invalid regex: /' + s + '/'); + } + + assert(s); + return s; +} + +// expand macros within macros and cache the result +function prepareMacros(dict_macros, opts) { + var macros = {}; + + // expand a `{NAME}` macro which exists inside a `[...]` set: + function expandMacroInSet(i) { + var k, a, m; + if (!macros[i]) { + m = dict_macros[i]; + + if (m.indexOf('{') >= 0) { + // set up our own record so we can detect definition loops: + macros[i] = { + in_set: false, + elsewhere: null, + raw: dict_macros[i] + }; + + for (k in dict_macros) { + if (dict_macros.hasOwnProperty(k) && i !== k) { + // it doesn't matter if the lexer recognized that the inner macro(s) + // were sitting inside a `[...]` set or not: the fact that they are used + // here in macro `i` which itself sits in a set, makes them *all* live in + // a set so all of them get the same treatment: set expansion style. + // + // Note: make sure we don't try to expand any XRegExp `\p{...}` or `\P{...}` + // macros here: + if (XRegExp._getUnicodeProperty(k)) { + // Work-around so that you can use `\p{ascii}` for a XRegExp slug, a.k.a. + // Unicode 'General Category' Property cf. http://unicode.org/reports/tr18/#Categories, + // while using `\p{ASCII}` as a *macro expansion* of the `ASCII` + // macro: + if (k.toUpperCase() !== k) { + m = new Error('Cannot use name "' + k + '" as a macro name as it clashes with the same XRegExp "\\p{..}" Unicode \'General Category\' Property name. Use all-uppercase macro names, e.g. name your macro "' + k.toUpperCase() + '" to work around this issue or give your offending macro a different name.'); + break; + } + } + + a = m.split('{' + k + '}'); + if (a.length > 1) { + var x = expandMacroInSet(k); + assert(x); + if (x instanceof Error) { + m = x; + break; + } + m = a.join(x); + } + } + } + } + + var mba = setmgmt.reduceRegexToSetBitArray(m, i, opts); + + var s1; + + // propagate deferred exceptions = error reports. + if (mba instanceof Error) { + s1 = mba; + } else { + s1 = setmgmt.bitarray2set(mba, false); + + m = s1; + } + + macros[i] = { + in_set: s1, + elsewhere: null, + raw: dict_macros[i] + }; + } else { + m = macros[i].in_set; + + if (m instanceof Error) { + // this turns out to be an macro with 'issues' and it is used, so the 'issues' do matter: bombs away! + return new Error(m.message); + } + + // detect definition loop: + if (m === false) { + return new Error('Macro name "' + i + '" has an illegal, looping, definition, i.e. it\'s definition references itself, either directly or indirectly, via other macros.'); + } + } + + return m; + } + + function expandMacroElsewhere(i) { + var k, a, m; + + if (macros[i].elsewhere == null) { + m = dict_macros[i]; + + // set up our own record so we can detect definition loops: + macros[i].elsewhere = false; + + // the macro MAY contain other macros which MAY be inside a `[...]` set in this + // macro or elsewhere, hence we must parse the regex: + m = reduceRegex(m, i, opts, expandAllMacrosInSet, expandAllMacrosElsewhere); + // propagate deferred exceptions = error reports. + if (m instanceof Error) { + return m; + } + + macros[i].elsewhere = m; + } else { + m = macros[i].elsewhere; + + if (m instanceof Error) { + // this turns out to be an macro with 'issues' and it is used, so the 'issues' do matter: bombs away! + return m; + } + + // detect definition loop: + if (m === false) { + return new Error('Macro name "' + i + '" has an illegal, looping, definition, i.e. it\'s definition references itself, either directly or indirectly, via other macros.'); + } + } + + return m; + } + + function expandAllMacrosInSet(s) { + var i, x; + + // process *all* the macros inside [...] set: + if (s.indexOf('{') >= 0) { + for (i in macros) { + if (macros.hasOwnProperty(i)) { + var a = s.split('{' + i + '}'); + if (a.length > 1) { + x = expandMacroInSet(i); + assert(x); + if (x instanceof Error) { + return new Error('failure to expand the macro [' + i + '] in set [' + s + ']: ' + x.message); + } + s = a.join(x); + } + + // stop the brute-force expansion attempt when we done 'em all: + if (s.indexOf('{') === -1) { + break; + } + } + } + } + + return s; + } + + function expandAllMacrosElsewhere(s) { + var i, x; + + // When we process the remaining macro occurrences in the regex + // every macro used in a lexer rule will become its own capture group. + // + // Meanwhile the cached expansion will expand any submacros into + // *NON*-capturing groups so that the backreference indexes remain as you'ld + // expect and using macros doesn't require you to know exactly what your + // used macro will expand into, i.e. which and how many submacros it has. + // + // This is a BREAKING CHANGE from vanilla jison 0.4.15! + if (s.indexOf('{') >= 0) { + for (i in macros) { + if (macros.hasOwnProperty(i)) { + // These are all submacro expansions, hence non-capturing grouping is applied: + var a = s.split('{' + i + '}'); + if (a.length > 1) { + x = expandMacroElsewhere(i); + assert(x); + if (x instanceof Error) { + return new Error('failure to expand the macro [' + i + '] in regex /' + s + '/: ' + x.message); + } + s = a.join('(?:' + x + ')'); + } + + // stop the brute-force expansion attempt when we done 'em all: + if (s.indexOf('{') === -1) { + break; + } + } + } + } + + return s; + } + + var m, i; + + if (opts.debug) console.log('\n############## RAW macros: ', dict_macros); + + // first we create the part of the dictionary which is targeting the use of macros + // *inside* `[...]` sets; once we have completed that half of the expansions work, + // we then go and expand the macros for when they are used elsewhere in a regex: + // iff we encounter submacros then which are used *inside* a set, we can use that + // first half dictionary to speed things up a bit as we can use those expansions + // straight away! + for (i in dict_macros) { + if (dict_macros.hasOwnProperty(i)) { + expandMacroInSet(i); + } + } + + for (i in dict_macros) { + if (dict_macros.hasOwnProperty(i)) { + expandMacroElsewhere(i); + } + } + + if (opts.debug) console.log('\n############### expanded macros: ', macros); + + return macros; +} + +// expand macros in a regex; expands them recursively +function expandMacros(src, macros, opts) { + var expansion_count = 0; + + // By the time we call this function `expandMacros` we MUST have expanded and cached all macros already! + // Hence things should be easy in there: + + function expandAllMacrosInSet(s) { + var i, m, x; + + // process *all* the macros inside [...] set: + if (s.indexOf('{') >= 0) { + for (i in macros) { + if (macros.hasOwnProperty(i)) { + m = macros[i]; + + var a = s.split('{' + i + '}'); + if (a.length > 1) { + x = m.in_set; + + assert(x); + if (x instanceof Error) { + // this turns out to be an macro with 'issues' and it is used, so the 'issues' do matter: bombs away! + throw x; + } + + // detect definition loop: + if (x === false) { + return new Error('Macro name "' + i + '" has an illegal, looping, definition, i.e. it\'s definition references itself, either directly or indirectly, via other macros.'); + } + + s = a.join(x); + expansion_count++; + } + + // stop the brute-force expansion attempt when we done 'em all: + if (s.indexOf('{') === -1) { + break; + } + } + } + } + + return s; + } + + function expandAllMacrosElsewhere(s) { + var i, m, x; + + // When we process the main macro occurrences in the regex + // every macro used in a lexer rule will become its own capture group. + // + // Meanwhile the cached expansion will expand any submacros into + // *NON*-capturing groups so that the backreference indexes remain as you'ld + // expect and using macros doesn't require you to know exactly what your + // used macro will expand into, i.e. which and how many submacros it has. + // + // This is a BREAKING CHANGE from vanilla jison 0.4.15! + if (s.indexOf('{') >= 0) { + for (i in macros) { + if (macros.hasOwnProperty(i)) { + m = macros[i]; + + var a = s.split('{' + i + '}'); + if (a.length > 1) { + // These are all main macro expansions, hence CAPTURING grouping is applied: + x = m.elsewhere; + assert(x); + + // detect definition loop: + if (x === false) { + return new Error('Macro name "' + i + '" has an illegal, looping, definition, i.e. it\'s definition references itself, either directly or indirectly, via other macros.'); + } + + s = a.join('(' + x + ')'); + expansion_count++; + } + + // stop the brute-force expansion attempt when we done 'em all: + if (s.indexOf('{') === -1) { + break; + } + } + } + } + + return s; + } + + // When we process the macro occurrences in the regex + // every macro used in a lexer rule will become its own capture group. + // + // Meanwhile the cached expansion will have expanded any submacros into + // *NON*-capturing groups so that the backreference indexes remain as you'ld + // expect and using macros doesn't require you to know exactly what your + // used macro will expand into, i.e. which and how many submacros it has. + // + // This is a BREAKING CHANGE from vanilla jison 0.4.15! + var s2 = reduceRegex(src, null, opts, expandAllMacrosInSet, expandAllMacrosElsewhere); + // propagate deferred exceptions = error reports. + if (s2 instanceof Error) { + throw s2; + } + + // only when we did expand some actual macros do we take the re-interpreted/optimized/regenerated regex from reduceRegex() + // in order to keep our test cases simple and rules recognizable. This assumes the user can code good regexes on his own, + // as long as no macros are involved... + // + // Also pick the reduced regex when there (potentially) are XRegExp extensions in the original, e.g. `\\p{Number}`, + // unless the `xregexp` output option has been enabled. + if (expansion_count > 0 || src.indexOf('\\p{') >= 0 && !opts.options.xregexp) { + src = s2; + } else { + // Check if the reduced regex is smaller in size; when it is, we still go with the new one! + if (s2.length < src.length) { + src = s2; + } + } + + return src; +} + +function prepareStartConditions(conditions) { + var sc, + hash = {}; + for (sc in conditions) { + if (conditions.hasOwnProperty(sc)) { + hash[sc] = { rules: [], inclusive: !conditions[sc] }; + } + } + return hash; +} + +function buildActions(dict, tokens, opts) { + var actions = [dict.actionInclude || '', 'var YYSTATE = YY_START;']; + var tok; + var toks = {}; + var caseHelper = []; + + // tokens: map/array of token numbers to token names + for (tok in tokens) { + var idx = parseInt(tok); + if (idx && idx > 0) { + toks[tokens[tok]] = idx; + } + } + + if (opts.options.flex && dict.rules) { + dict.rules.push(['.', 'console.log("", yytext); /* `flex` lexing mode: the last resort rule! */']); + } + + var gen = prepareRules(dict, actions, caseHelper, tokens && toks, opts.conditions, opts); + + var fun = actions.join('\n'); + 'yytext yyleng yylineno yylloc yyerror'.split(' ').forEach(function (yy) { + fun = fun.replace(new RegExp('\\b(' + yy + ')\\b', 'g'), 'yy_.$1'); + }); + + return { + caseHelperInclude: '{\n' + caseHelper.join(',') + '\n}', + + actions: 'function lexer__performAction(yy, yyrulenumber, YY_START) {\n var yy_ = this;\n\n ' + fun + '\n }', + + rules: gen.rules, + macros: gen.macros, // propagate these for debugging/diagnostic purposes + + regular_rule_count: gen.regular_rule_count, + simple_rule_count: gen.simple_rule_count + }; +} + +// +// NOTE: this is *almost* a copy of the JisonParserError producing code in +// jison/lib/jison.js @ line 2304:lrGeneratorMixin.generateErrorClass +// +function generateErrorClass() { + // --- START lexer error class --- + + var prelude = '/**\n * See also:\n * http://stackoverflow.com/questions/1382107/whats-a-good-way-to-extend-error-in-javascript/#35881508\n * but we keep the prototype.constructor and prototype.name assignment lines too for compatibility\n * with userland code which might access the derived class in a \'classic\' way.\n *\n * @public\n * @constructor\n * @nocollapse\n */\nfunction JisonLexerError(msg, hash) {\n Object.defineProperty(this, \'name\', {\n enumerable: false,\n writable: false,\n value: \'JisonLexerError\'\n });\n\n if (msg == null) msg = \'???\';\n\n Object.defineProperty(this, \'message\', {\n enumerable: false,\n writable: true,\n value: msg\n });\n\n this.hash = hash;\n\n var stacktrace;\n if (hash && hash.exception instanceof Error) {\n var ex2 = hash.exception;\n this.message = ex2.message || msg;\n stacktrace = ex2.stack;\n }\n if (!stacktrace) {\n if (Error.hasOwnProperty(\'captureStackTrace\')) { // V8\n Error.captureStackTrace(this, this.constructor);\n } else {\n stacktrace = (new Error(msg)).stack;\n }\n }\n if (stacktrace) {\n Object.defineProperty(this, \'stack\', {\n enumerable: false,\n writable: false,\n value: stacktrace\n });\n }\n}\n\nif (typeof Object.setPrototypeOf === \'function\') {\n Object.setPrototypeOf(JisonLexerError.prototype, Error.prototype);\n} else {\n JisonLexerError.prototype = Object.create(Error.prototype);\n}\nJisonLexerError.prototype.constructor = JisonLexerError;\nJisonLexerError.prototype.name = \'JisonLexerError\';'; + + // --- END lexer error class --- + + return prelude; +} + +var jisonLexerErrorDefinition = generateErrorClass(); + +function generateFakeXRegExpClassSrcCode() { + return rmCommonWS(_templateObject); +} + +/** @constructor */ +function RegExpLexer(dict, input, tokens, build_options) { + var opts; + var dump = false; + + function test_me(tweak_cb, description, src_exception, ex_callback) { + opts = processGrammar(dict, tokens, build_options); + opts.__in_rules_failure_analysis_mode__ = false; + prepExportStructures(opts); + assert(opts.options); + if (tweak_cb) { + tweak_cb(); + } + var source = generateModuleBody(opts); + try { + // The generated code will always have the `lexer` variable declared at local scope + // as `eval()` will use the local scope. + // + // The compiled code will look something like this: + // + // ``` + // var lexer; + // bla bla... + // ``` + // + // or + // + // ``` + // var lexer = { bla... }; + // ``` + var testcode = ['// provide a local version for test purposes:', jisonLexerErrorDefinition, '', generateFakeXRegExpClassSrcCode(), '', source, '', 'return lexer;'].join('\n'); + var lexer = code_exec(testcode, function generated_code_exec_wrapper_regexp_lexer(sourcecode) { + //console.log("===============================LEXER TEST CODE\n", sourcecode, "\n=====================END====================\n"); + var lexer_f = new Function('', sourcecode); + return lexer_f(); + }, opts.options, "lexer"); + + if (!lexer) { + throw new Error('no lexer defined *at all*?!'); + } + if (_typeof(lexer.options) !== 'object' || lexer.options == null) { + throw new Error('your lexer class MUST have an .options member object or it won\'t fly!'); + } + if (typeof lexer.setInput !== 'function') { + throw new Error('your lexer class MUST have a .setInput function member or it won\'t fly!'); + } + if (lexer.EOF !== 1 && lexer.ERROR !== 2) { + throw new Error('your lexer class MUST have these constants defined: lexer.EOF = 1 and lexer.ERROR = 2 or it won\'t fly!'); + } + + // When we do NOT crash, we found/killed the problem area just before this call! + if (src_exception && description) { + src_exception.message += '\n (' + description + ')'; + } + + // patch the pre and post handlers in there, now that we have some live code to work with: + if (opts.options) { + var pre = opts.options.pre_lex; + var post = opts.options.post_lex; + // since JSON cannot encode functions, we'll have to do it manually now: + if (typeof pre === 'function') { + lexer.options.pre_lex = pre; + } + if (typeof post === 'function') { + lexer.options.post_lex = post; + } + } + + if (opts.options.showSource) { + if (typeof opts.options.showSource === 'function') { + opts.options.showSource(lexer, source, opts); + } else { + console.log("\nGenerated lexer sourcecode:\n----------------------------------------\n", source, "\n----------------------------------------\n"); + } + } + return lexer; + } catch (ex) { + // if (src_exception) { + // src_exception.message += '\n (' + description + ': ' + ex.message + ')'; + // } + + if (ex_callback) { + ex_callback(ex); + } else if (dump) { + console.log('source code:\n', source); + } + return false; + } + } + + /** @constructor */ + var lexer = test_me(null, null, null, function (ex) { + // When we get an exception here, it means some part of the user-specified lexer is botched. + // + // Now we go and try to narrow down the problem area/category: + assert(opts.options); + assert(opts.options.xregexp !== undefined); + var orig_xregexp_opt = !!opts.options.xregexp; + if (!test_me(function () { + assert(opts.options.xregexp !== undefined); + opts.options.xregexp = false; + opts.showSource = false; + }, 'When you have specified %option xregexp, you must also properly IMPORT the XRegExp library in the generated lexer.', ex, null)) { + if (!test_me(function () { + // restore xregexp option setting: the trouble wasn't caused by the xregexp flag i.c.w. incorrect XRegExp library importing! + opts.options.xregexp = orig_xregexp_opt; + + opts.conditions = []; + opts.showSource = false; + }, dict.rules && dict.rules.length > 0 ? 'One or more of your lexer state names are possibly botched?' : 'Your custom lexer is somehow botched.', ex, null)) { + if (!test_me(function () { + // opts.conditions = []; + opts.rules = []; + opts.showSource = false; + opts.__in_rules_failure_analysis_mode__ = true; + }, 'One or more of your lexer rules are possibly botched?', ex, null)) { + // kill each rule action block, one at a time and test again after each 'edit': + var rv = false; + for (var i = 0, len = dict.rules ? dict.rules.length : 0; i < len; i++) { + dict.rules[i][1] = '{ /* nada */ }'; + rv = test_me(function () { + // opts.conditions = []; + // opts.rules = []; + // opts.__in_rules_failure_analysis_mode__ = true; + }, 'Your lexer rule "' + dict.rules[i][0] + '" action code block is botched?', ex, null); + if (rv) { + break; + } + } + if (!rv) { + test_me(function () { + opts.conditions = []; + opts.rules = []; + opts.performAction = 'null'; + // opts.options = {}; + // opts.caseHelperInclude = '{}'; + opts.showSource = false; + opts.__in_rules_failure_analysis_mode__ = true; + + dump = false; + }, 'One or more of your lexer rule action code block(s) are possibly botched?', ex, null); + } + } + } + } + throw ex; + }); + + lexer.setInput(input); + + /** @public */ + lexer.generate = function () { + return generateFromOpts(opts); + }; + /** @public */ + lexer.generateModule = function () { + return generateModule(opts); + }; + /** @public */ + lexer.generateCommonJSModule = function () { + return generateCommonJSModule(opts); + }; + /** @public */ + lexer.generateESModule = function () { + return generateESModule(opts); + }; + /** @public */ + lexer.generateAMDModule = function () { + return generateAMDModule(opts); + }; + + // internal APIs to aid testing: + /** @public */ + lexer.getExpandedMacros = function () { + return opts.macros; + }; + + return lexer; +} + +// code stripping performance test for very simple grammar: +// +// - removing backtracking parser code branches: 730K -> 750K rounds +// - removing all location info tracking: yylineno, yylloc, etc.: 750K -> 900K rounds +// - no `yyleng`: 900K -> 905K rounds +// - no `this.done` as we cannot have a NULL `_input` anymore: 905K -> 930K rounds +// - `simpleCaseActionClusters` as array instead of hash object: 930K -> 940K rounds +// - lexers which have only return stmts, i.e. only a +// `simpleCaseActionClusters` lookup table to produce +// lexer tokens: *inline* the `performAction` call: 940K -> 950K rounds +// - given all the above, you can *inline* what's left of +// `lexer_next()`: 950K -> 955K rounds (? this stuff becomes hard to measure; inaccuracy abounds!) +// +// Total gain when we forget about very minor (and tough to nail) *inlining* `lexer_next()` gains: +// +// 730 -> 950 ~ 30% performance gain. +// + +// As a function can be reproduced in source-code form by any JavaScript engine, we're going to wrap this chunk +// of code in a function so that we can easily get it including it comments, etc.: +/** +@public +@nocollapse +*/ +function getRegExpLexerPrototype() { + // --- START lexer kernel --- + return '{\n EOF: 1,\n ERROR: 2,\n\n // JisonLexerError: JisonLexerError, /// <-- injected by the code generator\n\n // options: {}, /// <-- injected by the code generator\n\n // yy: ..., /// <-- injected by setInput()\n\n __currentRuleSet__: null, /// INTERNAL USE ONLY: internal rule set cache for the current lexer state\n\n __error_infos: [], /// INTERNAL USE ONLY: the set of lexErrorInfo objects created since the last cleanup\n\n __decompressed: false, /// INTERNAL USE ONLY: mark whether the lexer instance has been \'unfolded\' completely and is now ready for use\n\n done: false, /// INTERNAL USE ONLY\n _backtrack: false, /// INTERNAL USE ONLY\n _input: \'\', /// INTERNAL USE ONLY\n _more: false, /// INTERNAL USE ONLY\n _signaled_error_token: false, /// INTERNAL USE ONLY\n\n conditionStack: [], /// INTERNAL USE ONLY; managed via `pushState()`, `popState()`, `topState()` and `stateStackSize()`\n\n match: \'\', /// READ-ONLY EXTERNAL ACCESS - ADVANCED USE ONLY: tracks input which has been matched so far for the lexer token under construction. `match` is identical to `yytext` except that this one still contains the matched input string after `lexer.performAction()` has been invoked, where userland code MAY have changed/replaced the `yytext` value entirely!\n matched: \'\', /// READ-ONLY EXTERNAL ACCESS - ADVANCED USE ONLY: tracks entire input which has been matched so far\n matches: false, /// READ-ONLY EXTERNAL ACCESS - ADVANCED USE ONLY: tracks RE match result for last (successful) match attempt\n yytext: \'\', /// ADVANCED USE ONLY: tracks input which has been matched so far for the lexer token under construction; this value is transferred to the parser as the \'token value\' when the parser consumes the lexer token produced through a call to the `lex()` API.\n offset: 0, /// READ-ONLY EXTERNAL ACCESS - ADVANCED USE ONLY: tracks the \'cursor position\' in the input string, i.e. the number of characters matched so far\n yyleng: 0, /// READ-ONLY EXTERNAL ACCESS - ADVANCED USE ONLY: length of matched input for the token under construction (`yytext`)\n yylineno: 0, /// READ-ONLY EXTERNAL ACCESS - ADVANCED USE ONLY: \'line number\' at which the token under construction is located\n yylloc: null, /// READ-ONLY EXTERNAL ACCESS - ADVANCED USE ONLY: tracks location info (lines + columns) for the token under construction\n\n /**\n * INTERNAL USE: construct a suitable error info hash object instance for `parseError`.\n * \n * @public\n * @this {RegExpLexer}\n */\n constructLexErrorInfo: function lexer_constructLexErrorInfo(msg, recoverable, show_input_position) {\n msg = \'\' + msg;\n\n // heuristic to determine if the error message already contains a (partial) source code dump\n // as produced by either `showPosition()` or `prettyPrintRange()`:\n if (show_input_position == undefined) {\n show_input_position = !(msg.indexOf(\'\\n\') > 0 && msg.indexOf(\'^\') > 0);\n }\n if (this.yylloc && show_input_position) {\n if (typeof this.prettyPrintRange === \'function\') {\n var pretty_src = this.prettyPrintRange(this.yylloc);\n\n if (!/\\n\\s*$/.test(msg)) {\n msg += \'\\n\';\n }\n msg += \'\\n Erroneous area:\\n\' + this.prettyPrintRange(this.yylloc); \n } else if (typeof this.showPosition === \'function\') {\n var pos_str = this.showPosition();\n if (pos_str) {\n if (msg.length && msg[msg.length - 1] !== \'\\n\' && pos_str[0] !== \'\\n\') {\n msg += \'\\n\' + pos_str;\n } else {\n msg += pos_str;\n }\n }\n }\n }\n /** @constructor */\n var pei = {\n errStr: msg,\n recoverable: !!recoverable,\n text: this.match, // This one MAY be empty; userland code should use the `upcomingInput` API to obtain more text which follows the \'lexer cursor position\'...\n token: null,\n line: this.yylineno,\n loc: this.yylloc,\n yy: this.yy,\n lexer: this,\n\n /**\n * and make sure the error info doesn\'t stay due to potential\n * ref cycle via userland code manipulations.\n * These would otherwise all be memory leak opportunities!\n * \n * Note that only array and object references are nuked as those\n * constitute the set of elements which can produce a cyclic ref.\n * The rest of the members is kept intact as they are harmless.\n * \n * @public\n * @this {LexErrorInfo}\n */\n destroy: function destructLexErrorInfo() {\n // remove cyclic references added to error info:\n // info.yy = null;\n // info.lexer = null;\n // ...\n var rec = !!this.recoverable;\n for (var key in this) {\n if (this.hasOwnProperty(key) && typeof key === \'object\') {\n this[key] = undefined;\n }\n }\n this.recoverable = rec;\n }\n };\n // track this instance so we can `destroy()` it once we deem it superfluous and ready for garbage collection!\n this.__error_infos.push(pei);\n return pei;\n },\n\n /**\n * handler which is invoked when a lexer error occurs.\n * \n * @public\n * @this {RegExpLexer}\n */\n parseError: function lexer_parseError(str, hash, ExceptionClass) {\n if (!ExceptionClass) {\n ExceptionClass = this.JisonLexerError;\n }\n if (this.yy) {\n if (this.yy.parser && typeof this.yy.parser.parseError === \'function\') {\n return this.yy.parser.parseError.call(this, str, hash, ExceptionClass) || this.ERROR;\n } else if (typeof this.yy.parseError === \'function\') {\n return this.yy.parseError.call(this, str, hash, ExceptionClass) || this.ERROR;\n } \n }\n throw new ExceptionClass(str, hash);\n },\n\n /**\n * method which implements `yyerror(str, ...args)` functionality for use inside lexer actions.\n * \n * @public\n * @this {RegExpLexer}\n */\n yyerror: function yyError(str /*, ...args */) {\n var lineno_msg = \'\';\n if (this.yylloc) {\n lineno_msg = \' on line \' + (this.yylineno + 1);\n }\n var p = this.constructLexErrorInfo(\'Lexical error\' + lineno_msg + \': \' + str, this.options.lexerErrorsAreRecoverable);\n\n // Add any extra args to the hash under the name `extra_error_attributes`:\n var args = Array.prototype.slice.call(arguments, 1);\n if (args.length) {\n p.extra_error_attributes = args;\n }\n\n return (this.parseError(p.errStr, p, this.JisonLexerError) || this.ERROR);\n },\n\n /**\n * final cleanup function for when we have completed lexing the input;\n * make it an API so that external code can use this one once userland\n * code has decided it\'s time to destroy any lingering lexer error\n * hash object instances and the like: this function helps to clean\n * up these constructs, which *may* carry cyclic references which would\n * otherwise prevent the instances from being properly and timely\n * garbage-collected, i.e. this function helps prevent memory leaks!\n * \n * @public\n * @this {RegExpLexer}\n */\n cleanupAfterLex: function lexer_cleanupAfterLex(do_not_nuke_errorinfos) {\n // prevent lingering circular references from causing memory leaks:\n this.setInput(\'\', {});\n\n // nuke the error hash info instances created during this run.\n // Userland code must COPY any data/references\n // in the error hash instance(s) it is more permanently interested in.\n if (!do_not_nuke_errorinfos) {\n for (var i = this.__error_infos.length - 1; i >= 0; i--) {\n var el = this.__error_infos[i];\n if (el && typeof el.destroy === \'function\') {\n el.destroy();\n }\n }\n this.__error_infos.length = 0;\n }\n\n return this;\n },\n\n /**\n * clear the lexer token context; intended for internal use only\n * \n * @public\n * @this {RegExpLexer}\n */\n clear: function lexer_clear() {\n this.yytext = \'\';\n this.yyleng = 0;\n this.match = \'\';\n // - DO NOT reset `this.matched`\n this.matches = false;\n this._more = false;\n this._backtrack = false;\n\n var col = (this.yylloc ? this.yylloc.last_column : 0);\n this.yylloc = {\n first_line: this.yylineno + 1,\n first_column: col,\n last_line: this.yylineno + 1,\n last_column: col,\n\n range: [this.offset, this.offset]\n };\n },\n\n /**\n * resets the lexer, sets new input\n * \n * @public\n * @this {RegExpLexer}\n */\n setInput: function lexer_setInput(input, yy) {\n this.yy = yy || this.yy || {};\n\n // also check if we\'ve fully initialized the lexer instance,\n // including expansion work to be done to go from a loaded\n // lexer to a usable lexer:\n if (!this.__decompressed) {\n // step 1: decompress the regex list:\n var rules = this.rules;\n for (var i = 0, len = rules.length; i < len; i++) {\n var rule_re = rules[i];\n\n // compression: is the RE an xref to another RE slot in the rules[] table?\n if (typeof rule_re === \'number\') {\n rules[i] = rules[rule_re];\n }\n }\n\n // step 2: unfold the conditions[] set to make these ready for use:\n var conditions = this.conditions;\n for (var k in conditions) {\n var spec = conditions[k];\n\n var rule_ids = spec.rules;\n\n var len = rule_ids.length;\n var rule_regexes = new Array(len + 1); // slot 0 is unused; we use a 1-based index approach here to keep the hottest code in `lexer_next()` fast and simple!\n var rule_new_ids = new Array(len + 1);\n\n for (var i = 0; i < len; i++) {\n var idx = rule_ids[i];\n var rule_re = rules[idx];\n rule_regexes[i + 1] = rule_re;\n rule_new_ids[i + 1] = idx;\n }\n\n spec.rules = rule_new_ids;\n spec.__rule_regexes = rule_regexes;\n spec.__rule_count = len;\n }\n\n this.__decompressed = true;\n }\n\n this._input = input || \'\';\n this.clear();\n this._signaled_error_token = false;\n this.done = false;\n this.yylineno = 0;\n this.matched = \'\';\n this.conditionStack = [\'INITIAL\'];\n this.__currentRuleSet__ = null;\n this.yylloc = {\n first_line: 1,\n first_column: 0,\n last_line: 1,\n last_column: 0,\n\n range: [0, 0]\n };\n this.offset = 0;\n return this;\n },\n\n /**\n * edit the remaining input via user-specified callback.\n * This can be used to forward-adjust the input-to-parse, \n * e.g. inserting macro expansions and alike in the\n * input which has yet to be lexed.\n * The behaviour of this API contrasts the `unput()` et al\n * APIs as those act on the *consumed* input, while this\n * one allows one to manipulate the future, without impacting\n * the current `yyloc` cursor location or any history. \n * \n * Use this API to help implement C-preprocessor-like\n * `#include` statements, etc.\n * \n * The provided callback must be synchronous and is\n * expected to return the edited input (string).\n *\n * The `cpsArg` argument value is passed to the callback\n * as-is.\n *\n * `callback` interface: \n * `function callback(input, cpsArg)`\n * \n * - `input` will carry the remaining-input-to-lex string\n * from the lexer.\n * - `cpsArg` is `cpsArg` passed into this API.\n * \n * The `this` reference for the callback will be set to\n * reference this lexer instance so that userland code\n * in the callback can easily and quickly access any lexer\n * API. \n *\n * When the callback returns a non-string-type falsey value,\n * we assume the callback did not edit the input and we\n * will using the input as-is.\n *\n * When the callback returns a non-string-type value, it\n * is converted to a string for lexing via the `"" + retval`\n * operation. (See also why: http://2ality.com/2012/03/converting-to-string.html \n * -- that way any returned object\'s `toValue()` and `toString()`\n * methods will be invoked in a proper/desirable order.)\n * \n * @public\n * @this {RegExpLexer}\n */\n editRemainingInput: function lexer_editRemainingInput(callback, cpsArg) {\n var rv = callback.call(this, this._input, cpsArg);\n if (typeof rv !== \'string\') {\n if (rv) {\n this._input = \'\' + rv; \n }\n // else: keep `this._input` as is. \n } else {\n this._input = rv; \n }\n return this;\n },\n\n /**\n * consumes and returns one char from the input\n * \n * @public\n * @this {RegExpLexer}\n */\n input: function lexer_input() {\n if (!this._input) {\n //this.done = true; -- don\'t set `done` as we want the lex()/next() API to be able to produce one custom EOF token match after this anyhow. (lexer can match special <> tokens and perform user action code for a <> match, but only does so *once*)\n return null;\n }\n var ch = this._input[0];\n this.yytext += ch;\n this.yyleng++;\n this.offset++;\n this.match += ch;\n this.matched += ch;\n // Count the linenumber up when we hit the LF (or a stand-alone CR).\n // On CRLF, the linenumber is incremented when you fetch the CR or the CRLF combo\n // and we advance immediately past the LF as well, returning both together as if\n // it was all a single \'character\' only.\n var slice_len = 1;\n var lines = false;\n if (ch === \'\\n\') {\n lines = true;\n } else if (ch === \'\\r\') {\n lines = true;\n var ch2 = this._input[1];\n if (ch2 === \'\\n\') {\n slice_len++;\n ch += ch2;\n this.yytext += ch2;\n this.yyleng++;\n this.offset++;\n this.match += ch2;\n this.matched += ch2;\n this.yylloc.range[1]++;\n }\n }\n if (lines) {\n this.yylineno++;\n this.yylloc.last_line++;\n this.yylloc.last_column = 0;\n } else {\n this.yylloc.last_column++;\n }\n this.yylloc.range[1]++;\n\n this._input = this._input.slice(slice_len);\n return ch;\n },\n\n /**\n * unshifts one char (or an entire string) into the input\n * \n * @public\n * @this {RegExpLexer}\n */\n unput: function lexer_unput(ch) {\n var len = ch.length;\n var lines = ch.split(/(?:\\r\\n?|\\n)/g);\n\n this._input = ch + this._input;\n this.yytext = this.yytext.substr(0, this.yytext.length - len);\n this.yyleng = this.yytext.length;\n this.offset -= len;\n this.match = this.match.substr(0, this.match.length - len);\n this.matched = this.matched.substr(0, this.matched.length - len);\n\n if (lines.length > 1) {\n this.yylineno -= lines.length - 1;\n\n this.yylloc.last_line = this.yylineno + 1;\n\n // Get last entirely matched line into the `pre_lines[]` array\'s\n // last index slot; we don\'t mind when other previously \n // matched lines end up in the array too. \n var pre = this.match;\n var pre_lines = pre.split(/(?:\\r\\n?|\\n)/g);\n if (pre_lines.length === 1) {\n pre = this.matched;\n pre_lines = pre.split(/(?:\\r\\n?|\\n)/g);\n }\n this.yylloc.last_column = pre_lines[pre_lines.length - 1].length;\n } else {\n this.yylloc.last_column -= len;\n }\n\n this.yylloc.range[1] = this.yylloc.range[0] + this.yyleng;\n\n this.done = false;\n return this;\n },\n\n /**\n * cache matched text and append it on next action\n * \n * @public\n * @this {RegExpLexer}\n */\n more: function lexer_more() {\n this._more = true;\n return this;\n },\n\n /**\n * signal the lexer that this rule fails to match the input, so the\n * next matching rule (regex) should be tested instead.\n * \n * @public\n * @this {RegExpLexer}\n */\n reject: function lexer_reject() {\n if (this.options.backtrack_lexer) {\n this._backtrack = true;\n } else {\n // when the `parseError()` call returns, we MUST ensure that the error is registered.\n // We accomplish this by signaling an \'error\' token to be produced for the current\n // `.lex()` run.\n var lineno_msg = \'\';\n if (this.yylloc) {\n lineno_msg = \' on line \' + (this.yylineno + 1);\n }\n var p = this.constructLexErrorInfo(\'Lexical error\' + lineno_msg + \': You can only invoke reject() in the lexer when the lexer is of the backtracking persuasion (options.backtrack_lexer = true).\', false);\n this._signaled_error_token = (this.parseError(p.errStr, p, this.JisonLexerError) || this.ERROR);\n }\n return this;\n },\n\n /**\n * retain first n characters of the match\n * \n * @public\n * @this {RegExpLexer}\n */\n less: function lexer_less(n) {\n return this.unput(this.match.slice(n));\n },\n\n /**\n * return (part of the) already matched input, i.e. for error\n * messages.\n * \n * Limit the returned string length to `maxSize` (default: 20).\n * \n * Limit the returned string to the `maxLines` number of lines of\n * input (default: 1).\n * \n * Negative limit values equal *unlimited*.\n * \n * @public\n * @this {RegExpLexer}\n */\n pastInput: function lexer_pastInput(maxSize, maxLines) {\n var past = this.matched.substring(0, this.matched.length - this.match.length);\n if (maxSize < 0)\n maxSize = past.length;\n else if (!maxSize)\n maxSize = 20;\n if (maxLines < 0)\n maxLines = past.length; // can\'t ever have more input lines than this!\n else if (!maxLines)\n maxLines = 1;\n // `substr` anticipation: treat \\r\\n as a single character and take a little\n // more than necessary so that we can still properly check against maxSize\n // after we\'ve transformed and limited the newLines in here:\n past = past.substr(-maxSize * 2 - 2);\n // now that we have a significantly reduced string to process, transform the newlines\n // and chop them, then limit them:\n var a = past.replace(/\\r\\n|\\r/g, \'\\n\').split(\'\\n\');\n a = a.slice(-maxLines);\n past = a.join(\'\\n\');\n // When, after limiting to maxLines, we still have too much to return,\n // do add an ellipsis prefix...\n if (past.length > maxSize) {\n past = \'...\' + past.substr(-maxSize);\n }\n return past;\n },\n\n /**\n * return (part of the) upcoming input, i.e. for error messages.\n * \n * Limit the returned string length to `maxSize` (default: 20).\n * \n * Limit the returned string to the `maxLines` number of lines of input (default: 1).\n * \n * Negative limit values equal *unlimited*.\n *\n * > ### NOTE ###\n * >\n * > *"upcoming input"* is defined as the whole of the both\n * > the *currently lexed* input, together with any remaining input\n * > following that. *"currently lexed"* input is the input \n * > already recognized by the lexer but not yet returned with\n * > the lexer token. This happens when you are invoking this API\n * > from inside any lexer rule action code block. \n * >\n * \n * @public\n * @this {RegExpLexer}\n */\n upcomingInput: function lexer_upcomingInput(maxSize, maxLines) {\n var next = this.match;\n if (maxSize < 0)\n maxSize = next.length + this._input.length;\n else if (!maxSize)\n maxSize = 20;\n if (maxLines < 0)\n maxLines = maxSize; // can\'t ever have more input lines than this!\n else if (!maxLines)\n maxLines = 1;\n // `substring` anticipation: treat \\r\\n as a single character and take a little\n // more than necessary so that we can still properly check against maxSize\n // after we\'ve transformed and limited the newLines in here:\n if (next.length < maxSize * 2 + 2) {\n next += this._input.substring(0, maxSize * 2 + 2); // substring is faster on Chrome/V8\n }\n // now that we have a significantly reduced string to process, transform the newlines\n // and chop them, then limit them:\n var a = next.replace(/\\r\\n|\\r/g, \'\\n\').split(\'\\n\');\n a = a.slice(0, maxLines);\n next = a.join(\'\\n\');\n // When, after limiting to maxLines, we still have too much to return,\n // do add an ellipsis postfix...\n if (next.length > maxSize) {\n next = next.substring(0, maxSize) + \'...\';\n }\n return next;\n },\n\n /**\n * return a string which displays the character position where the\n * lexing error occurred, i.e. for error messages\n * \n * @public\n * @this {RegExpLexer}\n */\n showPosition: function lexer_showPosition(maxPrefix, maxPostfix) {\n var pre = this.pastInput(maxPrefix).replace(/\\s/g, \' \');\n var c = new Array(pre.length + 1).join(\'-\');\n return pre + this.upcomingInput(maxPostfix).replace(/\\s/g, \' \') + \'\\n\' + c + \'^\';\n },\n\n /**\n * return a string which displays the lines & columns of input which are referenced \n * by the given location info range, plus a few lines of context.\n * \n * This function pretty-prints the indicated section of the input, with line numbers \n * and everything!\n * \n * This function is very useful to provide highly readable error reports, while\n * the location range may be specified in various flexible ways:\n * \n * - `loc` is the location info object which references the area which should be\n * displayed and \'marked up\': these lines & columns of text are marked up by `^`\n * characters below each character in the entire input range.\n * \n * - `context_loc` is the *optional* location info object which instructs this\n * pretty-printer how much *leading* context should be displayed alongside\n * the area referenced by `loc`. This can help provide context for the displayed\n * error, etc.\n * \n * When this location info is not provided, a default context of 3 lines is\n * used.\n * \n * - `context_loc2` is another *optional* location info object, which serves\n * a similar purpose to `context_loc`: it specifies the amount of *trailing*\n * context lines to display in the pretty-print output.\n * \n * When this location info is not provided, a default context of 1 line only is\n * used.\n * \n * Special Notes:\n * \n * - when the `loc`-indicated range is very large (about 5 lines or more), then\n * only the first and last few lines of this block are printed while a\n * `...continued...` message will be printed between them.\n * \n * This serves the purpose of not printing a huge amount of text when the `loc`\n * range happens to be huge: this way a manageable & readable output results\n * for arbitrary large ranges.\n * \n * - this function can display lines of input which whave not yet been lexed.\n * `prettyPrintRange()` can access the entire input!\n * \n * @public\n * @this {RegExpLexer}\n */\n prettyPrintRange: function lexer_prettyPrintRange(loc, context_loc, context_loc2) {\n var error_size = loc.last_line - loc.first_line;\n const CONTEXT = 3;\n const CONTEXT_TAIL = 1;\n const MINIMUM_VISIBLE_NONEMPTY_LINE_COUNT = 2;\n var input = this.matched + this._input;\n var lines = input.split(\'\\n\');\n //var show_context = (error_size < 5 || context_loc);\n var l0 = Math.max(1, (context_loc ? context_loc.first_line : loc.first_line - CONTEXT));\n var l1 = Math.max(1, (context_loc2 ? context_loc2.last_line : loc.last_line + CONTEXT_TAIL));\n var lineno_display_width = (1 + Math.log10(l1 | 1) | 0);\n var ws_prefix = new Array(lineno_display_width).join(\' \');\n var nonempty_line_indexes = [];\n var rv = lines.slice(l0 - 1, l1 + 1).map(function injectLineNumber(line, index) {\n var lno = index + l0;\n var lno_pfx = (ws_prefix + lno).substr(-lineno_display_width);\n var rv = lno_pfx + \': \' + line;\n var errpfx = (new Array(lineno_display_width + 1)).join(\'^\');\n var offset = 2 + 1;\n var len = 0;\n\n if (lno === loc.first_line) {\n offset += loc.first_column;\n\n len = Math.max(\n 2,\n ((lno === loc.last_line ? loc.last_column : line.length)) - loc.first_column + 1\n );\n } else if (lno === loc.last_line) {\n len = Math.max(2, loc.last_column + 1);\n } else if (lno > loc.first_line && lno < loc.last_line) {\n len = Math.max(2, line.length + 1);\n }\n\n if (len) {\n var lead = new Array(offset).join(\'.\');\n var mark = new Array(len).join(\'^\');\n rv += \'\\n\' + errpfx + lead + mark;\n\n if (line.trim().length > 0) {\n nonempty_line_indexes.push(index);\n }\n }\n\n rv = rv.replace(/\\t/g, \' \');\n return rv;\n });\n\n // now make sure we don\'t print an overly large amount of error area: limit it \n // to the top and bottom line count:\n if (nonempty_line_indexes.length > 2 * MINIMUM_VISIBLE_NONEMPTY_LINE_COUNT) {\n var clip_start = nonempty_line_indexes[MINIMUM_VISIBLE_NONEMPTY_LINE_COUNT - 1] + 1;\n var clip_end = nonempty_line_indexes[nonempty_line_indexes.length - MINIMUM_VISIBLE_NONEMPTY_LINE_COUNT] - 1;\n\n var intermediate_line = (new Array(lineno_display_width + 1)).join(\' \') + \' (...continued...)\';\n intermediate_line += \'\\n\' + (new Array(lineno_display_width + 1)).join(\'-\') + \' (---------------)\';\n rv.splice(clip_start, clip_end - clip_start + 1, intermediate_line);\n }\n return rv.join(\'\\n\');\n },\n\n /**\n * helper function, used to produce a human readable description as a string, given\n * the input `yylloc` location object.\n * \n * Set `display_range_too` to TRUE to include the string character index position(s)\n * in the description if the `yylloc.range` is available.\n * \n * @public\n * @this {RegExpLexer}\n */\n describeYYLLOC: function lexer_describe_yylloc(yylloc, display_range_too) {\n var l1 = yylloc.first_line;\n var l2 = yylloc.last_line;\n var c1 = yylloc.first_column;\n var c2 = yylloc.last_column;\n var dl = l2 - l1;\n var dc = c2 - c1;\n var rv;\n if (dl === 0) {\n rv = \'line \' + l1 + \', \';\n if (dc <= 1) {\n rv += \'column \' + c1;\n } else {\n rv += \'columns \' + c1 + \' .. \' + c2;\n }\n } else {\n rv = \'lines \' + l1 + \'(column \' + c1 + \') .. \' + l2 + \'(column \' + c2 + \')\';\n }\n if (yylloc.range && display_range_too) {\n var r1 = yylloc.range[0];\n var r2 = yylloc.range[1] - 1;\n if (r2 <= r1) {\n rv += \' {String Offset: \' + r1 + \'}\';\n } else {\n rv += \' {String Offset range: \' + r1 + \' .. \' + r2 + \'}\';\n }\n }\n return rv;\n },\n\n /**\n * test the lexed token: return FALSE when not a match, otherwise return token.\n * \n * `match` is supposed to be an array coming out of a regex match, i.e. `match[0]`\n * contains the actually matched text string.\n * \n * Also move the input cursor forward and update the match collectors:\n * \n * - `yytext`\n * - `yyleng`\n * - `match`\n * - `matches`\n * - `yylloc`\n * - `offset`\n * \n * @public\n * @this {RegExpLexer}\n */\n test_match: function lexer_test_match(match, indexed_rule) {\n var token,\n lines,\n backup,\n match_str,\n match_str_len;\n\n if (this.options.backtrack_lexer) {\n // save context\n backup = {\n yylineno: this.yylineno,\n yylloc: {\n first_line: this.yylloc.first_line,\n last_line: this.yylloc.last_line,\n first_column: this.yylloc.first_column,\n last_column: this.yylloc.last_column,\n\n range: this.yylloc.range.slice(0)\n },\n yytext: this.yytext,\n match: this.match,\n matches: this.matches,\n matched: this.matched,\n yyleng: this.yyleng,\n offset: this.offset,\n _more: this._more,\n _input: this._input,\n //_signaled_error_token: this._signaled_error_token,\n yy: this.yy,\n conditionStack: this.conditionStack.slice(0),\n done: this.done\n };\n }\n\n match_str = match[0];\n match_str_len = match_str.length;\n // if (match_str.indexOf(\'\\n\') !== -1 || match_str.indexOf(\'\\r\') !== -1) {\n lines = match_str.split(/(?:\\r\\n?|\\n)/g);\n if (lines.length > 1) {\n this.yylineno += lines.length - 1;\n\n this.yylloc.last_line = this.yylineno + 1;\n this.yylloc.last_column = lines[lines.length - 1].length;\n } else {\n this.yylloc.last_column += match_str_len;\n }\n // }\n this.yytext += match_str;\n this.match += match_str;\n this.matched += match_str;\n this.matches = match;\n this.yyleng = this.yytext.length;\n this.yylloc.range[1] += match_str_len;\n\n // previous lex rules MAY have invoked the `more()` API rather than producing a token:\n // those rules will already have moved this `offset` forward matching their match lengths,\n // hence we must only add our own match length now:\n this.offset += match_str_len;\n this._more = false;\n this._backtrack = false;\n this._input = this._input.slice(match_str_len);\n\n // calling this method:\n //\n // function lexer__performAction(yy, yyrulenumber, YY_START) {...}\n token = this.performAction.call(this, this.yy, indexed_rule, this.conditionStack[this.conditionStack.length - 1] /* = YY_START */);\n // otherwise, when the action codes are all simple return token statements:\n //token = this.simpleCaseActionClusters[indexed_rule];\n\n if (this.done && this._input) {\n this.done = false;\n }\n if (token) {\n return token;\n } else if (this._backtrack) {\n // recover context\n for (var k in backup) {\n this[k] = backup[k];\n }\n this.__currentRuleSet__ = null;\n return false; // rule action called reject() implying the next rule should be tested instead.\n } else if (this._signaled_error_token) {\n // produce one \'error\' token as `.parseError()` in `reject()`\n // did not guarantee a failure signal by throwing an exception!\n token = this._signaled_error_token;\n this._signaled_error_token = false;\n return token;\n }\n return false;\n },\n\n /**\n * return next match in input\n * \n * @public\n * @this {RegExpLexer}\n */\n next: function lexer_next() {\n if (this.done) {\n this.clear();\n return this.EOF;\n }\n if (!this._input) {\n this.done = true;\n }\n\n var token,\n match,\n tempMatch,\n index;\n if (!this._more) {\n this.clear();\n }\n var spec = this.__currentRuleSet__;\n if (!spec) {\n // Update the ruleset cache as we apparently encountered a state change or just started lexing.\n // The cache is set up for fast lookup -- we assume a lexer will switch states much less often than it will\n // invoke the `lex()` token-producing API and related APIs, hence caching the set for direct access helps\n // speed up those activities a tiny bit.\n spec = this.__currentRuleSet__ = this._currentRules();\n // Check whether a *sane* condition has been pushed before: this makes the lexer robust against\n // user-programmer bugs such as https://github.com/zaach/jison-lex/issues/19\n if (!spec || !spec.rules) {\n var lineno_msg = \'\';\n if (this.options.trackPosition) {\n lineno_msg = \' on line \' + (this.yylineno + 1);\n }\n var p = this.constructLexErrorInfo(\'Internal lexer engine error\' + lineno_msg + \': The lex grammar programmer pushed a non-existing condition name "\' + this.topState() + \'"; this is a fatal error and should be reported to the application programmer team!\', false);\n // produce one \'error\' token until this situation has been resolved, most probably by parse termination!\n return (this.parseError(p.errStr, p, this.JisonLexerError) || this.ERROR);\n }\n }\n\n var rule_ids = spec.rules;\n var regexes = spec.__rule_regexes;\n var len = spec.__rule_count;\n\n // Note: the arrays are 1-based, while `len` itself is a valid index,\n // hence the non-standard less-or-equal check in the next loop condition!\n for (var i = 1; i <= len; i++) {\n tempMatch = this._input.match(regexes[i]);\n if (tempMatch && (!match || tempMatch[0].length > match[0].length)) {\n match = tempMatch;\n index = i;\n if (this.options.backtrack_lexer) {\n token = this.test_match(tempMatch, rule_ids[i]);\n if (token !== false) {\n return token;\n } else if (this._backtrack) {\n match = undefined;\n continue; // rule action called reject() implying a rule MISmatch.\n } else {\n // else: this is a lexer rule which consumes input without producing a token (e.g. whitespace)\n return false;\n }\n } else if (!this.options.flex) {\n break;\n }\n }\n }\n if (match) {\n token = this.test_match(match, rule_ids[index]);\n if (token !== false) {\n return token;\n }\n // else: this is a lexer rule which consumes input without producing a token (e.g. whitespace)\n return false;\n }\n if (!this._input) {\n this.done = true;\n this.clear();\n return this.EOF;\n } else {\n var lineno_msg = \'\';\n if (this.options.trackPosition) {\n lineno_msg = \' on line \' + (this.yylineno + 1);\n }\n var p = this.constructLexErrorInfo(\'Lexical error\' + lineno_msg + \': Unrecognized text.\', this.options.lexerErrorsAreRecoverable);\n\n var pendingInput = this._input;\n var activeCondition = this.topState();\n var conditionStackDepth = this.conditionStack.length;\n\n token = (this.parseError(p.errStr, p, this.JisonLexerError) || this.ERROR);\n if (token === this.ERROR) {\n // we can try to recover from a lexer error that `parseError()` did not \'recover\' for us\n // by moving forward at least one character at a time IFF the (user-specified?) `parseError()`\n // has not consumed/modified any pending input or changed state in the error handler:\n if (!this.matches && \n // and make sure the input has been modified/consumed ...\n pendingInput === this._input &&\n // ...or the lexer state has been modified significantly enough\n // to merit a non-consuming error handling action right now.\n activeCondition === this.topState() && \n conditionStackDepth === this.conditionStack.length\n ) {\n this.input();\n }\n }\n return token;\n }\n },\n\n /**\n * return next match that has a token\n * \n * @public\n * @this {RegExpLexer}\n */\n lex: function lexer_lex() {\n var r;\n // allow the PRE/POST handlers set/modify the return token for maximum flexibility of the generated lexer:\n if (typeof this.options.pre_lex === \'function\') {\n r = this.options.pre_lex.call(this);\n }\n\n while (!r) {\n r = this.next();\n }\n\n if (typeof this.options.post_lex === \'function\') {\n // (also account for a userdef function which does not return any value: keep the token as is)\n r = this.options.post_lex.call(this, r) || r;\n }\n return r;\n },\n\n /**\n * backwards compatible alias for `pushState()`;\n * the latter is symmetrical with `popState()` and we advise to use\n * those APIs in any modern lexer code, rather than `begin()`.\n * \n * @public\n * @this {RegExpLexer}\n */\n begin: function lexer_begin(condition) {\n return this.pushState(condition);\n },\n\n /**\n * activates a new lexer condition state (pushes the new lexer\n * condition state onto the condition stack)\n * \n * @public\n * @this {RegExpLexer}\n */\n pushState: function lexer_pushState(condition) {\n this.conditionStack.push(condition);\n this.__currentRuleSet__ = null;\n return this;\n },\n\n /**\n * pop the previously active lexer condition state off the condition\n * stack\n * \n * @public\n * @this {RegExpLexer}\n */\n popState: function lexer_popState() {\n var n = this.conditionStack.length - 1;\n if (n > 0) {\n this.__currentRuleSet__ = null; \n return this.conditionStack.pop();\n } else {\n return this.conditionStack[0];\n }\n },\n\n /**\n * return the currently active lexer condition state; when an index\n * argument is provided it produces the N-th previous condition state,\n * if available\n * \n * @public\n * @this {RegExpLexer}\n */\n topState: function lexer_topState(n) {\n n = this.conditionStack.length - 1 - Math.abs(n || 0);\n if (n >= 0) {\n return this.conditionStack[n];\n } else {\n return \'INITIAL\';\n }\n },\n\n /**\n * (internal) determine the lexer rule set which is active for the\n * currently active lexer condition state\n * \n * @public\n * @this {RegExpLexer}\n */\n _currentRules: function lexer__currentRules() {\n if (this.conditionStack.length && this.conditionStack[this.conditionStack.length - 1]) {\n return this.conditions[this.conditionStack[this.conditionStack.length - 1]];\n } else {\n return this.conditions[\'INITIAL\'];\n }\n },\n\n /**\n * return the number of states currently on the stack\n * \n * @public\n * @this {RegExpLexer}\n */\n stateStackSize: function lexer_stateStackSize() {\n return this.conditionStack.length;\n }\n}'; + // --- END lexer kernel --- +} + +RegExpLexer.prototype = new Function(rmCommonWS(_templateObject2, getRegExpLexerPrototype()))(); + +// The lexer code stripper, driven by optimization analysis settings and +// lexer options, which cannot be changed at run-time. +function stripUnusedLexerCode(src, opt) { + // uses yyleng: ..................... ${opt.lexerActionsUseYYLENG} + // uses yylineno: ................... ${opt.lexerActionsUseYYLINENO} + // uses yytext: ..................... ${opt.lexerActionsUseYYTEXT} + // uses yylloc: ..................... ${opt.lexerActionsUseYYLOC} + // uses ParseError API: ............. ${opt.lexerActionsUseParseError} + // uses location tracking & editing: ${opt.lexerActionsUseLocationTracking} + // uses more() API: ................. ${opt.lexerActionsUseMore} + // uses unput() API: ................ ${opt.lexerActionsUseUnput} + // uses reject() API: ............... ${opt.lexerActionsUseReject} + // uses less() API: ................. ${opt.lexerActionsUseLess} + // uses display APIs pastInput(), upcomingInput(), showPosition(): + // ............................. ${opt.lexerActionsUseDisplayAPIs} + // uses describeYYLLOC() API: ....... ${opt.lexerActionsUseDescribeYYLOC} + + var ast = helpers.parseCodeChunkToAST(src, opt); + var new_src = helpers.prettyPrintAST(ast, opt); + + new_src = new_src.replace(/\/\*\s*JISON-LEX-ANALYTICS-REPORT\s*\*\//g, rmCommonWS(_templateObject3, opt.options.backtrack_lexer, opt.options.ranges, opt.options.trackPosition, opt.parseActionsUseYYLENG, opt.parseActionsUseYYLINENO, opt.parseActionsUseYYTEXT, opt.parseActionsUseYYLOC, opt.parseActionsUseValueTracking, opt.parseActionsUseValueAssignment, opt.parseActionsUseLocationTracking, opt.parseActionsUseLocationAssignment, opt.lexerActionsUseYYLENG, opt.lexerActionsUseYYLINENO, opt.lexerActionsUseYYTEXT, opt.lexerActionsUseYYLOC, opt.lexerActionsUseParseError, opt.lexerActionsUseYYERROR, opt.lexerActionsUseLocationTracking, opt.lexerActionsUseMore, opt.lexerActionsUseUnput, opt.lexerActionsUseReject, opt.lexerActionsUseLess, opt.lexerActionsUseDisplayAPIs, opt.lexerActionsUseDescribeYYLOC)); + + return new_src; +} + +// generate lexer source from a grammar +/** @public */ +function generate(dict, tokens, build_options) { + var opt = processGrammar(dict, tokens, build_options); + + return generateFromOpts(opt); +} + +// process the grammar and build final data structures and functions +/** @public */ +function processGrammar(dict, tokens, build_options) { + build_options = build_options || {}; + var opts = { + // include the knowledge passed through `build_options` about which lexer + // features will actually be *used* by the environment (which in 99.9% + // of cases is a jison *parser*): + // + // (this stuff comes straight from the jison Optimization Analysis.) + // + parseActionsUseYYLENG: build_options.parseActionsUseYYLENG, + parseActionsUseYYLINENO: build_options.parseActionsUseYYLINENO, + parseActionsUseYYTEXT: build_options.parseActionsUseYYTEXT, + parseActionsUseYYLOC: build_options.parseActionsUseYYLOC, + parseActionsUseParseError: build_options.parseActionsUseParseError, + parseActionsUseYYERROR: build_options.parseActionsUseYYERROR, + parseActionsUseYYERROK: build_options.parseActionsUseYYERROK, + parseActionsUseYYRECOVERING: build_options.parseActionsUseYYRECOVERING, + parseActionsUseYYCLEARIN: build_options.parseActionsUseYYCLEARIN, + parseActionsUseValueTracking: build_options.parseActionsUseValueTracking, + parseActionsUseValueAssignment: build_options.parseActionsUseValueAssignment, + parseActionsUseLocationTracking: build_options.parseActionsUseLocationTracking, + parseActionsUseLocationAssignment: build_options.parseActionsUseLocationAssignment, + parseActionsUseYYSTACK: build_options.parseActionsUseYYSTACK, + parseActionsUseYYSSTACK: build_options.parseActionsUseYYSSTACK, + parseActionsUseYYSTACKPOINTER: build_options.parseActionsUseYYSTACKPOINTER, + parseActionsUseYYRULELENGTH: build_options.parseActionsUseYYRULELENGTH, + parserHasErrorRecovery: build_options.parserHasErrorRecovery, + parserHasErrorReporting: build_options.parserHasErrorReporting, + + lexerActionsUseYYLENG: '???', + lexerActionsUseYYLINENO: '???', + lexerActionsUseYYTEXT: '???', + lexerActionsUseYYLOC: '???', + lexerActionsUseParseError: '???', + lexerActionsUseYYERROR: '???', + lexerActionsUseLocationTracking: '???', + lexerActionsUseMore: '???', + lexerActionsUseUnput: '???', + lexerActionsUseReject: '???', + lexerActionsUseLess: '???', + lexerActionsUseDisplayAPIs: '???', + lexerActionsUseDescribeYYLOC: '???' + }; + + dict = autodetectAndConvertToJSONformat(dict, build_options) || {}; + + // Feed the possibly reprocessed 'dictionary' above back to the caller + // (for use by our error diagnostic assistance code) + opts.lex_rule_dictionary = dict; + + // Always provide the lexer with an options object, even if it's empty! + // Make sure to camelCase all options: + opts.options = mkStdOptions(build_options, dict.options); + + opts.moduleType = opts.options.moduleType; + opts.moduleName = opts.options.moduleName; + + opts.conditions = prepareStartConditions(dict.startConditions); + opts.conditions.INITIAL = { + rules: [], + inclusive: true + }; + + var code = buildActions(dict, tokens, opts); + opts.performAction = code.actions; + opts.caseHelperInclude = code.caseHelperInclude; + opts.rules = code.rules; + opts.macros = code.macros; + + opts.regular_rule_count = code.regular_rule_count; + opts.simple_rule_count = code.simple_rule_count; + + opts.conditionStack = ['INITIAL']; + + opts.actionInclude = dict.actionInclude || ''; + opts.moduleInclude = (opts.moduleInclude || '') + (dict.moduleInclude || '').trim(); + + return opts; +} + +// Assemble the final source from the processed grammar +/** @public */ +function generateFromOpts(opt) { + var code = ''; + + switch (opt.moduleType) { + case 'js': + code = generateModule(opt); + break; + case 'amd': + code = generateAMDModule(opt); + break; + case 'es': + code = generateESModule(opt); + break; + case 'commonjs': + default: + code = generateCommonJSModule(opt); + break; + } + + return code; +} + +function generateRegexesInitTableCode(opt) { + var a = opt.rules; + var print_xregexp = opt.options && opt.options.xregexp; + var id_display_width = 1 + Math.log10(a.length | 1) | 0; + var ws_prefix = new Array(id_display_width).join(' '); + var b = a.map(function generateXRegExpInitCode(re, idx) { + var idx_str = (ws_prefix + idx).substr(-id_display_width); + + if (re instanceof XRegExp) { + // When we don't need the special XRegExp sauce at run-time, we do with the original + // JavaScript RegExp instance a.k.a. 'native regex': + if (re.xregexp.isNative || !print_xregexp) { + return '/* ' + idx_str + ': */ ' + re; + } + // And make sure to escape the regex to make it suitable for placement inside a *string* + // as it is passed as a string argument to the XRegExp constructor here. + var re_src = re.xregexp.source.replace(/[\\"]/g, '\\$&'); + return '/* ' + idx_str + ': */ new XRegExp("' + re_src + '", "' + re.xregexp.flags + '")'; + } else { + return '/* ' + idx_str + ': */ ' + re; + } + }); + return b.join(',\n'); +} + +function generateModuleBody(opt) { + // make the JSON output look more like JavaScript: + function cleanupJSON(str) { + str = str.replace(/ "rules": \[/g, ' rules: ['); + str = str.replace(/ "inclusive": /g, ' inclusive: '); + return str; + } + + function produceOptions(opts) { + var obj = {}; + var do_not_pass = { + debug: !opts.debug, // do not include this item when it is FALSE as there's no debug tracing built into the generated grammar anyway! + enableDebugLogs: 1, + json: 1, + _: 1, + noMain: 1, + dumpSourceCodeOnFailure: 1, + throwErrorOnCompileFailure: 1, + reportStats: 1, + file: 1, + outfile: 1, + inputPath: 1, + inputFilename: 1, + defaultModuleName: 1, + moduleName: 1, + moduleType: 1, + lexerErrorsAreRecoverable: 0, + flex: 0, + backtrack_lexer: 0, + caseInsensitive: 0, + showSource: 1, + exportAST: 1, + exportAllTables: 1, + exportSourceCode: 1, + prettyCfg: 1, + parseActionsUseYYLENG: 1, + parseActionsUseYYLINENO: 1, + parseActionsUseYYTEXT: 1, + parseActionsUseYYLOC: 1, + parseActionsUseParseError: 1, + parseActionsUseYYERROR: 1, + parseActionsUseYYRECOVERING: 1, + parseActionsUseYYERROK: 1, + parseActionsUseYYCLEARIN: 1, + parseActionsUseValueTracking: 1, + parseActionsUseValueAssignment: 1, + parseActionsUseLocationTracking: 1, + parseActionsUseLocationAssignment: 1, + parseActionsUseYYSTACK: 1, + parseActionsUseYYSSTACK: 1, + parseActionsUseYYSTACKPOINTER: 1, + parseActionsUseYYRULELENGTH: 1, + parserHasErrorRecovery: 1, + parserHasErrorReporting: 1, + lexerActionsUseYYLENG: 1, + lexerActionsUseYYLINENO: 1, + lexerActionsUseYYTEXT: 1, + lexerActionsUseYYLOC: 1, + lexerActionsUseParseError: 1, + lexerActionsUseYYERROR: 1, + lexerActionsUseLocationTracking: 1, + lexerActionsUseMore: 1, + lexerActionsUseUnput: 1, + lexerActionsUseReject: 1, + lexerActionsUseLess: 1, + lexerActionsUseDisplayAPIs: 1, + lexerActionsUseDescribeYYLOC: 1 + }; + for (var k in opts) { + if (!do_not_pass[k] && opts[k] != null && opts[k] !== false) { + // make sure numeric values are encoded as numeric, the rest as boolean/string. + if (typeof opts[k] === 'string') { + var f = parseFloat(opts[k]); + if (f == opts[k]) { + obj[k] = f; + continue; + } + } + obj[k] = opts[k]; + } + } + + // And now some options which should receive some special processing: + var pre = obj.pre_lex; + var post = obj.post_lex; + // since JSON cannot encode functions, we'll have to do it manually at run-time, i.e. later on: + if (pre) { + obj.pre_lex = true; + } + if (post) { + obj.post_lex = true; + } + + var js = JSON.stringify(obj, null, 2); + + js = js.replace(new XRegExp(' "(' + ID_REGEX_BASE + ')": ', 'g'), ' $1: '); + js = js.replace(/^( +)pre_lex: true(,)?$/gm, function (m, ls, tc) { + return ls + 'pre_lex: ' + String(pre) + (tc || ''); + }); + js = js.replace(/^( +)post_lex: true(,)?$/gm, function (m, ls, tc) { + return ls + 'post_lex: ' + String(post) + (tc || ''); + }); + return js; + } + + var out; + if (opt.rules.length > 0 || opt.__in_rules_failure_analysis_mode__) { + // we don't mind that the `test_me()` code above will have this `lexer` variable re-defined: + // JavaScript is fine with that. + var code = [rmCommonWS(_templateObject4), '/*JISON-LEX-ANALYTICS-REPORT*/' /* slot #1: placeholder for analysis report further below */ + ]; + + // get the RegExpLexer.prototype in source code form: + var protosrc = getRegExpLexerPrototype(); + // and strip off the surrounding bits we don't want: + protosrc = protosrc.replace(/^[\s\r\n]*\{/, '').replace(/\s*\}[\s\r\n]*$/, '').trim(); + code.push(protosrc + ',\n'); + + assert(opt.options); + // Assure all options are camelCased: + assert(typeof opt.options['case-insensitive'] === 'undefined'); + + code.push(' options: ' + produceOptions(opt.options)); + + var performActionCode = String(opt.performAction); + var simpleCaseActionClustersCode = String(opt.caseHelperInclude); + var rulesCode = generateRegexesInitTableCode(opt); + var conditionsCode = cleanupJSON(JSON.stringify(opt.conditions, null, 2)); + code.push(rmCommonWS(_templateObject5, performActionCode, simpleCaseActionClustersCode, rulesCode, conditionsCode)); + + opt.is_custom_lexer = false; + + out = code.join(''); + } else { + // We're clearly looking at a custom lexer here as there's no lexer rules at all. + // + // We are re-purposing the `%{...%}` `actionInclude` code block here as it serves no purpose otherwise. + // + // Meanwhile we make sure we have the `lexer` variable declared in *local scope* no matter + // what crazy stuff (or lack thereof) the userland code is pulling in the `actionInclude` chunk. + out = 'var lexer;\n'; + + assert(opt.regular_rule_count === 0); + assert(opt.simple_rule_count === 0); + opt.is_custom_lexer = true; + + if (opt.actionInclude) { + out += opt.actionInclude + (!opt.actionInclude.match(/;[\s\r\n]*$/) ? ';' : '') + '\n'; + } + } + + // The output of this function is guaranteed to read something like this: + // + // ``` + // var lexer; + // + // bla bla bla bla ... lotsa bla bla; + // ``` + // + // and that should work nicely as an `eval()`-able piece of source code. + return out; +} + +function generateGenericHeaderComment() { + var out = rmCommonWS(_templateObject6, version$1); + + return out; +} + +function prepareOptions(opt) { + opt = opt || {}; + + // check for illegal identifier + if (!opt.moduleName || !opt.moduleName.match(/^[a-zA-Z_$][a-zA-Z0-9_$\.]*$/)) { + if (opt.moduleName) { + var msg = 'WARNING: The specified moduleName "' + opt.moduleName + '" is illegal (only characters [a-zA-Z0-9_$] and "." dot are accepted); using the default moduleName "lexer" instead.'; + if (typeof opt.warn_cb === 'function') { + opt.warn_cb(msg); + } else { + // do not treat as warning; barf hairball instead so that this oddity gets noticed right away! + throw new Error(msg); + } + } + opt.moduleName = 'lexer'; + } + + prepExportStructures(opt); + + return opt; +} + +function generateModule(opt) { + opt = prepareOptions(opt); + + var out = [generateGenericHeaderComment(), '', 'var ' + opt.moduleName + ' = (function () {', jisonLexerErrorDefinition, '', generateModuleBody(opt), '', opt.moduleInclude ? opt.moduleInclude + ';' : '', '', 'return lexer;', '})();']; + + var src = out.join('\n') + '\n'; + src = stripUnusedLexerCode(src, opt); + opt.exportSourceCode.all = src; + return src; +} + +function generateAMDModule(opt) { + opt = prepareOptions(opt); + + var out = [generateGenericHeaderComment(), '', 'define([], function () {', jisonLexerErrorDefinition, '', generateModuleBody(opt), '', opt.moduleInclude ? opt.moduleInclude + ';' : '', '', 'return lexer;', '});']; + + var src = out.join('\n') + '\n'; + src = stripUnusedLexerCode(src, opt); + opt.exportSourceCode.all = src; + return src; +} + +function generateESModule(opt) { + opt = prepareOptions(opt); + + var out = [generateGenericHeaderComment(), '', 'var lexer = (function () {', jisonLexerErrorDefinition, '', generateModuleBody(opt), '', opt.moduleInclude ? opt.moduleInclude + ';' : '', '', 'return lexer;', '})();', '', 'function yylex() {', ' return lexer.lex.apply(lexer, arguments);', '}', rmCommonWS(_templateObject7)]; + + var src = out.join('\n') + '\n'; + src = stripUnusedLexerCode(src, opt); + opt.exportSourceCode.all = src; + return src; +} + +function generateCommonJSModule(opt) { + opt = prepareOptions(opt); + + var out = [generateGenericHeaderComment(), '', 'var ' + opt.moduleName + ' = (function () {', jisonLexerErrorDefinition, '', generateModuleBody(opt), '', opt.moduleInclude ? opt.moduleInclude + ';' : '', '', 'return lexer;', '})();', '', 'if (typeof require !== \'undefined\' && typeof exports !== \'undefined\') {', ' exports.lexer = ' + opt.moduleName + ';', ' exports.lex = function () {', ' return ' + opt.moduleName + '.lex.apply(lexer, arguments);', ' };', '}']; + + var src = out.join('\n') + '\n'; + src = stripUnusedLexerCode(src, opt); + opt.exportSourceCode.all = src; + return src; +} + +RegExpLexer.generate = generate; + +RegExpLexer.version = version$1; +RegExpLexer.defaultJisonLexOptions = defaultJisonLexOptions; +RegExpLexer.mkStdOptions = mkStdOptions; +RegExpLexer.camelCase = camelCase; +RegExpLexer.autodetectAndConvertToJSONformat = autodetectAndConvertToJSONformat; + +var version = '0.6.1-205'; // require('./package.json').version; + + +function getCommandlineOptions() { + 'use strict'; + + var opts = nomnom.script('jison-lex').unknownOptionTreatment(false) // do not accept unknown options! + .options({ + file: { + flag: true, + position: 0, + help: 'file containing a lexical grammar' + }, + json: { + abbr: 'j', + flag: true, + default: false, + help: 'jison will expect a grammar in either JSON/JSON5 or JISON format: the precise format is autodetected' + }, + outfile: { + abbr: 'o', + metavar: 'FILE', + help: 'Filepath and base module name of the generated parser;\nwhen terminated with a / (dir separator) it is treated as the destination directory where the generated output will be stored' + }, + debug: { + abbr: 'd', + flag: true, + default: false, + help: 'Debug mode' + }, + dumpSourceCodeOnFailure: { + full: 'dump-sourcecode-on-failure', + flag: true, + default: true, + help: 'Dump the generated source code to a special named file when the internal generator tests fail, i.e. when the generated source code does not compile in the JavaScript engine. Enabling this option helps you to diagnose/debug crashes (thrown exceptions) in the code generator due to various reasons: you can, for example, load the dumped sourcecode in another environment (e.g. NodeJS) to get more info on the precise location and cause of the compile failure.' + }, + throwErrorOnCompileFailure: { + full: 'throw-on-compile-failure', + flag: true, + default: true, + help: 'Throw an exception when the generated source code fails to compile in the JavaScript engine. **WARNING**: Turning this feature OFF permits the code generator to produce non-working source code and treat that as SUCCESS. This MAY be desirable code generator behaviour, but only rarely.' + }, + reportStats: { + full: 'info', + abbr: 'I', + flag: true, + default: false, + help: 'Report some statistics about the generated parser' + }, + moduleType: { + full: 'module-type', + abbr: 't', + default: 'commonjs', + metavar: 'TYPE', + choices: ['commonjs', 'amd', 'js', 'es'], + help: 'The type of module to generate (commonjs, amd, es, js)' + }, + moduleName: { + full: 'module-name', + abbr: 'n', + metavar: 'NAME', + help: 'The name of the generated parser object, namespace supported' + }, + main: { + full: 'main', + abbr: 'x', + flag: true, + default: false, + help: 'Include .main() entry point in generated commonjs module' + }, + moduleMain: { + full: 'module-main', + abbr: 'y', + metavar: 'NAME', + help: 'The main module function definition' + }, + version: { + abbr: 'V', + flag: true, + help: 'print version and exit', + callback: function callback() { + return version; + } + } + }).parse(); + + return opts; +} + +var cli = module.exports; + +cli.main = function cliMain(opts) { + 'use strict'; + + opts = RegExpLexer.mkStdOptions(opts); + + function isDirectory(fp) { + try { + return fs.lstatSync(fp).isDirectory(); + } catch (e) { + return false; + } + } + + function mkdirp(fp) { + if (!fp || fp === '.' || fp.length === 0) { + return false; + } + try { + fs.mkdirSync(fp); + return true; + } catch (e) { + if (e.code === 'ENOENT') { + var parent = path.dirname(fp); + // Did we hit the root directory by now? If so, abort! + // Else, create the parent; iff that fails, we fail too... + if (parent !== fp && mkdirp(parent)) { + try { + // Retry creating the original directory: it should succeed now + fs.mkdirSync(fp); + return true; + } catch (e) { + return false; + } + } + } + } + return false; + } + + function processInputFile() { + // getting raw files + var original_cwd = process.cwd(); + + var raw = fs.readFileSync(path.normalize(opts.file), 'utf8'); + + // making best guess at json mode + opts.json = path.extname(opts.file) === '.json' || opts.json; + + // When only the directory part of the output path was specified, then we + // do NOT have the target module name in there as well! + var outpath = opts.outfile; + if (/[\\\/]$/.test(outpath) || isDirectory(outpath)) { + opts.outfile = null; + outpath = outpath.replace(/[\\\/]$/, ''); + } + if (outpath && outpath.length > 0) { + outpath += '/'; + } else { + outpath = ''; + } + + // setting output file name and module name based on input file name + // if they aren't specified. + var name = path.basename(opts.outfile || opts.file); + + // get the base name (i.e. the file name without extension) + // i.e. strip off only the extension and keep any other dots in the filename + name = path.basename(name, path.extname(name)); + + opts.outfile = opts.outfile || outpath + name + '.js'; + if (!opts.moduleName && name) { + opts.moduleName = opts.defaultModuleName = name.replace(/-\w/g, function (match) { + return match.charAt(1).toUpperCase(); + }); + } + + // Change CWD to the directory where the source grammar resides: this helps us properly + // %include any files mentioned in the grammar with relative paths: + var new_cwd = path.dirname(path.normalize(opts.file)); + process.chdir(new_cwd); + + var lexer = cli.generateLexerString(raw, opts); + + // and change back to the CWD we started out with: + process.chdir(original_cwd); + + mkdirp(path.dirname(opts.outfile)); + fs.writeFileSync(opts.outfile, lexer); + console.log('JISON-LEX output for module [' + opts.moduleName + '] has been written to file:', opts.outfile); + } + + function readin(cb) { + var stdin = process.openStdin(), + data = ''; + + stdin.setEncoding('utf8'); + stdin.addListener('data', function (chunk) { + data += chunk; + }); + stdin.addListener('end', function () { + cb(data); + }); + } + + function processStdin() { + readin(function processStdinReadInCallback(raw) { + console.log(cli.generateLexerString(raw, opts)); + }); + } + + // if an input file wasn't given, assume input on stdin + if (opts.file) { + processInputFile(); + } else { + processStdin(); + } +}; + +cli.generateLexerString = function generateLexerString(lexerSpec, opts) { + 'use strict'; + + // var settings = RegExpLexer.mkStdOptions(opts); + + var predefined_tokens = null; + + return RegExpLexer.generate(lexerSpec, predefined_tokens, opts); +}; + +if (require.main === module) { + var opts = getCommandlineOptions(); + cli.main(opts); +} diff --git a/dist/cli-cjs.js b/dist/cli-cjs.js new file mode 100644 index 0000000..13171b1 --- /dev/null +++ b/dist/cli-cjs.js @@ -0,0 +1,4282 @@ +#!/usr/bin/env node + + +'use strict'; + +function _interopDefault (ex) { return (ex && (typeof ex === 'object') && 'default' in ex) ? ex['default'] : ex; } + +var fs = _interopDefault(require('fs')); +var path = _interopDefault(require('path')); +var nomnom = _interopDefault(require('@gerhobbelt/nomnom')); +var XRegExp = _interopDefault(require('@gerhobbelt/xregexp')); +var json5 = _interopDefault(require('@gerhobbelt/json5')); +var lexParser = _interopDefault(require('@gerhobbelt/lex-parser')); +var assert = _interopDefault(require('assert')); +var helpers = _interopDefault(require('jison-helpers-lib')); + +// +// Helper library for set definitions +// +// MIT Licensed +// +// +// This code is intended to help parse regex set expressions and mix them +// together, i.e. to answer questions like this: +// +// what is the resulting regex set expression when we mix the regex set +// `[a-z]` with the regex set `[^\s]` where with 'mix' we mean that any +// input which matches either input regex should match the resulting +// regex set. (a.k.a. Full Outer Join, see also http://www.diffen.com/difference/Inner_Join_vs_Outer_Join) +// + +'use strict'; + +const XREGEXP_UNICODE_ESCAPE_RE$1 = /^\{[A-Za-z0-9 \-\._]+\}/; // Matches the XRegExp Unicode escape braced part, e.g. `{Number}` +const CHR_RE$1 = /^(?:[^\\]|\\[^cxu0-9]|\\[0-9]{1,3}|\\c[A-Z]|\\x[0-9a-fA-F]{2}|\\u[0-9a-fA-F]{4}|\\u\{[0-9a-fA-F]+\})/; +const SET_PART_RE$1 = /^(?:[^\\\]]|\\[^cxu0-9]|\\[0-9]{1,3}|\\c[A-Z]|\\x[0-9a-fA-F]{2}|\\u[0-9a-fA-F]{4}|\\u\{[0-9a-fA-F]+\})+/; +const NOTHING_SPECIAL_RE$1 = /^(?:[^\\\[\]\(\)\|^\{\}]|\\[^cxu0-9]|\\[0-9]{1,3}|\\c[A-Z]|\\x[0-9a-fA-F]{2}|\\u[0-9a-fA-F]{4}|\\u\{[0-9a-fA-F]+\})+/; +const SET_IS_SINGLE_PCODE_RE = /^\\[dDwWsS]$|^\\p\{[A-Za-z0-9 \-\._]+\}$/; + +const UNICODE_BASE_PLANE_MAX_CP$1 = 65535; + +// The expanded regex sets which are equivalent to the given `\\{c}` escapes: +// +// `/\s/`: +const WHITESPACE_SETSTR$1 = ' \f\n\r\t\v\u00a0\u1680\u180e\u2000-\u200a\u2028\u2029\u202f\u205f\u3000\ufeff'; +// `/\d/`: +const DIGIT_SETSTR$1 = '0-9'; +// `/\w/`: +const WORDCHAR_SETSTR$1 = 'A-Za-z0-9_'; + + + + + +// Helper for `bitarray2set()`: convert character code to a representation string suitable for use in a regex +function i2c(i) { + var c, x; + + switch (i) { + case 10: + return '\\n'; + + case 13: + return '\\r'; + + case 9: + return '\\t'; + + case 8: + return '\\b'; + + case 12: + return '\\f'; + + case 11: + return '\\v'; + + case 45: // ASCII/Unicode for '-' dash + return '\\-'; + + case 91: // '[' + return '\\['; + + case 92: // '\\' + return '\\\\'; + + case 93: // ']' + return '\\]'; + + case 94: // ']' + return '\\^'; + } + if (i < 32 + || i > 0xFFF0 /* Unicode Specials, also in UTF16 */ + || (i >= 0xD800 && i <= 0xDFFF) /* Unicode Supplementary Planes; we're TOAST in JavaScript as we're NOT UTF-16 but UCS-2! */ + || String.fromCharCode(i).match(/[\u2028\u2029]/) /* Code compilation via `new Function()` does not like to see these, or rather: treats them as just another form of CRLF, which breaks your generated regex code! */ + ) { + // Detail about a detail: + // U+2028 and U+2029 are part of the `\s` regex escape code (`\s` and `[\s]` match either of these) and when placed in a JavaScript + // source file verbatim (without escaping it as a `\uNNNN` item) then JavaScript will interpret it as such and consequently report + // a b0rked generated parser, as the generated code would include this regex right here. + // Hence we MUST escape these buggers everywhere we go... + x = i.toString(16); + if (x.length >= 1 && i <= 0xFFFF) { + c = '0000' + x; + return '\\u' + c.substr(c.length - 4); + } else { + return '\\u{' + x + '}'; + } + } + return String.fromCharCode(i); +} + + +// Helper collection for `bitarray2set()`: we have expanded all these cached `\\p{NAME}` regex sets when creating +// this bitarray and now we should look at these expansions again to see if `bitarray2set()` can produce a +// `\\p{NAME}` shorthand to represent [part of] the bitarray: +var Pcodes_bitarray_cache = {}; +var Pcodes_bitarray_cache_test_order = []; + +// Helper collection for `bitarray2set()` for minifying special cases of result sets which can be represented by +// a single regex 'escape', e.g. `\d` for digits 0-9. +var EscCode_bitarray_output_refs; + +// now initialize the EscCodes_... table above: +init_EscCode_lookup_table(); + +function init_EscCode_lookup_table() { + var s, bitarr, set2esc = {}, esc2bitarr = {}; + + // patch global lookup tables for the time being, while we calculate their *real* content in this function: + EscCode_bitarray_output_refs = { + esc2bitarr: {}, + set2esc: {} + }; + Pcodes_bitarray_cache_test_order = []; + + // `/\S': + bitarr = []; + set2bitarray(bitarr, '^' + WHITESPACE_SETSTR$1); + s = bitarray2set(bitarr); + esc2bitarr['S'] = bitarr; + set2esc[s] = 'S'; + // set2esc['^' + s] = 's'; + Pcodes_bitarray_cache['\\S'] = bitarr; + + // `/\s': + bitarr = []; + set2bitarray(bitarr, WHITESPACE_SETSTR$1); + s = bitarray2set(bitarr); + esc2bitarr['s'] = bitarr; + set2esc[s] = 's'; + // set2esc['^' + s] = 'S'; + Pcodes_bitarray_cache['\\s'] = bitarr; + + // `/\D': + bitarr = []; + set2bitarray(bitarr, '^' + DIGIT_SETSTR$1); + s = bitarray2set(bitarr); + esc2bitarr['D'] = bitarr; + set2esc[s] = 'D'; + // set2esc['^' + s] = 'd'; + Pcodes_bitarray_cache['\\D'] = bitarr; + + // `/\d': + bitarr = []; + set2bitarray(bitarr, DIGIT_SETSTR$1); + s = bitarray2set(bitarr); + esc2bitarr['d'] = bitarr; + set2esc[s] = 'd'; + // set2esc['^' + s] = 'D'; + Pcodes_bitarray_cache['\\d'] = bitarr; + + // `/\W': + bitarr = []; + set2bitarray(bitarr, '^' + WORDCHAR_SETSTR$1); + s = bitarray2set(bitarr); + esc2bitarr['W'] = bitarr; + set2esc[s] = 'W'; + // set2esc['^' + s] = 'w'; + Pcodes_bitarray_cache['\\W'] = bitarr; + + // `/\w': + bitarr = []; + set2bitarray(bitarr, WORDCHAR_SETSTR$1); + s = bitarray2set(bitarr); + esc2bitarr['w'] = bitarr; + set2esc[s] = 'w'; + // set2esc['^' + s] = 'W'; + Pcodes_bitarray_cache['\\w'] = bitarr; + + EscCode_bitarray_output_refs = { + esc2bitarr: esc2bitarr, + set2esc: set2esc + }; + + updatePcodesBitarrayCacheTestOrder(); +} + +function updatePcodesBitarrayCacheTestOrder(opts) { + var t = new Array(UNICODE_BASE_PLANE_MAX_CP$1 + 1); + var l = {}; + var user_has_xregexp = opts && opts.options && opts.options.xregexp; + var i, j, k, ba; + + // mark every character with which regex pcodes they are part of: + for (k in Pcodes_bitarray_cache) { + ba = Pcodes_bitarray_cache[k]; + + if (!user_has_xregexp && k.indexOf('\\p{') >= 0) { + continue; + } + + var cnt = 0; + for (i = 0; i <= UNICODE_BASE_PLANE_MAX_CP$1; i++) { + if (ba[i]) { + cnt++; + if (!t[i]) { + t[i] = [k]; + } else { + t[i].push(k); + } + } + } + l[k] = cnt; + } + + // now dig out the unique ones: only need one per pcode. + // + // We ASSUME every \\p{NAME} 'pcode' has at least ONE character + // in it that is ONLY matched by that particular pcode. + // If this assumption fails, nothing is lost, but our 'regex set + // optimized representation' will be sub-optimal as than this pcode + // won't be tested during optimization. + // + // Now that would be a pity, so the assumption better holds... + // Turns out the assumption doesn't hold already for /\S/ + /\D/ + // as the second one (\D) is a pure subset of \S. So we have to + // look for markers which match multiple escapes/pcodes for those + // ones where a unique item isn't available... + var lut = []; + var done = {}; + var keys = Object.keys(Pcodes_bitarray_cache); + + for (i = 0; i <= UNICODE_BASE_PLANE_MAX_CP$1; i++) { + k = t[i][0]; + if (t[i].length === 1 && !done[k]) { + assert(l[k] > 0); + lut.push([i, k]); + done[k] = true; + } + } + + for (j = 0; keys[j]; j++) { + k = keys[j]; + + if (!user_has_xregexp && k.indexOf('\\p{') >= 0) { + continue; + } + + if (!done[k]) { + assert(l[k] > 0); + // find a minimum span character to mark this one: + var w = Infinity; + var rv; + ba = Pcodes_bitarray_cache[k]; + for (i = 0; i <= UNICODE_BASE_PLANE_MAX_CP$1; i++) { + if (ba[i]) { + var tl = t[i].length; + if (tl > 1 && tl < w) { + assert(l[k] > 0); + rv = [i, k]; + w = tl; + } + } + } + if (rv) { + done[k] = true; + lut.push(rv); + } + } + } + + // order from large set to small set so that small sets don't gobble + // characters also represented by overlapping larger set pcodes. + // + // Again we assume something: that finding the large regex pcode sets + // before the smaller, more specialized ones, will produce a more + // optimal minification of the regex set expression. + // + // This is a guestimate/heuristic only! + lut.sort(function (a, b) { + var k1 = a[1]; + var k2 = b[1]; + var ld = l[k2] - l[k1]; + if (ld) { + return ld; + } + // and for same-size sets, order from high to low unique identifier. + return b[0] - a[0]; + }); + + Pcodes_bitarray_cache_test_order = lut; +} + + + + + + +// 'Join' a regex set `[...]` into a Unicode range spanning logic array, flagging every character in the given set. +function set2bitarray(bitarr, s, opts) { + var orig = s; + var set_is_inverted = false; + var bitarr_orig; + + function mark(d1, d2) { + if (d2 == null) d2 = d1; + for (var i = d1; i <= d2; i++) { + bitarr[i] = true; + } + } + + function add2bitarray(dst, src) { + for (var i = 0; i <= UNICODE_BASE_PLANE_MAX_CP$1; i++) { + if (src[i]) { + dst[i] = true; + } + } + } + + function eval_escaped_code(s) { + var c; + // decode escaped code? If none, just take the character as-is + if (s.indexOf('\\') === 0) { + var l = s.substr(0, 2); + switch (l) { + case '\\c': + c = s.charCodeAt(2) - 'A'.charCodeAt(0) + 1; + return String.fromCharCode(c); + + case '\\x': + s = s.substr(2); + c = parseInt(s, 16); + return String.fromCharCode(c); + + case '\\u': + s = s.substr(2); + if (s[0] === '{') { + s = s.substr(1, s.length - 2); + } + c = parseInt(s, 16); + if (c >= 0x10000) { + return new Error('We do NOT support Extended Plane Unicode Codepoints (i.e. CodePoints beyond U:FFFF) in regex set expressions, e.g. \\u{' + s + '}'); + } + return String.fromCharCode(c); + + case '\\0': + case '\\1': + case '\\2': + case '\\3': + case '\\4': + case '\\5': + case '\\6': + case '\\7': + s = s.substr(1); + c = parseInt(s, 8); + return String.fromCharCode(c); + + case '\\r': + return '\r'; + + case '\\n': + return '\n'; + + case '\\v': + return '\v'; + + case '\\f': + return '\f'; + + case '\\t': + return '\t'; + + case '\\b': + return '\b'; + + default: + // just the character itself: + return s.substr(1); + } + } else { + return s; + } + } + + if (s && s.length) { + var c1, c2; + + // inverted set? + if (s[0] === '^') { + set_is_inverted = true; + s = s.substr(1); + bitarr_orig = bitarr; + bitarr = new Array(UNICODE_BASE_PLANE_MAX_CP$1 + 1); + } + + // BITARR collects flags for characters set. Inversion means the complement set of character is st instead. + // This results in an OR operations when sets are joined/chained. + + while (s.length) { + c1 = s.match(CHR_RE$1); + if (!c1) { + // hit an illegal escape sequence? cope anyway! + c1 = s[0]; + } else { + c1 = c1[0]; + // Quick hack for XRegExp escapes inside a regex `[...]` set definition: we *could* try to keep those + // intact but it's easier to unfold them here; this is not nice for when the grammar specifies explicit + // XRegExp support, but alas, we'll get there when we get there... ;-) + switch (c1) { + case '\\p': + s = s.substr(c1.length); + c2 = s.match(XREGEXP_UNICODE_ESCAPE_RE$1); + if (c2) { + c2 = c2[0]; + s = s.substr(c2.length); + // do we have this one cached already? + var pex = c1 + c2; + var ba4p = Pcodes_bitarray_cache[pex]; + if (!ba4p) { + // expand escape: + var xr = new XRegExp('[' + pex + ']'); // TODO: case-insensitive grammar??? + // rewrite to a standard `[...]` regex set: XRegExp will do this for us via `XRegExp.toString()`: + var xs = '' + xr; + // remove the wrapping `/.../` to get at the (possibly *combined* series of) `[...]` sets inside: + xs = xs.substr(1, xs.length - 2); + + ba4p = reduceRegexToSetBitArray(xs, pex, opts); + + Pcodes_bitarray_cache[pex] = ba4p; + updatePcodesBitarrayCacheTestOrder(opts); + } + // merge bitarrays: + add2bitarray(bitarr, ba4p); + continue; + } + break; + + case '\\S': + case '\\s': + case '\\W': + case '\\w': + case '\\d': + case '\\D': + // these can't participate in a range, but need to be treated special: + s = s.substr(c1.length); + // check for \S, \s, \D, \d, \W, \w and expand them: + var ba4e = EscCode_bitarray_output_refs.esc2bitarr[c1[1]]; + assert(ba4e); + add2bitarray(bitarr, ba4e); + continue; + + case '\\b': + // matches a backspace: https://developer.mozilla.org/en/docs/Web/JavaScript/Guide/Regular_Expressions#special-backspace + c1 = '\u0008'; + break; + } + } + var v1 = eval_escaped_code(c1); + // propagate deferred exceptions = error reports. + if (v1 instanceof Error) { + return v1; + } + v1 = v1.charCodeAt(0); + s = s.substr(c1.length); + + if (s[0] === '-' && s.length >= 2) { + // we can expect a range like 'a-z': + s = s.substr(1); + c2 = s.match(CHR_RE$1); + if (!c2) { + // hit an illegal escape sequence? cope anyway! + c2 = s[0]; + } else { + c2 = c2[0]; + } + var v2 = eval_escaped_code(c2); + // propagate deferred exceptions = error reports. + if (v2 instanceof Error) { + return v1; + } + v2 = v2.charCodeAt(0); + s = s.substr(c2.length); + + // legal ranges go UP, not /DOWN! + if (v1 <= v2) { + mark(v1, v2); + } else { + console.warn('INVALID CHARACTER RANGE found in regex: ', { re: orig, start: c1, start_n: v1, end: c2, end_n: v2 }); + mark(v1); + mark('-'.charCodeAt(0)); + mark(v2); + } + continue; + } + mark(v1); + } + + // When we have marked all slots, '^' NEGATES the set, hence we flip all slots. + // + // Since a regex like `[^]` should match everything(?really?), we don't need to check if the MARK + // phase actually marked anything at all: the `^` negation will correctly flip=mark the entire + // range then. + if (set_is_inverted) { + for (var i = 0; i <= UNICODE_BASE_PLANE_MAX_CP$1; i++) { + if (!bitarr[i]) { + bitarr_orig[i] = true; + } + } + } + } + return false; +} + + +// convert a simple bitarray back into a regex set `[...]` content: +function bitarray2set(l, output_inverted_variant, output_minimized) { + // construct the inverse(?) set from the mark-set: + // + // Before we do that, we inject a sentinel so that our inner loops + // below can be simple and fast: + l[UNICODE_BASE_PLANE_MAX_CP$1 + 1] = 1; + // now reconstruct the regex set: + var rv = []; + var i, j, cnt, lut, tn, tspec, match, pcode, ba4pcode, l2; + var bitarr_is_cloned = false; + var l_orig = l; + + if (output_inverted_variant) { + // generate the inverted set, hence all unmarked slots are part of the output range: + cnt = 0; + for (i = 0; i <= UNICODE_BASE_PLANE_MAX_CP$1; i++) { + if (!l[i]) { + cnt++; + } + } + if (cnt === UNICODE_BASE_PLANE_MAX_CP$1 + 1) { + // When there's nothing in the output we output a special 'match-nothing' regex: `[^\S\s]`. + // BUT... since we output the INVERTED set, we output the match-all set instead: + return '\\S\\s'; + } + else if (cnt === 0) { + // When we find the entire Unicode range is in the output match set, we replace this with + // a shorthand regex: `[\S\s]` + // BUT... since we output the INVERTED set, we output the match-nothing set instead: + return '^\\S\\s'; + } + + // Now see if we can replace several bits by an escape / pcode: + if (output_minimized) { + lut = Pcodes_bitarray_cache_test_order; + for (tn = 0; lut[tn]; tn++) { + tspec = lut[tn]; + // check if the uniquely identifying char is in the inverted set: + if (!l[tspec[0]]) { + // check if the pcode is covered by the inverted set: + pcode = tspec[1]; + ba4pcode = Pcodes_bitarray_cache[pcode]; + match = 0; + for (j = 0; j <= UNICODE_BASE_PLANE_MAX_CP$1; j++) { + if (ba4pcode[j]) { + if (!l[j]) { + // match in current inverted bitset, i.e. there's at + // least one 'new' bit covered by this pcode/escape: + match++; + } else if (l_orig[j]) { + // mismatch! + match = false; + break; + } + } + } + + // We're only interested in matches which actually cover some + // yet uncovered bits: `match !== 0 && match !== false`. + // + // Apply the heuristic that the pcode/escape is only going to be used + // when it covers *more* characters than its own identifier's length: + if (match && match > pcode.length) { + rv.push(pcode); + + // and nuke the bits in the array which match the given pcode: + // make sure these edits are visible outside this function as + // `l` is an INPUT parameter (~ not modified)! + if (!bitarr_is_cloned) { + l2 = new Array(UNICODE_BASE_PLANE_MAX_CP$1 + 1); + for (j = 0; j <= UNICODE_BASE_PLANE_MAX_CP$1; j++) { + l2[j] = l[j] || ba4pcode[j]; // `!(!l[j] && !ba4pcode[j])` + } + // recreate sentinel + l2[UNICODE_BASE_PLANE_MAX_CP$1 + 1] = 1; + l = l2; + bitarr_is_cloned = true; + } else { + for (j = 0; j <= UNICODE_BASE_PLANE_MAX_CP$1; j++) { + l[j] = l[j] || ba4pcode[j]; + } + } + } + } + } + } + + i = 0; + while (i <= UNICODE_BASE_PLANE_MAX_CP$1) { + // find first character not in original set: + while (l[i]) { + i++; + } + if (i >= UNICODE_BASE_PLANE_MAX_CP$1 + 1) { + break; + } + // find next character not in original set: + for (j = i + 1; !l[j]; j++) {} /* empty loop */ + // generate subset: + rv.push(i2c(i)); + if (j - 1 > i) { + rv.push((j - 2 > i ? '-' : '') + i2c(j - 1)); + } + i = j; + } + } else { + // generate the non-inverted set, hence all logic checks are inverted here... + cnt = 0; + for (i = 0; i <= UNICODE_BASE_PLANE_MAX_CP$1; i++) { + if (l[i]) { + cnt++; + } + } + if (cnt === UNICODE_BASE_PLANE_MAX_CP$1 + 1) { + // When we find the entire Unicode range is in the output match set, we replace this with + // a shorthand regex: `[\S\s]` + return '\\S\\s'; + } + else if (cnt === 0) { + // When there's nothing in the output we output a special 'match-nothing' regex: `[^\S\s]`. + return '^\\S\\s'; + } + + // Now see if we can replace several bits by an escape / pcode: + if (output_minimized) { + lut = Pcodes_bitarray_cache_test_order; + for (tn = 0; lut[tn]; tn++) { + tspec = lut[tn]; + // check if the uniquely identifying char is in the set: + if (l[tspec[0]]) { + // check if the pcode is covered by the set: + pcode = tspec[1]; + ba4pcode = Pcodes_bitarray_cache[pcode]; + match = 0; + for (j = 0; j <= UNICODE_BASE_PLANE_MAX_CP$1; j++) { + if (ba4pcode[j]) { + if (l[j]) { + // match in current bitset, i.e. there's at + // least one 'new' bit covered by this pcode/escape: + match++; + } else if (!l_orig[j]) { + // mismatch! + match = false; + break; + } + } + } + + // We're only interested in matches which actually cover some + // yet uncovered bits: `match !== 0 && match !== false`. + // + // Apply the heuristic that the pcode/escape is only going to be used + // when it covers *more* characters than its own identifier's length: + if (match && match > pcode.length) { + rv.push(pcode); + + // and nuke the bits in the array which match the given pcode: + // make sure these edits are visible outside this function as + // `l` is an INPUT parameter (~ not modified)! + if (!bitarr_is_cloned) { + l2 = new Array(UNICODE_BASE_PLANE_MAX_CP$1 + 1); + for (j = 0; j <= UNICODE_BASE_PLANE_MAX_CP$1; j++) { + l2[j] = l[j] && !ba4pcode[j]; + } + // recreate sentinel + l2[UNICODE_BASE_PLANE_MAX_CP$1 + 1] = 1; + l = l2; + bitarr_is_cloned = true; + } else { + for (j = 0; j <= UNICODE_BASE_PLANE_MAX_CP$1; j++) { + l[j] = l[j] && !ba4pcode[j]; + } + } + } + } + } + } + + i = 0; + while (i <= UNICODE_BASE_PLANE_MAX_CP$1) { + // find first character not in original set: + while (!l[i]) { + i++; + } + if (i >= UNICODE_BASE_PLANE_MAX_CP$1 + 1) { + break; + } + // find next character not in original set: + for (j = i + 1; l[j]; j++) {} /* empty loop */ + if (j > UNICODE_BASE_PLANE_MAX_CP$1 + 1) { + j = UNICODE_BASE_PLANE_MAX_CP$1 + 1; + } + // generate subset: + rv.push(i2c(i)); + if (j - 1 > i) { + rv.push((j - 2 > i ? '-' : '') + i2c(j - 1)); + } + i = j; + } + } + + assert(rv.length); + var s = rv.join(''); + assert(s); + + // Check if the set is better represented by one of the regex escapes: + var esc4s = EscCode_bitarray_output_refs.set2esc[s]; + if (esc4s) { + // When we hit a special case like this, it is always the shortest notation, hence wins on the spot! + return '\\' + esc4s; + } + return s; +} + + + + + +// Pretty brutal conversion of 'regex' `s` back to raw regex set content: strip outer [...] when they're there; +// ditto for inner combos of sets, i.e. `]|[` as in `[0-9]|[a-z]`. +function reduceRegexToSetBitArray(s, name, opts) { + var orig = s; + + // propagate deferred exceptions = error reports. + if (s instanceof Error) { + return s; + } + + var l = new Array(UNICODE_BASE_PLANE_MAX_CP$1 + 1); + var internal_state = 0; + var derr; + + while (s.length) { + var c1 = s.match(CHR_RE$1); + if (!c1) { + // cope with illegal escape sequences too! + return new Error('illegal escape sequence at start of regex part: "' + s + '" of regex "' + orig + '"'); + } else { + c1 = c1[0]; + } + s = s.substr(c1.length); + + switch (c1) { + case '[': + // this is starting a set within the regex: scan until end of set! + var set_content = []; + while (s.length) { + var inner = s.match(SET_PART_RE$1); + if (!inner) { + inner = s.match(CHR_RE$1); + if (!inner) { + // cope with illegal escape sequences too! + return new Error('illegal escape sequence at start of regex part: ' + s + '" of regex "' + orig + '"'); + } else { + inner = inner[0]; + } + if (inner === ']') break; + } else { + inner = inner[0]; + } + set_content.push(inner); + s = s.substr(inner.length); + } + + // ensure that we hit the terminating ']': + var c2 = s.match(CHR_RE$1); + if (!c2) { + // cope with illegal escape sequences too! + return new Error('regex set expression is broken in regex: "' + orig + '" --> "' + s + '"'); + } else { + c2 = c2[0]; + } + if (c2 !== ']') { + return new Error('regex set expression is broken in regex: ' + orig); + } + s = s.substr(c2.length); + + var se = set_content.join(''); + if (!internal_state) { + derr = set2bitarray(l, se, opts); + // propagate deferred exceptions = error reports. + if (derr instanceof Error) { + return derr; + } + + // a set is to use like a single character in a longer literal phrase, hence input `[abc]word[def]` would thus produce output `[abc]`: + internal_state = 1; + } + break; + + // Strip unescaped pipes to catch constructs like `\\r|\\n` and turn them into + // something ready for use inside a regex set, e.g. `\\r\\n`. + // + // > Of course, we realize that converting more complex piped constructs this way + // > will produce something you might not expect, e.g. `A|WORD2` which + // > would end up as the set `[AW]` which is something else than the input + // > entirely. + // > + // > However, we can only depend on the user (grammar writer) to realize this and + // > prevent this from happening by not creating such oddities in the input grammar. + case '|': + // a|b --> [ab] + internal_state = 0; + break; + + case '(': + // (a) --> a + // + // TODO - right now we treat this as 'too complex': + + // Strip off some possible outer wrappers which we know how to remove. + // We don't worry about 'damaging' the regex as any too-complex regex will be caught + // in the validation check at the end; our 'strippers' here would not damage useful + // regexes anyway and them damaging the unacceptable ones is fine. + s = s.replace(/^\((?:\?:)?(.*?)\)$/, '$1'); // (?:...) -> ... and (...) -> ... + s = s.replace(/^\^?(.*?)\$?$/, '$1'); // ^...$ --> ... (catch these both inside and outside the outer grouping, hence do the ungrouping twice: one before, once after this) + s = s.replace(/^\((?:\?:)?(.*?)\)$/, '$1'); // (?:...) -> ... and (...) -> ... + + return new Error('[macro [' + name + '] is unsuitable for use inside regex set expressions: "[' + orig + ']"]'); + + case '.': + case '*': + case '+': + case '?': + // wildcard + // + // TODO - right now we treat this as 'too complex': + return new Error('[macro [' + name + '] is unsuitable for use inside regex set expressions: "[' + orig + ']"]'); + + case '{': // range, e.g. `x{1,3}`, or macro? + // TODO - right now we treat this as 'too complex': + return new Error('[macro [' + name + '] is unsuitable for use inside regex set expressions: "[' + orig + ']"]'); + + default: + // literal character or word: take the first character only and ignore the rest, so that + // the constructed set for `word|noun` would be `[wb]`: + if (!internal_state) { + derr = set2bitarray(l, c1, opts); + // propagate deferred exceptions = error reports. + if (derr instanceof Error) { + return derr; + } + + internal_state = 2; + } + break; + } + } + + s = bitarray2set(l); + + // When this result is suitable for use in a set, than we should be able to compile + // it in a regex; that way we can easily validate whether macro X is fit to be used + // inside a regex set: + try { + var re; + assert(s); + assert(!(s instanceof Error)); + re = new XRegExp('[' + s + ']'); + re.test(s[0]); + + // One thing is apparently *not* caught by the RegExp compile action above: `[a[b]c]` + // so we check for lingering UNESCAPED brackets in here as those cannot be: + if (/[^\\][\[\]]/.exec(s)) { + throw new Error('unescaped brackets in set data'); + } + } catch (ex) { + // make sure we produce a set range expression which will fail badly when it is used + // in actual code: + s = new Error('[macro [' + name + '] is unsuitable for use inside regex set expressions: "[' + s + ']"]: ' + ex.message); + } + + assert(s); + // propagate deferred exceptions = error reports. + if (s instanceof Error) { + return s; + } + return l; +} + + + + +// Convert bitarray representing, for example, `'0-9'` to regex string `[0-9]` +// -- or in this example it can be further optimized to only `\d`! +function produceOptimizedRegex4Set(bitarr) { + // First try to produce a minimum regex from the bitarray directly: + var s1 = bitarray2set(bitarr, false, true); + + // and when the regex set turns out to match a single pcode/escape, then + // use that one as-is: + if (s1.match(SET_IS_SINGLE_PCODE_RE)) { + // When we hit a special case like this, it is always the shortest notation, hence wins on the spot! + return s1; + } else { + s1 = '[' + s1 + ']'; + } + + // Now try to produce a minimum regex from the *inverted* bitarray via negation: + // Because we look at a negated bitset, there's no use looking for matches with + // special cases here. + var s2 = bitarray2set(bitarr, true, true); + + if (s2[0] === '^') { + s2 = s2.substr(1); + if (s2.match(SET_IS_SINGLE_PCODE_RE)) { + // When we hit a special case like this, it is always the shortest notation, hence wins on the spot! + return s2; + } + } else { + s2 = '^' + s2; + } + s2 = '[' + s2 + ']'; + + // Then, as some pcode/escapes still happen to deliver a LARGER regex string in the end, + // we also check against the plain, unadulterated regex set expressions: + // + // First try to produce a minimum regex from the bitarray directly: + var s3 = bitarray2set(bitarr, false, false); + + // and when the regex set turns out to match a single pcode/escape, then + // use that one as-is: + if (s3.match(SET_IS_SINGLE_PCODE_RE)) { + // When we hit a special case like this, it is always the shortest notation, hence wins on the spot! + return s3; + } else { + s3 = '[' + s3 + ']'; + } + + // Now try to produce a minimum regex from the *inverted* bitarray via negation: + // Because we look at a negated bitset, there's no use looking for matches with + // special cases here. + var s4 = bitarray2set(bitarr, true, false); + + if (s4[0] === '^') { + s4 = s4.substr(1); + if (s4.match(SET_IS_SINGLE_PCODE_RE)) { + // When we hit a special case like this, it is always the shortest notation, hence wins on the spot! + return s4; + } + } else { + s4 = '^' + s4; + } + s4 = '[' + s4 + ']'; + + if (s2.length < s1.length) { + s1 = s2; + } + if (s3.length < s1.length) { + s1 = s3; + } + if (s4.length < s1.length) { + s1 = s4; + } + + return s1; +} + + + + + + +var setmgmt = { + XREGEXP_UNICODE_ESCAPE_RE: XREGEXP_UNICODE_ESCAPE_RE$1, + CHR_RE: CHR_RE$1, + SET_PART_RE: SET_PART_RE$1, + NOTHING_SPECIAL_RE: NOTHING_SPECIAL_RE$1, + SET_IS_SINGLE_PCODE_RE, + + UNICODE_BASE_PLANE_MAX_CP: UNICODE_BASE_PLANE_MAX_CP$1, + + WHITESPACE_SETSTR: WHITESPACE_SETSTR$1, + DIGIT_SETSTR: DIGIT_SETSTR$1, + WORDCHAR_SETSTR: WORDCHAR_SETSTR$1, + + set2bitarray, + bitarray2set, + produceOptimizedRegex4Set, + reduceRegexToSetBitArray, +}; + +// Basic Lexer implemented using JavaScript regular expressions +// Zachary Carter +// MIT Licensed + +var rmCommonWS = helpers.rmCommonWS; +var camelCase = helpers.camelCase; +var code_exec = helpers.exec; +// import recast from '@gerhobbelt/recast'; +// import astUtils from '@gerhobbelt/ast-util'; +var version$1 = '0.6.1-205'; // require('./package.json').version; + + + + +const XREGEXP_UNICODE_ESCAPE_RE = setmgmt.XREGEXP_UNICODE_ESCAPE_RE; // Matches the XRegExp Unicode escape braced part, e.g. `{Number}` +const CHR_RE = setmgmt.CHR_RE; +const SET_PART_RE = setmgmt.SET_PART_RE; +const NOTHING_SPECIAL_RE = setmgmt.NOTHING_SPECIAL_RE; +const UNICODE_BASE_PLANE_MAX_CP = setmgmt.UNICODE_BASE_PLANE_MAX_CP; + +// WARNING: this regex MUST match the regex for `ID` in ebnf-parser::bnf.l jison language lexer spec! (`ID = [{ALPHA}]{ALNUM}*`) +// +// This is the base XRegExp ID regex used in many places; this should match the ID macro definition in the EBNF/BNF parser et al as well! +const ID_REGEX_BASE = '[\\p{Alphabetic}_][\\p{Alphabetic}_\\p{Number}]*'; + + + + +// see also ./lib/cli.js +/** +@public +@nocollapse +*/ +const defaultJisonLexOptions = { + moduleType: 'commonjs', + debug: false, + enableDebugLogs: false, + json: false, + main: false, // CLI: not:(--main option) + dumpSourceCodeOnFailure: true, + throwErrorOnCompileFailure: true, + + moduleName: undefined, + defaultModuleName: 'lexer', + file: undefined, + outfile: undefined, + inputPath: undefined, + inputFilename: undefined, + warn_cb: undefined, // function(msg) | true (= use Jison.Print) | false (= throw Exception) + + xregexp: false, + lexerErrorsAreRecoverable: false, + flex: false, + backtrack_lexer: false, + ranges: false, // track position range, i.e. start+end indexes in the input string + trackPosition: true, // track line+column position in the input string + caseInsensitive: false, + showSource: false, + exportSourceCode: false, + exportAST: false, + prettyCfg: true, + pre_lex: undefined, + post_lex: undefined, +}; + + +// Merge sets of options. +// +// Convert alternative jison option names to their base option. +// +// The *last* option set which overrides the default wins, where 'override' is +// defined as specifying a not-undefined value which is not equal to the +// default value. +// +// When the FIRST argument is STRING "NODEFAULT", then we MUST NOT mix the +// default values avialable in Jison.defaultJisonOptions. +// +// Return a fresh set of options. +/** @public */ +function mkStdOptions(/*...args*/) { + var h = Object.prototype.hasOwnProperty; + + var opts = {}; + var args = [].concat.apply([], arguments); + // clone defaults, so we do not modify those constants? + if (args[0] !== "NODEFAULT") { + args.unshift(defaultJisonLexOptions); + } else { + args.shift(); + } + + for (var i = 0, len = args.length; i < len; i++) { + var o = args[i]; + if (!o) continue; + + // clone input (while camel-casing the options), so we do not modify those either. + var o2 = {}; + + for (var p in o) { + if (typeof o[p] !== 'undefined' && h.call(o, p)) { + o2[camelCase(p)] = o[p]; + } + } + + // now clean them options up: + if (typeof o2.main !== 'undefined') { + o2.noMain = !o2.main; + } + + delete o2.main; + + // special check for `moduleName` to ensure we detect the 'default' moduleName entering from the CLI + // NOT overriding the moduleName set in the grammar definition file via an `%options` entry: + if (o2.moduleName === o2.defaultModuleName) { + delete o2.moduleName; + } + + // now see if we have an overriding option here: + for (var p in o2) { + if (h.call(o2, p)) { + if (typeof o2[p] !== 'undefined') { + opts[p] = o2[p]; + } + } + } + } + + return opts; +} + +// set up export/output attributes of the `options` object instance +function prepExportStructures(options) { + // set up the 'option' `exportSourceCode` as a hash object for returning + // all generated source code chunks to the caller + var exportSourceCode = options.exportSourceCode; + if (!exportSourceCode || typeof exportSourceCode !== 'object') { + exportSourceCode = { + enabled: !!exportSourceCode + }; + } else if (typeof exportSourceCode.enabled !== 'boolean') { + exportSourceCode.enabled = true; + } + options.exportSourceCode = exportSourceCode; +} + +// Autodetect if the input lexer spec is in JSON or JISON +// format when the `options.json` flag is `true`. +// +// Produce the JSON lexer spec result when these are JSON formatted already as that +// would save us the trouble of doing this again, anywhere else in the JISON +// compiler/generator. +// +// Otherwise return the *parsed* lexer spec as it has +// been processed through LexParser. +function autodetectAndConvertToJSONformat(lexerSpec, options) { + var chk_l = null; + var ex1, err; + + if (typeof lexerSpec === 'string') { + if (options.json) { + try { + chk_l = json5.parse(lexerSpec); + + // When JSON5-based parsing of the lexer spec succeeds, this implies the lexer spec is specified in `JSON mode` + // *OR* there's a JSON/JSON5 format error in the input: + } catch (e) { + ex1 = e; + } + } + if (!chk_l) { + // // WARNING: the lexer may receive options specified in the **grammar spec file**, + // // hence we should mix the options to ensure the lexParser always + // // receives the full set! + // // + // // make sure all options are 'standardized' before we go and mix them together: + // options = mkStdOptions(grammar.options, options); + try { + chk_l = lexParser.parse(lexerSpec, options); + } catch (e) { + if (options.json) { + err = new Error('Could not parse lexer spec in JSON AUTODETECT mode\nError: ' + ex1.message + ' (' + e.message + ')'); + err.secondary_exception = e; + err.stack = ex1.stack; + } else { + err = new Error('Could not parse lexer spec\nError: ' + e.message); + err.stack = e.stack; + } + throw err; + } + } + } else { + chk_l = lexerSpec; + } + + // Save time! Don't reparse the entire lexer spec *again* inside the code generators when that's not necessary: + + return chk_l; +} + + +// expand macros and convert matchers to RegExp's +function prepareRules(dict, actions, caseHelper, tokens, startConditions, opts) { + var m, i, k, rule, action, conditions, + active_conditions, + rules = dict.rules || [], + newRules = [], + macros = {}, + regular_rule_count = 0, + simple_rule_count = 0; + + // Assure all options are camelCased: + assert(typeof opts.options['case-insensitive'] === 'undefined'); + + if (!tokens) { + tokens = {}; + } + + // Depending on the location within the regex we need different expansions of the macros: + // one expansion for when a macro is *inside* a `[...]` and another expansion when a macro + // is anywhere else in a regex: + if (dict.macros) { + macros = prepareMacros(dict.macros, opts); + } + + function tokenNumberReplacement(str, token) { + return 'return ' + (tokens[token] || '\'' + token.replace(/'/g, '\\\'') + '\''); + } + + // Make sure a comment does not contain any embedded '*/' end-of-comment marker + // as that would break the generated code + function postprocessComment(str) { + if (Array.isArray(str)) { + str = str.join(' '); + } + str = str.replace(/\*\//g, '*\\/'); // destroy any inner `*/` comment terminator sequence. + return str; + } + + actions.push('switch(yyrulenumber) {'); + + for (i = 0; i < rules.length; i++) { + rule = rules[i]; + m = rule[0]; + + active_conditions = []; + if (Object.prototype.toString.apply(m) !== '[object Array]') { + // implicit add to all inclusive start conditions + for (k in startConditions) { + if (startConditions[k].inclusive) { + active_conditions.push(k); + startConditions[k].rules.push(i); + } + } + } else if (m[0] === '*') { + // Add to ALL start conditions + active_conditions.push('*'); + for (k in startConditions) { + startConditions[k].rules.push(i); + } + rule.shift(); + m = rule[0]; + } else { + // Add to explicit start conditions + conditions = rule.shift(); + m = rule[0]; + for (k = 0; k < conditions.length; k++) { + if (!startConditions.hasOwnProperty(conditions[k])) { + startConditions[conditions[k]] = { + rules: [], + inclusive: false + }; + console.warn('Lexer Warning:', '"' + conditions[k] + '" start condition should be defined as %s or %x; assuming %x now.'); + } + active_conditions.push(conditions[k]); + startConditions[conditions[k]].rules.push(i); + } + } + + if (typeof m === 'string') { + m = expandMacros(m, macros, opts); + m = new XRegExp('^(?:' + m + ')', opts.options.caseInsensitive ? 'i' : ''); + } + newRules.push(m); + if (typeof rule[1] === 'function') { + rule[1] = String(rule[1]).replace(/^\s*function \(\)\s?\{/, '').replace(/\}\s*$/, ''); + } + action = rule[1]; + action = action.replace(/return '((?:\\'|[^']+)+)'/g, tokenNumberReplacement); + action = action.replace(/return "((?:\\"|[^"]+)+)"/g, tokenNumberReplacement); + + var code = ['\n/*! Conditions::']; + code.push(postprocessComment(active_conditions)); + code.push('*/', '\n/*! Rule:: '); + code.push(postprocessComment(rules[i][0])); + code.push('*/', '\n'); + + // When the action is *only* a simple `return TOKEN` statement, then add it to the caseHelpers; + // otherwise add the additional `break;` at the end. + // + // Note: we do NOT analyze the action block any more to see if the *last* line is a simple + // `return NNN;` statement as there are too many shoddy idioms, e.g. + // + // ``` + // %{ if (cond) + // return TOKEN; + // %} + // ``` + // + // which would then cause havoc when our action code analysis (using regexes or otherwise) was 'too simple' + // to catch these culprits; hence we resort and stick with the most fundamental approach here: + // always append `break;` even when it would be obvious to a human that such would be 'unreachable code'. + var match_nr = /^return[\s\r\n]+((?:'(?:\\'|[^']+)+')|(?:"(?:\\"|[^"]+)+")|\d+)[\s\r\n]*;?$/.exec(action.trim()); + if (match_nr) { + simple_rule_count++; + caseHelper.push([].concat(code, i, ':', match_nr[1]).join(' ').replace(/[\n]/g, '\n ')); + } else { + regular_rule_count++; + actions.push([].concat('case', i, ':', code, action, '\nbreak;').join(' ')); + } + } + actions.push('default:'); + actions.push(' return this.simpleCaseActionClusters[yyrulenumber];'); + actions.push('}'); + + return { + rules: newRules, + macros: macros, + + regular_rule_count: regular_rule_count, + simple_rule_count: simple_rule_count, + }; +} + + + + + + + +// expand all macros (with maybe one exception) in the given regex: the macros may exist inside `[...]` regex sets or +// elsewhere, which requires two different treatments to expand these macros. +function reduceRegex(s, name, opts, expandAllMacrosInSet_cb, expandAllMacrosElsewhere_cb) { + var orig = s; + + function errinfo() { + if (name) { + return 'macro [[' + name + ']]'; + } else { + return 'regex [[' + orig + ']]'; + } + } + + // propagate deferred exceptions = error reports. + if (s instanceof Error) { + return s; + } + + var c1, c2; + var rv = []; + var derr; + var se; + + while (s.length) { + c1 = s.match(CHR_RE); + if (!c1) { + // cope with illegal escape sequences too! + return new Error(errinfo() + ': illegal escape sequence at start of regex part: ' + s); + } else { + c1 = c1[0]; + } + s = s.substr(c1.length); + + switch (c1) { + case '[': + // this is starting a set within the regex: scan until end of set! + var set_content = []; + var l = new Array(UNICODE_BASE_PLANE_MAX_CP + 1); + + while (s.length) { + var inner = s.match(SET_PART_RE); + if (!inner) { + inner = s.match(CHR_RE); + if (!inner) { + // cope with illegal escape sequences too! + return new Error(errinfo() + ': illegal escape sequence at start of regex part: ' + s); + } else { + inner = inner[0]; + } + if (inner === ']') break; + } else { + inner = inner[0]; + } + set_content.push(inner); + s = s.substr(inner.length); + } + + // ensure that we hit the terminating ']': + c2 = s.match(CHR_RE); + if (!c2) { + // cope with illegal escape sequences too! + return new Error(errinfo() + ': regex set expression is broken: "' + s + '"'); + } else { + c2 = c2[0]; + } + if (c2 !== ']') { + return new Error(errinfo() + ': regex set expression is broken: apparently unterminated'); + } + s = s.substr(c2.length); + + se = set_content.join(''); + + // expand any macros in here: + if (expandAllMacrosInSet_cb) { + se = expandAllMacrosInSet_cb(se); + assert(se); + if (se instanceof Error) { + return new Error(errinfo() + ': ' + se.message); + } + } + + derr = setmgmt.set2bitarray(l, se, opts); + if (derr instanceof Error) { + return new Error(errinfo() + ': ' + derr.message); + } + + // find out which set expression is optimal in size: + var s1 = setmgmt.produceOptimizedRegex4Set(l); + + // check if the source regex set potentially has any expansions (guestimate!) + // + // The indexOf('{') picks both XRegExp Unicode escapes and JISON lexer macros, which is perfect for us here. + var has_expansions = (se.indexOf('{') >= 0); + + se = '[' + se + ']'; + + if (!has_expansions && se.length < s1.length) { + s1 = se; + } + rv.push(s1); + break; + + // XRegExp Unicode escape, e.g. `\\p{Number}`: + case '\\p': + c2 = s.match(XREGEXP_UNICODE_ESCAPE_RE); + if (c2) { + c2 = c2[0]; + s = s.substr(c2.length); + + // nothing to expand. + rv.push(c1 + c2); + } else { + // nothing to stretch this match, hence nothing to expand. + rv.push(c1); + } + break; + + // Either a range expression or the start of a macro reference: `.{1,3}` or `{NAME}`. + // Treat it as a macro reference and see if it will expand to anything: + case '{': + c2 = s.match(NOTHING_SPECIAL_RE); + if (c2) { + c2 = c2[0]; + s = s.substr(c2.length); + + var c3 = s[0]; + s = s.substr(c3.length); + if (c3 === '}') { + // possibly a macro name in there... Expand if possible: + c2 = c1 + c2 + c3; + if (expandAllMacrosElsewhere_cb) { + c2 = expandAllMacrosElsewhere_cb(c2); + assert(c2); + if (c2 instanceof Error) { + return new Error(errinfo() + ': ' + c2.message); + } + } + } else { + // not a well-terminated macro reference or something completely different: + // we do not even attempt to expand this as there's guaranteed nothing to expand + // in this bit. + c2 = c1 + c2 + c3; + } + rv.push(c2); + } else { + // nothing to stretch this match, hence nothing to expand. + rv.push(c1); + } + break; + + // Recognize some other regex elements, but there's no need to understand them all. + // + // We are merely interested in any chunks now which do *not* include yet another regex set `[...]` + // nor any `{MACRO}` reference: + default: + // non-set character or word: see how much of this there is for us and then see if there + // are any macros still lurking inside there: + c2 = s.match(NOTHING_SPECIAL_RE); + if (c2) { + c2 = c2[0]; + s = s.substr(c2.length); + + // nothing to expand. + rv.push(c1 + c2); + } else { + // nothing to stretch this match, hence nothing to expand. + rv.push(c1); + } + break; + } + } + + s = rv.join(''); + + // When this result is suitable for use in a set, than we should be able to compile + // it in a regex; that way we can easily validate whether macro X is fit to be used + // inside a regex set: + try { + var re; + re = new XRegExp(s); + re.test(s[0]); + } catch (ex) { + // make sure we produce a regex expression which will fail badly when it is used + // in actual code: + return new Error(errinfo() + ': expands to an invalid regex: /' + s + '/'); + } + + assert(s); + return s; +} + + +// expand macros within macros and cache the result +function prepareMacros(dict_macros, opts) { + var macros = {}; + + // expand a `{NAME}` macro which exists inside a `[...]` set: + function expandMacroInSet(i) { + var k, a, m; + if (!macros[i]) { + m = dict_macros[i]; + + if (m.indexOf('{') >= 0) { + // set up our own record so we can detect definition loops: + macros[i] = { + in_set: false, + elsewhere: null, + raw: dict_macros[i] + }; + + for (k in dict_macros) { + if (dict_macros.hasOwnProperty(k) && i !== k) { + // it doesn't matter if the lexer recognized that the inner macro(s) + // were sitting inside a `[...]` set or not: the fact that they are used + // here in macro `i` which itself sits in a set, makes them *all* live in + // a set so all of them get the same treatment: set expansion style. + // + // Note: make sure we don't try to expand any XRegExp `\p{...}` or `\P{...}` + // macros here: + if (XRegExp._getUnicodeProperty(k)) { + // Work-around so that you can use `\p{ascii}` for a XRegExp slug, a.k.a. + // Unicode 'General Category' Property cf. http://unicode.org/reports/tr18/#Categories, + // while using `\p{ASCII}` as a *macro expansion* of the `ASCII` + // macro: + if (k.toUpperCase() !== k) { + m = new Error('Cannot use name "' + k + '" as a macro name as it clashes with the same XRegExp "\\p{..}" Unicode \'General Category\' Property name. Use all-uppercase macro names, e.g. name your macro "' + k.toUpperCase() + '" to work around this issue or give your offending macro a different name.'); + break; + } + } + + a = m.split('{' + k + '}'); + if (a.length > 1) { + var x = expandMacroInSet(k); + assert(x); + if (x instanceof Error) { + m = x; + break; + } + m = a.join(x); + } + } + } + } + + var mba = setmgmt.reduceRegexToSetBitArray(m, i, opts); + + var s1; + + // propagate deferred exceptions = error reports. + if (mba instanceof Error) { + s1 = mba; + } else { + s1 = setmgmt.bitarray2set(mba, false); + + m = s1; + } + + macros[i] = { + in_set: s1, + elsewhere: null, + raw: dict_macros[i] + }; + } else { + m = macros[i].in_set; + + if (m instanceof Error) { + // this turns out to be an macro with 'issues' and it is used, so the 'issues' do matter: bombs away! + return new Error(m.message); + } + + // detect definition loop: + if (m === false) { + return new Error('Macro name "' + i + '" has an illegal, looping, definition, i.e. it\'s definition references itself, either directly or indirectly, via other macros.'); + } + } + + return m; + } + + function expandMacroElsewhere(i) { + var k, a, m; + + if (macros[i].elsewhere == null) { + m = dict_macros[i]; + + // set up our own record so we can detect definition loops: + macros[i].elsewhere = false; + + // the macro MAY contain other macros which MAY be inside a `[...]` set in this + // macro or elsewhere, hence we must parse the regex: + m = reduceRegex(m, i, opts, expandAllMacrosInSet, expandAllMacrosElsewhere); + // propagate deferred exceptions = error reports. + if (m instanceof Error) { + return m; + } + + macros[i].elsewhere = m; + } else { + m = macros[i].elsewhere; + + if (m instanceof Error) { + // this turns out to be an macro with 'issues' and it is used, so the 'issues' do matter: bombs away! + return m; + } + + // detect definition loop: + if (m === false) { + return new Error('Macro name "' + i + '" has an illegal, looping, definition, i.e. it\'s definition references itself, either directly or indirectly, via other macros.'); + } + } + + return m; + } + + function expandAllMacrosInSet(s) { + var i, x; + + // process *all* the macros inside [...] set: + if (s.indexOf('{') >= 0) { + for (i in macros) { + if (macros.hasOwnProperty(i)) { + var a = s.split('{' + i + '}'); + if (a.length > 1) { + x = expandMacroInSet(i); + assert(x); + if (x instanceof Error) { + return new Error('failure to expand the macro [' + i + '] in set [' + s + ']: ' + x.message); + } + s = a.join(x); + } + + // stop the brute-force expansion attempt when we done 'em all: + if (s.indexOf('{') === -1) { + break; + } + } + } + } + + return s; + } + + function expandAllMacrosElsewhere(s) { + var i, x; + + // When we process the remaining macro occurrences in the regex + // every macro used in a lexer rule will become its own capture group. + // + // Meanwhile the cached expansion will expand any submacros into + // *NON*-capturing groups so that the backreference indexes remain as you'ld + // expect and using macros doesn't require you to know exactly what your + // used macro will expand into, i.e. which and how many submacros it has. + // + // This is a BREAKING CHANGE from vanilla jison 0.4.15! + if (s.indexOf('{') >= 0) { + for (i in macros) { + if (macros.hasOwnProperty(i)) { + // These are all submacro expansions, hence non-capturing grouping is applied: + var a = s.split('{' + i + '}'); + if (a.length > 1) { + x = expandMacroElsewhere(i); + assert(x); + if (x instanceof Error) { + return new Error('failure to expand the macro [' + i + '] in regex /' + s + '/: ' + x.message); + } + s = a.join('(?:' + x + ')'); + } + + // stop the brute-force expansion attempt when we done 'em all: + if (s.indexOf('{') === -1) { + break; + } + } + } + } + + return s; + } + + + var m, i; + + if (opts.debug) console.log('\n############## RAW macros: ', dict_macros); + + // first we create the part of the dictionary which is targeting the use of macros + // *inside* `[...]` sets; once we have completed that half of the expansions work, + // we then go and expand the macros for when they are used elsewhere in a regex: + // iff we encounter submacros then which are used *inside* a set, we can use that + // first half dictionary to speed things up a bit as we can use those expansions + // straight away! + for (i in dict_macros) { + if (dict_macros.hasOwnProperty(i)) { + expandMacroInSet(i); + } + } + + for (i in dict_macros) { + if (dict_macros.hasOwnProperty(i)) { + expandMacroElsewhere(i); + } + } + + if (opts.debug) console.log('\n############### expanded macros: ', macros); + + return macros; +} + + + +// expand macros in a regex; expands them recursively +function expandMacros(src, macros, opts) { + var expansion_count = 0; + + // By the time we call this function `expandMacros` we MUST have expanded and cached all macros already! + // Hence things should be easy in there: + + function expandAllMacrosInSet(s) { + var i, m, x; + + // process *all* the macros inside [...] set: + if (s.indexOf('{') >= 0) { + for (i in macros) { + if (macros.hasOwnProperty(i)) { + m = macros[i]; + + var a = s.split('{' + i + '}'); + if (a.length > 1) { + x = m.in_set; + + assert(x); + if (x instanceof Error) { + // this turns out to be an macro with 'issues' and it is used, so the 'issues' do matter: bombs away! + throw x; + } + + // detect definition loop: + if (x === false) { + return new Error('Macro name "' + i + '" has an illegal, looping, definition, i.e. it\'s definition references itself, either directly or indirectly, via other macros.'); + } + + s = a.join(x); + expansion_count++; + } + + // stop the brute-force expansion attempt when we done 'em all: + if (s.indexOf('{') === -1) { + break; + } + } + } + } + + return s; + } + + function expandAllMacrosElsewhere(s) { + var i, m, x; + + // When we process the main macro occurrences in the regex + // every macro used in a lexer rule will become its own capture group. + // + // Meanwhile the cached expansion will expand any submacros into + // *NON*-capturing groups so that the backreference indexes remain as you'ld + // expect and using macros doesn't require you to know exactly what your + // used macro will expand into, i.e. which and how many submacros it has. + // + // This is a BREAKING CHANGE from vanilla jison 0.4.15! + if (s.indexOf('{') >= 0) { + for (i in macros) { + if (macros.hasOwnProperty(i)) { + m = macros[i]; + + var a = s.split('{' + i + '}'); + if (a.length > 1) { + // These are all main macro expansions, hence CAPTURING grouping is applied: + x = m.elsewhere; + assert(x); + + // detect definition loop: + if (x === false) { + return new Error('Macro name "' + i + '" has an illegal, looping, definition, i.e. it\'s definition references itself, either directly or indirectly, via other macros.'); + } + + s = a.join('(' + x + ')'); + expansion_count++; + } + + // stop the brute-force expansion attempt when we done 'em all: + if (s.indexOf('{') === -1) { + break; + } + } + } + } + + return s; + } + + + // When we process the macro occurrences in the regex + // every macro used in a lexer rule will become its own capture group. + // + // Meanwhile the cached expansion will have expanded any submacros into + // *NON*-capturing groups so that the backreference indexes remain as you'ld + // expect and using macros doesn't require you to know exactly what your + // used macro will expand into, i.e. which and how many submacros it has. + // + // This is a BREAKING CHANGE from vanilla jison 0.4.15! + var s2 = reduceRegex(src, null, opts, expandAllMacrosInSet, expandAllMacrosElsewhere); + // propagate deferred exceptions = error reports. + if (s2 instanceof Error) { + throw s2; + } + + // only when we did expand some actual macros do we take the re-interpreted/optimized/regenerated regex from reduceRegex() + // in order to keep our test cases simple and rules recognizable. This assumes the user can code good regexes on his own, + // as long as no macros are involved... + // + // Also pick the reduced regex when there (potentially) are XRegExp extensions in the original, e.g. `\\p{Number}`, + // unless the `xregexp` output option has been enabled. + if (expansion_count > 0 || (src.indexOf('\\p{') >= 0 && !opts.options.xregexp)) { + src = s2; + } else { + // Check if the reduced regex is smaller in size; when it is, we still go with the new one! + if (s2.length < src.length) { + src = s2; + } + } + + return src; +} + +function prepareStartConditions(conditions) { + var sc, + hash = {}; + for (sc in conditions) { + if (conditions.hasOwnProperty(sc)) { + hash[sc] = {rules:[], inclusive: !conditions[sc]}; + } + } + return hash; +} + +function buildActions(dict, tokens, opts) { + var actions = [dict.actionInclude || '', 'var YYSTATE = YY_START;']; + var tok; + var toks = {}; + var caseHelper = []; + + // tokens: map/array of token numbers to token names + for (tok in tokens) { + var idx = parseInt(tok); + if (idx && idx > 0) { + toks[tokens[tok]] = idx; + } + } + + if (opts.options.flex && dict.rules) { + dict.rules.push(['.', 'console.log("", yytext); /* `flex` lexing mode: the last resort rule! */']); + } + + var gen = prepareRules(dict, actions, caseHelper, tokens && toks, opts.conditions, opts); + + var fun = actions.join('\n'); + 'yytext yyleng yylineno yylloc yyerror'.split(' ').forEach(function (yy) { + fun = fun.replace(new RegExp('\\b(' + yy + ')\\b', 'g'), 'yy_.$1'); + }); + + return { + caseHelperInclude: '{\n' + caseHelper.join(',') + '\n}', + + actions: `function lexer__performAction(yy, yyrulenumber, YY_START) { + var yy_ = this; + + ${fun} + }`, + + rules: gen.rules, + macros: gen.macros, // propagate these for debugging/diagnostic purposes + + regular_rule_count: gen.regular_rule_count, + simple_rule_count: gen.simple_rule_count, + }; +} + +// +// NOTE: this is *almost* a copy of the JisonParserError producing code in +// jison/lib/jison.js @ line 2304:lrGeneratorMixin.generateErrorClass +// +function generateErrorClass() { + // --- START lexer error class --- + +var prelude = `/** + * See also: + * http://stackoverflow.com/questions/1382107/whats-a-good-way-to-extend-error-in-javascript/#35881508 + * but we keep the prototype.constructor and prototype.name assignment lines too for compatibility + * with userland code which might access the derived class in a 'classic' way. + * + * @public + * @constructor + * @nocollapse + */ +function JisonLexerError(msg, hash) { + Object.defineProperty(this, 'name', { + enumerable: false, + writable: false, + value: 'JisonLexerError' + }); + + if (msg == null) msg = '???'; + + Object.defineProperty(this, 'message', { + enumerable: false, + writable: true, + value: msg + }); + + this.hash = hash; + + var stacktrace; + if (hash && hash.exception instanceof Error) { + var ex2 = hash.exception; + this.message = ex2.message || msg; + stacktrace = ex2.stack; + } + if (!stacktrace) { + if (Error.hasOwnProperty('captureStackTrace')) { // V8 + Error.captureStackTrace(this, this.constructor); + } else { + stacktrace = (new Error(msg)).stack; + } + } + if (stacktrace) { + Object.defineProperty(this, 'stack', { + enumerable: false, + writable: false, + value: stacktrace + }); + } +} + +if (typeof Object.setPrototypeOf === 'function') { + Object.setPrototypeOf(JisonLexerError.prototype, Error.prototype); +} else { + JisonLexerError.prototype = Object.create(Error.prototype); +} +JisonLexerError.prototype.constructor = JisonLexerError; +JisonLexerError.prototype.name = 'JisonLexerError';`; + + // --- END lexer error class --- + + return prelude; +} + + +const jisonLexerErrorDefinition = generateErrorClass(); + + +function generateFakeXRegExpClassSrcCode() { + return rmCommonWS` + var __hacky_counter__ = 0; + + /** + * @constructor + * @nocollapse + */ + function XRegExp(re, f) { + this.re = re; + this.flags = f; + this._getUnicodeProperty = function (k) {}; + var fake = /./; // WARNING: this exact 'fake' is also depended upon by the xregexp unit test! + __hacky_counter__++; + fake.__hacky_backy__ = __hacky_counter__; + return fake; + } + `; +} + + + +/** @constructor */ +function RegExpLexer(dict, input, tokens, build_options) { + var opts; + var dump = false; + + function test_me(tweak_cb, description, src_exception, ex_callback) { + opts = processGrammar(dict, tokens, build_options); + opts.__in_rules_failure_analysis_mode__ = false; + prepExportStructures(opts); + assert(opts.options); + if (tweak_cb) { + tweak_cb(); + } + var source = generateModuleBody(opts); + try { + // The generated code will always have the `lexer` variable declared at local scope + // as `eval()` will use the local scope. + // + // The compiled code will look something like this: + // + // ``` + // var lexer; + // bla bla... + // ``` + // + // or + // + // ``` + // var lexer = { bla... }; + // ``` + var testcode = [ + '// provide a local version for test purposes:', + jisonLexerErrorDefinition, + '', + generateFakeXRegExpClassSrcCode(), + '', + source, + '', + 'return lexer;'].join('\n'); + var lexer = code_exec(testcode, function generated_code_exec_wrapper_regexp_lexer(sourcecode) { + //console.log("===============================LEXER TEST CODE\n", sourcecode, "\n=====================END====================\n"); + var lexer_f = new Function('', sourcecode); + return lexer_f(); + }, opts.options, "lexer"); + + if (!lexer) { + throw new Error('no lexer defined *at all*?!'); + } + if (typeof lexer.options !== 'object' || lexer.options == null) { + throw new Error('your lexer class MUST have an .options member object or it won\'t fly!'); + } + if (typeof lexer.setInput !== 'function') { + throw new Error('your lexer class MUST have a .setInput function member or it won\'t fly!'); + } + if (lexer.EOF !== 1 && lexer.ERROR !== 2) { + throw new Error('your lexer class MUST have these constants defined: lexer.EOF = 1 and lexer.ERROR = 2 or it won\'t fly!'); + } + + // When we do NOT crash, we found/killed the problem area just before this call! + if (src_exception && description) { + src_exception.message += '\n (' + description + ')'; + } + + // patch the pre and post handlers in there, now that we have some live code to work with: + if (opts.options) { + var pre = opts.options.pre_lex; + var post = opts.options.post_lex; + // since JSON cannot encode functions, we'll have to do it manually now: + if (typeof pre === 'function') { + lexer.options.pre_lex = pre; + } + if (typeof post === 'function') { + lexer.options.post_lex = post; + } + } + + if (opts.options.showSource) { + if (typeof opts.options.showSource === 'function') { + opts.options.showSource(lexer, source, opts); + } else { + console.log("\nGenerated lexer sourcecode:\n----------------------------------------\n", source, "\n----------------------------------------\n"); + } + } + return lexer; + } catch (ex) { + // if (src_exception) { + // src_exception.message += '\n (' + description + ': ' + ex.message + ')'; + // } + + if (ex_callback) { + ex_callback(ex); + } else if (dump) { + console.log('source code:\n', source); + } + return false; + } + } + + /** @constructor */ + var lexer = test_me(null, null, null, function (ex) { + // When we get an exception here, it means some part of the user-specified lexer is botched. + // + // Now we go and try to narrow down the problem area/category: + assert(opts.options); + assert(opts.options.xregexp !== undefined); + var orig_xregexp_opt = !!opts.options.xregexp; + if (!test_me(function () { + assert(opts.options.xregexp !== undefined); + opts.options.xregexp = false; + opts.showSource = false; + }, 'When you have specified %option xregexp, you must also properly IMPORT the XRegExp library in the generated lexer.', ex, null)) { + if (!test_me(function () { + // restore xregexp option setting: the trouble wasn't caused by the xregexp flag i.c.w. incorrect XRegExp library importing! + opts.options.xregexp = orig_xregexp_opt; + + opts.conditions = []; + opts.showSource = false; + }, ((dict.rules && dict.rules.length > 0) ? + 'One or more of your lexer state names are possibly botched?' : + 'Your custom lexer is somehow botched.'), ex, null)) { + if (!test_me(function () { + // opts.conditions = []; + opts.rules = []; + opts.showSource = false; + opts.__in_rules_failure_analysis_mode__ = true; + }, 'One or more of your lexer rules are possibly botched?', ex, null)) { + // kill each rule action block, one at a time and test again after each 'edit': + var rv = false; + for (var i = 0, len = (dict.rules ? dict.rules.length : 0); i < len; i++) { + dict.rules[i][1] = '{ /* nada */ }'; + rv = test_me(function () { + // opts.conditions = []; + // opts.rules = []; + // opts.__in_rules_failure_analysis_mode__ = true; + }, 'Your lexer rule "' + dict.rules[i][0] + '" action code block is botched?', ex, null); + if (rv) { + break; + } + } + if (!rv) { + test_me(function () { + opts.conditions = []; + opts.rules = []; + opts.performAction = 'null'; + // opts.options = {}; + // opts.caseHelperInclude = '{}'; + opts.showSource = false; + opts.__in_rules_failure_analysis_mode__ = true; + + dump = false; + }, 'One or more of your lexer rule action code block(s) are possibly botched?', ex, null); + } + } + } + } + throw ex; + }); + + lexer.setInput(input); + + /** @public */ + lexer.generate = function () { + return generateFromOpts(opts); + }; + /** @public */ + lexer.generateModule = function () { + return generateModule(opts); + }; + /** @public */ + lexer.generateCommonJSModule = function () { + return generateCommonJSModule(opts); + }; + /** @public */ + lexer.generateESModule = function () { + return generateESModule(opts); + }; + /** @public */ + lexer.generateAMDModule = function () { + return generateAMDModule(opts); + }; + + // internal APIs to aid testing: + /** @public */ + lexer.getExpandedMacros = function () { + return opts.macros; + }; + + return lexer; +} + +// code stripping performance test for very simple grammar: +// +// - removing backtracking parser code branches: 730K -> 750K rounds +// - removing all location info tracking: yylineno, yylloc, etc.: 750K -> 900K rounds +// - no `yyleng`: 900K -> 905K rounds +// - no `this.done` as we cannot have a NULL `_input` anymore: 905K -> 930K rounds +// - `simpleCaseActionClusters` as array instead of hash object: 930K -> 940K rounds +// - lexers which have only return stmts, i.e. only a +// `simpleCaseActionClusters` lookup table to produce +// lexer tokens: *inline* the `performAction` call: 940K -> 950K rounds +// - given all the above, you can *inline* what's left of +// `lexer_next()`: 950K -> 955K rounds (? this stuff becomes hard to measure; inaccuracy abounds!) +// +// Total gain when we forget about very minor (and tough to nail) *inlining* `lexer_next()` gains: +// +// 730 -> 950 ~ 30% performance gain. +// + +// As a function can be reproduced in source-code form by any JavaScript engine, we're going to wrap this chunk +// of code in a function so that we can easily get it including it comments, etc.: +/** +@public +@nocollapse +*/ +function getRegExpLexerPrototype() { + // --- START lexer kernel --- +return `{ + EOF: 1, + ERROR: 2, + + // JisonLexerError: JisonLexerError, /// <-- injected by the code generator + + // options: {}, /// <-- injected by the code generator + + // yy: ..., /// <-- injected by setInput() + + __currentRuleSet__: null, /// INTERNAL USE ONLY: internal rule set cache for the current lexer state + + __error_infos: [], /// INTERNAL USE ONLY: the set of lexErrorInfo objects created since the last cleanup + + __decompressed: false, /// INTERNAL USE ONLY: mark whether the lexer instance has been 'unfolded' completely and is now ready for use + + done: false, /// INTERNAL USE ONLY + _backtrack: false, /// INTERNAL USE ONLY + _input: '', /// INTERNAL USE ONLY + _more: false, /// INTERNAL USE ONLY + _signaled_error_token: false, /// INTERNAL USE ONLY + + conditionStack: [], /// INTERNAL USE ONLY; managed via \`pushState()\`, \`popState()\`, \`topState()\` and \`stateStackSize()\` + + match: '', /// READ-ONLY EXTERNAL ACCESS - ADVANCED USE ONLY: tracks input which has been matched so far for the lexer token under construction. \`match\` is identical to \`yytext\` except that this one still contains the matched input string after \`lexer.performAction()\` has been invoked, where userland code MAY have changed/replaced the \`yytext\` value entirely! + matched: '', /// READ-ONLY EXTERNAL ACCESS - ADVANCED USE ONLY: tracks entire input which has been matched so far + matches: false, /// READ-ONLY EXTERNAL ACCESS - ADVANCED USE ONLY: tracks RE match result for last (successful) match attempt + yytext: '', /// ADVANCED USE ONLY: tracks input which has been matched so far for the lexer token under construction; this value is transferred to the parser as the 'token value' when the parser consumes the lexer token produced through a call to the \`lex()\` API. + offset: 0, /// READ-ONLY EXTERNAL ACCESS - ADVANCED USE ONLY: tracks the 'cursor position' in the input string, i.e. the number of characters matched so far + yyleng: 0, /// READ-ONLY EXTERNAL ACCESS - ADVANCED USE ONLY: length of matched input for the token under construction (\`yytext\`) + yylineno: 0, /// READ-ONLY EXTERNAL ACCESS - ADVANCED USE ONLY: 'line number' at which the token under construction is located + yylloc: null, /// READ-ONLY EXTERNAL ACCESS - ADVANCED USE ONLY: tracks location info (lines + columns) for the token under construction + + /** + * INTERNAL USE: construct a suitable error info hash object instance for \`parseError\`. + * + * @public + * @this {RegExpLexer} + */ + constructLexErrorInfo: function lexer_constructLexErrorInfo(msg, recoverable, show_input_position) { + msg = '' + msg; + + // heuristic to determine if the error message already contains a (partial) source code dump + // as produced by either \`showPosition()\` or \`prettyPrintRange()\`: + if (show_input_position == undefined) { + show_input_position = !(msg.indexOf('\\n') > 0 && msg.indexOf('^') > 0); + } + if (this.yylloc && show_input_position) { + if (typeof this.prettyPrintRange === 'function') { + var pretty_src = this.prettyPrintRange(this.yylloc); + + if (!/\\n\\s*$/.test(msg)) { + msg += '\\n'; + } + msg += '\\n Erroneous area:\\n' + this.prettyPrintRange(this.yylloc); + } else if (typeof this.showPosition === 'function') { + var pos_str = this.showPosition(); + if (pos_str) { + if (msg.length && msg[msg.length - 1] !== '\\n' && pos_str[0] !== '\\n') { + msg += '\\n' + pos_str; + } else { + msg += pos_str; + } + } + } + } + /** @constructor */ + var pei = { + errStr: msg, + recoverable: !!recoverable, + text: this.match, // This one MAY be empty; userland code should use the \`upcomingInput\` API to obtain more text which follows the 'lexer cursor position'... + token: null, + line: this.yylineno, + loc: this.yylloc, + yy: this.yy, + lexer: this, + + /** + * and make sure the error info doesn't stay due to potential + * ref cycle via userland code manipulations. + * These would otherwise all be memory leak opportunities! + * + * Note that only array and object references are nuked as those + * constitute the set of elements which can produce a cyclic ref. + * The rest of the members is kept intact as they are harmless. + * + * @public + * @this {LexErrorInfo} + */ + destroy: function destructLexErrorInfo() { + // remove cyclic references added to error info: + // info.yy = null; + // info.lexer = null; + // ... + var rec = !!this.recoverable; + for (var key in this) { + if (this.hasOwnProperty(key) && typeof key === 'object') { + this[key] = undefined; + } + } + this.recoverable = rec; + } + }; + // track this instance so we can \`destroy()\` it once we deem it superfluous and ready for garbage collection! + this.__error_infos.push(pei); + return pei; + }, + + /** + * handler which is invoked when a lexer error occurs. + * + * @public + * @this {RegExpLexer} + */ + parseError: function lexer_parseError(str, hash, ExceptionClass) { + if (!ExceptionClass) { + ExceptionClass = this.JisonLexerError; + } + if (this.yy) { + if (this.yy.parser && typeof this.yy.parser.parseError === 'function') { + return this.yy.parser.parseError.call(this, str, hash, ExceptionClass) || this.ERROR; + } else if (typeof this.yy.parseError === 'function') { + return this.yy.parseError.call(this, str, hash, ExceptionClass) || this.ERROR; + } + } + throw new ExceptionClass(str, hash); + }, + + /** + * method which implements \`yyerror(str, ...args)\` functionality for use inside lexer actions. + * + * @public + * @this {RegExpLexer} + */ + yyerror: function yyError(str /*, ...args */) { + var lineno_msg = ''; + if (this.yylloc) { + lineno_msg = ' on line ' + (this.yylineno + 1); + } + var p = this.constructLexErrorInfo('Lexical error' + lineno_msg + ': ' + str, this.options.lexerErrorsAreRecoverable); + + // Add any extra args to the hash under the name \`extra_error_attributes\`: + var args = Array.prototype.slice.call(arguments, 1); + if (args.length) { + p.extra_error_attributes = args; + } + + return (this.parseError(p.errStr, p, this.JisonLexerError) || this.ERROR); + }, + + /** + * final cleanup function for when we have completed lexing the input; + * make it an API so that external code can use this one once userland + * code has decided it's time to destroy any lingering lexer error + * hash object instances and the like: this function helps to clean + * up these constructs, which *may* carry cyclic references which would + * otherwise prevent the instances from being properly and timely + * garbage-collected, i.e. this function helps prevent memory leaks! + * + * @public + * @this {RegExpLexer} + */ + cleanupAfterLex: function lexer_cleanupAfterLex(do_not_nuke_errorinfos) { + // prevent lingering circular references from causing memory leaks: + this.setInput('', {}); + + // nuke the error hash info instances created during this run. + // Userland code must COPY any data/references + // in the error hash instance(s) it is more permanently interested in. + if (!do_not_nuke_errorinfos) { + for (var i = this.__error_infos.length - 1; i >= 0; i--) { + var el = this.__error_infos[i]; + if (el && typeof el.destroy === 'function') { + el.destroy(); + } + } + this.__error_infos.length = 0; + } + + return this; + }, + + /** + * clear the lexer token context; intended for internal use only + * + * @public + * @this {RegExpLexer} + */ + clear: function lexer_clear() { + this.yytext = ''; + this.yyleng = 0; + this.match = ''; + // - DO NOT reset \`this.matched\` + this.matches = false; + this._more = false; + this._backtrack = false; + + var col = (this.yylloc ? this.yylloc.last_column : 0); + this.yylloc = { + first_line: this.yylineno + 1, + first_column: col, + last_line: this.yylineno + 1, + last_column: col, + + range: [this.offset, this.offset] + }; + }, + + /** + * resets the lexer, sets new input + * + * @public + * @this {RegExpLexer} + */ + setInput: function lexer_setInput(input, yy) { + this.yy = yy || this.yy || {}; + + // also check if we've fully initialized the lexer instance, + // including expansion work to be done to go from a loaded + // lexer to a usable lexer: + if (!this.__decompressed) { + // step 1: decompress the regex list: + var rules = this.rules; + for (var i = 0, len = rules.length; i < len; i++) { + var rule_re = rules[i]; + + // compression: is the RE an xref to another RE slot in the rules[] table? + if (typeof rule_re === 'number') { + rules[i] = rules[rule_re]; + } + } + + // step 2: unfold the conditions[] set to make these ready for use: + var conditions = this.conditions; + for (var k in conditions) { + var spec = conditions[k]; + + var rule_ids = spec.rules; + + var len = rule_ids.length; + var rule_regexes = new Array(len + 1); // slot 0 is unused; we use a 1-based index approach here to keep the hottest code in \`lexer_next()\` fast and simple! + var rule_new_ids = new Array(len + 1); + + for (var i = 0; i < len; i++) { + var idx = rule_ids[i]; + var rule_re = rules[idx]; + rule_regexes[i + 1] = rule_re; + rule_new_ids[i + 1] = idx; + } + + spec.rules = rule_new_ids; + spec.__rule_regexes = rule_regexes; + spec.__rule_count = len; + } + + this.__decompressed = true; + } + + this._input = input || ''; + this.clear(); + this._signaled_error_token = false; + this.done = false; + this.yylineno = 0; + this.matched = ''; + this.conditionStack = ['INITIAL']; + this.__currentRuleSet__ = null; + this.yylloc = { + first_line: 1, + first_column: 0, + last_line: 1, + last_column: 0, + + range: [0, 0] + }; + this.offset = 0; + return this; + }, + + /** + * edit the remaining input via user-specified callback. + * This can be used to forward-adjust the input-to-parse, + * e.g. inserting macro expansions and alike in the + * input which has yet to be lexed. + * The behaviour of this API contrasts the \`unput()\` et al + * APIs as those act on the *consumed* input, while this + * one allows one to manipulate the future, without impacting + * the current \`yyloc\` cursor location or any history. + * + * Use this API to help implement C-preprocessor-like + * \`#include\` statements, etc. + * + * The provided callback must be synchronous and is + * expected to return the edited input (string). + * + * The \`cpsArg\` argument value is passed to the callback + * as-is. + * + * \`callback\` interface: + * \`function callback(input, cpsArg)\` + * + * - \`input\` will carry the remaining-input-to-lex string + * from the lexer. + * - \`cpsArg\` is \`cpsArg\` passed into this API. + * + * The \`this\` reference for the callback will be set to + * reference this lexer instance so that userland code + * in the callback can easily and quickly access any lexer + * API. + * + * When the callback returns a non-string-type falsey value, + * we assume the callback did not edit the input and we + * will using the input as-is. + * + * When the callback returns a non-string-type value, it + * is converted to a string for lexing via the \`"" + retval\` + * operation. (See also why: http://2ality.com/2012/03/converting-to-string.html + * -- that way any returned object's \`toValue()\` and \`toString()\` + * methods will be invoked in a proper/desirable order.) + * + * @public + * @this {RegExpLexer} + */ + editRemainingInput: function lexer_editRemainingInput(callback, cpsArg) { + var rv = callback.call(this, this._input, cpsArg); + if (typeof rv !== 'string') { + if (rv) { + this._input = '' + rv; + } + // else: keep \`this._input\` as is. + } else { + this._input = rv; + } + return this; + }, + + /** + * consumes and returns one char from the input + * + * @public + * @this {RegExpLexer} + */ + input: function lexer_input() { + if (!this._input) { + //this.done = true; -- don't set \`done\` as we want the lex()/next() API to be able to produce one custom EOF token match after this anyhow. (lexer can match special <> tokens and perform user action code for a <> match, but only does so *once*) + return null; + } + var ch = this._input[0]; + this.yytext += ch; + this.yyleng++; + this.offset++; + this.match += ch; + this.matched += ch; + // Count the linenumber up when we hit the LF (or a stand-alone CR). + // On CRLF, the linenumber is incremented when you fetch the CR or the CRLF combo + // and we advance immediately past the LF as well, returning both together as if + // it was all a single 'character' only. + var slice_len = 1; + var lines = false; + if (ch === '\\n') { + lines = true; + } else if (ch === '\\r') { + lines = true; + var ch2 = this._input[1]; + if (ch2 === '\\n') { + slice_len++; + ch += ch2; + this.yytext += ch2; + this.yyleng++; + this.offset++; + this.match += ch2; + this.matched += ch2; + this.yylloc.range[1]++; + } + } + if (lines) { + this.yylineno++; + this.yylloc.last_line++; + this.yylloc.last_column = 0; + } else { + this.yylloc.last_column++; + } + this.yylloc.range[1]++; + + this._input = this._input.slice(slice_len); + return ch; + }, + + /** + * unshifts one char (or an entire string) into the input + * + * @public + * @this {RegExpLexer} + */ + unput: function lexer_unput(ch) { + var len = ch.length; + var lines = ch.split(/(?:\\r\\n?|\\n)/g); + + this._input = ch + this._input; + this.yytext = this.yytext.substr(0, this.yytext.length - len); + this.yyleng = this.yytext.length; + this.offset -= len; + this.match = this.match.substr(0, this.match.length - len); + this.matched = this.matched.substr(0, this.matched.length - len); + + if (lines.length > 1) { + this.yylineno -= lines.length - 1; + + this.yylloc.last_line = this.yylineno + 1; + + // Get last entirely matched line into the \`pre_lines[]\` array's + // last index slot; we don't mind when other previously + // matched lines end up in the array too. + var pre = this.match; + var pre_lines = pre.split(/(?:\\r\\n?|\\n)/g); + if (pre_lines.length === 1) { + pre = this.matched; + pre_lines = pre.split(/(?:\\r\\n?|\\n)/g); + } + this.yylloc.last_column = pre_lines[pre_lines.length - 1].length; + } else { + this.yylloc.last_column -= len; + } + + this.yylloc.range[1] = this.yylloc.range[0] + this.yyleng; + + this.done = false; + return this; + }, + + /** + * cache matched text and append it on next action + * + * @public + * @this {RegExpLexer} + */ + more: function lexer_more() { + this._more = true; + return this; + }, + + /** + * signal the lexer that this rule fails to match the input, so the + * next matching rule (regex) should be tested instead. + * + * @public + * @this {RegExpLexer} + */ + reject: function lexer_reject() { + if (this.options.backtrack_lexer) { + this._backtrack = true; + } else { + // when the \`parseError()\` call returns, we MUST ensure that the error is registered. + // We accomplish this by signaling an 'error' token to be produced for the current + // \`.lex()\` run. + var lineno_msg = ''; + if (this.yylloc) { + lineno_msg = ' on line ' + (this.yylineno + 1); + } + var p = this.constructLexErrorInfo('Lexical error' + lineno_msg + ': You can only invoke reject() in the lexer when the lexer is of the backtracking persuasion (options.backtrack_lexer = true).', false); + this._signaled_error_token = (this.parseError(p.errStr, p, this.JisonLexerError) || this.ERROR); + } + return this; + }, + + /** + * retain first n characters of the match + * + * @public + * @this {RegExpLexer} + */ + less: function lexer_less(n) { + return this.unput(this.match.slice(n)); + }, + + /** + * return (part of the) already matched input, i.e. for error + * messages. + * + * Limit the returned string length to \`maxSize\` (default: 20). + * + * Limit the returned string to the \`maxLines\` number of lines of + * input (default: 1). + * + * Negative limit values equal *unlimited*. + * + * @public + * @this {RegExpLexer} + */ + pastInput: function lexer_pastInput(maxSize, maxLines) { + var past = this.matched.substring(0, this.matched.length - this.match.length); + if (maxSize < 0) + maxSize = past.length; + else if (!maxSize) + maxSize = 20; + if (maxLines < 0) + maxLines = past.length; // can't ever have more input lines than this! + else if (!maxLines) + maxLines = 1; + // \`substr\` anticipation: treat \\r\\n as a single character and take a little + // more than necessary so that we can still properly check against maxSize + // after we've transformed and limited the newLines in here: + past = past.substr(-maxSize * 2 - 2); + // now that we have a significantly reduced string to process, transform the newlines + // and chop them, then limit them: + var a = past.replace(/\\r\\n|\\r/g, '\\n').split('\\n'); + a = a.slice(-maxLines); + past = a.join('\\n'); + // When, after limiting to maxLines, we still have too much to return, + // do add an ellipsis prefix... + if (past.length > maxSize) { + past = '...' + past.substr(-maxSize); + } + return past; + }, + + /** + * return (part of the) upcoming input, i.e. for error messages. + * + * Limit the returned string length to \`maxSize\` (default: 20). + * + * Limit the returned string to the \`maxLines\` number of lines of input (default: 1). + * + * Negative limit values equal *unlimited*. + * + * > ### NOTE ### + * > + * > *"upcoming input"* is defined as the whole of the both + * > the *currently lexed* input, together with any remaining input + * > following that. *"currently lexed"* input is the input + * > already recognized by the lexer but not yet returned with + * > the lexer token. This happens when you are invoking this API + * > from inside any lexer rule action code block. + * > + * + * @public + * @this {RegExpLexer} + */ + upcomingInput: function lexer_upcomingInput(maxSize, maxLines) { + var next = this.match; + if (maxSize < 0) + maxSize = next.length + this._input.length; + else if (!maxSize) + maxSize = 20; + if (maxLines < 0) + maxLines = maxSize; // can't ever have more input lines than this! + else if (!maxLines) + maxLines = 1; + // \`substring\` anticipation: treat \\r\\n as a single character and take a little + // more than necessary so that we can still properly check against maxSize + // after we've transformed and limited the newLines in here: + if (next.length < maxSize * 2 + 2) { + next += this._input.substring(0, maxSize * 2 + 2); // substring is faster on Chrome/V8 + } + // now that we have a significantly reduced string to process, transform the newlines + // and chop them, then limit them: + var a = next.replace(/\\r\\n|\\r/g, '\\n').split('\\n'); + a = a.slice(0, maxLines); + next = a.join('\\n'); + // When, after limiting to maxLines, we still have too much to return, + // do add an ellipsis postfix... + if (next.length > maxSize) { + next = next.substring(0, maxSize) + '...'; + } + return next; + }, + + /** + * return a string which displays the character position where the + * lexing error occurred, i.e. for error messages + * + * @public + * @this {RegExpLexer} + */ + showPosition: function lexer_showPosition(maxPrefix, maxPostfix) { + var pre = this.pastInput(maxPrefix).replace(/\\s/g, ' '); + var c = new Array(pre.length + 1).join('-'); + return pre + this.upcomingInput(maxPostfix).replace(/\\s/g, ' ') + '\\n' + c + '^'; + }, + + /** + * return a string which displays the lines & columns of input which are referenced + * by the given location info range, plus a few lines of context. + * + * This function pretty-prints the indicated section of the input, with line numbers + * and everything! + * + * This function is very useful to provide highly readable error reports, while + * the location range may be specified in various flexible ways: + * + * - \`loc\` is the location info object which references the area which should be + * displayed and 'marked up': these lines & columns of text are marked up by \`^\` + * characters below each character in the entire input range. + * + * - \`context_loc\` is the *optional* location info object which instructs this + * pretty-printer how much *leading* context should be displayed alongside + * the area referenced by \`loc\`. This can help provide context for the displayed + * error, etc. + * + * When this location info is not provided, a default context of 3 lines is + * used. + * + * - \`context_loc2\` is another *optional* location info object, which serves + * a similar purpose to \`context_loc\`: it specifies the amount of *trailing* + * context lines to display in the pretty-print output. + * + * When this location info is not provided, a default context of 1 line only is + * used. + * + * Special Notes: + * + * - when the \`loc\`-indicated range is very large (about 5 lines or more), then + * only the first and last few lines of this block are printed while a + * \`...continued...\` message will be printed between them. + * + * This serves the purpose of not printing a huge amount of text when the \`loc\` + * range happens to be huge: this way a manageable & readable output results + * for arbitrary large ranges. + * + * - this function can display lines of input which whave not yet been lexed. + * \`prettyPrintRange()\` can access the entire input! + * + * @public + * @this {RegExpLexer} + */ + prettyPrintRange: function lexer_prettyPrintRange(loc, context_loc, context_loc2) { + var error_size = loc.last_line - loc.first_line; + const CONTEXT = 3; + const CONTEXT_TAIL = 1; + const MINIMUM_VISIBLE_NONEMPTY_LINE_COUNT = 2; + var input = this.matched + this._input; + var lines = input.split('\\n'); + //var show_context = (error_size < 5 || context_loc); + var l0 = Math.max(1, (context_loc ? context_loc.first_line : loc.first_line - CONTEXT)); + var l1 = Math.max(1, (context_loc2 ? context_loc2.last_line : loc.last_line + CONTEXT_TAIL)); + var lineno_display_width = (1 + Math.log10(l1 | 1) | 0); + var ws_prefix = new Array(lineno_display_width).join(' '); + var nonempty_line_indexes = []; + var rv = lines.slice(l0 - 1, l1 + 1).map(function injectLineNumber(line, index) { + var lno = index + l0; + var lno_pfx = (ws_prefix + lno).substr(-lineno_display_width); + var rv = lno_pfx + ': ' + line; + var errpfx = (new Array(lineno_display_width + 1)).join('^'); + var offset = 2 + 1; + var len = 0; + + if (lno === loc.first_line) { + offset += loc.first_column; + + len = Math.max( + 2, + ((lno === loc.last_line ? loc.last_column : line.length)) - loc.first_column + 1 + ); + } else if (lno === loc.last_line) { + len = Math.max(2, loc.last_column + 1); + } else if (lno > loc.first_line && lno < loc.last_line) { + len = Math.max(2, line.length + 1); + } + + if (len) { + var lead = new Array(offset).join('.'); + var mark = new Array(len).join('^'); + rv += '\\n' + errpfx + lead + mark; + + if (line.trim().length > 0) { + nonempty_line_indexes.push(index); + } + } + + rv = rv.replace(/\\t/g, ' '); + return rv; + }); + + // now make sure we don't print an overly large amount of error area: limit it + // to the top and bottom line count: + if (nonempty_line_indexes.length > 2 * MINIMUM_VISIBLE_NONEMPTY_LINE_COUNT) { + var clip_start = nonempty_line_indexes[MINIMUM_VISIBLE_NONEMPTY_LINE_COUNT - 1] + 1; + var clip_end = nonempty_line_indexes[nonempty_line_indexes.length - MINIMUM_VISIBLE_NONEMPTY_LINE_COUNT] - 1; + + var intermediate_line = (new Array(lineno_display_width + 1)).join(' ') + ' (...continued...)'; + intermediate_line += '\\n' + (new Array(lineno_display_width + 1)).join('-') + ' (---------------)'; + rv.splice(clip_start, clip_end - clip_start + 1, intermediate_line); + } + return rv.join('\\n'); + }, + + /** + * helper function, used to produce a human readable description as a string, given + * the input \`yylloc\` location object. + * + * Set \`display_range_too\` to TRUE to include the string character index position(s) + * in the description if the \`yylloc.range\` is available. + * + * @public + * @this {RegExpLexer} + */ + describeYYLLOC: function lexer_describe_yylloc(yylloc, display_range_too) { + var l1 = yylloc.first_line; + var l2 = yylloc.last_line; + var c1 = yylloc.first_column; + var c2 = yylloc.last_column; + var dl = l2 - l1; + var dc = c2 - c1; + var rv; + if (dl === 0) { + rv = 'line ' + l1 + ', '; + if (dc <= 1) { + rv += 'column ' + c1; + } else { + rv += 'columns ' + c1 + ' .. ' + c2; + } + } else { + rv = 'lines ' + l1 + '(column ' + c1 + ') .. ' + l2 + '(column ' + c2 + ')'; + } + if (yylloc.range && display_range_too) { + var r1 = yylloc.range[0]; + var r2 = yylloc.range[1] - 1; + if (r2 <= r1) { + rv += ' {String Offset: ' + r1 + '}'; + } else { + rv += ' {String Offset range: ' + r1 + ' .. ' + r2 + '}'; + } + } + return rv; + }, + + /** + * test the lexed token: return FALSE when not a match, otherwise return token. + * + * \`match\` is supposed to be an array coming out of a regex match, i.e. \`match[0]\` + * contains the actually matched text string. + * + * Also move the input cursor forward and update the match collectors: + * + * - \`yytext\` + * - \`yyleng\` + * - \`match\` + * - \`matches\` + * - \`yylloc\` + * - \`offset\` + * + * @public + * @this {RegExpLexer} + */ + test_match: function lexer_test_match(match, indexed_rule) { + var token, + lines, + backup, + match_str, + match_str_len; + + if (this.options.backtrack_lexer) { + // save context + backup = { + yylineno: this.yylineno, + yylloc: { + first_line: this.yylloc.first_line, + last_line: this.yylloc.last_line, + first_column: this.yylloc.first_column, + last_column: this.yylloc.last_column, + + range: this.yylloc.range.slice(0) + }, + yytext: this.yytext, + match: this.match, + matches: this.matches, + matched: this.matched, + yyleng: this.yyleng, + offset: this.offset, + _more: this._more, + _input: this._input, + //_signaled_error_token: this._signaled_error_token, + yy: this.yy, + conditionStack: this.conditionStack.slice(0), + done: this.done + }; + } + + match_str = match[0]; + match_str_len = match_str.length; + // if (match_str.indexOf('\\n') !== -1 || match_str.indexOf('\\r') !== -1) { + lines = match_str.split(/(?:\\r\\n?|\\n)/g); + if (lines.length > 1) { + this.yylineno += lines.length - 1; + + this.yylloc.last_line = this.yylineno + 1; + this.yylloc.last_column = lines[lines.length - 1].length; + } else { + this.yylloc.last_column += match_str_len; + } + // } + this.yytext += match_str; + this.match += match_str; + this.matched += match_str; + this.matches = match; + this.yyleng = this.yytext.length; + this.yylloc.range[1] += match_str_len; + + // previous lex rules MAY have invoked the \`more()\` API rather than producing a token: + // those rules will already have moved this \`offset\` forward matching their match lengths, + // hence we must only add our own match length now: + this.offset += match_str_len; + this._more = false; + this._backtrack = false; + this._input = this._input.slice(match_str_len); + + // calling this method: + // + // function lexer__performAction(yy, yyrulenumber, YY_START) {...} + token = this.performAction.call(this, this.yy, indexed_rule, this.conditionStack[this.conditionStack.length - 1] /* = YY_START */); + // otherwise, when the action codes are all simple return token statements: + //token = this.simpleCaseActionClusters[indexed_rule]; + + if (this.done && this._input) { + this.done = false; + } + if (token) { + return token; + } else if (this._backtrack) { + // recover context + for (var k in backup) { + this[k] = backup[k]; + } + this.__currentRuleSet__ = null; + return false; // rule action called reject() implying the next rule should be tested instead. + } else if (this._signaled_error_token) { + // produce one 'error' token as \`.parseError()\` in \`reject()\` + // did not guarantee a failure signal by throwing an exception! + token = this._signaled_error_token; + this._signaled_error_token = false; + return token; + } + return false; + }, + + /** + * return next match in input + * + * @public + * @this {RegExpLexer} + */ + next: function lexer_next() { + if (this.done) { + this.clear(); + return this.EOF; + } + if (!this._input) { + this.done = true; + } + + var token, + match, + tempMatch, + index; + if (!this._more) { + this.clear(); + } + var spec = this.__currentRuleSet__; + if (!spec) { + // Update the ruleset cache as we apparently encountered a state change or just started lexing. + // The cache is set up for fast lookup -- we assume a lexer will switch states much less often than it will + // invoke the \`lex()\` token-producing API and related APIs, hence caching the set for direct access helps + // speed up those activities a tiny bit. + spec = this.__currentRuleSet__ = this._currentRules(); + // Check whether a *sane* condition has been pushed before: this makes the lexer robust against + // user-programmer bugs such as https://github.com/zaach/jison-lex/issues/19 + if (!spec || !spec.rules) { + var lineno_msg = ''; + if (this.options.trackPosition) { + lineno_msg = ' on line ' + (this.yylineno + 1); + } + var p = this.constructLexErrorInfo('Internal lexer engine error' + lineno_msg + ': The lex grammar programmer pushed a non-existing condition name "' + this.topState() + '"; this is a fatal error and should be reported to the application programmer team!', false); + // produce one 'error' token until this situation has been resolved, most probably by parse termination! + return (this.parseError(p.errStr, p, this.JisonLexerError) || this.ERROR); + } + } + + var rule_ids = spec.rules; + var regexes = spec.__rule_regexes; + var len = spec.__rule_count; + + // Note: the arrays are 1-based, while \`len\` itself is a valid index, + // hence the non-standard less-or-equal check in the next loop condition! + for (var i = 1; i <= len; i++) { + tempMatch = this._input.match(regexes[i]); + if (tempMatch && (!match || tempMatch[0].length > match[0].length)) { + match = tempMatch; + index = i; + if (this.options.backtrack_lexer) { + token = this.test_match(tempMatch, rule_ids[i]); + if (token !== false) { + return token; + } else if (this._backtrack) { + match = undefined; + continue; // rule action called reject() implying a rule MISmatch. + } else { + // else: this is a lexer rule which consumes input without producing a token (e.g. whitespace) + return false; + } + } else if (!this.options.flex) { + break; + } + } + } + if (match) { + token = this.test_match(match, rule_ids[index]); + if (token !== false) { + return token; + } + // else: this is a lexer rule which consumes input without producing a token (e.g. whitespace) + return false; + } + if (!this._input) { + this.done = true; + this.clear(); + return this.EOF; + } else { + var lineno_msg = ''; + if (this.options.trackPosition) { + lineno_msg = ' on line ' + (this.yylineno + 1); + } + var p = this.constructLexErrorInfo('Lexical error' + lineno_msg + ': Unrecognized text.', this.options.lexerErrorsAreRecoverable); + + var pendingInput = this._input; + var activeCondition = this.topState(); + var conditionStackDepth = this.conditionStack.length; + + token = (this.parseError(p.errStr, p, this.JisonLexerError) || this.ERROR); + if (token === this.ERROR) { + // we can try to recover from a lexer error that \`parseError()\` did not 'recover' for us + // by moving forward at least one character at a time IFF the (user-specified?) \`parseError()\` + // has not consumed/modified any pending input or changed state in the error handler: + if (!this.matches && + // and make sure the input has been modified/consumed ... + pendingInput === this._input && + // ...or the lexer state has been modified significantly enough + // to merit a non-consuming error handling action right now. + activeCondition === this.topState() && + conditionStackDepth === this.conditionStack.length + ) { + this.input(); + } + } + return token; + } + }, + + /** + * return next match that has a token + * + * @public + * @this {RegExpLexer} + */ + lex: function lexer_lex() { + var r; + // allow the PRE/POST handlers set/modify the return token for maximum flexibility of the generated lexer: + if (typeof this.options.pre_lex === 'function') { + r = this.options.pre_lex.call(this); + } + + while (!r) { + r = this.next(); + } + + if (typeof this.options.post_lex === 'function') { + // (also account for a userdef function which does not return any value: keep the token as is) + r = this.options.post_lex.call(this, r) || r; + } + return r; + }, + + /** + * backwards compatible alias for \`pushState()\`; + * the latter is symmetrical with \`popState()\` and we advise to use + * those APIs in any modern lexer code, rather than \`begin()\`. + * + * @public + * @this {RegExpLexer} + */ + begin: function lexer_begin(condition) { + return this.pushState(condition); + }, + + /** + * activates a new lexer condition state (pushes the new lexer + * condition state onto the condition stack) + * + * @public + * @this {RegExpLexer} + */ + pushState: function lexer_pushState(condition) { + this.conditionStack.push(condition); + this.__currentRuleSet__ = null; + return this; + }, + + /** + * pop the previously active lexer condition state off the condition + * stack + * + * @public + * @this {RegExpLexer} + */ + popState: function lexer_popState() { + var n = this.conditionStack.length - 1; + if (n > 0) { + this.__currentRuleSet__ = null; + return this.conditionStack.pop(); + } else { + return this.conditionStack[0]; + } + }, + + /** + * return the currently active lexer condition state; when an index + * argument is provided it produces the N-th previous condition state, + * if available + * + * @public + * @this {RegExpLexer} + */ + topState: function lexer_topState(n) { + n = this.conditionStack.length - 1 - Math.abs(n || 0); + if (n >= 0) { + return this.conditionStack[n]; + } else { + return 'INITIAL'; + } + }, + + /** + * (internal) determine the lexer rule set which is active for the + * currently active lexer condition state + * + * @public + * @this {RegExpLexer} + */ + _currentRules: function lexer__currentRules() { + if (this.conditionStack.length && this.conditionStack[this.conditionStack.length - 1]) { + return this.conditions[this.conditionStack[this.conditionStack.length - 1]]; + } else { + return this.conditions['INITIAL']; + } + }, + + /** + * return the number of states currently on the stack + * + * @public + * @this {RegExpLexer} + */ + stateStackSize: function lexer_stateStackSize() { + return this.conditionStack.length; + } +}`; + // --- END lexer kernel --- +} + +RegExpLexer.prototype = (new Function(rmCommonWS` + return ${getRegExpLexerPrototype()}; +`))(); + + +// The lexer code stripper, driven by optimization analysis settings and +// lexer options, which cannot be changed at run-time. +function stripUnusedLexerCode(src, opt) { + // uses yyleng: ..................... ${opt.lexerActionsUseYYLENG} + // uses yylineno: ................... ${opt.lexerActionsUseYYLINENO} + // uses yytext: ..................... ${opt.lexerActionsUseYYTEXT} + // uses yylloc: ..................... ${opt.lexerActionsUseYYLOC} + // uses ParseError API: ............. ${opt.lexerActionsUseParseError} + // uses location tracking & editing: ${opt.lexerActionsUseLocationTracking} + // uses more() API: ................. ${opt.lexerActionsUseMore} + // uses unput() API: ................ ${opt.lexerActionsUseUnput} + // uses reject() API: ............... ${opt.lexerActionsUseReject} + // uses less() API: ................. ${opt.lexerActionsUseLess} + // uses display APIs pastInput(), upcomingInput(), showPosition(): + // ............................. ${opt.lexerActionsUseDisplayAPIs} + // uses describeYYLLOC() API: ....... ${opt.lexerActionsUseDescribeYYLOC} + + var ast = helpers.parseCodeChunkToAST(src, opt); + var new_src = helpers.prettyPrintAST(ast, opt); + +new_src = new_src.replace(/\/\*\s*JISON-LEX-ANALYTICS-REPORT\s*\*\//g, rmCommonWS` + // Code Generator Information Report + // --------------------------------- + // + // Options: + // + // backtracking: .................... ${opt.options.backtrack_lexer} + // location.ranges: ................. ${opt.options.ranges} + // location line+column tracking: ... ${opt.options.trackPosition} + // + // + // Forwarded Parser Analysis flags: + // + // uses yyleng: ..................... ${opt.parseActionsUseYYLENG} + // uses yylineno: ................... ${opt.parseActionsUseYYLINENO} + // uses yytext: ..................... ${opt.parseActionsUseYYTEXT} + // uses yylloc: ..................... ${opt.parseActionsUseYYLOC} + // uses lexer values: ............... ${opt.parseActionsUseValueTracking} / ${opt.parseActionsUseValueAssignment} + // location tracking: ............... ${opt.parseActionsUseLocationTracking} + // location assignment: ............. ${opt.parseActionsUseLocationAssignment} + // + // + // Lexer Analysis flags: + // + // uses yyleng: ..................... ${opt.lexerActionsUseYYLENG} + // uses yylineno: ................... ${opt.lexerActionsUseYYLINENO} + // uses yytext: ..................... ${opt.lexerActionsUseYYTEXT} + // uses yylloc: ..................... ${opt.lexerActionsUseYYLOC} + // uses ParseError API: ............. ${opt.lexerActionsUseParseError} + // uses yyerror: .................... ${opt.lexerActionsUseYYERROR} + // uses location tracking & editing: ${opt.lexerActionsUseLocationTracking} + // uses more() API: ................. ${opt.lexerActionsUseMore} + // uses unput() API: ................ ${opt.lexerActionsUseUnput} + // uses reject() API: ............... ${opt.lexerActionsUseReject} + // uses less() API: ................. ${opt.lexerActionsUseLess} + // uses display APIs pastInput(), upcomingInput(), showPosition(): + // ............................. ${opt.lexerActionsUseDisplayAPIs} + // uses describeYYLLOC() API: ....... ${opt.lexerActionsUseDescribeYYLOC} + // + // --------- END OF REPORT ----------- + + `); + + return new_src; +} + + + + + +// generate lexer source from a grammar +/** @public */ +function generate(dict, tokens, build_options) { + var opt = processGrammar(dict, tokens, build_options); + + return generateFromOpts(opt); +} + +// process the grammar and build final data structures and functions +/** @public */ +function processGrammar(dict, tokens, build_options) { + build_options = build_options || {}; + var opts = { + // include the knowledge passed through `build_options` about which lexer + // features will actually be *used* by the environment (which in 99.9% + // of cases is a jison *parser*): + // + // (this stuff comes straight from the jison Optimization Analysis.) + // + parseActionsUseYYLENG: build_options.parseActionsUseYYLENG, + parseActionsUseYYLINENO: build_options.parseActionsUseYYLINENO, + parseActionsUseYYTEXT: build_options.parseActionsUseYYTEXT, + parseActionsUseYYLOC: build_options.parseActionsUseYYLOC, + parseActionsUseParseError: build_options.parseActionsUseParseError, + parseActionsUseYYERROR: build_options.parseActionsUseYYERROR, + parseActionsUseYYERROK: build_options.parseActionsUseYYERROK, + parseActionsUseYYRECOVERING: build_options.parseActionsUseYYRECOVERING, + parseActionsUseYYCLEARIN: build_options.parseActionsUseYYCLEARIN, + parseActionsUseValueTracking: build_options.parseActionsUseValueTracking, + parseActionsUseValueAssignment: build_options.parseActionsUseValueAssignment, + parseActionsUseLocationTracking: build_options.parseActionsUseLocationTracking, + parseActionsUseLocationAssignment: build_options.parseActionsUseLocationAssignment, + parseActionsUseYYSTACK: build_options.parseActionsUseYYSTACK, + parseActionsUseYYSSTACK: build_options.parseActionsUseYYSSTACK, + parseActionsUseYYSTACKPOINTER: build_options.parseActionsUseYYSTACKPOINTER, + parseActionsUseYYRULELENGTH: build_options.parseActionsUseYYRULELENGTH, + parserHasErrorRecovery: build_options.parserHasErrorRecovery, + parserHasErrorReporting: build_options.parserHasErrorReporting, + + lexerActionsUseYYLENG: '???', + lexerActionsUseYYLINENO: '???', + lexerActionsUseYYTEXT: '???', + lexerActionsUseYYLOC: '???', + lexerActionsUseParseError: '???', + lexerActionsUseYYERROR: '???', + lexerActionsUseLocationTracking: '???', + lexerActionsUseMore: '???', + lexerActionsUseUnput: '???', + lexerActionsUseReject: '???', + lexerActionsUseLess: '???', + lexerActionsUseDisplayAPIs: '???', + lexerActionsUseDescribeYYLOC: '???', + }; + + dict = autodetectAndConvertToJSONformat(dict, build_options) || {}; + + // Feed the possibly reprocessed 'dictionary' above back to the caller + // (for use by our error diagnostic assistance code) + opts.lex_rule_dictionary = dict; + + // Always provide the lexer with an options object, even if it's empty! + // Make sure to camelCase all options: + opts.options = mkStdOptions(build_options, dict.options); + + opts.moduleType = opts.options.moduleType; + opts.moduleName = opts.options.moduleName; + + opts.conditions = prepareStartConditions(dict.startConditions); + opts.conditions.INITIAL = { + rules: [], + inclusive: true + }; + + var code = buildActions(dict, tokens, opts); + opts.performAction = code.actions; + opts.caseHelperInclude = code.caseHelperInclude; + opts.rules = code.rules; + opts.macros = code.macros; + + opts.regular_rule_count = code.regular_rule_count; + opts.simple_rule_count = code.simple_rule_count; + + opts.conditionStack = ['INITIAL']; + + opts.actionInclude = (dict.actionInclude || ''); + opts.moduleInclude = (opts.moduleInclude || '') + (dict.moduleInclude || '').trim(); + + return opts; +} + +// Assemble the final source from the processed grammar +/** @public */ +function generateFromOpts(opt) { + var code = ''; + + switch (opt.moduleType) { + case 'js': + code = generateModule(opt); + break; + case 'amd': + code = generateAMDModule(opt); + break; + case 'es': + code = generateESModule(opt); + break; + case 'commonjs': + default: + code = generateCommonJSModule(opt); + break; + } + + return code; +} + +function generateRegexesInitTableCode(opt) { + var a = opt.rules; + var print_xregexp = opt.options && opt.options.xregexp; + var id_display_width = (1 + Math.log10(a.length | 1) | 0); + var ws_prefix = new Array(id_display_width).join(' '); + var b = a.map(function generateXRegExpInitCode(re, idx) { + var idx_str = (ws_prefix + idx).substr(-id_display_width); + + if (re instanceof XRegExp) { + // When we don't need the special XRegExp sauce at run-time, we do with the original + // JavaScript RegExp instance a.k.a. 'native regex': + if (re.xregexp.isNative || !print_xregexp) { + return `/* ${idx_str}: */ ${re}`; + } + // And make sure to escape the regex to make it suitable for placement inside a *string* + // as it is passed as a string argument to the XRegExp constructor here. + var re_src = re.xregexp.source.replace(/[\\"]/g, '\\$&'); + return `/* ${idx_str}: */ new XRegExp("${re_src}", "${re.xregexp.flags}")`; + } else { + return `/* ${idx_str}: */ ${re}`; + } + }); + return b.join(',\n'); +} + +function generateModuleBody(opt) { + // make the JSON output look more like JavaScript: + function cleanupJSON(str) { + str = str.replace(/ "rules": \[/g, ' rules: ['); + str = str.replace(/ "inclusive": /g, ' inclusive: '); + return str; + } + + function produceOptions(opts) { + var obj = {}; + var do_not_pass = { + debug: !opts.debug, // do not include this item when it is FALSE as there's no debug tracing built into the generated grammar anyway! + enableDebugLogs: 1, + json: 1, + _: 1, + noMain: 1, + dumpSourceCodeOnFailure: 1, + throwErrorOnCompileFailure: 1, + reportStats: 1, + file: 1, + outfile: 1, + inputPath: 1, + inputFilename: 1, + defaultModuleName: 1, + moduleName: 1, + moduleType: 1, + lexerErrorsAreRecoverable: 0, + flex: 0, + backtrack_lexer: 0, + caseInsensitive: 0, + showSource: 1, + exportAST: 1, + exportAllTables: 1, + exportSourceCode: 1, + prettyCfg: 1, + parseActionsUseYYLENG: 1, + parseActionsUseYYLINENO: 1, + parseActionsUseYYTEXT: 1, + parseActionsUseYYLOC: 1, + parseActionsUseParseError: 1, + parseActionsUseYYERROR: 1, + parseActionsUseYYRECOVERING: 1, + parseActionsUseYYERROK: 1, + parseActionsUseYYCLEARIN: 1, + parseActionsUseValueTracking: 1, + parseActionsUseValueAssignment: 1, + parseActionsUseLocationTracking: 1, + parseActionsUseLocationAssignment: 1, + parseActionsUseYYSTACK: 1, + parseActionsUseYYSSTACK: 1, + parseActionsUseYYSTACKPOINTER: 1, + parseActionsUseYYRULELENGTH: 1, + parserHasErrorRecovery: 1, + parserHasErrorReporting: 1, + lexerActionsUseYYLENG: 1, + lexerActionsUseYYLINENO: 1, + lexerActionsUseYYTEXT: 1, + lexerActionsUseYYLOC: 1, + lexerActionsUseParseError: 1, + lexerActionsUseYYERROR: 1, + lexerActionsUseLocationTracking: 1, + lexerActionsUseMore: 1, + lexerActionsUseUnput: 1, + lexerActionsUseReject: 1, + lexerActionsUseLess: 1, + lexerActionsUseDisplayAPIs: 1, + lexerActionsUseDescribeYYLOC: 1, + }; + for (var k in opts) { + if (!do_not_pass[k] && opts[k] != null && opts[k] !== false) { + // make sure numeric values are encoded as numeric, the rest as boolean/string. + if (typeof opts[k] === 'string') { + var f = parseFloat(opts[k]); + if (f == opts[k]) { + obj[k] = f; + continue; + } + } + obj[k] = opts[k]; + } + } + + // And now some options which should receive some special processing: + var pre = obj.pre_lex; + var post = obj.post_lex; + // since JSON cannot encode functions, we'll have to do it manually at run-time, i.e. later on: + if (pre) { + obj.pre_lex = true; + } + if (post) { + obj.post_lex = true; + } + + var js = JSON.stringify(obj, null, 2); + + js = js.replace(new XRegExp(` "(${ID_REGEX_BASE})": `, 'g'), ' $1: '); + js = js.replace(/^( +)pre_lex: true(,)?$/gm, function (m, ls, tc) { + return ls + 'pre_lex: ' + String(pre) + (tc || ''); + }); + js = js.replace(/^( +)post_lex: true(,)?$/gm, function (m, ls, tc) { + return ls + 'post_lex: ' + String(post) + (tc || ''); + }); + return js; + } + + + var out; + if (opt.rules.length > 0 || opt.__in_rules_failure_analysis_mode__) { + // we don't mind that the `test_me()` code above will have this `lexer` variable re-defined: + // JavaScript is fine with that. + var code = [rmCommonWS` + var lexer = { + `, '/*JISON-LEX-ANALYTICS-REPORT*/' /* slot #1: placeholder for analysis report further below */ + ]; + + // get the RegExpLexer.prototype in source code form: + var protosrc = getRegExpLexerPrototype(); + // and strip off the surrounding bits we don't want: + protosrc = protosrc + .replace(/^[\s\r\n]*\{/, '') + .replace(/\s*\}[\s\r\n]*$/, '') + .trim(); + code.push(protosrc + ',\n'); + + assert(opt.options); + // Assure all options are camelCased: + assert(typeof opt.options['case-insensitive'] === 'undefined'); + + code.push(' options: ' + produceOptions(opt.options)); + + var performActionCode = String(opt.performAction); + var simpleCaseActionClustersCode = String(opt.caseHelperInclude); + var rulesCode = generateRegexesInitTableCode(opt); + var conditionsCode = cleanupJSON(JSON.stringify(opt.conditions, null, 2)); + code.push(rmCommonWS`, + JisonLexerError: JisonLexerError, + performAction: ${performActionCode}, + simpleCaseActionClusters: ${simpleCaseActionClustersCode}, + rules: [ + ${rulesCode} + ], + conditions: ${conditionsCode} + }; + `); + + opt.is_custom_lexer = false; + + out = code.join(''); + } else { + // We're clearly looking at a custom lexer here as there's no lexer rules at all. + // + // We are re-purposing the `%{...%}` `actionInclude` code block here as it serves no purpose otherwise. + // + // Meanwhile we make sure we have the `lexer` variable declared in *local scope* no matter + // what crazy stuff (or lack thereof) the userland code is pulling in the `actionInclude` chunk. + out = 'var lexer;\n'; + + assert(opt.regular_rule_count === 0); + assert(opt.simple_rule_count === 0); + opt.is_custom_lexer = true; + + if (opt.actionInclude) { + out += opt.actionInclude + (!opt.actionInclude.match(/;[\s\r\n]*$/) ? ';' : '') + '\n'; + } + } + + // The output of this function is guaranteed to read something like this: + // + // ``` + // var lexer; + // + // bla bla bla bla ... lotsa bla bla; + // ``` + // + // and that should work nicely as an `eval()`-able piece of source code. + return out; +} + +function generateGenericHeaderComment() { + var out = rmCommonWS` + /* lexer generated by jison-lex ${version$1} */ + + /* + * Returns a Lexer object of the following structure: + * + * Lexer: { + * yy: {} The so-called "shared state" or rather the *source* of it; + * the real "shared state" \`yy\` passed around to + * the rule actions, etc. is a direct reference! + * + * This "shared context" object was passed to the lexer by way of + * the \`lexer.setInput(str, yy)\` API before you may use it. + * + * This "shared context" object is passed to the lexer action code in \`performAction()\` + * so userland code in the lexer actions may communicate with the outside world + * and/or other lexer rules' actions in more or less complex ways. + * + * } + * + * Lexer.prototype: { + * EOF: 1, + * ERROR: 2, + * + * yy: The overall "shared context" object reference. + * + * JisonLexerError: function(msg, hash), + * + * performAction: function lexer__performAction(yy, yyrulenumber, YY_START), + * + * The function parameters and \`this\` have the following value/meaning: + * - \`this\` : reference to the \`lexer\` instance. + * \`yy_\` is an alias for \`this\` lexer instance reference used internally. + * + * - \`yy\` : a reference to the \`yy\` "shared state" object which was passed to the lexer + * by way of the \`lexer.setInput(str, yy)\` API before. + * + * Note: + * The extra arguments you specified in the \`%parse-param\` statement in your + * **parser** grammar definition file are passed to the lexer via this object + * reference as member variables. + * + * - \`yyrulenumber\` : index of the matched lexer rule (regex), used internally. + * + * - \`YY_START\`: the current lexer "start condition" state. + * + * parseError: function(str, hash, ExceptionClass), + * + * constructLexErrorInfo: function(error_message, is_recoverable), + * Helper function. + * Produces a new errorInfo \'hash object\' which can be passed into \`parseError()\`. + * See it\'s use in this lexer kernel in many places; example usage: + * + * var infoObj = lexer.constructParseErrorInfo(\'fail!\', true); + * var retVal = lexer.parseError(infoObj.errStr, infoObj, lexer.JisonLexerError); + * + * options: { ... lexer %options ... }, + * + * lex: function(), + * Produce one token of lexed input, which was passed in earlier via the \`lexer.setInput()\` API. + * You MAY use the additional \`args...\` parameters as per \`%parse-param\` spec of the **lexer** grammar: + * these extra \`args...\` are added verbatim to the \`yy\` object reference as member variables. + * + * WARNING: + * Lexer's additional \`args...\` parameters (via lexer's \`%parse-param\`) MAY conflict with + * any attributes already added to \`yy\` by the **parser** or the jison run-time; + * when such a collision is detected an exception is thrown to prevent the generated run-time + * from silently accepting this confusing and potentially hazardous situation! + * + * cleanupAfterLex: function(do_not_nuke_errorinfos), + * Helper function. + * + * This helper API is invoked when the **parse process** has completed: it is the responsibility + * of the **parser** (or the calling userland code) to invoke this method once cleanup is desired. + * + * This helper may be invoked by user code to ensure the internal lexer gets properly garbage collected. + * + * setInput: function(input, [yy]), + * + * + * input: function(), + * + * + * unput: function(str), + * + * + * more: function(), + * + * + * reject: function(), + * + * + * less: function(n), + * + * + * pastInput: function(n), + * + * + * upcomingInput: function(n), + * + * + * showPosition: function(), + * + * + * test_match: function(regex_match_array, rule_index), + * + * + * next: function(), + * + * + * begin: function(condition), + * + * + * pushState: function(condition), + * + * + * popState: function(), + * + * + * topState: function(), + * + * + * _currentRules: function(), + * + * + * stateStackSize: function(), + * + * + * performAction: function(yy, yy_, yyrulenumber, YY_START), + * + * + * rules: [...], + * + * + * conditions: {associative list: name ==> set}, + * } + * + * + * token location info (\`yylloc\`): { + * first_line: n, + * last_line: n, + * first_column: n, + * last_column: n, + * range: [start_number, end_number] + * (where the numbers are indexes into the input string, zero-based) + * } + * + * --- + * + * The \`parseError\` function receives a \'hash\' object with these members for lexer errors: + * + * { + * text: (matched text) + * token: (the produced terminal token, if any) + * token_id: (the produced terminal token numeric ID, if any) + * line: (yylineno) + * loc: (yylloc) + * recoverable: (boolean: TRUE when the parser MAY have an error recovery rule + * available for this particular error) + * yy: (object: the current parser internal "shared state" \`yy\` + * as is also available in the rule actions; this can be used, + * for instance, for advanced error analysis and reporting) + * lexer: (reference to the current lexer instance used by the parser) + * } + * + * while \`this\` will reference the current lexer instance. + * + * When \`parseError\` is invoked by the lexer, the default implementation will + * attempt to invoke \`yy.parser.parseError()\`; when this callback is not provided + * it will try to invoke \`yy.parseError()\` instead. When that callback is also not + * provided, a \`JisonLexerError\` exception will be thrown containing the error + * message and \`hash\`, as constructed by the \`constructLexErrorInfo()\` API. + * + * Note that the lexer\'s \`JisonLexerError\` error class is passed via the + * \`ExceptionClass\` argument, which is invoked to construct the exception + * instance to be thrown, so technically \`parseError\` will throw the object + * produced by the \`new ExceptionClass(str, hash)\` JavaScript expression. + * + * --- + * + * You can specify lexer options by setting / modifying the \`.options\` object of your Lexer instance. + * These options are available: + * + * (Options are permanent.) + * + * yy: { + * parseError: function(str, hash, ExceptionClass) + * optional: overrides the default \`parseError\` function. + * } + * + * lexer.options: { + * pre_lex: function() + * optional: is invoked before the lexer is invoked to produce another token. + * \`this\` refers to the Lexer object. + * post_lex: function(token) { return token; } + * optional: is invoked when the lexer has produced a token \`token\`; + * this function can override the returned token value by returning another. + * When it does not return any (truthy) value, the lexer will return + * the original \`token\`. + * \`this\` refers to the Lexer object. + * + * WARNING: the next set of options are not meant to be changed. They echo the abilities of + * the lexer as per when it was compiled! + * + * ranges: boolean + * optional: \`true\` ==> token location info will include a .range[] member. + * flex: boolean + * optional: \`true\` ==> flex-like lexing behaviour where the rules are tested + * exhaustively to find the longest match. + * backtrack_lexer: boolean + * optional: \`true\` ==> lexer regexes are tested in order and for invoked; + * the lexer terminates the scan when a token is returned by the action code. + * xregexp: boolean + * optional: \`true\` ==> lexer rule regexes are "extended regex format" requiring the + * \`XRegExp\` library. When this %option has not been specified at compile time, all lexer + * rule regexes have been written as standard JavaScript RegExp expressions. + * } + */ + `; + + return out; +} + +function prepareOptions(opt) { + opt = opt || {}; + + // check for illegal identifier + if (!opt.moduleName || !opt.moduleName.match(/^[a-zA-Z_$][a-zA-Z0-9_$\.]*$/)) { + if (opt.moduleName) { + var msg = 'WARNING: The specified moduleName "' + opt.moduleName + '" is illegal (only characters [a-zA-Z0-9_$] and "." dot are accepted); using the default moduleName "lexer" instead.'; + if (typeof opt.warn_cb === 'function') { + opt.warn_cb(msg); + } else { + // do not treat as warning; barf hairball instead so that this oddity gets noticed right away! + throw new Error(msg); + } + } + opt.moduleName = 'lexer'; + } + + prepExportStructures(opt); + + return opt; +} + +function generateModule(opt) { + opt = prepareOptions(opt); + + var out = [ + generateGenericHeaderComment(), + '', + 'var ' + opt.moduleName + ' = (function () {', + jisonLexerErrorDefinition, + '', + generateModuleBody(opt), + '', + (opt.moduleInclude ? opt.moduleInclude + ';' : ''), + '', + 'return lexer;', + '})();' + ]; + + var src = out.join('\n') + '\n'; + src = stripUnusedLexerCode(src, opt); + opt.exportSourceCode.all = src; + return src; +} + +function generateAMDModule(opt) { + opt = prepareOptions(opt); + + var out = [ + generateGenericHeaderComment(), + '', + 'define([], function () {', + jisonLexerErrorDefinition, + '', + generateModuleBody(opt), + '', + (opt.moduleInclude ? opt.moduleInclude + ';' : ''), + '', + 'return lexer;', + '});' + ]; + + var src = out.join('\n') + '\n'; + src = stripUnusedLexerCode(src, opt); + opt.exportSourceCode.all = src; + return src; +} + +function generateESModule(opt) { + opt = prepareOptions(opt); + + var out = [ + generateGenericHeaderComment(), + '', + 'var lexer = (function () {', + jisonLexerErrorDefinition, + '', + generateModuleBody(opt), + '', + (opt.moduleInclude ? opt.moduleInclude + ';' : ''), + '', + 'return lexer;', + '})();', + '', + 'function yylex() {', + ' return lexer.lex.apply(lexer, arguments);', + '}', + rmCommonWS` + export { + lexer, + yylex as lex + }; + ` + ]; + + var src = out.join('\n') + '\n'; + src = stripUnusedLexerCode(src, opt); + opt.exportSourceCode.all = src; + return src; +} + +function generateCommonJSModule(opt) { + opt = prepareOptions(opt); + + var out = [ + generateGenericHeaderComment(), + '', + 'var ' + opt.moduleName + ' = (function () {', + jisonLexerErrorDefinition, + '', + generateModuleBody(opt), + '', + (opt.moduleInclude ? opt.moduleInclude + ';' : ''), + '', + 'return lexer;', + '})();', + '', + 'if (typeof require !== \'undefined\' && typeof exports !== \'undefined\') {', + ' exports.lexer = ' + opt.moduleName + ';', + ' exports.lex = function () {', + ' return ' + opt.moduleName + '.lex.apply(lexer, arguments);', + ' };', + '}' + ]; + + var src = out.join('\n') + '\n'; + src = stripUnusedLexerCode(src, opt); + opt.exportSourceCode.all = src; + return src; +} + +RegExpLexer.generate = generate; + +RegExpLexer.version = version$1; +RegExpLexer.defaultJisonLexOptions = defaultJisonLexOptions; +RegExpLexer.mkStdOptions = mkStdOptions; +RegExpLexer.camelCase = camelCase; +RegExpLexer.autodetectAndConvertToJSONformat = autodetectAndConvertToJSONformat; + +var version = '0.6.1-205'; // require('./package.json').version; + + +function getCommandlineOptions() { + 'use strict'; + + var opts = nomnom + .script('jison-lex') + .unknownOptionTreatment(false) // do not accept unknown options! + .options({ + file: { + flag: true, + position: 0, + help: 'file containing a lexical grammar' + }, + json: { + abbr: 'j', + flag: true, + default: false, + help: 'jison will expect a grammar in either JSON/JSON5 or JISON format: the precise format is autodetected' + }, + outfile: { + abbr: 'o', + metavar: 'FILE', + help : 'Filepath and base module name of the generated parser;\nwhen terminated with a / (dir separator) it is treated as the destination directory where the generated output will be stored' + }, + debug: { + abbr: 'd', + flag: true, + default: false, + help: 'Debug mode' + }, + dumpSourceCodeOnFailure: { + full: 'dump-sourcecode-on-failure', + flag: true, + default: true, + help: 'Dump the generated source code to a special named file when the internal generator tests fail, i.e. when the generated source code does not compile in the JavaScript engine. Enabling this option helps you to diagnose/debug crashes (thrown exceptions) in the code generator due to various reasons: you can, for example, load the dumped sourcecode in another environment (e.g. NodeJS) to get more info on the precise location and cause of the compile failure.' + }, + throwErrorOnCompileFailure: { + full: 'throw-on-compile-failure', + flag: true, + default: true, + help: 'Throw an exception when the generated source code fails to compile in the JavaScript engine. **WARNING**: Turning this feature OFF permits the code generator to produce non-working source code and treat that as SUCCESS. This MAY be desirable code generator behaviour, but only rarely.' + }, + reportStats: { + full: 'info', + abbr: 'I', + flag: true, + default: false, + help: 'Report some statistics about the generated parser' + }, + moduleType: { + full: 'module-type', + abbr: 't', + default: 'commonjs', + metavar: 'TYPE', + choices: ['commonjs', 'amd', 'js', 'es'], + help: 'The type of module to generate (commonjs, amd, es, js)' + }, + moduleName: { + full: 'module-name', + abbr: 'n', + metavar: 'NAME', + help: 'The name of the generated parser object, namespace supported' + }, + main: { + full: 'main', + abbr: 'x', + flag: true, + default: false, + help: 'Include .main() entry point in generated commonjs module' + }, + moduleMain: { + full: 'module-main', + abbr: 'y', + metavar: 'NAME', + help: 'The main module function definition' + }, + version: { + abbr: 'V', + flag: true, + help: 'print version and exit', + callback: function () { + return version; + } + } + }).parse(); + + return opts; +} + +var cli = module.exports; + +cli.main = function cliMain(opts) { + 'use strict'; + + opts = RegExpLexer.mkStdOptions(opts); + + function isDirectory(fp) { + try { + return fs.lstatSync(fp).isDirectory(); + } catch (e) { + return false; + } + } + + function mkdirp(fp) { + if (!fp || fp === '.' || fp.length === 0) { + return false; + } + try { + fs.mkdirSync(fp); + return true; + } catch (e) { + if (e.code === 'ENOENT') { + var parent = path.dirname(fp); + // Did we hit the root directory by now? If so, abort! + // Else, create the parent; iff that fails, we fail too... + if (parent !== fp && mkdirp(parent)) { + try { + // Retry creating the original directory: it should succeed now + fs.mkdirSync(fp); + return true; + } catch (e) { + return false; + } + } + } + } + return false; + } + + function processInputFile() { + // getting raw files + var original_cwd = process.cwd(); + + var raw = fs.readFileSync(path.normalize(opts.file), 'utf8'); + + // making best guess at json mode + opts.json = path.extname(opts.file) === '.json' || opts.json; + + // When only the directory part of the output path was specified, then we + // do NOT have the target module name in there as well! + var outpath = opts.outfile; + if (/[\\\/]$/.test(outpath) || isDirectory(outpath)) { + opts.outfile = null; + outpath = outpath.replace(/[\\\/]$/, ''); + } + if (outpath && outpath.length > 0) { + outpath += '/'; + } else { + outpath = ''; + } + + // setting output file name and module name based on input file name + // if they aren't specified. + var name = path.basename(opts.outfile || opts.file); + + // get the base name (i.e. the file name without extension) + // i.e. strip off only the extension and keep any other dots in the filename + name = path.basename(name, path.extname(name)); + + opts.outfile = opts.outfile || (outpath + name + '.js'); + if (!opts.moduleName && name) { + opts.moduleName = opts.defaultModuleName = name.replace(/-\w/g, + function (match) { + return match.charAt(1).toUpperCase(); + }); + } + + // Change CWD to the directory where the source grammar resides: this helps us properly + // %include any files mentioned in the grammar with relative paths: + var new_cwd = path.dirname(path.normalize(opts.file)); + process.chdir(new_cwd); + + var lexer = cli.generateLexerString(raw, opts); + + // and change back to the CWD we started out with: + process.chdir(original_cwd); + + mkdirp(path.dirname(opts.outfile)); + fs.writeFileSync(opts.outfile, lexer); + console.log('JISON-LEX output for module [' + opts.moduleName + '] has been written to file:', opts.outfile); + } + + function readin(cb) { + var stdin = process.openStdin(), + data = ''; + + stdin.setEncoding('utf8'); + stdin.addListener('data', function (chunk) { + data += chunk; + }); + stdin.addListener('end', function () { + cb(data); + }); + } + + function processStdin() { + readin(function processStdinReadInCallback(raw) { + console.log(cli.generateLexerString(raw, opts)); + }); + } + + // if an input file wasn't given, assume input on stdin + if (opts.file) { + processInputFile(); + } else { + processStdin(); + } +}; + +cli.generateLexerString = function generateLexerString(lexerSpec, opts) { + 'use strict'; + + // var settings = RegExpLexer.mkStdOptions(opts); + var predefined_tokens = null; + + return RegExpLexer.generate(lexerSpec, predefined_tokens, opts); +}; + + +if (require.main === module) { + var opts = getCommandlineOptions(); + cli.main(opts); +} diff --git a/dist/cli-es6.js b/dist/cli-es6.js new file mode 100644 index 0000000..28527e4 --- /dev/null +++ b/dist/cli-es6.js @@ -0,0 +1,4278 @@ +#!/usr/bin/env node + + +import fs from 'fs'; +import path from 'path'; +import nomnom from '@gerhobbelt/nomnom'; +import XRegExp from '@gerhobbelt/xregexp'; +import json5 from '@gerhobbelt/json5'; +import lexParser from '@gerhobbelt/lex-parser'; +import assert from 'assert'; +import helpers from 'jison-helpers-lib'; + +// +// Helper library for set definitions +// +// MIT Licensed +// +// +// This code is intended to help parse regex set expressions and mix them +// together, i.e. to answer questions like this: +// +// what is the resulting regex set expression when we mix the regex set +// `[a-z]` with the regex set `[^\s]` where with 'mix' we mean that any +// input which matches either input regex should match the resulting +// regex set. (a.k.a. Full Outer Join, see also http://www.diffen.com/difference/Inner_Join_vs_Outer_Join) +// + +'use strict'; + +const XREGEXP_UNICODE_ESCAPE_RE$1 = /^\{[A-Za-z0-9 \-\._]+\}/; // Matches the XRegExp Unicode escape braced part, e.g. `{Number}` +const CHR_RE$1 = /^(?:[^\\]|\\[^cxu0-9]|\\[0-9]{1,3}|\\c[A-Z]|\\x[0-9a-fA-F]{2}|\\u[0-9a-fA-F]{4}|\\u\{[0-9a-fA-F]+\})/; +const SET_PART_RE$1 = /^(?:[^\\\]]|\\[^cxu0-9]|\\[0-9]{1,3}|\\c[A-Z]|\\x[0-9a-fA-F]{2}|\\u[0-9a-fA-F]{4}|\\u\{[0-9a-fA-F]+\})+/; +const NOTHING_SPECIAL_RE$1 = /^(?:[^\\\[\]\(\)\|^\{\}]|\\[^cxu0-9]|\\[0-9]{1,3}|\\c[A-Z]|\\x[0-9a-fA-F]{2}|\\u[0-9a-fA-F]{4}|\\u\{[0-9a-fA-F]+\})+/; +const SET_IS_SINGLE_PCODE_RE = /^\\[dDwWsS]$|^\\p\{[A-Za-z0-9 \-\._]+\}$/; + +const UNICODE_BASE_PLANE_MAX_CP$1 = 65535; + +// The expanded regex sets which are equivalent to the given `\\{c}` escapes: +// +// `/\s/`: +const WHITESPACE_SETSTR$1 = ' \f\n\r\t\v\u00a0\u1680\u180e\u2000-\u200a\u2028\u2029\u202f\u205f\u3000\ufeff'; +// `/\d/`: +const DIGIT_SETSTR$1 = '0-9'; +// `/\w/`: +const WORDCHAR_SETSTR$1 = 'A-Za-z0-9_'; + + + + + +// Helper for `bitarray2set()`: convert character code to a representation string suitable for use in a regex +function i2c(i) { + var c, x; + + switch (i) { + case 10: + return '\\n'; + + case 13: + return '\\r'; + + case 9: + return '\\t'; + + case 8: + return '\\b'; + + case 12: + return '\\f'; + + case 11: + return '\\v'; + + case 45: // ASCII/Unicode for '-' dash + return '\\-'; + + case 91: // '[' + return '\\['; + + case 92: // '\\' + return '\\\\'; + + case 93: // ']' + return '\\]'; + + case 94: // ']' + return '\\^'; + } + if (i < 32 + || i > 0xFFF0 /* Unicode Specials, also in UTF16 */ + || (i >= 0xD800 && i <= 0xDFFF) /* Unicode Supplementary Planes; we're TOAST in JavaScript as we're NOT UTF-16 but UCS-2! */ + || String.fromCharCode(i).match(/[\u2028\u2029]/) /* Code compilation via `new Function()` does not like to see these, or rather: treats them as just another form of CRLF, which breaks your generated regex code! */ + ) { + // Detail about a detail: + // U+2028 and U+2029 are part of the `\s` regex escape code (`\s` and `[\s]` match either of these) and when placed in a JavaScript + // source file verbatim (without escaping it as a `\uNNNN` item) then JavaScript will interpret it as such and consequently report + // a b0rked generated parser, as the generated code would include this regex right here. + // Hence we MUST escape these buggers everywhere we go... + x = i.toString(16); + if (x.length >= 1 && i <= 0xFFFF) { + c = '0000' + x; + return '\\u' + c.substr(c.length - 4); + } else { + return '\\u{' + x + '}'; + } + } + return String.fromCharCode(i); +} + + +// Helper collection for `bitarray2set()`: we have expanded all these cached `\\p{NAME}` regex sets when creating +// this bitarray and now we should look at these expansions again to see if `bitarray2set()` can produce a +// `\\p{NAME}` shorthand to represent [part of] the bitarray: +var Pcodes_bitarray_cache = {}; +var Pcodes_bitarray_cache_test_order = []; + +// Helper collection for `bitarray2set()` for minifying special cases of result sets which can be represented by +// a single regex 'escape', e.g. `\d` for digits 0-9. +var EscCode_bitarray_output_refs; + +// now initialize the EscCodes_... table above: +init_EscCode_lookup_table(); + +function init_EscCode_lookup_table() { + var s, bitarr, set2esc = {}, esc2bitarr = {}; + + // patch global lookup tables for the time being, while we calculate their *real* content in this function: + EscCode_bitarray_output_refs = { + esc2bitarr: {}, + set2esc: {} + }; + Pcodes_bitarray_cache_test_order = []; + + // `/\S': + bitarr = []; + set2bitarray(bitarr, '^' + WHITESPACE_SETSTR$1); + s = bitarray2set(bitarr); + esc2bitarr['S'] = bitarr; + set2esc[s] = 'S'; + // set2esc['^' + s] = 's'; + Pcodes_bitarray_cache['\\S'] = bitarr; + + // `/\s': + bitarr = []; + set2bitarray(bitarr, WHITESPACE_SETSTR$1); + s = bitarray2set(bitarr); + esc2bitarr['s'] = bitarr; + set2esc[s] = 's'; + // set2esc['^' + s] = 'S'; + Pcodes_bitarray_cache['\\s'] = bitarr; + + // `/\D': + bitarr = []; + set2bitarray(bitarr, '^' + DIGIT_SETSTR$1); + s = bitarray2set(bitarr); + esc2bitarr['D'] = bitarr; + set2esc[s] = 'D'; + // set2esc['^' + s] = 'd'; + Pcodes_bitarray_cache['\\D'] = bitarr; + + // `/\d': + bitarr = []; + set2bitarray(bitarr, DIGIT_SETSTR$1); + s = bitarray2set(bitarr); + esc2bitarr['d'] = bitarr; + set2esc[s] = 'd'; + // set2esc['^' + s] = 'D'; + Pcodes_bitarray_cache['\\d'] = bitarr; + + // `/\W': + bitarr = []; + set2bitarray(bitarr, '^' + WORDCHAR_SETSTR$1); + s = bitarray2set(bitarr); + esc2bitarr['W'] = bitarr; + set2esc[s] = 'W'; + // set2esc['^' + s] = 'w'; + Pcodes_bitarray_cache['\\W'] = bitarr; + + // `/\w': + bitarr = []; + set2bitarray(bitarr, WORDCHAR_SETSTR$1); + s = bitarray2set(bitarr); + esc2bitarr['w'] = bitarr; + set2esc[s] = 'w'; + // set2esc['^' + s] = 'W'; + Pcodes_bitarray_cache['\\w'] = bitarr; + + EscCode_bitarray_output_refs = { + esc2bitarr: esc2bitarr, + set2esc: set2esc + }; + + updatePcodesBitarrayCacheTestOrder(); +} + +function updatePcodesBitarrayCacheTestOrder(opts) { + var t = new Array(UNICODE_BASE_PLANE_MAX_CP$1 + 1); + var l = {}; + var user_has_xregexp = opts && opts.options && opts.options.xregexp; + var i, j, k, ba; + + // mark every character with which regex pcodes they are part of: + for (k in Pcodes_bitarray_cache) { + ba = Pcodes_bitarray_cache[k]; + + if (!user_has_xregexp && k.indexOf('\\p{') >= 0) { + continue; + } + + var cnt = 0; + for (i = 0; i <= UNICODE_BASE_PLANE_MAX_CP$1; i++) { + if (ba[i]) { + cnt++; + if (!t[i]) { + t[i] = [k]; + } else { + t[i].push(k); + } + } + } + l[k] = cnt; + } + + // now dig out the unique ones: only need one per pcode. + // + // We ASSUME every \\p{NAME} 'pcode' has at least ONE character + // in it that is ONLY matched by that particular pcode. + // If this assumption fails, nothing is lost, but our 'regex set + // optimized representation' will be sub-optimal as than this pcode + // won't be tested during optimization. + // + // Now that would be a pity, so the assumption better holds... + // Turns out the assumption doesn't hold already for /\S/ + /\D/ + // as the second one (\D) is a pure subset of \S. So we have to + // look for markers which match multiple escapes/pcodes for those + // ones where a unique item isn't available... + var lut = []; + var done = {}; + var keys = Object.keys(Pcodes_bitarray_cache); + + for (i = 0; i <= UNICODE_BASE_PLANE_MAX_CP$1; i++) { + k = t[i][0]; + if (t[i].length === 1 && !done[k]) { + assert(l[k] > 0); + lut.push([i, k]); + done[k] = true; + } + } + + for (j = 0; keys[j]; j++) { + k = keys[j]; + + if (!user_has_xregexp && k.indexOf('\\p{') >= 0) { + continue; + } + + if (!done[k]) { + assert(l[k] > 0); + // find a minimum span character to mark this one: + var w = Infinity; + var rv; + ba = Pcodes_bitarray_cache[k]; + for (i = 0; i <= UNICODE_BASE_PLANE_MAX_CP$1; i++) { + if (ba[i]) { + var tl = t[i].length; + if (tl > 1 && tl < w) { + assert(l[k] > 0); + rv = [i, k]; + w = tl; + } + } + } + if (rv) { + done[k] = true; + lut.push(rv); + } + } + } + + // order from large set to small set so that small sets don't gobble + // characters also represented by overlapping larger set pcodes. + // + // Again we assume something: that finding the large regex pcode sets + // before the smaller, more specialized ones, will produce a more + // optimal minification of the regex set expression. + // + // This is a guestimate/heuristic only! + lut.sort(function (a, b) { + var k1 = a[1]; + var k2 = b[1]; + var ld = l[k2] - l[k1]; + if (ld) { + return ld; + } + // and for same-size sets, order from high to low unique identifier. + return b[0] - a[0]; + }); + + Pcodes_bitarray_cache_test_order = lut; +} + + + + + + +// 'Join' a regex set `[...]` into a Unicode range spanning logic array, flagging every character in the given set. +function set2bitarray(bitarr, s, opts) { + var orig = s; + var set_is_inverted = false; + var bitarr_orig; + + function mark(d1, d2) { + if (d2 == null) d2 = d1; + for (var i = d1; i <= d2; i++) { + bitarr[i] = true; + } + } + + function add2bitarray(dst, src) { + for (var i = 0; i <= UNICODE_BASE_PLANE_MAX_CP$1; i++) { + if (src[i]) { + dst[i] = true; + } + } + } + + function eval_escaped_code(s) { + var c; + // decode escaped code? If none, just take the character as-is + if (s.indexOf('\\') === 0) { + var l = s.substr(0, 2); + switch (l) { + case '\\c': + c = s.charCodeAt(2) - 'A'.charCodeAt(0) + 1; + return String.fromCharCode(c); + + case '\\x': + s = s.substr(2); + c = parseInt(s, 16); + return String.fromCharCode(c); + + case '\\u': + s = s.substr(2); + if (s[0] === '{') { + s = s.substr(1, s.length - 2); + } + c = parseInt(s, 16); + if (c >= 0x10000) { + return new Error('We do NOT support Extended Plane Unicode Codepoints (i.e. CodePoints beyond U:FFFF) in regex set expressions, e.g. \\u{' + s + '}'); + } + return String.fromCharCode(c); + + case '\\0': + case '\\1': + case '\\2': + case '\\3': + case '\\4': + case '\\5': + case '\\6': + case '\\7': + s = s.substr(1); + c = parseInt(s, 8); + return String.fromCharCode(c); + + case '\\r': + return '\r'; + + case '\\n': + return '\n'; + + case '\\v': + return '\v'; + + case '\\f': + return '\f'; + + case '\\t': + return '\t'; + + case '\\b': + return '\b'; + + default: + // just the character itself: + return s.substr(1); + } + } else { + return s; + } + } + + if (s && s.length) { + var c1, c2; + + // inverted set? + if (s[0] === '^') { + set_is_inverted = true; + s = s.substr(1); + bitarr_orig = bitarr; + bitarr = new Array(UNICODE_BASE_PLANE_MAX_CP$1 + 1); + } + + // BITARR collects flags for characters set. Inversion means the complement set of character is st instead. + // This results in an OR operations when sets are joined/chained. + + while (s.length) { + c1 = s.match(CHR_RE$1); + if (!c1) { + // hit an illegal escape sequence? cope anyway! + c1 = s[0]; + } else { + c1 = c1[0]; + // Quick hack for XRegExp escapes inside a regex `[...]` set definition: we *could* try to keep those + // intact but it's easier to unfold them here; this is not nice for when the grammar specifies explicit + // XRegExp support, but alas, we'll get there when we get there... ;-) + switch (c1) { + case '\\p': + s = s.substr(c1.length); + c2 = s.match(XREGEXP_UNICODE_ESCAPE_RE$1); + if (c2) { + c2 = c2[0]; + s = s.substr(c2.length); + // do we have this one cached already? + var pex = c1 + c2; + var ba4p = Pcodes_bitarray_cache[pex]; + if (!ba4p) { + // expand escape: + var xr = new XRegExp('[' + pex + ']'); // TODO: case-insensitive grammar??? + // rewrite to a standard `[...]` regex set: XRegExp will do this for us via `XRegExp.toString()`: + var xs = '' + xr; + // remove the wrapping `/.../` to get at the (possibly *combined* series of) `[...]` sets inside: + xs = xs.substr(1, xs.length - 2); + + ba4p = reduceRegexToSetBitArray(xs, pex, opts); + + Pcodes_bitarray_cache[pex] = ba4p; + updatePcodesBitarrayCacheTestOrder(opts); + } + // merge bitarrays: + add2bitarray(bitarr, ba4p); + continue; + } + break; + + case '\\S': + case '\\s': + case '\\W': + case '\\w': + case '\\d': + case '\\D': + // these can't participate in a range, but need to be treated special: + s = s.substr(c1.length); + // check for \S, \s, \D, \d, \W, \w and expand them: + var ba4e = EscCode_bitarray_output_refs.esc2bitarr[c1[1]]; + assert(ba4e); + add2bitarray(bitarr, ba4e); + continue; + + case '\\b': + // matches a backspace: https://developer.mozilla.org/en/docs/Web/JavaScript/Guide/Regular_Expressions#special-backspace + c1 = '\u0008'; + break; + } + } + var v1 = eval_escaped_code(c1); + // propagate deferred exceptions = error reports. + if (v1 instanceof Error) { + return v1; + } + v1 = v1.charCodeAt(0); + s = s.substr(c1.length); + + if (s[0] === '-' && s.length >= 2) { + // we can expect a range like 'a-z': + s = s.substr(1); + c2 = s.match(CHR_RE$1); + if (!c2) { + // hit an illegal escape sequence? cope anyway! + c2 = s[0]; + } else { + c2 = c2[0]; + } + var v2 = eval_escaped_code(c2); + // propagate deferred exceptions = error reports. + if (v2 instanceof Error) { + return v1; + } + v2 = v2.charCodeAt(0); + s = s.substr(c2.length); + + // legal ranges go UP, not /DOWN! + if (v1 <= v2) { + mark(v1, v2); + } else { + console.warn('INVALID CHARACTER RANGE found in regex: ', { re: orig, start: c1, start_n: v1, end: c2, end_n: v2 }); + mark(v1); + mark('-'.charCodeAt(0)); + mark(v2); + } + continue; + } + mark(v1); + } + + // When we have marked all slots, '^' NEGATES the set, hence we flip all slots. + // + // Since a regex like `[^]` should match everything(?really?), we don't need to check if the MARK + // phase actually marked anything at all: the `^` negation will correctly flip=mark the entire + // range then. + if (set_is_inverted) { + for (var i = 0; i <= UNICODE_BASE_PLANE_MAX_CP$1; i++) { + if (!bitarr[i]) { + bitarr_orig[i] = true; + } + } + } + } + return false; +} + + +// convert a simple bitarray back into a regex set `[...]` content: +function bitarray2set(l, output_inverted_variant, output_minimized) { + // construct the inverse(?) set from the mark-set: + // + // Before we do that, we inject a sentinel so that our inner loops + // below can be simple and fast: + l[UNICODE_BASE_PLANE_MAX_CP$1 + 1] = 1; + // now reconstruct the regex set: + var rv = []; + var i, j, cnt, lut, tn, tspec, match, pcode, ba4pcode, l2; + var bitarr_is_cloned = false; + var l_orig = l; + + if (output_inverted_variant) { + // generate the inverted set, hence all unmarked slots are part of the output range: + cnt = 0; + for (i = 0; i <= UNICODE_BASE_PLANE_MAX_CP$1; i++) { + if (!l[i]) { + cnt++; + } + } + if (cnt === UNICODE_BASE_PLANE_MAX_CP$1 + 1) { + // When there's nothing in the output we output a special 'match-nothing' regex: `[^\S\s]`. + // BUT... since we output the INVERTED set, we output the match-all set instead: + return '\\S\\s'; + } + else if (cnt === 0) { + // When we find the entire Unicode range is in the output match set, we replace this with + // a shorthand regex: `[\S\s]` + // BUT... since we output the INVERTED set, we output the match-nothing set instead: + return '^\\S\\s'; + } + + // Now see if we can replace several bits by an escape / pcode: + if (output_minimized) { + lut = Pcodes_bitarray_cache_test_order; + for (tn = 0; lut[tn]; tn++) { + tspec = lut[tn]; + // check if the uniquely identifying char is in the inverted set: + if (!l[tspec[0]]) { + // check if the pcode is covered by the inverted set: + pcode = tspec[1]; + ba4pcode = Pcodes_bitarray_cache[pcode]; + match = 0; + for (j = 0; j <= UNICODE_BASE_PLANE_MAX_CP$1; j++) { + if (ba4pcode[j]) { + if (!l[j]) { + // match in current inverted bitset, i.e. there's at + // least one 'new' bit covered by this pcode/escape: + match++; + } else if (l_orig[j]) { + // mismatch! + match = false; + break; + } + } + } + + // We're only interested in matches which actually cover some + // yet uncovered bits: `match !== 0 && match !== false`. + // + // Apply the heuristic that the pcode/escape is only going to be used + // when it covers *more* characters than its own identifier's length: + if (match && match > pcode.length) { + rv.push(pcode); + + // and nuke the bits in the array which match the given pcode: + // make sure these edits are visible outside this function as + // `l` is an INPUT parameter (~ not modified)! + if (!bitarr_is_cloned) { + l2 = new Array(UNICODE_BASE_PLANE_MAX_CP$1 + 1); + for (j = 0; j <= UNICODE_BASE_PLANE_MAX_CP$1; j++) { + l2[j] = l[j] || ba4pcode[j]; // `!(!l[j] && !ba4pcode[j])` + } + // recreate sentinel + l2[UNICODE_BASE_PLANE_MAX_CP$1 + 1] = 1; + l = l2; + bitarr_is_cloned = true; + } else { + for (j = 0; j <= UNICODE_BASE_PLANE_MAX_CP$1; j++) { + l[j] = l[j] || ba4pcode[j]; + } + } + } + } + } + } + + i = 0; + while (i <= UNICODE_BASE_PLANE_MAX_CP$1) { + // find first character not in original set: + while (l[i]) { + i++; + } + if (i >= UNICODE_BASE_PLANE_MAX_CP$1 + 1) { + break; + } + // find next character not in original set: + for (j = i + 1; !l[j]; j++) {} /* empty loop */ + // generate subset: + rv.push(i2c(i)); + if (j - 1 > i) { + rv.push((j - 2 > i ? '-' : '') + i2c(j - 1)); + } + i = j; + } + } else { + // generate the non-inverted set, hence all logic checks are inverted here... + cnt = 0; + for (i = 0; i <= UNICODE_BASE_PLANE_MAX_CP$1; i++) { + if (l[i]) { + cnt++; + } + } + if (cnt === UNICODE_BASE_PLANE_MAX_CP$1 + 1) { + // When we find the entire Unicode range is in the output match set, we replace this with + // a shorthand regex: `[\S\s]` + return '\\S\\s'; + } + else if (cnt === 0) { + // When there's nothing in the output we output a special 'match-nothing' regex: `[^\S\s]`. + return '^\\S\\s'; + } + + // Now see if we can replace several bits by an escape / pcode: + if (output_minimized) { + lut = Pcodes_bitarray_cache_test_order; + for (tn = 0; lut[tn]; tn++) { + tspec = lut[tn]; + // check if the uniquely identifying char is in the set: + if (l[tspec[0]]) { + // check if the pcode is covered by the set: + pcode = tspec[1]; + ba4pcode = Pcodes_bitarray_cache[pcode]; + match = 0; + for (j = 0; j <= UNICODE_BASE_PLANE_MAX_CP$1; j++) { + if (ba4pcode[j]) { + if (l[j]) { + // match in current bitset, i.e. there's at + // least one 'new' bit covered by this pcode/escape: + match++; + } else if (!l_orig[j]) { + // mismatch! + match = false; + break; + } + } + } + + // We're only interested in matches which actually cover some + // yet uncovered bits: `match !== 0 && match !== false`. + // + // Apply the heuristic that the pcode/escape is only going to be used + // when it covers *more* characters than its own identifier's length: + if (match && match > pcode.length) { + rv.push(pcode); + + // and nuke the bits in the array which match the given pcode: + // make sure these edits are visible outside this function as + // `l` is an INPUT parameter (~ not modified)! + if (!bitarr_is_cloned) { + l2 = new Array(UNICODE_BASE_PLANE_MAX_CP$1 + 1); + for (j = 0; j <= UNICODE_BASE_PLANE_MAX_CP$1; j++) { + l2[j] = l[j] && !ba4pcode[j]; + } + // recreate sentinel + l2[UNICODE_BASE_PLANE_MAX_CP$1 + 1] = 1; + l = l2; + bitarr_is_cloned = true; + } else { + for (j = 0; j <= UNICODE_BASE_PLANE_MAX_CP$1; j++) { + l[j] = l[j] && !ba4pcode[j]; + } + } + } + } + } + } + + i = 0; + while (i <= UNICODE_BASE_PLANE_MAX_CP$1) { + // find first character not in original set: + while (!l[i]) { + i++; + } + if (i >= UNICODE_BASE_PLANE_MAX_CP$1 + 1) { + break; + } + // find next character not in original set: + for (j = i + 1; l[j]; j++) {} /* empty loop */ + if (j > UNICODE_BASE_PLANE_MAX_CP$1 + 1) { + j = UNICODE_BASE_PLANE_MAX_CP$1 + 1; + } + // generate subset: + rv.push(i2c(i)); + if (j - 1 > i) { + rv.push((j - 2 > i ? '-' : '') + i2c(j - 1)); + } + i = j; + } + } + + assert(rv.length); + var s = rv.join(''); + assert(s); + + // Check if the set is better represented by one of the regex escapes: + var esc4s = EscCode_bitarray_output_refs.set2esc[s]; + if (esc4s) { + // When we hit a special case like this, it is always the shortest notation, hence wins on the spot! + return '\\' + esc4s; + } + return s; +} + + + + + +// Pretty brutal conversion of 'regex' `s` back to raw regex set content: strip outer [...] when they're there; +// ditto for inner combos of sets, i.e. `]|[` as in `[0-9]|[a-z]`. +function reduceRegexToSetBitArray(s, name, opts) { + var orig = s; + + // propagate deferred exceptions = error reports. + if (s instanceof Error) { + return s; + } + + var l = new Array(UNICODE_BASE_PLANE_MAX_CP$1 + 1); + var internal_state = 0; + var derr; + + while (s.length) { + var c1 = s.match(CHR_RE$1); + if (!c1) { + // cope with illegal escape sequences too! + return new Error('illegal escape sequence at start of regex part: "' + s + '" of regex "' + orig + '"'); + } else { + c1 = c1[0]; + } + s = s.substr(c1.length); + + switch (c1) { + case '[': + // this is starting a set within the regex: scan until end of set! + var set_content = []; + while (s.length) { + var inner = s.match(SET_PART_RE$1); + if (!inner) { + inner = s.match(CHR_RE$1); + if (!inner) { + // cope with illegal escape sequences too! + return new Error('illegal escape sequence at start of regex part: ' + s + '" of regex "' + orig + '"'); + } else { + inner = inner[0]; + } + if (inner === ']') break; + } else { + inner = inner[0]; + } + set_content.push(inner); + s = s.substr(inner.length); + } + + // ensure that we hit the terminating ']': + var c2 = s.match(CHR_RE$1); + if (!c2) { + // cope with illegal escape sequences too! + return new Error('regex set expression is broken in regex: "' + orig + '" --> "' + s + '"'); + } else { + c2 = c2[0]; + } + if (c2 !== ']') { + return new Error('regex set expression is broken in regex: ' + orig); + } + s = s.substr(c2.length); + + var se = set_content.join(''); + if (!internal_state) { + derr = set2bitarray(l, se, opts); + // propagate deferred exceptions = error reports. + if (derr instanceof Error) { + return derr; + } + + // a set is to use like a single character in a longer literal phrase, hence input `[abc]word[def]` would thus produce output `[abc]`: + internal_state = 1; + } + break; + + // Strip unescaped pipes to catch constructs like `\\r|\\n` and turn them into + // something ready for use inside a regex set, e.g. `\\r\\n`. + // + // > Of course, we realize that converting more complex piped constructs this way + // > will produce something you might not expect, e.g. `A|WORD2` which + // > would end up as the set `[AW]` which is something else than the input + // > entirely. + // > + // > However, we can only depend on the user (grammar writer) to realize this and + // > prevent this from happening by not creating such oddities in the input grammar. + case '|': + // a|b --> [ab] + internal_state = 0; + break; + + case '(': + // (a) --> a + // + // TODO - right now we treat this as 'too complex': + + // Strip off some possible outer wrappers which we know how to remove. + // We don't worry about 'damaging' the regex as any too-complex regex will be caught + // in the validation check at the end; our 'strippers' here would not damage useful + // regexes anyway and them damaging the unacceptable ones is fine. + s = s.replace(/^\((?:\?:)?(.*?)\)$/, '$1'); // (?:...) -> ... and (...) -> ... + s = s.replace(/^\^?(.*?)\$?$/, '$1'); // ^...$ --> ... (catch these both inside and outside the outer grouping, hence do the ungrouping twice: one before, once after this) + s = s.replace(/^\((?:\?:)?(.*?)\)$/, '$1'); // (?:...) -> ... and (...) -> ... + + return new Error('[macro [' + name + '] is unsuitable for use inside regex set expressions: "[' + orig + ']"]'); + + case '.': + case '*': + case '+': + case '?': + // wildcard + // + // TODO - right now we treat this as 'too complex': + return new Error('[macro [' + name + '] is unsuitable for use inside regex set expressions: "[' + orig + ']"]'); + + case '{': // range, e.g. `x{1,3}`, or macro? + // TODO - right now we treat this as 'too complex': + return new Error('[macro [' + name + '] is unsuitable for use inside regex set expressions: "[' + orig + ']"]'); + + default: + // literal character or word: take the first character only and ignore the rest, so that + // the constructed set for `word|noun` would be `[wb]`: + if (!internal_state) { + derr = set2bitarray(l, c1, opts); + // propagate deferred exceptions = error reports. + if (derr instanceof Error) { + return derr; + } + + internal_state = 2; + } + break; + } + } + + s = bitarray2set(l); + + // When this result is suitable for use in a set, than we should be able to compile + // it in a regex; that way we can easily validate whether macro X is fit to be used + // inside a regex set: + try { + var re; + assert(s); + assert(!(s instanceof Error)); + re = new XRegExp('[' + s + ']'); + re.test(s[0]); + + // One thing is apparently *not* caught by the RegExp compile action above: `[a[b]c]` + // so we check for lingering UNESCAPED brackets in here as those cannot be: + if (/[^\\][\[\]]/.exec(s)) { + throw new Error('unescaped brackets in set data'); + } + } catch (ex) { + // make sure we produce a set range expression which will fail badly when it is used + // in actual code: + s = new Error('[macro [' + name + '] is unsuitable for use inside regex set expressions: "[' + s + ']"]: ' + ex.message); + } + + assert(s); + // propagate deferred exceptions = error reports. + if (s instanceof Error) { + return s; + } + return l; +} + + + + +// Convert bitarray representing, for example, `'0-9'` to regex string `[0-9]` +// -- or in this example it can be further optimized to only `\d`! +function produceOptimizedRegex4Set(bitarr) { + // First try to produce a minimum regex from the bitarray directly: + var s1 = bitarray2set(bitarr, false, true); + + // and when the regex set turns out to match a single pcode/escape, then + // use that one as-is: + if (s1.match(SET_IS_SINGLE_PCODE_RE)) { + // When we hit a special case like this, it is always the shortest notation, hence wins on the spot! + return s1; + } else { + s1 = '[' + s1 + ']'; + } + + // Now try to produce a minimum regex from the *inverted* bitarray via negation: + // Because we look at a negated bitset, there's no use looking for matches with + // special cases here. + var s2 = bitarray2set(bitarr, true, true); + + if (s2[0] === '^') { + s2 = s2.substr(1); + if (s2.match(SET_IS_SINGLE_PCODE_RE)) { + // When we hit a special case like this, it is always the shortest notation, hence wins on the spot! + return s2; + } + } else { + s2 = '^' + s2; + } + s2 = '[' + s2 + ']'; + + // Then, as some pcode/escapes still happen to deliver a LARGER regex string in the end, + // we also check against the plain, unadulterated regex set expressions: + // + // First try to produce a minimum regex from the bitarray directly: + var s3 = bitarray2set(bitarr, false, false); + + // and when the regex set turns out to match a single pcode/escape, then + // use that one as-is: + if (s3.match(SET_IS_SINGLE_PCODE_RE)) { + // When we hit a special case like this, it is always the shortest notation, hence wins on the spot! + return s3; + } else { + s3 = '[' + s3 + ']'; + } + + // Now try to produce a minimum regex from the *inverted* bitarray via negation: + // Because we look at a negated bitset, there's no use looking for matches with + // special cases here. + var s4 = bitarray2set(bitarr, true, false); + + if (s4[0] === '^') { + s4 = s4.substr(1); + if (s4.match(SET_IS_SINGLE_PCODE_RE)) { + // When we hit a special case like this, it is always the shortest notation, hence wins on the spot! + return s4; + } + } else { + s4 = '^' + s4; + } + s4 = '[' + s4 + ']'; + + if (s2.length < s1.length) { + s1 = s2; + } + if (s3.length < s1.length) { + s1 = s3; + } + if (s4.length < s1.length) { + s1 = s4; + } + + return s1; +} + + + + + + +var setmgmt = { + XREGEXP_UNICODE_ESCAPE_RE: XREGEXP_UNICODE_ESCAPE_RE$1, + CHR_RE: CHR_RE$1, + SET_PART_RE: SET_PART_RE$1, + NOTHING_SPECIAL_RE: NOTHING_SPECIAL_RE$1, + SET_IS_SINGLE_PCODE_RE, + + UNICODE_BASE_PLANE_MAX_CP: UNICODE_BASE_PLANE_MAX_CP$1, + + WHITESPACE_SETSTR: WHITESPACE_SETSTR$1, + DIGIT_SETSTR: DIGIT_SETSTR$1, + WORDCHAR_SETSTR: WORDCHAR_SETSTR$1, + + set2bitarray, + bitarray2set, + produceOptimizedRegex4Set, + reduceRegexToSetBitArray, +}; + +// Basic Lexer implemented using JavaScript regular expressions +// Zachary Carter +// MIT Licensed + +var rmCommonWS = helpers.rmCommonWS; +var camelCase = helpers.camelCase; +var code_exec = helpers.exec; +// import recast from '@gerhobbelt/recast'; +// import astUtils from '@gerhobbelt/ast-util'; +var version$1 = '0.6.1-205'; // require('./package.json').version; + + + + +const XREGEXP_UNICODE_ESCAPE_RE = setmgmt.XREGEXP_UNICODE_ESCAPE_RE; // Matches the XRegExp Unicode escape braced part, e.g. `{Number}` +const CHR_RE = setmgmt.CHR_RE; +const SET_PART_RE = setmgmt.SET_PART_RE; +const NOTHING_SPECIAL_RE = setmgmt.NOTHING_SPECIAL_RE; +const UNICODE_BASE_PLANE_MAX_CP = setmgmt.UNICODE_BASE_PLANE_MAX_CP; + +// WARNING: this regex MUST match the regex for `ID` in ebnf-parser::bnf.l jison language lexer spec! (`ID = [{ALPHA}]{ALNUM}*`) +// +// This is the base XRegExp ID regex used in many places; this should match the ID macro definition in the EBNF/BNF parser et al as well! +const ID_REGEX_BASE = '[\\p{Alphabetic}_][\\p{Alphabetic}_\\p{Number}]*'; + + + + +// see also ./lib/cli.js +/** +@public +@nocollapse +*/ +const defaultJisonLexOptions = { + moduleType: 'commonjs', + debug: false, + enableDebugLogs: false, + json: false, + main: false, // CLI: not:(--main option) + dumpSourceCodeOnFailure: true, + throwErrorOnCompileFailure: true, + + moduleName: undefined, + defaultModuleName: 'lexer', + file: undefined, + outfile: undefined, + inputPath: undefined, + inputFilename: undefined, + warn_cb: undefined, // function(msg) | true (= use Jison.Print) | false (= throw Exception) + + xregexp: false, + lexerErrorsAreRecoverable: false, + flex: false, + backtrack_lexer: false, + ranges: false, // track position range, i.e. start+end indexes in the input string + trackPosition: true, // track line+column position in the input string + caseInsensitive: false, + showSource: false, + exportSourceCode: false, + exportAST: false, + prettyCfg: true, + pre_lex: undefined, + post_lex: undefined, +}; + + +// Merge sets of options. +// +// Convert alternative jison option names to their base option. +// +// The *last* option set which overrides the default wins, where 'override' is +// defined as specifying a not-undefined value which is not equal to the +// default value. +// +// When the FIRST argument is STRING "NODEFAULT", then we MUST NOT mix the +// default values avialable in Jison.defaultJisonOptions. +// +// Return a fresh set of options. +/** @public */ +function mkStdOptions(/*...args*/) { + var h = Object.prototype.hasOwnProperty; + + var opts = {}; + var args = [].concat.apply([], arguments); + // clone defaults, so we do not modify those constants? + if (args[0] !== "NODEFAULT") { + args.unshift(defaultJisonLexOptions); + } else { + args.shift(); + } + + for (var i = 0, len = args.length; i < len; i++) { + var o = args[i]; + if (!o) continue; + + // clone input (while camel-casing the options), so we do not modify those either. + var o2 = {}; + + for (var p in o) { + if (typeof o[p] !== 'undefined' && h.call(o, p)) { + o2[camelCase(p)] = o[p]; + } + } + + // now clean them options up: + if (typeof o2.main !== 'undefined') { + o2.noMain = !o2.main; + } + + delete o2.main; + + // special check for `moduleName` to ensure we detect the 'default' moduleName entering from the CLI + // NOT overriding the moduleName set in the grammar definition file via an `%options` entry: + if (o2.moduleName === o2.defaultModuleName) { + delete o2.moduleName; + } + + // now see if we have an overriding option here: + for (var p in o2) { + if (h.call(o2, p)) { + if (typeof o2[p] !== 'undefined') { + opts[p] = o2[p]; + } + } + } + } + + return opts; +} + +// set up export/output attributes of the `options` object instance +function prepExportStructures(options) { + // set up the 'option' `exportSourceCode` as a hash object for returning + // all generated source code chunks to the caller + var exportSourceCode = options.exportSourceCode; + if (!exportSourceCode || typeof exportSourceCode !== 'object') { + exportSourceCode = { + enabled: !!exportSourceCode + }; + } else if (typeof exportSourceCode.enabled !== 'boolean') { + exportSourceCode.enabled = true; + } + options.exportSourceCode = exportSourceCode; +} + +// Autodetect if the input lexer spec is in JSON or JISON +// format when the `options.json` flag is `true`. +// +// Produce the JSON lexer spec result when these are JSON formatted already as that +// would save us the trouble of doing this again, anywhere else in the JISON +// compiler/generator. +// +// Otherwise return the *parsed* lexer spec as it has +// been processed through LexParser. +function autodetectAndConvertToJSONformat(lexerSpec, options) { + var chk_l = null; + var ex1, err; + + if (typeof lexerSpec === 'string') { + if (options.json) { + try { + chk_l = json5.parse(lexerSpec); + + // When JSON5-based parsing of the lexer spec succeeds, this implies the lexer spec is specified in `JSON mode` + // *OR* there's a JSON/JSON5 format error in the input: + } catch (e) { + ex1 = e; + } + } + if (!chk_l) { + // // WARNING: the lexer may receive options specified in the **grammar spec file**, + // // hence we should mix the options to ensure the lexParser always + // // receives the full set! + // // + // // make sure all options are 'standardized' before we go and mix them together: + // options = mkStdOptions(grammar.options, options); + try { + chk_l = lexParser.parse(lexerSpec, options); + } catch (e) { + if (options.json) { + err = new Error('Could not parse lexer spec in JSON AUTODETECT mode\nError: ' + ex1.message + ' (' + e.message + ')'); + err.secondary_exception = e; + err.stack = ex1.stack; + } else { + err = new Error('Could not parse lexer spec\nError: ' + e.message); + err.stack = e.stack; + } + throw err; + } + } + } else { + chk_l = lexerSpec; + } + + // Save time! Don't reparse the entire lexer spec *again* inside the code generators when that's not necessary: + + return chk_l; +} + + +// expand macros and convert matchers to RegExp's +function prepareRules(dict, actions, caseHelper, tokens, startConditions, opts) { + var m, i, k, rule, action, conditions, + active_conditions, + rules = dict.rules || [], + newRules = [], + macros = {}, + regular_rule_count = 0, + simple_rule_count = 0; + + // Assure all options are camelCased: + assert(typeof opts.options['case-insensitive'] === 'undefined'); + + if (!tokens) { + tokens = {}; + } + + // Depending on the location within the regex we need different expansions of the macros: + // one expansion for when a macro is *inside* a `[...]` and another expansion when a macro + // is anywhere else in a regex: + if (dict.macros) { + macros = prepareMacros(dict.macros, opts); + } + + function tokenNumberReplacement(str, token) { + return 'return ' + (tokens[token] || '\'' + token.replace(/'/g, '\\\'') + '\''); + } + + // Make sure a comment does not contain any embedded '*/' end-of-comment marker + // as that would break the generated code + function postprocessComment(str) { + if (Array.isArray(str)) { + str = str.join(' '); + } + str = str.replace(/\*\//g, '*\\/'); // destroy any inner `*/` comment terminator sequence. + return str; + } + + actions.push('switch(yyrulenumber) {'); + + for (i = 0; i < rules.length; i++) { + rule = rules[i]; + m = rule[0]; + + active_conditions = []; + if (Object.prototype.toString.apply(m) !== '[object Array]') { + // implicit add to all inclusive start conditions + for (k in startConditions) { + if (startConditions[k].inclusive) { + active_conditions.push(k); + startConditions[k].rules.push(i); + } + } + } else if (m[0] === '*') { + // Add to ALL start conditions + active_conditions.push('*'); + for (k in startConditions) { + startConditions[k].rules.push(i); + } + rule.shift(); + m = rule[0]; + } else { + // Add to explicit start conditions + conditions = rule.shift(); + m = rule[0]; + for (k = 0; k < conditions.length; k++) { + if (!startConditions.hasOwnProperty(conditions[k])) { + startConditions[conditions[k]] = { + rules: [], + inclusive: false + }; + console.warn('Lexer Warning:', '"' + conditions[k] + '" start condition should be defined as %s or %x; assuming %x now.'); + } + active_conditions.push(conditions[k]); + startConditions[conditions[k]].rules.push(i); + } + } + + if (typeof m === 'string') { + m = expandMacros(m, macros, opts); + m = new XRegExp('^(?:' + m + ')', opts.options.caseInsensitive ? 'i' : ''); + } + newRules.push(m); + if (typeof rule[1] === 'function') { + rule[1] = String(rule[1]).replace(/^\s*function \(\)\s?\{/, '').replace(/\}\s*$/, ''); + } + action = rule[1]; + action = action.replace(/return '((?:\\'|[^']+)+)'/g, tokenNumberReplacement); + action = action.replace(/return "((?:\\"|[^"]+)+)"/g, tokenNumberReplacement); + + var code = ['\n/*! Conditions::']; + code.push(postprocessComment(active_conditions)); + code.push('*/', '\n/*! Rule:: '); + code.push(postprocessComment(rules[i][0])); + code.push('*/', '\n'); + + // When the action is *only* a simple `return TOKEN` statement, then add it to the caseHelpers; + // otherwise add the additional `break;` at the end. + // + // Note: we do NOT analyze the action block any more to see if the *last* line is a simple + // `return NNN;` statement as there are too many shoddy idioms, e.g. + // + // ``` + // %{ if (cond) + // return TOKEN; + // %} + // ``` + // + // which would then cause havoc when our action code analysis (using regexes or otherwise) was 'too simple' + // to catch these culprits; hence we resort and stick with the most fundamental approach here: + // always append `break;` even when it would be obvious to a human that such would be 'unreachable code'. + var match_nr = /^return[\s\r\n]+((?:'(?:\\'|[^']+)+')|(?:"(?:\\"|[^"]+)+")|\d+)[\s\r\n]*;?$/.exec(action.trim()); + if (match_nr) { + simple_rule_count++; + caseHelper.push([].concat(code, i, ':', match_nr[1]).join(' ').replace(/[\n]/g, '\n ')); + } else { + regular_rule_count++; + actions.push([].concat('case', i, ':', code, action, '\nbreak;').join(' ')); + } + } + actions.push('default:'); + actions.push(' return this.simpleCaseActionClusters[yyrulenumber];'); + actions.push('}'); + + return { + rules: newRules, + macros: macros, + + regular_rule_count: regular_rule_count, + simple_rule_count: simple_rule_count, + }; +} + + + + + + + +// expand all macros (with maybe one exception) in the given regex: the macros may exist inside `[...]` regex sets or +// elsewhere, which requires two different treatments to expand these macros. +function reduceRegex(s, name, opts, expandAllMacrosInSet_cb, expandAllMacrosElsewhere_cb) { + var orig = s; + + function errinfo() { + if (name) { + return 'macro [[' + name + ']]'; + } else { + return 'regex [[' + orig + ']]'; + } + } + + // propagate deferred exceptions = error reports. + if (s instanceof Error) { + return s; + } + + var c1, c2; + var rv = []; + var derr; + var se; + + while (s.length) { + c1 = s.match(CHR_RE); + if (!c1) { + // cope with illegal escape sequences too! + return new Error(errinfo() + ': illegal escape sequence at start of regex part: ' + s); + } else { + c1 = c1[0]; + } + s = s.substr(c1.length); + + switch (c1) { + case '[': + // this is starting a set within the regex: scan until end of set! + var set_content = []; + var l = new Array(UNICODE_BASE_PLANE_MAX_CP + 1); + + while (s.length) { + var inner = s.match(SET_PART_RE); + if (!inner) { + inner = s.match(CHR_RE); + if (!inner) { + // cope with illegal escape sequences too! + return new Error(errinfo() + ': illegal escape sequence at start of regex part: ' + s); + } else { + inner = inner[0]; + } + if (inner === ']') break; + } else { + inner = inner[0]; + } + set_content.push(inner); + s = s.substr(inner.length); + } + + // ensure that we hit the terminating ']': + c2 = s.match(CHR_RE); + if (!c2) { + // cope with illegal escape sequences too! + return new Error(errinfo() + ': regex set expression is broken: "' + s + '"'); + } else { + c2 = c2[0]; + } + if (c2 !== ']') { + return new Error(errinfo() + ': regex set expression is broken: apparently unterminated'); + } + s = s.substr(c2.length); + + se = set_content.join(''); + + // expand any macros in here: + if (expandAllMacrosInSet_cb) { + se = expandAllMacrosInSet_cb(se); + assert(se); + if (se instanceof Error) { + return new Error(errinfo() + ': ' + se.message); + } + } + + derr = setmgmt.set2bitarray(l, se, opts); + if (derr instanceof Error) { + return new Error(errinfo() + ': ' + derr.message); + } + + // find out which set expression is optimal in size: + var s1 = setmgmt.produceOptimizedRegex4Set(l); + + // check if the source regex set potentially has any expansions (guestimate!) + // + // The indexOf('{') picks both XRegExp Unicode escapes and JISON lexer macros, which is perfect for us here. + var has_expansions = (se.indexOf('{') >= 0); + + se = '[' + se + ']'; + + if (!has_expansions && se.length < s1.length) { + s1 = se; + } + rv.push(s1); + break; + + // XRegExp Unicode escape, e.g. `\\p{Number}`: + case '\\p': + c2 = s.match(XREGEXP_UNICODE_ESCAPE_RE); + if (c2) { + c2 = c2[0]; + s = s.substr(c2.length); + + // nothing to expand. + rv.push(c1 + c2); + } else { + // nothing to stretch this match, hence nothing to expand. + rv.push(c1); + } + break; + + // Either a range expression or the start of a macro reference: `.{1,3}` or `{NAME}`. + // Treat it as a macro reference and see if it will expand to anything: + case '{': + c2 = s.match(NOTHING_SPECIAL_RE); + if (c2) { + c2 = c2[0]; + s = s.substr(c2.length); + + var c3 = s[0]; + s = s.substr(c3.length); + if (c3 === '}') { + // possibly a macro name in there... Expand if possible: + c2 = c1 + c2 + c3; + if (expandAllMacrosElsewhere_cb) { + c2 = expandAllMacrosElsewhere_cb(c2); + assert(c2); + if (c2 instanceof Error) { + return new Error(errinfo() + ': ' + c2.message); + } + } + } else { + // not a well-terminated macro reference or something completely different: + // we do not even attempt to expand this as there's guaranteed nothing to expand + // in this bit. + c2 = c1 + c2 + c3; + } + rv.push(c2); + } else { + // nothing to stretch this match, hence nothing to expand. + rv.push(c1); + } + break; + + // Recognize some other regex elements, but there's no need to understand them all. + // + // We are merely interested in any chunks now which do *not* include yet another regex set `[...]` + // nor any `{MACRO}` reference: + default: + // non-set character or word: see how much of this there is for us and then see if there + // are any macros still lurking inside there: + c2 = s.match(NOTHING_SPECIAL_RE); + if (c2) { + c2 = c2[0]; + s = s.substr(c2.length); + + // nothing to expand. + rv.push(c1 + c2); + } else { + // nothing to stretch this match, hence nothing to expand. + rv.push(c1); + } + break; + } + } + + s = rv.join(''); + + // When this result is suitable for use in a set, than we should be able to compile + // it in a regex; that way we can easily validate whether macro X is fit to be used + // inside a regex set: + try { + var re; + re = new XRegExp(s); + re.test(s[0]); + } catch (ex) { + // make sure we produce a regex expression which will fail badly when it is used + // in actual code: + return new Error(errinfo() + ': expands to an invalid regex: /' + s + '/'); + } + + assert(s); + return s; +} + + +// expand macros within macros and cache the result +function prepareMacros(dict_macros, opts) { + var macros = {}; + + // expand a `{NAME}` macro which exists inside a `[...]` set: + function expandMacroInSet(i) { + var k, a, m; + if (!macros[i]) { + m = dict_macros[i]; + + if (m.indexOf('{') >= 0) { + // set up our own record so we can detect definition loops: + macros[i] = { + in_set: false, + elsewhere: null, + raw: dict_macros[i] + }; + + for (k in dict_macros) { + if (dict_macros.hasOwnProperty(k) && i !== k) { + // it doesn't matter if the lexer recognized that the inner macro(s) + // were sitting inside a `[...]` set or not: the fact that they are used + // here in macro `i` which itself sits in a set, makes them *all* live in + // a set so all of them get the same treatment: set expansion style. + // + // Note: make sure we don't try to expand any XRegExp `\p{...}` or `\P{...}` + // macros here: + if (XRegExp._getUnicodeProperty(k)) { + // Work-around so that you can use `\p{ascii}` for a XRegExp slug, a.k.a. + // Unicode 'General Category' Property cf. http://unicode.org/reports/tr18/#Categories, + // while using `\p{ASCII}` as a *macro expansion* of the `ASCII` + // macro: + if (k.toUpperCase() !== k) { + m = new Error('Cannot use name "' + k + '" as a macro name as it clashes with the same XRegExp "\\p{..}" Unicode \'General Category\' Property name. Use all-uppercase macro names, e.g. name your macro "' + k.toUpperCase() + '" to work around this issue or give your offending macro a different name.'); + break; + } + } + + a = m.split('{' + k + '}'); + if (a.length > 1) { + var x = expandMacroInSet(k); + assert(x); + if (x instanceof Error) { + m = x; + break; + } + m = a.join(x); + } + } + } + } + + var mba = setmgmt.reduceRegexToSetBitArray(m, i, opts); + + var s1; + + // propagate deferred exceptions = error reports. + if (mba instanceof Error) { + s1 = mba; + } else { + s1 = setmgmt.bitarray2set(mba, false); + + m = s1; + } + + macros[i] = { + in_set: s1, + elsewhere: null, + raw: dict_macros[i] + }; + } else { + m = macros[i].in_set; + + if (m instanceof Error) { + // this turns out to be an macro with 'issues' and it is used, so the 'issues' do matter: bombs away! + return new Error(m.message); + } + + // detect definition loop: + if (m === false) { + return new Error('Macro name "' + i + '" has an illegal, looping, definition, i.e. it\'s definition references itself, either directly or indirectly, via other macros.'); + } + } + + return m; + } + + function expandMacroElsewhere(i) { + var k, a, m; + + if (macros[i].elsewhere == null) { + m = dict_macros[i]; + + // set up our own record so we can detect definition loops: + macros[i].elsewhere = false; + + // the macro MAY contain other macros which MAY be inside a `[...]` set in this + // macro or elsewhere, hence we must parse the regex: + m = reduceRegex(m, i, opts, expandAllMacrosInSet, expandAllMacrosElsewhere); + // propagate deferred exceptions = error reports. + if (m instanceof Error) { + return m; + } + + macros[i].elsewhere = m; + } else { + m = macros[i].elsewhere; + + if (m instanceof Error) { + // this turns out to be an macro with 'issues' and it is used, so the 'issues' do matter: bombs away! + return m; + } + + // detect definition loop: + if (m === false) { + return new Error('Macro name "' + i + '" has an illegal, looping, definition, i.e. it\'s definition references itself, either directly or indirectly, via other macros.'); + } + } + + return m; + } + + function expandAllMacrosInSet(s) { + var i, x; + + // process *all* the macros inside [...] set: + if (s.indexOf('{') >= 0) { + for (i in macros) { + if (macros.hasOwnProperty(i)) { + var a = s.split('{' + i + '}'); + if (a.length > 1) { + x = expandMacroInSet(i); + assert(x); + if (x instanceof Error) { + return new Error('failure to expand the macro [' + i + '] in set [' + s + ']: ' + x.message); + } + s = a.join(x); + } + + // stop the brute-force expansion attempt when we done 'em all: + if (s.indexOf('{') === -1) { + break; + } + } + } + } + + return s; + } + + function expandAllMacrosElsewhere(s) { + var i, x; + + // When we process the remaining macro occurrences in the regex + // every macro used in a lexer rule will become its own capture group. + // + // Meanwhile the cached expansion will expand any submacros into + // *NON*-capturing groups so that the backreference indexes remain as you'ld + // expect and using macros doesn't require you to know exactly what your + // used macro will expand into, i.e. which and how many submacros it has. + // + // This is a BREAKING CHANGE from vanilla jison 0.4.15! + if (s.indexOf('{') >= 0) { + for (i in macros) { + if (macros.hasOwnProperty(i)) { + // These are all submacro expansions, hence non-capturing grouping is applied: + var a = s.split('{' + i + '}'); + if (a.length > 1) { + x = expandMacroElsewhere(i); + assert(x); + if (x instanceof Error) { + return new Error('failure to expand the macro [' + i + '] in regex /' + s + '/: ' + x.message); + } + s = a.join('(?:' + x + ')'); + } + + // stop the brute-force expansion attempt when we done 'em all: + if (s.indexOf('{') === -1) { + break; + } + } + } + } + + return s; + } + + + var m, i; + + if (opts.debug) console.log('\n############## RAW macros: ', dict_macros); + + // first we create the part of the dictionary which is targeting the use of macros + // *inside* `[...]` sets; once we have completed that half of the expansions work, + // we then go and expand the macros for when they are used elsewhere in a regex: + // iff we encounter submacros then which are used *inside* a set, we can use that + // first half dictionary to speed things up a bit as we can use those expansions + // straight away! + for (i in dict_macros) { + if (dict_macros.hasOwnProperty(i)) { + expandMacroInSet(i); + } + } + + for (i in dict_macros) { + if (dict_macros.hasOwnProperty(i)) { + expandMacroElsewhere(i); + } + } + + if (opts.debug) console.log('\n############### expanded macros: ', macros); + + return macros; +} + + + +// expand macros in a regex; expands them recursively +function expandMacros(src, macros, opts) { + var expansion_count = 0; + + // By the time we call this function `expandMacros` we MUST have expanded and cached all macros already! + // Hence things should be easy in there: + + function expandAllMacrosInSet(s) { + var i, m, x; + + // process *all* the macros inside [...] set: + if (s.indexOf('{') >= 0) { + for (i in macros) { + if (macros.hasOwnProperty(i)) { + m = macros[i]; + + var a = s.split('{' + i + '}'); + if (a.length > 1) { + x = m.in_set; + + assert(x); + if (x instanceof Error) { + // this turns out to be an macro with 'issues' and it is used, so the 'issues' do matter: bombs away! + throw x; + } + + // detect definition loop: + if (x === false) { + return new Error('Macro name "' + i + '" has an illegal, looping, definition, i.e. it\'s definition references itself, either directly or indirectly, via other macros.'); + } + + s = a.join(x); + expansion_count++; + } + + // stop the brute-force expansion attempt when we done 'em all: + if (s.indexOf('{') === -1) { + break; + } + } + } + } + + return s; + } + + function expandAllMacrosElsewhere(s) { + var i, m, x; + + // When we process the main macro occurrences in the regex + // every macro used in a lexer rule will become its own capture group. + // + // Meanwhile the cached expansion will expand any submacros into + // *NON*-capturing groups so that the backreference indexes remain as you'ld + // expect and using macros doesn't require you to know exactly what your + // used macro will expand into, i.e. which and how many submacros it has. + // + // This is a BREAKING CHANGE from vanilla jison 0.4.15! + if (s.indexOf('{') >= 0) { + for (i in macros) { + if (macros.hasOwnProperty(i)) { + m = macros[i]; + + var a = s.split('{' + i + '}'); + if (a.length > 1) { + // These are all main macro expansions, hence CAPTURING grouping is applied: + x = m.elsewhere; + assert(x); + + // detect definition loop: + if (x === false) { + return new Error('Macro name "' + i + '" has an illegal, looping, definition, i.e. it\'s definition references itself, either directly or indirectly, via other macros.'); + } + + s = a.join('(' + x + ')'); + expansion_count++; + } + + // stop the brute-force expansion attempt when we done 'em all: + if (s.indexOf('{') === -1) { + break; + } + } + } + } + + return s; + } + + + // When we process the macro occurrences in the regex + // every macro used in a lexer rule will become its own capture group. + // + // Meanwhile the cached expansion will have expanded any submacros into + // *NON*-capturing groups so that the backreference indexes remain as you'ld + // expect and using macros doesn't require you to know exactly what your + // used macro will expand into, i.e. which and how many submacros it has. + // + // This is a BREAKING CHANGE from vanilla jison 0.4.15! + var s2 = reduceRegex(src, null, opts, expandAllMacrosInSet, expandAllMacrosElsewhere); + // propagate deferred exceptions = error reports. + if (s2 instanceof Error) { + throw s2; + } + + // only when we did expand some actual macros do we take the re-interpreted/optimized/regenerated regex from reduceRegex() + // in order to keep our test cases simple and rules recognizable. This assumes the user can code good regexes on his own, + // as long as no macros are involved... + // + // Also pick the reduced regex when there (potentially) are XRegExp extensions in the original, e.g. `\\p{Number}`, + // unless the `xregexp` output option has been enabled. + if (expansion_count > 0 || (src.indexOf('\\p{') >= 0 && !opts.options.xregexp)) { + src = s2; + } else { + // Check if the reduced regex is smaller in size; when it is, we still go with the new one! + if (s2.length < src.length) { + src = s2; + } + } + + return src; +} + +function prepareStartConditions(conditions) { + var sc, + hash = {}; + for (sc in conditions) { + if (conditions.hasOwnProperty(sc)) { + hash[sc] = {rules:[], inclusive: !conditions[sc]}; + } + } + return hash; +} + +function buildActions(dict, tokens, opts) { + var actions = [dict.actionInclude || '', 'var YYSTATE = YY_START;']; + var tok; + var toks = {}; + var caseHelper = []; + + // tokens: map/array of token numbers to token names + for (tok in tokens) { + var idx = parseInt(tok); + if (idx && idx > 0) { + toks[tokens[tok]] = idx; + } + } + + if (opts.options.flex && dict.rules) { + dict.rules.push(['.', 'console.log("", yytext); /* `flex` lexing mode: the last resort rule! */']); + } + + var gen = prepareRules(dict, actions, caseHelper, tokens && toks, opts.conditions, opts); + + var fun = actions.join('\n'); + 'yytext yyleng yylineno yylloc yyerror'.split(' ').forEach(function (yy) { + fun = fun.replace(new RegExp('\\b(' + yy + ')\\b', 'g'), 'yy_.$1'); + }); + + return { + caseHelperInclude: '{\n' + caseHelper.join(',') + '\n}', + + actions: `function lexer__performAction(yy, yyrulenumber, YY_START) { + var yy_ = this; + + ${fun} + }`, + + rules: gen.rules, + macros: gen.macros, // propagate these for debugging/diagnostic purposes + + regular_rule_count: gen.regular_rule_count, + simple_rule_count: gen.simple_rule_count, + }; +} + +// +// NOTE: this is *almost* a copy of the JisonParserError producing code in +// jison/lib/jison.js @ line 2304:lrGeneratorMixin.generateErrorClass +// +function generateErrorClass() { + // --- START lexer error class --- + +var prelude = `/** + * See also: + * http://stackoverflow.com/questions/1382107/whats-a-good-way-to-extend-error-in-javascript/#35881508 + * but we keep the prototype.constructor and prototype.name assignment lines too for compatibility + * with userland code which might access the derived class in a 'classic' way. + * + * @public + * @constructor + * @nocollapse + */ +function JisonLexerError(msg, hash) { + Object.defineProperty(this, 'name', { + enumerable: false, + writable: false, + value: 'JisonLexerError' + }); + + if (msg == null) msg = '???'; + + Object.defineProperty(this, 'message', { + enumerable: false, + writable: true, + value: msg + }); + + this.hash = hash; + + var stacktrace; + if (hash && hash.exception instanceof Error) { + var ex2 = hash.exception; + this.message = ex2.message || msg; + stacktrace = ex2.stack; + } + if (!stacktrace) { + if (Error.hasOwnProperty('captureStackTrace')) { // V8 + Error.captureStackTrace(this, this.constructor); + } else { + stacktrace = (new Error(msg)).stack; + } + } + if (stacktrace) { + Object.defineProperty(this, 'stack', { + enumerable: false, + writable: false, + value: stacktrace + }); + } +} + +if (typeof Object.setPrototypeOf === 'function') { + Object.setPrototypeOf(JisonLexerError.prototype, Error.prototype); +} else { + JisonLexerError.prototype = Object.create(Error.prototype); +} +JisonLexerError.prototype.constructor = JisonLexerError; +JisonLexerError.prototype.name = 'JisonLexerError';`; + + // --- END lexer error class --- + + return prelude; +} + + +const jisonLexerErrorDefinition = generateErrorClass(); + + +function generateFakeXRegExpClassSrcCode() { + return rmCommonWS` + var __hacky_counter__ = 0; + + /** + * @constructor + * @nocollapse + */ + function XRegExp(re, f) { + this.re = re; + this.flags = f; + this._getUnicodeProperty = function (k) {}; + var fake = /./; // WARNING: this exact 'fake' is also depended upon by the xregexp unit test! + __hacky_counter__++; + fake.__hacky_backy__ = __hacky_counter__; + return fake; + } + `; +} + + + +/** @constructor */ +function RegExpLexer(dict, input, tokens, build_options) { + var opts; + var dump = false; + + function test_me(tweak_cb, description, src_exception, ex_callback) { + opts = processGrammar(dict, tokens, build_options); + opts.__in_rules_failure_analysis_mode__ = false; + prepExportStructures(opts); + assert(opts.options); + if (tweak_cb) { + tweak_cb(); + } + var source = generateModuleBody(opts); + try { + // The generated code will always have the `lexer` variable declared at local scope + // as `eval()` will use the local scope. + // + // The compiled code will look something like this: + // + // ``` + // var lexer; + // bla bla... + // ``` + // + // or + // + // ``` + // var lexer = { bla... }; + // ``` + var testcode = [ + '// provide a local version for test purposes:', + jisonLexerErrorDefinition, + '', + generateFakeXRegExpClassSrcCode(), + '', + source, + '', + 'return lexer;'].join('\n'); + var lexer = code_exec(testcode, function generated_code_exec_wrapper_regexp_lexer(sourcecode) { + //console.log("===============================LEXER TEST CODE\n", sourcecode, "\n=====================END====================\n"); + var lexer_f = new Function('', sourcecode); + return lexer_f(); + }, opts.options, "lexer"); + + if (!lexer) { + throw new Error('no lexer defined *at all*?!'); + } + if (typeof lexer.options !== 'object' || lexer.options == null) { + throw new Error('your lexer class MUST have an .options member object or it won\'t fly!'); + } + if (typeof lexer.setInput !== 'function') { + throw new Error('your lexer class MUST have a .setInput function member or it won\'t fly!'); + } + if (lexer.EOF !== 1 && lexer.ERROR !== 2) { + throw new Error('your lexer class MUST have these constants defined: lexer.EOF = 1 and lexer.ERROR = 2 or it won\'t fly!'); + } + + // When we do NOT crash, we found/killed the problem area just before this call! + if (src_exception && description) { + src_exception.message += '\n (' + description + ')'; + } + + // patch the pre and post handlers in there, now that we have some live code to work with: + if (opts.options) { + var pre = opts.options.pre_lex; + var post = opts.options.post_lex; + // since JSON cannot encode functions, we'll have to do it manually now: + if (typeof pre === 'function') { + lexer.options.pre_lex = pre; + } + if (typeof post === 'function') { + lexer.options.post_lex = post; + } + } + + if (opts.options.showSource) { + if (typeof opts.options.showSource === 'function') { + opts.options.showSource(lexer, source, opts); + } else { + console.log("\nGenerated lexer sourcecode:\n----------------------------------------\n", source, "\n----------------------------------------\n"); + } + } + return lexer; + } catch (ex) { + // if (src_exception) { + // src_exception.message += '\n (' + description + ': ' + ex.message + ')'; + // } + + if (ex_callback) { + ex_callback(ex); + } else if (dump) { + console.log('source code:\n', source); + } + return false; + } + } + + /** @constructor */ + var lexer = test_me(null, null, null, function (ex) { + // When we get an exception here, it means some part of the user-specified lexer is botched. + // + // Now we go and try to narrow down the problem area/category: + assert(opts.options); + assert(opts.options.xregexp !== undefined); + var orig_xregexp_opt = !!opts.options.xregexp; + if (!test_me(function () { + assert(opts.options.xregexp !== undefined); + opts.options.xregexp = false; + opts.showSource = false; + }, 'When you have specified %option xregexp, you must also properly IMPORT the XRegExp library in the generated lexer.', ex, null)) { + if (!test_me(function () { + // restore xregexp option setting: the trouble wasn't caused by the xregexp flag i.c.w. incorrect XRegExp library importing! + opts.options.xregexp = orig_xregexp_opt; + + opts.conditions = []; + opts.showSource = false; + }, ((dict.rules && dict.rules.length > 0) ? + 'One or more of your lexer state names are possibly botched?' : + 'Your custom lexer is somehow botched.'), ex, null)) { + if (!test_me(function () { + // opts.conditions = []; + opts.rules = []; + opts.showSource = false; + opts.__in_rules_failure_analysis_mode__ = true; + }, 'One or more of your lexer rules are possibly botched?', ex, null)) { + // kill each rule action block, one at a time and test again after each 'edit': + var rv = false; + for (var i = 0, len = (dict.rules ? dict.rules.length : 0); i < len; i++) { + dict.rules[i][1] = '{ /* nada */ }'; + rv = test_me(function () { + // opts.conditions = []; + // opts.rules = []; + // opts.__in_rules_failure_analysis_mode__ = true; + }, 'Your lexer rule "' + dict.rules[i][0] + '" action code block is botched?', ex, null); + if (rv) { + break; + } + } + if (!rv) { + test_me(function () { + opts.conditions = []; + opts.rules = []; + opts.performAction = 'null'; + // opts.options = {}; + // opts.caseHelperInclude = '{}'; + opts.showSource = false; + opts.__in_rules_failure_analysis_mode__ = true; + + dump = false; + }, 'One or more of your lexer rule action code block(s) are possibly botched?', ex, null); + } + } + } + } + throw ex; + }); + + lexer.setInput(input); + + /** @public */ + lexer.generate = function () { + return generateFromOpts(opts); + }; + /** @public */ + lexer.generateModule = function () { + return generateModule(opts); + }; + /** @public */ + lexer.generateCommonJSModule = function () { + return generateCommonJSModule(opts); + }; + /** @public */ + lexer.generateESModule = function () { + return generateESModule(opts); + }; + /** @public */ + lexer.generateAMDModule = function () { + return generateAMDModule(opts); + }; + + // internal APIs to aid testing: + /** @public */ + lexer.getExpandedMacros = function () { + return opts.macros; + }; + + return lexer; +} + +// code stripping performance test for very simple grammar: +// +// - removing backtracking parser code branches: 730K -> 750K rounds +// - removing all location info tracking: yylineno, yylloc, etc.: 750K -> 900K rounds +// - no `yyleng`: 900K -> 905K rounds +// - no `this.done` as we cannot have a NULL `_input` anymore: 905K -> 930K rounds +// - `simpleCaseActionClusters` as array instead of hash object: 930K -> 940K rounds +// - lexers which have only return stmts, i.e. only a +// `simpleCaseActionClusters` lookup table to produce +// lexer tokens: *inline* the `performAction` call: 940K -> 950K rounds +// - given all the above, you can *inline* what's left of +// `lexer_next()`: 950K -> 955K rounds (? this stuff becomes hard to measure; inaccuracy abounds!) +// +// Total gain when we forget about very minor (and tough to nail) *inlining* `lexer_next()` gains: +// +// 730 -> 950 ~ 30% performance gain. +// + +// As a function can be reproduced in source-code form by any JavaScript engine, we're going to wrap this chunk +// of code in a function so that we can easily get it including it comments, etc.: +/** +@public +@nocollapse +*/ +function getRegExpLexerPrototype() { + // --- START lexer kernel --- +return `{ + EOF: 1, + ERROR: 2, + + // JisonLexerError: JisonLexerError, /// <-- injected by the code generator + + // options: {}, /// <-- injected by the code generator + + // yy: ..., /// <-- injected by setInput() + + __currentRuleSet__: null, /// INTERNAL USE ONLY: internal rule set cache for the current lexer state + + __error_infos: [], /// INTERNAL USE ONLY: the set of lexErrorInfo objects created since the last cleanup + + __decompressed: false, /// INTERNAL USE ONLY: mark whether the lexer instance has been 'unfolded' completely and is now ready for use + + done: false, /// INTERNAL USE ONLY + _backtrack: false, /// INTERNAL USE ONLY + _input: '', /// INTERNAL USE ONLY + _more: false, /// INTERNAL USE ONLY + _signaled_error_token: false, /// INTERNAL USE ONLY + + conditionStack: [], /// INTERNAL USE ONLY; managed via \`pushState()\`, \`popState()\`, \`topState()\` and \`stateStackSize()\` + + match: '', /// READ-ONLY EXTERNAL ACCESS - ADVANCED USE ONLY: tracks input which has been matched so far for the lexer token under construction. \`match\` is identical to \`yytext\` except that this one still contains the matched input string after \`lexer.performAction()\` has been invoked, where userland code MAY have changed/replaced the \`yytext\` value entirely! + matched: '', /// READ-ONLY EXTERNAL ACCESS - ADVANCED USE ONLY: tracks entire input which has been matched so far + matches: false, /// READ-ONLY EXTERNAL ACCESS - ADVANCED USE ONLY: tracks RE match result for last (successful) match attempt + yytext: '', /// ADVANCED USE ONLY: tracks input which has been matched so far for the lexer token under construction; this value is transferred to the parser as the 'token value' when the parser consumes the lexer token produced through a call to the \`lex()\` API. + offset: 0, /// READ-ONLY EXTERNAL ACCESS - ADVANCED USE ONLY: tracks the 'cursor position' in the input string, i.e. the number of characters matched so far + yyleng: 0, /// READ-ONLY EXTERNAL ACCESS - ADVANCED USE ONLY: length of matched input for the token under construction (\`yytext\`) + yylineno: 0, /// READ-ONLY EXTERNAL ACCESS - ADVANCED USE ONLY: 'line number' at which the token under construction is located + yylloc: null, /// READ-ONLY EXTERNAL ACCESS - ADVANCED USE ONLY: tracks location info (lines + columns) for the token under construction + + /** + * INTERNAL USE: construct a suitable error info hash object instance for \`parseError\`. + * + * @public + * @this {RegExpLexer} + */ + constructLexErrorInfo: function lexer_constructLexErrorInfo(msg, recoverable, show_input_position) { + msg = '' + msg; + + // heuristic to determine if the error message already contains a (partial) source code dump + // as produced by either \`showPosition()\` or \`prettyPrintRange()\`: + if (show_input_position == undefined) { + show_input_position = !(msg.indexOf('\\n') > 0 && msg.indexOf('^') > 0); + } + if (this.yylloc && show_input_position) { + if (typeof this.prettyPrintRange === 'function') { + var pretty_src = this.prettyPrintRange(this.yylloc); + + if (!/\\n\\s*$/.test(msg)) { + msg += '\\n'; + } + msg += '\\n Erroneous area:\\n' + this.prettyPrintRange(this.yylloc); + } else if (typeof this.showPosition === 'function') { + var pos_str = this.showPosition(); + if (pos_str) { + if (msg.length && msg[msg.length - 1] !== '\\n' && pos_str[0] !== '\\n') { + msg += '\\n' + pos_str; + } else { + msg += pos_str; + } + } + } + } + /** @constructor */ + var pei = { + errStr: msg, + recoverable: !!recoverable, + text: this.match, // This one MAY be empty; userland code should use the \`upcomingInput\` API to obtain more text which follows the 'lexer cursor position'... + token: null, + line: this.yylineno, + loc: this.yylloc, + yy: this.yy, + lexer: this, + + /** + * and make sure the error info doesn't stay due to potential + * ref cycle via userland code manipulations. + * These would otherwise all be memory leak opportunities! + * + * Note that only array and object references are nuked as those + * constitute the set of elements which can produce a cyclic ref. + * The rest of the members is kept intact as they are harmless. + * + * @public + * @this {LexErrorInfo} + */ + destroy: function destructLexErrorInfo() { + // remove cyclic references added to error info: + // info.yy = null; + // info.lexer = null; + // ... + var rec = !!this.recoverable; + for (var key in this) { + if (this.hasOwnProperty(key) && typeof key === 'object') { + this[key] = undefined; + } + } + this.recoverable = rec; + } + }; + // track this instance so we can \`destroy()\` it once we deem it superfluous and ready for garbage collection! + this.__error_infos.push(pei); + return pei; + }, + + /** + * handler which is invoked when a lexer error occurs. + * + * @public + * @this {RegExpLexer} + */ + parseError: function lexer_parseError(str, hash, ExceptionClass) { + if (!ExceptionClass) { + ExceptionClass = this.JisonLexerError; + } + if (this.yy) { + if (this.yy.parser && typeof this.yy.parser.parseError === 'function') { + return this.yy.parser.parseError.call(this, str, hash, ExceptionClass) || this.ERROR; + } else if (typeof this.yy.parseError === 'function') { + return this.yy.parseError.call(this, str, hash, ExceptionClass) || this.ERROR; + } + } + throw new ExceptionClass(str, hash); + }, + + /** + * method which implements \`yyerror(str, ...args)\` functionality for use inside lexer actions. + * + * @public + * @this {RegExpLexer} + */ + yyerror: function yyError(str /*, ...args */) { + var lineno_msg = ''; + if (this.yylloc) { + lineno_msg = ' on line ' + (this.yylineno + 1); + } + var p = this.constructLexErrorInfo('Lexical error' + lineno_msg + ': ' + str, this.options.lexerErrorsAreRecoverable); + + // Add any extra args to the hash under the name \`extra_error_attributes\`: + var args = Array.prototype.slice.call(arguments, 1); + if (args.length) { + p.extra_error_attributes = args; + } + + return (this.parseError(p.errStr, p, this.JisonLexerError) || this.ERROR); + }, + + /** + * final cleanup function for when we have completed lexing the input; + * make it an API so that external code can use this one once userland + * code has decided it's time to destroy any lingering lexer error + * hash object instances and the like: this function helps to clean + * up these constructs, which *may* carry cyclic references which would + * otherwise prevent the instances from being properly and timely + * garbage-collected, i.e. this function helps prevent memory leaks! + * + * @public + * @this {RegExpLexer} + */ + cleanupAfterLex: function lexer_cleanupAfterLex(do_not_nuke_errorinfos) { + // prevent lingering circular references from causing memory leaks: + this.setInput('', {}); + + // nuke the error hash info instances created during this run. + // Userland code must COPY any data/references + // in the error hash instance(s) it is more permanently interested in. + if (!do_not_nuke_errorinfos) { + for (var i = this.__error_infos.length - 1; i >= 0; i--) { + var el = this.__error_infos[i]; + if (el && typeof el.destroy === 'function') { + el.destroy(); + } + } + this.__error_infos.length = 0; + } + + return this; + }, + + /** + * clear the lexer token context; intended for internal use only + * + * @public + * @this {RegExpLexer} + */ + clear: function lexer_clear() { + this.yytext = ''; + this.yyleng = 0; + this.match = ''; + // - DO NOT reset \`this.matched\` + this.matches = false; + this._more = false; + this._backtrack = false; + + var col = (this.yylloc ? this.yylloc.last_column : 0); + this.yylloc = { + first_line: this.yylineno + 1, + first_column: col, + last_line: this.yylineno + 1, + last_column: col, + + range: [this.offset, this.offset] + }; + }, + + /** + * resets the lexer, sets new input + * + * @public + * @this {RegExpLexer} + */ + setInput: function lexer_setInput(input, yy) { + this.yy = yy || this.yy || {}; + + // also check if we've fully initialized the lexer instance, + // including expansion work to be done to go from a loaded + // lexer to a usable lexer: + if (!this.__decompressed) { + // step 1: decompress the regex list: + var rules = this.rules; + for (var i = 0, len = rules.length; i < len; i++) { + var rule_re = rules[i]; + + // compression: is the RE an xref to another RE slot in the rules[] table? + if (typeof rule_re === 'number') { + rules[i] = rules[rule_re]; + } + } + + // step 2: unfold the conditions[] set to make these ready for use: + var conditions = this.conditions; + for (var k in conditions) { + var spec = conditions[k]; + + var rule_ids = spec.rules; + + var len = rule_ids.length; + var rule_regexes = new Array(len + 1); // slot 0 is unused; we use a 1-based index approach here to keep the hottest code in \`lexer_next()\` fast and simple! + var rule_new_ids = new Array(len + 1); + + for (var i = 0; i < len; i++) { + var idx = rule_ids[i]; + var rule_re = rules[idx]; + rule_regexes[i + 1] = rule_re; + rule_new_ids[i + 1] = idx; + } + + spec.rules = rule_new_ids; + spec.__rule_regexes = rule_regexes; + spec.__rule_count = len; + } + + this.__decompressed = true; + } + + this._input = input || ''; + this.clear(); + this._signaled_error_token = false; + this.done = false; + this.yylineno = 0; + this.matched = ''; + this.conditionStack = ['INITIAL']; + this.__currentRuleSet__ = null; + this.yylloc = { + first_line: 1, + first_column: 0, + last_line: 1, + last_column: 0, + + range: [0, 0] + }; + this.offset = 0; + return this; + }, + + /** + * edit the remaining input via user-specified callback. + * This can be used to forward-adjust the input-to-parse, + * e.g. inserting macro expansions and alike in the + * input which has yet to be lexed. + * The behaviour of this API contrasts the \`unput()\` et al + * APIs as those act on the *consumed* input, while this + * one allows one to manipulate the future, without impacting + * the current \`yyloc\` cursor location or any history. + * + * Use this API to help implement C-preprocessor-like + * \`#include\` statements, etc. + * + * The provided callback must be synchronous and is + * expected to return the edited input (string). + * + * The \`cpsArg\` argument value is passed to the callback + * as-is. + * + * \`callback\` interface: + * \`function callback(input, cpsArg)\` + * + * - \`input\` will carry the remaining-input-to-lex string + * from the lexer. + * - \`cpsArg\` is \`cpsArg\` passed into this API. + * + * The \`this\` reference for the callback will be set to + * reference this lexer instance so that userland code + * in the callback can easily and quickly access any lexer + * API. + * + * When the callback returns a non-string-type falsey value, + * we assume the callback did not edit the input and we + * will using the input as-is. + * + * When the callback returns a non-string-type value, it + * is converted to a string for lexing via the \`"" + retval\` + * operation. (See also why: http://2ality.com/2012/03/converting-to-string.html + * -- that way any returned object's \`toValue()\` and \`toString()\` + * methods will be invoked in a proper/desirable order.) + * + * @public + * @this {RegExpLexer} + */ + editRemainingInput: function lexer_editRemainingInput(callback, cpsArg) { + var rv = callback.call(this, this._input, cpsArg); + if (typeof rv !== 'string') { + if (rv) { + this._input = '' + rv; + } + // else: keep \`this._input\` as is. + } else { + this._input = rv; + } + return this; + }, + + /** + * consumes and returns one char from the input + * + * @public + * @this {RegExpLexer} + */ + input: function lexer_input() { + if (!this._input) { + //this.done = true; -- don't set \`done\` as we want the lex()/next() API to be able to produce one custom EOF token match after this anyhow. (lexer can match special <> tokens and perform user action code for a <> match, but only does so *once*) + return null; + } + var ch = this._input[0]; + this.yytext += ch; + this.yyleng++; + this.offset++; + this.match += ch; + this.matched += ch; + // Count the linenumber up when we hit the LF (or a stand-alone CR). + // On CRLF, the linenumber is incremented when you fetch the CR or the CRLF combo + // and we advance immediately past the LF as well, returning both together as if + // it was all a single 'character' only. + var slice_len = 1; + var lines = false; + if (ch === '\\n') { + lines = true; + } else if (ch === '\\r') { + lines = true; + var ch2 = this._input[1]; + if (ch2 === '\\n') { + slice_len++; + ch += ch2; + this.yytext += ch2; + this.yyleng++; + this.offset++; + this.match += ch2; + this.matched += ch2; + this.yylloc.range[1]++; + } + } + if (lines) { + this.yylineno++; + this.yylloc.last_line++; + this.yylloc.last_column = 0; + } else { + this.yylloc.last_column++; + } + this.yylloc.range[1]++; + + this._input = this._input.slice(slice_len); + return ch; + }, + + /** + * unshifts one char (or an entire string) into the input + * + * @public + * @this {RegExpLexer} + */ + unput: function lexer_unput(ch) { + var len = ch.length; + var lines = ch.split(/(?:\\r\\n?|\\n)/g); + + this._input = ch + this._input; + this.yytext = this.yytext.substr(0, this.yytext.length - len); + this.yyleng = this.yytext.length; + this.offset -= len; + this.match = this.match.substr(0, this.match.length - len); + this.matched = this.matched.substr(0, this.matched.length - len); + + if (lines.length > 1) { + this.yylineno -= lines.length - 1; + + this.yylloc.last_line = this.yylineno + 1; + + // Get last entirely matched line into the \`pre_lines[]\` array's + // last index slot; we don't mind when other previously + // matched lines end up in the array too. + var pre = this.match; + var pre_lines = pre.split(/(?:\\r\\n?|\\n)/g); + if (pre_lines.length === 1) { + pre = this.matched; + pre_lines = pre.split(/(?:\\r\\n?|\\n)/g); + } + this.yylloc.last_column = pre_lines[pre_lines.length - 1].length; + } else { + this.yylloc.last_column -= len; + } + + this.yylloc.range[1] = this.yylloc.range[0] + this.yyleng; + + this.done = false; + return this; + }, + + /** + * cache matched text and append it on next action + * + * @public + * @this {RegExpLexer} + */ + more: function lexer_more() { + this._more = true; + return this; + }, + + /** + * signal the lexer that this rule fails to match the input, so the + * next matching rule (regex) should be tested instead. + * + * @public + * @this {RegExpLexer} + */ + reject: function lexer_reject() { + if (this.options.backtrack_lexer) { + this._backtrack = true; + } else { + // when the \`parseError()\` call returns, we MUST ensure that the error is registered. + // We accomplish this by signaling an 'error' token to be produced for the current + // \`.lex()\` run. + var lineno_msg = ''; + if (this.yylloc) { + lineno_msg = ' on line ' + (this.yylineno + 1); + } + var p = this.constructLexErrorInfo('Lexical error' + lineno_msg + ': You can only invoke reject() in the lexer when the lexer is of the backtracking persuasion (options.backtrack_lexer = true).', false); + this._signaled_error_token = (this.parseError(p.errStr, p, this.JisonLexerError) || this.ERROR); + } + return this; + }, + + /** + * retain first n characters of the match + * + * @public + * @this {RegExpLexer} + */ + less: function lexer_less(n) { + return this.unput(this.match.slice(n)); + }, + + /** + * return (part of the) already matched input, i.e. for error + * messages. + * + * Limit the returned string length to \`maxSize\` (default: 20). + * + * Limit the returned string to the \`maxLines\` number of lines of + * input (default: 1). + * + * Negative limit values equal *unlimited*. + * + * @public + * @this {RegExpLexer} + */ + pastInput: function lexer_pastInput(maxSize, maxLines) { + var past = this.matched.substring(0, this.matched.length - this.match.length); + if (maxSize < 0) + maxSize = past.length; + else if (!maxSize) + maxSize = 20; + if (maxLines < 0) + maxLines = past.length; // can't ever have more input lines than this! + else if (!maxLines) + maxLines = 1; + // \`substr\` anticipation: treat \\r\\n as a single character and take a little + // more than necessary so that we can still properly check against maxSize + // after we've transformed and limited the newLines in here: + past = past.substr(-maxSize * 2 - 2); + // now that we have a significantly reduced string to process, transform the newlines + // and chop them, then limit them: + var a = past.replace(/\\r\\n|\\r/g, '\\n').split('\\n'); + a = a.slice(-maxLines); + past = a.join('\\n'); + // When, after limiting to maxLines, we still have too much to return, + // do add an ellipsis prefix... + if (past.length > maxSize) { + past = '...' + past.substr(-maxSize); + } + return past; + }, + + /** + * return (part of the) upcoming input, i.e. for error messages. + * + * Limit the returned string length to \`maxSize\` (default: 20). + * + * Limit the returned string to the \`maxLines\` number of lines of input (default: 1). + * + * Negative limit values equal *unlimited*. + * + * > ### NOTE ### + * > + * > *"upcoming input"* is defined as the whole of the both + * > the *currently lexed* input, together with any remaining input + * > following that. *"currently lexed"* input is the input + * > already recognized by the lexer but not yet returned with + * > the lexer token. This happens when you are invoking this API + * > from inside any lexer rule action code block. + * > + * + * @public + * @this {RegExpLexer} + */ + upcomingInput: function lexer_upcomingInput(maxSize, maxLines) { + var next = this.match; + if (maxSize < 0) + maxSize = next.length + this._input.length; + else if (!maxSize) + maxSize = 20; + if (maxLines < 0) + maxLines = maxSize; // can't ever have more input lines than this! + else if (!maxLines) + maxLines = 1; + // \`substring\` anticipation: treat \\r\\n as a single character and take a little + // more than necessary so that we can still properly check against maxSize + // after we've transformed and limited the newLines in here: + if (next.length < maxSize * 2 + 2) { + next += this._input.substring(0, maxSize * 2 + 2); // substring is faster on Chrome/V8 + } + // now that we have a significantly reduced string to process, transform the newlines + // and chop them, then limit them: + var a = next.replace(/\\r\\n|\\r/g, '\\n').split('\\n'); + a = a.slice(0, maxLines); + next = a.join('\\n'); + // When, after limiting to maxLines, we still have too much to return, + // do add an ellipsis postfix... + if (next.length > maxSize) { + next = next.substring(0, maxSize) + '...'; + } + return next; + }, + + /** + * return a string which displays the character position where the + * lexing error occurred, i.e. for error messages + * + * @public + * @this {RegExpLexer} + */ + showPosition: function lexer_showPosition(maxPrefix, maxPostfix) { + var pre = this.pastInput(maxPrefix).replace(/\\s/g, ' '); + var c = new Array(pre.length + 1).join('-'); + return pre + this.upcomingInput(maxPostfix).replace(/\\s/g, ' ') + '\\n' + c + '^'; + }, + + /** + * return a string which displays the lines & columns of input which are referenced + * by the given location info range, plus a few lines of context. + * + * This function pretty-prints the indicated section of the input, with line numbers + * and everything! + * + * This function is very useful to provide highly readable error reports, while + * the location range may be specified in various flexible ways: + * + * - \`loc\` is the location info object which references the area which should be + * displayed and 'marked up': these lines & columns of text are marked up by \`^\` + * characters below each character in the entire input range. + * + * - \`context_loc\` is the *optional* location info object which instructs this + * pretty-printer how much *leading* context should be displayed alongside + * the area referenced by \`loc\`. This can help provide context for the displayed + * error, etc. + * + * When this location info is not provided, a default context of 3 lines is + * used. + * + * - \`context_loc2\` is another *optional* location info object, which serves + * a similar purpose to \`context_loc\`: it specifies the amount of *trailing* + * context lines to display in the pretty-print output. + * + * When this location info is not provided, a default context of 1 line only is + * used. + * + * Special Notes: + * + * - when the \`loc\`-indicated range is very large (about 5 lines or more), then + * only the first and last few lines of this block are printed while a + * \`...continued...\` message will be printed between them. + * + * This serves the purpose of not printing a huge amount of text when the \`loc\` + * range happens to be huge: this way a manageable & readable output results + * for arbitrary large ranges. + * + * - this function can display lines of input which whave not yet been lexed. + * \`prettyPrintRange()\` can access the entire input! + * + * @public + * @this {RegExpLexer} + */ + prettyPrintRange: function lexer_prettyPrintRange(loc, context_loc, context_loc2) { + var error_size = loc.last_line - loc.first_line; + const CONTEXT = 3; + const CONTEXT_TAIL = 1; + const MINIMUM_VISIBLE_NONEMPTY_LINE_COUNT = 2; + var input = this.matched + this._input; + var lines = input.split('\\n'); + //var show_context = (error_size < 5 || context_loc); + var l0 = Math.max(1, (context_loc ? context_loc.first_line : loc.first_line - CONTEXT)); + var l1 = Math.max(1, (context_loc2 ? context_loc2.last_line : loc.last_line + CONTEXT_TAIL)); + var lineno_display_width = (1 + Math.log10(l1 | 1) | 0); + var ws_prefix = new Array(lineno_display_width).join(' '); + var nonempty_line_indexes = []; + var rv = lines.slice(l0 - 1, l1 + 1).map(function injectLineNumber(line, index) { + var lno = index + l0; + var lno_pfx = (ws_prefix + lno).substr(-lineno_display_width); + var rv = lno_pfx + ': ' + line; + var errpfx = (new Array(lineno_display_width + 1)).join('^'); + var offset = 2 + 1; + var len = 0; + + if (lno === loc.first_line) { + offset += loc.first_column; + + len = Math.max( + 2, + ((lno === loc.last_line ? loc.last_column : line.length)) - loc.first_column + 1 + ); + } else if (lno === loc.last_line) { + len = Math.max(2, loc.last_column + 1); + } else if (lno > loc.first_line && lno < loc.last_line) { + len = Math.max(2, line.length + 1); + } + + if (len) { + var lead = new Array(offset).join('.'); + var mark = new Array(len).join('^'); + rv += '\\n' + errpfx + lead + mark; + + if (line.trim().length > 0) { + nonempty_line_indexes.push(index); + } + } + + rv = rv.replace(/\\t/g, ' '); + return rv; + }); + + // now make sure we don't print an overly large amount of error area: limit it + // to the top and bottom line count: + if (nonempty_line_indexes.length > 2 * MINIMUM_VISIBLE_NONEMPTY_LINE_COUNT) { + var clip_start = nonempty_line_indexes[MINIMUM_VISIBLE_NONEMPTY_LINE_COUNT - 1] + 1; + var clip_end = nonempty_line_indexes[nonempty_line_indexes.length - MINIMUM_VISIBLE_NONEMPTY_LINE_COUNT] - 1; + + var intermediate_line = (new Array(lineno_display_width + 1)).join(' ') + ' (...continued...)'; + intermediate_line += '\\n' + (new Array(lineno_display_width + 1)).join('-') + ' (---------------)'; + rv.splice(clip_start, clip_end - clip_start + 1, intermediate_line); + } + return rv.join('\\n'); + }, + + /** + * helper function, used to produce a human readable description as a string, given + * the input \`yylloc\` location object. + * + * Set \`display_range_too\` to TRUE to include the string character index position(s) + * in the description if the \`yylloc.range\` is available. + * + * @public + * @this {RegExpLexer} + */ + describeYYLLOC: function lexer_describe_yylloc(yylloc, display_range_too) { + var l1 = yylloc.first_line; + var l2 = yylloc.last_line; + var c1 = yylloc.first_column; + var c2 = yylloc.last_column; + var dl = l2 - l1; + var dc = c2 - c1; + var rv; + if (dl === 0) { + rv = 'line ' + l1 + ', '; + if (dc <= 1) { + rv += 'column ' + c1; + } else { + rv += 'columns ' + c1 + ' .. ' + c2; + } + } else { + rv = 'lines ' + l1 + '(column ' + c1 + ') .. ' + l2 + '(column ' + c2 + ')'; + } + if (yylloc.range && display_range_too) { + var r1 = yylloc.range[0]; + var r2 = yylloc.range[1] - 1; + if (r2 <= r1) { + rv += ' {String Offset: ' + r1 + '}'; + } else { + rv += ' {String Offset range: ' + r1 + ' .. ' + r2 + '}'; + } + } + return rv; + }, + + /** + * test the lexed token: return FALSE when not a match, otherwise return token. + * + * \`match\` is supposed to be an array coming out of a regex match, i.e. \`match[0]\` + * contains the actually matched text string. + * + * Also move the input cursor forward and update the match collectors: + * + * - \`yytext\` + * - \`yyleng\` + * - \`match\` + * - \`matches\` + * - \`yylloc\` + * - \`offset\` + * + * @public + * @this {RegExpLexer} + */ + test_match: function lexer_test_match(match, indexed_rule) { + var token, + lines, + backup, + match_str, + match_str_len; + + if (this.options.backtrack_lexer) { + // save context + backup = { + yylineno: this.yylineno, + yylloc: { + first_line: this.yylloc.first_line, + last_line: this.yylloc.last_line, + first_column: this.yylloc.first_column, + last_column: this.yylloc.last_column, + + range: this.yylloc.range.slice(0) + }, + yytext: this.yytext, + match: this.match, + matches: this.matches, + matched: this.matched, + yyleng: this.yyleng, + offset: this.offset, + _more: this._more, + _input: this._input, + //_signaled_error_token: this._signaled_error_token, + yy: this.yy, + conditionStack: this.conditionStack.slice(0), + done: this.done + }; + } + + match_str = match[0]; + match_str_len = match_str.length; + // if (match_str.indexOf('\\n') !== -1 || match_str.indexOf('\\r') !== -1) { + lines = match_str.split(/(?:\\r\\n?|\\n)/g); + if (lines.length > 1) { + this.yylineno += lines.length - 1; + + this.yylloc.last_line = this.yylineno + 1; + this.yylloc.last_column = lines[lines.length - 1].length; + } else { + this.yylloc.last_column += match_str_len; + } + // } + this.yytext += match_str; + this.match += match_str; + this.matched += match_str; + this.matches = match; + this.yyleng = this.yytext.length; + this.yylloc.range[1] += match_str_len; + + // previous lex rules MAY have invoked the \`more()\` API rather than producing a token: + // those rules will already have moved this \`offset\` forward matching their match lengths, + // hence we must only add our own match length now: + this.offset += match_str_len; + this._more = false; + this._backtrack = false; + this._input = this._input.slice(match_str_len); + + // calling this method: + // + // function lexer__performAction(yy, yyrulenumber, YY_START) {...} + token = this.performAction.call(this, this.yy, indexed_rule, this.conditionStack[this.conditionStack.length - 1] /* = YY_START */); + // otherwise, when the action codes are all simple return token statements: + //token = this.simpleCaseActionClusters[indexed_rule]; + + if (this.done && this._input) { + this.done = false; + } + if (token) { + return token; + } else if (this._backtrack) { + // recover context + for (var k in backup) { + this[k] = backup[k]; + } + this.__currentRuleSet__ = null; + return false; // rule action called reject() implying the next rule should be tested instead. + } else if (this._signaled_error_token) { + // produce one 'error' token as \`.parseError()\` in \`reject()\` + // did not guarantee a failure signal by throwing an exception! + token = this._signaled_error_token; + this._signaled_error_token = false; + return token; + } + return false; + }, + + /** + * return next match in input + * + * @public + * @this {RegExpLexer} + */ + next: function lexer_next() { + if (this.done) { + this.clear(); + return this.EOF; + } + if (!this._input) { + this.done = true; + } + + var token, + match, + tempMatch, + index; + if (!this._more) { + this.clear(); + } + var spec = this.__currentRuleSet__; + if (!spec) { + // Update the ruleset cache as we apparently encountered a state change or just started lexing. + // The cache is set up for fast lookup -- we assume a lexer will switch states much less often than it will + // invoke the \`lex()\` token-producing API and related APIs, hence caching the set for direct access helps + // speed up those activities a tiny bit. + spec = this.__currentRuleSet__ = this._currentRules(); + // Check whether a *sane* condition has been pushed before: this makes the lexer robust against + // user-programmer bugs such as https://github.com/zaach/jison-lex/issues/19 + if (!spec || !spec.rules) { + var lineno_msg = ''; + if (this.options.trackPosition) { + lineno_msg = ' on line ' + (this.yylineno + 1); + } + var p = this.constructLexErrorInfo('Internal lexer engine error' + lineno_msg + ': The lex grammar programmer pushed a non-existing condition name "' + this.topState() + '"; this is a fatal error and should be reported to the application programmer team!', false); + // produce one 'error' token until this situation has been resolved, most probably by parse termination! + return (this.parseError(p.errStr, p, this.JisonLexerError) || this.ERROR); + } + } + + var rule_ids = spec.rules; + var regexes = spec.__rule_regexes; + var len = spec.__rule_count; + + // Note: the arrays are 1-based, while \`len\` itself is a valid index, + // hence the non-standard less-or-equal check in the next loop condition! + for (var i = 1; i <= len; i++) { + tempMatch = this._input.match(regexes[i]); + if (tempMatch && (!match || tempMatch[0].length > match[0].length)) { + match = tempMatch; + index = i; + if (this.options.backtrack_lexer) { + token = this.test_match(tempMatch, rule_ids[i]); + if (token !== false) { + return token; + } else if (this._backtrack) { + match = undefined; + continue; // rule action called reject() implying a rule MISmatch. + } else { + // else: this is a lexer rule which consumes input without producing a token (e.g. whitespace) + return false; + } + } else if (!this.options.flex) { + break; + } + } + } + if (match) { + token = this.test_match(match, rule_ids[index]); + if (token !== false) { + return token; + } + // else: this is a lexer rule which consumes input without producing a token (e.g. whitespace) + return false; + } + if (!this._input) { + this.done = true; + this.clear(); + return this.EOF; + } else { + var lineno_msg = ''; + if (this.options.trackPosition) { + lineno_msg = ' on line ' + (this.yylineno + 1); + } + var p = this.constructLexErrorInfo('Lexical error' + lineno_msg + ': Unrecognized text.', this.options.lexerErrorsAreRecoverable); + + var pendingInput = this._input; + var activeCondition = this.topState(); + var conditionStackDepth = this.conditionStack.length; + + token = (this.parseError(p.errStr, p, this.JisonLexerError) || this.ERROR); + if (token === this.ERROR) { + // we can try to recover from a lexer error that \`parseError()\` did not 'recover' for us + // by moving forward at least one character at a time IFF the (user-specified?) \`parseError()\` + // has not consumed/modified any pending input or changed state in the error handler: + if (!this.matches && + // and make sure the input has been modified/consumed ... + pendingInput === this._input && + // ...or the lexer state has been modified significantly enough + // to merit a non-consuming error handling action right now. + activeCondition === this.topState() && + conditionStackDepth === this.conditionStack.length + ) { + this.input(); + } + } + return token; + } + }, + + /** + * return next match that has a token + * + * @public + * @this {RegExpLexer} + */ + lex: function lexer_lex() { + var r; + // allow the PRE/POST handlers set/modify the return token for maximum flexibility of the generated lexer: + if (typeof this.options.pre_lex === 'function') { + r = this.options.pre_lex.call(this); + } + + while (!r) { + r = this.next(); + } + + if (typeof this.options.post_lex === 'function') { + // (also account for a userdef function which does not return any value: keep the token as is) + r = this.options.post_lex.call(this, r) || r; + } + return r; + }, + + /** + * backwards compatible alias for \`pushState()\`; + * the latter is symmetrical with \`popState()\` and we advise to use + * those APIs in any modern lexer code, rather than \`begin()\`. + * + * @public + * @this {RegExpLexer} + */ + begin: function lexer_begin(condition) { + return this.pushState(condition); + }, + + /** + * activates a new lexer condition state (pushes the new lexer + * condition state onto the condition stack) + * + * @public + * @this {RegExpLexer} + */ + pushState: function lexer_pushState(condition) { + this.conditionStack.push(condition); + this.__currentRuleSet__ = null; + return this; + }, + + /** + * pop the previously active lexer condition state off the condition + * stack + * + * @public + * @this {RegExpLexer} + */ + popState: function lexer_popState() { + var n = this.conditionStack.length - 1; + if (n > 0) { + this.__currentRuleSet__ = null; + return this.conditionStack.pop(); + } else { + return this.conditionStack[0]; + } + }, + + /** + * return the currently active lexer condition state; when an index + * argument is provided it produces the N-th previous condition state, + * if available + * + * @public + * @this {RegExpLexer} + */ + topState: function lexer_topState(n) { + n = this.conditionStack.length - 1 - Math.abs(n || 0); + if (n >= 0) { + return this.conditionStack[n]; + } else { + return 'INITIAL'; + } + }, + + /** + * (internal) determine the lexer rule set which is active for the + * currently active lexer condition state + * + * @public + * @this {RegExpLexer} + */ + _currentRules: function lexer__currentRules() { + if (this.conditionStack.length && this.conditionStack[this.conditionStack.length - 1]) { + return this.conditions[this.conditionStack[this.conditionStack.length - 1]]; + } else { + return this.conditions['INITIAL']; + } + }, + + /** + * return the number of states currently on the stack + * + * @public + * @this {RegExpLexer} + */ + stateStackSize: function lexer_stateStackSize() { + return this.conditionStack.length; + } +}`; + // --- END lexer kernel --- +} + +RegExpLexer.prototype = (new Function(rmCommonWS` + return ${getRegExpLexerPrototype()}; +`))(); + + +// The lexer code stripper, driven by optimization analysis settings and +// lexer options, which cannot be changed at run-time. +function stripUnusedLexerCode(src, opt) { + // uses yyleng: ..................... ${opt.lexerActionsUseYYLENG} + // uses yylineno: ................... ${opt.lexerActionsUseYYLINENO} + // uses yytext: ..................... ${opt.lexerActionsUseYYTEXT} + // uses yylloc: ..................... ${opt.lexerActionsUseYYLOC} + // uses ParseError API: ............. ${opt.lexerActionsUseParseError} + // uses location tracking & editing: ${opt.lexerActionsUseLocationTracking} + // uses more() API: ................. ${opt.lexerActionsUseMore} + // uses unput() API: ................ ${opt.lexerActionsUseUnput} + // uses reject() API: ............... ${opt.lexerActionsUseReject} + // uses less() API: ................. ${opt.lexerActionsUseLess} + // uses display APIs pastInput(), upcomingInput(), showPosition(): + // ............................. ${opt.lexerActionsUseDisplayAPIs} + // uses describeYYLLOC() API: ....... ${opt.lexerActionsUseDescribeYYLOC} + + var ast = helpers.parseCodeChunkToAST(src, opt); + var new_src = helpers.prettyPrintAST(ast, opt); + +new_src = new_src.replace(/\/\*\s*JISON-LEX-ANALYTICS-REPORT\s*\*\//g, rmCommonWS` + // Code Generator Information Report + // --------------------------------- + // + // Options: + // + // backtracking: .................... ${opt.options.backtrack_lexer} + // location.ranges: ................. ${opt.options.ranges} + // location line+column tracking: ... ${opt.options.trackPosition} + // + // + // Forwarded Parser Analysis flags: + // + // uses yyleng: ..................... ${opt.parseActionsUseYYLENG} + // uses yylineno: ................... ${opt.parseActionsUseYYLINENO} + // uses yytext: ..................... ${opt.parseActionsUseYYTEXT} + // uses yylloc: ..................... ${opt.parseActionsUseYYLOC} + // uses lexer values: ............... ${opt.parseActionsUseValueTracking} / ${opt.parseActionsUseValueAssignment} + // location tracking: ............... ${opt.parseActionsUseLocationTracking} + // location assignment: ............. ${opt.parseActionsUseLocationAssignment} + // + // + // Lexer Analysis flags: + // + // uses yyleng: ..................... ${opt.lexerActionsUseYYLENG} + // uses yylineno: ................... ${opt.lexerActionsUseYYLINENO} + // uses yytext: ..................... ${opt.lexerActionsUseYYTEXT} + // uses yylloc: ..................... ${opt.lexerActionsUseYYLOC} + // uses ParseError API: ............. ${opt.lexerActionsUseParseError} + // uses yyerror: .................... ${opt.lexerActionsUseYYERROR} + // uses location tracking & editing: ${opt.lexerActionsUseLocationTracking} + // uses more() API: ................. ${opt.lexerActionsUseMore} + // uses unput() API: ................ ${opt.lexerActionsUseUnput} + // uses reject() API: ............... ${opt.lexerActionsUseReject} + // uses less() API: ................. ${opt.lexerActionsUseLess} + // uses display APIs pastInput(), upcomingInput(), showPosition(): + // ............................. ${opt.lexerActionsUseDisplayAPIs} + // uses describeYYLLOC() API: ....... ${opt.lexerActionsUseDescribeYYLOC} + // + // --------- END OF REPORT ----------- + + `); + + return new_src; +} + + + + + +// generate lexer source from a grammar +/** @public */ +function generate(dict, tokens, build_options) { + var opt = processGrammar(dict, tokens, build_options); + + return generateFromOpts(opt); +} + +// process the grammar and build final data structures and functions +/** @public */ +function processGrammar(dict, tokens, build_options) { + build_options = build_options || {}; + var opts = { + // include the knowledge passed through `build_options` about which lexer + // features will actually be *used* by the environment (which in 99.9% + // of cases is a jison *parser*): + // + // (this stuff comes straight from the jison Optimization Analysis.) + // + parseActionsUseYYLENG: build_options.parseActionsUseYYLENG, + parseActionsUseYYLINENO: build_options.parseActionsUseYYLINENO, + parseActionsUseYYTEXT: build_options.parseActionsUseYYTEXT, + parseActionsUseYYLOC: build_options.parseActionsUseYYLOC, + parseActionsUseParseError: build_options.parseActionsUseParseError, + parseActionsUseYYERROR: build_options.parseActionsUseYYERROR, + parseActionsUseYYERROK: build_options.parseActionsUseYYERROK, + parseActionsUseYYRECOVERING: build_options.parseActionsUseYYRECOVERING, + parseActionsUseYYCLEARIN: build_options.parseActionsUseYYCLEARIN, + parseActionsUseValueTracking: build_options.parseActionsUseValueTracking, + parseActionsUseValueAssignment: build_options.parseActionsUseValueAssignment, + parseActionsUseLocationTracking: build_options.parseActionsUseLocationTracking, + parseActionsUseLocationAssignment: build_options.parseActionsUseLocationAssignment, + parseActionsUseYYSTACK: build_options.parseActionsUseYYSTACK, + parseActionsUseYYSSTACK: build_options.parseActionsUseYYSSTACK, + parseActionsUseYYSTACKPOINTER: build_options.parseActionsUseYYSTACKPOINTER, + parseActionsUseYYRULELENGTH: build_options.parseActionsUseYYRULELENGTH, + parserHasErrorRecovery: build_options.parserHasErrorRecovery, + parserHasErrorReporting: build_options.parserHasErrorReporting, + + lexerActionsUseYYLENG: '???', + lexerActionsUseYYLINENO: '???', + lexerActionsUseYYTEXT: '???', + lexerActionsUseYYLOC: '???', + lexerActionsUseParseError: '???', + lexerActionsUseYYERROR: '???', + lexerActionsUseLocationTracking: '???', + lexerActionsUseMore: '???', + lexerActionsUseUnput: '???', + lexerActionsUseReject: '???', + lexerActionsUseLess: '???', + lexerActionsUseDisplayAPIs: '???', + lexerActionsUseDescribeYYLOC: '???', + }; + + dict = autodetectAndConvertToJSONformat(dict, build_options) || {}; + + // Feed the possibly reprocessed 'dictionary' above back to the caller + // (for use by our error diagnostic assistance code) + opts.lex_rule_dictionary = dict; + + // Always provide the lexer with an options object, even if it's empty! + // Make sure to camelCase all options: + opts.options = mkStdOptions(build_options, dict.options); + + opts.moduleType = opts.options.moduleType; + opts.moduleName = opts.options.moduleName; + + opts.conditions = prepareStartConditions(dict.startConditions); + opts.conditions.INITIAL = { + rules: [], + inclusive: true + }; + + var code = buildActions(dict, tokens, opts); + opts.performAction = code.actions; + opts.caseHelperInclude = code.caseHelperInclude; + opts.rules = code.rules; + opts.macros = code.macros; + + opts.regular_rule_count = code.regular_rule_count; + opts.simple_rule_count = code.simple_rule_count; + + opts.conditionStack = ['INITIAL']; + + opts.actionInclude = (dict.actionInclude || ''); + opts.moduleInclude = (opts.moduleInclude || '') + (dict.moduleInclude || '').trim(); + + return opts; +} + +// Assemble the final source from the processed grammar +/** @public */ +function generateFromOpts(opt) { + var code = ''; + + switch (opt.moduleType) { + case 'js': + code = generateModule(opt); + break; + case 'amd': + code = generateAMDModule(opt); + break; + case 'es': + code = generateESModule(opt); + break; + case 'commonjs': + default: + code = generateCommonJSModule(opt); + break; + } + + return code; +} + +function generateRegexesInitTableCode(opt) { + var a = opt.rules; + var print_xregexp = opt.options && opt.options.xregexp; + var id_display_width = (1 + Math.log10(a.length | 1) | 0); + var ws_prefix = new Array(id_display_width).join(' '); + var b = a.map(function generateXRegExpInitCode(re, idx) { + var idx_str = (ws_prefix + idx).substr(-id_display_width); + + if (re instanceof XRegExp) { + // When we don't need the special XRegExp sauce at run-time, we do with the original + // JavaScript RegExp instance a.k.a. 'native regex': + if (re.xregexp.isNative || !print_xregexp) { + return `/* ${idx_str}: */ ${re}`; + } + // And make sure to escape the regex to make it suitable for placement inside a *string* + // as it is passed as a string argument to the XRegExp constructor here. + var re_src = re.xregexp.source.replace(/[\\"]/g, '\\$&'); + return `/* ${idx_str}: */ new XRegExp("${re_src}", "${re.xregexp.flags}")`; + } else { + return `/* ${idx_str}: */ ${re}`; + } + }); + return b.join(',\n'); +} + +function generateModuleBody(opt) { + // make the JSON output look more like JavaScript: + function cleanupJSON(str) { + str = str.replace(/ "rules": \[/g, ' rules: ['); + str = str.replace(/ "inclusive": /g, ' inclusive: '); + return str; + } + + function produceOptions(opts) { + var obj = {}; + var do_not_pass = { + debug: !opts.debug, // do not include this item when it is FALSE as there's no debug tracing built into the generated grammar anyway! + enableDebugLogs: 1, + json: 1, + _: 1, + noMain: 1, + dumpSourceCodeOnFailure: 1, + throwErrorOnCompileFailure: 1, + reportStats: 1, + file: 1, + outfile: 1, + inputPath: 1, + inputFilename: 1, + defaultModuleName: 1, + moduleName: 1, + moduleType: 1, + lexerErrorsAreRecoverable: 0, + flex: 0, + backtrack_lexer: 0, + caseInsensitive: 0, + showSource: 1, + exportAST: 1, + exportAllTables: 1, + exportSourceCode: 1, + prettyCfg: 1, + parseActionsUseYYLENG: 1, + parseActionsUseYYLINENO: 1, + parseActionsUseYYTEXT: 1, + parseActionsUseYYLOC: 1, + parseActionsUseParseError: 1, + parseActionsUseYYERROR: 1, + parseActionsUseYYRECOVERING: 1, + parseActionsUseYYERROK: 1, + parseActionsUseYYCLEARIN: 1, + parseActionsUseValueTracking: 1, + parseActionsUseValueAssignment: 1, + parseActionsUseLocationTracking: 1, + parseActionsUseLocationAssignment: 1, + parseActionsUseYYSTACK: 1, + parseActionsUseYYSSTACK: 1, + parseActionsUseYYSTACKPOINTER: 1, + parseActionsUseYYRULELENGTH: 1, + parserHasErrorRecovery: 1, + parserHasErrorReporting: 1, + lexerActionsUseYYLENG: 1, + lexerActionsUseYYLINENO: 1, + lexerActionsUseYYTEXT: 1, + lexerActionsUseYYLOC: 1, + lexerActionsUseParseError: 1, + lexerActionsUseYYERROR: 1, + lexerActionsUseLocationTracking: 1, + lexerActionsUseMore: 1, + lexerActionsUseUnput: 1, + lexerActionsUseReject: 1, + lexerActionsUseLess: 1, + lexerActionsUseDisplayAPIs: 1, + lexerActionsUseDescribeYYLOC: 1, + }; + for (var k in opts) { + if (!do_not_pass[k] && opts[k] != null && opts[k] !== false) { + // make sure numeric values are encoded as numeric, the rest as boolean/string. + if (typeof opts[k] === 'string') { + var f = parseFloat(opts[k]); + if (f == opts[k]) { + obj[k] = f; + continue; + } + } + obj[k] = opts[k]; + } + } + + // And now some options which should receive some special processing: + var pre = obj.pre_lex; + var post = obj.post_lex; + // since JSON cannot encode functions, we'll have to do it manually at run-time, i.e. later on: + if (pre) { + obj.pre_lex = true; + } + if (post) { + obj.post_lex = true; + } + + var js = JSON.stringify(obj, null, 2); + + js = js.replace(new XRegExp(` "(${ID_REGEX_BASE})": `, 'g'), ' $1: '); + js = js.replace(/^( +)pre_lex: true(,)?$/gm, function (m, ls, tc) { + return ls + 'pre_lex: ' + String(pre) + (tc || ''); + }); + js = js.replace(/^( +)post_lex: true(,)?$/gm, function (m, ls, tc) { + return ls + 'post_lex: ' + String(post) + (tc || ''); + }); + return js; + } + + + var out; + if (opt.rules.length > 0 || opt.__in_rules_failure_analysis_mode__) { + // we don't mind that the `test_me()` code above will have this `lexer` variable re-defined: + // JavaScript is fine with that. + var code = [rmCommonWS` + var lexer = { + `, '/*JISON-LEX-ANALYTICS-REPORT*/' /* slot #1: placeholder for analysis report further below */ + ]; + + // get the RegExpLexer.prototype in source code form: + var protosrc = getRegExpLexerPrototype(); + // and strip off the surrounding bits we don't want: + protosrc = protosrc + .replace(/^[\s\r\n]*\{/, '') + .replace(/\s*\}[\s\r\n]*$/, '') + .trim(); + code.push(protosrc + ',\n'); + + assert(opt.options); + // Assure all options are camelCased: + assert(typeof opt.options['case-insensitive'] === 'undefined'); + + code.push(' options: ' + produceOptions(opt.options)); + + var performActionCode = String(opt.performAction); + var simpleCaseActionClustersCode = String(opt.caseHelperInclude); + var rulesCode = generateRegexesInitTableCode(opt); + var conditionsCode = cleanupJSON(JSON.stringify(opt.conditions, null, 2)); + code.push(rmCommonWS`, + JisonLexerError: JisonLexerError, + performAction: ${performActionCode}, + simpleCaseActionClusters: ${simpleCaseActionClustersCode}, + rules: [ + ${rulesCode} + ], + conditions: ${conditionsCode} + }; + `); + + opt.is_custom_lexer = false; + + out = code.join(''); + } else { + // We're clearly looking at a custom lexer here as there's no lexer rules at all. + // + // We are re-purposing the `%{...%}` `actionInclude` code block here as it serves no purpose otherwise. + // + // Meanwhile we make sure we have the `lexer` variable declared in *local scope* no matter + // what crazy stuff (or lack thereof) the userland code is pulling in the `actionInclude` chunk. + out = 'var lexer;\n'; + + assert(opt.regular_rule_count === 0); + assert(opt.simple_rule_count === 0); + opt.is_custom_lexer = true; + + if (opt.actionInclude) { + out += opt.actionInclude + (!opt.actionInclude.match(/;[\s\r\n]*$/) ? ';' : '') + '\n'; + } + } + + // The output of this function is guaranteed to read something like this: + // + // ``` + // var lexer; + // + // bla bla bla bla ... lotsa bla bla; + // ``` + // + // and that should work nicely as an `eval()`-able piece of source code. + return out; +} + +function generateGenericHeaderComment() { + var out = rmCommonWS` + /* lexer generated by jison-lex ${version$1} */ + + /* + * Returns a Lexer object of the following structure: + * + * Lexer: { + * yy: {} The so-called "shared state" or rather the *source* of it; + * the real "shared state" \`yy\` passed around to + * the rule actions, etc. is a direct reference! + * + * This "shared context" object was passed to the lexer by way of + * the \`lexer.setInput(str, yy)\` API before you may use it. + * + * This "shared context" object is passed to the lexer action code in \`performAction()\` + * so userland code in the lexer actions may communicate with the outside world + * and/or other lexer rules' actions in more or less complex ways. + * + * } + * + * Lexer.prototype: { + * EOF: 1, + * ERROR: 2, + * + * yy: The overall "shared context" object reference. + * + * JisonLexerError: function(msg, hash), + * + * performAction: function lexer__performAction(yy, yyrulenumber, YY_START), + * + * The function parameters and \`this\` have the following value/meaning: + * - \`this\` : reference to the \`lexer\` instance. + * \`yy_\` is an alias for \`this\` lexer instance reference used internally. + * + * - \`yy\` : a reference to the \`yy\` "shared state" object which was passed to the lexer + * by way of the \`lexer.setInput(str, yy)\` API before. + * + * Note: + * The extra arguments you specified in the \`%parse-param\` statement in your + * **parser** grammar definition file are passed to the lexer via this object + * reference as member variables. + * + * - \`yyrulenumber\` : index of the matched lexer rule (regex), used internally. + * + * - \`YY_START\`: the current lexer "start condition" state. + * + * parseError: function(str, hash, ExceptionClass), + * + * constructLexErrorInfo: function(error_message, is_recoverable), + * Helper function. + * Produces a new errorInfo \'hash object\' which can be passed into \`parseError()\`. + * See it\'s use in this lexer kernel in many places; example usage: + * + * var infoObj = lexer.constructParseErrorInfo(\'fail!\', true); + * var retVal = lexer.parseError(infoObj.errStr, infoObj, lexer.JisonLexerError); + * + * options: { ... lexer %options ... }, + * + * lex: function(), + * Produce one token of lexed input, which was passed in earlier via the \`lexer.setInput()\` API. + * You MAY use the additional \`args...\` parameters as per \`%parse-param\` spec of the **lexer** grammar: + * these extra \`args...\` are added verbatim to the \`yy\` object reference as member variables. + * + * WARNING: + * Lexer's additional \`args...\` parameters (via lexer's \`%parse-param\`) MAY conflict with + * any attributes already added to \`yy\` by the **parser** or the jison run-time; + * when such a collision is detected an exception is thrown to prevent the generated run-time + * from silently accepting this confusing and potentially hazardous situation! + * + * cleanupAfterLex: function(do_not_nuke_errorinfos), + * Helper function. + * + * This helper API is invoked when the **parse process** has completed: it is the responsibility + * of the **parser** (or the calling userland code) to invoke this method once cleanup is desired. + * + * This helper may be invoked by user code to ensure the internal lexer gets properly garbage collected. + * + * setInput: function(input, [yy]), + * + * + * input: function(), + * + * + * unput: function(str), + * + * + * more: function(), + * + * + * reject: function(), + * + * + * less: function(n), + * + * + * pastInput: function(n), + * + * + * upcomingInput: function(n), + * + * + * showPosition: function(), + * + * + * test_match: function(regex_match_array, rule_index), + * + * + * next: function(), + * + * + * begin: function(condition), + * + * + * pushState: function(condition), + * + * + * popState: function(), + * + * + * topState: function(), + * + * + * _currentRules: function(), + * + * + * stateStackSize: function(), + * + * + * performAction: function(yy, yy_, yyrulenumber, YY_START), + * + * + * rules: [...], + * + * + * conditions: {associative list: name ==> set}, + * } + * + * + * token location info (\`yylloc\`): { + * first_line: n, + * last_line: n, + * first_column: n, + * last_column: n, + * range: [start_number, end_number] + * (where the numbers are indexes into the input string, zero-based) + * } + * + * --- + * + * The \`parseError\` function receives a \'hash\' object with these members for lexer errors: + * + * { + * text: (matched text) + * token: (the produced terminal token, if any) + * token_id: (the produced terminal token numeric ID, if any) + * line: (yylineno) + * loc: (yylloc) + * recoverable: (boolean: TRUE when the parser MAY have an error recovery rule + * available for this particular error) + * yy: (object: the current parser internal "shared state" \`yy\` + * as is also available in the rule actions; this can be used, + * for instance, for advanced error analysis and reporting) + * lexer: (reference to the current lexer instance used by the parser) + * } + * + * while \`this\` will reference the current lexer instance. + * + * When \`parseError\` is invoked by the lexer, the default implementation will + * attempt to invoke \`yy.parser.parseError()\`; when this callback is not provided + * it will try to invoke \`yy.parseError()\` instead. When that callback is also not + * provided, a \`JisonLexerError\` exception will be thrown containing the error + * message and \`hash\`, as constructed by the \`constructLexErrorInfo()\` API. + * + * Note that the lexer\'s \`JisonLexerError\` error class is passed via the + * \`ExceptionClass\` argument, which is invoked to construct the exception + * instance to be thrown, so technically \`parseError\` will throw the object + * produced by the \`new ExceptionClass(str, hash)\` JavaScript expression. + * + * --- + * + * You can specify lexer options by setting / modifying the \`.options\` object of your Lexer instance. + * These options are available: + * + * (Options are permanent.) + * + * yy: { + * parseError: function(str, hash, ExceptionClass) + * optional: overrides the default \`parseError\` function. + * } + * + * lexer.options: { + * pre_lex: function() + * optional: is invoked before the lexer is invoked to produce another token. + * \`this\` refers to the Lexer object. + * post_lex: function(token) { return token; } + * optional: is invoked when the lexer has produced a token \`token\`; + * this function can override the returned token value by returning another. + * When it does not return any (truthy) value, the lexer will return + * the original \`token\`. + * \`this\` refers to the Lexer object. + * + * WARNING: the next set of options are not meant to be changed. They echo the abilities of + * the lexer as per when it was compiled! + * + * ranges: boolean + * optional: \`true\` ==> token location info will include a .range[] member. + * flex: boolean + * optional: \`true\` ==> flex-like lexing behaviour where the rules are tested + * exhaustively to find the longest match. + * backtrack_lexer: boolean + * optional: \`true\` ==> lexer regexes are tested in order and for invoked; + * the lexer terminates the scan when a token is returned by the action code. + * xregexp: boolean + * optional: \`true\` ==> lexer rule regexes are "extended regex format" requiring the + * \`XRegExp\` library. When this %option has not been specified at compile time, all lexer + * rule regexes have been written as standard JavaScript RegExp expressions. + * } + */ + `; + + return out; +} + +function prepareOptions(opt) { + opt = opt || {}; + + // check for illegal identifier + if (!opt.moduleName || !opt.moduleName.match(/^[a-zA-Z_$][a-zA-Z0-9_$\.]*$/)) { + if (opt.moduleName) { + var msg = 'WARNING: The specified moduleName "' + opt.moduleName + '" is illegal (only characters [a-zA-Z0-9_$] and "." dot are accepted); using the default moduleName "lexer" instead.'; + if (typeof opt.warn_cb === 'function') { + opt.warn_cb(msg); + } else { + // do not treat as warning; barf hairball instead so that this oddity gets noticed right away! + throw new Error(msg); + } + } + opt.moduleName = 'lexer'; + } + + prepExportStructures(opt); + + return opt; +} + +function generateModule(opt) { + opt = prepareOptions(opt); + + var out = [ + generateGenericHeaderComment(), + '', + 'var ' + opt.moduleName + ' = (function () {', + jisonLexerErrorDefinition, + '', + generateModuleBody(opt), + '', + (opt.moduleInclude ? opt.moduleInclude + ';' : ''), + '', + 'return lexer;', + '})();' + ]; + + var src = out.join('\n') + '\n'; + src = stripUnusedLexerCode(src, opt); + opt.exportSourceCode.all = src; + return src; +} + +function generateAMDModule(opt) { + opt = prepareOptions(opt); + + var out = [ + generateGenericHeaderComment(), + '', + 'define([], function () {', + jisonLexerErrorDefinition, + '', + generateModuleBody(opt), + '', + (opt.moduleInclude ? opt.moduleInclude + ';' : ''), + '', + 'return lexer;', + '});' + ]; + + var src = out.join('\n') + '\n'; + src = stripUnusedLexerCode(src, opt); + opt.exportSourceCode.all = src; + return src; +} + +function generateESModule(opt) { + opt = prepareOptions(opt); + + var out = [ + generateGenericHeaderComment(), + '', + 'var lexer = (function () {', + jisonLexerErrorDefinition, + '', + generateModuleBody(opt), + '', + (opt.moduleInclude ? opt.moduleInclude + ';' : ''), + '', + 'return lexer;', + '})();', + '', + 'function yylex() {', + ' return lexer.lex.apply(lexer, arguments);', + '}', + rmCommonWS` + export { + lexer, + yylex as lex + }; + ` + ]; + + var src = out.join('\n') + '\n'; + src = stripUnusedLexerCode(src, opt); + opt.exportSourceCode.all = src; + return src; +} + +function generateCommonJSModule(opt) { + opt = prepareOptions(opt); + + var out = [ + generateGenericHeaderComment(), + '', + 'var ' + opt.moduleName + ' = (function () {', + jisonLexerErrorDefinition, + '', + generateModuleBody(opt), + '', + (opt.moduleInclude ? opt.moduleInclude + ';' : ''), + '', + 'return lexer;', + '})();', + '', + 'if (typeof require !== \'undefined\' && typeof exports !== \'undefined\') {', + ' exports.lexer = ' + opt.moduleName + ';', + ' exports.lex = function () {', + ' return ' + opt.moduleName + '.lex.apply(lexer, arguments);', + ' };', + '}' + ]; + + var src = out.join('\n') + '\n'; + src = stripUnusedLexerCode(src, opt); + opt.exportSourceCode.all = src; + return src; +} + +RegExpLexer.generate = generate; + +RegExpLexer.version = version$1; +RegExpLexer.defaultJisonLexOptions = defaultJisonLexOptions; +RegExpLexer.mkStdOptions = mkStdOptions; +RegExpLexer.camelCase = camelCase; +RegExpLexer.autodetectAndConvertToJSONformat = autodetectAndConvertToJSONformat; + +var version = '0.6.1-205'; // require('./package.json').version; + + +function getCommandlineOptions() { + 'use strict'; + + var opts = nomnom + .script('jison-lex') + .unknownOptionTreatment(false) // do not accept unknown options! + .options({ + file: { + flag: true, + position: 0, + help: 'file containing a lexical grammar' + }, + json: { + abbr: 'j', + flag: true, + default: false, + help: 'jison will expect a grammar in either JSON/JSON5 or JISON format: the precise format is autodetected' + }, + outfile: { + abbr: 'o', + metavar: 'FILE', + help : 'Filepath and base module name of the generated parser;\nwhen terminated with a / (dir separator) it is treated as the destination directory where the generated output will be stored' + }, + debug: { + abbr: 'd', + flag: true, + default: false, + help: 'Debug mode' + }, + dumpSourceCodeOnFailure: { + full: 'dump-sourcecode-on-failure', + flag: true, + default: true, + help: 'Dump the generated source code to a special named file when the internal generator tests fail, i.e. when the generated source code does not compile in the JavaScript engine. Enabling this option helps you to diagnose/debug crashes (thrown exceptions) in the code generator due to various reasons: you can, for example, load the dumped sourcecode in another environment (e.g. NodeJS) to get more info on the precise location and cause of the compile failure.' + }, + throwErrorOnCompileFailure: { + full: 'throw-on-compile-failure', + flag: true, + default: true, + help: 'Throw an exception when the generated source code fails to compile in the JavaScript engine. **WARNING**: Turning this feature OFF permits the code generator to produce non-working source code and treat that as SUCCESS. This MAY be desirable code generator behaviour, but only rarely.' + }, + reportStats: { + full: 'info', + abbr: 'I', + flag: true, + default: false, + help: 'Report some statistics about the generated parser' + }, + moduleType: { + full: 'module-type', + abbr: 't', + default: 'commonjs', + metavar: 'TYPE', + choices: ['commonjs', 'amd', 'js', 'es'], + help: 'The type of module to generate (commonjs, amd, es, js)' + }, + moduleName: { + full: 'module-name', + abbr: 'n', + metavar: 'NAME', + help: 'The name of the generated parser object, namespace supported' + }, + main: { + full: 'main', + abbr: 'x', + flag: true, + default: false, + help: 'Include .main() entry point in generated commonjs module' + }, + moduleMain: { + full: 'module-main', + abbr: 'y', + metavar: 'NAME', + help: 'The main module function definition' + }, + version: { + abbr: 'V', + flag: true, + help: 'print version and exit', + callback: function () { + return version; + } + } + }).parse(); + + return opts; +} + +var cli = module.exports; + +cli.main = function cliMain(opts) { + 'use strict'; + + opts = RegExpLexer.mkStdOptions(opts); + + function isDirectory(fp) { + try { + return fs.lstatSync(fp).isDirectory(); + } catch (e) { + return false; + } + } + + function mkdirp(fp) { + if (!fp || fp === '.' || fp.length === 0) { + return false; + } + try { + fs.mkdirSync(fp); + return true; + } catch (e) { + if (e.code === 'ENOENT') { + var parent = path.dirname(fp); + // Did we hit the root directory by now? If so, abort! + // Else, create the parent; iff that fails, we fail too... + if (parent !== fp && mkdirp(parent)) { + try { + // Retry creating the original directory: it should succeed now + fs.mkdirSync(fp); + return true; + } catch (e) { + return false; + } + } + } + } + return false; + } + + function processInputFile() { + // getting raw files + var original_cwd = process.cwd(); + + var raw = fs.readFileSync(path.normalize(opts.file), 'utf8'); + + // making best guess at json mode + opts.json = path.extname(opts.file) === '.json' || opts.json; + + // When only the directory part of the output path was specified, then we + // do NOT have the target module name in there as well! + var outpath = opts.outfile; + if (/[\\\/]$/.test(outpath) || isDirectory(outpath)) { + opts.outfile = null; + outpath = outpath.replace(/[\\\/]$/, ''); + } + if (outpath && outpath.length > 0) { + outpath += '/'; + } else { + outpath = ''; + } + + // setting output file name and module name based on input file name + // if they aren't specified. + var name = path.basename(opts.outfile || opts.file); + + // get the base name (i.e. the file name without extension) + // i.e. strip off only the extension and keep any other dots in the filename + name = path.basename(name, path.extname(name)); + + opts.outfile = opts.outfile || (outpath + name + '.js'); + if (!opts.moduleName && name) { + opts.moduleName = opts.defaultModuleName = name.replace(/-\w/g, + function (match) { + return match.charAt(1).toUpperCase(); + }); + } + + // Change CWD to the directory where the source grammar resides: this helps us properly + // %include any files mentioned in the grammar with relative paths: + var new_cwd = path.dirname(path.normalize(opts.file)); + process.chdir(new_cwd); + + var lexer = cli.generateLexerString(raw, opts); + + // and change back to the CWD we started out with: + process.chdir(original_cwd); + + mkdirp(path.dirname(opts.outfile)); + fs.writeFileSync(opts.outfile, lexer); + console.log('JISON-LEX output for module [' + opts.moduleName + '] has been written to file:', opts.outfile); + } + + function readin(cb) { + var stdin = process.openStdin(), + data = ''; + + stdin.setEncoding('utf8'); + stdin.addListener('data', function (chunk) { + data += chunk; + }); + stdin.addListener('end', function () { + cb(data); + }); + } + + function processStdin() { + readin(function processStdinReadInCallback(raw) { + console.log(cli.generateLexerString(raw, opts)); + }); + } + + // if an input file wasn't given, assume input on stdin + if (opts.file) { + processInputFile(); + } else { + processStdin(); + } +}; + +cli.generateLexerString = function generateLexerString(lexerSpec, opts) { + 'use strict'; + + // var settings = RegExpLexer.mkStdOptions(opts); + var predefined_tokens = null; + + return RegExpLexer.generate(lexerSpec, predefined_tokens, opts); +}; + + +if (require.main === module) { + var opts = getCommandlineOptions(); + cli.main(opts); +} diff --git a/dist/cli-umd-es5.js b/dist/cli-umd-es5.js new file mode 100644 index 0000000..1d19e70 --- /dev/null +++ b/dist/cli-umd-es5.js @@ -0,0 +1,2777 @@ +#!/usr/bin/env node + + +'use strict'; + +var _typeof = typeof Symbol === "function" && typeof Symbol.iterator === "symbol" ? function (obj) { return typeof obj; } : function (obj) { return obj && typeof Symbol === "function" && obj.constructor === Symbol && obj !== Symbol.prototype ? "symbol" : typeof obj; }; + +var _templateObject = _taggedTemplateLiteral(['\n var __hacky_counter__ = 0;\n\n /**\n * @constructor\n * @nocollapse\n */\n function XRegExp(re, f) {\n this.re = re;\n this.flags = f;\n this._getUnicodeProperty = function (k) {};\n var fake = /./; // WARNING: this exact \'fake\' is also depended upon by the xregexp unit test!\n __hacky_counter__++;\n fake.__hacky_backy__ = __hacky_counter__;\n return fake;\n }\n '], ['\n var __hacky_counter__ = 0;\n\n /**\n * @constructor\n * @nocollapse\n */\n function XRegExp(re, f) {\n this.re = re;\n this.flags = f;\n this._getUnicodeProperty = function (k) {};\n var fake = /./; // WARNING: this exact \'fake\' is also depended upon by the xregexp unit test!\n __hacky_counter__++;\n fake.__hacky_backy__ = __hacky_counter__;\n return fake;\n }\n ']), + _templateObject2 = _taggedTemplateLiteral(['\n return ', ';\n'], ['\n return ', ';\n']), + _templateObject3 = _taggedTemplateLiteral(['\n // Code Generator Information Report\n // ---------------------------------\n //\n // Options:\n //\n // backtracking: .................... ', '\n // location.ranges: ................. ', '\n // location line+column tracking: ... ', '\n //\n //\n // Forwarded Parser Analysis flags:\n //\n // uses yyleng: ..................... ', '\n // uses yylineno: ................... ', '\n // uses yytext: ..................... ', '\n // uses yylloc: ..................... ', '\n // uses lexer values: ............... ', ' / ', '\n // location tracking: ............... ', '\n // location assignment: ............. ', '\n //\n //\n // Lexer Analysis flags:\n //\n // uses yyleng: ..................... ', '\n // uses yylineno: ................... ', '\n // uses yytext: ..................... ', '\n // uses yylloc: ..................... ', '\n // uses ParseError API: ............. ', '\n // uses yyerror: .................... ', '\n // uses location tracking & editing: ', '\n // uses more() API: ................. ', '\n // uses unput() API: ................ ', '\n // uses reject() API: ............... ', '\n // uses less() API: ................. ', '\n // uses display APIs pastInput(), upcomingInput(), showPosition():\n // ............................. ', '\n // uses describeYYLLOC() API: ....... ', '\n //\n // --------- END OF REPORT -----------\n\n '], ['\n // Code Generator Information Report\n // ---------------------------------\n //\n // Options:\n //\n // backtracking: .................... ', '\n // location.ranges: ................. ', '\n // location line+column tracking: ... ', '\n //\n //\n // Forwarded Parser Analysis flags:\n //\n // uses yyleng: ..................... ', '\n // uses yylineno: ................... ', '\n // uses yytext: ..................... ', '\n // uses yylloc: ..................... ', '\n // uses lexer values: ............... ', ' / ', '\n // location tracking: ............... ', '\n // location assignment: ............. ', '\n //\n //\n // Lexer Analysis flags:\n //\n // uses yyleng: ..................... ', '\n // uses yylineno: ................... ', '\n // uses yytext: ..................... ', '\n // uses yylloc: ..................... ', '\n // uses ParseError API: ............. ', '\n // uses yyerror: .................... ', '\n // uses location tracking & editing: ', '\n // uses more() API: ................. ', '\n // uses unput() API: ................ ', '\n // uses reject() API: ............... ', '\n // uses less() API: ................. ', '\n // uses display APIs pastInput(), upcomingInput(), showPosition():\n // ............................. ', '\n // uses describeYYLLOC() API: ....... ', '\n //\n // --------- END OF REPORT -----------\n\n ']), + _templateObject4 = _taggedTemplateLiteral(['\n var lexer = {\n '], ['\n var lexer = {\n ']), + _templateObject5 = _taggedTemplateLiteral([',\n JisonLexerError: JisonLexerError,\n performAction: ', ',\n simpleCaseActionClusters: ', ',\n rules: [\n ', '\n ],\n conditions: ', '\n };\n '], [',\n JisonLexerError: JisonLexerError,\n performAction: ', ',\n simpleCaseActionClusters: ', ',\n rules: [\n ', '\n ],\n conditions: ', '\n };\n ']), + _templateObject6 = _taggedTemplateLiteral(['\n /* lexer generated by jison-lex ', ' */\n\n /*\n * Returns a Lexer object of the following structure:\n *\n * Lexer: {\n * yy: {} The so-called "shared state" or rather the *source* of it;\n * the real "shared state" `yy` passed around to\n * the rule actions, etc. is a direct reference!\n *\n * This "shared context" object was passed to the lexer by way of \n * the `lexer.setInput(str, yy)` API before you may use it.\n *\n * This "shared context" object is passed to the lexer action code in `performAction()`\n * so userland code in the lexer actions may communicate with the outside world \n * and/or other lexer rules\' actions in more or less complex ways.\n *\n * }\n *\n * Lexer.prototype: {\n * EOF: 1,\n * ERROR: 2,\n *\n * yy: The overall "shared context" object reference.\n *\n * JisonLexerError: function(msg, hash),\n *\n * performAction: function lexer__performAction(yy, yyrulenumber, YY_START),\n *\n * The function parameters and `this` have the following value/meaning:\n * - `this` : reference to the `lexer` instance. \n * `yy_` is an alias for `this` lexer instance reference used internally.\n *\n * - `yy` : a reference to the `yy` "shared state" object which was passed to the lexer\n * by way of the `lexer.setInput(str, yy)` API before.\n *\n * Note:\n * The extra arguments you specified in the `%parse-param` statement in your\n * **parser** grammar definition file are passed to the lexer via this object\n * reference as member variables.\n *\n * - `yyrulenumber` : index of the matched lexer rule (regex), used internally.\n *\n * - `YY_START`: the current lexer "start condition" state.\n *\n * parseError: function(str, hash, ExceptionClass),\n *\n * constructLexErrorInfo: function(error_message, is_recoverable),\n * Helper function.\n * Produces a new errorInfo \'hash object\' which can be passed into `parseError()`.\n * See it\'s use in this lexer kernel in many places; example usage:\n *\n * var infoObj = lexer.constructParseErrorInfo(\'fail!\', true);\n * var retVal = lexer.parseError(infoObj.errStr, infoObj, lexer.JisonLexerError);\n *\n * options: { ... lexer %options ... },\n *\n * lex: function(),\n * Produce one token of lexed input, which was passed in earlier via the `lexer.setInput()` API.\n * You MAY use the additional `args...` parameters as per `%parse-param` spec of the **lexer** grammar:\n * these extra `args...` are added verbatim to the `yy` object reference as member variables.\n *\n * WARNING:\n * Lexer\'s additional `args...` parameters (via lexer\'s `%parse-param`) MAY conflict with\n * any attributes already added to `yy` by the **parser** or the jison run-time; \n * when such a collision is detected an exception is thrown to prevent the generated run-time \n * from silently accepting this confusing and potentially hazardous situation! \n *\n * cleanupAfterLex: function(do_not_nuke_errorinfos),\n * Helper function.\n *\n * This helper API is invoked when the **parse process** has completed: it is the responsibility\n * of the **parser** (or the calling userland code) to invoke this method once cleanup is desired. \n *\n * This helper may be invoked by user code to ensure the internal lexer gets properly garbage collected.\n *\n * setInput: function(input, [yy]),\n *\n *\n * input: function(),\n *\n *\n * unput: function(str),\n *\n *\n * more: function(),\n *\n *\n * reject: function(),\n *\n *\n * less: function(n),\n *\n *\n * pastInput: function(n),\n *\n *\n * upcomingInput: function(n),\n *\n *\n * showPosition: function(),\n *\n *\n * test_match: function(regex_match_array, rule_index),\n *\n *\n * next: function(),\n *\n *\n * begin: function(condition),\n *\n *\n * pushState: function(condition),\n *\n *\n * popState: function(),\n *\n *\n * topState: function(),\n *\n *\n * _currentRules: function(),\n *\n *\n * stateStackSize: function(),\n *\n *\n * performAction: function(yy, yy_, yyrulenumber, YY_START),\n *\n *\n * rules: [...],\n *\n *\n * conditions: {associative list: name ==> set},\n * }\n *\n *\n * token location info (`yylloc`): {\n * first_line: n,\n * last_line: n,\n * first_column: n,\n * last_column: n,\n * range: [start_number, end_number]\n * (where the numbers are indexes into the input string, zero-based)\n * }\n *\n * ---\n *\n * The `parseError` function receives a \'hash\' object with these members for lexer errors:\n *\n * {\n * text: (matched text)\n * token: (the produced terminal token, if any)\n * token_id: (the produced terminal token numeric ID, if any)\n * line: (yylineno)\n * loc: (yylloc)\n * recoverable: (boolean: TRUE when the parser MAY have an error recovery rule\n * available for this particular error)\n * yy: (object: the current parser internal "shared state" `yy`\n * as is also available in the rule actions; this can be used,\n * for instance, for advanced error analysis and reporting)\n * lexer: (reference to the current lexer instance used by the parser)\n * }\n *\n * while `this` will reference the current lexer instance.\n *\n * When `parseError` is invoked by the lexer, the default implementation will\n * attempt to invoke `yy.parser.parseError()`; when this callback is not provided\n * it will try to invoke `yy.parseError()` instead. When that callback is also not\n * provided, a `JisonLexerError` exception will be thrown containing the error\n * message and `hash`, as constructed by the `constructLexErrorInfo()` API.\n *\n * Note that the lexer\'s `JisonLexerError` error class is passed via the\n * `ExceptionClass` argument, which is invoked to construct the exception\n * instance to be thrown, so technically `parseError` will throw the object\n * produced by the `new ExceptionClass(str, hash)` JavaScript expression.\n *\n * ---\n *\n * You can specify lexer options by setting / modifying the `.options` object of your Lexer instance.\n * These options are available:\n *\n * (Options are permanent.)\n * \n * yy: {\n * parseError: function(str, hash, ExceptionClass)\n * optional: overrides the default `parseError` function.\n * }\n *\n * lexer.options: {\n * pre_lex: function()\n * optional: is invoked before the lexer is invoked to produce another token.\n * `this` refers to the Lexer object.\n * post_lex: function(token) { return token; }\n * optional: is invoked when the lexer has produced a token `token`;\n * this function can override the returned token value by returning another.\n * When it does not return any (truthy) value, the lexer will return\n * the original `token`.\n * `this` refers to the Lexer object.\n *\n * WARNING: the next set of options are not meant to be changed. They echo the abilities of\n * the lexer as per when it was compiled!\n *\n * ranges: boolean\n * optional: `true` ==> token location info will include a .range[] member.\n * flex: boolean\n * optional: `true` ==> flex-like lexing behaviour where the rules are tested\n * exhaustively to find the longest match.\n * backtrack_lexer: boolean\n * optional: `true` ==> lexer regexes are tested in order and for invoked;\n * the lexer terminates the scan when a token is returned by the action code.\n * xregexp: boolean\n * optional: `true` ==> lexer rule regexes are "extended regex format" requiring the\n * `XRegExp` library. When this %option has not been specified at compile time, all lexer\n * rule regexes have been written as standard JavaScript RegExp expressions.\n * }\n */\n '], ['\n /* lexer generated by jison-lex ', ' */\n\n /*\n * Returns a Lexer object of the following structure:\n *\n * Lexer: {\n * yy: {} The so-called "shared state" or rather the *source* of it;\n * the real "shared state" \\`yy\\` passed around to\n * the rule actions, etc. is a direct reference!\n *\n * This "shared context" object was passed to the lexer by way of \n * the \\`lexer.setInput(str, yy)\\` API before you may use it.\n *\n * This "shared context" object is passed to the lexer action code in \\`performAction()\\`\n * so userland code in the lexer actions may communicate with the outside world \n * and/or other lexer rules\' actions in more or less complex ways.\n *\n * }\n *\n * Lexer.prototype: {\n * EOF: 1,\n * ERROR: 2,\n *\n * yy: The overall "shared context" object reference.\n *\n * JisonLexerError: function(msg, hash),\n *\n * performAction: function lexer__performAction(yy, yyrulenumber, YY_START),\n *\n * The function parameters and \\`this\\` have the following value/meaning:\n * - \\`this\\` : reference to the \\`lexer\\` instance. \n * \\`yy_\\` is an alias for \\`this\\` lexer instance reference used internally.\n *\n * - \\`yy\\` : a reference to the \\`yy\\` "shared state" object which was passed to the lexer\n * by way of the \\`lexer.setInput(str, yy)\\` API before.\n *\n * Note:\n * The extra arguments you specified in the \\`%parse-param\\` statement in your\n * **parser** grammar definition file are passed to the lexer via this object\n * reference as member variables.\n *\n * - \\`yyrulenumber\\` : index of the matched lexer rule (regex), used internally.\n *\n * - \\`YY_START\\`: the current lexer "start condition" state.\n *\n * parseError: function(str, hash, ExceptionClass),\n *\n * constructLexErrorInfo: function(error_message, is_recoverable),\n * Helper function.\n * Produces a new errorInfo \\\'hash object\\\' which can be passed into \\`parseError()\\`.\n * See it\\\'s use in this lexer kernel in many places; example usage:\n *\n * var infoObj = lexer.constructParseErrorInfo(\\\'fail!\\\', true);\n * var retVal = lexer.parseError(infoObj.errStr, infoObj, lexer.JisonLexerError);\n *\n * options: { ... lexer %options ... },\n *\n * lex: function(),\n * Produce one token of lexed input, which was passed in earlier via the \\`lexer.setInput()\\` API.\n * You MAY use the additional \\`args...\\` parameters as per \\`%parse-param\\` spec of the **lexer** grammar:\n * these extra \\`args...\\` are added verbatim to the \\`yy\\` object reference as member variables.\n *\n * WARNING:\n * Lexer\'s additional \\`args...\\` parameters (via lexer\'s \\`%parse-param\\`) MAY conflict with\n * any attributes already added to \\`yy\\` by the **parser** or the jison run-time; \n * when such a collision is detected an exception is thrown to prevent the generated run-time \n * from silently accepting this confusing and potentially hazardous situation! \n *\n * cleanupAfterLex: function(do_not_nuke_errorinfos),\n * Helper function.\n *\n * This helper API is invoked when the **parse process** has completed: it is the responsibility\n * of the **parser** (or the calling userland code) to invoke this method once cleanup is desired. \n *\n * This helper may be invoked by user code to ensure the internal lexer gets properly garbage collected.\n *\n * setInput: function(input, [yy]),\n *\n *\n * input: function(),\n *\n *\n * unput: function(str),\n *\n *\n * more: function(),\n *\n *\n * reject: function(),\n *\n *\n * less: function(n),\n *\n *\n * pastInput: function(n),\n *\n *\n * upcomingInput: function(n),\n *\n *\n * showPosition: function(),\n *\n *\n * test_match: function(regex_match_array, rule_index),\n *\n *\n * next: function(),\n *\n *\n * begin: function(condition),\n *\n *\n * pushState: function(condition),\n *\n *\n * popState: function(),\n *\n *\n * topState: function(),\n *\n *\n * _currentRules: function(),\n *\n *\n * stateStackSize: function(),\n *\n *\n * performAction: function(yy, yy_, yyrulenumber, YY_START),\n *\n *\n * rules: [...],\n *\n *\n * conditions: {associative list: name ==> set},\n * }\n *\n *\n * token location info (\\`yylloc\\`): {\n * first_line: n,\n * last_line: n,\n * first_column: n,\n * last_column: n,\n * range: [start_number, end_number]\n * (where the numbers are indexes into the input string, zero-based)\n * }\n *\n * ---\n *\n * The \\`parseError\\` function receives a \\\'hash\\\' object with these members for lexer errors:\n *\n * {\n * text: (matched text)\n * token: (the produced terminal token, if any)\n * token_id: (the produced terminal token numeric ID, if any)\n * line: (yylineno)\n * loc: (yylloc)\n * recoverable: (boolean: TRUE when the parser MAY have an error recovery rule\n * available for this particular error)\n * yy: (object: the current parser internal "shared state" \\`yy\\`\n * as is also available in the rule actions; this can be used,\n * for instance, for advanced error analysis and reporting)\n * lexer: (reference to the current lexer instance used by the parser)\n * }\n *\n * while \\`this\\` will reference the current lexer instance.\n *\n * When \\`parseError\\` is invoked by the lexer, the default implementation will\n * attempt to invoke \\`yy.parser.parseError()\\`; when this callback is not provided\n * it will try to invoke \\`yy.parseError()\\` instead. When that callback is also not\n * provided, a \\`JisonLexerError\\` exception will be thrown containing the error\n * message and \\`hash\\`, as constructed by the \\`constructLexErrorInfo()\\` API.\n *\n * Note that the lexer\\\'s \\`JisonLexerError\\` error class is passed via the\n * \\`ExceptionClass\\` argument, which is invoked to construct the exception\n * instance to be thrown, so technically \\`parseError\\` will throw the object\n * produced by the \\`new ExceptionClass(str, hash)\\` JavaScript expression.\n *\n * ---\n *\n * You can specify lexer options by setting / modifying the \\`.options\\` object of your Lexer instance.\n * These options are available:\n *\n * (Options are permanent.)\n * \n * yy: {\n * parseError: function(str, hash, ExceptionClass)\n * optional: overrides the default \\`parseError\\` function.\n * }\n *\n * lexer.options: {\n * pre_lex: function()\n * optional: is invoked before the lexer is invoked to produce another token.\n * \\`this\\` refers to the Lexer object.\n * post_lex: function(token) { return token; }\n * optional: is invoked when the lexer has produced a token \\`token\\`;\n * this function can override the returned token value by returning another.\n * When it does not return any (truthy) value, the lexer will return\n * the original \\`token\\`.\n * \\`this\\` refers to the Lexer object.\n *\n * WARNING: the next set of options are not meant to be changed. They echo the abilities of\n * the lexer as per when it was compiled!\n *\n * ranges: boolean\n * optional: \\`true\\` ==> token location info will include a .range[] member.\n * flex: boolean\n * optional: \\`true\\` ==> flex-like lexing behaviour where the rules are tested\n * exhaustively to find the longest match.\n * backtrack_lexer: boolean\n * optional: \\`true\\` ==> lexer regexes are tested in order and for invoked;\n * the lexer terminates the scan when a token is returned by the action code.\n * xregexp: boolean\n * optional: \\`true\\` ==> lexer rule regexes are "extended regex format" requiring the\n * \\`XRegExp\\` library. When this %option has not been specified at compile time, all lexer\n * rule regexes have been written as standard JavaScript RegExp expressions.\n * }\n */\n ']), + _templateObject7 = _taggedTemplateLiteral(['\n export {\n lexer,\n yylex as lex\n };\n '], ['\n export {\n lexer,\n yylex as lex\n };\n ']); + +function _taggedTemplateLiteral(strings, raw) { return Object.freeze(Object.defineProperties(strings, { raw: { value: Object.freeze(raw) } })); } + +(function (global, factory) { + (typeof exports === 'undefined' ? 'undefined' : _typeof(exports)) === 'object' && typeof module !== 'undefined' ? factory(require('fs'), require('path'), require('@gerhobbelt/nomnom'), require('@gerhobbelt/xregexp'), require('@gerhobbelt/json5'), require('@gerhobbelt/lex-parser'), require('assert'), require('jison-helpers-lib')) : typeof define === 'function' && define.amd ? define(['fs', 'path', '@gerhobbelt/nomnom', '@gerhobbelt/xregexp', '@gerhobbelt/json5', '@gerhobbelt/lex-parser', 'assert', 'jison-helpers-lib'], factory) : factory(global.fs, global.path, global.nomnom, global.XRegExp, global.json5, global.lexParser, global.assert, global.helpers); +})(undefined, function (fs, path, nomnom, XRegExp, json5, lexParser, assert, helpers) { + 'use strict'; + + fs = fs && fs.hasOwnProperty('default') ? fs['default'] : fs; + path = path && path.hasOwnProperty('default') ? path['default'] : path; + nomnom = nomnom && nomnom.hasOwnProperty('default') ? nomnom['default'] : nomnom; + XRegExp = XRegExp && XRegExp.hasOwnProperty('default') ? XRegExp['default'] : XRegExp; + json5 = json5 && json5.hasOwnProperty('default') ? json5['default'] : json5; + lexParser = lexParser && lexParser.hasOwnProperty('default') ? lexParser['default'] : lexParser; + assert = assert && assert.hasOwnProperty('default') ? assert['default'] : assert; + helpers = helpers && helpers.hasOwnProperty('default') ? helpers['default'] : helpers; + + // + // Helper library for set definitions + // + // MIT Licensed + // + // + // This code is intended to help parse regex set expressions and mix them + // together, i.e. to answer questions like this: + // + // what is the resulting regex set expression when we mix the regex set + // `[a-z]` with the regex set `[^\s]` where with 'mix' we mean that any + // input which matches either input regex should match the resulting + // regex set. (a.k.a. Full Outer Join, see also http://www.diffen.com/difference/Inner_Join_vs_Outer_Join) + // + + 'use strict'; + + var XREGEXP_UNICODE_ESCAPE_RE$1 = /^\{[A-Za-z0-9 \-\._]+\}/; // Matches the XRegExp Unicode escape braced part, e.g. `{Number}` + var CHR_RE$1 = /^(?:[^\\]|\\[^cxu0-9]|\\[0-9]{1,3}|\\c[A-Z]|\\x[0-9a-fA-F]{2}|\\u[0-9a-fA-F]{4}|\\u\{[0-9a-fA-F]+\})/; + var SET_PART_RE$1 = /^(?:[^\\\]]|\\[^cxu0-9]|\\[0-9]{1,3}|\\c[A-Z]|\\x[0-9a-fA-F]{2}|\\u[0-9a-fA-F]{4}|\\u\{[0-9a-fA-F]+\})+/; + var NOTHING_SPECIAL_RE$1 = /^(?:[^\\\[\]\(\)\|^\{\}]|\\[^cxu0-9]|\\[0-9]{1,3}|\\c[A-Z]|\\x[0-9a-fA-F]{2}|\\u[0-9a-fA-F]{4}|\\u\{[0-9a-fA-F]+\})+/; + var SET_IS_SINGLE_PCODE_RE = /^\\[dDwWsS]$|^\\p\{[A-Za-z0-9 \-\._]+\}$/; + + var UNICODE_BASE_PLANE_MAX_CP$1 = 65535; + + // The expanded regex sets which are equivalent to the given `\\{c}` escapes: + // + // `/\s/`: + var WHITESPACE_SETSTR$1 = ' \f\n\r\t\x0B\xA0\u1680\u180E\u2000-\u200A\u2028\u2029\u202F\u205F\u3000\uFEFF'; + // `/\d/`: + var DIGIT_SETSTR$1 = '0-9'; + // `/\w/`: + var WORDCHAR_SETSTR$1 = 'A-Za-z0-9_'; + + // Helper for `bitarray2set()`: convert character code to a representation string suitable for use in a regex + function i2c(i) { + var c, x; + + switch (i) { + case 10: + return '\\n'; + + case 13: + return '\\r'; + + case 9: + return '\\t'; + + case 8: + return '\\b'; + + case 12: + return '\\f'; + + case 11: + return '\\v'; + + case 45: + // ASCII/Unicode for '-' dash + return '\\-'; + + case 91: + // '[' + return '\\['; + + case 92: + // '\\' + return '\\\\'; + + case 93: + // ']' + return '\\]'; + + case 94: + // ']' + return '\\^'; + } + if (i < 32 || i > 0xFFF0 /* Unicode Specials, also in UTF16 */ + || i >= 0xD800 && i <= 0xDFFF /* Unicode Supplementary Planes; we're TOAST in JavaScript as we're NOT UTF-16 but UCS-2! */ + || String.fromCharCode(i).match(/[\u2028\u2029]/) /* Code compilation via `new Function()` does not like to see these, or rather: treats them as just another form of CRLF, which breaks your generated regex code! */ + ) { + // Detail about a detail: + // U+2028 and U+2029 are part of the `\s` regex escape code (`\s` and `[\s]` match either of these) and when placed in a JavaScript + // source file verbatim (without escaping it as a `\uNNNN` item) then JavaScript will interpret it as such and consequently report + // a b0rked generated parser, as the generated code would include this regex right here. + // Hence we MUST escape these buggers everywhere we go... + x = i.toString(16); + if (x.length >= 1 && i <= 0xFFFF) { + c = '0000' + x; + return '\\u' + c.substr(c.length - 4); + } else { + return '\\u{' + x + '}'; + } + } + return String.fromCharCode(i); + } + + // Helper collection for `bitarray2set()`: we have expanded all these cached `\\p{NAME}` regex sets when creating + // this bitarray and now we should look at these expansions again to see if `bitarray2set()` can produce a + // `\\p{NAME}` shorthand to represent [part of] the bitarray: + var Pcodes_bitarray_cache = {}; + var Pcodes_bitarray_cache_test_order = []; + + // Helper collection for `bitarray2set()` for minifying special cases of result sets which can be represented by + // a single regex 'escape', e.g. `\d` for digits 0-9. + var EscCode_bitarray_output_refs; + + // now initialize the EscCodes_... table above: + init_EscCode_lookup_table(); + + function init_EscCode_lookup_table() { + var s, + bitarr, + set2esc = {}, + esc2bitarr = {}; + + // patch global lookup tables for the time being, while we calculate their *real* content in this function: + EscCode_bitarray_output_refs = { + esc2bitarr: {}, + set2esc: {} + }; + Pcodes_bitarray_cache_test_order = []; + + // `/\S': + bitarr = []; + set2bitarray(bitarr, '^' + WHITESPACE_SETSTR$1); + s = bitarray2set(bitarr); + esc2bitarr['S'] = bitarr; + set2esc[s] = 'S'; + // set2esc['^' + s] = 's'; + Pcodes_bitarray_cache['\\S'] = bitarr; + + // `/\s': + bitarr = []; + set2bitarray(bitarr, WHITESPACE_SETSTR$1); + s = bitarray2set(bitarr); + esc2bitarr['s'] = bitarr; + set2esc[s] = 's'; + // set2esc['^' + s] = 'S'; + Pcodes_bitarray_cache['\\s'] = bitarr; + + // `/\D': + bitarr = []; + set2bitarray(bitarr, '^' + DIGIT_SETSTR$1); + s = bitarray2set(bitarr); + esc2bitarr['D'] = bitarr; + set2esc[s] = 'D'; + // set2esc['^' + s] = 'd'; + Pcodes_bitarray_cache['\\D'] = bitarr; + + // `/\d': + bitarr = []; + set2bitarray(bitarr, DIGIT_SETSTR$1); + s = bitarray2set(bitarr); + esc2bitarr['d'] = bitarr; + set2esc[s] = 'd'; + // set2esc['^' + s] = 'D'; + Pcodes_bitarray_cache['\\d'] = bitarr; + + // `/\W': + bitarr = []; + set2bitarray(bitarr, '^' + WORDCHAR_SETSTR$1); + s = bitarray2set(bitarr); + esc2bitarr['W'] = bitarr; + set2esc[s] = 'W'; + // set2esc['^' + s] = 'w'; + Pcodes_bitarray_cache['\\W'] = bitarr; + + // `/\w': + bitarr = []; + set2bitarray(bitarr, WORDCHAR_SETSTR$1); + s = bitarray2set(bitarr); + esc2bitarr['w'] = bitarr; + set2esc[s] = 'w'; + // set2esc['^' + s] = 'W'; + Pcodes_bitarray_cache['\\w'] = bitarr; + + EscCode_bitarray_output_refs = { + esc2bitarr: esc2bitarr, + set2esc: set2esc + }; + + updatePcodesBitarrayCacheTestOrder(); + } + + function updatePcodesBitarrayCacheTestOrder(opts) { + var t = new Array(UNICODE_BASE_PLANE_MAX_CP$1 + 1); + var l = {}; + var user_has_xregexp = opts && opts.options && opts.options.xregexp; + var i, j, k, ba; + + // mark every character with which regex pcodes they are part of: + for (k in Pcodes_bitarray_cache) { + ba = Pcodes_bitarray_cache[k]; + + if (!user_has_xregexp && k.indexOf('\\p{') >= 0) { + continue; + } + + var cnt = 0; + for (i = 0; i <= UNICODE_BASE_PLANE_MAX_CP$1; i++) { + if (ba[i]) { + cnt++; + if (!t[i]) { + t[i] = [k]; + } else { + t[i].push(k); + } + } + } + l[k] = cnt; + } + + // now dig out the unique ones: only need one per pcode. + // + // We ASSUME every \\p{NAME} 'pcode' has at least ONE character + // in it that is ONLY matched by that particular pcode. + // If this assumption fails, nothing is lost, but our 'regex set + // optimized representation' will be sub-optimal as than this pcode + // won't be tested during optimization. + // + // Now that would be a pity, so the assumption better holds... + // Turns out the assumption doesn't hold already for /\S/ + /\D/ + // as the second one (\D) is a pure subset of \S. So we have to + // look for markers which match multiple escapes/pcodes for those + // ones where a unique item isn't available... + var lut = []; + var done = {}; + var keys = Object.keys(Pcodes_bitarray_cache); + + for (i = 0; i <= UNICODE_BASE_PLANE_MAX_CP$1; i++) { + k = t[i][0]; + if (t[i].length === 1 && !done[k]) { + assert(l[k] > 0); + lut.push([i, k]); + done[k] = true; + } + } + + for (j = 0; keys[j]; j++) { + k = keys[j]; + + if (!user_has_xregexp && k.indexOf('\\p{') >= 0) { + continue; + } + + if (!done[k]) { + assert(l[k] > 0); + // find a minimum span character to mark this one: + var w = Infinity; + var rv; + ba = Pcodes_bitarray_cache[k]; + for (i = 0; i <= UNICODE_BASE_PLANE_MAX_CP$1; i++) { + if (ba[i]) { + var tl = t[i].length; + if (tl > 1 && tl < w) { + assert(l[k] > 0); + rv = [i, k]; + w = tl; + } + } + } + if (rv) { + done[k] = true; + lut.push(rv); + } + } + } + + // order from large set to small set so that small sets don't gobble + // characters also represented by overlapping larger set pcodes. + // + // Again we assume something: that finding the large regex pcode sets + // before the smaller, more specialized ones, will produce a more + // optimal minification of the regex set expression. + // + // This is a guestimate/heuristic only! + lut.sort(function (a, b) { + var k1 = a[1]; + var k2 = b[1]; + var ld = l[k2] - l[k1]; + if (ld) { + return ld; + } + // and for same-size sets, order from high to low unique identifier. + return b[0] - a[0]; + }); + + Pcodes_bitarray_cache_test_order = lut; + } + + // 'Join' a regex set `[...]` into a Unicode range spanning logic array, flagging every character in the given set. + function set2bitarray(bitarr, s, opts) { + var orig = s; + var set_is_inverted = false; + var bitarr_orig; + + function mark(d1, d2) { + if (d2 == null) d2 = d1; + for (var i = d1; i <= d2; i++) { + bitarr[i] = true; + } + } + + function add2bitarray(dst, src) { + for (var i = 0; i <= UNICODE_BASE_PLANE_MAX_CP$1; i++) { + if (src[i]) { + dst[i] = true; + } + } + } + + function eval_escaped_code(s) { + var c; + // decode escaped code? If none, just take the character as-is + if (s.indexOf('\\') === 0) { + var l = s.substr(0, 2); + switch (l) { + case '\\c': + c = s.charCodeAt(2) - 'A'.charCodeAt(0) + 1; + return String.fromCharCode(c); + + case '\\x': + s = s.substr(2); + c = parseInt(s, 16); + return String.fromCharCode(c); + + case '\\u': + s = s.substr(2); + if (s[0] === '{') { + s = s.substr(1, s.length - 2); + } + c = parseInt(s, 16); + if (c >= 0x10000) { + return new Error('We do NOT support Extended Plane Unicode Codepoints (i.e. CodePoints beyond U:FFFF) in regex set expressions, e.g. \\u{' + s + '}'); + } + return String.fromCharCode(c); + + case '\\0': + case '\\1': + case '\\2': + case '\\3': + case '\\4': + case '\\5': + case '\\6': + case '\\7': + s = s.substr(1); + c = parseInt(s, 8); + return String.fromCharCode(c); + + case '\\r': + return '\r'; + + case '\\n': + return '\n'; + + case '\\v': + return '\v'; + + case '\\f': + return '\f'; + + case '\\t': + return '\t'; + + case '\\b': + return '\b'; + + default: + // just the character itself: + return s.substr(1); + } + } else { + return s; + } + } + + if (s && s.length) { + var c1, c2; + + // inverted set? + if (s[0] === '^') { + set_is_inverted = true; + s = s.substr(1); + bitarr_orig = bitarr; + bitarr = new Array(UNICODE_BASE_PLANE_MAX_CP$1 + 1); + } + + // BITARR collects flags for characters set. Inversion means the complement set of character is st instead. + // This results in an OR operations when sets are joined/chained. + + while (s.length) { + c1 = s.match(CHR_RE$1); + if (!c1) { + // hit an illegal escape sequence? cope anyway! + c1 = s[0]; + } else { + c1 = c1[0]; + // Quick hack for XRegExp escapes inside a regex `[...]` set definition: we *could* try to keep those + // intact but it's easier to unfold them here; this is not nice for when the grammar specifies explicit + // XRegExp support, but alas, we'll get there when we get there... ;-) + switch (c1) { + case '\\p': + s = s.substr(c1.length); + c2 = s.match(XREGEXP_UNICODE_ESCAPE_RE$1); + if (c2) { + c2 = c2[0]; + s = s.substr(c2.length); + // do we have this one cached already? + var pex = c1 + c2; + var ba4p = Pcodes_bitarray_cache[pex]; + if (!ba4p) { + // expand escape: + var xr = new XRegExp('[' + pex + ']'); // TODO: case-insensitive grammar??? + // rewrite to a standard `[...]` regex set: XRegExp will do this for us via `XRegExp.toString()`: + var xs = '' + xr; + // remove the wrapping `/.../` to get at the (possibly *combined* series of) `[...]` sets inside: + xs = xs.substr(1, xs.length - 2); + + ba4p = reduceRegexToSetBitArray(xs, pex, opts); + + Pcodes_bitarray_cache[pex] = ba4p; + updatePcodesBitarrayCacheTestOrder(opts); + } + // merge bitarrays: + add2bitarray(bitarr, ba4p); + continue; + } + break; + + case '\\S': + case '\\s': + case '\\W': + case '\\w': + case '\\d': + case '\\D': + // these can't participate in a range, but need to be treated special: + s = s.substr(c1.length); + // check for \S, \s, \D, \d, \W, \w and expand them: + var ba4e = EscCode_bitarray_output_refs.esc2bitarr[c1[1]]; + assert(ba4e); + add2bitarray(bitarr, ba4e); + continue; + + case '\\b': + // matches a backspace: https://developer.mozilla.org/en/docs/Web/JavaScript/Guide/Regular_Expressions#special-backspace + c1 = '\b'; + break; + } + } + var v1 = eval_escaped_code(c1); + // propagate deferred exceptions = error reports. + if (v1 instanceof Error) { + return v1; + } + v1 = v1.charCodeAt(0); + s = s.substr(c1.length); + + if (s[0] === '-' && s.length >= 2) { + // we can expect a range like 'a-z': + s = s.substr(1); + c2 = s.match(CHR_RE$1); + if (!c2) { + // hit an illegal escape sequence? cope anyway! + c2 = s[0]; + } else { + c2 = c2[0]; + } + var v2 = eval_escaped_code(c2); + // propagate deferred exceptions = error reports. + if (v2 instanceof Error) { + return v1; + } + v2 = v2.charCodeAt(0); + s = s.substr(c2.length); + + // legal ranges go UP, not /DOWN! + if (v1 <= v2) { + mark(v1, v2); + } else { + console.warn('INVALID CHARACTER RANGE found in regex: ', { re: orig, start: c1, start_n: v1, end: c2, end_n: v2 }); + mark(v1); + mark('-'.charCodeAt(0)); + mark(v2); + } + continue; + } + mark(v1); + } + + // When we have marked all slots, '^' NEGATES the set, hence we flip all slots. + // + // Since a regex like `[^]` should match everything(?really?), we don't need to check if the MARK + // phase actually marked anything at all: the `^` negation will correctly flip=mark the entire + // range then. + if (set_is_inverted) { + for (var i = 0; i <= UNICODE_BASE_PLANE_MAX_CP$1; i++) { + if (!bitarr[i]) { + bitarr_orig[i] = true; + } + } + } + } + return false; + } + + // convert a simple bitarray back into a regex set `[...]` content: + function bitarray2set(l, output_inverted_variant, output_minimized) { + // construct the inverse(?) set from the mark-set: + // + // Before we do that, we inject a sentinel so that our inner loops + // below can be simple and fast: + l[UNICODE_BASE_PLANE_MAX_CP$1 + 1] = 1; + // now reconstruct the regex set: + var rv = []; + var i, j, cnt, lut, tn, tspec, match, pcode, ba4pcode, l2; + var bitarr_is_cloned = false; + var l_orig = l; + + if (output_inverted_variant) { + // generate the inverted set, hence all unmarked slots are part of the output range: + cnt = 0; + for (i = 0; i <= UNICODE_BASE_PLANE_MAX_CP$1; i++) { + if (!l[i]) { + cnt++; + } + } + if (cnt === UNICODE_BASE_PLANE_MAX_CP$1 + 1) { + // When there's nothing in the output we output a special 'match-nothing' regex: `[^\S\s]`. + // BUT... since we output the INVERTED set, we output the match-all set instead: + return '\\S\\s'; + } else if (cnt === 0) { + // When we find the entire Unicode range is in the output match set, we replace this with + // a shorthand regex: `[\S\s]` + // BUT... since we output the INVERTED set, we output the match-nothing set instead: + return '^\\S\\s'; + } + + // Now see if we can replace several bits by an escape / pcode: + if (output_minimized) { + lut = Pcodes_bitarray_cache_test_order; + for (tn = 0; lut[tn]; tn++) { + tspec = lut[tn]; + // check if the uniquely identifying char is in the inverted set: + if (!l[tspec[0]]) { + // check if the pcode is covered by the inverted set: + pcode = tspec[1]; + ba4pcode = Pcodes_bitarray_cache[pcode]; + match = 0; + for (j = 0; j <= UNICODE_BASE_PLANE_MAX_CP$1; j++) { + if (ba4pcode[j]) { + if (!l[j]) { + // match in current inverted bitset, i.e. there's at + // least one 'new' bit covered by this pcode/escape: + match++; + } else if (l_orig[j]) { + // mismatch! + match = false; + break; + } + } + } + + // We're only interested in matches which actually cover some + // yet uncovered bits: `match !== 0 && match !== false`. + // + // Apply the heuristic that the pcode/escape is only going to be used + // when it covers *more* characters than its own identifier's length: + if (match && match > pcode.length) { + rv.push(pcode); + + // and nuke the bits in the array which match the given pcode: + // make sure these edits are visible outside this function as + // `l` is an INPUT parameter (~ not modified)! + if (!bitarr_is_cloned) { + l2 = new Array(UNICODE_BASE_PLANE_MAX_CP$1 + 1); + for (j = 0; j <= UNICODE_BASE_PLANE_MAX_CP$1; j++) { + l2[j] = l[j] || ba4pcode[j]; // `!(!l[j] && !ba4pcode[j])` + } + // recreate sentinel + l2[UNICODE_BASE_PLANE_MAX_CP$1 + 1] = 1; + l = l2; + bitarr_is_cloned = true; + } else { + for (j = 0; j <= UNICODE_BASE_PLANE_MAX_CP$1; j++) { + l[j] = l[j] || ba4pcode[j]; + } + } + } + } + } + } + + i = 0; + while (i <= UNICODE_BASE_PLANE_MAX_CP$1) { + // find first character not in original set: + while (l[i]) { + i++; + } + if (i >= UNICODE_BASE_PLANE_MAX_CP$1 + 1) { + break; + } + // find next character not in original set: + for (j = i + 1; !l[j]; j++) {} /* empty loop */ + // generate subset: + rv.push(i2c(i)); + if (j - 1 > i) { + rv.push((j - 2 > i ? '-' : '') + i2c(j - 1)); + } + i = j; + } + } else { + // generate the non-inverted set, hence all logic checks are inverted here... + cnt = 0; + for (i = 0; i <= UNICODE_BASE_PLANE_MAX_CP$1; i++) { + if (l[i]) { + cnt++; + } + } + if (cnt === UNICODE_BASE_PLANE_MAX_CP$1 + 1) { + // When we find the entire Unicode range is in the output match set, we replace this with + // a shorthand regex: `[\S\s]` + return '\\S\\s'; + } else if (cnt === 0) { + // When there's nothing in the output we output a special 'match-nothing' regex: `[^\S\s]`. + return '^\\S\\s'; + } + + // Now see if we can replace several bits by an escape / pcode: + if (output_minimized) { + lut = Pcodes_bitarray_cache_test_order; + for (tn = 0; lut[tn]; tn++) { + tspec = lut[tn]; + // check if the uniquely identifying char is in the set: + if (l[tspec[0]]) { + // check if the pcode is covered by the set: + pcode = tspec[1]; + ba4pcode = Pcodes_bitarray_cache[pcode]; + match = 0; + for (j = 0; j <= UNICODE_BASE_PLANE_MAX_CP$1; j++) { + if (ba4pcode[j]) { + if (l[j]) { + // match in current bitset, i.e. there's at + // least one 'new' bit covered by this pcode/escape: + match++; + } else if (!l_orig[j]) { + // mismatch! + match = false; + break; + } + } + } + + // We're only interested in matches which actually cover some + // yet uncovered bits: `match !== 0 && match !== false`. + // + // Apply the heuristic that the pcode/escape is only going to be used + // when it covers *more* characters than its own identifier's length: + if (match && match > pcode.length) { + rv.push(pcode); + + // and nuke the bits in the array which match the given pcode: + // make sure these edits are visible outside this function as + // `l` is an INPUT parameter (~ not modified)! + if (!bitarr_is_cloned) { + l2 = new Array(UNICODE_BASE_PLANE_MAX_CP$1 + 1); + for (j = 0; j <= UNICODE_BASE_PLANE_MAX_CP$1; j++) { + l2[j] = l[j] && !ba4pcode[j]; + } + // recreate sentinel + l2[UNICODE_BASE_PLANE_MAX_CP$1 + 1] = 1; + l = l2; + bitarr_is_cloned = true; + } else { + for (j = 0; j <= UNICODE_BASE_PLANE_MAX_CP$1; j++) { + l[j] = l[j] && !ba4pcode[j]; + } + } + } + } + } + } + + i = 0; + while (i <= UNICODE_BASE_PLANE_MAX_CP$1) { + // find first character not in original set: + while (!l[i]) { + i++; + } + if (i >= UNICODE_BASE_PLANE_MAX_CP$1 + 1) { + break; + } + // find next character not in original set: + for (j = i + 1; l[j]; j++) {} /* empty loop */ + if (j > UNICODE_BASE_PLANE_MAX_CP$1 + 1) { + j = UNICODE_BASE_PLANE_MAX_CP$1 + 1; + } + // generate subset: + rv.push(i2c(i)); + if (j - 1 > i) { + rv.push((j - 2 > i ? '-' : '') + i2c(j - 1)); + } + i = j; + } + } + + assert(rv.length); + var s = rv.join(''); + assert(s); + + // Check if the set is better represented by one of the regex escapes: + var esc4s = EscCode_bitarray_output_refs.set2esc[s]; + if (esc4s) { + // When we hit a special case like this, it is always the shortest notation, hence wins on the spot! + return '\\' + esc4s; + } + return s; + } + + // Pretty brutal conversion of 'regex' `s` back to raw regex set content: strip outer [...] when they're there; + // ditto for inner combos of sets, i.e. `]|[` as in `[0-9]|[a-z]`. + function reduceRegexToSetBitArray(s, name, opts) { + var orig = s; + + // propagate deferred exceptions = error reports. + if (s instanceof Error) { + return s; + } + + var l = new Array(UNICODE_BASE_PLANE_MAX_CP$1 + 1); + var internal_state = 0; + var derr; + + while (s.length) { + var c1 = s.match(CHR_RE$1); + if (!c1) { + // cope with illegal escape sequences too! + return new Error('illegal escape sequence at start of regex part: "' + s + '" of regex "' + orig + '"'); + } else { + c1 = c1[0]; + } + s = s.substr(c1.length); + + switch (c1) { + case '[': + // this is starting a set within the regex: scan until end of set! + var set_content = []; + while (s.length) { + var inner = s.match(SET_PART_RE$1); + if (!inner) { + inner = s.match(CHR_RE$1); + if (!inner) { + // cope with illegal escape sequences too! + return new Error('illegal escape sequence at start of regex part: ' + s + '" of regex "' + orig + '"'); + } else { + inner = inner[0]; + } + if (inner === ']') break; + } else { + inner = inner[0]; + } + set_content.push(inner); + s = s.substr(inner.length); + } + + // ensure that we hit the terminating ']': + var c2 = s.match(CHR_RE$1); + if (!c2) { + // cope with illegal escape sequences too! + return new Error('regex set expression is broken in regex: "' + orig + '" --> "' + s + '"'); + } else { + c2 = c2[0]; + } + if (c2 !== ']') { + return new Error('regex set expression is broken in regex: ' + orig); + } + s = s.substr(c2.length); + + var se = set_content.join(''); + if (!internal_state) { + derr = set2bitarray(l, se, opts); + // propagate deferred exceptions = error reports. + if (derr instanceof Error) { + return derr; + } + + // a set is to use like a single character in a longer literal phrase, hence input `[abc]word[def]` would thus produce output `[abc]`: + internal_state = 1; + } + break; + + // Strip unescaped pipes to catch constructs like `\\r|\\n` and turn them into + // something ready for use inside a regex set, e.g. `\\r\\n`. + // + // > Of course, we realize that converting more complex piped constructs this way + // > will produce something you might not expect, e.g. `A|WORD2` which + // > would end up as the set `[AW]` which is something else than the input + // > entirely. + // > + // > However, we can only depend on the user (grammar writer) to realize this and + // > prevent this from happening by not creating such oddities in the input grammar. + case '|': + // a|b --> [ab] + internal_state = 0; + break; + + case '(': + // (a) --> a + // + // TODO - right now we treat this as 'too complex': + + // Strip off some possible outer wrappers which we know how to remove. + // We don't worry about 'damaging' the regex as any too-complex regex will be caught + // in the validation check at the end; our 'strippers' here would not damage useful + // regexes anyway and them damaging the unacceptable ones is fine. + s = s.replace(/^\((?:\?:)?(.*?)\)$/, '$1'); // (?:...) -> ... and (...) -> ... + s = s.replace(/^\^?(.*?)\$?$/, '$1'); // ^...$ --> ... (catch these both inside and outside the outer grouping, hence do the ungrouping twice: one before, once after this) + s = s.replace(/^\((?:\?:)?(.*?)\)$/, '$1'); // (?:...) -> ... and (...) -> ... + + return new Error('[macro [' + name + '] is unsuitable for use inside regex set expressions: "[' + orig + ']"]'); + + case '.': + case '*': + case '+': + case '?': + // wildcard + // + // TODO - right now we treat this as 'too complex': + return new Error('[macro [' + name + '] is unsuitable for use inside regex set expressions: "[' + orig + ']"]'); + + case '{': + // range, e.g. `x{1,3}`, or macro? + // TODO - right now we treat this as 'too complex': + return new Error('[macro [' + name + '] is unsuitable for use inside regex set expressions: "[' + orig + ']"]'); + + default: + // literal character or word: take the first character only and ignore the rest, so that + // the constructed set for `word|noun` would be `[wb]`: + if (!internal_state) { + derr = set2bitarray(l, c1, opts); + // propagate deferred exceptions = error reports. + if (derr instanceof Error) { + return derr; + } + + internal_state = 2; + } + break; + } + } + + s = bitarray2set(l); + + // When this result is suitable for use in a set, than we should be able to compile + // it in a regex; that way we can easily validate whether macro X is fit to be used + // inside a regex set: + try { + var re; + assert(s); + assert(!(s instanceof Error)); + re = new XRegExp('[' + s + ']'); + re.test(s[0]); + + // One thing is apparently *not* caught by the RegExp compile action above: `[a[b]c]` + // so we check for lingering UNESCAPED brackets in here as those cannot be: + if (/[^\\][\[\]]/.exec(s)) { + throw new Error('unescaped brackets in set data'); + } + } catch (ex) { + // make sure we produce a set range expression which will fail badly when it is used + // in actual code: + s = new Error('[macro [' + name + '] is unsuitable for use inside regex set expressions: "[' + s + ']"]: ' + ex.message); + } + + assert(s); + // propagate deferred exceptions = error reports. + if (s instanceof Error) { + return s; + } + return l; + } + + // Convert bitarray representing, for example, `'0-9'` to regex string `[0-9]` + // -- or in this example it can be further optimized to only `\d`! + function produceOptimizedRegex4Set(bitarr) { + // First try to produce a minimum regex from the bitarray directly: + var s1 = bitarray2set(bitarr, false, true); + + // and when the regex set turns out to match a single pcode/escape, then + // use that one as-is: + if (s1.match(SET_IS_SINGLE_PCODE_RE)) { + // When we hit a special case like this, it is always the shortest notation, hence wins on the spot! + return s1; + } else { + s1 = '[' + s1 + ']'; + } + + // Now try to produce a minimum regex from the *inverted* bitarray via negation: + // Because we look at a negated bitset, there's no use looking for matches with + // special cases here. + var s2 = bitarray2set(bitarr, true, true); + + if (s2[0] === '^') { + s2 = s2.substr(1); + if (s2.match(SET_IS_SINGLE_PCODE_RE)) { + // When we hit a special case like this, it is always the shortest notation, hence wins on the spot! + return s2; + } + } else { + s2 = '^' + s2; + } + s2 = '[' + s2 + ']'; + + // Then, as some pcode/escapes still happen to deliver a LARGER regex string in the end, + // we also check against the plain, unadulterated regex set expressions: + // + // First try to produce a minimum regex from the bitarray directly: + var s3 = bitarray2set(bitarr, false, false); + + // and when the regex set turns out to match a single pcode/escape, then + // use that one as-is: + if (s3.match(SET_IS_SINGLE_PCODE_RE)) { + // When we hit a special case like this, it is always the shortest notation, hence wins on the spot! + return s3; + } else { + s3 = '[' + s3 + ']'; + } + + // Now try to produce a minimum regex from the *inverted* bitarray via negation: + // Because we look at a negated bitset, there's no use looking for matches with + // special cases here. + var s4 = bitarray2set(bitarr, true, false); + + if (s4[0] === '^') { + s4 = s4.substr(1); + if (s4.match(SET_IS_SINGLE_PCODE_RE)) { + // When we hit a special case like this, it is always the shortest notation, hence wins on the spot! + return s4; + } + } else { + s4 = '^' + s4; + } + s4 = '[' + s4 + ']'; + + if (s2.length < s1.length) { + s1 = s2; + } + if (s3.length < s1.length) { + s1 = s3; + } + if (s4.length < s1.length) { + s1 = s4; + } + + return s1; + } + + var setmgmt = { + XREGEXP_UNICODE_ESCAPE_RE: XREGEXP_UNICODE_ESCAPE_RE$1, + CHR_RE: CHR_RE$1, + SET_PART_RE: SET_PART_RE$1, + NOTHING_SPECIAL_RE: NOTHING_SPECIAL_RE$1, + SET_IS_SINGLE_PCODE_RE: SET_IS_SINGLE_PCODE_RE, + + UNICODE_BASE_PLANE_MAX_CP: UNICODE_BASE_PLANE_MAX_CP$1, + + WHITESPACE_SETSTR: WHITESPACE_SETSTR$1, + DIGIT_SETSTR: DIGIT_SETSTR$1, + WORDCHAR_SETSTR: WORDCHAR_SETSTR$1, + + set2bitarray: set2bitarray, + bitarray2set: bitarray2set, + produceOptimizedRegex4Set: produceOptimizedRegex4Set, + reduceRegexToSetBitArray: reduceRegexToSetBitArray + }; + + // Basic Lexer implemented using JavaScript regular expressions + // Zachary Carter + // MIT Licensed + + var rmCommonWS = helpers.rmCommonWS; + var camelCase = helpers.camelCase; + var code_exec = helpers.exec; + // import recast from '@gerhobbelt/recast'; + // import astUtils from '@gerhobbelt/ast-util'; + var version$1 = '0.6.1-205'; // require('./package.json').version; + + + var XREGEXP_UNICODE_ESCAPE_RE = setmgmt.XREGEXP_UNICODE_ESCAPE_RE; // Matches the XRegExp Unicode escape braced part, e.g. `{Number}` + var CHR_RE = setmgmt.CHR_RE; + var SET_PART_RE = setmgmt.SET_PART_RE; + var NOTHING_SPECIAL_RE = setmgmt.NOTHING_SPECIAL_RE; + var UNICODE_BASE_PLANE_MAX_CP = setmgmt.UNICODE_BASE_PLANE_MAX_CP; + + // WARNING: this regex MUST match the regex for `ID` in ebnf-parser::bnf.l jison language lexer spec! (`ID = [{ALPHA}]{ALNUM}*`) + // + // This is the base XRegExp ID regex used in many places; this should match the ID macro definition in the EBNF/BNF parser et al as well! + var ID_REGEX_BASE = '[\\p{Alphabetic}_][\\p{Alphabetic}_\\p{Number}]*'; + + // see also ./lib/cli.js + /** + @public + @nocollapse + */ + var defaultJisonLexOptions = { + moduleType: 'commonjs', + debug: false, + enableDebugLogs: false, + json: false, + main: false, // CLI: not:(--main option) + dumpSourceCodeOnFailure: true, + throwErrorOnCompileFailure: true, + + moduleName: undefined, + defaultModuleName: 'lexer', + file: undefined, + outfile: undefined, + inputPath: undefined, + inputFilename: undefined, + warn_cb: undefined, // function(msg) | true (= use Jison.Print) | false (= throw Exception) + + xregexp: false, + lexerErrorsAreRecoverable: false, + flex: false, + backtrack_lexer: false, + ranges: false, // track position range, i.e. start+end indexes in the input string + trackPosition: true, // track line+column position in the input string + caseInsensitive: false, + showSource: false, + exportSourceCode: false, + exportAST: false, + prettyCfg: true, + pre_lex: undefined, + post_lex: undefined + }; + + // Merge sets of options. + // + // Convert alternative jison option names to their base option. + // + // The *last* option set which overrides the default wins, where 'override' is + // defined as specifying a not-undefined value which is not equal to the + // default value. + // + // When the FIRST argument is STRING "NODEFAULT", then we MUST NOT mix the + // default values avialable in Jison.defaultJisonOptions. + // + // Return a fresh set of options. + /** @public */ + function mkStdOptions() /*...args*/{ + var h = Object.prototype.hasOwnProperty; + + var opts = {}; + var args = [].concat.apply([], arguments); + // clone defaults, so we do not modify those constants? + if (args[0] !== "NODEFAULT") { + args.unshift(defaultJisonLexOptions); + } else { + args.shift(); + } + + for (var i = 0, len = args.length; i < len; i++) { + var o = args[i]; + if (!o) continue; + + // clone input (while camel-casing the options), so we do not modify those either. + var o2 = {}; + + for (var p in o) { + if (typeof o[p] !== 'undefined' && h.call(o, p)) { + o2[camelCase(p)] = o[p]; + } + } + + // now clean them options up: + if (typeof o2.main !== 'undefined') { + o2.noMain = !o2.main; + } + + delete o2.main; + + // special check for `moduleName` to ensure we detect the 'default' moduleName entering from the CLI + // NOT overriding the moduleName set in the grammar definition file via an `%options` entry: + if (o2.moduleName === o2.defaultModuleName) { + delete o2.moduleName; + } + + // now see if we have an overriding option here: + for (var p in o2) { + if (h.call(o2, p)) { + if (typeof o2[p] !== 'undefined') { + opts[p] = o2[p]; + } + } + } + } + + return opts; + } + + // set up export/output attributes of the `options` object instance + function prepExportStructures(options) { + // set up the 'option' `exportSourceCode` as a hash object for returning + // all generated source code chunks to the caller + var exportSourceCode = options.exportSourceCode; + if (!exportSourceCode || (typeof exportSourceCode === 'undefined' ? 'undefined' : _typeof(exportSourceCode)) !== 'object') { + exportSourceCode = { + enabled: !!exportSourceCode + }; + } else if (typeof exportSourceCode.enabled !== 'boolean') { + exportSourceCode.enabled = true; + } + options.exportSourceCode = exportSourceCode; + } + + // Autodetect if the input lexer spec is in JSON or JISON + // format when the `options.json` flag is `true`. + // + // Produce the JSON lexer spec result when these are JSON formatted already as that + // would save us the trouble of doing this again, anywhere else in the JISON + // compiler/generator. + // + // Otherwise return the *parsed* lexer spec as it has + // been processed through LexParser. + function autodetectAndConvertToJSONformat(lexerSpec, options) { + var chk_l = null; + var ex1, err; + + if (typeof lexerSpec === 'string') { + if (options.json) { + try { + chk_l = json5.parse(lexerSpec); + + // When JSON5-based parsing of the lexer spec succeeds, this implies the lexer spec is specified in `JSON mode` + // *OR* there's a JSON/JSON5 format error in the input: + } catch (e) { + ex1 = e; + } + } + if (!chk_l) { + // // WARNING: the lexer may receive options specified in the **grammar spec file**, + // // hence we should mix the options to ensure the lexParser always + // // receives the full set! + // // + // // make sure all options are 'standardized' before we go and mix them together: + // options = mkStdOptions(grammar.options, options); + try { + chk_l = lexParser.parse(lexerSpec, options); + } catch (e) { + if (options.json) { + err = new Error('Could not parse lexer spec in JSON AUTODETECT mode\nError: ' + ex1.message + ' (' + e.message + ')'); + err.secondary_exception = e; + err.stack = ex1.stack; + } else { + err = new Error('Could not parse lexer spec\nError: ' + e.message); + err.stack = e.stack; + } + throw err; + } + } + } else { + chk_l = lexerSpec; + } + + // Save time! Don't reparse the entire lexer spec *again* inside the code generators when that's not necessary: + + return chk_l; + } + + // expand macros and convert matchers to RegExp's + function prepareRules(dict, actions, caseHelper, tokens, startConditions, opts) { + var m, + i, + k, + rule, + action, + conditions, + active_conditions, + rules = dict.rules || [], + newRules = [], + macros = {}, + regular_rule_count = 0, + simple_rule_count = 0; + + // Assure all options are camelCased: + assert(typeof opts.options['case-insensitive'] === 'undefined'); + + if (!tokens) { + tokens = {}; + } + + // Depending on the location within the regex we need different expansions of the macros: + // one expansion for when a macro is *inside* a `[...]` and another expansion when a macro + // is anywhere else in a regex: + if (dict.macros) { + macros = prepareMacros(dict.macros, opts); + } + + function tokenNumberReplacement(str, token) { + return 'return ' + (tokens[token] || '\'' + token.replace(/'/g, '\\\'') + '\''); + } + + // Make sure a comment does not contain any embedded '*/' end-of-comment marker + // as that would break the generated code + function postprocessComment(str) { + if (Array.isArray(str)) { + str = str.join(' '); + } + str = str.replace(/\*\//g, '*\\/'); // destroy any inner `*/` comment terminator sequence. + return str; + } + + actions.push('switch(yyrulenumber) {'); + + for (i = 0; i < rules.length; i++) { + rule = rules[i]; + m = rule[0]; + + active_conditions = []; + if (Object.prototype.toString.apply(m) !== '[object Array]') { + // implicit add to all inclusive start conditions + for (k in startConditions) { + if (startConditions[k].inclusive) { + active_conditions.push(k); + startConditions[k].rules.push(i); + } + } + } else if (m[0] === '*') { + // Add to ALL start conditions + active_conditions.push('*'); + for (k in startConditions) { + startConditions[k].rules.push(i); + } + rule.shift(); + m = rule[0]; + } else { + // Add to explicit start conditions + conditions = rule.shift(); + m = rule[0]; + for (k = 0; k < conditions.length; k++) { + if (!startConditions.hasOwnProperty(conditions[k])) { + startConditions[conditions[k]] = { + rules: [], + inclusive: false + }; + console.warn('Lexer Warning:', '"' + conditions[k] + '" start condition should be defined as %s or %x; assuming %x now.'); + } + active_conditions.push(conditions[k]); + startConditions[conditions[k]].rules.push(i); + } + } + + if (typeof m === 'string') { + m = expandMacros(m, macros, opts); + m = new XRegExp('^(?:' + m + ')', opts.options.caseInsensitive ? 'i' : ''); + } + newRules.push(m); + if (typeof rule[1] === 'function') { + rule[1] = String(rule[1]).replace(/^\s*function \(\)\s?\{/, '').replace(/\}\s*$/, ''); + } + action = rule[1]; + action = action.replace(/return '((?:\\'|[^']+)+)'/g, tokenNumberReplacement); + action = action.replace(/return "((?:\\"|[^"]+)+)"/g, tokenNumberReplacement); + + var code = ['\n/*! Conditions::']; + code.push(postprocessComment(active_conditions)); + code.push('*/', '\n/*! Rule:: '); + code.push(postprocessComment(rules[i][0])); + code.push('*/', '\n'); + + // When the action is *only* a simple `return TOKEN` statement, then add it to the caseHelpers; + // otherwise add the additional `break;` at the end. + // + // Note: we do NOT analyze the action block any more to see if the *last* line is a simple + // `return NNN;` statement as there are too many shoddy idioms, e.g. + // + // ``` + // %{ if (cond) + // return TOKEN; + // %} + // ``` + // + // which would then cause havoc when our action code analysis (using regexes or otherwise) was 'too simple' + // to catch these culprits; hence we resort and stick with the most fundamental approach here: + // always append `break;` even when it would be obvious to a human that such would be 'unreachable code'. + var match_nr = /^return[\s\r\n]+((?:'(?:\\'|[^']+)+')|(?:"(?:\\"|[^"]+)+")|\d+)[\s\r\n]*;?$/.exec(action.trim()); + if (match_nr) { + simple_rule_count++; + caseHelper.push([].concat(code, i, ':', match_nr[1]).join(' ').replace(/[\n]/g, '\n ')); + } else { + regular_rule_count++; + actions.push([].concat('case', i, ':', code, action, '\nbreak;').join(' ')); + } + } + actions.push('default:'); + actions.push(' return this.simpleCaseActionClusters[yyrulenumber];'); + actions.push('}'); + + return { + rules: newRules, + macros: macros, + + regular_rule_count: regular_rule_count, + simple_rule_count: simple_rule_count + }; + } + + // expand all macros (with maybe one exception) in the given regex: the macros may exist inside `[...]` regex sets or + // elsewhere, which requires two different treatments to expand these macros. + function reduceRegex(s, name, opts, expandAllMacrosInSet_cb, expandAllMacrosElsewhere_cb) { + var orig = s; + + function errinfo() { + if (name) { + return 'macro [[' + name + ']]'; + } else { + return 'regex [[' + orig + ']]'; + } + } + + // propagate deferred exceptions = error reports. + if (s instanceof Error) { + return s; + } + + var c1, c2; + var rv = []; + var derr; + var se; + + while (s.length) { + c1 = s.match(CHR_RE); + if (!c1) { + // cope with illegal escape sequences too! + return new Error(errinfo() + ': illegal escape sequence at start of regex part: ' + s); + } else { + c1 = c1[0]; + } + s = s.substr(c1.length); + + switch (c1) { + case '[': + // this is starting a set within the regex: scan until end of set! + var set_content = []; + var l = new Array(UNICODE_BASE_PLANE_MAX_CP + 1); + + while (s.length) { + var inner = s.match(SET_PART_RE); + if (!inner) { + inner = s.match(CHR_RE); + if (!inner) { + // cope with illegal escape sequences too! + return new Error(errinfo() + ': illegal escape sequence at start of regex part: ' + s); + } else { + inner = inner[0]; + } + if (inner === ']') break; + } else { + inner = inner[0]; + } + set_content.push(inner); + s = s.substr(inner.length); + } + + // ensure that we hit the terminating ']': + c2 = s.match(CHR_RE); + if (!c2) { + // cope with illegal escape sequences too! + return new Error(errinfo() + ': regex set expression is broken: "' + s + '"'); + } else { + c2 = c2[0]; + } + if (c2 !== ']') { + return new Error(errinfo() + ': regex set expression is broken: apparently unterminated'); + } + s = s.substr(c2.length); + + se = set_content.join(''); + + // expand any macros in here: + if (expandAllMacrosInSet_cb) { + se = expandAllMacrosInSet_cb(se); + assert(se); + if (se instanceof Error) { + return new Error(errinfo() + ': ' + se.message); + } + } + + derr = setmgmt.set2bitarray(l, se, opts); + if (derr instanceof Error) { + return new Error(errinfo() + ': ' + derr.message); + } + + // find out which set expression is optimal in size: + var s1 = setmgmt.produceOptimizedRegex4Set(l); + + // check if the source regex set potentially has any expansions (guestimate!) + // + // The indexOf('{') picks both XRegExp Unicode escapes and JISON lexer macros, which is perfect for us here. + var has_expansions = se.indexOf('{') >= 0; + + se = '[' + se + ']'; + + if (!has_expansions && se.length < s1.length) { + s1 = se; + } + rv.push(s1); + break; + + // XRegExp Unicode escape, e.g. `\\p{Number}`: + case '\\p': + c2 = s.match(XREGEXP_UNICODE_ESCAPE_RE); + if (c2) { + c2 = c2[0]; + s = s.substr(c2.length); + + // nothing to expand. + rv.push(c1 + c2); + } else { + // nothing to stretch this match, hence nothing to expand. + rv.push(c1); + } + break; + + // Either a range expression or the start of a macro reference: `.{1,3}` or `{NAME}`. + // Treat it as a macro reference and see if it will expand to anything: + case '{': + c2 = s.match(NOTHING_SPECIAL_RE); + if (c2) { + c2 = c2[0]; + s = s.substr(c2.length); + + var c3 = s[0]; + s = s.substr(c3.length); + if (c3 === '}') { + // possibly a macro name in there... Expand if possible: + c2 = c1 + c2 + c3; + if (expandAllMacrosElsewhere_cb) { + c2 = expandAllMacrosElsewhere_cb(c2); + assert(c2); + if (c2 instanceof Error) { + return new Error(errinfo() + ': ' + c2.message); + } + } + } else { + // not a well-terminated macro reference or something completely different: + // we do not even attempt to expand this as there's guaranteed nothing to expand + // in this bit. + c2 = c1 + c2 + c3; + } + rv.push(c2); + } else { + // nothing to stretch this match, hence nothing to expand. + rv.push(c1); + } + break; + + // Recognize some other regex elements, but there's no need to understand them all. + // + // We are merely interested in any chunks now which do *not* include yet another regex set `[...]` + // nor any `{MACRO}` reference: + default: + // non-set character or word: see how much of this there is for us and then see if there + // are any macros still lurking inside there: + c2 = s.match(NOTHING_SPECIAL_RE); + if (c2) { + c2 = c2[0]; + s = s.substr(c2.length); + + // nothing to expand. + rv.push(c1 + c2); + } else { + // nothing to stretch this match, hence nothing to expand. + rv.push(c1); + } + break; + } + } + + s = rv.join(''); + + // When this result is suitable for use in a set, than we should be able to compile + // it in a regex; that way we can easily validate whether macro X is fit to be used + // inside a regex set: + try { + var re; + re = new XRegExp(s); + re.test(s[0]); + } catch (ex) { + // make sure we produce a regex expression which will fail badly when it is used + // in actual code: + return new Error(errinfo() + ': expands to an invalid regex: /' + s + '/'); + } + + assert(s); + return s; + } + + // expand macros within macros and cache the result + function prepareMacros(dict_macros, opts) { + var macros = {}; + + // expand a `{NAME}` macro which exists inside a `[...]` set: + function expandMacroInSet(i) { + var k, a, m; + if (!macros[i]) { + m = dict_macros[i]; + + if (m.indexOf('{') >= 0) { + // set up our own record so we can detect definition loops: + macros[i] = { + in_set: false, + elsewhere: null, + raw: dict_macros[i] + }; + + for (k in dict_macros) { + if (dict_macros.hasOwnProperty(k) && i !== k) { + // it doesn't matter if the lexer recognized that the inner macro(s) + // were sitting inside a `[...]` set or not: the fact that they are used + // here in macro `i` which itself sits in a set, makes them *all* live in + // a set so all of them get the same treatment: set expansion style. + // + // Note: make sure we don't try to expand any XRegExp `\p{...}` or `\P{...}` + // macros here: + if (XRegExp._getUnicodeProperty(k)) { + // Work-around so that you can use `\p{ascii}` for a XRegExp slug, a.k.a. + // Unicode 'General Category' Property cf. http://unicode.org/reports/tr18/#Categories, + // while using `\p{ASCII}` as a *macro expansion* of the `ASCII` + // macro: + if (k.toUpperCase() !== k) { + m = new Error('Cannot use name "' + k + '" as a macro name as it clashes with the same XRegExp "\\p{..}" Unicode \'General Category\' Property name. Use all-uppercase macro names, e.g. name your macro "' + k.toUpperCase() + '" to work around this issue or give your offending macro a different name.'); + break; + } + } + + a = m.split('{' + k + '}'); + if (a.length > 1) { + var x = expandMacroInSet(k); + assert(x); + if (x instanceof Error) { + m = x; + break; + } + m = a.join(x); + } + } + } + } + + var mba = setmgmt.reduceRegexToSetBitArray(m, i, opts); + + var s1; + + // propagate deferred exceptions = error reports. + if (mba instanceof Error) { + s1 = mba; + } else { + s1 = setmgmt.bitarray2set(mba, false); + + m = s1; + } + + macros[i] = { + in_set: s1, + elsewhere: null, + raw: dict_macros[i] + }; + } else { + m = macros[i].in_set; + + if (m instanceof Error) { + // this turns out to be an macro with 'issues' and it is used, so the 'issues' do matter: bombs away! + return new Error(m.message); + } + + // detect definition loop: + if (m === false) { + return new Error('Macro name "' + i + '" has an illegal, looping, definition, i.e. it\'s definition references itself, either directly or indirectly, via other macros.'); + } + } + + return m; + } + + function expandMacroElsewhere(i) { + var k, a, m; + + if (macros[i].elsewhere == null) { + m = dict_macros[i]; + + // set up our own record so we can detect definition loops: + macros[i].elsewhere = false; + + // the macro MAY contain other macros which MAY be inside a `[...]` set in this + // macro or elsewhere, hence we must parse the regex: + m = reduceRegex(m, i, opts, expandAllMacrosInSet, expandAllMacrosElsewhere); + // propagate deferred exceptions = error reports. + if (m instanceof Error) { + return m; + } + + macros[i].elsewhere = m; + } else { + m = macros[i].elsewhere; + + if (m instanceof Error) { + // this turns out to be an macro with 'issues' and it is used, so the 'issues' do matter: bombs away! + return m; + } + + // detect definition loop: + if (m === false) { + return new Error('Macro name "' + i + '" has an illegal, looping, definition, i.e. it\'s definition references itself, either directly or indirectly, via other macros.'); + } + } + + return m; + } + + function expandAllMacrosInSet(s) { + var i, x; + + // process *all* the macros inside [...] set: + if (s.indexOf('{') >= 0) { + for (i in macros) { + if (macros.hasOwnProperty(i)) { + var a = s.split('{' + i + '}'); + if (a.length > 1) { + x = expandMacroInSet(i); + assert(x); + if (x instanceof Error) { + return new Error('failure to expand the macro [' + i + '] in set [' + s + ']: ' + x.message); + } + s = a.join(x); + } + + // stop the brute-force expansion attempt when we done 'em all: + if (s.indexOf('{') === -1) { + break; + } + } + } + } + + return s; + } + + function expandAllMacrosElsewhere(s) { + var i, x; + + // When we process the remaining macro occurrences in the regex + // every macro used in a lexer rule will become its own capture group. + // + // Meanwhile the cached expansion will expand any submacros into + // *NON*-capturing groups so that the backreference indexes remain as you'ld + // expect and using macros doesn't require you to know exactly what your + // used macro will expand into, i.e. which and how many submacros it has. + // + // This is a BREAKING CHANGE from vanilla jison 0.4.15! + if (s.indexOf('{') >= 0) { + for (i in macros) { + if (macros.hasOwnProperty(i)) { + // These are all submacro expansions, hence non-capturing grouping is applied: + var a = s.split('{' + i + '}'); + if (a.length > 1) { + x = expandMacroElsewhere(i); + assert(x); + if (x instanceof Error) { + return new Error('failure to expand the macro [' + i + '] in regex /' + s + '/: ' + x.message); + } + s = a.join('(?:' + x + ')'); + } + + // stop the brute-force expansion attempt when we done 'em all: + if (s.indexOf('{') === -1) { + break; + } + } + } + } + + return s; + } + + var m, i; + + if (opts.debug) console.log('\n############## RAW macros: ', dict_macros); + + // first we create the part of the dictionary which is targeting the use of macros + // *inside* `[...]` sets; once we have completed that half of the expansions work, + // we then go and expand the macros for when they are used elsewhere in a regex: + // iff we encounter submacros then which are used *inside* a set, we can use that + // first half dictionary to speed things up a bit as we can use those expansions + // straight away! + for (i in dict_macros) { + if (dict_macros.hasOwnProperty(i)) { + expandMacroInSet(i); + } + } + + for (i in dict_macros) { + if (dict_macros.hasOwnProperty(i)) { + expandMacroElsewhere(i); + } + } + + if (opts.debug) console.log('\n############### expanded macros: ', macros); + + return macros; + } + + // expand macros in a regex; expands them recursively + function expandMacros(src, macros, opts) { + var expansion_count = 0; + + // By the time we call this function `expandMacros` we MUST have expanded and cached all macros already! + // Hence things should be easy in there: + + function expandAllMacrosInSet(s) { + var i, m, x; + + // process *all* the macros inside [...] set: + if (s.indexOf('{') >= 0) { + for (i in macros) { + if (macros.hasOwnProperty(i)) { + m = macros[i]; + + var a = s.split('{' + i + '}'); + if (a.length > 1) { + x = m.in_set; + + assert(x); + if (x instanceof Error) { + // this turns out to be an macro with 'issues' and it is used, so the 'issues' do matter: bombs away! + throw x; + } + + // detect definition loop: + if (x === false) { + return new Error('Macro name "' + i + '" has an illegal, looping, definition, i.e. it\'s definition references itself, either directly or indirectly, via other macros.'); + } + + s = a.join(x); + expansion_count++; + } + + // stop the brute-force expansion attempt when we done 'em all: + if (s.indexOf('{') === -1) { + break; + } + } + } + } + + return s; + } + + function expandAllMacrosElsewhere(s) { + var i, m, x; + + // When we process the main macro occurrences in the regex + // every macro used in a lexer rule will become its own capture group. + // + // Meanwhile the cached expansion will expand any submacros into + // *NON*-capturing groups so that the backreference indexes remain as you'ld + // expect and using macros doesn't require you to know exactly what your + // used macro will expand into, i.e. which and how many submacros it has. + // + // This is a BREAKING CHANGE from vanilla jison 0.4.15! + if (s.indexOf('{') >= 0) { + for (i in macros) { + if (macros.hasOwnProperty(i)) { + m = macros[i]; + + var a = s.split('{' + i + '}'); + if (a.length > 1) { + // These are all main macro expansions, hence CAPTURING grouping is applied: + x = m.elsewhere; + assert(x); + + // detect definition loop: + if (x === false) { + return new Error('Macro name "' + i + '" has an illegal, looping, definition, i.e. it\'s definition references itself, either directly or indirectly, via other macros.'); + } + + s = a.join('(' + x + ')'); + expansion_count++; + } + + // stop the brute-force expansion attempt when we done 'em all: + if (s.indexOf('{') === -1) { + break; + } + } + } + } + + return s; + } + + // When we process the macro occurrences in the regex + // every macro used in a lexer rule will become its own capture group. + // + // Meanwhile the cached expansion will have expanded any submacros into + // *NON*-capturing groups so that the backreference indexes remain as you'ld + // expect and using macros doesn't require you to know exactly what your + // used macro will expand into, i.e. which and how many submacros it has. + // + // This is a BREAKING CHANGE from vanilla jison 0.4.15! + var s2 = reduceRegex(src, null, opts, expandAllMacrosInSet, expandAllMacrosElsewhere); + // propagate deferred exceptions = error reports. + if (s2 instanceof Error) { + throw s2; + } + + // only when we did expand some actual macros do we take the re-interpreted/optimized/regenerated regex from reduceRegex() + // in order to keep our test cases simple and rules recognizable. This assumes the user can code good regexes on his own, + // as long as no macros are involved... + // + // Also pick the reduced regex when there (potentially) are XRegExp extensions in the original, e.g. `\\p{Number}`, + // unless the `xregexp` output option has been enabled. + if (expansion_count > 0 || src.indexOf('\\p{') >= 0 && !opts.options.xregexp) { + src = s2; + } else { + // Check if the reduced regex is smaller in size; when it is, we still go with the new one! + if (s2.length < src.length) { + src = s2; + } + } + + return src; + } + + function prepareStartConditions(conditions) { + var sc, + hash = {}; + for (sc in conditions) { + if (conditions.hasOwnProperty(sc)) { + hash[sc] = { rules: [], inclusive: !conditions[sc] }; + } + } + return hash; + } + + function buildActions(dict, tokens, opts) { + var actions = [dict.actionInclude || '', 'var YYSTATE = YY_START;']; + var tok; + var toks = {}; + var caseHelper = []; + + // tokens: map/array of token numbers to token names + for (tok in tokens) { + var idx = parseInt(tok); + if (idx && idx > 0) { + toks[tokens[tok]] = idx; + } + } + + if (opts.options.flex && dict.rules) { + dict.rules.push(['.', 'console.log("", yytext); /* `flex` lexing mode: the last resort rule! */']); + } + + var gen = prepareRules(dict, actions, caseHelper, tokens && toks, opts.conditions, opts); + + var fun = actions.join('\n'); + 'yytext yyleng yylineno yylloc yyerror'.split(' ').forEach(function (yy) { + fun = fun.replace(new RegExp('\\b(' + yy + ')\\b', 'g'), 'yy_.$1'); + }); + + return { + caseHelperInclude: '{\n' + caseHelper.join(',') + '\n}', + + actions: 'function lexer__performAction(yy, yyrulenumber, YY_START) {\n var yy_ = this;\n\n ' + fun + '\n }', + + rules: gen.rules, + macros: gen.macros, // propagate these for debugging/diagnostic purposes + + regular_rule_count: gen.regular_rule_count, + simple_rule_count: gen.simple_rule_count + }; + } + + // + // NOTE: this is *almost* a copy of the JisonParserError producing code in + // jison/lib/jison.js @ line 2304:lrGeneratorMixin.generateErrorClass + // + function generateErrorClass() { + // --- START lexer error class --- + + var prelude = '/**\n * See also:\n * http://stackoverflow.com/questions/1382107/whats-a-good-way-to-extend-error-in-javascript/#35881508\n * but we keep the prototype.constructor and prototype.name assignment lines too for compatibility\n * with userland code which might access the derived class in a \'classic\' way.\n *\n * @public\n * @constructor\n * @nocollapse\n */\nfunction JisonLexerError(msg, hash) {\n Object.defineProperty(this, \'name\', {\n enumerable: false,\n writable: false,\n value: \'JisonLexerError\'\n });\n\n if (msg == null) msg = \'???\';\n\n Object.defineProperty(this, \'message\', {\n enumerable: false,\n writable: true,\n value: msg\n });\n\n this.hash = hash;\n\n var stacktrace;\n if (hash && hash.exception instanceof Error) {\n var ex2 = hash.exception;\n this.message = ex2.message || msg;\n stacktrace = ex2.stack;\n }\n if (!stacktrace) {\n if (Error.hasOwnProperty(\'captureStackTrace\')) { // V8\n Error.captureStackTrace(this, this.constructor);\n } else {\n stacktrace = (new Error(msg)).stack;\n }\n }\n if (stacktrace) {\n Object.defineProperty(this, \'stack\', {\n enumerable: false,\n writable: false,\n value: stacktrace\n });\n }\n}\n\nif (typeof Object.setPrototypeOf === \'function\') {\n Object.setPrototypeOf(JisonLexerError.prototype, Error.prototype);\n} else {\n JisonLexerError.prototype = Object.create(Error.prototype);\n}\nJisonLexerError.prototype.constructor = JisonLexerError;\nJisonLexerError.prototype.name = \'JisonLexerError\';'; + + // --- END lexer error class --- + + return prelude; + } + + var jisonLexerErrorDefinition = generateErrorClass(); + + function generateFakeXRegExpClassSrcCode() { + return rmCommonWS(_templateObject); + } + + /** @constructor */ + function RegExpLexer(dict, input, tokens, build_options) { + var opts; + var dump = false; + + function test_me(tweak_cb, description, src_exception, ex_callback) { + opts = processGrammar(dict, tokens, build_options); + opts.__in_rules_failure_analysis_mode__ = false; + prepExportStructures(opts); + assert(opts.options); + if (tweak_cb) { + tweak_cb(); + } + var source = generateModuleBody(opts); + try { + // The generated code will always have the `lexer` variable declared at local scope + // as `eval()` will use the local scope. + // + // The compiled code will look something like this: + // + // ``` + // var lexer; + // bla bla... + // ``` + // + // or + // + // ``` + // var lexer = { bla... }; + // ``` + var testcode = ['// provide a local version for test purposes:', jisonLexerErrorDefinition, '', generateFakeXRegExpClassSrcCode(), '', source, '', 'return lexer;'].join('\n'); + var lexer = code_exec(testcode, function generated_code_exec_wrapper_regexp_lexer(sourcecode) { + //console.log("===============================LEXER TEST CODE\n", sourcecode, "\n=====================END====================\n"); + var lexer_f = new Function('', sourcecode); + return lexer_f(); + }, opts.options, "lexer"); + + if (!lexer) { + throw new Error('no lexer defined *at all*?!'); + } + if (_typeof(lexer.options) !== 'object' || lexer.options == null) { + throw new Error('your lexer class MUST have an .options member object or it won\'t fly!'); + } + if (typeof lexer.setInput !== 'function') { + throw new Error('your lexer class MUST have a .setInput function member or it won\'t fly!'); + } + if (lexer.EOF !== 1 && lexer.ERROR !== 2) { + throw new Error('your lexer class MUST have these constants defined: lexer.EOF = 1 and lexer.ERROR = 2 or it won\'t fly!'); + } + + // When we do NOT crash, we found/killed the problem area just before this call! + if (src_exception && description) { + src_exception.message += '\n (' + description + ')'; + } + + // patch the pre and post handlers in there, now that we have some live code to work with: + if (opts.options) { + var pre = opts.options.pre_lex; + var post = opts.options.post_lex; + // since JSON cannot encode functions, we'll have to do it manually now: + if (typeof pre === 'function') { + lexer.options.pre_lex = pre; + } + if (typeof post === 'function') { + lexer.options.post_lex = post; + } + } + + if (opts.options.showSource) { + if (typeof opts.options.showSource === 'function') { + opts.options.showSource(lexer, source, opts); + } else { + console.log("\nGenerated lexer sourcecode:\n----------------------------------------\n", source, "\n----------------------------------------\n"); + } + } + return lexer; + } catch (ex) { + // if (src_exception) { + // src_exception.message += '\n (' + description + ': ' + ex.message + ')'; + // } + + if (ex_callback) { + ex_callback(ex); + } else if (dump) { + console.log('source code:\n', source); + } + return false; + } + } + + /** @constructor */ + var lexer = test_me(null, null, null, function (ex) { + // When we get an exception here, it means some part of the user-specified lexer is botched. + // + // Now we go and try to narrow down the problem area/category: + assert(opts.options); + assert(opts.options.xregexp !== undefined); + var orig_xregexp_opt = !!opts.options.xregexp; + if (!test_me(function () { + assert(opts.options.xregexp !== undefined); + opts.options.xregexp = false; + opts.showSource = false; + }, 'When you have specified %option xregexp, you must also properly IMPORT the XRegExp library in the generated lexer.', ex, null)) { + if (!test_me(function () { + // restore xregexp option setting: the trouble wasn't caused by the xregexp flag i.c.w. incorrect XRegExp library importing! + opts.options.xregexp = orig_xregexp_opt; + + opts.conditions = []; + opts.showSource = false; + }, dict.rules && dict.rules.length > 0 ? 'One or more of your lexer state names are possibly botched?' : 'Your custom lexer is somehow botched.', ex, null)) { + if (!test_me(function () { + // opts.conditions = []; + opts.rules = []; + opts.showSource = false; + opts.__in_rules_failure_analysis_mode__ = true; + }, 'One or more of your lexer rules are possibly botched?', ex, null)) { + // kill each rule action block, one at a time and test again after each 'edit': + var rv = false; + for (var i = 0, len = dict.rules ? dict.rules.length : 0; i < len; i++) { + dict.rules[i][1] = '{ /* nada */ }'; + rv = test_me(function () { + // opts.conditions = []; + // opts.rules = []; + // opts.__in_rules_failure_analysis_mode__ = true; + }, 'Your lexer rule "' + dict.rules[i][0] + '" action code block is botched?', ex, null); + if (rv) { + break; + } + } + if (!rv) { + test_me(function () { + opts.conditions = []; + opts.rules = []; + opts.performAction = 'null'; + // opts.options = {}; + // opts.caseHelperInclude = '{}'; + opts.showSource = false; + opts.__in_rules_failure_analysis_mode__ = true; + + dump = false; + }, 'One or more of your lexer rule action code block(s) are possibly botched?', ex, null); + } + } + } + } + throw ex; + }); + + lexer.setInput(input); + + /** @public */ + lexer.generate = function () { + return generateFromOpts(opts); + }; + /** @public */ + lexer.generateModule = function () { + return generateModule(opts); + }; + /** @public */ + lexer.generateCommonJSModule = function () { + return generateCommonJSModule(opts); + }; + /** @public */ + lexer.generateESModule = function () { + return generateESModule(opts); + }; + /** @public */ + lexer.generateAMDModule = function () { + return generateAMDModule(opts); + }; + + // internal APIs to aid testing: + /** @public */ + lexer.getExpandedMacros = function () { + return opts.macros; + }; + + return lexer; + } + + // code stripping performance test for very simple grammar: + // + // - removing backtracking parser code branches: 730K -> 750K rounds + // - removing all location info tracking: yylineno, yylloc, etc.: 750K -> 900K rounds + // - no `yyleng`: 900K -> 905K rounds + // - no `this.done` as we cannot have a NULL `_input` anymore: 905K -> 930K rounds + // - `simpleCaseActionClusters` as array instead of hash object: 930K -> 940K rounds + // - lexers which have only return stmts, i.e. only a + // `simpleCaseActionClusters` lookup table to produce + // lexer tokens: *inline* the `performAction` call: 940K -> 950K rounds + // - given all the above, you can *inline* what's left of + // `lexer_next()`: 950K -> 955K rounds (? this stuff becomes hard to measure; inaccuracy abounds!) + // + // Total gain when we forget about very minor (and tough to nail) *inlining* `lexer_next()` gains: + // + // 730 -> 950 ~ 30% performance gain. + // + + // As a function can be reproduced in source-code form by any JavaScript engine, we're going to wrap this chunk + // of code in a function so that we can easily get it including it comments, etc.: + /** + @public + @nocollapse + */ + function getRegExpLexerPrototype() { + // --- START lexer kernel --- + return '{\n EOF: 1,\n ERROR: 2,\n\n // JisonLexerError: JisonLexerError, /// <-- injected by the code generator\n\n // options: {}, /// <-- injected by the code generator\n\n // yy: ..., /// <-- injected by setInput()\n\n __currentRuleSet__: null, /// INTERNAL USE ONLY: internal rule set cache for the current lexer state\n\n __error_infos: [], /// INTERNAL USE ONLY: the set of lexErrorInfo objects created since the last cleanup\n\n __decompressed: false, /// INTERNAL USE ONLY: mark whether the lexer instance has been \'unfolded\' completely and is now ready for use\n\n done: false, /// INTERNAL USE ONLY\n _backtrack: false, /// INTERNAL USE ONLY\n _input: \'\', /// INTERNAL USE ONLY\n _more: false, /// INTERNAL USE ONLY\n _signaled_error_token: false, /// INTERNAL USE ONLY\n\n conditionStack: [], /// INTERNAL USE ONLY; managed via `pushState()`, `popState()`, `topState()` and `stateStackSize()`\n\n match: \'\', /// READ-ONLY EXTERNAL ACCESS - ADVANCED USE ONLY: tracks input which has been matched so far for the lexer token under construction. `match` is identical to `yytext` except that this one still contains the matched input string after `lexer.performAction()` has been invoked, where userland code MAY have changed/replaced the `yytext` value entirely!\n matched: \'\', /// READ-ONLY EXTERNAL ACCESS - ADVANCED USE ONLY: tracks entire input which has been matched so far\n matches: false, /// READ-ONLY EXTERNAL ACCESS - ADVANCED USE ONLY: tracks RE match result for last (successful) match attempt\n yytext: \'\', /// ADVANCED USE ONLY: tracks input which has been matched so far for the lexer token under construction; this value is transferred to the parser as the \'token value\' when the parser consumes the lexer token produced through a call to the `lex()` API.\n offset: 0, /// READ-ONLY EXTERNAL ACCESS - ADVANCED USE ONLY: tracks the \'cursor position\' in the input string, i.e. the number of characters matched so far\n yyleng: 0, /// READ-ONLY EXTERNAL ACCESS - ADVANCED USE ONLY: length of matched input for the token under construction (`yytext`)\n yylineno: 0, /// READ-ONLY EXTERNAL ACCESS - ADVANCED USE ONLY: \'line number\' at which the token under construction is located\n yylloc: null, /// READ-ONLY EXTERNAL ACCESS - ADVANCED USE ONLY: tracks location info (lines + columns) for the token under construction\n\n /**\n * INTERNAL USE: construct a suitable error info hash object instance for `parseError`.\n * \n * @public\n * @this {RegExpLexer}\n */\n constructLexErrorInfo: function lexer_constructLexErrorInfo(msg, recoverable, show_input_position) {\n msg = \'\' + msg;\n\n // heuristic to determine if the error message already contains a (partial) source code dump\n // as produced by either `showPosition()` or `prettyPrintRange()`:\n if (show_input_position == undefined) {\n show_input_position = !(msg.indexOf(\'\\n\') > 0 && msg.indexOf(\'^\') > 0);\n }\n if (this.yylloc && show_input_position) {\n if (typeof this.prettyPrintRange === \'function\') {\n var pretty_src = this.prettyPrintRange(this.yylloc);\n\n if (!/\\n\\s*$/.test(msg)) {\n msg += \'\\n\';\n }\n msg += \'\\n Erroneous area:\\n\' + this.prettyPrintRange(this.yylloc); \n } else if (typeof this.showPosition === \'function\') {\n var pos_str = this.showPosition();\n if (pos_str) {\n if (msg.length && msg[msg.length - 1] !== \'\\n\' && pos_str[0] !== \'\\n\') {\n msg += \'\\n\' + pos_str;\n } else {\n msg += pos_str;\n }\n }\n }\n }\n /** @constructor */\n var pei = {\n errStr: msg,\n recoverable: !!recoverable,\n text: this.match, // This one MAY be empty; userland code should use the `upcomingInput` API to obtain more text which follows the \'lexer cursor position\'...\n token: null,\n line: this.yylineno,\n loc: this.yylloc,\n yy: this.yy,\n lexer: this,\n\n /**\n * and make sure the error info doesn\'t stay due to potential\n * ref cycle via userland code manipulations.\n * These would otherwise all be memory leak opportunities!\n * \n * Note that only array and object references are nuked as those\n * constitute the set of elements which can produce a cyclic ref.\n * The rest of the members is kept intact as they are harmless.\n * \n * @public\n * @this {LexErrorInfo}\n */\n destroy: function destructLexErrorInfo() {\n // remove cyclic references added to error info:\n // info.yy = null;\n // info.lexer = null;\n // ...\n var rec = !!this.recoverable;\n for (var key in this) {\n if (this.hasOwnProperty(key) && typeof key === \'object\') {\n this[key] = undefined;\n }\n }\n this.recoverable = rec;\n }\n };\n // track this instance so we can `destroy()` it once we deem it superfluous and ready for garbage collection!\n this.__error_infos.push(pei);\n return pei;\n },\n\n /**\n * handler which is invoked when a lexer error occurs.\n * \n * @public\n * @this {RegExpLexer}\n */\n parseError: function lexer_parseError(str, hash, ExceptionClass) {\n if (!ExceptionClass) {\n ExceptionClass = this.JisonLexerError;\n }\n if (this.yy) {\n if (this.yy.parser && typeof this.yy.parser.parseError === \'function\') {\n return this.yy.parser.parseError.call(this, str, hash, ExceptionClass) || this.ERROR;\n } else if (typeof this.yy.parseError === \'function\') {\n return this.yy.parseError.call(this, str, hash, ExceptionClass) || this.ERROR;\n } \n }\n throw new ExceptionClass(str, hash);\n },\n\n /**\n * method which implements `yyerror(str, ...args)` functionality for use inside lexer actions.\n * \n * @public\n * @this {RegExpLexer}\n */\n yyerror: function yyError(str /*, ...args */) {\n var lineno_msg = \'\';\n if (this.yylloc) {\n lineno_msg = \' on line \' + (this.yylineno + 1);\n }\n var p = this.constructLexErrorInfo(\'Lexical error\' + lineno_msg + \': \' + str, this.options.lexerErrorsAreRecoverable);\n\n // Add any extra args to the hash under the name `extra_error_attributes`:\n var args = Array.prototype.slice.call(arguments, 1);\n if (args.length) {\n p.extra_error_attributes = args;\n }\n\n return (this.parseError(p.errStr, p, this.JisonLexerError) || this.ERROR);\n },\n\n /**\n * final cleanup function for when we have completed lexing the input;\n * make it an API so that external code can use this one once userland\n * code has decided it\'s time to destroy any lingering lexer error\n * hash object instances and the like: this function helps to clean\n * up these constructs, which *may* carry cyclic references which would\n * otherwise prevent the instances from being properly and timely\n * garbage-collected, i.e. this function helps prevent memory leaks!\n * \n * @public\n * @this {RegExpLexer}\n */\n cleanupAfterLex: function lexer_cleanupAfterLex(do_not_nuke_errorinfos) {\n // prevent lingering circular references from causing memory leaks:\n this.setInput(\'\', {});\n\n // nuke the error hash info instances created during this run.\n // Userland code must COPY any data/references\n // in the error hash instance(s) it is more permanently interested in.\n if (!do_not_nuke_errorinfos) {\n for (var i = this.__error_infos.length - 1; i >= 0; i--) {\n var el = this.__error_infos[i];\n if (el && typeof el.destroy === \'function\') {\n el.destroy();\n }\n }\n this.__error_infos.length = 0;\n }\n\n return this;\n },\n\n /**\n * clear the lexer token context; intended for internal use only\n * \n * @public\n * @this {RegExpLexer}\n */\n clear: function lexer_clear() {\n this.yytext = \'\';\n this.yyleng = 0;\n this.match = \'\';\n // - DO NOT reset `this.matched`\n this.matches = false;\n this._more = false;\n this._backtrack = false;\n\n var col = (this.yylloc ? this.yylloc.last_column : 0);\n this.yylloc = {\n first_line: this.yylineno + 1,\n first_column: col,\n last_line: this.yylineno + 1,\n last_column: col,\n\n range: [this.offset, this.offset]\n };\n },\n\n /**\n * resets the lexer, sets new input\n * \n * @public\n * @this {RegExpLexer}\n */\n setInput: function lexer_setInput(input, yy) {\n this.yy = yy || this.yy || {};\n\n // also check if we\'ve fully initialized the lexer instance,\n // including expansion work to be done to go from a loaded\n // lexer to a usable lexer:\n if (!this.__decompressed) {\n // step 1: decompress the regex list:\n var rules = this.rules;\n for (var i = 0, len = rules.length; i < len; i++) {\n var rule_re = rules[i];\n\n // compression: is the RE an xref to another RE slot in the rules[] table?\n if (typeof rule_re === \'number\') {\n rules[i] = rules[rule_re];\n }\n }\n\n // step 2: unfold the conditions[] set to make these ready for use:\n var conditions = this.conditions;\n for (var k in conditions) {\n var spec = conditions[k];\n\n var rule_ids = spec.rules;\n\n var len = rule_ids.length;\n var rule_regexes = new Array(len + 1); // slot 0 is unused; we use a 1-based index approach here to keep the hottest code in `lexer_next()` fast and simple!\n var rule_new_ids = new Array(len + 1);\n\n for (var i = 0; i < len; i++) {\n var idx = rule_ids[i];\n var rule_re = rules[idx];\n rule_regexes[i + 1] = rule_re;\n rule_new_ids[i + 1] = idx;\n }\n\n spec.rules = rule_new_ids;\n spec.__rule_regexes = rule_regexes;\n spec.__rule_count = len;\n }\n\n this.__decompressed = true;\n }\n\n this._input = input || \'\';\n this.clear();\n this._signaled_error_token = false;\n this.done = false;\n this.yylineno = 0;\n this.matched = \'\';\n this.conditionStack = [\'INITIAL\'];\n this.__currentRuleSet__ = null;\n this.yylloc = {\n first_line: 1,\n first_column: 0,\n last_line: 1,\n last_column: 0,\n\n range: [0, 0]\n };\n this.offset = 0;\n return this;\n },\n\n /**\n * edit the remaining input via user-specified callback.\n * This can be used to forward-adjust the input-to-parse, \n * e.g. inserting macro expansions and alike in the\n * input which has yet to be lexed.\n * The behaviour of this API contrasts the `unput()` et al\n * APIs as those act on the *consumed* input, while this\n * one allows one to manipulate the future, without impacting\n * the current `yyloc` cursor location or any history. \n * \n * Use this API to help implement C-preprocessor-like\n * `#include` statements, etc.\n * \n * The provided callback must be synchronous and is\n * expected to return the edited input (string).\n *\n * The `cpsArg` argument value is passed to the callback\n * as-is.\n *\n * `callback` interface: \n * `function callback(input, cpsArg)`\n * \n * - `input` will carry the remaining-input-to-lex string\n * from the lexer.\n * - `cpsArg` is `cpsArg` passed into this API.\n * \n * The `this` reference for the callback will be set to\n * reference this lexer instance so that userland code\n * in the callback can easily and quickly access any lexer\n * API. \n *\n * When the callback returns a non-string-type falsey value,\n * we assume the callback did not edit the input and we\n * will using the input as-is.\n *\n * When the callback returns a non-string-type value, it\n * is converted to a string for lexing via the `"" + retval`\n * operation. (See also why: http://2ality.com/2012/03/converting-to-string.html \n * -- that way any returned object\'s `toValue()` and `toString()`\n * methods will be invoked in a proper/desirable order.)\n * \n * @public\n * @this {RegExpLexer}\n */\n editRemainingInput: function lexer_editRemainingInput(callback, cpsArg) {\n var rv = callback.call(this, this._input, cpsArg);\n if (typeof rv !== \'string\') {\n if (rv) {\n this._input = \'\' + rv; \n }\n // else: keep `this._input` as is. \n } else {\n this._input = rv; \n }\n return this;\n },\n\n /**\n * consumes and returns one char from the input\n * \n * @public\n * @this {RegExpLexer}\n */\n input: function lexer_input() {\n if (!this._input) {\n //this.done = true; -- don\'t set `done` as we want the lex()/next() API to be able to produce one custom EOF token match after this anyhow. (lexer can match special <> tokens and perform user action code for a <> match, but only does so *once*)\n return null;\n }\n var ch = this._input[0];\n this.yytext += ch;\n this.yyleng++;\n this.offset++;\n this.match += ch;\n this.matched += ch;\n // Count the linenumber up when we hit the LF (or a stand-alone CR).\n // On CRLF, the linenumber is incremented when you fetch the CR or the CRLF combo\n // and we advance immediately past the LF as well, returning both together as if\n // it was all a single \'character\' only.\n var slice_len = 1;\n var lines = false;\n if (ch === \'\\n\') {\n lines = true;\n } else if (ch === \'\\r\') {\n lines = true;\n var ch2 = this._input[1];\n if (ch2 === \'\\n\') {\n slice_len++;\n ch += ch2;\n this.yytext += ch2;\n this.yyleng++;\n this.offset++;\n this.match += ch2;\n this.matched += ch2;\n this.yylloc.range[1]++;\n }\n }\n if (lines) {\n this.yylineno++;\n this.yylloc.last_line++;\n this.yylloc.last_column = 0;\n } else {\n this.yylloc.last_column++;\n }\n this.yylloc.range[1]++;\n\n this._input = this._input.slice(slice_len);\n return ch;\n },\n\n /**\n * unshifts one char (or an entire string) into the input\n * \n * @public\n * @this {RegExpLexer}\n */\n unput: function lexer_unput(ch) {\n var len = ch.length;\n var lines = ch.split(/(?:\\r\\n?|\\n)/g);\n\n this._input = ch + this._input;\n this.yytext = this.yytext.substr(0, this.yytext.length - len);\n this.yyleng = this.yytext.length;\n this.offset -= len;\n this.match = this.match.substr(0, this.match.length - len);\n this.matched = this.matched.substr(0, this.matched.length - len);\n\n if (lines.length > 1) {\n this.yylineno -= lines.length - 1;\n\n this.yylloc.last_line = this.yylineno + 1;\n\n // Get last entirely matched line into the `pre_lines[]` array\'s\n // last index slot; we don\'t mind when other previously \n // matched lines end up in the array too. \n var pre = this.match;\n var pre_lines = pre.split(/(?:\\r\\n?|\\n)/g);\n if (pre_lines.length === 1) {\n pre = this.matched;\n pre_lines = pre.split(/(?:\\r\\n?|\\n)/g);\n }\n this.yylloc.last_column = pre_lines[pre_lines.length - 1].length;\n } else {\n this.yylloc.last_column -= len;\n }\n\n this.yylloc.range[1] = this.yylloc.range[0] + this.yyleng;\n\n this.done = false;\n return this;\n },\n\n /**\n * cache matched text and append it on next action\n * \n * @public\n * @this {RegExpLexer}\n */\n more: function lexer_more() {\n this._more = true;\n return this;\n },\n\n /**\n * signal the lexer that this rule fails to match the input, so the\n * next matching rule (regex) should be tested instead.\n * \n * @public\n * @this {RegExpLexer}\n */\n reject: function lexer_reject() {\n if (this.options.backtrack_lexer) {\n this._backtrack = true;\n } else {\n // when the `parseError()` call returns, we MUST ensure that the error is registered.\n // We accomplish this by signaling an \'error\' token to be produced for the current\n // `.lex()` run.\n var lineno_msg = \'\';\n if (this.yylloc) {\n lineno_msg = \' on line \' + (this.yylineno + 1);\n }\n var p = this.constructLexErrorInfo(\'Lexical error\' + lineno_msg + \': You can only invoke reject() in the lexer when the lexer is of the backtracking persuasion (options.backtrack_lexer = true).\', false);\n this._signaled_error_token = (this.parseError(p.errStr, p, this.JisonLexerError) || this.ERROR);\n }\n return this;\n },\n\n /**\n * retain first n characters of the match\n * \n * @public\n * @this {RegExpLexer}\n */\n less: function lexer_less(n) {\n return this.unput(this.match.slice(n));\n },\n\n /**\n * return (part of the) already matched input, i.e. for error\n * messages.\n * \n * Limit the returned string length to `maxSize` (default: 20).\n * \n * Limit the returned string to the `maxLines` number of lines of\n * input (default: 1).\n * \n * Negative limit values equal *unlimited*.\n * \n * @public\n * @this {RegExpLexer}\n */\n pastInput: function lexer_pastInput(maxSize, maxLines) {\n var past = this.matched.substring(0, this.matched.length - this.match.length);\n if (maxSize < 0)\n maxSize = past.length;\n else if (!maxSize)\n maxSize = 20;\n if (maxLines < 0)\n maxLines = past.length; // can\'t ever have more input lines than this!\n else if (!maxLines)\n maxLines = 1;\n // `substr` anticipation: treat \\r\\n as a single character and take a little\n // more than necessary so that we can still properly check against maxSize\n // after we\'ve transformed and limited the newLines in here:\n past = past.substr(-maxSize * 2 - 2);\n // now that we have a significantly reduced string to process, transform the newlines\n // and chop them, then limit them:\n var a = past.replace(/\\r\\n|\\r/g, \'\\n\').split(\'\\n\');\n a = a.slice(-maxLines);\n past = a.join(\'\\n\');\n // When, after limiting to maxLines, we still have too much to return,\n // do add an ellipsis prefix...\n if (past.length > maxSize) {\n past = \'...\' + past.substr(-maxSize);\n }\n return past;\n },\n\n /**\n * return (part of the) upcoming input, i.e. for error messages.\n * \n * Limit the returned string length to `maxSize` (default: 20).\n * \n * Limit the returned string to the `maxLines` number of lines of input (default: 1).\n * \n * Negative limit values equal *unlimited*.\n *\n * > ### NOTE ###\n * >\n * > *"upcoming input"* is defined as the whole of the both\n * > the *currently lexed* input, together with any remaining input\n * > following that. *"currently lexed"* input is the input \n * > already recognized by the lexer but not yet returned with\n * > the lexer token. This happens when you are invoking this API\n * > from inside any lexer rule action code block. \n * >\n * \n * @public\n * @this {RegExpLexer}\n */\n upcomingInput: function lexer_upcomingInput(maxSize, maxLines) {\n var next = this.match;\n if (maxSize < 0)\n maxSize = next.length + this._input.length;\n else if (!maxSize)\n maxSize = 20;\n if (maxLines < 0)\n maxLines = maxSize; // can\'t ever have more input lines than this!\n else if (!maxLines)\n maxLines = 1;\n // `substring` anticipation: treat \\r\\n as a single character and take a little\n // more than necessary so that we can still properly check against maxSize\n // after we\'ve transformed and limited the newLines in here:\n if (next.length < maxSize * 2 + 2) {\n next += this._input.substring(0, maxSize * 2 + 2); // substring is faster on Chrome/V8\n }\n // now that we have a significantly reduced string to process, transform the newlines\n // and chop them, then limit them:\n var a = next.replace(/\\r\\n|\\r/g, \'\\n\').split(\'\\n\');\n a = a.slice(0, maxLines);\n next = a.join(\'\\n\');\n // When, after limiting to maxLines, we still have too much to return,\n // do add an ellipsis postfix...\n if (next.length > maxSize) {\n next = next.substring(0, maxSize) + \'...\';\n }\n return next;\n },\n\n /**\n * return a string which displays the character position where the\n * lexing error occurred, i.e. for error messages\n * \n * @public\n * @this {RegExpLexer}\n */\n showPosition: function lexer_showPosition(maxPrefix, maxPostfix) {\n var pre = this.pastInput(maxPrefix).replace(/\\s/g, \' \');\n var c = new Array(pre.length + 1).join(\'-\');\n return pre + this.upcomingInput(maxPostfix).replace(/\\s/g, \' \') + \'\\n\' + c + \'^\';\n },\n\n /**\n * return a string which displays the lines & columns of input which are referenced \n * by the given location info range, plus a few lines of context.\n * \n * This function pretty-prints the indicated section of the input, with line numbers \n * and everything!\n * \n * This function is very useful to provide highly readable error reports, while\n * the location range may be specified in various flexible ways:\n * \n * - `loc` is the location info object which references the area which should be\n * displayed and \'marked up\': these lines & columns of text are marked up by `^`\n * characters below each character in the entire input range.\n * \n * - `context_loc` is the *optional* location info object which instructs this\n * pretty-printer how much *leading* context should be displayed alongside\n * the area referenced by `loc`. This can help provide context for the displayed\n * error, etc.\n * \n * When this location info is not provided, a default context of 3 lines is\n * used.\n * \n * - `context_loc2` is another *optional* location info object, which serves\n * a similar purpose to `context_loc`: it specifies the amount of *trailing*\n * context lines to display in the pretty-print output.\n * \n * When this location info is not provided, a default context of 1 line only is\n * used.\n * \n * Special Notes:\n * \n * - when the `loc`-indicated range is very large (about 5 lines or more), then\n * only the first and last few lines of this block are printed while a\n * `...continued...` message will be printed between them.\n * \n * This serves the purpose of not printing a huge amount of text when the `loc`\n * range happens to be huge: this way a manageable & readable output results\n * for arbitrary large ranges.\n * \n * - this function can display lines of input which whave not yet been lexed.\n * `prettyPrintRange()` can access the entire input!\n * \n * @public\n * @this {RegExpLexer}\n */\n prettyPrintRange: function lexer_prettyPrintRange(loc, context_loc, context_loc2) {\n var error_size = loc.last_line - loc.first_line;\n const CONTEXT = 3;\n const CONTEXT_TAIL = 1;\n const MINIMUM_VISIBLE_NONEMPTY_LINE_COUNT = 2;\n var input = this.matched + this._input;\n var lines = input.split(\'\\n\');\n //var show_context = (error_size < 5 || context_loc);\n var l0 = Math.max(1, (context_loc ? context_loc.first_line : loc.first_line - CONTEXT));\n var l1 = Math.max(1, (context_loc2 ? context_loc2.last_line : loc.last_line + CONTEXT_TAIL));\n var lineno_display_width = (1 + Math.log10(l1 | 1) | 0);\n var ws_prefix = new Array(lineno_display_width).join(\' \');\n var nonempty_line_indexes = [];\n var rv = lines.slice(l0 - 1, l1 + 1).map(function injectLineNumber(line, index) {\n var lno = index + l0;\n var lno_pfx = (ws_prefix + lno).substr(-lineno_display_width);\n var rv = lno_pfx + \': \' + line;\n var errpfx = (new Array(lineno_display_width + 1)).join(\'^\');\n var offset = 2 + 1;\n var len = 0;\n\n if (lno === loc.first_line) {\n offset += loc.first_column;\n\n len = Math.max(\n 2,\n ((lno === loc.last_line ? loc.last_column : line.length)) - loc.first_column + 1\n );\n } else if (lno === loc.last_line) {\n len = Math.max(2, loc.last_column + 1);\n } else if (lno > loc.first_line && lno < loc.last_line) {\n len = Math.max(2, line.length + 1);\n }\n\n if (len) {\n var lead = new Array(offset).join(\'.\');\n var mark = new Array(len).join(\'^\');\n rv += \'\\n\' + errpfx + lead + mark;\n\n if (line.trim().length > 0) {\n nonempty_line_indexes.push(index);\n }\n }\n\n rv = rv.replace(/\\t/g, \' \');\n return rv;\n });\n\n // now make sure we don\'t print an overly large amount of error area: limit it \n // to the top and bottom line count:\n if (nonempty_line_indexes.length > 2 * MINIMUM_VISIBLE_NONEMPTY_LINE_COUNT) {\n var clip_start = nonempty_line_indexes[MINIMUM_VISIBLE_NONEMPTY_LINE_COUNT - 1] + 1;\n var clip_end = nonempty_line_indexes[nonempty_line_indexes.length - MINIMUM_VISIBLE_NONEMPTY_LINE_COUNT] - 1;\n\n var intermediate_line = (new Array(lineno_display_width + 1)).join(\' \') + \' (...continued...)\';\n intermediate_line += \'\\n\' + (new Array(lineno_display_width + 1)).join(\'-\') + \' (---------------)\';\n rv.splice(clip_start, clip_end - clip_start + 1, intermediate_line);\n }\n return rv.join(\'\\n\');\n },\n\n /**\n * helper function, used to produce a human readable description as a string, given\n * the input `yylloc` location object.\n * \n * Set `display_range_too` to TRUE to include the string character index position(s)\n * in the description if the `yylloc.range` is available.\n * \n * @public\n * @this {RegExpLexer}\n */\n describeYYLLOC: function lexer_describe_yylloc(yylloc, display_range_too) {\n var l1 = yylloc.first_line;\n var l2 = yylloc.last_line;\n var c1 = yylloc.first_column;\n var c2 = yylloc.last_column;\n var dl = l2 - l1;\n var dc = c2 - c1;\n var rv;\n if (dl === 0) {\n rv = \'line \' + l1 + \', \';\n if (dc <= 1) {\n rv += \'column \' + c1;\n } else {\n rv += \'columns \' + c1 + \' .. \' + c2;\n }\n } else {\n rv = \'lines \' + l1 + \'(column \' + c1 + \') .. \' + l2 + \'(column \' + c2 + \')\';\n }\n if (yylloc.range && display_range_too) {\n var r1 = yylloc.range[0];\n var r2 = yylloc.range[1] - 1;\n if (r2 <= r1) {\n rv += \' {String Offset: \' + r1 + \'}\';\n } else {\n rv += \' {String Offset range: \' + r1 + \' .. \' + r2 + \'}\';\n }\n }\n return rv;\n },\n\n /**\n * test the lexed token: return FALSE when not a match, otherwise return token.\n * \n * `match` is supposed to be an array coming out of a regex match, i.e. `match[0]`\n * contains the actually matched text string.\n * \n * Also move the input cursor forward and update the match collectors:\n * \n * - `yytext`\n * - `yyleng`\n * - `match`\n * - `matches`\n * - `yylloc`\n * - `offset`\n * \n * @public\n * @this {RegExpLexer}\n */\n test_match: function lexer_test_match(match, indexed_rule) {\n var token,\n lines,\n backup,\n match_str,\n match_str_len;\n\n if (this.options.backtrack_lexer) {\n // save context\n backup = {\n yylineno: this.yylineno,\n yylloc: {\n first_line: this.yylloc.first_line,\n last_line: this.yylloc.last_line,\n first_column: this.yylloc.first_column,\n last_column: this.yylloc.last_column,\n\n range: this.yylloc.range.slice(0)\n },\n yytext: this.yytext,\n match: this.match,\n matches: this.matches,\n matched: this.matched,\n yyleng: this.yyleng,\n offset: this.offset,\n _more: this._more,\n _input: this._input,\n //_signaled_error_token: this._signaled_error_token,\n yy: this.yy,\n conditionStack: this.conditionStack.slice(0),\n done: this.done\n };\n }\n\n match_str = match[0];\n match_str_len = match_str.length;\n // if (match_str.indexOf(\'\\n\') !== -1 || match_str.indexOf(\'\\r\') !== -1) {\n lines = match_str.split(/(?:\\r\\n?|\\n)/g);\n if (lines.length > 1) {\n this.yylineno += lines.length - 1;\n\n this.yylloc.last_line = this.yylineno + 1;\n this.yylloc.last_column = lines[lines.length - 1].length;\n } else {\n this.yylloc.last_column += match_str_len;\n }\n // }\n this.yytext += match_str;\n this.match += match_str;\n this.matched += match_str;\n this.matches = match;\n this.yyleng = this.yytext.length;\n this.yylloc.range[1] += match_str_len;\n\n // previous lex rules MAY have invoked the `more()` API rather than producing a token:\n // those rules will already have moved this `offset` forward matching their match lengths,\n // hence we must only add our own match length now:\n this.offset += match_str_len;\n this._more = false;\n this._backtrack = false;\n this._input = this._input.slice(match_str_len);\n\n // calling this method:\n //\n // function lexer__performAction(yy, yyrulenumber, YY_START) {...}\n token = this.performAction.call(this, this.yy, indexed_rule, this.conditionStack[this.conditionStack.length - 1] /* = YY_START */);\n // otherwise, when the action codes are all simple return token statements:\n //token = this.simpleCaseActionClusters[indexed_rule];\n\n if (this.done && this._input) {\n this.done = false;\n }\n if (token) {\n return token;\n } else if (this._backtrack) {\n // recover context\n for (var k in backup) {\n this[k] = backup[k];\n }\n this.__currentRuleSet__ = null;\n return false; // rule action called reject() implying the next rule should be tested instead.\n } else if (this._signaled_error_token) {\n // produce one \'error\' token as `.parseError()` in `reject()`\n // did not guarantee a failure signal by throwing an exception!\n token = this._signaled_error_token;\n this._signaled_error_token = false;\n return token;\n }\n return false;\n },\n\n /**\n * return next match in input\n * \n * @public\n * @this {RegExpLexer}\n */\n next: function lexer_next() {\n if (this.done) {\n this.clear();\n return this.EOF;\n }\n if (!this._input) {\n this.done = true;\n }\n\n var token,\n match,\n tempMatch,\n index;\n if (!this._more) {\n this.clear();\n }\n var spec = this.__currentRuleSet__;\n if (!spec) {\n // Update the ruleset cache as we apparently encountered a state change or just started lexing.\n // The cache is set up for fast lookup -- we assume a lexer will switch states much less often than it will\n // invoke the `lex()` token-producing API and related APIs, hence caching the set for direct access helps\n // speed up those activities a tiny bit.\n spec = this.__currentRuleSet__ = this._currentRules();\n // Check whether a *sane* condition has been pushed before: this makes the lexer robust against\n // user-programmer bugs such as https://github.com/zaach/jison-lex/issues/19\n if (!spec || !spec.rules) {\n var lineno_msg = \'\';\n if (this.options.trackPosition) {\n lineno_msg = \' on line \' + (this.yylineno + 1);\n }\n var p = this.constructLexErrorInfo(\'Internal lexer engine error\' + lineno_msg + \': The lex grammar programmer pushed a non-existing condition name "\' + this.topState() + \'"; this is a fatal error and should be reported to the application programmer team!\', false);\n // produce one \'error\' token until this situation has been resolved, most probably by parse termination!\n return (this.parseError(p.errStr, p, this.JisonLexerError) || this.ERROR);\n }\n }\n\n var rule_ids = spec.rules;\n var regexes = spec.__rule_regexes;\n var len = spec.__rule_count;\n\n // Note: the arrays are 1-based, while `len` itself is a valid index,\n // hence the non-standard less-or-equal check in the next loop condition!\n for (var i = 1; i <= len; i++) {\n tempMatch = this._input.match(regexes[i]);\n if (tempMatch && (!match || tempMatch[0].length > match[0].length)) {\n match = tempMatch;\n index = i;\n if (this.options.backtrack_lexer) {\n token = this.test_match(tempMatch, rule_ids[i]);\n if (token !== false) {\n return token;\n } else if (this._backtrack) {\n match = undefined;\n continue; // rule action called reject() implying a rule MISmatch.\n } else {\n // else: this is a lexer rule which consumes input without producing a token (e.g. whitespace)\n return false;\n }\n } else if (!this.options.flex) {\n break;\n }\n }\n }\n if (match) {\n token = this.test_match(match, rule_ids[index]);\n if (token !== false) {\n return token;\n }\n // else: this is a lexer rule which consumes input without producing a token (e.g. whitespace)\n return false;\n }\n if (!this._input) {\n this.done = true;\n this.clear();\n return this.EOF;\n } else {\n var lineno_msg = \'\';\n if (this.options.trackPosition) {\n lineno_msg = \' on line \' + (this.yylineno + 1);\n }\n var p = this.constructLexErrorInfo(\'Lexical error\' + lineno_msg + \': Unrecognized text.\', this.options.lexerErrorsAreRecoverable);\n\n var pendingInput = this._input;\n var activeCondition = this.topState();\n var conditionStackDepth = this.conditionStack.length;\n\n token = (this.parseError(p.errStr, p, this.JisonLexerError) || this.ERROR);\n if (token === this.ERROR) {\n // we can try to recover from a lexer error that `parseError()` did not \'recover\' for us\n // by moving forward at least one character at a time IFF the (user-specified?) `parseError()`\n // has not consumed/modified any pending input or changed state in the error handler:\n if (!this.matches && \n // and make sure the input has been modified/consumed ...\n pendingInput === this._input &&\n // ...or the lexer state has been modified significantly enough\n // to merit a non-consuming error handling action right now.\n activeCondition === this.topState() && \n conditionStackDepth === this.conditionStack.length\n ) {\n this.input();\n }\n }\n return token;\n }\n },\n\n /**\n * return next match that has a token\n * \n * @public\n * @this {RegExpLexer}\n */\n lex: function lexer_lex() {\n var r;\n // allow the PRE/POST handlers set/modify the return token for maximum flexibility of the generated lexer:\n if (typeof this.options.pre_lex === \'function\') {\n r = this.options.pre_lex.call(this);\n }\n\n while (!r) {\n r = this.next();\n }\n\n if (typeof this.options.post_lex === \'function\') {\n // (also account for a userdef function which does not return any value: keep the token as is)\n r = this.options.post_lex.call(this, r) || r;\n }\n return r;\n },\n\n /**\n * backwards compatible alias for `pushState()`;\n * the latter is symmetrical with `popState()` and we advise to use\n * those APIs in any modern lexer code, rather than `begin()`.\n * \n * @public\n * @this {RegExpLexer}\n */\n begin: function lexer_begin(condition) {\n return this.pushState(condition);\n },\n\n /**\n * activates a new lexer condition state (pushes the new lexer\n * condition state onto the condition stack)\n * \n * @public\n * @this {RegExpLexer}\n */\n pushState: function lexer_pushState(condition) {\n this.conditionStack.push(condition);\n this.__currentRuleSet__ = null;\n return this;\n },\n\n /**\n * pop the previously active lexer condition state off the condition\n * stack\n * \n * @public\n * @this {RegExpLexer}\n */\n popState: function lexer_popState() {\n var n = this.conditionStack.length - 1;\n if (n > 0) {\n this.__currentRuleSet__ = null; \n return this.conditionStack.pop();\n } else {\n return this.conditionStack[0];\n }\n },\n\n /**\n * return the currently active lexer condition state; when an index\n * argument is provided it produces the N-th previous condition state,\n * if available\n * \n * @public\n * @this {RegExpLexer}\n */\n topState: function lexer_topState(n) {\n n = this.conditionStack.length - 1 - Math.abs(n || 0);\n if (n >= 0) {\n return this.conditionStack[n];\n } else {\n return \'INITIAL\';\n }\n },\n\n /**\n * (internal) determine the lexer rule set which is active for the\n * currently active lexer condition state\n * \n * @public\n * @this {RegExpLexer}\n */\n _currentRules: function lexer__currentRules() {\n if (this.conditionStack.length && this.conditionStack[this.conditionStack.length - 1]) {\n return this.conditions[this.conditionStack[this.conditionStack.length - 1]];\n } else {\n return this.conditions[\'INITIAL\'];\n }\n },\n\n /**\n * return the number of states currently on the stack\n * \n * @public\n * @this {RegExpLexer}\n */\n stateStackSize: function lexer_stateStackSize() {\n return this.conditionStack.length;\n }\n}'; + // --- END lexer kernel --- + } + + RegExpLexer.prototype = new Function(rmCommonWS(_templateObject2, getRegExpLexerPrototype()))(); + + // The lexer code stripper, driven by optimization analysis settings and + // lexer options, which cannot be changed at run-time. + function stripUnusedLexerCode(src, opt) { + // uses yyleng: ..................... ${opt.lexerActionsUseYYLENG} + // uses yylineno: ................... ${opt.lexerActionsUseYYLINENO} + // uses yytext: ..................... ${opt.lexerActionsUseYYTEXT} + // uses yylloc: ..................... ${opt.lexerActionsUseYYLOC} + // uses ParseError API: ............. ${opt.lexerActionsUseParseError} + // uses location tracking & editing: ${opt.lexerActionsUseLocationTracking} + // uses more() API: ................. ${opt.lexerActionsUseMore} + // uses unput() API: ................ ${opt.lexerActionsUseUnput} + // uses reject() API: ............... ${opt.lexerActionsUseReject} + // uses less() API: ................. ${opt.lexerActionsUseLess} + // uses display APIs pastInput(), upcomingInput(), showPosition(): + // ............................. ${opt.lexerActionsUseDisplayAPIs} + // uses describeYYLLOC() API: ....... ${opt.lexerActionsUseDescribeYYLOC} + + var ast = helpers.parseCodeChunkToAST(src, opt); + var new_src = helpers.prettyPrintAST(ast, opt); + + new_src = new_src.replace(/\/\*\s*JISON-LEX-ANALYTICS-REPORT\s*\*\//g, rmCommonWS(_templateObject3, opt.options.backtrack_lexer, opt.options.ranges, opt.options.trackPosition, opt.parseActionsUseYYLENG, opt.parseActionsUseYYLINENO, opt.parseActionsUseYYTEXT, opt.parseActionsUseYYLOC, opt.parseActionsUseValueTracking, opt.parseActionsUseValueAssignment, opt.parseActionsUseLocationTracking, opt.parseActionsUseLocationAssignment, opt.lexerActionsUseYYLENG, opt.lexerActionsUseYYLINENO, opt.lexerActionsUseYYTEXT, opt.lexerActionsUseYYLOC, opt.lexerActionsUseParseError, opt.lexerActionsUseYYERROR, opt.lexerActionsUseLocationTracking, opt.lexerActionsUseMore, opt.lexerActionsUseUnput, opt.lexerActionsUseReject, opt.lexerActionsUseLess, opt.lexerActionsUseDisplayAPIs, opt.lexerActionsUseDescribeYYLOC)); + + return new_src; + } + + // generate lexer source from a grammar + /** @public */ + function generate(dict, tokens, build_options) { + var opt = processGrammar(dict, tokens, build_options); + + return generateFromOpts(opt); + } + + // process the grammar and build final data structures and functions + /** @public */ + function processGrammar(dict, tokens, build_options) { + build_options = build_options || {}; + var opts = { + // include the knowledge passed through `build_options` about which lexer + // features will actually be *used* by the environment (which in 99.9% + // of cases is a jison *parser*): + // + // (this stuff comes straight from the jison Optimization Analysis.) + // + parseActionsUseYYLENG: build_options.parseActionsUseYYLENG, + parseActionsUseYYLINENO: build_options.parseActionsUseYYLINENO, + parseActionsUseYYTEXT: build_options.parseActionsUseYYTEXT, + parseActionsUseYYLOC: build_options.parseActionsUseYYLOC, + parseActionsUseParseError: build_options.parseActionsUseParseError, + parseActionsUseYYERROR: build_options.parseActionsUseYYERROR, + parseActionsUseYYERROK: build_options.parseActionsUseYYERROK, + parseActionsUseYYRECOVERING: build_options.parseActionsUseYYRECOVERING, + parseActionsUseYYCLEARIN: build_options.parseActionsUseYYCLEARIN, + parseActionsUseValueTracking: build_options.parseActionsUseValueTracking, + parseActionsUseValueAssignment: build_options.parseActionsUseValueAssignment, + parseActionsUseLocationTracking: build_options.parseActionsUseLocationTracking, + parseActionsUseLocationAssignment: build_options.parseActionsUseLocationAssignment, + parseActionsUseYYSTACK: build_options.parseActionsUseYYSTACK, + parseActionsUseYYSSTACK: build_options.parseActionsUseYYSSTACK, + parseActionsUseYYSTACKPOINTER: build_options.parseActionsUseYYSTACKPOINTER, + parseActionsUseYYRULELENGTH: build_options.parseActionsUseYYRULELENGTH, + parserHasErrorRecovery: build_options.parserHasErrorRecovery, + parserHasErrorReporting: build_options.parserHasErrorReporting, + + lexerActionsUseYYLENG: '???', + lexerActionsUseYYLINENO: '???', + lexerActionsUseYYTEXT: '???', + lexerActionsUseYYLOC: '???', + lexerActionsUseParseError: '???', + lexerActionsUseYYERROR: '???', + lexerActionsUseLocationTracking: '???', + lexerActionsUseMore: '???', + lexerActionsUseUnput: '???', + lexerActionsUseReject: '???', + lexerActionsUseLess: '???', + lexerActionsUseDisplayAPIs: '???', + lexerActionsUseDescribeYYLOC: '???' + }; + + dict = autodetectAndConvertToJSONformat(dict, build_options) || {}; + + // Feed the possibly reprocessed 'dictionary' above back to the caller + // (for use by our error diagnostic assistance code) + opts.lex_rule_dictionary = dict; + + // Always provide the lexer with an options object, even if it's empty! + // Make sure to camelCase all options: + opts.options = mkStdOptions(build_options, dict.options); + + opts.moduleType = opts.options.moduleType; + opts.moduleName = opts.options.moduleName; + + opts.conditions = prepareStartConditions(dict.startConditions); + opts.conditions.INITIAL = { + rules: [], + inclusive: true + }; + + var code = buildActions(dict, tokens, opts); + opts.performAction = code.actions; + opts.caseHelperInclude = code.caseHelperInclude; + opts.rules = code.rules; + opts.macros = code.macros; + + opts.regular_rule_count = code.regular_rule_count; + opts.simple_rule_count = code.simple_rule_count; + + opts.conditionStack = ['INITIAL']; + + opts.actionInclude = dict.actionInclude || ''; + opts.moduleInclude = (opts.moduleInclude || '') + (dict.moduleInclude || '').trim(); + + return opts; + } + + // Assemble the final source from the processed grammar + /** @public */ + function generateFromOpts(opt) { + var code = ''; + + switch (opt.moduleType) { + case 'js': + code = generateModule(opt); + break; + case 'amd': + code = generateAMDModule(opt); + break; + case 'es': + code = generateESModule(opt); + break; + case 'commonjs': + default: + code = generateCommonJSModule(opt); + break; + } + + return code; + } + + function generateRegexesInitTableCode(opt) { + var a = opt.rules; + var print_xregexp = opt.options && opt.options.xregexp; + var id_display_width = 1 + Math.log10(a.length | 1) | 0; + var ws_prefix = new Array(id_display_width).join(' '); + var b = a.map(function generateXRegExpInitCode(re, idx) { + var idx_str = (ws_prefix + idx).substr(-id_display_width); + + if (re instanceof XRegExp) { + // When we don't need the special XRegExp sauce at run-time, we do with the original + // JavaScript RegExp instance a.k.a. 'native regex': + if (re.xregexp.isNative || !print_xregexp) { + return '/* ' + idx_str + ': */ ' + re; + } + // And make sure to escape the regex to make it suitable for placement inside a *string* + // as it is passed as a string argument to the XRegExp constructor here. + var re_src = re.xregexp.source.replace(/[\\"]/g, '\\$&'); + return '/* ' + idx_str + ': */ new XRegExp("' + re_src + '", "' + re.xregexp.flags + '")'; + } else { + return '/* ' + idx_str + ': */ ' + re; + } + }); + return b.join(',\n'); + } + + function generateModuleBody(opt) { + // make the JSON output look more like JavaScript: + function cleanupJSON(str) { + str = str.replace(/ "rules": \[/g, ' rules: ['); + str = str.replace(/ "inclusive": /g, ' inclusive: '); + return str; + } + + function produceOptions(opts) { + var obj = {}; + var do_not_pass = { + debug: !opts.debug, // do not include this item when it is FALSE as there's no debug tracing built into the generated grammar anyway! + enableDebugLogs: 1, + json: 1, + _: 1, + noMain: 1, + dumpSourceCodeOnFailure: 1, + throwErrorOnCompileFailure: 1, + reportStats: 1, + file: 1, + outfile: 1, + inputPath: 1, + inputFilename: 1, + defaultModuleName: 1, + moduleName: 1, + moduleType: 1, + lexerErrorsAreRecoverable: 0, + flex: 0, + backtrack_lexer: 0, + caseInsensitive: 0, + showSource: 1, + exportAST: 1, + exportAllTables: 1, + exportSourceCode: 1, + prettyCfg: 1, + parseActionsUseYYLENG: 1, + parseActionsUseYYLINENO: 1, + parseActionsUseYYTEXT: 1, + parseActionsUseYYLOC: 1, + parseActionsUseParseError: 1, + parseActionsUseYYERROR: 1, + parseActionsUseYYRECOVERING: 1, + parseActionsUseYYERROK: 1, + parseActionsUseYYCLEARIN: 1, + parseActionsUseValueTracking: 1, + parseActionsUseValueAssignment: 1, + parseActionsUseLocationTracking: 1, + parseActionsUseLocationAssignment: 1, + parseActionsUseYYSTACK: 1, + parseActionsUseYYSSTACK: 1, + parseActionsUseYYSTACKPOINTER: 1, + parseActionsUseYYRULELENGTH: 1, + parserHasErrorRecovery: 1, + parserHasErrorReporting: 1, + lexerActionsUseYYLENG: 1, + lexerActionsUseYYLINENO: 1, + lexerActionsUseYYTEXT: 1, + lexerActionsUseYYLOC: 1, + lexerActionsUseParseError: 1, + lexerActionsUseYYERROR: 1, + lexerActionsUseLocationTracking: 1, + lexerActionsUseMore: 1, + lexerActionsUseUnput: 1, + lexerActionsUseReject: 1, + lexerActionsUseLess: 1, + lexerActionsUseDisplayAPIs: 1, + lexerActionsUseDescribeYYLOC: 1 + }; + for (var k in opts) { + if (!do_not_pass[k] && opts[k] != null && opts[k] !== false) { + // make sure numeric values are encoded as numeric, the rest as boolean/string. + if (typeof opts[k] === 'string') { + var f = parseFloat(opts[k]); + if (f == opts[k]) { + obj[k] = f; + continue; + } + } + obj[k] = opts[k]; + } + } + + // And now some options which should receive some special processing: + var pre = obj.pre_lex; + var post = obj.post_lex; + // since JSON cannot encode functions, we'll have to do it manually at run-time, i.e. later on: + if (pre) { + obj.pre_lex = true; + } + if (post) { + obj.post_lex = true; + } + + var js = JSON.stringify(obj, null, 2); + + js = js.replace(new XRegExp(' "(' + ID_REGEX_BASE + ')": ', 'g'), ' $1: '); + js = js.replace(/^( +)pre_lex: true(,)?$/gm, function (m, ls, tc) { + return ls + 'pre_lex: ' + String(pre) + (tc || ''); + }); + js = js.replace(/^( +)post_lex: true(,)?$/gm, function (m, ls, tc) { + return ls + 'post_lex: ' + String(post) + (tc || ''); + }); + return js; + } + + var out; + if (opt.rules.length > 0 || opt.__in_rules_failure_analysis_mode__) { + // we don't mind that the `test_me()` code above will have this `lexer` variable re-defined: + // JavaScript is fine with that. + var code = [rmCommonWS(_templateObject4), '/*JISON-LEX-ANALYTICS-REPORT*/' /* slot #1: placeholder for analysis report further below */ + ]; + + // get the RegExpLexer.prototype in source code form: + var protosrc = getRegExpLexerPrototype(); + // and strip off the surrounding bits we don't want: + protosrc = protosrc.replace(/^[\s\r\n]*\{/, '').replace(/\s*\}[\s\r\n]*$/, '').trim(); + code.push(protosrc + ',\n'); + + assert(opt.options); + // Assure all options are camelCased: + assert(typeof opt.options['case-insensitive'] === 'undefined'); + + code.push(' options: ' + produceOptions(opt.options)); + + var performActionCode = String(opt.performAction); + var simpleCaseActionClustersCode = String(opt.caseHelperInclude); + var rulesCode = generateRegexesInitTableCode(opt); + var conditionsCode = cleanupJSON(JSON.stringify(opt.conditions, null, 2)); + code.push(rmCommonWS(_templateObject5, performActionCode, simpleCaseActionClustersCode, rulesCode, conditionsCode)); + + opt.is_custom_lexer = false; + + out = code.join(''); + } else { + // We're clearly looking at a custom lexer here as there's no lexer rules at all. + // + // We are re-purposing the `%{...%}` `actionInclude` code block here as it serves no purpose otherwise. + // + // Meanwhile we make sure we have the `lexer` variable declared in *local scope* no matter + // what crazy stuff (or lack thereof) the userland code is pulling in the `actionInclude` chunk. + out = 'var lexer;\n'; + + assert(opt.regular_rule_count === 0); + assert(opt.simple_rule_count === 0); + opt.is_custom_lexer = true; + + if (opt.actionInclude) { + out += opt.actionInclude + (!opt.actionInclude.match(/;[\s\r\n]*$/) ? ';' : '') + '\n'; + } + } + + // The output of this function is guaranteed to read something like this: + // + // ``` + // var lexer; + // + // bla bla bla bla ... lotsa bla bla; + // ``` + // + // and that should work nicely as an `eval()`-able piece of source code. + return out; + } + + function generateGenericHeaderComment() { + var out = rmCommonWS(_templateObject6, version$1); + + return out; + } + + function prepareOptions(opt) { + opt = opt || {}; + + // check for illegal identifier + if (!opt.moduleName || !opt.moduleName.match(/^[a-zA-Z_$][a-zA-Z0-9_$\.]*$/)) { + if (opt.moduleName) { + var msg = 'WARNING: The specified moduleName "' + opt.moduleName + '" is illegal (only characters [a-zA-Z0-9_$] and "." dot are accepted); using the default moduleName "lexer" instead.'; + if (typeof opt.warn_cb === 'function') { + opt.warn_cb(msg); + } else { + // do not treat as warning; barf hairball instead so that this oddity gets noticed right away! + throw new Error(msg); + } + } + opt.moduleName = 'lexer'; + } + + prepExportStructures(opt); + + return opt; + } + + function generateModule(opt) { + opt = prepareOptions(opt); + + var out = [generateGenericHeaderComment(), '', 'var ' + opt.moduleName + ' = (function () {', jisonLexerErrorDefinition, '', generateModuleBody(opt), '', opt.moduleInclude ? opt.moduleInclude + ';' : '', '', 'return lexer;', '})();']; + + var src = out.join('\n') + '\n'; + src = stripUnusedLexerCode(src, opt); + opt.exportSourceCode.all = src; + return src; + } + + function generateAMDModule(opt) { + opt = prepareOptions(opt); + + var out = [generateGenericHeaderComment(), '', 'define([], function () {', jisonLexerErrorDefinition, '', generateModuleBody(opt), '', opt.moduleInclude ? opt.moduleInclude + ';' : '', '', 'return lexer;', '});']; + + var src = out.join('\n') + '\n'; + src = stripUnusedLexerCode(src, opt); + opt.exportSourceCode.all = src; + return src; + } + + function generateESModule(opt) { + opt = prepareOptions(opt); + + var out = [generateGenericHeaderComment(), '', 'var lexer = (function () {', jisonLexerErrorDefinition, '', generateModuleBody(opt), '', opt.moduleInclude ? opt.moduleInclude + ';' : '', '', 'return lexer;', '})();', '', 'function yylex() {', ' return lexer.lex.apply(lexer, arguments);', '}', rmCommonWS(_templateObject7)]; + + var src = out.join('\n') + '\n'; + src = stripUnusedLexerCode(src, opt); + opt.exportSourceCode.all = src; + return src; + } + + function generateCommonJSModule(opt) { + opt = prepareOptions(opt); + + var out = [generateGenericHeaderComment(), '', 'var ' + opt.moduleName + ' = (function () {', jisonLexerErrorDefinition, '', generateModuleBody(opt), '', opt.moduleInclude ? opt.moduleInclude + ';' : '', '', 'return lexer;', '})();', '', 'if (typeof require !== \'undefined\' && typeof exports !== \'undefined\') {', ' exports.lexer = ' + opt.moduleName + ';', ' exports.lex = function () {', ' return ' + opt.moduleName + '.lex.apply(lexer, arguments);', ' };', '}']; + + var src = out.join('\n') + '\n'; + src = stripUnusedLexerCode(src, opt); + opt.exportSourceCode.all = src; + return src; + } + + RegExpLexer.generate = generate; + + RegExpLexer.version = version$1; + RegExpLexer.defaultJisonLexOptions = defaultJisonLexOptions; + RegExpLexer.mkStdOptions = mkStdOptions; + RegExpLexer.camelCase = camelCase; + RegExpLexer.autodetectAndConvertToJSONformat = autodetectAndConvertToJSONformat; + + var version = '0.6.1-205'; // require('./package.json').version; + + + function getCommandlineOptions() { + 'use strict'; + + var opts = nomnom.script('jison-lex').unknownOptionTreatment(false) // do not accept unknown options! + .options({ + file: { + flag: true, + position: 0, + help: 'file containing a lexical grammar' + }, + json: { + abbr: 'j', + flag: true, + default: false, + help: 'jison will expect a grammar in either JSON/JSON5 or JISON format: the precise format is autodetected' + }, + outfile: { + abbr: 'o', + metavar: 'FILE', + help: 'Filepath and base module name of the generated parser;\nwhen terminated with a / (dir separator) it is treated as the destination directory where the generated output will be stored' + }, + debug: { + abbr: 'd', + flag: true, + default: false, + help: 'Debug mode' + }, + dumpSourceCodeOnFailure: { + full: 'dump-sourcecode-on-failure', + flag: true, + default: true, + help: 'Dump the generated source code to a special named file when the internal generator tests fail, i.e. when the generated source code does not compile in the JavaScript engine. Enabling this option helps you to diagnose/debug crashes (thrown exceptions) in the code generator due to various reasons: you can, for example, load the dumped sourcecode in another environment (e.g. NodeJS) to get more info on the precise location and cause of the compile failure.' + }, + throwErrorOnCompileFailure: { + full: 'throw-on-compile-failure', + flag: true, + default: true, + help: 'Throw an exception when the generated source code fails to compile in the JavaScript engine. **WARNING**: Turning this feature OFF permits the code generator to produce non-working source code and treat that as SUCCESS. This MAY be desirable code generator behaviour, but only rarely.' + }, + reportStats: { + full: 'info', + abbr: 'I', + flag: true, + default: false, + help: 'Report some statistics about the generated parser' + }, + moduleType: { + full: 'module-type', + abbr: 't', + default: 'commonjs', + metavar: 'TYPE', + choices: ['commonjs', 'amd', 'js', 'es'], + help: 'The type of module to generate (commonjs, amd, es, js)' + }, + moduleName: { + full: 'module-name', + abbr: 'n', + metavar: 'NAME', + help: 'The name of the generated parser object, namespace supported' + }, + main: { + full: 'main', + abbr: 'x', + flag: true, + default: false, + help: 'Include .main() entry point in generated commonjs module' + }, + moduleMain: { + full: 'module-main', + abbr: 'y', + metavar: 'NAME', + help: 'The main module function definition' + }, + version: { + abbr: 'V', + flag: true, + help: 'print version and exit', + callback: function callback() { + return version; + } + } + }).parse(); + + return opts; + } + + var cli = module.exports; + + cli.main = function cliMain(opts) { + 'use strict'; + + opts = RegExpLexer.mkStdOptions(opts); + + function isDirectory(fp) { + try { + return fs.lstatSync(fp).isDirectory(); + } catch (e) { + return false; + } + } + + function mkdirp(fp) { + if (!fp || fp === '.' || fp.length === 0) { + return false; + } + try { + fs.mkdirSync(fp); + return true; + } catch (e) { + if (e.code === 'ENOENT') { + var parent = path.dirname(fp); + // Did we hit the root directory by now? If so, abort! + // Else, create the parent; iff that fails, we fail too... + if (parent !== fp && mkdirp(parent)) { + try { + // Retry creating the original directory: it should succeed now + fs.mkdirSync(fp); + return true; + } catch (e) { + return false; + } + } + } + } + return false; + } + + function processInputFile() { + // getting raw files + var original_cwd = process.cwd(); + + var raw = fs.readFileSync(path.normalize(opts.file), 'utf8'); + + // making best guess at json mode + opts.json = path.extname(opts.file) === '.json' || opts.json; + + // When only the directory part of the output path was specified, then we + // do NOT have the target module name in there as well! + var outpath = opts.outfile; + if (/[\\\/]$/.test(outpath) || isDirectory(outpath)) { + opts.outfile = null; + outpath = outpath.replace(/[\\\/]$/, ''); + } + if (outpath && outpath.length > 0) { + outpath += '/'; + } else { + outpath = ''; + } + + // setting output file name and module name based on input file name + // if they aren't specified. + var name = path.basename(opts.outfile || opts.file); + + // get the base name (i.e. the file name without extension) + // i.e. strip off only the extension and keep any other dots in the filename + name = path.basename(name, path.extname(name)); + + opts.outfile = opts.outfile || outpath + name + '.js'; + if (!opts.moduleName && name) { + opts.moduleName = opts.defaultModuleName = name.replace(/-\w/g, function (match) { + return match.charAt(1).toUpperCase(); + }); + } + + // Change CWD to the directory where the source grammar resides: this helps us properly + // %include any files mentioned in the grammar with relative paths: + var new_cwd = path.dirname(path.normalize(opts.file)); + process.chdir(new_cwd); + + var lexer = cli.generateLexerString(raw, opts); + + // and change back to the CWD we started out with: + process.chdir(original_cwd); + + mkdirp(path.dirname(opts.outfile)); + fs.writeFileSync(opts.outfile, lexer); + console.log('JISON-LEX output for module [' + opts.moduleName + '] has been written to file:', opts.outfile); + } + + function readin(cb) { + var stdin = process.openStdin(), + data = ''; + + stdin.setEncoding('utf8'); + stdin.addListener('data', function (chunk) { + data += chunk; + }); + stdin.addListener('end', function () { + cb(data); + }); + } + + function processStdin() { + readin(function processStdinReadInCallback(raw) { + console.log(cli.generateLexerString(raw, opts)); + }); + } + + // if an input file wasn't given, assume input on stdin + if (opts.file) { + processInputFile(); + } else { + processStdin(); + } + }; + + cli.generateLexerString = function generateLexerString(lexerSpec, opts) { + 'use strict'; + + // var settings = RegExpLexer.mkStdOptions(opts); + + var predefined_tokens = null; + + return RegExpLexer.generate(lexerSpec, predefined_tokens, opts); + }; + + if (require.main === module) { + var opts = getCommandlineOptions(); + cli.main(opts); + } +}); diff --git a/dist/cli-umd.js b/dist/cli-umd.js new file mode 100644 index 0000000..64f8baa --- /dev/null +++ b/dist/cli-umd.js @@ -0,0 +1,4286 @@ +#!/usr/bin/env node + + +(function (global, factory) { + typeof exports === 'object' && typeof module !== 'undefined' ? factory(require('fs'), require('path'), require('@gerhobbelt/nomnom'), require('@gerhobbelt/xregexp'), require('@gerhobbelt/json5'), require('@gerhobbelt/lex-parser'), require('assert'), require('jison-helpers-lib')) : + typeof define === 'function' && define.amd ? define(['fs', 'path', '@gerhobbelt/nomnom', '@gerhobbelt/xregexp', '@gerhobbelt/json5', '@gerhobbelt/lex-parser', 'assert', 'jison-helpers-lib'], factory) : + (factory(global.fs,global.path,global.nomnom,global.XRegExp,global.json5,global.lexParser,global.assert,global.helpers)); +}(this, (function (fs,path,nomnom,XRegExp,json5,lexParser,assert,helpers) { 'use strict'; + +fs = fs && fs.hasOwnProperty('default') ? fs['default'] : fs; +path = path && path.hasOwnProperty('default') ? path['default'] : path; +nomnom = nomnom && nomnom.hasOwnProperty('default') ? nomnom['default'] : nomnom; +XRegExp = XRegExp && XRegExp.hasOwnProperty('default') ? XRegExp['default'] : XRegExp; +json5 = json5 && json5.hasOwnProperty('default') ? json5['default'] : json5; +lexParser = lexParser && lexParser.hasOwnProperty('default') ? lexParser['default'] : lexParser; +assert = assert && assert.hasOwnProperty('default') ? assert['default'] : assert; +helpers = helpers && helpers.hasOwnProperty('default') ? helpers['default'] : helpers; + +// +// Helper library for set definitions +// +// MIT Licensed +// +// +// This code is intended to help parse regex set expressions and mix them +// together, i.e. to answer questions like this: +// +// what is the resulting regex set expression when we mix the regex set +// `[a-z]` with the regex set `[^\s]` where with 'mix' we mean that any +// input which matches either input regex should match the resulting +// regex set. (a.k.a. Full Outer Join, see also http://www.diffen.com/difference/Inner_Join_vs_Outer_Join) +// + +'use strict'; + +const XREGEXP_UNICODE_ESCAPE_RE$1 = /^\{[A-Za-z0-9 \-\._]+\}/; // Matches the XRegExp Unicode escape braced part, e.g. `{Number}` +const CHR_RE$1 = /^(?:[^\\]|\\[^cxu0-9]|\\[0-9]{1,3}|\\c[A-Z]|\\x[0-9a-fA-F]{2}|\\u[0-9a-fA-F]{4}|\\u\{[0-9a-fA-F]+\})/; +const SET_PART_RE$1 = /^(?:[^\\\]]|\\[^cxu0-9]|\\[0-9]{1,3}|\\c[A-Z]|\\x[0-9a-fA-F]{2}|\\u[0-9a-fA-F]{4}|\\u\{[0-9a-fA-F]+\})+/; +const NOTHING_SPECIAL_RE$1 = /^(?:[^\\\[\]\(\)\|^\{\}]|\\[^cxu0-9]|\\[0-9]{1,3}|\\c[A-Z]|\\x[0-9a-fA-F]{2}|\\u[0-9a-fA-F]{4}|\\u\{[0-9a-fA-F]+\})+/; +const SET_IS_SINGLE_PCODE_RE = /^\\[dDwWsS]$|^\\p\{[A-Za-z0-9 \-\._]+\}$/; + +const UNICODE_BASE_PLANE_MAX_CP$1 = 65535; + +// The expanded regex sets which are equivalent to the given `\\{c}` escapes: +// +// `/\s/`: +const WHITESPACE_SETSTR$1 = ' \f\n\r\t\v\u00a0\u1680\u180e\u2000-\u200a\u2028\u2029\u202f\u205f\u3000\ufeff'; +// `/\d/`: +const DIGIT_SETSTR$1 = '0-9'; +// `/\w/`: +const WORDCHAR_SETSTR$1 = 'A-Za-z0-9_'; + + + + + +// Helper for `bitarray2set()`: convert character code to a representation string suitable for use in a regex +function i2c(i) { + var c, x; + + switch (i) { + case 10: + return '\\n'; + + case 13: + return '\\r'; + + case 9: + return '\\t'; + + case 8: + return '\\b'; + + case 12: + return '\\f'; + + case 11: + return '\\v'; + + case 45: // ASCII/Unicode for '-' dash + return '\\-'; + + case 91: // '[' + return '\\['; + + case 92: // '\\' + return '\\\\'; + + case 93: // ']' + return '\\]'; + + case 94: // ']' + return '\\^'; + } + if (i < 32 + || i > 0xFFF0 /* Unicode Specials, also in UTF16 */ + || (i >= 0xD800 && i <= 0xDFFF) /* Unicode Supplementary Planes; we're TOAST in JavaScript as we're NOT UTF-16 but UCS-2! */ + || String.fromCharCode(i).match(/[\u2028\u2029]/) /* Code compilation via `new Function()` does not like to see these, or rather: treats them as just another form of CRLF, which breaks your generated regex code! */ + ) { + // Detail about a detail: + // U+2028 and U+2029 are part of the `\s` regex escape code (`\s` and `[\s]` match either of these) and when placed in a JavaScript + // source file verbatim (without escaping it as a `\uNNNN` item) then JavaScript will interpret it as such and consequently report + // a b0rked generated parser, as the generated code would include this regex right here. + // Hence we MUST escape these buggers everywhere we go... + x = i.toString(16); + if (x.length >= 1 && i <= 0xFFFF) { + c = '0000' + x; + return '\\u' + c.substr(c.length - 4); + } else { + return '\\u{' + x + '}'; + } + } + return String.fromCharCode(i); +} + + +// Helper collection for `bitarray2set()`: we have expanded all these cached `\\p{NAME}` regex sets when creating +// this bitarray and now we should look at these expansions again to see if `bitarray2set()` can produce a +// `\\p{NAME}` shorthand to represent [part of] the bitarray: +var Pcodes_bitarray_cache = {}; +var Pcodes_bitarray_cache_test_order = []; + +// Helper collection for `bitarray2set()` for minifying special cases of result sets which can be represented by +// a single regex 'escape', e.g. `\d` for digits 0-9. +var EscCode_bitarray_output_refs; + +// now initialize the EscCodes_... table above: +init_EscCode_lookup_table(); + +function init_EscCode_lookup_table() { + var s, bitarr, set2esc = {}, esc2bitarr = {}; + + // patch global lookup tables for the time being, while we calculate their *real* content in this function: + EscCode_bitarray_output_refs = { + esc2bitarr: {}, + set2esc: {} + }; + Pcodes_bitarray_cache_test_order = []; + + // `/\S': + bitarr = []; + set2bitarray(bitarr, '^' + WHITESPACE_SETSTR$1); + s = bitarray2set(bitarr); + esc2bitarr['S'] = bitarr; + set2esc[s] = 'S'; + // set2esc['^' + s] = 's'; + Pcodes_bitarray_cache['\\S'] = bitarr; + + // `/\s': + bitarr = []; + set2bitarray(bitarr, WHITESPACE_SETSTR$1); + s = bitarray2set(bitarr); + esc2bitarr['s'] = bitarr; + set2esc[s] = 's'; + // set2esc['^' + s] = 'S'; + Pcodes_bitarray_cache['\\s'] = bitarr; + + // `/\D': + bitarr = []; + set2bitarray(bitarr, '^' + DIGIT_SETSTR$1); + s = bitarray2set(bitarr); + esc2bitarr['D'] = bitarr; + set2esc[s] = 'D'; + // set2esc['^' + s] = 'd'; + Pcodes_bitarray_cache['\\D'] = bitarr; + + // `/\d': + bitarr = []; + set2bitarray(bitarr, DIGIT_SETSTR$1); + s = bitarray2set(bitarr); + esc2bitarr['d'] = bitarr; + set2esc[s] = 'd'; + // set2esc['^' + s] = 'D'; + Pcodes_bitarray_cache['\\d'] = bitarr; + + // `/\W': + bitarr = []; + set2bitarray(bitarr, '^' + WORDCHAR_SETSTR$1); + s = bitarray2set(bitarr); + esc2bitarr['W'] = bitarr; + set2esc[s] = 'W'; + // set2esc['^' + s] = 'w'; + Pcodes_bitarray_cache['\\W'] = bitarr; + + // `/\w': + bitarr = []; + set2bitarray(bitarr, WORDCHAR_SETSTR$1); + s = bitarray2set(bitarr); + esc2bitarr['w'] = bitarr; + set2esc[s] = 'w'; + // set2esc['^' + s] = 'W'; + Pcodes_bitarray_cache['\\w'] = bitarr; + + EscCode_bitarray_output_refs = { + esc2bitarr: esc2bitarr, + set2esc: set2esc + }; + + updatePcodesBitarrayCacheTestOrder(); +} + +function updatePcodesBitarrayCacheTestOrder(opts) { + var t = new Array(UNICODE_BASE_PLANE_MAX_CP$1 + 1); + var l = {}; + var user_has_xregexp = opts && opts.options && opts.options.xregexp; + var i, j, k, ba; + + // mark every character with which regex pcodes they are part of: + for (k in Pcodes_bitarray_cache) { + ba = Pcodes_bitarray_cache[k]; + + if (!user_has_xregexp && k.indexOf('\\p{') >= 0) { + continue; + } + + var cnt = 0; + for (i = 0; i <= UNICODE_BASE_PLANE_MAX_CP$1; i++) { + if (ba[i]) { + cnt++; + if (!t[i]) { + t[i] = [k]; + } else { + t[i].push(k); + } + } + } + l[k] = cnt; + } + + // now dig out the unique ones: only need one per pcode. + // + // We ASSUME every \\p{NAME} 'pcode' has at least ONE character + // in it that is ONLY matched by that particular pcode. + // If this assumption fails, nothing is lost, but our 'regex set + // optimized representation' will be sub-optimal as than this pcode + // won't be tested during optimization. + // + // Now that would be a pity, so the assumption better holds... + // Turns out the assumption doesn't hold already for /\S/ + /\D/ + // as the second one (\D) is a pure subset of \S. So we have to + // look for markers which match multiple escapes/pcodes for those + // ones where a unique item isn't available... + var lut = []; + var done = {}; + var keys = Object.keys(Pcodes_bitarray_cache); + + for (i = 0; i <= UNICODE_BASE_PLANE_MAX_CP$1; i++) { + k = t[i][0]; + if (t[i].length === 1 && !done[k]) { + assert(l[k] > 0); + lut.push([i, k]); + done[k] = true; + } + } + + for (j = 0; keys[j]; j++) { + k = keys[j]; + + if (!user_has_xregexp && k.indexOf('\\p{') >= 0) { + continue; + } + + if (!done[k]) { + assert(l[k] > 0); + // find a minimum span character to mark this one: + var w = Infinity; + var rv; + ba = Pcodes_bitarray_cache[k]; + for (i = 0; i <= UNICODE_BASE_PLANE_MAX_CP$1; i++) { + if (ba[i]) { + var tl = t[i].length; + if (tl > 1 && tl < w) { + assert(l[k] > 0); + rv = [i, k]; + w = tl; + } + } + } + if (rv) { + done[k] = true; + lut.push(rv); + } + } + } + + // order from large set to small set so that small sets don't gobble + // characters also represented by overlapping larger set pcodes. + // + // Again we assume something: that finding the large regex pcode sets + // before the smaller, more specialized ones, will produce a more + // optimal minification of the regex set expression. + // + // This is a guestimate/heuristic only! + lut.sort(function (a, b) { + var k1 = a[1]; + var k2 = b[1]; + var ld = l[k2] - l[k1]; + if (ld) { + return ld; + } + // and for same-size sets, order from high to low unique identifier. + return b[0] - a[0]; + }); + + Pcodes_bitarray_cache_test_order = lut; +} + + + + + + +// 'Join' a regex set `[...]` into a Unicode range spanning logic array, flagging every character in the given set. +function set2bitarray(bitarr, s, opts) { + var orig = s; + var set_is_inverted = false; + var bitarr_orig; + + function mark(d1, d2) { + if (d2 == null) d2 = d1; + for (var i = d1; i <= d2; i++) { + bitarr[i] = true; + } + } + + function add2bitarray(dst, src) { + for (var i = 0; i <= UNICODE_BASE_PLANE_MAX_CP$1; i++) { + if (src[i]) { + dst[i] = true; + } + } + } + + function eval_escaped_code(s) { + var c; + // decode escaped code? If none, just take the character as-is + if (s.indexOf('\\') === 0) { + var l = s.substr(0, 2); + switch (l) { + case '\\c': + c = s.charCodeAt(2) - 'A'.charCodeAt(0) + 1; + return String.fromCharCode(c); + + case '\\x': + s = s.substr(2); + c = parseInt(s, 16); + return String.fromCharCode(c); + + case '\\u': + s = s.substr(2); + if (s[0] === '{') { + s = s.substr(1, s.length - 2); + } + c = parseInt(s, 16); + if (c >= 0x10000) { + return new Error('We do NOT support Extended Plane Unicode Codepoints (i.e. CodePoints beyond U:FFFF) in regex set expressions, e.g. \\u{' + s + '}'); + } + return String.fromCharCode(c); + + case '\\0': + case '\\1': + case '\\2': + case '\\3': + case '\\4': + case '\\5': + case '\\6': + case '\\7': + s = s.substr(1); + c = parseInt(s, 8); + return String.fromCharCode(c); + + case '\\r': + return '\r'; + + case '\\n': + return '\n'; + + case '\\v': + return '\v'; + + case '\\f': + return '\f'; + + case '\\t': + return '\t'; + + case '\\b': + return '\b'; + + default: + // just the character itself: + return s.substr(1); + } + } else { + return s; + } + } + + if (s && s.length) { + var c1, c2; + + // inverted set? + if (s[0] === '^') { + set_is_inverted = true; + s = s.substr(1); + bitarr_orig = bitarr; + bitarr = new Array(UNICODE_BASE_PLANE_MAX_CP$1 + 1); + } + + // BITARR collects flags for characters set. Inversion means the complement set of character is st instead. + // This results in an OR operations when sets are joined/chained. + + while (s.length) { + c1 = s.match(CHR_RE$1); + if (!c1) { + // hit an illegal escape sequence? cope anyway! + c1 = s[0]; + } else { + c1 = c1[0]; + // Quick hack for XRegExp escapes inside a regex `[...]` set definition: we *could* try to keep those + // intact but it's easier to unfold them here; this is not nice for when the grammar specifies explicit + // XRegExp support, but alas, we'll get there when we get there... ;-) + switch (c1) { + case '\\p': + s = s.substr(c1.length); + c2 = s.match(XREGEXP_UNICODE_ESCAPE_RE$1); + if (c2) { + c2 = c2[0]; + s = s.substr(c2.length); + // do we have this one cached already? + var pex = c1 + c2; + var ba4p = Pcodes_bitarray_cache[pex]; + if (!ba4p) { + // expand escape: + var xr = new XRegExp('[' + pex + ']'); // TODO: case-insensitive grammar??? + // rewrite to a standard `[...]` regex set: XRegExp will do this for us via `XRegExp.toString()`: + var xs = '' + xr; + // remove the wrapping `/.../` to get at the (possibly *combined* series of) `[...]` sets inside: + xs = xs.substr(1, xs.length - 2); + + ba4p = reduceRegexToSetBitArray(xs, pex, opts); + + Pcodes_bitarray_cache[pex] = ba4p; + updatePcodesBitarrayCacheTestOrder(opts); + } + // merge bitarrays: + add2bitarray(bitarr, ba4p); + continue; + } + break; + + case '\\S': + case '\\s': + case '\\W': + case '\\w': + case '\\d': + case '\\D': + // these can't participate in a range, but need to be treated special: + s = s.substr(c1.length); + // check for \S, \s, \D, \d, \W, \w and expand them: + var ba4e = EscCode_bitarray_output_refs.esc2bitarr[c1[1]]; + assert(ba4e); + add2bitarray(bitarr, ba4e); + continue; + + case '\\b': + // matches a backspace: https://developer.mozilla.org/en/docs/Web/JavaScript/Guide/Regular_Expressions#special-backspace + c1 = '\u0008'; + break; + } + } + var v1 = eval_escaped_code(c1); + // propagate deferred exceptions = error reports. + if (v1 instanceof Error) { + return v1; + } + v1 = v1.charCodeAt(0); + s = s.substr(c1.length); + + if (s[0] === '-' && s.length >= 2) { + // we can expect a range like 'a-z': + s = s.substr(1); + c2 = s.match(CHR_RE$1); + if (!c2) { + // hit an illegal escape sequence? cope anyway! + c2 = s[0]; + } else { + c2 = c2[0]; + } + var v2 = eval_escaped_code(c2); + // propagate deferred exceptions = error reports. + if (v2 instanceof Error) { + return v1; + } + v2 = v2.charCodeAt(0); + s = s.substr(c2.length); + + // legal ranges go UP, not /DOWN! + if (v1 <= v2) { + mark(v1, v2); + } else { + console.warn('INVALID CHARACTER RANGE found in regex: ', { re: orig, start: c1, start_n: v1, end: c2, end_n: v2 }); + mark(v1); + mark('-'.charCodeAt(0)); + mark(v2); + } + continue; + } + mark(v1); + } + + // When we have marked all slots, '^' NEGATES the set, hence we flip all slots. + // + // Since a regex like `[^]` should match everything(?really?), we don't need to check if the MARK + // phase actually marked anything at all: the `^` negation will correctly flip=mark the entire + // range then. + if (set_is_inverted) { + for (var i = 0; i <= UNICODE_BASE_PLANE_MAX_CP$1; i++) { + if (!bitarr[i]) { + bitarr_orig[i] = true; + } + } + } + } + return false; +} + + +// convert a simple bitarray back into a regex set `[...]` content: +function bitarray2set(l, output_inverted_variant, output_minimized) { + // construct the inverse(?) set from the mark-set: + // + // Before we do that, we inject a sentinel so that our inner loops + // below can be simple and fast: + l[UNICODE_BASE_PLANE_MAX_CP$1 + 1] = 1; + // now reconstruct the regex set: + var rv = []; + var i, j, cnt, lut, tn, tspec, match, pcode, ba4pcode, l2; + var bitarr_is_cloned = false; + var l_orig = l; + + if (output_inverted_variant) { + // generate the inverted set, hence all unmarked slots are part of the output range: + cnt = 0; + for (i = 0; i <= UNICODE_BASE_PLANE_MAX_CP$1; i++) { + if (!l[i]) { + cnt++; + } + } + if (cnt === UNICODE_BASE_PLANE_MAX_CP$1 + 1) { + // When there's nothing in the output we output a special 'match-nothing' regex: `[^\S\s]`. + // BUT... since we output the INVERTED set, we output the match-all set instead: + return '\\S\\s'; + } + else if (cnt === 0) { + // When we find the entire Unicode range is in the output match set, we replace this with + // a shorthand regex: `[\S\s]` + // BUT... since we output the INVERTED set, we output the match-nothing set instead: + return '^\\S\\s'; + } + + // Now see if we can replace several bits by an escape / pcode: + if (output_minimized) { + lut = Pcodes_bitarray_cache_test_order; + for (tn = 0; lut[tn]; tn++) { + tspec = lut[tn]; + // check if the uniquely identifying char is in the inverted set: + if (!l[tspec[0]]) { + // check if the pcode is covered by the inverted set: + pcode = tspec[1]; + ba4pcode = Pcodes_bitarray_cache[pcode]; + match = 0; + for (j = 0; j <= UNICODE_BASE_PLANE_MAX_CP$1; j++) { + if (ba4pcode[j]) { + if (!l[j]) { + // match in current inverted bitset, i.e. there's at + // least one 'new' bit covered by this pcode/escape: + match++; + } else if (l_orig[j]) { + // mismatch! + match = false; + break; + } + } + } + + // We're only interested in matches which actually cover some + // yet uncovered bits: `match !== 0 && match !== false`. + // + // Apply the heuristic that the pcode/escape is only going to be used + // when it covers *more* characters than its own identifier's length: + if (match && match > pcode.length) { + rv.push(pcode); + + // and nuke the bits in the array which match the given pcode: + // make sure these edits are visible outside this function as + // `l` is an INPUT parameter (~ not modified)! + if (!bitarr_is_cloned) { + l2 = new Array(UNICODE_BASE_PLANE_MAX_CP$1 + 1); + for (j = 0; j <= UNICODE_BASE_PLANE_MAX_CP$1; j++) { + l2[j] = l[j] || ba4pcode[j]; // `!(!l[j] && !ba4pcode[j])` + } + // recreate sentinel + l2[UNICODE_BASE_PLANE_MAX_CP$1 + 1] = 1; + l = l2; + bitarr_is_cloned = true; + } else { + for (j = 0; j <= UNICODE_BASE_PLANE_MAX_CP$1; j++) { + l[j] = l[j] || ba4pcode[j]; + } + } + } + } + } + } + + i = 0; + while (i <= UNICODE_BASE_PLANE_MAX_CP$1) { + // find first character not in original set: + while (l[i]) { + i++; + } + if (i >= UNICODE_BASE_PLANE_MAX_CP$1 + 1) { + break; + } + // find next character not in original set: + for (j = i + 1; !l[j]; j++) {} /* empty loop */ + // generate subset: + rv.push(i2c(i)); + if (j - 1 > i) { + rv.push((j - 2 > i ? '-' : '') + i2c(j - 1)); + } + i = j; + } + } else { + // generate the non-inverted set, hence all logic checks are inverted here... + cnt = 0; + for (i = 0; i <= UNICODE_BASE_PLANE_MAX_CP$1; i++) { + if (l[i]) { + cnt++; + } + } + if (cnt === UNICODE_BASE_PLANE_MAX_CP$1 + 1) { + // When we find the entire Unicode range is in the output match set, we replace this with + // a shorthand regex: `[\S\s]` + return '\\S\\s'; + } + else if (cnt === 0) { + // When there's nothing in the output we output a special 'match-nothing' regex: `[^\S\s]`. + return '^\\S\\s'; + } + + // Now see if we can replace several bits by an escape / pcode: + if (output_minimized) { + lut = Pcodes_bitarray_cache_test_order; + for (tn = 0; lut[tn]; tn++) { + tspec = lut[tn]; + // check if the uniquely identifying char is in the set: + if (l[tspec[0]]) { + // check if the pcode is covered by the set: + pcode = tspec[1]; + ba4pcode = Pcodes_bitarray_cache[pcode]; + match = 0; + for (j = 0; j <= UNICODE_BASE_PLANE_MAX_CP$1; j++) { + if (ba4pcode[j]) { + if (l[j]) { + // match in current bitset, i.e. there's at + // least one 'new' bit covered by this pcode/escape: + match++; + } else if (!l_orig[j]) { + // mismatch! + match = false; + break; + } + } + } + + // We're only interested in matches which actually cover some + // yet uncovered bits: `match !== 0 && match !== false`. + // + // Apply the heuristic that the pcode/escape is only going to be used + // when it covers *more* characters than its own identifier's length: + if (match && match > pcode.length) { + rv.push(pcode); + + // and nuke the bits in the array which match the given pcode: + // make sure these edits are visible outside this function as + // `l` is an INPUT parameter (~ not modified)! + if (!bitarr_is_cloned) { + l2 = new Array(UNICODE_BASE_PLANE_MAX_CP$1 + 1); + for (j = 0; j <= UNICODE_BASE_PLANE_MAX_CP$1; j++) { + l2[j] = l[j] && !ba4pcode[j]; + } + // recreate sentinel + l2[UNICODE_BASE_PLANE_MAX_CP$1 + 1] = 1; + l = l2; + bitarr_is_cloned = true; + } else { + for (j = 0; j <= UNICODE_BASE_PLANE_MAX_CP$1; j++) { + l[j] = l[j] && !ba4pcode[j]; + } + } + } + } + } + } + + i = 0; + while (i <= UNICODE_BASE_PLANE_MAX_CP$1) { + // find first character not in original set: + while (!l[i]) { + i++; + } + if (i >= UNICODE_BASE_PLANE_MAX_CP$1 + 1) { + break; + } + // find next character not in original set: + for (j = i + 1; l[j]; j++) {} /* empty loop */ + if (j > UNICODE_BASE_PLANE_MAX_CP$1 + 1) { + j = UNICODE_BASE_PLANE_MAX_CP$1 + 1; + } + // generate subset: + rv.push(i2c(i)); + if (j - 1 > i) { + rv.push((j - 2 > i ? '-' : '') + i2c(j - 1)); + } + i = j; + } + } + + assert(rv.length); + var s = rv.join(''); + assert(s); + + // Check if the set is better represented by one of the regex escapes: + var esc4s = EscCode_bitarray_output_refs.set2esc[s]; + if (esc4s) { + // When we hit a special case like this, it is always the shortest notation, hence wins on the spot! + return '\\' + esc4s; + } + return s; +} + + + + + +// Pretty brutal conversion of 'regex' `s` back to raw regex set content: strip outer [...] when they're there; +// ditto for inner combos of sets, i.e. `]|[` as in `[0-9]|[a-z]`. +function reduceRegexToSetBitArray(s, name, opts) { + var orig = s; + + // propagate deferred exceptions = error reports. + if (s instanceof Error) { + return s; + } + + var l = new Array(UNICODE_BASE_PLANE_MAX_CP$1 + 1); + var internal_state = 0; + var derr; + + while (s.length) { + var c1 = s.match(CHR_RE$1); + if (!c1) { + // cope with illegal escape sequences too! + return new Error('illegal escape sequence at start of regex part: "' + s + '" of regex "' + orig + '"'); + } else { + c1 = c1[0]; + } + s = s.substr(c1.length); + + switch (c1) { + case '[': + // this is starting a set within the regex: scan until end of set! + var set_content = []; + while (s.length) { + var inner = s.match(SET_PART_RE$1); + if (!inner) { + inner = s.match(CHR_RE$1); + if (!inner) { + // cope with illegal escape sequences too! + return new Error('illegal escape sequence at start of regex part: ' + s + '" of regex "' + orig + '"'); + } else { + inner = inner[0]; + } + if (inner === ']') break; + } else { + inner = inner[0]; + } + set_content.push(inner); + s = s.substr(inner.length); + } + + // ensure that we hit the terminating ']': + var c2 = s.match(CHR_RE$1); + if (!c2) { + // cope with illegal escape sequences too! + return new Error('regex set expression is broken in regex: "' + orig + '" --> "' + s + '"'); + } else { + c2 = c2[0]; + } + if (c2 !== ']') { + return new Error('regex set expression is broken in regex: ' + orig); + } + s = s.substr(c2.length); + + var se = set_content.join(''); + if (!internal_state) { + derr = set2bitarray(l, se, opts); + // propagate deferred exceptions = error reports. + if (derr instanceof Error) { + return derr; + } + + // a set is to use like a single character in a longer literal phrase, hence input `[abc]word[def]` would thus produce output `[abc]`: + internal_state = 1; + } + break; + + // Strip unescaped pipes to catch constructs like `\\r|\\n` and turn them into + // something ready for use inside a regex set, e.g. `\\r\\n`. + // + // > Of course, we realize that converting more complex piped constructs this way + // > will produce something you might not expect, e.g. `A|WORD2` which + // > would end up as the set `[AW]` which is something else than the input + // > entirely. + // > + // > However, we can only depend on the user (grammar writer) to realize this and + // > prevent this from happening by not creating such oddities in the input grammar. + case '|': + // a|b --> [ab] + internal_state = 0; + break; + + case '(': + // (a) --> a + // + // TODO - right now we treat this as 'too complex': + + // Strip off some possible outer wrappers which we know how to remove. + // We don't worry about 'damaging' the regex as any too-complex regex will be caught + // in the validation check at the end; our 'strippers' here would not damage useful + // regexes anyway and them damaging the unacceptable ones is fine. + s = s.replace(/^\((?:\?:)?(.*?)\)$/, '$1'); // (?:...) -> ... and (...) -> ... + s = s.replace(/^\^?(.*?)\$?$/, '$1'); // ^...$ --> ... (catch these both inside and outside the outer grouping, hence do the ungrouping twice: one before, once after this) + s = s.replace(/^\((?:\?:)?(.*?)\)$/, '$1'); // (?:...) -> ... and (...) -> ... + + return new Error('[macro [' + name + '] is unsuitable for use inside regex set expressions: "[' + orig + ']"]'); + + case '.': + case '*': + case '+': + case '?': + // wildcard + // + // TODO - right now we treat this as 'too complex': + return new Error('[macro [' + name + '] is unsuitable for use inside regex set expressions: "[' + orig + ']"]'); + + case '{': // range, e.g. `x{1,3}`, or macro? + // TODO - right now we treat this as 'too complex': + return new Error('[macro [' + name + '] is unsuitable for use inside regex set expressions: "[' + orig + ']"]'); + + default: + // literal character or word: take the first character only and ignore the rest, so that + // the constructed set for `word|noun` would be `[wb]`: + if (!internal_state) { + derr = set2bitarray(l, c1, opts); + // propagate deferred exceptions = error reports. + if (derr instanceof Error) { + return derr; + } + + internal_state = 2; + } + break; + } + } + + s = bitarray2set(l); + + // When this result is suitable for use in a set, than we should be able to compile + // it in a regex; that way we can easily validate whether macro X is fit to be used + // inside a regex set: + try { + var re; + assert(s); + assert(!(s instanceof Error)); + re = new XRegExp('[' + s + ']'); + re.test(s[0]); + + // One thing is apparently *not* caught by the RegExp compile action above: `[a[b]c]` + // so we check for lingering UNESCAPED brackets in here as those cannot be: + if (/[^\\][\[\]]/.exec(s)) { + throw new Error('unescaped brackets in set data'); + } + } catch (ex) { + // make sure we produce a set range expression which will fail badly when it is used + // in actual code: + s = new Error('[macro [' + name + '] is unsuitable for use inside regex set expressions: "[' + s + ']"]: ' + ex.message); + } + + assert(s); + // propagate deferred exceptions = error reports. + if (s instanceof Error) { + return s; + } + return l; +} + + + + +// Convert bitarray representing, for example, `'0-9'` to regex string `[0-9]` +// -- or in this example it can be further optimized to only `\d`! +function produceOptimizedRegex4Set(bitarr) { + // First try to produce a minimum regex from the bitarray directly: + var s1 = bitarray2set(bitarr, false, true); + + // and when the regex set turns out to match a single pcode/escape, then + // use that one as-is: + if (s1.match(SET_IS_SINGLE_PCODE_RE)) { + // When we hit a special case like this, it is always the shortest notation, hence wins on the spot! + return s1; + } else { + s1 = '[' + s1 + ']'; + } + + // Now try to produce a minimum regex from the *inverted* bitarray via negation: + // Because we look at a negated bitset, there's no use looking for matches with + // special cases here. + var s2 = bitarray2set(bitarr, true, true); + + if (s2[0] === '^') { + s2 = s2.substr(1); + if (s2.match(SET_IS_SINGLE_PCODE_RE)) { + // When we hit a special case like this, it is always the shortest notation, hence wins on the spot! + return s2; + } + } else { + s2 = '^' + s2; + } + s2 = '[' + s2 + ']'; + + // Then, as some pcode/escapes still happen to deliver a LARGER regex string in the end, + // we also check against the plain, unadulterated regex set expressions: + // + // First try to produce a minimum regex from the bitarray directly: + var s3 = bitarray2set(bitarr, false, false); + + // and when the regex set turns out to match a single pcode/escape, then + // use that one as-is: + if (s3.match(SET_IS_SINGLE_PCODE_RE)) { + // When we hit a special case like this, it is always the shortest notation, hence wins on the spot! + return s3; + } else { + s3 = '[' + s3 + ']'; + } + + // Now try to produce a minimum regex from the *inverted* bitarray via negation: + // Because we look at a negated bitset, there's no use looking for matches with + // special cases here. + var s4 = bitarray2set(bitarr, true, false); + + if (s4[0] === '^') { + s4 = s4.substr(1); + if (s4.match(SET_IS_SINGLE_PCODE_RE)) { + // When we hit a special case like this, it is always the shortest notation, hence wins on the spot! + return s4; + } + } else { + s4 = '^' + s4; + } + s4 = '[' + s4 + ']'; + + if (s2.length < s1.length) { + s1 = s2; + } + if (s3.length < s1.length) { + s1 = s3; + } + if (s4.length < s1.length) { + s1 = s4; + } + + return s1; +} + + + + + + +var setmgmt = { + XREGEXP_UNICODE_ESCAPE_RE: XREGEXP_UNICODE_ESCAPE_RE$1, + CHR_RE: CHR_RE$1, + SET_PART_RE: SET_PART_RE$1, + NOTHING_SPECIAL_RE: NOTHING_SPECIAL_RE$1, + SET_IS_SINGLE_PCODE_RE, + + UNICODE_BASE_PLANE_MAX_CP: UNICODE_BASE_PLANE_MAX_CP$1, + + WHITESPACE_SETSTR: WHITESPACE_SETSTR$1, + DIGIT_SETSTR: DIGIT_SETSTR$1, + WORDCHAR_SETSTR: WORDCHAR_SETSTR$1, + + set2bitarray, + bitarray2set, + produceOptimizedRegex4Set, + reduceRegexToSetBitArray, +}; + +// Basic Lexer implemented using JavaScript regular expressions +// Zachary Carter +// MIT Licensed + +var rmCommonWS = helpers.rmCommonWS; +var camelCase = helpers.camelCase; +var code_exec = helpers.exec; +// import recast from '@gerhobbelt/recast'; +// import astUtils from '@gerhobbelt/ast-util'; +var version$1 = '0.6.1-205'; // require('./package.json').version; + + + + +const XREGEXP_UNICODE_ESCAPE_RE = setmgmt.XREGEXP_UNICODE_ESCAPE_RE; // Matches the XRegExp Unicode escape braced part, e.g. `{Number}` +const CHR_RE = setmgmt.CHR_RE; +const SET_PART_RE = setmgmt.SET_PART_RE; +const NOTHING_SPECIAL_RE = setmgmt.NOTHING_SPECIAL_RE; +const UNICODE_BASE_PLANE_MAX_CP = setmgmt.UNICODE_BASE_PLANE_MAX_CP; + +// WARNING: this regex MUST match the regex for `ID` in ebnf-parser::bnf.l jison language lexer spec! (`ID = [{ALPHA}]{ALNUM}*`) +// +// This is the base XRegExp ID regex used in many places; this should match the ID macro definition in the EBNF/BNF parser et al as well! +const ID_REGEX_BASE = '[\\p{Alphabetic}_][\\p{Alphabetic}_\\p{Number}]*'; + + + + +// see also ./lib/cli.js +/** +@public +@nocollapse +*/ +const defaultJisonLexOptions = { + moduleType: 'commonjs', + debug: false, + enableDebugLogs: false, + json: false, + main: false, // CLI: not:(--main option) + dumpSourceCodeOnFailure: true, + throwErrorOnCompileFailure: true, + + moduleName: undefined, + defaultModuleName: 'lexer', + file: undefined, + outfile: undefined, + inputPath: undefined, + inputFilename: undefined, + warn_cb: undefined, // function(msg) | true (= use Jison.Print) | false (= throw Exception) + + xregexp: false, + lexerErrorsAreRecoverable: false, + flex: false, + backtrack_lexer: false, + ranges: false, // track position range, i.e. start+end indexes in the input string + trackPosition: true, // track line+column position in the input string + caseInsensitive: false, + showSource: false, + exportSourceCode: false, + exportAST: false, + prettyCfg: true, + pre_lex: undefined, + post_lex: undefined, +}; + + +// Merge sets of options. +// +// Convert alternative jison option names to their base option. +// +// The *last* option set which overrides the default wins, where 'override' is +// defined as specifying a not-undefined value which is not equal to the +// default value. +// +// When the FIRST argument is STRING "NODEFAULT", then we MUST NOT mix the +// default values avialable in Jison.defaultJisonOptions. +// +// Return a fresh set of options. +/** @public */ +function mkStdOptions(/*...args*/) { + var h = Object.prototype.hasOwnProperty; + + var opts = {}; + var args = [].concat.apply([], arguments); + // clone defaults, so we do not modify those constants? + if (args[0] !== "NODEFAULT") { + args.unshift(defaultJisonLexOptions); + } else { + args.shift(); + } + + for (var i = 0, len = args.length; i < len; i++) { + var o = args[i]; + if (!o) continue; + + // clone input (while camel-casing the options), so we do not modify those either. + var o2 = {}; + + for (var p in o) { + if (typeof o[p] !== 'undefined' && h.call(o, p)) { + o2[camelCase(p)] = o[p]; + } + } + + // now clean them options up: + if (typeof o2.main !== 'undefined') { + o2.noMain = !o2.main; + } + + delete o2.main; + + // special check for `moduleName` to ensure we detect the 'default' moduleName entering from the CLI + // NOT overriding the moduleName set in the grammar definition file via an `%options` entry: + if (o2.moduleName === o2.defaultModuleName) { + delete o2.moduleName; + } + + // now see if we have an overriding option here: + for (var p in o2) { + if (h.call(o2, p)) { + if (typeof o2[p] !== 'undefined') { + opts[p] = o2[p]; + } + } + } + } + + return opts; +} + +// set up export/output attributes of the `options` object instance +function prepExportStructures(options) { + // set up the 'option' `exportSourceCode` as a hash object for returning + // all generated source code chunks to the caller + var exportSourceCode = options.exportSourceCode; + if (!exportSourceCode || typeof exportSourceCode !== 'object') { + exportSourceCode = { + enabled: !!exportSourceCode + }; + } else if (typeof exportSourceCode.enabled !== 'boolean') { + exportSourceCode.enabled = true; + } + options.exportSourceCode = exportSourceCode; +} + +// Autodetect if the input lexer spec is in JSON or JISON +// format when the `options.json` flag is `true`. +// +// Produce the JSON lexer spec result when these are JSON formatted already as that +// would save us the trouble of doing this again, anywhere else in the JISON +// compiler/generator. +// +// Otherwise return the *parsed* lexer spec as it has +// been processed through LexParser. +function autodetectAndConvertToJSONformat(lexerSpec, options) { + var chk_l = null; + var ex1, err; + + if (typeof lexerSpec === 'string') { + if (options.json) { + try { + chk_l = json5.parse(lexerSpec); + + // When JSON5-based parsing of the lexer spec succeeds, this implies the lexer spec is specified in `JSON mode` + // *OR* there's a JSON/JSON5 format error in the input: + } catch (e) { + ex1 = e; + } + } + if (!chk_l) { + // // WARNING: the lexer may receive options specified in the **grammar spec file**, + // // hence we should mix the options to ensure the lexParser always + // // receives the full set! + // // + // // make sure all options are 'standardized' before we go and mix them together: + // options = mkStdOptions(grammar.options, options); + try { + chk_l = lexParser.parse(lexerSpec, options); + } catch (e) { + if (options.json) { + err = new Error('Could not parse lexer spec in JSON AUTODETECT mode\nError: ' + ex1.message + ' (' + e.message + ')'); + err.secondary_exception = e; + err.stack = ex1.stack; + } else { + err = new Error('Could not parse lexer spec\nError: ' + e.message); + err.stack = e.stack; + } + throw err; + } + } + } else { + chk_l = lexerSpec; + } + + // Save time! Don't reparse the entire lexer spec *again* inside the code generators when that's not necessary: + + return chk_l; +} + + +// expand macros and convert matchers to RegExp's +function prepareRules(dict, actions, caseHelper, tokens, startConditions, opts) { + var m, i, k, rule, action, conditions, + active_conditions, + rules = dict.rules || [], + newRules = [], + macros = {}, + regular_rule_count = 0, + simple_rule_count = 0; + + // Assure all options are camelCased: + assert(typeof opts.options['case-insensitive'] === 'undefined'); + + if (!tokens) { + tokens = {}; + } + + // Depending on the location within the regex we need different expansions of the macros: + // one expansion for when a macro is *inside* a `[...]` and another expansion when a macro + // is anywhere else in a regex: + if (dict.macros) { + macros = prepareMacros(dict.macros, opts); + } + + function tokenNumberReplacement(str, token) { + return 'return ' + (tokens[token] || '\'' + token.replace(/'/g, '\\\'') + '\''); + } + + // Make sure a comment does not contain any embedded '*/' end-of-comment marker + // as that would break the generated code + function postprocessComment(str) { + if (Array.isArray(str)) { + str = str.join(' '); + } + str = str.replace(/\*\//g, '*\\/'); // destroy any inner `*/` comment terminator sequence. + return str; + } + + actions.push('switch(yyrulenumber) {'); + + for (i = 0; i < rules.length; i++) { + rule = rules[i]; + m = rule[0]; + + active_conditions = []; + if (Object.prototype.toString.apply(m) !== '[object Array]') { + // implicit add to all inclusive start conditions + for (k in startConditions) { + if (startConditions[k].inclusive) { + active_conditions.push(k); + startConditions[k].rules.push(i); + } + } + } else if (m[0] === '*') { + // Add to ALL start conditions + active_conditions.push('*'); + for (k in startConditions) { + startConditions[k].rules.push(i); + } + rule.shift(); + m = rule[0]; + } else { + // Add to explicit start conditions + conditions = rule.shift(); + m = rule[0]; + for (k = 0; k < conditions.length; k++) { + if (!startConditions.hasOwnProperty(conditions[k])) { + startConditions[conditions[k]] = { + rules: [], + inclusive: false + }; + console.warn('Lexer Warning:', '"' + conditions[k] + '" start condition should be defined as %s or %x; assuming %x now.'); + } + active_conditions.push(conditions[k]); + startConditions[conditions[k]].rules.push(i); + } + } + + if (typeof m === 'string') { + m = expandMacros(m, macros, opts); + m = new XRegExp('^(?:' + m + ')', opts.options.caseInsensitive ? 'i' : ''); + } + newRules.push(m); + if (typeof rule[1] === 'function') { + rule[1] = String(rule[1]).replace(/^\s*function \(\)\s?\{/, '').replace(/\}\s*$/, ''); + } + action = rule[1]; + action = action.replace(/return '((?:\\'|[^']+)+)'/g, tokenNumberReplacement); + action = action.replace(/return "((?:\\"|[^"]+)+)"/g, tokenNumberReplacement); + + var code = ['\n/*! Conditions::']; + code.push(postprocessComment(active_conditions)); + code.push('*/', '\n/*! Rule:: '); + code.push(postprocessComment(rules[i][0])); + code.push('*/', '\n'); + + // When the action is *only* a simple `return TOKEN` statement, then add it to the caseHelpers; + // otherwise add the additional `break;` at the end. + // + // Note: we do NOT analyze the action block any more to see if the *last* line is a simple + // `return NNN;` statement as there are too many shoddy idioms, e.g. + // + // ``` + // %{ if (cond) + // return TOKEN; + // %} + // ``` + // + // which would then cause havoc when our action code analysis (using regexes or otherwise) was 'too simple' + // to catch these culprits; hence we resort and stick with the most fundamental approach here: + // always append `break;` even when it would be obvious to a human that such would be 'unreachable code'. + var match_nr = /^return[\s\r\n]+((?:'(?:\\'|[^']+)+')|(?:"(?:\\"|[^"]+)+")|\d+)[\s\r\n]*;?$/.exec(action.trim()); + if (match_nr) { + simple_rule_count++; + caseHelper.push([].concat(code, i, ':', match_nr[1]).join(' ').replace(/[\n]/g, '\n ')); + } else { + regular_rule_count++; + actions.push([].concat('case', i, ':', code, action, '\nbreak;').join(' ')); + } + } + actions.push('default:'); + actions.push(' return this.simpleCaseActionClusters[yyrulenumber];'); + actions.push('}'); + + return { + rules: newRules, + macros: macros, + + regular_rule_count: regular_rule_count, + simple_rule_count: simple_rule_count, + }; +} + + + + + + + +// expand all macros (with maybe one exception) in the given regex: the macros may exist inside `[...]` regex sets or +// elsewhere, which requires two different treatments to expand these macros. +function reduceRegex(s, name, opts, expandAllMacrosInSet_cb, expandAllMacrosElsewhere_cb) { + var orig = s; + + function errinfo() { + if (name) { + return 'macro [[' + name + ']]'; + } else { + return 'regex [[' + orig + ']]'; + } + } + + // propagate deferred exceptions = error reports. + if (s instanceof Error) { + return s; + } + + var c1, c2; + var rv = []; + var derr; + var se; + + while (s.length) { + c1 = s.match(CHR_RE); + if (!c1) { + // cope with illegal escape sequences too! + return new Error(errinfo() + ': illegal escape sequence at start of regex part: ' + s); + } else { + c1 = c1[0]; + } + s = s.substr(c1.length); + + switch (c1) { + case '[': + // this is starting a set within the regex: scan until end of set! + var set_content = []; + var l = new Array(UNICODE_BASE_PLANE_MAX_CP + 1); + + while (s.length) { + var inner = s.match(SET_PART_RE); + if (!inner) { + inner = s.match(CHR_RE); + if (!inner) { + // cope with illegal escape sequences too! + return new Error(errinfo() + ': illegal escape sequence at start of regex part: ' + s); + } else { + inner = inner[0]; + } + if (inner === ']') break; + } else { + inner = inner[0]; + } + set_content.push(inner); + s = s.substr(inner.length); + } + + // ensure that we hit the terminating ']': + c2 = s.match(CHR_RE); + if (!c2) { + // cope with illegal escape sequences too! + return new Error(errinfo() + ': regex set expression is broken: "' + s + '"'); + } else { + c2 = c2[0]; + } + if (c2 !== ']') { + return new Error(errinfo() + ': regex set expression is broken: apparently unterminated'); + } + s = s.substr(c2.length); + + se = set_content.join(''); + + // expand any macros in here: + if (expandAllMacrosInSet_cb) { + se = expandAllMacrosInSet_cb(se); + assert(se); + if (se instanceof Error) { + return new Error(errinfo() + ': ' + se.message); + } + } + + derr = setmgmt.set2bitarray(l, se, opts); + if (derr instanceof Error) { + return new Error(errinfo() + ': ' + derr.message); + } + + // find out which set expression is optimal in size: + var s1 = setmgmt.produceOptimizedRegex4Set(l); + + // check if the source regex set potentially has any expansions (guestimate!) + // + // The indexOf('{') picks both XRegExp Unicode escapes and JISON lexer macros, which is perfect for us here. + var has_expansions = (se.indexOf('{') >= 0); + + se = '[' + se + ']'; + + if (!has_expansions && se.length < s1.length) { + s1 = se; + } + rv.push(s1); + break; + + // XRegExp Unicode escape, e.g. `\\p{Number}`: + case '\\p': + c2 = s.match(XREGEXP_UNICODE_ESCAPE_RE); + if (c2) { + c2 = c2[0]; + s = s.substr(c2.length); + + // nothing to expand. + rv.push(c1 + c2); + } else { + // nothing to stretch this match, hence nothing to expand. + rv.push(c1); + } + break; + + // Either a range expression or the start of a macro reference: `.{1,3}` or `{NAME}`. + // Treat it as a macro reference and see if it will expand to anything: + case '{': + c2 = s.match(NOTHING_SPECIAL_RE); + if (c2) { + c2 = c2[0]; + s = s.substr(c2.length); + + var c3 = s[0]; + s = s.substr(c3.length); + if (c3 === '}') { + // possibly a macro name in there... Expand if possible: + c2 = c1 + c2 + c3; + if (expandAllMacrosElsewhere_cb) { + c2 = expandAllMacrosElsewhere_cb(c2); + assert(c2); + if (c2 instanceof Error) { + return new Error(errinfo() + ': ' + c2.message); + } + } + } else { + // not a well-terminated macro reference or something completely different: + // we do not even attempt to expand this as there's guaranteed nothing to expand + // in this bit. + c2 = c1 + c2 + c3; + } + rv.push(c2); + } else { + // nothing to stretch this match, hence nothing to expand. + rv.push(c1); + } + break; + + // Recognize some other regex elements, but there's no need to understand them all. + // + // We are merely interested in any chunks now which do *not* include yet another regex set `[...]` + // nor any `{MACRO}` reference: + default: + // non-set character or word: see how much of this there is for us and then see if there + // are any macros still lurking inside there: + c2 = s.match(NOTHING_SPECIAL_RE); + if (c2) { + c2 = c2[0]; + s = s.substr(c2.length); + + // nothing to expand. + rv.push(c1 + c2); + } else { + // nothing to stretch this match, hence nothing to expand. + rv.push(c1); + } + break; + } + } + + s = rv.join(''); + + // When this result is suitable for use in a set, than we should be able to compile + // it in a regex; that way we can easily validate whether macro X is fit to be used + // inside a regex set: + try { + var re; + re = new XRegExp(s); + re.test(s[0]); + } catch (ex) { + // make sure we produce a regex expression which will fail badly when it is used + // in actual code: + return new Error(errinfo() + ': expands to an invalid regex: /' + s + '/'); + } + + assert(s); + return s; +} + + +// expand macros within macros and cache the result +function prepareMacros(dict_macros, opts) { + var macros = {}; + + // expand a `{NAME}` macro which exists inside a `[...]` set: + function expandMacroInSet(i) { + var k, a, m; + if (!macros[i]) { + m = dict_macros[i]; + + if (m.indexOf('{') >= 0) { + // set up our own record so we can detect definition loops: + macros[i] = { + in_set: false, + elsewhere: null, + raw: dict_macros[i] + }; + + for (k in dict_macros) { + if (dict_macros.hasOwnProperty(k) && i !== k) { + // it doesn't matter if the lexer recognized that the inner macro(s) + // were sitting inside a `[...]` set or not: the fact that they are used + // here in macro `i` which itself sits in a set, makes them *all* live in + // a set so all of them get the same treatment: set expansion style. + // + // Note: make sure we don't try to expand any XRegExp `\p{...}` or `\P{...}` + // macros here: + if (XRegExp._getUnicodeProperty(k)) { + // Work-around so that you can use `\p{ascii}` for a XRegExp slug, a.k.a. + // Unicode 'General Category' Property cf. http://unicode.org/reports/tr18/#Categories, + // while using `\p{ASCII}` as a *macro expansion* of the `ASCII` + // macro: + if (k.toUpperCase() !== k) { + m = new Error('Cannot use name "' + k + '" as a macro name as it clashes with the same XRegExp "\\p{..}" Unicode \'General Category\' Property name. Use all-uppercase macro names, e.g. name your macro "' + k.toUpperCase() + '" to work around this issue or give your offending macro a different name.'); + break; + } + } + + a = m.split('{' + k + '}'); + if (a.length > 1) { + var x = expandMacroInSet(k); + assert(x); + if (x instanceof Error) { + m = x; + break; + } + m = a.join(x); + } + } + } + } + + var mba = setmgmt.reduceRegexToSetBitArray(m, i, opts); + + var s1; + + // propagate deferred exceptions = error reports. + if (mba instanceof Error) { + s1 = mba; + } else { + s1 = setmgmt.bitarray2set(mba, false); + + m = s1; + } + + macros[i] = { + in_set: s1, + elsewhere: null, + raw: dict_macros[i] + }; + } else { + m = macros[i].in_set; + + if (m instanceof Error) { + // this turns out to be an macro with 'issues' and it is used, so the 'issues' do matter: bombs away! + return new Error(m.message); + } + + // detect definition loop: + if (m === false) { + return new Error('Macro name "' + i + '" has an illegal, looping, definition, i.e. it\'s definition references itself, either directly or indirectly, via other macros.'); + } + } + + return m; + } + + function expandMacroElsewhere(i) { + var k, a, m; + + if (macros[i].elsewhere == null) { + m = dict_macros[i]; + + // set up our own record so we can detect definition loops: + macros[i].elsewhere = false; + + // the macro MAY contain other macros which MAY be inside a `[...]` set in this + // macro or elsewhere, hence we must parse the regex: + m = reduceRegex(m, i, opts, expandAllMacrosInSet, expandAllMacrosElsewhere); + // propagate deferred exceptions = error reports. + if (m instanceof Error) { + return m; + } + + macros[i].elsewhere = m; + } else { + m = macros[i].elsewhere; + + if (m instanceof Error) { + // this turns out to be an macro with 'issues' and it is used, so the 'issues' do matter: bombs away! + return m; + } + + // detect definition loop: + if (m === false) { + return new Error('Macro name "' + i + '" has an illegal, looping, definition, i.e. it\'s definition references itself, either directly or indirectly, via other macros.'); + } + } + + return m; + } + + function expandAllMacrosInSet(s) { + var i, x; + + // process *all* the macros inside [...] set: + if (s.indexOf('{') >= 0) { + for (i in macros) { + if (macros.hasOwnProperty(i)) { + var a = s.split('{' + i + '}'); + if (a.length > 1) { + x = expandMacroInSet(i); + assert(x); + if (x instanceof Error) { + return new Error('failure to expand the macro [' + i + '] in set [' + s + ']: ' + x.message); + } + s = a.join(x); + } + + // stop the brute-force expansion attempt when we done 'em all: + if (s.indexOf('{') === -1) { + break; + } + } + } + } + + return s; + } + + function expandAllMacrosElsewhere(s) { + var i, x; + + // When we process the remaining macro occurrences in the regex + // every macro used in a lexer rule will become its own capture group. + // + // Meanwhile the cached expansion will expand any submacros into + // *NON*-capturing groups so that the backreference indexes remain as you'ld + // expect and using macros doesn't require you to know exactly what your + // used macro will expand into, i.e. which and how many submacros it has. + // + // This is a BREAKING CHANGE from vanilla jison 0.4.15! + if (s.indexOf('{') >= 0) { + for (i in macros) { + if (macros.hasOwnProperty(i)) { + // These are all submacro expansions, hence non-capturing grouping is applied: + var a = s.split('{' + i + '}'); + if (a.length > 1) { + x = expandMacroElsewhere(i); + assert(x); + if (x instanceof Error) { + return new Error('failure to expand the macro [' + i + '] in regex /' + s + '/: ' + x.message); + } + s = a.join('(?:' + x + ')'); + } + + // stop the brute-force expansion attempt when we done 'em all: + if (s.indexOf('{') === -1) { + break; + } + } + } + } + + return s; + } + + + var m, i; + + if (opts.debug) console.log('\n############## RAW macros: ', dict_macros); + + // first we create the part of the dictionary which is targeting the use of macros + // *inside* `[...]` sets; once we have completed that half of the expansions work, + // we then go and expand the macros for when they are used elsewhere in a regex: + // iff we encounter submacros then which are used *inside* a set, we can use that + // first half dictionary to speed things up a bit as we can use those expansions + // straight away! + for (i in dict_macros) { + if (dict_macros.hasOwnProperty(i)) { + expandMacroInSet(i); + } + } + + for (i in dict_macros) { + if (dict_macros.hasOwnProperty(i)) { + expandMacroElsewhere(i); + } + } + + if (opts.debug) console.log('\n############### expanded macros: ', macros); + + return macros; +} + + + +// expand macros in a regex; expands them recursively +function expandMacros(src, macros, opts) { + var expansion_count = 0; + + // By the time we call this function `expandMacros` we MUST have expanded and cached all macros already! + // Hence things should be easy in there: + + function expandAllMacrosInSet(s) { + var i, m, x; + + // process *all* the macros inside [...] set: + if (s.indexOf('{') >= 0) { + for (i in macros) { + if (macros.hasOwnProperty(i)) { + m = macros[i]; + + var a = s.split('{' + i + '}'); + if (a.length > 1) { + x = m.in_set; + + assert(x); + if (x instanceof Error) { + // this turns out to be an macro with 'issues' and it is used, so the 'issues' do matter: bombs away! + throw x; + } + + // detect definition loop: + if (x === false) { + return new Error('Macro name "' + i + '" has an illegal, looping, definition, i.e. it\'s definition references itself, either directly or indirectly, via other macros.'); + } + + s = a.join(x); + expansion_count++; + } + + // stop the brute-force expansion attempt when we done 'em all: + if (s.indexOf('{') === -1) { + break; + } + } + } + } + + return s; + } + + function expandAllMacrosElsewhere(s) { + var i, m, x; + + // When we process the main macro occurrences in the regex + // every macro used in a lexer rule will become its own capture group. + // + // Meanwhile the cached expansion will expand any submacros into + // *NON*-capturing groups so that the backreference indexes remain as you'ld + // expect and using macros doesn't require you to know exactly what your + // used macro will expand into, i.e. which and how many submacros it has. + // + // This is a BREAKING CHANGE from vanilla jison 0.4.15! + if (s.indexOf('{') >= 0) { + for (i in macros) { + if (macros.hasOwnProperty(i)) { + m = macros[i]; + + var a = s.split('{' + i + '}'); + if (a.length > 1) { + // These are all main macro expansions, hence CAPTURING grouping is applied: + x = m.elsewhere; + assert(x); + + // detect definition loop: + if (x === false) { + return new Error('Macro name "' + i + '" has an illegal, looping, definition, i.e. it\'s definition references itself, either directly or indirectly, via other macros.'); + } + + s = a.join('(' + x + ')'); + expansion_count++; + } + + // stop the brute-force expansion attempt when we done 'em all: + if (s.indexOf('{') === -1) { + break; + } + } + } + } + + return s; + } + + + // When we process the macro occurrences in the regex + // every macro used in a lexer rule will become its own capture group. + // + // Meanwhile the cached expansion will have expanded any submacros into + // *NON*-capturing groups so that the backreference indexes remain as you'ld + // expect and using macros doesn't require you to know exactly what your + // used macro will expand into, i.e. which and how many submacros it has. + // + // This is a BREAKING CHANGE from vanilla jison 0.4.15! + var s2 = reduceRegex(src, null, opts, expandAllMacrosInSet, expandAllMacrosElsewhere); + // propagate deferred exceptions = error reports. + if (s2 instanceof Error) { + throw s2; + } + + // only when we did expand some actual macros do we take the re-interpreted/optimized/regenerated regex from reduceRegex() + // in order to keep our test cases simple and rules recognizable. This assumes the user can code good regexes on his own, + // as long as no macros are involved... + // + // Also pick the reduced regex when there (potentially) are XRegExp extensions in the original, e.g. `\\p{Number}`, + // unless the `xregexp` output option has been enabled. + if (expansion_count > 0 || (src.indexOf('\\p{') >= 0 && !opts.options.xregexp)) { + src = s2; + } else { + // Check if the reduced regex is smaller in size; when it is, we still go with the new one! + if (s2.length < src.length) { + src = s2; + } + } + + return src; +} + +function prepareStartConditions(conditions) { + var sc, + hash = {}; + for (sc in conditions) { + if (conditions.hasOwnProperty(sc)) { + hash[sc] = {rules:[], inclusive: !conditions[sc]}; + } + } + return hash; +} + +function buildActions(dict, tokens, opts) { + var actions = [dict.actionInclude || '', 'var YYSTATE = YY_START;']; + var tok; + var toks = {}; + var caseHelper = []; + + // tokens: map/array of token numbers to token names + for (tok in tokens) { + var idx = parseInt(tok); + if (idx && idx > 0) { + toks[tokens[tok]] = idx; + } + } + + if (opts.options.flex && dict.rules) { + dict.rules.push(['.', 'console.log("", yytext); /* `flex` lexing mode: the last resort rule! */']); + } + + var gen = prepareRules(dict, actions, caseHelper, tokens && toks, opts.conditions, opts); + + var fun = actions.join('\n'); + 'yytext yyleng yylineno yylloc yyerror'.split(' ').forEach(function (yy) { + fun = fun.replace(new RegExp('\\b(' + yy + ')\\b', 'g'), 'yy_.$1'); + }); + + return { + caseHelperInclude: '{\n' + caseHelper.join(',') + '\n}', + + actions: `function lexer__performAction(yy, yyrulenumber, YY_START) { + var yy_ = this; + + ${fun} + }`, + + rules: gen.rules, + macros: gen.macros, // propagate these for debugging/diagnostic purposes + + regular_rule_count: gen.regular_rule_count, + simple_rule_count: gen.simple_rule_count, + }; +} + +// +// NOTE: this is *almost* a copy of the JisonParserError producing code in +// jison/lib/jison.js @ line 2304:lrGeneratorMixin.generateErrorClass +// +function generateErrorClass() { + // --- START lexer error class --- + +var prelude = `/** + * See also: + * http://stackoverflow.com/questions/1382107/whats-a-good-way-to-extend-error-in-javascript/#35881508 + * but we keep the prototype.constructor and prototype.name assignment lines too for compatibility + * with userland code which might access the derived class in a 'classic' way. + * + * @public + * @constructor + * @nocollapse + */ +function JisonLexerError(msg, hash) { + Object.defineProperty(this, 'name', { + enumerable: false, + writable: false, + value: 'JisonLexerError' + }); + + if (msg == null) msg = '???'; + + Object.defineProperty(this, 'message', { + enumerable: false, + writable: true, + value: msg + }); + + this.hash = hash; + + var stacktrace; + if (hash && hash.exception instanceof Error) { + var ex2 = hash.exception; + this.message = ex2.message || msg; + stacktrace = ex2.stack; + } + if (!stacktrace) { + if (Error.hasOwnProperty('captureStackTrace')) { // V8 + Error.captureStackTrace(this, this.constructor); + } else { + stacktrace = (new Error(msg)).stack; + } + } + if (stacktrace) { + Object.defineProperty(this, 'stack', { + enumerable: false, + writable: false, + value: stacktrace + }); + } +} + +if (typeof Object.setPrototypeOf === 'function') { + Object.setPrototypeOf(JisonLexerError.prototype, Error.prototype); +} else { + JisonLexerError.prototype = Object.create(Error.prototype); +} +JisonLexerError.prototype.constructor = JisonLexerError; +JisonLexerError.prototype.name = 'JisonLexerError';`; + + // --- END lexer error class --- + + return prelude; +} + + +const jisonLexerErrorDefinition = generateErrorClass(); + + +function generateFakeXRegExpClassSrcCode() { + return rmCommonWS` + var __hacky_counter__ = 0; + + /** + * @constructor + * @nocollapse + */ + function XRegExp(re, f) { + this.re = re; + this.flags = f; + this._getUnicodeProperty = function (k) {}; + var fake = /./; // WARNING: this exact 'fake' is also depended upon by the xregexp unit test! + __hacky_counter__++; + fake.__hacky_backy__ = __hacky_counter__; + return fake; + } + `; +} + + + +/** @constructor */ +function RegExpLexer(dict, input, tokens, build_options) { + var opts; + var dump = false; + + function test_me(tweak_cb, description, src_exception, ex_callback) { + opts = processGrammar(dict, tokens, build_options); + opts.__in_rules_failure_analysis_mode__ = false; + prepExportStructures(opts); + assert(opts.options); + if (tweak_cb) { + tweak_cb(); + } + var source = generateModuleBody(opts); + try { + // The generated code will always have the `lexer` variable declared at local scope + // as `eval()` will use the local scope. + // + // The compiled code will look something like this: + // + // ``` + // var lexer; + // bla bla... + // ``` + // + // or + // + // ``` + // var lexer = { bla... }; + // ``` + var testcode = [ + '// provide a local version for test purposes:', + jisonLexerErrorDefinition, + '', + generateFakeXRegExpClassSrcCode(), + '', + source, + '', + 'return lexer;'].join('\n'); + var lexer = code_exec(testcode, function generated_code_exec_wrapper_regexp_lexer(sourcecode) { + //console.log("===============================LEXER TEST CODE\n", sourcecode, "\n=====================END====================\n"); + var lexer_f = new Function('', sourcecode); + return lexer_f(); + }, opts.options, "lexer"); + + if (!lexer) { + throw new Error('no lexer defined *at all*?!'); + } + if (typeof lexer.options !== 'object' || lexer.options == null) { + throw new Error('your lexer class MUST have an .options member object or it won\'t fly!'); + } + if (typeof lexer.setInput !== 'function') { + throw new Error('your lexer class MUST have a .setInput function member or it won\'t fly!'); + } + if (lexer.EOF !== 1 && lexer.ERROR !== 2) { + throw new Error('your lexer class MUST have these constants defined: lexer.EOF = 1 and lexer.ERROR = 2 or it won\'t fly!'); + } + + // When we do NOT crash, we found/killed the problem area just before this call! + if (src_exception && description) { + src_exception.message += '\n (' + description + ')'; + } + + // patch the pre and post handlers in there, now that we have some live code to work with: + if (opts.options) { + var pre = opts.options.pre_lex; + var post = opts.options.post_lex; + // since JSON cannot encode functions, we'll have to do it manually now: + if (typeof pre === 'function') { + lexer.options.pre_lex = pre; + } + if (typeof post === 'function') { + lexer.options.post_lex = post; + } + } + + if (opts.options.showSource) { + if (typeof opts.options.showSource === 'function') { + opts.options.showSource(lexer, source, opts); + } else { + console.log("\nGenerated lexer sourcecode:\n----------------------------------------\n", source, "\n----------------------------------------\n"); + } + } + return lexer; + } catch (ex) { + // if (src_exception) { + // src_exception.message += '\n (' + description + ': ' + ex.message + ')'; + // } + + if (ex_callback) { + ex_callback(ex); + } else if (dump) { + console.log('source code:\n', source); + } + return false; + } + } + + /** @constructor */ + var lexer = test_me(null, null, null, function (ex) { + // When we get an exception here, it means some part of the user-specified lexer is botched. + // + // Now we go and try to narrow down the problem area/category: + assert(opts.options); + assert(opts.options.xregexp !== undefined); + var orig_xregexp_opt = !!opts.options.xregexp; + if (!test_me(function () { + assert(opts.options.xregexp !== undefined); + opts.options.xregexp = false; + opts.showSource = false; + }, 'When you have specified %option xregexp, you must also properly IMPORT the XRegExp library in the generated lexer.', ex, null)) { + if (!test_me(function () { + // restore xregexp option setting: the trouble wasn't caused by the xregexp flag i.c.w. incorrect XRegExp library importing! + opts.options.xregexp = orig_xregexp_opt; + + opts.conditions = []; + opts.showSource = false; + }, ((dict.rules && dict.rules.length > 0) ? + 'One or more of your lexer state names are possibly botched?' : + 'Your custom lexer is somehow botched.'), ex, null)) { + if (!test_me(function () { + // opts.conditions = []; + opts.rules = []; + opts.showSource = false; + opts.__in_rules_failure_analysis_mode__ = true; + }, 'One or more of your lexer rules are possibly botched?', ex, null)) { + // kill each rule action block, one at a time and test again after each 'edit': + var rv = false; + for (var i = 0, len = (dict.rules ? dict.rules.length : 0); i < len; i++) { + dict.rules[i][1] = '{ /* nada */ }'; + rv = test_me(function () { + // opts.conditions = []; + // opts.rules = []; + // opts.__in_rules_failure_analysis_mode__ = true; + }, 'Your lexer rule "' + dict.rules[i][0] + '" action code block is botched?', ex, null); + if (rv) { + break; + } + } + if (!rv) { + test_me(function () { + opts.conditions = []; + opts.rules = []; + opts.performAction = 'null'; + // opts.options = {}; + // opts.caseHelperInclude = '{}'; + opts.showSource = false; + opts.__in_rules_failure_analysis_mode__ = true; + + dump = false; + }, 'One or more of your lexer rule action code block(s) are possibly botched?', ex, null); + } + } + } + } + throw ex; + }); + + lexer.setInput(input); + + /** @public */ + lexer.generate = function () { + return generateFromOpts(opts); + }; + /** @public */ + lexer.generateModule = function () { + return generateModule(opts); + }; + /** @public */ + lexer.generateCommonJSModule = function () { + return generateCommonJSModule(opts); + }; + /** @public */ + lexer.generateESModule = function () { + return generateESModule(opts); + }; + /** @public */ + lexer.generateAMDModule = function () { + return generateAMDModule(opts); + }; + + // internal APIs to aid testing: + /** @public */ + lexer.getExpandedMacros = function () { + return opts.macros; + }; + + return lexer; +} + +// code stripping performance test for very simple grammar: +// +// - removing backtracking parser code branches: 730K -> 750K rounds +// - removing all location info tracking: yylineno, yylloc, etc.: 750K -> 900K rounds +// - no `yyleng`: 900K -> 905K rounds +// - no `this.done` as we cannot have a NULL `_input` anymore: 905K -> 930K rounds +// - `simpleCaseActionClusters` as array instead of hash object: 930K -> 940K rounds +// - lexers which have only return stmts, i.e. only a +// `simpleCaseActionClusters` lookup table to produce +// lexer tokens: *inline* the `performAction` call: 940K -> 950K rounds +// - given all the above, you can *inline* what's left of +// `lexer_next()`: 950K -> 955K rounds (? this stuff becomes hard to measure; inaccuracy abounds!) +// +// Total gain when we forget about very minor (and tough to nail) *inlining* `lexer_next()` gains: +// +// 730 -> 950 ~ 30% performance gain. +// + +// As a function can be reproduced in source-code form by any JavaScript engine, we're going to wrap this chunk +// of code in a function so that we can easily get it including it comments, etc.: +/** +@public +@nocollapse +*/ +function getRegExpLexerPrototype() { + // --- START lexer kernel --- +return `{ + EOF: 1, + ERROR: 2, + + // JisonLexerError: JisonLexerError, /// <-- injected by the code generator + + // options: {}, /// <-- injected by the code generator + + // yy: ..., /// <-- injected by setInput() + + __currentRuleSet__: null, /// INTERNAL USE ONLY: internal rule set cache for the current lexer state + + __error_infos: [], /// INTERNAL USE ONLY: the set of lexErrorInfo objects created since the last cleanup + + __decompressed: false, /// INTERNAL USE ONLY: mark whether the lexer instance has been 'unfolded' completely and is now ready for use + + done: false, /// INTERNAL USE ONLY + _backtrack: false, /// INTERNAL USE ONLY + _input: '', /// INTERNAL USE ONLY + _more: false, /// INTERNAL USE ONLY + _signaled_error_token: false, /// INTERNAL USE ONLY + + conditionStack: [], /// INTERNAL USE ONLY; managed via \`pushState()\`, \`popState()\`, \`topState()\` and \`stateStackSize()\` + + match: '', /// READ-ONLY EXTERNAL ACCESS - ADVANCED USE ONLY: tracks input which has been matched so far for the lexer token under construction. \`match\` is identical to \`yytext\` except that this one still contains the matched input string after \`lexer.performAction()\` has been invoked, where userland code MAY have changed/replaced the \`yytext\` value entirely! + matched: '', /// READ-ONLY EXTERNAL ACCESS - ADVANCED USE ONLY: tracks entire input which has been matched so far + matches: false, /// READ-ONLY EXTERNAL ACCESS - ADVANCED USE ONLY: tracks RE match result for last (successful) match attempt + yytext: '', /// ADVANCED USE ONLY: tracks input which has been matched so far for the lexer token under construction; this value is transferred to the parser as the 'token value' when the parser consumes the lexer token produced through a call to the \`lex()\` API. + offset: 0, /// READ-ONLY EXTERNAL ACCESS - ADVANCED USE ONLY: tracks the 'cursor position' in the input string, i.e. the number of characters matched so far + yyleng: 0, /// READ-ONLY EXTERNAL ACCESS - ADVANCED USE ONLY: length of matched input for the token under construction (\`yytext\`) + yylineno: 0, /// READ-ONLY EXTERNAL ACCESS - ADVANCED USE ONLY: 'line number' at which the token under construction is located + yylloc: null, /// READ-ONLY EXTERNAL ACCESS - ADVANCED USE ONLY: tracks location info (lines + columns) for the token under construction + + /** + * INTERNAL USE: construct a suitable error info hash object instance for \`parseError\`. + * + * @public + * @this {RegExpLexer} + */ + constructLexErrorInfo: function lexer_constructLexErrorInfo(msg, recoverable, show_input_position) { + msg = '' + msg; + + // heuristic to determine if the error message already contains a (partial) source code dump + // as produced by either \`showPosition()\` or \`prettyPrintRange()\`: + if (show_input_position == undefined) { + show_input_position = !(msg.indexOf('\\n') > 0 && msg.indexOf('^') > 0); + } + if (this.yylloc && show_input_position) { + if (typeof this.prettyPrintRange === 'function') { + var pretty_src = this.prettyPrintRange(this.yylloc); + + if (!/\\n\\s*$/.test(msg)) { + msg += '\\n'; + } + msg += '\\n Erroneous area:\\n' + this.prettyPrintRange(this.yylloc); + } else if (typeof this.showPosition === 'function') { + var pos_str = this.showPosition(); + if (pos_str) { + if (msg.length && msg[msg.length - 1] !== '\\n' && pos_str[0] !== '\\n') { + msg += '\\n' + pos_str; + } else { + msg += pos_str; + } + } + } + } + /** @constructor */ + var pei = { + errStr: msg, + recoverable: !!recoverable, + text: this.match, // This one MAY be empty; userland code should use the \`upcomingInput\` API to obtain more text which follows the 'lexer cursor position'... + token: null, + line: this.yylineno, + loc: this.yylloc, + yy: this.yy, + lexer: this, + + /** + * and make sure the error info doesn't stay due to potential + * ref cycle via userland code manipulations. + * These would otherwise all be memory leak opportunities! + * + * Note that only array and object references are nuked as those + * constitute the set of elements which can produce a cyclic ref. + * The rest of the members is kept intact as they are harmless. + * + * @public + * @this {LexErrorInfo} + */ + destroy: function destructLexErrorInfo() { + // remove cyclic references added to error info: + // info.yy = null; + // info.lexer = null; + // ... + var rec = !!this.recoverable; + for (var key in this) { + if (this.hasOwnProperty(key) && typeof key === 'object') { + this[key] = undefined; + } + } + this.recoverable = rec; + } + }; + // track this instance so we can \`destroy()\` it once we deem it superfluous and ready for garbage collection! + this.__error_infos.push(pei); + return pei; + }, + + /** + * handler which is invoked when a lexer error occurs. + * + * @public + * @this {RegExpLexer} + */ + parseError: function lexer_parseError(str, hash, ExceptionClass) { + if (!ExceptionClass) { + ExceptionClass = this.JisonLexerError; + } + if (this.yy) { + if (this.yy.parser && typeof this.yy.parser.parseError === 'function') { + return this.yy.parser.parseError.call(this, str, hash, ExceptionClass) || this.ERROR; + } else if (typeof this.yy.parseError === 'function') { + return this.yy.parseError.call(this, str, hash, ExceptionClass) || this.ERROR; + } + } + throw new ExceptionClass(str, hash); + }, + + /** + * method which implements \`yyerror(str, ...args)\` functionality for use inside lexer actions. + * + * @public + * @this {RegExpLexer} + */ + yyerror: function yyError(str /*, ...args */) { + var lineno_msg = ''; + if (this.yylloc) { + lineno_msg = ' on line ' + (this.yylineno + 1); + } + var p = this.constructLexErrorInfo('Lexical error' + lineno_msg + ': ' + str, this.options.lexerErrorsAreRecoverable); + + // Add any extra args to the hash under the name \`extra_error_attributes\`: + var args = Array.prototype.slice.call(arguments, 1); + if (args.length) { + p.extra_error_attributes = args; + } + + return (this.parseError(p.errStr, p, this.JisonLexerError) || this.ERROR); + }, + + /** + * final cleanup function for when we have completed lexing the input; + * make it an API so that external code can use this one once userland + * code has decided it's time to destroy any lingering lexer error + * hash object instances and the like: this function helps to clean + * up these constructs, which *may* carry cyclic references which would + * otherwise prevent the instances from being properly and timely + * garbage-collected, i.e. this function helps prevent memory leaks! + * + * @public + * @this {RegExpLexer} + */ + cleanupAfterLex: function lexer_cleanupAfterLex(do_not_nuke_errorinfos) { + // prevent lingering circular references from causing memory leaks: + this.setInput('', {}); + + // nuke the error hash info instances created during this run. + // Userland code must COPY any data/references + // in the error hash instance(s) it is more permanently interested in. + if (!do_not_nuke_errorinfos) { + for (var i = this.__error_infos.length - 1; i >= 0; i--) { + var el = this.__error_infos[i]; + if (el && typeof el.destroy === 'function') { + el.destroy(); + } + } + this.__error_infos.length = 0; + } + + return this; + }, + + /** + * clear the lexer token context; intended for internal use only + * + * @public + * @this {RegExpLexer} + */ + clear: function lexer_clear() { + this.yytext = ''; + this.yyleng = 0; + this.match = ''; + // - DO NOT reset \`this.matched\` + this.matches = false; + this._more = false; + this._backtrack = false; + + var col = (this.yylloc ? this.yylloc.last_column : 0); + this.yylloc = { + first_line: this.yylineno + 1, + first_column: col, + last_line: this.yylineno + 1, + last_column: col, + + range: [this.offset, this.offset] + }; + }, + + /** + * resets the lexer, sets new input + * + * @public + * @this {RegExpLexer} + */ + setInput: function lexer_setInput(input, yy) { + this.yy = yy || this.yy || {}; + + // also check if we've fully initialized the lexer instance, + // including expansion work to be done to go from a loaded + // lexer to a usable lexer: + if (!this.__decompressed) { + // step 1: decompress the regex list: + var rules = this.rules; + for (var i = 0, len = rules.length; i < len; i++) { + var rule_re = rules[i]; + + // compression: is the RE an xref to another RE slot in the rules[] table? + if (typeof rule_re === 'number') { + rules[i] = rules[rule_re]; + } + } + + // step 2: unfold the conditions[] set to make these ready for use: + var conditions = this.conditions; + for (var k in conditions) { + var spec = conditions[k]; + + var rule_ids = spec.rules; + + var len = rule_ids.length; + var rule_regexes = new Array(len + 1); // slot 0 is unused; we use a 1-based index approach here to keep the hottest code in \`lexer_next()\` fast and simple! + var rule_new_ids = new Array(len + 1); + + for (var i = 0; i < len; i++) { + var idx = rule_ids[i]; + var rule_re = rules[idx]; + rule_regexes[i + 1] = rule_re; + rule_new_ids[i + 1] = idx; + } + + spec.rules = rule_new_ids; + spec.__rule_regexes = rule_regexes; + spec.__rule_count = len; + } + + this.__decompressed = true; + } + + this._input = input || ''; + this.clear(); + this._signaled_error_token = false; + this.done = false; + this.yylineno = 0; + this.matched = ''; + this.conditionStack = ['INITIAL']; + this.__currentRuleSet__ = null; + this.yylloc = { + first_line: 1, + first_column: 0, + last_line: 1, + last_column: 0, + + range: [0, 0] + }; + this.offset = 0; + return this; + }, + + /** + * edit the remaining input via user-specified callback. + * This can be used to forward-adjust the input-to-parse, + * e.g. inserting macro expansions and alike in the + * input which has yet to be lexed. + * The behaviour of this API contrasts the \`unput()\` et al + * APIs as those act on the *consumed* input, while this + * one allows one to manipulate the future, without impacting + * the current \`yyloc\` cursor location or any history. + * + * Use this API to help implement C-preprocessor-like + * \`#include\` statements, etc. + * + * The provided callback must be synchronous and is + * expected to return the edited input (string). + * + * The \`cpsArg\` argument value is passed to the callback + * as-is. + * + * \`callback\` interface: + * \`function callback(input, cpsArg)\` + * + * - \`input\` will carry the remaining-input-to-lex string + * from the lexer. + * - \`cpsArg\` is \`cpsArg\` passed into this API. + * + * The \`this\` reference for the callback will be set to + * reference this lexer instance so that userland code + * in the callback can easily and quickly access any lexer + * API. + * + * When the callback returns a non-string-type falsey value, + * we assume the callback did not edit the input and we + * will using the input as-is. + * + * When the callback returns a non-string-type value, it + * is converted to a string for lexing via the \`"" + retval\` + * operation. (See also why: http://2ality.com/2012/03/converting-to-string.html + * -- that way any returned object's \`toValue()\` and \`toString()\` + * methods will be invoked in a proper/desirable order.) + * + * @public + * @this {RegExpLexer} + */ + editRemainingInput: function lexer_editRemainingInput(callback, cpsArg) { + var rv = callback.call(this, this._input, cpsArg); + if (typeof rv !== 'string') { + if (rv) { + this._input = '' + rv; + } + // else: keep \`this._input\` as is. + } else { + this._input = rv; + } + return this; + }, + + /** + * consumes and returns one char from the input + * + * @public + * @this {RegExpLexer} + */ + input: function lexer_input() { + if (!this._input) { + //this.done = true; -- don't set \`done\` as we want the lex()/next() API to be able to produce one custom EOF token match after this anyhow. (lexer can match special <> tokens and perform user action code for a <> match, but only does so *once*) + return null; + } + var ch = this._input[0]; + this.yytext += ch; + this.yyleng++; + this.offset++; + this.match += ch; + this.matched += ch; + // Count the linenumber up when we hit the LF (or a stand-alone CR). + // On CRLF, the linenumber is incremented when you fetch the CR or the CRLF combo + // and we advance immediately past the LF as well, returning both together as if + // it was all a single 'character' only. + var slice_len = 1; + var lines = false; + if (ch === '\\n') { + lines = true; + } else if (ch === '\\r') { + lines = true; + var ch2 = this._input[1]; + if (ch2 === '\\n') { + slice_len++; + ch += ch2; + this.yytext += ch2; + this.yyleng++; + this.offset++; + this.match += ch2; + this.matched += ch2; + this.yylloc.range[1]++; + } + } + if (lines) { + this.yylineno++; + this.yylloc.last_line++; + this.yylloc.last_column = 0; + } else { + this.yylloc.last_column++; + } + this.yylloc.range[1]++; + + this._input = this._input.slice(slice_len); + return ch; + }, + + /** + * unshifts one char (or an entire string) into the input + * + * @public + * @this {RegExpLexer} + */ + unput: function lexer_unput(ch) { + var len = ch.length; + var lines = ch.split(/(?:\\r\\n?|\\n)/g); + + this._input = ch + this._input; + this.yytext = this.yytext.substr(0, this.yytext.length - len); + this.yyleng = this.yytext.length; + this.offset -= len; + this.match = this.match.substr(0, this.match.length - len); + this.matched = this.matched.substr(0, this.matched.length - len); + + if (lines.length > 1) { + this.yylineno -= lines.length - 1; + + this.yylloc.last_line = this.yylineno + 1; + + // Get last entirely matched line into the \`pre_lines[]\` array's + // last index slot; we don't mind when other previously + // matched lines end up in the array too. + var pre = this.match; + var pre_lines = pre.split(/(?:\\r\\n?|\\n)/g); + if (pre_lines.length === 1) { + pre = this.matched; + pre_lines = pre.split(/(?:\\r\\n?|\\n)/g); + } + this.yylloc.last_column = pre_lines[pre_lines.length - 1].length; + } else { + this.yylloc.last_column -= len; + } + + this.yylloc.range[1] = this.yylloc.range[0] + this.yyleng; + + this.done = false; + return this; + }, + + /** + * cache matched text and append it on next action + * + * @public + * @this {RegExpLexer} + */ + more: function lexer_more() { + this._more = true; + return this; + }, + + /** + * signal the lexer that this rule fails to match the input, so the + * next matching rule (regex) should be tested instead. + * + * @public + * @this {RegExpLexer} + */ + reject: function lexer_reject() { + if (this.options.backtrack_lexer) { + this._backtrack = true; + } else { + // when the \`parseError()\` call returns, we MUST ensure that the error is registered. + // We accomplish this by signaling an 'error' token to be produced for the current + // \`.lex()\` run. + var lineno_msg = ''; + if (this.yylloc) { + lineno_msg = ' on line ' + (this.yylineno + 1); + } + var p = this.constructLexErrorInfo('Lexical error' + lineno_msg + ': You can only invoke reject() in the lexer when the lexer is of the backtracking persuasion (options.backtrack_lexer = true).', false); + this._signaled_error_token = (this.parseError(p.errStr, p, this.JisonLexerError) || this.ERROR); + } + return this; + }, + + /** + * retain first n characters of the match + * + * @public + * @this {RegExpLexer} + */ + less: function lexer_less(n) { + return this.unput(this.match.slice(n)); + }, + + /** + * return (part of the) already matched input, i.e. for error + * messages. + * + * Limit the returned string length to \`maxSize\` (default: 20). + * + * Limit the returned string to the \`maxLines\` number of lines of + * input (default: 1). + * + * Negative limit values equal *unlimited*. + * + * @public + * @this {RegExpLexer} + */ + pastInput: function lexer_pastInput(maxSize, maxLines) { + var past = this.matched.substring(0, this.matched.length - this.match.length); + if (maxSize < 0) + maxSize = past.length; + else if (!maxSize) + maxSize = 20; + if (maxLines < 0) + maxLines = past.length; // can't ever have more input lines than this! + else if (!maxLines) + maxLines = 1; + // \`substr\` anticipation: treat \\r\\n as a single character and take a little + // more than necessary so that we can still properly check against maxSize + // after we've transformed and limited the newLines in here: + past = past.substr(-maxSize * 2 - 2); + // now that we have a significantly reduced string to process, transform the newlines + // and chop them, then limit them: + var a = past.replace(/\\r\\n|\\r/g, '\\n').split('\\n'); + a = a.slice(-maxLines); + past = a.join('\\n'); + // When, after limiting to maxLines, we still have too much to return, + // do add an ellipsis prefix... + if (past.length > maxSize) { + past = '...' + past.substr(-maxSize); + } + return past; + }, + + /** + * return (part of the) upcoming input, i.e. for error messages. + * + * Limit the returned string length to \`maxSize\` (default: 20). + * + * Limit the returned string to the \`maxLines\` number of lines of input (default: 1). + * + * Negative limit values equal *unlimited*. + * + * > ### NOTE ### + * > + * > *"upcoming input"* is defined as the whole of the both + * > the *currently lexed* input, together with any remaining input + * > following that. *"currently lexed"* input is the input + * > already recognized by the lexer but not yet returned with + * > the lexer token. This happens when you are invoking this API + * > from inside any lexer rule action code block. + * > + * + * @public + * @this {RegExpLexer} + */ + upcomingInput: function lexer_upcomingInput(maxSize, maxLines) { + var next = this.match; + if (maxSize < 0) + maxSize = next.length + this._input.length; + else if (!maxSize) + maxSize = 20; + if (maxLines < 0) + maxLines = maxSize; // can't ever have more input lines than this! + else if (!maxLines) + maxLines = 1; + // \`substring\` anticipation: treat \\r\\n as a single character and take a little + // more than necessary so that we can still properly check against maxSize + // after we've transformed and limited the newLines in here: + if (next.length < maxSize * 2 + 2) { + next += this._input.substring(0, maxSize * 2 + 2); // substring is faster on Chrome/V8 + } + // now that we have a significantly reduced string to process, transform the newlines + // and chop them, then limit them: + var a = next.replace(/\\r\\n|\\r/g, '\\n').split('\\n'); + a = a.slice(0, maxLines); + next = a.join('\\n'); + // When, after limiting to maxLines, we still have too much to return, + // do add an ellipsis postfix... + if (next.length > maxSize) { + next = next.substring(0, maxSize) + '...'; + } + return next; + }, + + /** + * return a string which displays the character position where the + * lexing error occurred, i.e. for error messages + * + * @public + * @this {RegExpLexer} + */ + showPosition: function lexer_showPosition(maxPrefix, maxPostfix) { + var pre = this.pastInput(maxPrefix).replace(/\\s/g, ' '); + var c = new Array(pre.length + 1).join('-'); + return pre + this.upcomingInput(maxPostfix).replace(/\\s/g, ' ') + '\\n' + c + '^'; + }, + + /** + * return a string which displays the lines & columns of input which are referenced + * by the given location info range, plus a few lines of context. + * + * This function pretty-prints the indicated section of the input, with line numbers + * and everything! + * + * This function is very useful to provide highly readable error reports, while + * the location range may be specified in various flexible ways: + * + * - \`loc\` is the location info object which references the area which should be + * displayed and 'marked up': these lines & columns of text are marked up by \`^\` + * characters below each character in the entire input range. + * + * - \`context_loc\` is the *optional* location info object which instructs this + * pretty-printer how much *leading* context should be displayed alongside + * the area referenced by \`loc\`. This can help provide context for the displayed + * error, etc. + * + * When this location info is not provided, a default context of 3 lines is + * used. + * + * - \`context_loc2\` is another *optional* location info object, which serves + * a similar purpose to \`context_loc\`: it specifies the amount of *trailing* + * context lines to display in the pretty-print output. + * + * When this location info is not provided, a default context of 1 line only is + * used. + * + * Special Notes: + * + * - when the \`loc\`-indicated range is very large (about 5 lines or more), then + * only the first and last few lines of this block are printed while a + * \`...continued...\` message will be printed between them. + * + * This serves the purpose of not printing a huge amount of text when the \`loc\` + * range happens to be huge: this way a manageable & readable output results + * for arbitrary large ranges. + * + * - this function can display lines of input which whave not yet been lexed. + * \`prettyPrintRange()\` can access the entire input! + * + * @public + * @this {RegExpLexer} + */ + prettyPrintRange: function lexer_prettyPrintRange(loc, context_loc, context_loc2) { + var error_size = loc.last_line - loc.first_line; + const CONTEXT = 3; + const CONTEXT_TAIL = 1; + const MINIMUM_VISIBLE_NONEMPTY_LINE_COUNT = 2; + var input = this.matched + this._input; + var lines = input.split('\\n'); + //var show_context = (error_size < 5 || context_loc); + var l0 = Math.max(1, (context_loc ? context_loc.first_line : loc.first_line - CONTEXT)); + var l1 = Math.max(1, (context_loc2 ? context_loc2.last_line : loc.last_line + CONTEXT_TAIL)); + var lineno_display_width = (1 + Math.log10(l1 | 1) | 0); + var ws_prefix = new Array(lineno_display_width).join(' '); + var nonempty_line_indexes = []; + var rv = lines.slice(l0 - 1, l1 + 1).map(function injectLineNumber(line, index) { + var lno = index + l0; + var lno_pfx = (ws_prefix + lno).substr(-lineno_display_width); + var rv = lno_pfx + ': ' + line; + var errpfx = (new Array(lineno_display_width + 1)).join('^'); + var offset = 2 + 1; + var len = 0; + + if (lno === loc.first_line) { + offset += loc.first_column; + + len = Math.max( + 2, + ((lno === loc.last_line ? loc.last_column : line.length)) - loc.first_column + 1 + ); + } else if (lno === loc.last_line) { + len = Math.max(2, loc.last_column + 1); + } else if (lno > loc.first_line && lno < loc.last_line) { + len = Math.max(2, line.length + 1); + } + + if (len) { + var lead = new Array(offset).join('.'); + var mark = new Array(len).join('^'); + rv += '\\n' + errpfx + lead + mark; + + if (line.trim().length > 0) { + nonempty_line_indexes.push(index); + } + } + + rv = rv.replace(/\\t/g, ' '); + return rv; + }); + + // now make sure we don't print an overly large amount of error area: limit it + // to the top and bottom line count: + if (nonempty_line_indexes.length > 2 * MINIMUM_VISIBLE_NONEMPTY_LINE_COUNT) { + var clip_start = nonempty_line_indexes[MINIMUM_VISIBLE_NONEMPTY_LINE_COUNT - 1] + 1; + var clip_end = nonempty_line_indexes[nonempty_line_indexes.length - MINIMUM_VISIBLE_NONEMPTY_LINE_COUNT] - 1; + + var intermediate_line = (new Array(lineno_display_width + 1)).join(' ') + ' (...continued...)'; + intermediate_line += '\\n' + (new Array(lineno_display_width + 1)).join('-') + ' (---------------)'; + rv.splice(clip_start, clip_end - clip_start + 1, intermediate_line); + } + return rv.join('\\n'); + }, + + /** + * helper function, used to produce a human readable description as a string, given + * the input \`yylloc\` location object. + * + * Set \`display_range_too\` to TRUE to include the string character index position(s) + * in the description if the \`yylloc.range\` is available. + * + * @public + * @this {RegExpLexer} + */ + describeYYLLOC: function lexer_describe_yylloc(yylloc, display_range_too) { + var l1 = yylloc.first_line; + var l2 = yylloc.last_line; + var c1 = yylloc.first_column; + var c2 = yylloc.last_column; + var dl = l2 - l1; + var dc = c2 - c1; + var rv; + if (dl === 0) { + rv = 'line ' + l1 + ', '; + if (dc <= 1) { + rv += 'column ' + c1; + } else { + rv += 'columns ' + c1 + ' .. ' + c2; + } + } else { + rv = 'lines ' + l1 + '(column ' + c1 + ') .. ' + l2 + '(column ' + c2 + ')'; + } + if (yylloc.range && display_range_too) { + var r1 = yylloc.range[0]; + var r2 = yylloc.range[1] - 1; + if (r2 <= r1) { + rv += ' {String Offset: ' + r1 + '}'; + } else { + rv += ' {String Offset range: ' + r1 + ' .. ' + r2 + '}'; + } + } + return rv; + }, + + /** + * test the lexed token: return FALSE when not a match, otherwise return token. + * + * \`match\` is supposed to be an array coming out of a regex match, i.e. \`match[0]\` + * contains the actually matched text string. + * + * Also move the input cursor forward and update the match collectors: + * + * - \`yytext\` + * - \`yyleng\` + * - \`match\` + * - \`matches\` + * - \`yylloc\` + * - \`offset\` + * + * @public + * @this {RegExpLexer} + */ + test_match: function lexer_test_match(match, indexed_rule) { + var token, + lines, + backup, + match_str, + match_str_len; + + if (this.options.backtrack_lexer) { + // save context + backup = { + yylineno: this.yylineno, + yylloc: { + first_line: this.yylloc.first_line, + last_line: this.yylloc.last_line, + first_column: this.yylloc.first_column, + last_column: this.yylloc.last_column, + + range: this.yylloc.range.slice(0) + }, + yytext: this.yytext, + match: this.match, + matches: this.matches, + matched: this.matched, + yyleng: this.yyleng, + offset: this.offset, + _more: this._more, + _input: this._input, + //_signaled_error_token: this._signaled_error_token, + yy: this.yy, + conditionStack: this.conditionStack.slice(0), + done: this.done + }; + } + + match_str = match[0]; + match_str_len = match_str.length; + // if (match_str.indexOf('\\n') !== -1 || match_str.indexOf('\\r') !== -1) { + lines = match_str.split(/(?:\\r\\n?|\\n)/g); + if (lines.length > 1) { + this.yylineno += lines.length - 1; + + this.yylloc.last_line = this.yylineno + 1; + this.yylloc.last_column = lines[lines.length - 1].length; + } else { + this.yylloc.last_column += match_str_len; + } + // } + this.yytext += match_str; + this.match += match_str; + this.matched += match_str; + this.matches = match; + this.yyleng = this.yytext.length; + this.yylloc.range[1] += match_str_len; + + // previous lex rules MAY have invoked the \`more()\` API rather than producing a token: + // those rules will already have moved this \`offset\` forward matching their match lengths, + // hence we must only add our own match length now: + this.offset += match_str_len; + this._more = false; + this._backtrack = false; + this._input = this._input.slice(match_str_len); + + // calling this method: + // + // function lexer__performAction(yy, yyrulenumber, YY_START) {...} + token = this.performAction.call(this, this.yy, indexed_rule, this.conditionStack[this.conditionStack.length - 1] /* = YY_START */); + // otherwise, when the action codes are all simple return token statements: + //token = this.simpleCaseActionClusters[indexed_rule]; + + if (this.done && this._input) { + this.done = false; + } + if (token) { + return token; + } else if (this._backtrack) { + // recover context + for (var k in backup) { + this[k] = backup[k]; + } + this.__currentRuleSet__ = null; + return false; // rule action called reject() implying the next rule should be tested instead. + } else if (this._signaled_error_token) { + // produce one 'error' token as \`.parseError()\` in \`reject()\` + // did not guarantee a failure signal by throwing an exception! + token = this._signaled_error_token; + this._signaled_error_token = false; + return token; + } + return false; + }, + + /** + * return next match in input + * + * @public + * @this {RegExpLexer} + */ + next: function lexer_next() { + if (this.done) { + this.clear(); + return this.EOF; + } + if (!this._input) { + this.done = true; + } + + var token, + match, + tempMatch, + index; + if (!this._more) { + this.clear(); + } + var spec = this.__currentRuleSet__; + if (!spec) { + // Update the ruleset cache as we apparently encountered a state change or just started lexing. + // The cache is set up for fast lookup -- we assume a lexer will switch states much less often than it will + // invoke the \`lex()\` token-producing API and related APIs, hence caching the set for direct access helps + // speed up those activities a tiny bit. + spec = this.__currentRuleSet__ = this._currentRules(); + // Check whether a *sane* condition has been pushed before: this makes the lexer robust against + // user-programmer bugs such as https://github.com/zaach/jison-lex/issues/19 + if (!spec || !spec.rules) { + var lineno_msg = ''; + if (this.options.trackPosition) { + lineno_msg = ' on line ' + (this.yylineno + 1); + } + var p = this.constructLexErrorInfo('Internal lexer engine error' + lineno_msg + ': The lex grammar programmer pushed a non-existing condition name "' + this.topState() + '"; this is a fatal error and should be reported to the application programmer team!', false); + // produce one 'error' token until this situation has been resolved, most probably by parse termination! + return (this.parseError(p.errStr, p, this.JisonLexerError) || this.ERROR); + } + } + + var rule_ids = spec.rules; + var regexes = spec.__rule_regexes; + var len = spec.__rule_count; + + // Note: the arrays are 1-based, while \`len\` itself is a valid index, + // hence the non-standard less-or-equal check in the next loop condition! + for (var i = 1; i <= len; i++) { + tempMatch = this._input.match(regexes[i]); + if (tempMatch && (!match || tempMatch[0].length > match[0].length)) { + match = tempMatch; + index = i; + if (this.options.backtrack_lexer) { + token = this.test_match(tempMatch, rule_ids[i]); + if (token !== false) { + return token; + } else if (this._backtrack) { + match = undefined; + continue; // rule action called reject() implying a rule MISmatch. + } else { + // else: this is a lexer rule which consumes input without producing a token (e.g. whitespace) + return false; + } + } else if (!this.options.flex) { + break; + } + } + } + if (match) { + token = this.test_match(match, rule_ids[index]); + if (token !== false) { + return token; + } + // else: this is a lexer rule which consumes input without producing a token (e.g. whitespace) + return false; + } + if (!this._input) { + this.done = true; + this.clear(); + return this.EOF; + } else { + var lineno_msg = ''; + if (this.options.trackPosition) { + lineno_msg = ' on line ' + (this.yylineno + 1); + } + var p = this.constructLexErrorInfo('Lexical error' + lineno_msg + ': Unrecognized text.', this.options.lexerErrorsAreRecoverable); + + var pendingInput = this._input; + var activeCondition = this.topState(); + var conditionStackDepth = this.conditionStack.length; + + token = (this.parseError(p.errStr, p, this.JisonLexerError) || this.ERROR); + if (token === this.ERROR) { + // we can try to recover from a lexer error that \`parseError()\` did not 'recover' for us + // by moving forward at least one character at a time IFF the (user-specified?) \`parseError()\` + // has not consumed/modified any pending input or changed state in the error handler: + if (!this.matches && + // and make sure the input has been modified/consumed ... + pendingInput === this._input && + // ...or the lexer state has been modified significantly enough + // to merit a non-consuming error handling action right now. + activeCondition === this.topState() && + conditionStackDepth === this.conditionStack.length + ) { + this.input(); + } + } + return token; + } + }, + + /** + * return next match that has a token + * + * @public + * @this {RegExpLexer} + */ + lex: function lexer_lex() { + var r; + // allow the PRE/POST handlers set/modify the return token for maximum flexibility of the generated lexer: + if (typeof this.options.pre_lex === 'function') { + r = this.options.pre_lex.call(this); + } + + while (!r) { + r = this.next(); + } + + if (typeof this.options.post_lex === 'function') { + // (also account for a userdef function which does not return any value: keep the token as is) + r = this.options.post_lex.call(this, r) || r; + } + return r; + }, + + /** + * backwards compatible alias for \`pushState()\`; + * the latter is symmetrical with \`popState()\` and we advise to use + * those APIs in any modern lexer code, rather than \`begin()\`. + * + * @public + * @this {RegExpLexer} + */ + begin: function lexer_begin(condition) { + return this.pushState(condition); + }, + + /** + * activates a new lexer condition state (pushes the new lexer + * condition state onto the condition stack) + * + * @public + * @this {RegExpLexer} + */ + pushState: function lexer_pushState(condition) { + this.conditionStack.push(condition); + this.__currentRuleSet__ = null; + return this; + }, + + /** + * pop the previously active lexer condition state off the condition + * stack + * + * @public + * @this {RegExpLexer} + */ + popState: function lexer_popState() { + var n = this.conditionStack.length - 1; + if (n > 0) { + this.__currentRuleSet__ = null; + return this.conditionStack.pop(); + } else { + return this.conditionStack[0]; + } + }, + + /** + * return the currently active lexer condition state; when an index + * argument is provided it produces the N-th previous condition state, + * if available + * + * @public + * @this {RegExpLexer} + */ + topState: function lexer_topState(n) { + n = this.conditionStack.length - 1 - Math.abs(n || 0); + if (n >= 0) { + return this.conditionStack[n]; + } else { + return 'INITIAL'; + } + }, + + /** + * (internal) determine the lexer rule set which is active for the + * currently active lexer condition state + * + * @public + * @this {RegExpLexer} + */ + _currentRules: function lexer__currentRules() { + if (this.conditionStack.length && this.conditionStack[this.conditionStack.length - 1]) { + return this.conditions[this.conditionStack[this.conditionStack.length - 1]]; + } else { + return this.conditions['INITIAL']; + } + }, + + /** + * return the number of states currently on the stack + * + * @public + * @this {RegExpLexer} + */ + stateStackSize: function lexer_stateStackSize() { + return this.conditionStack.length; + } +}`; + // --- END lexer kernel --- +} + +RegExpLexer.prototype = (new Function(rmCommonWS` + return ${getRegExpLexerPrototype()}; +`))(); + + +// The lexer code stripper, driven by optimization analysis settings and +// lexer options, which cannot be changed at run-time. +function stripUnusedLexerCode(src, opt) { + // uses yyleng: ..................... ${opt.lexerActionsUseYYLENG} + // uses yylineno: ................... ${opt.lexerActionsUseYYLINENO} + // uses yytext: ..................... ${opt.lexerActionsUseYYTEXT} + // uses yylloc: ..................... ${opt.lexerActionsUseYYLOC} + // uses ParseError API: ............. ${opt.lexerActionsUseParseError} + // uses location tracking & editing: ${opt.lexerActionsUseLocationTracking} + // uses more() API: ................. ${opt.lexerActionsUseMore} + // uses unput() API: ................ ${opt.lexerActionsUseUnput} + // uses reject() API: ............... ${opt.lexerActionsUseReject} + // uses less() API: ................. ${opt.lexerActionsUseLess} + // uses display APIs pastInput(), upcomingInput(), showPosition(): + // ............................. ${opt.lexerActionsUseDisplayAPIs} + // uses describeYYLLOC() API: ....... ${opt.lexerActionsUseDescribeYYLOC} + + var ast = helpers.parseCodeChunkToAST(src, opt); + var new_src = helpers.prettyPrintAST(ast, opt); + +new_src = new_src.replace(/\/\*\s*JISON-LEX-ANALYTICS-REPORT\s*\*\//g, rmCommonWS` + // Code Generator Information Report + // --------------------------------- + // + // Options: + // + // backtracking: .................... ${opt.options.backtrack_lexer} + // location.ranges: ................. ${opt.options.ranges} + // location line+column tracking: ... ${opt.options.trackPosition} + // + // + // Forwarded Parser Analysis flags: + // + // uses yyleng: ..................... ${opt.parseActionsUseYYLENG} + // uses yylineno: ................... ${opt.parseActionsUseYYLINENO} + // uses yytext: ..................... ${opt.parseActionsUseYYTEXT} + // uses yylloc: ..................... ${opt.parseActionsUseYYLOC} + // uses lexer values: ............... ${opt.parseActionsUseValueTracking} / ${opt.parseActionsUseValueAssignment} + // location tracking: ............... ${opt.parseActionsUseLocationTracking} + // location assignment: ............. ${opt.parseActionsUseLocationAssignment} + // + // + // Lexer Analysis flags: + // + // uses yyleng: ..................... ${opt.lexerActionsUseYYLENG} + // uses yylineno: ................... ${opt.lexerActionsUseYYLINENO} + // uses yytext: ..................... ${opt.lexerActionsUseYYTEXT} + // uses yylloc: ..................... ${opt.lexerActionsUseYYLOC} + // uses ParseError API: ............. ${opt.lexerActionsUseParseError} + // uses yyerror: .................... ${opt.lexerActionsUseYYERROR} + // uses location tracking & editing: ${opt.lexerActionsUseLocationTracking} + // uses more() API: ................. ${opt.lexerActionsUseMore} + // uses unput() API: ................ ${opt.lexerActionsUseUnput} + // uses reject() API: ............... ${opt.lexerActionsUseReject} + // uses less() API: ................. ${opt.lexerActionsUseLess} + // uses display APIs pastInput(), upcomingInput(), showPosition(): + // ............................. ${opt.lexerActionsUseDisplayAPIs} + // uses describeYYLLOC() API: ....... ${opt.lexerActionsUseDescribeYYLOC} + // + // --------- END OF REPORT ----------- + + `); + + return new_src; +} + + + + + +// generate lexer source from a grammar +/** @public */ +function generate(dict, tokens, build_options) { + var opt = processGrammar(dict, tokens, build_options); + + return generateFromOpts(opt); +} + +// process the grammar and build final data structures and functions +/** @public */ +function processGrammar(dict, tokens, build_options) { + build_options = build_options || {}; + var opts = { + // include the knowledge passed through `build_options` about which lexer + // features will actually be *used* by the environment (which in 99.9% + // of cases is a jison *parser*): + // + // (this stuff comes straight from the jison Optimization Analysis.) + // + parseActionsUseYYLENG: build_options.parseActionsUseYYLENG, + parseActionsUseYYLINENO: build_options.parseActionsUseYYLINENO, + parseActionsUseYYTEXT: build_options.parseActionsUseYYTEXT, + parseActionsUseYYLOC: build_options.parseActionsUseYYLOC, + parseActionsUseParseError: build_options.parseActionsUseParseError, + parseActionsUseYYERROR: build_options.parseActionsUseYYERROR, + parseActionsUseYYERROK: build_options.parseActionsUseYYERROK, + parseActionsUseYYRECOVERING: build_options.parseActionsUseYYRECOVERING, + parseActionsUseYYCLEARIN: build_options.parseActionsUseYYCLEARIN, + parseActionsUseValueTracking: build_options.parseActionsUseValueTracking, + parseActionsUseValueAssignment: build_options.parseActionsUseValueAssignment, + parseActionsUseLocationTracking: build_options.parseActionsUseLocationTracking, + parseActionsUseLocationAssignment: build_options.parseActionsUseLocationAssignment, + parseActionsUseYYSTACK: build_options.parseActionsUseYYSTACK, + parseActionsUseYYSSTACK: build_options.parseActionsUseYYSSTACK, + parseActionsUseYYSTACKPOINTER: build_options.parseActionsUseYYSTACKPOINTER, + parseActionsUseYYRULELENGTH: build_options.parseActionsUseYYRULELENGTH, + parserHasErrorRecovery: build_options.parserHasErrorRecovery, + parserHasErrorReporting: build_options.parserHasErrorReporting, + + lexerActionsUseYYLENG: '???', + lexerActionsUseYYLINENO: '???', + lexerActionsUseYYTEXT: '???', + lexerActionsUseYYLOC: '???', + lexerActionsUseParseError: '???', + lexerActionsUseYYERROR: '???', + lexerActionsUseLocationTracking: '???', + lexerActionsUseMore: '???', + lexerActionsUseUnput: '???', + lexerActionsUseReject: '???', + lexerActionsUseLess: '???', + lexerActionsUseDisplayAPIs: '???', + lexerActionsUseDescribeYYLOC: '???', + }; + + dict = autodetectAndConvertToJSONformat(dict, build_options) || {}; + + // Feed the possibly reprocessed 'dictionary' above back to the caller + // (for use by our error diagnostic assistance code) + opts.lex_rule_dictionary = dict; + + // Always provide the lexer with an options object, even if it's empty! + // Make sure to camelCase all options: + opts.options = mkStdOptions(build_options, dict.options); + + opts.moduleType = opts.options.moduleType; + opts.moduleName = opts.options.moduleName; + + opts.conditions = prepareStartConditions(dict.startConditions); + opts.conditions.INITIAL = { + rules: [], + inclusive: true + }; + + var code = buildActions(dict, tokens, opts); + opts.performAction = code.actions; + opts.caseHelperInclude = code.caseHelperInclude; + opts.rules = code.rules; + opts.macros = code.macros; + + opts.regular_rule_count = code.regular_rule_count; + opts.simple_rule_count = code.simple_rule_count; + + opts.conditionStack = ['INITIAL']; + + opts.actionInclude = (dict.actionInclude || ''); + opts.moduleInclude = (opts.moduleInclude || '') + (dict.moduleInclude || '').trim(); + + return opts; +} + +// Assemble the final source from the processed grammar +/** @public */ +function generateFromOpts(opt) { + var code = ''; + + switch (opt.moduleType) { + case 'js': + code = generateModule(opt); + break; + case 'amd': + code = generateAMDModule(opt); + break; + case 'es': + code = generateESModule(opt); + break; + case 'commonjs': + default: + code = generateCommonJSModule(opt); + break; + } + + return code; +} + +function generateRegexesInitTableCode(opt) { + var a = opt.rules; + var print_xregexp = opt.options && opt.options.xregexp; + var id_display_width = (1 + Math.log10(a.length | 1) | 0); + var ws_prefix = new Array(id_display_width).join(' '); + var b = a.map(function generateXRegExpInitCode(re, idx) { + var idx_str = (ws_prefix + idx).substr(-id_display_width); + + if (re instanceof XRegExp) { + // When we don't need the special XRegExp sauce at run-time, we do with the original + // JavaScript RegExp instance a.k.a. 'native regex': + if (re.xregexp.isNative || !print_xregexp) { + return `/* ${idx_str}: */ ${re}`; + } + // And make sure to escape the regex to make it suitable for placement inside a *string* + // as it is passed as a string argument to the XRegExp constructor here. + var re_src = re.xregexp.source.replace(/[\\"]/g, '\\$&'); + return `/* ${idx_str}: */ new XRegExp("${re_src}", "${re.xregexp.flags}")`; + } else { + return `/* ${idx_str}: */ ${re}`; + } + }); + return b.join(',\n'); +} + +function generateModuleBody(opt) { + // make the JSON output look more like JavaScript: + function cleanupJSON(str) { + str = str.replace(/ "rules": \[/g, ' rules: ['); + str = str.replace(/ "inclusive": /g, ' inclusive: '); + return str; + } + + function produceOptions(opts) { + var obj = {}; + var do_not_pass = { + debug: !opts.debug, // do not include this item when it is FALSE as there's no debug tracing built into the generated grammar anyway! + enableDebugLogs: 1, + json: 1, + _: 1, + noMain: 1, + dumpSourceCodeOnFailure: 1, + throwErrorOnCompileFailure: 1, + reportStats: 1, + file: 1, + outfile: 1, + inputPath: 1, + inputFilename: 1, + defaultModuleName: 1, + moduleName: 1, + moduleType: 1, + lexerErrorsAreRecoverable: 0, + flex: 0, + backtrack_lexer: 0, + caseInsensitive: 0, + showSource: 1, + exportAST: 1, + exportAllTables: 1, + exportSourceCode: 1, + prettyCfg: 1, + parseActionsUseYYLENG: 1, + parseActionsUseYYLINENO: 1, + parseActionsUseYYTEXT: 1, + parseActionsUseYYLOC: 1, + parseActionsUseParseError: 1, + parseActionsUseYYERROR: 1, + parseActionsUseYYRECOVERING: 1, + parseActionsUseYYERROK: 1, + parseActionsUseYYCLEARIN: 1, + parseActionsUseValueTracking: 1, + parseActionsUseValueAssignment: 1, + parseActionsUseLocationTracking: 1, + parseActionsUseLocationAssignment: 1, + parseActionsUseYYSTACK: 1, + parseActionsUseYYSSTACK: 1, + parseActionsUseYYSTACKPOINTER: 1, + parseActionsUseYYRULELENGTH: 1, + parserHasErrorRecovery: 1, + parserHasErrorReporting: 1, + lexerActionsUseYYLENG: 1, + lexerActionsUseYYLINENO: 1, + lexerActionsUseYYTEXT: 1, + lexerActionsUseYYLOC: 1, + lexerActionsUseParseError: 1, + lexerActionsUseYYERROR: 1, + lexerActionsUseLocationTracking: 1, + lexerActionsUseMore: 1, + lexerActionsUseUnput: 1, + lexerActionsUseReject: 1, + lexerActionsUseLess: 1, + lexerActionsUseDisplayAPIs: 1, + lexerActionsUseDescribeYYLOC: 1, + }; + for (var k in opts) { + if (!do_not_pass[k] && opts[k] != null && opts[k] !== false) { + // make sure numeric values are encoded as numeric, the rest as boolean/string. + if (typeof opts[k] === 'string') { + var f = parseFloat(opts[k]); + if (f == opts[k]) { + obj[k] = f; + continue; + } + } + obj[k] = opts[k]; + } + } + + // And now some options which should receive some special processing: + var pre = obj.pre_lex; + var post = obj.post_lex; + // since JSON cannot encode functions, we'll have to do it manually at run-time, i.e. later on: + if (pre) { + obj.pre_lex = true; + } + if (post) { + obj.post_lex = true; + } + + var js = JSON.stringify(obj, null, 2); + + js = js.replace(new XRegExp(` "(${ID_REGEX_BASE})": `, 'g'), ' $1: '); + js = js.replace(/^( +)pre_lex: true(,)?$/gm, function (m, ls, tc) { + return ls + 'pre_lex: ' + String(pre) + (tc || ''); + }); + js = js.replace(/^( +)post_lex: true(,)?$/gm, function (m, ls, tc) { + return ls + 'post_lex: ' + String(post) + (tc || ''); + }); + return js; + } + + + var out; + if (opt.rules.length > 0 || opt.__in_rules_failure_analysis_mode__) { + // we don't mind that the `test_me()` code above will have this `lexer` variable re-defined: + // JavaScript is fine with that. + var code = [rmCommonWS` + var lexer = { + `, '/*JISON-LEX-ANALYTICS-REPORT*/' /* slot #1: placeholder for analysis report further below */ + ]; + + // get the RegExpLexer.prototype in source code form: + var protosrc = getRegExpLexerPrototype(); + // and strip off the surrounding bits we don't want: + protosrc = protosrc + .replace(/^[\s\r\n]*\{/, '') + .replace(/\s*\}[\s\r\n]*$/, '') + .trim(); + code.push(protosrc + ',\n'); + + assert(opt.options); + // Assure all options are camelCased: + assert(typeof opt.options['case-insensitive'] === 'undefined'); + + code.push(' options: ' + produceOptions(opt.options)); + + var performActionCode = String(opt.performAction); + var simpleCaseActionClustersCode = String(opt.caseHelperInclude); + var rulesCode = generateRegexesInitTableCode(opt); + var conditionsCode = cleanupJSON(JSON.stringify(opt.conditions, null, 2)); + code.push(rmCommonWS`, + JisonLexerError: JisonLexerError, + performAction: ${performActionCode}, + simpleCaseActionClusters: ${simpleCaseActionClustersCode}, + rules: [ + ${rulesCode} + ], + conditions: ${conditionsCode} + }; + `); + + opt.is_custom_lexer = false; + + out = code.join(''); + } else { + // We're clearly looking at a custom lexer here as there's no lexer rules at all. + // + // We are re-purposing the `%{...%}` `actionInclude` code block here as it serves no purpose otherwise. + // + // Meanwhile we make sure we have the `lexer` variable declared in *local scope* no matter + // what crazy stuff (or lack thereof) the userland code is pulling in the `actionInclude` chunk. + out = 'var lexer;\n'; + + assert(opt.regular_rule_count === 0); + assert(opt.simple_rule_count === 0); + opt.is_custom_lexer = true; + + if (opt.actionInclude) { + out += opt.actionInclude + (!opt.actionInclude.match(/;[\s\r\n]*$/) ? ';' : '') + '\n'; + } + } + + // The output of this function is guaranteed to read something like this: + // + // ``` + // var lexer; + // + // bla bla bla bla ... lotsa bla bla; + // ``` + // + // and that should work nicely as an `eval()`-able piece of source code. + return out; +} + +function generateGenericHeaderComment() { + var out = rmCommonWS` + /* lexer generated by jison-lex ${version$1} */ + + /* + * Returns a Lexer object of the following structure: + * + * Lexer: { + * yy: {} The so-called "shared state" or rather the *source* of it; + * the real "shared state" \`yy\` passed around to + * the rule actions, etc. is a direct reference! + * + * This "shared context" object was passed to the lexer by way of + * the \`lexer.setInput(str, yy)\` API before you may use it. + * + * This "shared context" object is passed to the lexer action code in \`performAction()\` + * so userland code in the lexer actions may communicate with the outside world + * and/or other lexer rules' actions in more or less complex ways. + * + * } + * + * Lexer.prototype: { + * EOF: 1, + * ERROR: 2, + * + * yy: The overall "shared context" object reference. + * + * JisonLexerError: function(msg, hash), + * + * performAction: function lexer__performAction(yy, yyrulenumber, YY_START), + * + * The function parameters and \`this\` have the following value/meaning: + * - \`this\` : reference to the \`lexer\` instance. + * \`yy_\` is an alias for \`this\` lexer instance reference used internally. + * + * - \`yy\` : a reference to the \`yy\` "shared state" object which was passed to the lexer + * by way of the \`lexer.setInput(str, yy)\` API before. + * + * Note: + * The extra arguments you specified in the \`%parse-param\` statement in your + * **parser** grammar definition file are passed to the lexer via this object + * reference as member variables. + * + * - \`yyrulenumber\` : index of the matched lexer rule (regex), used internally. + * + * - \`YY_START\`: the current lexer "start condition" state. + * + * parseError: function(str, hash, ExceptionClass), + * + * constructLexErrorInfo: function(error_message, is_recoverable), + * Helper function. + * Produces a new errorInfo \'hash object\' which can be passed into \`parseError()\`. + * See it\'s use in this lexer kernel in many places; example usage: + * + * var infoObj = lexer.constructParseErrorInfo(\'fail!\', true); + * var retVal = lexer.parseError(infoObj.errStr, infoObj, lexer.JisonLexerError); + * + * options: { ... lexer %options ... }, + * + * lex: function(), + * Produce one token of lexed input, which was passed in earlier via the \`lexer.setInput()\` API. + * You MAY use the additional \`args...\` parameters as per \`%parse-param\` spec of the **lexer** grammar: + * these extra \`args...\` are added verbatim to the \`yy\` object reference as member variables. + * + * WARNING: + * Lexer's additional \`args...\` parameters (via lexer's \`%parse-param\`) MAY conflict with + * any attributes already added to \`yy\` by the **parser** or the jison run-time; + * when such a collision is detected an exception is thrown to prevent the generated run-time + * from silently accepting this confusing and potentially hazardous situation! + * + * cleanupAfterLex: function(do_not_nuke_errorinfos), + * Helper function. + * + * This helper API is invoked when the **parse process** has completed: it is the responsibility + * of the **parser** (or the calling userland code) to invoke this method once cleanup is desired. + * + * This helper may be invoked by user code to ensure the internal lexer gets properly garbage collected. + * + * setInput: function(input, [yy]), + * + * + * input: function(), + * + * + * unput: function(str), + * + * + * more: function(), + * + * + * reject: function(), + * + * + * less: function(n), + * + * + * pastInput: function(n), + * + * + * upcomingInput: function(n), + * + * + * showPosition: function(), + * + * + * test_match: function(regex_match_array, rule_index), + * + * + * next: function(), + * + * + * begin: function(condition), + * + * + * pushState: function(condition), + * + * + * popState: function(), + * + * + * topState: function(), + * + * + * _currentRules: function(), + * + * + * stateStackSize: function(), + * + * + * performAction: function(yy, yy_, yyrulenumber, YY_START), + * + * + * rules: [...], + * + * + * conditions: {associative list: name ==> set}, + * } + * + * + * token location info (\`yylloc\`): { + * first_line: n, + * last_line: n, + * first_column: n, + * last_column: n, + * range: [start_number, end_number] + * (where the numbers are indexes into the input string, zero-based) + * } + * + * --- + * + * The \`parseError\` function receives a \'hash\' object with these members for lexer errors: + * + * { + * text: (matched text) + * token: (the produced terminal token, if any) + * token_id: (the produced terminal token numeric ID, if any) + * line: (yylineno) + * loc: (yylloc) + * recoverable: (boolean: TRUE when the parser MAY have an error recovery rule + * available for this particular error) + * yy: (object: the current parser internal "shared state" \`yy\` + * as is also available in the rule actions; this can be used, + * for instance, for advanced error analysis and reporting) + * lexer: (reference to the current lexer instance used by the parser) + * } + * + * while \`this\` will reference the current lexer instance. + * + * When \`parseError\` is invoked by the lexer, the default implementation will + * attempt to invoke \`yy.parser.parseError()\`; when this callback is not provided + * it will try to invoke \`yy.parseError()\` instead. When that callback is also not + * provided, a \`JisonLexerError\` exception will be thrown containing the error + * message and \`hash\`, as constructed by the \`constructLexErrorInfo()\` API. + * + * Note that the lexer\'s \`JisonLexerError\` error class is passed via the + * \`ExceptionClass\` argument, which is invoked to construct the exception + * instance to be thrown, so technically \`parseError\` will throw the object + * produced by the \`new ExceptionClass(str, hash)\` JavaScript expression. + * + * --- + * + * You can specify lexer options by setting / modifying the \`.options\` object of your Lexer instance. + * These options are available: + * + * (Options are permanent.) + * + * yy: { + * parseError: function(str, hash, ExceptionClass) + * optional: overrides the default \`parseError\` function. + * } + * + * lexer.options: { + * pre_lex: function() + * optional: is invoked before the lexer is invoked to produce another token. + * \`this\` refers to the Lexer object. + * post_lex: function(token) { return token; } + * optional: is invoked when the lexer has produced a token \`token\`; + * this function can override the returned token value by returning another. + * When it does not return any (truthy) value, the lexer will return + * the original \`token\`. + * \`this\` refers to the Lexer object. + * + * WARNING: the next set of options are not meant to be changed. They echo the abilities of + * the lexer as per when it was compiled! + * + * ranges: boolean + * optional: \`true\` ==> token location info will include a .range[] member. + * flex: boolean + * optional: \`true\` ==> flex-like lexing behaviour where the rules are tested + * exhaustively to find the longest match. + * backtrack_lexer: boolean + * optional: \`true\` ==> lexer regexes are tested in order and for invoked; + * the lexer terminates the scan when a token is returned by the action code. + * xregexp: boolean + * optional: \`true\` ==> lexer rule regexes are "extended regex format" requiring the + * \`XRegExp\` library. When this %option has not been specified at compile time, all lexer + * rule regexes have been written as standard JavaScript RegExp expressions. + * } + */ + `; + + return out; +} + +function prepareOptions(opt) { + opt = opt || {}; + + // check for illegal identifier + if (!opt.moduleName || !opt.moduleName.match(/^[a-zA-Z_$][a-zA-Z0-9_$\.]*$/)) { + if (opt.moduleName) { + var msg = 'WARNING: The specified moduleName "' + opt.moduleName + '" is illegal (only characters [a-zA-Z0-9_$] and "." dot are accepted); using the default moduleName "lexer" instead.'; + if (typeof opt.warn_cb === 'function') { + opt.warn_cb(msg); + } else { + // do not treat as warning; barf hairball instead so that this oddity gets noticed right away! + throw new Error(msg); + } + } + opt.moduleName = 'lexer'; + } + + prepExportStructures(opt); + + return opt; +} + +function generateModule(opt) { + opt = prepareOptions(opt); + + var out = [ + generateGenericHeaderComment(), + '', + 'var ' + opt.moduleName + ' = (function () {', + jisonLexerErrorDefinition, + '', + generateModuleBody(opt), + '', + (opt.moduleInclude ? opt.moduleInclude + ';' : ''), + '', + 'return lexer;', + '})();' + ]; + + var src = out.join('\n') + '\n'; + src = stripUnusedLexerCode(src, opt); + opt.exportSourceCode.all = src; + return src; +} + +function generateAMDModule(opt) { + opt = prepareOptions(opt); + + var out = [ + generateGenericHeaderComment(), + '', + 'define([], function () {', + jisonLexerErrorDefinition, + '', + generateModuleBody(opt), + '', + (opt.moduleInclude ? opt.moduleInclude + ';' : ''), + '', + 'return lexer;', + '});' + ]; + + var src = out.join('\n') + '\n'; + src = stripUnusedLexerCode(src, opt); + opt.exportSourceCode.all = src; + return src; +} + +function generateESModule(opt) { + opt = prepareOptions(opt); + + var out = [ + generateGenericHeaderComment(), + '', + 'var lexer = (function () {', + jisonLexerErrorDefinition, + '', + generateModuleBody(opt), + '', + (opt.moduleInclude ? opt.moduleInclude + ';' : ''), + '', + 'return lexer;', + '})();', + '', + 'function yylex() {', + ' return lexer.lex.apply(lexer, arguments);', + '}', + rmCommonWS` + export { + lexer, + yylex as lex + }; + ` + ]; + + var src = out.join('\n') + '\n'; + src = stripUnusedLexerCode(src, opt); + opt.exportSourceCode.all = src; + return src; +} + +function generateCommonJSModule(opt) { + opt = prepareOptions(opt); + + var out = [ + generateGenericHeaderComment(), + '', + 'var ' + opt.moduleName + ' = (function () {', + jisonLexerErrorDefinition, + '', + generateModuleBody(opt), + '', + (opt.moduleInclude ? opt.moduleInclude + ';' : ''), + '', + 'return lexer;', + '})();', + '', + 'if (typeof require !== \'undefined\' && typeof exports !== \'undefined\') {', + ' exports.lexer = ' + opt.moduleName + ';', + ' exports.lex = function () {', + ' return ' + opt.moduleName + '.lex.apply(lexer, arguments);', + ' };', + '}' + ]; + + var src = out.join('\n') + '\n'; + src = stripUnusedLexerCode(src, opt); + opt.exportSourceCode.all = src; + return src; +} + +RegExpLexer.generate = generate; + +RegExpLexer.version = version$1; +RegExpLexer.defaultJisonLexOptions = defaultJisonLexOptions; +RegExpLexer.mkStdOptions = mkStdOptions; +RegExpLexer.camelCase = camelCase; +RegExpLexer.autodetectAndConvertToJSONformat = autodetectAndConvertToJSONformat; + +var version = '0.6.1-205'; // require('./package.json').version; + + +function getCommandlineOptions() { + 'use strict'; + + var opts = nomnom + .script('jison-lex') + .unknownOptionTreatment(false) // do not accept unknown options! + .options({ + file: { + flag: true, + position: 0, + help: 'file containing a lexical grammar' + }, + json: { + abbr: 'j', + flag: true, + default: false, + help: 'jison will expect a grammar in either JSON/JSON5 or JISON format: the precise format is autodetected' + }, + outfile: { + abbr: 'o', + metavar: 'FILE', + help : 'Filepath and base module name of the generated parser;\nwhen terminated with a / (dir separator) it is treated as the destination directory where the generated output will be stored' + }, + debug: { + abbr: 'd', + flag: true, + default: false, + help: 'Debug mode' + }, + dumpSourceCodeOnFailure: { + full: 'dump-sourcecode-on-failure', + flag: true, + default: true, + help: 'Dump the generated source code to a special named file when the internal generator tests fail, i.e. when the generated source code does not compile in the JavaScript engine. Enabling this option helps you to diagnose/debug crashes (thrown exceptions) in the code generator due to various reasons: you can, for example, load the dumped sourcecode in another environment (e.g. NodeJS) to get more info on the precise location and cause of the compile failure.' + }, + throwErrorOnCompileFailure: { + full: 'throw-on-compile-failure', + flag: true, + default: true, + help: 'Throw an exception when the generated source code fails to compile in the JavaScript engine. **WARNING**: Turning this feature OFF permits the code generator to produce non-working source code and treat that as SUCCESS. This MAY be desirable code generator behaviour, but only rarely.' + }, + reportStats: { + full: 'info', + abbr: 'I', + flag: true, + default: false, + help: 'Report some statistics about the generated parser' + }, + moduleType: { + full: 'module-type', + abbr: 't', + default: 'commonjs', + metavar: 'TYPE', + choices: ['commonjs', 'amd', 'js', 'es'], + help: 'The type of module to generate (commonjs, amd, es, js)' + }, + moduleName: { + full: 'module-name', + abbr: 'n', + metavar: 'NAME', + help: 'The name of the generated parser object, namespace supported' + }, + main: { + full: 'main', + abbr: 'x', + flag: true, + default: false, + help: 'Include .main() entry point in generated commonjs module' + }, + moduleMain: { + full: 'module-main', + abbr: 'y', + metavar: 'NAME', + help: 'The main module function definition' + }, + version: { + abbr: 'V', + flag: true, + help: 'print version and exit', + callback: function () { + return version; + } + } + }).parse(); + + return opts; +} + +var cli = module.exports; + +cli.main = function cliMain(opts) { + 'use strict'; + + opts = RegExpLexer.mkStdOptions(opts); + + function isDirectory(fp) { + try { + return fs.lstatSync(fp).isDirectory(); + } catch (e) { + return false; + } + } + + function mkdirp(fp) { + if (!fp || fp === '.' || fp.length === 0) { + return false; + } + try { + fs.mkdirSync(fp); + return true; + } catch (e) { + if (e.code === 'ENOENT') { + var parent = path.dirname(fp); + // Did we hit the root directory by now? If so, abort! + // Else, create the parent; iff that fails, we fail too... + if (parent !== fp && mkdirp(parent)) { + try { + // Retry creating the original directory: it should succeed now + fs.mkdirSync(fp); + return true; + } catch (e) { + return false; + } + } + } + } + return false; + } + + function processInputFile() { + // getting raw files + var original_cwd = process.cwd(); + + var raw = fs.readFileSync(path.normalize(opts.file), 'utf8'); + + // making best guess at json mode + opts.json = path.extname(opts.file) === '.json' || opts.json; + + // When only the directory part of the output path was specified, then we + // do NOT have the target module name in there as well! + var outpath = opts.outfile; + if (/[\\\/]$/.test(outpath) || isDirectory(outpath)) { + opts.outfile = null; + outpath = outpath.replace(/[\\\/]$/, ''); + } + if (outpath && outpath.length > 0) { + outpath += '/'; + } else { + outpath = ''; + } + + // setting output file name and module name based on input file name + // if they aren't specified. + var name = path.basename(opts.outfile || opts.file); + + // get the base name (i.e. the file name without extension) + // i.e. strip off only the extension and keep any other dots in the filename + name = path.basename(name, path.extname(name)); + + opts.outfile = opts.outfile || (outpath + name + '.js'); + if (!opts.moduleName && name) { + opts.moduleName = opts.defaultModuleName = name.replace(/-\w/g, + function (match) { + return match.charAt(1).toUpperCase(); + }); + } + + // Change CWD to the directory where the source grammar resides: this helps us properly + // %include any files mentioned in the grammar with relative paths: + var new_cwd = path.dirname(path.normalize(opts.file)); + process.chdir(new_cwd); + + var lexer = cli.generateLexerString(raw, opts); + + // and change back to the CWD we started out with: + process.chdir(original_cwd); + + mkdirp(path.dirname(opts.outfile)); + fs.writeFileSync(opts.outfile, lexer); + console.log('JISON-LEX output for module [' + opts.moduleName + '] has been written to file:', opts.outfile); + } + + function readin(cb) { + var stdin = process.openStdin(), + data = ''; + + stdin.setEncoding('utf8'); + stdin.addListener('data', function (chunk) { + data += chunk; + }); + stdin.addListener('end', function () { + cb(data); + }); + } + + function processStdin() { + readin(function processStdinReadInCallback(raw) { + console.log(cli.generateLexerString(raw, opts)); + }); + } + + // if an input file wasn't given, assume input on stdin + if (opts.file) { + processInputFile(); + } else { + processStdin(); + } +}; + +cli.generateLexerString = function generateLexerString(lexerSpec, opts) { + 'use strict'; + + // var settings = RegExpLexer.mkStdOptions(opts); + var predefined_tokens = null; + + return RegExpLexer.generate(lexerSpec, predefined_tokens, opts); +}; + + +if (require.main === module) { + var opts = getCommandlineOptions(); + cli.main(opts); +} + +}))); diff --git a/dist/regexp-lexer-cjs-es5.js b/dist/regexp-lexer-cjs-es5.js new file mode 100644 index 0000000..c7e575b --- /dev/null +++ b/dist/regexp-lexer-cjs-es5.js @@ -0,0 +1,2547 @@ +'use strict'; + +var _typeof = typeof Symbol === "function" && typeof Symbol.iterator === "symbol" ? function (obj) { return typeof obj; } : function (obj) { return obj && typeof Symbol === "function" && obj.constructor === Symbol && obj !== Symbol.prototype ? "symbol" : typeof obj; }; + +var _templateObject = _taggedTemplateLiteral(['\n var __hacky_counter__ = 0;\n\n /**\n * @constructor\n * @nocollapse\n */\n function XRegExp(re, f) {\n this.re = re;\n this.flags = f;\n this._getUnicodeProperty = function (k) {};\n var fake = /./; // WARNING: this exact \'fake\' is also depended upon by the xregexp unit test!\n __hacky_counter__++;\n fake.__hacky_backy__ = __hacky_counter__;\n return fake;\n }\n '], ['\n var __hacky_counter__ = 0;\n\n /**\n * @constructor\n * @nocollapse\n */\n function XRegExp(re, f) {\n this.re = re;\n this.flags = f;\n this._getUnicodeProperty = function (k) {};\n var fake = /./; // WARNING: this exact \'fake\' is also depended upon by the xregexp unit test!\n __hacky_counter__++;\n fake.__hacky_backy__ = __hacky_counter__;\n return fake;\n }\n ']), + _templateObject2 = _taggedTemplateLiteral(['\n return ', ';\n'], ['\n return ', ';\n']), + _templateObject3 = _taggedTemplateLiteral(['\n // Code Generator Information Report\n // ---------------------------------\n //\n // Options:\n //\n // backtracking: .................... ', '\n // location.ranges: ................. ', '\n // location line+column tracking: ... ', '\n //\n //\n // Forwarded Parser Analysis flags:\n //\n // uses yyleng: ..................... ', '\n // uses yylineno: ................... ', '\n // uses yytext: ..................... ', '\n // uses yylloc: ..................... ', '\n // uses lexer values: ............... ', ' / ', '\n // location tracking: ............... ', '\n // location assignment: ............. ', '\n //\n //\n // Lexer Analysis flags:\n //\n // uses yyleng: ..................... ', '\n // uses yylineno: ................... ', '\n // uses yytext: ..................... ', '\n // uses yylloc: ..................... ', '\n // uses ParseError API: ............. ', '\n // uses yyerror: .................... ', '\n // uses location tracking & editing: ', '\n // uses more() API: ................. ', '\n // uses unput() API: ................ ', '\n // uses reject() API: ............... ', '\n // uses less() API: ................. ', '\n // uses display APIs pastInput(), upcomingInput(), showPosition():\n // ............................. ', '\n // uses describeYYLLOC() API: ....... ', '\n //\n // --------- END OF REPORT -----------\n\n '], ['\n // Code Generator Information Report\n // ---------------------------------\n //\n // Options:\n //\n // backtracking: .................... ', '\n // location.ranges: ................. ', '\n // location line+column tracking: ... ', '\n //\n //\n // Forwarded Parser Analysis flags:\n //\n // uses yyleng: ..................... ', '\n // uses yylineno: ................... ', '\n // uses yytext: ..................... ', '\n // uses yylloc: ..................... ', '\n // uses lexer values: ............... ', ' / ', '\n // location tracking: ............... ', '\n // location assignment: ............. ', '\n //\n //\n // Lexer Analysis flags:\n //\n // uses yyleng: ..................... ', '\n // uses yylineno: ................... ', '\n // uses yytext: ..................... ', '\n // uses yylloc: ..................... ', '\n // uses ParseError API: ............. ', '\n // uses yyerror: .................... ', '\n // uses location tracking & editing: ', '\n // uses more() API: ................. ', '\n // uses unput() API: ................ ', '\n // uses reject() API: ............... ', '\n // uses less() API: ................. ', '\n // uses display APIs pastInput(), upcomingInput(), showPosition():\n // ............................. ', '\n // uses describeYYLLOC() API: ....... ', '\n //\n // --------- END OF REPORT -----------\n\n ']), + _templateObject4 = _taggedTemplateLiteral(['\n var lexer = {\n '], ['\n var lexer = {\n ']), + _templateObject5 = _taggedTemplateLiteral([',\n JisonLexerError: JisonLexerError,\n performAction: ', ',\n simpleCaseActionClusters: ', ',\n rules: [\n ', '\n ],\n conditions: ', '\n };\n '], [',\n JisonLexerError: JisonLexerError,\n performAction: ', ',\n simpleCaseActionClusters: ', ',\n rules: [\n ', '\n ],\n conditions: ', '\n };\n ']), + _templateObject6 = _taggedTemplateLiteral(['\n /* lexer generated by jison-lex ', ' */\n\n /*\n * Returns a Lexer object of the following structure:\n *\n * Lexer: {\n * yy: {} The so-called "shared state" or rather the *source* of it;\n * the real "shared state" `yy` passed around to\n * the rule actions, etc. is a direct reference!\n *\n * This "shared context" object was passed to the lexer by way of \n * the `lexer.setInput(str, yy)` API before you may use it.\n *\n * This "shared context" object is passed to the lexer action code in `performAction()`\n * so userland code in the lexer actions may communicate with the outside world \n * and/or other lexer rules\' actions in more or less complex ways.\n *\n * }\n *\n * Lexer.prototype: {\n * EOF: 1,\n * ERROR: 2,\n *\n * yy: The overall "shared context" object reference.\n *\n * JisonLexerError: function(msg, hash),\n *\n * performAction: function lexer__performAction(yy, yyrulenumber, YY_START),\n *\n * The function parameters and `this` have the following value/meaning:\n * - `this` : reference to the `lexer` instance. \n * `yy_` is an alias for `this` lexer instance reference used internally.\n *\n * - `yy` : a reference to the `yy` "shared state" object which was passed to the lexer\n * by way of the `lexer.setInput(str, yy)` API before.\n *\n * Note:\n * The extra arguments you specified in the `%parse-param` statement in your\n * **parser** grammar definition file are passed to the lexer via this object\n * reference as member variables.\n *\n * - `yyrulenumber` : index of the matched lexer rule (regex), used internally.\n *\n * - `YY_START`: the current lexer "start condition" state.\n *\n * parseError: function(str, hash, ExceptionClass),\n *\n * constructLexErrorInfo: function(error_message, is_recoverable),\n * Helper function.\n * Produces a new errorInfo \'hash object\' which can be passed into `parseError()`.\n * See it\'s use in this lexer kernel in many places; example usage:\n *\n * var infoObj = lexer.constructParseErrorInfo(\'fail!\', true);\n * var retVal = lexer.parseError(infoObj.errStr, infoObj, lexer.JisonLexerError);\n *\n * options: { ... lexer %options ... },\n *\n * lex: function(),\n * Produce one token of lexed input, which was passed in earlier via the `lexer.setInput()` API.\n * You MAY use the additional `args...` parameters as per `%parse-param` spec of the **lexer** grammar:\n * these extra `args...` are added verbatim to the `yy` object reference as member variables.\n *\n * WARNING:\n * Lexer\'s additional `args...` parameters (via lexer\'s `%parse-param`) MAY conflict with\n * any attributes already added to `yy` by the **parser** or the jison run-time; \n * when such a collision is detected an exception is thrown to prevent the generated run-time \n * from silently accepting this confusing and potentially hazardous situation! \n *\n * cleanupAfterLex: function(do_not_nuke_errorinfos),\n * Helper function.\n *\n * This helper API is invoked when the **parse process** has completed: it is the responsibility\n * of the **parser** (or the calling userland code) to invoke this method once cleanup is desired. \n *\n * This helper may be invoked by user code to ensure the internal lexer gets properly garbage collected.\n *\n * setInput: function(input, [yy]),\n *\n *\n * input: function(),\n *\n *\n * unput: function(str),\n *\n *\n * more: function(),\n *\n *\n * reject: function(),\n *\n *\n * less: function(n),\n *\n *\n * pastInput: function(n),\n *\n *\n * upcomingInput: function(n),\n *\n *\n * showPosition: function(),\n *\n *\n * test_match: function(regex_match_array, rule_index),\n *\n *\n * next: function(),\n *\n *\n * begin: function(condition),\n *\n *\n * pushState: function(condition),\n *\n *\n * popState: function(),\n *\n *\n * topState: function(),\n *\n *\n * _currentRules: function(),\n *\n *\n * stateStackSize: function(),\n *\n *\n * performAction: function(yy, yy_, yyrulenumber, YY_START),\n *\n *\n * rules: [...],\n *\n *\n * conditions: {associative list: name ==> set},\n * }\n *\n *\n * token location info (`yylloc`): {\n * first_line: n,\n * last_line: n,\n * first_column: n,\n * last_column: n,\n * range: [start_number, end_number]\n * (where the numbers are indexes into the input string, zero-based)\n * }\n *\n * ---\n *\n * The `parseError` function receives a \'hash\' object with these members for lexer errors:\n *\n * {\n * text: (matched text)\n * token: (the produced terminal token, if any)\n * token_id: (the produced terminal token numeric ID, if any)\n * line: (yylineno)\n * loc: (yylloc)\n * recoverable: (boolean: TRUE when the parser MAY have an error recovery rule\n * available for this particular error)\n * yy: (object: the current parser internal "shared state" `yy`\n * as is also available in the rule actions; this can be used,\n * for instance, for advanced error analysis and reporting)\n * lexer: (reference to the current lexer instance used by the parser)\n * }\n *\n * while `this` will reference the current lexer instance.\n *\n * When `parseError` is invoked by the lexer, the default implementation will\n * attempt to invoke `yy.parser.parseError()`; when this callback is not provided\n * it will try to invoke `yy.parseError()` instead. When that callback is also not\n * provided, a `JisonLexerError` exception will be thrown containing the error\n * message and `hash`, as constructed by the `constructLexErrorInfo()` API.\n *\n * Note that the lexer\'s `JisonLexerError` error class is passed via the\n * `ExceptionClass` argument, which is invoked to construct the exception\n * instance to be thrown, so technically `parseError` will throw the object\n * produced by the `new ExceptionClass(str, hash)` JavaScript expression.\n *\n * ---\n *\n * You can specify lexer options by setting / modifying the `.options` object of your Lexer instance.\n * These options are available:\n *\n * (Options are permanent.)\n * \n * yy: {\n * parseError: function(str, hash, ExceptionClass)\n * optional: overrides the default `parseError` function.\n * }\n *\n * lexer.options: {\n * pre_lex: function()\n * optional: is invoked before the lexer is invoked to produce another token.\n * `this` refers to the Lexer object.\n * post_lex: function(token) { return token; }\n * optional: is invoked when the lexer has produced a token `token`;\n * this function can override the returned token value by returning another.\n * When it does not return any (truthy) value, the lexer will return\n * the original `token`.\n * `this` refers to the Lexer object.\n *\n * WARNING: the next set of options are not meant to be changed. They echo the abilities of\n * the lexer as per when it was compiled!\n *\n * ranges: boolean\n * optional: `true` ==> token location info will include a .range[] member.\n * flex: boolean\n * optional: `true` ==> flex-like lexing behaviour where the rules are tested\n * exhaustively to find the longest match.\n * backtrack_lexer: boolean\n * optional: `true` ==> lexer regexes are tested in order and for invoked;\n * the lexer terminates the scan when a token is returned by the action code.\n * xregexp: boolean\n * optional: `true` ==> lexer rule regexes are "extended regex format" requiring the\n * `XRegExp` library. When this %option has not been specified at compile time, all lexer\n * rule regexes have been written as standard JavaScript RegExp expressions.\n * }\n */\n '], ['\n /* lexer generated by jison-lex ', ' */\n\n /*\n * Returns a Lexer object of the following structure:\n *\n * Lexer: {\n * yy: {} The so-called "shared state" or rather the *source* of it;\n * the real "shared state" \\`yy\\` passed around to\n * the rule actions, etc. is a direct reference!\n *\n * This "shared context" object was passed to the lexer by way of \n * the \\`lexer.setInput(str, yy)\\` API before you may use it.\n *\n * This "shared context" object is passed to the lexer action code in \\`performAction()\\`\n * so userland code in the lexer actions may communicate with the outside world \n * and/or other lexer rules\' actions in more or less complex ways.\n *\n * }\n *\n * Lexer.prototype: {\n * EOF: 1,\n * ERROR: 2,\n *\n * yy: The overall "shared context" object reference.\n *\n * JisonLexerError: function(msg, hash),\n *\n * performAction: function lexer__performAction(yy, yyrulenumber, YY_START),\n *\n * The function parameters and \\`this\\` have the following value/meaning:\n * - \\`this\\` : reference to the \\`lexer\\` instance. \n * \\`yy_\\` is an alias for \\`this\\` lexer instance reference used internally.\n *\n * - \\`yy\\` : a reference to the \\`yy\\` "shared state" object which was passed to the lexer\n * by way of the \\`lexer.setInput(str, yy)\\` API before.\n *\n * Note:\n * The extra arguments you specified in the \\`%parse-param\\` statement in your\n * **parser** grammar definition file are passed to the lexer via this object\n * reference as member variables.\n *\n * - \\`yyrulenumber\\` : index of the matched lexer rule (regex), used internally.\n *\n * - \\`YY_START\\`: the current lexer "start condition" state.\n *\n * parseError: function(str, hash, ExceptionClass),\n *\n * constructLexErrorInfo: function(error_message, is_recoverable),\n * Helper function.\n * Produces a new errorInfo \\\'hash object\\\' which can be passed into \\`parseError()\\`.\n * See it\\\'s use in this lexer kernel in many places; example usage:\n *\n * var infoObj = lexer.constructParseErrorInfo(\\\'fail!\\\', true);\n * var retVal = lexer.parseError(infoObj.errStr, infoObj, lexer.JisonLexerError);\n *\n * options: { ... lexer %options ... },\n *\n * lex: function(),\n * Produce one token of lexed input, which was passed in earlier via the \\`lexer.setInput()\\` API.\n * You MAY use the additional \\`args...\\` parameters as per \\`%parse-param\\` spec of the **lexer** grammar:\n * these extra \\`args...\\` are added verbatim to the \\`yy\\` object reference as member variables.\n *\n * WARNING:\n * Lexer\'s additional \\`args...\\` parameters (via lexer\'s \\`%parse-param\\`) MAY conflict with\n * any attributes already added to \\`yy\\` by the **parser** or the jison run-time; \n * when such a collision is detected an exception is thrown to prevent the generated run-time \n * from silently accepting this confusing and potentially hazardous situation! \n *\n * cleanupAfterLex: function(do_not_nuke_errorinfos),\n * Helper function.\n *\n * This helper API is invoked when the **parse process** has completed: it is the responsibility\n * of the **parser** (or the calling userland code) to invoke this method once cleanup is desired. \n *\n * This helper may be invoked by user code to ensure the internal lexer gets properly garbage collected.\n *\n * setInput: function(input, [yy]),\n *\n *\n * input: function(),\n *\n *\n * unput: function(str),\n *\n *\n * more: function(),\n *\n *\n * reject: function(),\n *\n *\n * less: function(n),\n *\n *\n * pastInput: function(n),\n *\n *\n * upcomingInput: function(n),\n *\n *\n * showPosition: function(),\n *\n *\n * test_match: function(regex_match_array, rule_index),\n *\n *\n * next: function(),\n *\n *\n * begin: function(condition),\n *\n *\n * pushState: function(condition),\n *\n *\n * popState: function(),\n *\n *\n * topState: function(),\n *\n *\n * _currentRules: function(),\n *\n *\n * stateStackSize: function(),\n *\n *\n * performAction: function(yy, yy_, yyrulenumber, YY_START),\n *\n *\n * rules: [...],\n *\n *\n * conditions: {associative list: name ==> set},\n * }\n *\n *\n * token location info (\\`yylloc\\`): {\n * first_line: n,\n * last_line: n,\n * first_column: n,\n * last_column: n,\n * range: [start_number, end_number]\n * (where the numbers are indexes into the input string, zero-based)\n * }\n *\n * ---\n *\n * The \\`parseError\\` function receives a \\\'hash\\\' object with these members for lexer errors:\n *\n * {\n * text: (matched text)\n * token: (the produced terminal token, if any)\n * token_id: (the produced terminal token numeric ID, if any)\n * line: (yylineno)\n * loc: (yylloc)\n * recoverable: (boolean: TRUE when the parser MAY have an error recovery rule\n * available for this particular error)\n * yy: (object: the current parser internal "shared state" \\`yy\\`\n * as is also available in the rule actions; this can be used,\n * for instance, for advanced error analysis and reporting)\n * lexer: (reference to the current lexer instance used by the parser)\n * }\n *\n * while \\`this\\` will reference the current lexer instance.\n *\n * When \\`parseError\\` is invoked by the lexer, the default implementation will\n * attempt to invoke \\`yy.parser.parseError()\\`; when this callback is not provided\n * it will try to invoke \\`yy.parseError()\\` instead. When that callback is also not\n * provided, a \\`JisonLexerError\\` exception will be thrown containing the error\n * message and \\`hash\\`, as constructed by the \\`constructLexErrorInfo()\\` API.\n *\n * Note that the lexer\\\'s \\`JisonLexerError\\` error class is passed via the\n * \\`ExceptionClass\\` argument, which is invoked to construct the exception\n * instance to be thrown, so technically \\`parseError\\` will throw the object\n * produced by the \\`new ExceptionClass(str, hash)\\` JavaScript expression.\n *\n * ---\n *\n * You can specify lexer options by setting / modifying the \\`.options\\` object of your Lexer instance.\n * These options are available:\n *\n * (Options are permanent.)\n * \n * yy: {\n * parseError: function(str, hash, ExceptionClass)\n * optional: overrides the default \\`parseError\\` function.\n * }\n *\n * lexer.options: {\n * pre_lex: function()\n * optional: is invoked before the lexer is invoked to produce another token.\n * \\`this\\` refers to the Lexer object.\n * post_lex: function(token) { return token; }\n * optional: is invoked when the lexer has produced a token \\`token\\`;\n * this function can override the returned token value by returning another.\n * When it does not return any (truthy) value, the lexer will return\n * the original \\`token\\`.\n * \\`this\\` refers to the Lexer object.\n *\n * WARNING: the next set of options are not meant to be changed. They echo the abilities of\n * the lexer as per when it was compiled!\n *\n * ranges: boolean\n * optional: \\`true\\` ==> token location info will include a .range[] member.\n * flex: boolean\n * optional: \\`true\\` ==> flex-like lexing behaviour where the rules are tested\n * exhaustively to find the longest match.\n * backtrack_lexer: boolean\n * optional: \\`true\\` ==> lexer regexes are tested in order and for invoked;\n * the lexer terminates the scan when a token is returned by the action code.\n * xregexp: boolean\n * optional: \\`true\\` ==> lexer rule regexes are "extended regex format" requiring the\n * \\`XRegExp\\` library. When this %option has not been specified at compile time, all lexer\n * rule regexes have been written as standard JavaScript RegExp expressions.\n * }\n */\n ']), + _templateObject7 = _taggedTemplateLiteral(['\n export {\n lexer,\n yylex as lex\n };\n '], ['\n export {\n lexer,\n yylex as lex\n };\n ']); + +function _taggedTemplateLiteral(strings, raw) { return Object.freeze(Object.defineProperties(strings, { raw: { value: Object.freeze(raw) } })); } + +function _interopDefault(ex) { + return ex && (typeof ex === 'undefined' ? 'undefined' : _typeof(ex)) === 'object' && 'default' in ex ? ex['default'] : ex; +} + +var XRegExp = _interopDefault(require('@gerhobbelt/xregexp')); +var json5 = _interopDefault(require('@gerhobbelt/json5')); +var lexParser = _interopDefault(require('@gerhobbelt/lex-parser')); +var assert = _interopDefault(require('assert')); +var helpers = _interopDefault(require('jison-helpers-lib')); + +// +// Helper library for set definitions +// +// MIT Licensed +// +// +// This code is intended to help parse regex set expressions and mix them +// together, i.e. to answer questions like this: +// +// what is the resulting regex set expression when we mix the regex set +// `[a-z]` with the regex set `[^\s]` where with 'mix' we mean that any +// input which matches either input regex should match the resulting +// regex set. (a.k.a. Full Outer Join, see also http://www.diffen.com/difference/Inner_Join_vs_Outer_Join) +// + +'use strict'; + +var XREGEXP_UNICODE_ESCAPE_RE$1 = /^\{[A-Za-z0-9 \-\._]+\}/; // Matches the XRegExp Unicode escape braced part, e.g. `{Number}` +var CHR_RE$1 = /^(?:[^\\]|\\[^cxu0-9]|\\[0-9]{1,3}|\\c[A-Z]|\\x[0-9a-fA-F]{2}|\\u[0-9a-fA-F]{4}|\\u\{[0-9a-fA-F]+\})/; +var SET_PART_RE$1 = /^(?:[^\\\]]|\\[^cxu0-9]|\\[0-9]{1,3}|\\c[A-Z]|\\x[0-9a-fA-F]{2}|\\u[0-9a-fA-F]{4}|\\u\{[0-9a-fA-F]+\})+/; +var NOTHING_SPECIAL_RE$1 = /^(?:[^\\\[\]\(\)\|^\{\}]|\\[^cxu0-9]|\\[0-9]{1,3}|\\c[A-Z]|\\x[0-9a-fA-F]{2}|\\u[0-9a-fA-F]{4}|\\u\{[0-9a-fA-F]+\})+/; +var SET_IS_SINGLE_PCODE_RE = /^\\[dDwWsS]$|^\\p\{[A-Za-z0-9 \-\._]+\}$/; + +var UNICODE_BASE_PLANE_MAX_CP$1 = 65535; + +// The expanded regex sets which are equivalent to the given `\\{c}` escapes: +// +// `/\s/`: +var WHITESPACE_SETSTR$1 = ' \f\n\r\t\x0B\xA0\u1680\u180E\u2000-\u200A\u2028\u2029\u202F\u205F\u3000\uFEFF'; +// `/\d/`: +var DIGIT_SETSTR$1 = '0-9'; +// `/\w/`: +var WORDCHAR_SETSTR$1 = 'A-Za-z0-9_'; + +// Helper for `bitarray2set()`: convert character code to a representation string suitable for use in a regex +function i2c(i) { + var c, x; + + switch (i) { + case 10: + return '\\n'; + + case 13: + return '\\r'; + + case 9: + return '\\t'; + + case 8: + return '\\b'; + + case 12: + return '\\f'; + + case 11: + return '\\v'; + + case 45: + // ASCII/Unicode for '-' dash + return '\\-'; + + case 91: + // '[' + return '\\['; + + case 92: + // '\\' + return '\\\\'; + + case 93: + // ']' + return '\\]'; + + case 94: + // ']' + return '\\^'; + } + if (i < 32 || i > 0xFFF0 /* Unicode Specials, also in UTF16 */ + || i >= 0xD800 && i <= 0xDFFF /* Unicode Supplementary Planes; we're TOAST in JavaScript as we're NOT UTF-16 but UCS-2! */ + || String.fromCharCode(i).match(/[\u2028\u2029]/) /* Code compilation via `new Function()` does not like to see these, or rather: treats them as just another form of CRLF, which breaks your generated regex code! */ + ) { + // Detail about a detail: + // U+2028 and U+2029 are part of the `\s` regex escape code (`\s` and `[\s]` match either of these) and when placed in a JavaScript + // source file verbatim (without escaping it as a `\uNNNN` item) then JavaScript will interpret it as such and consequently report + // a b0rked generated parser, as the generated code would include this regex right here. + // Hence we MUST escape these buggers everywhere we go... + x = i.toString(16); + if (x.length >= 1 && i <= 0xFFFF) { + c = '0000' + x; + return '\\u' + c.substr(c.length - 4); + } else { + return '\\u{' + x + '}'; + } + } + return String.fromCharCode(i); +} + +// Helper collection for `bitarray2set()`: we have expanded all these cached `\\p{NAME}` regex sets when creating +// this bitarray and now we should look at these expansions again to see if `bitarray2set()` can produce a +// `\\p{NAME}` shorthand to represent [part of] the bitarray: +var Pcodes_bitarray_cache = {}; +var Pcodes_bitarray_cache_test_order = []; + +// Helper collection for `bitarray2set()` for minifying special cases of result sets which can be represented by +// a single regex 'escape', e.g. `\d` for digits 0-9. +var EscCode_bitarray_output_refs; + +// now initialize the EscCodes_... table above: +init_EscCode_lookup_table(); + +function init_EscCode_lookup_table() { + var s, + bitarr, + set2esc = {}, + esc2bitarr = {}; + + // patch global lookup tables for the time being, while we calculate their *real* content in this function: + EscCode_bitarray_output_refs = { + esc2bitarr: {}, + set2esc: {} + }; + Pcodes_bitarray_cache_test_order = []; + + // `/\S': + bitarr = []; + set2bitarray(bitarr, '^' + WHITESPACE_SETSTR$1); + s = bitarray2set(bitarr); + esc2bitarr['S'] = bitarr; + set2esc[s] = 'S'; + // set2esc['^' + s] = 's'; + Pcodes_bitarray_cache['\\S'] = bitarr; + + // `/\s': + bitarr = []; + set2bitarray(bitarr, WHITESPACE_SETSTR$1); + s = bitarray2set(bitarr); + esc2bitarr['s'] = bitarr; + set2esc[s] = 's'; + // set2esc['^' + s] = 'S'; + Pcodes_bitarray_cache['\\s'] = bitarr; + + // `/\D': + bitarr = []; + set2bitarray(bitarr, '^' + DIGIT_SETSTR$1); + s = bitarray2set(bitarr); + esc2bitarr['D'] = bitarr; + set2esc[s] = 'D'; + // set2esc['^' + s] = 'd'; + Pcodes_bitarray_cache['\\D'] = bitarr; + + // `/\d': + bitarr = []; + set2bitarray(bitarr, DIGIT_SETSTR$1); + s = bitarray2set(bitarr); + esc2bitarr['d'] = bitarr; + set2esc[s] = 'd'; + // set2esc['^' + s] = 'D'; + Pcodes_bitarray_cache['\\d'] = bitarr; + + // `/\W': + bitarr = []; + set2bitarray(bitarr, '^' + WORDCHAR_SETSTR$1); + s = bitarray2set(bitarr); + esc2bitarr['W'] = bitarr; + set2esc[s] = 'W'; + // set2esc['^' + s] = 'w'; + Pcodes_bitarray_cache['\\W'] = bitarr; + + // `/\w': + bitarr = []; + set2bitarray(bitarr, WORDCHAR_SETSTR$1); + s = bitarray2set(bitarr); + esc2bitarr['w'] = bitarr; + set2esc[s] = 'w'; + // set2esc['^' + s] = 'W'; + Pcodes_bitarray_cache['\\w'] = bitarr; + + EscCode_bitarray_output_refs = { + esc2bitarr: esc2bitarr, + set2esc: set2esc + }; + + updatePcodesBitarrayCacheTestOrder(); +} + +function updatePcodesBitarrayCacheTestOrder(opts) { + var t = new Array(UNICODE_BASE_PLANE_MAX_CP$1 + 1); + var l = {}; + var user_has_xregexp = opts && opts.options && opts.options.xregexp; + var i, j, k, ba; + + // mark every character with which regex pcodes they are part of: + for (k in Pcodes_bitarray_cache) { + ba = Pcodes_bitarray_cache[k]; + + if (!user_has_xregexp && k.indexOf('\\p{') >= 0) { + continue; + } + + var cnt = 0; + for (i = 0; i <= UNICODE_BASE_PLANE_MAX_CP$1; i++) { + if (ba[i]) { + cnt++; + if (!t[i]) { + t[i] = [k]; + } else { + t[i].push(k); + } + } + } + l[k] = cnt; + } + + // now dig out the unique ones: only need one per pcode. + // + // We ASSUME every \\p{NAME} 'pcode' has at least ONE character + // in it that is ONLY matched by that particular pcode. + // If this assumption fails, nothing is lost, but our 'regex set + // optimized representation' will be sub-optimal as than this pcode + // won't be tested during optimization. + // + // Now that would be a pity, so the assumption better holds... + // Turns out the assumption doesn't hold already for /\S/ + /\D/ + // as the second one (\D) is a pure subset of \S. So we have to + // look for markers which match multiple escapes/pcodes for those + // ones where a unique item isn't available... + var lut = []; + var done = {}; + var keys = Object.keys(Pcodes_bitarray_cache); + + for (i = 0; i <= UNICODE_BASE_PLANE_MAX_CP$1; i++) { + k = t[i][0]; + if (t[i].length === 1 && !done[k]) { + assert(l[k] > 0); + lut.push([i, k]); + done[k] = true; + } + } + + for (j = 0; keys[j]; j++) { + k = keys[j]; + + if (!user_has_xregexp && k.indexOf('\\p{') >= 0) { + continue; + } + + if (!done[k]) { + assert(l[k] > 0); + // find a minimum span character to mark this one: + var w = Infinity; + var rv; + ba = Pcodes_bitarray_cache[k]; + for (i = 0; i <= UNICODE_BASE_PLANE_MAX_CP$1; i++) { + if (ba[i]) { + var tl = t[i].length; + if (tl > 1 && tl < w) { + assert(l[k] > 0); + rv = [i, k]; + w = tl; + } + } + } + if (rv) { + done[k] = true; + lut.push(rv); + } + } + } + + // order from large set to small set so that small sets don't gobble + // characters also represented by overlapping larger set pcodes. + // + // Again we assume something: that finding the large regex pcode sets + // before the smaller, more specialized ones, will produce a more + // optimal minification of the regex set expression. + // + // This is a guestimate/heuristic only! + lut.sort(function (a, b) { + var k1 = a[1]; + var k2 = b[1]; + var ld = l[k2] - l[k1]; + if (ld) { + return ld; + } + // and for same-size sets, order from high to low unique identifier. + return b[0] - a[0]; + }); + + Pcodes_bitarray_cache_test_order = lut; +} + +// 'Join' a regex set `[...]` into a Unicode range spanning logic array, flagging every character in the given set. +function set2bitarray(bitarr, s, opts) { + var orig = s; + var set_is_inverted = false; + var bitarr_orig; + + function mark(d1, d2) { + if (d2 == null) d2 = d1; + for (var i = d1; i <= d2; i++) { + bitarr[i] = true; + } + } + + function add2bitarray(dst, src) { + for (var i = 0; i <= UNICODE_BASE_PLANE_MAX_CP$1; i++) { + if (src[i]) { + dst[i] = true; + } + } + } + + function eval_escaped_code(s) { + var c; + // decode escaped code? If none, just take the character as-is + if (s.indexOf('\\') === 0) { + var l = s.substr(0, 2); + switch (l) { + case '\\c': + c = s.charCodeAt(2) - 'A'.charCodeAt(0) + 1; + return String.fromCharCode(c); + + case '\\x': + s = s.substr(2); + c = parseInt(s, 16); + return String.fromCharCode(c); + + case '\\u': + s = s.substr(2); + if (s[0] === '{') { + s = s.substr(1, s.length - 2); + } + c = parseInt(s, 16); + if (c >= 0x10000) { + return new Error('We do NOT support Extended Plane Unicode Codepoints (i.e. CodePoints beyond U:FFFF) in regex set expressions, e.g. \\u{' + s + '}'); + } + return String.fromCharCode(c); + + case '\\0': + case '\\1': + case '\\2': + case '\\3': + case '\\4': + case '\\5': + case '\\6': + case '\\7': + s = s.substr(1); + c = parseInt(s, 8); + return String.fromCharCode(c); + + case '\\r': + return '\r'; + + case '\\n': + return '\n'; + + case '\\v': + return '\v'; + + case '\\f': + return '\f'; + + case '\\t': + return '\t'; + + case '\\b': + return '\b'; + + default: + // just the character itself: + return s.substr(1); + } + } else { + return s; + } + } + + if (s && s.length) { + var c1, c2; + + // inverted set? + if (s[0] === '^') { + set_is_inverted = true; + s = s.substr(1); + bitarr_orig = bitarr; + bitarr = new Array(UNICODE_BASE_PLANE_MAX_CP$1 + 1); + } + + // BITARR collects flags for characters set. Inversion means the complement set of character is st instead. + // This results in an OR operations when sets are joined/chained. + + while (s.length) { + c1 = s.match(CHR_RE$1); + if (!c1) { + // hit an illegal escape sequence? cope anyway! + c1 = s[0]; + } else { + c1 = c1[0]; + // Quick hack for XRegExp escapes inside a regex `[...]` set definition: we *could* try to keep those + // intact but it's easier to unfold them here; this is not nice for when the grammar specifies explicit + // XRegExp support, but alas, we'll get there when we get there... ;-) + switch (c1) { + case '\\p': + s = s.substr(c1.length); + c2 = s.match(XREGEXP_UNICODE_ESCAPE_RE$1); + if (c2) { + c2 = c2[0]; + s = s.substr(c2.length); + // do we have this one cached already? + var pex = c1 + c2; + var ba4p = Pcodes_bitarray_cache[pex]; + if (!ba4p) { + // expand escape: + var xr = new XRegExp('[' + pex + ']'); // TODO: case-insensitive grammar??? + // rewrite to a standard `[...]` regex set: XRegExp will do this for us via `XRegExp.toString()`: + var xs = '' + xr; + // remove the wrapping `/.../` to get at the (possibly *combined* series of) `[...]` sets inside: + xs = xs.substr(1, xs.length - 2); + + ba4p = reduceRegexToSetBitArray(xs, pex, opts); + + Pcodes_bitarray_cache[pex] = ba4p; + updatePcodesBitarrayCacheTestOrder(opts); + } + // merge bitarrays: + add2bitarray(bitarr, ba4p); + continue; + } + break; + + case '\\S': + case '\\s': + case '\\W': + case '\\w': + case '\\d': + case '\\D': + // these can't participate in a range, but need to be treated special: + s = s.substr(c1.length); + // check for \S, \s, \D, \d, \W, \w and expand them: + var ba4e = EscCode_bitarray_output_refs.esc2bitarr[c1[1]]; + assert(ba4e); + add2bitarray(bitarr, ba4e); + continue; + + case '\\b': + // matches a backspace: https://developer.mozilla.org/en/docs/Web/JavaScript/Guide/Regular_Expressions#special-backspace + c1 = '\b'; + break; + } + } + var v1 = eval_escaped_code(c1); + // propagate deferred exceptions = error reports. + if (v1 instanceof Error) { + return v1; + } + v1 = v1.charCodeAt(0); + s = s.substr(c1.length); + + if (s[0] === '-' && s.length >= 2) { + // we can expect a range like 'a-z': + s = s.substr(1); + c2 = s.match(CHR_RE$1); + if (!c2) { + // hit an illegal escape sequence? cope anyway! + c2 = s[0]; + } else { + c2 = c2[0]; + } + var v2 = eval_escaped_code(c2); + // propagate deferred exceptions = error reports. + if (v2 instanceof Error) { + return v1; + } + v2 = v2.charCodeAt(0); + s = s.substr(c2.length); + + // legal ranges go UP, not /DOWN! + if (v1 <= v2) { + mark(v1, v2); + } else { + console.warn('INVALID CHARACTER RANGE found in regex: ', { re: orig, start: c1, start_n: v1, end: c2, end_n: v2 }); + mark(v1); + mark('-'.charCodeAt(0)); + mark(v2); + } + continue; + } + mark(v1); + } + + // When we have marked all slots, '^' NEGATES the set, hence we flip all slots. + // + // Since a regex like `[^]` should match everything(?really?), we don't need to check if the MARK + // phase actually marked anything at all: the `^` negation will correctly flip=mark the entire + // range then. + if (set_is_inverted) { + for (var i = 0; i <= UNICODE_BASE_PLANE_MAX_CP$1; i++) { + if (!bitarr[i]) { + bitarr_orig[i] = true; + } + } + } + } + return false; +} + +// convert a simple bitarray back into a regex set `[...]` content: +function bitarray2set(l, output_inverted_variant, output_minimized) { + // construct the inverse(?) set from the mark-set: + // + // Before we do that, we inject a sentinel so that our inner loops + // below can be simple and fast: + l[UNICODE_BASE_PLANE_MAX_CP$1 + 1] = 1; + // now reconstruct the regex set: + var rv = []; + var i, j, cnt, lut, tn, tspec, match, pcode, ba4pcode, l2; + var bitarr_is_cloned = false; + var l_orig = l; + + if (output_inverted_variant) { + // generate the inverted set, hence all unmarked slots are part of the output range: + cnt = 0; + for (i = 0; i <= UNICODE_BASE_PLANE_MAX_CP$1; i++) { + if (!l[i]) { + cnt++; + } + } + if (cnt === UNICODE_BASE_PLANE_MAX_CP$1 + 1) { + // When there's nothing in the output we output a special 'match-nothing' regex: `[^\S\s]`. + // BUT... since we output the INVERTED set, we output the match-all set instead: + return '\\S\\s'; + } else if (cnt === 0) { + // When we find the entire Unicode range is in the output match set, we replace this with + // a shorthand regex: `[\S\s]` + // BUT... since we output the INVERTED set, we output the match-nothing set instead: + return '^\\S\\s'; + } + + // Now see if we can replace several bits by an escape / pcode: + if (output_minimized) { + lut = Pcodes_bitarray_cache_test_order; + for (tn = 0; lut[tn]; tn++) { + tspec = lut[tn]; + // check if the uniquely identifying char is in the inverted set: + if (!l[tspec[0]]) { + // check if the pcode is covered by the inverted set: + pcode = tspec[1]; + ba4pcode = Pcodes_bitarray_cache[pcode]; + match = 0; + for (j = 0; j <= UNICODE_BASE_PLANE_MAX_CP$1; j++) { + if (ba4pcode[j]) { + if (!l[j]) { + // match in current inverted bitset, i.e. there's at + // least one 'new' bit covered by this pcode/escape: + match++; + } else if (l_orig[j]) { + // mismatch! + match = false; + break; + } + } + } + + // We're only interested in matches which actually cover some + // yet uncovered bits: `match !== 0 && match !== false`. + // + // Apply the heuristic that the pcode/escape is only going to be used + // when it covers *more* characters than its own identifier's length: + if (match && match > pcode.length) { + rv.push(pcode); + + // and nuke the bits in the array which match the given pcode: + // make sure these edits are visible outside this function as + // `l` is an INPUT parameter (~ not modified)! + if (!bitarr_is_cloned) { + l2 = new Array(UNICODE_BASE_PLANE_MAX_CP$1 + 1); + for (j = 0; j <= UNICODE_BASE_PLANE_MAX_CP$1; j++) { + l2[j] = l[j] || ba4pcode[j]; // `!(!l[j] && !ba4pcode[j])` + } + // recreate sentinel + l2[UNICODE_BASE_PLANE_MAX_CP$1 + 1] = 1; + l = l2; + bitarr_is_cloned = true; + } else { + for (j = 0; j <= UNICODE_BASE_PLANE_MAX_CP$1; j++) { + l[j] = l[j] || ba4pcode[j]; + } + } + } + } + } + } + + i = 0; + while (i <= UNICODE_BASE_PLANE_MAX_CP$1) { + // find first character not in original set: + while (l[i]) { + i++; + } + if (i >= UNICODE_BASE_PLANE_MAX_CP$1 + 1) { + break; + } + // find next character not in original set: + for (j = i + 1; !l[j]; j++) {} /* empty loop */ + // generate subset: + rv.push(i2c(i)); + if (j - 1 > i) { + rv.push((j - 2 > i ? '-' : '') + i2c(j - 1)); + } + i = j; + } + } else { + // generate the non-inverted set, hence all logic checks are inverted here... + cnt = 0; + for (i = 0; i <= UNICODE_BASE_PLANE_MAX_CP$1; i++) { + if (l[i]) { + cnt++; + } + } + if (cnt === UNICODE_BASE_PLANE_MAX_CP$1 + 1) { + // When we find the entire Unicode range is in the output match set, we replace this with + // a shorthand regex: `[\S\s]` + return '\\S\\s'; + } else if (cnt === 0) { + // When there's nothing in the output we output a special 'match-nothing' regex: `[^\S\s]`. + return '^\\S\\s'; + } + + // Now see if we can replace several bits by an escape / pcode: + if (output_minimized) { + lut = Pcodes_bitarray_cache_test_order; + for (tn = 0; lut[tn]; tn++) { + tspec = lut[tn]; + // check if the uniquely identifying char is in the set: + if (l[tspec[0]]) { + // check if the pcode is covered by the set: + pcode = tspec[1]; + ba4pcode = Pcodes_bitarray_cache[pcode]; + match = 0; + for (j = 0; j <= UNICODE_BASE_PLANE_MAX_CP$1; j++) { + if (ba4pcode[j]) { + if (l[j]) { + // match in current bitset, i.e. there's at + // least one 'new' bit covered by this pcode/escape: + match++; + } else if (!l_orig[j]) { + // mismatch! + match = false; + break; + } + } + } + + // We're only interested in matches which actually cover some + // yet uncovered bits: `match !== 0 && match !== false`. + // + // Apply the heuristic that the pcode/escape is only going to be used + // when it covers *more* characters than its own identifier's length: + if (match && match > pcode.length) { + rv.push(pcode); + + // and nuke the bits in the array which match the given pcode: + // make sure these edits are visible outside this function as + // `l` is an INPUT parameter (~ not modified)! + if (!bitarr_is_cloned) { + l2 = new Array(UNICODE_BASE_PLANE_MAX_CP$1 + 1); + for (j = 0; j <= UNICODE_BASE_PLANE_MAX_CP$1; j++) { + l2[j] = l[j] && !ba4pcode[j]; + } + // recreate sentinel + l2[UNICODE_BASE_PLANE_MAX_CP$1 + 1] = 1; + l = l2; + bitarr_is_cloned = true; + } else { + for (j = 0; j <= UNICODE_BASE_PLANE_MAX_CP$1; j++) { + l[j] = l[j] && !ba4pcode[j]; + } + } + } + } + } + } + + i = 0; + while (i <= UNICODE_BASE_PLANE_MAX_CP$1) { + // find first character not in original set: + while (!l[i]) { + i++; + } + if (i >= UNICODE_BASE_PLANE_MAX_CP$1 + 1) { + break; + } + // find next character not in original set: + for (j = i + 1; l[j]; j++) {} /* empty loop */ + if (j > UNICODE_BASE_PLANE_MAX_CP$1 + 1) { + j = UNICODE_BASE_PLANE_MAX_CP$1 + 1; + } + // generate subset: + rv.push(i2c(i)); + if (j - 1 > i) { + rv.push((j - 2 > i ? '-' : '') + i2c(j - 1)); + } + i = j; + } + } + + assert(rv.length); + var s = rv.join(''); + assert(s); + + // Check if the set is better represented by one of the regex escapes: + var esc4s = EscCode_bitarray_output_refs.set2esc[s]; + if (esc4s) { + // When we hit a special case like this, it is always the shortest notation, hence wins on the spot! + return '\\' + esc4s; + } + return s; +} + +// Pretty brutal conversion of 'regex' `s` back to raw regex set content: strip outer [...] when they're there; +// ditto for inner combos of sets, i.e. `]|[` as in `[0-9]|[a-z]`. +function reduceRegexToSetBitArray(s, name, opts) { + var orig = s; + + // propagate deferred exceptions = error reports. + if (s instanceof Error) { + return s; + } + + var l = new Array(UNICODE_BASE_PLANE_MAX_CP$1 + 1); + var internal_state = 0; + var derr; + + while (s.length) { + var c1 = s.match(CHR_RE$1); + if (!c1) { + // cope with illegal escape sequences too! + return new Error('illegal escape sequence at start of regex part: "' + s + '" of regex "' + orig + '"'); + } else { + c1 = c1[0]; + } + s = s.substr(c1.length); + + switch (c1) { + case '[': + // this is starting a set within the regex: scan until end of set! + var set_content = []; + while (s.length) { + var inner = s.match(SET_PART_RE$1); + if (!inner) { + inner = s.match(CHR_RE$1); + if (!inner) { + // cope with illegal escape sequences too! + return new Error('illegal escape sequence at start of regex part: ' + s + '" of regex "' + orig + '"'); + } else { + inner = inner[0]; + } + if (inner === ']') break; + } else { + inner = inner[0]; + } + set_content.push(inner); + s = s.substr(inner.length); + } + + // ensure that we hit the terminating ']': + var c2 = s.match(CHR_RE$1); + if (!c2) { + // cope with illegal escape sequences too! + return new Error('regex set expression is broken in regex: "' + orig + '" --> "' + s + '"'); + } else { + c2 = c2[0]; + } + if (c2 !== ']') { + return new Error('regex set expression is broken in regex: ' + orig); + } + s = s.substr(c2.length); + + var se = set_content.join(''); + if (!internal_state) { + derr = set2bitarray(l, se, opts); + // propagate deferred exceptions = error reports. + if (derr instanceof Error) { + return derr; + } + + // a set is to use like a single character in a longer literal phrase, hence input `[abc]word[def]` would thus produce output `[abc]`: + internal_state = 1; + } + break; + + // Strip unescaped pipes to catch constructs like `\\r|\\n` and turn them into + // something ready for use inside a regex set, e.g. `\\r\\n`. + // + // > Of course, we realize that converting more complex piped constructs this way + // > will produce something you might not expect, e.g. `A|WORD2` which + // > would end up as the set `[AW]` which is something else than the input + // > entirely. + // > + // > However, we can only depend on the user (grammar writer) to realize this and + // > prevent this from happening by not creating such oddities in the input grammar. + case '|': + // a|b --> [ab] + internal_state = 0; + break; + + case '(': + // (a) --> a + // + // TODO - right now we treat this as 'too complex': + + // Strip off some possible outer wrappers which we know how to remove. + // We don't worry about 'damaging' the regex as any too-complex regex will be caught + // in the validation check at the end; our 'strippers' here would not damage useful + // regexes anyway and them damaging the unacceptable ones is fine. + s = s.replace(/^\((?:\?:)?(.*?)\)$/, '$1'); // (?:...) -> ... and (...) -> ... + s = s.replace(/^\^?(.*?)\$?$/, '$1'); // ^...$ --> ... (catch these both inside and outside the outer grouping, hence do the ungrouping twice: one before, once after this) + s = s.replace(/^\((?:\?:)?(.*?)\)$/, '$1'); // (?:...) -> ... and (...) -> ... + + return new Error('[macro [' + name + '] is unsuitable for use inside regex set expressions: "[' + orig + ']"]'); + + case '.': + case '*': + case '+': + case '?': + // wildcard + // + // TODO - right now we treat this as 'too complex': + return new Error('[macro [' + name + '] is unsuitable for use inside regex set expressions: "[' + orig + ']"]'); + + case '{': + // range, e.g. `x{1,3}`, or macro? + // TODO - right now we treat this as 'too complex': + return new Error('[macro [' + name + '] is unsuitable for use inside regex set expressions: "[' + orig + ']"]'); + + default: + // literal character or word: take the first character only and ignore the rest, so that + // the constructed set for `word|noun` would be `[wb]`: + if (!internal_state) { + derr = set2bitarray(l, c1, opts); + // propagate deferred exceptions = error reports. + if (derr instanceof Error) { + return derr; + } + + internal_state = 2; + } + break; + } + } + + s = bitarray2set(l); + + // When this result is suitable for use in a set, than we should be able to compile + // it in a regex; that way we can easily validate whether macro X is fit to be used + // inside a regex set: + try { + var re; + assert(s); + assert(!(s instanceof Error)); + re = new XRegExp('[' + s + ']'); + re.test(s[0]); + + // One thing is apparently *not* caught by the RegExp compile action above: `[a[b]c]` + // so we check for lingering UNESCAPED brackets in here as those cannot be: + if (/[^\\][\[\]]/.exec(s)) { + throw new Error('unescaped brackets in set data'); + } + } catch (ex) { + // make sure we produce a set range expression which will fail badly when it is used + // in actual code: + s = new Error('[macro [' + name + '] is unsuitable for use inside regex set expressions: "[' + s + ']"]: ' + ex.message); + } + + assert(s); + // propagate deferred exceptions = error reports. + if (s instanceof Error) { + return s; + } + return l; +} + +// Convert bitarray representing, for example, `'0-9'` to regex string `[0-9]` +// -- or in this example it can be further optimized to only `\d`! +function produceOptimizedRegex4Set(bitarr) { + // First try to produce a minimum regex from the bitarray directly: + var s1 = bitarray2set(bitarr, false, true); + + // and when the regex set turns out to match a single pcode/escape, then + // use that one as-is: + if (s1.match(SET_IS_SINGLE_PCODE_RE)) { + // When we hit a special case like this, it is always the shortest notation, hence wins on the spot! + return s1; + } else { + s1 = '[' + s1 + ']'; + } + + // Now try to produce a minimum regex from the *inverted* bitarray via negation: + // Because we look at a negated bitset, there's no use looking for matches with + // special cases here. + var s2 = bitarray2set(bitarr, true, true); + + if (s2[0] === '^') { + s2 = s2.substr(1); + if (s2.match(SET_IS_SINGLE_PCODE_RE)) { + // When we hit a special case like this, it is always the shortest notation, hence wins on the spot! + return s2; + } + } else { + s2 = '^' + s2; + } + s2 = '[' + s2 + ']'; + + // Then, as some pcode/escapes still happen to deliver a LARGER regex string in the end, + // we also check against the plain, unadulterated regex set expressions: + // + // First try to produce a minimum regex from the bitarray directly: + var s3 = bitarray2set(bitarr, false, false); + + // and when the regex set turns out to match a single pcode/escape, then + // use that one as-is: + if (s3.match(SET_IS_SINGLE_PCODE_RE)) { + // When we hit a special case like this, it is always the shortest notation, hence wins on the spot! + return s3; + } else { + s3 = '[' + s3 + ']'; + } + + // Now try to produce a minimum regex from the *inverted* bitarray via negation: + // Because we look at a negated bitset, there's no use looking for matches with + // special cases here. + var s4 = bitarray2set(bitarr, true, false); + + if (s4[0] === '^') { + s4 = s4.substr(1); + if (s4.match(SET_IS_SINGLE_PCODE_RE)) { + // When we hit a special case like this, it is always the shortest notation, hence wins on the spot! + return s4; + } + } else { + s4 = '^' + s4; + } + s4 = '[' + s4 + ']'; + + if (s2.length < s1.length) { + s1 = s2; + } + if (s3.length < s1.length) { + s1 = s3; + } + if (s4.length < s1.length) { + s1 = s4; + } + + return s1; +} + +var setmgmt = { + XREGEXP_UNICODE_ESCAPE_RE: XREGEXP_UNICODE_ESCAPE_RE$1, + CHR_RE: CHR_RE$1, + SET_PART_RE: SET_PART_RE$1, + NOTHING_SPECIAL_RE: NOTHING_SPECIAL_RE$1, + SET_IS_SINGLE_PCODE_RE: SET_IS_SINGLE_PCODE_RE, + + UNICODE_BASE_PLANE_MAX_CP: UNICODE_BASE_PLANE_MAX_CP$1, + + WHITESPACE_SETSTR: WHITESPACE_SETSTR$1, + DIGIT_SETSTR: DIGIT_SETSTR$1, + WORDCHAR_SETSTR: WORDCHAR_SETSTR$1, + + set2bitarray: set2bitarray, + bitarray2set: bitarray2set, + produceOptimizedRegex4Set: produceOptimizedRegex4Set, + reduceRegexToSetBitArray: reduceRegexToSetBitArray +}; + +// Basic Lexer implemented using JavaScript regular expressions +// Zachary Carter +// MIT Licensed + +var rmCommonWS = helpers.rmCommonWS; +var camelCase = helpers.camelCase; +var code_exec = helpers.exec; +// import recast from '@gerhobbelt/recast'; +// import astUtils from '@gerhobbelt/ast-util'; +var version = '0.6.1-205'; // require('./package.json').version; + + +var XREGEXP_UNICODE_ESCAPE_RE = setmgmt.XREGEXP_UNICODE_ESCAPE_RE; // Matches the XRegExp Unicode escape braced part, e.g. `{Number}` +var CHR_RE = setmgmt.CHR_RE; +var SET_PART_RE = setmgmt.SET_PART_RE; +var NOTHING_SPECIAL_RE = setmgmt.NOTHING_SPECIAL_RE; +var UNICODE_BASE_PLANE_MAX_CP = setmgmt.UNICODE_BASE_PLANE_MAX_CP; + +// WARNING: this regex MUST match the regex for `ID` in ebnf-parser::bnf.l jison language lexer spec! (`ID = [{ALPHA}]{ALNUM}*`) +// +// This is the base XRegExp ID regex used in many places; this should match the ID macro definition in the EBNF/BNF parser et al as well! +var ID_REGEX_BASE = '[\\p{Alphabetic}_][\\p{Alphabetic}_\\p{Number}]*'; + +// see also ./lib/cli.js +/** +@public +@nocollapse +*/ +var defaultJisonLexOptions = { + moduleType: 'commonjs', + debug: false, + enableDebugLogs: false, + json: false, + main: false, // CLI: not:(--main option) + dumpSourceCodeOnFailure: true, + throwErrorOnCompileFailure: true, + + moduleName: undefined, + defaultModuleName: 'lexer', + file: undefined, + outfile: undefined, + inputPath: undefined, + inputFilename: undefined, + warn_cb: undefined, // function(msg) | true (= use Jison.Print) | false (= throw Exception) + + xregexp: false, + lexerErrorsAreRecoverable: false, + flex: false, + backtrack_lexer: false, + ranges: false, // track position range, i.e. start+end indexes in the input string + trackPosition: true, // track line+column position in the input string + caseInsensitive: false, + showSource: false, + exportSourceCode: false, + exportAST: false, + prettyCfg: true, + pre_lex: undefined, + post_lex: undefined +}; + +// Merge sets of options. +// +// Convert alternative jison option names to their base option. +// +// The *last* option set which overrides the default wins, where 'override' is +// defined as specifying a not-undefined value which is not equal to the +// default value. +// +// When the FIRST argument is STRING "NODEFAULT", then we MUST NOT mix the +// default values avialable in Jison.defaultJisonOptions. +// +// Return a fresh set of options. +/** @public */ +function mkStdOptions() /*...args*/{ + var h = Object.prototype.hasOwnProperty; + + var opts = {}; + var args = [].concat.apply([], arguments); + // clone defaults, so we do not modify those constants? + if (args[0] !== "NODEFAULT") { + args.unshift(defaultJisonLexOptions); + } else { + args.shift(); + } + + for (var i = 0, len = args.length; i < len; i++) { + var o = args[i]; + if (!o) continue; + + // clone input (while camel-casing the options), so we do not modify those either. + var o2 = {}; + + for (var p in o) { + if (typeof o[p] !== 'undefined' && h.call(o, p)) { + o2[camelCase(p)] = o[p]; + } + } + + // now clean them options up: + if (typeof o2.main !== 'undefined') { + o2.noMain = !o2.main; + } + + delete o2.main; + + // special check for `moduleName` to ensure we detect the 'default' moduleName entering from the CLI + // NOT overriding the moduleName set in the grammar definition file via an `%options` entry: + if (o2.moduleName === o2.defaultModuleName) { + delete o2.moduleName; + } + + // now see if we have an overriding option here: + for (var p in o2) { + if (h.call(o2, p)) { + if (typeof o2[p] !== 'undefined') { + opts[p] = o2[p]; + } + } + } + } + + return opts; +} + +// set up export/output attributes of the `options` object instance +function prepExportStructures(options) { + // set up the 'option' `exportSourceCode` as a hash object for returning + // all generated source code chunks to the caller + var exportSourceCode = options.exportSourceCode; + if (!exportSourceCode || (typeof exportSourceCode === 'undefined' ? 'undefined' : _typeof(exportSourceCode)) !== 'object') { + exportSourceCode = { + enabled: !!exportSourceCode + }; + } else if (typeof exportSourceCode.enabled !== 'boolean') { + exportSourceCode.enabled = true; + } + options.exportSourceCode = exportSourceCode; +} + +// Autodetect if the input lexer spec is in JSON or JISON +// format when the `options.json` flag is `true`. +// +// Produce the JSON lexer spec result when these are JSON formatted already as that +// would save us the trouble of doing this again, anywhere else in the JISON +// compiler/generator. +// +// Otherwise return the *parsed* lexer spec as it has +// been processed through LexParser. +function autodetectAndConvertToJSONformat(lexerSpec, options) { + var chk_l = null; + var ex1, err; + + if (typeof lexerSpec === 'string') { + if (options.json) { + try { + chk_l = json5.parse(lexerSpec); + + // When JSON5-based parsing of the lexer spec succeeds, this implies the lexer spec is specified in `JSON mode` + // *OR* there's a JSON/JSON5 format error in the input: + } catch (e) { + ex1 = e; + } + } + if (!chk_l) { + // // WARNING: the lexer may receive options specified in the **grammar spec file**, + // // hence we should mix the options to ensure the lexParser always + // // receives the full set! + // // + // // make sure all options are 'standardized' before we go and mix them together: + // options = mkStdOptions(grammar.options, options); + try { + chk_l = lexParser.parse(lexerSpec, options); + } catch (e) { + if (options.json) { + err = new Error('Could not parse lexer spec in JSON AUTODETECT mode\nError: ' + ex1.message + ' (' + e.message + ')'); + err.secondary_exception = e; + err.stack = ex1.stack; + } else { + err = new Error('Could not parse lexer spec\nError: ' + e.message); + err.stack = e.stack; + } + throw err; + } + } + } else { + chk_l = lexerSpec; + } + + // Save time! Don't reparse the entire lexer spec *again* inside the code generators when that's not necessary: + + return chk_l; +} + +// expand macros and convert matchers to RegExp's +function prepareRules(dict, actions, caseHelper, tokens, startConditions, opts) { + var m, + i, + k, + rule, + action, + conditions, + active_conditions, + rules = dict.rules || [], + newRules = [], + macros = {}, + regular_rule_count = 0, + simple_rule_count = 0; + + // Assure all options are camelCased: + assert(typeof opts.options['case-insensitive'] === 'undefined'); + + if (!tokens) { + tokens = {}; + } + + // Depending on the location within the regex we need different expansions of the macros: + // one expansion for when a macro is *inside* a `[...]` and another expansion when a macro + // is anywhere else in a regex: + if (dict.macros) { + macros = prepareMacros(dict.macros, opts); + } + + function tokenNumberReplacement(str, token) { + return 'return ' + (tokens[token] || '\'' + token.replace(/'/g, '\\\'') + '\''); + } + + // Make sure a comment does not contain any embedded '*/' end-of-comment marker + // as that would break the generated code + function postprocessComment(str) { + if (Array.isArray(str)) { + str = str.join(' '); + } + str = str.replace(/\*\//g, '*\\/'); // destroy any inner `*/` comment terminator sequence. + return str; + } + + actions.push('switch(yyrulenumber) {'); + + for (i = 0; i < rules.length; i++) { + rule = rules[i]; + m = rule[0]; + + active_conditions = []; + if (Object.prototype.toString.apply(m) !== '[object Array]') { + // implicit add to all inclusive start conditions + for (k in startConditions) { + if (startConditions[k].inclusive) { + active_conditions.push(k); + startConditions[k].rules.push(i); + } + } + } else if (m[0] === '*') { + // Add to ALL start conditions + active_conditions.push('*'); + for (k in startConditions) { + startConditions[k].rules.push(i); + } + rule.shift(); + m = rule[0]; + } else { + // Add to explicit start conditions + conditions = rule.shift(); + m = rule[0]; + for (k = 0; k < conditions.length; k++) { + if (!startConditions.hasOwnProperty(conditions[k])) { + startConditions[conditions[k]] = { + rules: [], + inclusive: false + }; + console.warn('Lexer Warning:', '"' + conditions[k] + '" start condition should be defined as %s or %x; assuming %x now.'); + } + active_conditions.push(conditions[k]); + startConditions[conditions[k]].rules.push(i); + } + } + + if (typeof m === 'string') { + m = expandMacros(m, macros, opts); + m = new XRegExp('^(?:' + m + ')', opts.options.caseInsensitive ? 'i' : ''); + } + newRules.push(m); + if (typeof rule[1] === 'function') { + rule[1] = String(rule[1]).replace(/^\s*function \(\)\s?\{/, '').replace(/\}\s*$/, ''); + } + action = rule[1]; + action = action.replace(/return '((?:\\'|[^']+)+)'/g, tokenNumberReplacement); + action = action.replace(/return "((?:\\"|[^"]+)+)"/g, tokenNumberReplacement); + + var code = ['\n/*! Conditions::']; + code.push(postprocessComment(active_conditions)); + code.push('*/', '\n/*! Rule:: '); + code.push(postprocessComment(rules[i][0])); + code.push('*/', '\n'); + + // When the action is *only* a simple `return TOKEN` statement, then add it to the caseHelpers; + // otherwise add the additional `break;` at the end. + // + // Note: we do NOT analyze the action block any more to see if the *last* line is a simple + // `return NNN;` statement as there are too many shoddy idioms, e.g. + // + // ``` + // %{ if (cond) + // return TOKEN; + // %} + // ``` + // + // which would then cause havoc when our action code analysis (using regexes or otherwise) was 'too simple' + // to catch these culprits; hence we resort and stick with the most fundamental approach here: + // always append `break;` even when it would be obvious to a human that such would be 'unreachable code'. + var match_nr = /^return[\s\r\n]+((?:'(?:\\'|[^']+)+')|(?:"(?:\\"|[^"]+)+")|\d+)[\s\r\n]*;?$/.exec(action.trim()); + if (match_nr) { + simple_rule_count++; + caseHelper.push([].concat(code, i, ':', match_nr[1]).join(' ').replace(/[\n]/g, '\n ')); + } else { + regular_rule_count++; + actions.push([].concat('case', i, ':', code, action, '\nbreak;').join(' ')); + } + } + actions.push('default:'); + actions.push(' return this.simpleCaseActionClusters[yyrulenumber];'); + actions.push('}'); + + return { + rules: newRules, + macros: macros, + + regular_rule_count: regular_rule_count, + simple_rule_count: simple_rule_count + }; +} + +// expand all macros (with maybe one exception) in the given regex: the macros may exist inside `[...]` regex sets or +// elsewhere, which requires two different treatments to expand these macros. +function reduceRegex(s, name, opts, expandAllMacrosInSet_cb, expandAllMacrosElsewhere_cb) { + var orig = s; + + function errinfo() { + if (name) { + return 'macro [[' + name + ']]'; + } else { + return 'regex [[' + orig + ']]'; + } + } + + // propagate deferred exceptions = error reports. + if (s instanceof Error) { + return s; + } + + var c1, c2; + var rv = []; + var derr; + var se; + + while (s.length) { + c1 = s.match(CHR_RE); + if (!c1) { + // cope with illegal escape sequences too! + return new Error(errinfo() + ': illegal escape sequence at start of regex part: ' + s); + } else { + c1 = c1[0]; + } + s = s.substr(c1.length); + + switch (c1) { + case '[': + // this is starting a set within the regex: scan until end of set! + var set_content = []; + var l = new Array(UNICODE_BASE_PLANE_MAX_CP + 1); + + while (s.length) { + var inner = s.match(SET_PART_RE); + if (!inner) { + inner = s.match(CHR_RE); + if (!inner) { + // cope with illegal escape sequences too! + return new Error(errinfo() + ': illegal escape sequence at start of regex part: ' + s); + } else { + inner = inner[0]; + } + if (inner === ']') break; + } else { + inner = inner[0]; + } + set_content.push(inner); + s = s.substr(inner.length); + } + + // ensure that we hit the terminating ']': + c2 = s.match(CHR_RE); + if (!c2) { + // cope with illegal escape sequences too! + return new Error(errinfo() + ': regex set expression is broken: "' + s + '"'); + } else { + c2 = c2[0]; + } + if (c2 !== ']') { + return new Error(errinfo() + ': regex set expression is broken: apparently unterminated'); + } + s = s.substr(c2.length); + + se = set_content.join(''); + + // expand any macros in here: + if (expandAllMacrosInSet_cb) { + se = expandAllMacrosInSet_cb(se); + assert(se); + if (se instanceof Error) { + return new Error(errinfo() + ': ' + se.message); + } + } + + derr = setmgmt.set2bitarray(l, se, opts); + if (derr instanceof Error) { + return new Error(errinfo() + ': ' + derr.message); + } + + // find out which set expression is optimal in size: + var s1 = setmgmt.produceOptimizedRegex4Set(l); + + // check if the source regex set potentially has any expansions (guestimate!) + // + // The indexOf('{') picks both XRegExp Unicode escapes and JISON lexer macros, which is perfect for us here. + var has_expansions = se.indexOf('{') >= 0; + + se = '[' + se + ']'; + + if (!has_expansions && se.length < s1.length) { + s1 = se; + } + rv.push(s1); + break; + + // XRegExp Unicode escape, e.g. `\\p{Number}`: + case '\\p': + c2 = s.match(XREGEXP_UNICODE_ESCAPE_RE); + if (c2) { + c2 = c2[0]; + s = s.substr(c2.length); + + // nothing to expand. + rv.push(c1 + c2); + } else { + // nothing to stretch this match, hence nothing to expand. + rv.push(c1); + } + break; + + // Either a range expression or the start of a macro reference: `.{1,3}` or `{NAME}`. + // Treat it as a macro reference and see if it will expand to anything: + case '{': + c2 = s.match(NOTHING_SPECIAL_RE); + if (c2) { + c2 = c2[0]; + s = s.substr(c2.length); + + var c3 = s[0]; + s = s.substr(c3.length); + if (c3 === '}') { + // possibly a macro name in there... Expand if possible: + c2 = c1 + c2 + c3; + if (expandAllMacrosElsewhere_cb) { + c2 = expandAllMacrosElsewhere_cb(c2); + assert(c2); + if (c2 instanceof Error) { + return new Error(errinfo() + ': ' + c2.message); + } + } + } else { + // not a well-terminated macro reference or something completely different: + // we do not even attempt to expand this as there's guaranteed nothing to expand + // in this bit. + c2 = c1 + c2 + c3; + } + rv.push(c2); + } else { + // nothing to stretch this match, hence nothing to expand. + rv.push(c1); + } + break; + + // Recognize some other regex elements, but there's no need to understand them all. + // + // We are merely interested in any chunks now which do *not* include yet another regex set `[...]` + // nor any `{MACRO}` reference: + default: + // non-set character or word: see how much of this there is for us and then see if there + // are any macros still lurking inside there: + c2 = s.match(NOTHING_SPECIAL_RE); + if (c2) { + c2 = c2[0]; + s = s.substr(c2.length); + + // nothing to expand. + rv.push(c1 + c2); + } else { + // nothing to stretch this match, hence nothing to expand. + rv.push(c1); + } + break; + } + } + + s = rv.join(''); + + // When this result is suitable for use in a set, than we should be able to compile + // it in a regex; that way we can easily validate whether macro X is fit to be used + // inside a regex set: + try { + var re; + re = new XRegExp(s); + re.test(s[0]); + } catch (ex) { + // make sure we produce a regex expression which will fail badly when it is used + // in actual code: + return new Error(errinfo() + ': expands to an invalid regex: /' + s + '/'); + } + + assert(s); + return s; +} + +// expand macros within macros and cache the result +function prepareMacros(dict_macros, opts) { + var macros = {}; + + // expand a `{NAME}` macro which exists inside a `[...]` set: + function expandMacroInSet(i) { + var k, a, m; + if (!macros[i]) { + m = dict_macros[i]; + + if (m.indexOf('{') >= 0) { + // set up our own record so we can detect definition loops: + macros[i] = { + in_set: false, + elsewhere: null, + raw: dict_macros[i] + }; + + for (k in dict_macros) { + if (dict_macros.hasOwnProperty(k) && i !== k) { + // it doesn't matter if the lexer recognized that the inner macro(s) + // were sitting inside a `[...]` set or not: the fact that they are used + // here in macro `i` which itself sits in a set, makes them *all* live in + // a set so all of them get the same treatment: set expansion style. + // + // Note: make sure we don't try to expand any XRegExp `\p{...}` or `\P{...}` + // macros here: + if (XRegExp._getUnicodeProperty(k)) { + // Work-around so that you can use `\p{ascii}` for a XRegExp slug, a.k.a. + // Unicode 'General Category' Property cf. http://unicode.org/reports/tr18/#Categories, + // while using `\p{ASCII}` as a *macro expansion* of the `ASCII` + // macro: + if (k.toUpperCase() !== k) { + m = new Error('Cannot use name "' + k + '" as a macro name as it clashes with the same XRegExp "\\p{..}" Unicode \'General Category\' Property name. Use all-uppercase macro names, e.g. name your macro "' + k.toUpperCase() + '" to work around this issue or give your offending macro a different name.'); + break; + } + } + + a = m.split('{' + k + '}'); + if (a.length > 1) { + var x = expandMacroInSet(k); + assert(x); + if (x instanceof Error) { + m = x; + break; + } + m = a.join(x); + } + } + } + } + + var mba = setmgmt.reduceRegexToSetBitArray(m, i, opts); + + var s1; + + // propagate deferred exceptions = error reports. + if (mba instanceof Error) { + s1 = mba; + } else { + s1 = setmgmt.bitarray2set(mba, false); + + m = s1; + } + + macros[i] = { + in_set: s1, + elsewhere: null, + raw: dict_macros[i] + }; + } else { + m = macros[i].in_set; + + if (m instanceof Error) { + // this turns out to be an macro with 'issues' and it is used, so the 'issues' do matter: bombs away! + return new Error(m.message); + } + + // detect definition loop: + if (m === false) { + return new Error('Macro name "' + i + '" has an illegal, looping, definition, i.e. it\'s definition references itself, either directly or indirectly, via other macros.'); + } + } + + return m; + } + + function expandMacroElsewhere(i) { + var k, a, m; + + if (macros[i].elsewhere == null) { + m = dict_macros[i]; + + // set up our own record so we can detect definition loops: + macros[i].elsewhere = false; + + // the macro MAY contain other macros which MAY be inside a `[...]` set in this + // macro or elsewhere, hence we must parse the regex: + m = reduceRegex(m, i, opts, expandAllMacrosInSet, expandAllMacrosElsewhere); + // propagate deferred exceptions = error reports. + if (m instanceof Error) { + return m; + } + + macros[i].elsewhere = m; + } else { + m = macros[i].elsewhere; + + if (m instanceof Error) { + // this turns out to be an macro with 'issues' and it is used, so the 'issues' do matter: bombs away! + return m; + } + + // detect definition loop: + if (m === false) { + return new Error('Macro name "' + i + '" has an illegal, looping, definition, i.e. it\'s definition references itself, either directly or indirectly, via other macros.'); + } + } + + return m; + } + + function expandAllMacrosInSet(s) { + var i, x; + + // process *all* the macros inside [...] set: + if (s.indexOf('{') >= 0) { + for (i in macros) { + if (macros.hasOwnProperty(i)) { + var a = s.split('{' + i + '}'); + if (a.length > 1) { + x = expandMacroInSet(i); + assert(x); + if (x instanceof Error) { + return new Error('failure to expand the macro [' + i + '] in set [' + s + ']: ' + x.message); + } + s = a.join(x); + } + + // stop the brute-force expansion attempt when we done 'em all: + if (s.indexOf('{') === -1) { + break; + } + } + } + } + + return s; + } + + function expandAllMacrosElsewhere(s) { + var i, x; + + // When we process the remaining macro occurrences in the regex + // every macro used in a lexer rule will become its own capture group. + // + // Meanwhile the cached expansion will expand any submacros into + // *NON*-capturing groups so that the backreference indexes remain as you'ld + // expect and using macros doesn't require you to know exactly what your + // used macro will expand into, i.e. which and how many submacros it has. + // + // This is a BREAKING CHANGE from vanilla jison 0.4.15! + if (s.indexOf('{') >= 0) { + for (i in macros) { + if (macros.hasOwnProperty(i)) { + // These are all submacro expansions, hence non-capturing grouping is applied: + var a = s.split('{' + i + '}'); + if (a.length > 1) { + x = expandMacroElsewhere(i); + assert(x); + if (x instanceof Error) { + return new Error('failure to expand the macro [' + i + '] in regex /' + s + '/: ' + x.message); + } + s = a.join('(?:' + x + ')'); + } + + // stop the brute-force expansion attempt when we done 'em all: + if (s.indexOf('{') === -1) { + break; + } + } + } + } + + return s; + } + + var m, i; + + if (opts.debug) console.log('\n############## RAW macros: ', dict_macros); + + // first we create the part of the dictionary which is targeting the use of macros + // *inside* `[...]` sets; once we have completed that half of the expansions work, + // we then go and expand the macros for when they are used elsewhere in a regex: + // iff we encounter submacros then which are used *inside* a set, we can use that + // first half dictionary to speed things up a bit as we can use those expansions + // straight away! + for (i in dict_macros) { + if (dict_macros.hasOwnProperty(i)) { + expandMacroInSet(i); + } + } + + for (i in dict_macros) { + if (dict_macros.hasOwnProperty(i)) { + expandMacroElsewhere(i); + } + } + + if (opts.debug) console.log('\n############### expanded macros: ', macros); + + return macros; +} + +// expand macros in a regex; expands them recursively +function expandMacros(src, macros, opts) { + var expansion_count = 0; + + // By the time we call this function `expandMacros` we MUST have expanded and cached all macros already! + // Hence things should be easy in there: + + function expandAllMacrosInSet(s) { + var i, m, x; + + // process *all* the macros inside [...] set: + if (s.indexOf('{') >= 0) { + for (i in macros) { + if (macros.hasOwnProperty(i)) { + m = macros[i]; + + var a = s.split('{' + i + '}'); + if (a.length > 1) { + x = m.in_set; + + assert(x); + if (x instanceof Error) { + // this turns out to be an macro with 'issues' and it is used, so the 'issues' do matter: bombs away! + throw x; + } + + // detect definition loop: + if (x === false) { + return new Error('Macro name "' + i + '" has an illegal, looping, definition, i.e. it\'s definition references itself, either directly or indirectly, via other macros.'); + } + + s = a.join(x); + expansion_count++; + } + + // stop the brute-force expansion attempt when we done 'em all: + if (s.indexOf('{') === -1) { + break; + } + } + } + } + + return s; + } + + function expandAllMacrosElsewhere(s) { + var i, m, x; + + // When we process the main macro occurrences in the regex + // every macro used in a lexer rule will become its own capture group. + // + // Meanwhile the cached expansion will expand any submacros into + // *NON*-capturing groups so that the backreference indexes remain as you'ld + // expect and using macros doesn't require you to know exactly what your + // used macro will expand into, i.e. which and how many submacros it has. + // + // This is a BREAKING CHANGE from vanilla jison 0.4.15! + if (s.indexOf('{') >= 0) { + for (i in macros) { + if (macros.hasOwnProperty(i)) { + m = macros[i]; + + var a = s.split('{' + i + '}'); + if (a.length > 1) { + // These are all main macro expansions, hence CAPTURING grouping is applied: + x = m.elsewhere; + assert(x); + + // detect definition loop: + if (x === false) { + return new Error('Macro name "' + i + '" has an illegal, looping, definition, i.e. it\'s definition references itself, either directly or indirectly, via other macros.'); + } + + s = a.join('(' + x + ')'); + expansion_count++; + } + + // stop the brute-force expansion attempt when we done 'em all: + if (s.indexOf('{') === -1) { + break; + } + } + } + } + + return s; + } + + // When we process the macro occurrences in the regex + // every macro used in a lexer rule will become its own capture group. + // + // Meanwhile the cached expansion will have expanded any submacros into + // *NON*-capturing groups so that the backreference indexes remain as you'ld + // expect and using macros doesn't require you to know exactly what your + // used macro will expand into, i.e. which and how many submacros it has. + // + // This is a BREAKING CHANGE from vanilla jison 0.4.15! + var s2 = reduceRegex(src, null, opts, expandAllMacrosInSet, expandAllMacrosElsewhere); + // propagate deferred exceptions = error reports. + if (s2 instanceof Error) { + throw s2; + } + + // only when we did expand some actual macros do we take the re-interpreted/optimized/regenerated regex from reduceRegex() + // in order to keep our test cases simple and rules recognizable. This assumes the user can code good regexes on his own, + // as long as no macros are involved... + // + // Also pick the reduced regex when there (potentially) are XRegExp extensions in the original, e.g. `\\p{Number}`, + // unless the `xregexp` output option has been enabled. + if (expansion_count > 0 || src.indexOf('\\p{') >= 0 && !opts.options.xregexp) { + src = s2; + } else { + // Check if the reduced regex is smaller in size; when it is, we still go with the new one! + if (s2.length < src.length) { + src = s2; + } + } + + return src; +} + +function prepareStartConditions(conditions) { + var sc, + hash = {}; + for (sc in conditions) { + if (conditions.hasOwnProperty(sc)) { + hash[sc] = { rules: [], inclusive: !conditions[sc] }; + } + } + return hash; +} + +function buildActions(dict, tokens, opts) { + var actions = [dict.actionInclude || '', 'var YYSTATE = YY_START;']; + var tok; + var toks = {}; + var caseHelper = []; + + // tokens: map/array of token numbers to token names + for (tok in tokens) { + var idx = parseInt(tok); + if (idx && idx > 0) { + toks[tokens[tok]] = idx; + } + } + + if (opts.options.flex && dict.rules) { + dict.rules.push(['.', 'console.log("", yytext); /* `flex` lexing mode: the last resort rule! */']); + } + + var gen = prepareRules(dict, actions, caseHelper, tokens && toks, opts.conditions, opts); + + var fun = actions.join('\n'); + 'yytext yyleng yylineno yylloc yyerror'.split(' ').forEach(function (yy) { + fun = fun.replace(new RegExp('\\b(' + yy + ')\\b', 'g'), 'yy_.$1'); + }); + + return { + caseHelperInclude: '{\n' + caseHelper.join(',') + '\n}', + + actions: 'function lexer__performAction(yy, yyrulenumber, YY_START) {\n var yy_ = this;\n\n ' + fun + '\n }', + + rules: gen.rules, + macros: gen.macros, // propagate these for debugging/diagnostic purposes + + regular_rule_count: gen.regular_rule_count, + simple_rule_count: gen.simple_rule_count + }; +} + +// +// NOTE: this is *almost* a copy of the JisonParserError producing code in +// jison/lib/jison.js @ line 2304:lrGeneratorMixin.generateErrorClass +// +function generateErrorClass() { + // --- START lexer error class --- + + var prelude = '/**\n * See also:\n * http://stackoverflow.com/questions/1382107/whats-a-good-way-to-extend-error-in-javascript/#35881508\n * but we keep the prototype.constructor and prototype.name assignment lines too for compatibility\n * with userland code which might access the derived class in a \'classic\' way.\n *\n * @public\n * @constructor\n * @nocollapse\n */\nfunction JisonLexerError(msg, hash) {\n Object.defineProperty(this, \'name\', {\n enumerable: false,\n writable: false,\n value: \'JisonLexerError\'\n });\n\n if (msg == null) msg = \'???\';\n\n Object.defineProperty(this, \'message\', {\n enumerable: false,\n writable: true,\n value: msg\n });\n\n this.hash = hash;\n\n var stacktrace;\n if (hash && hash.exception instanceof Error) {\n var ex2 = hash.exception;\n this.message = ex2.message || msg;\n stacktrace = ex2.stack;\n }\n if (!stacktrace) {\n if (Error.hasOwnProperty(\'captureStackTrace\')) { // V8\n Error.captureStackTrace(this, this.constructor);\n } else {\n stacktrace = (new Error(msg)).stack;\n }\n }\n if (stacktrace) {\n Object.defineProperty(this, \'stack\', {\n enumerable: false,\n writable: false,\n value: stacktrace\n });\n }\n}\n\nif (typeof Object.setPrototypeOf === \'function\') {\n Object.setPrototypeOf(JisonLexerError.prototype, Error.prototype);\n} else {\n JisonLexerError.prototype = Object.create(Error.prototype);\n}\nJisonLexerError.prototype.constructor = JisonLexerError;\nJisonLexerError.prototype.name = \'JisonLexerError\';'; + + // --- END lexer error class --- + + return prelude; +} + +var jisonLexerErrorDefinition = generateErrorClass(); + +function generateFakeXRegExpClassSrcCode() { + return rmCommonWS(_templateObject); +} + +/** @constructor */ +function RegExpLexer(dict, input, tokens, build_options) { + var opts; + var dump = false; + + function test_me(tweak_cb, description, src_exception, ex_callback) { + opts = processGrammar(dict, tokens, build_options); + opts.__in_rules_failure_analysis_mode__ = false; + prepExportStructures(opts); + assert(opts.options); + if (tweak_cb) { + tweak_cb(); + } + var source = generateModuleBody(opts); + try { + // The generated code will always have the `lexer` variable declared at local scope + // as `eval()` will use the local scope. + // + // The compiled code will look something like this: + // + // ``` + // var lexer; + // bla bla... + // ``` + // + // or + // + // ``` + // var lexer = { bla... }; + // ``` + var testcode = ['// provide a local version for test purposes:', jisonLexerErrorDefinition, '', generateFakeXRegExpClassSrcCode(), '', source, '', 'return lexer;'].join('\n'); + var lexer = code_exec(testcode, function generated_code_exec_wrapper_regexp_lexer(sourcecode) { + //console.log("===============================LEXER TEST CODE\n", sourcecode, "\n=====================END====================\n"); + var lexer_f = new Function('', sourcecode); + return lexer_f(); + }, opts.options, "lexer"); + + if (!lexer) { + throw new Error('no lexer defined *at all*?!'); + } + if (_typeof(lexer.options) !== 'object' || lexer.options == null) { + throw new Error('your lexer class MUST have an .options member object or it won\'t fly!'); + } + if (typeof lexer.setInput !== 'function') { + throw new Error('your lexer class MUST have a .setInput function member or it won\'t fly!'); + } + if (lexer.EOF !== 1 && lexer.ERROR !== 2) { + throw new Error('your lexer class MUST have these constants defined: lexer.EOF = 1 and lexer.ERROR = 2 or it won\'t fly!'); + } + + // When we do NOT crash, we found/killed the problem area just before this call! + if (src_exception && description) { + src_exception.message += '\n (' + description + ')'; + } + + // patch the pre and post handlers in there, now that we have some live code to work with: + if (opts.options) { + var pre = opts.options.pre_lex; + var post = opts.options.post_lex; + // since JSON cannot encode functions, we'll have to do it manually now: + if (typeof pre === 'function') { + lexer.options.pre_lex = pre; + } + if (typeof post === 'function') { + lexer.options.post_lex = post; + } + } + + if (opts.options.showSource) { + if (typeof opts.options.showSource === 'function') { + opts.options.showSource(lexer, source, opts); + } else { + console.log("\nGenerated lexer sourcecode:\n----------------------------------------\n", source, "\n----------------------------------------\n"); + } + } + return lexer; + } catch (ex) { + // if (src_exception) { + // src_exception.message += '\n (' + description + ': ' + ex.message + ')'; + // } + + if (ex_callback) { + ex_callback(ex); + } else if (dump) { + console.log('source code:\n', source); + } + return false; + } + } + + /** @constructor */ + var lexer = test_me(null, null, null, function (ex) { + // When we get an exception here, it means some part of the user-specified lexer is botched. + // + // Now we go and try to narrow down the problem area/category: + assert(opts.options); + assert(opts.options.xregexp !== undefined); + var orig_xregexp_opt = !!opts.options.xregexp; + if (!test_me(function () { + assert(opts.options.xregexp !== undefined); + opts.options.xregexp = false; + opts.showSource = false; + }, 'When you have specified %option xregexp, you must also properly IMPORT the XRegExp library in the generated lexer.', ex, null)) { + if (!test_me(function () { + // restore xregexp option setting: the trouble wasn't caused by the xregexp flag i.c.w. incorrect XRegExp library importing! + opts.options.xregexp = orig_xregexp_opt; + + opts.conditions = []; + opts.showSource = false; + }, dict.rules && dict.rules.length > 0 ? 'One or more of your lexer state names are possibly botched?' : 'Your custom lexer is somehow botched.', ex, null)) { + if (!test_me(function () { + // opts.conditions = []; + opts.rules = []; + opts.showSource = false; + opts.__in_rules_failure_analysis_mode__ = true; + }, 'One or more of your lexer rules are possibly botched?', ex, null)) { + // kill each rule action block, one at a time and test again after each 'edit': + var rv = false; + for (var i = 0, len = dict.rules ? dict.rules.length : 0; i < len; i++) { + dict.rules[i][1] = '{ /* nada */ }'; + rv = test_me(function () { + // opts.conditions = []; + // opts.rules = []; + // opts.__in_rules_failure_analysis_mode__ = true; + }, 'Your lexer rule "' + dict.rules[i][0] + '" action code block is botched?', ex, null); + if (rv) { + break; + } + } + if (!rv) { + test_me(function () { + opts.conditions = []; + opts.rules = []; + opts.performAction = 'null'; + // opts.options = {}; + // opts.caseHelperInclude = '{}'; + opts.showSource = false; + opts.__in_rules_failure_analysis_mode__ = true; + + dump = false; + }, 'One or more of your lexer rule action code block(s) are possibly botched?', ex, null); + } + } + } + } + throw ex; + }); + + lexer.setInput(input); + + /** @public */ + lexer.generate = function () { + return generateFromOpts(opts); + }; + /** @public */ + lexer.generateModule = function () { + return generateModule(opts); + }; + /** @public */ + lexer.generateCommonJSModule = function () { + return generateCommonJSModule(opts); + }; + /** @public */ + lexer.generateESModule = function () { + return generateESModule(opts); + }; + /** @public */ + lexer.generateAMDModule = function () { + return generateAMDModule(opts); + }; + + // internal APIs to aid testing: + /** @public */ + lexer.getExpandedMacros = function () { + return opts.macros; + }; + + return lexer; +} + +// code stripping performance test for very simple grammar: +// +// - removing backtracking parser code branches: 730K -> 750K rounds +// - removing all location info tracking: yylineno, yylloc, etc.: 750K -> 900K rounds +// - no `yyleng`: 900K -> 905K rounds +// - no `this.done` as we cannot have a NULL `_input` anymore: 905K -> 930K rounds +// - `simpleCaseActionClusters` as array instead of hash object: 930K -> 940K rounds +// - lexers which have only return stmts, i.e. only a +// `simpleCaseActionClusters` lookup table to produce +// lexer tokens: *inline* the `performAction` call: 940K -> 950K rounds +// - given all the above, you can *inline* what's left of +// `lexer_next()`: 950K -> 955K rounds (? this stuff becomes hard to measure; inaccuracy abounds!) +// +// Total gain when we forget about very minor (and tough to nail) *inlining* `lexer_next()` gains: +// +// 730 -> 950 ~ 30% performance gain. +// + +// As a function can be reproduced in source-code form by any JavaScript engine, we're going to wrap this chunk +// of code in a function so that we can easily get it including it comments, etc.: +/** +@public +@nocollapse +*/ +function getRegExpLexerPrototype() { + // --- START lexer kernel --- + return '{\n EOF: 1,\n ERROR: 2,\n\n // JisonLexerError: JisonLexerError, /// <-- injected by the code generator\n\n // options: {}, /// <-- injected by the code generator\n\n // yy: ..., /// <-- injected by setInput()\n\n __currentRuleSet__: null, /// INTERNAL USE ONLY: internal rule set cache for the current lexer state\n\n __error_infos: [], /// INTERNAL USE ONLY: the set of lexErrorInfo objects created since the last cleanup\n\n __decompressed: false, /// INTERNAL USE ONLY: mark whether the lexer instance has been \'unfolded\' completely and is now ready for use\n\n done: false, /// INTERNAL USE ONLY\n _backtrack: false, /// INTERNAL USE ONLY\n _input: \'\', /// INTERNAL USE ONLY\n _more: false, /// INTERNAL USE ONLY\n _signaled_error_token: false, /// INTERNAL USE ONLY\n\n conditionStack: [], /// INTERNAL USE ONLY; managed via `pushState()`, `popState()`, `topState()` and `stateStackSize()`\n\n match: \'\', /// READ-ONLY EXTERNAL ACCESS - ADVANCED USE ONLY: tracks input which has been matched so far for the lexer token under construction. `match` is identical to `yytext` except that this one still contains the matched input string after `lexer.performAction()` has been invoked, where userland code MAY have changed/replaced the `yytext` value entirely!\n matched: \'\', /// READ-ONLY EXTERNAL ACCESS - ADVANCED USE ONLY: tracks entire input which has been matched so far\n matches: false, /// READ-ONLY EXTERNAL ACCESS - ADVANCED USE ONLY: tracks RE match result for last (successful) match attempt\n yytext: \'\', /// ADVANCED USE ONLY: tracks input which has been matched so far for the lexer token under construction; this value is transferred to the parser as the \'token value\' when the parser consumes the lexer token produced through a call to the `lex()` API.\n offset: 0, /// READ-ONLY EXTERNAL ACCESS - ADVANCED USE ONLY: tracks the \'cursor position\' in the input string, i.e. the number of characters matched so far\n yyleng: 0, /// READ-ONLY EXTERNAL ACCESS - ADVANCED USE ONLY: length of matched input for the token under construction (`yytext`)\n yylineno: 0, /// READ-ONLY EXTERNAL ACCESS - ADVANCED USE ONLY: \'line number\' at which the token under construction is located\n yylloc: null, /// READ-ONLY EXTERNAL ACCESS - ADVANCED USE ONLY: tracks location info (lines + columns) for the token under construction\n\n /**\n * INTERNAL USE: construct a suitable error info hash object instance for `parseError`.\n * \n * @public\n * @this {RegExpLexer}\n */\n constructLexErrorInfo: function lexer_constructLexErrorInfo(msg, recoverable, show_input_position) {\n msg = \'\' + msg;\n\n // heuristic to determine if the error message already contains a (partial) source code dump\n // as produced by either `showPosition()` or `prettyPrintRange()`:\n if (show_input_position == undefined) {\n show_input_position = !(msg.indexOf(\'\\n\') > 0 && msg.indexOf(\'^\') > 0);\n }\n if (this.yylloc && show_input_position) {\n if (typeof this.prettyPrintRange === \'function\') {\n var pretty_src = this.prettyPrintRange(this.yylloc);\n\n if (!/\\n\\s*$/.test(msg)) {\n msg += \'\\n\';\n }\n msg += \'\\n Erroneous area:\\n\' + this.prettyPrintRange(this.yylloc); \n } else if (typeof this.showPosition === \'function\') {\n var pos_str = this.showPosition();\n if (pos_str) {\n if (msg.length && msg[msg.length - 1] !== \'\\n\' && pos_str[0] !== \'\\n\') {\n msg += \'\\n\' + pos_str;\n } else {\n msg += pos_str;\n }\n }\n }\n }\n /** @constructor */\n var pei = {\n errStr: msg,\n recoverable: !!recoverable,\n text: this.match, // This one MAY be empty; userland code should use the `upcomingInput` API to obtain more text which follows the \'lexer cursor position\'...\n token: null,\n line: this.yylineno,\n loc: this.yylloc,\n yy: this.yy,\n lexer: this,\n\n /**\n * and make sure the error info doesn\'t stay due to potential\n * ref cycle via userland code manipulations.\n * These would otherwise all be memory leak opportunities!\n * \n * Note that only array and object references are nuked as those\n * constitute the set of elements which can produce a cyclic ref.\n * The rest of the members is kept intact as they are harmless.\n * \n * @public\n * @this {LexErrorInfo}\n */\n destroy: function destructLexErrorInfo() {\n // remove cyclic references added to error info:\n // info.yy = null;\n // info.lexer = null;\n // ...\n var rec = !!this.recoverable;\n for (var key in this) {\n if (this.hasOwnProperty(key) && typeof key === \'object\') {\n this[key] = undefined;\n }\n }\n this.recoverable = rec;\n }\n };\n // track this instance so we can `destroy()` it once we deem it superfluous and ready for garbage collection!\n this.__error_infos.push(pei);\n return pei;\n },\n\n /**\n * handler which is invoked when a lexer error occurs.\n * \n * @public\n * @this {RegExpLexer}\n */\n parseError: function lexer_parseError(str, hash, ExceptionClass) {\n if (!ExceptionClass) {\n ExceptionClass = this.JisonLexerError;\n }\n if (this.yy) {\n if (this.yy.parser && typeof this.yy.parser.parseError === \'function\') {\n return this.yy.parser.parseError.call(this, str, hash, ExceptionClass) || this.ERROR;\n } else if (typeof this.yy.parseError === \'function\') {\n return this.yy.parseError.call(this, str, hash, ExceptionClass) || this.ERROR;\n } \n }\n throw new ExceptionClass(str, hash);\n },\n\n /**\n * method which implements `yyerror(str, ...args)` functionality for use inside lexer actions.\n * \n * @public\n * @this {RegExpLexer}\n */\n yyerror: function yyError(str /*, ...args */) {\n var lineno_msg = \'\';\n if (this.yylloc) {\n lineno_msg = \' on line \' + (this.yylineno + 1);\n }\n var p = this.constructLexErrorInfo(\'Lexical error\' + lineno_msg + \': \' + str, this.options.lexerErrorsAreRecoverable);\n\n // Add any extra args to the hash under the name `extra_error_attributes`:\n var args = Array.prototype.slice.call(arguments, 1);\n if (args.length) {\n p.extra_error_attributes = args;\n }\n\n return (this.parseError(p.errStr, p, this.JisonLexerError) || this.ERROR);\n },\n\n /**\n * final cleanup function for when we have completed lexing the input;\n * make it an API so that external code can use this one once userland\n * code has decided it\'s time to destroy any lingering lexer error\n * hash object instances and the like: this function helps to clean\n * up these constructs, which *may* carry cyclic references which would\n * otherwise prevent the instances from being properly and timely\n * garbage-collected, i.e. this function helps prevent memory leaks!\n * \n * @public\n * @this {RegExpLexer}\n */\n cleanupAfterLex: function lexer_cleanupAfterLex(do_not_nuke_errorinfos) {\n // prevent lingering circular references from causing memory leaks:\n this.setInput(\'\', {});\n\n // nuke the error hash info instances created during this run.\n // Userland code must COPY any data/references\n // in the error hash instance(s) it is more permanently interested in.\n if (!do_not_nuke_errorinfos) {\n for (var i = this.__error_infos.length - 1; i >= 0; i--) {\n var el = this.__error_infos[i];\n if (el && typeof el.destroy === \'function\') {\n el.destroy();\n }\n }\n this.__error_infos.length = 0;\n }\n\n return this;\n },\n\n /**\n * clear the lexer token context; intended for internal use only\n * \n * @public\n * @this {RegExpLexer}\n */\n clear: function lexer_clear() {\n this.yytext = \'\';\n this.yyleng = 0;\n this.match = \'\';\n // - DO NOT reset `this.matched`\n this.matches = false;\n this._more = false;\n this._backtrack = false;\n\n var col = (this.yylloc ? this.yylloc.last_column : 0);\n this.yylloc = {\n first_line: this.yylineno + 1,\n first_column: col,\n last_line: this.yylineno + 1,\n last_column: col,\n\n range: [this.offset, this.offset]\n };\n },\n\n /**\n * resets the lexer, sets new input\n * \n * @public\n * @this {RegExpLexer}\n */\n setInput: function lexer_setInput(input, yy) {\n this.yy = yy || this.yy || {};\n\n // also check if we\'ve fully initialized the lexer instance,\n // including expansion work to be done to go from a loaded\n // lexer to a usable lexer:\n if (!this.__decompressed) {\n // step 1: decompress the regex list:\n var rules = this.rules;\n for (var i = 0, len = rules.length; i < len; i++) {\n var rule_re = rules[i];\n\n // compression: is the RE an xref to another RE slot in the rules[] table?\n if (typeof rule_re === \'number\') {\n rules[i] = rules[rule_re];\n }\n }\n\n // step 2: unfold the conditions[] set to make these ready for use:\n var conditions = this.conditions;\n for (var k in conditions) {\n var spec = conditions[k];\n\n var rule_ids = spec.rules;\n\n var len = rule_ids.length;\n var rule_regexes = new Array(len + 1); // slot 0 is unused; we use a 1-based index approach here to keep the hottest code in `lexer_next()` fast and simple!\n var rule_new_ids = new Array(len + 1);\n\n for (var i = 0; i < len; i++) {\n var idx = rule_ids[i];\n var rule_re = rules[idx];\n rule_regexes[i + 1] = rule_re;\n rule_new_ids[i + 1] = idx;\n }\n\n spec.rules = rule_new_ids;\n spec.__rule_regexes = rule_regexes;\n spec.__rule_count = len;\n }\n\n this.__decompressed = true;\n }\n\n this._input = input || \'\';\n this.clear();\n this._signaled_error_token = false;\n this.done = false;\n this.yylineno = 0;\n this.matched = \'\';\n this.conditionStack = [\'INITIAL\'];\n this.__currentRuleSet__ = null;\n this.yylloc = {\n first_line: 1,\n first_column: 0,\n last_line: 1,\n last_column: 0,\n\n range: [0, 0]\n };\n this.offset = 0;\n return this;\n },\n\n /**\n * edit the remaining input via user-specified callback.\n * This can be used to forward-adjust the input-to-parse, \n * e.g. inserting macro expansions and alike in the\n * input which has yet to be lexed.\n * The behaviour of this API contrasts the `unput()` et al\n * APIs as those act on the *consumed* input, while this\n * one allows one to manipulate the future, without impacting\n * the current `yyloc` cursor location or any history. \n * \n * Use this API to help implement C-preprocessor-like\n * `#include` statements, etc.\n * \n * The provided callback must be synchronous and is\n * expected to return the edited input (string).\n *\n * The `cpsArg` argument value is passed to the callback\n * as-is.\n *\n * `callback` interface: \n * `function callback(input, cpsArg)`\n * \n * - `input` will carry the remaining-input-to-lex string\n * from the lexer.\n * - `cpsArg` is `cpsArg` passed into this API.\n * \n * The `this` reference for the callback will be set to\n * reference this lexer instance so that userland code\n * in the callback can easily and quickly access any lexer\n * API. \n *\n * When the callback returns a non-string-type falsey value,\n * we assume the callback did not edit the input and we\n * will using the input as-is.\n *\n * When the callback returns a non-string-type value, it\n * is converted to a string for lexing via the `"" + retval`\n * operation. (See also why: http://2ality.com/2012/03/converting-to-string.html \n * -- that way any returned object\'s `toValue()` and `toString()`\n * methods will be invoked in a proper/desirable order.)\n * \n * @public\n * @this {RegExpLexer}\n */\n editRemainingInput: function lexer_editRemainingInput(callback, cpsArg) {\n var rv = callback.call(this, this._input, cpsArg);\n if (typeof rv !== \'string\') {\n if (rv) {\n this._input = \'\' + rv; \n }\n // else: keep `this._input` as is. \n } else {\n this._input = rv; \n }\n return this;\n },\n\n /**\n * consumes and returns one char from the input\n * \n * @public\n * @this {RegExpLexer}\n */\n input: function lexer_input() {\n if (!this._input) {\n //this.done = true; -- don\'t set `done` as we want the lex()/next() API to be able to produce one custom EOF token match after this anyhow. (lexer can match special <> tokens and perform user action code for a <> match, but only does so *once*)\n return null;\n }\n var ch = this._input[0];\n this.yytext += ch;\n this.yyleng++;\n this.offset++;\n this.match += ch;\n this.matched += ch;\n // Count the linenumber up when we hit the LF (or a stand-alone CR).\n // On CRLF, the linenumber is incremented when you fetch the CR or the CRLF combo\n // and we advance immediately past the LF as well, returning both together as if\n // it was all a single \'character\' only.\n var slice_len = 1;\n var lines = false;\n if (ch === \'\\n\') {\n lines = true;\n } else if (ch === \'\\r\') {\n lines = true;\n var ch2 = this._input[1];\n if (ch2 === \'\\n\') {\n slice_len++;\n ch += ch2;\n this.yytext += ch2;\n this.yyleng++;\n this.offset++;\n this.match += ch2;\n this.matched += ch2;\n this.yylloc.range[1]++;\n }\n }\n if (lines) {\n this.yylineno++;\n this.yylloc.last_line++;\n this.yylloc.last_column = 0;\n } else {\n this.yylloc.last_column++;\n }\n this.yylloc.range[1]++;\n\n this._input = this._input.slice(slice_len);\n return ch;\n },\n\n /**\n * unshifts one char (or an entire string) into the input\n * \n * @public\n * @this {RegExpLexer}\n */\n unput: function lexer_unput(ch) {\n var len = ch.length;\n var lines = ch.split(/(?:\\r\\n?|\\n)/g);\n\n this._input = ch + this._input;\n this.yytext = this.yytext.substr(0, this.yytext.length - len);\n this.yyleng = this.yytext.length;\n this.offset -= len;\n this.match = this.match.substr(0, this.match.length - len);\n this.matched = this.matched.substr(0, this.matched.length - len);\n\n if (lines.length > 1) {\n this.yylineno -= lines.length - 1;\n\n this.yylloc.last_line = this.yylineno + 1;\n\n // Get last entirely matched line into the `pre_lines[]` array\'s\n // last index slot; we don\'t mind when other previously \n // matched lines end up in the array too. \n var pre = this.match;\n var pre_lines = pre.split(/(?:\\r\\n?|\\n)/g);\n if (pre_lines.length === 1) {\n pre = this.matched;\n pre_lines = pre.split(/(?:\\r\\n?|\\n)/g);\n }\n this.yylloc.last_column = pre_lines[pre_lines.length - 1].length;\n } else {\n this.yylloc.last_column -= len;\n }\n\n this.yylloc.range[1] = this.yylloc.range[0] + this.yyleng;\n\n this.done = false;\n return this;\n },\n\n /**\n * cache matched text and append it on next action\n * \n * @public\n * @this {RegExpLexer}\n */\n more: function lexer_more() {\n this._more = true;\n return this;\n },\n\n /**\n * signal the lexer that this rule fails to match the input, so the\n * next matching rule (regex) should be tested instead.\n * \n * @public\n * @this {RegExpLexer}\n */\n reject: function lexer_reject() {\n if (this.options.backtrack_lexer) {\n this._backtrack = true;\n } else {\n // when the `parseError()` call returns, we MUST ensure that the error is registered.\n // We accomplish this by signaling an \'error\' token to be produced for the current\n // `.lex()` run.\n var lineno_msg = \'\';\n if (this.yylloc) {\n lineno_msg = \' on line \' + (this.yylineno + 1);\n }\n var p = this.constructLexErrorInfo(\'Lexical error\' + lineno_msg + \': You can only invoke reject() in the lexer when the lexer is of the backtracking persuasion (options.backtrack_lexer = true).\', false);\n this._signaled_error_token = (this.parseError(p.errStr, p, this.JisonLexerError) || this.ERROR);\n }\n return this;\n },\n\n /**\n * retain first n characters of the match\n * \n * @public\n * @this {RegExpLexer}\n */\n less: function lexer_less(n) {\n return this.unput(this.match.slice(n));\n },\n\n /**\n * return (part of the) already matched input, i.e. for error\n * messages.\n * \n * Limit the returned string length to `maxSize` (default: 20).\n * \n * Limit the returned string to the `maxLines` number of lines of\n * input (default: 1).\n * \n * Negative limit values equal *unlimited*.\n * \n * @public\n * @this {RegExpLexer}\n */\n pastInput: function lexer_pastInput(maxSize, maxLines) {\n var past = this.matched.substring(0, this.matched.length - this.match.length);\n if (maxSize < 0)\n maxSize = past.length;\n else if (!maxSize)\n maxSize = 20;\n if (maxLines < 0)\n maxLines = past.length; // can\'t ever have more input lines than this!\n else if (!maxLines)\n maxLines = 1;\n // `substr` anticipation: treat \\r\\n as a single character and take a little\n // more than necessary so that we can still properly check against maxSize\n // after we\'ve transformed and limited the newLines in here:\n past = past.substr(-maxSize * 2 - 2);\n // now that we have a significantly reduced string to process, transform the newlines\n // and chop them, then limit them:\n var a = past.replace(/\\r\\n|\\r/g, \'\\n\').split(\'\\n\');\n a = a.slice(-maxLines);\n past = a.join(\'\\n\');\n // When, after limiting to maxLines, we still have too much to return,\n // do add an ellipsis prefix...\n if (past.length > maxSize) {\n past = \'...\' + past.substr(-maxSize);\n }\n return past;\n },\n\n /**\n * return (part of the) upcoming input, i.e. for error messages.\n * \n * Limit the returned string length to `maxSize` (default: 20).\n * \n * Limit the returned string to the `maxLines` number of lines of input (default: 1).\n * \n * Negative limit values equal *unlimited*.\n *\n * > ### NOTE ###\n * >\n * > *"upcoming input"* is defined as the whole of the both\n * > the *currently lexed* input, together with any remaining input\n * > following that. *"currently lexed"* input is the input \n * > already recognized by the lexer but not yet returned with\n * > the lexer token. This happens when you are invoking this API\n * > from inside any lexer rule action code block. \n * >\n * \n * @public\n * @this {RegExpLexer}\n */\n upcomingInput: function lexer_upcomingInput(maxSize, maxLines) {\n var next = this.match;\n if (maxSize < 0)\n maxSize = next.length + this._input.length;\n else if (!maxSize)\n maxSize = 20;\n if (maxLines < 0)\n maxLines = maxSize; // can\'t ever have more input lines than this!\n else if (!maxLines)\n maxLines = 1;\n // `substring` anticipation: treat \\r\\n as a single character and take a little\n // more than necessary so that we can still properly check against maxSize\n // after we\'ve transformed and limited the newLines in here:\n if (next.length < maxSize * 2 + 2) {\n next += this._input.substring(0, maxSize * 2 + 2); // substring is faster on Chrome/V8\n }\n // now that we have a significantly reduced string to process, transform the newlines\n // and chop them, then limit them:\n var a = next.replace(/\\r\\n|\\r/g, \'\\n\').split(\'\\n\');\n a = a.slice(0, maxLines);\n next = a.join(\'\\n\');\n // When, after limiting to maxLines, we still have too much to return,\n // do add an ellipsis postfix...\n if (next.length > maxSize) {\n next = next.substring(0, maxSize) + \'...\';\n }\n return next;\n },\n\n /**\n * return a string which displays the character position where the\n * lexing error occurred, i.e. for error messages\n * \n * @public\n * @this {RegExpLexer}\n */\n showPosition: function lexer_showPosition(maxPrefix, maxPostfix) {\n var pre = this.pastInput(maxPrefix).replace(/\\s/g, \' \');\n var c = new Array(pre.length + 1).join(\'-\');\n return pre + this.upcomingInput(maxPostfix).replace(/\\s/g, \' \') + \'\\n\' + c + \'^\';\n },\n\n /**\n * return a string which displays the lines & columns of input which are referenced \n * by the given location info range, plus a few lines of context.\n * \n * This function pretty-prints the indicated section of the input, with line numbers \n * and everything!\n * \n * This function is very useful to provide highly readable error reports, while\n * the location range may be specified in various flexible ways:\n * \n * - `loc` is the location info object which references the area which should be\n * displayed and \'marked up\': these lines & columns of text are marked up by `^`\n * characters below each character in the entire input range.\n * \n * - `context_loc` is the *optional* location info object which instructs this\n * pretty-printer how much *leading* context should be displayed alongside\n * the area referenced by `loc`. This can help provide context for the displayed\n * error, etc.\n * \n * When this location info is not provided, a default context of 3 lines is\n * used.\n * \n * - `context_loc2` is another *optional* location info object, which serves\n * a similar purpose to `context_loc`: it specifies the amount of *trailing*\n * context lines to display in the pretty-print output.\n * \n * When this location info is not provided, a default context of 1 line only is\n * used.\n * \n * Special Notes:\n * \n * - when the `loc`-indicated range is very large (about 5 lines or more), then\n * only the first and last few lines of this block are printed while a\n * `...continued...` message will be printed between them.\n * \n * This serves the purpose of not printing a huge amount of text when the `loc`\n * range happens to be huge: this way a manageable & readable output results\n * for arbitrary large ranges.\n * \n * - this function can display lines of input which whave not yet been lexed.\n * `prettyPrintRange()` can access the entire input!\n * \n * @public\n * @this {RegExpLexer}\n */\n prettyPrintRange: function lexer_prettyPrintRange(loc, context_loc, context_loc2) {\n var error_size = loc.last_line - loc.first_line;\n const CONTEXT = 3;\n const CONTEXT_TAIL = 1;\n const MINIMUM_VISIBLE_NONEMPTY_LINE_COUNT = 2;\n var input = this.matched + this._input;\n var lines = input.split(\'\\n\');\n //var show_context = (error_size < 5 || context_loc);\n var l0 = Math.max(1, (context_loc ? context_loc.first_line : loc.first_line - CONTEXT));\n var l1 = Math.max(1, (context_loc2 ? context_loc2.last_line : loc.last_line + CONTEXT_TAIL));\n var lineno_display_width = (1 + Math.log10(l1 | 1) | 0);\n var ws_prefix = new Array(lineno_display_width).join(\' \');\n var nonempty_line_indexes = [];\n var rv = lines.slice(l0 - 1, l1 + 1).map(function injectLineNumber(line, index) {\n var lno = index + l0;\n var lno_pfx = (ws_prefix + lno).substr(-lineno_display_width);\n var rv = lno_pfx + \': \' + line;\n var errpfx = (new Array(lineno_display_width + 1)).join(\'^\');\n var offset = 2 + 1;\n var len = 0;\n\n if (lno === loc.first_line) {\n offset += loc.first_column;\n\n len = Math.max(\n 2,\n ((lno === loc.last_line ? loc.last_column : line.length)) - loc.first_column + 1\n );\n } else if (lno === loc.last_line) {\n len = Math.max(2, loc.last_column + 1);\n } else if (lno > loc.first_line && lno < loc.last_line) {\n len = Math.max(2, line.length + 1);\n }\n\n if (len) {\n var lead = new Array(offset).join(\'.\');\n var mark = new Array(len).join(\'^\');\n rv += \'\\n\' + errpfx + lead + mark;\n\n if (line.trim().length > 0) {\n nonempty_line_indexes.push(index);\n }\n }\n\n rv = rv.replace(/\\t/g, \' \');\n return rv;\n });\n\n // now make sure we don\'t print an overly large amount of error area: limit it \n // to the top and bottom line count:\n if (nonempty_line_indexes.length > 2 * MINIMUM_VISIBLE_NONEMPTY_LINE_COUNT) {\n var clip_start = nonempty_line_indexes[MINIMUM_VISIBLE_NONEMPTY_LINE_COUNT - 1] + 1;\n var clip_end = nonempty_line_indexes[nonempty_line_indexes.length - MINIMUM_VISIBLE_NONEMPTY_LINE_COUNT] - 1;\n\n var intermediate_line = (new Array(lineno_display_width + 1)).join(\' \') + \' (...continued...)\';\n intermediate_line += \'\\n\' + (new Array(lineno_display_width + 1)).join(\'-\') + \' (---------------)\';\n rv.splice(clip_start, clip_end - clip_start + 1, intermediate_line);\n }\n return rv.join(\'\\n\');\n },\n\n /**\n * helper function, used to produce a human readable description as a string, given\n * the input `yylloc` location object.\n * \n * Set `display_range_too` to TRUE to include the string character index position(s)\n * in the description if the `yylloc.range` is available.\n * \n * @public\n * @this {RegExpLexer}\n */\n describeYYLLOC: function lexer_describe_yylloc(yylloc, display_range_too) {\n var l1 = yylloc.first_line;\n var l2 = yylloc.last_line;\n var c1 = yylloc.first_column;\n var c2 = yylloc.last_column;\n var dl = l2 - l1;\n var dc = c2 - c1;\n var rv;\n if (dl === 0) {\n rv = \'line \' + l1 + \', \';\n if (dc <= 1) {\n rv += \'column \' + c1;\n } else {\n rv += \'columns \' + c1 + \' .. \' + c2;\n }\n } else {\n rv = \'lines \' + l1 + \'(column \' + c1 + \') .. \' + l2 + \'(column \' + c2 + \')\';\n }\n if (yylloc.range && display_range_too) {\n var r1 = yylloc.range[0];\n var r2 = yylloc.range[1] - 1;\n if (r2 <= r1) {\n rv += \' {String Offset: \' + r1 + \'}\';\n } else {\n rv += \' {String Offset range: \' + r1 + \' .. \' + r2 + \'}\';\n }\n }\n return rv;\n },\n\n /**\n * test the lexed token: return FALSE when not a match, otherwise return token.\n * \n * `match` is supposed to be an array coming out of a regex match, i.e. `match[0]`\n * contains the actually matched text string.\n * \n * Also move the input cursor forward and update the match collectors:\n * \n * - `yytext`\n * - `yyleng`\n * - `match`\n * - `matches`\n * - `yylloc`\n * - `offset`\n * \n * @public\n * @this {RegExpLexer}\n */\n test_match: function lexer_test_match(match, indexed_rule) {\n var token,\n lines,\n backup,\n match_str,\n match_str_len;\n\n if (this.options.backtrack_lexer) {\n // save context\n backup = {\n yylineno: this.yylineno,\n yylloc: {\n first_line: this.yylloc.first_line,\n last_line: this.yylloc.last_line,\n first_column: this.yylloc.first_column,\n last_column: this.yylloc.last_column,\n\n range: this.yylloc.range.slice(0)\n },\n yytext: this.yytext,\n match: this.match,\n matches: this.matches,\n matched: this.matched,\n yyleng: this.yyleng,\n offset: this.offset,\n _more: this._more,\n _input: this._input,\n //_signaled_error_token: this._signaled_error_token,\n yy: this.yy,\n conditionStack: this.conditionStack.slice(0),\n done: this.done\n };\n }\n\n match_str = match[0];\n match_str_len = match_str.length;\n // if (match_str.indexOf(\'\\n\') !== -1 || match_str.indexOf(\'\\r\') !== -1) {\n lines = match_str.split(/(?:\\r\\n?|\\n)/g);\n if (lines.length > 1) {\n this.yylineno += lines.length - 1;\n\n this.yylloc.last_line = this.yylineno + 1;\n this.yylloc.last_column = lines[lines.length - 1].length;\n } else {\n this.yylloc.last_column += match_str_len;\n }\n // }\n this.yytext += match_str;\n this.match += match_str;\n this.matched += match_str;\n this.matches = match;\n this.yyleng = this.yytext.length;\n this.yylloc.range[1] += match_str_len;\n\n // previous lex rules MAY have invoked the `more()` API rather than producing a token:\n // those rules will already have moved this `offset` forward matching their match lengths,\n // hence we must only add our own match length now:\n this.offset += match_str_len;\n this._more = false;\n this._backtrack = false;\n this._input = this._input.slice(match_str_len);\n\n // calling this method:\n //\n // function lexer__performAction(yy, yyrulenumber, YY_START) {...}\n token = this.performAction.call(this, this.yy, indexed_rule, this.conditionStack[this.conditionStack.length - 1] /* = YY_START */);\n // otherwise, when the action codes are all simple return token statements:\n //token = this.simpleCaseActionClusters[indexed_rule];\n\n if (this.done && this._input) {\n this.done = false;\n }\n if (token) {\n return token;\n } else if (this._backtrack) {\n // recover context\n for (var k in backup) {\n this[k] = backup[k];\n }\n this.__currentRuleSet__ = null;\n return false; // rule action called reject() implying the next rule should be tested instead.\n } else if (this._signaled_error_token) {\n // produce one \'error\' token as `.parseError()` in `reject()`\n // did not guarantee a failure signal by throwing an exception!\n token = this._signaled_error_token;\n this._signaled_error_token = false;\n return token;\n }\n return false;\n },\n\n /**\n * return next match in input\n * \n * @public\n * @this {RegExpLexer}\n */\n next: function lexer_next() {\n if (this.done) {\n this.clear();\n return this.EOF;\n }\n if (!this._input) {\n this.done = true;\n }\n\n var token,\n match,\n tempMatch,\n index;\n if (!this._more) {\n this.clear();\n }\n var spec = this.__currentRuleSet__;\n if (!spec) {\n // Update the ruleset cache as we apparently encountered a state change or just started lexing.\n // The cache is set up for fast lookup -- we assume a lexer will switch states much less often than it will\n // invoke the `lex()` token-producing API and related APIs, hence caching the set for direct access helps\n // speed up those activities a tiny bit.\n spec = this.__currentRuleSet__ = this._currentRules();\n // Check whether a *sane* condition has been pushed before: this makes the lexer robust against\n // user-programmer bugs such as https://github.com/zaach/jison-lex/issues/19\n if (!spec || !spec.rules) {\n var lineno_msg = \'\';\n if (this.options.trackPosition) {\n lineno_msg = \' on line \' + (this.yylineno + 1);\n }\n var p = this.constructLexErrorInfo(\'Internal lexer engine error\' + lineno_msg + \': The lex grammar programmer pushed a non-existing condition name "\' + this.topState() + \'"; this is a fatal error and should be reported to the application programmer team!\', false);\n // produce one \'error\' token until this situation has been resolved, most probably by parse termination!\n return (this.parseError(p.errStr, p, this.JisonLexerError) || this.ERROR);\n }\n }\n\n var rule_ids = spec.rules;\n var regexes = spec.__rule_regexes;\n var len = spec.__rule_count;\n\n // Note: the arrays are 1-based, while `len` itself is a valid index,\n // hence the non-standard less-or-equal check in the next loop condition!\n for (var i = 1; i <= len; i++) {\n tempMatch = this._input.match(regexes[i]);\n if (tempMatch && (!match || tempMatch[0].length > match[0].length)) {\n match = tempMatch;\n index = i;\n if (this.options.backtrack_lexer) {\n token = this.test_match(tempMatch, rule_ids[i]);\n if (token !== false) {\n return token;\n } else if (this._backtrack) {\n match = undefined;\n continue; // rule action called reject() implying a rule MISmatch.\n } else {\n // else: this is a lexer rule which consumes input without producing a token (e.g. whitespace)\n return false;\n }\n } else if (!this.options.flex) {\n break;\n }\n }\n }\n if (match) {\n token = this.test_match(match, rule_ids[index]);\n if (token !== false) {\n return token;\n }\n // else: this is a lexer rule which consumes input without producing a token (e.g. whitespace)\n return false;\n }\n if (!this._input) {\n this.done = true;\n this.clear();\n return this.EOF;\n } else {\n var lineno_msg = \'\';\n if (this.options.trackPosition) {\n lineno_msg = \' on line \' + (this.yylineno + 1);\n }\n var p = this.constructLexErrorInfo(\'Lexical error\' + lineno_msg + \': Unrecognized text.\', this.options.lexerErrorsAreRecoverable);\n\n var pendingInput = this._input;\n var activeCondition = this.topState();\n var conditionStackDepth = this.conditionStack.length;\n\n token = (this.parseError(p.errStr, p, this.JisonLexerError) || this.ERROR);\n if (token === this.ERROR) {\n // we can try to recover from a lexer error that `parseError()` did not \'recover\' for us\n // by moving forward at least one character at a time IFF the (user-specified?) `parseError()`\n // has not consumed/modified any pending input or changed state in the error handler:\n if (!this.matches && \n // and make sure the input has been modified/consumed ...\n pendingInput === this._input &&\n // ...or the lexer state has been modified significantly enough\n // to merit a non-consuming error handling action right now.\n activeCondition === this.topState() && \n conditionStackDepth === this.conditionStack.length\n ) {\n this.input();\n }\n }\n return token;\n }\n },\n\n /**\n * return next match that has a token\n * \n * @public\n * @this {RegExpLexer}\n */\n lex: function lexer_lex() {\n var r;\n // allow the PRE/POST handlers set/modify the return token for maximum flexibility of the generated lexer:\n if (typeof this.options.pre_lex === \'function\') {\n r = this.options.pre_lex.call(this);\n }\n\n while (!r) {\n r = this.next();\n }\n\n if (typeof this.options.post_lex === \'function\') {\n // (also account for a userdef function which does not return any value: keep the token as is)\n r = this.options.post_lex.call(this, r) || r;\n }\n return r;\n },\n\n /**\n * backwards compatible alias for `pushState()`;\n * the latter is symmetrical with `popState()` and we advise to use\n * those APIs in any modern lexer code, rather than `begin()`.\n * \n * @public\n * @this {RegExpLexer}\n */\n begin: function lexer_begin(condition) {\n return this.pushState(condition);\n },\n\n /**\n * activates a new lexer condition state (pushes the new lexer\n * condition state onto the condition stack)\n * \n * @public\n * @this {RegExpLexer}\n */\n pushState: function lexer_pushState(condition) {\n this.conditionStack.push(condition);\n this.__currentRuleSet__ = null;\n return this;\n },\n\n /**\n * pop the previously active lexer condition state off the condition\n * stack\n * \n * @public\n * @this {RegExpLexer}\n */\n popState: function lexer_popState() {\n var n = this.conditionStack.length - 1;\n if (n > 0) {\n this.__currentRuleSet__ = null; \n return this.conditionStack.pop();\n } else {\n return this.conditionStack[0];\n }\n },\n\n /**\n * return the currently active lexer condition state; when an index\n * argument is provided it produces the N-th previous condition state,\n * if available\n * \n * @public\n * @this {RegExpLexer}\n */\n topState: function lexer_topState(n) {\n n = this.conditionStack.length - 1 - Math.abs(n || 0);\n if (n >= 0) {\n return this.conditionStack[n];\n } else {\n return \'INITIAL\';\n }\n },\n\n /**\n * (internal) determine the lexer rule set which is active for the\n * currently active lexer condition state\n * \n * @public\n * @this {RegExpLexer}\n */\n _currentRules: function lexer__currentRules() {\n if (this.conditionStack.length && this.conditionStack[this.conditionStack.length - 1]) {\n return this.conditions[this.conditionStack[this.conditionStack.length - 1]];\n } else {\n return this.conditions[\'INITIAL\'];\n }\n },\n\n /**\n * return the number of states currently on the stack\n * \n * @public\n * @this {RegExpLexer}\n */\n stateStackSize: function lexer_stateStackSize() {\n return this.conditionStack.length;\n }\n}'; + // --- END lexer kernel --- +} + +RegExpLexer.prototype = new Function(rmCommonWS(_templateObject2, getRegExpLexerPrototype()))(); + +// The lexer code stripper, driven by optimization analysis settings and +// lexer options, which cannot be changed at run-time. +function stripUnusedLexerCode(src, opt) { + // uses yyleng: ..................... ${opt.lexerActionsUseYYLENG} + // uses yylineno: ................... ${opt.lexerActionsUseYYLINENO} + // uses yytext: ..................... ${opt.lexerActionsUseYYTEXT} + // uses yylloc: ..................... ${opt.lexerActionsUseYYLOC} + // uses ParseError API: ............. ${opt.lexerActionsUseParseError} + // uses location tracking & editing: ${opt.lexerActionsUseLocationTracking} + // uses more() API: ................. ${opt.lexerActionsUseMore} + // uses unput() API: ................ ${opt.lexerActionsUseUnput} + // uses reject() API: ............... ${opt.lexerActionsUseReject} + // uses less() API: ................. ${opt.lexerActionsUseLess} + // uses display APIs pastInput(), upcomingInput(), showPosition(): + // ............................. ${opt.lexerActionsUseDisplayAPIs} + // uses describeYYLLOC() API: ....... ${opt.lexerActionsUseDescribeYYLOC} + + var ast = helpers.parseCodeChunkToAST(src, opt); + var new_src = helpers.prettyPrintAST(ast, opt); + + new_src = new_src.replace(/\/\*\s*JISON-LEX-ANALYTICS-REPORT\s*\*\//g, rmCommonWS(_templateObject3, opt.options.backtrack_lexer, opt.options.ranges, opt.options.trackPosition, opt.parseActionsUseYYLENG, opt.parseActionsUseYYLINENO, opt.parseActionsUseYYTEXT, opt.parseActionsUseYYLOC, opt.parseActionsUseValueTracking, opt.parseActionsUseValueAssignment, opt.parseActionsUseLocationTracking, opt.parseActionsUseLocationAssignment, opt.lexerActionsUseYYLENG, opt.lexerActionsUseYYLINENO, opt.lexerActionsUseYYTEXT, opt.lexerActionsUseYYLOC, opt.lexerActionsUseParseError, opt.lexerActionsUseYYERROR, opt.lexerActionsUseLocationTracking, opt.lexerActionsUseMore, opt.lexerActionsUseUnput, opt.lexerActionsUseReject, opt.lexerActionsUseLess, opt.lexerActionsUseDisplayAPIs, opt.lexerActionsUseDescribeYYLOC)); + + return new_src; +} + +// generate lexer source from a grammar +/** @public */ +function generate(dict, tokens, build_options) { + var opt = processGrammar(dict, tokens, build_options); + + return generateFromOpts(opt); +} + +// process the grammar and build final data structures and functions +/** @public */ +function processGrammar(dict, tokens, build_options) { + build_options = build_options || {}; + var opts = { + // include the knowledge passed through `build_options` about which lexer + // features will actually be *used* by the environment (which in 99.9% + // of cases is a jison *parser*): + // + // (this stuff comes straight from the jison Optimization Analysis.) + // + parseActionsUseYYLENG: build_options.parseActionsUseYYLENG, + parseActionsUseYYLINENO: build_options.parseActionsUseYYLINENO, + parseActionsUseYYTEXT: build_options.parseActionsUseYYTEXT, + parseActionsUseYYLOC: build_options.parseActionsUseYYLOC, + parseActionsUseParseError: build_options.parseActionsUseParseError, + parseActionsUseYYERROR: build_options.parseActionsUseYYERROR, + parseActionsUseYYERROK: build_options.parseActionsUseYYERROK, + parseActionsUseYYRECOVERING: build_options.parseActionsUseYYRECOVERING, + parseActionsUseYYCLEARIN: build_options.parseActionsUseYYCLEARIN, + parseActionsUseValueTracking: build_options.parseActionsUseValueTracking, + parseActionsUseValueAssignment: build_options.parseActionsUseValueAssignment, + parseActionsUseLocationTracking: build_options.parseActionsUseLocationTracking, + parseActionsUseLocationAssignment: build_options.parseActionsUseLocationAssignment, + parseActionsUseYYSTACK: build_options.parseActionsUseYYSTACK, + parseActionsUseYYSSTACK: build_options.parseActionsUseYYSSTACK, + parseActionsUseYYSTACKPOINTER: build_options.parseActionsUseYYSTACKPOINTER, + parseActionsUseYYRULELENGTH: build_options.parseActionsUseYYRULELENGTH, + parserHasErrorRecovery: build_options.parserHasErrorRecovery, + parserHasErrorReporting: build_options.parserHasErrorReporting, + + lexerActionsUseYYLENG: '???', + lexerActionsUseYYLINENO: '???', + lexerActionsUseYYTEXT: '???', + lexerActionsUseYYLOC: '???', + lexerActionsUseParseError: '???', + lexerActionsUseYYERROR: '???', + lexerActionsUseLocationTracking: '???', + lexerActionsUseMore: '???', + lexerActionsUseUnput: '???', + lexerActionsUseReject: '???', + lexerActionsUseLess: '???', + lexerActionsUseDisplayAPIs: '???', + lexerActionsUseDescribeYYLOC: '???' + }; + + dict = autodetectAndConvertToJSONformat(dict, build_options) || {}; + + // Feed the possibly reprocessed 'dictionary' above back to the caller + // (for use by our error diagnostic assistance code) + opts.lex_rule_dictionary = dict; + + // Always provide the lexer with an options object, even if it's empty! + // Make sure to camelCase all options: + opts.options = mkStdOptions(build_options, dict.options); + + opts.moduleType = opts.options.moduleType; + opts.moduleName = opts.options.moduleName; + + opts.conditions = prepareStartConditions(dict.startConditions); + opts.conditions.INITIAL = { + rules: [], + inclusive: true + }; + + var code = buildActions(dict, tokens, opts); + opts.performAction = code.actions; + opts.caseHelperInclude = code.caseHelperInclude; + opts.rules = code.rules; + opts.macros = code.macros; + + opts.regular_rule_count = code.regular_rule_count; + opts.simple_rule_count = code.simple_rule_count; + + opts.conditionStack = ['INITIAL']; + + opts.actionInclude = dict.actionInclude || ''; + opts.moduleInclude = (opts.moduleInclude || '') + (dict.moduleInclude || '').trim(); + + return opts; +} + +// Assemble the final source from the processed grammar +/** @public */ +function generateFromOpts(opt) { + var code = ''; + + switch (opt.moduleType) { + case 'js': + code = generateModule(opt); + break; + case 'amd': + code = generateAMDModule(opt); + break; + case 'es': + code = generateESModule(opt); + break; + case 'commonjs': + default: + code = generateCommonJSModule(opt); + break; + } + + return code; +} + +function generateRegexesInitTableCode(opt) { + var a = opt.rules; + var print_xregexp = opt.options && opt.options.xregexp; + var id_display_width = 1 + Math.log10(a.length | 1) | 0; + var ws_prefix = new Array(id_display_width).join(' '); + var b = a.map(function generateXRegExpInitCode(re, idx) { + var idx_str = (ws_prefix + idx).substr(-id_display_width); + + if (re instanceof XRegExp) { + // When we don't need the special XRegExp sauce at run-time, we do with the original + // JavaScript RegExp instance a.k.a. 'native regex': + if (re.xregexp.isNative || !print_xregexp) { + return '/* ' + idx_str + ': */ ' + re; + } + // And make sure to escape the regex to make it suitable for placement inside a *string* + // as it is passed as a string argument to the XRegExp constructor here. + var re_src = re.xregexp.source.replace(/[\\"]/g, '\\$&'); + return '/* ' + idx_str + ': */ new XRegExp("' + re_src + '", "' + re.xregexp.flags + '")'; + } else { + return '/* ' + idx_str + ': */ ' + re; + } + }); + return b.join(',\n'); +} + +function generateModuleBody(opt) { + // make the JSON output look more like JavaScript: + function cleanupJSON(str) { + str = str.replace(/ "rules": \[/g, ' rules: ['); + str = str.replace(/ "inclusive": /g, ' inclusive: '); + return str; + } + + function produceOptions(opts) { + var obj = {}; + var do_not_pass = { + debug: !opts.debug, // do not include this item when it is FALSE as there's no debug tracing built into the generated grammar anyway! + enableDebugLogs: 1, + json: 1, + _: 1, + noMain: 1, + dumpSourceCodeOnFailure: 1, + throwErrorOnCompileFailure: 1, + reportStats: 1, + file: 1, + outfile: 1, + inputPath: 1, + inputFilename: 1, + defaultModuleName: 1, + moduleName: 1, + moduleType: 1, + lexerErrorsAreRecoverable: 0, + flex: 0, + backtrack_lexer: 0, + caseInsensitive: 0, + showSource: 1, + exportAST: 1, + exportAllTables: 1, + exportSourceCode: 1, + prettyCfg: 1, + parseActionsUseYYLENG: 1, + parseActionsUseYYLINENO: 1, + parseActionsUseYYTEXT: 1, + parseActionsUseYYLOC: 1, + parseActionsUseParseError: 1, + parseActionsUseYYERROR: 1, + parseActionsUseYYRECOVERING: 1, + parseActionsUseYYERROK: 1, + parseActionsUseYYCLEARIN: 1, + parseActionsUseValueTracking: 1, + parseActionsUseValueAssignment: 1, + parseActionsUseLocationTracking: 1, + parseActionsUseLocationAssignment: 1, + parseActionsUseYYSTACK: 1, + parseActionsUseYYSSTACK: 1, + parseActionsUseYYSTACKPOINTER: 1, + parseActionsUseYYRULELENGTH: 1, + parserHasErrorRecovery: 1, + parserHasErrorReporting: 1, + lexerActionsUseYYLENG: 1, + lexerActionsUseYYLINENO: 1, + lexerActionsUseYYTEXT: 1, + lexerActionsUseYYLOC: 1, + lexerActionsUseParseError: 1, + lexerActionsUseYYERROR: 1, + lexerActionsUseLocationTracking: 1, + lexerActionsUseMore: 1, + lexerActionsUseUnput: 1, + lexerActionsUseReject: 1, + lexerActionsUseLess: 1, + lexerActionsUseDisplayAPIs: 1, + lexerActionsUseDescribeYYLOC: 1 + }; + for (var k in opts) { + if (!do_not_pass[k] && opts[k] != null && opts[k] !== false) { + // make sure numeric values are encoded as numeric, the rest as boolean/string. + if (typeof opts[k] === 'string') { + var f = parseFloat(opts[k]); + if (f == opts[k]) { + obj[k] = f; + continue; + } + } + obj[k] = opts[k]; + } + } + + // And now some options which should receive some special processing: + var pre = obj.pre_lex; + var post = obj.post_lex; + // since JSON cannot encode functions, we'll have to do it manually at run-time, i.e. later on: + if (pre) { + obj.pre_lex = true; + } + if (post) { + obj.post_lex = true; + } + + var js = JSON.stringify(obj, null, 2); + + js = js.replace(new XRegExp(' "(' + ID_REGEX_BASE + ')": ', 'g'), ' $1: '); + js = js.replace(/^( +)pre_lex: true(,)?$/gm, function (m, ls, tc) { + return ls + 'pre_lex: ' + String(pre) + (tc || ''); + }); + js = js.replace(/^( +)post_lex: true(,)?$/gm, function (m, ls, tc) { + return ls + 'post_lex: ' + String(post) + (tc || ''); + }); + return js; + } + + var out; + if (opt.rules.length > 0 || opt.__in_rules_failure_analysis_mode__) { + // we don't mind that the `test_me()` code above will have this `lexer` variable re-defined: + // JavaScript is fine with that. + var code = [rmCommonWS(_templateObject4), '/*JISON-LEX-ANALYTICS-REPORT*/' /* slot #1: placeholder for analysis report further below */ + ]; + + // get the RegExpLexer.prototype in source code form: + var protosrc = getRegExpLexerPrototype(); + // and strip off the surrounding bits we don't want: + protosrc = protosrc.replace(/^[\s\r\n]*\{/, '').replace(/\s*\}[\s\r\n]*$/, '').trim(); + code.push(protosrc + ',\n'); + + assert(opt.options); + // Assure all options are camelCased: + assert(typeof opt.options['case-insensitive'] === 'undefined'); + + code.push(' options: ' + produceOptions(opt.options)); + + var performActionCode = String(opt.performAction); + var simpleCaseActionClustersCode = String(opt.caseHelperInclude); + var rulesCode = generateRegexesInitTableCode(opt); + var conditionsCode = cleanupJSON(JSON.stringify(opt.conditions, null, 2)); + code.push(rmCommonWS(_templateObject5, performActionCode, simpleCaseActionClustersCode, rulesCode, conditionsCode)); + + opt.is_custom_lexer = false; + + out = code.join(''); + } else { + // We're clearly looking at a custom lexer here as there's no lexer rules at all. + // + // We are re-purposing the `%{...%}` `actionInclude` code block here as it serves no purpose otherwise. + // + // Meanwhile we make sure we have the `lexer` variable declared in *local scope* no matter + // what crazy stuff (or lack thereof) the userland code is pulling in the `actionInclude` chunk. + out = 'var lexer;\n'; + + assert(opt.regular_rule_count === 0); + assert(opt.simple_rule_count === 0); + opt.is_custom_lexer = true; + + if (opt.actionInclude) { + out += opt.actionInclude + (!opt.actionInclude.match(/;[\s\r\n]*$/) ? ';' : '') + '\n'; + } + } + + // The output of this function is guaranteed to read something like this: + // + // ``` + // var lexer; + // + // bla bla bla bla ... lotsa bla bla; + // ``` + // + // and that should work nicely as an `eval()`-able piece of source code. + return out; +} + +function generateGenericHeaderComment() { + var out = rmCommonWS(_templateObject6, version); + + return out; +} + +function prepareOptions(opt) { + opt = opt || {}; + + // check for illegal identifier + if (!opt.moduleName || !opt.moduleName.match(/^[a-zA-Z_$][a-zA-Z0-9_$\.]*$/)) { + if (opt.moduleName) { + var msg = 'WARNING: The specified moduleName "' + opt.moduleName + '" is illegal (only characters [a-zA-Z0-9_$] and "." dot are accepted); using the default moduleName "lexer" instead.'; + if (typeof opt.warn_cb === 'function') { + opt.warn_cb(msg); + } else { + // do not treat as warning; barf hairball instead so that this oddity gets noticed right away! + throw new Error(msg); + } + } + opt.moduleName = 'lexer'; + } + + prepExportStructures(opt); + + return opt; +} + +function generateModule(opt) { + opt = prepareOptions(opt); + + var out = [generateGenericHeaderComment(), '', 'var ' + opt.moduleName + ' = (function () {', jisonLexerErrorDefinition, '', generateModuleBody(opt), '', opt.moduleInclude ? opt.moduleInclude + ';' : '', '', 'return lexer;', '})();']; + + var src = out.join('\n') + '\n'; + src = stripUnusedLexerCode(src, opt); + opt.exportSourceCode.all = src; + return src; +} + +function generateAMDModule(opt) { + opt = prepareOptions(opt); + + var out = [generateGenericHeaderComment(), '', 'define([], function () {', jisonLexerErrorDefinition, '', generateModuleBody(opt), '', opt.moduleInclude ? opt.moduleInclude + ';' : '', '', 'return lexer;', '});']; + + var src = out.join('\n') + '\n'; + src = stripUnusedLexerCode(src, opt); + opt.exportSourceCode.all = src; + return src; +} + +function generateESModule(opt) { + opt = prepareOptions(opt); + + var out = [generateGenericHeaderComment(), '', 'var lexer = (function () {', jisonLexerErrorDefinition, '', generateModuleBody(opt), '', opt.moduleInclude ? opt.moduleInclude + ';' : '', '', 'return lexer;', '})();', '', 'function yylex() {', ' return lexer.lex.apply(lexer, arguments);', '}', rmCommonWS(_templateObject7)]; + + var src = out.join('\n') + '\n'; + src = stripUnusedLexerCode(src, opt); + opt.exportSourceCode.all = src; + return src; +} + +function generateCommonJSModule(opt) { + opt = prepareOptions(opt); + + var out = [generateGenericHeaderComment(), '', 'var ' + opt.moduleName + ' = (function () {', jisonLexerErrorDefinition, '', generateModuleBody(opt), '', opt.moduleInclude ? opt.moduleInclude + ';' : '', '', 'return lexer;', '})();', '', 'if (typeof require !== \'undefined\' && typeof exports !== \'undefined\') {', ' exports.lexer = ' + opt.moduleName + ';', ' exports.lex = function () {', ' return ' + opt.moduleName + '.lex.apply(lexer, arguments);', ' };', '}']; + + var src = out.join('\n') + '\n'; + src = stripUnusedLexerCode(src, opt); + opt.exportSourceCode.all = src; + return src; +} + +RegExpLexer.generate = generate; + +RegExpLexer.version = version; +RegExpLexer.defaultJisonLexOptions = defaultJisonLexOptions; +RegExpLexer.mkStdOptions = mkStdOptions; +RegExpLexer.camelCase = camelCase; +RegExpLexer.autodetectAndConvertToJSONformat = autodetectAndConvertToJSONformat; + +module.exports = RegExpLexer; diff --git a/dist/regexp-lexer-cjs.js b/dist/regexp-lexer-cjs.js new file mode 100644 index 0000000..55ba938 --- /dev/null +++ b/dist/regexp-lexer-cjs.js @@ -0,0 +1,4051 @@ +'use strict'; + +function _interopDefault (ex) { return (ex && (typeof ex === 'object') && 'default' in ex) ? ex['default'] : ex; } + +var XRegExp = _interopDefault(require('@gerhobbelt/xregexp')); +var json5 = _interopDefault(require('@gerhobbelt/json5')); +var lexParser = _interopDefault(require('@gerhobbelt/lex-parser')); +var assert = _interopDefault(require('assert')); +var helpers = _interopDefault(require('jison-helpers-lib')); + +// +// Helper library for set definitions +// +// MIT Licensed +// +// +// This code is intended to help parse regex set expressions and mix them +// together, i.e. to answer questions like this: +// +// what is the resulting regex set expression when we mix the regex set +// `[a-z]` with the regex set `[^\s]` where with 'mix' we mean that any +// input which matches either input regex should match the resulting +// regex set. (a.k.a. Full Outer Join, see also http://www.diffen.com/difference/Inner_Join_vs_Outer_Join) +// + +'use strict'; + +const XREGEXP_UNICODE_ESCAPE_RE$1 = /^\{[A-Za-z0-9 \-\._]+\}/; // Matches the XRegExp Unicode escape braced part, e.g. `{Number}` +const CHR_RE$1 = /^(?:[^\\]|\\[^cxu0-9]|\\[0-9]{1,3}|\\c[A-Z]|\\x[0-9a-fA-F]{2}|\\u[0-9a-fA-F]{4}|\\u\{[0-9a-fA-F]+\})/; +const SET_PART_RE$1 = /^(?:[^\\\]]|\\[^cxu0-9]|\\[0-9]{1,3}|\\c[A-Z]|\\x[0-9a-fA-F]{2}|\\u[0-9a-fA-F]{4}|\\u\{[0-9a-fA-F]+\})+/; +const NOTHING_SPECIAL_RE$1 = /^(?:[^\\\[\]\(\)\|^\{\}]|\\[^cxu0-9]|\\[0-9]{1,3}|\\c[A-Z]|\\x[0-9a-fA-F]{2}|\\u[0-9a-fA-F]{4}|\\u\{[0-9a-fA-F]+\})+/; +const SET_IS_SINGLE_PCODE_RE = /^\\[dDwWsS]$|^\\p\{[A-Za-z0-9 \-\._]+\}$/; + +const UNICODE_BASE_PLANE_MAX_CP$1 = 65535; + +// The expanded regex sets which are equivalent to the given `\\{c}` escapes: +// +// `/\s/`: +const WHITESPACE_SETSTR$1 = ' \f\n\r\t\v\u00a0\u1680\u180e\u2000-\u200a\u2028\u2029\u202f\u205f\u3000\ufeff'; +// `/\d/`: +const DIGIT_SETSTR$1 = '0-9'; +// `/\w/`: +const WORDCHAR_SETSTR$1 = 'A-Za-z0-9_'; + + + + + +// Helper for `bitarray2set()`: convert character code to a representation string suitable for use in a regex +function i2c(i) { + var c, x; + + switch (i) { + case 10: + return '\\n'; + + case 13: + return '\\r'; + + case 9: + return '\\t'; + + case 8: + return '\\b'; + + case 12: + return '\\f'; + + case 11: + return '\\v'; + + case 45: // ASCII/Unicode for '-' dash + return '\\-'; + + case 91: // '[' + return '\\['; + + case 92: // '\\' + return '\\\\'; + + case 93: // ']' + return '\\]'; + + case 94: // ']' + return '\\^'; + } + if (i < 32 + || i > 0xFFF0 /* Unicode Specials, also in UTF16 */ + || (i >= 0xD800 && i <= 0xDFFF) /* Unicode Supplementary Planes; we're TOAST in JavaScript as we're NOT UTF-16 but UCS-2! */ + || String.fromCharCode(i).match(/[\u2028\u2029]/) /* Code compilation via `new Function()` does not like to see these, or rather: treats them as just another form of CRLF, which breaks your generated regex code! */ + ) { + // Detail about a detail: + // U+2028 and U+2029 are part of the `\s` regex escape code (`\s` and `[\s]` match either of these) and when placed in a JavaScript + // source file verbatim (without escaping it as a `\uNNNN` item) then JavaScript will interpret it as such and consequently report + // a b0rked generated parser, as the generated code would include this regex right here. + // Hence we MUST escape these buggers everywhere we go... + x = i.toString(16); + if (x.length >= 1 && i <= 0xFFFF) { + c = '0000' + x; + return '\\u' + c.substr(c.length - 4); + } else { + return '\\u{' + x + '}'; + } + } + return String.fromCharCode(i); +} + + +// Helper collection for `bitarray2set()`: we have expanded all these cached `\\p{NAME}` regex sets when creating +// this bitarray and now we should look at these expansions again to see if `bitarray2set()` can produce a +// `\\p{NAME}` shorthand to represent [part of] the bitarray: +var Pcodes_bitarray_cache = {}; +var Pcodes_bitarray_cache_test_order = []; + +// Helper collection for `bitarray2set()` for minifying special cases of result sets which can be represented by +// a single regex 'escape', e.g. `\d` for digits 0-9. +var EscCode_bitarray_output_refs; + +// now initialize the EscCodes_... table above: +init_EscCode_lookup_table(); + +function init_EscCode_lookup_table() { + var s, bitarr, set2esc = {}, esc2bitarr = {}; + + // patch global lookup tables for the time being, while we calculate their *real* content in this function: + EscCode_bitarray_output_refs = { + esc2bitarr: {}, + set2esc: {} + }; + Pcodes_bitarray_cache_test_order = []; + + // `/\S': + bitarr = []; + set2bitarray(bitarr, '^' + WHITESPACE_SETSTR$1); + s = bitarray2set(bitarr); + esc2bitarr['S'] = bitarr; + set2esc[s] = 'S'; + // set2esc['^' + s] = 's'; + Pcodes_bitarray_cache['\\S'] = bitarr; + + // `/\s': + bitarr = []; + set2bitarray(bitarr, WHITESPACE_SETSTR$1); + s = bitarray2set(bitarr); + esc2bitarr['s'] = bitarr; + set2esc[s] = 's'; + // set2esc['^' + s] = 'S'; + Pcodes_bitarray_cache['\\s'] = bitarr; + + // `/\D': + bitarr = []; + set2bitarray(bitarr, '^' + DIGIT_SETSTR$1); + s = bitarray2set(bitarr); + esc2bitarr['D'] = bitarr; + set2esc[s] = 'D'; + // set2esc['^' + s] = 'd'; + Pcodes_bitarray_cache['\\D'] = bitarr; + + // `/\d': + bitarr = []; + set2bitarray(bitarr, DIGIT_SETSTR$1); + s = bitarray2set(bitarr); + esc2bitarr['d'] = bitarr; + set2esc[s] = 'd'; + // set2esc['^' + s] = 'D'; + Pcodes_bitarray_cache['\\d'] = bitarr; + + // `/\W': + bitarr = []; + set2bitarray(bitarr, '^' + WORDCHAR_SETSTR$1); + s = bitarray2set(bitarr); + esc2bitarr['W'] = bitarr; + set2esc[s] = 'W'; + // set2esc['^' + s] = 'w'; + Pcodes_bitarray_cache['\\W'] = bitarr; + + // `/\w': + bitarr = []; + set2bitarray(bitarr, WORDCHAR_SETSTR$1); + s = bitarray2set(bitarr); + esc2bitarr['w'] = bitarr; + set2esc[s] = 'w'; + // set2esc['^' + s] = 'W'; + Pcodes_bitarray_cache['\\w'] = bitarr; + + EscCode_bitarray_output_refs = { + esc2bitarr: esc2bitarr, + set2esc: set2esc + }; + + updatePcodesBitarrayCacheTestOrder(); +} + +function updatePcodesBitarrayCacheTestOrder(opts) { + var t = new Array(UNICODE_BASE_PLANE_MAX_CP$1 + 1); + var l = {}; + var user_has_xregexp = opts && opts.options && opts.options.xregexp; + var i, j, k, ba; + + // mark every character with which regex pcodes they are part of: + for (k in Pcodes_bitarray_cache) { + ba = Pcodes_bitarray_cache[k]; + + if (!user_has_xregexp && k.indexOf('\\p{') >= 0) { + continue; + } + + var cnt = 0; + for (i = 0; i <= UNICODE_BASE_PLANE_MAX_CP$1; i++) { + if (ba[i]) { + cnt++; + if (!t[i]) { + t[i] = [k]; + } else { + t[i].push(k); + } + } + } + l[k] = cnt; + } + + // now dig out the unique ones: only need one per pcode. + // + // We ASSUME every \\p{NAME} 'pcode' has at least ONE character + // in it that is ONLY matched by that particular pcode. + // If this assumption fails, nothing is lost, but our 'regex set + // optimized representation' will be sub-optimal as than this pcode + // won't be tested during optimization. + // + // Now that would be a pity, so the assumption better holds... + // Turns out the assumption doesn't hold already for /\S/ + /\D/ + // as the second one (\D) is a pure subset of \S. So we have to + // look for markers which match multiple escapes/pcodes for those + // ones where a unique item isn't available... + var lut = []; + var done = {}; + var keys = Object.keys(Pcodes_bitarray_cache); + + for (i = 0; i <= UNICODE_BASE_PLANE_MAX_CP$1; i++) { + k = t[i][0]; + if (t[i].length === 1 && !done[k]) { + assert(l[k] > 0); + lut.push([i, k]); + done[k] = true; + } + } + + for (j = 0; keys[j]; j++) { + k = keys[j]; + + if (!user_has_xregexp && k.indexOf('\\p{') >= 0) { + continue; + } + + if (!done[k]) { + assert(l[k] > 0); + // find a minimum span character to mark this one: + var w = Infinity; + var rv; + ba = Pcodes_bitarray_cache[k]; + for (i = 0; i <= UNICODE_BASE_PLANE_MAX_CP$1; i++) { + if (ba[i]) { + var tl = t[i].length; + if (tl > 1 && tl < w) { + assert(l[k] > 0); + rv = [i, k]; + w = tl; + } + } + } + if (rv) { + done[k] = true; + lut.push(rv); + } + } + } + + // order from large set to small set so that small sets don't gobble + // characters also represented by overlapping larger set pcodes. + // + // Again we assume something: that finding the large regex pcode sets + // before the smaller, more specialized ones, will produce a more + // optimal minification of the regex set expression. + // + // This is a guestimate/heuristic only! + lut.sort(function (a, b) { + var k1 = a[1]; + var k2 = b[1]; + var ld = l[k2] - l[k1]; + if (ld) { + return ld; + } + // and for same-size sets, order from high to low unique identifier. + return b[0] - a[0]; + }); + + Pcodes_bitarray_cache_test_order = lut; +} + + + + + + +// 'Join' a regex set `[...]` into a Unicode range spanning logic array, flagging every character in the given set. +function set2bitarray(bitarr, s, opts) { + var orig = s; + var set_is_inverted = false; + var bitarr_orig; + + function mark(d1, d2) { + if (d2 == null) d2 = d1; + for (var i = d1; i <= d2; i++) { + bitarr[i] = true; + } + } + + function add2bitarray(dst, src) { + for (var i = 0; i <= UNICODE_BASE_PLANE_MAX_CP$1; i++) { + if (src[i]) { + dst[i] = true; + } + } + } + + function eval_escaped_code(s) { + var c; + // decode escaped code? If none, just take the character as-is + if (s.indexOf('\\') === 0) { + var l = s.substr(0, 2); + switch (l) { + case '\\c': + c = s.charCodeAt(2) - 'A'.charCodeAt(0) + 1; + return String.fromCharCode(c); + + case '\\x': + s = s.substr(2); + c = parseInt(s, 16); + return String.fromCharCode(c); + + case '\\u': + s = s.substr(2); + if (s[0] === '{') { + s = s.substr(1, s.length - 2); + } + c = parseInt(s, 16); + if (c >= 0x10000) { + return new Error('We do NOT support Extended Plane Unicode Codepoints (i.e. CodePoints beyond U:FFFF) in regex set expressions, e.g. \\u{' + s + '}'); + } + return String.fromCharCode(c); + + case '\\0': + case '\\1': + case '\\2': + case '\\3': + case '\\4': + case '\\5': + case '\\6': + case '\\7': + s = s.substr(1); + c = parseInt(s, 8); + return String.fromCharCode(c); + + case '\\r': + return '\r'; + + case '\\n': + return '\n'; + + case '\\v': + return '\v'; + + case '\\f': + return '\f'; + + case '\\t': + return '\t'; + + case '\\b': + return '\b'; + + default: + // just the character itself: + return s.substr(1); + } + } else { + return s; + } + } + + if (s && s.length) { + var c1, c2; + + // inverted set? + if (s[0] === '^') { + set_is_inverted = true; + s = s.substr(1); + bitarr_orig = bitarr; + bitarr = new Array(UNICODE_BASE_PLANE_MAX_CP$1 + 1); + } + + // BITARR collects flags for characters set. Inversion means the complement set of character is st instead. + // This results in an OR operations when sets are joined/chained. + + while (s.length) { + c1 = s.match(CHR_RE$1); + if (!c1) { + // hit an illegal escape sequence? cope anyway! + c1 = s[0]; + } else { + c1 = c1[0]; + // Quick hack for XRegExp escapes inside a regex `[...]` set definition: we *could* try to keep those + // intact but it's easier to unfold them here; this is not nice for when the grammar specifies explicit + // XRegExp support, but alas, we'll get there when we get there... ;-) + switch (c1) { + case '\\p': + s = s.substr(c1.length); + c2 = s.match(XREGEXP_UNICODE_ESCAPE_RE$1); + if (c2) { + c2 = c2[0]; + s = s.substr(c2.length); + // do we have this one cached already? + var pex = c1 + c2; + var ba4p = Pcodes_bitarray_cache[pex]; + if (!ba4p) { + // expand escape: + var xr = new XRegExp('[' + pex + ']'); // TODO: case-insensitive grammar??? + // rewrite to a standard `[...]` regex set: XRegExp will do this for us via `XRegExp.toString()`: + var xs = '' + xr; + // remove the wrapping `/.../` to get at the (possibly *combined* series of) `[...]` sets inside: + xs = xs.substr(1, xs.length - 2); + + ba4p = reduceRegexToSetBitArray(xs, pex, opts); + + Pcodes_bitarray_cache[pex] = ba4p; + updatePcodesBitarrayCacheTestOrder(opts); + } + // merge bitarrays: + add2bitarray(bitarr, ba4p); + continue; + } + break; + + case '\\S': + case '\\s': + case '\\W': + case '\\w': + case '\\d': + case '\\D': + // these can't participate in a range, but need to be treated special: + s = s.substr(c1.length); + // check for \S, \s, \D, \d, \W, \w and expand them: + var ba4e = EscCode_bitarray_output_refs.esc2bitarr[c1[1]]; + assert(ba4e); + add2bitarray(bitarr, ba4e); + continue; + + case '\\b': + // matches a backspace: https://developer.mozilla.org/en/docs/Web/JavaScript/Guide/Regular_Expressions#special-backspace + c1 = '\u0008'; + break; + } + } + var v1 = eval_escaped_code(c1); + // propagate deferred exceptions = error reports. + if (v1 instanceof Error) { + return v1; + } + v1 = v1.charCodeAt(0); + s = s.substr(c1.length); + + if (s[0] === '-' && s.length >= 2) { + // we can expect a range like 'a-z': + s = s.substr(1); + c2 = s.match(CHR_RE$1); + if (!c2) { + // hit an illegal escape sequence? cope anyway! + c2 = s[0]; + } else { + c2 = c2[0]; + } + var v2 = eval_escaped_code(c2); + // propagate deferred exceptions = error reports. + if (v2 instanceof Error) { + return v1; + } + v2 = v2.charCodeAt(0); + s = s.substr(c2.length); + + // legal ranges go UP, not /DOWN! + if (v1 <= v2) { + mark(v1, v2); + } else { + console.warn('INVALID CHARACTER RANGE found in regex: ', { re: orig, start: c1, start_n: v1, end: c2, end_n: v2 }); + mark(v1); + mark('-'.charCodeAt(0)); + mark(v2); + } + continue; + } + mark(v1); + } + + // When we have marked all slots, '^' NEGATES the set, hence we flip all slots. + // + // Since a regex like `[^]` should match everything(?really?), we don't need to check if the MARK + // phase actually marked anything at all: the `^` negation will correctly flip=mark the entire + // range then. + if (set_is_inverted) { + for (var i = 0; i <= UNICODE_BASE_PLANE_MAX_CP$1; i++) { + if (!bitarr[i]) { + bitarr_orig[i] = true; + } + } + } + } + return false; +} + + +// convert a simple bitarray back into a regex set `[...]` content: +function bitarray2set(l, output_inverted_variant, output_minimized) { + // construct the inverse(?) set from the mark-set: + // + // Before we do that, we inject a sentinel so that our inner loops + // below can be simple and fast: + l[UNICODE_BASE_PLANE_MAX_CP$1 + 1] = 1; + // now reconstruct the regex set: + var rv = []; + var i, j, cnt, lut, tn, tspec, match, pcode, ba4pcode, l2; + var bitarr_is_cloned = false; + var l_orig = l; + + if (output_inverted_variant) { + // generate the inverted set, hence all unmarked slots are part of the output range: + cnt = 0; + for (i = 0; i <= UNICODE_BASE_PLANE_MAX_CP$1; i++) { + if (!l[i]) { + cnt++; + } + } + if (cnt === UNICODE_BASE_PLANE_MAX_CP$1 + 1) { + // When there's nothing in the output we output a special 'match-nothing' regex: `[^\S\s]`. + // BUT... since we output the INVERTED set, we output the match-all set instead: + return '\\S\\s'; + } + else if (cnt === 0) { + // When we find the entire Unicode range is in the output match set, we replace this with + // a shorthand regex: `[\S\s]` + // BUT... since we output the INVERTED set, we output the match-nothing set instead: + return '^\\S\\s'; + } + + // Now see if we can replace several bits by an escape / pcode: + if (output_minimized) { + lut = Pcodes_bitarray_cache_test_order; + for (tn = 0; lut[tn]; tn++) { + tspec = lut[tn]; + // check if the uniquely identifying char is in the inverted set: + if (!l[tspec[0]]) { + // check if the pcode is covered by the inverted set: + pcode = tspec[1]; + ba4pcode = Pcodes_bitarray_cache[pcode]; + match = 0; + for (j = 0; j <= UNICODE_BASE_PLANE_MAX_CP$1; j++) { + if (ba4pcode[j]) { + if (!l[j]) { + // match in current inverted bitset, i.e. there's at + // least one 'new' bit covered by this pcode/escape: + match++; + } else if (l_orig[j]) { + // mismatch! + match = false; + break; + } + } + } + + // We're only interested in matches which actually cover some + // yet uncovered bits: `match !== 0 && match !== false`. + // + // Apply the heuristic that the pcode/escape is only going to be used + // when it covers *more* characters than its own identifier's length: + if (match && match > pcode.length) { + rv.push(pcode); + + // and nuke the bits in the array which match the given pcode: + // make sure these edits are visible outside this function as + // `l` is an INPUT parameter (~ not modified)! + if (!bitarr_is_cloned) { + l2 = new Array(UNICODE_BASE_PLANE_MAX_CP$1 + 1); + for (j = 0; j <= UNICODE_BASE_PLANE_MAX_CP$1; j++) { + l2[j] = l[j] || ba4pcode[j]; // `!(!l[j] && !ba4pcode[j])` + } + // recreate sentinel + l2[UNICODE_BASE_PLANE_MAX_CP$1 + 1] = 1; + l = l2; + bitarr_is_cloned = true; + } else { + for (j = 0; j <= UNICODE_BASE_PLANE_MAX_CP$1; j++) { + l[j] = l[j] || ba4pcode[j]; + } + } + } + } + } + } + + i = 0; + while (i <= UNICODE_BASE_PLANE_MAX_CP$1) { + // find first character not in original set: + while (l[i]) { + i++; + } + if (i >= UNICODE_BASE_PLANE_MAX_CP$1 + 1) { + break; + } + // find next character not in original set: + for (j = i + 1; !l[j]; j++) {} /* empty loop */ + // generate subset: + rv.push(i2c(i)); + if (j - 1 > i) { + rv.push((j - 2 > i ? '-' : '') + i2c(j - 1)); + } + i = j; + } + } else { + // generate the non-inverted set, hence all logic checks are inverted here... + cnt = 0; + for (i = 0; i <= UNICODE_BASE_PLANE_MAX_CP$1; i++) { + if (l[i]) { + cnt++; + } + } + if (cnt === UNICODE_BASE_PLANE_MAX_CP$1 + 1) { + // When we find the entire Unicode range is in the output match set, we replace this with + // a shorthand regex: `[\S\s]` + return '\\S\\s'; + } + else if (cnt === 0) { + // When there's nothing in the output we output a special 'match-nothing' regex: `[^\S\s]`. + return '^\\S\\s'; + } + + // Now see if we can replace several bits by an escape / pcode: + if (output_minimized) { + lut = Pcodes_bitarray_cache_test_order; + for (tn = 0; lut[tn]; tn++) { + tspec = lut[tn]; + // check if the uniquely identifying char is in the set: + if (l[tspec[0]]) { + // check if the pcode is covered by the set: + pcode = tspec[1]; + ba4pcode = Pcodes_bitarray_cache[pcode]; + match = 0; + for (j = 0; j <= UNICODE_BASE_PLANE_MAX_CP$1; j++) { + if (ba4pcode[j]) { + if (l[j]) { + // match in current bitset, i.e. there's at + // least one 'new' bit covered by this pcode/escape: + match++; + } else if (!l_orig[j]) { + // mismatch! + match = false; + break; + } + } + } + + // We're only interested in matches which actually cover some + // yet uncovered bits: `match !== 0 && match !== false`. + // + // Apply the heuristic that the pcode/escape is only going to be used + // when it covers *more* characters than its own identifier's length: + if (match && match > pcode.length) { + rv.push(pcode); + + // and nuke the bits in the array which match the given pcode: + // make sure these edits are visible outside this function as + // `l` is an INPUT parameter (~ not modified)! + if (!bitarr_is_cloned) { + l2 = new Array(UNICODE_BASE_PLANE_MAX_CP$1 + 1); + for (j = 0; j <= UNICODE_BASE_PLANE_MAX_CP$1; j++) { + l2[j] = l[j] && !ba4pcode[j]; + } + // recreate sentinel + l2[UNICODE_BASE_PLANE_MAX_CP$1 + 1] = 1; + l = l2; + bitarr_is_cloned = true; + } else { + for (j = 0; j <= UNICODE_BASE_PLANE_MAX_CP$1; j++) { + l[j] = l[j] && !ba4pcode[j]; + } + } + } + } + } + } + + i = 0; + while (i <= UNICODE_BASE_PLANE_MAX_CP$1) { + // find first character not in original set: + while (!l[i]) { + i++; + } + if (i >= UNICODE_BASE_PLANE_MAX_CP$1 + 1) { + break; + } + // find next character not in original set: + for (j = i + 1; l[j]; j++) {} /* empty loop */ + if (j > UNICODE_BASE_PLANE_MAX_CP$1 + 1) { + j = UNICODE_BASE_PLANE_MAX_CP$1 + 1; + } + // generate subset: + rv.push(i2c(i)); + if (j - 1 > i) { + rv.push((j - 2 > i ? '-' : '') + i2c(j - 1)); + } + i = j; + } + } + + assert(rv.length); + var s = rv.join(''); + assert(s); + + // Check if the set is better represented by one of the regex escapes: + var esc4s = EscCode_bitarray_output_refs.set2esc[s]; + if (esc4s) { + // When we hit a special case like this, it is always the shortest notation, hence wins on the spot! + return '\\' + esc4s; + } + return s; +} + + + + + +// Pretty brutal conversion of 'regex' `s` back to raw regex set content: strip outer [...] when they're there; +// ditto for inner combos of sets, i.e. `]|[` as in `[0-9]|[a-z]`. +function reduceRegexToSetBitArray(s, name, opts) { + var orig = s; + + // propagate deferred exceptions = error reports. + if (s instanceof Error) { + return s; + } + + var l = new Array(UNICODE_BASE_PLANE_MAX_CP$1 + 1); + var internal_state = 0; + var derr; + + while (s.length) { + var c1 = s.match(CHR_RE$1); + if (!c1) { + // cope with illegal escape sequences too! + return new Error('illegal escape sequence at start of regex part: "' + s + '" of regex "' + orig + '"'); + } else { + c1 = c1[0]; + } + s = s.substr(c1.length); + + switch (c1) { + case '[': + // this is starting a set within the regex: scan until end of set! + var set_content = []; + while (s.length) { + var inner = s.match(SET_PART_RE$1); + if (!inner) { + inner = s.match(CHR_RE$1); + if (!inner) { + // cope with illegal escape sequences too! + return new Error('illegal escape sequence at start of regex part: ' + s + '" of regex "' + orig + '"'); + } else { + inner = inner[0]; + } + if (inner === ']') break; + } else { + inner = inner[0]; + } + set_content.push(inner); + s = s.substr(inner.length); + } + + // ensure that we hit the terminating ']': + var c2 = s.match(CHR_RE$1); + if (!c2) { + // cope with illegal escape sequences too! + return new Error('regex set expression is broken in regex: "' + orig + '" --> "' + s + '"'); + } else { + c2 = c2[0]; + } + if (c2 !== ']') { + return new Error('regex set expression is broken in regex: ' + orig); + } + s = s.substr(c2.length); + + var se = set_content.join(''); + if (!internal_state) { + derr = set2bitarray(l, se, opts); + // propagate deferred exceptions = error reports. + if (derr instanceof Error) { + return derr; + } + + // a set is to use like a single character in a longer literal phrase, hence input `[abc]word[def]` would thus produce output `[abc]`: + internal_state = 1; + } + break; + + // Strip unescaped pipes to catch constructs like `\\r|\\n` and turn them into + // something ready for use inside a regex set, e.g. `\\r\\n`. + // + // > Of course, we realize that converting more complex piped constructs this way + // > will produce something you might not expect, e.g. `A|WORD2` which + // > would end up as the set `[AW]` which is something else than the input + // > entirely. + // > + // > However, we can only depend on the user (grammar writer) to realize this and + // > prevent this from happening by not creating such oddities in the input grammar. + case '|': + // a|b --> [ab] + internal_state = 0; + break; + + case '(': + // (a) --> a + // + // TODO - right now we treat this as 'too complex': + + // Strip off some possible outer wrappers which we know how to remove. + // We don't worry about 'damaging' the regex as any too-complex regex will be caught + // in the validation check at the end; our 'strippers' here would not damage useful + // regexes anyway and them damaging the unacceptable ones is fine. + s = s.replace(/^\((?:\?:)?(.*?)\)$/, '$1'); // (?:...) -> ... and (...) -> ... + s = s.replace(/^\^?(.*?)\$?$/, '$1'); // ^...$ --> ... (catch these both inside and outside the outer grouping, hence do the ungrouping twice: one before, once after this) + s = s.replace(/^\((?:\?:)?(.*?)\)$/, '$1'); // (?:...) -> ... and (...) -> ... + + return new Error('[macro [' + name + '] is unsuitable for use inside regex set expressions: "[' + orig + ']"]'); + + case '.': + case '*': + case '+': + case '?': + // wildcard + // + // TODO - right now we treat this as 'too complex': + return new Error('[macro [' + name + '] is unsuitable for use inside regex set expressions: "[' + orig + ']"]'); + + case '{': // range, e.g. `x{1,3}`, or macro? + // TODO - right now we treat this as 'too complex': + return new Error('[macro [' + name + '] is unsuitable for use inside regex set expressions: "[' + orig + ']"]'); + + default: + // literal character or word: take the first character only and ignore the rest, so that + // the constructed set for `word|noun` would be `[wb]`: + if (!internal_state) { + derr = set2bitarray(l, c1, opts); + // propagate deferred exceptions = error reports. + if (derr instanceof Error) { + return derr; + } + + internal_state = 2; + } + break; + } + } + + s = bitarray2set(l); + + // When this result is suitable for use in a set, than we should be able to compile + // it in a regex; that way we can easily validate whether macro X is fit to be used + // inside a regex set: + try { + var re; + assert(s); + assert(!(s instanceof Error)); + re = new XRegExp('[' + s + ']'); + re.test(s[0]); + + // One thing is apparently *not* caught by the RegExp compile action above: `[a[b]c]` + // so we check for lingering UNESCAPED brackets in here as those cannot be: + if (/[^\\][\[\]]/.exec(s)) { + throw new Error('unescaped brackets in set data'); + } + } catch (ex) { + // make sure we produce a set range expression which will fail badly when it is used + // in actual code: + s = new Error('[macro [' + name + '] is unsuitable for use inside regex set expressions: "[' + s + ']"]: ' + ex.message); + } + + assert(s); + // propagate deferred exceptions = error reports. + if (s instanceof Error) { + return s; + } + return l; +} + + + + +// Convert bitarray representing, for example, `'0-9'` to regex string `[0-9]` +// -- or in this example it can be further optimized to only `\d`! +function produceOptimizedRegex4Set(bitarr) { + // First try to produce a minimum regex from the bitarray directly: + var s1 = bitarray2set(bitarr, false, true); + + // and when the regex set turns out to match a single pcode/escape, then + // use that one as-is: + if (s1.match(SET_IS_SINGLE_PCODE_RE)) { + // When we hit a special case like this, it is always the shortest notation, hence wins on the spot! + return s1; + } else { + s1 = '[' + s1 + ']'; + } + + // Now try to produce a minimum regex from the *inverted* bitarray via negation: + // Because we look at a negated bitset, there's no use looking for matches with + // special cases here. + var s2 = bitarray2set(bitarr, true, true); + + if (s2[0] === '^') { + s2 = s2.substr(1); + if (s2.match(SET_IS_SINGLE_PCODE_RE)) { + // When we hit a special case like this, it is always the shortest notation, hence wins on the spot! + return s2; + } + } else { + s2 = '^' + s2; + } + s2 = '[' + s2 + ']'; + + // Then, as some pcode/escapes still happen to deliver a LARGER regex string in the end, + // we also check against the plain, unadulterated regex set expressions: + // + // First try to produce a minimum regex from the bitarray directly: + var s3 = bitarray2set(bitarr, false, false); + + // and when the regex set turns out to match a single pcode/escape, then + // use that one as-is: + if (s3.match(SET_IS_SINGLE_PCODE_RE)) { + // When we hit a special case like this, it is always the shortest notation, hence wins on the spot! + return s3; + } else { + s3 = '[' + s3 + ']'; + } + + // Now try to produce a minimum regex from the *inverted* bitarray via negation: + // Because we look at a negated bitset, there's no use looking for matches with + // special cases here. + var s4 = bitarray2set(bitarr, true, false); + + if (s4[0] === '^') { + s4 = s4.substr(1); + if (s4.match(SET_IS_SINGLE_PCODE_RE)) { + // When we hit a special case like this, it is always the shortest notation, hence wins on the spot! + return s4; + } + } else { + s4 = '^' + s4; + } + s4 = '[' + s4 + ']'; + + if (s2.length < s1.length) { + s1 = s2; + } + if (s3.length < s1.length) { + s1 = s3; + } + if (s4.length < s1.length) { + s1 = s4; + } + + return s1; +} + + + + + + +var setmgmt = { + XREGEXP_UNICODE_ESCAPE_RE: XREGEXP_UNICODE_ESCAPE_RE$1, + CHR_RE: CHR_RE$1, + SET_PART_RE: SET_PART_RE$1, + NOTHING_SPECIAL_RE: NOTHING_SPECIAL_RE$1, + SET_IS_SINGLE_PCODE_RE, + + UNICODE_BASE_PLANE_MAX_CP: UNICODE_BASE_PLANE_MAX_CP$1, + + WHITESPACE_SETSTR: WHITESPACE_SETSTR$1, + DIGIT_SETSTR: DIGIT_SETSTR$1, + WORDCHAR_SETSTR: WORDCHAR_SETSTR$1, + + set2bitarray, + bitarray2set, + produceOptimizedRegex4Set, + reduceRegexToSetBitArray, +}; + +// Basic Lexer implemented using JavaScript regular expressions +// Zachary Carter +// MIT Licensed + +var rmCommonWS = helpers.rmCommonWS; +var camelCase = helpers.camelCase; +var code_exec = helpers.exec; +// import recast from '@gerhobbelt/recast'; +// import astUtils from '@gerhobbelt/ast-util'; +var version = '0.6.1-205'; // require('./package.json').version; + + + + +const XREGEXP_UNICODE_ESCAPE_RE = setmgmt.XREGEXP_UNICODE_ESCAPE_RE; // Matches the XRegExp Unicode escape braced part, e.g. `{Number}` +const CHR_RE = setmgmt.CHR_RE; +const SET_PART_RE = setmgmt.SET_PART_RE; +const NOTHING_SPECIAL_RE = setmgmt.NOTHING_SPECIAL_RE; +const UNICODE_BASE_PLANE_MAX_CP = setmgmt.UNICODE_BASE_PLANE_MAX_CP; + +// WARNING: this regex MUST match the regex for `ID` in ebnf-parser::bnf.l jison language lexer spec! (`ID = [{ALPHA}]{ALNUM}*`) +// +// This is the base XRegExp ID regex used in many places; this should match the ID macro definition in the EBNF/BNF parser et al as well! +const ID_REGEX_BASE = '[\\p{Alphabetic}_][\\p{Alphabetic}_\\p{Number}]*'; + + + + +// see also ./lib/cli.js +/** +@public +@nocollapse +*/ +const defaultJisonLexOptions = { + moduleType: 'commonjs', + debug: false, + enableDebugLogs: false, + json: false, + main: false, // CLI: not:(--main option) + dumpSourceCodeOnFailure: true, + throwErrorOnCompileFailure: true, + + moduleName: undefined, + defaultModuleName: 'lexer', + file: undefined, + outfile: undefined, + inputPath: undefined, + inputFilename: undefined, + warn_cb: undefined, // function(msg) | true (= use Jison.Print) | false (= throw Exception) + + xregexp: false, + lexerErrorsAreRecoverable: false, + flex: false, + backtrack_lexer: false, + ranges: false, // track position range, i.e. start+end indexes in the input string + trackPosition: true, // track line+column position in the input string + caseInsensitive: false, + showSource: false, + exportSourceCode: false, + exportAST: false, + prettyCfg: true, + pre_lex: undefined, + post_lex: undefined, +}; + + +// Merge sets of options. +// +// Convert alternative jison option names to their base option. +// +// The *last* option set which overrides the default wins, where 'override' is +// defined as specifying a not-undefined value which is not equal to the +// default value. +// +// When the FIRST argument is STRING "NODEFAULT", then we MUST NOT mix the +// default values avialable in Jison.defaultJisonOptions. +// +// Return a fresh set of options. +/** @public */ +function mkStdOptions(/*...args*/) { + var h = Object.prototype.hasOwnProperty; + + var opts = {}; + var args = [].concat.apply([], arguments); + // clone defaults, so we do not modify those constants? + if (args[0] !== "NODEFAULT") { + args.unshift(defaultJisonLexOptions); + } else { + args.shift(); + } + + for (var i = 0, len = args.length; i < len; i++) { + var o = args[i]; + if (!o) continue; + + // clone input (while camel-casing the options), so we do not modify those either. + var o2 = {}; + + for (var p in o) { + if (typeof o[p] !== 'undefined' && h.call(o, p)) { + o2[camelCase(p)] = o[p]; + } + } + + // now clean them options up: + if (typeof o2.main !== 'undefined') { + o2.noMain = !o2.main; + } + + delete o2.main; + + // special check for `moduleName` to ensure we detect the 'default' moduleName entering from the CLI + // NOT overriding the moduleName set in the grammar definition file via an `%options` entry: + if (o2.moduleName === o2.defaultModuleName) { + delete o2.moduleName; + } + + // now see if we have an overriding option here: + for (var p in o2) { + if (h.call(o2, p)) { + if (typeof o2[p] !== 'undefined') { + opts[p] = o2[p]; + } + } + } + } + + return opts; +} + +// set up export/output attributes of the `options` object instance +function prepExportStructures(options) { + // set up the 'option' `exportSourceCode` as a hash object for returning + // all generated source code chunks to the caller + var exportSourceCode = options.exportSourceCode; + if (!exportSourceCode || typeof exportSourceCode !== 'object') { + exportSourceCode = { + enabled: !!exportSourceCode + }; + } else if (typeof exportSourceCode.enabled !== 'boolean') { + exportSourceCode.enabled = true; + } + options.exportSourceCode = exportSourceCode; +} + +// Autodetect if the input lexer spec is in JSON or JISON +// format when the `options.json` flag is `true`. +// +// Produce the JSON lexer spec result when these are JSON formatted already as that +// would save us the trouble of doing this again, anywhere else in the JISON +// compiler/generator. +// +// Otherwise return the *parsed* lexer spec as it has +// been processed through LexParser. +function autodetectAndConvertToJSONformat(lexerSpec, options) { + var chk_l = null; + var ex1, err; + + if (typeof lexerSpec === 'string') { + if (options.json) { + try { + chk_l = json5.parse(lexerSpec); + + // When JSON5-based parsing of the lexer spec succeeds, this implies the lexer spec is specified in `JSON mode` + // *OR* there's a JSON/JSON5 format error in the input: + } catch (e) { + ex1 = e; + } + } + if (!chk_l) { + // // WARNING: the lexer may receive options specified in the **grammar spec file**, + // // hence we should mix the options to ensure the lexParser always + // // receives the full set! + // // + // // make sure all options are 'standardized' before we go and mix them together: + // options = mkStdOptions(grammar.options, options); + try { + chk_l = lexParser.parse(lexerSpec, options); + } catch (e) { + if (options.json) { + err = new Error('Could not parse lexer spec in JSON AUTODETECT mode\nError: ' + ex1.message + ' (' + e.message + ')'); + err.secondary_exception = e; + err.stack = ex1.stack; + } else { + err = new Error('Could not parse lexer spec\nError: ' + e.message); + err.stack = e.stack; + } + throw err; + } + } + } else { + chk_l = lexerSpec; + } + + // Save time! Don't reparse the entire lexer spec *again* inside the code generators when that's not necessary: + + return chk_l; +} + + +// expand macros and convert matchers to RegExp's +function prepareRules(dict, actions, caseHelper, tokens, startConditions, opts) { + var m, i, k, rule, action, conditions, + active_conditions, + rules = dict.rules || [], + newRules = [], + macros = {}, + regular_rule_count = 0, + simple_rule_count = 0; + + // Assure all options are camelCased: + assert(typeof opts.options['case-insensitive'] === 'undefined'); + + if (!tokens) { + tokens = {}; + } + + // Depending on the location within the regex we need different expansions of the macros: + // one expansion for when a macro is *inside* a `[...]` and another expansion when a macro + // is anywhere else in a regex: + if (dict.macros) { + macros = prepareMacros(dict.macros, opts); + } + + function tokenNumberReplacement(str, token) { + return 'return ' + (tokens[token] || '\'' + token.replace(/'/g, '\\\'') + '\''); + } + + // Make sure a comment does not contain any embedded '*/' end-of-comment marker + // as that would break the generated code + function postprocessComment(str) { + if (Array.isArray(str)) { + str = str.join(' '); + } + str = str.replace(/\*\//g, '*\\/'); // destroy any inner `*/` comment terminator sequence. + return str; + } + + actions.push('switch(yyrulenumber) {'); + + for (i = 0; i < rules.length; i++) { + rule = rules[i]; + m = rule[0]; + + active_conditions = []; + if (Object.prototype.toString.apply(m) !== '[object Array]') { + // implicit add to all inclusive start conditions + for (k in startConditions) { + if (startConditions[k].inclusive) { + active_conditions.push(k); + startConditions[k].rules.push(i); + } + } + } else if (m[0] === '*') { + // Add to ALL start conditions + active_conditions.push('*'); + for (k in startConditions) { + startConditions[k].rules.push(i); + } + rule.shift(); + m = rule[0]; + } else { + // Add to explicit start conditions + conditions = rule.shift(); + m = rule[0]; + for (k = 0; k < conditions.length; k++) { + if (!startConditions.hasOwnProperty(conditions[k])) { + startConditions[conditions[k]] = { + rules: [], + inclusive: false + }; + console.warn('Lexer Warning:', '"' + conditions[k] + '" start condition should be defined as %s or %x; assuming %x now.'); + } + active_conditions.push(conditions[k]); + startConditions[conditions[k]].rules.push(i); + } + } + + if (typeof m === 'string') { + m = expandMacros(m, macros, opts); + m = new XRegExp('^(?:' + m + ')', opts.options.caseInsensitive ? 'i' : ''); + } + newRules.push(m); + if (typeof rule[1] === 'function') { + rule[1] = String(rule[1]).replace(/^\s*function \(\)\s?\{/, '').replace(/\}\s*$/, ''); + } + action = rule[1]; + action = action.replace(/return '((?:\\'|[^']+)+)'/g, tokenNumberReplacement); + action = action.replace(/return "((?:\\"|[^"]+)+)"/g, tokenNumberReplacement); + + var code = ['\n/*! Conditions::']; + code.push(postprocessComment(active_conditions)); + code.push('*/', '\n/*! Rule:: '); + code.push(postprocessComment(rules[i][0])); + code.push('*/', '\n'); + + // When the action is *only* a simple `return TOKEN` statement, then add it to the caseHelpers; + // otherwise add the additional `break;` at the end. + // + // Note: we do NOT analyze the action block any more to see if the *last* line is a simple + // `return NNN;` statement as there are too many shoddy idioms, e.g. + // + // ``` + // %{ if (cond) + // return TOKEN; + // %} + // ``` + // + // which would then cause havoc when our action code analysis (using regexes or otherwise) was 'too simple' + // to catch these culprits; hence we resort and stick with the most fundamental approach here: + // always append `break;` even when it would be obvious to a human that such would be 'unreachable code'. + var match_nr = /^return[\s\r\n]+((?:'(?:\\'|[^']+)+')|(?:"(?:\\"|[^"]+)+")|\d+)[\s\r\n]*;?$/.exec(action.trim()); + if (match_nr) { + simple_rule_count++; + caseHelper.push([].concat(code, i, ':', match_nr[1]).join(' ').replace(/[\n]/g, '\n ')); + } else { + regular_rule_count++; + actions.push([].concat('case', i, ':', code, action, '\nbreak;').join(' ')); + } + } + actions.push('default:'); + actions.push(' return this.simpleCaseActionClusters[yyrulenumber];'); + actions.push('}'); + + return { + rules: newRules, + macros: macros, + + regular_rule_count: regular_rule_count, + simple_rule_count: simple_rule_count, + }; +} + + + + + + + +// expand all macros (with maybe one exception) in the given regex: the macros may exist inside `[...]` regex sets or +// elsewhere, which requires two different treatments to expand these macros. +function reduceRegex(s, name, opts, expandAllMacrosInSet_cb, expandAllMacrosElsewhere_cb) { + var orig = s; + + function errinfo() { + if (name) { + return 'macro [[' + name + ']]'; + } else { + return 'regex [[' + orig + ']]'; + } + } + + // propagate deferred exceptions = error reports. + if (s instanceof Error) { + return s; + } + + var c1, c2; + var rv = []; + var derr; + var se; + + while (s.length) { + c1 = s.match(CHR_RE); + if (!c1) { + // cope with illegal escape sequences too! + return new Error(errinfo() + ': illegal escape sequence at start of regex part: ' + s); + } else { + c1 = c1[0]; + } + s = s.substr(c1.length); + + switch (c1) { + case '[': + // this is starting a set within the regex: scan until end of set! + var set_content = []; + var l = new Array(UNICODE_BASE_PLANE_MAX_CP + 1); + + while (s.length) { + var inner = s.match(SET_PART_RE); + if (!inner) { + inner = s.match(CHR_RE); + if (!inner) { + // cope with illegal escape sequences too! + return new Error(errinfo() + ': illegal escape sequence at start of regex part: ' + s); + } else { + inner = inner[0]; + } + if (inner === ']') break; + } else { + inner = inner[0]; + } + set_content.push(inner); + s = s.substr(inner.length); + } + + // ensure that we hit the terminating ']': + c2 = s.match(CHR_RE); + if (!c2) { + // cope with illegal escape sequences too! + return new Error(errinfo() + ': regex set expression is broken: "' + s + '"'); + } else { + c2 = c2[0]; + } + if (c2 !== ']') { + return new Error(errinfo() + ': regex set expression is broken: apparently unterminated'); + } + s = s.substr(c2.length); + + se = set_content.join(''); + + // expand any macros in here: + if (expandAllMacrosInSet_cb) { + se = expandAllMacrosInSet_cb(se); + assert(se); + if (se instanceof Error) { + return new Error(errinfo() + ': ' + se.message); + } + } + + derr = setmgmt.set2bitarray(l, se, opts); + if (derr instanceof Error) { + return new Error(errinfo() + ': ' + derr.message); + } + + // find out which set expression is optimal in size: + var s1 = setmgmt.produceOptimizedRegex4Set(l); + + // check if the source regex set potentially has any expansions (guestimate!) + // + // The indexOf('{') picks both XRegExp Unicode escapes and JISON lexer macros, which is perfect for us here. + var has_expansions = (se.indexOf('{') >= 0); + + se = '[' + se + ']'; + + if (!has_expansions && se.length < s1.length) { + s1 = se; + } + rv.push(s1); + break; + + // XRegExp Unicode escape, e.g. `\\p{Number}`: + case '\\p': + c2 = s.match(XREGEXP_UNICODE_ESCAPE_RE); + if (c2) { + c2 = c2[0]; + s = s.substr(c2.length); + + // nothing to expand. + rv.push(c1 + c2); + } else { + // nothing to stretch this match, hence nothing to expand. + rv.push(c1); + } + break; + + // Either a range expression or the start of a macro reference: `.{1,3}` or `{NAME}`. + // Treat it as a macro reference and see if it will expand to anything: + case '{': + c2 = s.match(NOTHING_SPECIAL_RE); + if (c2) { + c2 = c2[0]; + s = s.substr(c2.length); + + var c3 = s[0]; + s = s.substr(c3.length); + if (c3 === '}') { + // possibly a macro name in there... Expand if possible: + c2 = c1 + c2 + c3; + if (expandAllMacrosElsewhere_cb) { + c2 = expandAllMacrosElsewhere_cb(c2); + assert(c2); + if (c2 instanceof Error) { + return new Error(errinfo() + ': ' + c2.message); + } + } + } else { + // not a well-terminated macro reference or something completely different: + // we do not even attempt to expand this as there's guaranteed nothing to expand + // in this bit. + c2 = c1 + c2 + c3; + } + rv.push(c2); + } else { + // nothing to stretch this match, hence nothing to expand. + rv.push(c1); + } + break; + + // Recognize some other regex elements, but there's no need to understand them all. + // + // We are merely interested in any chunks now which do *not* include yet another regex set `[...]` + // nor any `{MACRO}` reference: + default: + // non-set character or word: see how much of this there is for us and then see if there + // are any macros still lurking inside there: + c2 = s.match(NOTHING_SPECIAL_RE); + if (c2) { + c2 = c2[0]; + s = s.substr(c2.length); + + // nothing to expand. + rv.push(c1 + c2); + } else { + // nothing to stretch this match, hence nothing to expand. + rv.push(c1); + } + break; + } + } + + s = rv.join(''); + + // When this result is suitable for use in a set, than we should be able to compile + // it in a regex; that way we can easily validate whether macro X is fit to be used + // inside a regex set: + try { + var re; + re = new XRegExp(s); + re.test(s[0]); + } catch (ex) { + // make sure we produce a regex expression which will fail badly when it is used + // in actual code: + return new Error(errinfo() + ': expands to an invalid regex: /' + s + '/'); + } + + assert(s); + return s; +} + + +// expand macros within macros and cache the result +function prepareMacros(dict_macros, opts) { + var macros = {}; + + // expand a `{NAME}` macro which exists inside a `[...]` set: + function expandMacroInSet(i) { + var k, a, m; + if (!macros[i]) { + m = dict_macros[i]; + + if (m.indexOf('{') >= 0) { + // set up our own record so we can detect definition loops: + macros[i] = { + in_set: false, + elsewhere: null, + raw: dict_macros[i] + }; + + for (k in dict_macros) { + if (dict_macros.hasOwnProperty(k) && i !== k) { + // it doesn't matter if the lexer recognized that the inner macro(s) + // were sitting inside a `[...]` set or not: the fact that they are used + // here in macro `i` which itself sits in a set, makes them *all* live in + // a set so all of them get the same treatment: set expansion style. + // + // Note: make sure we don't try to expand any XRegExp `\p{...}` or `\P{...}` + // macros here: + if (XRegExp._getUnicodeProperty(k)) { + // Work-around so that you can use `\p{ascii}` for a XRegExp slug, a.k.a. + // Unicode 'General Category' Property cf. http://unicode.org/reports/tr18/#Categories, + // while using `\p{ASCII}` as a *macro expansion* of the `ASCII` + // macro: + if (k.toUpperCase() !== k) { + m = new Error('Cannot use name "' + k + '" as a macro name as it clashes with the same XRegExp "\\p{..}" Unicode \'General Category\' Property name. Use all-uppercase macro names, e.g. name your macro "' + k.toUpperCase() + '" to work around this issue or give your offending macro a different name.'); + break; + } + } + + a = m.split('{' + k + '}'); + if (a.length > 1) { + var x = expandMacroInSet(k); + assert(x); + if (x instanceof Error) { + m = x; + break; + } + m = a.join(x); + } + } + } + } + + var mba = setmgmt.reduceRegexToSetBitArray(m, i, opts); + + var s1; + + // propagate deferred exceptions = error reports. + if (mba instanceof Error) { + s1 = mba; + } else { + s1 = setmgmt.bitarray2set(mba, false); + + m = s1; + } + + macros[i] = { + in_set: s1, + elsewhere: null, + raw: dict_macros[i] + }; + } else { + m = macros[i].in_set; + + if (m instanceof Error) { + // this turns out to be an macro with 'issues' and it is used, so the 'issues' do matter: bombs away! + return new Error(m.message); + } + + // detect definition loop: + if (m === false) { + return new Error('Macro name "' + i + '" has an illegal, looping, definition, i.e. it\'s definition references itself, either directly or indirectly, via other macros.'); + } + } + + return m; + } + + function expandMacroElsewhere(i) { + var k, a, m; + + if (macros[i].elsewhere == null) { + m = dict_macros[i]; + + // set up our own record so we can detect definition loops: + macros[i].elsewhere = false; + + // the macro MAY contain other macros which MAY be inside a `[...]` set in this + // macro or elsewhere, hence we must parse the regex: + m = reduceRegex(m, i, opts, expandAllMacrosInSet, expandAllMacrosElsewhere); + // propagate deferred exceptions = error reports. + if (m instanceof Error) { + return m; + } + + macros[i].elsewhere = m; + } else { + m = macros[i].elsewhere; + + if (m instanceof Error) { + // this turns out to be an macro with 'issues' and it is used, so the 'issues' do matter: bombs away! + return m; + } + + // detect definition loop: + if (m === false) { + return new Error('Macro name "' + i + '" has an illegal, looping, definition, i.e. it\'s definition references itself, either directly or indirectly, via other macros.'); + } + } + + return m; + } + + function expandAllMacrosInSet(s) { + var i, x; + + // process *all* the macros inside [...] set: + if (s.indexOf('{') >= 0) { + for (i in macros) { + if (macros.hasOwnProperty(i)) { + var a = s.split('{' + i + '}'); + if (a.length > 1) { + x = expandMacroInSet(i); + assert(x); + if (x instanceof Error) { + return new Error('failure to expand the macro [' + i + '] in set [' + s + ']: ' + x.message); + } + s = a.join(x); + } + + // stop the brute-force expansion attempt when we done 'em all: + if (s.indexOf('{') === -1) { + break; + } + } + } + } + + return s; + } + + function expandAllMacrosElsewhere(s) { + var i, x; + + // When we process the remaining macro occurrences in the regex + // every macro used in a lexer rule will become its own capture group. + // + // Meanwhile the cached expansion will expand any submacros into + // *NON*-capturing groups so that the backreference indexes remain as you'ld + // expect and using macros doesn't require you to know exactly what your + // used macro will expand into, i.e. which and how many submacros it has. + // + // This is a BREAKING CHANGE from vanilla jison 0.4.15! + if (s.indexOf('{') >= 0) { + for (i in macros) { + if (macros.hasOwnProperty(i)) { + // These are all submacro expansions, hence non-capturing grouping is applied: + var a = s.split('{' + i + '}'); + if (a.length > 1) { + x = expandMacroElsewhere(i); + assert(x); + if (x instanceof Error) { + return new Error('failure to expand the macro [' + i + '] in regex /' + s + '/: ' + x.message); + } + s = a.join('(?:' + x + ')'); + } + + // stop the brute-force expansion attempt when we done 'em all: + if (s.indexOf('{') === -1) { + break; + } + } + } + } + + return s; + } + + + var m, i; + + if (opts.debug) console.log('\n############## RAW macros: ', dict_macros); + + // first we create the part of the dictionary which is targeting the use of macros + // *inside* `[...]` sets; once we have completed that half of the expansions work, + // we then go and expand the macros for when they are used elsewhere in a regex: + // iff we encounter submacros then which are used *inside* a set, we can use that + // first half dictionary to speed things up a bit as we can use those expansions + // straight away! + for (i in dict_macros) { + if (dict_macros.hasOwnProperty(i)) { + expandMacroInSet(i); + } + } + + for (i in dict_macros) { + if (dict_macros.hasOwnProperty(i)) { + expandMacroElsewhere(i); + } + } + + if (opts.debug) console.log('\n############### expanded macros: ', macros); + + return macros; +} + + + +// expand macros in a regex; expands them recursively +function expandMacros(src, macros, opts) { + var expansion_count = 0; + + // By the time we call this function `expandMacros` we MUST have expanded and cached all macros already! + // Hence things should be easy in there: + + function expandAllMacrosInSet(s) { + var i, m, x; + + // process *all* the macros inside [...] set: + if (s.indexOf('{') >= 0) { + for (i in macros) { + if (macros.hasOwnProperty(i)) { + m = macros[i]; + + var a = s.split('{' + i + '}'); + if (a.length > 1) { + x = m.in_set; + + assert(x); + if (x instanceof Error) { + // this turns out to be an macro with 'issues' and it is used, so the 'issues' do matter: bombs away! + throw x; + } + + // detect definition loop: + if (x === false) { + return new Error('Macro name "' + i + '" has an illegal, looping, definition, i.e. it\'s definition references itself, either directly or indirectly, via other macros.'); + } + + s = a.join(x); + expansion_count++; + } + + // stop the brute-force expansion attempt when we done 'em all: + if (s.indexOf('{') === -1) { + break; + } + } + } + } + + return s; + } + + function expandAllMacrosElsewhere(s) { + var i, m, x; + + // When we process the main macro occurrences in the regex + // every macro used in a lexer rule will become its own capture group. + // + // Meanwhile the cached expansion will expand any submacros into + // *NON*-capturing groups so that the backreference indexes remain as you'ld + // expect and using macros doesn't require you to know exactly what your + // used macro will expand into, i.e. which and how many submacros it has. + // + // This is a BREAKING CHANGE from vanilla jison 0.4.15! + if (s.indexOf('{') >= 0) { + for (i in macros) { + if (macros.hasOwnProperty(i)) { + m = macros[i]; + + var a = s.split('{' + i + '}'); + if (a.length > 1) { + // These are all main macro expansions, hence CAPTURING grouping is applied: + x = m.elsewhere; + assert(x); + + // detect definition loop: + if (x === false) { + return new Error('Macro name "' + i + '" has an illegal, looping, definition, i.e. it\'s definition references itself, either directly or indirectly, via other macros.'); + } + + s = a.join('(' + x + ')'); + expansion_count++; + } + + // stop the brute-force expansion attempt when we done 'em all: + if (s.indexOf('{') === -1) { + break; + } + } + } + } + + return s; + } + + + // When we process the macro occurrences in the regex + // every macro used in a lexer rule will become its own capture group. + // + // Meanwhile the cached expansion will have expanded any submacros into + // *NON*-capturing groups so that the backreference indexes remain as you'ld + // expect and using macros doesn't require you to know exactly what your + // used macro will expand into, i.e. which and how many submacros it has. + // + // This is a BREAKING CHANGE from vanilla jison 0.4.15! + var s2 = reduceRegex(src, null, opts, expandAllMacrosInSet, expandAllMacrosElsewhere); + // propagate deferred exceptions = error reports. + if (s2 instanceof Error) { + throw s2; + } + + // only when we did expand some actual macros do we take the re-interpreted/optimized/regenerated regex from reduceRegex() + // in order to keep our test cases simple and rules recognizable. This assumes the user can code good regexes on his own, + // as long as no macros are involved... + // + // Also pick the reduced regex when there (potentially) are XRegExp extensions in the original, e.g. `\\p{Number}`, + // unless the `xregexp` output option has been enabled. + if (expansion_count > 0 || (src.indexOf('\\p{') >= 0 && !opts.options.xregexp)) { + src = s2; + } else { + // Check if the reduced regex is smaller in size; when it is, we still go with the new one! + if (s2.length < src.length) { + src = s2; + } + } + + return src; +} + +function prepareStartConditions(conditions) { + var sc, + hash = {}; + for (sc in conditions) { + if (conditions.hasOwnProperty(sc)) { + hash[sc] = {rules:[], inclusive: !conditions[sc]}; + } + } + return hash; +} + +function buildActions(dict, tokens, opts) { + var actions = [dict.actionInclude || '', 'var YYSTATE = YY_START;']; + var tok; + var toks = {}; + var caseHelper = []; + + // tokens: map/array of token numbers to token names + for (tok in tokens) { + var idx = parseInt(tok); + if (idx && idx > 0) { + toks[tokens[tok]] = idx; + } + } + + if (opts.options.flex && dict.rules) { + dict.rules.push(['.', 'console.log("", yytext); /* `flex` lexing mode: the last resort rule! */']); + } + + var gen = prepareRules(dict, actions, caseHelper, tokens && toks, opts.conditions, opts); + + var fun = actions.join('\n'); + 'yytext yyleng yylineno yylloc yyerror'.split(' ').forEach(function (yy) { + fun = fun.replace(new RegExp('\\b(' + yy + ')\\b', 'g'), 'yy_.$1'); + }); + + return { + caseHelperInclude: '{\n' + caseHelper.join(',') + '\n}', + + actions: `function lexer__performAction(yy, yyrulenumber, YY_START) { + var yy_ = this; + + ${fun} + }`, + + rules: gen.rules, + macros: gen.macros, // propagate these for debugging/diagnostic purposes + + regular_rule_count: gen.regular_rule_count, + simple_rule_count: gen.simple_rule_count, + }; +} + +// +// NOTE: this is *almost* a copy of the JisonParserError producing code in +// jison/lib/jison.js @ line 2304:lrGeneratorMixin.generateErrorClass +// +function generateErrorClass() { + // --- START lexer error class --- + +var prelude = `/** + * See also: + * http://stackoverflow.com/questions/1382107/whats-a-good-way-to-extend-error-in-javascript/#35881508 + * but we keep the prototype.constructor and prototype.name assignment lines too for compatibility + * with userland code which might access the derived class in a 'classic' way. + * + * @public + * @constructor + * @nocollapse + */ +function JisonLexerError(msg, hash) { + Object.defineProperty(this, 'name', { + enumerable: false, + writable: false, + value: 'JisonLexerError' + }); + + if (msg == null) msg = '???'; + + Object.defineProperty(this, 'message', { + enumerable: false, + writable: true, + value: msg + }); + + this.hash = hash; + + var stacktrace; + if (hash && hash.exception instanceof Error) { + var ex2 = hash.exception; + this.message = ex2.message || msg; + stacktrace = ex2.stack; + } + if (!stacktrace) { + if (Error.hasOwnProperty('captureStackTrace')) { // V8 + Error.captureStackTrace(this, this.constructor); + } else { + stacktrace = (new Error(msg)).stack; + } + } + if (stacktrace) { + Object.defineProperty(this, 'stack', { + enumerable: false, + writable: false, + value: stacktrace + }); + } +} + +if (typeof Object.setPrototypeOf === 'function') { + Object.setPrototypeOf(JisonLexerError.prototype, Error.prototype); +} else { + JisonLexerError.prototype = Object.create(Error.prototype); +} +JisonLexerError.prototype.constructor = JisonLexerError; +JisonLexerError.prototype.name = 'JisonLexerError';`; + + // --- END lexer error class --- + + return prelude; +} + + +const jisonLexerErrorDefinition = generateErrorClass(); + + +function generateFakeXRegExpClassSrcCode() { + return rmCommonWS` + var __hacky_counter__ = 0; + + /** + * @constructor + * @nocollapse + */ + function XRegExp(re, f) { + this.re = re; + this.flags = f; + this._getUnicodeProperty = function (k) {}; + var fake = /./; // WARNING: this exact 'fake' is also depended upon by the xregexp unit test! + __hacky_counter__++; + fake.__hacky_backy__ = __hacky_counter__; + return fake; + } + `; +} + + + +/** @constructor */ +function RegExpLexer(dict, input, tokens, build_options) { + var opts; + var dump = false; + + function test_me(tweak_cb, description, src_exception, ex_callback) { + opts = processGrammar(dict, tokens, build_options); + opts.__in_rules_failure_analysis_mode__ = false; + prepExportStructures(opts); + assert(opts.options); + if (tweak_cb) { + tweak_cb(); + } + var source = generateModuleBody(opts); + try { + // The generated code will always have the `lexer` variable declared at local scope + // as `eval()` will use the local scope. + // + // The compiled code will look something like this: + // + // ``` + // var lexer; + // bla bla... + // ``` + // + // or + // + // ``` + // var lexer = { bla... }; + // ``` + var testcode = [ + '// provide a local version for test purposes:', + jisonLexerErrorDefinition, + '', + generateFakeXRegExpClassSrcCode(), + '', + source, + '', + 'return lexer;'].join('\n'); + var lexer = code_exec(testcode, function generated_code_exec_wrapper_regexp_lexer(sourcecode) { + //console.log("===============================LEXER TEST CODE\n", sourcecode, "\n=====================END====================\n"); + var lexer_f = new Function('', sourcecode); + return lexer_f(); + }, opts.options, "lexer"); + + if (!lexer) { + throw new Error('no lexer defined *at all*?!'); + } + if (typeof lexer.options !== 'object' || lexer.options == null) { + throw new Error('your lexer class MUST have an .options member object or it won\'t fly!'); + } + if (typeof lexer.setInput !== 'function') { + throw new Error('your lexer class MUST have a .setInput function member or it won\'t fly!'); + } + if (lexer.EOF !== 1 && lexer.ERROR !== 2) { + throw new Error('your lexer class MUST have these constants defined: lexer.EOF = 1 and lexer.ERROR = 2 or it won\'t fly!'); + } + + // When we do NOT crash, we found/killed the problem area just before this call! + if (src_exception && description) { + src_exception.message += '\n (' + description + ')'; + } + + // patch the pre and post handlers in there, now that we have some live code to work with: + if (opts.options) { + var pre = opts.options.pre_lex; + var post = opts.options.post_lex; + // since JSON cannot encode functions, we'll have to do it manually now: + if (typeof pre === 'function') { + lexer.options.pre_lex = pre; + } + if (typeof post === 'function') { + lexer.options.post_lex = post; + } + } + + if (opts.options.showSource) { + if (typeof opts.options.showSource === 'function') { + opts.options.showSource(lexer, source, opts); + } else { + console.log("\nGenerated lexer sourcecode:\n----------------------------------------\n", source, "\n----------------------------------------\n"); + } + } + return lexer; + } catch (ex) { + // if (src_exception) { + // src_exception.message += '\n (' + description + ': ' + ex.message + ')'; + // } + + if (ex_callback) { + ex_callback(ex); + } else if (dump) { + console.log('source code:\n', source); + } + return false; + } + } + + /** @constructor */ + var lexer = test_me(null, null, null, function (ex) { + // When we get an exception here, it means some part of the user-specified lexer is botched. + // + // Now we go and try to narrow down the problem area/category: + assert(opts.options); + assert(opts.options.xregexp !== undefined); + var orig_xregexp_opt = !!opts.options.xregexp; + if (!test_me(function () { + assert(opts.options.xregexp !== undefined); + opts.options.xregexp = false; + opts.showSource = false; + }, 'When you have specified %option xregexp, you must also properly IMPORT the XRegExp library in the generated lexer.', ex, null)) { + if (!test_me(function () { + // restore xregexp option setting: the trouble wasn't caused by the xregexp flag i.c.w. incorrect XRegExp library importing! + opts.options.xregexp = orig_xregexp_opt; + + opts.conditions = []; + opts.showSource = false; + }, ((dict.rules && dict.rules.length > 0) ? + 'One or more of your lexer state names are possibly botched?' : + 'Your custom lexer is somehow botched.'), ex, null)) { + if (!test_me(function () { + // opts.conditions = []; + opts.rules = []; + opts.showSource = false; + opts.__in_rules_failure_analysis_mode__ = true; + }, 'One or more of your lexer rules are possibly botched?', ex, null)) { + // kill each rule action block, one at a time and test again after each 'edit': + var rv = false; + for (var i = 0, len = (dict.rules ? dict.rules.length : 0); i < len; i++) { + dict.rules[i][1] = '{ /* nada */ }'; + rv = test_me(function () { + // opts.conditions = []; + // opts.rules = []; + // opts.__in_rules_failure_analysis_mode__ = true; + }, 'Your lexer rule "' + dict.rules[i][0] + '" action code block is botched?', ex, null); + if (rv) { + break; + } + } + if (!rv) { + test_me(function () { + opts.conditions = []; + opts.rules = []; + opts.performAction = 'null'; + // opts.options = {}; + // opts.caseHelperInclude = '{}'; + opts.showSource = false; + opts.__in_rules_failure_analysis_mode__ = true; + + dump = false; + }, 'One or more of your lexer rule action code block(s) are possibly botched?', ex, null); + } + } + } + } + throw ex; + }); + + lexer.setInput(input); + + /** @public */ + lexer.generate = function () { + return generateFromOpts(opts); + }; + /** @public */ + lexer.generateModule = function () { + return generateModule(opts); + }; + /** @public */ + lexer.generateCommonJSModule = function () { + return generateCommonJSModule(opts); + }; + /** @public */ + lexer.generateESModule = function () { + return generateESModule(opts); + }; + /** @public */ + lexer.generateAMDModule = function () { + return generateAMDModule(opts); + }; + + // internal APIs to aid testing: + /** @public */ + lexer.getExpandedMacros = function () { + return opts.macros; + }; + + return lexer; +} + +// code stripping performance test for very simple grammar: +// +// - removing backtracking parser code branches: 730K -> 750K rounds +// - removing all location info tracking: yylineno, yylloc, etc.: 750K -> 900K rounds +// - no `yyleng`: 900K -> 905K rounds +// - no `this.done` as we cannot have a NULL `_input` anymore: 905K -> 930K rounds +// - `simpleCaseActionClusters` as array instead of hash object: 930K -> 940K rounds +// - lexers which have only return stmts, i.e. only a +// `simpleCaseActionClusters` lookup table to produce +// lexer tokens: *inline* the `performAction` call: 940K -> 950K rounds +// - given all the above, you can *inline* what's left of +// `lexer_next()`: 950K -> 955K rounds (? this stuff becomes hard to measure; inaccuracy abounds!) +// +// Total gain when we forget about very minor (and tough to nail) *inlining* `lexer_next()` gains: +// +// 730 -> 950 ~ 30% performance gain. +// + +// As a function can be reproduced in source-code form by any JavaScript engine, we're going to wrap this chunk +// of code in a function so that we can easily get it including it comments, etc.: +/** +@public +@nocollapse +*/ +function getRegExpLexerPrototype() { + // --- START lexer kernel --- +return `{ + EOF: 1, + ERROR: 2, + + // JisonLexerError: JisonLexerError, /// <-- injected by the code generator + + // options: {}, /// <-- injected by the code generator + + // yy: ..., /// <-- injected by setInput() + + __currentRuleSet__: null, /// INTERNAL USE ONLY: internal rule set cache for the current lexer state + + __error_infos: [], /// INTERNAL USE ONLY: the set of lexErrorInfo objects created since the last cleanup + + __decompressed: false, /// INTERNAL USE ONLY: mark whether the lexer instance has been 'unfolded' completely and is now ready for use + + done: false, /// INTERNAL USE ONLY + _backtrack: false, /// INTERNAL USE ONLY + _input: '', /// INTERNAL USE ONLY + _more: false, /// INTERNAL USE ONLY + _signaled_error_token: false, /// INTERNAL USE ONLY + + conditionStack: [], /// INTERNAL USE ONLY; managed via \`pushState()\`, \`popState()\`, \`topState()\` and \`stateStackSize()\` + + match: '', /// READ-ONLY EXTERNAL ACCESS - ADVANCED USE ONLY: tracks input which has been matched so far for the lexer token under construction. \`match\` is identical to \`yytext\` except that this one still contains the matched input string after \`lexer.performAction()\` has been invoked, where userland code MAY have changed/replaced the \`yytext\` value entirely! + matched: '', /// READ-ONLY EXTERNAL ACCESS - ADVANCED USE ONLY: tracks entire input which has been matched so far + matches: false, /// READ-ONLY EXTERNAL ACCESS - ADVANCED USE ONLY: tracks RE match result for last (successful) match attempt + yytext: '', /// ADVANCED USE ONLY: tracks input which has been matched so far for the lexer token under construction; this value is transferred to the parser as the 'token value' when the parser consumes the lexer token produced through a call to the \`lex()\` API. + offset: 0, /// READ-ONLY EXTERNAL ACCESS - ADVANCED USE ONLY: tracks the 'cursor position' in the input string, i.e. the number of characters matched so far + yyleng: 0, /// READ-ONLY EXTERNAL ACCESS - ADVANCED USE ONLY: length of matched input for the token under construction (\`yytext\`) + yylineno: 0, /// READ-ONLY EXTERNAL ACCESS - ADVANCED USE ONLY: 'line number' at which the token under construction is located + yylloc: null, /// READ-ONLY EXTERNAL ACCESS - ADVANCED USE ONLY: tracks location info (lines + columns) for the token under construction + + /** + * INTERNAL USE: construct a suitable error info hash object instance for \`parseError\`. + * + * @public + * @this {RegExpLexer} + */ + constructLexErrorInfo: function lexer_constructLexErrorInfo(msg, recoverable, show_input_position) { + msg = '' + msg; + + // heuristic to determine if the error message already contains a (partial) source code dump + // as produced by either \`showPosition()\` or \`prettyPrintRange()\`: + if (show_input_position == undefined) { + show_input_position = !(msg.indexOf('\\n') > 0 && msg.indexOf('^') > 0); + } + if (this.yylloc && show_input_position) { + if (typeof this.prettyPrintRange === 'function') { + var pretty_src = this.prettyPrintRange(this.yylloc); + + if (!/\\n\\s*$/.test(msg)) { + msg += '\\n'; + } + msg += '\\n Erroneous area:\\n' + this.prettyPrintRange(this.yylloc); + } else if (typeof this.showPosition === 'function') { + var pos_str = this.showPosition(); + if (pos_str) { + if (msg.length && msg[msg.length - 1] !== '\\n' && pos_str[0] !== '\\n') { + msg += '\\n' + pos_str; + } else { + msg += pos_str; + } + } + } + } + /** @constructor */ + var pei = { + errStr: msg, + recoverable: !!recoverable, + text: this.match, // This one MAY be empty; userland code should use the \`upcomingInput\` API to obtain more text which follows the 'lexer cursor position'... + token: null, + line: this.yylineno, + loc: this.yylloc, + yy: this.yy, + lexer: this, + + /** + * and make sure the error info doesn't stay due to potential + * ref cycle via userland code manipulations. + * These would otherwise all be memory leak opportunities! + * + * Note that only array and object references are nuked as those + * constitute the set of elements which can produce a cyclic ref. + * The rest of the members is kept intact as they are harmless. + * + * @public + * @this {LexErrorInfo} + */ + destroy: function destructLexErrorInfo() { + // remove cyclic references added to error info: + // info.yy = null; + // info.lexer = null; + // ... + var rec = !!this.recoverable; + for (var key in this) { + if (this.hasOwnProperty(key) && typeof key === 'object') { + this[key] = undefined; + } + } + this.recoverable = rec; + } + }; + // track this instance so we can \`destroy()\` it once we deem it superfluous and ready for garbage collection! + this.__error_infos.push(pei); + return pei; + }, + + /** + * handler which is invoked when a lexer error occurs. + * + * @public + * @this {RegExpLexer} + */ + parseError: function lexer_parseError(str, hash, ExceptionClass) { + if (!ExceptionClass) { + ExceptionClass = this.JisonLexerError; + } + if (this.yy) { + if (this.yy.parser && typeof this.yy.parser.parseError === 'function') { + return this.yy.parser.parseError.call(this, str, hash, ExceptionClass) || this.ERROR; + } else if (typeof this.yy.parseError === 'function') { + return this.yy.parseError.call(this, str, hash, ExceptionClass) || this.ERROR; + } + } + throw new ExceptionClass(str, hash); + }, + + /** + * method which implements \`yyerror(str, ...args)\` functionality for use inside lexer actions. + * + * @public + * @this {RegExpLexer} + */ + yyerror: function yyError(str /*, ...args */) { + var lineno_msg = ''; + if (this.yylloc) { + lineno_msg = ' on line ' + (this.yylineno + 1); + } + var p = this.constructLexErrorInfo('Lexical error' + lineno_msg + ': ' + str, this.options.lexerErrorsAreRecoverable); + + // Add any extra args to the hash under the name \`extra_error_attributes\`: + var args = Array.prototype.slice.call(arguments, 1); + if (args.length) { + p.extra_error_attributes = args; + } + + return (this.parseError(p.errStr, p, this.JisonLexerError) || this.ERROR); + }, + + /** + * final cleanup function for when we have completed lexing the input; + * make it an API so that external code can use this one once userland + * code has decided it's time to destroy any lingering lexer error + * hash object instances and the like: this function helps to clean + * up these constructs, which *may* carry cyclic references which would + * otherwise prevent the instances from being properly and timely + * garbage-collected, i.e. this function helps prevent memory leaks! + * + * @public + * @this {RegExpLexer} + */ + cleanupAfterLex: function lexer_cleanupAfterLex(do_not_nuke_errorinfos) { + // prevent lingering circular references from causing memory leaks: + this.setInput('', {}); + + // nuke the error hash info instances created during this run. + // Userland code must COPY any data/references + // in the error hash instance(s) it is more permanently interested in. + if (!do_not_nuke_errorinfos) { + for (var i = this.__error_infos.length - 1; i >= 0; i--) { + var el = this.__error_infos[i]; + if (el && typeof el.destroy === 'function') { + el.destroy(); + } + } + this.__error_infos.length = 0; + } + + return this; + }, + + /** + * clear the lexer token context; intended for internal use only + * + * @public + * @this {RegExpLexer} + */ + clear: function lexer_clear() { + this.yytext = ''; + this.yyleng = 0; + this.match = ''; + // - DO NOT reset \`this.matched\` + this.matches = false; + this._more = false; + this._backtrack = false; + + var col = (this.yylloc ? this.yylloc.last_column : 0); + this.yylloc = { + first_line: this.yylineno + 1, + first_column: col, + last_line: this.yylineno + 1, + last_column: col, + + range: [this.offset, this.offset] + }; + }, + + /** + * resets the lexer, sets new input + * + * @public + * @this {RegExpLexer} + */ + setInput: function lexer_setInput(input, yy) { + this.yy = yy || this.yy || {}; + + // also check if we've fully initialized the lexer instance, + // including expansion work to be done to go from a loaded + // lexer to a usable lexer: + if (!this.__decompressed) { + // step 1: decompress the regex list: + var rules = this.rules; + for (var i = 0, len = rules.length; i < len; i++) { + var rule_re = rules[i]; + + // compression: is the RE an xref to another RE slot in the rules[] table? + if (typeof rule_re === 'number') { + rules[i] = rules[rule_re]; + } + } + + // step 2: unfold the conditions[] set to make these ready for use: + var conditions = this.conditions; + for (var k in conditions) { + var spec = conditions[k]; + + var rule_ids = spec.rules; + + var len = rule_ids.length; + var rule_regexes = new Array(len + 1); // slot 0 is unused; we use a 1-based index approach here to keep the hottest code in \`lexer_next()\` fast and simple! + var rule_new_ids = new Array(len + 1); + + for (var i = 0; i < len; i++) { + var idx = rule_ids[i]; + var rule_re = rules[idx]; + rule_regexes[i + 1] = rule_re; + rule_new_ids[i + 1] = idx; + } + + spec.rules = rule_new_ids; + spec.__rule_regexes = rule_regexes; + spec.__rule_count = len; + } + + this.__decompressed = true; + } + + this._input = input || ''; + this.clear(); + this._signaled_error_token = false; + this.done = false; + this.yylineno = 0; + this.matched = ''; + this.conditionStack = ['INITIAL']; + this.__currentRuleSet__ = null; + this.yylloc = { + first_line: 1, + first_column: 0, + last_line: 1, + last_column: 0, + + range: [0, 0] + }; + this.offset = 0; + return this; + }, + + /** + * edit the remaining input via user-specified callback. + * This can be used to forward-adjust the input-to-parse, + * e.g. inserting macro expansions and alike in the + * input which has yet to be lexed. + * The behaviour of this API contrasts the \`unput()\` et al + * APIs as those act on the *consumed* input, while this + * one allows one to manipulate the future, without impacting + * the current \`yyloc\` cursor location or any history. + * + * Use this API to help implement C-preprocessor-like + * \`#include\` statements, etc. + * + * The provided callback must be synchronous and is + * expected to return the edited input (string). + * + * The \`cpsArg\` argument value is passed to the callback + * as-is. + * + * \`callback\` interface: + * \`function callback(input, cpsArg)\` + * + * - \`input\` will carry the remaining-input-to-lex string + * from the lexer. + * - \`cpsArg\` is \`cpsArg\` passed into this API. + * + * The \`this\` reference for the callback will be set to + * reference this lexer instance so that userland code + * in the callback can easily and quickly access any lexer + * API. + * + * When the callback returns a non-string-type falsey value, + * we assume the callback did not edit the input and we + * will using the input as-is. + * + * When the callback returns a non-string-type value, it + * is converted to a string for lexing via the \`"" + retval\` + * operation. (See also why: http://2ality.com/2012/03/converting-to-string.html + * -- that way any returned object's \`toValue()\` and \`toString()\` + * methods will be invoked in a proper/desirable order.) + * + * @public + * @this {RegExpLexer} + */ + editRemainingInput: function lexer_editRemainingInput(callback, cpsArg) { + var rv = callback.call(this, this._input, cpsArg); + if (typeof rv !== 'string') { + if (rv) { + this._input = '' + rv; + } + // else: keep \`this._input\` as is. + } else { + this._input = rv; + } + return this; + }, + + /** + * consumes and returns one char from the input + * + * @public + * @this {RegExpLexer} + */ + input: function lexer_input() { + if (!this._input) { + //this.done = true; -- don't set \`done\` as we want the lex()/next() API to be able to produce one custom EOF token match after this anyhow. (lexer can match special <> tokens and perform user action code for a <> match, but only does so *once*) + return null; + } + var ch = this._input[0]; + this.yytext += ch; + this.yyleng++; + this.offset++; + this.match += ch; + this.matched += ch; + // Count the linenumber up when we hit the LF (or a stand-alone CR). + // On CRLF, the linenumber is incremented when you fetch the CR or the CRLF combo + // and we advance immediately past the LF as well, returning both together as if + // it was all a single 'character' only. + var slice_len = 1; + var lines = false; + if (ch === '\\n') { + lines = true; + } else if (ch === '\\r') { + lines = true; + var ch2 = this._input[1]; + if (ch2 === '\\n') { + slice_len++; + ch += ch2; + this.yytext += ch2; + this.yyleng++; + this.offset++; + this.match += ch2; + this.matched += ch2; + this.yylloc.range[1]++; + } + } + if (lines) { + this.yylineno++; + this.yylloc.last_line++; + this.yylloc.last_column = 0; + } else { + this.yylloc.last_column++; + } + this.yylloc.range[1]++; + + this._input = this._input.slice(slice_len); + return ch; + }, + + /** + * unshifts one char (or an entire string) into the input + * + * @public + * @this {RegExpLexer} + */ + unput: function lexer_unput(ch) { + var len = ch.length; + var lines = ch.split(/(?:\\r\\n?|\\n)/g); + + this._input = ch + this._input; + this.yytext = this.yytext.substr(0, this.yytext.length - len); + this.yyleng = this.yytext.length; + this.offset -= len; + this.match = this.match.substr(0, this.match.length - len); + this.matched = this.matched.substr(0, this.matched.length - len); + + if (lines.length > 1) { + this.yylineno -= lines.length - 1; + + this.yylloc.last_line = this.yylineno + 1; + + // Get last entirely matched line into the \`pre_lines[]\` array's + // last index slot; we don't mind when other previously + // matched lines end up in the array too. + var pre = this.match; + var pre_lines = pre.split(/(?:\\r\\n?|\\n)/g); + if (pre_lines.length === 1) { + pre = this.matched; + pre_lines = pre.split(/(?:\\r\\n?|\\n)/g); + } + this.yylloc.last_column = pre_lines[pre_lines.length - 1].length; + } else { + this.yylloc.last_column -= len; + } + + this.yylloc.range[1] = this.yylloc.range[0] + this.yyleng; + + this.done = false; + return this; + }, + + /** + * cache matched text and append it on next action + * + * @public + * @this {RegExpLexer} + */ + more: function lexer_more() { + this._more = true; + return this; + }, + + /** + * signal the lexer that this rule fails to match the input, so the + * next matching rule (regex) should be tested instead. + * + * @public + * @this {RegExpLexer} + */ + reject: function lexer_reject() { + if (this.options.backtrack_lexer) { + this._backtrack = true; + } else { + // when the \`parseError()\` call returns, we MUST ensure that the error is registered. + // We accomplish this by signaling an 'error' token to be produced for the current + // \`.lex()\` run. + var lineno_msg = ''; + if (this.yylloc) { + lineno_msg = ' on line ' + (this.yylineno + 1); + } + var p = this.constructLexErrorInfo('Lexical error' + lineno_msg + ': You can only invoke reject() in the lexer when the lexer is of the backtracking persuasion (options.backtrack_lexer = true).', false); + this._signaled_error_token = (this.parseError(p.errStr, p, this.JisonLexerError) || this.ERROR); + } + return this; + }, + + /** + * retain first n characters of the match + * + * @public + * @this {RegExpLexer} + */ + less: function lexer_less(n) { + return this.unput(this.match.slice(n)); + }, + + /** + * return (part of the) already matched input, i.e. for error + * messages. + * + * Limit the returned string length to \`maxSize\` (default: 20). + * + * Limit the returned string to the \`maxLines\` number of lines of + * input (default: 1). + * + * Negative limit values equal *unlimited*. + * + * @public + * @this {RegExpLexer} + */ + pastInput: function lexer_pastInput(maxSize, maxLines) { + var past = this.matched.substring(0, this.matched.length - this.match.length); + if (maxSize < 0) + maxSize = past.length; + else if (!maxSize) + maxSize = 20; + if (maxLines < 0) + maxLines = past.length; // can't ever have more input lines than this! + else if (!maxLines) + maxLines = 1; + // \`substr\` anticipation: treat \\r\\n as a single character and take a little + // more than necessary so that we can still properly check against maxSize + // after we've transformed and limited the newLines in here: + past = past.substr(-maxSize * 2 - 2); + // now that we have a significantly reduced string to process, transform the newlines + // and chop them, then limit them: + var a = past.replace(/\\r\\n|\\r/g, '\\n').split('\\n'); + a = a.slice(-maxLines); + past = a.join('\\n'); + // When, after limiting to maxLines, we still have too much to return, + // do add an ellipsis prefix... + if (past.length > maxSize) { + past = '...' + past.substr(-maxSize); + } + return past; + }, + + /** + * return (part of the) upcoming input, i.e. for error messages. + * + * Limit the returned string length to \`maxSize\` (default: 20). + * + * Limit the returned string to the \`maxLines\` number of lines of input (default: 1). + * + * Negative limit values equal *unlimited*. + * + * > ### NOTE ### + * > + * > *"upcoming input"* is defined as the whole of the both + * > the *currently lexed* input, together with any remaining input + * > following that. *"currently lexed"* input is the input + * > already recognized by the lexer but not yet returned with + * > the lexer token. This happens when you are invoking this API + * > from inside any lexer rule action code block. + * > + * + * @public + * @this {RegExpLexer} + */ + upcomingInput: function lexer_upcomingInput(maxSize, maxLines) { + var next = this.match; + if (maxSize < 0) + maxSize = next.length + this._input.length; + else if (!maxSize) + maxSize = 20; + if (maxLines < 0) + maxLines = maxSize; // can't ever have more input lines than this! + else if (!maxLines) + maxLines = 1; + // \`substring\` anticipation: treat \\r\\n as a single character and take a little + // more than necessary so that we can still properly check against maxSize + // after we've transformed and limited the newLines in here: + if (next.length < maxSize * 2 + 2) { + next += this._input.substring(0, maxSize * 2 + 2); // substring is faster on Chrome/V8 + } + // now that we have a significantly reduced string to process, transform the newlines + // and chop them, then limit them: + var a = next.replace(/\\r\\n|\\r/g, '\\n').split('\\n'); + a = a.slice(0, maxLines); + next = a.join('\\n'); + // When, after limiting to maxLines, we still have too much to return, + // do add an ellipsis postfix... + if (next.length > maxSize) { + next = next.substring(0, maxSize) + '...'; + } + return next; + }, + + /** + * return a string which displays the character position where the + * lexing error occurred, i.e. for error messages + * + * @public + * @this {RegExpLexer} + */ + showPosition: function lexer_showPosition(maxPrefix, maxPostfix) { + var pre = this.pastInput(maxPrefix).replace(/\\s/g, ' '); + var c = new Array(pre.length + 1).join('-'); + return pre + this.upcomingInput(maxPostfix).replace(/\\s/g, ' ') + '\\n' + c + '^'; + }, + + /** + * return a string which displays the lines & columns of input which are referenced + * by the given location info range, plus a few lines of context. + * + * This function pretty-prints the indicated section of the input, with line numbers + * and everything! + * + * This function is very useful to provide highly readable error reports, while + * the location range may be specified in various flexible ways: + * + * - \`loc\` is the location info object which references the area which should be + * displayed and 'marked up': these lines & columns of text are marked up by \`^\` + * characters below each character in the entire input range. + * + * - \`context_loc\` is the *optional* location info object which instructs this + * pretty-printer how much *leading* context should be displayed alongside + * the area referenced by \`loc\`. This can help provide context for the displayed + * error, etc. + * + * When this location info is not provided, a default context of 3 lines is + * used. + * + * - \`context_loc2\` is another *optional* location info object, which serves + * a similar purpose to \`context_loc\`: it specifies the amount of *trailing* + * context lines to display in the pretty-print output. + * + * When this location info is not provided, a default context of 1 line only is + * used. + * + * Special Notes: + * + * - when the \`loc\`-indicated range is very large (about 5 lines or more), then + * only the first and last few lines of this block are printed while a + * \`...continued...\` message will be printed between them. + * + * This serves the purpose of not printing a huge amount of text when the \`loc\` + * range happens to be huge: this way a manageable & readable output results + * for arbitrary large ranges. + * + * - this function can display lines of input which whave not yet been lexed. + * \`prettyPrintRange()\` can access the entire input! + * + * @public + * @this {RegExpLexer} + */ + prettyPrintRange: function lexer_prettyPrintRange(loc, context_loc, context_loc2) { + var error_size = loc.last_line - loc.first_line; + const CONTEXT = 3; + const CONTEXT_TAIL = 1; + const MINIMUM_VISIBLE_NONEMPTY_LINE_COUNT = 2; + var input = this.matched + this._input; + var lines = input.split('\\n'); + //var show_context = (error_size < 5 || context_loc); + var l0 = Math.max(1, (context_loc ? context_loc.first_line : loc.first_line - CONTEXT)); + var l1 = Math.max(1, (context_loc2 ? context_loc2.last_line : loc.last_line + CONTEXT_TAIL)); + var lineno_display_width = (1 + Math.log10(l1 | 1) | 0); + var ws_prefix = new Array(lineno_display_width).join(' '); + var nonempty_line_indexes = []; + var rv = lines.slice(l0 - 1, l1 + 1).map(function injectLineNumber(line, index) { + var lno = index + l0; + var lno_pfx = (ws_prefix + lno).substr(-lineno_display_width); + var rv = lno_pfx + ': ' + line; + var errpfx = (new Array(lineno_display_width + 1)).join('^'); + var offset = 2 + 1; + var len = 0; + + if (lno === loc.first_line) { + offset += loc.first_column; + + len = Math.max( + 2, + ((lno === loc.last_line ? loc.last_column : line.length)) - loc.first_column + 1 + ); + } else if (lno === loc.last_line) { + len = Math.max(2, loc.last_column + 1); + } else if (lno > loc.first_line && lno < loc.last_line) { + len = Math.max(2, line.length + 1); + } + + if (len) { + var lead = new Array(offset).join('.'); + var mark = new Array(len).join('^'); + rv += '\\n' + errpfx + lead + mark; + + if (line.trim().length > 0) { + nonempty_line_indexes.push(index); + } + } + + rv = rv.replace(/\\t/g, ' '); + return rv; + }); + + // now make sure we don't print an overly large amount of error area: limit it + // to the top and bottom line count: + if (nonempty_line_indexes.length > 2 * MINIMUM_VISIBLE_NONEMPTY_LINE_COUNT) { + var clip_start = nonempty_line_indexes[MINIMUM_VISIBLE_NONEMPTY_LINE_COUNT - 1] + 1; + var clip_end = nonempty_line_indexes[nonempty_line_indexes.length - MINIMUM_VISIBLE_NONEMPTY_LINE_COUNT] - 1; + + var intermediate_line = (new Array(lineno_display_width + 1)).join(' ') + ' (...continued...)'; + intermediate_line += '\\n' + (new Array(lineno_display_width + 1)).join('-') + ' (---------------)'; + rv.splice(clip_start, clip_end - clip_start + 1, intermediate_line); + } + return rv.join('\\n'); + }, + + /** + * helper function, used to produce a human readable description as a string, given + * the input \`yylloc\` location object. + * + * Set \`display_range_too\` to TRUE to include the string character index position(s) + * in the description if the \`yylloc.range\` is available. + * + * @public + * @this {RegExpLexer} + */ + describeYYLLOC: function lexer_describe_yylloc(yylloc, display_range_too) { + var l1 = yylloc.first_line; + var l2 = yylloc.last_line; + var c1 = yylloc.first_column; + var c2 = yylloc.last_column; + var dl = l2 - l1; + var dc = c2 - c1; + var rv; + if (dl === 0) { + rv = 'line ' + l1 + ', '; + if (dc <= 1) { + rv += 'column ' + c1; + } else { + rv += 'columns ' + c1 + ' .. ' + c2; + } + } else { + rv = 'lines ' + l1 + '(column ' + c1 + ') .. ' + l2 + '(column ' + c2 + ')'; + } + if (yylloc.range && display_range_too) { + var r1 = yylloc.range[0]; + var r2 = yylloc.range[1] - 1; + if (r2 <= r1) { + rv += ' {String Offset: ' + r1 + '}'; + } else { + rv += ' {String Offset range: ' + r1 + ' .. ' + r2 + '}'; + } + } + return rv; + }, + + /** + * test the lexed token: return FALSE when not a match, otherwise return token. + * + * \`match\` is supposed to be an array coming out of a regex match, i.e. \`match[0]\` + * contains the actually matched text string. + * + * Also move the input cursor forward and update the match collectors: + * + * - \`yytext\` + * - \`yyleng\` + * - \`match\` + * - \`matches\` + * - \`yylloc\` + * - \`offset\` + * + * @public + * @this {RegExpLexer} + */ + test_match: function lexer_test_match(match, indexed_rule) { + var token, + lines, + backup, + match_str, + match_str_len; + + if (this.options.backtrack_lexer) { + // save context + backup = { + yylineno: this.yylineno, + yylloc: { + first_line: this.yylloc.first_line, + last_line: this.yylloc.last_line, + first_column: this.yylloc.first_column, + last_column: this.yylloc.last_column, + + range: this.yylloc.range.slice(0) + }, + yytext: this.yytext, + match: this.match, + matches: this.matches, + matched: this.matched, + yyleng: this.yyleng, + offset: this.offset, + _more: this._more, + _input: this._input, + //_signaled_error_token: this._signaled_error_token, + yy: this.yy, + conditionStack: this.conditionStack.slice(0), + done: this.done + }; + } + + match_str = match[0]; + match_str_len = match_str.length; + // if (match_str.indexOf('\\n') !== -1 || match_str.indexOf('\\r') !== -1) { + lines = match_str.split(/(?:\\r\\n?|\\n)/g); + if (lines.length > 1) { + this.yylineno += lines.length - 1; + + this.yylloc.last_line = this.yylineno + 1; + this.yylloc.last_column = lines[lines.length - 1].length; + } else { + this.yylloc.last_column += match_str_len; + } + // } + this.yytext += match_str; + this.match += match_str; + this.matched += match_str; + this.matches = match; + this.yyleng = this.yytext.length; + this.yylloc.range[1] += match_str_len; + + // previous lex rules MAY have invoked the \`more()\` API rather than producing a token: + // those rules will already have moved this \`offset\` forward matching their match lengths, + // hence we must only add our own match length now: + this.offset += match_str_len; + this._more = false; + this._backtrack = false; + this._input = this._input.slice(match_str_len); + + // calling this method: + // + // function lexer__performAction(yy, yyrulenumber, YY_START) {...} + token = this.performAction.call(this, this.yy, indexed_rule, this.conditionStack[this.conditionStack.length - 1] /* = YY_START */); + // otherwise, when the action codes are all simple return token statements: + //token = this.simpleCaseActionClusters[indexed_rule]; + + if (this.done && this._input) { + this.done = false; + } + if (token) { + return token; + } else if (this._backtrack) { + // recover context + for (var k in backup) { + this[k] = backup[k]; + } + this.__currentRuleSet__ = null; + return false; // rule action called reject() implying the next rule should be tested instead. + } else if (this._signaled_error_token) { + // produce one 'error' token as \`.parseError()\` in \`reject()\` + // did not guarantee a failure signal by throwing an exception! + token = this._signaled_error_token; + this._signaled_error_token = false; + return token; + } + return false; + }, + + /** + * return next match in input + * + * @public + * @this {RegExpLexer} + */ + next: function lexer_next() { + if (this.done) { + this.clear(); + return this.EOF; + } + if (!this._input) { + this.done = true; + } + + var token, + match, + tempMatch, + index; + if (!this._more) { + this.clear(); + } + var spec = this.__currentRuleSet__; + if (!spec) { + // Update the ruleset cache as we apparently encountered a state change or just started lexing. + // The cache is set up for fast lookup -- we assume a lexer will switch states much less often than it will + // invoke the \`lex()\` token-producing API and related APIs, hence caching the set for direct access helps + // speed up those activities a tiny bit. + spec = this.__currentRuleSet__ = this._currentRules(); + // Check whether a *sane* condition has been pushed before: this makes the lexer robust against + // user-programmer bugs such as https://github.com/zaach/jison-lex/issues/19 + if (!spec || !spec.rules) { + var lineno_msg = ''; + if (this.options.trackPosition) { + lineno_msg = ' on line ' + (this.yylineno + 1); + } + var p = this.constructLexErrorInfo('Internal lexer engine error' + lineno_msg + ': The lex grammar programmer pushed a non-existing condition name "' + this.topState() + '"; this is a fatal error and should be reported to the application programmer team!', false); + // produce one 'error' token until this situation has been resolved, most probably by parse termination! + return (this.parseError(p.errStr, p, this.JisonLexerError) || this.ERROR); + } + } + + var rule_ids = spec.rules; + var regexes = spec.__rule_regexes; + var len = spec.__rule_count; + + // Note: the arrays are 1-based, while \`len\` itself is a valid index, + // hence the non-standard less-or-equal check in the next loop condition! + for (var i = 1; i <= len; i++) { + tempMatch = this._input.match(regexes[i]); + if (tempMatch && (!match || tempMatch[0].length > match[0].length)) { + match = tempMatch; + index = i; + if (this.options.backtrack_lexer) { + token = this.test_match(tempMatch, rule_ids[i]); + if (token !== false) { + return token; + } else if (this._backtrack) { + match = undefined; + continue; // rule action called reject() implying a rule MISmatch. + } else { + // else: this is a lexer rule which consumes input without producing a token (e.g. whitespace) + return false; + } + } else if (!this.options.flex) { + break; + } + } + } + if (match) { + token = this.test_match(match, rule_ids[index]); + if (token !== false) { + return token; + } + // else: this is a lexer rule which consumes input without producing a token (e.g. whitespace) + return false; + } + if (!this._input) { + this.done = true; + this.clear(); + return this.EOF; + } else { + var lineno_msg = ''; + if (this.options.trackPosition) { + lineno_msg = ' on line ' + (this.yylineno + 1); + } + var p = this.constructLexErrorInfo('Lexical error' + lineno_msg + ': Unrecognized text.', this.options.lexerErrorsAreRecoverable); + + var pendingInput = this._input; + var activeCondition = this.topState(); + var conditionStackDepth = this.conditionStack.length; + + token = (this.parseError(p.errStr, p, this.JisonLexerError) || this.ERROR); + if (token === this.ERROR) { + // we can try to recover from a lexer error that \`parseError()\` did not 'recover' for us + // by moving forward at least one character at a time IFF the (user-specified?) \`parseError()\` + // has not consumed/modified any pending input or changed state in the error handler: + if (!this.matches && + // and make sure the input has been modified/consumed ... + pendingInput === this._input && + // ...or the lexer state has been modified significantly enough + // to merit a non-consuming error handling action right now. + activeCondition === this.topState() && + conditionStackDepth === this.conditionStack.length + ) { + this.input(); + } + } + return token; + } + }, + + /** + * return next match that has a token + * + * @public + * @this {RegExpLexer} + */ + lex: function lexer_lex() { + var r; + // allow the PRE/POST handlers set/modify the return token for maximum flexibility of the generated lexer: + if (typeof this.options.pre_lex === 'function') { + r = this.options.pre_lex.call(this); + } + + while (!r) { + r = this.next(); + } + + if (typeof this.options.post_lex === 'function') { + // (also account for a userdef function which does not return any value: keep the token as is) + r = this.options.post_lex.call(this, r) || r; + } + return r; + }, + + /** + * backwards compatible alias for \`pushState()\`; + * the latter is symmetrical with \`popState()\` and we advise to use + * those APIs in any modern lexer code, rather than \`begin()\`. + * + * @public + * @this {RegExpLexer} + */ + begin: function lexer_begin(condition) { + return this.pushState(condition); + }, + + /** + * activates a new lexer condition state (pushes the new lexer + * condition state onto the condition stack) + * + * @public + * @this {RegExpLexer} + */ + pushState: function lexer_pushState(condition) { + this.conditionStack.push(condition); + this.__currentRuleSet__ = null; + return this; + }, + + /** + * pop the previously active lexer condition state off the condition + * stack + * + * @public + * @this {RegExpLexer} + */ + popState: function lexer_popState() { + var n = this.conditionStack.length - 1; + if (n > 0) { + this.__currentRuleSet__ = null; + return this.conditionStack.pop(); + } else { + return this.conditionStack[0]; + } + }, + + /** + * return the currently active lexer condition state; when an index + * argument is provided it produces the N-th previous condition state, + * if available + * + * @public + * @this {RegExpLexer} + */ + topState: function lexer_topState(n) { + n = this.conditionStack.length - 1 - Math.abs(n || 0); + if (n >= 0) { + return this.conditionStack[n]; + } else { + return 'INITIAL'; + } + }, + + /** + * (internal) determine the lexer rule set which is active for the + * currently active lexer condition state + * + * @public + * @this {RegExpLexer} + */ + _currentRules: function lexer__currentRules() { + if (this.conditionStack.length && this.conditionStack[this.conditionStack.length - 1]) { + return this.conditions[this.conditionStack[this.conditionStack.length - 1]]; + } else { + return this.conditions['INITIAL']; + } + }, + + /** + * return the number of states currently on the stack + * + * @public + * @this {RegExpLexer} + */ + stateStackSize: function lexer_stateStackSize() { + return this.conditionStack.length; + } +}`; + // --- END lexer kernel --- +} + +RegExpLexer.prototype = (new Function(rmCommonWS` + return ${getRegExpLexerPrototype()}; +`))(); + + +// The lexer code stripper, driven by optimization analysis settings and +// lexer options, which cannot be changed at run-time. +function stripUnusedLexerCode(src, opt) { + // uses yyleng: ..................... ${opt.lexerActionsUseYYLENG} + // uses yylineno: ................... ${opt.lexerActionsUseYYLINENO} + // uses yytext: ..................... ${opt.lexerActionsUseYYTEXT} + // uses yylloc: ..................... ${opt.lexerActionsUseYYLOC} + // uses ParseError API: ............. ${opt.lexerActionsUseParseError} + // uses location tracking & editing: ${opt.lexerActionsUseLocationTracking} + // uses more() API: ................. ${opt.lexerActionsUseMore} + // uses unput() API: ................ ${opt.lexerActionsUseUnput} + // uses reject() API: ............... ${opt.lexerActionsUseReject} + // uses less() API: ................. ${opt.lexerActionsUseLess} + // uses display APIs pastInput(), upcomingInput(), showPosition(): + // ............................. ${opt.lexerActionsUseDisplayAPIs} + // uses describeYYLLOC() API: ....... ${opt.lexerActionsUseDescribeYYLOC} + + var ast = helpers.parseCodeChunkToAST(src, opt); + var new_src = helpers.prettyPrintAST(ast, opt); + +new_src = new_src.replace(/\/\*\s*JISON-LEX-ANALYTICS-REPORT\s*\*\//g, rmCommonWS` + // Code Generator Information Report + // --------------------------------- + // + // Options: + // + // backtracking: .................... ${opt.options.backtrack_lexer} + // location.ranges: ................. ${opt.options.ranges} + // location line+column tracking: ... ${opt.options.trackPosition} + // + // + // Forwarded Parser Analysis flags: + // + // uses yyleng: ..................... ${opt.parseActionsUseYYLENG} + // uses yylineno: ................... ${opt.parseActionsUseYYLINENO} + // uses yytext: ..................... ${opt.parseActionsUseYYTEXT} + // uses yylloc: ..................... ${opt.parseActionsUseYYLOC} + // uses lexer values: ............... ${opt.parseActionsUseValueTracking} / ${opt.parseActionsUseValueAssignment} + // location tracking: ............... ${opt.parseActionsUseLocationTracking} + // location assignment: ............. ${opt.parseActionsUseLocationAssignment} + // + // + // Lexer Analysis flags: + // + // uses yyleng: ..................... ${opt.lexerActionsUseYYLENG} + // uses yylineno: ................... ${opt.lexerActionsUseYYLINENO} + // uses yytext: ..................... ${opt.lexerActionsUseYYTEXT} + // uses yylloc: ..................... ${opt.lexerActionsUseYYLOC} + // uses ParseError API: ............. ${opt.lexerActionsUseParseError} + // uses yyerror: .................... ${opt.lexerActionsUseYYERROR} + // uses location tracking & editing: ${opt.lexerActionsUseLocationTracking} + // uses more() API: ................. ${opt.lexerActionsUseMore} + // uses unput() API: ................ ${opt.lexerActionsUseUnput} + // uses reject() API: ............... ${opt.lexerActionsUseReject} + // uses less() API: ................. ${opt.lexerActionsUseLess} + // uses display APIs pastInput(), upcomingInput(), showPosition(): + // ............................. ${opt.lexerActionsUseDisplayAPIs} + // uses describeYYLLOC() API: ....... ${opt.lexerActionsUseDescribeYYLOC} + // + // --------- END OF REPORT ----------- + + `); + + return new_src; +} + + + + + +// generate lexer source from a grammar +/** @public */ +function generate(dict, tokens, build_options) { + var opt = processGrammar(dict, tokens, build_options); + + return generateFromOpts(opt); +} + +// process the grammar and build final data structures and functions +/** @public */ +function processGrammar(dict, tokens, build_options) { + build_options = build_options || {}; + var opts = { + // include the knowledge passed through `build_options` about which lexer + // features will actually be *used* by the environment (which in 99.9% + // of cases is a jison *parser*): + // + // (this stuff comes straight from the jison Optimization Analysis.) + // + parseActionsUseYYLENG: build_options.parseActionsUseYYLENG, + parseActionsUseYYLINENO: build_options.parseActionsUseYYLINENO, + parseActionsUseYYTEXT: build_options.parseActionsUseYYTEXT, + parseActionsUseYYLOC: build_options.parseActionsUseYYLOC, + parseActionsUseParseError: build_options.parseActionsUseParseError, + parseActionsUseYYERROR: build_options.parseActionsUseYYERROR, + parseActionsUseYYERROK: build_options.parseActionsUseYYERROK, + parseActionsUseYYRECOVERING: build_options.parseActionsUseYYRECOVERING, + parseActionsUseYYCLEARIN: build_options.parseActionsUseYYCLEARIN, + parseActionsUseValueTracking: build_options.parseActionsUseValueTracking, + parseActionsUseValueAssignment: build_options.parseActionsUseValueAssignment, + parseActionsUseLocationTracking: build_options.parseActionsUseLocationTracking, + parseActionsUseLocationAssignment: build_options.parseActionsUseLocationAssignment, + parseActionsUseYYSTACK: build_options.parseActionsUseYYSTACK, + parseActionsUseYYSSTACK: build_options.parseActionsUseYYSSTACK, + parseActionsUseYYSTACKPOINTER: build_options.parseActionsUseYYSTACKPOINTER, + parseActionsUseYYRULELENGTH: build_options.parseActionsUseYYRULELENGTH, + parserHasErrorRecovery: build_options.parserHasErrorRecovery, + parserHasErrorReporting: build_options.parserHasErrorReporting, + + lexerActionsUseYYLENG: '???', + lexerActionsUseYYLINENO: '???', + lexerActionsUseYYTEXT: '???', + lexerActionsUseYYLOC: '???', + lexerActionsUseParseError: '???', + lexerActionsUseYYERROR: '???', + lexerActionsUseLocationTracking: '???', + lexerActionsUseMore: '???', + lexerActionsUseUnput: '???', + lexerActionsUseReject: '???', + lexerActionsUseLess: '???', + lexerActionsUseDisplayAPIs: '???', + lexerActionsUseDescribeYYLOC: '???', + }; + + dict = autodetectAndConvertToJSONformat(dict, build_options) || {}; + + // Feed the possibly reprocessed 'dictionary' above back to the caller + // (for use by our error diagnostic assistance code) + opts.lex_rule_dictionary = dict; + + // Always provide the lexer with an options object, even if it's empty! + // Make sure to camelCase all options: + opts.options = mkStdOptions(build_options, dict.options); + + opts.moduleType = opts.options.moduleType; + opts.moduleName = opts.options.moduleName; + + opts.conditions = prepareStartConditions(dict.startConditions); + opts.conditions.INITIAL = { + rules: [], + inclusive: true + }; + + var code = buildActions(dict, tokens, opts); + opts.performAction = code.actions; + opts.caseHelperInclude = code.caseHelperInclude; + opts.rules = code.rules; + opts.macros = code.macros; + + opts.regular_rule_count = code.regular_rule_count; + opts.simple_rule_count = code.simple_rule_count; + + opts.conditionStack = ['INITIAL']; + + opts.actionInclude = (dict.actionInclude || ''); + opts.moduleInclude = (opts.moduleInclude || '') + (dict.moduleInclude || '').trim(); + + return opts; +} + +// Assemble the final source from the processed grammar +/** @public */ +function generateFromOpts(opt) { + var code = ''; + + switch (opt.moduleType) { + case 'js': + code = generateModule(opt); + break; + case 'amd': + code = generateAMDModule(opt); + break; + case 'es': + code = generateESModule(opt); + break; + case 'commonjs': + default: + code = generateCommonJSModule(opt); + break; + } + + return code; +} + +function generateRegexesInitTableCode(opt) { + var a = opt.rules; + var print_xregexp = opt.options && opt.options.xregexp; + var id_display_width = (1 + Math.log10(a.length | 1) | 0); + var ws_prefix = new Array(id_display_width).join(' '); + var b = a.map(function generateXRegExpInitCode(re, idx) { + var idx_str = (ws_prefix + idx).substr(-id_display_width); + + if (re instanceof XRegExp) { + // When we don't need the special XRegExp sauce at run-time, we do with the original + // JavaScript RegExp instance a.k.a. 'native regex': + if (re.xregexp.isNative || !print_xregexp) { + return `/* ${idx_str}: */ ${re}`; + } + // And make sure to escape the regex to make it suitable for placement inside a *string* + // as it is passed as a string argument to the XRegExp constructor here. + var re_src = re.xregexp.source.replace(/[\\"]/g, '\\$&'); + return `/* ${idx_str}: */ new XRegExp("${re_src}", "${re.xregexp.flags}")`; + } else { + return `/* ${idx_str}: */ ${re}`; + } + }); + return b.join(',\n'); +} + +function generateModuleBody(opt) { + // make the JSON output look more like JavaScript: + function cleanupJSON(str) { + str = str.replace(/ "rules": \[/g, ' rules: ['); + str = str.replace(/ "inclusive": /g, ' inclusive: '); + return str; + } + + function produceOptions(opts) { + var obj = {}; + var do_not_pass = { + debug: !opts.debug, // do not include this item when it is FALSE as there's no debug tracing built into the generated grammar anyway! + enableDebugLogs: 1, + json: 1, + _: 1, + noMain: 1, + dumpSourceCodeOnFailure: 1, + throwErrorOnCompileFailure: 1, + reportStats: 1, + file: 1, + outfile: 1, + inputPath: 1, + inputFilename: 1, + defaultModuleName: 1, + moduleName: 1, + moduleType: 1, + lexerErrorsAreRecoverable: 0, + flex: 0, + backtrack_lexer: 0, + caseInsensitive: 0, + showSource: 1, + exportAST: 1, + exportAllTables: 1, + exportSourceCode: 1, + prettyCfg: 1, + parseActionsUseYYLENG: 1, + parseActionsUseYYLINENO: 1, + parseActionsUseYYTEXT: 1, + parseActionsUseYYLOC: 1, + parseActionsUseParseError: 1, + parseActionsUseYYERROR: 1, + parseActionsUseYYRECOVERING: 1, + parseActionsUseYYERROK: 1, + parseActionsUseYYCLEARIN: 1, + parseActionsUseValueTracking: 1, + parseActionsUseValueAssignment: 1, + parseActionsUseLocationTracking: 1, + parseActionsUseLocationAssignment: 1, + parseActionsUseYYSTACK: 1, + parseActionsUseYYSSTACK: 1, + parseActionsUseYYSTACKPOINTER: 1, + parseActionsUseYYRULELENGTH: 1, + parserHasErrorRecovery: 1, + parserHasErrorReporting: 1, + lexerActionsUseYYLENG: 1, + lexerActionsUseYYLINENO: 1, + lexerActionsUseYYTEXT: 1, + lexerActionsUseYYLOC: 1, + lexerActionsUseParseError: 1, + lexerActionsUseYYERROR: 1, + lexerActionsUseLocationTracking: 1, + lexerActionsUseMore: 1, + lexerActionsUseUnput: 1, + lexerActionsUseReject: 1, + lexerActionsUseLess: 1, + lexerActionsUseDisplayAPIs: 1, + lexerActionsUseDescribeYYLOC: 1, + }; + for (var k in opts) { + if (!do_not_pass[k] && opts[k] != null && opts[k] !== false) { + // make sure numeric values are encoded as numeric, the rest as boolean/string. + if (typeof opts[k] === 'string') { + var f = parseFloat(opts[k]); + if (f == opts[k]) { + obj[k] = f; + continue; + } + } + obj[k] = opts[k]; + } + } + + // And now some options which should receive some special processing: + var pre = obj.pre_lex; + var post = obj.post_lex; + // since JSON cannot encode functions, we'll have to do it manually at run-time, i.e. later on: + if (pre) { + obj.pre_lex = true; + } + if (post) { + obj.post_lex = true; + } + + var js = JSON.stringify(obj, null, 2); + + js = js.replace(new XRegExp(` "(${ID_REGEX_BASE})": `, 'g'), ' $1: '); + js = js.replace(/^( +)pre_lex: true(,)?$/gm, function (m, ls, tc) { + return ls + 'pre_lex: ' + String(pre) + (tc || ''); + }); + js = js.replace(/^( +)post_lex: true(,)?$/gm, function (m, ls, tc) { + return ls + 'post_lex: ' + String(post) + (tc || ''); + }); + return js; + } + + + var out; + if (opt.rules.length > 0 || opt.__in_rules_failure_analysis_mode__) { + // we don't mind that the `test_me()` code above will have this `lexer` variable re-defined: + // JavaScript is fine with that. + var code = [rmCommonWS` + var lexer = { + `, '/*JISON-LEX-ANALYTICS-REPORT*/' /* slot #1: placeholder for analysis report further below */ + ]; + + // get the RegExpLexer.prototype in source code form: + var protosrc = getRegExpLexerPrototype(); + // and strip off the surrounding bits we don't want: + protosrc = protosrc + .replace(/^[\s\r\n]*\{/, '') + .replace(/\s*\}[\s\r\n]*$/, '') + .trim(); + code.push(protosrc + ',\n'); + + assert(opt.options); + // Assure all options are camelCased: + assert(typeof opt.options['case-insensitive'] === 'undefined'); + + code.push(' options: ' + produceOptions(opt.options)); + + var performActionCode = String(opt.performAction); + var simpleCaseActionClustersCode = String(opt.caseHelperInclude); + var rulesCode = generateRegexesInitTableCode(opt); + var conditionsCode = cleanupJSON(JSON.stringify(opt.conditions, null, 2)); + code.push(rmCommonWS`, + JisonLexerError: JisonLexerError, + performAction: ${performActionCode}, + simpleCaseActionClusters: ${simpleCaseActionClustersCode}, + rules: [ + ${rulesCode} + ], + conditions: ${conditionsCode} + }; + `); + + opt.is_custom_lexer = false; + + out = code.join(''); + } else { + // We're clearly looking at a custom lexer here as there's no lexer rules at all. + // + // We are re-purposing the `%{...%}` `actionInclude` code block here as it serves no purpose otherwise. + // + // Meanwhile we make sure we have the `lexer` variable declared in *local scope* no matter + // what crazy stuff (or lack thereof) the userland code is pulling in the `actionInclude` chunk. + out = 'var lexer;\n'; + + assert(opt.regular_rule_count === 0); + assert(opt.simple_rule_count === 0); + opt.is_custom_lexer = true; + + if (opt.actionInclude) { + out += opt.actionInclude + (!opt.actionInclude.match(/;[\s\r\n]*$/) ? ';' : '') + '\n'; + } + } + + // The output of this function is guaranteed to read something like this: + // + // ``` + // var lexer; + // + // bla bla bla bla ... lotsa bla bla; + // ``` + // + // and that should work nicely as an `eval()`-able piece of source code. + return out; +} + +function generateGenericHeaderComment() { + var out = rmCommonWS` + /* lexer generated by jison-lex ${version} */ + + /* + * Returns a Lexer object of the following structure: + * + * Lexer: { + * yy: {} The so-called "shared state" or rather the *source* of it; + * the real "shared state" \`yy\` passed around to + * the rule actions, etc. is a direct reference! + * + * This "shared context" object was passed to the lexer by way of + * the \`lexer.setInput(str, yy)\` API before you may use it. + * + * This "shared context" object is passed to the lexer action code in \`performAction()\` + * so userland code in the lexer actions may communicate with the outside world + * and/or other lexer rules' actions in more or less complex ways. + * + * } + * + * Lexer.prototype: { + * EOF: 1, + * ERROR: 2, + * + * yy: The overall "shared context" object reference. + * + * JisonLexerError: function(msg, hash), + * + * performAction: function lexer__performAction(yy, yyrulenumber, YY_START), + * + * The function parameters and \`this\` have the following value/meaning: + * - \`this\` : reference to the \`lexer\` instance. + * \`yy_\` is an alias for \`this\` lexer instance reference used internally. + * + * - \`yy\` : a reference to the \`yy\` "shared state" object which was passed to the lexer + * by way of the \`lexer.setInput(str, yy)\` API before. + * + * Note: + * The extra arguments you specified in the \`%parse-param\` statement in your + * **parser** grammar definition file are passed to the lexer via this object + * reference as member variables. + * + * - \`yyrulenumber\` : index of the matched lexer rule (regex), used internally. + * + * - \`YY_START\`: the current lexer "start condition" state. + * + * parseError: function(str, hash, ExceptionClass), + * + * constructLexErrorInfo: function(error_message, is_recoverable), + * Helper function. + * Produces a new errorInfo \'hash object\' which can be passed into \`parseError()\`. + * See it\'s use in this lexer kernel in many places; example usage: + * + * var infoObj = lexer.constructParseErrorInfo(\'fail!\', true); + * var retVal = lexer.parseError(infoObj.errStr, infoObj, lexer.JisonLexerError); + * + * options: { ... lexer %options ... }, + * + * lex: function(), + * Produce one token of lexed input, which was passed in earlier via the \`lexer.setInput()\` API. + * You MAY use the additional \`args...\` parameters as per \`%parse-param\` spec of the **lexer** grammar: + * these extra \`args...\` are added verbatim to the \`yy\` object reference as member variables. + * + * WARNING: + * Lexer's additional \`args...\` parameters (via lexer's \`%parse-param\`) MAY conflict with + * any attributes already added to \`yy\` by the **parser** or the jison run-time; + * when such a collision is detected an exception is thrown to prevent the generated run-time + * from silently accepting this confusing and potentially hazardous situation! + * + * cleanupAfterLex: function(do_not_nuke_errorinfos), + * Helper function. + * + * This helper API is invoked when the **parse process** has completed: it is the responsibility + * of the **parser** (or the calling userland code) to invoke this method once cleanup is desired. + * + * This helper may be invoked by user code to ensure the internal lexer gets properly garbage collected. + * + * setInput: function(input, [yy]), + * + * + * input: function(), + * + * + * unput: function(str), + * + * + * more: function(), + * + * + * reject: function(), + * + * + * less: function(n), + * + * + * pastInput: function(n), + * + * + * upcomingInput: function(n), + * + * + * showPosition: function(), + * + * + * test_match: function(regex_match_array, rule_index), + * + * + * next: function(), + * + * + * begin: function(condition), + * + * + * pushState: function(condition), + * + * + * popState: function(), + * + * + * topState: function(), + * + * + * _currentRules: function(), + * + * + * stateStackSize: function(), + * + * + * performAction: function(yy, yy_, yyrulenumber, YY_START), + * + * + * rules: [...], + * + * + * conditions: {associative list: name ==> set}, + * } + * + * + * token location info (\`yylloc\`): { + * first_line: n, + * last_line: n, + * first_column: n, + * last_column: n, + * range: [start_number, end_number] + * (where the numbers are indexes into the input string, zero-based) + * } + * + * --- + * + * The \`parseError\` function receives a \'hash\' object with these members for lexer errors: + * + * { + * text: (matched text) + * token: (the produced terminal token, if any) + * token_id: (the produced terminal token numeric ID, if any) + * line: (yylineno) + * loc: (yylloc) + * recoverable: (boolean: TRUE when the parser MAY have an error recovery rule + * available for this particular error) + * yy: (object: the current parser internal "shared state" \`yy\` + * as is also available in the rule actions; this can be used, + * for instance, for advanced error analysis and reporting) + * lexer: (reference to the current lexer instance used by the parser) + * } + * + * while \`this\` will reference the current lexer instance. + * + * When \`parseError\` is invoked by the lexer, the default implementation will + * attempt to invoke \`yy.parser.parseError()\`; when this callback is not provided + * it will try to invoke \`yy.parseError()\` instead. When that callback is also not + * provided, a \`JisonLexerError\` exception will be thrown containing the error + * message and \`hash\`, as constructed by the \`constructLexErrorInfo()\` API. + * + * Note that the lexer\'s \`JisonLexerError\` error class is passed via the + * \`ExceptionClass\` argument, which is invoked to construct the exception + * instance to be thrown, so technically \`parseError\` will throw the object + * produced by the \`new ExceptionClass(str, hash)\` JavaScript expression. + * + * --- + * + * You can specify lexer options by setting / modifying the \`.options\` object of your Lexer instance. + * These options are available: + * + * (Options are permanent.) + * + * yy: { + * parseError: function(str, hash, ExceptionClass) + * optional: overrides the default \`parseError\` function. + * } + * + * lexer.options: { + * pre_lex: function() + * optional: is invoked before the lexer is invoked to produce another token. + * \`this\` refers to the Lexer object. + * post_lex: function(token) { return token; } + * optional: is invoked when the lexer has produced a token \`token\`; + * this function can override the returned token value by returning another. + * When it does not return any (truthy) value, the lexer will return + * the original \`token\`. + * \`this\` refers to the Lexer object. + * + * WARNING: the next set of options are not meant to be changed. They echo the abilities of + * the lexer as per when it was compiled! + * + * ranges: boolean + * optional: \`true\` ==> token location info will include a .range[] member. + * flex: boolean + * optional: \`true\` ==> flex-like lexing behaviour where the rules are tested + * exhaustively to find the longest match. + * backtrack_lexer: boolean + * optional: \`true\` ==> lexer regexes are tested in order and for invoked; + * the lexer terminates the scan when a token is returned by the action code. + * xregexp: boolean + * optional: \`true\` ==> lexer rule regexes are "extended regex format" requiring the + * \`XRegExp\` library. When this %option has not been specified at compile time, all lexer + * rule regexes have been written as standard JavaScript RegExp expressions. + * } + */ + `; + + return out; +} + +function prepareOptions(opt) { + opt = opt || {}; + + // check for illegal identifier + if (!opt.moduleName || !opt.moduleName.match(/^[a-zA-Z_$][a-zA-Z0-9_$\.]*$/)) { + if (opt.moduleName) { + var msg = 'WARNING: The specified moduleName "' + opt.moduleName + '" is illegal (only characters [a-zA-Z0-9_$] and "." dot are accepted); using the default moduleName "lexer" instead.'; + if (typeof opt.warn_cb === 'function') { + opt.warn_cb(msg); + } else { + // do not treat as warning; barf hairball instead so that this oddity gets noticed right away! + throw new Error(msg); + } + } + opt.moduleName = 'lexer'; + } + + prepExportStructures(opt); + + return opt; +} + +function generateModule(opt) { + opt = prepareOptions(opt); + + var out = [ + generateGenericHeaderComment(), + '', + 'var ' + opt.moduleName + ' = (function () {', + jisonLexerErrorDefinition, + '', + generateModuleBody(opt), + '', + (opt.moduleInclude ? opt.moduleInclude + ';' : ''), + '', + 'return lexer;', + '})();' + ]; + + var src = out.join('\n') + '\n'; + src = stripUnusedLexerCode(src, opt); + opt.exportSourceCode.all = src; + return src; +} + +function generateAMDModule(opt) { + opt = prepareOptions(opt); + + var out = [ + generateGenericHeaderComment(), + '', + 'define([], function () {', + jisonLexerErrorDefinition, + '', + generateModuleBody(opt), + '', + (opt.moduleInclude ? opt.moduleInclude + ';' : ''), + '', + 'return lexer;', + '});' + ]; + + var src = out.join('\n') + '\n'; + src = stripUnusedLexerCode(src, opt); + opt.exportSourceCode.all = src; + return src; +} + +function generateESModule(opt) { + opt = prepareOptions(opt); + + var out = [ + generateGenericHeaderComment(), + '', + 'var lexer = (function () {', + jisonLexerErrorDefinition, + '', + generateModuleBody(opt), + '', + (opt.moduleInclude ? opt.moduleInclude + ';' : ''), + '', + 'return lexer;', + '})();', + '', + 'function yylex() {', + ' return lexer.lex.apply(lexer, arguments);', + '}', + rmCommonWS` + export { + lexer, + yylex as lex + }; + ` + ]; + + var src = out.join('\n') + '\n'; + src = stripUnusedLexerCode(src, opt); + opt.exportSourceCode.all = src; + return src; +} + +function generateCommonJSModule(opt) { + opt = prepareOptions(opt); + + var out = [ + generateGenericHeaderComment(), + '', + 'var ' + opt.moduleName + ' = (function () {', + jisonLexerErrorDefinition, + '', + generateModuleBody(opt), + '', + (opt.moduleInclude ? opt.moduleInclude + ';' : ''), + '', + 'return lexer;', + '})();', + '', + 'if (typeof require !== \'undefined\' && typeof exports !== \'undefined\') {', + ' exports.lexer = ' + opt.moduleName + ';', + ' exports.lex = function () {', + ' return ' + opt.moduleName + '.lex.apply(lexer, arguments);', + ' };', + '}' + ]; + + var src = out.join('\n') + '\n'; + src = stripUnusedLexerCode(src, opt); + opt.exportSourceCode.all = src; + return src; +} + +RegExpLexer.generate = generate; + +RegExpLexer.version = version; +RegExpLexer.defaultJisonLexOptions = defaultJisonLexOptions; +RegExpLexer.mkStdOptions = mkStdOptions; +RegExpLexer.camelCase = camelCase; +RegExpLexer.autodetectAndConvertToJSONformat = autodetectAndConvertToJSONformat; + +module.exports = RegExpLexer; diff --git a/dist/regexp-lexer-es6.js b/dist/regexp-lexer-es6.js new file mode 100644 index 0000000..8ba9c07 --- /dev/null +++ b/dist/regexp-lexer-es6.js @@ -0,0 +1,4047 @@ +import XRegExp from '@gerhobbelt/xregexp'; +import json5 from '@gerhobbelt/json5'; +import lexParser from '@gerhobbelt/lex-parser'; +import assert from 'assert'; +import helpers from 'jison-helpers-lib'; + +// +// Helper library for set definitions +// +// MIT Licensed +// +// +// This code is intended to help parse regex set expressions and mix them +// together, i.e. to answer questions like this: +// +// what is the resulting regex set expression when we mix the regex set +// `[a-z]` with the regex set `[^\s]` where with 'mix' we mean that any +// input which matches either input regex should match the resulting +// regex set. (a.k.a. Full Outer Join, see also http://www.diffen.com/difference/Inner_Join_vs_Outer_Join) +// + +'use strict'; + +const XREGEXP_UNICODE_ESCAPE_RE$1 = /^\{[A-Za-z0-9 \-\._]+\}/; // Matches the XRegExp Unicode escape braced part, e.g. `{Number}` +const CHR_RE$1 = /^(?:[^\\]|\\[^cxu0-9]|\\[0-9]{1,3}|\\c[A-Z]|\\x[0-9a-fA-F]{2}|\\u[0-9a-fA-F]{4}|\\u\{[0-9a-fA-F]+\})/; +const SET_PART_RE$1 = /^(?:[^\\\]]|\\[^cxu0-9]|\\[0-9]{1,3}|\\c[A-Z]|\\x[0-9a-fA-F]{2}|\\u[0-9a-fA-F]{4}|\\u\{[0-9a-fA-F]+\})+/; +const NOTHING_SPECIAL_RE$1 = /^(?:[^\\\[\]\(\)\|^\{\}]|\\[^cxu0-9]|\\[0-9]{1,3}|\\c[A-Z]|\\x[0-9a-fA-F]{2}|\\u[0-9a-fA-F]{4}|\\u\{[0-9a-fA-F]+\})+/; +const SET_IS_SINGLE_PCODE_RE = /^\\[dDwWsS]$|^\\p\{[A-Za-z0-9 \-\._]+\}$/; + +const UNICODE_BASE_PLANE_MAX_CP$1 = 65535; + +// The expanded regex sets which are equivalent to the given `\\{c}` escapes: +// +// `/\s/`: +const WHITESPACE_SETSTR$1 = ' \f\n\r\t\v\u00a0\u1680\u180e\u2000-\u200a\u2028\u2029\u202f\u205f\u3000\ufeff'; +// `/\d/`: +const DIGIT_SETSTR$1 = '0-9'; +// `/\w/`: +const WORDCHAR_SETSTR$1 = 'A-Za-z0-9_'; + + + + + +// Helper for `bitarray2set()`: convert character code to a representation string suitable for use in a regex +function i2c(i) { + var c, x; + + switch (i) { + case 10: + return '\\n'; + + case 13: + return '\\r'; + + case 9: + return '\\t'; + + case 8: + return '\\b'; + + case 12: + return '\\f'; + + case 11: + return '\\v'; + + case 45: // ASCII/Unicode for '-' dash + return '\\-'; + + case 91: // '[' + return '\\['; + + case 92: // '\\' + return '\\\\'; + + case 93: // ']' + return '\\]'; + + case 94: // ']' + return '\\^'; + } + if (i < 32 + || i > 0xFFF0 /* Unicode Specials, also in UTF16 */ + || (i >= 0xD800 && i <= 0xDFFF) /* Unicode Supplementary Planes; we're TOAST in JavaScript as we're NOT UTF-16 but UCS-2! */ + || String.fromCharCode(i).match(/[\u2028\u2029]/) /* Code compilation via `new Function()` does not like to see these, or rather: treats them as just another form of CRLF, which breaks your generated regex code! */ + ) { + // Detail about a detail: + // U+2028 and U+2029 are part of the `\s` regex escape code (`\s` and `[\s]` match either of these) and when placed in a JavaScript + // source file verbatim (without escaping it as a `\uNNNN` item) then JavaScript will interpret it as such and consequently report + // a b0rked generated parser, as the generated code would include this regex right here. + // Hence we MUST escape these buggers everywhere we go... + x = i.toString(16); + if (x.length >= 1 && i <= 0xFFFF) { + c = '0000' + x; + return '\\u' + c.substr(c.length - 4); + } else { + return '\\u{' + x + '}'; + } + } + return String.fromCharCode(i); +} + + +// Helper collection for `bitarray2set()`: we have expanded all these cached `\\p{NAME}` regex sets when creating +// this bitarray and now we should look at these expansions again to see if `bitarray2set()` can produce a +// `\\p{NAME}` shorthand to represent [part of] the bitarray: +var Pcodes_bitarray_cache = {}; +var Pcodes_bitarray_cache_test_order = []; + +// Helper collection for `bitarray2set()` for minifying special cases of result sets which can be represented by +// a single regex 'escape', e.g. `\d` for digits 0-9. +var EscCode_bitarray_output_refs; + +// now initialize the EscCodes_... table above: +init_EscCode_lookup_table(); + +function init_EscCode_lookup_table() { + var s, bitarr, set2esc = {}, esc2bitarr = {}; + + // patch global lookup tables for the time being, while we calculate their *real* content in this function: + EscCode_bitarray_output_refs = { + esc2bitarr: {}, + set2esc: {} + }; + Pcodes_bitarray_cache_test_order = []; + + // `/\S': + bitarr = []; + set2bitarray(bitarr, '^' + WHITESPACE_SETSTR$1); + s = bitarray2set(bitarr); + esc2bitarr['S'] = bitarr; + set2esc[s] = 'S'; + // set2esc['^' + s] = 's'; + Pcodes_bitarray_cache['\\S'] = bitarr; + + // `/\s': + bitarr = []; + set2bitarray(bitarr, WHITESPACE_SETSTR$1); + s = bitarray2set(bitarr); + esc2bitarr['s'] = bitarr; + set2esc[s] = 's'; + // set2esc['^' + s] = 'S'; + Pcodes_bitarray_cache['\\s'] = bitarr; + + // `/\D': + bitarr = []; + set2bitarray(bitarr, '^' + DIGIT_SETSTR$1); + s = bitarray2set(bitarr); + esc2bitarr['D'] = bitarr; + set2esc[s] = 'D'; + // set2esc['^' + s] = 'd'; + Pcodes_bitarray_cache['\\D'] = bitarr; + + // `/\d': + bitarr = []; + set2bitarray(bitarr, DIGIT_SETSTR$1); + s = bitarray2set(bitarr); + esc2bitarr['d'] = bitarr; + set2esc[s] = 'd'; + // set2esc['^' + s] = 'D'; + Pcodes_bitarray_cache['\\d'] = bitarr; + + // `/\W': + bitarr = []; + set2bitarray(bitarr, '^' + WORDCHAR_SETSTR$1); + s = bitarray2set(bitarr); + esc2bitarr['W'] = bitarr; + set2esc[s] = 'W'; + // set2esc['^' + s] = 'w'; + Pcodes_bitarray_cache['\\W'] = bitarr; + + // `/\w': + bitarr = []; + set2bitarray(bitarr, WORDCHAR_SETSTR$1); + s = bitarray2set(bitarr); + esc2bitarr['w'] = bitarr; + set2esc[s] = 'w'; + // set2esc['^' + s] = 'W'; + Pcodes_bitarray_cache['\\w'] = bitarr; + + EscCode_bitarray_output_refs = { + esc2bitarr: esc2bitarr, + set2esc: set2esc + }; + + updatePcodesBitarrayCacheTestOrder(); +} + +function updatePcodesBitarrayCacheTestOrder(opts) { + var t = new Array(UNICODE_BASE_PLANE_MAX_CP$1 + 1); + var l = {}; + var user_has_xregexp = opts && opts.options && opts.options.xregexp; + var i, j, k, ba; + + // mark every character with which regex pcodes they are part of: + for (k in Pcodes_bitarray_cache) { + ba = Pcodes_bitarray_cache[k]; + + if (!user_has_xregexp && k.indexOf('\\p{') >= 0) { + continue; + } + + var cnt = 0; + for (i = 0; i <= UNICODE_BASE_PLANE_MAX_CP$1; i++) { + if (ba[i]) { + cnt++; + if (!t[i]) { + t[i] = [k]; + } else { + t[i].push(k); + } + } + } + l[k] = cnt; + } + + // now dig out the unique ones: only need one per pcode. + // + // We ASSUME every \\p{NAME} 'pcode' has at least ONE character + // in it that is ONLY matched by that particular pcode. + // If this assumption fails, nothing is lost, but our 'regex set + // optimized representation' will be sub-optimal as than this pcode + // won't be tested during optimization. + // + // Now that would be a pity, so the assumption better holds... + // Turns out the assumption doesn't hold already for /\S/ + /\D/ + // as the second one (\D) is a pure subset of \S. So we have to + // look for markers which match multiple escapes/pcodes for those + // ones where a unique item isn't available... + var lut = []; + var done = {}; + var keys = Object.keys(Pcodes_bitarray_cache); + + for (i = 0; i <= UNICODE_BASE_PLANE_MAX_CP$1; i++) { + k = t[i][0]; + if (t[i].length === 1 && !done[k]) { + assert(l[k] > 0); + lut.push([i, k]); + done[k] = true; + } + } + + for (j = 0; keys[j]; j++) { + k = keys[j]; + + if (!user_has_xregexp && k.indexOf('\\p{') >= 0) { + continue; + } + + if (!done[k]) { + assert(l[k] > 0); + // find a minimum span character to mark this one: + var w = Infinity; + var rv; + ba = Pcodes_bitarray_cache[k]; + for (i = 0; i <= UNICODE_BASE_PLANE_MAX_CP$1; i++) { + if (ba[i]) { + var tl = t[i].length; + if (tl > 1 && tl < w) { + assert(l[k] > 0); + rv = [i, k]; + w = tl; + } + } + } + if (rv) { + done[k] = true; + lut.push(rv); + } + } + } + + // order from large set to small set so that small sets don't gobble + // characters also represented by overlapping larger set pcodes. + // + // Again we assume something: that finding the large regex pcode sets + // before the smaller, more specialized ones, will produce a more + // optimal minification of the regex set expression. + // + // This is a guestimate/heuristic only! + lut.sort(function (a, b) { + var k1 = a[1]; + var k2 = b[1]; + var ld = l[k2] - l[k1]; + if (ld) { + return ld; + } + // and for same-size sets, order from high to low unique identifier. + return b[0] - a[0]; + }); + + Pcodes_bitarray_cache_test_order = lut; +} + + + + + + +// 'Join' a regex set `[...]` into a Unicode range spanning logic array, flagging every character in the given set. +function set2bitarray(bitarr, s, opts) { + var orig = s; + var set_is_inverted = false; + var bitarr_orig; + + function mark(d1, d2) { + if (d2 == null) d2 = d1; + for (var i = d1; i <= d2; i++) { + bitarr[i] = true; + } + } + + function add2bitarray(dst, src) { + for (var i = 0; i <= UNICODE_BASE_PLANE_MAX_CP$1; i++) { + if (src[i]) { + dst[i] = true; + } + } + } + + function eval_escaped_code(s) { + var c; + // decode escaped code? If none, just take the character as-is + if (s.indexOf('\\') === 0) { + var l = s.substr(0, 2); + switch (l) { + case '\\c': + c = s.charCodeAt(2) - 'A'.charCodeAt(0) + 1; + return String.fromCharCode(c); + + case '\\x': + s = s.substr(2); + c = parseInt(s, 16); + return String.fromCharCode(c); + + case '\\u': + s = s.substr(2); + if (s[0] === '{') { + s = s.substr(1, s.length - 2); + } + c = parseInt(s, 16); + if (c >= 0x10000) { + return new Error('We do NOT support Extended Plane Unicode Codepoints (i.e. CodePoints beyond U:FFFF) in regex set expressions, e.g. \\u{' + s + '}'); + } + return String.fromCharCode(c); + + case '\\0': + case '\\1': + case '\\2': + case '\\3': + case '\\4': + case '\\5': + case '\\6': + case '\\7': + s = s.substr(1); + c = parseInt(s, 8); + return String.fromCharCode(c); + + case '\\r': + return '\r'; + + case '\\n': + return '\n'; + + case '\\v': + return '\v'; + + case '\\f': + return '\f'; + + case '\\t': + return '\t'; + + case '\\b': + return '\b'; + + default: + // just the character itself: + return s.substr(1); + } + } else { + return s; + } + } + + if (s && s.length) { + var c1, c2; + + // inverted set? + if (s[0] === '^') { + set_is_inverted = true; + s = s.substr(1); + bitarr_orig = bitarr; + bitarr = new Array(UNICODE_BASE_PLANE_MAX_CP$1 + 1); + } + + // BITARR collects flags for characters set. Inversion means the complement set of character is st instead. + // This results in an OR operations when sets are joined/chained. + + while (s.length) { + c1 = s.match(CHR_RE$1); + if (!c1) { + // hit an illegal escape sequence? cope anyway! + c1 = s[0]; + } else { + c1 = c1[0]; + // Quick hack for XRegExp escapes inside a regex `[...]` set definition: we *could* try to keep those + // intact but it's easier to unfold them here; this is not nice for when the grammar specifies explicit + // XRegExp support, but alas, we'll get there when we get there... ;-) + switch (c1) { + case '\\p': + s = s.substr(c1.length); + c2 = s.match(XREGEXP_UNICODE_ESCAPE_RE$1); + if (c2) { + c2 = c2[0]; + s = s.substr(c2.length); + // do we have this one cached already? + var pex = c1 + c2; + var ba4p = Pcodes_bitarray_cache[pex]; + if (!ba4p) { + // expand escape: + var xr = new XRegExp('[' + pex + ']'); // TODO: case-insensitive grammar??? + // rewrite to a standard `[...]` regex set: XRegExp will do this for us via `XRegExp.toString()`: + var xs = '' + xr; + // remove the wrapping `/.../` to get at the (possibly *combined* series of) `[...]` sets inside: + xs = xs.substr(1, xs.length - 2); + + ba4p = reduceRegexToSetBitArray(xs, pex, opts); + + Pcodes_bitarray_cache[pex] = ba4p; + updatePcodesBitarrayCacheTestOrder(opts); + } + // merge bitarrays: + add2bitarray(bitarr, ba4p); + continue; + } + break; + + case '\\S': + case '\\s': + case '\\W': + case '\\w': + case '\\d': + case '\\D': + // these can't participate in a range, but need to be treated special: + s = s.substr(c1.length); + // check for \S, \s, \D, \d, \W, \w and expand them: + var ba4e = EscCode_bitarray_output_refs.esc2bitarr[c1[1]]; + assert(ba4e); + add2bitarray(bitarr, ba4e); + continue; + + case '\\b': + // matches a backspace: https://developer.mozilla.org/en/docs/Web/JavaScript/Guide/Regular_Expressions#special-backspace + c1 = '\u0008'; + break; + } + } + var v1 = eval_escaped_code(c1); + // propagate deferred exceptions = error reports. + if (v1 instanceof Error) { + return v1; + } + v1 = v1.charCodeAt(0); + s = s.substr(c1.length); + + if (s[0] === '-' && s.length >= 2) { + // we can expect a range like 'a-z': + s = s.substr(1); + c2 = s.match(CHR_RE$1); + if (!c2) { + // hit an illegal escape sequence? cope anyway! + c2 = s[0]; + } else { + c2 = c2[0]; + } + var v2 = eval_escaped_code(c2); + // propagate deferred exceptions = error reports. + if (v2 instanceof Error) { + return v1; + } + v2 = v2.charCodeAt(0); + s = s.substr(c2.length); + + // legal ranges go UP, not /DOWN! + if (v1 <= v2) { + mark(v1, v2); + } else { + console.warn('INVALID CHARACTER RANGE found in regex: ', { re: orig, start: c1, start_n: v1, end: c2, end_n: v2 }); + mark(v1); + mark('-'.charCodeAt(0)); + mark(v2); + } + continue; + } + mark(v1); + } + + // When we have marked all slots, '^' NEGATES the set, hence we flip all slots. + // + // Since a regex like `[^]` should match everything(?really?), we don't need to check if the MARK + // phase actually marked anything at all: the `^` negation will correctly flip=mark the entire + // range then. + if (set_is_inverted) { + for (var i = 0; i <= UNICODE_BASE_PLANE_MAX_CP$1; i++) { + if (!bitarr[i]) { + bitarr_orig[i] = true; + } + } + } + } + return false; +} + + +// convert a simple bitarray back into a regex set `[...]` content: +function bitarray2set(l, output_inverted_variant, output_minimized) { + // construct the inverse(?) set from the mark-set: + // + // Before we do that, we inject a sentinel so that our inner loops + // below can be simple and fast: + l[UNICODE_BASE_PLANE_MAX_CP$1 + 1] = 1; + // now reconstruct the regex set: + var rv = []; + var i, j, cnt, lut, tn, tspec, match, pcode, ba4pcode, l2; + var bitarr_is_cloned = false; + var l_orig = l; + + if (output_inverted_variant) { + // generate the inverted set, hence all unmarked slots are part of the output range: + cnt = 0; + for (i = 0; i <= UNICODE_BASE_PLANE_MAX_CP$1; i++) { + if (!l[i]) { + cnt++; + } + } + if (cnt === UNICODE_BASE_PLANE_MAX_CP$1 + 1) { + // When there's nothing in the output we output a special 'match-nothing' regex: `[^\S\s]`. + // BUT... since we output the INVERTED set, we output the match-all set instead: + return '\\S\\s'; + } + else if (cnt === 0) { + // When we find the entire Unicode range is in the output match set, we replace this with + // a shorthand regex: `[\S\s]` + // BUT... since we output the INVERTED set, we output the match-nothing set instead: + return '^\\S\\s'; + } + + // Now see if we can replace several bits by an escape / pcode: + if (output_minimized) { + lut = Pcodes_bitarray_cache_test_order; + for (tn = 0; lut[tn]; tn++) { + tspec = lut[tn]; + // check if the uniquely identifying char is in the inverted set: + if (!l[tspec[0]]) { + // check if the pcode is covered by the inverted set: + pcode = tspec[1]; + ba4pcode = Pcodes_bitarray_cache[pcode]; + match = 0; + for (j = 0; j <= UNICODE_BASE_PLANE_MAX_CP$1; j++) { + if (ba4pcode[j]) { + if (!l[j]) { + // match in current inverted bitset, i.e. there's at + // least one 'new' bit covered by this pcode/escape: + match++; + } else if (l_orig[j]) { + // mismatch! + match = false; + break; + } + } + } + + // We're only interested in matches which actually cover some + // yet uncovered bits: `match !== 0 && match !== false`. + // + // Apply the heuristic that the pcode/escape is only going to be used + // when it covers *more* characters than its own identifier's length: + if (match && match > pcode.length) { + rv.push(pcode); + + // and nuke the bits in the array which match the given pcode: + // make sure these edits are visible outside this function as + // `l` is an INPUT parameter (~ not modified)! + if (!bitarr_is_cloned) { + l2 = new Array(UNICODE_BASE_PLANE_MAX_CP$1 + 1); + for (j = 0; j <= UNICODE_BASE_PLANE_MAX_CP$1; j++) { + l2[j] = l[j] || ba4pcode[j]; // `!(!l[j] && !ba4pcode[j])` + } + // recreate sentinel + l2[UNICODE_BASE_PLANE_MAX_CP$1 + 1] = 1; + l = l2; + bitarr_is_cloned = true; + } else { + for (j = 0; j <= UNICODE_BASE_PLANE_MAX_CP$1; j++) { + l[j] = l[j] || ba4pcode[j]; + } + } + } + } + } + } + + i = 0; + while (i <= UNICODE_BASE_PLANE_MAX_CP$1) { + // find first character not in original set: + while (l[i]) { + i++; + } + if (i >= UNICODE_BASE_PLANE_MAX_CP$1 + 1) { + break; + } + // find next character not in original set: + for (j = i + 1; !l[j]; j++) {} /* empty loop */ + // generate subset: + rv.push(i2c(i)); + if (j - 1 > i) { + rv.push((j - 2 > i ? '-' : '') + i2c(j - 1)); + } + i = j; + } + } else { + // generate the non-inverted set, hence all logic checks are inverted here... + cnt = 0; + for (i = 0; i <= UNICODE_BASE_PLANE_MAX_CP$1; i++) { + if (l[i]) { + cnt++; + } + } + if (cnt === UNICODE_BASE_PLANE_MAX_CP$1 + 1) { + // When we find the entire Unicode range is in the output match set, we replace this with + // a shorthand regex: `[\S\s]` + return '\\S\\s'; + } + else if (cnt === 0) { + // When there's nothing in the output we output a special 'match-nothing' regex: `[^\S\s]`. + return '^\\S\\s'; + } + + // Now see if we can replace several bits by an escape / pcode: + if (output_minimized) { + lut = Pcodes_bitarray_cache_test_order; + for (tn = 0; lut[tn]; tn++) { + tspec = lut[tn]; + // check if the uniquely identifying char is in the set: + if (l[tspec[0]]) { + // check if the pcode is covered by the set: + pcode = tspec[1]; + ba4pcode = Pcodes_bitarray_cache[pcode]; + match = 0; + for (j = 0; j <= UNICODE_BASE_PLANE_MAX_CP$1; j++) { + if (ba4pcode[j]) { + if (l[j]) { + // match in current bitset, i.e. there's at + // least one 'new' bit covered by this pcode/escape: + match++; + } else if (!l_orig[j]) { + // mismatch! + match = false; + break; + } + } + } + + // We're only interested in matches which actually cover some + // yet uncovered bits: `match !== 0 && match !== false`. + // + // Apply the heuristic that the pcode/escape is only going to be used + // when it covers *more* characters than its own identifier's length: + if (match && match > pcode.length) { + rv.push(pcode); + + // and nuke the bits in the array which match the given pcode: + // make sure these edits are visible outside this function as + // `l` is an INPUT parameter (~ not modified)! + if (!bitarr_is_cloned) { + l2 = new Array(UNICODE_BASE_PLANE_MAX_CP$1 + 1); + for (j = 0; j <= UNICODE_BASE_PLANE_MAX_CP$1; j++) { + l2[j] = l[j] && !ba4pcode[j]; + } + // recreate sentinel + l2[UNICODE_BASE_PLANE_MAX_CP$1 + 1] = 1; + l = l2; + bitarr_is_cloned = true; + } else { + for (j = 0; j <= UNICODE_BASE_PLANE_MAX_CP$1; j++) { + l[j] = l[j] && !ba4pcode[j]; + } + } + } + } + } + } + + i = 0; + while (i <= UNICODE_BASE_PLANE_MAX_CP$1) { + // find first character not in original set: + while (!l[i]) { + i++; + } + if (i >= UNICODE_BASE_PLANE_MAX_CP$1 + 1) { + break; + } + // find next character not in original set: + for (j = i + 1; l[j]; j++) {} /* empty loop */ + if (j > UNICODE_BASE_PLANE_MAX_CP$1 + 1) { + j = UNICODE_BASE_PLANE_MAX_CP$1 + 1; + } + // generate subset: + rv.push(i2c(i)); + if (j - 1 > i) { + rv.push((j - 2 > i ? '-' : '') + i2c(j - 1)); + } + i = j; + } + } + + assert(rv.length); + var s = rv.join(''); + assert(s); + + // Check if the set is better represented by one of the regex escapes: + var esc4s = EscCode_bitarray_output_refs.set2esc[s]; + if (esc4s) { + // When we hit a special case like this, it is always the shortest notation, hence wins on the spot! + return '\\' + esc4s; + } + return s; +} + + + + + +// Pretty brutal conversion of 'regex' `s` back to raw regex set content: strip outer [...] when they're there; +// ditto for inner combos of sets, i.e. `]|[` as in `[0-9]|[a-z]`. +function reduceRegexToSetBitArray(s, name, opts) { + var orig = s; + + // propagate deferred exceptions = error reports. + if (s instanceof Error) { + return s; + } + + var l = new Array(UNICODE_BASE_PLANE_MAX_CP$1 + 1); + var internal_state = 0; + var derr; + + while (s.length) { + var c1 = s.match(CHR_RE$1); + if (!c1) { + // cope with illegal escape sequences too! + return new Error('illegal escape sequence at start of regex part: "' + s + '" of regex "' + orig + '"'); + } else { + c1 = c1[0]; + } + s = s.substr(c1.length); + + switch (c1) { + case '[': + // this is starting a set within the regex: scan until end of set! + var set_content = []; + while (s.length) { + var inner = s.match(SET_PART_RE$1); + if (!inner) { + inner = s.match(CHR_RE$1); + if (!inner) { + // cope with illegal escape sequences too! + return new Error('illegal escape sequence at start of regex part: ' + s + '" of regex "' + orig + '"'); + } else { + inner = inner[0]; + } + if (inner === ']') break; + } else { + inner = inner[0]; + } + set_content.push(inner); + s = s.substr(inner.length); + } + + // ensure that we hit the terminating ']': + var c2 = s.match(CHR_RE$1); + if (!c2) { + // cope with illegal escape sequences too! + return new Error('regex set expression is broken in regex: "' + orig + '" --> "' + s + '"'); + } else { + c2 = c2[0]; + } + if (c2 !== ']') { + return new Error('regex set expression is broken in regex: ' + orig); + } + s = s.substr(c2.length); + + var se = set_content.join(''); + if (!internal_state) { + derr = set2bitarray(l, se, opts); + // propagate deferred exceptions = error reports. + if (derr instanceof Error) { + return derr; + } + + // a set is to use like a single character in a longer literal phrase, hence input `[abc]word[def]` would thus produce output `[abc]`: + internal_state = 1; + } + break; + + // Strip unescaped pipes to catch constructs like `\\r|\\n` and turn them into + // something ready for use inside a regex set, e.g. `\\r\\n`. + // + // > Of course, we realize that converting more complex piped constructs this way + // > will produce something you might not expect, e.g. `A|WORD2` which + // > would end up as the set `[AW]` which is something else than the input + // > entirely. + // > + // > However, we can only depend on the user (grammar writer) to realize this and + // > prevent this from happening by not creating such oddities in the input grammar. + case '|': + // a|b --> [ab] + internal_state = 0; + break; + + case '(': + // (a) --> a + // + // TODO - right now we treat this as 'too complex': + + // Strip off some possible outer wrappers which we know how to remove. + // We don't worry about 'damaging' the regex as any too-complex regex will be caught + // in the validation check at the end; our 'strippers' here would not damage useful + // regexes anyway and them damaging the unacceptable ones is fine. + s = s.replace(/^\((?:\?:)?(.*?)\)$/, '$1'); // (?:...) -> ... and (...) -> ... + s = s.replace(/^\^?(.*?)\$?$/, '$1'); // ^...$ --> ... (catch these both inside and outside the outer grouping, hence do the ungrouping twice: one before, once after this) + s = s.replace(/^\((?:\?:)?(.*?)\)$/, '$1'); // (?:...) -> ... and (...) -> ... + + return new Error('[macro [' + name + '] is unsuitable for use inside regex set expressions: "[' + orig + ']"]'); + + case '.': + case '*': + case '+': + case '?': + // wildcard + // + // TODO - right now we treat this as 'too complex': + return new Error('[macro [' + name + '] is unsuitable for use inside regex set expressions: "[' + orig + ']"]'); + + case '{': // range, e.g. `x{1,3}`, or macro? + // TODO - right now we treat this as 'too complex': + return new Error('[macro [' + name + '] is unsuitable for use inside regex set expressions: "[' + orig + ']"]'); + + default: + // literal character or word: take the first character only and ignore the rest, so that + // the constructed set for `word|noun` would be `[wb]`: + if (!internal_state) { + derr = set2bitarray(l, c1, opts); + // propagate deferred exceptions = error reports. + if (derr instanceof Error) { + return derr; + } + + internal_state = 2; + } + break; + } + } + + s = bitarray2set(l); + + // When this result is suitable for use in a set, than we should be able to compile + // it in a regex; that way we can easily validate whether macro X is fit to be used + // inside a regex set: + try { + var re; + assert(s); + assert(!(s instanceof Error)); + re = new XRegExp('[' + s + ']'); + re.test(s[0]); + + // One thing is apparently *not* caught by the RegExp compile action above: `[a[b]c]` + // so we check for lingering UNESCAPED brackets in here as those cannot be: + if (/[^\\][\[\]]/.exec(s)) { + throw new Error('unescaped brackets in set data'); + } + } catch (ex) { + // make sure we produce a set range expression which will fail badly when it is used + // in actual code: + s = new Error('[macro [' + name + '] is unsuitable for use inside regex set expressions: "[' + s + ']"]: ' + ex.message); + } + + assert(s); + // propagate deferred exceptions = error reports. + if (s instanceof Error) { + return s; + } + return l; +} + + + + +// Convert bitarray representing, for example, `'0-9'` to regex string `[0-9]` +// -- or in this example it can be further optimized to only `\d`! +function produceOptimizedRegex4Set(bitarr) { + // First try to produce a minimum regex from the bitarray directly: + var s1 = bitarray2set(bitarr, false, true); + + // and when the regex set turns out to match a single pcode/escape, then + // use that one as-is: + if (s1.match(SET_IS_SINGLE_PCODE_RE)) { + // When we hit a special case like this, it is always the shortest notation, hence wins on the spot! + return s1; + } else { + s1 = '[' + s1 + ']'; + } + + // Now try to produce a minimum regex from the *inverted* bitarray via negation: + // Because we look at a negated bitset, there's no use looking for matches with + // special cases here. + var s2 = bitarray2set(bitarr, true, true); + + if (s2[0] === '^') { + s2 = s2.substr(1); + if (s2.match(SET_IS_SINGLE_PCODE_RE)) { + // When we hit a special case like this, it is always the shortest notation, hence wins on the spot! + return s2; + } + } else { + s2 = '^' + s2; + } + s2 = '[' + s2 + ']'; + + // Then, as some pcode/escapes still happen to deliver a LARGER regex string in the end, + // we also check against the plain, unadulterated regex set expressions: + // + // First try to produce a minimum regex from the bitarray directly: + var s3 = bitarray2set(bitarr, false, false); + + // and when the regex set turns out to match a single pcode/escape, then + // use that one as-is: + if (s3.match(SET_IS_SINGLE_PCODE_RE)) { + // When we hit a special case like this, it is always the shortest notation, hence wins on the spot! + return s3; + } else { + s3 = '[' + s3 + ']'; + } + + // Now try to produce a minimum regex from the *inverted* bitarray via negation: + // Because we look at a negated bitset, there's no use looking for matches with + // special cases here. + var s4 = bitarray2set(bitarr, true, false); + + if (s4[0] === '^') { + s4 = s4.substr(1); + if (s4.match(SET_IS_SINGLE_PCODE_RE)) { + // When we hit a special case like this, it is always the shortest notation, hence wins on the spot! + return s4; + } + } else { + s4 = '^' + s4; + } + s4 = '[' + s4 + ']'; + + if (s2.length < s1.length) { + s1 = s2; + } + if (s3.length < s1.length) { + s1 = s3; + } + if (s4.length < s1.length) { + s1 = s4; + } + + return s1; +} + + + + + + +var setmgmt = { + XREGEXP_UNICODE_ESCAPE_RE: XREGEXP_UNICODE_ESCAPE_RE$1, + CHR_RE: CHR_RE$1, + SET_PART_RE: SET_PART_RE$1, + NOTHING_SPECIAL_RE: NOTHING_SPECIAL_RE$1, + SET_IS_SINGLE_PCODE_RE, + + UNICODE_BASE_PLANE_MAX_CP: UNICODE_BASE_PLANE_MAX_CP$1, + + WHITESPACE_SETSTR: WHITESPACE_SETSTR$1, + DIGIT_SETSTR: DIGIT_SETSTR$1, + WORDCHAR_SETSTR: WORDCHAR_SETSTR$1, + + set2bitarray, + bitarray2set, + produceOptimizedRegex4Set, + reduceRegexToSetBitArray, +}; + +// Basic Lexer implemented using JavaScript regular expressions +// Zachary Carter +// MIT Licensed + +var rmCommonWS = helpers.rmCommonWS; +var camelCase = helpers.camelCase; +var code_exec = helpers.exec; +// import recast from '@gerhobbelt/recast'; +// import astUtils from '@gerhobbelt/ast-util'; +var version = '0.6.1-205'; // require('./package.json').version; + + + + +const XREGEXP_UNICODE_ESCAPE_RE = setmgmt.XREGEXP_UNICODE_ESCAPE_RE; // Matches the XRegExp Unicode escape braced part, e.g. `{Number}` +const CHR_RE = setmgmt.CHR_RE; +const SET_PART_RE = setmgmt.SET_PART_RE; +const NOTHING_SPECIAL_RE = setmgmt.NOTHING_SPECIAL_RE; +const UNICODE_BASE_PLANE_MAX_CP = setmgmt.UNICODE_BASE_PLANE_MAX_CP; + +// WARNING: this regex MUST match the regex for `ID` in ebnf-parser::bnf.l jison language lexer spec! (`ID = [{ALPHA}]{ALNUM}*`) +// +// This is the base XRegExp ID regex used in many places; this should match the ID macro definition in the EBNF/BNF parser et al as well! +const ID_REGEX_BASE = '[\\p{Alphabetic}_][\\p{Alphabetic}_\\p{Number}]*'; + + + + +// see also ./lib/cli.js +/** +@public +@nocollapse +*/ +const defaultJisonLexOptions = { + moduleType: 'commonjs', + debug: false, + enableDebugLogs: false, + json: false, + main: false, // CLI: not:(--main option) + dumpSourceCodeOnFailure: true, + throwErrorOnCompileFailure: true, + + moduleName: undefined, + defaultModuleName: 'lexer', + file: undefined, + outfile: undefined, + inputPath: undefined, + inputFilename: undefined, + warn_cb: undefined, // function(msg) | true (= use Jison.Print) | false (= throw Exception) + + xregexp: false, + lexerErrorsAreRecoverable: false, + flex: false, + backtrack_lexer: false, + ranges: false, // track position range, i.e. start+end indexes in the input string + trackPosition: true, // track line+column position in the input string + caseInsensitive: false, + showSource: false, + exportSourceCode: false, + exportAST: false, + prettyCfg: true, + pre_lex: undefined, + post_lex: undefined, +}; + + +// Merge sets of options. +// +// Convert alternative jison option names to their base option. +// +// The *last* option set which overrides the default wins, where 'override' is +// defined as specifying a not-undefined value which is not equal to the +// default value. +// +// When the FIRST argument is STRING "NODEFAULT", then we MUST NOT mix the +// default values avialable in Jison.defaultJisonOptions. +// +// Return a fresh set of options. +/** @public */ +function mkStdOptions(/*...args*/) { + var h = Object.prototype.hasOwnProperty; + + var opts = {}; + var args = [].concat.apply([], arguments); + // clone defaults, so we do not modify those constants? + if (args[0] !== "NODEFAULT") { + args.unshift(defaultJisonLexOptions); + } else { + args.shift(); + } + + for (var i = 0, len = args.length; i < len; i++) { + var o = args[i]; + if (!o) continue; + + // clone input (while camel-casing the options), so we do not modify those either. + var o2 = {}; + + for (var p in o) { + if (typeof o[p] !== 'undefined' && h.call(o, p)) { + o2[camelCase(p)] = o[p]; + } + } + + // now clean them options up: + if (typeof o2.main !== 'undefined') { + o2.noMain = !o2.main; + } + + delete o2.main; + + // special check for `moduleName` to ensure we detect the 'default' moduleName entering from the CLI + // NOT overriding the moduleName set in the grammar definition file via an `%options` entry: + if (o2.moduleName === o2.defaultModuleName) { + delete o2.moduleName; + } + + // now see if we have an overriding option here: + for (var p in o2) { + if (h.call(o2, p)) { + if (typeof o2[p] !== 'undefined') { + opts[p] = o2[p]; + } + } + } + } + + return opts; +} + +// set up export/output attributes of the `options` object instance +function prepExportStructures(options) { + // set up the 'option' `exportSourceCode` as a hash object for returning + // all generated source code chunks to the caller + var exportSourceCode = options.exportSourceCode; + if (!exportSourceCode || typeof exportSourceCode !== 'object') { + exportSourceCode = { + enabled: !!exportSourceCode + }; + } else if (typeof exportSourceCode.enabled !== 'boolean') { + exportSourceCode.enabled = true; + } + options.exportSourceCode = exportSourceCode; +} + +// Autodetect if the input lexer spec is in JSON or JISON +// format when the `options.json` flag is `true`. +// +// Produce the JSON lexer spec result when these are JSON formatted already as that +// would save us the trouble of doing this again, anywhere else in the JISON +// compiler/generator. +// +// Otherwise return the *parsed* lexer spec as it has +// been processed through LexParser. +function autodetectAndConvertToJSONformat(lexerSpec, options) { + var chk_l = null; + var ex1, err; + + if (typeof lexerSpec === 'string') { + if (options.json) { + try { + chk_l = json5.parse(lexerSpec); + + // When JSON5-based parsing of the lexer spec succeeds, this implies the lexer spec is specified in `JSON mode` + // *OR* there's a JSON/JSON5 format error in the input: + } catch (e) { + ex1 = e; + } + } + if (!chk_l) { + // // WARNING: the lexer may receive options specified in the **grammar spec file**, + // // hence we should mix the options to ensure the lexParser always + // // receives the full set! + // // + // // make sure all options are 'standardized' before we go and mix them together: + // options = mkStdOptions(grammar.options, options); + try { + chk_l = lexParser.parse(lexerSpec, options); + } catch (e) { + if (options.json) { + err = new Error('Could not parse lexer spec in JSON AUTODETECT mode\nError: ' + ex1.message + ' (' + e.message + ')'); + err.secondary_exception = e; + err.stack = ex1.stack; + } else { + err = new Error('Could not parse lexer spec\nError: ' + e.message); + err.stack = e.stack; + } + throw err; + } + } + } else { + chk_l = lexerSpec; + } + + // Save time! Don't reparse the entire lexer spec *again* inside the code generators when that's not necessary: + + return chk_l; +} + + +// expand macros and convert matchers to RegExp's +function prepareRules(dict, actions, caseHelper, tokens, startConditions, opts) { + var m, i, k, rule, action, conditions, + active_conditions, + rules = dict.rules || [], + newRules = [], + macros = {}, + regular_rule_count = 0, + simple_rule_count = 0; + + // Assure all options are camelCased: + assert(typeof opts.options['case-insensitive'] === 'undefined'); + + if (!tokens) { + tokens = {}; + } + + // Depending on the location within the regex we need different expansions of the macros: + // one expansion for when a macro is *inside* a `[...]` and another expansion when a macro + // is anywhere else in a regex: + if (dict.macros) { + macros = prepareMacros(dict.macros, opts); + } + + function tokenNumberReplacement(str, token) { + return 'return ' + (tokens[token] || '\'' + token.replace(/'/g, '\\\'') + '\''); + } + + // Make sure a comment does not contain any embedded '*/' end-of-comment marker + // as that would break the generated code + function postprocessComment(str) { + if (Array.isArray(str)) { + str = str.join(' '); + } + str = str.replace(/\*\//g, '*\\/'); // destroy any inner `*/` comment terminator sequence. + return str; + } + + actions.push('switch(yyrulenumber) {'); + + for (i = 0; i < rules.length; i++) { + rule = rules[i]; + m = rule[0]; + + active_conditions = []; + if (Object.prototype.toString.apply(m) !== '[object Array]') { + // implicit add to all inclusive start conditions + for (k in startConditions) { + if (startConditions[k].inclusive) { + active_conditions.push(k); + startConditions[k].rules.push(i); + } + } + } else if (m[0] === '*') { + // Add to ALL start conditions + active_conditions.push('*'); + for (k in startConditions) { + startConditions[k].rules.push(i); + } + rule.shift(); + m = rule[0]; + } else { + // Add to explicit start conditions + conditions = rule.shift(); + m = rule[0]; + for (k = 0; k < conditions.length; k++) { + if (!startConditions.hasOwnProperty(conditions[k])) { + startConditions[conditions[k]] = { + rules: [], + inclusive: false + }; + console.warn('Lexer Warning:', '"' + conditions[k] + '" start condition should be defined as %s or %x; assuming %x now.'); + } + active_conditions.push(conditions[k]); + startConditions[conditions[k]].rules.push(i); + } + } + + if (typeof m === 'string') { + m = expandMacros(m, macros, opts); + m = new XRegExp('^(?:' + m + ')', opts.options.caseInsensitive ? 'i' : ''); + } + newRules.push(m); + if (typeof rule[1] === 'function') { + rule[1] = String(rule[1]).replace(/^\s*function \(\)\s?\{/, '').replace(/\}\s*$/, ''); + } + action = rule[1]; + action = action.replace(/return '((?:\\'|[^']+)+)'/g, tokenNumberReplacement); + action = action.replace(/return "((?:\\"|[^"]+)+)"/g, tokenNumberReplacement); + + var code = ['\n/*! Conditions::']; + code.push(postprocessComment(active_conditions)); + code.push('*/', '\n/*! Rule:: '); + code.push(postprocessComment(rules[i][0])); + code.push('*/', '\n'); + + // When the action is *only* a simple `return TOKEN` statement, then add it to the caseHelpers; + // otherwise add the additional `break;` at the end. + // + // Note: we do NOT analyze the action block any more to see if the *last* line is a simple + // `return NNN;` statement as there are too many shoddy idioms, e.g. + // + // ``` + // %{ if (cond) + // return TOKEN; + // %} + // ``` + // + // which would then cause havoc when our action code analysis (using regexes or otherwise) was 'too simple' + // to catch these culprits; hence we resort and stick with the most fundamental approach here: + // always append `break;` even when it would be obvious to a human that such would be 'unreachable code'. + var match_nr = /^return[\s\r\n]+((?:'(?:\\'|[^']+)+')|(?:"(?:\\"|[^"]+)+")|\d+)[\s\r\n]*;?$/.exec(action.trim()); + if (match_nr) { + simple_rule_count++; + caseHelper.push([].concat(code, i, ':', match_nr[1]).join(' ').replace(/[\n]/g, '\n ')); + } else { + regular_rule_count++; + actions.push([].concat('case', i, ':', code, action, '\nbreak;').join(' ')); + } + } + actions.push('default:'); + actions.push(' return this.simpleCaseActionClusters[yyrulenumber];'); + actions.push('}'); + + return { + rules: newRules, + macros: macros, + + regular_rule_count: regular_rule_count, + simple_rule_count: simple_rule_count, + }; +} + + + + + + + +// expand all macros (with maybe one exception) in the given regex: the macros may exist inside `[...]` regex sets or +// elsewhere, which requires two different treatments to expand these macros. +function reduceRegex(s, name, opts, expandAllMacrosInSet_cb, expandAllMacrosElsewhere_cb) { + var orig = s; + + function errinfo() { + if (name) { + return 'macro [[' + name + ']]'; + } else { + return 'regex [[' + orig + ']]'; + } + } + + // propagate deferred exceptions = error reports. + if (s instanceof Error) { + return s; + } + + var c1, c2; + var rv = []; + var derr; + var se; + + while (s.length) { + c1 = s.match(CHR_RE); + if (!c1) { + // cope with illegal escape sequences too! + return new Error(errinfo() + ': illegal escape sequence at start of regex part: ' + s); + } else { + c1 = c1[0]; + } + s = s.substr(c1.length); + + switch (c1) { + case '[': + // this is starting a set within the regex: scan until end of set! + var set_content = []; + var l = new Array(UNICODE_BASE_PLANE_MAX_CP + 1); + + while (s.length) { + var inner = s.match(SET_PART_RE); + if (!inner) { + inner = s.match(CHR_RE); + if (!inner) { + // cope with illegal escape sequences too! + return new Error(errinfo() + ': illegal escape sequence at start of regex part: ' + s); + } else { + inner = inner[0]; + } + if (inner === ']') break; + } else { + inner = inner[0]; + } + set_content.push(inner); + s = s.substr(inner.length); + } + + // ensure that we hit the terminating ']': + c2 = s.match(CHR_RE); + if (!c2) { + // cope with illegal escape sequences too! + return new Error(errinfo() + ': regex set expression is broken: "' + s + '"'); + } else { + c2 = c2[0]; + } + if (c2 !== ']') { + return new Error(errinfo() + ': regex set expression is broken: apparently unterminated'); + } + s = s.substr(c2.length); + + se = set_content.join(''); + + // expand any macros in here: + if (expandAllMacrosInSet_cb) { + se = expandAllMacrosInSet_cb(se); + assert(se); + if (se instanceof Error) { + return new Error(errinfo() + ': ' + se.message); + } + } + + derr = setmgmt.set2bitarray(l, se, opts); + if (derr instanceof Error) { + return new Error(errinfo() + ': ' + derr.message); + } + + // find out which set expression is optimal in size: + var s1 = setmgmt.produceOptimizedRegex4Set(l); + + // check if the source regex set potentially has any expansions (guestimate!) + // + // The indexOf('{') picks both XRegExp Unicode escapes and JISON lexer macros, which is perfect for us here. + var has_expansions = (se.indexOf('{') >= 0); + + se = '[' + se + ']'; + + if (!has_expansions && se.length < s1.length) { + s1 = se; + } + rv.push(s1); + break; + + // XRegExp Unicode escape, e.g. `\\p{Number}`: + case '\\p': + c2 = s.match(XREGEXP_UNICODE_ESCAPE_RE); + if (c2) { + c2 = c2[0]; + s = s.substr(c2.length); + + // nothing to expand. + rv.push(c1 + c2); + } else { + // nothing to stretch this match, hence nothing to expand. + rv.push(c1); + } + break; + + // Either a range expression or the start of a macro reference: `.{1,3}` or `{NAME}`. + // Treat it as a macro reference and see if it will expand to anything: + case '{': + c2 = s.match(NOTHING_SPECIAL_RE); + if (c2) { + c2 = c2[0]; + s = s.substr(c2.length); + + var c3 = s[0]; + s = s.substr(c3.length); + if (c3 === '}') { + // possibly a macro name in there... Expand if possible: + c2 = c1 + c2 + c3; + if (expandAllMacrosElsewhere_cb) { + c2 = expandAllMacrosElsewhere_cb(c2); + assert(c2); + if (c2 instanceof Error) { + return new Error(errinfo() + ': ' + c2.message); + } + } + } else { + // not a well-terminated macro reference or something completely different: + // we do not even attempt to expand this as there's guaranteed nothing to expand + // in this bit. + c2 = c1 + c2 + c3; + } + rv.push(c2); + } else { + // nothing to stretch this match, hence nothing to expand. + rv.push(c1); + } + break; + + // Recognize some other regex elements, but there's no need to understand them all. + // + // We are merely interested in any chunks now which do *not* include yet another regex set `[...]` + // nor any `{MACRO}` reference: + default: + // non-set character or word: see how much of this there is for us and then see if there + // are any macros still lurking inside there: + c2 = s.match(NOTHING_SPECIAL_RE); + if (c2) { + c2 = c2[0]; + s = s.substr(c2.length); + + // nothing to expand. + rv.push(c1 + c2); + } else { + // nothing to stretch this match, hence nothing to expand. + rv.push(c1); + } + break; + } + } + + s = rv.join(''); + + // When this result is suitable for use in a set, than we should be able to compile + // it in a regex; that way we can easily validate whether macro X is fit to be used + // inside a regex set: + try { + var re; + re = new XRegExp(s); + re.test(s[0]); + } catch (ex) { + // make sure we produce a regex expression which will fail badly when it is used + // in actual code: + return new Error(errinfo() + ': expands to an invalid regex: /' + s + '/'); + } + + assert(s); + return s; +} + + +// expand macros within macros and cache the result +function prepareMacros(dict_macros, opts) { + var macros = {}; + + // expand a `{NAME}` macro which exists inside a `[...]` set: + function expandMacroInSet(i) { + var k, a, m; + if (!macros[i]) { + m = dict_macros[i]; + + if (m.indexOf('{') >= 0) { + // set up our own record so we can detect definition loops: + macros[i] = { + in_set: false, + elsewhere: null, + raw: dict_macros[i] + }; + + for (k in dict_macros) { + if (dict_macros.hasOwnProperty(k) && i !== k) { + // it doesn't matter if the lexer recognized that the inner macro(s) + // were sitting inside a `[...]` set or not: the fact that they are used + // here in macro `i` which itself sits in a set, makes them *all* live in + // a set so all of them get the same treatment: set expansion style. + // + // Note: make sure we don't try to expand any XRegExp `\p{...}` or `\P{...}` + // macros here: + if (XRegExp._getUnicodeProperty(k)) { + // Work-around so that you can use `\p{ascii}` for a XRegExp slug, a.k.a. + // Unicode 'General Category' Property cf. http://unicode.org/reports/tr18/#Categories, + // while using `\p{ASCII}` as a *macro expansion* of the `ASCII` + // macro: + if (k.toUpperCase() !== k) { + m = new Error('Cannot use name "' + k + '" as a macro name as it clashes with the same XRegExp "\\p{..}" Unicode \'General Category\' Property name. Use all-uppercase macro names, e.g. name your macro "' + k.toUpperCase() + '" to work around this issue or give your offending macro a different name.'); + break; + } + } + + a = m.split('{' + k + '}'); + if (a.length > 1) { + var x = expandMacroInSet(k); + assert(x); + if (x instanceof Error) { + m = x; + break; + } + m = a.join(x); + } + } + } + } + + var mba = setmgmt.reduceRegexToSetBitArray(m, i, opts); + + var s1; + + // propagate deferred exceptions = error reports. + if (mba instanceof Error) { + s1 = mba; + } else { + s1 = setmgmt.bitarray2set(mba, false); + + m = s1; + } + + macros[i] = { + in_set: s1, + elsewhere: null, + raw: dict_macros[i] + }; + } else { + m = macros[i].in_set; + + if (m instanceof Error) { + // this turns out to be an macro with 'issues' and it is used, so the 'issues' do matter: bombs away! + return new Error(m.message); + } + + // detect definition loop: + if (m === false) { + return new Error('Macro name "' + i + '" has an illegal, looping, definition, i.e. it\'s definition references itself, either directly or indirectly, via other macros.'); + } + } + + return m; + } + + function expandMacroElsewhere(i) { + var k, a, m; + + if (macros[i].elsewhere == null) { + m = dict_macros[i]; + + // set up our own record so we can detect definition loops: + macros[i].elsewhere = false; + + // the macro MAY contain other macros which MAY be inside a `[...]` set in this + // macro or elsewhere, hence we must parse the regex: + m = reduceRegex(m, i, opts, expandAllMacrosInSet, expandAllMacrosElsewhere); + // propagate deferred exceptions = error reports. + if (m instanceof Error) { + return m; + } + + macros[i].elsewhere = m; + } else { + m = macros[i].elsewhere; + + if (m instanceof Error) { + // this turns out to be an macro with 'issues' and it is used, so the 'issues' do matter: bombs away! + return m; + } + + // detect definition loop: + if (m === false) { + return new Error('Macro name "' + i + '" has an illegal, looping, definition, i.e. it\'s definition references itself, either directly or indirectly, via other macros.'); + } + } + + return m; + } + + function expandAllMacrosInSet(s) { + var i, x; + + // process *all* the macros inside [...] set: + if (s.indexOf('{') >= 0) { + for (i in macros) { + if (macros.hasOwnProperty(i)) { + var a = s.split('{' + i + '}'); + if (a.length > 1) { + x = expandMacroInSet(i); + assert(x); + if (x instanceof Error) { + return new Error('failure to expand the macro [' + i + '] in set [' + s + ']: ' + x.message); + } + s = a.join(x); + } + + // stop the brute-force expansion attempt when we done 'em all: + if (s.indexOf('{') === -1) { + break; + } + } + } + } + + return s; + } + + function expandAllMacrosElsewhere(s) { + var i, x; + + // When we process the remaining macro occurrences in the regex + // every macro used in a lexer rule will become its own capture group. + // + // Meanwhile the cached expansion will expand any submacros into + // *NON*-capturing groups so that the backreference indexes remain as you'ld + // expect and using macros doesn't require you to know exactly what your + // used macro will expand into, i.e. which and how many submacros it has. + // + // This is a BREAKING CHANGE from vanilla jison 0.4.15! + if (s.indexOf('{') >= 0) { + for (i in macros) { + if (macros.hasOwnProperty(i)) { + // These are all submacro expansions, hence non-capturing grouping is applied: + var a = s.split('{' + i + '}'); + if (a.length > 1) { + x = expandMacroElsewhere(i); + assert(x); + if (x instanceof Error) { + return new Error('failure to expand the macro [' + i + '] in regex /' + s + '/: ' + x.message); + } + s = a.join('(?:' + x + ')'); + } + + // stop the brute-force expansion attempt when we done 'em all: + if (s.indexOf('{') === -1) { + break; + } + } + } + } + + return s; + } + + + var m, i; + + if (opts.debug) console.log('\n############## RAW macros: ', dict_macros); + + // first we create the part of the dictionary which is targeting the use of macros + // *inside* `[...]` sets; once we have completed that half of the expansions work, + // we then go and expand the macros for when they are used elsewhere in a regex: + // iff we encounter submacros then which are used *inside* a set, we can use that + // first half dictionary to speed things up a bit as we can use those expansions + // straight away! + for (i in dict_macros) { + if (dict_macros.hasOwnProperty(i)) { + expandMacroInSet(i); + } + } + + for (i in dict_macros) { + if (dict_macros.hasOwnProperty(i)) { + expandMacroElsewhere(i); + } + } + + if (opts.debug) console.log('\n############### expanded macros: ', macros); + + return macros; +} + + + +// expand macros in a regex; expands them recursively +function expandMacros(src, macros, opts) { + var expansion_count = 0; + + // By the time we call this function `expandMacros` we MUST have expanded and cached all macros already! + // Hence things should be easy in there: + + function expandAllMacrosInSet(s) { + var i, m, x; + + // process *all* the macros inside [...] set: + if (s.indexOf('{') >= 0) { + for (i in macros) { + if (macros.hasOwnProperty(i)) { + m = macros[i]; + + var a = s.split('{' + i + '}'); + if (a.length > 1) { + x = m.in_set; + + assert(x); + if (x instanceof Error) { + // this turns out to be an macro with 'issues' and it is used, so the 'issues' do matter: bombs away! + throw x; + } + + // detect definition loop: + if (x === false) { + return new Error('Macro name "' + i + '" has an illegal, looping, definition, i.e. it\'s definition references itself, either directly or indirectly, via other macros.'); + } + + s = a.join(x); + expansion_count++; + } + + // stop the brute-force expansion attempt when we done 'em all: + if (s.indexOf('{') === -1) { + break; + } + } + } + } + + return s; + } + + function expandAllMacrosElsewhere(s) { + var i, m, x; + + // When we process the main macro occurrences in the regex + // every macro used in a lexer rule will become its own capture group. + // + // Meanwhile the cached expansion will expand any submacros into + // *NON*-capturing groups so that the backreference indexes remain as you'ld + // expect and using macros doesn't require you to know exactly what your + // used macro will expand into, i.e. which and how many submacros it has. + // + // This is a BREAKING CHANGE from vanilla jison 0.4.15! + if (s.indexOf('{') >= 0) { + for (i in macros) { + if (macros.hasOwnProperty(i)) { + m = macros[i]; + + var a = s.split('{' + i + '}'); + if (a.length > 1) { + // These are all main macro expansions, hence CAPTURING grouping is applied: + x = m.elsewhere; + assert(x); + + // detect definition loop: + if (x === false) { + return new Error('Macro name "' + i + '" has an illegal, looping, definition, i.e. it\'s definition references itself, either directly or indirectly, via other macros.'); + } + + s = a.join('(' + x + ')'); + expansion_count++; + } + + // stop the brute-force expansion attempt when we done 'em all: + if (s.indexOf('{') === -1) { + break; + } + } + } + } + + return s; + } + + + // When we process the macro occurrences in the regex + // every macro used in a lexer rule will become its own capture group. + // + // Meanwhile the cached expansion will have expanded any submacros into + // *NON*-capturing groups so that the backreference indexes remain as you'ld + // expect and using macros doesn't require you to know exactly what your + // used macro will expand into, i.e. which and how many submacros it has. + // + // This is a BREAKING CHANGE from vanilla jison 0.4.15! + var s2 = reduceRegex(src, null, opts, expandAllMacrosInSet, expandAllMacrosElsewhere); + // propagate deferred exceptions = error reports. + if (s2 instanceof Error) { + throw s2; + } + + // only when we did expand some actual macros do we take the re-interpreted/optimized/regenerated regex from reduceRegex() + // in order to keep our test cases simple and rules recognizable. This assumes the user can code good regexes on his own, + // as long as no macros are involved... + // + // Also pick the reduced regex when there (potentially) are XRegExp extensions in the original, e.g. `\\p{Number}`, + // unless the `xregexp` output option has been enabled. + if (expansion_count > 0 || (src.indexOf('\\p{') >= 0 && !opts.options.xregexp)) { + src = s2; + } else { + // Check if the reduced regex is smaller in size; when it is, we still go with the new one! + if (s2.length < src.length) { + src = s2; + } + } + + return src; +} + +function prepareStartConditions(conditions) { + var sc, + hash = {}; + for (sc in conditions) { + if (conditions.hasOwnProperty(sc)) { + hash[sc] = {rules:[], inclusive: !conditions[sc]}; + } + } + return hash; +} + +function buildActions(dict, tokens, opts) { + var actions = [dict.actionInclude || '', 'var YYSTATE = YY_START;']; + var tok; + var toks = {}; + var caseHelper = []; + + // tokens: map/array of token numbers to token names + for (tok in tokens) { + var idx = parseInt(tok); + if (idx && idx > 0) { + toks[tokens[tok]] = idx; + } + } + + if (opts.options.flex && dict.rules) { + dict.rules.push(['.', 'console.log("", yytext); /* `flex` lexing mode: the last resort rule! */']); + } + + var gen = prepareRules(dict, actions, caseHelper, tokens && toks, opts.conditions, opts); + + var fun = actions.join('\n'); + 'yytext yyleng yylineno yylloc yyerror'.split(' ').forEach(function (yy) { + fun = fun.replace(new RegExp('\\b(' + yy + ')\\b', 'g'), 'yy_.$1'); + }); + + return { + caseHelperInclude: '{\n' + caseHelper.join(',') + '\n}', + + actions: `function lexer__performAction(yy, yyrulenumber, YY_START) { + var yy_ = this; + + ${fun} + }`, + + rules: gen.rules, + macros: gen.macros, // propagate these for debugging/diagnostic purposes + + regular_rule_count: gen.regular_rule_count, + simple_rule_count: gen.simple_rule_count, + }; +} + +// +// NOTE: this is *almost* a copy of the JisonParserError producing code in +// jison/lib/jison.js @ line 2304:lrGeneratorMixin.generateErrorClass +// +function generateErrorClass() { + // --- START lexer error class --- + +var prelude = `/** + * See also: + * http://stackoverflow.com/questions/1382107/whats-a-good-way-to-extend-error-in-javascript/#35881508 + * but we keep the prototype.constructor and prototype.name assignment lines too for compatibility + * with userland code which might access the derived class in a 'classic' way. + * + * @public + * @constructor + * @nocollapse + */ +function JisonLexerError(msg, hash) { + Object.defineProperty(this, 'name', { + enumerable: false, + writable: false, + value: 'JisonLexerError' + }); + + if (msg == null) msg = '???'; + + Object.defineProperty(this, 'message', { + enumerable: false, + writable: true, + value: msg + }); + + this.hash = hash; + + var stacktrace; + if (hash && hash.exception instanceof Error) { + var ex2 = hash.exception; + this.message = ex2.message || msg; + stacktrace = ex2.stack; + } + if (!stacktrace) { + if (Error.hasOwnProperty('captureStackTrace')) { // V8 + Error.captureStackTrace(this, this.constructor); + } else { + stacktrace = (new Error(msg)).stack; + } + } + if (stacktrace) { + Object.defineProperty(this, 'stack', { + enumerable: false, + writable: false, + value: stacktrace + }); + } +} + +if (typeof Object.setPrototypeOf === 'function') { + Object.setPrototypeOf(JisonLexerError.prototype, Error.prototype); +} else { + JisonLexerError.prototype = Object.create(Error.prototype); +} +JisonLexerError.prototype.constructor = JisonLexerError; +JisonLexerError.prototype.name = 'JisonLexerError';`; + + // --- END lexer error class --- + + return prelude; +} + + +const jisonLexerErrorDefinition = generateErrorClass(); + + +function generateFakeXRegExpClassSrcCode() { + return rmCommonWS` + var __hacky_counter__ = 0; + + /** + * @constructor + * @nocollapse + */ + function XRegExp(re, f) { + this.re = re; + this.flags = f; + this._getUnicodeProperty = function (k) {}; + var fake = /./; // WARNING: this exact 'fake' is also depended upon by the xregexp unit test! + __hacky_counter__++; + fake.__hacky_backy__ = __hacky_counter__; + return fake; + } + `; +} + + + +/** @constructor */ +function RegExpLexer(dict, input, tokens, build_options) { + var opts; + var dump = false; + + function test_me(tweak_cb, description, src_exception, ex_callback) { + opts = processGrammar(dict, tokens, build_options); + opts.__in_rules_failure_analysis_mode__ = false; + prepExportStructures(opts); + assert(opts.options); + if (tweak_cb) { + tweak_cb(); + } + var source = generateModuleBody(opts); + try { + // The generated code will always have the `lexer` variable declared at local scope + // as `eval()` will use the local scope. + // + // The compiled code will look something like this: + // + // ``` + // var lexer; + // bla bla... + // ``` + // + // or + // + // ``` + // var lexer = { bla... }; + // ``` + var testcode = [ + '// provide a local version for test purposes:', + jisonLexerErrorDefinition, + '', + generateFakeXRegExpClassSrcCode(), + '', + source, + '', + 'return lexer;'].join('\n'); + var lexer = code_exec(testcode, function generated_code_exec_wrapper_regexp_lexer(sourcecode) { + //console.log("===============================LEXER TEST CODE\n", sourcecode, "\n=====================END====================\n"); + var lexer_f = new Function('', sourcecode); + return lexer_f(); + }, opts.options, "lexer"); + + if (!lexer) { + throw new Error('no lexer defined *at all*?!'); + } + if (typeof lexer.options !== 'object' || lexer.options == null) { + throw new Error('your lexer class MUST have an .options member object or it won\'t fly!'); + } + if (typeof lexer.setInput !== 'function') { + throw new Error('your lexer class MUST have a .setInput function member or it won\'t fly!'); + } + if (lexer.EOF !== 1 && lexer.ERROR !== 2) { + throw new Error('your lexer class MUST have these constants defined: lexer.EOF = 1 and lexer.ERROR = 2 or it won\'t fly!'); + } + + // When we do NOT crash, we found/killed the problem area just before this call! + if (src_exception && description) { + src_exception.message += '\n (' + description + ')'; + } + + // patch the pre and post handlers in there, now that we have some live code to work with: + if (opts.options) { + var pre = opts.options.pre_lex; + var post = opts.options.post_lex; + // since JSON cannot encode functions, we'll have to do it manually now: + if (typeof pre === 'function') { + lexer.options.pre_lex = pre; + } + if (typeof post === 'function') { + lexer.options.post_lex = post; + } + } + + if (opts.options.showSource) { + if (typeof opts.options.showSource === 'function') { + opts.options.showSource(lexer, source, opts); + } else { + console.log("\nGenerated lexer sourcecode:\n----------------------------------------\n", source, "\n----------------------------------------\n"); + } + } + return lexer; + } catch (ex) { + // if (src_exception) { + // src_exception.message += '\n (' + description + ': ' + ex.message + ')'; + // } + + if (ex_callback) { + ex_callback(ex); + } else if (dump) { + console.log('source code:\n', source); + } + return false; + } + } + + /** @constructor */ + var lexer = test_me(null, null, null, function (ex) { + // When we get an exception here, it means some part of the user-specified lexer is botched. + // + // Now we go and try to narrow down the problem area/category: + assert(opts.options); + assert(opts.options.xregexp !== undefined); + var orig_xregexp_opt = !!opts.options.xregexp; + if (!test_me(function () { + assert(opts.options.xregexp !== undefined); + opts.options.xregexp = false; + opts.showSource = false; + }, 'When you have specified %option xregexp, you must also properly IMPORT the XRegExp library in the generated lexer.', ex, null)) { + if (!test_me(function () { + // restore xregexp option setting: the trouble wasn't caused by the xregexp flag i.c.w. incorrect XRegExp library importing! + opts.options.xregexp = orig_xregexp_opt; + + opts.conditions = []; + opts.showSource = false; + }, ((dict.rules && dict.rules.length > 0) ? + 'One or more of your lexer state names are possibly botched?' : + 'Your custom lexer is somehow botched.'), ex, null)) { + if (!test_me(function () { + // opts.conditions = []; + opts.rules = []; + opts.showSource = false; + opts.__in_rules_failure_analysis_mode__ = true; + }, 'One or more of your lexer rules are possibly botched?', ex, null)) { + // kill each rule action block, one at a time and test again after each 'edit': + var rv = false; + for (var i = 0, len = (dict.rules ? dict.rules.length : 0); i < len; i++) { + dict.rules[i][1] = '{ /* nada */ }'; + rv = test_me(function () { + // opts.conditions = []; + // opts.rules = []; + // opts.__in_rules_failure_analysis_mode__ = true; + }, 'Your lexer rule "' + dict.rules[i][0] + '" action code block is botched?', ex, null); + if (rv) { + break; + } + } + if (!rv) { + test_me(function () { + opts.conditions = []; + opts.rules = []; + opts.performAction = 'null'; + // opts.options = {}; + // opts.caseHelperInclude = '{}'; + opts.showSource = false; + opts.__in_rules_failure_analysis_mode__ = true; + + dump = false; + }, 'One or more of your lexer rule action code block(s) are possibly botched?', ex, null); + } + } + } + } + throw ex; + }); + + lexer.setInput(input); + + /** @public */ + lexer.generate = function () { + return generateFromOpts(opts); + }; + /** @public */ + lexer.generateModule = function () { + return generateModule(opts); + }; + /** @public */ + lexer.generateCommonJSModule = function () { + return generateCommonJSModule(opts); + }; + /** @public */ + lexer.generateESModule = function () { + return generateESModule(opts); + }; + /** @public */ + lexer.generateAMDModule = function () { + return generateAMDModule(opts); + }; + + // internal APIs to aid testing: + /** @public */ + lexer.getExpandedMacros = function () { + return opts.macros; + }; + + return lexer; +} + +// code stripping performance test for very simple grammar: +// +// - removing backtracking parser code branches: 730K -> 750K rounds +// - removing all location info tracking: yylineno, yylloc, etc.: 750K -> 900K rounds +// - no `yyleng`: 900K -> 905K rounds +// - no `this.done` as we cannot have a NULL `_input` anymore: 905K -> 930K rounds +// - `simpleCaseActionClusters` as array instead of hash object: 930K -> 940K rounds +// - lexers which have only return stmts, i.e. only a +// `simpleCaseActionClusters` lookup table to produce +// lexer tokens: *inline* the `performAction` call: 940K -> 950K rounds +// - given all the above, you can *inline* what's left of +// `lexer_next()`: 950K -> 955K rounds (? this stuff becomes hard to measure; inaccuracy abounds!) +// +// Total gain when we forget about very minor (and tough to nail) *inlining* `lexer_next()` gains: +// +// 730 -> 950 ~ 30% performance gain. +// + +// As a function can be reproduced in source-code form by any JavaScript engine, we're going to wrap this chunk +// of code in a function so that we can easily get it including it comments, etc.: +/** +@public +@nocollapse +*/ +function getRegExpLexerPrototype() { + // --- START lexer kernel --- +return `{ + EOF: 1, + ERROR: 2, + + // JisonLexerError: JisonLexerError, /// <-- injected by the code generator + + // options: {}, /// <-- injected by the code generator + + // yy: ..., /// <-- injected by setInput() + + __currentRuleSet__: null, /// INTERNAL USE ONLY: internal rule set cache for the current lexer state + + __error_infos: [], /// INTERNAL USE ONLY: the set of lexErrorInfo objects created since the last cleanup + + __decompressed: false, /// INTERNAL USE ONLY: mark whether the lexer instance has been 'unfolded' completely and is now ready for use + + done: false, /// INTERNAL USE ONLY + _backtrack: false, /// INTERNAL USE ONLY + _input: '', /// INTERNAL USE ONLY + _more: false, /// INTERNAL USE ONLY + _signaled_error_token: false, /// INTERNAL USE ONLY + + conditionStack: [], /// INTERNAL USE ONLY; managed via \`pushState()\`, \`popState()\`, \`topState()\` and \`stateStackSize()\` + + match: '', /// READ-ONLY EXTERNAL ACCESS - ADVANCED USE ONLY: tracks input which has been matched so far for the lexer token under construction. \`match\` is identical to \`yytext\` except that this one still contains the matched input string after \`lexer.performAction()\` has been invoked, where userland code MAY have changed/replaced the \`yytext\` value entirely! + matched: '', /// READ-ONLY EXTERNAL ACCESS - ADVANCED USE ONLY: tracks entire input which has been matched so far + matches: false, /// READ-ONLY EXTERNAL ACCESS - ADVANCED USE ONLY: tracks RE match result for last (successful) match attempt + yytext: '', /// ADVANCED USE ONLY: tracks input which has been matched so far for the lexer token under construction; this value is transferred to the parser as the 'token value' when the parser consumes the lexer token produced through a call to the \`lex()\` API. + offset: 0, /// READ-ONLY EXTERNAL ACCESS - ADVANCED USE ONLY: tracks the 'cursor position' in the input string, i.e. the number of characters matched so far + yyleng: 0, /// READ-ONLY EXTERNAL ACCESS - ADVANCED USE ONLY: length of matched input for the token under construction (\`yytext\`) + yylineno: 0, /// READ-ONLY EXTERNAL ACCESS - ADVANCED USE ONLY: 'line number' at which the token under construction is located + yylloc: null, /// READ-ONLY EXTERNAL ACCESS - ADVANCED USE ONLY: tracks location info (lines + columns) for the token under construction + + /** + * INTERNAL USE: construct a suitable error info hash object instance for \`parseError\`. + * + * @public + * @this {RegExpLexer} + */ + constructLexErrorInfo: function lexer_constructLexErrorInfo(msg, recoverable, show_input_position) { + msg = '' + msg; + + // heuristic to determine if the error message already contains a (partial) source code dump + // as produced by either \`showPosition()\` or \`prettyPrintRange()\`: + if (show_input_position == undefined) { + show_input_position = !(msg.indexOf('\\n') > 0 && msg.indexOf('^') > 0); + } + if (this.yylloc && show_input_position) { + if (typeof this.prettyPrintRange === 'function') { + var pretty_src = this.prettyPrintRange(this.yylloc); + + if (!/\\n\\s*$/.test(msg)) { + msg += '\\n'; + } + msg += '\\n Erroneous area:\\n' + this.prettyPrintRange(this.yylloc); + } else if (typeof this.showPosition === 'function') { + var pos_str = this.showPosition(); + if (pos_str) { + if (msg.length && msg[msg.length - 1] !== '\\n' && pos_str[0] !== '\\n') { + msg += '\\n' + pos_str; + } else { + msg += pos_str; + } + } + } + } + /** @constructor */ + var pei = { + errStr: msg, + recoverable: !!recoverable, + text: this.match, // This one MAY be empty; userland code should use the \`upcomingInput\` API to obtain more text which follows the 'lexer cursor position'... + token: null, + line: this.yylineno, + loc: this.yylloc, + yy: this.yy, + lexer: this, + + /** + * and make sure the error info doesn't stay due to potential + * ref cycle via userland code manipulations. + * These would otherwise all be memory leak opportunities! + * + * Note that only array and object references are nuked as those + * constitute the set of elements which can produce a cyclic ref. + * The rest of the members is kept intact as they are harmless. + * + * @public + * @this {LexErrorInfo} + */ + destroy: function destructLexErrorInfo() { + // remove cyclic references added to error info: + // info.yy = null; + // info.lexer = null; + // ... + var rec = !!this.recoverable; + for (var key in this) { + if (this.hasOwnProperty(key) && typeof key === 'object') { + this[key] = undefined; + } + } + this.recoverable = rec; + } + }; + // track this instance so we can \`destroy()\` it once we deem it superfluous and ready for garbage collection! + this.__error_infos.push(pei); + return pei; + }, + + /** + * handler which is invoked when a lexer error occurs. + * + * @public + * @this {RegExpLexer} + */ + parseError: function lexer_parseError(str, hash, ExceptionClass) { + if (!ExceptionClass) { + ExceptionClass = this.JisonLexerError; + } + if (this.yy) { + if (this.yy.parser && typeof this.yy.parser.parseError === 'function') { + return this.yy.parser.parseError.call(this, str, hash, ExceptionClass) || this.ERROR; + } else if (typeof this.yy.parseError === 'function') { + return this.yy.parseError.call(this, str, hash, ExceptionClass) || this.ERROR; + } + } + throw new ExceptionClass(str, hash); + }, + + /** + * method which implements \`yyerror(str, ...args)\` functionality for use inside lexer actions. + * + * @public + * @this {RegExpLexer} + */ + yyerror: function yyError(str /*, ...args */) { + var lineno_msg = ''; + if (this.yylloc) { + lineno_msg = ' on line ' + (this.yylineno + 1); + } + var p = this.constructLexErrorInfo('Lexical error' + lineno_msg + ': ' + str, this.options.lexerErrorsAreRecoverable); + + // Add any extra args to the hash under the name \`extra_error_attributes\`: + var args = Array.prototype.slice.call(arguments, 1); + if (args.length) { + p.extra_error_attributes = args; + } + + return (this.parseError(p.errStr, p, this.JisonLexerError) || this.ERROR); + }, + + /** + * final cleanup function for when we have completed lexing the input; + * make it an API so that external code can use this one once userland + * code has decided it's time to destroy any lingering lexer error + * hash object instances and the like: this function helps to clean + * up these constructs, which *may* carry cyclic references which would + * otherwise prevent the instances from being properly and timely + * garbage-collected, i.e. this function helps prevent memory leaks! + * + * @public + * @this {RegExpLexer} + */ + cleanupAfterLex: function lexer_cleanupAfterLex(do_not_nuke_errorinfos) { + // prevent lingering circular references from causing memory leaks: + this.setInput('', {}); + + // nuke the error hash info instances created during this run. + // Userland code must COPY any data/references + // in the error hash instance(s) it is more permanently interested in. + if (!do_not_nuke_errorinfos) { + for (var i = this.__error_infos.length - 1; i >= 0; i--) { + var el = this.__error_infos[i]; + if (el && typeof el.destroy === 'function') { + el.destroy(); + } + } + this.__error_infos.length = 0; + } + + return this; + }, + + /** + * clear the lexer token context; intended for internal use only + * + * @public + * @this {RegExpLexer} + */ + clear: function lexer_clear() { + this.yytext = ''; + this.yyleng = 0; + this.match = ''; + // - DO NOT reset \`this.matched\` + this.matches = false; + this._more = false; + this._backtrack = false; + + var col = (this.yylloc ? this.yylloc.last_column : 0); + this.yylloc = { + first_line: this.yylineno + 1, + first_column: col, + last_line: this.yylineno + 1, + last_column: col, + + range: [this.offset, this.offset] + }; + }, + + /** + * resets the lexer, sets new input + * + * @public + * @this {RegExpLexer} + */ + setInput: function lexer_setInput(input, yy) { + this.yy = yy || this.yy || {}; + + // also check if we've fully initialized the lexer instance, + // including expansion work to be done to go from a loaded + // lexer to a usable lexer: + if (!this.__decompressed) { + // step 1: decompress the regex list: + var rules = this.rules; + for (var i = 0, len = rules.length; i < len; i++) { + var rule_re = rules[i]; + + // compression: is the RE an xref to another RE slot in the rules[] table? + if (typeof rule_re === 'number') { + rules[i] = rules[rule_re]; + } + } + + // step 2: unfold the conditions[] set to make these ready for use: + var conditions = this.conditions; + for (var k in conditions) { + var spec = conditions[k]; + + var rule_ids = spec.rules; + + var len = rule_ids.length; + var rule_regexes = new Array(len + 1); // slot 0 is unused; we use a 1-based index approach here to keep the hottest code in \`lexer_next()\` fast and simple! + var rule_new_ids = new Array(len + 1); + + for (var i = 0; i < len; i++) { + var idx = rule_ids[i]; + var rule_re = rules[idx]; + rule_regexes[i + 1] = rule_re; + rule_new_ids[i + 1] = idx; + } + + spec.rules = rule_new_ids; + spec.__rule_regexes = rule_regexes; + spec.__rule_count = len; + } + + this.__decompressed = true; + } + + this._input = input || ''; + this.clear(); + this._signaled_error_token = false; + this.done = false; + this.yylineno = 0; + this.matched = ''; + this.conditionStack = ['INITIAL']; + this.__currentRuleSet__ = null; + this.yylloc = { + first_line: 1, + first_column: 0, + last_line: 1, + last_column: 0, + + range: [0, 0] + }; + this.offset = 0; + return this; + }, + + /** + * edit the remaining input via user-specified callback. + * This can be used to forward-adjust the input-to-parse, + * e.g. inserting macro expansions and alike in the + * input which has yet to be lexed. + * The behaviour of this API contrasts the \`unput()\` et al + * APIs as those act on the *consumed* input, while this + * one allows one to manipulate the future, without impacting + * the current \`yyloc\` cursor location or any history. + * + * Use this API to help implement C-preprocessor-like + * \`#include\` statements, etc. + * + * The provided callback must be synchronous and is + * expected to return the edited input (string). + * + * The \`cpsArg\` argument value is passed to the callback + * as-is. + * + * \`callback\` interface: + * \`function callback(input, cpsArg)\` + * + * - \`input\` will carry the remaining-input-to-lex string + * from the lexer. + * - \`cpsArg\` is \`cpsArg\` passed into this API. + * + * The \`this\` reference for the callback will be set to + * reference this lexer instance so that userland code + * in the callback can easily and quickly access any lexer + * API. + * + * When the callback returns a non-string-type falsey value, + * we assume the callback did not edit the input and we + * will using the input as-is. + * + * When the callback returns a non-string-type value, it + * is converted to a string for lexing via the \`"" + retval\` + * operation. (See also why: http://2ality.com/2012/03/converting-to-string.html + * -- that way any returned object's \`toValue()\` and \`toString()\` + * methods will be invoked in a proper/desirable order.) + * + * @public + * @this {RegExpLexer} + */ + editRemainingInput: function lexer_editRemainingInput(callback, cpsArg) { + var rv = callback.call(this, this._input, cpsArg); + if (typeof rv !== 'string') { + if (rv) { + this._input = '' + rv; + } + // else: keep \`this._input\` as is. + } else { + this._input = rv; + } + return this; + }, + + /** + * consumes and returns one char from the input + * + * @public + * @this {RegExpLexer} + */ + input: function lexer_input() { + if (!this._input) { + //this.done = true; -- don't set \`done\` as we want the lex()/next() API to be able to produce one custom EOF token match after this anyhow. (lexer can match special <> tokens and perform user action code for a <> match, but only does so *once*) + return null; + } + var ch = this._input[0]; + this.yytext += ch; + this.yyleng++; + this.offset++; + this.match += ch; + this.matched += ch; + // Count the linenumber up when we hit the LF (or a stand-alone CR). + // On CRLF, the linenumber is incremented when you fetch the CR or the CRLF combo + // and we advance immediately past the LF as well, returning both together as if + // it was all a single 'character' only. + var slice_len = 1; + var lines = false; + if (ch === '\\n') { + lines = true; + } else if (ch === '\\r') { + lines = true; + var ch2 = this._input[1]; + if (ch2 === '\\n') { + slice_len++; + ch += ch2; + this.yytext += ch2; + this.yyleng++; + this.offset++; + this.match += ch2; + this.matched += ch2; + this.yylloc.range[1]++; + } + } + if (lines) { + this.yylineno++; + this.yylloc.last_line++; + this.yylloc.last_column = 0; + } else { + this.yylloc.last_column++; + } + this.yylloc.range[1]++; + + this._input = this._input.slice(slice_len); + return ch; + }, + + /** + * unshifts one char (or an entire string) into the input + * + * @public + * @this {RegExpLexer} + */ + unput: function lexer_unput(ch) { + var len = ch.length; + var lines = ch.split(/(?:\\r\\n?|\\n)/g); + + this._input = ch + this._input; + this.yytext = this.yytext.substr(0, this.yytext.length - len); + this.yyleng = this.yytext.length; + this.offset -= len; + this.match = this.match.substr(0, this.match.length - len); + this.matched = this.matched.substr(0, this.matched.length - len); + + if (lines.length > 1) { + this.yylineno -= lines.length - 1; + + this.yylloc.last_line = this.yylineno + 1; + + // Get last entirely matched line into the \`pre_lines[]\` array's + // last index slot; we don't mind when other previously + // matched lines end up in the array too. + var pre = this.match; + var pre_lines = pre.split(/(?:\\r\\n?|\\n)/g); + if (pre_lines.length === 1) { + pre = this.matched; + pre_lines = pre.split(/(?:\\r\\n?|\\n)/g); + } + this.yylloc.last_column = pre_lines[pre_lines.length - 1].length; + } else { + this.yylloc.last_column -= len; + } + + this.yylloc.range[1] = this.yylloc.range[0] + this.yyleng; + + this.done = false; + return this; + }, + + /** + * cache matched text and append it on next action + * + * @public + * @this {RegExpLexer} + */ + more: function lexer_more() { + this._more = true; + return this; + }, + + /** + * signal the lexer that this rule fails to match the input, so the + * next matching rule (regex) should be tested instead. + * + * @public + * @this {RegExpLexer} + */ + reject: function lexer_reject() { + if (this.options.backtrack_lexer) { + this._backtrack = true; + } else { + // when the \`parseError()\` call returns, we MUST ensure that the error is registered. + // We accomplish this by signaling an 'error' token to be produced for the current + // \`.lex()\` run. + var lineno_msg = ''; + if (this.yylloc) { + lineno_msg = ' on line ' + (this.yylineno + 1); + } + var p = this.constructLexErrorInfo('Lexical error' + lineno_msg + ': You can only invoke reject() in the lexer when the lexer is of the backtracking persuasion (options.backtrack_lexer = true).', false); + this._signaled_error_token = (this.parseError(p.errStr, p, this.JisonLexerError) || this.ERROR); + } + return this; + }, + + /** + * retain first n characters of the match + * + * @public + * @this {RegExpLexer} + */ + less: function lexer_less(n) { + return this.unput(this.match.slice(n)); + }, + + /** + * return (part of the) already matched input, i.e. for error + * messages. + * + * Limit the returned string length to \`maxSize\` (default: 20). + * + * Limit the returned string to the \`maxLines\` number of lines of + * input (default: 1). + * + * Negative limit values equal *unlimited*. + * + * @public + * @this {RegExpLexer} + */ + pastInput: function lexer_pastInput(maxSize, maxLines) { + var past = this.matched.substring(0, this.matched.length - this.match.length); + if (maxSize < 0) + maxSize = past.length; + else if (!maxSize) + maxSize = 20; + if (maxLines < 0) + maxLines = past.length; // can't ever have more input lines than this! + else if (!maxLines) + maxLines = 1; + // \`substr\` anticipation: treat \\r\\n as a single character and take a little + // more than necessary so that we can still properly check against maxSize + // after we've transformed and limited the newLines in here: + past = past.substr(-maxSize * 2 - 2); + // now that we have a significantly reduced string to process, transform the newlines + // and chop them, then limit them: + var a = past.replace(/\\r\\n|\\r/g, '\\n').split('\\n'); + a = a.slice(-maxLines); + past = a.join('\\n'); + // When, after limiting to maxLines, we still have too much to return, + // do add an ellipsis prefix... + if (past.length > maxSize) { + past = '...' + past.substr(-maxSize); + } + return past; + }, + + /** + * return (part of the) upcoming input, i.e. for error messages. + * + * Limit the returned string length to \`maxSize\` (default: 20). + * + * Limit the returned string to the \`maxLines\` number of lines of input (default: 1). + * + * Negative limit values equal *unlimited*. + * + * > ### NOTE ### + * > + * > *"upcoming input"* is defined as the whole of the both + * > the *currently lexed* input, together with any remaining input + * > following that. *"currently lexed"* input is the input + * > already recognized by the lexer but not yet returned with + * > the lexer token. This happens when you are invoking this API + * > from inside any lexer rule action code block. + * > + * + * @public + * @this {RegExpLexer} + */ + upcomingInput: function lexer_upcomingInput(maxSize, maxLines) { + var next = this.match; + if (maxSize < 0) + maxSize = next.length + this._input.length; + else if (!maxSize) + maxSize = 20; + if (maxLines < 0) + maxLines = maxSize; // can't ever have more input lines than this! + else if (!maxLines) + maxLines = 1; + // \`substring\` anticipation: treat \\r\\n as a single character and take a little + // more than necessary so that we can still properly check against maxSize + // after we've transformed and limited the newLines in here: + if (next.length < maxSize * 2 + 2) { + next += this._input.substring(0, maxSize * 2 + 2); // substring is faster on Chrome/V8 + } + // now that we have a significantly reduced string to process, transform the newlines + // and chop them, then limit them: + var a = next.replace(/\\r\\n|\\r/g, '\\n').split('\\n'); + a = a.slice(0, maxLines); + next = a.join('\\n'); + // When, after limiting to maxLines, we still have too much to return, + // do add an ellipsis postfix... + if (next.length > maxSize) { + next = next.substring(0, maxSize) + '...'; + } + return next; + }, + + /** + * return a string which displays the character position where the + * lexing error occurred, i.e. for error messages + * + * @public + * @this {RegExpLexer} + */ + showPosition: function lexer_showPosition(maxPrefix, maxPostfix) { + var pre = this.pastInput(maxPrefix).replace(/\\s/g, ' '); + var c = new Array(pre.length + 1).join('-'); + return pre + this.upcomingInput(maxPostfix).replace(/\\s/g, ' ') + '\\n' + c + '^'; + }, + + /** + * return a string which displays the lines & columns of input which are referenced + * by the given location info range, plus a few lines of context. + * + * This function pretty-prints the indicated section of the input, with line numbers + * and everything! + * + * This function is very useful to provide highly readable error reports, while + * the location range may be specified in various flexible ways: + * + * - \`loc\` is the location info object which references the area which should be + * displayed and 'marked up': these lines & columns of text are marked up by \`^\` + * characters below each character in the entire input range. + * + * - \`context_loc\` is the *optional* location info object which instructs this + * pretty-printer how much *leading* context should be displayed alongside + * the area referenced by \`loc\`. This can help provide context for the displayed + * error, etc. + * + * When this location info is not provided, a default context of 3 lines is + * used. + * + * - \`context_loc2\` is another *optional* location info object, which serves + * a similar purpose to \`context_loc\`: it specifies the amount of *trailing* + * context lines to display in the pretty-print output. + * + * When this location info is not provided, a default context of 1 line only is + * used. + * + * Special Notes: + * + * - when the \`loc\`-indicated range is very large (about 5 lines or more), then + * only the first and last few lines of this block are printed while a + * \`...continued...\` message will be printed between them. + * + * This serves the purpose of not printing a huge amount of text when the \`loc\` + * range happens to be huge: this way a manageable & readable output results + * for arbitrary large ranges. + * + * - this function can display lines of input which whave not yet been lexed. + * \`prettyPrintRange()\` can access the entire input! + * + * @public + * @this {RegExpLexer} + */ + prettyPrintRange: function lexer_prettyPrintRange(loc, context_loc, context_loc2) { + var error_size = loc.last_line - loc.first_line; + const CONTEXT = 3; + const CONTEXT_TAIL = 1; + const MINIMUM_VISIBLE_NONEMPTY_LINE_COUNT = 2; + var input = this.matched + this._input; + var lines = input.split('\\n'); + //var show_context = (error_size < 5 || context_loc); + var l0 = Math.max(1, (context_loc ? context_loc.first_line : loc.first_line - CONTEXT)); + var l1 = Math.max(1, (context_loc2 ? context_loc2.last_line : loc.last_line + CONTEXT_TAIL)); + var lineno_display_width = (1 + Math.log10(l1 | 1) | 0); + var ws_prefix = new Array(lineno_display_width).join(' '); + var nonempty_line_indexes = []; + var rv = lines.slice(l0 - 1, l1 + 1).map(function injectLineNumber(line, index) { + var lno = index + l0; + var lno_pfx = (ws_prefix + lno).substr(-lineno_display_width); + var rv = lno_pfx + ': ' + line; + var errpfx = (new Array(lineno_display_width + 1)).join('^'); + var offset = 2 + 1; + var len = 0; + + if (lno === loc.first_line) { + offset += loc.first_column; + + len = Math.max( + 2, + ((lno === loc.last_line ? loc.last_column : line.length)) - loc.first_column + 1 + ); + } else if (lno === loc.last_line) { + len = Math.max(2, loc.last_column + 1); + } else if (lno > loc.first_line && lno < loc.last_line) { + len = Math.max(2, line.length + 1); + } + + if (len) { + var lead = new Array(offset).join('.'); + var mark = new Array(len).join('^'); + rv += '\\n' + errpfx + lead + mark; + + if (line.trim().length > 0) { + nonempty_line_indexes.push(index); + } + } + + rv = rv.replace(/\\t/g, ' '); + return rv; + }); + + // now make sure we don't print an overly large amount of error area: limit it + // to the top and bottom line count: + if (nonempty_line_indexes.length > 2 * MINIMUM_VISIBLE_NONEMPTY_LINE_COUNT) { + var clip_start = nonempty_line_indexes[MINIMUM_VISIBLE_NONEMPTY_LINE_COUNT - 1] + 1; + var clip_end = nonempty_line_indexes[nonempty_line_indexes.length - MINIMUM_VISIBLE_NONEMPTY_LINE_COUNT] - 1; + + var intermediate_line = (new Array(lineno_display_width + 1)).join(' ') + ' (...continued...)'; + intermediate_line += '\\n' + (new Array(lineno_display_width + 1)).join('-') + ' (---------------)'; + rv.splice(clip_start, clip_end - clip_start + 1, intermediate_line); + } + return rv.join('\\n'); + }, + + /** + * helper function, used to produce a human readable description as a string, given + * the input \`yylloc\` location object. + * + * Set \`display_range_too\` to TRUE to include the string character index position(s) + * in the description if the \`yylloc.range\` is available. + * + * @public + * @this {RegExpLexer} + */ + describeYYLLOC: function lexer_describe_yylloc(yylloc, display_range_too) { + var l1 = yylloc.first_line; + var l2 = yylloc.last_line; + var c1 = yylloc.first_column; + var c2 = yylloc.last_column; + var dl = l2 - l1; + var dc = c2 - c1; + var rv; + if (dl === 0) { + rv = 'line ' + l1 + ', '; + if (dc <= 1) { + rv += 'column ' + c1; + } else { + rv += 'columns ' + c1 + ' .. ' + c2; + } + } else { + rv = 'lines ' + l1 + '(column ' + c1 + ') .. ' + l2 + '(column ' + c2 + ')'; + } + if (yylloc.range && display_range_too) { + var r1 = yylloc.range[0]; + var r2 = yylloc.range[1] - 1; + if (r2 <= r1) { + rv += ' {String Offset: ' + r1 + '}'; + } else { + rv += ' {String Offset range: ' + r1 + ' .. ' + r2 + '}'; + } + } + return rv; + }, + + /** + * test the lexed token: return FALSE when not a match, otherwise return token. + * + * \`match\` is supposed to be an array coming out of a regex match, i.e. \`match[0]\` + * contains the actually matched text string. + * + * Also move the input cursor forward and update the match collectors: + * + * - \`yytext\` + * - \`yyleng\` + * - \`match\` + * - \`matches\` + * - \`yylloc\` + * - \`offset\` + * + * @public + * @this {RegExpLexer} + */ + test_match: function lexer_test_match(match, indexed_rule) { + var token, + lines, + backup, + match_str, + match_str_len; + + if (this.options.backtrack_lexer) { + // save context + backup = { + yylineno: this.yylineno, + yylloc: { + first_line: this.yylloc.first_line, + last_line: this.yylloc.last_line, + first_column: this.yylloc.first_column, + last_column: this.yylloc.last_column, + + range: this.yylloc.range.slice(0) + }, + yytext: this.yytext, + match: this.match, + matches: this.matches, + matched: this.matched, + yyleng: this.yyleng, + offset: this.offset, + _more: this._more, + _input: this._input, + //_signaled_error_token: this._signaled_error_token, + yy: this.yy, + conditionStack: this.conditionStack.slice(0), + done: this.done + }; + } + + match_str = match[0]; + match_str_len = match_str.length; + // if (match_str.indexOf('\\n') !== -1 || match_str.indexOf('\\r') !== -1) { + lines = match_str.split(/(?:\\r\\n?|\\n)/g); + if (lines.length > 1) { + this.yylineno += lines.length - 1; + + this.yylloc.last_line = this.yylineno + 1; + this.yylloc.last_column = lines[lines.length - 1].length; + } else { + this.yylloc.last_column += match_str_len; + } + // } + this.yytext += match_str; + this.match += match_str; + this.matched += match_str; + this.matches = match; + this.yyleng = this.yytext.length; + this.yylloc.range[1] += match_str_len; + + // previous lex rules MAY have invoked the \`more()\` API rather than producing a token: + // those rules will already have moved this \`offset\` forward matching their match lengths, + // hence we must only add our own match length now: + this.offset += match_str_len; + this._more = false; + this._backtrack = false; + this._input = this._input.slice(match_str_len); + + // calling this method: + // + // function lexer__performAction(yy, yyrulenumber, YY_START) {...} + token = this.performAction.call(this, this.yy, indexed_rule, this.conditionStack[this.conditionStack.length - 1] /* = YY_START */); + // otherwise, when the action codes are all simple return token statements: + //token = this.simpleCaseActionClusters[indexed_rule]; + + if (this.done && this._input) { + this.done = false; + } + if (token) { + return token; + } else if (this._backtrack) { + // recover context + for (var k in backup) { + this[k] = backup[k]; + } + this.__currentRuleSet__ = null; + return false; // rule action called reject() implying the next rule should be tested instead. + } else if (this._signaled_error_token) { + // produce one 'error' token as \`.parseError()\` in \`reject()\` + // did not guarantee a failure signal by throwing an exception! + token = this._signaled_error_token; + this._signaled_error_token = false; + return token; + } + return false; + }, + + /** + * return next match in input + * + * @public + * @this {RegExpLexer} + */ + next: function lexer_next() { + if (this.done) { + this.clear(); + return this.EOF; + } + if (!this._input) { + this.done = true; + } + + var token, + match, + tempMatch, + index; + if (!this._more) { + this.clear(); + } + var spec = this.__currentRuleSet__; + if (!spec) { + // Update the ruleset cache as we apparently encountered a state change or just started lexing. + // The cache is set up for fast lookup -- we assume a lexer will switch states much less often than it will + // invoke the \`lex()\` token-producing API and related APIs, hence caching the set for direct access helps + // speed up those activities a tiny bit. + spec = this.__currentRuleSet__ = this._currentRules(); + // Check whether a *sane* condition has been pushed before: this makes the lexer robust against + // user-programmer bugs such as https://github.com/zaach/jison-lex/issues/19 + if (!spec || !spec.rules) { + var lineno_msg = ''; + if (this.options.trackPosition) { + lineno_msg = ' on line ' + (this.yylineno + 1); + } + var p = this.constructLexErrorInfo('Internal lexer engine error' + lineno_msg + ': The lex grammar programmer pushed a non-existing condition name "' + this.topState() + '"; this is a fatal error and should be reported to the application programmer team!', false); + // produce one 'error' token until this situation has been resolved, most probably by parse termination! + return (this.parseError(p.errStr, p, this.JisonLexerError) || this.ERROR); + } + } + + var rule_ids = spec.rules; + var regexes = spec.__rule_regexes; + var len = spec.__rule_count; + + // Note: the arrays are 1-based, while \`len\` itself is a valid index, + // hence the non-standard less-or-equal check in the next loop condition! + for (var i = 1; i <= len; i++) { + tempMatch = this._input.match(regexes[i]); + if (tempMatch && (!match || tempMatch[0].length > match[0].length)) { + match = tempMatch; + index = i; + if (this.options.backtrack_lexer) { + token = this.test_match(tempMatch, rule_ids[i]); + if (token !== false) { + return token; + } else if (this._backtrack) { + match = undefined; + continue; // rule action called reject() implying a rule MISmatch. + } else { + // else: this is a lexer rule which consumes input without producing a token (e.g. whitespace) + return false; + } + } else if (!this.options.flex) { + break; + } + } + } + if (match) { + token = this.test_match(match, rule_ids[index]); + if (token !== false) { + return token; + } + // else: this is a lexer rule which consumes input without producing a token (e.g. whitespace) + return false; + } + if (!this._input) { + this.done = true; + this.clear(); + return this.EOF; + } else { + var lineno_msg = ''; + if (this.options.trackPosition) { + lineno_msg = ' on line ' + (this.yylineno + 1); + } + var p = this.constructLexErrorInfo('Lexical error' + lineno_msg + ': Unrecognized text.', this.options.lexerErrorsAreRecoverable); + + var pendingInput = this._input; + var activeCondition = this.topState(); + var conditionStackDepth = this.conditionStack.length; + + token = (this.parseError(p.errStr, p, this.JisonLexerError) || this.ERROR); + if (token === this.ERROR) { + // we can try to recover from a lexer error that \`parseError()\` did not 'recover' for us + // by moving forward at least one character at a time IFF the (user-specified?) \`parseError()\` + // has not consumed/modified any pending input or changed state in the error handler: + if (!this.matches && + // and make sure the input has been modified/consumed ... + pendingInput === this._input && + // ...or the lexer state has been modified significantly enough + // to merit a non-consuming error handling action right now. + activeCondition === this.topState() && + conditionStackDepth === this.conditionStack.length + ) { + this.input(); + } + } + return token; + } + }, + + /** + * return next match that has a token + * + * @public + * @this {RegExpLexer} + */ + lex: function lexer_lex() { + var r; + // allow the PRE/POST handlers set/modify the return token for maximum flexibility of the generated lexer: + if (typeof this.options.pre_lex === 'function') { + r = this.options.pre_lex.call(this); + } + + while (!r) { + r = this.next(); + } + + if (typeof this.options.post_lex === 'function') { + // (also account for a userdef function which does not return any value: keep the token as is) + r = this.options.post_lex.call(this, r) || r; + } + return r; + }, + + /** + * backwards compatible alias for \`pushState()\`; + * the latter is symmetrical with \`popState()\` and we advise to use + * those APIs in any modern lexer code, rather than \`begin()\`. + * + * @public + * @this {RegExpLexer} + */ + begin: function lexer_begin(condition) { + return this.pushState(condition); + }, + + /** + * activates a new lexer condition state (pushes the new lexer + * condition state onto the condition stack) + * + * @public + * @this {RegExpLexer} + */ + pushState: function lexer_pushState(condition) { + this.conditionStack.push(condition); + this.__currentRuleSet__ = null; + return this; + }, + + /** + * pop the previously active lexer condition state off the condition + * stack + * + * @public + * @this {RegExpLexer} + */ + popState: function lexer_popState() { + var n = this.conditionStack.length - 1; + if (n > 0) { + this.__currentRuleSet__ = null; + return this.conditionStack.pop(); + } else { + return this.conditionStack[0]; + } + }, + + /** + * return the currently active lexer condition state; when an index + * argument is provided it produces the N-th previous condition state, + * if available + * + * @public + * @this {RegExpLexer} + */ + topState: function lexer_topState(n) { + n = this.conditionStack.length - 1 - Math.abs(n || 0); + if (n >= 0) { + return this.conditionStack[n]; + } else { + return 'INITIAL'; + } + }, + + /** + * (internal) determine the lexer rule set which is active for the + * currently active lexer condition state + * + * @public + * @this {RegExpLexer} + */ + _currentRules: function lexer__currentRules() { + if (this.conditionStack.length && this.conditionStack[this.conditionStack.length - 1]) { + return this.conditions[this.conditionStack[this.conditionStack.length - 1]]; + } else { + return this.conditions['INITIAL']; + } + }, + + /** + * return the number of states currently on the stack + * + * @public + * @this {RegExpLexer} + */ + stateStackSize: function lexer_stateStackSize() { + return this.conditionStack.length; + } +}`; + // --- END lexer kernel --- +} + +RegExpLexer.prototype = (new Function(rmCommonWS` + return ${getRegExpLexerPrototype()}; +`))(); + + +// The lexer code stripper, driven by optimization analysis settings and +// lexer options, which cannot be changed at run-time. +function stripUnusedLexerCode(src, opt) { + // uses yyleng: ..................... ${opt.lexerActionsUseYYLENG} + // uses yylineno: ................... ${opt.lexerActionsUseYYLINENO} + // uses yytext: ..................... ${opt.lexerActionsUseYYTEXT} + // uses yylloc: ..................... ${opt.lexerActionsUseYYLOC} + // uses ParseError API: ............. ${opt.lexerActionsUseParseError} + // uses location tracking & editing: ${opt.lexerActionsUseLocationTracking} + // uses more() API: ................. ${opt.lexerActionsUseMore} + // uses unput() API: ................ ${opt.lexerActionsUseUnput} + // uses reject() API: ............... ${opt.lexerActionsUseReject} + // uses less() API: ................. ${opt.lexerActionsUseLess} + // uses display APIs pastInput(), upcomingInput(), showPosition(): + // ............................. ${opt.lexerActionsUseDisplayAPIs} + // uses describeYYLLOC() API: ....... ${opt.lexerActionsUseDescribeYYLOC} + + var ast = helpers.parseCodeChunkToAST(src, opt); + var new_src = helpers.prettyPrintAST(ast, opt); + +new_src = new_src.replace(/\/\*\s*JISON-LEX-ANALYTICS-REPORT\s*\*\//g, rmCommonWS` + // Code Generator Information Report + // --------------------------------- + // + // Options: + // + // backtracking: .................... ${opt.options.backtrack_lexer} + // location.ranges: ................. ${opt.options.ranges} + // location line+column tracking: ... ${opt.options.trackPosition} + // + // + // Forwarded Parser Analysis flags: + // + // uses yyleng: ..................... ${opt.parseActionsUseYYLENG} + // uses yylineno: ................... ${opt.parseActionsUseYYLINENO} + // uses yytext: ..................... ${opt.parseActionsUseYYTEXT} + // uses yylloc: ..................... ${opt.parseActionsUseYYLOC} + // uses lexer values: ............... ${opt.parseActionsUseValueTracking} / ${opt.parseActionsUseValueAssignment} + // location tracking: ............... ${opt.parseActionsUseLocationTracking} + // location assignment: ............. ${opt.parseActionsUseLocationAssignment} + // + // + // Lexer Analysis flags: + // + // uses yyleng: ..................... ${opt.lexerActionsUseYYLENG} + // uses yylineno: ................... ${opt.lexerActionsUseYYLINENO} + // uses yytext: ..................... ${opt.lexerActionsUseYYTEXT} + // uses yylloc: ..................... ${opt.lexerActionsUseYYLOC} + // uses ParseError API: ............. ${opt.lexerActionsUseParseError} + // uses yyerror: .................... ${opt.lexerActionsUseYYERROR} + // uses location tracking & editing: ${opt.lexerActionsUseLocationTracking} + // uses more() API: ................. ${opt.lexerActionsUseMore} + // uses unput() API: ................ ${opt.lexerActionsUseUnput} + // uses reject() API: ............... ${opt.lexerActionsUseReject} + // uses less() API: ................. ${opt.lexerActionsUseLess} + // uses display APIs pastInput(), upcomingInput(), showPosition(): + // ............................. ${opt.lexerActionsUseDisplayAPIs} + // uses describeYYLLOC() API: ....... ${opt.lexerActionsUseDescribeYYLOC} + // + // --------- END OF REPORT ----------- + + `); + + return new_src; +} + + + + + +// generate lexer source from a grammar +/** @public */ +function generate(dict, tokens, build_options) { + var opt = processGrammar(dict, tokens, build_options); + + return generateFromOpts(opt); +} + +// process the grammar and build final data structures and functions +/** @public */ +function processGrammar(dict, tokens, build_options) { + build_options = build_options || {}; + var opts = { + // include the knowledge passed through `build_options` about which lexer + // features will actually be *used* by the environment (which in 99.9% + // of cases is a jison *parser*): + // + // (this stuff comes straight from the jison Optimization Analysis.) + // + parseActionsUseYYLENG: build_options.parseActionsUseYYLENG, + parseActionsUseYYLINENO: build_options.parseActionsUseYYLINENO, + parseActionsUseYYTEXT: build_options.parseActionsUseYYTEXT, + parseActionsUseYYLOC: build_options.parseActionsUseYYLOC, + parseActionsUseParseError: build_options.parseActionsUseParseError, + parseActionsUseYYERROR: build_options.parseActionsUseYYERROR, + parseActionsUseYYERROK: build_options.parseActionsUseYYERROK, + parseActionsUseYYRECOVERING: build_options.parseActionsUseYYRECOVERING, + parseActionsUseYYCLEARIN: build_options.parseActionsUseYYCLEARIN, + parseActionsUseValueTracking: build_options.parseActionsUseValueTracking, + parseActionsUseValueAssignment: build_options.parseActionsUseValueAssignment, + parseActionsUseLocationTracking: build_options.parseActionsUseLocationTracking, + parseActionsUseLocationAssignment: build_options.parseActionsUseLocationAssignment, + parseActionsUseYYSTACK: build_options.parseActionsUseYYSTACK, + parseActionsUseYYSSTACK: build_options.parseActionsUseYYSSTACK, + parseActionsUseYYSTACKPOINTER: build_options.parseActionsUseYYSTACKPOINTER, + parseActionsUseYYRULELENGTH: build_options.parseActionsUseYYRULELENGTH, + parserHasErrorRecovery: build_options.parserHasErrorRecovery, + parserHasErrorReporting: build_options.parserHasErrorReporting, + + lexerActionsUseYYLENG: '???', + lexerActionsUseYYLINENO: '???', + lexerActionsUseYYTEXT: '???', + lexerActionsUseYYLOC: '???', + lexerActionsUseParseError: '???', + lexerActionsUseYYERROR: '???', + lexerActionsUseLocationTracking: '???', + lexerActionsUseMore: '???', + lexerActionsUseUnput: '???', + lexerActionsUseReject: '???', + lexerActionsUseLess: '???', + lexerActionsUseDisplayAPIs: '???', + lexerActionsUseDescribeYYLOC: '???', + }; + + dict = autodetectAndConvertToJSONformat(dict, build_options) || {}; + + // Feed the possibly reprocessed 'dictionary' above back to the caller + // (for use by our error diagnostic assistance code) + opts.lex_rule_dictionary = dict; + + // Always provide the lexer with an options object, even if it's empty! + // Make sure to camelCase all options: + opts.options = mkStdOptions(build_options, dict.options); + + opts.moduleType = opts.options.moduleType; + opts.moduleName = opts.options.moduleName; + + opts.conditions = prepareStartConditions(dict.startConditions); + opts.conditions.INITIAL = { + rules: [], + inclusive: true + }; + + var code = buildActions(dict, tokens, opts); + opts.performAction = code.actions; + opts.caseHelperInclude = code.caseHelperInclude; + opts.rules = code.rules; + opts.macros = code.macros; + + opts.regular_rule_count = code.regular_rule_count; + opts.simple_rule_count = code.simple_rule_count; + + opts.conditionStack = ['INITIAL']; + + opts.actionInclude = (dict.actionInclude || ''); + opts.moduleInclude = (opts.moduleInclude || '') + (dict.moduleInclude || '').trim(); + + return opts; +} + +// Assemble the final source from the processed grammar +/** @public */ +function generateFromOpts(opt) { + var code = ''; + + switch (opt.moduleType) { + case 'js': + code = generateModule(opt); + break; + case 'amd': + code = generateAMDModule(opt); + break; + case 'es': + code = generateESModule(opt); + break; + case 'commonjs': + default: + code = generateCommonJSModule(opt); + break; + } + + return code; +} + +function generateRegexesInitTableCode(opt) { + var a = opt.rules; + var print_xregexp = opt.options && opt.options.xregexp; + var id_display_width = (1 + Math.log10(a.length | 1) | 0); + var ws_prefix = new Array(id_display_width).join(' '); + var b = a.map(function generateXRegExpInitCode(re, idx) { + var idx_str = (ws_prefix + idx).substr(-id_display_width); + + if (re instanceof XRegExp) { + // When we don't need the special XRegExp sauce at run-time, we do with the original + // JavaScript RegExp instance a.k.a. 'native regex': + if (re.xregexp.isNative || !print_xregexp) { + return `/* ${idx_str}: */ ${re}`; + } + // And make sure to escape the regex to make it suitable for placement inside a *string* + // as it is passed as a string argument to the XRegExp constructor here. + var re_src = re.xregexp.source.replace(/[\\"]/g, '\\$&'); + return `/* ${idx_str}: */ new XRegExp("${re_src}", "${re.xregexp.flags}")`; + } else { + return `/* ${idx_str}: */ ${re}`; + } + }); + return b.join(',\n'); +} + +function generateModuleBody(opt) { + // make the JSON output look more like JavaScript: + function cleanupJSON(str) { + str = str.replace(/ "rules": \[/g, ' rules: ['); + str = str.replace(/ "inclusive": /g, ' inclusive: '); + return str; + } + + function produceOptions(opts) { + var obj = {}; + var do_not_pass = { + debug: !opts.debug, // do not include this item when it is FALSE as there's no debug tracing built into the generated grammar anyway! + enableDebugLogs: 1, + json: 1, + _: 1, + noMain: 1, + dumpSourceCodeOnFailure: 1, + throwErrorOnCompileFailure: 1, + reportStats: 1, + file: 1, + outfile: 1, + inputPath: 1, + inputFilename: 1, + defaultModuleName: 1, + moduleName: 1, + moduleType: 1, + lexerErrorsAreRecoverable: 0, + flex: 0, + backtrack_lexer: 0, + caseInsensitive: 0, + showSource: 1, + exportAST: 1, + exportAllTables: 1, + exportSourceCode: 1, + prettyCfg: 1, + parseActionsUseYYLENG: 1, + parseActionsUseYYLINENO: 1, + parseActionsUseYYTEXT: 1, + parseActionsUseYYLOC: 1, + parseActionsUseParseError: 1, + parseActionsUseYYERROR: 1, + parseActionsUseYYRECOVERING: 1, + parseActionsUseYYERROK: 1, + parseActionsUseYYCLEARIN: 1, + parseActionsUseValueTracking: 1, + parseActionsUseValueAssignment: 1, + parseActionsUseLocationTracking: 1, + parseActionsUseLocationAssignment: 1, + parseActionsUseYYSTACK: 1, + parseActionsUseYYSSTACK: 1, + parseActionsUseYYSTACKPOINTER: 1, + parseActionsUseYYRULELENGTH: 1, + parserHasErrorRecovery: 1, + parserHasErrorReporting: 1, + lexerActionsUseYYLENG: 1, + lexerActionsUseYYLINENO: 1, + lexerActionsUseYYTEXT: 1, + lexerActionsUseYYLOC: 1, + lexerActionsUseParseError: 1, + lexerActionsUseYYERROR: 1, + lexerActionsUseLocationTracking: 1, + lexerActionsUseMore: 1, + lexerActionsUseUnput: 1, + lexerActionsUseReject: 1, + lexerActionsUseLess: 1, + lexerActionsUseDisplayAPIs: 1, + lexerActionsUseDescribeYYLOC: 1, + }; + for (var k in opts) { + if (!do_not_pass[k] && opts[k] != null && opts[k] !== false) { + // make sure numeric values are encoded as numeric, the rest as boolean/string. + if (typeof opts[k] === 'string') { + var f = parseFloat(opts[k]); + if (f == opts[k]) { + obj[k] = f; + continue; + } + } + obj[k] = opts[k]; + } + } + + // And now some options which should receive some special processing: + var pre = obj.pre_lex; + var post = obj.post_lex; + // since JSON cannot encode functions, we'll have to do it manually at run-time, i.e. later on: + if (pre) { + obj.pre_lex = true; + } + if (post) { + obj.post_lex = true; + } + + var js = JSON.stringify(obj, null, 2); + + js = js.replace(new XRegExp(` "(${ID_REGEX_BASE})": `, 'g'), ' $1: '); + js = js.replace(/^( +)pre_lex: true(,)?$/gm, function (m, ls, tc) { + return ls + 'pre_lex: ' + String(pre) + (tc || ''); + }); + js = js.replace(/^( +)post_lex: true(,)?$/gm, function (m, ls, tc) { + return ls + 'post_lex: ' + String(post) + (tc || ''); + }); + return js; + } + + + var out; + if (opt.rules.length > 0 || opt.__in_rules_failure_analysis_mode__) { + // we don't mind that the `test_me()` code above will have this `lexer` variable re-defined: + // JavaScript is fine with that. + var code = [rmCommonWS` + var lexer = { + `, '/*JISON-LEX-ANALYTICS-REPORT*/' /* slot #1: placeholder for analysis report further below */ + ]; + + // get the RegExpLexer.prototype in source code form: + var protosrc = getRegExpLexerPrototype(); + // and strip off the surrounding bits we don't want: + protosrc = protosrc + .replace(/^[\s\r\n]*\{/, '') + .replace(/\s*\}[\s\r\n]*$/, '') + .trim(); + code.push(protosrc + ',\n'); + + assert(opt.options); + // Assure all options are camelCased: + assert(typeof opt.options['case-insensitive'] === 'undefined'); + + code.push(' options: ' + produceOptions(opt.options)); + + var performActionCode = String(opt.performAction); + var simpleCaseActionClustersCode = String(opt.caseHelperInclude); + var rulesCode = generateRegexesInitTableCode(opt); + var conditionsCode = cleanupJSON(JSON.stringify(opt.conditions, null, 2)); + code.push(rmCommonWS`, + JisonLexerError: JisonLexerError, + performAction: ${performActionCode}, + simpleCaseActionClusters: ${simpleCaseActionClustersCode}, + rules: [ + ${rulesCode} + ], + conditions: ${conditionsCode} + }; + `); + + opt.is_custom_lexer = false; + + out = code.join(''); + } else { + // We're clearly looking at a custom lexer here as there's no lexer rules at all. + // + // We are re-purposing the `%{...%}` `actionInclude` code block here as it serves no purpose otherwise. + // + // Meanwhile we make sure we have the `lexer` variable declared in *local scope* no matter + // what crazy stuff (or lack thereof) the userland code is pulling in the `actionInclude` chunk. + out = 'var lexer;\n'; + + assert(opt.regular_rule_count === 0); + assert(opt.simple_rule_count === 0); + opt.is_custom_lexer = true; + + if (opt.actionInclude) { + out += opt.actionInclude + (!opt.actionInclude.match(/;[\s\r\n]*$/) ? ';' : '') + '\n'; + } + } + + // The output of this function is guaranteed to read something like this: + // + // ``` + // var lexer; + // + // bla bla bla bla ... lotsa bla bla; + // ``` + // + // and that should work nicely as an `eval()`-able piece of source code. + return out; +} + +function generateGenericHeaderComment() { + var out = rmCommonWS` + /* lexer generated by jison-lex ${version} */ + + /* + * Returns a Lexer object of the following structure: + * + * Lexer: { + * yy: {} The so-called "shared state" or rather the *source* of it; + * the real "shared state" \`yy\` passed around to + * the rule actions, etc. is a direct reference! + * + * This "shared context" object was passed to the lexer by way of + * the \`lexer.setInput(str, yy)\` API before you may use it. + * + * This "shared context" object is passed to the lexer action code in \`performAction()\` + * so userland code in the lexer actions may communicate with the outside world + * and/or other lexer rules' actions in more or less complex ways. + * + * } + * + * Lexer.prototype: { + * EOF: 1, + * ERROR: 2, + * + * yy: The overall "shared context" object reference. + * + * JisonLexerError: function(msg, hash), + * + * performAction: function lexer__performAction(yy, yyrulenumber, YY_START), + * + * The function parameters and \`this\` have the following value/meaning: + * - \`this\` : reference to the \`lexer\` instance. + * \`yy_\` is an alias for \`this\` lexer instance reference used internally. + * + * - \`yy\` : a reference to the \`yy\` "shared state" object which was passed to the lexer + * by way of the \`lexer.setInput(str, yy)\` API before. + * + * Note: + * The extra arguments you specified in the \`%parse-param\` statement in your + * **parser** grammar definition file are passed to the lexer via this object + * reference as member variables. + * + * - \`yyrulenumber\` : index of the matched lexer rule (regex), used internally. + * + * - \`YY_START\`: the current lexer "start condition" state. + * + * parseError: function(str, hash, ExceptionClass), + * + * constructLexErrorInfo: function(error_message, is_recoverable), + * Helper function. + * Produces a new errorInfo \'hash object\' which can be passed into \`parseError()\`. + * See it\'s use in this lexer kernel in many places; example usage: + * + * var infoObj = lexer.constructParseErrorInfo(\'fail!\', true); + * var retVal = lexer.parseError(infoObj.errStr, infoObj, lexer.JisonLexerError); + * + * options: { ... lexer %options ... }, + * + * lex: function(), + * Produce one token of lexed input, which was passed in earlier via the \`lexer.setInput()\` API. + * You MAY use the additional \`args...\` parameters as per \`%parse-param\` spec of the **lexer** grammar: + * these extra \`args...\` are added verbatim to the \`yy\` object reference as member variables. + * + * WARNING: + * Lexer's additional \`args...\` parameters (via lexer's \`%parse-param\`) MAY conflict with + * any attributes already added to \`yy\` by the **parser** or the jison run-time; + * when such a collision is detected an exception is thrown to prevent the generated run-time + * from silently accepting this confusing and potentially hazardous situation! + * + * cleanupAfterLex: function(do_not_nuke_errorinfos), + * Helper function. + * + * This helper API is invoked when the **parse process** has completed: it is the responsibility + * of the **parser** (or the calling userland code) to invoke this method once cleanup is desired. + * + * This helper may be invoked by user code to ensure the internal lexer gets properly garbage collected. + * + * setInput: function(input, [yy]), + * + * + * input: function(), + * + * + * unput: function(str), + * + * + * more: function(), + * + * + * reject: function(), + * + * + * less: function(n), + * + * + * pastInput: function(n), + * + * + * upcomingInput: function(n), + * + * + * showPosition: function(), + * + * + * test_match: function(regex_match_array, rule_index), + * + * + * next: function(), + * + * + * begin: function(condition), + * + * + * pushState: function(condition), + * + * + * popState: function(), + * + * + * topState: function(), + * + * + * _currentRules: function(), + * + * + * stateStackSize: function(), + * + * + * performAction: function(yy, yy_, yyrulenumber, YY_START), + * + * + * rules: [...], + * + * + * conditions: {associative list: name ==> set}, + * } + * + * + * token location info (\`yylloc\`): { + * first_line: n, + * last_line: n, + * first_column: n, + * last_column: n, + * range: [start_number, end_number] + * (where the numbers are indexes into the input string, zero-based) + * } + * + * --- + * + * The \`parseError\` function receives a \'hash\' object with these members for lexer errors: + * + * { + * text: (matched text) + * token: (the produced terminal token, if any) + * token_id: (the produced terminal token numeric ID, if any) + * line: (yylineno) + * loc: (yylloc) + * recoverable: (boolean: TRUE when the parser MAY have an error recovery rule + * available for this particular error) + * yy: (object: the current parser internal "shared state" \`yy\` + * as is also available in the rule actions; this can be used, + * for instance, for advanced error analysis and reporting) + * lexer: (reference to the current lexer instance used by the parser) + * } + * + * while \`this\` will reference the current lexer instance. + * + * When \`parseError\` is invoked by the lexer, the default implementation will + * attempt to invoke \`yy.parser.parseError()\`; when this callback is not provided + * it will try to invoke \`yy.parseError()\` instead. When that callback is also not + * provided, a \`JisonLexerError\` exception will be thrown containing the error + * message and \`hash\`, as constructed by the \`constructLexErrorInfo()\` API. + * + * Note that the lexer\'s \`JisonLexerError\` error class is passed via the + * \`ExceptionClass\` argument, which is invoked to construct the exception + * instance to be thrown, so technically \`parseError\` will throw the object + * produced by the \`new ExceptionClass(str, hash)\` JavaScript expression. + * + * --- + * + * You can specify lexer options by setting / modifying the \`.options\` object of your Lexer instance. + * These options are available: + * + * (Options are permanent.) + * + * yy: { + * parseError: function(str, hash, ExceptionClass) + * optional: overrides the default \`parseError\` function. + * } + * + * lexer.options: { + * pre_lex: function() + * optional: is invoked before the lexer is invoked to produce another token. + * \`this\` refers to the Lexer object. + * post_lex: function(token) { return token; } + * optional: is invoked when the lexer has produced a token \`token\`; + * this function can override the returned token value by returning another. + * When it does not return any (truthy) value, the lexer will return + * the original \`token\`. + * \`this\` refers to the Lexer object. + * + * WARNING: the next set of options are not meant to be changed. They echo the abilities of + * the lexer as per when it was compiled! + * + * ranges: boolean + * optional: \`true\` ==> token location info will include a .range[] member. + * flex: boolean + * optional: \`true\` ==> flex-like lexing behaviour where the rules are tested + * exhaustively to find the longest match. + * backtrack_lexer: boolean + * optional: \`true\` ==> lexer regexes are tested in order and for invoked; + * the lexer terminates the scan when a token is returned by the action code. + * xregexp: boolean + * optional: \`true\` ==> lexer rule regexes are "extended regex format" requiring the + * \`XRegExp\` library. When this %option has not been specified at compile time, all lexer + * rule regexes have been written as standard JavaScript RegExp expressions. + * } + */ + `; + + return out; +} + +function prepareOptions(opt) { + opt = opt || {}; + + // check for illegal identifier + if (!opt.moduleName || !opt.moduleName.match(/^[a-zA-Z_$][a-zA-Z0-9_$\.]*$/)) { + if (opt.moduleName) { + var msg = 'WARNING: The specified moduleName "' + opt.moduleName + '" is illegal (only characters [a-zA-Z0-9_$] and "." dot are accepted); using the default moduleName "lexer" instead.'; + if (typeof opt.warn_cb === 'function') { + opt.warn_cb(msg); + } else { + // do not treat as warning; barf hairball instead so that this oddity gets noticed right away! + throw new Error(msg); + } + } + opt.moduleName = 'lexer'; + } + + prepExportStructures(opt); + + return opt; +} + +function generateModule(opt) { + opt = prepareOptions(opt); + + var out = [ + generateGenericHeaderComment(), + '', + 'var ' + opt.moduleName + ' = (function () {', + jisonLexerErrorDefinition, + '', + generateModuleBody(opt), + '', + (opt.moduleInclude ? opt.moduleInclude + ';' : ''), + '', + 'return lexer;', + '})();' + ]; + + var src = out.join('\n') + '\n'; + src = stripUnusedLexerCode(src, opt); + opt.exportSourceCode.all = src; + return src; +} + +function generateAMDModule(opt) { + opt = prepareOptions(opt); + + var out = [ + generateGenericHeaderComment(), + '', + 'define([], function () {', + jisonLexerErrorDefinition, + '', + generateModuleBody(opt), + '', + (opt.moduleInclude ? opt.moduleInclude + ';' : ''), + '', + 'return lexer;', + '});' + ]; + + var src = out.join('\n') + '\n'; + src = stripUnusedLexerCode(src, opt); + opt.exportSourceCode.all = src; + return src; +} + +function generateESModule(opt) { + opt = prepareOptions(opt); + + var out = [ + generateGenericHeaderComment(), + '', + 'var lexer = (function () {', + jisonLexerErrorDefinition, + '', + generateModuleBody(opt), + '', + (opt.moduleInclude ? opt.moduleInclude + ';' : ''), + '', + 'return lexer;', + '})();', + '', + 'function yylex() {', + ' return lexer.lex.apply(lexer, arguments);', + '}', + rmCommonWS` + export { + lexer, + yylex as lex + }; + ` + ]; + + var src = out.join('\n') + '\n'; + src = stripUnusedLexerCode(src, opt); + opt.exportSourceCode.all = src; + return src; +} + +function generateCommonJSModule(opt) { + opt = prepareOptions(opt); + + var out = [ + generateGenericHeaderComment(), + '', + 'var ' + opt.moduleName + ' = (function () {', + jisonLexerErrorDefinition, + '', + generateModuleBody(opt), + '', + (opt.moduleInclude ? opt.moduleInclude + ';' : ''), + '', + 'return lexer;', + '})();', + '', + 'if (typeof require !== \'undefined\' && typeof exports !== \'undefined\') {', + ' exports.lexer = ' + opt.moduleName + ';', + ' exports.lex = function () {', + ' return ' + opt.moduleName + '.lex.apply(lexer, arguments);', + ' };', + '}' + ]; + + var src = out.join('\n') + '\n'; + src = stripUnusedLexerCode(src, opt); + opt.exportSourceCode.all = src; + return src; +} + +RegExpLexer.generate = generate; + +RegExpLexer.version = version; +RegExpLexer.defaultJisonLexOptions = defaultJisonLexOptions; +RegExpLexer.mkStdOptions = mkStdOptions; +RegExpLexer.camelCase = camelCase; +RegExpLexer.autodetectAndConvertToJSONformat = autodetectAndConvertToJSONformat; + +export default RegExpLexer; diff --git a/dist/regexp-lexer-umd-es5.js b/dist/regexp-lexer-umd-es5.js new file mode 100644 index 0000000..9245659 --- /dev/null +++ b/dist/regexp-lexer-umd-es5.js @@ -0,0 +1,2549 @@ +'use strict'; + +var _typeof = typeof Symbol === "function" && typeof Symbol.iterator === "symbol" ? function (obj) { return typeof obj; } : function (obj) { return obj && typeof Symbol === "function" && obj.constructor === Symbol && obj !== Symbol.prototype ? "symbol" : typeof obj; }; + +var _templateObject = _taggedTemplateLiteral(['\n var __hacky_counter__ = 0;\n\n /**\n * @constructor\n * @nocollapse\n */\n function XRegExp(re, f) {\n this.re = re;\n this.flags = f;\n this._getUnicodeProperty = function (k) {};\n var fake = /./; // WARNING: this exact \'fake\' is also depended upon by the xregexp unit test!\n __hacky_counter__++;\n fake.__hacky_backy__ = __hacky_counter__;\n return fake;\n }\n '], ['\n var __hacky_counter__ = 0;\n\n /**\n * @constructor\n * @nocollapse\n */\n function XRegExp(re, f) {\n this.re = re;\n this.flags = f;\n this._getUnicodeProperty = function (k) {};\n var fake = /./; // WARNING: this exact \'fake\' is also depended upon by the xregexp unit test!\n __hacky_counter__++;\n fake.__hacky_backy__ = __hacky_counter__;\n return fake;\n }\n ']), + _templateObject2 = _taggedTemplateLiteral(['\n return ', ';\n'], ['\n return ', ';\n']), + _templateObject3 = _taggedTemplateLiteral(['\n // Code Generator Information Report\n // ---------------------------------\n //\n // Options:\n //\n // backtracking: .................... ', '\n // location.ranges: ................. ', '\n // location line+column tracking: ... ', '\n //\n //\n // Forwarded Parser Analysis flags:\n //\n // uses yyleng: ..................... ', '\n // uses yylineno: ................... ', '\n // uses yytext: ..................... ', '\n // uses yylloc: ..................... ', '\n // uses lexer values: ............... ', ' / ', '\n // location tracking: ............... ', '\n // location assignment: ............. ', '\n //\n //\n // Lexer Analysis flags:\n //\n // uses yyleng: ..................... ', '\n // uses yylineno: ................... ', '\n // uses yytext: ..................... ', '\n // uses yylloc: ..................... ', '\n // uses ParseError API: ............. ', '\n // uses yyerror: .................... ', '\n // uses location tracking & editing: ', '\n // uses more() API: ................. ', '\n // uses unput() API: ................ ', '\n // uses reject() API: ............... ', '\n // uses less() API: ................. ', '\n // uses display APIs pastInput(), upcomingInput(), showPosition():\n // ............................. ', '\n // uses describeYYLLOC() API: ....... ', '\n //\n // --------- END OF REPORT -----------\n\n '], ['\n // Code Generator Information Report\n // ---------------------------------\n //\n // Options:\n //\n // backtracking: .................... ', '\n // location.ranges: ................. ', '\n // location line+column tracking: ... ', '\n //\n //\n // Forwarded Parser Analysis flags:\n //\n // uses yyleng: ..................... ', '\n // uses yylineno: ................... ', '\n // uses yytext: ..................... ', '\n // uses yylloc: ..................... ', '\n // uses lexer values: ............... ', ' / ', '\n // location tracking: ............... ', '\n // location assignment: ............. ', '\n //\n //\n // Lexer Analysis flags:\n //\n // uses yyleng: ..................... ', '\n // uses yylineno: ................... ', '\n // uses yytext: ..................... ', '\n // uses yylloc: ..................... ', '\n // uses ParseError API: ............. ', '\n // uses yyerror: .................... ', '\n // uses location tracking & editing: ', '\n // uses more() API: ................. ', '\n // uses unput() API: ................ ', '\n // uses reject() API: ............... ', '\n // uses less() API: ................. ', '\n // uses display APIs pastInput(), upcomingInput(), showPosition():\n // ............................. ', '\n // uses describeYYLLOC() API: ....... ', '\n //\n // --------- END OF REPORT -----------\n\n ']), + _templateObject4 = _taggedTemplateLiteral(['\n var lexer = {\n '], ['\n var lexer = {\n ']), + _templateObject5 = _taggedTemplateLiteral([',\n JisonLexerError: JisonLexerError,\n performAction: ', ',\n simpleCaseActionClusters: ', ',\n rules: [\n ', '\n ],\n conditions: ', '\n };\n '], [',\n JisonLexerError: JisonLexerError,\n performAction: ', ',\n simpleCaseActionClusters: ', ',\n rules: [\n ', '\n ],\n conditions: ', '\n };\n ']), + _templateObject6 = _taggedTemplateLiteral(['\n /* lexer generated by jison-lex ', ' */\n\n /*\n * Returns a Lexer object of the following structure:\n *\n * Lexer: {\n * yy: {} The so-called "shared state" or rather the *source* of it;\n * the real "shared state" `yy` passed around to\n * the rule actions, etc. is a direct reference!\n *\n * This "shared context" object was passed to the lexer by way of \n * the `lexer.setInput(str, yy)` API before you may use it.\n *\n * This "shared context" object is passed to the lexer action code in `performAction()`\n * so userland code in the lexer actions may communicate with the outside world \n * and/or other lexer rules\' actions in more or less complex ways.\n *\n * }\n *\n * Lexer.prototype: {\n * EOF: 1,\n * ERROR: 2,\n *\n * yy: The overall "shared context" object reference.\n *\n * JisonLexerError: function(msg, hash),\n *\n * performAction: function lexer__performAction(yy, yyrulenumber, YY_START),\n *\n * The function parameters and `this` have the following value/meaning:\n * - `this` : reference to the `lexer` instance. \n * `yy_` is an alias for `this` lexer instance reference used internally.\n *\n * - `yy` : a reference to the `yy` "shared state" object which was passed to the lexer\n * by way of the `lexer.setInput(str, yy)` API before.\n *\n * Note:\n * The extra arguments you specified in the `%parse-param` statement in your\n * **parser** grammar definition file are passed to the lexer via this object\n * reference as member variables.\n *\n * - `yyrulenumber` : index of the matched lexer rule (regex), used internally.\n *\n * - `YY_START`: the current lexer "start condition" state.\n *\n * parseError: function(str, hash, ExceptionClass),\n *\n * constructLexErrorInfo: function(error_message, is_recoverable),\n * Helper function.\n * Produces a new errorInfo \'hash object\' which can be passed into `parseError()`.\n * See it\'s use in this lexer kernel in many places; example usage:\n *\n * var infoObj = lexer.constructParseErrorInfo(\'fail!\', true);\n * var retVal = lexer.parseError(infoObj.errStr, infoObj, lexer.JisonLexerError);\n *\n * options: { ... lexer %options ... },\n *\n * lex: function(),\n * Produce one token of lexed input, which was passed in earlier via the `lexer.setInput()` API.\n * You MAY use the additional `args...` parameters as per `%parse-param` spec of the **lexer** grammar:\n * these extra `args...` are added verbatim to the `yy` object reference as member variables.\n *\n * WARNING:\n * Lexer\'s additional `args...` parameters (via lexer\'s `%parse-param`) MAY conflict with\n * any attributes already added to `yy` by the **parser** or the jison run-time; \n * when such a collision is detected an exception is thrown to prevent the generated run-time \n * from silently accepting this confusing and potentially hazardous situation! \n *\n * cleanupAfterLex: function(do_not_nuke_errorinfos),\n * Helper function.\n *\n * This helper API is invoked when the **parse process** has completed: it is the responsibility\n * of the **parser** (or the calling userland code) to invoke this method once cleanup is desired. \n *\n * This helper may be invoked by user code to ensure the internal lexer gets properly garbage collected.\n *\n * setInput: function(input, [yy]),\n *\n *\n * input: function(),\n *\n *\n * unput: function(str),\n *\n *\n * more: function(),\n *\n *\n * reject: function(),\n *\n *\n * less: function(n),\n *\n *\n * pastInput: function(n),\n *\n *\n * upcomingInput: function(n),\n *\n *\n * showPosition: function(),\n *\n *\n * test_match: function(regex_match_array, rule_index),\n *\n *\n * next: function(),\n *\n *\n * begin: function(condition),\n *\n *\n * pushState: function(condition),\n *\n *\n * popState: function(),\n *\n *\n * topState: function(),\n *\n *\n * _currentRules: function(),\n *\n *\n * stateStackSize: function(),\n *\n *\n * performAction: function(yy, yy_, yyrulenumber, YY_START),\n *\n *\n * rules: [...],\n *\n *\n * conditions: {associative list: name ==> set},\n * }\n *\n *\n * token location info (`yylloc`): {\n * first_line: n,\n * last_line: n,\n * first_column: n,\n * last_column: n,\n * range: [start_number, end_number]\n * (where the numbers are indexes into the input string, zero-based)\n * }\n *\n * ---\n *\n * The `parseError` function receives a \'hash\' object with these members for lexer errors:\n *\n * {\n * text: (matched text)\n * token: (the produced terminal token, if any)\n * token_id: (the produced terminal token numeric ID, if any)\n * line: (yylineno)\n * loc: (yylloc)\n * recoverable: (boolean: TRUE when the parser MAY have an error recovery rule\n * available for this particular error)\n * yy: (object: the current parser internal "shared state" `yy`\n * as is also available in the rule actions; this can be used,\n * for instance, for advanced error analysis and reporting)\n * lexer: (reference to the current lexer instance used by the parser)\n * }\n *\n * while `this` will reference the current lexer instance.\n *\n * When `parseError` is invoked by the lexer, the default implementation will\n * attempt to invoke `yy.parser.parseError()`; when this callback is not provided\n * it will try to invoke `yy.parseError()` instead. When that callback is also not\n * provided, a `JisonLexerError` exception will be thrown containing the error\n * message and `hash`, as constructed by the `constructLexErrorInfo()` API.\n *\n * Note that the lexer\'s `JisonLexerError` error class is passed via the\n * `ExceptionClass` argument, which is invoked to construct the exception\n * instance to be thrown, so technically `parseError` will throw the object\n * produced by the `new ExceptionClass(str, hash)` JavaScript expression.\n *\n * ---\n *\n * You can specify lexer options by setting / modifying the `.options` object of your Lexer instance.\n * These options are available:\n *\n * (Options are permanent.)\n * \n * yy: {\n * parseError: function(str, hash, ExceptionClass)\n * optional: overrides the default `parseError` function.\n * }\n *\n * lexer.options: {\n * pre_lex: function()\n * optional: is invoked before the lexer is invoked to produce another token.\n * `this` refers to the Lexer object.\n * post_lex: function(token) { return token; }\n * optional: is invoked when the lexer has produced a token `token`;\n * this function can override the returned token value by returning another.\n * When it does not return any (truthy) value, the lexer will return\n * the original `token`.\n * `this` refers to the Lexer object.\n *\n * WARNING: the next set of options are not meant to be changed. They echo the abilities of\n * the lexer as per when it was compiled!\n *\n * ranges: boolean\n * optional: `true` ==> token location info will include a .range[] member.\n * flex: boolean\n * optional: `true` ==> flex-like lexing behaviour where the rules are tested\n * exhaustively to find the longest match.\n * backtrack_lexer: boolean\n * optional: `true` ==> lexer regexes are tested in order and for invoked;\n * the lexer terminates the scan when a token is returned by the action code.\n * xregexp: boolean\n * optional: `true` ==> lexer rule regexes are "extended regex format" requiring the\n * `XRegExp` library. When this %option has not been specified at compile time, all lexer\n * rule regexes have been written as standard JavaScript RegExp expressions.\n * }\n */\n '], ['\n /* lexer generated by jison-lex ', ' */\n\n /*\n * Returns a Lexer object of the following structure:\n *\n * Lexer: {\n * yy: {} The so-called "shared state" or rather the *source* of it;\n * the real "shared state" \\`yy\\` passed around to\n * the rule actions, etc. is a direct reference!\n *\n * This "shared context" object was passed to the lexer by way of \n * the \\`lexer.setInput(str, yy)\\` API before you may use it.\n *\n * This "shared context" object is passed to the lexer action code in \\`performAction()\\`\n * so userland code in the lexer actions may communicate with the outside world \n * and/or other lexer rules\' actions in more or less complex ways.\n *\n * }\n *\n * Lexer.prototype: {\n * EOF: 1,\n * ERROR: 2,\n *\n * yy: The overall "shared context" object reference.\n *\n * JisonLexerError: function(msg, hash),\n *\n * performAction: function lexer__performAction(yy, yyrulenumber, YY_START),\n *\n * The function parameters and \\`this\\` have the following value/meaning:\n * - \\`this\\` : reference to the \\`lexer\\` instance. \n * \\`yy_\\` is an alias for \\`this\\` lexer instance reference used internally.\n *\n * - \\`yy\\` : a reference to the \\`yy\\` "shared state" object which was passed to the lexer\n * by way of the \\`lexer.setInput(str, yy)\\` API before.\n *\n * Note:\n * The extra arguments you specified in the \\`%parse-param\\` statement in your\n * **parser** grammar definition file are passed to the lexer via this object\n * reference as member variables.\n *\n * - \\`yyrulenumber\\` : index of the matched lexer rule (regex), used internally.\n *\n * - \\`YY_START\\`: the current lexer "start condition" state.\n *\n * parseError: function(str, hash, ExceptionClass),\n *\n * constructLexErrorInfo: function(error_message, is_recoverable),\n * Helper function.\n * Produces a new errorInfo \\\'hash object\\\' which can be passed into \\`parseError()\\`.\n * See it\\\'s use in this lexer kernel in many places; example usage:\n *\n * var infoObj = lexer.constructParseErrorInfo(\\\'fail!\\\', true);\n * var retVal = lexer.parseError(infoObj.errStr, infoObj, lexer.JisonLexerError);\n *\n * options: { ... lexer %options ... },\n *\n * lex: function(),\n * Produce one token of lexed input, which was passed in earlier via the \\`lexer.setInput()\\` API.\n * You MAY use the additional \\`args...\\` parameters as per \\`%parse-param\\` spec of the **lexer** grammar:\n * these extra \\`args...\\` are added verbatim to the \\`yy\\` object reference as member variables.\n *\n * WARNING:\n * Lexer\'s additional \\`args...\\` parameters (via lexer\'s \\`%parse-param\\`) MAY conflict with\n * any attributes already added to \\`yy\\` by the **parser** or the jison run-time; \n * when such a collision is detected an exception is thrown to prevent the generated run-time \n * from silently accepting this confusing and potentially hazardous situation! \n *\n * cleanupAfterLex: function(do_not_nuke_errorinfos),\n * Helper function.\n *\n * This helper API is invoked when the **parse process** has completed: it is the responsibility\n * of the **parser** (or the calling userland code) to invoke this method once cleanup is desired. \n *\n * This helper may be invoked by user code to ensure the internal lexer gets properly garbage collected.\n *\n * setInput: function(input, [yy]),\n *\n *\n * input: function(),\n *\n *\n * unput: function(str),\n *\n *\n * more: function(),\n *\n *\n * reject: function(),\n *\n *\n * less: function(n),\n *\n *\n * pastInput: function(n),\n *\n *\n * upcomingInput: function(n),\n *\n *\n * showPosition: function(),\n *\n *\n * test_match: function(regex_match_array, rule_index),\n *\n *\n * next: function(),\n *\n *\n * begin: function(condition),\n *\n *\n * pushState: function(condition),\n *\n *\n * popState: function(),\n *\n *\n * topState: function(),\n *\n *\n * _currentRules: function(),\n *\n *\n * stateStackSize: function(),\n *\n *\n * performAction: function(yy, yy_, yyrulenumber, YY_START),\n *\n *\n * rules: [...],\n *\n *\n * conditions: {associative list: name ==> set},\n * }\n *\n *\n * token location info (\\`yylloc\\`): {\n * first_line: n,\n * last_line: n,\n * first_column: n,\n * last_column: n,\n * range: [start_number, end_number]\n * (where the numbers are indexes into the input string, zero-based)\n * }\n *\n * ---\n *\n * The \\`parseError\\` function receives a \\\'hash\\\' object with these members for lexer errors:\n *\n * {\n * text: (matched text)\n * token: (the produced terminal token, if any)\n * token_id: (the produced terminal token numeric ID, if any)\n * line: (yylineno)\n * loc: (yylloc)\n * recoverable: (boolean: TRUE when the parser MAY have an error recovery rule\n * available for this particular error)\n * yy: (object: the current parser internal "shared state" \\`yy\\`\n * as is also available in the rule actions; this can be used,\n * for instance, for advanced error analysis and reporting)\n * lexer: (reference to the current lexer instance used by the parser)\n * }\n *\n * while \\`this\\` will reference the current lexer instance.\n *\n * When \\`parseError\\` is invoked by the lexer, the default implementation will\n * attempt to invoke \\`yy.parser.parseError()\\`; when this callback is not provided\n * it will try to invoke \\`yy.parseError()\\` instead. When that callback is also not\n * provided, a \\`JisonLexerError\\` exception will be thrown containing the error\n * message and \\`hash\\`, as constructed by the \\`constructLexErrorInfo()\\` API.\n *\n * Note that the lexer\\\'s \\`JisonLexerError\\` error class is passed via the\n * \\`ExceptionClass\\` argument, which is invoked to construct the exception\n * instance to be thrown, so technically \\`parseError\\` will throw the object\n * produced by the \\`new ExceptionClass(str, hash)\\` JavaScript expression.\n *\n * ---\n *\n * You can specify lexer options by setting / modifying the \\`.options\\` object of your Lexer instance.\n * These options are available:\n *\n * (Options are permanent.)\n * \n * yy: {\n * parseError: function(str, hash, ExceptionClass)\n * optional: overrides the default \\`parseError\\` function.\n * }\n *\n * lexer.options: {\n * pre_lex: function()\n * optional: is invoked before the lexer is invoked to produce another token.\n * \\`this\\` refers to the Lexer object.\n * post_lex: function(token) { return token; }\n * optional: is invoked when the lexer has produced a token \\`token\\`;\n * this function can override the returned token value by returning another.\n * When it does not return any (truthy) value, the lexer will return\n * the original \\`token\\`.\n * \\`this\\` refers to the Lexer object.\n *\n * WARNING: the next set of options are not meant to be changed. They echo the abilities of\n * the lexer as per when it was compiled!\n *\n * ranges: boolean\n * optional: \\`true\\` ==> token location info will include a .range[] member.\n * flex: boolean\n * optional: \\`true\\` ==> flex-like lexing behaviour where the rules are tested\n * exhaustively to find the longest match.\n * backtrack_lexer: boolean\n * optional: \\`true\\` ==> lexer regexes are tested in order and for invoked;\n * the lexer terminates the scan when a token is returned by the action code.\n * xregexp: boolean\n * optional: \\`true\\` ==> lexer rule regexes are "extended regex format" requiring the\n * \\`XRegExp\\` library. When this %option has not been specified at compile time, all lexer\n * rule regexes have been written as standard JavaScript RegExp expressions.\n * }\n */\n ']), + _templateObject7 = _taggedTemplateLiteral(['\n export {\n lexer,\n yylex as lex\n };\n '], ['\n export {\n lexer,\n yylex as lex\n };\n ']); + +function _taggedTemplateLiteral(strings, raw) { return Object.freeze(Object.defineProperties(strings, { raw: { value: Object.freeze(raw) } })); } + +(function (global, factory) { + (typeof exports === 'undefined' ? 'undefined' : _typeof(exports)) === 'object' && typeof module !== 'undefined' ? module.exports = factory(require('@gerhobbelt/xregexp'), require('@gerhobbelt/json5'), require('@gerhobbelt/lex-parser'), require('assert'), require('jison-helpers-lib')) : typeof define === 'function' && define.amd ? define(['@gerhobbelt/xregexp', '@gerhobbelt/json5', '@gerhobbelt/lex-parser', 'assert', 'jison-helpers-lib'], factory) : global['regexp-lexer'] = factory(global.XRegExp, global.json5, global.lexParser, global.assert, global.helpers); +})(undefined, function (XRegExp, json5, lexParser, assert, helpers) { + 'use strict'; + + XRegExp = XRegExp && XRegExp.hasOwnProperty('default') ? XRegExp['default'] : XRegExp; + json5 = json5 && json5.hasOwnProperty('default') ? json5['default'] : json5; + lexParser = lexParser && lexParser.hasOwnProperty('default') ? lexParser['default'] : lexParser; + assert = assert && assert.hasOwnProperty('default') ? assert['default'] : assert; + helpers = helpers && helpers.hasOwnProperty('default') ? helpers['default'] : helpers; + + // + // Helper library for set definitions + // + // MIT Licensed + // + // + // This code is intended to help parse regex set expressions and mix them + // together, i.e. to answer questions like this: + // + // what is the resulting regex set expression when we mix the regex set + // `[a-z]` with the regex set `[^\s]` where with 'mix' we mean that any + // input which matches either input regex should match the resulting + // regex set. (a.k.a. Full Outer Join, see also http://www.diffen.com/difference/Inner_Join_vs_Outer_Join) + // + + 'use strict'; + + var XREGEXP_UNICODE_ESCAPE_RE$1 = /^\{[A-Za-z0-9 \-\._]+\}/; // Matches the XRegExp Unicode escape braced part, e.g. `{Number}` + var CHR_RE$1 = /^(?:[^\\]|\\[^cxu0-9]|\\[0-9]{1,3}|\\c[A-Z]|\\x[0-9a-fA-F]{2}|\\u[0-9a-fA-F]{4}|\\u\{[0-9a-fA-F]+\})/; + var SET_PART_RE$1 = /^(?:[^\\\]]|\\[^cxu0-9]|\\[0-9]{1,3}|\\c[A-Z]|\\x[0-9a-fA-F]{2}|\\u[0-9a-fA-F]{4}|\\u\{[0-9a-fA-F]+\})+/; + var NOTHING_SPECIAL_RE$1 = /^(?:[^\\\[\]\(\)\|^\{\}]|\\[^cxu0-9]|\\[0-9]{1,3}|\\c[A-Z]|\\x[0-9a-fA-F]{2}|\\u[0-9a-fA-F]{4}|\\u\{[0-9a-fA-F]+\})+/; + var SET_IS_SINGLE_PCODE_RE = /^\\[dDwWsS]$|^\\p\{[A-Za-z0-9 \-\._]+\}$/; + + var UNICODE_BASE_PLANE_MAX_CP$1 = 65535; + + // The expanded regex sets which are equivalent to the given `\\{c}` escapes: + // + // `/\s/`: + var WHITESPACE_SETSTR$1 = ' \f\n\r\t\x0B\xA0\u1680\u180E\u2000-\u200A\u2028\u2029\u202F\u205F\u3000\uFEFF'; + // `/\d/`: + var DIGIT_SETSTR$1 = '0-9'; + // `/\w/`: + var WORDCHAR_SETSTR$1 = 'A-Za-z0-9_'; + + // Helper for `bitarray2set()`: convert character code to a representation string suitable for use in a regex + function i2c(i) { + var c, x; + + switch (i) { + case 10: + return '\\n'; + + case 13: + return '\\r'; + + case 9: + return '\\t'; + + case 8: + return '\\b'; + + case 12: + return '\\f'; + + case 11: + return '\\v'; + + case 45: + // ASCII/Unicode for '-' dash + return '\\-'; + + case 91: + // '[' + return '\\['; + + case 92: + // '\\' + return '\\\\'; + + case 93: + // ']' + return '\\]'; + + case 94: + // ']' + return '\\^'; + } + if (i < 32 || i > 0xFFF0 /* Unicode Specials, also in UTF16 */ + || i >= 0xD800 && i <= 0xDFFF /* Unicode Supplementary Planes; we're TOAST in JavaScript as we're NOT UTF-16 but UCS-2! */ + || String.fromCharCode(i).match(/[\u2028\u2029]/) /* Code compilation via `new Function()` does not like to see these, or rather: treats them as just another form of CRLF, which breaks your generated regex code! */ + ) { + // Detail about a detail: + // U+2028 and U+2029 are part of the `\s` regex escape code (`\s` and `[\s]` match either of these) and when placed in a JavaScript + // source file verbatim (without escaping it as a `\uNNNN` item) then JavaScript will interpret it as such and consequently report + // a b0rked generated parser, as the generated code would include this regex right here. + // Hence we MUST escape these buggers everywhere we go... + x = i.toString(16); + if (x.length >= 1 && i <= 0xFFFF) { + c = '0000' + x; + return '\\u' + c.substr(c.length - 4); + } else { + return '\\u{' + x + '}'; + } + } + return String.fromCharCode(i); + } + + // Helper collection for `bitarray2set()`: we have expanded all these cached `\\p{NAME}` regex sets when creating + // this bitarray and now we should look at these expansions again to see if `bitarray2set()` can produce a + // `\\p{NAME}` shorthand to represent [part of] the bitarray: + var Pcodes_bitarray_cache = {}; + var Pcodes_bitarray_cache_test_order = []; + + // Helper collection for `bitarray2set()` for minifying special cases of result sets which can be represented by + // a single regex 'escape', e.g. `\d` for digits 0-9. + var EscCode_bitarray_output_refs; + + // now initialize the EscCodes_... table above: + init_EscCode_lookup_table(); + + function init_EscCode_lookup_table() { + var s, + bitarr, + set2esc = {}, + esc2bitarr = {}; + + // patch global lookup tables for the time being, while we calculate their *real* content in this function: + EscCode_bitarray_output_refs = { + esc2bitarr: {}, + set2esc: {} + }; + Pcodes_bitarray_cache_test_order = []; + + // `/\S': + bitarr = []; + set2bitarray(bitarr, '^' + WHITESPACE_SETSTR$1); + s = bitarray2set(bitarr); + esc2bitarr['S'] = bitarr; + set2esc[s] = 'S'; + // set2esc['^' + s] = 's'; + Pcodes_bitarray_cache['\\S'] = bitarr; + + // `/\s': + bitarr = []; + set2bitarray(bitarr, WHITESPACE_SETSTR$1); + s = bitarray2set(bitarr); + esc2bitarr['s'] = bitarr; + set2esc[s] = 's'; + // set2esc['^' + s] = 'S'; + Pcodes_bitarray_cache['\\s'] = bitarr; + + // `/\D': + bitarr = []; + set2bitarray(bitarr, '^' + DIGIT_SETSTR$1); + s = bitarray2set(bitarr); + esc2bitarr['D'] = bitarr; + set2esc[s] = 'D'; + // set2esc['^' + s] = 'd'; + Pcodes_bitarray_cache['\\D'] = bitarr; + + // `/\d': + bitarr = []; + set2bitarray(bitarr, DIGIT_SETSTR$1); + s = bitarray2set(bitarr); + esc2bitarr['d'] = bitarr; + set2esc[s] = 'd'; + // set2esc['^' + s] = 'D'; + Pcodes_bitarray_cache['\\d'] = bitarr; + + // `/\W': + bitarr = []; + set2bitarray(bitarr, '^' + WORDCHAR_SETSTR$1); + s = bitarray2set(bitarr); + esc2bitarr['W'] = bitarr; + set2esc[s] = 'W'; + // set2esc['^' + s] = 'w'; + Pcodes_bitarray_cache['\\W'] = bitarr; + + // `/\w': + bitarr = []; + set2bitarray(bitarr, WORDCHAR_SETSTR$1); + s = bitarray2set(bitarr); + esc2bitarr['w'] = bitarr; + set2esc[s] = 'w'; + // set2esc['^' + s] = 'W'; + Pcodes_bitarray_cache['\\w'] = bitarr; + + EscCode_bitarray_output_refs = { + esc2bitarr: esc2bitarr, + set2esc: set2esc + }; + + updatePcodesBitarrayCacheTestOrder(); + } + + function updatePcodesBitarrayCacheTestOrder(opts) { + var t = new Array(UNICODE_BASE_PLANE_MAX_CP$1 + 1); + var l = {}; + var user_has_xregexp = opts && opts.options && opts.options.xregexp; + var i, j, k, ba; + + // mark every character with which regex pcodes they are part of: + for (k in Pcodes_bitarray_cache) { + ba = Pcodes_bitarray_cache[k]; + + if (!user_has_xregexp && k.indexOf('\\p{') >= 0) { + continue; + } + + var cnt = 0; + for (i = 0; i <= UNICODE_BASE_PLANE_MAX_CP$1; i++) { + if (ba[i]) { + cnt++; + if (!t[i]) { + t[i] = [k]; + } else { + t[i].push(k); + } + } + } + l[k] = cnt; + } + + // now dig out the unique ones: only need one per pcode. + // + // We ASSUME every \\p{NAME} 'pcode' has at least ONE character + // in it that is ONLY matched by that particular pcode. + // If this assumption fails, nothing is lost, but our 'regex set + // optimized representation' will be sub-optimal as than this pcode + // won't be tested during optimization. + // + // Now that would be a pity, so the assumption better holds... + // Turns out the assumption doesn't hold already for /\S/ + /\D/ + // as the second one (\D) is a pure subset of \S. So we have to + // look for markers which match multiple escapes/pcodes for those + // ones where a unique item isn't available... + var lut = []; + var done = {}; + var keys = Object.keys(Pcodes_bitarray_cache); + + for (i = 0; i <= UNICODE_BASE_PLANE_MAX_CP$1; i++) { + k = t[i][0]; + if (t[i].length === 1 && !done[k]) { + assert(l[k] > 0); + lut.push([i, k]); + done[k] = true; + } + } + + for (j = 0; keys[j]; j++) { + k = keys[j]; + + if (!user_has_xregexp && k.indexOf('\\p{') >= 0) { + continue; + } + + if (!done[k]) { + assert(l[k] > 0); + // find a minimum span character to mark this one: + var w = Infinity; + var rv; + ba = Pcodes_bitarray_cache[k]; + for (i = 0; i <= UNICODE_BASE_PLANE_MAX_CP$1; i++) { + if (ba[i]) { + var tl = t[i].length; + if (tl > 1 && tl < w) { + assert(l[k] > 0); + rv = [i, k]; + w = tl; + } + } + } + if (rv) { + done[k] = true; + lut.push(rv); + } + } + } + + // order from large set to small set so that small sets don't gobble + // characters also represented by overlapping larger set pcodes. + // + // Again we assume something: that finding the large regex pcode sets + // before the smaller, more specialized ones, will produce a more + // optimal minification of the regex set expression. + // + // This is a guestimate/heuristic only! + lut.sort(function (a, b) { + var k1 = a[1]; + var k2 = b[1]; + var ld = l[k2] - l[k1]; + if (ld) { + return ld; + } + // and for same-size sets, order from high to low unique identifier. + return b[0] - a[0]; + }); + + Pcodes_bitarray_cache_test_order = lut; + } + + // 'Join' a regex set `[...]` into a Unicode range spanning logic array, flagging every character in the given set. + function set2bitarray(bitarr, s, opts) { + var orig = s; + var set_is_inverted = false; + var bitarr_orig; + + function mark(d1, d2) { + if (d2 == null) d2 = d1; + for (var i = d1; i <= d2; i++) { + bitarr[i] = true; + } + } + + function add2bitarray(dst, src) { + for (var i = 0; i <= UNICODE_BASE_PLANE_MAX_CP$1; i++) { + if (src[i]) { + dst[i] = true; + } + } + } + + function eval_escaped_code(s) { + var c; + // decode escaped code? If none, just take the character as-is + if (s.indexOf('\\') === 0) { + var l = s.substr(0, 2); + switch (l) { + case '\\c': + c = s.charCodeAt(2) - 'A'.charCodeAt(0) + 1; + return String.fromCharCode(c); + + case '\\x': + s = s.substr(2); + c = parseInt(s, 16); + return String.fromCharCode(c); + + case '\\u': + s = s.substr(2); + if (s[0] === '{') { + s = s.substr(1, s.length - 2); + } + c = parseInt(s, 16); + if (c >= 0x10000) { + return new Error('We do NOT support Extended Plane Unicode Codepoints (i.e. CodePoints beyond U:FFFF) in regex set expressions, e.g. \\u{' + s + '}'); + } + return String.fromCharCode(c); + + case '\\0': + case '\\1': + case '\\2': + case '\\3': + case '\\4': + case '\\5': + case '\\6': + case '\\7': + s = s.substr(1); + c = parseInt(s, 8); + return String.fromCharCode(c); + + case '\\r': + return '\r'; + + case '\\n': + return '\n'; + + case '\\v': + return '\v'; + + case '\\f': + return '\f'; + + case '\\t': + return '\t'; + + case '\\b': + return '\b'; + + default: + // just the character itself: + return s.substr(1); + } + } else { + return s; + } + } + + if (s && s.length) { + var c1, c2; + + // inverted set? + if (s[0] === '^') { + set_is_inverted = true; + s = s.substr(1); + bitarr_orig = bitarr; + bitarr = new Array(UNICODE_BASE_PLANE_MAX_CP$1 + 1); + } + + // BITARR collects flags for characters set. Inversion means the complement set of character is st instead. + // This results in an OR operations when sets are joined/chained. + + while (s.length) { + c1 = s.match(CHR_RE$1); + if (!c1) { + // hit an illegal escape sequence? cope anyway! + c1 = s[0]; + } else { + c1 = c1[0]; + // Quick hack for XRegExp escapes inside a regex `[...]` set definition: we *could* try to keep those + // intact but it's easier to unfold them here; this is not nice for when the grammar specifies explicit + // XRegExp support, but alas, we'll get there when we get there... ;-) + switch (c1) { + case '\\p': + s = s.substr(c1.length); + c2 = s.match(XREGEXP_UNICODE_ESCAPE_RE$1); + if (c2) { + c2 = c2[0]; + s = s.substr(c2.length); + // do we have this one cached already? + var pex = c1 + c2; + var ba4p = Pcodes_bitarray_cache[pex]; + if (!ba4p) { + // expand escape: + var xr = new XRegExp('[' + pex + ']'); // TODO: case-insensitive grammar??? + // rewrite to a standard `[...]` regex set: XRegExp will do this for us via `XRegExp.toString()`: + var xs = '' + xr; + // remove the wrapping `/.../` to get at the (possibly *combined* series of) `[...]` sets inside: + xs = xs.substr(1, xs.length - 2); + + ba4p = reduceRegexToSetBitArray(xs, pex, opts); + + Pcodes_bitarray_cache[pex] = ba4p; + updatePcodesBitarrayCacheTestOrder(opts); + } + // merge bitarrays: + add2bitarray(bitarr, ba4p); + continue; + } + break; + + case '\\S': + case '\\s': + case '\\W': + case '\\w': + case '\\d': + case '\\D': + // these can't participate in a range, but need to be treated special: + s = s.substr(c1.length); + // check for \S, \s, \D, \d, \W, \w and expand them: + var ba4e = EscCode_bitarray_output_refs.esc2bitarr[c1[1]]; + assert(ba4e); + add2bitarray(bitarr, ba4e); + continue; + + case '\\b': + // matches a backspace: https://developer.mozilla.org/en/docs/Web/JavaScript/Guide/Regular_Expressions#special-backspace + c1 = '\b'; + break; + } + } + var v1 = eval_escaped_code(c1); + // propagate deferred exceptions = error reports. + if (v1 instanceof Error) { + return v1; + } + v1 = v1.charCodeAt(0); + s = s.substr(c1.length); + + if (s[0] === '-' && s.length >= 2) { + // we can expect a range like 'a-z': + s = s.substr(1); + c2 = s.match(CHR_RE$1); + if (!c2) { + // hit an illegal escape sequence? cope anyway! + c2 = s[0]; + } else { + c2 = c2[0]; + } + var v2 = eval_escaped_code(c2); + // propagate deferred exceptions = error reports. + if (v2 instanceof Error) { + return v1; + } + v2 = v2.charCodeAt(0); + s = s.substr(c2.length); + + // legal ranges go UP, not /DOWN! + if (v1 <= v2) { + mark(v1, v2); + } else { + console.warn('INVALID CHARACTER RANGE found in regex: ', { re: orig, start: c1, start_n: v1, end: c2, end_n: v2 }); + mark(v1); + mark('-'.charCodeAt(0)); + mark(v2); + } + continue; + } + mark(v1); + } + + // When we have marked all slots, '^' NEGATES the set, hence we flip all slots. + // + // Since a regex like `[^]` should match everything(?really?), we don't need to check if the MARK + // phase actually marked anything at all: the `^` negation will correctly flip=mark the entire + // range then. + if (set_is_inverted) { + for (var i = 0; i <= UNICODE_BASE_PLANE_MAX_CP$1; i++) { + if (!bitarr[i]) { + bitarr_orig[i] = true; + } + } + } + } + return false; + } + + // convert a simple bitarray back into a regex set `[...]` content: + function bitarray2set(l, output_inverted_variant, output_minimized) { + // construct the inverse(?) set from the mark-set: + // + // Before we do that, we inject a sentinel so that our inner loops + // below can be simple and fast: + l[UNICODE_BASE_PLANE_MAX_CP$1 + 1] = 1; + // now reconstruct the regex set: + var rv = []; + var i, j, cnt, lut, tn, tspec, match, pcode, ba4pcode, l2; + var bitarr_is_cloned = false; + var l_orig = l; + + if (output_inverted_variant) { + // generate the inverted set, hence all unmarked slots are part of the output range: + cnt = 0; + for (i = 0; i <= UNICODE_BASE_PLANE_MAX_CP$1; i++) { + if (!l[i]) { + cnt++; + } + } + if (cnt === UNICODE_BASE_PLANE_MAX_CP$1 + 1) { + // When there's nothing in the output we output a special 'match-nothing' regex: `[^\S\s]`. + // BUT... since we output the INVERTED set, we output the match-all set instead: + return '\\S\\s'; + } else if (cnt === 0) { + // When we find the entire Unicode range is in the output match set, we replace this with + // a shorthand regex: `[\S\s]` + // BUT... since we output the INVERTED set, we output the match-nothing set instead: + return '^\\S\\s'; + } + + // Now see if we can replace several bits by an escape / pcode: + if (output_minimized) { + lut = Pcodes_bitarray_cache_test_order; + for (tn = 0; lut[tn]; tn++) { + tspec = lut[tn]; + // check if the uniquely identifying char is in the inverted set: + if (!l[tspec[0]]) { + // check if the pcode is covered by the inverted set: + pcode = tspec[1]; + ba4pcode = Pcodes_bitarray_cache[pcode]; + match = 0; + for (j = 0; j <= UNICODE_BASE_PLANE_MAX_CP$1; j++) { + if (ba4pcode[j]) { + if (!l[j]) { + // match in current inverted bitset, i.e. there's at + // least one 'new' bit covered by this pcode/escape: + match++; + } else if (l_orig[j]) { + // mismatch! + match = false; + break; + } + } + } + + // We're only interested in matches which actually cover some + // yet uncovered bits: `match !== 0 && match !== false`. + // + // Apply the heuristic that the pcode/escape is only going to be used + // when it covers *more* characters than its own identifier's length: + if (match && match > pcode.length) { + rv.push(pcode); + + // and nuke the bits in the array which match the given pcode: + // make sure these edits are visible outside this function as + // `l` is an INPUT parameter (~ not modified)! + if (!bitarr_is_cloned) { + l2 = new Array(UNICODE_BASE_PLANE_MAX_CP$1 + 1); + for (j = 0; j <= UNICODE_BASE_PLANE_MAX_CP$1; j++) { + l2[j] = l[j] || ba4pcode[j]; // `!(!l[j] && !ba4pcode[j])` + } + // recreate sentinel + l2[UNICODE_BASE_PLANE_MAX_CP$1 + 1] = 1; + l = l2; + bitarr_is_cloned = true; + } else { + for (j = 0; j <= UNICODE_BASE_PLANE_MAX_CP$1; j++) { + l[j] = l[j] || ba4pcode[j]; + } + } + } + } + } + } + + i = 0; + while (i <= UNICODE_BASE_PLANE_MAX_CP$1) { + // find first character not in original set: + while (l[i]) { + i++; + } + if (i >= UNICODE_BASE_PLANE_MAX_CP$1 + 1) { + break; + } + // find next character not in original set: + for (j = i + 1; !l[j]; j++) {} /* empty loop */ + // generate subset: + rv.push(i2c(i)); + if (j - 1 > i) { + rv.push((j - 2 > i ? '-' : '') + i2c(j - 1)); + } + i = j; + } + } else { + // generate the non-inverted set, hence all logic checks are inverted here... + cnt = 0; + for (i = 0; i <= UNICODE_BASE_PLANE_MAX_CP$1; i++) { + if (l[i]) { + cnt++; + } + } + if (cnt === UNICODE_BASE_PLANE_MAX_CP$1 + 1) { + // When we find the entire Unicode range is in the output match set, we replace this with + // a shorthand regex: `[\S\s]` + return '\\S\\s'; + } else if (cnt === 0) { + // When there's nothing in the output we output a special 'match-nothing' regex: `[^\S\s]`. + return '^\\S\\s'; + } + + // Now see if we can replace several bits by an escape / pcode: + if (output_minimized) { + lut = Pcodes_bitarray_cache_test_order; + for (tn = 0; lut[tn]; tn++) { + tspec = lut[tn]; + // check if the uniquely identifying char is in the set: + if (l[tspec[0]]) { + // check if the pcode is covered by the set: + pcode = tspec[1]; + ba4pcode = Pcodes_bitarray_cache[pcode]; + match = 0; + for (j = 0; j <= UNICODE_BASE_PLANE_MAX_CP$1; j++) { + if (ba4pcode[j]) { + if (l[j]) { + // match in current bitset, i.e. there's at + // least one 'new' bit covered by this pcode/escape: + match++; + } else if (!l_orig[j]) { + // mismatch! + match = false; + break; + } + } + } + + // We're only interested in matches which actually cover some + // yet uncovered bits: `match !== 0 && match !== false`. + // + // Apply the heuristic that the pcode/escape is only going to be used + // when it covers *more* characters than its own identifier's length: + if (match && match > pcode.length) { + rv.push(pcode); + + // and nuke the bits in the array which match the given pcode: + // make sure these edits are visible outside this function as + // `l` is an INPUT parameter (~ not modified)! + if (!bitarr_is_cloned) { + l2 = new Array(UNICODE_BASE_PLANE_MAX_CP$1 + 1); + for (j = 0; j <= UNICODE_BASE_PLANE_MAX_CP$1; j++) { + l2[j] = l[j] && !ba4pcode[j]; + } + // recreate sentinel + l2[UNICODE_BASE_PLANE_MAX_CP$1 + 1] = 1; + l = l2; + bitarr_is_cloned = true; + } else { + for (j = 0; j <= UNICODE_BASE_PLANE_MAX_CP$1; j++) { + l[j] = l[j] && !ba4pcode[j]; + } + } + } + } + } + } + + i = 0; + while (i <= UNICODE_BASE_PLANE_MAX_CP$1) { + // find first character not in original set: + while (!l[i]) { + i++; + } + if (i >= UNICODE_BASE_PLANE_MAX_CP$1 + 1) { + break; + } + // find next character not in original set: + for (j = i + 1; l[j]; j++) {} /* empty loop */ + if (j > UNICODE_BASE_PLANE_MAX_CP$1 + 1) { + j = UNICODE_BASE_PLANE_MAX_CP$1 + 1; + } + // generate subset: + rv.push(i2c(i)); + if (j - 1 > i) { + rv.push((j - 2 > i ? '-' : '') + i2c(j - 1)); + } + i = j; + } + } + + assert(rv.length); + var s = rv.join(''); + assert(s); + + // Check if the set is better represented by one of the regex escapes: + var esc4s = EscCode_bitarray_output_refs.set2esc[s]; + if (esc4s) { + // When we hit a special case like this, it is always the shortest notation, hence wins on the spot! + return '\\' + esc4s; + } + return s; + } + + // Pretty brutal conversion of 'regex' `s` back to raw regex set content: strip outer [...] when they're there; + // ditto for inner combos of sets, i.e. `]|[` as in `[0-9]|[a-z]`. + function reduceRegexToSetBitArray(s, name, opts) { + var orig = s; + + // propagate deferred exceptions = error reports. + if (s instanceof Error) { + return s; + } + + var l = new Array(UNICODE_BASE_PLANE_MAX_CP$1 + 1); + var internal_state = 0; + var derr; + + while (s.length) { + var c1 = s.match(CHR_RE$1); + if (!c1) { + // cope with illegal escape sequences too! + return new Error('illegal escape sequence at start of regex part: "' + s + '" of regex "' + orig + '"'); + } else { + c1 = c1[0]; + } + s = s.substr(c1.length); + + switch (c1) { + case '[': + // this is starting a set within the regex: scan until end of set! + var set_content = []; + while (s.length) { + var inner = s.match(SET_PART_RE$1); + if (!inner) { + inner = s.match(CHR_RE$1); + if (!inner) { + // cope with illegal escape sequences too! + return new Error('illegal escape sequence at start of regex part: ' + s + '" of regex "' + orig + '"'); + } else { + inner = inner[0]; + } + if (inner === ']') break; + } else { + inner = inner[0]; + } + set_content.push(inner); + s = s.substr(inner.length); + } + + // ensure that we hit the terminating ']': + var c2 = s.match(CHR_RE$1); + if (!c2) { + // cope with illegal escape sequences too! + return new Error('regex set expression is broken in regex: "' + orig + '" --> "' + s + '"'); + } else { + c2 = c2[0]; + } + if (c2 !== ']') { + return new Error('regex set expression is broken in regex: ' + orig); + } + s = s.substr(c2.length); + + var se = set_content.join(''); + if (!internal_state) { + derr = set2bitarray(l, se, opts); + // propagate deferred exceptions = error reports. + if (derr instanceof Error) { + return derr; + } + + // a set is to use like a single character in a longer literal phrase, hence input `[abc]word[def]` would thus produce output `[abc]`: + internal_state = 1; + } + break; + + // Strip unescaped pipes to catch constructs like `\\r|\\n` and turn them into + // something ready for use inside a regex set, e.g. `\\r\\n`. + // + // > Of course, we realize that converting more complex piped constructs this way + // > will produce something you might not expect, e.g. `A|WORD2` which + // > would end up as the set `[AW]` which is something else than the input + // > entirely. + // > + // > However, we can only depend on the user (grammar writer) to realize this and + // > prevent this from happening by not creating such oddities in the input grammar. + case '|': + // a|b --> [ab] + internal_state = 0; + break; + + case '(': + // (a) --> a + // + // TODO - right now we treat this as 'too complex': + + // Strip off some possible outer wrappers which we know how to remove. + // We don't worry about 'damaging' the regex as any too-complex regex will be caught + // in the validation check at the end; our 'strippers' here would not damage useful + // regexes anyway and them damaging the unacceptable ones is fine. + s = s.replace(/^\((?:\?:)?(.*?)\)$/, '$1'); // (?:...) -> ... and (...) -> ... + s = s.replace(/^\^?(.*?)\$?$/, '$1'); // ^...$ --> ... (catch these both inside and outside the outer grouping, hence do the ungrouping twice: one before, once after this) + s = s.replace(/^\((?:\?:)?(.*?)\)$/, '$1'); // (?:...) -> ... and (...) -> ... + + return new Error('[macro [' + name + '] is unsuitable for use inside regex set expressions: "[' + orig + ']"]'); + + case '.': + case '*': + case '+': + case '?': + // wildcard + // + // TODO - right now we treat this as 'too complex': + return new Error('[macro [' + name + '] is unsuitable for use inside regex set expressions: "[' + orig + ']"]'); + + case '{': + // range, e.g. `x{1,3}`, or macro? + // TODO - right now we treat this as 'too complex': + return new Error('[macro [' + name + '] is unsuitable for use inside regex set expressions: "[' + orig + ']"]'); + + default: + // literal character or word: take the first character only and ignore the rest, so that + // the constructed set for `word|noun` would be `[wb]`: + if (!internal_state) { + derr = set2bitarray(l, c1, opts); + // propagate deferred exceptions = error reports. + if (derr instanceof Error) { + return derr; + } + + internal_state = 2; + } + break; + } + } + + s = bitarray2set(l); + + // When this result is suitable for use in a set, than we should be able to compile + // it in a regex; that way we can easily validate whether macro X is fit to be used + // inside a regex set: + try { + var re; + assert(s); + assert(!(s instanceof Error)); + re = new XRegExp('[' + s + ']'); + re.test(s[0]); + + // One thing is apparently *not* caught by the RegExp compile action above: `[a[b]c]` + // so we check for lingering UNESCAPED brackets in here as those cannot be: + if (/[^\\][\[\]]/.exec(s)) { + throw new Error('unescaped brackets in set data'); + } + } catch (ex) { + // make sure we produce a set range expression which will fail badly when it is used + // in actual code: + s = new Error('[macro [' + name + '] is unsuitable for use inside regex set expressions: "[' + s + ']"]: ' + ex.message); + } + + assert(s); + // propagate deferred exceptions = error reports. + if (s instanceof Error) { + return s; + } + return l; + } + + // Convert bitarray representing, for example, `'0-9'` to regex string `[0-9]` + // -- or in this example it can be further optimized to only `\d`! + function produceOptimizedRegex4Set(bitarr) { + // First try to produce a minimum regex from the bitarray directly: + var s1 = bitarray2set(bitarr, false, true); + + // and when the regex set turns out to match a single pcode/escape, then + // use that one as-is: + if (s1.match(SET_IS_SINGLE_PCODE_RE)) { + // When we hit a special case like this, it is always the shortest notation, hence wins on the spot! + return s1; + } else { + s1 = '[' + s1 + ']'; + } + + // Now try to produce a minimum regex from the *inverted* bitarray via negation: + // Because we look at a negated bitset, there's no use looking for matches with + // special cases here. + var s2 = bitarray2set(bitarr, true, true); + + if (s2[0] === '^') { + s2 = s2.substr(1); + if (s2.match(SET_IS_SINGLE_PCODE_RE)) { + // When we hit a special case like this, it is always the shortest notation, hence wins on the spot! + return s2; + } + } else { + s2 = '^' + s2; + } + s2 = '[' + s2 + ']'; + + // Then, as some pcode/escapes still happen to deliver a LARGER regex string in the end, + // we also check against the plain, unadulterated regex set expressions: + // + // First try to produce a minimum regex from the bitarray directly: + var s3 = bitarray2set(bitarr, false, false); + + // and when the regex set turns out to match a single pcode/escape, then + // use that one as-is: + if (s3.match(SET_IS_SINGLE_PCODE_RE)) { + // When we hit a special case like this, it is always the shortest notation, hence wins on the spot! + return s3; + } else { + s3 = '[' + s3 + ']'; + } + + // Now try to produce a minimum regex from the *inverted* bitarray via negation: + // Because we look at a negated bitset, there's no use looking for matches with + // special cases here. + var s4 = bitarray2set(bitarr, true, false); + + if (s4[0] === '^') { + s4 = s4.substr(1); + if (s4.match(SET_IS_SINGLE_PCODE_RE)) { + // When we hit a special case like this, it is always the shortest notation, hence wins on the spot! + return s4; + } + } else { + s4 = '^' + s4; + } + s4 = '[' + s4 + ']'; + + if (s2.length < s1.length) { + s1 = s2; + } + if (s3.length < s1.length) { + s1 = s3; + } + if (s4.length < s1.length) { + s1 = s4; + } + + return s1; + } + + var setmgmt = { + XREGEXP_UNICODE_ESCAPE_RE: XREGEXP_UNICODE_ESCAPE_RE$1, + CHR_RE: CHR_RE$1, + SET_PART_RE: SET_PART_RE$1, + NOTHING_SPECIAL_RE: NOTHING_SPECIAL_RE$1, + SET_IS_SINGLE_PCODE_RE: SET_IS_SINGLE_PCODE_RE, + + UNICODE_BASE_PLANE_MAX_CP: UNICODE_BASE_PLANE_MAX_CP$1, + + WHITESPACE_SETSTR: WHITESPACE_SETSTR$1, + DIGIT_SETSTR: DIGIT_SETSTR$1, + WORDCHAR_SETSTR: WORDCHAR_SETSTR$1, + + set2bitarray: set2bitarray, + bitarray2set: bitarray2set, + produceOptimizedRegex4Set: produceOptimizedRegex4Set, + reduceRegexToSetBitArray: reduceRegexToSetBitArray + }; + + // Basic Lexer implemented using JavaScript regular expressions + // Zachary Carter + // MIT Licensed + + var rmCommonWS = helpers.rmCommonWS; + var camelCase = helpers.camelCase; + var code_exec = helpers.exec; + // import recast from '@gerhobbelt/recast'; + // import astUtils from '@gerhobbelt/ast-util'; + var version = '0.6.1-205'; // require('./package.json').version; + + + var XREGEXP_UNICODE_ESCAPE_RE = setmgmt.XREGEXP_UNICODE_ESCAPE_RE; // Matches the XRegExp Unicode escape braced part, e.g. `{Number}` + var CHR_RE = setmgmt.CHR_RE; + var SET_PART_RE = setmgmt.SET_PART_RE; + var NOTHING_SPECIAL_RE = setmgmt.NOTHING_SPECIAL_RE; + var UNICODE_BASE_PLANE_MAX_CP = setmgmt.UNICODE_BASE_PLANE_MAX_CP; + + // WARNING: this regex MUST match the regex for `ID` in ebnf-parser::bnf.l jison language lexer spec! (`ID = [{ALPHA}]{ALNUM}*`) + // + // This is the base XRegExp ID regex used in many places; this should match the ID macro definition in the EBNF/BNF parser et al as well! + var ID_REGEX_BASE = '[\\p{Alphabetic}_][\\p{Alphabetic}_\\p{Number}]*'; + + // see also ./lib/cli.js + /** + @public + @nocollapse + */ + var defaultJisonLexOptions = { + moduleType: 'commonjs', + debug: false, + enableDebugLogs: false, + json: false, + main: false, // CLI: not:(--main option) + dumpSourceCodeOnFailure: true, + throwErrorOnCompileFailure: true, + + moduleName: undefined, + defaultModuleName: 'lexer', + file: undefined, + outfile: undefined, + inputPath: undefined, + inputFilename: undefined, + warn_cb: undefined, // function(msg) | true (= use Jison.Print) | false (= throw Exception) + + xregexp: false, + lexerErrorsAreRecoverable: false, + flex: false, + backtrack_lexer: false, + ranges: false, // track position range, i.e. start+end indexes in the input string + trackPosition: true, // track line+column position in the input string + caseInsensitive: false, + showSource: false, + exportSourceCode: false, + exportAST: false, + prettyCfg: true, + pre_lex: undefined, + post_lex: undefined + }; + + // Merge sets of options. + // + // Convert alternative jison option names to their base option. + // + // The *last* option set which overrides the default wins, where 'override' is + // defined as specifying a not-undefined value which is not equal to the + // default value. + // + // When the FIRST argument is STRING "NODEFAULT", then we MUST NOT mix the + // default values avialable in Jison.defaultJisonOptions. + // + // Return a fresh set of options. + /** @public */ + function mkStdOptions() /*...args*/{ + var h = Object.prototype.hasOwnProperty; + + var opts = {}; + var args = [].concat.apply([], arguments); + // clone defaults, so we do not modify those constants? + if (args[0] !== "NODEFAULT") { + args.unshift(defaultJisonLexOptions); + } else { + args.shift(); + } + + for (var i = 0, len = args.length; i < len; i++) { + var o = args[i]; + if (!o) continue; + + // clone input (while camel-casing the options), so we do not modify those either. + var o2 = {}; + + for (var p in o) { + if (typeof o[p] !== 'undefined' && h.call(o, p)) { + o2[camelCase(p)] = o[p]; + } + } + + // now clean them options up: + if (typeof o2.main !== 'undefined') { + o2.noMain = !o2.main; + } + + delete o2.main; + + // special check for `moduleName` to ensure we detect the 'default' moduleName entering from the CLI + // NOT overriding the moduleName set in the grammar definition file via an `%options` entry: + if (o2.moduleName === o2.defaultModuleName) { + delete o2.moduleName; + } + + // now see if we have an overriding option here: + for (var p in o2) { + if (h.call(o2, p)) { + if (typeof o2[p] !== 'undefined') { + opts[p] = o2[p]; + } + } + } + } + + return opts; + } + + // set up export/output attributes of the `options` object instance + function prepExportStructures(options) { + // set up the 'option' `exportSourceCode` as a hash object for returning + // all generated source code chunks to the caller + var exportSourceCode = options.exportSourceCode; + if (!exportSourceCode || (typeof exportSourceCode === 'undefined' ? 'undefined' : _typeof(exportSourceCode)) !== 'object') { + exportSourceCode = { + enabled: !!exportSourceCode + }; + } else if (typeof exportSourceCode.enabled !== 'boolean') { + exportSourceCode.enabled = true; + } + options.exportSourceCode = exportSourceCode; + } + + // Autodetect if the input lexer spec is in JSON or JISON + // format when the `options.json` flag is `true`. + // + // Produce the JSON lexer spec result when these are JSON formatted already as that + // would save us the trouble of doing this again, anywhere else in the JISON + // compiler/generator. + // + // Otherwise return the *parsed* lexer spec as it has + // been processed through LexParser. + function autodetectAndConvertToJSONformat(lexerSpec, options) { + var chk_l = null; + var ex1, err; + + if (typeof lexerSpec === 'string') { + if (options.json) { + try { + chk_l = json5.parse(lexerSpec); + + // When JSON5-based parsing of the lexer spec succeeds, this implies the lexer spec is specified in `JSON mode` + // *OR* there's a JSON/JSON5 format error in the input: + } catch (e) { + ex1 = e; + } + } + if (!chk_l) { + // // WARNING: the lexer may receive options specified in the **grammar spec file**, + // // hence we should mix the options to ensure the lexParser always + // // receives the full set! + // // + // // make sure all options are 'standardized' before we go and mix them together: + // options = mkStdOptions(grammar.options, options); + try { + chk_l = lexParser.parse(lexerSpec, options); + } catch (e) { + if (options.json) { + err = new Error('Could not parse lexer spec in JSON AUTODETECT mode\nError: ' + ex1.message + ' (' + e.message + ')'); + err.secondary_exception = e; + err.stack = ex1.stack; + } else { + err = new Error('Could not parse lexer spec\nError: ' + e.message); + err.stack = e.stack; + } + throw err; + } + } + } else { + chk_l = lexerSpec; + } + + // Save time! Don't reparse the entire lexer spec *again* inside the code generators when that's not necessary: + + return chk_l; + } + + // expand macros and convert matchers to RegExp's + function prepareRules(dict, actions, caseHelper, tokens, startConditions, opts) { + var m, + i, + k, + rule, + action, + conditions, + active_conditions, + rules = dict.rules || [], + newRules = [], + macros = {}, + regular_rule_count = 0, + simple_rule_count = 0; + + // Assure all options are camelCased: + assert(typeof opts.options['case-insensitive'] === 'undefined'); + + if (!tokens) { + tokens = {}; + } + + // Depending on the location within the regex we need different expansions of the macros: + // one expansion for when a macro is *inside* a `[...]` and another expansion when a macro + // is anywhere else in a regex: + if (dict.macros) { + macros = prepareMacros(dict.macros, opts); + } + + function tokenNumberReplacement(str, token) { + return 'return ' + (tokens[token] || '\'' + token.replace(/'/g, '\\\'') + '\''); + } + + // Make sure a comment does not contain any embedded '*/' end-of-comment marker + // as that would break the generated code + function postprocessComment(str) { + if (Array.isArray(str)) { + str = str.join(' '); + } + str = str.replace(/\*\//g, '*\\/'); // destroy any inner `*/` comment terminator sequence. + return str; + } + + actions.push('switch(yyrulenumber) {'); + + for (i = 0; i < rules.length; i++) { + rule = rules[i]; + m = rule[0]; + + active_conditions = []; + if (Object.prototype.toString.apply(m) !== '[object Array]') { + // implicit add to all inclusive start conditions + for (k in startConditions) { + if (startConditions[k].inclusive) { + active_conditions.push(k); + startConditions[k].rules.push(i); + } + } + } else if (m[0] === '*') { + // Add to ALL start conditions + active_conditions.push('*'); + for (k in startConditions) { + startConditions[k].rules.push(i); + } + rule.shift(); + m = rule[0]; + } else { + // Add to explicit start conditions + conditions = rule.shift(); + m = rule[0]; + for (k = 0; k < conditions.length; k++) { + if (!startConditions.hasOwnProperty(conditions[k])) { + startConditions[conditions[k]] = { + rules: [], + inclusive: false + }; + console.warn('Lexer Warning:', '"' + conditions[k] + '" start condition should be defined as %s or %x; assuming %x now.'); + } + active_conditions.push(conditions[k]); + startConditions[conditions[k]].rules.push(i); + } + } + + if (typeof m === 'string') { + m = expandMacros(m, macros, opts); + m = new XRegExp('^(?:' + m + ')', opts.options.caseInsensitive ? 'i' : ''); + } + newRules.push(m); + if (typeof rule[1] === 'function') { + rule[1] = String(rule[1]).replace(/^\s*function \(\)\s?\{/, '').replace(/\}\s*$/, ''); + } + action = rule[1]; + action = action.replace(/return '((?:\\'|[^']+)+)'/g, tokenNumberReplacement); + action = action.replace(/return "((?:\\"|[^"]+)+)"/g, tokenNumberReplacement); + + var code = ['\n/*! Conditions::']; + code.push(postprocessComment(active_conditions)); + code.push('*/', '\n/*! Rule:: '); + code.push(postprocessComment(rules[i][0])); + code.push('*/', '\n'); + + // When the action is *only* a simple `return TOKEN` statement, then add it to the caseHelpers; + // otherwise add the additional `break;` at the end. + // + // Note: we do NOT analyze the action block any more to see if the *last* line is a simple + // `return NNN;` statement as there are too many shoddy idioms, e.g. + // + // ``` + // %{ if (cond) + // return TOKEN; + // %} + // ``` + // + // which would then cause havoc when our action code analysis (using regexes or otherwise) was 'too simple' + // to catch these culprits; hence we resort and stick with the most fundamental approach here: + // always append `break;` even when it would be obvious to a human that such would be 'unreachable code'. + var match_nr = /^return[\s\r\n]+((?:'(?:\\'|[^']+)+')|(?:"(?:\\"|[^"]+)+")|\d+)[\s\r\n]*;?$/.exec(action.trim()); + if (match_nr) { + simple_rule_count++; + caseHelper.push([].concat(code, i, ':', match_nr[1]).join(' ').replace(/[\n]/g, '\n ')); + } else { + regular_rule_count++; + actions.push([].concat('case', i, ':', code, action, '\nbreak;').join(' ')); + } + } + actions.push('default:'); + actions.push(' return this.simpleCaseActionClusters[yyrulenumber];'); + actions.push('}'); + + return { + rules: newRules, + macros: macros, + + regular_rule_count: regular_rule_count, + simple_rule_count: simple_rule_count + }; + } + + // expand all macros (with maybe one exception) in the given regex: the macros may exist inside `[...]` regex sets or + // elsewhere, which requires two different treatments to expand these macros. + function reduceRegex(s, name, opts, expandAllMacrosInSet_cb, expandAllMacrosElsewhere_cb) { + var orig = s; + + function errinfo() { + if (name) { + return 'macro [[' + name + ']]'; + } else { + return 'regex [[' + orig + ']]'; + } + } + + // propagate deferred exceptions = error reports. + if (s instanceof Error) { + return s; + } + + var c1, c2; + var rv = []; + var derr; + var se; + + while (s.length) { + c1 = s.match(CHR_RE); + if (!c1) { + // cope with illegal escape sequences too! + return new Error(errinfo() + ': illegal escape sequence at start of regex part: ' + s); + } else { + c1 = c1[0]; + } + s = s.substr(c1.length); + + switch (c1) { + case '[': + // this is starting a set within the regex: scan until end of set! + var set_content = []; + var l = new Array(UNICODE_BASE_PLANE_MAX_CP + 1); + + while (s.length) { + var inner = s.match(SET_PART_RE); + if (!inner) { + inner = s.match(CHR_RE); + if (!inner) { + // cope with illegal escape sequences too! + return new Error(errinfo() + ': illegal escape sequence at start of regex part: ' + s); + } else { + inner = inner[0]; + } + if (inner === ']') break; + } else { + inner = inner[0]; + } + set_content.push(inner); + s = s.substr(inner.length); + } + + // ensure that we hit the terminating ']': + c2 = s.match(CHR_RE); + if (!c2) { + // cope with illegal escape sequences too! + return new Error(errinfo() + ': regex set expression is broken: "' + s + '"'); + } else { + c2 = c2[0]; + } + if (c2 !== ']') { + return new Error(errinfo() + ': regex set expression is broken: apparently unterminated'); + } + s = s.substr(c2.length); + + se = set_content.join(''); + + // expand any macros in here: + if (expandAllMacrosInSet_cb) { + se = expandAllMacrosInSet_cb(se); + assert(se); + if (se instanceof Error) { + return new Error(errinfo() + ': ' + se.message); + } + } + + derr = setmgmt.set2bitarray(l, se, opts); + if (derr instanceof Error) { + return new Error(errinfo() + ': ' + derr.message); + } + + // find out which set expression is optimal in size: + var s1 = setmgmt.produceOptimizedRegex4Set(l); + + // check if the source regex set potentially has any expansions (guestimate!) + // + // The indexOf('{') picks both XRegExp Unicode escapes and JISON lexer macros, which is perfect for us here. + var has_expansions = se.indexOf('{') >= 0; + + se = '[' + se + ']'; + + if (!has_expansions && se.length < s1.length) { + s1 = se; + } + rv.push(s1); + break; + + // XRegExp Unicode escape, e.g. `\\p{Number}`: + case '\\p': + c2 = s.match(XREGEXP_UNICODE_ESCAPE_RE); + if (c2) { + c2 = c2[0]; + s = s.substr(c2.length); + + // nothing to expand. + rv.push(c1 + c2); + } else { + // nothing to stretch this match, hence nothing to expand. + rv.push(c1); + } + break; + + // Either a range expression or the start of a macro reference: `.{1,3}` or `{NAME}`. + // Treat it as a macro reference and see if it will expand to anything: + case '{': + c2 = s.match(NOTHING_SPECIAL_RE); + if (c2) { + c2 = c2[0]; + s = s.substr(c2.length); + + var c3 = s[0]; + s = s.substr(c3.length); + if (c3 === '}') { + // possibly a macro name in there... Expand if possible: + c2 = c1 + c2 + c3; + if (expandAllMacrosElsewhere_cb) { + c2 = expandAllMacrosElsewhere_cb(c2); + assert(c2); + if (c2 instanceof Error) { + return new Error(errinfo() + ': ' + c2.message); + } + } + } else { + // not a well-terminated macro reference or something completely different: + // we do not even attempt to expand this as there's guaranteed nothing to expand + // in this bit. + c2 = c1 + c2 + c3; + } + rv.push(c2); + } else { + // nothing to stretch this match, hence nothing to expand. + rv.push(c1); + } + break; + + // Recognize some other regex elements, but there's no need to understand them all. + // + // We are merely interested in any chunks now which do *not* include yet another regex set `[...]` + // nor any `{MACRO}` reference: + default: + // non-set character or word: see how much of this there is for us and then see if there + // are any macros still lurking inside there: + c2 = s.match(NOTHING_SPECIAL_RE); + if (c2) { + c2 = c2[0]; + s = s.substr(c2.length); + + // nothing to expand. + rv.push(c1 + c2); + } else { + // nothing to stretch this match, hence nothing to expand. + rv.push(c1); + } + break; + } + } + + s = rv.join(''); + + // When this result is suitable for use in a set, than we should be able to compile + // it in a regex; that way we can easily validate whether macro X is fit to be used + // inside a regex set: + try { + var re; + re = new XRegExp(s); + re.test(s[0]); + } catch (ex) { + // make sure we produce a regex expression which will fail badly when it is used + // in actual code: + return new Error(errinfo() + ': expands to an invalid regex: /' + s + '/'); + } + + assert(s); + return s; + } + + // expand macros within macros and cache the result + function prepareMacros(dict_macros, opts) { + var macros = {}; + + // expand a `{NAME}` macro which exists inside a `[...]` set: + function expandMacroInSet(i) { + var k, a, m; + if (!macros[i]) { + m = dict_macros[i]; + + if (m.indexOf('{') >= 0) { + // set up our own record so we can detect definition loops: + macros[i] = { + in_set: false, + elsewhere: null, + raw: dict_macros[i] + }; + + for (k in dict_macros) { + if (dict_macros.hasOwnProperty(k) && i !== k) { + // it doesn't matter if the lexer recognized that the inner macro(s) + // were sitting inside a `[...]` set or not: the fact that they are used + // here in macro `i` which itself sits in a set, makes them *all* live in + // a set so all of them get the same treatment: set expansion style. + // + // Note: make sure we don't try to expand any XRegExp `\p{...}` or `\P{...}` + // macros here: + if (XRegExp._getUnicodeProperty(k)) { + // Work-around so that you can use `\p{ascii}` for a XRegExp slug, a.k.a. + // Unicode 'General Category' Property cf. http://unicode.org/reports/tr18/#Categories, + // while using `\p{ASCII}` as a *macro expansion* of the `ASCII` + // macro: + if (k.toUpperCase() !== k) { + m = new Error('Cannot use name "' + k + '" as a macro name as it clashes with the same XRegExp "\\p{..}" Unicode \'General Category\' Property name. Use all-uppercase macro names, e.g. name your macro "' + k.toUpperCase() + '" to work around this issue or give your offending macro a different name.'); + break; + } + } + + a = m.split('{' + k + '}'); + if (a.length > 1) { + var x = expandMacroInSet(k); + assert(x); + if (x instanceof Error) { + m = x; + break; + } + m = a.join(x); + } + } + } + } + + var mba = setmgmt.reduceRegexToSetBitArray(m, i, opts); + + var s1; + + // propagate deferred exceptions = error reports. + if (mba instanceof Error) { + s1 = mba; + } else { + s1 = setmgmt.bitarray2set(mba, false); + + m = s1; + } + + macros[i] = { + in_set: s1, + elsewhere: null, + raw: dict_macros[i] + }; + } else { + m = macros[i].in_set; + + if (m instanceof Error) { + // this turns out to be an macro with 'issues' and it is used, so the 'issues' do matter: bombs away! + return new Error(m.message); + } + + // detect definition loop: + if (m === false) { + return new Error('Macro name "' + i + '" has an illegal, looping, definition, i.e. it\'s definition references itself, either directly or indirectly, via other macros.'); + } + } + + return m; + } + + function expandMacroElsewhere(i) { + var k, a, m; + + if (macros[i].elsewhere == null) { + m = dict_macros[i]; + + // set up our own record so we can detect definition loops: + macros[i].elsewhere = false; + + // the macro MAY contain other macros which MAY be inside a `[...]` set in this + // macro or elsewhere, hence we must parse the regex: + m = reduceRegex(m, i, opts, expandAllMacrosInSet, expandAllMacrosElsewhere); + // propagate deferred exceptions = error reports. + if (m instanceof Error) { + return m; + } + + macros[i].elsewhere = m; + } else { + m = macros[i].elsewhere; + + if (m instanceof Error) { + // this turns out to be an macro with 'issues' and it is used, so the 'issues' do matter: bombs away! + return m; + } + + // detect definition loop: + if (m === false) { + return new Error('Macro name "' + i + '" has an illegal, looping, definition, i.e. it\'s definition references itself, either directly or indirectly, via other macros.'); + } + } + + return m; + } + + function expandAllMacrosInSet(s) { + var i, x; + + // process *all* the macros inside [...] set: + if (s.indexOf('{') >= 0) { + for (i in macros) { + if (macros.hasOwnProperty(i)) { + var a = s.split('{' + i + '}'); + if (a.length > 1) { + x = expandMacroInSet(i); + assert(x); + if (x instanceof Error) { + return new Error('failure to expand the macro [' + i + '] in set [' + s + ']: ' + x.message); + } + s = a.join(x); + } + + // stop the brute-force expansion attempt when we done 'em all: + if (s.indexOf('{') === -1) { + break; + } + } + } + } + + return s; + } + + function expandAllMacrosElsewhere(s) { + var i, x; + + // When we process the remaining macro occurrences in the regex + // every macro used in a lexer rule will become its own capture group. + // + // Meanwhile the cached expansion will expand any submacros into + // *NON*-capturing groups so that the backreference indexes remain as you'ld + // expect and using macros doesn't require you to know exactly what your + // used macro will expand into, i.e. which and how many submacros it has. + // + // This is a BREAKING CHANGE from vanilla jison 0.4.15! + if (s.indexOf('{') >= 0) { + for (i in macros) { + if (macros.hasOwnProperty(i)) { + // These are all submacro expansions, hence non-capturing grouping is applied: + var a = s.split('{' + i + '}'); + if (a.length > 1) { + x = expandMacroElsewhere(i); + assert(x); + if (x instanceof Error) { + return new Error('failure to expand the macro [' + i + '] in regex /' + s + '/: ' + x.message); + } + s = a.join('(?:' + x + ')'); + } + + // stop the brute-force expansion attempt when we done 'em all: + if (s.indexOf('{') === -1) { + break; + } + } + } + } + + return s; + } + + var m, i; + + if (opts.debug) console.log('\n############## RAW macros: ', dict_macros); + + // first we create the part of the dictionary which is targeting the use of macros + // *inside* `[...]` sets; once we have completed that half of the expansions work, + // we then go and expand the macros for when they are used elsewhere in a regex: + // iff we encounter submacros then which are used *inside* a set, we can use that + // first half dictionary to speed things up a bit as we can use those expansions + // straight away! + for (i in dict_macros) { + if (dict_macros.hasOwnProperty(i)) { + expandMacroInSet(i); + } + } + + for (i in dict_macros) { + if (dict_macros.hasOwnProperty(i)) { + expandMacroElsewhere(i); + } + } + + if (opts.debug) console.log('\n############### expanded macros: ', macros); + + return macros; + } + + // expand macros in a regex; expands them recursively + function expandMacros(src, macros, opts) { + var expansion_count = 0; + + // By the time we call this function `expandMacros` we MUST have expanded and cached all macros already! + // Hence things should be easy in there: + + function expandAllMacrosInSet(s) { + var i, m, x; + + // process *all* the macros inside [...] set: + if (s.indexOf('{') >= 0) { + for (i in macros) { + if (macros.hasOwnProperty(i)) { + m = macros[i]; + + var a = s.split('{' + i + '}'); + if (a.length > 1) { + x = m.in_set; + + assert(x); + if (x instanceof Error) { + // this turns out to be an macro with 'issues' and it is used, so the 'issues' do matter: bombs away! + throw x; + } + + // detect definition loop: + if (x === false) { + return new Error('Macro name "' + i + '" has an illegal, looping, definition, i.e. it\'s definition references itself, either directly or indirectly, via other macros.'); + } + + s = a.join(x); + expansion_count++; + } + + // stop the brute-force expansion attempt when we done 'em all: + if (s.indexOf('{') === -1) { + break; + } + } + } + } + + return s; + } + + function expandAllMacrosElsewhere(s) { + var i, m, x; + + // When we process the main macro occurrences in the regex + // every macro used in a lexer rule will become its own capture group. + // + // Meanwhile the cached expansion will expand any submacros into + // *NON*-capturing groups so that the backreference indexes remain as you'ld + // expect and using macros doesn't require you to know exactly what your + // used macro will expand into, i.e. which and how many submacros it has. + // + // This is a BREAKING CHANGE from vanilla jison 0.4.15! + if (s.indexOf('{') >= 0) { + for (i in macros) { + if (macros.hasOwnProperty(i)) { + m = macros[i]; + + var a = s.split('{' + i + '}'); + if (a.length > 1) { + // These are all main macro expansions, hence CAPTURING grouping is applied: + x = m.elsewhere; + assert(x); + + // detect definition loop: + if (x === false) { + return new Error('Macro name "' + i + '" has an illegal, looping, definition, i.e. it\'s definition references itself, either directly or indirectly, via other macros.'); + } + + s = a.join('(' + x + ')'); + expansion_count++; + } + + // stop the brute-force expansion attempt when we done 'em all: + if (s.indexOf('{') === -1) { + break; + } + } + } + } + + return s; + } + + // When we process the macro occurrences in the regex + // every macro used in a lexer rule will become its own capture group. + // + // Meanwhile the cached expansion will have expanded any submacros into + // *NON*-capturing groups so that the backreference indexes remain as you'ld + // expect and using macros doesn't require you to know exactly what your + // used macro will expand into, i.e. which and how many submacros it has. + // + // This is a BREAKING CHANGE from vanilla jison 0.4.15! + var s2 = reduceRegex(src, null, opts, expandAllMacrosInSet, expandAllMacrosElsewhere); + // propagate deferred exceptions = error reports. + if (s2 instanceof Error) { + throw s2; + } + + // only when we did expand some actual macros do we take the re-interpreted/optimized/regenerated regex from reduceRegex() + // in order to keep our test cases simple and rules recognizable. This assumes the user can code good regexes on his own, + // as long as no macros are involved... + // + // Also pick the reduced regex when there (potentially) are XRegExp extensions in the original, e.g. `\\p{Number}`, + // unless the `xregexp` output option has been enabled. + if (expansion_count > 0 || src.indexOf('\\p{') >= 0 && !opts.options.xregexp) { + src = s2; + } else { + // Check if the reduced regex is smaller in size; when it is, we still go with the new one! + if (s2.length < src.length) { + src = s2; + } + } + + return src; + } + + function prepareStartConditions(conditions) { + var sc, + hash = {}; + for (sc in conditions) { + if (conditions.hasOwnProperty(sc)) { + hash[sc] = { rules: [], inclusive: !conditions[sc] }; + } + } + return hash; + } + + function buildActions(dict, tokens, opts) { + var actions = [dict.actionInclude || '', 'var YYSTATE = YY_START;']; + var tok; + var toks = {}; + var caseHelper = []; + + // tokens: map/array of token numbers to token names + for (tok in tokens) { + var idx = parseInt(tok); + if (idx && idx > 0) { + toks[tokens[tok]] = idx; + } + } + + if (opts.options.flex && dict.rules) { + dict.rules.push(['.', 'console.log("", yytext); /* `flex` lexing mode: the last resort rule! */']); + } + + var gen = prepareRules(dict, actions, caseHelper, tokens && toks, opts.conditions, opts); + + var fun = actions.join('\n'); + 'yytext yyleng yylineno yylloc yyerror'.split(' ').forEach(function (yy) { + fun = fun.replace(new RegExp('\\b(' + yy + ')\\b', 'g'), 'yy_.$1'); + }); + + return { + caseHelperInclude: '{\n' + caseHelper.join(',') + '\n}', + + actions: 'function lexer__performAction(yy, yyrulenumber, YY_START) {\n var yy_ = this;\n\n ' + fun + '\n }', + + rules: gen.rules, + macros: gen.macros, // propagate these for debugging/diagnostic purposes + + regular_rule_count: gen.regular_rule_count, + simple_rule_count: gen.simple_rule_count + }; + } + + // + // NOTE: this is *almost* a copy of the JisonParserError producing code in + // jison/lib/jison.js @ line 2304:lrGeneratorMixin.generateErrorClass + // + function generateErrorClass() { + // --- START lexer error class --- + + var prelude = '/**\n * See also:\n * http://stackoverflow.com/questions/1382107/whats-a-good-way-to-extend-error-in-javascript/#35881508\n * but we keep the prototype.constructor and prototype.name assignment lines too for compatibility\n * with userland code which might access the derived class in a \'classic\' way.\n *\n * @public\n * @constructor\n * @nocollapse\n */\nfunction JisonLexerError(msg, hash) {\n Object.defineProperty(this, \'name\', {\n enumerable: false,\n writable: false,\n value: \'JisonLexerError\'\n });\n\n if (msg == null) msg = \'???\';\n\n Object.defineProperty(this, \'message\', {\n enumerable: false,\n writable: true,\n value: msg\n });\n\n this.hash = hash;\n\n var stacktrace;\n if (hash && hash.exception instanceof Error) {\n var ex2 = hash.exception;\n this.message = ex2.message || msg;\n stacktrace = ex2.stack;\n }\n if (!stacktrace) {\n if (Error.hasOwnProperty(\'captureStackTrace\')) { // V8\n Error.captureStackTrace(this, this.constructor);\n } else {\n stacktrace = (new Error(msg)).stack;\n }\n }\n if (stacktrace) {\n Object.defineProperty(this, \'stack\', {\n enumerable: false,\n writable: false,\n value: stacktrace\n });\n }\n}\n\nif (typeof Object.setPrototypeOf === \'function\') {\n Object.setPrototypeOf(JisonLexerError.prototype, Error.prototype);\n} else {\n JisonLexerError.prototype = Object.create(Error.prototype);\n}\nJisonLexerError.prototype.constructor = JisonLexerError;\nJisonLexerError.prototype.name = \'JisonLexerError\';'; + + // --- END lexer error class --- + + return prelude; + } + + var jisonLexerErrorDefinition = generateErrorClass(); + + function generateFakeXRegExpClassSrcCode() { + return rmCommonWS(_templateObject); + } + + /** @constructor */ + function RegExpLexer(dict, input, tokens, build_options) { + var opts; + var dump = false; + + function test_me(tweak_cb, description, src_exception, ex_callback) { + opts = processGrammar(dict, tokens, build_options); + opts.__in_rules_failure_analysis_mode__ = false; + prepExportStructures(opts); + assert(opts.options); + if (tweak_cb) { + tweak_cb(); + } + var source = generateModuleBody(opts); + try { + // The generated code will always have the `lexer` variable declared at local scope + // as `eval()` will use the local scope. + // + // The compiled code will look something like this: + // + // ``` + // var lexer; + // bla bla... + // ``` + // + // or + // + // ``` + // var lexer = { bla... }; + // ``` + var testcode = ['// provide a local version for test purposes:', jisonLexerErrorDefinition, '', generateFakeXRegExpClassSrcCode(), '', source, '', 'return lexer;'].join('\n'); + var lexer = code_exec(testcode, function generated_code_exec_wrapper_regexp_lexer(sourcecode) { + //console.log("===============================LEXER TEST CODE\n", sourcecode, "\n=====================END====================\n"); + var lexer_f = new Function('', sourcecode); + return lexer_f(); + }, opts.options, "lexer"); + + if (!lexer) { + throw new Error('no lexer defined *at all*?!'); + } + if (_typeof(lexer.options) !== 'object' || lexer.options == null) { + throw new Error('your lexer class MUST have an .options member object or it won\'t fly!'); + } + if (typeof lexer.setInput !== 'function') { + throw new Error('your lexer class MUST have a .setInput function member or it won\'t fly!'); + } + if (lexer.EOF !== 1 && lexer.ERROR !== 2) { + throw new Error('your lexer class MUST have these constants defined: lexer.EOF = 1 and lexer.ERROR = 2 or it won\'t fly!'); + } + + // When we do NOT crash, we found/killed the problem area just before this call! + if (src_exception && description) { + src_exception.message += '\n (' + description + ')'; + } + + // patch the pre and post handlers in there, now that we have some live code to work with: + if (opts.options) { + var pre = opts.options.pre_lex; + var post = opts.options.post_lex; + // since JSON cannot encode functions, we'll have to do it manually now: + if (typeof pre === 'function') { + lexer.options.pre_lex = pre; + } + if (typeof post === 'function') { + lexer.options.post_lex = post; + } + } + + if (opts.options.showSource) { + if (typeof opts.options.showSource === 'function') { + opts.options.showSource(lexer, source, opts); + } else { + console.log("\nGenerated lexer sourcecode:\n----------------------------------------\n", source, "\n----------------------------------------\n"); + } + } + return lexer; + } catch (ex) { + // if (src_exception) { + // src_exception.message += '\n (' + description + ': ' + ex.message + ')'; + // } + + if (ex_callback) { + ex_callback(ex); + } else if (dump) { + console.log('source code:\n', source); + } + return false; + } + } + + /** @constructor */ + var lexer = test_me(null, null, null, function (ex) { + // When we get an exception here, it means some part of the user-specified lexer is botched. + // + // Now we go and try to narrow down the problem area/category: + assert(opts.options); + assert(opts.options.xregexp !== undefined); + var orig_xregexp_opt = !!opts.options.xregexp; + if (!test_me(function () { + assert(opts.options.xregexp !== undefined); + opts.options.xregexp = false; + opts.showSource = false; + }, 'When you have specified %option xregexp, you must also properly IMPORT the XRegExp library in the generated lexer.', ex, null)) { + if (!test_me(function () { + // restore xregexp option setting: the trouble wasn't caused by the xregexp flag i.c.w. incorrect XRegExp library importing! + opts.options.xregexp = orig_xregexp_opt; + + opts.conditions = []; + opts.showSource = false; + }, dict.rules && dict.rules.length > 0 ? 'One or more of your lexer state names are possibly botched?' : 'Your custom lexer is somehow botched.', ex, null)) { + if (!test_me(function () { + // opts.conditions = []; + opts.rules = []; + opts.showSource = false; + opts.__in_rules_failure_analysis_mode__ = true; + }, 'One or more of your lexer rules are possibly botched?', ex, null)) { + // kill each rule action block, one at a time and test again after each 'edit': + var rv = false; + for (var i = 0, len = dict.rules ? dict.rules.length : 0; i < len; i++) { + dict.rules[i][1] = '{ /* nada */ }'; + rv = test_me(function () { + // opts.conditions = []; + // opts.rules = []; + // opts.__in_rules_failure_analysis_mode__ = true; + }, 'Your lexer rule "' + dict.rules[i][0] + '" action code block is botched?', ex, null); + if (rv) { + break; + } + } + if (!rv) { + test_me(function () { + opts.conditions = []; + opts.rules = []; + opts.performAction = 'null'; + // opts.options = {}; + // opts.caseHelperInclude = '{}'; + opts.showSource = false; + opts.__in_rules_failure_analysis_mode__ = true; + + dump = false; + }, 'One or more of your lexer rule action code block(s) are possibly botched?', ex, null); + } + } + } + } + throw ex; + }); + + lexer.setInput(input); + + /** @public */ + lexer.generate = function () { + return generateFromOpts(opts); + }; + /** @public */ + lexer.generateModule = function () { + return generateModule(opts); + }; + /** @public */ + lexer.generateCommonJSModule = function () { + return generateCommonJSModule(opts); + }; + /** @public */ + lexer.generateESModule = function () { + return generateESModule(opts); + }; + /** @public */ + lexer.generateAMDModule = function () { + return generateAMDModule(opts); + }; + + // internal APIs to aid testing: + /** @public */ + lexer.getExpandedMacros = function () { + return opts.macros; + }; + + return lexer; + } + + // code stripping performance test for very simple grammar: + // + // - removing backtracking parser code branches: 730K -> 750K rounds + // - removing all location info tracking: yylineno, yylloc, etc.: 750K -> 900K rounds + // - no `yyleng`: 900K -> 905K rounds + // - no `this.done` as we cannot have a NULL `_input` anymore: 905K -> 930K rounds + // - `simpleCaseActionClusters` as array instead of hash object: 930K -> 940K rounds + // - lexers which have only return stmts, i.e. only a + // `simpleCaseActionClusters` lookup table to produce + // lexer tokens: *inline* the `performAction` call: 940K -> 950K rounds + // - given all the above, you can *inline* what's left of + // `lexer_next()`: 950K -> 955K rounds (? this stuff becomes hard to measure; inaccuracy abounds!) + // + // Total gain when we forget about very minor (and tough to nail) *inlining* `lexer_next()` gains: + // + // 730 -> 950 ~ 30% performance gain. + // + + // As a function can be reproduced in source-code form by any JavaScript engine, we're going to wrap this chunk + // of code in a function so that we can easily get it including it comments, etc.: + /** + @public + @nocollapse + */ + function getRegExpLexerPrototype() { + // --- START lexer kernel --- + return '{\n EOF: 1,\n ERROR: 2,\n\n // JisonLexerError: JisonLexerError, /// <-- injected by the code generator\n\n // options: {}, /// <-- injected by the code generator\n\n // yy: ..., /// <-- injected by setInput()\n\n __currentRuleSet__: null, /// INTERNAL USE ONLY: internal rule set cache for the current lexer state\n\n __error_infos: [], /// INTERNAL USE ONLY: the set of lexErrorInfo objects created since the last cleanup\n\n __decompressed: false, /// INTERNAL USE ONLY: mark whether the lexer instance has been \'unfolded\' completely and is now ready for use\n\n done: false, /// INTERNAL USE ONLY\n _backtrack: false, /// INTERNAL USE ONLY\n _input: \'\', /// INTERNAL USE ONLY\n _more: false, /// INTERNAL USE ONLY\n _signaled_error_token: false, /// INTERNAL USE ONLY\n\n conditionStack: [], /// INTERNAL USE ONLY; managed via `pushState()`, `popState()`, `topState()` and `stateStackSize()`\n\n match: \'\', /// READ-ONLY EXTERNAL ACCESS - ADVANCED USE ONLY: tracks input which has been matched so far for the lexer token under construction. `match` is identical to `yytext` except that this one still contains the matched input string after `lexer.performAction()` has been invoked, where userland code MAY have changed/replaced the `yytext` value entirely!\n matched: \'\', /// READ-ONLY EXTERNAL ACCESS - ADVANCED USE ONLY: tracks entire input which has been matched so far\n matches: false, /// READ-ONLY EXTERNAL ACCESS - ADVANCED USE ONLY: tracks RE match result for last (successful) match attempt\n yytext: \'\', /// ADVANCED USE ONLY: tracks input which has been matched so far for the lexer token under construction; this value is transferred to the parser as the \'token value\' when the parser consumes the lexer token produced through a call to the `lex()` API.\n offset: 0, /// READ-ONLY EXTERNAL ACCESS - ADVANCED USE ONLY: tracks the \'cursor position\' in the input string, i.e. the number of characters matched so far\n yyleng: 0, /// READ-ONLY EXTERNAL ACCESS - ADVANCED USE ONLY: length of matched input for the token under construction (`yytext`)\n yylineno: 0, /// READ-ONLY EXTERNAL ACCESS - ADVANCED USE ONLY: \'line number\' at which the token under construction is located\n yylloc: null, /// READ-ONLY EXTERNAL ACCESS - ADVANCED USE ONLY: tracks location info (lines + columns) for the token under construction\n\n /**\n * INTERNAL USE: construct a suitable error info hash object instance for `parseError`.\n * \n * @public\n * @this {RegExpLexer}\n */\n constructLexErrorInfo: function lexer_constructLexErrorInfo(msg, recoverable, show_input_position) {\n msg = \'\' + msg;\n\n // heuristic to determine if the error message already contains a (partial) source code dump\n // as produced by either `showPosition()` or `prettyPrintRange()`:\n if (show_input_position == undefined) {\n show_input_position = !(msg.indexOf(\'\\n\') > 0 && msg.indexOf(\'^\') > 0);\n }\n if (this.yylloc && show_input_position) {\n if (typeof this.prettyPrintRange === \'function\') {\n var pretty_src = this.prettyPrintRange(this.yylloc);\n\n if (!/\\n\\s*$/.test(msg)) {\n msg += \'\\n\';\n }\n msg += \'\\n Erroneous area:\\n\' + this.prettyPrintRange(this.yylloc); \n } else if (typeof this.showPosition === \'function\') {\n var pos_str = this.showPosition();\n if (pos_str) {\n if (msg.length && msg[msg.length - 1] !== \'\\n\' && pos_str[0] !== \'\\n\') {\n msg += \'\\n\' + pos_str;\n } else {\n msg += pos_str;\n }\n }\n }\n }\n /** @constructor */\n var pei = {\n errStr: msg,\n recoverable: !!recoverable,\n text: this.match, // This one MAY be empty; userland code should use the `upcomingInput` API to obtain more text which follows the \'lexer cursor position\'...\n token: null,\n line: this.yylineno,\n loc: this.yylloc,\n yy: this.yy,\n lexer: this,\n\n /**\n * and make sure the error info doesn\'t stay due to potential\n * ref cycle via userland code manipulations.\n * These would otherwise all be memory leak opportunities!\n * \n * Note that only array and object references are nuked as those\n * constitute the set of elements which can produce a cyclic ref.\n * The rest of the members is kept intact as they are harmless.\n * \n * @public\n * @this {LexErrorInfo}\n */\n destroy: function destructLexErrorInfo() {\n // remove cyclic references added to error info:\n // info.yy = null;\n // info.lexer = null;\n // ...\n var rec = !!this.recoverable;\n for (var key in this) {\n if (this.hasOwnProperty(key) && typeof key === \'object\') {\n this[key] = undefined;\n }\n }\n this.recoverable = rec;\n }\n };\n // track this instance so we can `destroy()` it once we deem it superfluous and ready for garbage collection!\n this.__error_infos.push(pei);\n return pei;\n },\n\n /**\n * handler which is invoked when a lexer error occurs.\n * \n * @public\n * @this {RegExpLexer}\n */\n parseError: function lexer_parseError(str, hash, ExceptionClass) {\n if (!ExceptionClass) {\n ExceptionClass = this.JisonLexerError;\n }\n if (this.yy) {\n if (this.yy.parser && typeof this.yy.parser.parseError === \'function\') {\n return this.yy.parser.parseError.call(this, str, hash, ExceptionClass) || this.ERROR;\n } else if (typeof this.yy.parseError === \'function\') {\n return this.yy.parseError.call(this, str, hash, ExceptionClass) || this.ERROR;\n } \n }\n throw new ExceptionClass(str, hash);\n },\n\n /**\n * method which implements `yyerror(str, ...args)` functionality for use inside lexer actions.\n * \n * @public\n * @this {RegExpLexer}\n */\n yyerror: function yyError(str /*, ...args */) {\n var lineno_msg = \'\';\n if (this.yylloc) {\n lineno_msg = \' on line \' + (this.yylineno + 1);\n }\n var p = this.constructLexErrorInfo(\'Lexical error\' + lineno_msg + \': \' + str, this.options.lexerErrorsAreRecoverable);\n\n // Add any extra args to the hash under the name `extra_error_attributes`:\n var args = Array.prototype.slice.call(arguments, 1);\n if (args.length) {\n p.extra_error_attributes = args;\n }\n\n return (this.parseError(p.errStr, p, this.JisonLexerError) || this.ERROR);\n },\n\n /**\n * final cleanup function for when we have completed lexing the input;\n * make it an API so that external code can use this one once userland\n * code has decided it\'s time to destroy any lingering lexer error\n * hash object instances and the like: this function helps to clean\n * up these constructs, which *may* carry cyclic references which would\n * otherwise prevent the instances from being properly and timely\n * garbage-collected, i.e. this function helps prevent memory leaks!\n * \n * @public\n * @this {RegExpLexer}\n */\n cleanupAfterLex: function lexer_cleanupAfterLex(do_not_nuke_errorinfos) {\n // prevent lingering circular references from causing memory leaks:\n this.setInput(\'\', {});\n\n // nuke the error hash info instances created during this run.\n // Userland code must COPY any data/references\n // in the error hash instance(s) it is more permanently interested in.\n if (!do_not_nuke_errorinfos) {\n for (var i = this.__error_infos.length - 1; i >= 0; i--) {\n var el = this.__error_infos[i];\n if (el && typeof el.destroy === \'function\') {\n el.destroy();\n }\n }\n this.__error_infos.length = 0;\n }\n\n return this;\n },\n\n /**\n * clear the lexer token context; intended for internal use only\n * \n * @public\n * @this {RegExpLexer}\n */\n clear: function lexer_clear() {\n this.yytext = \'\';\n this.yyleng = 0;\n this.match = \'\';\n // - DO NOT reset `this.matched`\n this.matches = false;\n this._more = false;\n this._backtrack = false;\n\n var col = (this.yylloc ? this.yylloc.last_column : 0);\n this.yylloc = {\n first_line: this.yylineno + 1,\n first_column: col,\n last_line: this.yylineno + 1,\n last_column: col,\n\n range: [this.offset, this.offset]\n };\n },\n\n /**\n * resets the lexer, sets new input\n * \n * @public\n * @this {RegExpLexer}\n */\n setInput: function lexer_setInput(input, yy) {\n this.yy = yy || this.yy || {};\n\n // also check if we\'ve fully initialized the lexer instance,\n // including expansion work to be done to go from a loaded\n // lexer to a usable lexer:\n if (!this.__decompressed) {\n // step 1: decompress the regex list:\n var rules = this.rules;\n for (var i = 0, len = rules.length; i < len; i++) {\n var rule_re = rules[i];\n\n // compression: is the RE an xref to another RE slot in the rules[] table?\n if (typeof rule_re === \'number\') {\n rules[i] = rules[rule_re];\n }\n }\n\n // step 2: unfold the conditions[] set to make these ready for use:\n var conditions = this.conditions;\n for (var k in conditions) {\n var spec = conditions[k];\n\n var rule_ids = spec.rules;\n\n var len = rule_ids.length;\n var rule_regexes = new Array(len + 1); // slot 0 is unused; we use a 1-based index approach here to keep the hottest code in `lexer_next()` fast and simple!\n var rule_new_ids = new Array(len + 1);\n\n for (var i = 0; i < len; i++) {\n var idx = rule_ids[i];\n var rule_re = rules[idx];\n rule_regexes[i + 1] = rule_re;\n rule_new_ids[i + 1] = idx;\n }\n\n spec.rules = rule_new_ids;\n spec.__rule_regexes = rule_regexes;\n spec.__rule_count = len;\n }\n\n this.__decompressed = true;\n }\n\n this._input = input || \'\';\n this.clear();\n this._signaled_error_token = false;\n this.done = false;\n this.yylineno = 0;\n this.matched = \'\';\n this.conditionStack = [\'INITIAL\'];\n this.__currentRuleSet__ = null;\n this.yylloc = {\n first_line: 1,\n first_column: 0,\n last_line: 1,\n last_column: 0,\n\n range: [0, 0]\n };\n this.offset = 0;\n return this;\n },\n\n /**\n * edit the remaining input via user-specified callback.\n * This can be used to forward-adjust the input-to-parse, \n * e.g. inserting macro expansions and alike in the\n * input which has yet to be lexed.\n * The behaviour of this API contrasts the `unput()` et al\n * APIs as those act on the *consumed* input, while this\n * one allows one to manipulate the future, without impacting\n * the current `yyloc` cursor location or any history. \n * \n * Use this API to help implement C-preprocessor-like\n * `#include` statements, etc.\n * \n * The provided callback must be synchronous and is\n * expected to return the edited input (string).\n *\n * The `cpsArg` argument value is passed to the callback\n * as-is.\n *\n * `callback` interface: \n * `function callback(input, cpsArg)`\n * \n * - `input` will carry the remaining-input-to-lex string\n * from the lexer.\n * - `cpsArg` is `cpsArg` passed into this API.\n * \n * The `this` reference for the callback will be set to\n * reference this lexer instance so that userland code\n * in the callback can easily and quickly access any lexer\n * API. \n *\n * When the callback returns a non-string-type falsey value,\n * we assume the callback did not edit the input and we\n * will using the input as-is.\n *\n * When the callback returns a non-string-type value, it\n * is converted to a string for lexing via the `"" + retval`\n * operation. (See also why: http://2ality.com/2012/03/converting-to-string.html \n * -- that way any returned object\'s `toValue()` and `toString()`\n * methods will be invoked in a proper/desirable order.)\n * \n * @public\n * @this {RegExpLexer}\n */\n editRemainingInput: function lexer_editRemainingInput(callback, cpsArg) {\n var rv = callback.call(this, this._input, cpsArg);\n if (typeof rv !== \'string\') {\n if (rv) {\n this._input = \'\' + rv; \n }\n // else: keep `this._input` as is. \n } else {\n this._input = rv; \n }\n return this;\n },\n\n /**\n * consumes and returns one char from the input\n * \n * @public\n * @this {RegExpLexer}\n */\n input: function lexer_input() {\n if (!this._input) {\n //this.done = true; -- don\'t set `done` as we want the lex()/next() API to be able to produce one custom EOF token match after this anyhow. (lexer can match special <> tokens and perform user action code for a <> match, but only does so *once*)\n return null;\n }\n var ch = this._input[0];\n this.yytext += ch;\n this.yyleng++;\n this.offset++;\n this.match += ch;\n this.matched += ch;\n // Count the linenumber up when we hit the LF (or a stand-alone CR).\n // On CRLF, the linenumber is incremented when you fetch the CR or the CRLF combo\n // and we advance immediately past the LF as well, returning both together as if\n // it was all a single \'character\' only.\n var slice_len = 1;\n var lines = false;\n if (ch === \'\\n\') {\n lines = true;\n } else if (ch === \'\\r\') {\n lines = true;\n var ch2 = this._input[1];\n if (ch2 === \'\\n\') {\n slice_len++;\n ch += ch2;\n this.yytext += ch2;\n this.yyleng++;\n this.offset++;\n this.match += ch2;\n this.matched += ch2;\n this.yylloc.range[1]++;\n }\n }\n if (lines) {\n this.yylineno++;\n this.yylloc.last_line++;\n this.yylloc.last_column = 0;\n } else {\n this.yylloc.last_column++;\n }\n this.yylloc.range[1]++;\n\n this._input = this._input.slice(slice_len);\n return ch;\n },\n\n /**\n * unshifts one char (or an entire string) into the input\n * \n * @public\n * @this {RegExpLexer}\n */\n unput: function lexer_unput(ch) {\n var len = ch.length;\n var lines = ch.split(/(?:\\r\\n?|\\n)/g);\n\n this._input = ch + this._input;\n this.yytext = this.yytext.substr(0, this.yytext.length - len);\n this.yyleng = this.yytext.length;\n this.offset -= len;\n this.match = this.match.substr(0, this.match.length - len);\n this.matched = this.matched.substr(0, this.matched.length - len);\n\n if (lines.length > 1) {\n this.yylineno -= lines.length - 1;\n\n this.yylloc.last_line = this.yylineno + 1;\n\n // Get last entirely matched line into the `pre_lines[]` array\'s\n // last index slot; we don\'t mind when other previously \n // matched lines end up in the array too. \n var pre = this.match;\n var pre_lines = pre.split(/(?:\\r\\n?|\\n)/g);\n if (pre_lines.length === 1) {\n pre = this.matched;\n pre_lines = pre.split(/(?:\\r\\n?|\\n)/g);\n }\n this.yylloc.last_column = pre_lines[pre_lines.length - 1].length;\n } else {\n this.yylloc.last_column -= len;\n }\n\n this.yylloc.range[1] = this.yylloc.range[0] + this.yyleng;\n\n this.done = false;\n return this;\n },\n\n /**\n * cache matched text and append it on next action\n * \n * @public\n * @this {RegExpLexer}\n */\n more: function lexer_more() {\n this._more = true;\n return this;\n },\n\n /**\n * signal the lexer that this rule fails to match the input, so the\n * next matching rule (regex) should be tested instead.\n * \n * @public\n * @this {RegExpLexer}\n */\n reject: function lexer_reject() {\n if (this.options.backtrack_lexer) {\n this._backtrack = true;\n } else {\n // when the `parseError()` call returns, we MUST ensure that the error is registered.\n // We accomplish this by signaling an \'error\' token to be produced for the current\n // `.lex()` run.\n var lineno_msg = \'\';\n if (this.yylloc) {\n lineno_msg = \' on line \' + (this.yylineno + 1);\n }\n var p = this.constructLexErrorInfo(\'Lexical error\' + lineno_msg + \': You can only invoke reject() in the lexer when the lexer is of the backtracking persuasion (options.backtrack_lexer = true).\', false);\n this._signaled_error_token = (this.parseError(p.errStr, p, this.JisonLexerError) || this.ERROR);\n }\n return this;\n },\n\n /**\n * retain first n characters of the match\n * \n * @public\n * @this {RegExpLexer}\n */\n less: function lexer_less(n) {\n return this.unput(this.match.slice(n));\n },\n\n /**\n * return (part of the) already matched input, i.e. for error\n * messages.\n * \n * Limit the returned string length to `maxSize` (default: 20).\n * \n * Limit the returned string to the `maxLines` number of lines of\n * input (default: 1).\n * \n * Negative limit values equal *unlimited*.\n * \n * @public\n * @this {RegExpLexer}\n */\n pastInput: function lexer_pastInput(maxSize, maxLines) {\n var past = this.matched.substring(0, this.matched.length - this.match.length);\n if (maxSize < 0)\n maxSize = past.length;\n else if (!maxSize)\n maxSize = 20;\n if (maxLines < 0)\n maxLines = past.length; // can\'t ever have more input lines than this!\n else if (!maxLines)\n maxLines = 1;\n // `substr` anticipation: treat \\r\\n as a single character and take a little\n // more than necessary so that we can still properly check against maxSize\n // after we\'ve transformed and limited the newLines in here:\n past = past.substr(-maxSize * 2 - 2);\n // now that we have a significantly reduced string to process, transform the newlines\n // and chop them, then limit them:\n var a = past.replace(/\\r\\n|\\r/g, \'\\n\').split(\'\\n\');\n a = a.slice(-maxLines);\n past = a.join(\'\\n\');\n // When, after limiting to maxLines, we still have too much to return,\n // do add an ellipsis prefix...\n if (past.length > maxSize) {\n past = \'...\' + past.substr(-maxSize);\n }\n return past;\n },\n\n /**\n * return (part of the) upcoming input, i.e. for error messages.\n * \n * Limit the returned string length to `maxSize` (default: 20).\n * \n * Limit the returned string to the `maxLines` number of lines of input (default: 1).\n * \n * Negative limit values equal *unlimited*.\n *\n * > ### NOTE ###\n * >\n * > *"upcoming input"* is defined as the whole of the both\n * > the *currently lexed* input, together with any remaining input\n * > following that. *"currently lexed"* input is the input \n * > already recognized by the lexer but not yet returned with\n * > the lexer token. This happens when you are invoking this API\n * > from inside any lexer rule action code block. \n * >\n * \n * @public\n * @this {RegExpLexer}\n */\n upcomingInput: function lexer_upcomingInput(maxSize, maxLines) {\n var next = this.match;\n if (maxSize < 0)\n maxSize = next.length + this._input.length;\n else if (!maxSize)\n maxSize = 20;\n if (maxLines < 0)\n maxLines = maxSize; // can\'t ever have more input lines than this!\n else if (!maxLines)\n maxLines = 1;\n // `substring` anticipation: treat \\r\\n as a single character and take a little\n // more than necessary so that we can still properly check against maxSize\n // after we\'ve transformed and limited the newLines in here:\n if (next.length < maxSize * 2 + 2) {\n next += this._input.substring(0, maxSize * 2 + 2); // substring is faster on Chrome/V8\n }\n // now that we have a significantly reduced string to process, transform the newlines\n // and chop them, then limit them:\n var a = next.replace(/\\r\\n|\\r/g, \'\\n\').split(\'\\n\');\n a = a.slice(0, maxLines);\n next = a.join(\'\\n\');\n // When, after limiting to maxLines, we still have too much to return,\n // do add an ellipsis postfix...\n if (next.length > maxSize) {\n next = next.substring(0, maxSize) + \'...\';\n }\n return next;\n },\n\n /**\n * return a string which displays the character position where the\n * lexing error occurred, i.e. for error messages\n * \n * @public\n * @this {RegExpLexer}\n */\n showPosition: function lexer_showPosition(maxPrefix, maxPostfix) {\n var pre = this.pastInput(maxPrefix).replace(/\\s/g, \' \');\n var c = new Array(pre.length + 1).join(\'-\');\n return pre + this.upcomingInput(maxPostfix).replace(/\\s/g, \' \') + \'\\n\' + c + \'^\';\n },\n\n /**\n * return a string which displays the lines & columns of input which are referenced \n * by the given location info range, plus a few lines of context.\n * \n * This function pretty-prints the indicated section of the input, with line numbers \n * and everything!\n * \n * This function is very useful to provide highly readable error reports, while\n * the location range may be specified in various flexible ways:\n * \n * - `loc` is the location info object which references the area which should be\n * displayed and \'marked up\': these lines & columns of text are marked up by `^`\n * characters below each character in the entire input range.\n * \n * - `context_loc` is the *optional* location info object which instructs this\n * pretty-printer how much *leading* context should be displayed alongside\n * the area referenced by `loc`. This can help provide context for the displayed\n * error, etc.\n * \n * When this location info is not provided, a default context of 3 lines is\n * used.\n * \n * - `context_loc2` is another *optional* location info object, which serves\n * a similar purpose to `context_loc`: it specifies the amount of *trailing*\n * context lines to display in the pretty-print output.\n * \n * When this location info is not provided, a default context of 1 line only is\n * used.\n * \n * Special Notes:\n * \n * - when the `loc`-indicated range is very large (about 5 lines or more), then\n * only the first and last few lines of this block are printed while a\n * `...continued...` message will be printed between them.\n * \n * This serves the purpose of not printing a huge amount of text when the `loc`\n * range happens to be huge: this way a manageable & readable output results\n * for arbitrary large ranges.\n * \n * - this function can display lines of input which whave not yet been lexed.\n * `prettyPrintRange()` can access the entire input!\n * \n * @public\n * @this {RegExpLexer}\n */\n prettyPrintRange: function lexer_prettyPrintRange(loc, context_loc, context_loc2) {\n var error_size = loc.last_line - loc.first_line;\n const CONTEXT = 3;\n const CONTEXT_TAIL = 1;\n const MINIMUM_VISIBLE_NONEMPTY_LINE_COUNT = 2;\n var input = this.matched + this._input;\n var lines = input.split(\'\\n\');\n //var show_context = (error_size < 5 || context_loc);\n var l0 = Math.max(1, (context_loc ? context_loc.first_line : loc.first_line - CONTEXT));\n var l1 = Math.max(1, (context_loc2 ? context_loc2.last_line : loc.last_line + CONTEXT_TAIL));\n var lineno_display_width = (1 + Math.log10(l1 | 1) | 0);\n var ws_prefix = new Array(lineno_display_width).join(\' \');\n var nonempty_line_indexes = [];\n var rv = lines.slice(l0 - 1, l1 + 1).map(function injectLineNumber(line, index) {\n var lno = index + l0;\n var lno_pfx = (ws_prefix + lno).substr(-lineno_display_width);\n var rv = lno_pfx + \': \' + line;\n var errpfx = (new Array(lineno_display_width + 1)).join(\'^\');\n var offset = 2 + 1;\n var len = 0;\n\n if (lno === loc.first_line) {\n offset += loc.first_column;\n\n len = Math.max(\n 2,\n ((lno === loc.last_line ? loc.last_column : line.length)) - loc.first_column + 1\n );\n } else if (lno === loc.last_line) {\n len = Math.max(2, loc.last_column + 1);\n } else if (lno > loc.first_line && lno < loc.last_line) {\n len = Math.max(2, line.length + 1);\n }\n\n if (len) {\n var lead = new Array(offset).join(\'.\');\n var mark = new Array(len).join(\'^\');\n rv += \'\\n\' + errpfx + lead + mark;\n\n if (line.trim().length > 0) {\n nonempty_line_indexes.push(index);\n }\n }\n\n rv = rv.replace(/\\t/g, \' \');\n return rv;\n });\n\n // now make sure we don\'t print an overly large amount of error area: limit it \n // to the top and bottom line count:\n if (nonempty_line_indexes.length > 2 * MINIMUM_VISIBLE_NONEMPTY_LINE_COUNT) {\n var clip_start = nonempty_line_indexes[MINIMUM_VISIBLE_NONEMPTY_LINE_COUNT - 1] + 1;\n var clip_end = nonempty_line_indexes[nonempty_line_indexes.length - MINIMUM_VISIBLE_NONEMPTY_LINE_COUNT] - 1;\n\n var intermediate_line = (new Array(lineno_display_width + 1)).join(\' \') + \' (...continued...)\';\n intermediate_line += \'\\n\' + (new Array(lineno_display_width + 1)).join(\'-\') + \' (---------------)\';\n rv.splice(clip_start, clip_end - clip_start + 1, intermediate_line);\n }\n return rv.join(\'\\n\');\n },\n\n /**\n * helper function, used to produce a human readable description as a string, given\n * the input `yylloc` location object.\n * \n * Set `display_range_too` to TRUE to include the string character index position(s)\n * in the description if the `yylloc.range` is available.\n * \n * @public\n * @this {RegExpLexer}\n */\n describeYYLLOC: function lexer_describe_yylloc(yylloc, display_range_too) {\n var l1 = yylloc.first_line;\n var l2 = yylloc.last_line;\n var c1 = yylloc.first_column;\n var c2 = yylloc.last_column;\n var dl = l2 - l1;\n var dc = c2 - c1;\n var rv;\n if (dl === 0) {\n rv = \'line \' + l1 + \', \';\n if (dc <= 1) {\n rv += \'column \' + c1;\n } else {\n rv += \'columns \' + c1 + \' .. \' + c2;\n }\n } else {\n rv = \'lines \' + l1 + \'(column \' + c1 + \') .. \' + l2 + \'(column \' + c2 + \')\';\n }\n if (yylloc.range && display_range_too) {\n var r1 = yylloc.range[0];\n var r2 = yylloc.range[1] - 1;\n if (r2 <= r1) {\n rv += \' {String Offset: \' + r1 + \'}\';\n } else {\n rv += \' {String Offset range: \' + r1 + \' .. \' + r2 + \'}\';\n }\n }\n return rv;\n },\n\n /**\n * test the lexed token: return FALSE when not a match, otherwise return token.\n * \n * `match` is supposed to be an array coming out of a regex match, i.e. `match[0]`\n * contains the actually matched text string.\n * \n * Also move the input cursor forward and update the match collectors:\n * \n * - `yytext`\n * - `yyleng`\n * - `match`\n * - `matches`\n * - `yylloc`\n * - `offset`\n * \n * @public\n * @this {RegExpLexer}\n */\n test_match: function lexer_test_match(match, indexed_rule) {\n var token,\n lines,\n backup,\n match_str,\n match_str_len;\n\n if (this.options.backtrack_lexer) {\n // save context\n backup = {\n yylineno: this.yylineno,\n yylloc: {\n first_line: this.yylloc.first_line,\n last_line: this.yylloc.last_line,\n first_column: this.yylloc.first_column,\n last_column: this.yylloc.last_column,\n\n range: this.yylloc.range.slice(0)\n },\n yytext: this.yytext,\n match: this.match,\n matches: this.matches,\n matched: this.matched,\n yyleng: this.yyleng,\n offset: this.offset,\n _more: this._more,\n _input: this._input,\n //_signaled_error_token: this._signaled_error_token,\n yy: this.yy,\n conditionStack: this.conditionStack.slice(0),\n done: this.done\n };\n }\n\n match_str = match[0];\n match_str_len = match_str.length;\n // if (match_str.indexOf(\'\\n\') !== -1 || match_str.indexOf(\'\\r\') !== -1) {\n lines = match_str.split(/(?:\\r\\n?|\\n)/g);\n if (lines.length > 1) {\n this.yylineno += lines.length - 1;\n\n this.yylloc.last_line = this.yylineno + 1;\n this.yylloc.last_column = lines[lines.length - 1].length;\n } else {\n this.yylloc.last_column += match_str_len;\n }\n // }\n this.yytext += match_str;\n this.match += match_str;\n this.matched += match_str;\n this.matches = match;\n this.yyleng = this.yytext.length;\n this.yylloc.range[1] += match_str_len;\n\n // previous lex rules MAY have invoked the `more()` API rather than producing a token:\n // those rules will already have moved this `offset` forward matching their match lengths,\n // hence we must only add our own match length now:\n this.offset += match_str_len;\n this._more = false;\n this._backtrack = false;\n this._input = this._input.slice(match_str_len);\n\n // calling this method:\n //\n // function lexer__performAction(yy, yyrulenumber, YY_START) {...}\n token = this.performAction.call(this, this.yy, indexed_rule, this.conditionStack[this.conditionStack.length - 1] /* = YY_START */);\n // otherwise, when the action codes are all simple return token statements:\n //token = this.simpleCaseActionClusters[indexed_rule];\n\n if (this.done && this._input) {\n this.done = false;\n }\n if (token) {\n return token;\n } else if (this._backtrack) {\n // recover context\n for (var k in backup) {\n this[k] = backup[k];\n }\n this.__currentRuleSet__ = null;\n return false; // rule action called reject() implying the next rule should be tested instead.\n } else if (this._signaled_error_token) {\n // produce one \'error\' token as `.parseError()` in `reject()`\n // did not guarantee a failure signal by throwing an exception!\n token = this._signaled_error_token;\n this._signaled_error_token = false;\n return token;\n }\n return false;\n },\n\n /**\n * return next match in input\n * \n * @public\n * @this {RegExpLexer}\n */\n next: function lexer_next() {\n if (this.done) {\n this.clear();\n return this.EOF;\n }\n if (!this._input) {\n this.done = true;\n }\n\n var token,\n match,\n tempMatch,\n index;\n if (!this._more) {\n this.clear();\n }\n var spec = this.__currentRuleSet__;\n if (!spec) {\n // Update the ruleset cache as we apparently encountered a state change or just started lexing.\n // The cache is set up for fast lookup -- we assume a lexer will switch states much less often than it will\n // invoke the `lex()` token-producing API and related APIs, hence caching the set for direct access helps\n // speed up those activities a tiny bit.\n spec = this.__currentRuleSet__ = this._currentRules();\n // Check whether a *sane* condition has been pushed before: this makes the lexer robust against\n // user-programmer bugs such as https://github.com/zaach/jison-lex/issues/19\n if (!spec || !spec.rules) {\n var lineno_msg = \'\';\n if (this.options.trackPosition) {\n lineno_msg = \' on line \' + (this.yylineno + 1);\n }\n var p = this.constructLexErrorInfo(\'Internal lexer engine error\' + lineno_msg + \': The lex grammar programmer pushed a non-existing condition name "\' + this.topState() + \'"; this is a fatal error and should be reported to the application programmer team!\', false);\n // produce one \'error\' token until this situation has been resolved, most probably by parse termination!\n return (this.parseError(p.errStr, p, this.JisonLexerError) || this.ERROR);\n }\n }\n\n var rule_ids = spec.rules;\n var regexes = spec.__rule_regexes;\n var len = spec.__rule_count;\n\n // Note: the arrays are 1-based, while `len` itself is a valid index,\n // hence the non-standard less-or-equal check in the next loop condition!\n for (var i = 1; i <= len; i++) {\n tempMatch = this._input.match(regexes[i]);\n if (tempMatch && (!match || tempMatch[0].length > match[0].length)) {\n match = tempMatch;\n index = i;\n if (this.options.backtrack_lexer) {\n token = this.test_match(tempMatch, rule_ids[i]);\n if (token !== false) {\n return token;\n } else if (this._backtrack) {\n match = undefined;\n continue; // rule action called reject() implying a rule MISmatch.\n } else {\n // else: this is a lexer rule which consumes input without producing a token (e.g. whitespace)\n return false;\n }\n } else if (!this.options.flex) {\n break;\n }\n }\n }\n if (match) {\n token = this.test_match(match, rule_ids[index]);\n if (token !== false) {\n return token;\n }\n // else: this is a lexer rule which consumes input without producing a token (e.g. whitespace)\n return false;\n }\n if (!this._input) {\n this.done = true;\n this.clear();\n return this.EOF;\n } else {\n var lineno_msg = \'\';\n if (this.options.trackPosition) {\n lineno_msg = \' on line \' + (this.yylineno + 1);\n }\n var p = this.constructLexErrorInfo(\'Lexical error\' + lineno_msg + \': Unrecognized text.\', this.options.lexerErrorsAreRecoverable);\n\n var pendingInput = this._input;\n var activeCondition = this.topState();\n var conditionStackDepth = this.conditionStack.length;\n\n token = (this.parseError(p.errStr, p, this.JisonLexerError) || this.ERROR);\n if (token === this.ERROR) {\n // we can try to recover from a lexer error that `parseError()` did not \'recover\' for us\n // by moving forward at least one character at a time IFF the (user-specified?) `parseError()`\n // has not consumed/modified any pending input or changed state in the error handler:\n if (!this.matches && \n // and make sure the input has been modified/consumed ...\n pendingInput === this._input &&\n // ...or the lexer state has been modified significantly enough\n // to merit a non-consuming error handling action right now.\n activeCondition === this.topState() && \n conditionStackDepth === this.conditionStack.length\n ) {\n this.input();\n }\n }\n return token;\n }\n },\n\n /**\n * return next match that has a token\n * \n * @public\n * @this {RegExpLexer}\n */\n lex: function lexer_lex() {\n var r;\n // allow the PRE/POST handlers set/modify the return token for maximum flexibility of the generated lexer:\n if (typeof this.options.pre_lex === \'function\') {\n r = this.options.pre_lex.call(this);\n }\n\n while (!r) {\n r = this.next();\n }\n\n if (typeof this.options.post_lex === \'function\') {\n // (also account for a userdef function which does not return any value: keep the token as is)\n r = this.options.post_lex.call(this, r) || r;\n }\n return r;\n },\n\n /**\n * backwards compatible alias for `pushState()`;\n * the latter is symmetrical with `popState()` and we advise to use\n * those APIs in any modern lexer code, rather than `begin()`.\n * \n * @public\n * @this {RegExpLexer}\n */\n begin: function lexer_begin(condition) {\n return this.pushState(condition);\n },\n\n /**\n * activates a new lexer condition state (pushes the new lexer\n * condition state onto the condition stack)\n * \n * @public\n * @this {RegExpLexer}\n */\n pushState: function lexer_pushState(condition) {\n this.conditionStack.push(condition);\n this.__currentRuleSet__ = null;\n return this;\n },\n\n /**\n * pop the previously active lexer condition state off the condition\n * stack\n * \n * @public\n * @this {RegExpLexer}\n */\n popState: function lexer_popState() {\n var n = this.conditionStack.length - 1;\n if (n > 0) {\n this.__currentRuleSet__ = null; \n return this.conditionStack.pop();\n } else {\n return this.conditionStack[0];\n }\n },\n\n /**\n * return the currently active lexer condition state; when an index\n * argument is provided it produces the N-th previous condition state,\n * if available\n * \n * @public\n * @this {RegExpLexer}\n */\n topState: function lexer_topState(n) {\n n = this.conditionStack.length - 1 - Math.abs(n || 0);\n if (n >= 0) {\n return this.conditionStack[n];\n } else {\n return \'INITIAL\';\n }\n },\n\n /**\n * (internal) determine the lexer rule set which is active for the\n * currently active lexer condition state\n * \n * @public\n * @this {RegExpLexer}\n */\n _currentRules: function lexer__currentRules() {\n if (this.conditionStack.length && this.conditionStack[this.conditionStack.length - 1]) {\n return this.conditions[this.conditionStack[this.conditionStack.length - 1]];\n } else {\n return this.conditions[\'INITIAL\'];\n }\n },\n\n /**\n * return the number of states currently on the stack\n * \n * @public\n * @this {RegExpLexer}\n */\n stateStackSize: function lexer_stateStackSize() {\n return this.conditionStack.length;\n }\n}'; + // --- END lexer kernel --- + } + + RegExpLexer.prototype = new Function(rmCommonWS(_templateObject2, getRegExpLexerPrototype()))(); + + // The lexer code stripper, driven by optimization analysis settings and + // lexer options, which cannot be changed at run-time. + function stripUnusedLexerCode(src, opt) { + // uses yyleng: ..................... ${opt.lexerActionsUseYYLENG} + // uses yylineno: ................... ${opt.lexerActionsUseYYLINENO} + // uses yytext: ..................... ${opt.lexerActionsUseYYTEXT} + // uses yylloc: ..................... ${opt.lexerActionsUseYYLOC} + // uses ParseError API: ............. ${opt.lexerActionsUseParseError} + // uses location tracking & editing: ${opt.lexerActionsUseLocationTracking} + // uses more() API: ................. ${opt.lexerActionsUseMore} + // uses unput() API: ................ ${opt.lexerActionsUseUnput} + // uses reject() API: ............... ${opt.lexerActionsUseReject} + // uses less() API: ................. ${opt.lexerActionsUseLess} + // uses display APIs pastInput(), upcomingInput(), showPosition(): + // ............................. ${opt.lexerActionsUseDisplayAPIs} + // uses describeYYLLOC() API: ....... ${opt.lexerActionsUseDescribeYYLOC} + + var ast = helpers.parseCodeChunkToAST(src, opt); + var new_src = helpers.prettyPrintAST(ast, opt); + + new_src = new_src.replace(/\/\*\s*JISON-LEX-ANALYTICS-REPORT\s*\*\//g, rmCommonWS(_templateObject3, opt.options.backtrack_lexer, opt.options.ranges, opt.options.trackPosition, opt.parseActionsUseYYLENG, opt.parseActionsUseYYLINENO, opt.parseActionsUseYYTEXT, opt.parseActionsUseYYLOC, opt.parseActionsUseValueTracking, opt.parseActionsUseValueAssignment, opt.parseActionsUseLocationTracking, opt.parseActionsUseLocationAssignment, opt.lexerActionsUseYYLENG, opt.lexerActionsUseYYLINENO, opt.lexerActionsUseYYTEXT, opt.lexerActionsUseYYLOC, opt.lexerActionsUseParseError, opt.lexerActionsUseYYERROR, opt.lexerActionsUseLocationTracking, opt.lexerActionsUseMore, opt.lexerActionsUseUnput, opt.lexerActionsUseReject, opt.lexerActionsUseLess, opt.lexerActionsUseDisplayAPIs, opt.lexerActionsUseDescribeYYLOC)); + + return new_src; + } + + // generate lexer source from a grammar + /** @public */ + function generate(dict, tokens, build_options) { + var opt = processGrammar(dict, tokens, build_options); + + return generateFromOpts(opt); + } + + // process the grammar and build final data structures and functions + /** @public */ + function processGrammar(dict, tokens, build_options) { + build_options = build_options || {}; + var opts = { + // include the knowledge passed through `build_options` about which lexer + // features will actually be *used* by the environment (which in 99.9% + // of cases is a jison *parser*): + // + // (this stuff comes straight from the jison Optimization Analysis.) + // + parseActionsUseYYLENG: build_options.parseActionsUseYYLENG, + parseActionsUseYYLINENO: build_options.parseActionsUseYYLINENO, + parseActionsUseYYTEXT: build_options.parseActionsUseYYTEXT, + parseActionsUseYYLOC: build_options.parseActionsUseYYLOC, + parseActionsUseParseError: build_options.parseActionsUseParseError, + parseActionsUseYYERROR: build_options.parseActionsUseYYERROR, + parseActionsUseYYERROK: build_options.parseActionsUseYYERROK, + parseActionsUseYYRECOVERING: build_options.parseActionsUseYYRECOVERING, + parseActionsUseYYCLEARIN: build_options.parseActionsUseYYCLEARIN, + parseActionsUseValueTracking: build_options.parseActionsUseValueTracking, + parseActionsUseValueAssignment: build_options.parseActionsUseValueAssignment, + parseActionsUseLocationTracking: build_options.parseActionsUseLocationTracking, + parseActionsUseLocationAssignment: build_options.parseActionsUseLocationAssignment, + parseActionsUseYYSTACK: build_options.parseActionsUseYYSTACK, + parseActionsUseYYSSTACK: build_options.parseActionsUseYYSSTACK, + parseActionsUseYYSTACKPOINTER: build_options.parseActionsUseYYSTACKPOINTER, + parseActionsUseYYRULELENGTH: build_options.parseActionsUseYYRULELENGTH, + parserHasErrorRecovery: build_options.parserHasErrorRecovery, + parserHasErrorReporting: build_options.parserHasErrorReporting, + + lexerActionsUseYYLENG: '???', + lexerActionsUseYYLINENO: '???', + lexerActionsUseYYTEXT: '???', + lexerActionsUseYYLOC: '???', + lexerActionsUseParseError: '???', + lexerActionsUseYYERROR: '???', + lexerActionsUseLocationTracking: '???', + lexerActionsUseMore: '???', + lexerActionsUseUnput: '???', + lexerActionsUseReject: '???', + lexerActionsUseLess: '???', + lexerActionsUseDisplayAPIs: '???', + lexerActionsUseDescribeYYLOC: '???' + }; + + dict = autodetectAndConvertToJSONformat(dict, build_options) || {}; + + // Feed the possibly reprocessed 'dictionary' above back to the caller + // (for use by our error diagnostic assistance code) + opts.lex_rule_dictionary = dict; + + // Always provide the lexer with an options object, even if it's empty! + // Make sure to camelCase all options: + opts.options = mkStdOptions(build_options, dict.options); + + opts.moduleType = opts.options.moduleType; + opts.moduleName = opts.options.moduleName; + + opts.conditions = prepareStartConditions(dict.startConditions); + opts.conditions.INITIAL = { + rules: [], + inclusive: true + }; + + var code = buildActions(dict, tokens, opts); + opts.performAction = code.actions; + opts.caseHelperInclude = code.caseHelperInclude; + opts.rules = code.rules; + opts.macros = code.macros; + + opts.regular_rule_count = code.regular_rule_count; + opts.simple_rule_count = code.simple_rule_count; + + opts.conditionStack = ['INITIAL']; + + opts.actionInclude = dict.actionInclude || ''; + opts.moduleInclude = (opts.moduleInclude || '') + (dict.moduleInclude || '').trim(); + + return opts; + } + + // Assemble the final source from the processed grammar + /** @public */ + function generateFromOpts(opt) { + var code = ''; + + switch (opt.moduleType) { + case 'js': + code = generateModule(opt); + break; + case 'amd': + code = generateAMDModule(opt); + break; + case 'es': + code = generateESModule(opt); + break; + case 'commonjs': + default: + code = generateCommonJSModule(opt); + break; + } + + return code; + } + + function generateRegexesInitTableCode(opt) { + var a = opt.rules; + var print_xregexp = opt.options && opt.options.xregexp; + var id_display_width = 1 + Math.log10(a.length | 1) | 0; + var ws_prefix = new Array(id_display_width).join(' '); + var b = a.map(function generateXRegExpInitCode(re, idx) { + var idx_str = (ws_prefix + idx).substr(-id_display_width); + + if (re instanceof XRegExp) { + // When we don't need the special XRegExp sauce at run-time, we do with the original + // JavaScript RegExp instance a.k.a. 'native regex': + if (re.xregexp.isNative || !print_xregexp) { + return '/* ' + idx_str + ': */ ' + re; + } + // And make sure to escape the regex to make it suitable for placement inside a *string* + // as it is passed as a string argument to the XRegExp constructor here. + var re_src = re.xregexp.source.replace(/[\\"]/g, '\\$&'); + return '/* ' + idx_str + ': */ new XRegExp("' + re_src + '", "' + re.xregexp.flags + '")'; + } else { + return '/* ' + idx_str + ': */ ' + re; + } + }); + return b.join(',\n'); + } + + function generateModuleBody(opt) { + // make the JSON output look more like JavaScript: + function cleanupJSON(str) { + str = str.replace(/ "rules": \[/g, ' rules: ['); + str = str.replace(/ "inclusive": /g, ' inclusive: '); + return str; + } + + function produceOptions(opts) { + var obj = {}; + var do_not_pass = { + debug: !opts.debug, // do not include this item when it is FALSE as there's no debug tracing built into the generated grammar anyway! + enableDebugLogs: 1, + json: 1, + _: 1, + noMain: 1, + dumpSourceCodeOnFailure: 1, + throwErrorOnCompileFailure: 1, + reportStats: 1, + file: 1, + outfile: 1, + inputPath: 1, + inputFilename: 1, + defaultModuleName: 1, + moduleName: 1, + moduleType: 1, + lexerErrorsAreRecoverable: 0, + flex: 0, + backtrack_lexer: 0, + caseInsensitive: 0, + showSource: 1, + exportAST: 1, + exportAllTables: 1, + exportSourceCode: 1, + prettyCfg: 1, + parseActionsUseYYLENG: 1, + parseActionsUseYYLINENO: 1, + parseActionsUseYYTEXT: 1, + parseActionsUseYYLOC: 1, + parseActionsUseParseError: 1, + parseActionsUseYYERROR: 1, + parseActionsUseYYRECOVERING: 1, + parseActionsUseYYERROK: 1, + parseActionsUseYYCLEARIN: 1, + parseActionsUseValueTracking: 1, + parseActionsUseValueAssignment: 1, + parseActionsUseLocationTracking: 1, + parseActionsUseLocationAssignment: 1, + parseActionsUseYYSTACK: 1, + parseActionsUseYYSSTACK: 1, + parseActionsUseYYSTACKPOINTER: 1, + parseActionsUseYYRULELENGTH: 1, + parserHasErrorRecovery: 1, + parserHasErrorReporting: 1, + lexerActionsUseYYLENG: 1, + lexerActionsUseYYLINENO: 1, + lexerActionsUseYYTEXT: 1, + lexerActionsUseYYLOC: 1, + lexerActionsUseParseError: 1, + lexerActionsUseYYERROR: 1, + lexerActionsUseLocationTracking: 1, + lexerActionsUseMore: 1, + lexerActionsUseUnput: 1, + lexerActionsUseReject: 1, + lexerActionsUseLess: 1, + lexerActionsUseDisplayAPIs: 1, + lexerActionsUseDescribeYYLOC: 1 + }; + for (var k in opts) { + if (!do_not_pass[k] && opts[k] != null && opts[k] !== false) { + // make sure numeric values are encoded as numeric, the rest as boolean/string. + if (typeof opts[k] === 'string') { + var f = parseFloat(opts[k]); + if (f == opts[k]) { + obj[k] = f; + continue; + } + } + obj[k] = opts[k]; + } + } + + // And now some options which should receive some special processing: + var pre = obj.pre_lex; + var post = obj.post_lex; + // since JSON cannot encode functions, we'll have to do it manually at run-time, i.e. later on: + if (pre) { + obj.pre_lex = true; + } + if (post) { + obj.post_lex = true; + } + + var js = JSON.stringify(obj, null, 2); + + js = js.replace(new XRegExp(' "(' + ID_REGEX_BASE + ')": ', 'g'), ' $1: '); + js = js.replace(/^( +)pre_lex: true(,)?$/gm, function (m, ls, tc) { + return ls + 'pre_lex: ' + String(pre) + (tc || ''); + }); + js = js.replace(/^( +)post_lex: true(,)?$/gm, function (m, ls, tc) { + return ls + 'post_lex: ' + String(post) + (tc || ''); + }); + return js; + } + + var out; + if (opt.rules.length > 0 || opt.__in_rules_failure_analysis_mode__) { + // we don't mind that the `test_me()` code above will have this `lexer` variable re-defined: + // JavaScript is fine with that. + var code = [rmCommonWS(_templateObject4), '/*JISON-LEX-ANALYTICS-REPORT*/' /* slot #1: placeholder for analysis report further below */ + ]; + + // get the RegExpLexer.prototype in source code form: + var protosrc = getRegExpLexerPrototype(); + // and strip off the surrounding bits we don't want: + protosrc = protosrc.replace(/^[\s\r\n]*\{/, '').replace(/\s*\}[\s\r\n]*$/, '').trim(); + code.push(protosrc + ',\n'); + + assert(opt.options); + // Assure all options are camelCased: + assert(typeof opt.options['case-insensitive'] === 'undefined'); + + code.push(' options: ' + produceOptions(opt.options)); + + var performActionCode = String(opt.performAction); + var simpleCaseActionClustersCode = String(opt.caseHelperInclude); + var rulesCode = generateRegexesInitTableCode(opt); + var conditionsCode = cleanupJSON(JSON.stringify(opt.conditions, null, 2)); + code.push(rmCommonWS(_templateObject5, performActionCode, simpleCaseActionClustersCode, rulesCode, conditionsCode)); + + opt.is_custom_lexer = false; + + out = code.join(''); + } else { + // We're clearly looking at a custom lexer here as there's no lexer rules at all. + // + // We are re-purposing the `%{...%}` `actionInclude` code block here as it serves no purpose otherwise. + // + // Meanwhile we make sure we have the `lexer` variable declared in *local scope* no matter + // what crazy stuff (or lack thereof) the userland code is pulling in the `actionInclude` chunk. + out = 'var lexer;\n'; + + assert(opt.regular_rule_count === 0); + assert(opt.simple_rule_count === 0); + opt.is_custom_lexer = true; + + if (opt.actionInclude) { + out += opt.actionInclude + (!opt.actionInclude.match(/;[\s\r\n]*$/) ? ';' : '') + '\n'; + } + } + + // The output of this function is guaranteed to read something like this: + // + // ``` + // var lexer; + // + // bla bla bla bla ... lotsa bla bla; + // ``` + // + // and that should work nicely as an `eval()`-able piece of source code. + return out; + } + + function generateGenericHeaderComment() { + var out = rmCommonWS(_templateObject6, version); + + return out; + } + + function prepareOptions(opt) { + opt = opt || {}; + + // check for illegal identifier + if (!opt.moduleName || !opt.moduleName.match(/^[a-zA-Z_$][a-zA-Z0-9_$\.]*$/)) { + if (opt.moduleName) { + var msg = 'WARNING: The specified moduleName "' + opt.moduleName + '" is illegal (only characters [a-zA-Z0-9_$] and "." dot are accepted); using the default moduleName "lexer" instead.'; + if (typeof opt.warn_cb === 'function') { + opt.warn_cb(msg); + } else { + // do not treat as warning; barf hairball instead so that this oddity gets noticed right away! + throw new Error(msg); + } + } + opt.moduleName = 'lexer'; + } + + prepExportStructures(opt); + + return opt; + } + + function generateModule(opt) { + opt = prepareOptions(opt); + + var out = [generateGenericHeaderComment(), '', 'var ' + opt.moduleName + ' = (function () {', jisonLexerErrorDefinition, '', generateModuleBody(opt), '', opt.moduleInclude ? opt.moduleInclude + ';' : '', '', 'return lexer;', '})();']; + + var src = out.join('\n') + '\n'; + src = stripUnusedLexerCode(src, opt); + opt.exportSourceCode.all = src; + return src; + } + + function generateAMDModule(opt) { + opt = prepareOptions(opt); + + var out = [generateGenericHeaderComment(), '', 'define([], function () {', jisonLexerErrorDefinition, '', generateModuleBody(opt), '', opt.moduleInclude ? opt.moduleInclude + ';' : '', '', 'return lexer;', '});']; + + var src = out.join('\n') + '\n'; + src = stripUnusedLexerCode(src, opt); + opt.exportSourceCode.all = src; + return src; + } + + function generateESModule(opt) { + opt = prepareOptions(opt); + + var out = [generateGenericHeaderComment(), '', 'var lexer = (function () {', jisonLexerErrorDefinition, '', generateModuleBody(opt), '', opt.moduleInclude ? opt.moduleInclude + ';' : '', '', 'return lexer;', '})();', '', 'function yylex() {', ' return lexer.lex.apply(lexer, arguments);', '}', rmCommonWS(_templateObject7)]; + + var src = out.join('\n') + '\n'; + src = stripUnusedLexerCode(src, opt); + opt.exportSourceCode.all = src; + return src; + } + + function generateCommonJSModule(opt) { + opt = prepareOptions(opt); + + var out = [generateGenericHeaderComment(), '', 'var ' + opt.moduleName + ' = (function () {', jisonLexerErrorDefinition, '', generateModuleBody(opt), '', opt.moduleInclude ? opt.moduleInclude + ';' : '', '', 'return lexer;', '})();', '', 'if (typeof require !== \'undefined\' && typeof exports !== \'undefined\') {', ' exports.lexer = ' + opt.moduleName + ';', ' exports.lex = function () {', ' return ' + opt.moduleName + '.lex.apply(lexer, arguments);', ' };', '}']; + + var src = out.join('\n') + '\n'; + src = stripUnusedLexerCode(src, opt); + opt.exportSourceCode.all = src; + return src; + } + + RegExpLexer.generate = generate; + + RegExpLexer.version = version; + RegExpLexer.defaultJisonLexOptions = defaultJisonLexOptions; + RegExpLexer.mkStdOptions = mkStdOptions; + RegExpLexer.camelCase = camelCase; + RegExpLexer.autodetectAndConvertToJSONformat = autodetectAndConvertToJSONformat; + + return RegExpLexer; +}); diff --git a/dist/regexp-lexer-umd.js b/dist/regexp-lexer-umd.js new file mode 100644 index 0000000..1467f9c --- /dev/null +++ b/dist/regexp-lexer-umd.js @@ -0,0 +1,4055 @@ +(function (global, factory) { + typeof exports === 'object' && typeof module !== 'undefined' ? module.exports = factory(require('@gerhobbelt/xregexp'), require('@gerhobbelt/json5'), require('@gerhobbelt/lex-parser'), require('assert'), require('jison-helpers-lib')) : + typeof define === 'function' && define.amd ? define(['@gerhobbelt/xregexp', '@gerhobbelt/json5', '@gerhobbelt/lex-parser', 'assert', 'jison-helpers-lib'], factory) : + (global['regexp-lexer'] = factory(global.XRegExp,global.json5,global.lexParser,global.assert,global.helpers)); +}(this, (function (XRegExp,json5,lexParser,assert,helpers) { 'use strict'; + +XRegExp = XRegExp && XRegExp.hasOwnProperty('default') ? XRegExp['default'] : XRegExp; +json5 = json5 && json5.hasOwnProperty('default') ? json5['default'] : json5; +lexParser = lexParser && lexParser.hasOwnProperty('default') ? lexParser['default'] : lexParser; +assert = assert && assert.hasOwnProperty('default') ? assert['default'] : assert; +helpers = helpers && helpers.hasOwnProperty('default') ? helpers['default'] : helpers; + +// +// Helper library for set definitions +// +// MIT Licensed +// +// +// This code is intended to help parse regex set expressions and mix them +// together, i.e. to answer questions like this: +// +// what is the resulting regex set expression when we mix the regex set +// `[a-z]` with the regex set `[^\s]` where with 'mix' we mean that any +// input which matches either input regex should match the resulting +// regex set. (a.k.a. Full Outer Join, see also http://www.diffen.com/difference/Inner_Join_vs_Outer_Join) +// + +'use strict'; + +const XREGEXP_UNICODE_ESCAPE_RE$1 = /^\{[A-Za-z0-9 \-\._]+\}/; // Matches the XRegExp Unicode escape braced part, e.g. `{Number}` +const CHR_RE$1 = /^(?:[^\\]|\\[^cxu0-9]|\\[0-9]{1,3}|\\c[A-Z]|\\x[0-9a-fA-F]{2}|\\u[0-9a-fA-F]{4}|\\u\{[0-9a-fA-F]+\})/; +const SET_PART_RE$1 = /^(?:[^\\\]]|\\[^cxu0-9]|\\[0-9]{1,3}|\\c[A-Z]|\\x[0-9a-fA-F]{2}|\\u[0-9a-fA-F]{4}|\\u\{[0-9a-fA-F]+\})+/; +const NOTHING_SPECIAL_RE$1 = /^(?:[^\\\[\]\(\)\|^\{\}]|\\[^cxu0-9]|\\[0-9]{1,3}|\\c[A-Z]|\\x[0-9a-fA-F]{2}|\\u[0-9a-fA-F]{4}|\\u\{[0-9a-fA-F]+\})+/; +const SET_IS_SINGLE_PCODE_RE = /^\\[dDwWsS]$|^\\p\{[A-Za-z0-9 \-\._]+\}$/; + +const UNICODE_BASE_PLANE_MAX_CP$1 = 65535; + +// The expanded regex sets which are equivalent to the given `\\{c}` escapes: +// +// `/\s/`: +const WHITESPACE_SETSTR$1 = ' \f\n\r\t\v\u00a0\u1680\u180e\u2000-\u200a\u2028\u2029\u202f\u205f\u3000\ufeff'; +// `/\d/`: +const DIGIT_SETSTR$1 = '0-9'; +// `/\w/`: +const WORDCHAR_SETSTR$1 = 'A-Za-z0-9_'; + + + + + +// Helper for `bitarray2set()`: convert character code to a representation string suitable for use in a regex +function i2c(i) { + var c, x; + + switch (i) { + case 10: + return '\\n'; + + case 13: + return '\\r'; + + case 9: + return '\\t'; + + case 8: + return '\\b'; + + case 12: + return '\\f'; + + case 11: + return '\\v'; + + case 45: // ASCII/Unicode for '-' dash + return '\\-'; + + case 91: // '[' + return '\\['; + + case 92: // '\\' + return '\\\\'; + + case 93: // ']' + return '\\]'; + + case 94: // ']' + return '\\^'; + } + if (i < 32 + || i > 0xFFF0 /* Unicode Specials, also in UTF16 */ + || (i >= 0xD800 && i <= 0xDFFF) /* Unicode Supplementary Planes; we're TOAST in JavaScript as we're NOT UTF-16 but UCS-2! */ + || String.fromCharCode(i).match(/[\u2028\u2029]/) /* Code compilation via `new Function()` does not like to see these, or rather: treats them as just another form of CRLF, which breaks your generated regex code! */ + ) { + // Detail about a detail: + // U+2028 and U+2029 are part of the `\s` regex escape code (`\s` and `[\s]` match either of these) and when placed in a JavaScript + // source file verbatim (without escaping it as a `\uNNNN` item) then JavaScript will interpret it as such and consequently report + // a b0rked generated parser, as the generated code would include this regex right here. + // Hence we MUST escape these buggers everywhere we go... + x = i.toString(16); + if (x.length >= 1 && i <= 0xFFFF) { + c = '0000' + x; + return '\\u' + c.substr(c.length - 4); + } else { + return '\\u{' + x + '}'; + } + } + return String.fromCharCode(i); +} + + +// Helper collection for `bitarray2set()`: we have expanded all these cached `\\p{NAME}` regex sets when creating +// this bitarray and now we should look at these expansions again to see if `bitarray2set()` can produce a +// `\\p{NAME}` shorthand to represent [part of] the bitarray: +var Pcodes_bitarray_cache = {}; +var Pcodes_bitarray_cache_test_order = []; + +// Helper collection for `bitarray2set()` for minifying special cases of result sets which can be represented by +// a single regex 'escape', e.g. `\d` for digits 0-9. +var EscCode_bitarray_output_refs; + +// now initialize the EscCodes_... table above: +init_EscCode_lookup_table(); + +function init_EscCode_lookup_table() { + var s, bitarr, set2esc = {}, esc2bitarr = {}; + + // patch global lookup tables for the time being, while we calculate their *real* content in this function: + EscCode_bitarray_output_refs = { + esc2bitarr: {}, + set2esc: {} + }; + Pcodes_bitarray_cache_test_order = []; + + // `/\S': + bitarr = []; + set2bitarray(bitarr, '^' + WHITESPACE_SETSTR$1); + s = bitarray2set(bitarr); + esc2bitarr['S'] = bitarr; + set2esc[s] = 'S'; + // set2esc['^' + s] = 's'; + Pcodes_bitarray_cache['\\S'] = bitarr; + + // `/\s': + bitarr = []; + set2bitarray(bitarr, WHITESPACE_SETSTR$1); + s = bitarray2set(bitarr); + esc2bitarr['s'] = bitarr; + set2esc[s] = 's'; + // set2esc['^' + s] = 'S'; + Pcodes_bitarray_cache['\\s'] = bitarr; + + // `/\D': + bitarr = []; + set2bitarray(bitarr, '^' + DIGIT_SETSTR$1); + s = bitarray2set(bitarr); + esc2bitarr['D'] = bitarr; + set2esc[s] = 'D'; + // set2esc['^' + s] = 'd'; + Pcodes_bitarray_cache['\\D'] = bitarr; + + // `/\d': + bitarr = []; + set2bitarray(bitarr, DIGIT_SETSTR$1); + s = bitarray2set(bitarr); + esc2bitarr['d'] = bitarr; + set2esc[s] = 'd'; + // set2esc['^' + s] = 'D'; + Pcodes_bitarray_cache['\\d'] = bitarr; + + // `/\W': + bitarr = []; + set2bitarray(bitarr, '^' + WORDCHAR_SETSTR$1); + s = bitarray2set(bitarr); + esc2bitarr['W'] = bitarr; + set2esc[s] = 'W'; + // set2esc['^' + s] = 'w'; + Pcodes_bitarray_cache['\\W'] = bitarr; + + // `/\w': + bitarr = []; + set2bitarray(bitarr, WORDCHAR_SETSTR$1); + s = bitarray2set(bitarr); + esc2bitarr['w'] = bitarr; + set2esc[s] = 'w'; + // set2esc['^' + s] = 'W'; + Pcodes_bitarray_cache['\\w'] = bitarr; + + EscCode_bitarray_output_refs = { + esc2bitarr: esc2bitarr, + set2esc: set2esc + }; + + updatePcodesBitarrayCacheTestOrder(); +} + +function updatePcodesBitarrayCacheTestOrder(opts) { + var t = new Array(UNICODE_BASE_PLANE_MAX_CP$1 + 1); + var l = {}; + var user_has_xregexp = opts && opts.options && opts.options.xregexp; + var i, j, k, ba; + + // mark every character with which regex pcodes they are part of: + for (k in Pcodes_bitarray_cache) { + ba = Pcodes_bitarray_cache[k]; + + if (!user_has_xregexp && k.indexOf('\\p{') >= 0) { + continue; + } + + var cnt = 0; + for (i = 0; i <= UNICODE_BASE_PLANE_MAX_CP$1; i++) { + if (ba[i]) { + cnt++; + if (!t[i]) { + t[i] = [k]; + } else { + t[i].push(k); + } + } + } + l[k] = cnt; + } + + // now dig out the unique ones: only need one per pcode. + // + // We ASSUME every \\p{NAME} 'pcode' has at least ONE character + // in it that is ONLY matched by that particular pcode. + // If this assumption fails, nothing is lost, but our 'regex set + // optimized representation' will be sub-optimal as than this pcode + // won't be tested during optimization. + // + // Now that would be a pity, so the assumption better holds... + // Turns out the assumption doesn't hold already for /\S/ + /\D/ + // as the second one (\D) is a pure subset of \S. So we have to + // look for markers which match multiple escapes/pcodes for those + // ones where a unique item isn't available... + var lut = []; + var done = {}; + var keys = Object.keys(Pcodes_bitarray_cache); + + for (i = 0; i <= UNICODE_BASE_PLANE_MAX_CP$1; i++) { + k = t[i][0]; + if (t[i].length === 1 && !done[k]) { + assert(l[k] > 0); + lut.push([i, k]); + done[k] = true; + } + } + + for (j = 0; keys[j]; j++) { + k = keys[j]; + + if (!user_has_xregexp && k.indexOf('\\p{') >= 0) { + continue; + } + + if (!done[k]) { + assert(l[k] > 0); + // find a minimum span character to mark this one: + var w = Infinity; + var rv; + ba = Pcodes_bitarray_cache[k]; + for (i = 0; i <= UNICODE_BASE_PLANE_MAX_CP$1; i++) { + if (ba[i]) { + var tl = t[i].length; + if (tl > 1 && tl < w) { + assert(l[k] > 0); + rv = [i, k]; + w = tl; + } + } + } + if (rv) { + done[k] = true; + lut.push(rv); + } + } + } + + // order from large set to small set so that small sets don't gobble + // characters also represented by overlapping larger set pcodes. + // + // Again we assume something: that finding the large regex pcode sets + // before the smaller, more specialized ones, will produce a more + // optimal minification of the regex set expression. + // + // This is a guestimate/heuristic only! + lut.sort(function (a, b) { + var k1 = a[1]; + var k2 = b[1]; + var ld = l[k2] - l[k1]; + if (ld) { + return ld; + } + // and for same-size sets, order from high to low unique identifier. + return b[0] - a[0]; + }); + + Pcodes_bitarray_cache_test_order = lut; +} + + + + + + +// 'Join' a regex set `[...]` into a Unicode range spanning logic array, flagging every character in the given set. +function set2bitarray(bitarr, s, opts) { + var orig = s; + var set_is_inverted = false; + var bitarr_orig; + + function mark(d1, d2) { + if (d2 == null) d2 = d1; + for (var i = d1; i <= d2; i++) { + bitarr[i] = true; + } + } + + function add2bitarray(dst, src) { + for (var i = 0; i <= UNICODE_BASE_PLANE_MAX_CP$1; i++) { + if (src[i]) { + dst[i] = true; + } + } + } + + function eval_escaped_code(s) { + var c; + // decode escaped code? If none, just take the character as-is + if (s.indexOf('\\') === 0) { + var l = s.substr(0, 2); + switch (l) { + case '\\c': + c = s.charCodeAt(2) - 'A'.charCodeAt(0) + 1; + return String.fromCharCode(c); + + case '\\x': + s = s.substr(2); + c = parseInt(s, 16); + return String.fromCharCode(c); + + case '\\u': + s = s.substr(2); + if (s[0] === '{') { + s = s.substr(1, s.length - 2); + } + c = parseInt(s, 16); + if (c >= 0x10000) { + return new Error('We do NOT support Extended Plane Unicode Codepoints (i.e. CodePoints beyond U:FFFF) in regex set expressions, e.g. \\u{' + s + '}'); + } + return String.fromCharCode(c); + + case '\\0': + case '\\1': + case '\\2': + case '\\3': + case '\\4': + case '\\5': + case '\\6': + case '\\7': + s = s.substr(1); + c = parseInt(s, 8); + return String.fromCharCode(c); + + case '\\r': + return '\r'; + + case '\\n': + return '\n'; + + case '\\v': + return '\v'; + + case '\\f': + return '\f'; + + case '\\t': + return '\t'; + + case '\\b': + return '\b'; + + default: + // just the character itself: + return s.substr(1); + } + } else { + return s; + } + } + + if (s && s.length) { + var c1, c2; + + // inverted set? + if (s[0] === '^') { + set_is_inverted = true; + s = s.substr(1); + bitarr_orig = bitarr; + bitarr = new Array(UNICODE_BASE_PLANE_MAX_CP$1 + 1); + } + + // BITARR collects flags for characters set. Inversion means the complement set of character is st instead. + // This results in an OR operations when sets are joined/chained. + + while (s.length) { + c1 = s.match(CHR_RE$1); + if (!c1) { + // hit an illegal escape sequence? cope anyway! + c1 = s[0]; + } else { + c1 = c1[0]; + // Quick hack for XRegExp escapes inside a regex `[...]` set definition: we *could* try to keep those + // intact but it's easier to unfold them here; this is not nice for when the grammar specifies explicit + // XRegExp support, but alas, we'll get there when we get there... ;-) + switch (c1) { + case '\\p': + s = s.substr(c1.length); + c2 = s.match(XREGEXP_UNICODE_ESCAPE_RE$1); + if (c2) { + c2 = c2[0]; + s = s.substr(c2.length); + // do we have this one cached already? + var pex = c1 + c2; + var ba4p = Pcodes_bitarray_cache[pex]; + if (!ba4p) { + // expand escape: + var xr = new XRegExp('[' + pex + ']'); // TODO: case-insensitive grammar??? + // rewrite to a standard `[...]` regex set: XRegExp will do this for us via `XRegExp.toString()`: + var xs = '' + xr; + // remove the wrapping `/.../` to get at the (possibly *combined* series of) `[...]` sets inside: + xs = xs.substr(1, xs.length - 2); + + ba4p = reduceRegexToSetBitArray(xs, pex, opts); + + Pcodes_bitarray_cache[pex] = ba4p; + updatePcodesBitarrayCacheTestOrder(opts); + } + // merge bitarrays: + add2bitarray(bitarr, ba4p); + continue; + } + break; + + case '\\S': + case '\\s': + case '\\W': + case '\\w': + case '\\d': + case '\\D': + // these can't participate in a range, but need to be treated special: + s = s.substr(c1.length); + // check for \S, \s, \D, \d, \W, \w and expand them: + var ba4e = EscCode_bitarray_output_refs.esc2bitarr[c1[1]]; + assert(ba4e); + add2bitarray(bitarr, ba4e); + continue; + + case '\\b': + // matches a backspace: https://developer.mozilla.org/en/docs/Web/JavaScript/Guide/Regular_Expressions#special-backspace + c1 = '\u0008'; + break; + } + } + var v1 = eval_escaped_code(c1); + // propagate deferred exceptions = error reports. + if (v1 instanceof Error) { + return v1; + } + v1 = v1.charCodeAt(0); + s = s.substr(c1.length); + + if (s[0] === '-' && s.length >= 2) { + // we can expect a range like 'a-z': + s = s.substr(1); + c2 = s.match(CHR_RE$1); + if (!c2) { + // hit an illegal escape sequence? cope anyway! + c2 = s[0]; + } else { + c2 = c2[0]; + } + var v2 = eval_escaped_code(c2); + // propagate deferred exceptions = error reports. + if (v2 instanceof Error) { + return v1; + } + v2 = v2.charCodeAt(0); + s = s.substr(c2.length); + + // legal ranges go UP, not /DOWN! + if (v1 <= v2) { + mark(v1, v2); + } else { + console.warn('INVALID CHARACTER RANGE found in regex: ', { re: orig, start: c1, start_n: v1, end: c2, end_n: v2 }); + mark(v1); + mark('-'.charCodeAt(0)); + mark(v2); + } + continue; + } + mark(v1); + } + + // When we have marked all slots, '^' NEGATES the set, hence we flip all slots. + // + // Since a regex like `[^]` should match everything(?really?), we don't need to check if the MARK + // phase actually marked anything at all: the `^` negation will correctly flip=mark the entire + // range then. + if (set_is_inverted) { + for (var i = 0; i <= UNICODE_BASE_PLANE_MAX_CP$1; i++) { + if (!bitarr[i]) { + bitarr_orig[i] = true; + } + } + } + } + return false; +} + + +// convert a simple bitarray back into a regex set `[...]` content: +function bitarray2set(l, output_inverted_variant, output_minimized) { + // construct the inverse(?) set from the mark-set: + // + // Before we do that, we inject a sentinel so that our inner loops + // below can be simple and fast: + l[UNICODE_BASE_PLANE_MAX_CP$1 + 1] = 1; + // now reconstruct the regex set: + var rv = []; + var i, j, cnt, lut, tn, tspec, match, pcode, ba4pcode, l2; + var bitarr_is_cloned = false; + var l_orig = l; + + if (output_inverted_variant) { + // generate the inverted set, hence all unmarked slots are part of the output range: + cnt = 0; + for (i = 0; i <= UNICODE_BASE_PLANE_MAX_CP$1; i++) { + if (!l[i]) { + cnt++; + } + } + if (cnt === UNICODE_BASE_PLANE_MAX_CP$1 + 1) { + // When there's nothing in the output we output a special 'match-nothing' regex: `[^\S\s]`. + // BUT... since we output the INVERTED set, we output the match-all set instead: + return '\\S\\s'; + } + else if (cnt === 0) { + // When we find the entire Unicode range is in the output match set, we replace this with + // a shorthand regex: `[\S\s]` + // BUT... since we output the INVERTED set, we output the match-nothing set instead: + return '^\\S\\s'; + } + + // Now see if we can replace several bits by an escape / pcode: + if (output_minimized) { + lut = Pcodes_bitarray_cache_test_order; + for (tn = 0; lut[tn]; tn++) { + tspec = lut[tn]; + // check if the uniquely identifying char is in the inverted set: + if (!l[tspec[0]]) { + // check if the pcode is covered by the inverted set: + pcode = tspec[1]; + ba4pcode = Pcodes_bitarray_cache[pcode]; + match = 0; + for (j = 0; j <= UNICODE_BASE_PLANE_MAX_CP$1; j++) { + if (ba4pcode[j]) { + if (!l[j]) { + // match in current inverted bitset, i.e. there's at + // least one 'new' bit covered by this pcode/escape: + match++; + } else if (l_orig[j]) { + // mismatch! + match = false; + break; + } + } + } + + // We're only interested in matches which actually cover some + // yet uncovered bits: `match !== 0 && match !== false`. + // + // Apply the heuristic that the pcode/escape is only going to be used + // when it covers *more* characters than its own identifier's length: + if (match && match > pcode.length) { + rv.push(pcode); + + // and nuke the bits in the array which match the given pcode: + // make sure these edits are visible outside this function as + // `l` is an INPUT parameter (~ not modified)! + if (!bitarr_is_cloned) { + l2 = new Array(UNICODE_BASE_PLANE_MAX_CP$1 + 1); + for (j = 0; j <= UNICODE_BASE_PLANE_MAX_CP$1; j++) { + l2[j] = l[j] || ba4pcode[j]; // `!(!l[j] && !ba4pcode[j])` + } + // recreate sentinel + l2[UNICODE_BASE_PLANE_MAX_CP$1 + 1] = 1; + l = l2; + bitarr_is_cloned = true; + } else { + for (j = 0; j <= UNICODE_BASE_PLANE_MAX_CP$1; j++) { + l[j] = l[j] || ba4pcode[j]; + } + } + } + } + } + } + + i = 0; + while (i <= UNICODE_BASE_PLANE_MAX_CP$1) { + // find first character not in original set: + while (l[i]) { + i++; + } + if (i >= UNICODE_BASE_PLANE_MAX_CP$1 + 1) { + break; + } + // find next character not in original set: + for (j = i + 1; !l[j]; j++) {} /* empty loop */ + // generate subset: + rv.push(i2c(i)); + if (j - 1 > i) { + rv.push((j - 2 > i ? '-' : '') + i2c(j - 1)); + } + i = j; + } + } else { + // generate the non-inverted set, hence all logic checks are inverted here... + cnt = 0; + for (i = 0; i <= UNICODE_BASE_PLANE_MAX_CP$1; i++) { + if (l[i]) { + cnt++; + } + } + if (cnt === UNICODE_BASE_PLANE_MAX_CP$1 + 1) { + // When we find the entire Unicode range is in the output match set, we replace this with + // a shorthand regex: `[\S\s]` + return '\\S\\s'; + } + else if (cnt === 0) { + // When there's nothing in the output we output a special 'match-nothing' regex: `[^\S\s]`. + return '^\\S\\s'; + } + + // Now see if we can replace several bits by an escape / pcode: + if (output_minimized) { + lut = Pcodes_bitarray_cache_test_order; + for (tn = 0; lut[tn]; tn++) { + tspec = lut[tn]; + // check if the uniquely identifying char is in the set: + if (l[tspec[0]]) { + // check if the pcode is covered by the set: + pcode = tspec[1]; + ba4pcode = Pcodes_bitarray_cache[pcode]; + match = 0; + for (j = 0; j <= UNICODE_BASE_PLANE_MAX_CP$1; j++) { + if (ba4pcode[j]) { + if (l[j]) { + // match in current bitset, i.e. there's at + // least one 'new' bit covered by this pcode/escape: + match++; + } else if (!l_orig[j]) { + // mismatch! + match = false; + break; + } + } + } + + // We're only interested in matches which actually cover some + // yet uncovered bits: `match !== 0 && match !== false`. + // + // Apply the heuristic that the pcode/escape is only going to be used + // when it covers *more* characters than its own identifier's length: + if (match && match > pcode.length) { + rv.push(pcode); + + // and nuke the bits in the array which match the given pcode: + // make sure these edits are visible outside this function as + // `l` is an INPUT parameter (~ not modified)! + if (!bitarr_is_cloned) { + l2 = new Array(UNICODE_BASE_PLANE_MAX_CP$1 + 1); + for (j = 0; j <= UNICODE_BASE_PLANE_MAX_CP$1; j++) { + l2[j] = l[j] && !ba4pcode[j]; + } + // recreate sentinel + l2[UNICODE_BASE_PLANE_MAX_CP$1 + 1] = 1; + l = l2; + bitarr_is_cloned = true; + } else { + for (j = 0; j <= UNICODE_BASE_PLANE_MAX_CP$1; j++) { + l[j] = l[j] && !ba4pcode[j]; + } + } + } + } + } + } + + i = 0; + while (i <= UNICODE_BASE_PLANE_MAX_CP$1) { + // find first character not in original set: + while (!l[i]) { + i++; + } + if (i >= UNICODE_BASE_PLANE_MAX_CP$1 + 1) { + break; + } + // find next character not in original set: + for (j = i + 1; l[j]; j++) {} /* empty loop */ + if (j > UNICODE_BASE_PLANE_MAX_CP$1 + 1) { + j = UNICODE_BASE_PLANE_MAX_CP$1 + 1; + } + // generate subset: + rv.push(i2c(i)); + if (j - 1 > i) { + rv.push((j - 2 > i ? '-' : '') + i2c(j - 1)); + } + i = j; + } + } + + assert(rv.length); + var s = rv.join(''); + assert(s); + + // Check if the set is better represented by one of the regex escapes: + var esc4s = EscCode_bitarray_output_refs.set2esc[s]; + if (esc4s) { + // When we hit a special case like this, it is always the shortest notation, hence wins on the spot! + return '\\' + esc4s; + } + return s; +} + + + + + +// Pretty brutal conversion of 'regex' `s` back to raw regex set content: strip outer [...] when they're there; +// ditto for inner combos of sets, i.e. `]|[` as in `[0-9]|[a-z]`. +function reduceRegexToSetBitArray(s, name, opts) { + var orig = s; + + // propagate deferred exceptions = error reports. + if (s instanceof Error) { + return s; + } + + var l = new Array(UNICODE_BASE_PLANE_MAX_CP$1 + 1); + var internal_state = 0; + var derr; + + while (s.length) { + var c1 = s.match(CHR_RE$1); + if (!c1) { + // cope with illegal escape sequences too! + return new Error('illegal escape sequence at start of regex part: "' + s + '" of regex "' + orig + '"'); + } else { + c1 = c1[0]; + } + s = s.substr(c1.length); + + switch (c1) { + case '[': + // this is starting a set within the regex: scan until end of set! + var set_content = []; + while (s.length) { + var inner = s.match(SET_PART_RE$1); + if (!inner) { + inner = s.match(CHR_RE$1); + if (!inner) { + // cope with illegal escape sequences too! + return new Error('illegal escape sequence at start of regex part: ' + s + '" of regex "' + orig + '"'); + } else { + inner = inner[0]; + } + if (inner === ']') break; + } else { + inner = inner[0]; + } + set_content.push(inner); + s = s.substr(inner.length); + } + + // ensure that we hit the terminating ']': + var c2 = s.match(CHR_RE$1); + if (!c2) { + // cope with illegal escape sequences too! + return new Error('regex set expression is broken in regex: "' + orig + '" --> "' + s + '"'); + } else { + c2 = c2[0]; + } + if (c2 !== ']') { + return new Error('regex set expression is broken in regex: ' + orig); + } + s = s.substr(c2.length); + + var se = set_content.join(''); + if (!internal_state) { + derr = set2bitarray(l, se, opts); + // propagate deferred exceptions = error reports. + if (derr instanceof Error) { + return derr; + } + + // a set is to use like a single character in a longer literal phrase, hence input `[abc]word[def]` would thus produce output `[abc]`: + internal_state = 1; + } + break; + + // Strip unescaped pipes to catch constructs like `\\r|\\n` and turn them into + // something ready for use inside a regex set, e.g. `\\r\\n`. + // + // > Of course, we realize that converting more complex piped constructs this way + // > will produce something you might not expect, e.g. `A|WORD2` which + // > would end up as the set `[AW]` which is something else than the input + // > entirely. + // > + // > However, we can only depend on the user (grammar writer) to realize this and + // > prevent this from happening by not creating such oddities in the input grammar. + case '|': + // a|b --> [ab] + internal_state = 0; + break; + + case '(': + // (a) --> a + // + // TODO - right now we treat this as 'too complex': + + // Strip off some possible outer wrappers which we know how to remove. + // We don't worry about 'damaging' the regex as any too-complex regex will be caught + // in the validation check at the end; our 'strippers' here would not damage useful + // regexes anyway and them damaging the unacceptable ones is fine. + s = s.replace(/^\((?:\?:)?(.*?)\)$/, '$1'); // (?:...) -> ... and (...) -> ... + s = s.replace(/^\^?(.*?)\$?$/, '$1'); // ^...$ --> ... (catch these both inside and outside the outer grouping, hence do the ungrouping twice: one before, once after this) + s = s.replace(/^\((?:\?:)?(.*?)\)$/, '$1'); // (?:...) -> ... and (...) -> ... + + return new Error('[macro [' + name + '] is unsuitable for use inside regex set expressions: "[' + orig + ']"]'); + + case '.': + case '*': + case '+': + case '?': + // wildcard + // + // TODO - right now we treat this as 'too complex': + return new Error('[macro [' + name + '] is unsuitable for use inside regex set expressions: "[' + orig + ']"]'); + + case '{': // range, e.g. `x{1,3}`, or macro? + // TODO - right now we treat this as 'too complex': + return new Error('[macro [' + name + '] is unsuitable for use inside regex set expressions: "[' + orig + ']"]'); + + default: + // literal character or word: take the first character only and ignore the rest, so that + // the constructed set for `word|noun` would be `[wb]`: + if (!internal_state) { + derr = set2bitarray(l, c1, opts); + // propagate deferred exceptions = error reports. + if (derr instanceof Error) { + return derr; + } + + internal_state = 2; + } + break; + } + } + + s = bitarray2set(l); + + // When this result is suitable for use in a set, than we should be able to compile + // it in a regex; that way we can easily validate whether macro X is fit to be used + // inside a regex set: + try { + var re; + assert(s); + assert(!(s instanceof Error)); + re = new XRegExp('[' + s + ']'); + re.test(s[0]); + + // One thing is apparently *not* caught by the RegExp compile action above: `[a[b]c]` + // so we check for lingering UNESCAPED brackets in here as those cannot be: + if (/[^\\][\[\]]/.exec(s)) { + throw new Error('unescaped brackets in set data'); + } + } catch (ex) { + // make sure we produce a set range expression which will fail badly when it is used + // in actual code: + s = new Error('[macro [' + name + '] is unsuitable for use inside regex set expressions: "[' + s + ']"]: ' + ex.message); + } + + assert(s); + // propagate deferred exceptions = error reports. + if (s instanceof Error) { + return s; + } + return l; +} + + + + +// Convert bitarray representing, for example, `'0-9'` to regex string `[0-9]` +// -- or in this example it can be further optimized to only `\d`! +function produceOptimizedRegex4Set(bitarr) { + // First try to produce a minimum regex from the bitarray directly: + var s1 = bitarray2set(bitarr, false, true); + + // and when the regex set turns out to match a single pcode/escape, then + // use that one as-is: + if (s1.match(SET_IS_SINGLE_PCODE_RE)) { + // When we hit a special case like this, it is always the shortest notation, hence wins on the spot! + return s1; + } else { + s1 = '[' + s1 + ']'; + } + + // Now try to produce a minimum regex from the *inverted* bitarray via negation: + // Because we look at a negated bitset, there's no use looking for matches with + // special cases here. + var s2 = bitarray2set(bitarr, true, true); + + if (s2[0] === '^') { + s2 = s2.substr(1); + if (s2.match(SET_IS_SINGLE_PCODE_RE)) { + // When we hit a special case like this, it is always the shortest notation, hence wins on the spot! + return s2; + } + } else { + s2 = '^' + s2; + } + s2 = '[' + s2 + ']'; + + // Then, as some pcode/escapes still happen to deliver a LARGER regex string in the end, + // we also check against the plain, unadulterated regex set expressions: + // + // First try to produce a minimum regex from the bitarray directly: + var s3 = bitarray2set(bitarr, false, false); + + // and when the regex set turns out to match a single pcode/escape, then + // use that one as-is: + if (s3.match(SET_IS_SINGLE_PCODE_RE)) { + // When we hit a special case like this, it is always the shortest notation, hence wins on the spot! + return s3; + } else { + s3 = '[' + s3 + ']'; + } + + // Now try to produce a minimum regex from the *inverted* bitarray via negation: + // Because we look at a negated bitset, there's no use looking for matches with + // special cases here. + var s4 = bitarray2set(bitarr, true, false); + + if (s4[0] === '^') { + s4 = s4.substr(1); + if (s4.match(SET_IS_SINGLE_PCODE_RE)) { + // When we hit a special case like this, it is always the shortest notation, hence wins on the spot! + return s4; + } + } else { + s4 = '^' + s4; + } + s4 = '[' + s4 + ']'; + + if (s2.length < s1.length) { + s1 = s2; + } + if (s3.length < s1.length) { + s1 = s3; + } + if (s4.length < s1.length) { + s1 = s4; + } + + return s1; +} + + + + + + +var setmgmt = { + XREGEXP_UNICODE_ESCAPE_RE: XREGEXP_UNICODE_ESCAPE_RE$1, + CHR_RE: CHR_RE$1, + SET_PART_RE: SET_PART_RE$1, + NOTHING_SPECIAL_RE: NOTHING_SPECIAL_RE$1, + SET_IS_SINGLE_PCODE_RE, + + UNICODE_BASE_PLANE_MAX_CP: UNICODE_BASE_PLANE_MAX_CP$1, + + WHITESPACE_SETSTR: WHITESPACE_SETSTR$1, + DIGIT_SETSTR: DIGIT_SETSTR$1, + WORDCHAR_SETSTR: WORDCHAR_SETSTR$1, + + set2bitarray, + bitarray2set, + produceOptimizedRegex4Set, + reduceRegexToSetBitArray, +}; + +// Basic Lexer implemented using JavaScript regular expressions +// Zachary Carter +// MIT Licensed + +var rmCommonWS = helpers.rmCommonWS; +var camelCase = helpers.camelCase; +var code_exec = helpers.exec; +// import recast from '@gerhobbelt/recast'; +// import astUtils from '@gerhobbelt/ast-util'; +var version = '0.6.1-205'; // require('./package.json').version; + + + + +const XREGEXP_UNICODE_ESCAPE_RE = setmgmt.XREGEXP_UNICODE_ESCAPE_RE; // Matches the XRegExp Unicode escape braced part, e.g. `{Number}` +const CHR_RE = setmgmt.CHR_RE; +const SET_PART_RE = setmgmt.SET_PART_RE; +const NOTHING_SPECIAL_RE = setmgmt.NOTHING_SPECIAL_RE; +const UNICODE_BASE_PLANE_MAX_CP = setmgmt.UNICODE_BASE_PLANE_MAX_CP; + +// WARNING: this regex MUST match the regex for `ID` in ebnf-parser::bnf.l jison language lexer spec! (`ID = [{ALPHA}]{ALNUM}*`) +// +// This is the base XRegExp ID regex used in many places; this should match the ID macro definition in the EBNF/BNF parser et al as well! +const ID_REGEX_BASE = '[\\p{Alphabetic}_][\\p{Alphabetic}_\\p{Number}]*'; + + + + +// see also ./lib/cli.js +/** +@public +@nocollapse +*/ +const defaultJisonLexOptions = { + moduleType: 'commonjs', + debug: false, + enableDebugLogs: false, + json: false, + main: false, // CLI: not:(--main option) + dumpSourceCodeOnFailure: true, + throwErrorOnCompileFailure: true, + + moduleName: undefined, + defaultModuleName: 'lexer', + file: undefined, + outfile: undefined, + inputPath: undefined, + inputFilename: undefined, + warn_cb: undefined, // function(msg) | true (= use Jison.Print) | false (= throw Exception) + + xregexp: false, + lexerErrorsAreRecoverable: false, + flex: false, + backtrack_lexer: false, + ranges: false, // track position range, i.e. start+end indexes in the input string + trackPosition: true, // track line+column position in the input string + caseInsensitive: false, + showSource: false, + exportSourceCode: false, + exportAST: false, + prettyCfg: true, + pre_lex: undefined, + post_lex: undefined, +}; + + +// Merge sets of options. +// +// Convert alternative jison option names to their base option. +// +// The *last* option set which overrides the default wins, where 'override' is +// defined as specifying a not-undefined value which is not equal to the +// default value. +// +// When the FIRST argument is STRING "NODEFAULT", then we MUST NOT mix the +// default values avialable in Jison.defaultJisonOptions. +// +// Return a fresh set of options. +/** @public */ +function mkStdOptions(/*...args*/) { + var h = Object.prototype.hasOwnProperty; + + var opts = {}; + var args = [].concat.apply([], arguments); + // clone defaults, so we do not modify those constants? + if (args[0] !== "NODEFAULT") { + args.unshift(defaultJisonLexOptions); + } else { + args.shift(); + } + + for (var i = 0, len = args.length; i < len; i++) { + var o = args[i]; + if (!o) continue; + + // clone input (while camel-casing the options), so we do not modify those either. + var o2 = {}; + + for (var p in o) { + if (typeof o[p] !== 'undefined' && h.call(o, p)) { + o2[camelCase(p)] = o[p]; + } + } + + // now clean them options up: + if (typeof o2.main !== 'undefined') { + o2.noMain = !o2.main; + } + + delete o2.main; + + // special check for `moduleName` to ensure we detect the 'default' moduleName entering from the CLI + // NOT overriding the moduleName set in the grammar definition file via an `%options` entry: + if (o2.moduleName === o2.defaultModuleName) { + delete o2.moduleName; + } + + // now see if we have an overriding option here: + for (var p in o2) { + if (h.call(o2, p)) { + if (typeof o2[p] !== 'undefined') { + opts[p] = o2[p]; + } + } + } + } + + return opts; +} + +// set up export/output attributes of the `options` object instance +function prepExportStructures(options) { + // set up the 'option' `exportSourceCode` as a hash object for returning + // all generated source code chunks to the caller + var exportSourceCode = options.exportSourceCode; + if (!exportSourceCode || typeof exportSourceCode !== 'object') { + exportSourceCode = { + enabled: !!exportSourceCode + }; + } else if (typeof exportSourceCode.enabled !== 'boolean') { + exportSourceCode.enabled = true; + } + options.exportSourceCode = exportSourceCode; +} + +// Autodetect if the input lexer spec is in JSON or JISON +// format when the `options.json` flag is `true`. +// +// Produce the JSON lexer spec result when these are JSON formatted already as that +// would save us the trouble of doing this again, anywhere else in the JISON +// compiler/generator. +// +// Otherwise return the *parsed* lexer spec as it has +// been processed through LexParser. +function autodetectAndConvertToJSONformat(lexerSpec, options) { + var chk_l = null; + var ex1, err; + + if (typeof lexerSpec === 'string') { + if (options.json) { + try { + chk_l = json5.parse(lexerSpec); + + // When JSON5-based parsing of the lexer spec succeeds, this implies the lexer spec is specified in `JSON mode` + // *OR* there's a JSON/JSON5 format error in the input: + } catch (e) { + ex1 = e; + } + } + if (!chk_l) { + // // WARNING: the lexer may receive options specified in the **grammar spec file**, + // // hence we should mix the options to ensure the lexParser always + // // receives the full set! + // // + // // make sure all options are 'standardized' before we go and mix them together: + // options = mkStdOptions(grammar.options, options); + try { + chk_l = lexParser.parse(lexerSpec, options); + } catch (e) { + if (options.json) { + err = new Error('Could not parse lexer spec in JSON AUTODETECT mode\nError: ' + ex1.message + ' (' + e.message + ')'); + err.secondary_exception = e; + err.stack = ex1.stack; + } else { + err = new Error('Could not parse lexer spec\nError: ' + e.message); + err.stack = e.stack; + } + throw err; + } + } + } else { + chk_l = lexerSpec; + } + + // Save time! Don't reparse the entire lexer spec *again* inside the code generators when that's not necessary: + + return chk_l; +} + + +// expand macros and convert matchers to RegExp's +function prepareRules(dict, actions, caseHelper, tokens, startConditions, opts) { + var m, i, k, rule, action, conditions, + active_conditions, + rules = dict.rules || [], + newRules = [], + macros = {}, + regular_rule_count = 0, + simple_rule_count = 0; + + // Assure all options are camelCased: + assert(typeof opts.options['case-insensitive'] === 'undefined'); + + if (!tokens) { + tokens = {}; + } + + // Depending on the location within the regex we need different expansions of the macros: + // one expansion for when a macro is *inside* a `[...]` and another expansion when a macro + // is anywhere else in a regex: + if (dict.macros) { + macros = prepareMacros(dict.macros, opts); + } + + function tokenNumberReplacement(str, token) { + return 'return ' + (tokens[token] || '\'' + token.replace(/'/g, '\\\'') + '\''); + } + + // Make sure a comment does not contain any embedded '*/' end-of-comment marker + // as that would break the generated code + function postprocessComment(str) { + if (Array.isArray(str)) { + str = str.join(' '); + } + str = str.replace(/\*\//g, '*\\/'); // destroy any inner `*/` comment terminator sequence. + return str; + } + + actions.push('switch(yyrulenumber) {'); + + for (i = 0; i < rules.length; i++) { + rule = rules[i]; + m = rule[0]; + + active_conditions = []; + if (Object.prototype.toString.apply(m) !== '[object Array]') { + // implicit add to all inclusive start conditions + for (k in startConditions) { + if (startConditions[k].inclusive) { + active_conditions.push(k); + startConditions[k].rules.push(i); + } + } + } else if (m[0] === '*') { + // Add to ALL start conditions + active_conditions.push('*'); + for (k in startConditions) { + startConditions[k].rules.push(i); + } + rule.shift(); + m = rule[0]; + } else { + // Add to explicit start conditions + conditions = rule.shift(); + m = rule[0]; + for (k = 0; k < conditions.length; k++) { + if (!startConditions.hasOwnProperty(conditions[k])) { + startConditions[conditions[k]] = { + rules: [], + inclusive: false + }; + console.warn('Lexer Warning:', '"' + conditions[k] + '" start condition should be defined as %s or %x; assuming %x now.'); + } + active_conditions.push(conditions[k]); + startConditions[conditions[k]].rules.push(i); + } + } + + if (typeof m === 'string') { + m = expandMacros(m, macros, opts); + m = new XRegExp('^(?:' + m + ')', opts.options.caseInsensitive ? 'i' : ''); + } + newRules.push(m); + if (typeof rule[1] === 'function') { + rule[1] = String(rule[1]).replace(/^\s*function \(\)\s?\{/, '').replace(/\}\s*$/, ''); + } + action = rule[1]; + action = action.replace(/return '((?:\\'|[^']+)+)'/g, tokenNumberReplacement); + action = action.replace(/return "((?:\\"|[^"]+)+)"/g, tokenNumberReplacement); + + var code = ['\n/*! Conditions::']; + code.push(postprocessComment(active_conditions)); + code.push('*/', '\n/*! Rule:: '); + code.push(postprocessComment(rules[i][0])); + code.push('*/', '\n'); + + // When the action is *only* a simple `return TOKEN` statement, then add it to the caseHelpers; + // otherwise add the additional `break;` at the end. + // + // Note: we do NOT analyze the action block any more to see if the *last* line is a simple + // `return NNN;` statement as there are too many shoddy idioms, e.g. + // + // ``` + // %{ if (cond) + // return TOKEN; + // %} + // ``` + // + // which would then cause havoc when our action code analysis (using regexes or otherwise) was 'too simple' + // to catch these culprits; hence we resort and stick with the most fundamental approach here: + // always append `break;` even when it would be obvious to a human that such would be 'unreachable code'. + var match_nr = /^return[\s\r\n]+((?:'(?:\\'|[^']+)+')|(?:"(?:\\"|[^"]+)+")|\d+)[\s\r\n]*;?$/.exec(action.trim()); + if (match_nr) { + simple_rule_count++; + caseHelper.push([].concat(code, i, ':', match_nr[1]).join(' ').replace(/[\n]/g, '\n ')); + } else { + regular_rule_count++; + actions.push([].concat('case', i, ':', code, action, '\nbreak;').join(' ')); + } + } + actions.push('default:'); + actions.push(' return this.simpleCaseActionClusters[yyrulenumber];'); + actions.push('}'); + + return { + rules: newRules, + macros: macros, + + regular_rule_count: regular_rule_count, + simple_rule_count: simple_rule_count, + }; +} + + + + + + + +// expand all macros (with maybe one exception) in the given regex: the macros may exist inside `[...]` regex sets or +// elsewhere, which requires two different treatments to expand these macros. +function reduceRegex(s, name, opts, expandAllMacrosInSet_cb, expandAllMacrosElsewhere_cb) { + var orig = s; + + function errinfo() { + if (name) { + return 'macro [[' + name + ']]'; + } else { + return 'regex [[' + orig + ']]'; + } + } + + // propagate deferred exceptions = error reports. + if (s instanceof Error) { + return s; + } + + var c1, c2; + var rv = []; + var derr; + var se; + + while (s.length) { + c1 = s.match(CHR_RE); + if (!c1) { + // cope with illegal escape sequences too! + return new Error(errinfo() + ': illegal escape sequence at start of regex part: ' + s); + } else { + c1 = c1[0]; + } + s = s.substr(c1.length); + + switch (c1) { + case '[': + // this is starting a set within the regex: scan until end of set! + var set_content = []; + var l = new Array(UNICODE_BASE_PLANE_MAX_CP + 1); + + while (s.length) { + var inner = s.match(SET_PART_RE); + if (!inner) { + inner = s.match(CHR_RE); + if (!inner) { + // cope with illegal escape sequences too! + return new Error(errinfo() + ': illegal escape sequence at start of regex part: ' + s); + } else { + inner = inner[0]; + } + if (inner === ']') break; + } else { + inner = inner[0]; + } + set_content.push(inner); + s = s.substr(inner.length); + } + + // ensure that we hit the terminating ']': + c2 = s.match(CHR_RE); + if (!c2) { + // cope with illegal escape sequences too! + return new Error(errinfo() + ': regex set expression is broken: "' + s + '"'); + } else { + c2 = c2[0]; + } + if (c2 !== ']') { + return new Error(errinfo() + ': regex set expression is broken: apparently unterminated'); + } + s = s.substr(c2.length); + + se = set_content.join(''); + + // expand any macros in here: + if (expandAllMacrosInSet_cb) { + se = expandAllMacrosInSet_cb(se); + assert(se); + if (se instanceof Error) { + return new Error(errinfo() + ': ' + se.message); + } + } + + derr = setmgmt.set2bitarray(l, se, opts); + if (derr instanceof Error) { + return new Error(errinfo() + ': ' + derr.message); + } + + // find out which set expression is optimal in size: + var s1 = setmgmt.produceOptimizedRegex4Set(l); + + // check if the source regex set potentially has any expansions (guestimate!) + // + // The indexOf('{') picks both XRegExp Unicode escapes and JISON lexer macros, which is perfect for us here. + var has_expansions = (se.indexOf('{') >= 0); + + se = '[' + se + ']'; + + if (!has_expansions && se.length < s1.length) { + s1 = se; + } + rv.push(s1); + break; + + // XRegExp Unicode escape, e.g. `\\p{Number}`: + case '\\p': + c2 = s.match(XREGEXP_UNICODE_ESCAPE_RE); + if (c2) { + c2 = c2[0]; + s = s.substr(c2.length); + + // nothing to expand. + rv.push(c1 + c2); + } else { + // nothing to stretch this match, hence nothing to expand. + rv.push(c1); + } + break; + + // Either a range expression or the start of a macro reference: `.{1,3}` or `{NAME}`. + // Treat it as a macro reference and see if it will expand to anything: + case '{': + c2 = s.match(NOTHING_SPECIAL_RE); + if (c2) { + c2 = c2[0]; + s = s.substr(c2.length); + + var c3 = s[0]; + s = s.substr(c3.length); + if (c3 === '}') { + // possibly a macro name in there... Expand if possible: + c2 = c1 + c2 + c3; + if (expandAllMacrosElsewhere_cb) { + c2 = expandAllMacrosElsewhere_cb(c2); + assert(c2); + if (c2 instanceof Error) { + return new Error(errinfo() + ': ' + c2.message); + } + } + } else { + // not a well-terminated macro reference or something completely different: + // we do not even attempt to expand this as there's guaranteed nothing to expand + // in this bit. + c2 = c1 + c2 + c3; + } + rv.push(c2); + } else { + // nothing to stretch this match, hence nothing to expand. + rv.push(c1); + } + break; + + // Recognize some other regex elements, but there's no need to understand them all. + // + // We are merely interested in any chunks now which do *not* include yet another regex set `[...]` + // nor any `{MACRO}` reference: + default: + // non-set character or word: see how much of this there is for us and then see if there + // are any macros still lurking inside there: + c2 = s.match(NOTHING_SPECIAL_RE); + if (c2) { + c2 = c2[0]; + s = s.substr(c2.length); + + // nothing to expand. + rv.push(c1 + c2); + } else { + // nothing to stretch this match, hence nothing to expand. + rv.push(c1); + } + break; + } + } + + s = rv.join(''); + + // When this result is suitable for use in a set, than we should be able to compile + // it in a regex; that way we can easily validate whether macro X is fit to be used + // inside a regex set: + try { + var re; + re = new XRegExp(s); + re.test(s[0]); + } catch (ex) { + // make sure we produce a regex expression which will fail badly when it is used + // in actual code: + return new Error(errinfo() + ': expands to an invalid regex: /' + s + '/'); + } + + assert(s); + return s; +} + + +// expand macros within macros and cache the result +function prepareMacros(dict_macros, opts) { + var macros = {}; + + // expand a `{NAME}` macro which exists inside a `[...]` set: + function expandMacroInSet(i) { + var k, a, m; + if (!macros[i]) { + m = dict_macros[i]; + + if (m.indexOf('{') >= 0) { + // set up our own record so we can detect definition loops: + macros[i] = { + in_set: false, + elsewhere: null, + raw: dict_macros[i] + }; + + for (k in dict_macros) { + if (dict_macros.hasOwnProperty(k) && i !== k) { + // it doesn't matter if the lexer recognized that the inner macro(s) + // were sitting inside a `[...]` set or not: the fact that they are used + // here in macro `i` which itself sits in a set, makes them *all* live in + // a set so all of them get the same treatment: set expansion style. + // + // Note: make sure we don't try to expand any XRegExp `\p{...}` or `\P{...}` + // macros here: + if (XRegExp._getUnicodeProperty(k)) { + // Work-around so that you can use `\p{ascii}` for a XRegExp slug, a.k.a. + // Unicode 'General Category' Property cf. http://unicode.org/reports/tr18/#Categories, + // while using `\p{ASCII}` as a *macro expansion* of the `ASCII` + // macro: + if (k.toUpperCase() !== k) { + m = new Error('Cannot use name "' + k + '" as a macro name as it clashes with the same XRegExp "\\p{..}" Unicode \'General Category\' Property name. Use all-uppercase macro names, e.g. name your macro "' + k.toUpperCase() + '" to work around this issue or give your offending macro a different name.'); + break; + } + } + + a = m.split('{' + k + '}'); + if (a.length > 1) { + var x = expandMacroInSet(k); + assert(x); + if (x instanceof Error) { + m = x; + break; + } + m = a.join(x); + } + } + } + } + + var mba = setmgmt.reduceRegexToSetBitArray(m, i, opts); + + var s1; + + // propagate deferred exceptions = error reports. + if (mba instanceof Error) { + s1 = mba; + } else { + s1 = setmgmt.bitarray2set(mba, false); + + m = s1; + } + + macros[i] = { + in_set: s1, + elsewhere: null, + raw: dict_macros[i] + }; + } else { + m = macros[i].in_set; + + if (m instanceof Error) { + // this turns out to be an macro with 'issues' and it is used, so the 'issues' do matter: bombs away! + return new Error(m.message); + } + + // detect definition loop: + if (m === false) { + return new Error('Macro name "' + i + '" has an illegal, looping, definition, i.e. it\'s definition references itself, either directly or indirectly, via other macros.'); + } + } + + return m; + } + + function expandMacroElsewhere(i) { + var k, a, m; + + if (macros[i].elsewhere == null) { + m = dict_macros[i]; + + // set up our own record so we can detect definition loops: + macros[i].elsewhere = false; + + // the macro MAY contain other macros which MAY be inside a `[...]` set in this + // macro or elsewhere, hence we must parse the regex: + m = reduceRegex(m, i, opts, expandAllMacrosInSet, expandAllMacrosElsewhere); + // propagate deferred exceptions = error reports. + if (m instanceof Error) { + return m; + } + + macros[i].elsewhere = m; + } else { + m = macros[i].elsewhere; + + if (m instanceof Error) { + // this turns out to be an macro with 'issues' and it is used, so the 'issues' do matter: bombs away! + return m; + } + + // detect definition loop: + if (m === false) { + return new Error('Macro name "' + i + '" has an illegal, looping, definition, i.e. it\'s definition references itself, either directly or indirectly, via other macros.'); + } + } + + return m; + } + + function expandAllMacrosInSet(s) { + var i, x; + + // process *all* the macros inside [...] set: + if (s.indexOf('{') >= 0) { + for (i in macros) { + if (macros.hasOwnProperty(i)) { + var a = s.split('{' + i + '}'); + if (a.length > 1) { + x = expandMacroInSet(i); + assert(x); + if (x instanceof Error) { + return new Error('failure to expand the macro [' + i + '] in set [' + s + ']: ' + x.message); + } + s = a.join(x); + } + + // stop the brute-force expansion attempt when we done 'em all: + if (s.indexOf('{') === -1) { + break; + } + } + } + } + + return s; + } + + function expandAllMacrosElsewhere(s) { + var i, x; + + // When we process the remaining macro occurrences in the regex + // every macro used in a lexer rule will become its own capture group. + // + // Meanwhile the cached expansion will expand any submacros into + // *NON*-capturing groups so that the backreference indexes remain as you'ld + // expect and using macros doesn't require you to know exactly what your + // used macro will expand into, i.e. which and how many submacros it has. + // + // This is a BREAKING CHANGE from vanilla jison 0.4.15! + if (s.indexOf('{') >= 0) { + for (i in macros) { + if (macros.hasOwnProperty(i)) { + // These are all submacro expansions, hence non-capturing grouping is applied: + var a = s.split('{' + i + '}'); + if (a.length > 1) { + x = expandMacroElsewhere(i); + assert(x); + if (x instanceof Error) { + return new Error('failure to expand the macro [' + i + '] in regex /' + s + '/: ' + x.message); + } + s = a.join('(?:' + x + ')'); + } + + // stop the brute-force expansion attempt when we done 'em all: + if (s.indexOf('{') === -1) { + break; + } + } + } + } + + return s; + } + + + var m, i; + + if (opts.debug) console.log('\n############## RAW macros: ', dict_macros); + + // first we create the part of the dictionary which is targeting the use of macros + // *inside* `[...]` sets; once we have completed that half of the expansions work, + // we then go and expand the macros for when they are used elsewhere in a regex: + // iff we encounter submacros then which are used *inside* a set, we can use that + // first half dictionary to speed things up a bit as we can use those expansions + // straight away! + for (i in dict_macros) { + if (dict_macros.hasOwnProperty(i)) { + expandMacroInSet(i); + } + } + + for (i in dict_macros) { + if (dict_macros.hasOwnProperty(i)) { + expandMacroElsewhere(i); + } + } + + if (opts.debug) console.log('\n############### expanded macros: ', macros); + + return macros; +} + + + +// expand macros in a regex; expands them recursively +function expandMacros(src, macros, opts) { + var expansion_count = 0; + + // By the time we call this function `expandMacros` we MUST have expanded and cached all macros already! + // Hence things should be easy in there: + + function expandAllMacrosInSet(s) { + var i, m, x; + + // process *all* the macros inside [...] set: + if (s.indexOf('{') >= 0) { + for (i in macros) { + if (macros.hasOwnProperty(i)) { + m = macros[i]; + + var a = s.split('{' + i + '}'); + if (a.length > 1) { + x = m.in_set; + + assert(x); + if (x instanceof Error) { + // this turns out to be an macro with 'issues' and it is used, so the 'issues' do matter: bombs away! + throw x; + } + + // detect definition loop: + if (x === false) { + return new Error('Macro name "' + i + '" has an illegal, looping, definition, i.e. it\'s definition references itself, either directly or indirectly, via other macros.'); + } + + s = a.join(x); + expansion_count++; + } + + // stop the brute-force expansion attempt when we done 'em all: + if (s.indexOf('{') === -1) { + break; + } + } + } + } + + return s; + } + + function expandAllMacrosElsewhere(s) { + var i, m, x; + + // When we process the main macro occurrences in the regex + // every macro used in a lexer rule will become its own capture group. + // + // Meanwhile the cached expansion will expand any submacros into + // *NON*-capturing groups so that the backreference indexes remain as you'ld + // expect and using macros doesn't require you to know exactly what your + // used macro will expand into, i.e. which and how many submacros it has. + // + // This is a BREAKING CHANGE from vanilla jison 0.4.15! + if (s.indexOf('{') >= 0) { + for (i in macros) { + if (macros.hasOwnProperty(i)) { + m = macros[i]; + + var a = s.split('{' + i + '}'); + if (a.length > 1) { + // These are all main macro expansions, hence CAPTURING grouping is applied: + x = m.elsewhere; + assert(x); + + // detect definition loop: + if (x === false) { + return new Error('Macro name "' + i + '" has an illegal, looping, definition, i.e. it\'s definition references itself, either directly or indirectly, via other macros.'); + } + + s = a.join('(' + x + ')'); + expansion_count++; + } + + // stop the brute-force expansion attempt when we done 'em all: + if (s.indexOf('{') === -1) { + break; + } + } + } + } + + return s; + } + + + // When we process the macro occurrences in the regex + // every macro used in a lexer rule will become its own capture group. + // + // Meanwhile the cached expansion will have expanded any submacros into + // *NON*-capturing groups so that the backreference indexes remain as you'ld + // expect and using macros doesn't require you to know exactly what your + // used macro will expand into, i.e. which and how many submacros it has. + // + // This is a BREAKING CHANGE from vanilla jison 0.4.15! + var s2 = reduceRegex(src, null, opts, expandAllMacrosInSet, expandAllMacrosElsewhere); + // propagate deferred exceptions = error reports. + if (s2 instanceof Error) { + throw s2; + } + + // only when we did expand some actual macros do we take the re-interpreted/optimized/regenerated regex from reduceRegex() + // in order to keep our test cases simple and rules recognizable. This assumes the user can code good regexes on his own, + // as long as no macros are involved... + // + // Also pick the reduced regex when there (potentially) are XRegExp extensions in the original, e.g. `\\p{Number}`, + // unless the `xregexp` output option has been enabled. + if (expansion_count > 0 || (src.indexOf('\\p{') >= 0 && !opts.options.xregexp)) { + src = s2; + } else { + // Check if the reduced regex is smaller in size; when it is, we still go with the new one! + if (s2.length < src.length) { + src = s2; + } + } + + return src; +} + +function prepareStartConditions(conditions) { + var sc, + hash = {}; + for (sc in conditions) { + if (conditions.hasOwnProperty(sc)) { + hash[sc] = {rules:[], inclusive: !conditions[sc]}; + } + } + return hash; +} + +function buildActions(dict, tokens, opts) { + var actions = [dict.actionInclude || '', 'var YYSTATE = YY_START;']; + var tok; + var toks = {}; + var caseHelper = []; + + // tokens: map/array of token numbers to token names + for (tok in tokens) { + var idx = parseInt(tok); + if (idx && idx > 0) { + toks[tokens[tok]] = idx; + } + } + + if (opts.options.flex && dict.rules) { + dict.rules.push(['.', 'console.log("", yytext); /* `flex` lexing mode: the last resort rule! */']); + } + + var gen = prepareRules(dict, actions, caseHelper, tokens && toks, opts.conditions, opts); + + var fun = actions.join('\n'); + 'yytext yyleng yylineno yylloc yyerror'.split(' ').forEach(function (yy) { + fun = fun.replace(new RegExp('\\b(' + yy + ')\\b', 'g'), 'yy_.$1'); + }); + + return { + caseHelperInclude: '{\n' + caseHelper.join(',') + '\n}', + + actions: `function lexer__performAction(yy, yyrulenumber, YY_START) { + var yy_ = this; + + ${fun} + }`, + + rules: gen.rules, + macros: gen.macros, // propagate these for debugging/diagnostic purposes + + regular_rule_count: gen.regular_rule_count, + simple_rule_count: gen.simple_rule_count, + }; +} + +// +// NOTE: this is *almost* a copy of the JisonParserError producing code in +// jison/lib/jison.js @ line 2304:lrGeneratorMixin.generateErrorClass +// +function generateErrorClass() { + // --- START lexer error class --- + +var prelude = `/** + * See also: + * http://stackoverflow.com/questions/1382107/whats-a-good-way-to-extend-error-in-javascript/#35881508 + * but we keep the prototype.constructor and prototype.name assignment lines too for compatibility + * with userland code which might access the derived class in a 'classic' way. + * + * @public + * @constructor + * @nocollapse + */ +function JisonLexerError(msg, hash) { + Object.defineProperty(this, 'name', { + enumerable: false, + writable: false, + value: 'JisonLexerError' + }); + + if (msg == null) msg = '???'; + + Object.defineProperty(this, 'message', { + enumerable: false, + writable: true, + value: msg + }); + + this.hash = hash; + + var stacktrace; + if (hash && hash.exception instanceof Error) { + var ex2 = hash.exception; + this.message = ex2.message || msg; + stacktrace = ex2.stack; + } + if (!stacktrace) { + if (Error.hasOwnProperty('captureStackTrace')) { // V8 + Error.captureStackTrace(this, this.constructor); + } else { + stacktrace = (new Error(msg)).stack; + } + } + if (stacktrace) { + Object.defineProperty(this, 'stack', { + enumerable: false, + writable: false, + value: stacktrace + }); + } +} + +if (typeof Object.setPrototypeOf === 'function') { + Object.setPrototypeOf(JisonLexerError.prototype, Error.prototype); +} else { + JisonLexerError.prototype = Object.create(Error.prototype); +} +JisonLexerError.prototype.constructor = JisonLexerError; +JisonLexerError.prototype.name = 'JisonLexerError';`; + + // --- END lexer error class --- + + return prelude; +} + + +const jisonLexerErrorDefinition = generateErrorClass(); + + +function generateFakeXRegExpClassSrcCode() { + return rmCommonWS` + var __hacky_counter__ = 0; + + /** + * @constructor + * @nocollapse + */ + function XRegExp(re, f) { + this.re = re; + this.flags = f; + this._getUnicodeProperty = function (k) {}; + var fake = /./; // WARNING: this exact 'fake' is also depended upon by the xregexp unit test! + __hacky_counter__++; + fake.__hacky_backy__ = __hacky_counter__; + return fake; + } + `; +} + + + +/** @constructor */ +function RegExpLexer(dict, input, tokens, build_options) { + var opts; + var dump = false; + + function test_me(tweak_cb, description, src_exception, ex_callback) { + opts = processGrammar(dict, tokens, build_options); + opts.__in_rules_failure_analysis_mode__ = false; + prepExportStructures(opts); + assert(opts.options); + if (tweak_cb) { + tweak_cb(); + } + var source = generateModuleBody(opts); + try { + // The generated code will always have the `lexer` variable declared at local scope + // as `eval()` will use the local scope. + // + // The compiled code will look something like this: + // + // ``` + // var lexer; + // bla bla... + // ``` + // + // or + // + // ``` + // var lexer = { bla... }; + // ``` + var testcode = [ + '// provide a local version for test purposes:', + jisonLexerErrorDefinition, + '', + generateFakeXRegExpClassSrcCode(), + '', + source, + '', + 'return lexer;'].join('\n'); + var lexer = code_exec(testcode, function generated_code_exec_wrapper_regexp_lexer(sourcecode) { + //console.log("===============================LEXER TEST CODE\n", sourcecode, "\n=====================END====================\n"); + var lexer_f = new Function('', sourcecode); + return lexer_f(); + }, opts.options, "lexer"); + + if (!lexer) { + throw new Error('no lexer defined *at all*?!'); + } + if (typeof lexer.options !== 'object' || lexer.options == null) { + throw new Error('your lexer class MUST have an .options member object or it won\'t fly!'); + } + if (typeof lexer.setInput !== 'function') { + throw new Error('your lexer class MUST have a .setInput function member or it won\'t fly!'); + } + if (lexer.EOF !== 1 && lexer.ERROR !== 2) { + throw new Error('your lexer class MUST have these constants defined: lexer.EOF = 1 and lexer.ERROR = 2 or it won\'t fly!'); + } + + // When we do NOT crash, we found/killed the problem area just before this call! + if (src_exception && description) { + src_exception.message += '\n (' + description + ')'; + } + + // patch the pre and post handlers in there, now that we have some live code to work with: + if (opts.options) { + var pre = opts.options.pre_lex; + var post = opts.options.post_lex; + // since JSON cannot encode functions, we'll have to do it manually now: + if (typeof pre === 'function') { + lexer.options.pre_lex = pre; + } + if (typeof post === 'function') { + lexer.options.post_lex = post; + } + } + + if (opts.options.showSource) { + if (typeof opts.options.showSource === 'function') { + opts.options.showSource(lexer, source, opts); + } else { + console.log("\nGenerated lexer sourcecode:\n----------------------------------------\n", source, "\n----------------------------------------\n"); + } + } + return lexer; + } catch (ex) { + // if (src_exception) { + // src_exception.message += '\n (' + description + ': ' + ex.message + ')'; + // } + + if (ex_callback) { + ex_callback(ex); + } else if (dump) { + console.log('source code:\n', source); + } + return false; + } + } + + /** @constructor */ + var lexer = test_me(null, null, null, function (ex) { + // When we get an exception here, it means some part of the user-specified lexer is botched. + // + // Now we go and try to narrow down the problem area/category: + assert(opts.options); + assert(opts.options.xregexp !== undefined); + var orig_xregexp_opt = !!opts.options.xregexp; + if (!test_me(function () { + assert(opts.options.xregexp !== undefined); + opts.options.xregexp = false; + opts.showSource = false; + }, 'When you have specified %option xregexp, you must also properly IMPORT the XRegExp library in the generated lexer.', ex, null)) { + if (!test_me(function () { + // restore xregexp option setting: the trouble wasn't caused by the xregexp flag i.c.w. incorrect XRegExp library importing! + opts.options.xregexp = orig_xregexp_opt; + + opts.conditions = []; + opts.showSource = false; + }, ((dict.rules && dict.rules.length > 0) ? + 'One or more of your lexer state names are possibly botched?' : + 'Your custom lexer is somehow botched.'), ex, null)) { + if (!test_me(function () { + // opts.conditions = []; + opts.rules = []; + opts.showSource = false; + opts.__in_rules_failure_analysis_mode__ = true; + }, 'One or more of your lexer rules are possibly botched?', ex, null)) { + // kill each rule action block, one at a time and test again after each 'edit': + var rv = false; + for (var i = 0, len = (dict.rules ? dict.rules.length : 0); i < len; i++) { + dict.rules[i][1] = '{ /* nada */ }'; + rv = test_me(function () { + // opts.conditions = []; + // opts.rules = []; + // opts.__in_rules_failure_analysis_mode__ = true; + }, 'Your lexer rule "' + dict.rules[i][0] + '" action code block is botched?', ex, null); + if (rv) { + break; + } + } + if (!rv) { + test_me(function () { + opts.conditions = []; + opts.rules = []; + opts.performAction = 'null'; + // opts.options = {}; + // opts.caseHelperInclude = '{}'; + opts.showSource = false; + opts.__in_rules_failure_analysis_mode__ = true; + + dump = false; + }, 'One or more of your lexer rule action code block(s) are possibly botched?', ex, null); + } + } + } + } + throw ex; + }); + + lexer.setInput(input); + + /** @public */ + lexer.generate = function () { + return generateFromOpts(opts); + }; + /** @public */ + lexer.generateModule = function () { + return generateModule(opts); + }; + /** @public */ + lexer.generateCommonJSModule = function () { + return generateCommonJSModule(opts); + }; + /** @public */ + lexer.generateESModule = function () { + return generateESModule(opts); + }; + /** @public */ + lexer.generateAMDModule = function () { + return generateAMDModule(opts); + }; + + // internal APIs to aid testing: + /** @public */ + lexer.getExpandedMacros = function () { + return opts.macros; + }; + + return lexer; +} + +// code stripping performance test for very simple grammar: +// +// - removing backtracking parser code branches: 730K -> 750K rounds +// - removing all location info tracking: yylineno, yylloc, etc.: 750K -> 900K rounds +// - no `yyleng`: 900K -> 905K rounds +// - no `this.done` as we cannot have a NULL `_input` anymore: 905K -> 930K rounds +// - `simpleCaseActionClusters` as array instead of hash object: 930K -> 940K rounds +// - lexers which have only return stmts, i.e. only a +// `simpleCaseActionClusters` lookup table to produce +// lexer tokens: *inline* the `performAction` call: 940K -> 950K rounds +// - given all the above, you can *inline* what's left of +// `lexer_next()`: 950K -> 955K rounds (? this stuff becomes hard to measure; inaccuracy abounds!) +// +// Total gain when we forget about very minor (and tough to nail) *inlining* `lexer_next()` gains: +// +// 730 -> 950 ~ 30% performance gain. +// + +// As a function can be reproduced in source-code form by any JavaScript engine, we're going to wrap this chunk +// of code in a function so that we can easily get it including it comments, etc.: +/** +@public +@nocollapse +*/ +function getRegExpLexerPrototype() { + // --- START lexer kernel --- +return `{ + EOF: 1, + ERROR: 2, + + // JisonLexerError: JisonLexerError, /// <-- injected by the code generator + + // options: {}, /// <-- injected by the code generator + + // yy: ..., /// <-- injected by setInput() + + __currentRuleSet__: null, /// INTERNAL USE ONLY: internal rule set cache for the current lexer state + + __error_infos: [], /// INTERNAL USE ONLY: the set of lexErrorInfo objects created since the last cleanup + + __decompressed: false, /// INTERNAL USE ONLY: mark whether the lexer instance has been 'unfolded' completely and is now ready for use + + done: false, /// INTERNAL USE ONLY + _backtrack: false, /// INTERNAL USE ONLY + _input: '', /// INTERNAL USE ONLY + _more: false, /// INTERNAL USE ONLY + _signaled_error_token: false, /// INTERNAL USE ONLY + + conditionStack: [], /// INTERNAL USE ONLY; managed via \`pushState()\`, \`popState()\`, \`topState()\` and \`stateStackSize()\` + + match: '', /// READ-ONLY EXTERNAL ACCESS - ADVANCED USE ONLY: tracks input which has been matched so far for the lexer token under construction. \`match\` is identical to \`yytext\` except that this one still contains the matched input string after \`lexer.performAction()\` has been invoked, where userland code MAY have changed/replaced the \`yytext\` value entirely! + matched: '', /// READ-ONLY EXTERNAL ACCESS - ADVANCED USE ONLY: tracks entire input which has been matched so far + matches: false, /// READ-ONLY EXTERNAL ACCESS - ADVANCED USE ONLY: tracks RE match result for last (successful) match attempt + yytext: '', /// ADVANCED USE ONLY: tracks input which has been matched so far for the lexer token under construction; this value is transferred to the parser as the 'token value' when the parser consumes the lexer token produced through a call to the \`lex()\` API. + offset: 0, /// READ-ONLY EXTERNAL ACCESS - ADVANCED USE ONLY: tracks the 'cursor position' in the input string, i.e. the number of characters matched so far + yyleng: 0, /// READ-ONLY EXTERNAL ACCESS - ADVANCED USE ONLY: length of matched input for the token under construction (\`yytext\`) + yylineno: 0, /// READ-ONLY EXTERNAL ACCESS - ADVANCED USE ONLY: 'line number' at which the token under construction is located + yylloc: null, /// READ-ONLY EXTERNAL ACCESS - ADVANCED USE ONLY: tracks location info (lines + columns) for the token under construction + + /** + * INTERNAL USE: construct a suitable error info hash object instance for \`parseError\`. + * + * @public + * @this {RegExpLexer} + */ + constructLexErrorInfo: function lexer_constructLexErrorInfo(msg, recoverable, show_input_position) { + msg = '' + msg; + + // heuristic to determine if the error message already contains a (partial) source code dump + // as produced by either \`showPosition()\` or \`prettyPrintRange()\`: + if (show_input_position == undefined) { + show_input_position = !(msg.indexOf('\\n') > 0 && msg.indexOf('^') > 0); + } + if (this.yylloc && show_input_position) { + if (typeof this.prettyPrintRange === 'function') { + var pretty_src = this.prettyPrintRange(this.yylloc); + + if (!/\\n\\s*$/.test(msg)) { + msg += '\\n'; + } + msg += '\\n Erroneous area:\\n' + this.prettyPrintRange(this.yylloc); + } else if (typeof this.showPosition === 'function') { + var pos_str = this.showPosition(); + if (pos_str) { + if (msg.length && msg[msg.length - 1] !== '\\n' && pos_str[0] !== '\\n') { + msg += '\\n' + pos_str; + } else { + msg += pos_str; + } + } + } + } + /** @constructor */ + var pei = { + errStr: msg, + recoverable: !!recoverable, + text: this.match, // This one MAY be empty; userland code should use the \`upcomingInput\` API to obtain more text which follows the 'lexer cursor position'... + token: null, + line: this.yylineno, + loc: this.yylloc, + yy: this.yy, + lexer: this, + + /** + * and make sure the error info doesn't stay due to potential + * ref cycle via userland code manipulations. + * These would otherwise all be memory leak opportunities! + * + * Note that only array and object references are nuked as those + * constitute the set of elements which can produce a cyclic ref. + * The rest of the members is kept intact as they are harmless. + * + * @public + * @this {LexErrorInfo} + */ + destroy: function destructLexErrorInfo() { + // remove cyclic references added to error info: + // info.yy = null; + // info.lexer = null; + // ... + var rec = !!this.recoverable; + for (var key in this) { + if (this.hasOwnProperty(key) && typeof key === 'object') { + this[key] = undefined; + } + } + this.recoverable = rec; + } + }; + // track this instance so we can \`destroy()\` it once we deem it superfluous and ready for garbage collection! + this.__error_infos.push(pei); + return pei; + }, + + /** + * handler which is invoked when a lexer error occurs. + * + * @public + * @this {RegExpLexer} + */ + parseError: function lexer_parseError(str, hash, ExceptionClass) { + if (!ExceptionClass) { + ExceptionClass = this.JisonLexerError; + } + if (this.yy) { + if (this.yy.parser && typeof this.yy.parser.parseError === 'function') { + return this.yy.parser.parseError.call(this, str, hash, ExceptionClass) || this.ERROR; + } else if (typeof this.yy.parseError === 'function') { + return this.yy.parseError.call(this, str, hash, ExceptionClass) || this.ERROR; + } + } + throw new ExceptionClass(str, hash); + }, + + /** + * method which implements \`yyerror(str, ...args)\` functionality for use inside lexer actions. + * + * @public + * @this {RegExpLexer} + */ + yyerror: function yyError(str /*, ...args */) { + var lineno_msg = ''; + if (this.yylloc) { + lineno_msg = ' on line ' + (this.yylineno + 1); + } + var p = this.constructLexErrorInfo('Lexical error' + lineno_msg + ': ' + str, this.options.lexerErrorsAreRecoverable); + + // Add any extra args to the hash under the name \`extra_error_attributes\`: + var args = Array.prototype.slice.call(arguments, 1); + if (args.length) { + p.extra_error_attributes = args; + } + + return (this.parseError(p.errStr, p, this.JisonLexerError) || this.ERROR); + }, + + /** + * final cleanup function for when we have completed lexing the input; + * make it an API so that external code can use this one once userland + * code has decided it's time to destroy any lingering lexer error + * hash object instances and the like: this function helps to clean + * up these constructs, which *may* carry cyclic references which would + * otherwise prevent the instances from being properly and timely + * garbage-collected, i.e. this function helps prevent memory leaks! + * + * @public + * @this {RegExpLexer} + */ + cleanupAfterLex: function lexer_cleanupAfterLex(do_not_nuke_errorinfos) { + // prevent lingering circular references from causing memory leaks: + this.setInput('', {}); + + // nuke the error hash info instances created during this run. + // Userland code must COPY any data/references + // in the error hash instance(s) it is more permanently interested in. + if (!do_not_nuke_errorinfos) { + for (var i = this.__error_infos.length - 1; i >= 0; i--) { + var el = this.__error_infos[i]; + if (el && typeof el.destroy === 'function') { + el.destroy(); + } + } + this.__error_infos.length = 0; + } + + return this; + }, + + /** + * clear the lexer token context; intended for internal use only + * + * @public + * @this {RegExpLexer} + */ + clear: function lexer_clear() { + this.yytext = ''; + this.yyleng = 0; + this.match = ''; + // - DO NOT reset \`this.matched\` + this.matches = false; + this._more = false; + this._backtrack = false; + + var col = (this.yylloc ? this.yylloc.last_column : 0); + this.yylloc = { + first_line: this.yylineno + 1, + first_column: col, + last_line: this.yylineno + 1, + last_column: col, + + range: [this.offset, this.offset] + }; + }, + + /** + * resets the lexer, sets new input + * + * @public + * @this {RegExpLexer} + */ + setInput: function lexer_setInput(input, yy) { + this.yy = yy || this.yy || {}; + + // also check if we've fully initialized the lexer instance, + // including expansion work to be done to go from a loaded + // lexer to a usable lexer: + if (!this.__decompressed) { + // step 1: decompress the regex list: + var rules = this.rules; + for (var i = 0, len = rules.length; i < len; i++) { + var rule_re = rules[i]; + + // compression: is the RE an xref to another RE slot in the rules[] table? + if (typeof rule_re === 'number') { + rules[i] = rules[rule_re]; + } + } + + // step 2: unfold the conditions[] set to make these ready for use: + var conditions = this.conditions; + for (var k in conditions) { + var spec = conditions[k]; + + var rule_ids = spec.rules; + + var len = rule_ids.length; + var rule_regexes = new Array(len + 1); // slot 0 is unused; we use a 1-based index approach here to keep the hottest code in \`lexer_next()\` fast and simple! + var rule_new_ids = new Array(len + 1); + + for (var i = 0; i < len; i++) { + var idx = rule_ids[i]; + var rule_re = rules[idx]; + rule_regexes[i + 1] = rule_re; + rule_new_ids[i + 1] = idx; + } + + spec.rules = rule_new_ids; + spec.__rule_regexes = rule_regexes; + spec.__rule_count = len; + } + + this.__decompressed = true; + } + + this._input = input || ''; + this.clear(); + this._signaled_error_token = false; + this.done = false; + this.yylineno = 0; + this.matched = ''; + this.conditionStack = ['INITIAL']; + this.__currentRuleSet__ = null; + this.yylloc = { + first_line: 1, + first_column: 0, + last_line: 1, + last_column: 0, + + range: [0, 0] + }; + this.offset = 0; + return this; + }, + + /** + * edit the remaining input via user-specified callback. + * This can be used to forward-adjust the input-to-parse, + * e.g. inserting macro expansions and alike in the + * input which has yet to be lexed. + * The behaviour of this API contrasts the \`unput()\` et al + * APIs as those act on the *consumed* input, while this + * one allows one to manipulate the future, without impacting + * the current \`yyloc\` cursor location or any history. + * + * Use this API to help implement C-preprocessor-like + * \`#include\` statements, etc. + * + * The provided callback must be synchronous and is + * expected to return the edited input (string). + * + * The \`cpsArg\` argument value is passed to the callback + * as-is. + * + * \`callback\` interface: + * \`function callback(input, cpsArg)\` + * + * - \`input\` will carry the remaining-input-to-lex string + * from the lexer. + * - \`cpsArg\` is \`cpsArg\` passed into this API. + * + * The \`this\` reference for the callback will be set to + * reference this lexer instance so that userland code + * in the callback can easily and quickly access any lexer + * API. + * + * When the callback returns a non-string-type falsey value, + * we assume the callback did not edit the input and we + * will using the input as-is. + * + * When the callback returns a non-string-type value, it + * is converted to a string for lexing via the \`"" + retval\` + * operation. (See also why: http://2ality.com/2012/03/converting-to-string.html + * -- that way any returned object's \`toValue()\` and \`toString()\` + * methods will be invoked in a proper/desirable order.) + * + * @public + * @this {RegExpLexer} + */ + editRemainingInput: function lexer_editRemainingInput(callback, cpsArg) { + var rv = callback.call(this, this._input, cpsArg); + if (typeof rv !== 'string') { + if (rv) { + this._input = '' + rv; + } + // else: keep \`this._input\` as is. + } else { + this._input = rv; + } + return this; + }, + + /** + * consumes and returns one char from the input + * + * @public + * @this {RegExpLexer} + */ + input: function lexer_input() { + if (!this._input) { + //this.done = true; -- don't set \`done\` as we want the lex()/next() API to be able to produce one custom EOF token match after this anyhow. (lexer can match special <> tokens and perform user action code for a <> match, but only does so *once*) + return null; + } + var ch = this._input[0]; + this.yytext += ch; + this.yyleng++; + this.offset++; + this.match += ch; + this.matched += ch; + // Count the linenumber up when we hit the LF (or a stand-alone CR). + // On CRLF, the linenumber is incremented when you fetch the CR or the CRLF combo + // and we advance immediately past the LF as well, returning both together as if + // it was all a single 'character' only. + var slice_len = 1; + var lines = false; + if (ch === '\\n') { + lines = true; + } else if (ch === '\\r') { + lines = true; + var ch2 = this._input[1]; + if (ch2 === '\\n') { + slice_len++; + ch += ch2; + this.yytext += ch2; + this.yyleng++; + this.offset++; + this.match += ch2; + this.matched += ch2; + this.yylloc.range[1]++; + } + } + if (lines) { + this.yylineno++; + this.yylloc.last_line++; + this.yylloc.last_column = 0; + } else { + this.yylloc.last_column++; + } + this.yylloc.range[1]++; + + this._input = this._input.slice(slice_len); + return ch; + }, + + /** + * unshifts one char (or an entire string) into the input + * + * @public + * @this {RegExpLexer} + */ + unput: function lexer_unput(ch) { + var len = ch.length; + var lines = ch.split(/(?:\\r\\n?|\\n)/g); + + this._input = ch + this._input; + this.yytext = this.yytext.substr(0, this.yytext.length - len); + this.yyleng = this.yytext.length; + this.offset -= len; + this.match = this.match.substr(0, this.match.length - len); + this.matched = this.matched.substr(0, this.matched.length - len); + + if (lines.length > 1) { + this.yylineno -= lines.length - 1; + + this.yylloc.last_line = this.yylineno + 1; + + // Get last entirely matched line into the \`pre_lines[]\` array's + // last index slot; we don't mind when other previously + // matched lines end up in the array too. + var pre = this.match; + var pre_lines = pre.split(/(?:\\r\\n?|\\n)/g); + if (pre_lines.length === 1) { + pre = this.matched; + pre_lines = pre.split(/(?:\\r\\n?|\\n)/g); + } + this.yylloc.last_column = pre_lines[pre_lines.length - 1].length; + } else { + this.yylloc.last_column -= len; + } + + this.yylloc.range[1] = this.yylloc.range[0] + this.yyleng; + + this.done = false; + return this; + }, + + /** + * cache matched text and append it on next action + * + * @public + * @this {RegExpLexer} + */ + more: function lexer_more() { + this._more = true; + return this; + }, + + /** + * signal the lexer that this rule fails to match the input, so the + * next matching rule (regex) should be tested instead. + * + * @public + * @this {RegExpLexer} + */ + reject: function lexer_reject() { + if (this.options.backtrack_lexer) { + this._backtrack = true; + } else { + // when the \`parseError()\` call returns, we MUST ensure that the error is registered. + // We accomplish this by signaling an 'error' token to be produced for the current + // \`.lex()\` run. + var lineno_msg = ''; + if (this.yylloc) { + lineno_msg = ' on line ' + (this.yylineno + 1); + } + var p = this.constructLexErrorInfo('Lexical error' + lineno_msg + ': You can only invoke reject() in the lexer when the lexer is of the backtracking persuasion (options.backtrack_lexer = true).', false); + this._signaled_error_token = (this.parseError(p.errStr, p, this.JisonLexerError) || this.ERROR); + } + return this; + }, + + /** + * retain first n characters of the match + * + * @public + * @this {RegExpLexer} + */ + less: function lexer_less(n) { + return this.unput(this.match.slice(n)); + }, + + /** + * return (part of the) already matched input, i.e. for error + * messages. + * + * Limit the returned string length to \`maxSize\` (default: 20). + * + * Limit the returned string to the \`maxLines\` number of lines of + * input (default: 1). + * + * Negative limit values equal *unlimited*. + * + * @public + * @this {RegExpLexer} + */ + pastInput: function lexer_pastInput(maxSize, maxLines) { + var past = this.matched.substring(0, this.matched.length - this.match.length); + if (maxSize < 0) + maxSize = past.length; + else if (!maxSize) + maxSize = 20; + if (maxLines < 0) + maxLines = past.length; // can't ever have more input lines than this! + else if (!maxLines) + maxLines = 1; + // \`substr\` anticipation: treat \\r\\n as a single character and take a little + // more than necessary so that we can still properly check against maxSize + // after we've transformed and limited the newLines in here: + past = past.substr(-maxSize * 2 - 2); + // now that we have a significantly reduced string to process, transform the newlines + // and chop them, then limit them: + var a = past.replace(/\\r\\n|\\r/g, '\\n').split('\\n'); + a = a.slice(-maxLines); + past = a.join('\\n'); + // When, after limiting to maxLines, we still have too much to return, + // do add an ellipsis prefix... + if (past.length > maxSize) { + past = '...' + past.substr(-maxSize); + } + return past; + }, + + /** + * return (part of the) upcoming input, i.e. for error messages. + * + * Limit the returned string length to \`maxSize\` (default: 20). + * + * Limit the returned string to the \`maxLines\` number of lines of input (default: 1). + * + * Negative limit values equal *unlimited*. + * + * > ### NOTE ### + * > + * > *"upcoming input"* is defined as the whole of the both + * > the *currently lexed* input, together with any remaining input + * > following that. *"currently lexed"* input is the input + * > already recognized by the lexer but not yet returned with + * > the lexer token. This happens when you are invoking this API + * > from inside any lexer rule action code block. + * > + * + * @public + * @this {RegExpLexer} + */ + upcomingInput: function lexer_upcomingInput(maxSize, maxLines) { + var next = this.match; + if (maxSize < 0) + maxSize = next.length + this._input.length; + else if (!maxSize) + maxSize = 20; + if (maxLines < 0) + maxLines = maxSize; // can't ever have more input lines than this! + else if (!maxLines) + maxLines = 1; + // \`substring\` anticipation: treat \\r\\n as a single character and take a little + // more than necessary so that we can still properly check against maxSize + // after we've transformed and limited the newLines in here: + if (next.length < maxSize * 2 + 2) { + next += this._input.substring(0, maxSize * 2 + 2); // substring is faster on Chrome/V8 + } + // now that we have a significantly reduced string to process, transform the newlines + // and chop them, then limit them: + var a = next.replace(/\\r\\n|\\r/g, '\\n').split('\\n'); + a = a.slice(0, maxLines); + next = a.join('\\n'); + // When, after limiting to maxLines, we still have too much to return, + // do add an ellipsis postfix... + if (next.length > maxSize) { + next = next.substring(0, maxSize) + '...'; + } + return next; + }, + + /** + * return a string which displays the character position where the + * lexing error occurred, i.e. for error messages + * + * @public + * @this {RegExpLexer} + */ + showPosition: function lexer_showPosition(maxPrefix, maxPostfix) { + var pre = this.pastInput(maxPrefix).replace(/\\s/g, ' '); + var c = new Array(pre.length + 1).join('-'); + return pre + this.upcomingInput(maxPostfix).replace(/\\s/g, ' ') + '\\n' + c + '^'; + }, + + /** + * return a string which displays the lines & columns of input which are referenced + * by the given location info range, plus a few lines of context. + * + * This function pretty-prints the indicated section of the input, with line numbers + * and everything! + * + * This function is very useful to provide highly readable error reports, while + * the location range may be specified in various flexible ways: + * + * - \`loc\` is the location info object which references the area which should be + * displayed and 'marked up': these lines & columns of text are marked up by \`^\` + * characters below each character in the entire input range. + * + * - \`context_loc\` is the *optional* location info object which instructs this + * pretty-printer how much *leading* context should be displayed alongside + * the area referenced by \`loc\`. This can help provide context for the displayed + * error, etc. + * + * When this location info is not provided, a default context of 3 lines is + * used. + * + * - \`context_loc2\` is another *optional* location info object, which serves + * a similar purpose to \`context_loc\`: it specifies the amount of *trailing* + * context lines to display in the pretty-print output. + * + * When this location info is not provided, a default context of 1 line only is + * used. + * + * Special Notes: + * + * - when the \`loc\`-indicated range is very large (about 5 lines or more), then + * only the first and last few lines of this block are printed while a + * \`...continued...\` message will be printed between them. + * + * This serves the purpose of not printing a huge amount of text when the \`loc\` + * range happens to be huge: this way a manageable & readable output results + * for arbitrary large ranges. + * + * - this function can display lines of input which whave not yet been lexed. + * \`prettyPrintRange()\` can access the entire input! + * + * @public + * @this {RegExpLexer} + */ + prettyPrintRange: function lexer_prettyPrintRange(loc, context_loc, context_loc2) { + var error_size = loc.last_line - loc.first_line; + const CONTEXT = 3; + const CONTEXT_TAIL = 1; + const MINIMUM_VISIBLE_NONEMPTY_LINE_COUNT = 2; + var input = this.matched + this._input; + var lines = input.split('\\n'); + //var show_context = (error_size < 5 || context_loc); + var l0 = Math.max(1, (context_loc ? context_loc.first_line : loc.first_line - CONTEXT)); + var l1 = Math.max(1, (context_loc2 ? context_loc2.last_line : loc.last_line + CONTEXT_TAIL)); + var lineno_display_width = (1 + Math.log10(l1 | 1) | 0); + var ws_prefix = new Array(lineno_display_width).join(' '); + var nonempty_line_indexes = []; + var rv = lines.slice(l0 - 1, l1 + 1).map(function injectLineNumber(line, index) { + var lno = index + l0; + var lno_pfx = (ws_prefix + lno).substr(-lineno_display_width); + var rv = lno_pfx + ': ' + line; + var errpfx = (new Array(lineno_display_width + 1)).join('^'); + var offset = 2 + 1; + var len = 0; + + if (lno === loc.first_line) { + offset += loc.first_column; + + len = Math.max( + 2, + ((lno === loc.last_line ? loc.last_column : line.length)) - loc.first_column + 1 + ); + } else if (lno === loc.last_line) { + len = Math.max(2, loc.last_column + 1); + } else if (lno > loc.first_line && lno < loc.last_line) { + len = Math.max(2, line.length + 1); + } + + if (len) { + var lead = new Array(offset).join('.'); + var mark = new Array(len).join('^'); + rv += '\\n' + errpfx + lead + mark; + + if (line.trim().length > 0) { + nonempty_line_indexes.push(index); + } + } + + rv = rv.replace(/\\t/g, ' '); + return rv; + }); + + // now make sure we don't print an overly large amount of error area: limit it + // to the top and bottom line count: + if (nonempty_line_indexes.length > 2 * MINIMUM_VISIBLE_NONEMPTY_LINE_COUNT) { + var clip_start = nonempty_line_indexes[MINIMUM_VISIBLE_NONEMPTY_LINE_COUNT - 1] + 1; + var clip_end = nonempty_line_indexes[nonempty_line_indexes.length - MINIMUM_VISIBLE_NONEMPTY_LINE_COUNT] - 1; + + var intermediate_line = (new Array(lineno_display_width + 1)).join(' ') + ' (...continued...)'; + intermediate_line += '\\n' + (new Array(lineno_display_width + 1)).join('-') + ' (---------------)'; + rv.splice(clip_start, clip_end - clip_start + 1, intermediate_line); + } + return rv.join('\\n'); + }, + + /** + * helper function, used to produce a human readable description as a string, given + * the input \`yylloc\` location object. + * + * Set \`display_range_too\` to TRUE to include the string character index position(s) + * in the description if the \`yylloc.range\` is available. + * + * @public + * @this {RegExpLexer} + */ + describeYYLLOC: function lexer_describe_yylloc(yylloc, display_range_too) { + var l1 = yylloc.first_line; + var l2 = yylloc.last_line; + var c1 = yylloc.first_column; + var c2 = yylloc.last_column; + var dl = l2 - l1; + var dc = c2 - c1; + var rv; + if (dl === 0) { + rv = 'line ' + l1 + ', '; + if (dc <= 1) { + rv += 'column ' + c1; + } else { + rv += 'columns ' + c1 + ' .. ' + c2; + } + } else { + rv = 'lines ' + l1 + '(column ' + c1 + ') .. ' + l2 + '(column ' + c2 + ')'; + } + if (yylloc.range && display_range_too) { + var r1 = yylloc.range[0]; + var r2 = yylloc.range[1] - 1; + if (r2 <= r1) { + rv += ' {String Offset: ' + r1 + '}'; + } else { + rv += ' {String Offset range: ' + r1 + ' .. ' + r2 + '}'; + } + } + return rv; + }, + + /** + * test the lexed token: return FALSE when not a match, otherwise return token. + * + * \`match\` is supposed to be an array coming out of a regex match, i.e. \`match[0]\` + * contains the actually matched text string. + * + * Also move the input cursor forward and update the match collectors: + * + * - \`yytext\` + * - \`yyleng\` + * - \`match\` + * - \`matches\` + * - \`yylloc\` + * - \`offset\` + * + * @public + * @this {RegExpLexer} + */ + test_match: function lexer_test_match(match, indexed_rule) { + var token, + lines, + backup, + match_str, + match_str_len; + + if (this.options.backtrack_lexer) { + // save context + backup = { + yylineno: this.yylineno, + yylloc: { + first_line: this.yylloc.first_line, + last_line: this.yylloc.last_line, + first_column: this.yylloc.first_column, + last_column: this.yylloc.last_column, + + range: this.yylloc.range.slice(0) + }, + yytext: this.yytext, + match: this.match, + matches: this.matches, + matched: this.matched, + yyleng: this.yyleng, + offset: this.offset, + _more: this._more, + _input: this._input, + //_signaled_error_token: this._signaled_error_token, + yy: this.yy, + conditionStack: this.conditionStack.slice(0), + done: this.done + }; + } + + match_str = match[0]; + match_str_len = match_str.length; + // if (match_str.indexOf('\\n') !== -1 || match_str.indexOf('\\r') !== -1) { + lines = match_str.split(/(?:\\r\\n?|\\n)/g); + if (lines.length > 1) { + this.yylineno += lines.length - 1; + + this.yylloc.last_line = this.yylineno + 1; + this.yylloc.last_column = lines[lines.length - 1].length; + } else { + this.yylloc.last_column += match_str_len; + } + // } + this.yytext += match_str; + this.match += match_str; + this.matched += match_str; + this.matches = match; + this.yyleng = this.yytext.length; + this.yylloc.range[1] += match_str_len; + + // previous lex rules MAY have invoked the \`more()\` API rather than producing a token: + // those rules will already have moved this \`offset\` forward matching their match lengths, + // hence we must only add our own match length now: + this.offset += match_str_len; + this._more = false; + this._backtrack = false; + this._input = this._input.slice(match_str_len); + + // calling this method: + // + // function lexer__performAction(yy, yyrulenumber, YY_START) {...} + token = this.performAction.call(this, this.yy, indexed_rule, this.conditionStack[this.conditionStack.length - 1] /* = YY_START */); + // otherwise, when the action codes are all simple return token statements: + //token = this.simpleCaseActionClusters[indexed_rule]; + + if (this.done && this._input) { + this.done = false; + } + if (token) { + return token; + } else if (this._backtrack) { + // recover context + for (var k in backup) { + this[k] = backup[k]; + } + this.__currentRuleSet__ = null; + return false; // rule action called reject() implying the next rule should be tested instead. + } else if (this._signaled_error_token) { + // produce one 'error' token as \`.parseError()\` in \`reject()\` + // did not guarantee a failure signal by throwing an exception! + token = this._signaled_error_token; + this._signaled_error_token = false; + return token; + } + return false; + }, + + /** + * return next match in input + * + * @public + * @this {RegExpLexer} + */ + next: function lexer_next() { + if (this.done) { + this.clear(); + return this.EOF; + } + if (!this._input) { + this.done = true; + } + + var token, + match, + tempMatch, + index; + if (!this._more) { + this.clear(); + } + var spec = this.__currentRuleSet__; + if (!spec) { + // Update the ruleset cache as we apparently encountered a state change or just started lexing. + // The cache is set up for fast lookup -- we assume a lexer will switch states much less often than it will + // invoke the \`lex()\` token-producing API and related APIs, hence caching the set for direct access helps + // speed up those activities a tiny bit. + spec = this.__currentRuleSet__ = this._currentRules(); + // Check whether a *sane* condition has been pushed before: this makes the lexer robust against + // user-programmer bugs such as https://github.com/zaach/jison-lex/issues/19 + if (!spec || !spec.rules) { + var lineno_msg = ''; + if (this.options.trackPosition) { + lineno_msg = ' on line ' + (this.yylineno + 1); + } + var p = this.constructLexErrorInfo('Internal lexer engine error' + lineno_msg + ': The lex grammar programmer pushed a non-existing condition name "' + this.topState() + '"; this is a fatal error and should be reported to the application programmer team!', false); + // produce one 'error' token until this situation has been resolved, most probably by parse termination! + return (this.parseError(p.errStr, p, this.JisonLexerError) || this.ERROR); + } + } + + var rule_ids = spec.rules; + var regexes = spec.__rule_regexes; + var len = spec.__rule_count; + + // Note: the arrays are 1-based, while \`len\` itself is a valid index, + // hence the non-standard less-or-equal check in the next loop condition! + for (var i = 1; i <= len; i++) { + tempMatch = this._input.match(regexes[i]); + if (tempMatch && (!match || tempMatch[0].length > match[0].length)) { + match = tempMatch; + index = i; + if (this.options.backtrack_lexer) { + token = this.test_match(tempMatch, rule_ids[i]); + if (token !== false) { + return token; + } else if (this._backtrack) { + match = undefined; + continue; // rule action called reject() implying a rule MISmatch. + } else { + // else: this is a lexer rule which consumes input without producing a token (e.g. whitespace) + return false; + } + } else if (!this.options.flex) { + break; + } + } + } + if (match) { + token = this.test_match(match, rule_ids[index]); + if (token !== false) { + return token; + } + // else: this is a lexer rule which consumes input without producing a token (e.g. whitespace) + return false; + } + if (!this._input) { + this.done = true; + this.clear(); + return this.EOF; + } else { + var lineno_msg = ''; + if (this.options.trackPosition) { + lineno_msg = ' on line ' + (this.yylineno + 1); + } + var p = this.constructLexErrorInfo('Lexical error' + lineno_msg + ': Unrecognized text.', this.options.lexerErrorsAreRecoverable); + + var pendingInput = this._input; + var activeCondition = this.topState(); + var conditionStackDepth = this.conditionStack.length; + + token = (this.parseError(p.errStr, p, this.JisonLexerError) || this.ERROR); + if (token === this.ERROR) { + // we can try to recover from a lexer error that \`parseError()\` did not 'recover' for us + // by moving forward at least one character at a time IFF the (user-specified?) \`parseError()\` + // has not consumed/modified any pending input or changed state in the error handler: + if (!this.matches && + // and make sure the input has been modified/consumed ... + pendingInput === this._input && + // ...or the lexer state has been modified significantly enough + // to merit a non-consuming error handling action right now. + activeCondition === this.topState() && + conditionStackDepth === this.conditionStack.length + ) { + this.input(); + } + } + return token; + } + }, + + /** + * return next match that has a token + * + * @public + * @this {RegExpLexer} + */ + lex: function lexer_lex() { + var r; + // allow the PRE/POST handlers set/modify the return token for maximum flexibility of the generated lexer: + if (typeof this.options.pre_lex === 'function') { + r = this.options.pre_lex.call(this); + } + + while (!r) { + r = this.next(); + } + + if (typeof this.options.post_lex === 'function') { + // (also account for a userdef function which does not return any value: keep the token as is) + r = this.options.post_lex.call(this, r) || r; + } + return r; + }, + + /** + * backwards compatible alias for \`pushState()\`; + * the latter is symmetrical with \`popState()\` and we advise to use + * those APIs in any modern lexer code, rather than \`begin()\`. + * + * @public + * @this {RegExpLexer} + */ + begin: function lexer_begin(condition) { + return this.pushState(condition); + }, + + /** + * activates a new lexer condition state (pushes the new lexer + * condition state onto the condition stack) + * + * @public + * @this {RegExpLexer} + */ + pushState: function lexer_pushState(condition) { + this.conditionStack.push(condition); + this.__currentRuleSet__ = null; + return this; + }, + + /** + * pop the previously active lexer condition state off the condition + * stack + * + * @public + * @this {RegExpLexer} + */ + popState: function lexer_popState() { + var n = this.conditionStack.length - 1; + if (n > 0) { + this.__currentRuleSet__ = null; + return this.conditionStack.pop(); + } else { + return this.conditionStack[0]; + } + }, + + /** + * return the currently active lexer condition state; when an index + * argument is provided it produces the N-th previous condition state, + * if available + * + * @public + * @this {RegExpLexer} + */ + topState: function lexer_topState(n) { + n = this.conditionStack.length - 1 - Math.abs(n || 0); + if (n >= 0) { + return this.conditionStack[n]; + } else { + return 'INITIAL'; + } + }, + + /** + * (internal) determine the lexer rule set which is active for the + * currently active lexer condition state + * + * @public + * @this {RegExpLexer} + */ + _currentRules: function lexer__currentRules() { + if (this.conditionStack.length && this.conditionStack[this.conditionStack.length - 1]) { + return this.conditions[this.conditionStack[this.conditionStack.length - 1]]; + } else { + return this.conditions['INITIAL']; + } + }, + + /** + * return the number of states currently on the stack + * + * @public + * @this {RegExpLexer} + */ + stateStackSize: function lexer_stateStackSize() { + return this.conditionStack.length; + } +}`; + // --- END lexer kernel --- +} + +RegExpLexer.prototype = (new Function(rmCommonWS` + return ${getRegExpLexerPrototype()}; +`))(); + + +// The lexer code stripper, driven by optimization analysis settings and +// lexer options, which cannot be changed at run-time. +function stripUnusedLexerCode(src, opt) { + // uses yyleng: ..................... ${opt.lexerActionsUseYYLENG} + // uses yylineno: ................... ${opt.lexerActionsUseYYLINENO} + // uses yytext: ..................... ${opt.lexerActionsUseYYTEXT} + // uses yylloc: ..................... ${opt.lexerActionsUseYYLOC} + // uses ParseError API: ............. ${opt.lexerActionsUseParseError} + // uses location tracking & editing: ${opt.lexerActionsUseLocationTracking} + // uses more() API: ................. ${opt.lexerActionsUseMore} + // uses unput() API: ................ ${opt.lexerActionsUseUnput} + // uses reject() API: ............... ${opt.lexerActionsUseReject} + // uses less() API: ................. ${opt.lexerActionsUseLess} + // uses display APIs pastInput(), upcomingInput(), showPosition(): + // ............................. ${opt.lexerActionsUseDisplayAPIs} + // uses describeYYLLOC() API: ....... ${opt.lexerActionsUseDescribeYYLOC} + + var ast = helpers.parseCodeChunkToAST(src, opt); + var new_src = helpers.prettyPrintAST(ast, opt); + +new_src = new_src.replace(/\/\*\s*JISON-LEX-ANALYTICS-REPORT\s*\*\//g, rmCommonWS` + // Code Generator Information Report + // --------------------------------- + // + // Options: + // + // backtracking: .................... ${opt.options.backtrack_lexer} + // location.ranges: ................. ${opt.options.ranges} + // location line+column tracking: ... ${opt.options.trackPosition} + // + // + // Forwarded Parser Analysis flags: + // + // uses yyleng: ..................... ${opt.parseActionsUseYYLENG} + // uses yylineno: ................... ${opt.parseActionsUseYYLINENO} + // uses yytext: ..................... ${opt.parseActionsUseYYTEXT} + // uses yylloc: ..................... ${opt.parseActionsUseYYLOC} + // uses lexer values: ............... ${opt.parseActionsUseValueTracking} / ${opt.parseActionsUseValueAssignment} + // location tracking: ............... ${opt.parseActionsUseLocationTracking} + // location assignment: ............. ${opt.parseActionsUseLocationAssignment} + // + // + // Lexer Analysis flags: + // + // uses yyleng: ..................... ${opt.lexerActionsUseYYLENG} + // uses yylineno: ................... ${opt.lexerActionsUseYYLINENO} + // uses yytext: ..................... ${opt.lexerActionsUseYYTEXT} + // uses yylloc: ..................... ${opt.lexerActionsUseYYLOC} + // uses ParseError API: ............. ${opt.lexerActionsUseParseError} + // uses yyerror: .................... ${opt.lexerActionsUseYYERROR} + // uses location tracking & editing: ${opt.lexerActionsUseLocationTracking} + // uses more() API: ................. ${opt.lexerActionsUseMore} + // uses unput() API: ................ ${opt.lexerActionsUseUnput} + // uses reject() API: ............... ${opt.lexerActionsUseReject} + // uses less() API: ................. ${opt.lexerActionsUseLess} + // uses display APIs pastInput(), upcomingInput(), showPosition(): + // ............................. ${opt.lexerActionsUseDisplayAPIs} + // uses describeYYLLOC() API: ....... ${opt.lexerActionsUseDescribeYYLOC} + // + // --------- END OF REPORT ----------- + + `); + + return new_src; +} + + + + + +// generate lexer source from a grammar +/** @public */ +function generate(dict, tokens, build_options) { + var opt = processGrammar(dict, tokens, build_options); + + return generateFromOpts(opt); +} + +// process the grammar and build final data structures and functions +/** @public */ +function processGrammar(dict, tokens, build_options) { + build_options = build_options || {}; + var opts = { + // include the knowledge passed through `build_options` about which lexer + // features will actually be *used* by the environment (which in 99.9% + // of cases is a jison *parser*): + // + // (this stuff comes straight from the jison Optimization Analysis.) + // + parseActionsUseYYLENG: build_options.parseActionsUseYYLENG, + parseActionsUseYYLINENO: build_options.parseActionsUseYYLINENO, + parseActionsUseYYTEXT: build_options.parseActionsUseYYTEXT, + parseActionsUseYYLOC: build_options.parseActionsUseYYLOC, + parseActionsUseParseError: build_options.parseActionsUseParseError, + parseActionsUseYYERROR: build_options.parseActionsUseYYERROR, + parseActionsUseYYERROK: build_options.parseActionsUseYYERROK, + parseActionsUseYYRECOVERING: build_options.parseActionsUseYYRECOVERING, + parseActionsUseYYCLEARIN: build_options.parseActionsUseYYCLEARIN, + parseActionsUseValueTracking: build_options.parseActionsUseValueTracking, + parseActionsUseValueAssignment: build_options.parseActionsUseValueAssignment, + parseActionsUseLocationTracking: build_options.parseActionsUseLocationTracking, + parseActionsUseLocationAssignment: build_options.parseActionsUseLocationAssignment, + parseActionsUseYYSTACK: build_options.parseActionsUseYYSTACK, + parseActionsUseYYSSTACK: build_options.parseActionsUseYYSSTACK, + parseActionsUseYYSTACKPOINTER: build_options.parseActionsUseYYSTACKPOINTER, + parseActionsUseYYRULELENGTH: build_options.parseActionsUseYYRULELENGTH, + parserHasErrorRecovery: build_options.parserHasErrorRecovery, + parserHasErrorReporting: build_options.parserHasErrorReporting, + + lexerActionsUseYYLENG: '???', + lexerActionsUseYYLINENO: '???', + lexerActionsUseYYTEXT: '???', + lexerActionsUseYYLOC: '???', + lexerActionsUseParseError: '???', + lexerActionsUseYYERROR: '???', + lexerActionsUseLocationTracking: '???', + lexerActionsUseMore: '???', + lexerActionsUseUnput: '???', + lexerActionsUseReject: '???', + lexerActionsUseLess: '???', + lexerActionsUseDisplayAPIs: '???', + lexerActionsUseDescribeYYLOC: '???', + }; + + dict = autodetectAndConvertToJSONformat(dict, build_options) || {}; + + // Feed the possibly reprocessed 'dictionary' above back to the caller + // (for use by our error diagnostic assistance code) + opts.lex_rule_dictionary = dict; + + // Always provide the lexer with an options object, even if it's empty! + // Make sure to camelCase all options: + opts.options = mkStdOptions(build_options, dict.options); + + opts.moduleType = opts.options.moduleType; + opts.moduleName = opts.options.moduleName; + + opts.conditions = prepareStartConditions(dict.startConditions); + opts.conditions.INITIAL = { + rules: [], + inclusive: true + }; + + var code = buildActions(dict, tokens, opts); + opts.performAction = code.actions; + opts.caseHelperInclude = code.caseHelperInclude; + opts.rules = code.rules; + opts.macros = code.macros; + + opts.regular_rule_count = code.regular_rule_count; + opts.simple_rule_count = code.simple_rule_count; + + opts.conditionStack = ['INITIAL']; + + opts.actionInclude = (dict.actionInclude || ''); + opts.moduleInclude = (opts.moduleInclude || '') + (dict.moduleInclude || '').trim(); + + return opts; +} + +// Assemble the final source from the processed grammar +/** @public */ +function generateFromOpts(opt) { + var code = ''; + + switch (opt.moduleType) { + case 'js': + code = generateModule(opt); + break; + case 'amd': + code = generateAMDModule(opt); + break; + case 'es': + code = generateESModule(opt); + break; + case 'commonjs': + default: + code = generateCommonJSModule(opt); + break; + } + + return code; +} + +function generateRegexesInitTableCode(opt) { + var a = opt.rules; + var print_xregexp = opt.options && opt.options.xregexp; + var id_display_width = (1 + Math.log10(a.length | 1) | 0); + var ws_prefix = new Array(id_display_width).join(' '); + var b = a.map(function generateXRegExpInitCode(re, idx) { + var idx_str = (ws_prefix + idx).substr(-id_display_width); + + if (re instanceof XRegExp) { + // When we don't need the special XRegExp sauce at run-time, we do with the original + // JavaScript RegExp instance a.k.a. 'native regex': + if (re.xregexp.isNative || !print_xregexp) { + return `/* ${idx_str}: */ ${re}`; + } + // And make sure to escape the regex to make it suitable for placement inside a *string* + // as it is passed as a string argument to the XRegExp constructor here. + var re_src = re.xregexp.source.replace(/[\\"]/g, '\\$&'); + return `/* ${idx_str}: */ new XRegExp("${re_src}", "${re.xregexp.flags}")`; + } else { + return `/* ${idx_str}: */ ${re}`; + } + }); + return b.join(',\n'); +} + +function generateModuleBody(opt) { + // make the JSON output look more like JavaScript: + function cleanupJSON(str) { + str = str.replace(/ "rules": \[/g, ' rules: ['); + str = str.replace(/ "inclusive": /g, ' inclusive: '); + return str; + } + + function produceOptions(opts) { + var obj = {}; + var do_not_pass = { + debug: !opts.debug, // do not include this item when it is FALSE as there's no debug tracing built into the generated grammar anyway! + enableDebugLogs: 1, + json: 1, + _: 1, + noMain: 1, + dumpSourceCodeOnFailure: 1, + throwErrorOnCompileFailure: 1, + reportStats: 1, + file: 1, + outfile: 1, + inputPath: 1, + inputFilename: 1, + defaultModuleName: 1, + moduleName: 1, + moduleType: 1, + lexerErrorsAreRecoverable: 0, + flex: 0, + backtrack_lexer: 0, + caseInsensitive: 0, + showSource: 1, + exportAST: 1, + exportAllTables: 1, + exportSourceCode: 1, + prettyCfg: 1, + parseActionsUseYYLENG: 1, + parseActionsUseYYLINENO: 1, + parseActionsUseYYTEXT: 1, + parseActionsUseYYLOC: 1, + parseActionsUseParseError: 1, + parseActionsUseYYERROR: 1, + parseActionsUseYYRECOVERING: 1, + parseActionsUseYYERROK: 1, + parseActionsUseYYCLEARIN: 1, + parseActionsUseValueTracking: 1, + parseActionsUseValueAssignment: 1, + parseActionsUseLocationTracking: 1, + parseActionsUseLocationAssignment: 1, + parseActionsUseYYSTACK: 1, + parseActionsUseYYSSTACK: 1, + parseActionsUseYYSTACKPOINTER: 1, + parseActionsUseYYRULELENGTH: 1, + parserHasErrorRecovery: 1, + parserHasErrorReporting: 1, + lexerActionsUseYYLENG: 1, + lexerActionsUseYYLINENO: 1, + lexerActionsUseYYTEXT: 1, + lexerActionsUseYYLOC: 1, + lexerActionsUseParseError: 1, + lexerActionsUseYYERROR: 1, + lexerActionsUseLocationTracking: 1, + lexerActionsUseMore: 1, + lexerActionsUseUnput: 1, + lexerActionsUseReject: 1, + lexerActionsUseLess: 1, + lexerActionsUseDisplayAPIs: 1, + lexerActionsUseDescribeYYLOC: 1, + }; + for (var k in opts) { + if (!do_not_pass[k] && opts[k] != null && opts[k] !== false) { + // make sure numeric values are encoded as numeric, the rest as boolean/string. + if (typeof opts[k] === 'string') { + var f = parseFloat(opts[k]); + if (f == opts[k]) { + obj[k] = f; + continue; + } + } + obj[k] = opts[k]; + } + } + + // And now some options which should receive some special processing: + var pre = obj.pre_lex; + var post = obj.post_lex; + // since JSON cannot encode functions, we'll have to do it manually at run-time, i.e. later on: + if (pre) { + obj.pre_lex = true; + } + if (post) { + obj.post_lex = true; + } + + var js = JSON.stringify(obj, null, 2); + + js = js.replace(new XRegExp(` "(${ID_REGEX_BASE})": `, 'g'), ' $1: '); + js = js.replace(/^( +)pre_lex: true(,)?$/gm, function (m, ls, tc) { + return ls + 'pre_lex: ' + String(pre) + (tc || ''); + }); + js = js.replace(/^( +)post_lex: true(,)?$/gm, function (m, ls, tc) { + return ls + 'post_lex: ' + String(post) + (tc || ''); + }); + return js; + } + + + var out; + if (opt.rules.length > 0 || opt.__in_rules_failure_analysis_mode__) { + // we don't mind that the `test_me()` code above will have this `lexer` variable re-defined: + // JavaScript is fine with that. + var code = [rmCommonWS` + var lexer = { + `, '/*JISON-LEX-ANALYTICS-REPORT*/' /* slot #1: placeholder for analysis report further below */ + ]; + + // get the RegExpLexer.prototype in source code form: + var protosrc = getRegExpLexerPrototype(); + // and strip off the surrounding bits we don't want: + protosrc = protosrc + .replace(/^[\s\r\n]*\{/, '') + .replace(/\s*\}[\s\r\n]*$/, '') + .trim(); + code.push(protosrc + ',\n'); + + assert(opt.options); + // Assure all options are camelCased: + assert(typeof opt.options['case-insensitive'] === 'undefined'); + + code.push(' options: ' + produceOptions(opt.options)); + + var performActionCode = String(opt.performAction); + var simpleCaseActionClustersCode = String(opt.caseHelperInclude); + var rulesCode = generateRegexesInitTableCode(opt); + var conditionsCode = cleanupJSON(JSON.stringify(opt.conditions, null, 2)); + code.push(rmCommonWS`, + JisonLexerError: JisonLexerError, + performAction: ${performActionCode}, + simpleCaseActionClusters: ${simpleCaseActionClustersCode}, + rules: [ + ${rulesCode} + ], + conditions: ${conditionsCode} + }; + `); + + opt.is_custom_lexer = false; + + out = code.join(''); + } else { + // We're clearly looking at a custom lexer here as there's no lexer rules at all. + // + // We are re-purposing the `%{...%}` `actionInclude` code block here as it serves no purpose otherwise. + // + // Meanwhile we make sure we have the `lexer` variable declared in *local scope* no matter + // what crazy stuff (or lack thereof) the userland code is pulling in the `actionInclude` chunk. + out = 'var lexer;\n'; + + assert(opt.regular_rule_count === 0); + assert(opt.simple_rule_count === 0); + opt.is_custom_lexer = true; + + if (opt.actionInclude) { + out += opt.actionInclude + (!opt.actionInclude.match(/;[\s\r\n]*$/) ? ';' : '') + '\n'; + } + } + + // The output of this function is guaranteed to read something like this: + // + // ``` + // var lexer; + // + // bla bla bla bla ... lotsa bla bla; + // ``` + // + // and that should work nicely as an `eval()`-able piece of source code. + return out; +} + +function generateGenericHeaderComment() { + var out = rmCommonWS` + /* lexer generated by jison-lex ${version} */ + + /* + * Returns a Lexer object of the following structure: + * + * Lexer: { + * yy: {} The so-called "shared state" or rather the *source* of it; + * the real "shared state" \`yy\` passed around to + * the rule actions, etc. is a direct reference! + * + * This "shared context" object was passed to the lexer by way of + * the \`lexer.setInput(str, yy)\` API before you may use it. + * + * This "shared context" object is passed to the lexer action code in \`performAction()\` + * so userland code in the lexer actions may communicate with the outside world + * and/or other lexer rules' actions in more or less complex ways. + * + * } + * + * Lexer.prototype: { + * EOF: 1, + * ERROR: 2, + * + * yy: The overall "shared context" object reference. + * + * JisonLexerError: function(msg, hash), + * + * performAction: function lexer__performAction(yy, yyrulenumber, YY_START), + * + * The function parameters and \`this\` have the following value/meaning: + * - \`this\` : reference to the \`lexer\` instance. + * \`yy_\` is an alias for \`this\` lexer instance reference used internally. + * + * - \`yy\` : a reference to the \`yy\` "shared state" object which was passed to the lexer + * by way of the \`lexer.setInput(str, yy)\` API before. + * + * Note: + * The extra arguments you specified in the \`%parse-param\` statement in your + * **parser** grammar definition file are passed to the lexer via this object + * reference as member variables. + * + * - \`yyrulenumber\` : index of the matched lexer rule (regex), used internally. + * + * - \`YY_START\`: the current lexer "start condition" state. + * + * parseError: function(str, hash, ExceptionClass), + * + * constructLexErrorInfo: function(error_message, is_recoverable), + * Helper function. + * Produces a new errorInfo \'hash object\' which can be passed into \`parseError()\`. + * See it\'s use in this lexer kernel in many places; example usage: + * + * var infoObj = lexer.constructParseErrorInfo(\'fail!\', true); + * var retVal = lexer.parseError(infoObj.errStr, infoObj, lexer.JisonLexerError); + * + * options: { ... lexer %options ... }, + * + * lex: function(), + * Produce one token of lexed input, which was passed in earlier via the \`lexer.setInput()\` API. + * You MAY use the additional \`args...\` parameters as per \`%parse-param\` spec of the **lexer** grammar: + * these extra \`args...\` are added verbatim to the \`yy\` object reference as member variables. + * + * WARNING: + * Lexer's additional \`args...\` parameters (via lexer's \`%parse-param\`) MAY conflict with + * any attributes already added to \`yy\` by the **parser** or the jison run-time; + * when such a collision is detected an exception is thrown to prevent the generated run-time + * from silently accepting this confusing and potentially hazardous situation! + * + * cleanupAfterLex: function(do_not_nuke_errorinfos), + * Helper function. + * + * This helper API is invoked when the **parse process** has completed: it is the responsibility + * of the **parser** (or the calling userland code) to invoke this method once cleanup is desired. + * + * This helper may be invoked by user code to ensure the internal lexer gets properly garbage collected. + * + * setInput: function(input, [yy]), + * + * + * input: function(), + * + * + * unput: function(str), + * + * + * more: function(), + * + * + * reject: function(), + * + * + * less: function(n), + * + * + * pastInput: function(n), + * + * + * upcomingInput: function(n), + * + * + * showPosition: function(), + * + * + * test_match: function(regex_match_array, rule_index), + * + * + * next: function(), + * + * + * begin: function(condition), + * + * + * pushState: function(condition), + * + * + * popState: function(), + * + * + * topState: function(), + * + * + * _currentRules: function(), + * + * + * stateStackSize: function(), + * + * + * performAction: function(yy, yy_, yyrulenumber, YY_START), + * + * + * rules: [...], + * + * + * conditions: {associative list: name ==> set}, + * } + * + * + * token location info (\`yylloc\`): { + * first_line: n, + * last_line: n, + * first_column: n, + * last_column: n, + * range: [start_number, end_number] + * (where the numbers are indexes into the input string, zero-based) + * } + * + * --- + * + * The \`parseError\` function receives a \'hash\' object with these members for lexer errors: + * + * { + * text: (matched text) + * token: (the produced terminal token, if any) + * token_id: (the produced terminal token numeric ID, if any) + * line: (yylineno) + * loc: (yylloc) + * recoverable: (boolean: TRUE when the parser MAY have an error recovery rule + * available for this particular error) + * yy: (object: the current parser internal "shared state" \`yy\` + * as is also available in the rule actions; this can be used, + * for instance, for advanced error analysis and reporting) + * lexer: (reference to the current lexer instance used by the parser) + * } + * + * while \`this\` will reference the current lexer instance. + * + * When \`parseError\` is invoked by the lexer, the default implementation will + * attempt to invoke \`yy.parser.parseError()\`; when this callback is not provided + * it will try to invoke \`yy.parseError()\` instead. When that callback is also not + * provided, a \`JisonLexerError\` exception will be thrown containing the error + * message and \`hash\`, as constructed by the \`constructLexErrorInfo()\` API. + * + * Note that the lexer\'s \`JisonLexerError\` error class is passed via the + * \`ExceptionClass\` argument, which is invoked to construct the exception + * instance to be thrown, so technically \`parseError\` will throw the object + * produced by the \`new ExceptionClass(str, hash)\` JavaScript expression. + * + * --- + * + * You can specify lexer options by setting / modifying the \`.options\` object of your Lexer instance. + * These options are available: + * + * (Options are permanent.) + * + * yy: { + * parseError: function(str, hash, ExceptionClass) + * optional: overrides the default \`parseError\` function. + * } + * + * lexer.options: { + * pre_lex: function() + * optional: is invoked before the lexer is invoked to produce another token. + * \`this\` refers to the Lexer object. + * post_lex: function(token) { return token; } + * optional: is invoked when the lexer has produced a token \`token\`; + * this function can override the returned token value by returning another. + * When it does not return any (truthy) value, the lexer will return + * the original \`token\`. + * \`this\` refers to the Lexer object. + * + * WARNING: the next set of options are not meant to be changed. They echo the abilities of + * the lexer as per when it was compiled! + * + * ranges: boolean + * optional: \`true\` ==> token location info will include a .range[] member. + * flex: boolean + * optional: \`true\` ==> flex-like lexing behaviour where the rules are tested + * exhaustively to find the longest match. + * backtrack_lexer: boolean + * optional: \`true\` ==> lexer regexes are tested in order and for invoked; + * the lexer terminates the scan when a token is returned by the action code. + * xregexp: boolean + * optional: \`true\` ==> lexer rule regexes are "extended regex format" requiring the + * \`XRegExp\` library. When this %option has not been specified at compile time, all lexer + * rule regexes have been written as standard JavaScript RegExp expressions. + * } + */ + `; + + return out; +} + +function prepareOptions(opt) { + opt = opt || {}; + + // check for illegal identifier + if (!opt.moduleName || !opt.moduleName.match(/^[a-zA-Z_$][a-zA-Z0-9_$\.]*$/)) { + if (opt.moduleName) { + var msg = 'WARNING: The specified moduleName "' + opt.moduleName + '" is illegal (only characters [a-zA-Z0-9_$] and "." dot are accepted); using the default moduleName "lexer" instead.'; + if (typeof opt.warn_cb === 'function') { + opt.warn_cb(msg); + } else { + // do not treat as warning; barf hairball instead so that this oddity gets noticed right away! + throw new Error(msg); + } + } + opt.moduleName = 'lexer'; + } + + prepExportStructures(opt); + + return opt; +} + +function generateModule(opt) { + opt = prepareOptions(opt); + + var out = [ + generateGenericHeaderComment(), + '', + 'var ' + opt.moduleName + ' = (function () {', + jisonLexerErrorDefinition, + '', + generateModuleBody(opt), + '', + (opt.moduleInclude ? opt.moduleInclude + ';' : ''), + '', + 'return lexer;', + '})();' + ]; + + var src = out.join('\n') + '\n'; + src = stripUnusedLexerCode(src, opt); + opt.exportSourceCode.all = src; + return src; +} + +function generateAMDModule(opt) { + opt = prepareOptions(opt); + + var out = [ + generateGenericHeaderComment(), + '', + 'define([], function () {', + jisonLexerErrorDefinition, + '', + generateModuleBody(opt), + '', + (opt.moduleInclude ? opt.moduleInclude + ';' : ''), + '', + 'return lexer;', + '});' + ]; + + var src = out.join('\n') + '\n'; + src = stripUnusedLexerCode(src, opt); + opt.exportSourceCode.all = src; + return src; +} + +function generateESModule(opt) { + opt = prepareOptions(opt); + + var out = [ + generateGenericHeaderComment(), + '', + 'var lexer = (function () {', + jisonLexerErrorDefinition, + '', + generateModuleBody(opt), + '', + (opt.moduleInclude ? opt.moduleInclude + ';' : ''), + '', + 'return lexer;', + '})();', + '', + 'function yylex() {', + ' return lexer.lex.apply(lexer, arguments);', + '}', + rmCommonWS` + export { + lexer, + yylex as lex + }; + ` + ]; + + var src = out.join('\n') + '\n'; + src = stripUnusedLexerCode(src, opt); + opt.exportSourceCode.all = src; + return src; +} + +function generateCommonJSModule(opt) { + opt = prepareOptions(opt); + + var out = [ + generateGenericHeaderComment(), + '', + 'var ' + opt.moduleName + ' = (function () {', + jisonLexerErrorDefinition, + '', + generateModuleBody(opt), + '', + (opt.moduleInclude ? opt.moduleInclude + ';' : ''), + '', + 'return lexer;', + '})();', + '', + 'if (typeof require !== \'undefined\' && typeof exports !== \'undefined\') {', + ' exports.lexer = ' + opt.moduleName + ';', + ' exports.lex = function () {', + ' return ' + opt.moduleName + '.lex.apply(lexer, arguments);', + ' };', + '}' + ]; + + var src = out.join('\n') + '\n'; + src = stripUnusedLexerCode(src, opt); + opt.exportSourceCode.all = src; + return src; +} + +RegExpLexer.generate = generate; + +RegExpLexer.version = version; +RegExpLexer.defaultJisonLexOptions = defaultJisonLexOptions; +RegExpLexer.mkStdOptions = mkStdOptions; +RegExpLexer.camelCase = camelCase; +RegExpLexer.autodetectAndConvertToJSONformat = autodetectAndConvertToJSONformat; + +return RegExpLexer; + +}))); diff --git a/examples/basic2_lex.jison b/examples/basic2_lex.jison new file mode 100644 index 0000000..65b89b6 --- /dev/null +++ b/examples/basic2_lex.jison @@ -0,0 +1,7 @@ +%% + +\s+ {/* skip whitespace */} +"x" {return 'x';} + +%% + diff --git a/examples/basic_lex.jison b/examples/basic_lex.jison new file mode 100644 index 0000000..0a14bbb --- /dev/null +++ b/examples/basic_lex.jison @@ -0,0 +1,6 @@ + +%% +\s+ {/* skip whitespace */} +[0-9]+ {return 'NAT';} +"+" {return '+';} + diff --git a/examples/benchmark.js b/examples/benchmark.js new file mode 100644 index 0000000..ce5260e --- /dev/null +++ b/examples/benchmark.js @@ -0,0 +1,360 @@ + +/** + * Provide a generic performance timer, which strives to produce highest possible accuracy time measurements. + * + * methods: + * + * - `start()` (re)starts the timer and 'marks' the current time for ID="start". + * `.start()` also CLEARS ALL .mark_delta() timers! + * + * - `mark(ID)` calculates the elapsed time for the current timer in MILLISECONDS (floating point) + * since `.start()`. `.mark_delta()` then updates the 'start/mark time' for the given ID. + * + * ID *may* be NULL, in which case `.mark()` will not update any 'start/mark time'. + * + * - `mark_delta(ID, START_ID)` calculates the elapsed time for the current timer in MILLISECONDS (floating point) since + * the last call to `.mark_delta()` or `.mark()` with the same ID. `.mark_delta()` then updates the + * 'start/mark time' for the given ID. + * + * When the optional START_ID is specified, the delta is calculated against the last marked time + * for that START_ID. + * + * When the ID is NULL or not specified, then the default ID of "start" will be assumed. + * + * This results in consecutive calls to `.mark_delta()` with the same ID to produce + * each of the time intervals between the calls, while consecutive calls to + * `.mark()` with he same ID would produce an increase each time instead as the time + * between the `.mark()` call and the original `.start()` increases. + * + * Notes: + * + * - when you invoke `.mark()` or `.mark_delta()` without having called .start() before, + * then the timer is started at the mark. + * + * - `.start()` will erase all stored 'start/mark times' which may have been + * set by `.mark()` or `.mark_delta()` before -- you may call `.start()` multiple times for + * the same timer instance, after all. + * + * - you are responsible to manage the IDs for `.mark()` and `.mark_delta()`. The ID MUST NOT be "start" + * as ID = "start" identifies the .start() timer. + * + * References for the internal implementation: + * + * - http://updates.html5rocks.com/2012/08/When-milliseconds-are-not-enough-performance-now + * - http://ejohn.org/blog/accuracy-of-javascript-time/ + * + * @class + * @constructor + */ +function PerformanceTimer() { + /* @private */ var start_time = false; + var obj = { + }; + // feature detect: + /* @private */ var f, tv; + /* @private */ var p = (typeof window !== 'undefined' && window.performance); + if (p && p.timing.navigationStart && p.now) { + f = function () { + return p.now(); + }; + } else if (p && typeof p.webkitNow === 'function') { + f = function () { + return p.webkitNow(); + }; + } else { + p = (typeof process !== 'undefined' && process.hrtime); + if (typeof p === 'function') { + tv = p(); + if (tv && tv.length === 2) { + f = function () { + var rv = p(); + return rv[0] * 1e3 + rv[1] * 1e-6; + }; + } + } + if (!f) { + f = function () { + return Date.now(); + }; + try { + f(); + } catch (ex) { + f = function () { + return +new Date(); + }; + } + } + } + + obj.start = function () { + start_time = { + start: f() + }; + return obj; + }; + + obj.mark = function (id, start_id) { + if (start_time === false) this.start(); + var end_time = f(); + var begin_time = start_time[start_id || id || "start"]; + if (!begin_time) { + begin_time = end_time; + } + var rv = end_time - begin_time; + if (id) { + start_time[id] = end_time; + } + return rv; + }; + + obj.mark_delta = function (id) { + if (start_time === false) this.start(); + id = id || "start"; + var end_time = f(); + var begin_time = start_time[id]; + if (!begin_time) { + begin_time = end_time; + } + var rv = end_time - begin_time; + start_time[id] = end_time; + return rv; + }; + + obj.reset_mark = function (id) { + id = id || "start"; + start_time[id] = null; + return obj; + }; + + obj.get_mark = function (id) { + id = id || "start"; + return start_time[id]; + }; + + obj.mark_sample_and_hold = function (id) { + if (start_time === false) this.start(); + id = id || "start"; + // sample ... + var end_time = f(); + var begin_time = start_time[id]; + if (!begin_time) { + begin_time = end_time; + // ... and hold + start_time[id] = begin_time; + } + var rv = end_time - begin_time; + return rv; + }; + + return obj; +} + +var perf = PerformanceTimer(); + + + +// round to the number of decimal digits: +function r(v, n) { + var m = Math.pow(10, n | 0); + v *= m; + v = Math.round(v); + return v / m; +} + +// run the benchmark on function `f` for at least 5 seconds. +function bench(f, n, minimum_run_time, setup_f, destroy_f) { + var factor = 50; + var run = 1; // factor of 50 ! + n |= 0; + n /= run; + n |= 0; + n = Math.max(n, 1); // --> minimum number of tests: 1*run*factor + + minimum_run_time |= 0; + if (!minimum_run_time) { + // default: 5 seconds minimum run time: + minimum_run_time = 5000 * 1.01 /* overhead compensation */; + } + minimum_run_time = Math.max(minimum_run_time, 1000); // absolute minimum run time: 1 second + + perf.mark('monitor'); + + if (setup_f) { + setup_f(f, n, minimum_run_time); + } + + // measure a short run and determine the run count based on this result: + perf.mark('bench'); + // 50 x f(): that seems a sort of 'sweet spot' for NodeJS v5, at least for some benchmarks... + f(); + f(); + f(); + f(); + f(); + f(); + f(); + f(); + f(); + f(); + + f(); + f(); + f(); + f(); + f(); + f(); + f(); + f(); + f(); + f(); + + f(); + f(); + f(); + f(); + f(); + f(); + f(); + f(); + f(); + f(); + + f(); + f(); + f(); + f(); + f(); + f(); + f(); + f(); + f(); + f(); + + f(); + f(); + f(); + f(); + f(); + f(); + f(); + f(); + f(); + f(); + + var sample1 = perf.mark('bench'); + var fmultiplier = 250 / sample1; + var multiplier = Math.max(1, (fmultiplier + 0.5) | 0); + run = Math.max(run, multiplier); + console.log("run multiplier: ", run); + + // get the number of tests internal to the test function: 1 or more + var internal_cnt = f(); + if (typeof internal_cnt === 'number' && (internal_cnt | 0) === internal_cnt) { + factor *= internal_cnt; + } + + var last_report = 500; + var ts = []; + for (var i = 0; i < n; i++) { + perf.mark('bench'); + for (var j = 0; j < run; j++) { + // 50 x f(): that seems a sort of 'sweet spot' for NodeJS v5, at least for some benchmarks... + f(); + f(); + f(); + f(); + f(); + f(); + f(); + f(); + f(); + f(); + + f(); + f(); + f(); + f(); + f(); + f(); + f(); + f(); + f(); + f(); + + f(); + f(); + f(); + f(); + f(); + f(); + f(); + f(); + f(); + f(); + + f(); + f(); + f(); + f(); + f(); + f(); + f(); + f(); + f(); + f(); + + f(); + f(); + f(); + f(); + f(); + f(); + f(); + f(); + f(); + f(); + } + ts.push(perf.mark('bench')); + var consumed = perf.mark_sample_and_hold('monitor'); + //console.log('consumed', consumed, ts[ts.length - 1], i); + if (last_report <= consumed) { + console.log('#' + (ts.length * factor)); + last_report = consumed + 1000; + } + if (consumed < minimum_run_time || ts.length < 10) { + // stay in the loop until 5 seconds have expired or at least 10 rounds have been executed! + i = Math.min(i, n - 2); + } + } + + if (destroy_f) { + destroy_f(f, n, minimum_run_time); + } + + var consumed = perf.mark_sample_and_hold('monitor'); + + var sum = 0; + for (var i = 0, cnt = ts.length; i < cnt; i++) { + sum += ts[i]; + } + var avg = sum / cnt; + + var dev = 0; + var peak = 0; + for (var i = 0; i < cnt; i++) { + var delta = Math.abs(ts[i] - avg); + dev += delta; + peak = Math.max(peak, delta); + } + dev /= cnt; + var sample_size = run * factor; + console.log(["Time: total: ", r(sum, 0) + 'ms', + ", sample_count: ", cnt, + ", # runs: ", cnt * sample_size, + ", # runs/sec: ", r(cnt * sample_size * 1000 / sum, 1), + ", average: ", r(avg / sample_size, 4) + 'ms', + ", deviation: ", r(100 * dev / avg, 2) + '%', + ", peak_deviation: ", r(100 * peak / avg, 2) + '%', + ", total overhead: ", r(consumed - sum, 0) + 'ms'].join('') + ); +} diff --git a/examples/c99.l b/examples/c99.l new file mode 100644 index 0000000..4e2dc84 --- /dev/null +++ b/examples/c99.l @@ -0,0 +1,269 @@ + +/* + * ANSI C grammar, Lex specification + * + * (This Lex file is accompanied by a matching Yacc file.) + * + * In 1985, Jeff Lee published his Yacc grammar based on a draft version of the ANSI C standard, + * along with a supporting Lex specification. Tom Stockfisch reposted those files to net.sources + * in 1987; as mentioned in the answer to question 17.25 of the comp.lang.c FAQ, they used to + * be available from ftp.uu.net as usenet/net.sources/ansi.c.grammar.Z. + * + * The version you see here has been updated based on the 2011 ISO C standard. + * (The previous version's Lex and Yacc files for ANSI C9X still exist as archived copies.) + * + * It is assumed that translation phases 1..5 have already been completed, including + * preprocessing and _Pragma processing. The Lex rule for string literals will perform + * concatenation (translation phase 6). Transliteration of universal character names + * (\uHHHH or \UHHHHHHHH) must have been done by either the preprocessor or a replacement + * for the input() macro used by Lex (or the YY_INPUT function used by Flex) to read + * characters. Although comments should have been changed to space characters during + * translation phase 3, there are Lex rules for them anyway. + * + * I want to keep this version as close to the current C Standard grammar as possible; + * please let me know if you discover discrepancies. + * (There is an FAQ for this grammar that you might want to read first.) + * + * jutta@pobox.com, 2012 + * + * Last edit: 2012-12-19 DAGwyn@aol.com + * + * + * Note: The following %-parameters are the minimum sizes needed for real Lex. + * + * %e number of parsed tree nodes + * %p number of positions + * %n number of states + * %k number of packed character classes + * %a number of transitions + * %o size of output array + * + * %e 1019 + * %p 2807 + * %n 371 + * %k 284 + * %a 1213 + * %o 1117 + * + * ----------------------------------------------------------------------------------------- + */ + + +%options easy_keyword_rules + + + +O [0-7] +D [0-9] +NZ [1-9] +L [a-zA-Z_] +A [a-zA-Z_0-9] +H [a-fA-F0-9] +HP (0[xX]) +E ([Ee][+-]?{D}+) +P ([Pp][+-]?{D}+) +FS (f|F|l|L) +IS (((u|U)(l|L|ll|LL)?)|((l|L|ll|LL)(u|U)?)) +CP (u|U|L) +SP (u8|u|U|L) +ES (\\(['"\?\\abfnrtv]|[0-7]{1,3}|x[a-fA-F0-9]+)) +WS [ \t\v\n\f] + + + +// %{ +// #include +// #include "y.tab.h" +// +// extern void yyerror(const char *); /* prints grammar violation message */ +// +// extern int sym_type(const char *); /* returns type from symbol table */ +// +// #define sym_type(identifier) IDENTIFIER /* with no symbol table, fake it */ +// +// static void comment(void); +// static int check_type(void); +// %} + + +%% + +"/*" { comment(); } +"//".* { /* consume //-comment */ } + +"auto" { return 'AUTO'; } +"break" { return 'BREAK'; } +"case" { return 'CASE'; } +"char" { return 'CHAR'; } +"const" { return 'CONST'; } +"continue" { return 'CONTINUE'; } +"default" { return 'DEFAULT'; } +"do" { return 'DO'; } +"double" { return 'DOUBLE'; } +"else" { return 'ELSE'; } +"enum" { return 'ENUM'; } +"extern" { return 'EXTERN'; } +"float" { return 'FLOAT'; } +"for" { return 'FOR'; } +"goto" { return 'GOTO'; } +"if" { return 'IF'; } +"inline" { return 'INLINE'; } +"int" { return 'INT'; } +"long" { return 'LONG'; } +"register" { return 'REGISTER'; } +"restrict" { return 'RESTRICT'; } +"return" { return 'RETURN'; } +"short" { return 'SHORT'; } +"signed" { return 'SIGNED'; } +"sizeof" { return 'SIZEOF'; } +"static" { return 'STATIC'; } +"struct" { return 'STRUCT'; } +"switch" { return 'SWITCH'; } +"typedef" { return 'TYPEDEF'; } +"union" { return 'UNION'; } +"unsigned" { return 'UNSIGNED'; } +"void" { return 'VOID'; } +"volatile" { return 'VOLATILE'; } +"while" { return 'WHILE'; } +"_Alignas" { return 'ALIGNAS'; } +"_Alignof" { return 'ALIGNOF'; } +"_Atomic" { return 'ATOMIC'; } +"_Bool" { return 'BOOL'; } +"_Complex" { return 'COMPLEX'; } +"_Generic" { return 'GENERIC'; } +"_Imaginary" { return 'IMAGINARY'; } +"_Noreturn" { return 'NORETURN'; } +"_Static_assert" { return 'STATIC_ASSERT'; } +"_Thread_local" { return 'THREAD_LOCAL'; } +"__func__" { return 'FUNC_NAME'; } + +{L}{A}* { + // return check_type(); + + switch (sym_type(yytext)) { + case TYPEDEF_NAME: /* previously defined */ + return 'TYPEDEF_NAME'; + + case ENUMERATION_CONSTANT: /* previously defined */ + return 'ENUMERATION_CONSTANT'; + + default: /* includes undefined */ + return 'IDENTIFIER'; + } + } + +{HP}{H}+{IS}? { return 'I_CONSTANT'; } +{NZ}{D}*{IS}? { return 'I_CONSTANT'; } +"0"{O}*{IS}? { return 'I_CONSTANT'; } +{CP}?"'"([^'\\\n]|{ES})+"'" + { return 'I_CONSTANT'; } + +{D}+{E}{FS}? { return 'F_CONSTANT'; } +{D}*"."{D}+{E}?{FS}? { return 'F_CONSTANT'; } +{D}+"."{E}?{FS}? { return 'F_CONSTANT'; } +{HP}{H}+{P}{FS}? { return 'F_CONSTANT'; } +{HP}{H}*"."{H}+{P}{FS}? + { return 'F_CONSTANT'; } +{HP}{H}+"."{P}{FS}? { return 'F_CONSTANT'; } + +({SP}?\"([^\"\\\n]|{ES})*\"{WS}*)+ + { return 'STRING_LITERAL'; } + +"..." { return 'ELLIPSIS'; } +">>=" { return 'RIGHT_ASSIGN'; } +"<<=" { return 'LEFT_ASSIGN'; } +"+=" { return 'ADD_ASSIGN'; } +"-=" { return 'SUB_ASSIGN'; } +"*=" { return 'MUL_ASSIGN'; } +"/=" { return 'DIV_ASSIGN'; } +"%=" { return 'MOD_ASSIGN'; } +"&=" { return 'AND_ASSIGN'; } +"^=" { return 'XOR_ASSIGN'; } +"|=" { return 'OR_ASSIGN'; } +">>" { return 'RIGHT_OP'; } +"<<" { return 'LEFT_OP'; } +"++" { return 'INC_OP'; } +"--" { return 'DEC_OP'; } +"->" { return 'PTR_OP'; } +"&&" { return 'AND_OP'; } +"||" { return 'OR_OP'; } +"<=" { return 'LE_OP'; } +">=" { return 'GE_OP'; } +"==" { return 'EQ_OP'; } +"!=" { return 'NE_OP'; } +";" { return ';'; } +("{"|"<%") { return '{'; } +("}"|"%>") { return '}'; } +"," { return ','; } +":" { return ':'; } +"=" { return '='; } +"(" { return '('; } +")" { return ')'; } +("["|"<:") { return '['; } +("]"|":>") { return ']'; } +"." { return '.'; } +"&" { return '&'; } +"!" { return '!'; } +"~" { return '~'; } +"-" { return '-'; } +"+" { return '+'; } +"*" { return '*'; } +"/" { return '/'; } +"%" { return '%'; } +"<" { return '<'; } +">" { return '>'; } +"^" { return '^'; } +"|" { return '|'; } +"?" { return '?'; } + +{WS}+ { /* whitespace separates tokens */ } +. { /* discard bad characters */ } + + + + + + +%% + + + +// +// int yywrap(void) /* called at end of input */ +// { +// return 1; /* terminate now */ +// } +// +// static void comment(void) +// { +// int c; +// +// while ((c = input()) != 0) +// if (c == '*') +// { +// while ((c = input()) == '*') +// ; +// +// if (c == '/') +// return; +// +// if (c == 0) +// break; +// } +// yyerror("unterminated comment"); +// } +// +// static int check_type(void) +// { +// switch (sym_type(yytext)) +// { +// case TYPEDEF_NAME: /* previously defined */ +// return TYPEDEF_NAME; +// case ENUMERATION_CONSTANT: /* previously defined */ +// return ENUMERATION_CONSTANT; +// default: /* includes undefined */ +// return IDENTIFIER; +// } +// } +// +// diff --git a/examples/ccalc-lex.l b/examples/ccalc-lex.l new file mode 100644 index 0000000..37839be --- /dev/null +++ b/examples/ccalc-lex.l @@ -0,0 +1,77 @@ +/*! @file lex.l + * @brief Lexical Analysis + ********************************************************************* + * a simple calculator with variables + * + * sample-files for a artikel in developerworks.ibm.com + * Author: Christian Hagen, chagen@de.ibm.com + * + * @par lex.l & lex.c + * input for flex the lexical analysis generator + * + ********************************************************************* + */ + +// %option noyywrap + +%{ +%} + +/*-------------------------------------------------------------------- + * + * flex definitions + * + *------------------------------------------------------------------*/ +DIGIT [0-9] +ID [_a-zA-Z][_a-zA-Z0-9]* + +%% + +[ \t\r\n]+ { + /* eat up whitespace */ + BeginToken(yytext); + } +{DIGIT}+ { + BeginToken(yytext); + yylval.value = atof(yytext); + return VALUE; + } +{DIGIT}+"."{DIGIT}* { + BeginToken(yytext); + yylval.value = atof(yytext); + return VALUE; + } +{DIGIT}+[eE]["+""-"]?{DIGIT}* { + BeginToken(yytext); + yylval.value = atof(yytext); + return VALUE; + } +{DIGIT}+"."{DIGIT}*[eE]["+""-"]?{DIGIT}* { + BeginToken(yytext); + yylval.value = atof(yytext); + return VALUE; + } +{ID} { + BeginToken(yytext); + yylval.string = malloc(strlen(yytext)+1); + strcpy(yylval.string, yytext); + return IDENTIFIER; + } +"+" { BeginToken(yytext); return ADD; } +"-" { BeginToken(yytext); return SUB; } +"*" { BeginToken(yytext); return MULT; } +"/" { BeginToken(yytext); return DIV; } +"(" { BeginToken(yytext); return LBRACE; } +")" { BeginToken(yytext); return RBRACE; } +";" { BeginToken(yytext); return SEMICOLON; } +"=" { BeginToken(yytext); return ASSIGN; } + +. { + BeginToken(yytext); + return yytext[0]; + } +%% + +/*-------------------------------------------------------------------- + * lex.l + *------------------------------------------------------------------*/ diff --git a/examples/classy.jisonlex b/examples/classy.jisonlex new file mode 100644 index 0000000..e0e7edf --- /dev/null +++ b/examples/classy.jisonlex @@ -0,0 +1,39 @@ +digit [0-9] +id [a-zA-Z][a-zA-Z0-9]* + +%% +"//".* /* ignore comment */ +"main" return 'MAIN'; +"class" return 'CLASS'; +"extends" return 'EXTENDS'; +"nat" return 'NATTYPE'; +"if" return 'IF'; +"else" return 'ELSE'; +"for" return 'FOR'; +"printNat" return 'PRINTNAT'; +"readNat" return 'READNAT'; +"this" return 'THIS'; +"new" return 'NEW'; +"var" return 'VAR'; +"null" return 'NUL'; +{digit}+ return 'NATLITERAL'; +{id} return 'ID'; +"==" return 'EQUALITY'; +"=" return 'ASSIGN'; +"+" return 'PLUS'; +"-" return 'MINUS'; +"*" return 'TIMES'; +">" return 'GREATER'; +"||" return 'OR'; +"!" return 'NOT'; +"." return 'DOT'; +"{" return 'LBRACE'; +"}" return 'RBRACE'; +"(" return 'LPAREN'; +")" return 'RPAREN'; +";" return 'SEMICOLON'; +\s+ /* skip whitespace */ +"." throw 'Illegal character'; +<> return 'ENDOFFILE'; + + diff --git a/examples/codegen-feature-tester-base.jison b/examples/codegen-feature-tester-base.jison new file mode 100644 index 0000000..1cad86b --- /dev/null +++ b/examples/codegen-feature-tester-base.jison @@ -0,0 +1,197 @@ + +// %options backtrack_lexer + +%s PERCENT_ALLOWED + +%% + +// `%`: the grammar is not LALR(1) unless we make the lexer smarter and have +// it disambiguate the `%` between `percent` and `modulo` functionality by +// additional look-ahead: +// we introduce a lexical predicate here to disambiguate the `%` and thus +// keep the grammar LALR(1)! +// https://developer.mozilla.org/en/docs/Web/JavaScript/Guide/Regular_Expressions +// we also use an (inclusive) lexical scope which turns this rule on only +// immediately after a number was lexed previously. + +"%"(?=\s*(?:[^0-9)]|E\b|PI\b|$)) + // followed by another operator, i.e. anything that's + // not a number, or The End: then this is a unary + // `percent` operator. + // + // `1%-2` would be ambiguous but isn't: the `-` is + // considered as a unary minus and thus `%` is a + // `modulo` operator. + // + // `1%*5` thus is treated the same: any operator + // following the `%` is assumed to be a *binary* + // operator. Hence `1% times 5` which brings us to + // operators which only exist in unary form: `!`, and + // values which are not numbers, e.g. `PI` and `E`: + // how about + // - `1%E` -> modulo E, + // - `1%!0` -> modulo 1 (as !0 -> 1) + // + // Of course, the easier way to handle this would be to + // keep the lexer itself dumb and put this additional + // logic inside a post_lex handler which should then be + // able to obtain additional look-ahead tokens and queue + // them for later, while using those to inspect and + // adjust the lexer output now -- a trick which is used + // in the cockroachDB SQL parser code, for example. + // + // The above regex solution however is a more local + // extra-lookahead solution and thus should cost us less + // overhead than the suggested post_lex alternative, but + // it comes at a cost itself: complex regex and + // duplication of language knowledge in the lexer itself, + // plus inclusion of *grammar* (syntactic) knowledge in + // the lexer too, where it doesn't belong in an ideal + // world... + console.log('percent: ', yytext); + return '%'; + +. + this.popState(); + this.unput(yytext); + // this.unput(yytext); can be used here instead of + // this.reject(); which would only work when we set the + // backtrack_lexer option + + +\s+ /* skip whitespace */ + +[0-9]+("."[0-9]+)?\b + this.pushState('PERCENT_ALLOWED'); + return 'NUMBER'; + +"*" return '*'; +"/" return '/'; +"-" return '-'; +"+" return '+'; +"^" return '^'; +"!" return '!'; +"%" return 'MOD'; +"(" return '('; +")" return ')'; +"PI" return 'PI'; +"E" return 'E'; +<> return 'EOF'; +. return 'INVALID'; + + + +%% + +// feature of the GH fork: specify your own main. +// +// compile with +// +// jison -o test.js --main that/will/be/me.jison +// +// then run +// +// node ./test.js +// +// to see the output. + +var assert = require("assert"); + + +var print = (typeof console !== 'undefined' ? function __print__() { + console.log.apply(null, [' '].concat(Array.prototype.slice.call(arguments, 0))); +} : function __dummy__() {}); + + + + + + + + + + +parser.pre_parse = function (yy) { + print("parsing: ", yy.lexer.upcomingInput(-1 /* i.e. produce the entire (unparsed) input string */)); + + parser.lexer.options.post_lex = function (token) { + print("lex() ==> ", token, '[' + this.yytext + ']', parser.describeSymbol(token)); + }; +}; + + + +if (0) { + parser.trace = function () { + print.apply(null, ['TRACE: '].concat(Array.prototype.slice.call(arguments, 0))); + }; +} + + + +parser.yy.parseError = function parseError(str, hash, ExceptionClass) { + assert(hash.yy); + assert(this); + assert(this !== parser.yy); + assert(this === hash.yy.parser || this === hash.yy.lexer); + if (hash.recoverable) { + hash.yy.parser.trace(str); + hash.yy.lastErrorMessage = str; + hash.yy.lastErrorHash = hash; + } else { + console.error(str, hash && hash.exception); + throw new ExceptionClass(str, hash); + } +}; + + + +%include benchmark.js + + + + +parser.main = function () { + print("Running benchmark..."); + var t1 = perf.start(); + + var basenum = 1; + + function test() { + const formulas_and_expectations = [ + basenum + '+2*(3-5--+--+6!)-7/-8%', 1523.5 + basenum, + basenum + '+2*0.7%^PI^2+4+5', 9 + basenum, /* this bets on JS floating point calculations discarding the small difference with this integer value... */ + basenum + '+(2+3*++++)+5+6+7+8+9 9', 74 + basenum, // with error recovery and all it gives you a value... + basenum + '+2*(3!-5!-6!)/7/8', -29.785714285714285 + basenum, + ]; + + basenum++; + + for (var i = 0, len = formulas_and_expectations.length; i < len; i += 2) { + var formula = formulas_and_expectations[i]; + var expectation = formulas_and_expectations[i + 1]; + + var rv = parser.parse(formula); + print("'" + formula + "' ==> ", rv, "\n"); + if (isNaN(rv) && isNaN(expectation)) { + assert(1); + } else { + assert.equal(rv, expectation); + } + } + return formulas_and_expectations.length / 2; + } + + if (0) { + print = function dummy() {}; + } + if (01) { + test(); + } else { + bench(test); + } + + // if you get past the assert(), you're good. + print("tested OK @", r(perf.mark(), 2), " ms"); +}; + diff --git a/examples/comments.jison b/examples/comments.jison new file mode 100644 index 0000000..d88e1e4 --- /dev/null +++ b/examples/comments.jison @@ -0,0 +1,79 @@ + +lineEnd (\n\r|\r\n|[\n\r]) +commentName ([a-zA-Z]+("|"|[a-zA-Z]+)*(?=[\s]*)) +%s area commentBody inlineCommentBody + +%% +("//"\n) + %{ + yytext = ''; + this.popState(); + return 'areaEnd'; + %} +(?=("//"|"/*")) + %{ + this.popState(); + return 'areaEnd'; + %} +(.|{lineEnd}) + %{ + return 'areaString'; + %} +(?=<>) + %{ + this.popState(); + return 'areaEnd'; + %} +("//"){commentName}(?={lineEnd}) + %{ + this.begin('area'); + yytext = getTypes(yytext.substring(2, yyleng)); + return 'areaType'; + %} + + +("*/") + %{ + this.popState(); + return 'commentEnd'; + %} +(.|{lineEnd}) + %{ + return 'bodyString'; + %} +("/*"){commentName} + %{ + this.begin('commentBody'); + yytext = getTypes(yytext.substring(2, yyleng)); + return 'commentType'; + %} + + +(.) + %{ + return 'inlineBodyString'; + %} +(?={lineEnd}) + %{ + this.popState(); + return 'inlineCommentEnd'; + %} +(?=<>) + %{ + this.popState(); + return 'inlineCommentEnd'; + %} +"//"{commentName} + %{ + this.begin('inlineCommentBody'); + yytext = getTypes(yytext.substring(2, yyleng)); + return 'inlineCommentType'; + %} + + +([A-Za-z0-9 .,?;]+) return 'string'; +([ ]) return 'string'; +{lineEnd} return 'string'; +(.) return 'string'; +<> return 'eof'; + diff --git a/examples/compiled_calc_parse.jison b/examples/compiled_calc_parse.jison new file mode 100644 index 0000000..c30b3f1 --- /dev/null +++ b/examples/compiled_calc_parse.jison @@ -0,0 +1,115 @@ + + + +%import symbols "compiled_calc_AST_symbols.json5" + + + + +//%options flex +%options case-insensitive +//%options xregexp +//%options backtrack_lexer +//%options ranges +%options easy_keyword_rules + + +%% + +// 1.0e7 +[0-9]+\.[0-9]*(?:[eE][-+]*[0-9]+)?\b + %{ + yytext = parseFloat(yytext); + return 'NUM'; + %} + +// .5e7 +[0-9]*\.[0-9]+(?:[eE][-+]*[0-9]+)?\b + %{ + yytext = parseFloat(yytext); + return 'NUM'; + %} + +// 5 / 3e4 +[0-9]+(?:[eE][-+]*[0-9]+)?\b + %{ + yytext = parseFloat(yytext); + return 'NUM'; + %} + +// reserved keywords: +'and' return 'AND'; +'or' return 'OR'; +'xor' return 'XOR'; +'not' return 'NOT'; +'if' return 'IF'; +'then' return 'THEN'; +'else' return 'ELSE'; + + +// accept variable names with dots in them, e.g. `store.item`: +[a-zA-Z_$]+[a-zA-Z_0-9.$]*\b + %{ + var rv = lookup_constant(yytext); + if (rv) { + yytext = rv; + return 'CONSTANT'; + } + rv = lookup_function(yytext); + if (rv) { + yytext = rv; + return 'FUNCTION'; + } + rv = lookup_or_register_variable(yytext); + yytext = rv; + return 'VAR'; + %} + +\/\/.* yytext = yytext.substr(2).trim(); return 'COMMENT'; // skip C++-style comments +\/\*[\s\S]*?\*\/ yytext = yytext.substring(2, yyleng - 2).trim(); return 'COMMENT'; // skip C-style multi-line comments + +'===' return 'EQ'; +'==' return 'EQ'; +'!=' return 'NEQ'; +'<=' return 'LEQ'; +'>=' return 'GEQ'; + +'||' return 'OR'; +'^^' return 'XOR'; +'&&' return 'AND'; + +'**' return 'POWER'; /* Exponentiation */ + +'<' return 'LT'; +'>' return 'GT'; + +'=' return '='; +'-' return '-'; +'+' return '+'; +'*' return '*'; +'/' return '/'; +'(' return '('; +')' return ')'; +',' return ','; +'!' return '!'; +'%' return '%'; +'~' return '~'; + +'?' return '?'; // IF +':' return ':'; // ELSE + +'|' return '|'; +'^' return '^'; +'&' return '&'; + + +\\[\r\n] // accept C-style line continuation: ignore this bit. + +[\r\n] return 'EOL'; + +[^\S\r\n]+ // ignore whitespace + +<> return 'EOF'; +. return 'INVALID'; + + diff --git a/examples/faking-multiple-start-rules-alt.jison b/examples/faking-multiple-start-rules-alt.jison new file mode 100644 index 0000000..cea8390 --- /dev/null +++ b/examples/faking-multiple-start-rules-alt.jison @@ -0,0 +1,559 @@ + + +// Off Topic +// --------- +// +// Do not specify the xregexp option as we want the XRegExp \p{...} regex macros converted to +// native regexes and used as such: +// +// %options xregexp + + +/* + * We have several 'lexer states', all of which are defined here: `%x` means it's an _exclusive_ lexer state, while + * JISON considers `%s` states to be _inclusive_, i.e. states which include the set of unmarked lexer rules alongside + * the ones that are marked up as belonging to the given state. + */ + +%x PARSE_MODE_DETECTION +%s VALUE_MODE + + +ASCII_LETTER [a-zA-z] +// \p{Alphabetic} already includes [a-zA-z], hence we don't need to merge with {ASCII_LETTER}: +UNICODE_LETTER [\p{Alphabetic}] +DIGIT [\p{Number}] +WHITESPACE [\s\r\n\p{Separator}] + +// Match simple floating point values, for example `1.0`, but also `9.`, `.05` or just `7`: +BASIC_FLOATING_POINT_NUMBER (?:[0-9]+(?:"."[0-9]*)?|"."[0-9]+) + + + +%% + +/* + * A word on the `PARSE_MODE_DETECTION` 'hack' / We have multiple `%start` Rules + * ----------------------------------------------------------------------------- + * + * The `PARSE_MODE_DETECTION` mode is a parser/lexer communications hack to give us multiple start rules, i.e. + * we use this hack as the code generator (JISON) does not support multiple `%start` rules. + * + * We 'hack' this feature into the grammar by setting up a start rules which first checks which start + * rule we really desire and then goes and tweaks the input fed to the lexer (and switches to the + * `PARSE_MODE_DETECTION` mode alongside) to help the lexer 'fake' a token which the parser can then + * use to switch to the desired start rule. + * + * As the hack involves using the JISON lexer `.unput()` method at the very beginning of the parsing/lexing + * process, the 'hack' byte which is meant to tickle the lexer as described above, lands in NEGATIVE `yylloc` + * space. In other words: the hack does not damage the input position information of the real text/input + * being lexed/parsed subsequently. + * + * The intricacies of the 'hack' involve: + * + * - a *grammar* subrule to set it all up, which itself does not require any lexer tokens (is 'empty') nor any + * look-ahead, thus allowing the parser to 'reduce' this `init_phase` rule without having had to call the + * lexer *yet*. This means that any parser action code attached to this `init_phase` rule will execute + * before the lexer is demanded to deliver any tokens. + * + * - us pushing a special character value as a prefix of the lexed input via `.unput()`: this character is + * later recognized by the lexer and produces a special token which is used to direct the parser + * towards the desired 'start rule'. + * + * The crux here is that we do not want any look-ahead or other lexer tokenization activity before we have + * been able to set up the context for switching to a particular start rule. + * + * To protect against the 'magic characters' `\u0001 .. \u0003` occurring in (possibly malicious/illegal) input, we use a + * lexer mode which will only be used at the very start of the parse process: `PARSE_MODE_DETECTION`. + */ + +"\u0001" + %{ + this.popState(); + console.log('detected date mode'); + return 'DATE_PARSE_MODE'; + %} + +"\u0002" + %{ + this.popState(); + console.log('detected time mode'); + return 'TIME_PARSE_MODE'; + %} + +"\u0003" + %{ + this.popState(); + console.log('detected value mode'); + this.pushState('VALUE_MODE'); + return 'VALUE_PARSE_MODE'; + %} + +/* + * Catch all other possible initial input characters, make sure we do not consume them and + * process the input in the default parse mode: `INITIAL` + */ +. + %{ + this.popState(); + console.log('detected DEFAULT (value) mode'); + /* + * When we did not observe one of the special character codes at the forefront of our + * input stream then we will parsing the entire input in the default mode, i.e. as a numeric value. + * + * Therefore, let the previous lexer state (should be `INITIAL`) process this bit instead; + * do not consume the matched input. + * + * **WARNING**: you might think this would be easily accomplished using the lexer.reject() + * call like this: + * + * this.reject(); + * + * but `reject()` only works as expected _as long as you do NOT switch lexer states_! + * + * Some understanding of the lexer internals is required here: when you call `reject()`, the + * lexer will simply test the input against the next regex in the current set. The key here + * is _the current set_: when the lexer is required to produce a token, it will construct + * a _regex set_ given the _current lexer state_. + * + * What we need here is the lexer retrying matching the same input after we changed the + * lexer state above when we called: + * + * this.popState(); + * + * The way to accomplish this is to 'push back' the matched content into the input buffer using + * `.unput()` and then signal the lexer that we matched nothing by returning no token at all: + * + * return false; + * + * That `return false` will make sure the lexer considers this action as 'complete' (by + * us `return`ing from the lexer), while the boolean `false` tells the lexer it will need + * to run another round in order to provide its caller with a 'real' lexed token. + * + * + * ### For the technically inquisitive + * + * The crux is us employing the side effects of the jison lexer engine, + * more specifically this bit, where I'd like you to take notice of + * the recursive nature of the `.lex()` method in here, plus the fact that `.next()` + * will call `._currentRules()` each time it is invoked (while this is a very much + * reduced and somewhat paraphrased extract of the original): + * + * // generated by jison-lex... + * parser.lexer = { + * ..., + * next: function () { + * ... + * var match, token, rule_under_test; + * var rules = this._currentRules(); + * for (var i = 0; i < rules.length; i++) { + * rule_under_test = this.rules[rules[i]]; + * match = this._input.match(rule_under_test); + * ... + * if (match) { + * // exec the matching lexer action code: + * token = this.test_match(match, rule_under_test); + * + * // stay in this loop when .reject() was called, + * // otherwise we'll run with this match: + * if (!this.rejected) break; + * } + * } + * if (match) { + * ... + * if (token !== false) { + * return token; + * } + * // else: this is a lexer rule which consumes input + * // without producing a token (e.g. whitespace) + * return false; + * } + * ... + * }, + * + * // return next match that has a token + * lex: function lex() { + * var r = this.next(); + * if (r) { + * return r; + * } else { + * return this.lex(); + * } + * }, + * + * // produce the lexer rule set which is active + * // for the currently active lexer condition state + * _currentRules: function _currentRules() { + * ... + * return this.conditions[...].rules; + * }, + * + * ... + * + * conditions: { + * "PARSE_MODE_DETECTION": { + * rules: [ + * 0, 1, 2, 3, 4 + * ], + * inclusive: false + * }, + * ... + * "INITIAL": { + * rules: [ + * 5, 6, 7, 8, 9, + * ... + * ], + * inclusive: true + * } + * } + * }; + * + */ + this.unput(this.matches[0]); + + // Pick the default parse mode: + this.pushState('VALUE_MODE'); + return 'VALUE_PARSE_MODE'; + %} + +<> + %{ + this.popState(); + // let the previous lexer state process that EOF for real... + return false; + %} + + + + + +/* + * And here our lexer rule sets starts for real... + * ----------------------------------------------- + */ + + + + +{UNICODE_LETTER}+ + %{ + return 'MONTH'; + %} + + +// As the {BASIC_FLOATING_POINT_NUMBER} can also lex a year or hour or other integer number, +// we use a lexer condition to help us only recognize floating numbers when we actually expect +// them: + +{BASIC_FLOATING_POINT_NUMBER} + %{ + yytext = parseFloat(yytext); + return 'FLOAT'; + %} + +// Recognize any integer value, e.g. 2016 +{DIGIT}+ + %{ + yytext = parseInt(yytext, 10); + return 'INTEGER'; + %} + +'-' return '-'; +'+' return '+'; +'/' return '/'; +':' return ':'; +'.' return '.'; + + + + +/* + * The sag wagon, which mops up the dregs + * -------------------------------------- + */ + +\s+ /*: skip whitespace */ + +<> return 'EOF'; + +. + %{ + yy.lexer.parseError("Don't know what to do with this: it's unsupported input: '" + yytext + "'"); + return 'error'; + %} + + + + + + +/* + * And here endeth the parser proper + * --------------------------------- + * + * This concludes the grammar rules definitions themselves. + * What follows is a chunk of support code that JISON will include in the generated parser. + */ + + +%% + + + +/* + * This chunk is included in the parser object code, + * following the 'init' code block that may be set in `%{ ... %}` at the top of this + * grammar definition file. + */ + + + +/* @const */ var DATE_MODE = 'D'; +/* @const */ var TIME_MODE = 'T'; +/* @const */ var VALUE_MODE = 'V'; + +var parseModeInitialized = 0; + +function initParseMode(yy, parser_mode) { + /* + * The 'init phase' is always invoked for every parse invocation. + * + * At this point in time, nothing has happened yet: no token has + * been lexed, no real statement has been parsed yet. + */ + + /* + * Depending on parser mode we must push a 'magick marker' into the lexer stream + * which is a hack offering a working alternative to having the parser generator + * support multiple %start rules. + */ + yy.lexer.pushState('PARSE_MODE_DETECTION'); + + + parseModeInitialized = 1; + + + // prevent crash in lexer as the look-ahead activity in there may already have + // changed yytext to become another type (not string any more): + //yy.lexer.yytext = yy.lexer.match; + + + switch (parser_mode) { + default: + break; + + case DATE_MODE: + yy.lexer.unput("\u0001"); + break; + + case TIME_MODE: + yy.lexer.unput("\u0002"); + break; + + case VALUE_MODE: + yy.lexer.unput("\u0003"); + break; + } +}; + + + + + +// Demo main() to showcase the use of this example: +parser.main = function () { + // set up a custom parseError handler. + // + // Note that this one has an extra feature: it returns a special (truthy) 'parse value' + // which will be returned by the parse() call when this handler was invoked: this is + // very useful to quickly make the parse process a known result even when errors have + // occurred: + parser.parseError = function altParseError(msg, hash) { + if (hash && hash.exception) { + msg = hash.exception.message; + //console.log('ex:', hash.exception, hash.exception.stack); + } + console.log("### ERROR: " + msg); + return { + error: 'parse error' + }; + }; + + // Because JISON doesn't reduce epsilon rules *before* the next non-epsilon rule is + // inspected, i.e. JISON *always* fetches a single look-ahead token from the lexer, + // and we cannot have that as we need to push back the parse mode marker *before* + // anything is lexed in there, so we have to complicate this hack by hooking into + // the lexer and making it spit out dummy tokens until we observe the parse mode + // being set up via initParseMode() above... + parser.lexer.options.pre_lex = function () { + // This callback can return a token ID to prevent the lexer from munching any + // input: + if (!parseModeInitialized) { + //console.log('pre_lex pre init --> DUMMY'); + return parser.symbols_['DUMMY']; + } + //console.log('pre_lex'); + }; + + // And hook into setInput AI to reset the global flag... + var si_f = parser.lexer.setInput; + parser.lexer.setInput = function (input, yy) { + parseModeInitialized = 0; + + console.log('setting input: ', input); + + return si_f.call(this, input, yy); + }; + + // End of fixup to make the hack work. + // + // Compare this with the code in the faking-multiple-start-rules.jison example which + // employs the pre_parse() API: that one is much cleaner than this! + + + console.log('\n\nUsing number parse mode:\n'); + + var input = '-7.42'; + console.log(JSON.stringify({ + input: input, + parse_result: parser.parse(input, VALUE_MODE) + }, null, 2)); + + console.log('\n\nErrors in this mode:\n'); + + input = '2016-03-27'; + console.log(JSON.stringify({ + input: input, + parse_result: parser.parse(input, VALUE_MODE) + }, null, 2)); + + input = '2016 march 13'; + console.log(JSON.stringify({ + input: input, + parse_result: parser.parse(input, VALUE_MODE) + }, null, 2)); + + input = '17:05'; + console.log(JSON.stringify({ + input: input, + parse_result: parser.parse(input, VALUE_MODE) + }, null, 2)); + + input = '08:30:22'; + console.log(JSON.stringify({ + input: input, + parse_result: parser.parse(input, VALUE_MODE) + }, null, 2)); + + + + console.log('\n\nUsing date parse mode:\n'); + + input = '2016-03-27'; + console.log(JSON.stringify({ + input: input, + parse_result: parser.parse(input, DATE_MODE) + }, null, 2)); + + input = '2016 march 13'; + console.log(JSON.stringify({ + input: input, + parse_result: parser.parse(input, DATE_MODE) + }, null, 2)); + + console.log('\n\nErrors in this mode:\n'); + + input = '-7.42'; + console.log(JSON.stringify({ + input: input, + parse_result: parser.parse(input, DATE_MODE) + }, null, 2)); + + input = '17:05'; + console.log(JSON.stringify({ + input: input, + parse_result: parser.parse(input, DATE_MODE) + }, null, 2)); + + input = '08:30:22'; + console.log(JSON.stringify({ + input: input, + parse_result: parser.parse(input, DATE_MODE) + }, null, 2)); + + + + console.log('\n\nUsing time parse mode:\n'); + + input = '17:05'; + console.log(JSON.stringify({ + input: input, + parse_result: parser.parse(input, TIME_MODE) + }, null, 2)); + + input = '08:30:22'; + console.log(JSON.stringify({ + input: input, + parse_result: parser.parse(input, TIME_MODE) + }, null, 2)); + + console.log('\n\nErrors in this mode:\n'); + + input = '-7.42'; + console.log(JSON.stringify({ + input: input, + parse_result: parser.parse(input, TIME_MODE) + }, null, 2)); + + input = '2016-03-27'; + console.log(JSON.stringify({ + input: input, + parse_result: parser.parse(input, TIME_MODE) + }, null, 2)); + + input = '2016 march 13'; + console.log(JSON.stringify({ + input: input, + parse_result: parser.parse(input, TIME_MODE) + }, null, 2)); + + + + + console.log('\n\nUsing DEFAULT parse mode:\n'); + + input = '-7.42'; + console.log(JSON.stringify({ + input: input, + parse_result: parser.parse(input) + }, null, 2)); + + console.log('\n\nErrors in this mode:\n'); + + input = '2016-03-27'; + console.log(JSON.stringify({ + input: input, + parse_result: parser.parse(input) + }, null, 2)); + + input = '2016 march 13'; + console.log(JSON.stringify({ + input: input, + parse_result: parser.parse(input) + }, null, 2)); + + input = '17:05'; + console.log(JSON.stringify({ + input: input, + parse_result: parser.parse(input) + }, null, 2)); + + input = '08:30:22'; + console.log(JSON.stringify({ + input: input, + parse_result: parser.parse(input) + }, null, 2)); + + + +}; diff --git a/examples/floop.l b/examples/floop.l new file mode 100644 index 0000000..a83810c --- /dev/null +++ b/examples/floop.l @@ -0,0 +1,53 @@ +ID [A-Z-]+"?"? +NUM ([1-9][0-9]+|[0-9]) + +%options flex case-insensitive + +%% + +\s+ /* ignore */ +{NUM} return 'NUMBER' + +DEFINE return 'DEFINE' +PROCEDURE return 'PROCEDURE' +BLOCK return 'BLOCK' +BEGIN return 'BEGIN' +OUTPUT return 'OUTPUT' +CELL return 'CELL' +IF return 'IF' +THEN return 'THEN' +LOOP return 'LOOP' +"MU-LOOP" return yy.bloop ? 'INVALID' : 'MU_LOOP' +AT return 'AT' +MOST return 'MOST' +TIMES return 'TIMES' +ABORT return 'ABORT' +END return 'END' +QUIT return 'QUIT' +AND return 'AND' +YES return 'YES' +NO return 'NO' +{ID} return 'IDENT' +"." return '.' +"''" return 'QUOTE' +"[" return '[' +"]" return ']' +"(" return '(' +")" return ')' +"{" return '{' +"}" return '}' +":" return ':' +";" return ';' +"," return ',' +"+" return '+' +"*" return '*' +"×" return '*' //non-ascii +"<=" return '<=' +"⇐" return '<=' //non-ascii +"<" return '<' +">" return '>' +"=" return '=' +<> return 'EOF' +. return 'INVALID' + + diff --git a/examples/handlebars.jison.l b/examples/handlebars.jison.l new file mode 100644 index 0000000..2cbebba --- /dev/null +++ b/examples/handlebars.jison.l @@ -0,0 +1,136 @@ +/* + * Handlebars template language definition. + * + * This example has split the lexer and parser/grammar sections of the language definition into separate files a la lex/yacc. + * + * Use Jison to generate a parser from this example, e.g.: + * $ jison handlebars.jison.l handlebars.jison.y + */ + + +%x mu emu com raw + +%{ + +function strip(start, end) { + return yytext = yytext.substr(start, yyleng-end); +} + +%} + +LEFT_STRIP "~" +RIGHT_STRIP "~" + +LOOKAHEAD [=~}\s\/.)|] +LITERAL_LOOKAHEAD [~}\s)] + +/* +ID is the inverse of control characters. +Control characters ranges: + [\s] Whitespace + [!"#%-,\./] !, ", #, %, &, ', (, ), *, +, ,, ., /, Exceptions in range: $, - + [;->@] ;, <, =, >, @, Exceptions in range: :, ? + [\[-\^`] [, \, ], ^, `, Exceptions in range: _ + [\{-~] {, |, }, ~ +*/ +ID [^\s!"#%-,\.\/;->@\[-\^`\{-~]+/{LOOKAHEAD} + +%% + +[^\x00]*?/("{{") { + if(yytext.slice(-2) === "\\\\") { + strip(0,1); + this.begin("mu"); + } else if(yytext.slice(-1) === "\\") { + strip(0,1); + this.begin("emu"); + } else { + this.begin("mu"); + } + if(yytext) return 'CONTENT'; + } + +[^\x00]+ return 'CONTENT'; + +// marks CONTENT up to the next mustache or escaped mustache +[^\x00]{2,}?/("{{"|"\\{{"|"\\\\{{"|<>) { + this.popState(); + return 'CONTENT'; + } + +// nested raw block will create stacked 'raw' condition +"{{{{"/[^/] this.begin('raw'); return 'CONTENT'; +"{{{{/"[^\s!"#%-,\.\/;->@\[-\^`\{-~]+/[=}\s\/.]"}}}}" { + this.popState(); + // Should be using `this.topState()` below, but it currently + // returns the second top instead of the first top. Opened an + // issue about it at https://github.com/zaach/jison/issues/291 + if (this.conditionStack[this.conditionStack.length-1] === 'raw') { + return 'CONTENT'; + } else { + yytext = yytext.substr(5, yyleng-9); + return 'END_RAW_BLOCK'; + } + } +[^\x00]*?/("{{{{") { return 'CONTENT'; } + +[\s\S]*?"--"{RIGHT_STRIP}?"}}" { + this.popState(); + return 'COMMENT'; +} + +"(" return 'OPEN_SEXPR'; +")" return 'CLOSE_SEXPR'; + +"{{{{" { return 'OPEN_RAW_BLOCK'; } +"}}}}" { + this.popState(); + this.begin('raw'); + return 'CLOSE_RAW_BLOCK'; + } +"{{"{LEFT_STRIP}?">" return 'OPEN_PARTIAL'; +"{{"{LEFT_STRIP}?"#>" return 'OPEN_PARTIAL_BLOCK'; +"{{"{LEFT_STRIP}?"#""*"? return 'OPEN_BLOCK'; +"{{"{LEFT_STRIP}?"/" return 'OPEN_ENDBLOCK'; +"{{"{LEFT_STRIP}?"^"\s*{RIGHT_STRIP}?"}}" this.popState(); return 'INVERSE'; +"{{"{LEFT_STRIP}?\s*"else"\s*{RIGHT_STRIP}?"}}" this.popState(); return 'INVERSE'; +"{{"{LEFT_STRIP}?"^" return 'OPEN_INVERSE'; +"{{"{LEFT_STRIP}?\s*"else" return 'OPEN_INVERSE_CHAIN'; +"{{"{LEFT_STRIP}?"{" return 'OPEN_UNESCAPED'; +"{{"{LEFT_STRIP}?"&" return 'OPEN'; +"{{"{LEFT_STRIP}?"!--" { + this.unput(yytext); + this.popState(); + this.begin('com'); +} +"{{"{LEFT_STRIP}?"!"[\s\S]*?"}}" { + this.popState(); + return 'COMMENT'; +} +"{{"{LEFT_STRIP}?"*"? return 'OPEN'; + +"=" return 'EQUALS'; +".." return 'ID'; +"."/{LOOKAHEAD} return 'ID'; +[\/.] return 'SEP'; +\s+ // ignore whitespace +"}"{RIGHT_STRIP}?"}}" this.popState(); return 'CLOSE_UNESCAPED'; +{RIGHT_STRIP}?"}}" this.popState(); return 'CLOSE'; +'"'("\\"["]|[^"])*'"' yytext = strip(1,2).replace(/\\"/g,'"'); return 'STRING'; +"'"("\\"[']|[^'])*"'" yytext = strip(1,2).replace(/\\'/g,"'"); return 'STRING'; +"@" return 'DATA'; +"true"/{LITERAL_LOOKAHEAD} return 'BOOLEAN'; +"false"/{LITERAL_LOOKAHEAD} return 'BOOLEAN'; +"undefined"/{LITERAL_LOOKAHEAD} return 'UNDEFINED'; +"null"/{LITERAL_LOOKAHEAD} return 'NULL'; +\-?[0-9]+(?:\.[0-9]+)?/{LITERAL_LOOKAHEAD} return 'NUMBER'; +"as"\s+"|" return 'OPEN_BLOCK_PARAMS'; +"|" return 'CLOSE_BLOCK_PARAMS'; + +{ID} return 'ID'; + +'['('\\]'|[^\]])*']' yytext = yytext.replace(/\\([\\\]])/g,'$1'); return 'ID'; +. return 'INVALID'; + +<> return 'EOF'; + diff --git a/examples/issue-19-jison_lex-fixed.jison b/examples/issue-19-jison_lex-fixed.jison new file mode 100644 index 0000000..efccaa1 --- /dev/null +++ b/examples/issue-19-jison_lex-fixed.jison @@ -0,0 +1,67 @@ + +%x MLC + + +%% + +\s+ /* skip whitespace */ +a return 'ID'; +';' %{ + this.begin("MLC"); /* corrected... */ + return ';'; + %} +. return 'ERROR'; + +%% + +// feature of the GH fork: specify your own main. +// +// compile with +// +// jison -o test.js --main that/will/be/me.jison +// +// then run +// +// node ./test.js +// +// to see the output. + +var assert = require("assert"); + +parser.main = function () { + // set up an aborting error handler which does not throw an exception + // but returns a special parse 'result' instead: + var errmsg = null; + var errReturnValue = '@@@'; + parser.yy.parseError = function (msg, hash) { + console.log("ERROR: ", msg); + errmsg = msg; + return errReturnValue + (hash.parser ? hash.value_stack.slice(0, hash.stack_pointer).join('.') : '???'); + }; + + var rv = parser.parse(';aa;'); + console.log("test #1: ';aa;' ==> ", rv); + assert.equal(rv, '@@@.;.a.a.;'); + + rv = parser.parse(';;a;'); + console.log("test #2: ';;a;' ==> ", rv); + assert.equal(rv, '@@@.;.;.a.;'); + + console.log("\nAnd now the failing inputs: even these deliver a result:\n"); + + rv = parser.parse('a;'); + console.log("test #3: 'a;' ==> ", rv); + assert.equal(rv, '@@@'); + + rv = parser.parse('a'); + console.log("test #4: 'a' ==> ", rv); + assert.equal(rv, '@@@'); + + rv = parser.parse('b'); + console.log("test #5: 'b' ==> ", rv); + assert.equal(rv, '@@@'); + + // if you get past the assert(), you're good. + console.log("tested OK"); +}; + diff --git a/examples/issue-19-jison_lex.jison b/examples/issue-19-jison_lex.jison new file mode 100644 index 0000000..6f60530 --- /dev/null +++ b/examples/issue-19-jison_lex.jison @@ -0,0 +1,68 @@ + +%x MLC + + +%% + +\s+ /* skip whitespace */ +a return 'ID'; +';' %{ + this.begin("MUTLI"); /* intentional error in condition name string! */ + return ';'; + %} +. return 'ERROR'; + + +%% + +// feature of the GH fork: specify your own main. +// +// compile with +// +// jison -o test.js --main that/will/be/me.jison +// +// then run +// +// node ./test.js +// +// to see the output. + +var assert = require("assert"); + +parser.main = function () { + // set up an aborting error handler which does not throw an exception + // but returns a special parse 'result' instead: + var errmsg = null; + var errReturnValue = '@@@'; + parser.yy.parseError = function (msg, hash) { + console.log("ERROR: ", msg); + errmsg = msg; + return errReturnValue + (hash.parser ? hash.value_stack.slice(0, hash.stack_pointer).join('.') : '???'); + }; + + var rv = parser.parse(';aa;'); + console.log("test #1: ';aa;' ==> ", rv); + assert.equal(rv, '@@@.;'); + + rv = parser.parse(';;a;'); + console.log("test #2: ';;a;' ==> ", rv); + assert.equal(rv, '@@@.;'); + + console.log("\nAnd now the failing inputs: even these deliver a result:\n"); + + rv = parser.parse('a;'); + console.log("test #3: 'a;' ==> ", rv); + assert.equal(rv, '@@@'); + + rv = parser.parse('a'); + console.log("test #4: 'a' ==> ", rv); + assert.equal(rv, '@@@'); + + rv = parser.parse('b'); + console.log("test #5: 'b' ==> ", rv); + assert.equal(rv, '@@@'); + + // if you get past the assert(), you're good. + console.log("tested OK"); +}; + diff --git a/examples/issue-357-url-lexing.jison b/examples/issue-357-url-lexing.jison new file mode 100644 index 0000000..75e318c --- /dev/null +++ b/examples/issue-357-url-lexing.jison @@ -0,0 +1,45 @@ + +%% + +// You can either encapsulate literal ':' colons in quotes or doublequotes, but another way is to +// wrap these in a regex set: `[:]` as shown below: +(ftp|http|https)[:]\/\/(\w+[:]{0,1}\w*@)?(\S+)([:][0-9]+)?(\/|\/([\w#!:.?+=&%@!\-\/]))? return 'URL'; + + +\s+ /* skip whitespace */ +[a-zA-Z]+ return 'ID'; +[^a-zA-Z\s\r\n]+ return 'OTHER'; +. return 'MISC'; + + + + +%% + +// feature of the GH fork: specify your own main. +// +// compile with +// +// jison -o test.js --main that/will/be/me.jison +// +// then run +// +// node ./test.js +// +// to see the output. + +var assert = require("assert"); + +parser.main = function () { + var rv = parser.parse('a+b'); + console.log("test #1: 'a+b' ==> ", rv, parser.yy); + // assert.equal(rv, '+aDabX:a'); + + rv = parser.parse('a-b'); + console.log("test #2: 'a-b' ==> ", rv); + // assert.equal(rv, 'XE'); + + // if you get past the assert(), you're good. + console.log("tested OK"); +}; + diff --git a/examples/lex.l b/examples/lex.l index 515984d..cc01388 100644 --- a/examples/lex.l +++ b/examples/lex.l @@ -1,90 +1,296 @@ -NAME [a-zA-Z_][a-zA-Z0-9_-]* -BR \r\n|\n|\r +ASCII_LETTER [a-zA-z] +// \p{Alphabetic} already includes [a-zA-z], hence we don't need to merge +// with {UNICODE_LETTER} (though jison has code to optimize if you *did* +// include the `[a-zA-Z]` anyway): +UNICODE_LETTER [\p{Alphabetic}] +ALPHA [{UNICODE_LETTER}_] +DIGIT [\p{Number}] +WHITESPACE [\s\r\n\p{Separator}] +ALNUM [{ALPHA}{DIGIT}] + +NAME [{ALPHA}](?:[{ALNUM}-]*{ALNUM})? +ID [{ALPHA}]{ALNUM}* +DECIMAL_NUMBER [1-9][0-9]* +HEX_NUMBER "0"[xX][0-9a-fA-F]+ +BR \r\n|\n|\r +// WhiteSpace MUST NOT match CR/LF and the regex `\s` DOES, so we cannot use +// that one directly. Instead we define the {WS} macro here: +WS [^\S\r\n] + +// Quoted string content: support *escaped* quotes inside strings: +QUOTED_STRING_CONTENT (?:\\\'|\\[^\']|[^\\\'])* +DOUBLEQUOTED_STRING_CONTENT (?:\\\"|\\[^\"]|[^\\\"])* + +// Accept any non-regex-special character as a direct literal without +// the need to put quotes around it: +ANY_LITERAL_CHAR [^\s\r\n<>\[\](){}.*+?:!=|%\/\\^$,\'\";] + + +%s indented trail rules macro +%x code start_condition options conditions action path set + + +%options easy_keyword_rules +//%options ranges +%options xregexp + -%s indented trail rules -%x code start_condition options conditions action %% -"/*"(.|\n|\r)*?"*/" return 'ACTION_BODY'; -"//".* return 'ACTION_BODY'; -"/"[^ /]*?['"{}'][^ ]*?"/" return 'ACTION_BODY'; // regexp with braces or quotes (and no spaces) -\"("\\\\"|'\"'|[^"])*\" return 'ACTION_BODY'; -"'"("\\\\"|"\'"|[^'])*"'" return 'ACTION_BODY'; -[/"'][^{}/"']+ return 'ACTION_BODY'; -[^{}/"']+ return 'ACTION_BODY'; -"{" yy.depth++; return '{' -"}" yy.depth == 0 ? this.begin('trail') : yy.depth--; return '}' - -{NAME} return 'NAME'; -">" this.popState(); return '>'; -"," return ','; -"*" return '*'; - -{BR}+ /* */ -\s+{BR}+ /* */ -\s+ this.begin('indented') -"%%" this.begin('code'); return '%%' -[a-zA-Z0-9_]+ return 'CHARACTER_LIT' - -{NAME} yy.options[yytext] = true -{BR}+ this.begin('INITIAL') -\s+{BR}+ this.begin('INITIAL') -\s+ /* empty */ - -{NAME} return 'START_COND' -{BR}+ this.begin('INITIAL') -\s+{BR}+ this.begin('INITIAL') -\s+ /* empty */ - -.*{BR}+ this.begin('rules') - -"{" yy.depth = 0; this.begin('action'); return '{' -"%{"(.|{BR})*?"%}" this.begin('trail'); yytext = yytext.substr(2, yytext.length-4);return 'ACTION' -"%{"(.|{BR})*?"%}" yytext = yytext.substr(2, yytext.length-4); return 'ACTION' -.+ this.begin('rules'); return 'ACTION' - -"/*"(.|\n|\r)*?"*/" /* ignore */ -"//".* /* ignore */ - -{BR}+ /* */ -\s+ /* */ -{NAME} return 'NAME'; -\"("\\\\"|'\"'|[^"])*\" yytext = yytext.replace(/\\"/g,'"'); return 'STRING_LIT'; -"'"("\\\\"|"\'"|[^'])*"'" yytext = yytext.replace(/\\'/g,"'"); return 'STRING_LIT'; -"|" return '|'; -"["("\\\\"|"\]"|[^\]])*"]" return 'ANY_GROUP_REGEX'; -"(?:" return 'SPECIAL_GROUP'; -"(?=" return 'SPECIAL_GROUP'; -"(?!" return 'SPECIAL_GROUP'; -"(" return '('; -")" return ')'; -"+" return '+'; -"*" return '*'; -"?" return '?'; -"^" return '^'; -"," return ','; -"<>" return '$'; -"<" this.begin('conditions'); return '<'; -"/!" return '/!'; -"/" return '/'; -"\\"([0-7]{1,3}|[rfntvsSbBwWdD\\*+()${}|[\]\/.^?]|"c"[A-Z]|"x"[0-9A-F]{2}|"u"[a-fA-F0-9]{4}) return 'ESCAPE_CHAR'; -"\\". yytext = yytext.replace(/^\\/g,''); return 'ESCAPE_CHAR'; -"$" return '$'; -"." return '.'; -"%options" yy.options = {}; this.begin('options'); -"%s" this.begin('start_condition'); return 'START_INC'; -"%x" this.begin('start_condition'); return 'START_EXC'; -"%%" this.begin('rules'); return '%%'; -"{"\d+(","\s?\d+|",")?"}" return 'RANGE_REGEX'; -"{"{NAME}"}" return 'NAME_BRACE'; -"{" return '{'; -"}" return '}'; -. /* ignore bad characters */ -<*><> return 'EOF'; - -(.|{BR})+ return 'CODE'; +"/*"[^]*?"*/" return 'ACTION_BODY'; +"//".* return 'ACTION_BODY'; +// regexp with braces or quotes (and no spaces, so we don't mistake +// a *division operator* `/` for a regex delimiter here in most circumstances): +"/"[^\s/]*?['"{}][^\s]*?"/" return 'ACTION_BODY'; +\"{DOUBLEQUOTED_STRING_CONTENT}\" + return 'ACTION_BODY'; +\'{QUOTED_STRING_CONTENT}\' + return 'ACTION_BODY'; +[/"'][^{}/"']+ return 'ACTION_BODY'; +[^{}/"']+ return 'ACTION_BODY'; +"{" yy.depth++; return '{'; +"}" %{ + if (yy.depth == 0) { + this.popState(); + this.pushState('trail'); + } else { + yy.depth--; + } + return '}'; + %} + +{NAME} return 'NAME'; +">" this.popState(); return '>'; +"," return ','; +"*" return '*'; + +{BR}+ /* empty */ +{WS}+{BR}+ /* empty */ +{WS}+ this.pushState('indented'); +"%%" this.popState(); this.pushState('code'); return '%%'; +// Accept any non-regex-special character as a direct literal without +// the need to put quotes around it: +{ANY_LITERAL_CHAR}+ + %{ + // accept any non-regex, non-lex, non-string-delim, + // non-escape-starter, non-space character as-is + return 'CHARACTER_LIT'; + %} +{NAME} return 'NAME'; +"=" return '='; +\"{DOUBLEQUOTED_STRING_CONTENT}\" + yytext = unescQuote(this.matches[1]); return 'OPTION_STRING_VALUE'; // value is always a string type +\'{QUOTED_STRING_CONTENT}\' + yytext = unescQuote(this.matches[1]); return 'OPTION_STRING_VALUE'; // value is always a string type + +// Comments should be gobbled and discarded anywhere *except* the code/action blocks: +"//"[^\r\n]* + /* skip single-line comment */ +"/*"(.|\n|\r)*?"*/" + /* skip multi-line comment */ + +[^\s\r\n]+ return 'OPTION_VALUE'; +{BR}{WS}+(?=\S) /* skip leading whitespace on the next line of input, when followed by more options */ +{BR} this.popState(); return 'OPTIONS_END'; +{WS}+ /* skip whitespace */ + +{ID} return 'START_COND'; +{BR}+ this.popState(); +{WS}+ /* empty */ + +{WS}*{BR}+ this.popState(); this.unput(yytext); /* this.unput(yytext); can be used here instead of this.reject(); which would only work when we set the backtrack_lexer option */ + +{WS}*{BR}+ this.popState(); +"{" yy.depth = 0; this.pushState('action'); return '{'; +"%{"((?:.|{BR})*?)"%}" this.pushState('trail'); yytext = this.matches[1]; return 'ACTION'; +"%{"((?:.|{BR})*?)"%}" yytext = this.matches[1]; return 'ACTION'; +"%include" %{ + // This is an include instruction in place of an action: + // thanks to the `.+` rule immediately below we need to semi-duplicate + // the `%include` token recognition here vs. the almost-identical rule for the same + // further below. + // There's no real harm as we need to do something special in this case anyway: + // push 2 (two!) conditions. + // + // (Anecdotal: to find that we needed to place this almost-copy here to make the test grammar + // parse correctly took several hours as the debug facilities were - and are - too meager to + // quickly diagnose the problem while we hadn't. So the code got littered with debug prints + // and finally it hit me what the *F* went wrong, after which I saw I needed to add *this* rule!) + + // first push the 'trail' condition which will be the follow-up after we're done parsing the path parameter... + this.pushState('trail'); + // then push the immediate need: the 'path' condition. + this.pushState('path'); + return 'INCLUDE'; + %} +.* this.popState(); return 'ACTION'; + +{ID} this.pushState('macro'); return 'NAME'; +{BR}+ this.popState(); + +// Accept any non-regex-special character as a direct literal without +// the need to put quotes around it: +{ANY_LITERAL_CHAR}+ %{ + // accept any non-regex, non-lex, non-string-delim, + // non-escape-starter, non-space character as-is + return 'CHARACTER_LIT'; + %} + +{BR}+ /* empty */ +\s+ /* empty */ + +\"{DOUBLEQUOTED_STRING_CONTENT}\" %{ + yytext = unescQuote(this.matches[1]); + return 'STRING_LIT'; + %} +\'{QUOTED_STRING_CONTENT}\' %{ + yytext = unescQuote(this.matches[1]); + return 'STRING_LIT'; + %} +"[" this.pushState('set'); return 'REGEX_SET_START'; +"|" return '|'; +"(?:" return 'SPECIAL_GROUP'; +"(?=" return 'SPECIAL_GROUP'; +"(?!" return 'SPECIAL_GROUP'; +"(" return '('; +")" return ')'; +"+" return '+'; +"*" return '*'; +"?" return '?'; +"^" return '^'; +"," return ','; +"<>" return '$'; +"<" this.pushState('conditions'); return '<'; +"/!" return '/!'; // treated as `(?!atom)` +"/" return '/'; // treated as `(?=atom)` +"\\"([0-7]{1,3}|[rfntvsSbBwWdD\\*+()${}|[\]\/.^?]|"c"[A-Z]|"x"[0-9A-F]{2}|"u"[a-fA-F0-9]{4}) + return 'ESCAPE_CHAR'; +"\\". yytext = yytext.replace(/^\\/g, ''); return 'ESCAPE_CHAR'; +"$" return '$'; +"." return '.'; +"%options" this.pushState('options'); return 'OPTIONS'; +"%s" this.pushState('start_condition'); return 'START_INC'; +"%x" this.pushState('start_condition'); return 'START_EXC'; +"%include" this.pushState('path'); return 'INCLUDE'; +"%"{NAME}([^\r\n]*) + %{ + /* ignore unrecognized decl */ + var l0 = Math.max(0, yylloc.last_column - yylloc.first_column); + var l2 = 19; + var l1 = Math.min(79 - 4 - l0 - l2, yylloc.first_column, 0); + this.warn('LEX: ignoring unsupported lexer option', dquote(yytext), 'while lexing in', this.topState(), 'state:\n' + indent(this.showPosition(l1, l2), 4) + // , '\n', { + // remaining_input: this._input, + // matched: this.matched, + // matches: this.matches + // } + ); + yytext = [ + this.matches[1], // {NAME} + this.matches[2].trim() // optional value/parameters + ]; + return 'UNKNOWN_DECL'; + %} +"%%" this.pushState('rules'); return '%%'; +"{"\d+(","\s*\d+|",")?"}" return 'RANGE_REGEX'; +"{"{ID}"}" return 'NAME_BRACE'; +"{"{ID}"}" return 'NAME_BRACE'; +"{" return '{'; +"}" return '}'; + + +(?:"\\\\"|"\\]"|[^\]{])+ return 'REGEX_SET'; +"{" return 'REGEX_SET'; +"]" this.popState(); + return 'REGEX_SET_END'; + + +// in the trailing CODE block, only accept these `%include` macros when +// they appear at the start of a line and make sure the rest of lexer +// regexes account for this one so it'll match that way only: +[^\r\n]*(\r|\n)+ return 'CODE'; +[^\r\n]+ return 'CODE'; // the bit of CODE just before EOF... + + +{BR} this.popState(); this.unput(yytext); +\"{DOUBLEQUOTED_STRING_CONTENT}\" + yytext = unescQuote(this.matches[1]); + this.popState(); + return 'PATH'; +\'{QUOTED_STRING_CONTENT}\' + yytext = unescQuote(this.matches[1]); + this.popState(); + return 'PATH'; +{WS}+ // skip whitespace in the line +[^\s\r\n]+ this.popState(); + return 'PATH'; + +. %{ + /* b0rk on bad characters */ + var l0 = Math.max(0, yylloc.last_column - yylloc.first_column); + var l2 = 39; + var l1 = Math.min(79 - 4 - l0 - l2, yylloc.first_column, 0); + var rules = (this.topState() === 'macro' ? 'macro\'s' : this.topState()); + var pos_str = this.showPosition(l1, l2); + if (pos_str && pos_str[0] !== '\n') { + pos_str = '\n\n Offending input:\n' + indent(pos_str, 4); + } + yyerror('unsupported lexer input: ' + dquote(yytext) + ' while lexing ' + rules + '\n (i.e. jison lex regexes).\n\n NOTE: When you want the input ' + dquote(yytext) + ' to be interpreted as a literal part\n of a lex rule regex, you MUST enclose it in double or single quotes,\n e.g. as shown in this error message just before. If not, then know\n that this is not accepted as a regex operator here in\n jison-lex ' + rules + '.' + pos_str); + %} + +<*>. %{ + /* b0rk on bad characters */ + var l0 = Math.max(0, yylloc.last_column - yylloc.first_column); + var l2 = 39; + var l1 = Math.min(79 - 4 - l0 - l2, yylloc.first_column, 0); + var pos_str = this.showPosition(l1, l2); + if (pos_str && pos_str[0] !== '\n') { + pos_str = '\n\n Offending input:\n' + indent(pos_str, 4); + } + yyerror('unsupported lexer input: ' + dquote(yytext) + ' while lexing in ' + dquote(this.topState()) + ' state.' + pos_str); + %} + +<*><> return 'EOF'; %% + +var helpers = require('../../modules/helpers-lib'); +var dquote = helpers.dquote; + + +function indent(s, i) { + var a = s.split('\n'); + var pf = (new Array(i + 1)).join(' '); + return pf + a.join('\n' + pf); +} + +// unescape a string value which is wrapped in quotes/doublequotes +function unescQuote(str) { + str = '' + str; + var a = str.split('\\\\'); + a = a.map(function (s) { + return s.replace(/\\'/g, "'").replace(/\\"/g, '"'); + }); + str = a.join('\\\\'); + return str; +} + + +lexer.warn = function l_warn() { + if (this.yy && this.yy.parser && typeof this.yy.parser.warn === 'function') { + return this.yy.parser.warn.apply(this, arguments); + } else { + console.warn.apply(console, arguments); + } +}; + +lexer.log = function l_log() { + if (this.yy && this.yy.parser && typeof this.yy.parser.log === 'function') { + return this.yy.parser.log.apply(this, arguments); + } else { + console.log.apply(console, arguments); + } +}; diff --git a/examples/lex_grammar.jisonlex b/examples/lex_grammar.jisonlex new file mode 100644 index 0000000..7717888 --- /dev/null +++ b/examples/lex_grammar.jisonlex @@ -0,0 +1,29 @@ + +%% +\n+ {yy.freshLine = true;} +\s+ {yy.freshLine = false;} +"y{"[^}]*"}" {yytext = yytext.substr(2, yyleng - 3); return 'ACTION';} +[a-zA-Z_][a-zA-Z0-9_-]* {return 'NAME';} +'"'([^"]|'\"')*'"' {return 'STRING_LIT';} +"'"([^']|"\'")*"'" {return 'STRING_LIT';} +"|" {return '|';} +"["("\]"|[^\]])*"]" {return 'ANY_GROUP_REGEX';} +"(" {return '(';} +")" {return ')';} +"+" {return '+';} +"*" {return '*';} +"?" {return '?';} +"^" {return '^';} +"/" {return '/';} +"\\"[a-zA-Z0] {return 'ESCAPE_CHAR';} +"$" {return '$';} +"<>" {return '$';} +"." {return '.';} +"%%" {return '%%';} +"{"\d+(","\s?\d+|",")?"}" {return 'RANGE_REGEX';} +/"{" %{if (yy.freshLine) { this.input('{'); return '{'; } else { this.unput('y'); }%} +"}" %{return '}';%} +"%{"(.|\n)*?"}%" {yytext = yytext.substr(2, yyleng - 4); return 'ACTION';} +. {/* ignore bad characters */} +<> {return 'EOF';} + diff --git a/examples/parser-to-lexer-communication-test-w-debug.jison b/examples/parser-to-lexer-communication-test-w-debug.jison new file mode 100644 index 0000000..132db0e --- /dev/null +++ b/examples/parser-to-lexer-communication-test-w-debug.jison @@ -0,0 +1,108 @@ + +//%debug // cost ~ 2-4% having it in there when not used. Much higher cost when actually used. +//%options output-debug-tables + +%options ranges + +%x alt + +%% + +'(' return '('; +')' return ')'; +. return 'A'; + +'(' return 'BEGIN'; +')' return 'END'; +. return 'B'; + + +%% + + +%include 'benchmark.js' + + +// rephrase for display: error info objects which have been pushed onto the vstack: +function get_filtered_value_stack(vstack) { + var rv = []; + for (var i = 0, len = vstack.length; i < len; i++) { + var o = vstack[i]; + if (o && o.errStr) { + o = '#ERRORINFO#: ' + o.errStr; + } + rv.push(o); + } + return rv; +} + +function get_reduced_error_info_obj(hash) { + if (!hash || !hash.errStr) { + return null; + } + return { + text: hash.text, + token: hash.token, + token_id: hash.token_id, + expected: hash.expected, + matched: (hash.lexer && hash.lexer.matched) || '(-nada-)', + lexerConditionStack: (hash.lexer && hash.lexer.conditionStack) || '(???)', + remaining_input: (hash.lexer && hash.lexer._input) || '(-nada-)', + recoverable: hash.recoverable, + state_stack: hash.state_stack, + value_stack: get_filtered_value_stack(hash.value_stack) + }; +} + +parser.main = function compiledRunner(args) { + var inp = 'xxx(x(x)x)xxx'; + console.log('input = ', inp); + + + // set up a custom parseError handler. + // + // Note that this one has an extra feature: it tweaks the `yytext` value to propagate + // the error info into the parser error rules as `$error`: + parser.parseError = function altParseError(msg, hash) { + if (hash && hash.exception) { + msg = hash.exception.message; + //console.log('ex:', hash.exception, hash.exception.stack); + } + console.log("### ERROR: " + msg, get_reduced_error_info_obj(hash)); + if (hash && hash.lexer) { + hash.lexer.yytext = hash; + }; + }; + + parser.lexer.options.post_lex = function (tok) { + parser.trace('lexer produces one token: ', tok, parser.describeSymbol(tok)); + }; + + parser.options.debug = false; + + function execute() { + parser.parse(inp); + } + + if (0) { + execute(); + } else { + // nuke the console output via trace() and output minimal progress while we run the benchmark: + parser.trace = function nada_trace() {}; + // make sure to disable debug output at all, so we only get the conditional check as cost when `%debug` is enabled for this grammar + parser.options.debug = false; + + // track number of calls for minimal/FAST status update while benchmarking... + var logcount = 0; + parser.post_parse = function (tok) { + logcount++; + }; + + bench(execute, 0, 10e3, null, function () { + console.log('run #', logcount); + }); + } + + return 0; +}; + diff --git a/examples/pascal.l b/examples/pascal.l new file mode 100644 index 0000000..30fb6f5 --- /dev/null +++ b/examples/pascal.l @@ -0,0 +1,151 @@ +%{ +/* + * scan.l + * + * lex input file for pascal scanner + * + * extensions: to ways to spell "external" and "->" ok for "^". + */ + +var line_no = 1; + +%} + +A [aA] +B [bB] +C [cC] +D [dD] +E [eE] +F [fF] +G [gG] +H [hH] +I [iI] +J [jJ] +K [kK] +L [lL] +M [mM] +N [nN] +O [oO] +P [pP] +Q [qQ] +R [rR] +S [sS] +T [tT] +U [uU] +V [vV] +W [wW] +X [xX] +Y [yY] +Z [zZ] +NQUOTE [^'] + +%% + +{A}{N}{D} return(AND); +{A}{R}{R}{A}{Y} return(ARRAY); +{C}{A}{S}{E} return(CASE); +{C}{O}{N}{S}{T} return(CONST); +{D}{I}{V} return(DIV); +{D}{O} return(DO); +{D}{O}{W}{N}{T}{O} return(DOWNTO); +{E}{L}{S}{E} return(ELSE); +{E}{N}{D} return(END); +{E}{X}{T}{E}{R}{N} return(EXTERNAL); +{E}{X}{T}{E}{R}{N}{A}{L} return(EXTERNAL); +{F}{O}{R} return(FOR); +{F}{O}{R}{W}{A}{R}{D} return(FORWARD); +{F}{U}{N}{C}{T}{I}{O}{N} return(FUNCTION); +{G}{O}{T}{O} return(GOTO); +{I}{F} return(IF); +{I}{N} return(IN); +{L}{A}{B}{E}{L} return(LABEL); +{M}{O}{D} return(MOD); +{N}{I}{L} return(NIL); +{N}{O}{T} return(NOT); +{O}{F} return(OF); +{O}{R} return(OR); +{O}{T}{H}{E}{R}{W}{I}{S}{E} return(OTHERWISE); +{P}{A}{C}{K}{E}{D} return(PACKED); +{B}{E}{G}{I}{N} return(PBEGIN); +{F}{I}{L}{E} return(PFILE); +{P}{R}{O}{C}{E}{D}{U}{R}{E} return(PROCEDURE); +{P}{R}{O}{G}{R}{A}{M} return(PROGRAM); +{R}{E}{C}{O}{R}{D} return(RECORD); +{R}{E}{P}{E}{A}{T} return(REPEAT); +{S}{E}{T} return(SET); +{T}{H}{E}{N} return(THEN); +{T}{O} return(TO); +{T}{Y}{P}{E} return(TYPE); +{U}{N}{T}{I}{L} return(UNTIL); +{V}{A}{R} return(VAR); +{W}{H}{I}{L}{E} return(WHILE); +{W}{I}{T}{H} return(WITH); +[a-zA-Z]([a-zA-Z0-9])+ return(IDENTIFIER); + +":=" return(ASSIGNMENT); +'({NQUOTE}|'')+' return(CHARACTER_STRING); +":" return(COLON); +"," return(COMMA); +[0-9]+ return(DIGSEQ); +"." return(DOT); +".." return(DOTDOT); +"=" return(EQUAL); +">=" return(GE); +">" return(GT); +"[" return(LBRAC); +"<=" return(LE); +"(" return(LPAREN); +"<" return(LT); +"-" return(MINUS); +"<>" return(NOTEQUAL); +"+" return(PLUS); +"]" return(RBRAC); +[0-9]+"."[0-9]+ return(REALNUMBER); +")" return(RPAREN); +";" return(SEMICOLON); +"/" return(SLASH); +"*" return(STAR); +"**" return(STARSTAR); +"->" return(UPARROW); +"^" return(UPARROW); + +"(*"|"{" { + var c; + while ((c = input())) + { + if (c == '}') + break; + else if (c == '*') + { + if ((c = input()) == ')') + break; + else + unput (c); + } + else if (c == '\n') + line_no++; + else if (c == 0) + commenteof(); + } + } + +[ \t\f] ; + +\n line_no++; + +. { + fprintf (stderr, "'%c' (0%o): illegal charcter at line %d\n", yytext[0], yytext[0], line_no); + } + +%% + +function commenteof() { + fprintf (stderr, "unexpected EOF inside comment at line %d\n", line_no); + exit(1); +} + +function yywrap() { + return (1); +} + + diff --git a/examples/regex.jison b/examples/regex.jison new file mode 100644 index 0000000..89d0d89 --- /dev/null +++ b/examples/regex.jison @@ -0,0 +1,22 @@ +/* +Copyright 2015 Mathew Reny + +Regular expression parser for Jison Yacc. +*/ + + +%% + + + +[a-zA-Z0-9] return 'TERMINAL' +"|" return '|' +"*" return '*' +"+" return '+' +"?" return '?' +"(" return '(' +")" return ')' +<> return 'EOF' + + + diff --git a/examples/semwhitespace_lex.jison b/examples/semwhitespace_lex.jison new file mode 100644 index 0000000..a812b8e --- /dev/null +++ b/examples/semwhitespace_lex.jison @@ -0,0 +1,55 @@ +/* Demonstrates semantic whitespace pseudo-tokens, INDENT/DEDENT. */ + +id [a-zA-Z_][a-zA-Z0-9_]* +spc [\t \u00a0\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200a\u200b\u2028\u2029\u3000] + +%s EXPR + +%% +"if" return 'IF'; +"else" return 'ELSE'; +"print" return 'PRINT'; +":" return 'COLON'; +"(" this.begin('EXPR'); return 'LPAREN'; +")" this.popState(); return 'RPAREN'; +\"[^\"]*\"|\'[^\']*\' yytext = yytext.substr(1,yyleng-2); return 'STRING'; +"+" return 'PLUS'; +"-" return 'MINUS'; +{id} return 'ID'; +\d+ return 'NATLITERAL'; +<> return "ENDOFFILE"; +\s*<> %{ + // remaining DEDENTs implied by EOF, regardless of tabs/spaces + var tokens = []; + + while (0 < _iemitstack[0]) { + this.popState(); + tokens.unshift("DEDENT"); + _iemitstack.shift(); + } + + if (tokens.length) return tokens; + %} +[\n\r]+{spc}*/![^\n\r] /* eat blank lines */ +[\n\r]{spc}* %{ + var indentation = yyleng - yytext.search(/\s/) - 1; + if (indentation > _iemitstack[0]) { + _iemitstack.unshift(indentation); + return 'INDENT'; + } + + var tokens = []; + + while (indentation < _iemitstack[0]) { + this.popState(); + tokens.unshift("DEDENT"); + _iemitstack.shift(); + } + if (tokens.length) return tokens; + %} +{spc}+ /* ignore all other whitespace */ + +%% +/* initialize the pseudo-token stack with 0 indents */ +_iemitstack = [0]; + diff --git a/examples/tikiwikiparser.jison b/examples/tikiwikiparser.jison new file mode 100644 index 0000000..2eb8c51 --- /dev/null +++ b/examples/tikiwikiparser.jison @@ -0,0 +1,129 @@ + + +PLUGIN_ID [A-Z]+ +INLINE_PLUGIN_ID [a-z]+ +SMILE [a-z]+ + +%s bold box center colortext italic header6 header5 header4 header3 header2 header1 link strikethrough table titlebar underscore wikilink + +%% + +"{"{INLINE_PLUGIN_ID}.*?"}" + %{ + yytext = parserlib.inlinePlugin(yytext); + return 'INLINE_PLUGIN'; + %} + +"{"{PLUGIN_ID}"(".*?")}" + %{ + yy.pluginStack = parserlib.stackPlugin(yytext, yy.pluginStack); + + if (parserlib.size(yy.pluginStack) == 1) { + return 'PLUGIN_START'; + } else { + return 'CONTENT'; + } + %} + +"{"{PLUGIN_ID}"}" + %{ + if (yy.pluginStack) { + if ( + parserlib.size(yy.pluginStack) > 0 && + parserlib.substring(yytext, 1, -1) == yy.pluginStack[parserlib.size(yy.pluginStack) - 1].name + ) { + if (parserlib.size(yy.pluginStack) == 1) { + yytext = yy.pluginStack[parserlib.size(yy.pluginStack) - 1]; + yy.pluginStack = parserlib.pop(yy.pluginStack); + return 'PLUGIN_END'; + } else { + yy.pluginStack = parserlib.pop(yy.pluginStack); + return 'CONTENT'; + } + } + } + return 'CONTENT'; + %} + +("~np~") + %{ + yy.npStack = parserlib.push(yy.npStack, true); + this.yy.npOn = true; + + return 'NP_START'; + %} + +("~/np~") + %{ + this.yy.npStack = parserlib.pop(yy.npStack); + if (parserlib.size(yy.npStack) < 1) yy.npOn = false; + return 'NP_END'; + %} + +"---" + %{ + yytext = parserlib.hr(); + return 'HORIZONTAL_BAR'; + %} + +"(:"{SMILE}":)" + %{ + yytext = parserlib.substring(yytext, 2, -2); + yytext = parserlib.smile(yytext); + return 'SMILE'; + %} + +"[[".*? + %{ + yytext = parserlib.substring(yytext, 2, -1); + return 'CONTENT'; + %} + +[_][_] %{ this.popState(); return parserlib.npState(this.yy.npOn, 'CONTENT', 'BOLD_END'); %} +[_][_] %{ this.begin('bold'); return parserlib.npState(this.yy.npOn, 'CONTENT', 'BOLD_START'); %} +[\^] %{ this.popState(); return parserlib.npState(this.yy.npOn, 'CONTENT', 'BOX_END'); %} +[\^] %{ this.begin('box'); return parserlib.npState(this.yy.npOn, 'CONTENT', 'BOX_START'); %} +
[:][:] %{ this.popState(); return parserlib.npState(this.yy.npOn, 'CONTENT', 'CENTER_END'); %} +[:][:] %{ this.begin('center'); return parserlib.npState(this.yy.npOn, 'CONTENT', 'CENTER_START'); %} +[\~][\~] %{ this.popState(); return parserlib.npState(this.yy.npOn, 'CONTENT', 'COLORTEXT_END'); %} +[\~][\~][#] %{ this.begin('colortext'); return parserlib.npState(this.yy.npOn, 'CONTENT', 'COLORTEXT_START'); %} +[\n] %{ this.popState(); return parserlib.npState(this.yy.npOn, 'CONTENT', 'HEADER6_END'); %} +[\n]("!!!!!!") %{ this.begin('header6'); return parserlib.npState(this.yy.npOn, 'CONTENT', 'HEADER6_START'); %} +[\n] %{ this.popState(); return parserlib.npState(this.yy.npOn, 'CONTENT', 'HEADER5_END'); %} +[\n]("!!!!!") %{ this.begin('header5'); return parserlib.npState(this.yy.npOn, 'CONTENT', 'HEADER5_START'); %} +[\n] %{ this.popState(); return parserlib.npState(this.yy.npOn, 'CONTENT', 'HEADER4_END'); %} +[\n]("!!!!") %{ this.begin('header4'); return parserlib.npState(this.yy.npOn, 'CONTENT', 'HEADER4_START'); %} +[\n] %{ this.popState(); return parserlib.npState(this.yy.npOn, 'CONTENT', 'HEADER3_END'); %} +[\n]("!!!") %{ this.begin('header3'); return parserlib.npState(this.yy.npOn, 'CONTENT', 'HEADER3_START'); %} +[\n] %{ this.popState(); return parserlib.npState(this.yy.npOn, 'CONTENT', 'HEADER2_END'); %} +[\n]("!!") %{ this.begin('header2'); return parserlib.npState(this.yy.npOn, 'CONTENT', 'HEADER2_START'); %} +[\n] %{ this.popState(); return parserlib.npState(this.yy.npOn, 'CONTENT', 'HEADER1_END'); %} +[\n]("!") %{ this.begin('header1'); return parserlib.npState(this.yy.npOn, 'CONTENT', 'HEADER1_START'); %} +[']['] %{ this.popState(); return parserlib.npState(this.yy.npOn, 'CONTENT', 'ITALIC_END'); %} +[']['] %{ this.begin('italic'); return parserlib.npState(this.yy.npOn, 'CONTENT', 'ITALIC_START'); %} +("]") %{ this.popState(); return parserlib.npState(this.yy.npOn, 'CONTENT', 'LINK_END'); %} +("[") %{ this.begin('link'); return parserlib.npState(this.yy.npOn, 'CONTENT', 'LINK_START'); %} +[-][-] %{ this.popState(); return parserlib.npState(this.yy.npOn, 'CONTENT', 'STRIKETHROUGH_END'); %} +[-][-] %{ this.begin('strikethrough'); return parserlib.npState(this.yy.npOn, 'CONTENT', 'STRIKETHROUGH_START'); %} +[|][|] %{ this.popState(); return parserlib.npState(this.yy.npOn, 'CONTENT', 'TABLE_END'); %} +[|][|] %{ this.begin('table'); return parserlib.npState(this.yy.npOn, 'CONTENT', 'TABLE_START'); %} +[=][-] %{ this.popState(); return parserlib.npState(this.yy.npOn, 'CONTENT', 'TITLEBAR_END'); %} +[-][=] %{ this.begin('titlebar'); return parserlib.npState(this.yy.npOn, 'CONTENT', 'TITLEBAR_START'); %} +[=][=][=] %{ this.popState(); return parserlib.npState(this.yy.npOn, 'CONTENT', 'UNDERSCORE_END'); %} +[=][=][=] %{ this.begin('underscore'); return parserlib.npState(this.yy.npOn, 'CONTENT', 'UNDERSCORE_START'); %} +[)][)] %{ this.popState(); return parserlib.npState(this.yy.npOn, 'CONTENT', 'WIKILINK_END'); %} +[(][(] %{ this.begin('wikilink'); return parserlib.npState(this.yy.npOn, 'CONTENT', 'WIKILINK_START'); %} + +"<"(.|\n)*?">" return 'HTML' +(.) return 'CONTENT' +(\n) + %{ + if (parserlib.npState(this.yy.npOn, false, true) == true) { + yytext = parserlib.formatContent(yytext); + } + + return 'CONTENT'; + %} + +<> return 'EOF' + diff --git a/examples/unicode.jison b/examples/unicode.jison new file mode 100644 index 0000000..ef9aa65 --- /dev/null +++ b/examples/unicode.jison @@ -0,0 +1,965 @@ +/* + * Which advanced JISON features are showcased in this grammar? + * ============================================================ + * + * - lexer macro expansion inside regex sets, e.g. `[{NAME}...]` + * + * - `%options xregexp`, i.e. allowing the use of XRegExp escapes, e.g. to identify Unicode + * 'letter' and/or 'digit' ranges. See also http://xregexp.com/syntax/#unicode and + http://xregexp.com/plugins/#unicode + * + * The sample grammar itself is a toy language and only there to show the lexer features + * at work. + */ + + + + + + + +%options ranges +%options backtrack_lexer +%options xregexp + + + + + + + + +ASCII_LETTER [a-zA-z] +// \p{Alphabetic} already includes [a-zA-z], hence we don't need to merge with {ASCII_LETTER}: +UNICODE_LETTER_RANGE [\p{Alphabetic}] + +IDENTIFIER_START [{UNICODE_LETTER_RANGE}_] +IDENTIFIER_LAST [{IDENTIFIER_START}\p{Number}_] +IDENTIFIER_MIDDLE [{IDENTIFIER_LAST}.] + +WHITESPACE [\s\r\n\p{Separator}] + +NON_OPERATOR_CHAR [{WHITESPACE}{IDENTIFIER_LAST}] + + + + + +/* + https://github.com/mishoo/UglifyJS2/blob/master/lib/parse.js#L121 +*/ +ID [{IDENTIFIER_START}][{IDENTIFIER_LAST}]* +DOTTED_ID [{IDENTIFIER_START}](?:[{IDENTIFIER_MIDDLE}]*[{IDENTIFIER_LAST}])? +WORD [{IDENTIFIER_LAST}]+ +WORDS [{IDENTIFIER_LAST}](?:[\s{IDENTIFIER_LAST}]*[{IDENTIFIER_LAST}])? +DOTTED_WORDS [{IDENTIFIER_LAST}](?:[\s{IDENTIFIER_MIDDLE}]*[{IDENTIFIER_LAST}])? + +OPERATOR [^{NON_OPERATOR_CHAR}]{1,3} + +// Match simple floating point values, for example `1.0`, but also `9.`, `.05` or just `7`: +BASIC_FLOATING_POINT_NUMBER (?:[0-9]+(?:"."[0-9]*)?|"."[0-9]+) + + + +%% + +// 1.0e7 +[0-9]+\.[0-9]*(?:[eE][-+]*[0-9]+)?\b + %{ + yytext = parseFloat(yytext); + return 'NUM'; + %} + +// .5e7 +[0-9]*\.[0-9]+(?:[eE][-+]*[0-9]+)?\b + %{ + yytext = parseFloat(yytext); + return 'NUM'; + %} + +// 5 / 3e4 +[0-9]+(?:[eE][-+]*[0-9]+)?\b + %{ + yytext = parseFloat(yytext); + return 'NUM'; + %} + +[a-zA-Z_]+[a-zA-Z_0-9]*\b + %{ + if (is_constant(yytext)) { + return 'CONSTANT'; + } + if (is_function(yytext)) { + return 'FUNCTION'; + } + return 'VAR'; + %} + + + +{OPERATOR} + %{ + /* + * Check if the matched string STARTS WITH an operator in the list below. + * + * On the first pass, a hash table is created (and cached) to speed up matching. + */ + if (!this.__operator_hash_table) { + var definition_table = [ + { + name: "$", + lexer_opcode: FKA_FIXED_ROW_OR_COLUMN_MARKER, + produce: function () { + return '$'; + } + }, + { + name: ":", + lexer_opcode: FKA_RANGE_MARKER, + produce: function () { + return ':'; + } + }, + { + name: "...", /* .. and ... equal : */ + lexer_opcode: FKA_RANGE_MARKER, + produce: function () { + return ':'; + } + }, + { + name: "..", /* .. and ... equal : */ + lexer_opcode: FKA_RANGE_MARKER, + produce: function () { + return ':'; + } + }, + { + name: ",", + lexer_opcode: FKA_COMMA, + produce: function () { + return ','; + } + }, + { + name: "/*", + produce: function (loc) { + // set the end-of-comment marker for this comment and switch to parsing the comment + if (this.options.inline_comment_mode < this.inline_comments_monitor) { + this.inline_comment_end_markers = ["*/"]; + this.inline_comment_start_yylloc = parser.deepCopy(loc); + this.pushState('INLINE_COMMENT'); + return false; + } + // no dice, try another! + this.reject(); + } + }, + { + name: "(*", + produce: function (loc) { + // set the end-of-comment marker for this comment and switch to parsing the comment + if (this.options.inline_comment_mode < this.inline_comments_monitor) { + this.inline_comment_end_markers = ["*)"]; + this.inline_comment_start_yylloc = parser.deepCopy(loc); + this.pushState('INLINE_COMMENT'); + return false; + } + // no dice, try another! + this.reject(); + } + }, + { + name: "{*", + produce: function (loc) { + // set the end-of-comment marker for this comment and switch to parsing the comment + if (this.options.inline_comment_mode < this.inline_comments_monitor) { + this.inline_comment_end_markers = ["*}"]; + this.inline_comment_start_yylloc = parser.deepCopy(loc); + this.pushState('INLINE_COMMENT'); + return false; + } + // no dice, try another! + this.reject(); + } + }, + { + name: "#", + produce: function (loc) { + // set the end-of-comment marker for this comment and switch to parsing the comment + if (this.options.inline_comment_mode < this.inline_comments_monitor) { + this.inline_comment_end_markers = ["#"]; + this.inline_comment_start_yylloc = parser.deepCopy(loc); + this.pushState('INLINE_COMMENT'); + return false; + } + // no dice, try another! + this.reject(); + } + }, + { + name: "\u203c", /* ‼ */ + produce: function (loc) { + // set the end-of-comment marker for this comment and switch to parsing the comment + if (this.options.inline_comment_mode < this.inline_comments_monitor) { + this.inline_comment_end_markers = ["!!", "\u203c" /* ‼ */]; + this.inline_comment_start_yylloc = parser.deepCopy(loc); + this.pushState('INLINE_COMMENT'); + return false; + } + // no dice, try another! + this.reject(); + } + }, + { + name: "\u2590", /* ▐ */ + produce: function (loc) { + // set the end-of-comment marker for this comment and switch to parsing the comment + if (this.options.inline_comment_mode < this.inline_comments_monitor) { + this.inline_comment_end_markers = ["\u258c" /* ▌ */, "\u2590" /* ▐ */]; + this.inline_comment_start_yylloc = parser.deepCopy(loc); + this.pushState('INLINE_COMMENT'); + return false; + } + // no dice, try another! + this.reject(); + } + }, + { + name: "&&", + opcode: FKW_BOOLEAN_AND_OPERATOR | FT_BOOLEAN | FU_DERIVED, + produce: function () { + return 'BOOLEAN_AND_OPERATOR'; + } + }, + { + name: "||", + opcode: FKW_BOOLEAN_OR_OPERATOR | FT_BOOLEAN | FU_DERIVED, + produce: function () { + return 'BOOLEAN_OR_OPERATOR'; + } + }, + { + name: "&", + opcode: FKW_STRING_CONCATENATION_OPERATOR | FT_STRING | FU_STRING, + produce: function () { + return 'STRING_CONCATENATION_OPERATOR'; + } + }, + { + name: "<=", // Unicode alternatives: \u22dc + opcode: FKW_LESS_OR_EQUAL | FT_BOOLEAN | FU_DERIVED, + produce: function () { + return 'LESS_OR_EQUAL'; + } + }, + { + name: ">=", // Unicode alternatives: \u22dd + opcode: FKW_GREATER_OR_EQUAL | FT_BOOLEAN | FU_DERIVED, + produce: function () { + return 'GREATER_OR_EQUAL'; + } + }, + { + name: "\u2264", + opcode: FKW_LESS_OR_EQUAL | FT_BOOLEAN | FU_DERIVED, + produce: function () { + return 'LESS_OR_EQUAL'; /* ≤ */ + } + }, + { + name: "\u2266", + opcode: FKW_LESS_OR_EQUAL | FT_BOOLEAN | FU_DERIVED, + produce: function () { + return 'LESS_OR_EQUAL'; /* ≦ */ + } + }, + { + name: "\u2265", + opcode: FKW_GREATER_OR_EQUAL | FT_BOOLEAN | FU_DERIVED, + produce: function () { + return 'GREATER_OR_EQUAL'; /* ≥ */ + } + }, + { + name: "\u2267", + opcode: FKW_GREATER_OR_EQUAL | FT_BOOLEAN | FU_DERIVED, + produce: function () { + return 'GREATER_OR_EQUAL'; /* ≧ */ + } + }, + { + name: "<>", // Unicode alternatives: \u2276, \u2277 + opcode: FKW_NOT_EQUAL | FT_BOOLEAN | FU_DERIVED, + produce: function () { + return 'NOT_EQUAL'; + } + }, + { + name: "!=", // Unicode alternatives: \u2260 + opcode: FKW_NOT_EQUAL | FT_BOOLEAN | FU_DERIVED, + produce: function () { + return 'NOT_EQUAL'; + } + }, + { + name: "!==", + opcode: FKW_NOT_IDENTICAL | FT_BOOLEAN | FU_DERIVED, + produce: function () { + return 'NOT_IDENTICAL'; + } + }, + { + name: "<", + opcode: FKW_LESS_THAN | FT_BOOLEAN | FU_DERIVED, + produce: function () { + return '<'; + } + }, + { + name: ">", + opcode: FKW_GREATER_THAN | FT_BOOLEAN | FU_DERIVED, + produce: function () { + return '>'; + } + }, + { + name: "===", + opcode: FKW_IS_IDENTICAL | FT_BOOLEAN | FU_DERIVED, + produce: function () { + return 'IS_IDENTICAL'; + } + }, + { + name: "==", + opcode: FKW_EQUAL | FT_BOOLEAN | FU_DERIVED, + produce: function () { + return 'IS_EQUAL'; + } + }, + { + name: "=", + opcode: FKW_EQUAL | FT_BOOLEAN | FU_DERIVED, + produce: function () { + // This MAY be the `=` starting a formula: mark the event for the inline comments: + if (this.options.inline_comment_mode > 0) { + if (!this.inline_comments_monitor) { + this.inline_comments_monitor = this.options.inline_comment_mode + 1; + } + } + return '='; + } + }, + { + name: "**", + opcode: FKW_POWER | FT_NUMBER | FU_ANY, + produce: function () { + return '^'; + } + }, + { + name: "*", + opcode: FKW_MULTIPLY | FT_NUMBER | FU_DERIVED, + produce: function () { + return '*'; + } + }, + { + name: "/", + opcode: FKW_DIVIDE | FT_NUMBER | FU_DERIVED, + produce: function () { + return '/'; + } + }, + { + name: "-", + opcode: FKW_SUBTRACT | FT_NUMBER | FU_DERIVED, + produce: function () { + return '-'; + } + }, + { + name: "+", + opcode: FKW_ADD | FT_NUMBER | FU_DERIVED, + produce: function () { + return '+'; + } + }, + { + name: "^", + opcode: FKW_POWER | FT_NUMBER | FU_ANY, + produce: function () { + return '^'; + } + }, + { + name: "%", + opcode: FKW_MODULO_OPERATOR, + produce: function () { + return 'MODULO_OPERATOR'; + } + }, + { + name: "\u2030", + opcode: FKW_PROMILAGE_OPERATOR, + produce: function () { + return 'PROMILAGE_OPERATOR'; /* ‰ */ + } + }, + { + name: "\u221a", + opcode: FKW_SQRT_OPERATOR | FT_NUMBER | FU_ANY, + produce: function () { + return 'SQRT_OPERATOR'; /* √ */ + } + }, + { + name: "\u2248", + opcode: FKW_ALMOST_EQUAL | FT_BOOLEAN | FU_DERIVED, + produce: function () { + return 'ALMOST_EQUAL'; /* ≈ */ + } + }, + { + name: "\u2260", + opcode: FKW_NOT_EQUAL | FT_BOOLEAN | FU_DERIVED, + produce: function () { + return 'NOT_EQUAL'; /* ≠ */ + } + }, + { + name: "\u2264", + opcode: FKW_LESS_OR_EQUAL | FT_BOOLEAN | FU_DERIVED, + produce: function () { + return 'LESS_OR_EQUAL'; /* ≤ */ + } + }, + { + name: "\u2265", + opcode: FKW_GREATER_OR_EQUAL | FT_BOOLEAN | FU_DERIVED, + produce: function () { + return 'GREATER_OR_EQUAL'; /* ≥ */ + } + }, + { + name: "\u2212", + opcode: FKW_SUBTRACT | FT_NUMBER | FU_DERIVED, + produce: function () { + return '-'; /* − */ + } + }, + { + name: "\u2013", + opcode: FKW_SUBTRACT | FT_NUMBER | FU_DERIVED, + produce: function () { + return '-'; /* – */ + } + }, + { + name: "\u2012", + opcode: FKW_SUBTRACT | FT_NUMBER | FU_DERIVED, + produce: function () { + return '-'; /* ‒ */ + } + }, + { + name: "\u2014", + opcode: FKW_SUBTRACT | FT_NUMBER | FU_DERIVED, + produce: function () { + return '-'; /* — */ + } + }, + { + name: "\u2215", + opcode: FKW_DIVIDE | FT_NUMBER | FU_DERIVED, + produce: function () { + return '/'; /* ∕ */ + } + }, + { + name: "\u2044", + opcode: FKW_DIVIDE | FT_NUMBER | FU_DERIVED, + produce: function () { + return '/'; /* ⁄ */ + } + }, + { + name: "\u2219", + opcode: FKW_MULTIPLY | FT_NUMBER | FU_DERIVED, + produce: function () { + return '*'; /* ∙ */ + } + }, + { + name: "\u2022", + opcode: FKW_MULTIPLY | FT_NUMBER | FU_DERIVED, + produce: function () { + return '*'; /* • */ + } + }, + { + name: "\u2261", + opcode: FKW_IS_IDENTICAL | FT_BOOLEAN | FU_DERIVED, + produce: function () { + return 'IS_IDENTICAL'; /* ≡ */ + } + }, + { + name: "\u2310", + opcode: FKW_BOOLEAN_NOT_OPERATOR | FT_BOOLEAN | FU_DERIVED, + produce: function () { + return '!'; /* ⌐ */ + } + }, + { + name: "\u00ac", + opcode: FKW_BOOLEAN_NOT_OPERATOR | FT_BOOLEAN | FU_DERIVED, + produce: function () { + return '!'; /* ¬ */ + } + }, + { + name: "!", + opcode: FKW_BOOLEAN_NOT_OPERATOR | FT_BOOLEAN | FU_DERIVED, + produce: function () { + return '!'; + } + }, + { + name: "\u2229", + opcode: FKW_BOOLEAN_AND_OPERATOR | FT_BOOLEAN | FU_DERIVED, + produce: function () { + return 'BOOLEAN_AND_OPERATOR'; /* ∩ */ + } + }, + { + name: "\u00f7", + opcode: FKW_DIVIDE | FT_NUMBER | FU_DERIVED, + produce: function () { + return '/'; /* ÷ */ + } + }, + { + name: "\u00d7", + opcode: FKW_MULTIPLY | FT_NUMBER | FU_DERIVED, + produce: function () { + return '*'; /* × */ + } + }, + { + name: "\u00b7", + opcode: FKW_MULTIPLY | FT_NUMBER | FU_DERIVED, + produce: function () { + return '*'; /* · */ + } + }, + { + name: "\u2219", + opcode: FKW_MULTIPLY | FT_NUMBER | FU_DERIVED, + produce: function () { + return '*'; /* ∙ */ + } + }, + { + name: "\u00b0", + opcode: FKW_DEGREES_OPERATOR, + produce: function () { + return 'DEGREES_OPERATOR'; /* ° */ + } + }, + { + name: "\u00b2", + opcode: FKW_SQUARE_OPERATOR | FT_NUMBER | FU_DERIVED, + produce: function () { + return 'SQUARE_OPERATOR'; /* ² */ + } + }, + { + name: "\u00b3", + opcode: FKW_CUBE_OPERATOR | FT_NUMBER | FU_DERIVED, + produce: function () { + return 'CUBE_OPERATOR'; /* ³ */ + } + }, + { + /* + * This token is an alternative notation which does not require the curly braces around + * a 'fragmented range reference', e.g. `{A1, A2, A3, B1}` is equivalent to `A1 ○ A2 ○ A3 ○ B1` + * which could also be written as `A1:A3 ○ B1` + */ + name: "\u25cb", + opcode: FKW_ARRAY_CONCATENATION_OPERATOR, + produce: function () { + return 'ARRAY_CONCATENATION_OPERATOR'; /* ○ */ + } + }, + { + /* + * This token is an alternative notation which does not require the curly braces around + * a 'fragmented range reference', e.g. `{A1, A2, A3, B1}` is equivalent to `A1 ◦ A2 ◦ A3 ◦ B1` + * which could also be written as `A1:A3 ◦ B1` + */ + name: "\u25e6", + opcode: FKW_ARRAY_CONCATENATION_OPERATOR, + produce: function () { + return 'ARRAY_CONCATENATION_OPERATOR'; /* ◦ */ + } + }, + { + name: "@", + opcode: FKW_DATA_MARKER, + produce: function () { + return '@'; + } + }, + { + name: ".", + opcode: FKW_DOT, + produce: function () { + // switch lexer modes RIGHT NOW: next up is the `json_filter_expression` rule! + assert(this.topState() !== 'JSON_FILTERING'); + //this.pushState('JSON_FILTERING'); -- Fixed #880 + + return '.'; + } + } + ]; + var k, d, tlen, ht; + + ht = [{}, {}, {}, {}]; + for (var k = 0, tlen = definition_table.length; k < tlen; k++) { + d = definition_table[k]; + assert(d.name); + ht[d.name.length][d.name] = d; + } + + this.__operator_hash_table = ht; + } + + var s1 = false, s2 = false, s3 = false; + + s = yytext; + switch (s.length) { + case 3: + s3 = s; + s = s.substr(0, 2); + // fall through + case 2: + s2 = s; + s = s.substr(0, 1); + // fall through + case 1: + s1 = s; + break; + default: + assert(0, "should never get here"); + break; + } + + // reset `s`: + s = yytext; + + // now find matches in the operator lookup table, largest match first: + rv = this.__operator_hash_table[3][s3] || this.__operator_hash_table[2][s2] || this.__operator_hash_table[1][s1]; + if (rv) { + // push the remainder back into the buffer before we continue: + if (s.length > rv.name.length) { + this.unput(s.substr(rv.name.length)); + } + + if (rv.opcode) { + yytext = (new Visyond.FormulaParser.ASTopcode(rv.opcode)) + .setLocationInfo(yylloc) + .setCommentsIndex(parser.getNextCommentIndex()) + .setLexedText(rv.name); + } else if (rv.lexer_opcode) { + yytext = (new Visyond.FormulaParser.lexerToken(rv.lexer_opcode)) + .setLocationInfo(yylloc) + .setCommentsIndex(parser.getNextCommentIndex()) + .setLexedText(rv.name); + } + return rv.produce.call(this, yylloc, yytext); + } + + /* This may be a single Unicode character representing some constant or currency */ + if (s.length > 1) { + this.unput(s.substr(1)); + } + s = s1; + + rv = parser.getSymbol4Currency(s); + if (rv) { + yytext = (new Visyond.FormulaParser.ASTcurrency.ASTcurrency(rv)) + .setLocationInfo(yylloc) + .setCommentsIndex(parser.getNextCommentIndex()) + .setLexedText(s); + return 'CURRENCY'; + } + + // no dice, now see if this is a predefined constant + rv = parser.getSymbol4DefinedConstant(s); + if (rv) { + yytext = (new Visyond.FormulaParser.ASTvalue(rv.value, rv.attributes)) + .setPredefinedConstantInfo(rv) + .setLocationInfo(yylloc) + .setCommentsIndex(parser.getNextCommentIndex()) + .setLexedText(s); + switch (yytext.getValueType()) { + default: + return 'CONSTANT'; + + case FT_BOOLEAN: + if (rv.value) + return 'TRUE'; + else + return 'FALSE'; + + case FT_STRING: + return 'STRING'; + } + } + + // when we don't have a match at all, we leave it to the other rules to hit something: + this.reject(); + %} + + + + + + +/* + * String Handling + * --------------- + */ + + +"\u2039"([^\u203a]*)"\u203a" + %{ /* ‹string› */ + s = this.matches[1]; + yytext = (new Visyond.FormulaParser.ASTvalue(s, FKW_VALUE | FT_STRING | FU_STRING)) + .setNotationAttributes(FKA_DELIMITERS_2039) + .setLocationInfo(yylloc) + .setCommentsIndex(parser.getNextCommentIndex()); + return 'STRING'; + %} + +"\u201c"([^\u201d]*)"\u201d" + %{ /* “string” */ + s = this.matches[1]; + yytext = (new Visyond.FormulaParser.ASTvalue(s, FKW_VALUE | FT_STRING | FU_STRING)) + .setNotationAttributes(FKA_DELIMITERS_201C) + .setLocationInfo(yylloc) + .setCommentsIndex(parser.getNextCommentIndex()); + return 'STRING'; + %} + +"\u00ab"([^\u00bb]*)"\u00bb" + %{ /* «string» */ + s = this.matches[1]; + yytext = (new Visyond.FormulaParser.ASTvalue(s, FKW_VALUE | FT_STRING | FU_STRING)) + .setNotationAttributes(FKA_DELIMITERS_00AB) + .setLocationInfo(yylloc) + .setCommentsIndex(parser.getNextCommentIndex()); + return 'STRING'; + %} + + + +"'"([^']*(?:"''"[^']*)*)"'"(?={DUALIC_OPERATOR_MUST_FOLLOW}) + %{ + // this.unput(this.matches[2]); + + s = this.matches[1]; + s2 = parser.dedupQuotedString(s, "'"); + yytext = (new Visyond.FormulaParser.ASTvalue(s2, FKW_VALUE | FT_STRING | FU_STRING)) + .setNotationAttributes(FKA_DELIMITERS_SINGLEQUOTE) + .setLocationInfo(yylloc) + .setCommentsIndex(parser.getNextCommentIndex()); + return 'STRING'; + %} + +'"'([^"]*(?:'""'[^"]*)*)'"'(?={DUALIC_OPERATOR_MUST_FOLLOW}) + %{ + // this.unput(this.matches[2]); + + s = this.matches[1]; + s2 = parser.dedupQuotedString(s, '"'); + yytext = (new Visyond.FormulaParser.ASTvalue(s2, FKW_VALUE | FT_STRING | FU_STRING)) + .setNotationAttributes(FKA_DELIMITERS_DOUBLEQUOTE) + .setLocationInfo(yylloc) + .setCommentsIndex(parser.getNextCommentIndex()); + return 'STRING'; + %} + + + + + +/* + * Comment parsing + * --------------- + */ + + +[^\/\*\)\}#!\u203c\u258c\u2590]+ + %{ /* * / ) | # ! ‼ ▌ ▐ */ + /* keep it all; we haven't hit an end-of-comment marker starting character yet! */ + this.more(); + %} + +. + %{ + for (rv = 0, len = this.inline_comment_end_markers.length; rv < len; rv++) { + s2 = this.inline_comment_end_markers[rv]; + if (s2[0] === this.matches[0]) { + // we got a POTENTIAL MATCH; let's see if we need more: + if (s2.length > 1) { + // when yes, test the next rule! + this.reject(); + return false; + } else { + /* + * Full match! end of comment reached. + * + * Remove this last bit from the parsed text and strip leading / trailing whitespace. + * + * > ### Notes + * > + * > Since returning actual tokens for any inline comments would + * > break the LALR(1) grammar most severely, we concatenate + * > comments and attach them to the next token. + * > + * > Since the 'next token' MAY be `EOF`, we need the parser + * > to check if there's any leech called `comment` hanging + * > off that EOF it might've got dropped in the in-box... + */ + parser.pushComment(); + this.popState(); + return false; + } + } + } + // collect input until we hit something we know: + this.more(); + %} + +.. + %{ + /* + * We only hit this rule when the previous one was `reject()`-ed + * as that rule will match anything that's the start of this one. + * + * Hence we know we have a partial match on a comment terminator, + * but we need to make sure. + * + * We also know that our longest 'end markers' are 2 characters wide, + * so this solution is sufficient and complete. + * + * Now all we have to do is scan the longer-than-1-character + * comment markers against what we've got here and if there's + * NO MATCH, we need to keep in mind that nasty people can write + * comments like `{***}` and we have a hit on `**}` so we may only + * consume one character here in that case. + */ + for (rv = 0, len = this.inline_comment_end_markers.length; rv < len; rv++) { + s2 = this.inline_comment_end_markers[rv]; + if (s2 === this.matches[0]) { + /* + * Full match! end of comment reached. + * + * Remove this last bit from the parsed text and strip leading/trailing whitespace. + * + * Since returning actual tokens for any inline comments would + * break the LALR(1) grammar most severely, we concatenate + * comments and attach them to the next token. + * + * Since the 'next token' MAY be `EOF`, we need the parser + * to check if there's any leech called `comment` hanging + * of that EOF it might've got dropped in the in-box... + */ + parser.pushComment(); + this.popState(); + return false; + } + } + // we may only consume a single character, so we `unput()` the last one: + this.less(1); + + // collect input until we hit something we know: + this.more(); + %} + +<> + %{ + // Check if this is a comment type which does not have to be 'terminated': + for (rv = 0, len = this.inline_comment_end_markers.length; rv < len; rv++) { + s2 = this.inline_comment_end_markers[rv]; + if (s2 === "") { + /* + * Full match! end of comment reached. + * + * Remove this last bit from the parsed text and strip leading / trailing whitespace. + * + * > ### Notes + * > + * > Since returning actual tokens for any inline comments would + * > break the LALR(1) grammar most severely, we concatenate + * > comments and attach them to the next token. + * > + * > Since the 'next token' MAY be `EOF`, we need the parser + * > to check if there's any leech called `comment` hanging + * > off that EOF it might've got dropped in the in-box... + */ + parser.pushComment(); + this.popState(); + return false; + } + } + + // Otherwise, flag this as an unterminated and thus illegal comment chunk. + parser.pushComment(); + + yytext = (new Visyond.FormulaParser.ASTerror(FERR_UNTERMINATED_INLINE_COMMENT, "Unterminated inline comment.")) + .setErrorArguments(this.inline_comment_end_markers) + .setLocationInfo(yylloc) + .setCommentsIndex(parser.getNextCommentIndex()) + .setLexedText(yytext); + return 'error'; + %} + + + + + +'=' return '='; +'-' return '-'; +'+' return '+'; +'*' return '*'; +'/' return '/'; +'^' return 'POWER'; /* Exponentiation */ +'(' return '('; +')' return ')'; +',' return ','; +'!' return '!'; +'%' return '%'; + + +[\r\n]+ return 'NL'; + +[^\S\r\n]+ // ignore whitespace + +\/\/.* // skip comments +\/\*.*?\*\/ // skip comments + +<> return 'EOF'; +. return 'INVALID'; + + + + +/* End of grammar */ + + +%% + diff --git a/examples/unicode2.jison b/examples/unicode2.jison new file mode 100644 index 0000000..d011bf2 --- /dev/null +++ b/examples/unicode2.jison @@ -0,0 +1,967 @@ +/* + * Which advanced JISON features are showcased in this grammar? + * ============================================================ + * + * - lexer macro expansion inside regex sets, e.g. `[{NAME}...]` + * + * - `%options xregexp`, i.e. allowing the use of XRegExp escapes, e.g. to identify Unicode + * 'letter' and/or 'digit' ranges. See also http://xregexp.com/syntax/#unicode and + http://xregexp.com/plugins/#unicode + * + * The sample grammar itself is a toy language and only there to show the lexer features + * at work. + */ + + + + + + + + + +%options ranges +%options backtrack_lexer +%options xregexp + + + + + + + + +ASCII_LETTER [a-zA-z] +// \p{Alphabetic} already includes [a-zA-z], hence we don't need to merge with {ASCII_LETTER}: +UNICODE_LETTER_RANGE [\p{Alphabetic}] + +IDENTIFIER_START [{UNICODE_LETTER_RANGE}_] +IDENTIFIER_LAST [{IDENTIFIER_START}\p{Number}_] +IDENTIFIER_MIDDLE [{IDENTIFIER_LAST}.] + +WHITESPACE [\s\r\n\p{Separator}] + +NON_OPERATOR_CHAR [{WHITESPACE}{IDENTIFIER_LAST}] + + + + + +/* + https://github.com/mishoo/UglifyJS2/blob/master/lib/parse.js#L121 +*/ +ID [{IDENTIFIER_START}][{IDENTIFIER_LAST}]* +DOTTED_ID [{IDENTIFIER_START}](?:[{IDENTIFIER_MIDDLE}]*[{IDENTIFIER_LAST}])? +WORD [{IDENTIFIER_LAST}]+ +WORDS [{IDENTIFIER_LAST}](?:[\s{IDENTIFIER_LAST}]*[{IDENTIFIER_LAST}])? +DOTTED_WORDS [{IDENTIFIER_LAST}](?:[\s{IDENTIFIER_MIDDLE}]*[{IDENTIFIER_LAST}])? + +OPERATOR [^{NON_OPERATOR_CHAR}]{1,3} + +// Match simple floating point values, for example `1.0`, but also `9.`, `.05` or just `7`: +BASIC_FLOATING_POINT_NUMBER (?:[0-9]+(?:"."[0-9]*)?|"."[0-9]+) + + + +%% + +// 1.0e7 +[0-9]+\.[0-9]*(?:[eE][-+]*[0-9]+)?\b + %{ + yytext = parseFloat(yytext); + return 'NUM'; + %} + +// .5e7 +[0-9]*\.[0-9]+(?:[eE][-+]*[0-9]+)?\b + %{ + yytext = parseFloat(yytext); + return 'NUM'; + %} + +// 5 / 3e4 +[0-9]+(?:[eE][-+]*[0-9]+)?\b + %{ + yytext = parseFloat(yytext); + return 'NUM'; + %} + +[a-zA-Z_]+[a-zA-Z_0-9]*\b + %{ + if (is_constant(yytext)) { + return 'CONSTANT'; + } + if (is_function(yytext)) { + return 'FUNCTION'; + } + return 'VAR'; + %} + + + +{OPERATOR} + %{ + /* + * Check if the matched string STARTS WITH an operator in the list below. + * + * On the first pass, a hash table is created (and cached) to speed up matching. + */ + if (!this.__operator_hash_table) { + var definition_table = [ + { + name: "$", + lexer_opcode: FKA_FIXED_ROW_OR_COLUMN_MARKER, + produce: function () { + return '$'; + } + }, + { + name: ":", + lexer_opcode: FKA_RANGE_MARKER, + produce: function () { + return ':'; + } + }, + { + name: "...", /* .. and ... equal : */ + lexer_opcode: FKA_RANGE_MARKER, + produce: function () { + return ':'; + } + }, + { + name: "..", /* .. and ... equal : */ + lexer_opcode: FKA_RANGE_MARKER, + produce: function () { + return ':'; + } + }, + { + name: ",", + lexer_opcode: FKA_COMMA, + produce: function () { + return ','; + } + }, + { + name: "/*", + produce: function (loc) { + // set the end-of-comment marker for this comment and switch to parsing the comment + if (this.options.inline_comment_mode < this.inline_comments_monitor) { + this.inline_comment_end_markers = ["*/"]; + this.inline_comment_start_yylloc = parser.deepCopy(loc); + this.pushState('INLINE_COMMENT'); + return false; + } + // no dice, try another! + this.reject(); + } + }, + { + name: "(*", + produce: function (loc) { + // set the end-of-comment marker for this comment and switch to parsing the comment + if (this.options.inline_comment_mode < this.inline_comments_monitor) { + this.inline_comment_end_markers = ["*)"]; + this.inline_comment_start_yylloc = parser.deepCopy(loc); + this.pushState('INLINE_COMMENT'); + return false; + } + // no dice, try another! + this.reject(); + } + }, + { + name: "{*", + produce: function (loc) { + // set the end-of-comment marker for this comment and switch to parsing the comment + if (this.options.inline_comment_mode < this.inline_comments_monitor) { + this.inline_comment_end_markers = ["*}"]; + this.inline_comment_start_yylloc = parser.deepCopy(loc); + this.pushState('INLINE_COMMENT'); + return false; + } + // no dice, try another! + this.reject(); + } + }, + { + name: "#", + produce: function (loc) { + // set the end-of-comment marker for this comment and switch to parsing the comment + if (this.options.inline_comment_mode < this.inline_comments_monitor) { + this.inline_comment_end_markers = ["#"]; + this.inline_comment_start_yylloc = parser.deepCopy(loc); + this.pushState('INLINE_COMMENT'); + return false; + } + // no dice, try another! + this.reject(); + } + }, + { + name: "\u203c", /* ‼ */ + produce: function (loc) { + // set the end-of-comment marker for this comment and switch to parsing the comment + if (this.options.inline_comment_mode < this.inline_comments_monitor) { + this.inline_comment_end_markers = ["!!", "\u203c" /* ‼ */]; + this.inline_comment_start_yylloc = parser.deepCopy(loc); + this.pushState('INLINE_COMMENT'); + return false; + } + // no dice, try another! + this.reject(); + } + }, + { + name: "\u2590", /* ▐ */ + produce: function (loc) { + // set the end-of-comment marker for this comment and switch to parsing the comment + if (this.options.inline_comment_mode < this.inline_comments_monitor) { + this.inline_comment_end_markers = ["\u258c" /* ▌ */, "\u2590" /* ▐ */]; + this.inline_comment_start_yylloc = parser.deepCopy(loc); + this.pushState('INLINE_COMMENT'); + return false; + } + // no dice, try another! + this.reject(); + } + }, + { + name: "&&", + opcode: FKW_BOOLEAN_AND_OPERATOR | FT_BOOLEAN | FU_DERIVED, + produce: function () { + return 'BOOLEAN_AND_OPERATOR'; + } + }, + { + name: "||", + opcode: FKW_BOOLEAN_OR_OPERATOR | FT_BOOLEAN | FU_DERIVED, + produce: function () { + return 'BOOLEAN_OR_OPERATOR'; + } + }, + { + name: "&", + opcode: FKW_STRING_CONCATENATION_OPERATOR | FT_STRING | FU_STRING, + produce: function () { + return 'STRING_CONCATENATION_OPERATOR'; + } + }, + { + name: "<=", // Unicode alternatives: \u22dc + opcode: FKW_LESS_OR_EQUAL | FT_BOOLEAN | FU_DERIVED, + produce: function () { + return 'LESS_OR_EQUAL'; + } + }, + { + name: ">=", // Unicode alternatives: \u22dd + opcode: FKW_GREATER_OR_EQUAL | FT_BOOLEAN | FU_DERIVED, + produce: function () { + return 'GREATER_OR_EQUAL'; + } + }, + { + name: "\u2264", + opcode: FKW_LESS_OR_EQUAL | FT_BOOLEAN | FU_DERIVED, + produce: function () { + return 'LESS_OR_EQUAL'; /* ≤ */ + } + }, + { + name: "\u2266", + opcode: FKW_LESS_OR_EQUAL | FT_BOOLEAN | FU_DERIVED, + produce: function () { + return 'LESS_OR_EQUAL'; /* ≦ */ + } + }, + { + name: "\u2265", + opcode: FKW_GREATER_OR_EQUAL | FT_BOOLEAN | FU_DERIVED, + produce: function () { + return 'GREATER_OR_EQUAL'; /* ≥ */ + } + }, + { + name: "\u2267", + opcode: FKW_GREATER_OR_EQUAL | FT_BOOLEAN | FU_DERIVED, + produce: function () { + return 'GREATER_OR_EQUAL'; /* ≧ */ + } + }, + { + name: "<>", // Unicode alternatives: \u2276, \u2277 + opcode: FKW_NOT_EQUAL | FT_BOOLEAN | FU_DERIVED, + produce: function () { + return 'NOT_EQUAL'; + } + }, + { + name: "!=", // Unicode alternatives: \u2260 + opcode: FKW_NOT_EQUAL | FT_BOOLEAN | FU_DERIVED, + produce: function () { + return 'NOT_EQUAL'; + } + }, + { + name: "!==", + opcode: FKW_NOT_IDENTICAL | FT_BOOLEAN | FU_DERIVED, + produce: function () { + return 'NOT_IDENTICAL'; + } + }, + { + name: "<", + opcode: FKW_LESS_THAN | FT_BOOLEAN | FU_DERIVED, + produce: function () { + return '<'; + } + }, + { + name: ">", + opcode: FKW_GREATER_THAN | FT_BOOLEAN | FU_DERIVED, + produce: function () { + return '>'; + } + }, + { + name: "===", + opcode: FKW_IS_IDENTICAL | FT_BOOLEAN | FU_DERIVED, + produce: function () { + return 'IS_IDENTICAL'; + } + }, + { + name: "==", + opcode: FKW_EQUAL | FT_BOOLEAN | FU_DERIVED, + produce: function () { + return 'IS_EQUAL'; + } + }, + { + name: "=", + opcode: FKW_EQUAL | FT_BOOLEAN | FU_DERIVED, + produce: function () { + // This MAY be the `=` starting a formula: mark the event for the inline comments: + if (this.options.inline_comment_mode > 0) { + if (!this.inline_comments_monitor) { + this.inline_comments_monitor = this.options.inline_comment_mode + 1; + } + } + return '='; + } + }, + { + name: "**", + opcode: FKW_POWER | FT_NUMBER | FU_ANY, + produce: function () { + return '^'; + } + }, + { + name: "*", + opcode: FKW_MULTIPLY | FT_NUMBER | FU_DERIVED, + produce: function () { + return '*'; + } + }, + { + name: "/", + opcode: FKW_DIVIDE | FT_NUMBER | FU_DERIVED, + produce: function () { + return '/'; + } + }, + { + name: "-", + opcode: FKW_SUBTRACT | FT_NUMBER | FU_DERIVED, + produce: function () { + return '-'; + } + }, + { + name: "+", + opcode: FKW_ADD | FT_NUMBER | FU_DERIVED, + produce: function () { + return '+'; + } + }, + { + name: "^", + opcode: FKW_POWER | FT_NUMBER | FU_ANY, + produce: function () { + return '^'; + } + }, + { + name: "%", + opcode: FKW_MODULO_OPERATOR, + produce: function () { + return 'MODULO_OPERATOR'; + } + }, + { + name: "\u2030", + opcode: FKW_PROMILAGE_OPERATOR, + produce: function () { + return 'PROMILAGE_OPERATOR'; /* ‰ */ + } + }, + { + name: "\u221a", + opcode: FKW_SQRT_OPERATOR | FT_NUMBER | FU_ANY, + produce: function () { + return 'SQRT_OPERATOR'; /* √ */ + } + }, + { + name: "\u2248", + opcode: FKW_ALMOST_EQUAL | FT_BOOLEAN | FU_DERIVED, + produce: function () { + return 'ALMOST_EQUAL'; /* ≈ */ + } + }, + { + name: "\u2260", + opcode: FKW_NOT_EQUAL | FT_BOOLEAN | FU_DERIVED, + produce: function () { + return 'NOT_EQUAL'; /* ≠ */ + } + }, + { + name: "\u2264", + opcode: FKW_LESS_OR_EQUAL | FT_BOOLEAN | FU_DERIVED, + produce: function () { + return 'LESS_OR_EQUAL'; /* ≤ */ + } + }, + { + name: "\u2265", + opcode: FKW_GREATER_OR_EQUAL | FT_BOOLEAN | FU_DERIVED, + produce: function () { + return 'GREATER_OR_EQUAL'; /* ≥ */ + } + }, + { + name: "\u2212", + opcode: FKW_SUBTRACT | FT_NUMBER | FU_DERIVED, + produce: function () { + return '-'; /* − */ + } + }, + { + name: "\u2013", + opcode: FKW_SUBTRACT | FT_NUMBER | FU_DERIVED, + produce: function () { + return '-'; /* – */ + } + }, + { + name: "\u2012", + opcode: FKW_SUBTRACT | FT_NUMBER | FU_DERIVED, + produce: function () { + return '-'; /* ‒ */ + } + }, + { + name: "\u2014", + opcode: FKW_SUBTRACT | FT_NUMBER | FU_DERIVED, + produce: function () { + return '-'; /* — */ + } + }, + { + name: "\u2215", + opcode: FKW_DIVIDE | FT_NUMBER | FU_DERIVED, + produce: function () { + return '/'; /* ∕ */ + } + }, + { + name: "\u2044", + opcode: FKW_DIVIDE | FT_NUMBER | FU_DERIVED, + produce: function () { + return '/'; /* ⁄ */ + } + }, + { + name: "\u2219", + opcode: FKW_MULTIPLY | FT_NUMBER | FU_DERIVED, + produce: function () { + return '*'; /* ∙ */ + } + }, + { + name: "\u2022", + opcode: FKW_MULTIPLY | FT_NUMBER | FU_DERIVED, + produce: function () { + return '*'; /* • */ + } + }, + { + name: "\u2261", + opcode: FKW_IS_IDENTICAL | FT_BOOLEAN | FU_DERIVED, + produce: function () { + return 'IS_IDENTICAL'; /* ≡ */ + } + }, + { + name: "\u2310", + opcode: FKW_BOOLEAN_NOT_OPERATOR | FT_BOOLEAN | FU_DERIVED, + produce: function () { + return '!'; /* ⌐ */ + } + }, + { + name: "\u00ac", + opcode: FKW_BOOLEAN_NOT_OPERATOR | FT_BOOLEAN | FU_DERIVED, + produce: function () { + return '!'; /* ¬ */ + } + }, + { + name: "!", + opcode: FKW_BOOLEAN_NOT_OPERATOR | FT_BOOLEAN | FU_DERIVED, + produce: function () { + return '!'; + } + }, + { + name: "\u2229", + opcode: FKW_BOOLEAN_AND_OPERATOR | FT_BOOLEAN | FU_DERIVED, + produce: function () { + return 'BOOLEAN_AND_OPERATOR'; /* ∩ */ + } + }, + { + name: "\u00f7", + opcode: FKW_DIVIDE | FT_NUMBER | FU_DERIVED, + produce: function () { + return '/'; /* ÷ */ + } + }, + { + name: "\u00d7", + opcode: FKW_MULTIPLY | FT_NUMBER | FU_DERIVED, + produce: function () { + return '*'; /* × */ + } + }, + { + name: "\u00b7", + opcode: FKW_MULTIPLY | FT_NUMBER | FU_DERIVED, + produce: function () { + return '*'; /* · */ + } + }, + { + name: "\u2219", + opcode: FKW_MULTIPLY | FT_NUMBER | FU_DERIVED, + produce: function () { + return '*'; /* ∙ */ + } + }, + { + name: "\u00b0", + opcode: FKW_DEGREES_OPERATOR, + produce: function () { + return 'DEGREES_OPERATOR'; /* ° */ + } + }, + { + name: "\u00b2", + opcode: FKW_SQUARE_OPERATOR | FT_NUMBER | FU_DERIVED, + produce: function () { + return 'SQUARE_OPERATOR'; /* ² */ + } + }, + { + name: "\u00b3", + opcode: FKW_CUBE_OPERATOR | FT_NUMBER | FU_DERIVED, + produce: function () { + return 'CUBE_OPERATOR'; /* ³ */ + } + }, + { + /* + * This token is an alternative notation which does not require the curly braces around + * a 'fragmented range reference', e.g. `{A1, A2, A3, B1}` is equivalent to `A1 ○ A2 ○ A3 ○ B1` + * which could also be written as `A1:A3 ○ B1` + */ + name: "\u25cb", + opcode: FKW_ARRAY_CONCATENATION_OPERATOR, + produce: function () { + return 'ARRAY_CONCATENATION_OPERATOR'; /* ○ */ + } + }, + { + /* + * This token is an alternative notation which does not require the curly braces around + * a 'fragmented range reference', e.g. `{A1, A2, A3, B1}` is equivalent to `A1 ◦ A2 ◦ A3 ◦ B1` + * which could also be written as `A1:A3 ◦ B1` + */ + name: "\u25e6", + opcode: FKW_ARRAY_CONCATENATION_OPERATOR, + produce: function () { + return 'ARRAY_CONCATENATION_OPERATOR'; /* ◦ */ + } + }, + { + name: "@", + opcode: FKW_DATA_MARKER, + produce: function () { + return '@'; + } + }, + { + name: ".", + opcode: FKW_DOT, + produce: function () { + // switch lexer modes RIGHT NOW: next up is the `json_filter_expression` rule! + assert(this.topState() !== 'JSON_FILTERING'); + //this.pushState('JSON_FILTERING'); -- Fixed #880 + + return '.'; + } + } + ]; + var k, d, tlen, ht; + + ht = [{}, {}, {}, {}]; + for (var k = 0, tlen = definition_table.length; k < tlen; k++) { + d = definition_table[k]; + assert(d.name); + ht[d.name.length][d.name] = d; + } + + this.__operator_hash_table = ht; + } + + var s1 = false, s2 = false, s3 = false; + + s = yytext; + switch (s.length) { + case 3: + s3 = s; + s = s.substr(0, 2); + // fall through + case 2: + s2 = s; + s = s.substr(0, 1); + // fall through + case 1: + s1 = s; + break; + default: + assert(0, "should never get here"); + break; + } + + // reset `s`: + s = yytext; + + // now find matches in the operator lookup table, largest match first: + rv = this.__operator_hash_table[3][s3] || this.__operator_hash_table[2][s2] || this.__operator_hash_table[1][s1]; + if (rv) { + // push the remainder back into the buffer before we continue: + if (s.length > rv.name.length) { + this.unput(s.substr(rv.name.length)); + } + + if (rv.opcode) { + yytext = (new Visyond.FormulaParser.ASTopcode(rv.opcode)) + .setLocationInfo(yylloc) + .setCommentsIndex(parser.getNextCommentIndex()) + .setLexedText(rv.name); + } else if (rv.lexer_opcode) { + yytext = (new Visyond.FormulaParser.lexerToken(rv.lexer_opcode)) + .setLocationInfo(yylloc) + .setCommentsIndex(parser.getNextCommentIndex()) + .setLexedText(rv.name); + } + return rv.produce.call(this, yylloc, yytext); + } + + /* This may be a single Unicode character representing some constant or currency */ + if (s.length > 1) { + this.unput(s.substr(1)); + } + s = s1; + + rv = parser.getSymbol4Currency(s); + if (rv) { + yytext = (new Visyond.FormulaParser.ASTcurrency.ASTcurrency(rv)) + .setLocationInfo(yylloc) + .setCommentsIndex(parser.getNextCommentIndex()) + .setLexedText(s); + return 'CURRENCY'; + } + + // no dice, now see if this is a predefined constant + rv = parser.getSymbol4DefinedConstant(s); + if (rv) { + yytext = (new Visyond.FormulaParser.ASTvalue(rv.value, rv.attributes)) + .setPredefinedConstantInfo(rv) + .setLocationInfo(yylloc) + .setCommentsIndex(parser.getNextCommentIndex()) + .setLexedText(s); + switch (yytext.getValueType()) { + default: + return 'CONSTANT'; + + case FT_BOOLEAN: + if (rv.value) + return 'TRUE'; + else + return 'FALSE'; + + case FT_STRING: + return 'STRING'; + } + } + + // when we don't have a match at all, we leave it to the other rules to hit something: + this.reject(); + %} + + + + + + +/* + * String Handling + * --------------- + */ + + +"\u2039"([^\u203a]*)"\u203a" + %{ /* ‹string› */ + s = this.matches[1]; + yytext = (new Visyond.FormulaParser.ASTvalue(s, FKW_VALUE | FT_STRING | FU_STRING)) + .setNotationAttributes(FKA_DELIMITERS_2039) + .setLocationInfo(yylloc) + .setCommentsIndex(parser.getNextCommentIndex()); + return 'STRING'; + %} + +"\u201c"([^\u201d]*)"\u201d" + %{ /* “string” */ + s = this.matches[1]; + yytext = (new Visyond.FormulaParser.ASTvalue(s, FKW_VALUE | FT_STRING | FU_STRING)) + .setNotationAttributes(FKA_DELIMITERS_201C) + .setLocationInfo(yylloc) + .setCommentsIndex(parser.getNextCommentIndex()); + return 'STRING'; + %} + +"\u00ab"([^\u00bb]*)"\u00bb" + %{ /* «string» */ + s = this.matches[1]; + yytext = (new Visyond.FormulaParser.ASTvalue(s, FKW_VALUE | FT_STRING | FU_STRING)) + .setNotationAttributes(FKA_DELIMITERS_00AB) + .setLocationInfo(yylloc) + .setCommentsIndex(parser.getNextCommentIndex()); + return 'STRING'; + %} + + + +"'"([^']*(?:"''"[^']*)*)"'"(?={DUALIC_OPERATOR_MUST_FOLLOW}) + %{ + // this.unput(this.matches[2]); + + s = this.matches[1]; + s2 = parser.dedupQuotedString(s, "'"); + yytext = (new Visyond.FormulaParser.ASTvalue(s2, FKW_VALUE | FT_STRING | FU_STRING)) + .setNotationAttributes(FKA_DELIMITERS_SINGLEQUOTE) + .setLocationInfo(yylloc) + .setCommentsIndex(parser.getNextCommentIndex()); + return 'STRING'; + %} + +'"'([^"]*(?:'""'[^"]*)*)'"'(?={DUALIC_OPERATOR_MUST_FOLLOW}) + %{ + // this.unput(this.matches[2]); + + s = this.matches[1]; + s2 = parser.dedupQuotedString(s, '"'); + yytext = (new Visyond.FormulaParser.ASTvalue(s2, FKW_VALUE | FT_STRING | FU_STRING)) + .setNotationAttributes(FKA_DELIMITERS_DOUBLEQUOTE) + .setLocationInfo(yylloc) + .setCommentsIndex(parser.getNextCommentIndex()); + return 'STRING'; + %} + + + + + +/* + * Comment parsing + * --------------- + */ + + +[^\/\*\)\}#!\u203c\u258c\u2590]+ + %{ /* * / ) | # ! ‼ ▌ ▐ */ + /* keep it all; we haven't hit an end-of-comment marker starting character yet! */ + this.more(); + %} + +. + %{ + for (rv = 0, len = this.inline_comment_end_markers.length; rv < len; rv++) { + s2 = this.inline_comment_end_markers[rv]; + if (s2[0] === this.matches[0]) { + // we got a POTENTIAL MATCH; let's see if we need more: + if (s2.length > 1) { + // when yes, test the next rule! + this.reject(); + return false; + } else { + /* + * Full match! end of comment reached. + * + * Remove this last bit from the parsed text and strip leading / trailing whitespace. + * + * > ### Notes + * > + * > Since returning actual tokens for any inline comments would + * > break the LALR(1) grammar most severely, we concatenate + * > comments and attach them to the next token. + * > + * > Since the 'next token' MAY be `EOF`, we need the parser + * > to check if there's any leech called `comment` hanging + * > off that EOF it might've got dropped in the in-box... + */ + parser.pushComment(); + this.popState(); + return false; + } + } + } + // collect input until we hit something we know: + this.more(); + %} + +.. + %{ + /* + * We only hit this rule when the previous one was `reject()`-ed + * as that rule will match anything that's the start of this one. + * + * Hence we know we have a partial match on a comment terminator, + * but we need to make sure. + * + * We also know that our longest 'end markers' are 2 characters wide, + * so this solution is sufficient and complete. + * + * Now all we have to do is scan the longer-than-1-character + * comment markers against what we've got here and if there's + * NO MATCH, we need to keep in mind that nasty people can write + * comments like `{***}` and we have a hit on `**}` so we may only + * consume one character here in that case. + */ + for (rv = 0, len = this.inline_comment_end_markers.length; rv < len; rv++) { + s2 = this.inline_comment_end_markers[rv]; + if (s2 === this.matches[0]) { + /* + * Full match! end of comment reached. + * + * Remove this last bit from the parsed text and strip leading/trailing whitespace. + * + * Since returning actual tokens for any inline comments would + * break the LALR(1) grammar most severely, we concatenate + * comments and attach them to the next token. + * + * Since the 'next token' MAY be `EOF`, we need the parser + * to check if there's any leech called `comment` hanging + * of that EOF it might've got dropped in the in-box... + */ + parser.pushComment(); + this.popState(); + return false; + } + } + // we may only consume a single character, so we `unput()` the last one: + this.less(1); + + // collect input until we hit something we know: + this.more(); + %} + +<> + %{ + // Check if this is a comment type which does not have to be 'terminated': + for (rv = 0, len = this.inline_comment_end_markers.length; rv < len; rv++) { + s2 = this.inline_comment_end_markers[rv]; + if (s2 === "") { + /* + * Full match! end of comment reached. + * + * Remove this last bit from the parsed text and strip leading / trailing whitespace. + * + * > ### Notes + * > + * > Since returning actual tokens for any inline comments would + * > break the LALR(1) grammar most severely, we concatenate + * > comments and attach them to the next token. + * > + * > Since the 'next token' MAY be `EOF`, we need the parser + * > to check if there's any leech called `comment` hanging + * > off that EOF it might've got dropped in the in-box... + */ + parser.pushComment(); + this.popState(); + return false; + } + } + + // Otherwise, flag this as an unterminated and thus illegal comment chunk. + parser.pushComment(); + + yytext = (new Visyond.FormulaParser.ASTerror(FERR_UNTERMINATED_INLINE_COMMENT, "Unterminated inline comment.")) + .setErrorArguments(this.inline_comment_end_markers) + .setLocationInfo(yylloc) + .setCommentsIndex(parser.getNextCommentIndex()) + .setLexedText(yytext); + return 'error'; + %} + + + + + +'=' return '='; +'-' return '-'; +'+' return '+'; +'*' return '*'; +'/' return '/'; +'^' return 'POWER'; /* Exponentiation */ +'(' return '('; +')' return ')'; +',' return ','; +'!' return '!'; +'%' return '%'; + + +[\r\n]+ return 'NL'; + +[^\S\r\n]+ // ignore whitespace + +\/\/.* // skip comments +\/\*.*?\*\/ // skip comments + +<> return 'EOF'; +. return 'INVALID'; + + + + +/* End of grammar */ + + +%% + diff --git a/examples/with-includes.action1.js b/examples/with-includes.action1.js new file mode 100644 index 0000000..24de476 --- /dev/null +++ b/examples/with-includes.action1.js @@ -0,0 +1,3 @@ + // ................. action #1 + +return 666; diff --git a/examples/with-includes.jison b/examples/with-includes.jison new file mode 100644 index 0000000..ddf0947 --- /dev/null +++ b/examples/with-includes.jison @@ -0,0 +1,47 @@ + +/* + * description: Grammar showing the `%include` feature in both lexer and parser. + * The grammar itself is a copy of the precedence grammar which shows precedence operators + * and semantic actions. + */ + + + +// This chunk will be injected before everything else that's generated by JISON: +%code required %include with-includes.prelude.top.js + +// ... and this chunk will land before the parser and parser tables... +%code init %include with-includes.prelude.init.js + + + +%options ranges + + +DIGITS [0-9] +ALPHA [a-zA-Z]|{DIGITS} +SPACE " " +WHITESPACE \s + + +%include with-includes.prelude1.js + +%% + +{WHITESPACE}+ {/* skip whitespace */} +[{DIGITS}]+ /* leading comment */ + %include "with-includes.returnNAT.js" // demonstrate the ACTION block include and the ability to comment on it right here. +[{DIGITS}{ALPHA}]+ + %{ console.log("buggerit millenium hands and shrimp!"); %} + +"+" {return '+';} +"-" {return '-';} +"*" {return '*';} +<> {return 'EOF';} + +%% + +%include with-includes.prelude2.js + + +%include with-includes.main.js // demonstrate the trailing code block include and the ability to comment on it right here. diff --git a/examples/with-includes.main.js b/examples/with-includes.main.js new file mode 100644 index 0000000..25dd9bc --- /dev/null +++ b/examples/with-includes.main.js @@ -0,0 +1,69 @@ + +parser.main = function (args) { + if (!args[1]) { + console.log('Usage: ' + args[0] + ' FILE'); + process.exit(1); + } + + var tty = require('tty'); + if (tty.isatty(process.stdout.fd)) { + console.log('not redirected'); + } + else { + console.log('redirected'); + } + + var input_chunks = []; + + function process_one_line(source) { + try { + var rv = parser.parse(source); + + process.stdout.write(JSON.stringify(rv, null, 2) + '\n'); + } catch (ex) { + process.stdout.write("Parse error:\n" + JSON.stringify(ex, null, 2) + "\nfor input:\n" + source + '\n'); + } + } + + function act() { + // see if we got an entire line's worth from stdin already? + var source = input_chunks.join("").split('\n'); + while (source.length > 1) { + process_one_line(source[0]); + source.shift(); + } + input_chunks = source; + } + + if (args[1] === '-') { + // read from stdin, echo output to stdout + process.stdin.setEncoding('utf8'); + + process.stdin.on('readable', function() { + var chunk = process.stdin.read(); + //console.log("chunk:", JSON.stringify(chunk, null, 2)); + if (chunk !== null) { + input_chunks.push(chunk); + act(); + } + }); + + process.stdin.on('end', function() { + input_chunks.push('\n'); + act(); + process.exit(0); + }); + } else { + try { + var source = require('fs').readFileSync(require('path').normalize(args[1]), 'utf8'); + var rv = parser.parse(source); + + process.stdout.write(JSON.stringify(rv, null, 2)); + return +rv || 0; + } catch (ex) { + process.stdout.write("Parse error:\n" + JSON.stringify(ex, null, 2) + "\nfor input file:\n" + args[1]); + return 66; + } + } +}; + diff --git a/examples/with-includes.prelude.init.js b/examples/with-includes.prelude.init.js new file mode 100644 index 0000000..dbc75a7 --- /dev/null +++ b/examples/with-includes.prelude.init.js @@ -0,0 +1,2 @@ +// ................. include INIT + diff --git a/examples/with-includes.prelude.top.js b/examples/with-includes.prelude.top.js new file mode 100644 index 0000000..1bf1f0f --- /dev/null +++ b/examples/with-includes.prelude.top.js @@ -0,0 +1,2 @@ +// ................. include TOP + diff --git a/examples/with-includes.prelude1.js b/examples/with-includes.prelude1.js new file mode 100644 index 0000000..4b4f1c8 --- /dev/null +++ b/examples/with-includes.prelude1.js @@ -0,0 +1,2 @@ + // ................. include #1 + diff --git a/examples/with-includes.prelude2.js b/examples/with-includes.prelude2.js new file mode 100644 index 0000000..43af226 --- /dev/null +++ b/examples/with-includes.prelude2.js @@ -0,0 +1,2 @@ + // ................. include #2 + diff --git a/examples/with-includes.returnNAT.js b/examples/with-includes.returnNAT.js new file mode 100644 index 0000000..ede4d99 --- /dev/null +++ b/examples/with-includes.returnNAT.js @@ -0,0 +1,5 @@ +// the lexer generator code will look at this action block and correctly replace the string token +// return by a nice & fast numeric token ID, just like would've happened if this code had been sitting +// inside a `%{...%}` or `{...}` ACTION block: + +return 'NAT'; diff --git a/examples/with-includes.test.lex b/examples/with-includes.test.lex new file mode 100644 index 0000000..37048b9 --- /dev/null +++ b/examples/with-includes.test.lex @@ -0,0 +1,29 @@ + +%options ranges + + +DIGITS [0-9] +ALPHA [a-zA-Z]|{DIGITS} +SPACE " " +WHITESPACE \s + + +%include with-includes.prelude1.js + +%% + +{WHITESPACE}+ {/* skip whitespace */} +[{DIGITS}]+ /* leading comment */ + %include "with-includes.action1.js" // demonstrate the ACTION block include and the ability to comment on it right here. +[{DIGITS}{ALPHA}]+ + %{ console.log("buggerit millenium hands and shrimp!"); %} + +"+" {return '+';} +"-" {return '-';} +"*" {return '*';} +<> {return 'EOF';} + +%% + +%include with-includes.prelude2.js + diff --git a/examples/with_custom_lexer.jison b/examples/with_custom_lexer.jison new file mode 100644 index 0000000..5907f7f --- /dev/null +++ b/examples/with_custom_lexer.jison @@ -0,0 +1,64 @@ + +/* + * description: One way to provide a custom lexer with a jison grammar. + * + * The grammar itself is a copy of the precedence grammar which shows precedence operators + * and semantic actions. + */ + + +%options ranges + +%include with-includes.prelude1.js + +%{ + // When you set up a custom lexer, this is the minimum example for one: + // + // your lexer class/object must provide these interface methods and constants at least: + // + // - setInput(string) + // - lex() -> token + // - EOF = 1 + // - ERROR = 2 + // + // and your lexer must have a `options` member set up as a hash table, i.e. JS object: + // + // - options: {} + // + // Your lexer must be named `lexer` as shown below. + + var input = ""; + var input_offset = 0; + + var lexer = { + EOF: 1, + ERROR: 2, + + options: {}, + + lex: function () { + if (input.length > input_offset) { + return input[input_offset++]; + } else { + return this.EOF; + } + }, + + setInput: function (inp) { + input = inp; + input_offset = 0; + } + }; +%} + +%% + +// no rules = zero rules: this signals jison to expect a *custom* lexer, provided through +// either a `%{...%}` action block above or pulled in via an `%include` statement. + +%% + +%include with-includes.prelude2.js + + +%include with-includes.main.js diff --git a/jison-lexer-error-code.js b/jison-lexer-error-code.js new file mode 100644 index 0000000..95d6284 --- /dev/null +++ b/jison-lexer-error-code.js @@ -0,0 +1,56 @@ +/** + * See also: + * http://stackoverflow.com/questions/1382107/whats-a-good-way-to-extend-error-in-javascript/#35881508 + * but we keep the prototype.constructor and prototype.name assignment lines too for compatibility + * with userland code which might access the derived class in a 'classic' way. + * + * @public + * @constructor + * @nocollapse + */ +function JisonLexerError(msg, hash) { + Object.defineProperty(this, 'name', { + enumerable: false, + writable: false, + value: 'JisonLexerError' + }); + + if (msg == null) msg = '???'; + + Object.defineProperty(this, 'message', { + enumerable: false, + writable: true, + value: msg + }); + + this.hash = hash; + + var stacktrace; + if (hash && hash.exception instanceof Error) { + var ex2 = hash.exception; + this.message = ex2.message || msg; + stacktrace = ex2.stack; + } + if (!stacktrace) { + if (Error.hasOwnProperty('captureStackTrace')) { // V8 + Error.captureStackTrace(this, this.constructor); + } else { + stacktrace = (new Error(msg)).stack; + } + } + if (stacktrace) { + Object.defineProperty(this, 'stack', { + enumerable: false, + writable: false, + value: stacktrace + }); + } +} + +if (typeof Object.setPrototypeOf === 'function') { + Object.setPrototypeOf(JisonLexerError.prototype, Error.prototype); +} else { + JisonLexerError.prototype = Object.create(Error.prototype); +} +JisonLexerError.prototype.constructor = JisonLexerError; +JisonLexerError.prototype.name = 'JisonLexerError'; diff --git a/jison-lexer-kernel.js b/jison-lexer-kernel.js new file mode 100644 index 0000000..7c88b95 --- /dev/null +++ b/jison-lexer-kernel.js @@ -0,0 +1,1058 @@ +// Full-featured Lexer Run-Time Class core (to be included in every generated lexer) +// Zachary Carter +// MIT Licensed + +{ + EOF: 1, + ERROR: 2, + + // JisonLexerError: JisonLexerError, /// <-- injected by the code generator + + // options: {}, /// <-- injected by the code generator + + // yy: ..., /// <-- injected by setInput() + + __currentRuleSet__: null, /// INTERNAL USE ONLY: internal rule set cache for the current lexer state + + __error_infos: [], /// INTERNAL USE ONLY: the set of lexErrorInfo objects created since the last cleanup + + __decompressed: false, /// INTERNAL USE ONLY: mark whether the lexer instance has been 'unfolded' completely and is now ready for use + + done: false, /// INTERNAL USE ONLY + _backtrack: false, /// INTERNAL USE ONLY + _input: '', /// INTERNAL USE ONLY + _more: false, /// INTERNAL USE ONLY + _signaled_error_token: false, /// INTERNAL USE ONLY + + conditionStack: [], /// INTERNAL USE ONLY; managed via `pushState()`, `popState()`, `topState()` and `stateStackSize()` + + match: '', /// READ-ONLY EXTERNAL ACCESS - ADVANCED USE ONLY: tracks input which has been matched so far for the lexer token under construction. `match` is identical to `yytext` except that this one still contains the matched input string after `lexer.performAction()` has been invoked, where userland code MAY have changed/replaced the `yytext` value entirely! + matched: '', /// READ-ONLY EXTERNAL ACCESS - ADVANCED USE ONLY: tracks entire input which has been matched so far + matches: false, /// READ-ONLY EXTERNAL ACCESS - ADVANCED USE ONLY: tracks RE match result for last (successful) match attempt + yytext: '', /// ADVANCED USE ONLY: tracks input which has been matched so far for the lexer token under construction; this value is transferred to the parser as the 'token value' when the parser consumes the lexer token produced through a call to the `lex()` API. + offset: 0, /// READ-ONLY EXTERNAL ACCESS - ADVANCED USE ONLY: tracks the 'cursor position' in the input string, i.e. the number of characters matched so far + yyleng: 0, /// READ-ONLY EXTERNAL ACCESS - ADVANCED USE ONLY: length of matched input for the token under construction (`yytext`) + yylineno: 0, /// READ-ONLY EXTERNAL ACCESS - ADVANCED USE ONLY: 'line number' at which the token under construction is located + yylloc: null, /// READ-ONLY EXTERNAL ACCESS - ADVANCED USE ONLY: tracks location info (lines + columns) for the token under construction + + /** + * INTERNAL USE: construct a suitable error info hash object instance for `parseError`. + * + * @public + * @this {RegExpLexer} + */ + constructLexErrorInfo: function lexer_constructLexErrorInfo(msg, recoverable, show_input_position) { + msg = '' + msg; + + // heuristic to determine if the error message already contains a (partial) source code dump + // as produced by either `showPosition()` or `prettyPrintRange()`: + if (show_input_position == undefined) { + show_input_position = !(msg.indexOf('\n') > 0 && msg.indexOf('^') > 0); + } + if (this.yylloc && show_input_position) { + if (typeof this.prettyPrintRange === 'function') { + var pretty_src = this.prettyPrintRange(this.yylloc); + + if (!/\n\s*$/.test(msg)) { + msg += '\n'; + } + msg += '\n Erroneous area:\n' + this.prettyPrintRange(this.yylloc); + } else if (typeof this.showPosition === 'function') { + var pos_str = this.showPosition(); + if (pos_str) { + if (msg.length && msg[msg.length - 1] !== '\n' && pos_str[0] !== '\n') { + msg += '\n' + pos_str; + } else { + msg += pos_str; + } + } + } + } + /** @constructor */ + var pei = { + errStr: msg, + recoverable: !!recoverable, + text: this.match, // This one MAY be empty; userland code should use the `upcomingInput` API to obtain more text which follows the 'lexer cursor position'... + token: null, + line: this.yylineno, + loc: this.yylloc, + yy: this.yy, + lexer: this, + + /** + * and make sure the error info doesn't stay due to potential + * ref cycle via userland code manipulations. + * These would otherwise all be memory leak opportunities! + * + * Note that only array and object references are nuked as those + * constitute the set of elements which can produce a cyclic ref. + * The rest of the members is kept intact as they are harmless. + * + * @public + * @this {LexErrorInfo} + */ + destroy: function destructLexErrorInfo() { + // remove cyclic references added to error info: + // info.yy = null; + // info.lexer = null; + // ... + var rec = !!this.recoverable; + for (var key in this) { + if (this.hasOwnProperty(key) && typeof key === 'object') { + this[key] = undefined; + } + } + this.recoverable = rec; + } + }; + // track this instance so we can `destroy()` it once we deem it superfluous and ready for garbage collection! + this.__error_infos.push(pei); + return pei; + }, + + /** + * handler which is invoked when a lexer error occurs. + * + * @public + * @this {RegExpLexer} + */ + parseError: function lexer_parseError(str, hash, ExceptionClass) { + if (!ExceptionClass) { + ExceptionClass = this.JisonLexerError; + } + if (this.yy) { + if (this.yy.parser && typeof this.yy.parser.parseError === 'function') { + return this.yy.parser.parseError.call(this, str, hash, ExceptionClass) || this.ERROR; + } else if (typeof this.yy.parseError === 'function') { + return this.yy.parseError.call(this, str, hash, ExceptionClass) || this.ERROR; + } + } + throw new ExceptionClass(str, hash); + }, + + /** + * method which implements `yyerror(str, ...args)` functionality for use inside lexer actions. + * + * @public + * @this {RegExpLexer} + */ + yyerror: function yyError(str /*, ...args */) { + var lineno_msg = ''; + if (this.yylloc) { + lineno_msg = ' on line ' + (this.yylineno + 1); + } + var p = this.constructLexErrorInfo('Lexical error' + lineno_msg + ': ' + str, this.options.lexerErrorsAreRecoverable); + + // Add any extra args to the hash under the name `extra_error_attributes`: + var args = Array.prototype.slice.call(arguments, 1); + if (args.length) { + p.extra_error_attributes = args; + } + + return (this.parseError(p.errStr, p, this.JisonLexerError) || this.ERROR); + }, + + /** + * final cleanup function for when we have completed lexing the input; + * make it an API so that external code can use this one once userland + * code has decided it's time to destroy any lingering lexer error + * hash object instances and the like: this function helps to clean + * up these constructs, which *may* carry cyclic references which would + * otherwise prevent the instances from being properly and timely + * garbage-collected, i.e. this function helps prevent memory leaks! + * + * @public + * @this {RegExpLexer} + */ + cleanupAfterLex: function lexer_cleanupAfterLex(do_not_nuke_errorinfos) { + // prevent lingering circular references from causing memory leaks: + this.setInput('', {}); + + // nuke the error hash info instances created during this run. + // Userland code must COPY any data/references + // in the error hash instance(s) it is more permanently interested in. + if (!do_not_nuke_errorinfos) { + for (var i = this.__error_infos.length - 1; i >= 0; i--) { + var el = this.__error_infos[i]; + if (el && typeof el.destroy === 'function') { + el.destroy(); + } + } + this.__error_infos.length = 0; + } + + return this; + }, + + /** + * clear the lexer token context; intended for internal use only + * + * @public + * @this {RegExpLexer} + */ + clear: function lexer_clear() { + this.yytext = ''; + this.yyleng = 0; + this.match = ''; + // - DO NOT reset `this.matched` + this.matches = false; + this._more = false; + this._backtrack = false; + + var col = (this.yylloc ? this.yylloc.last_column : 0); + this.yylloc = { + first_line: this.yylineno + 1, + first_column: col, + last_line: this.yylineno + 1, + last_column: col, + + range: [this.offset, this.offset] + }; + }, + + /** + * resets the lexer, sets new input + * + * @public + * @this {RegExpLexer} + */ + setInput: function lexer_setInput(input, yy) { + this.yy = yy || this.yy || {}; + + // also check if we've fully initialized the lexer instance, + // including expansion work to be done to go from a loaded + // lexer to a usable lexer: + if (!this.__decompressed) { + // step 1: decompress the regex list: + var rules = this.rules; + for (var i = 0, len = rules.length; i < len; i++) { + var rule_re = rules[i]; + + // compression: is the RE an xref to another RE slot in the rules[] table? + if (typeof rule_re === 'number') { + rules[i] = rules[rule_re]; + } + } + + // step 2: unfold the conditions[] set to make these ready for use: + var conditions = this.conditions; + for (var k in conditions) { + var spec = conditions[k]; + + var rule_ids = spec.rules; + + var len = rule_ids.length; + var rule_regexes = new Array(len + 1); // slot 0 is unused; we use a 1-based index approach here to keep the hottest code in `lexer_next()` fast and simple! + var rule_new_ids = new Array(len + 1); + + for (var i = 0; i < len; i++) { + var idx = rule_ids[i]; + var rule_re = rules[idx]; + rule_regexes[i + 1] = rule_re; + rule_new_ids[i + 1] = idx; + } + + spec.rules = rule_new_ids; + spec.__rule_regexes = rule_regexes; + spec.__rule_count = len; + } + + this.__decompressed = true; + } + + this._input = input || ''; + this.clear(); + this._signaled_error_token = false; + this.done = false; + this.yylineno = 0; + this.matched = ''; + this.conditionStack = ['INITIAL']; + this.__currentRuleSet__ = null; + this.yylloc = { + first_line: 1, + first_column: 0, + last_line: 1, + last_column: 0, + + range: [0, 0] + }; + this.offset = 0; + return this; + }, + + /** + * edit the remaining input via user-specified callback. + * This can be used to forward-adjust the input-to-parse, + * e.g. inserting macro expansions and alike in the + * input which has yet to be lexed. + * The behaviour of this API contrasts the `unput()` et al + * APIs as those act on the *consumed* input, while this + * one allows one to manipulate the future, without impacting + * the current `yyloc` cursor location or any history. + * + * Use this API to help implement C-preprocessor-like + * `#include` statements, etc. + * + * The provided callback must be synchronous and is + * expected to return the edited input (string). + * + * The `cpsArg` argument value is passed to the callback + * as-is. + * + * `callback` interface: + * `function callback(input, cpsArg)` + * + * - `input` will carry the remaining-input-to-lex string + * from the lexer. + * - `cpsArg` is `cpsArg` passed into this API. + * + * The `this` reference for the callback will be set to + * reference this lexer instance so that userland code + * in the callback can easily and quickly access any lexer + * API. + * + * When the callback returns a non-string-type falsey value, + * we assume the callback did not edit the input and we + * will using the input as-is. + * + * When the callback returns a non-string-type value, it + * is converted to a string for lexing via the `"" + retval` + * operation. (See also why: http://2ality.com/2012/03/converting-to-string.html + * -- that way any returned object's `toValue()` and `toString()` + * methods will be invoked in a proper/desirable order.) + * + * @public + * @this {RegExpLexer} + */ + editRemainingInput: function lexer_editRemainingInput(callback, cpsArg) { + var rv = callback.call(this, this._input, cpsArg); + if (typeof rv !== 'string') { + if (rv) { + this._input = '' + rv; + } + // else: keep `this._input` as is. + } else { + this._input = rv; + } + return this; + }, + + /** + * consumes and returns one char from the input + * + * @public + * @this {RegExpLexer} + */ + input: function lexer_input() { + if (!this._input) { + //this.done = true; -- don't set `done` as we want the lex()/next() API to be able to produce one custom EOF token match after this anyhow. (lexer can match special <> tokens and perform user action code for a <> match, but only does so *once*) + return null; + } + var ch = this._input[0]; + this.yytext += ch; + this.yyleng++; + this.offset++; + this.match += ch; + this.matched += ch; + // Count the linenumber up when we hit the LF (or a stand-alone CR). + // On CRLF, the linenumber is incremented when you fetch the CR or the CRLF combo + // and we advance immediately past the LF as well, returning both together as if + // it was all a single 'character' only. + var slice_len = 1; + var lines = false; + if (ch === '\n') { + lines = true; + } else if (ch === '\r') { + lines = true; + var ch2 = this._input[1]; + if (ch2 === '\n') { + slice_len++; + ch += ch2; + this.yytext += ch2; + this.yyleng++; + this.offset++; + this.match += ch2; + this.matched += ch2; + this.yylloc.range[1]++; + } + } + if (lines) { + this.yylineno++; + this.yylloc.last_line++; + this.yylloc.last_column = 0; + } else { + this.yylloc.last_column++; + } + this.yylloc.range[1]++; + + this._input = this._input.slice(slice_len); + return ch; + }, + + /** + * unshifts one char (or an entire string) into the input + * + * @public + * @this {RegExpLexer} + */ + unput: function lexer_unput(ch) { + var len = ch.length; + var lines = ch.split(/(?:\r\n?|\n)/g); + + this._input = ch + this._input; + this.yytext = this.yytext.substr(0, this.yytext.length - len); + this.yyleng = this.yytext.length; + this.offset -= len; + this.match = this.match.substr(0, this.match.length - len); + this.matched = this.matched.substr(0, this.matched.length - len); + + if (lines.length > 1) { + this.yylineno -= lines.length - 1; + + this.yylloc.last_line = this.yylineno + 1; + + // Get last entirely matched line into the `pre_lines[]` array's + // last index slot; we don't mind when other previously + // matched lines end up in the array too. + var pre = this.match; + var pre_lines = pre.split(/(?:\r\n?|\n)/g); + if (pre_lines.length === 1) { + pre = this.matched; + pre_lines = pre.split(/(?:\r\n?|\n)/g); + } + this.yylloc.last_column = pre_lines[pre_lines.length - 1].length; + } else { + this.yylloc.last_column -= len; + } + + this.yylloc.range[1] = this.yylloc.range[0] + this.yyleng; + + this.done = false; + return this; + }, + + /** + * cache matched text and append it on next action + * + * @public + * @this {RegExpLexer} + */ + more: function lexer_more() { + this._more = true; + return this; + }, + + /** + * signal the lexer that this rule fails to match the input, so the + * next matching rule (regex) should be tested instead. + * + * @public + * @this {RegExpLexer} + */ + reject: function lexer_reject() { + if (this.options.backtrack_lexer) { + this._backtrack = true; + } else { + // when the `parseError()` call returns, we MUST ensure that the error is registered. + // We accomplish this by signaling an 'error' token to be produced for the current + // `.lex()` run. + var lineno_msg = ''; + if (this.yylloc) { + lineno_msg = ' on line ' + (this.yylineno + 1); + } + var p = this.constructLexErrorInfo('Lexical error' + lineno_msg + ': You can only invoke reject() in the lexer when the lexer is of the backtracking persuasion (options.backtrack_lexer = true).', false); + this._signaled_error_token = (this.parseError(p.errStr, p, this.JisonLexerError) || this.ERROR); + } + return this; + }, + + /** + * retain first n characters of the match + * + * @public + * @this {RegExpLexer} + */ + less: function lexer_less(n) { + return this.unput(this.match.slice(n)); + }, + + /** + * return (part of the) already matched input, i.e. for error + * messages. + * + * Limit the returned string length to `maxSize` (default: 20). + * + * Limit the returned string to the `maxLines` number of lines of + * input (default: 1). + * + * Negative limit values equal *unlimited*. + * + * @public + * @this {RegExpLexer} + */ + pastInput: function lexer_pastInput(maxSize, maxLines) { + var past = this.matched.substring(0, this.matched.length - this.match.length); + if (maxSize < 0) + maxSize = past.length; + else if (!maxSize) + maxSize = 20; + if (maxLines < 0) + maxLines = past.length; // can't ever have more input lines than this! + else if (!maxLines) + maxLines = 1; + // `substr` anticipation: treat \r\n as a single character and take a little + // more than necessary so that we can still properly check against maxSize + // after we've transformed and limited the newLines in here: + past = past.substr(-maxSize * 2 - 2); + // now that we have a significantly reduced string to process, transform the newlines + // and chop them, then limit them: + var a = past.replace(/\r\n|\r/g, '\n').split('\n'); + a = a.slice(-maxLines); + past = a.join('\n'); + // When, after limiting to maxLines, we still have too much to return, + // do add an ellipsis prefix... + if (past.length > maxSize) { + past = '...' + past.substr(-maxSize); + } + return past; + }, + + /** + * return (part of the) upcoming input, i.e. for error messages. + * + * Limit the returned string length to `maxSize` (default: 20). + * + * Limit the returned string to the `maxLines` number of lines of input (default: 1). + * + * Negative limit values equal *unlimited*. + * + * > ### NOTE ### + * > + * > *"upcoming input"* is defined as the whole of the both + * > the *currently lexed* input, together with any remaining input + * > following that. *"currently lexed"* input is the input + * > already recognized by the lexer but not yet returned with + * > the lexer token. This happens when you are invoking this API + * > from inside any lexer rule action code block. + * > + * + * @public + * @this {RegExpLexer} + */ + upcomingInput: function lexer_upcomingInput(maxSize, maxLines) { + var next = this.match; + if (maxSize < 0) + maxSize = next.length + this._input.length; + else if (!maxSize) + maxSize = 20; + if (maxLines < 0) + maxLines = maxSize; // can't ever have more input lines than this! + else if (!maxLines) + maxLines = 1; + // `substring` anticipation: treat \r\n as a single character and take a little + // more than necessary so that we can still properly check against maxSize + // after we've transformed and limited the newLines in here: + if (next.length < maxSize * 2 + 2) { + next += this._input.substring(0, maxSize * 2 + 2); // substring is faster on Chrome/V8 + } + // now that we have a significantly reduced string to process, transform the newlines + // and chop them, then limit them: + var a = next.replace(/\r\n|\r/g, '\n').split('\n'); + a = a.slice(0, maxLines); + next = a.join('\n'); + // When, after limiting to maxLines, we still have too much to return, + // do add an ellipsis postfix... + if (next.length > maxSize) { + next = next.substring(0, maxSize) + '...'; + } + return next; + }, + + /** + * return a string which displays the character position where the + * lexing error occurred, i.e. for error messages + * + * @public + * @this {RegExpLexer} + */ + showPosition: function lexer_showPosition(maxPrefix, maxPostfix) { + var pre = this.pastInput(maxPrefix).replace(/\s/g, ' '); + var c = new Array(pre.length + 1).join('-'); + return pre + this.upcomingInput(maxPostfix).replace(/\s/g, ' ') + '\n' + c + '^'; + }, + + /** + * return a string which displays the lines & columns of input which are referenced + * by the given location info range, plus a few lines of context. + * + * This function pretty-prints the indicated section of the input, with line numbers + * and everything! + * + * This function is very useful to provide highly readable error reports, while + * the location range may be specified in various flexible ways: + * + * - `loc` is the location info object which references the area which should be + * displayed and 'marked up': these lines & columns of text are marked up by `^` + * characters below each character in the entire input range. + * + * - `context_loc` is the *optional* location info object which instructs this + * pretty-printer how much *leading* context should be displayed alongside + * the area referenced by `loc`. This can help provide context for the displayed + * error, etc. + * + * When this location info is not provided, a default context of 3 lines is + * used. + * + * - `context_loc2` is another *optional* location info object, which serves + * a similar purpose to `context_loc`: it specifies the amount of *trailing* + * context lines to display in the pretty-print output. + * + * When this location info is not provided, a default context of 1 line only is + * used. + * + * Special Notes: + * + * - when the `loc`-indicated range is very large (about 5 lines or more), then + * only the first and last few lines of this block are printed while a + * `...continued...` message will be printed between them. + * + * This serves the purpose of not printing a huge amount of text when the `loc` + * range happens to be huge: this way a manageable & readable output results + * for arbitrary large ranges. + * + * - this function can display lines of input which whave not yet been lexed. + * `prettyPrintRange()` can access the entire input! + * + * @public + * @this {RegExpLexer} + */ + prettyPrintRange: function lexer_prettyPrintRange(loc, context_loc, context_loc2) { + var error_size = loc.last_line - loc.first_line; + const CONTEXT = 3; + const CONTEXT_TAIL = 1; + const MINIMUM_VISIBLE_NONEMPTY_LINE_COUNT = 2; + var input = this.matched + this._input; + var lines = input.split('\n'); + //var show_context = (error_size < 5 || context_loc); + var l0 = Math.max(1, (context_loc ? context_loc.first_line : loc.first_line - CONTEXT)); + var l1 = Math.max(1, (context_loc2 ? context_loc2.last_line : loc.last_line + CONTEXT_TAIL)); + var lineno_display_width = (1 + Math.log10(l1 | 1) | 0); + var ws_prefix = new Array(lineno_display_width).join(' '); + var nonempty_line_indexes = []; + var rv = lines.slice(l0 - 1, l1 + 1).map(function injectLineNumber(line, index) { + var lno = index + l0; + var lno_pfx = (ws_prefix + lno).substr(-lineno_display_width); + var rv = lno_pfx + ': ' + line; + var errpfx = (new Array(lineno_display_width + 1)).join('^'); + var offset = 2 + 1; + var len = 0; + + if (lno === loc.first_line) { + offset += loc.first_column; + + len = Math.max( + 2, + ((lno === loc.last_line ? loc.last_column : line.length)) - loc.first_column + 1 + ); + } else if (lno === loc.last_line) { + len = Math.max(2, loc.last_column + 1); + } else if (lno > loc.first_line && lno < loc.last_line) { + len = Math.max(2, line.length + 1); + } + + if (len) { + var lead = new Array(offset).join('.'); + var mark = new Array(len).join('^'); + rv += '\n' + errpfx + lead + mark; + + if (line.trim().length > 0) { + nonempty_line_indexes.push(index); + } + } + + rv = rv.replace(/\t/g, ' '); + return rv; + }); + + // now make sure we don't print an overly large amount of error area: limit it + // to the top and bottom line count: + if (nonempty_line_indexes.length > 2 * MINIMUM_VISIBLE_NONEMPTY_LINE_COUNT) { + var clip_start = nonempty_line_indexes[MINIMUM_VISIBLE_NONEMPTY_LINE_COUNT - 1] + 1; + var clip_end = nonempty_line_indexes[nonempty_line_indexes.length - MINIMUM_VISIBLE_NONEMPTY_LINE_COUNT] - 1; + + var intermediate_line = (new Array(lineno_display_width + 1)).join(' ') + ' (...continued...)'; + intermediate_line += '\n' + (new Array(lineno_display_width + 1)).join('-') + ' (---------------)'; + rv.splice(clip_start, clip_end - clip_start + 1, intermediate_line); + } + return rv.join('\n'); + }, + + /** + * helper function, used to produce a human readable description as a string, given + * the input `yylloc` location object. + * + * Set `display_range_too` to TRUE to include the string character index position(s) + * in the description if the `yylloc.range` is available. + * + * @public + * @this {RegExpLexer} + */ + describeYYLLOC: function lexer_describe_yylloc(yylloc, display_range_too) { + var l1 = yylloc.first_line; + var l2 = yylloc.last_line; + var c1 = yylloc.first_column; + var c2 = yylloc.last_column; + var dl = l2 - l1; + var dc = c2 - c1; + var rv; + if (dl === 0) { + rv = 'line ' + l1 + ', '; + if (dc <= 1) { + rv += 'column ' + c1; + } else { + rv += 'columns ' + c1 + ' .. ' + c2; + } + } else { + rv = 'lines ' + l1 + '(column ' + c1 + ') .. ' + l2 + '(column ' + c2 + ')'; + } + if (yylloc.range && display_range_too) { + var r1 = yylloc.range[0]; + var r2 = yylloc.range[1] - 1; + if (r2 <= r1) { + rv += ' {String Offset: ' + r1 + '}'; + } else { + rv += ' {String Offset range: ' + r1 + ' .. ' + r2 + '}'; + } + } + return rv; + }, + + /** + * test the lexed token: return FALSE when not a match, otherwise return token. + * + * `match` is supposed to be an array coming out of a regex match, i.e. `match[0]` + * contains the actually matched text string. + * + * Also move the input cursor forward and update the match collectors: + * + * - `yytext` + * - `yyleng` + * - `match` + * - `matches` + * - `yylloc` + * - `offset` + * + * @public + * @this {RegExpLexer} + */ + test_match: function lexer_test_match(match, indexed_rule) { + var token, + lines, + backup, + match_str, + match_str_len; + + if (this.options.backtrack_lexer) { + // save context + backup = { + yylineno: this.yylineno, + yylloc: { + first_line: this.yylloc.first_line, + last_line: this.yylloc.last_line, + first_column: this.yylloc.first_column, + last_column: this.yylloc.last_column, + + range: this.yylloc.range.slice(0) + }, + yytext: this.yytext, + match: this.match, + matches: this.matches, + matched: this.matched, + yyleng: this.yyleng, + offset: this.offset, + _more: this._more, + _input: this._input, + //_signaled_error_token: this._signaled_error_token, + yy: this.yy, + conditionStack: this.conditionStack.slice(0), + done: this.done + }; + } + + match_str = match[0]; + match_str_len = match_str.length; + // if (match_str.indexOf('\n') !== -1 || match_str.indexOf('\r') !== -1) { + lines = match_str.split(/(?:\r\n?|\n)/g); + if (lines.length > 1) { + this.yylineno += lines.length - 1; + + this.yylloc.last_line = this.yylineno + 1; + this.yylloc.last_column = lines[lines.length - 1].length; + } else { + this.yylloc.last_column += match_str_len; + } + // } + this.yytext += match_str; + this.match += match_str; + this.matched += match_str; + this.matches = match; + this.yyleng = this.yytext.length; + this.yylloc.range[1] += match_str_len; + + // previous lex rules MAY have invoked the `more()` API rather than producing a token: + // those rules will already have moved this `offset` forward matching their match lengths, + // hence we must only add our own match length now: + this.offset += match_str_len; + this._more = false; + this._backtrack = false; + this._input = this._input.slice(match_str_len); + + // calling this method: + // + // function lexer__performAction(yy, yyrulenumber, YY_START) {...} + token = this.performAction.call(this, this.yy, indexed_rule, this.conditionStack[this.conditionStack.length - 1] /* = YY_START */); + // otherwise, when the action codes are all simple return token statements: + //token = this.simpleCaseActionClusters[indexed_rule]; + + if (this.done && this._input) { + this.done = false; + } + if (token) { + return token; + } else if (this._backtrack) { + // recover context + for (var k in backup) { + this[k] = backup[k]; + } + this.__currentRuleSet__ = null; + return false; // rule action called reject() implying the next rule should be tested instead. + } else if (this._signaled_error_token) { + // produce one 'error' token as `.parseError()` in `reject()` + // did not guarantee a failure signal by throwing an exception! + token = this._signaled_error_token; + this._signaled_error_token = false; + return token; + } + return false; + }, + + /** + * return next match in input + * + * @public + * @this {RegExpLexer} + */ + next: function lexer_next() { + if (this.done) { + this.clear(); + return this.EOF; + } + if (!this._input) { + this.done = true; + } + + var token, + match, + tempMatch, + index; + if (!this._more) { + this.clear(); + } + var spec = this.__currentRuleSet__; + if (!spec) { + // Update the ruleset cache as we apparently encountered a state change or just started lexing. + // The cache is set up for fast lookup -- we assume a lexer will switch states much less often than it will + // invoke the `lex()` token-producing API and related APIs, hence caching the set for direct access helps + // speed up those activities a tiny bit. + spec = this.__currentRuleSet__ = this._currentRules(); + // Check whether a *sane* condition has been pushed before: this makes the lexer robust against + // user-programmer bugs such as https://github.com/zaach/jison-lex/issues/19 + if (!spec || !spec.rules) { + var lineno_msg = ''; + if (this.options.trackPosition) { + lineno_msg = ' on line ' + (this.yylineno + 1); + } + var p = this.constructLexErrorInfo('Internal lexer engine error' + lineno_msg + ': The lex grammar programmer pushed a non-existing condition name "' + this.topState() + '"; this is a fatal error and should be reported to the application programmer team!', false); + // produce one 'error' token until this situation has been resolved, most probably by parse termination! + return (this.parseError(p.errStr, p, this.JisonLexerError) || this.ERROR); + } + } + + var rule_ids = spec.rules; + var regexes = spec.__rule_regexes; + var len = spec.__rule_count; + + // Note: the arrays are 1-based, while `len` itself is a valid index, + // hence the non-standard less-or-equal check in the next loop condition! + for (var i = 1; i <= len; i++) { + tempMatch = this._input.match(regexes[i]); + if (tempMatch && (!match || tempMatch[0].length > match[0].length)) { + match = tempMatch; + index = i; + if (this.options.backtrack_lexer) { + token = this.test_match(tempMatch, rule_ids[i]); + if (token !== false) { + return token; + } else if (this._backtrack) { + match = undefined; + continue; // rule action called reject() implying a rule MISmatch. + } else { + // else: this is a lexer rule which consumes input without producing a token (e.g. whitespace) + return false; + } + } else if (!this.options.flex) { + break; + } + } + } + if (match) { + token = this.test_match(match, rule_ids[index]); + if (token !== false) { + return token; + } + // else: this is a lexer rule which consumes input without producing a token (e.g. whitespace) + return false; + } + if (!this._input) { + this.done = true; + this.clear(); + return this.EOF; + } else { + var lineno_msg = ''; + if (this.options.trackPosition) { + lineno_msg = ' on line ' + (this.yylineno + 1); + } + var p = this.constructLexErrorInfo('Lexical error' + lineno_msg + ': Unrecognized text.', this.options.lexerErrorsAreRecoverable); + + var pendingInput = this._input; + var activeCondition = this.topState(); + var conditionStackDepth = this.conditionStack.length; + + token = (this.parseError(p.errStr, p, this.JisonLexerError) || this.ERROR); + if (token === this.ERROR) { + // we can try to recover from a lexer error that `parseError()` did not 'recover' for us + // by moving forward at least one character at a time IFF the (user-specified?) `parseError()` + // has not consumed/modified any pending input or changed state in the error handler: + if (!this.matches && + // and make sure the input has been modified/consumed ... + pendingInput === this._input && + // ...or the lexer state has been modified significantly enough + // to merit a non-consuming error handling action right now. + activeCondition === this.topState() && + conditionStackDepth === this.conditionStack.length + ) { + this.input(); + } + } + return token; + } + }, + + /** + * return next match that has a token + * + * @public + * @this {RegExpLexer} + */ + lex: function lexer_lex() { + var r; + // allow the PRE/POST handlers set/modify the return token for maximum flexibility of the generated lexer: + if (typeof this.options.pre_lex === 'function') { + r = this.options.pre_lex.call(this); + } + + while (!r) { + r = this.next(); + } + + if (typeof this.options.post_lex === 'function') { + // (also account for a userdef function which does not return any value: keep the token as is) + r = this.options.post_lex.call(this, r) || r; + } + return r; + }, + + /** + * backwards compatible alias for `pushState()`; + * the latter is symmetrical with `popState()` and we advise to use + * those APIs in any modern lexer code, rather than `begin()`. + * + * @public + * @this {RegExpLexer} + */ + begin: function lexer_begin(condition) { + return this.pushState(condition); + }, + + /** + * activates a new lexer condition state (pushes the new lexer + * condition state onto the condition stack) + * + * @public + * @this {RegExpLexer} + */ + pushState: function lexer_pushState(condition) { + this.conditionStack.push(condition); + this.__currentRuleSet__ = null; + return this; + }, + + /** + * pop the previously active lexer condition state off the condition + * stack + * + * @public + * @this {RegExpLexer} + */ + popState: function lexer_popState() { + var n = this.conditionStack.length - 1; + if (n > 0) { + this.__currentRuleSet__ = null; + return this.conditionStack.pop(); + } else { + return this.conditionStack[0]; + } + }, + + /** + * return the currently active lexer condition state; when an index + * argument is provided it produces the N-th previous condition state, + * if available + * + * @public + * @this {RegExpLexer} + */ + topState: function lexer_topState(n) { + n = this.conditionStack.length - 1 - Math.abs(n || 0); + if (n >= 0) { + return this.conditionStack[n]; + } else { + return 'INITIAL'; + } + }, + + /** + * (internal) determine the lexer rule set which is active for the + * currently active lexer condition state + * + * @public + * @this {RegExpLexer} + */ + _currentRules: function lexer__currentRules() { + if (this.conditionStack.length && this.conditionStack[this.conditionStack.length - 1]) { + return this.conditions[this.conditionStack[this.conditionStack.length - 1]]; + } else { + return this.conditions['INITIAL']; + } + }, + + /** + * return the number of states currently on the stack + * + * @public + * @this {RegExpLexer} + */ + stateStackSize: function lexer_stateStackSize() { + return this.conditionStack.length; + } +} diff --git a/package-lock.json b/package-lock.json new file mode 100644 index 0000000..451c3ac --- /dev/null +++ b/package-lock.json @@ -0,0 +1,2378 @@ +{ + "name": "@gerhobbelt/jison-lex", + "version": "0.6.1-205", + "lockfileVersion": 1, + "dependencies": { + "@gerhobbelt/ast-types": { + "version": "0.9.13-4", + "resolved": "https://registry.npmjs.org/@gerhobbelt/ast-types/-/ast-types-0.9.13-4.tgz", + "integrity": "sha512-V8UIj1XN6XOP014fPpecxEa7AlAB9kaTOB/wF9UbguuwIMWCHDmdA9i03JDK9zXyVDVaLWCYh42JK8F9f27AtA==" + }, + "@gerhobbelt/ast-util": { + "version": "0.6.1-4", + "resolved": "https://registry.npmjs.org/@gerhobbelt/ast-util/-/ast-util-0.6.1-4.tgz", + "integrity": "sha512-NP7YZh7rR6CNiMLyKTF+qb2Epx0r5x/zKQ3Z14TgXl73YJurC8WkMkFM9nDj8cRXb6R+f+BEu4DqAvvYKMxbqg==" + }, + "@gerhobbelt/json5": { + "version": "0.5.1-20", + "resolved": "https://registry.npmjs.org/@gerhobbelt/json5/-/json5-0.5.1-20.tgz", + "integrity": "sha512-4YEkF451JFUdt3Y54l+BLvbGz5sCVYbIVvrkt+NshIsmDKHZXefkBRznsf5prdmxbxXiAfMoVgtbVD/5V5rVWw==" + }, + "@gerhobbelt/lex-parser": { + "version": "0.6.1-205", + "resolved": "https://registry.npmjs.org/@gerhobbelt/lex-parser/-/lex-parser-0.6.1-205.tgz", + "integrity": "sha512-U+i43wcYKj+JX43o6nhQnK94BJBEku7Sd326C1sU576VxoVlRcmpFwQE5i0G4tiCvgLv0SL3Cxbsm46FBT+xjQ==", + "dependencies": { + "@gerhobbelt/ast-types": { + "version": "0.9.13-7", + "resolved": "https://registry.npmjs.org/@gerhobbelt/ast-types/-/ast-types-0.9.13-7.tgz", + "integrity": "sha512-OKLyvezcD1X9WHXsKfDm2nLhwt1ybNRvErTqVeM5wlq6vQvNMkWKG6SLwG3Y08gkseZWKfe7enhPiJWoJORf3A==" + }, + "@gerhobbelt/recast": { + "version": "0.12.7-11", + "resolved": "https://registry.npmjs.org/@gerhobbelt/recast/-/recast-0.12.7-11.tgz", + "integrity": "sha512-vjk3AMqq8bgg8Wf5B6n2OdWmpa9iyBYX+/N5+vTf9mz/+etm0YUHcgGdzX98f8tSTCUl+LEdMKNN4vteLbUsxg==" + }, + "jison-helpers-lib": { + "version": "0.6.1-203", + "resolved": "https://registry.npmjs.org/jison-helpers-lib/-/jison-helpers-lib-0.6.1-203.tgz", + "integrity": "sha512-Pc8JW2rGm3ZpFtcYD3+uoZdVRmnyBPwzZc2SaPvriWbSPwsQpLOZjSGOq5WK6fuPZH0FhifHwr0YwHwiXS3hWw==" + } + } + }, + "@gerhobbelt/linewrap": { + "version": "0.2.2-3", + "resolved": "https://registry.npmjs.org/@gerhobbelt/linewrap/-/linewrap-0.2.2-3.tgz", + "integrity": "sha512-u2eUbXgNtqckBI4gxds/uiUNoytT+qIqpePmVDI5isW8A18uB3Qz1P+UxAHgFafGOZWJNrpR0IKnZhl7QhaUng==" + }, + "@gerhobbelt/nomnom": { + "version": "1.8.4-24", + "resolved": "https://registry.npmjs.org/@gerhobbelt/nomnom/-/nomnom-1.8.4-24.tgz", + "integrity": "sha512-spzyz2vHd1BhYNSUMXjqJOwk4AjnOIzZz3cYCOryUCzMvlqz01/+SAPEy/pjT47CrOGdWd0JgemePjru1aLYgQ==" + }, + "@gerhobbelt/recast": { + "version": "0.12.7-14", + "resolved": "https://registry.npmjs.org/@gerhobbelt/recast/-/recast-0.12.7-14.tgz", + "integrity": "sha512-U1PM+EXUYDXWxLYZiEdd+y5Gk4XHBiAjxolWeCviq3kbxobZiQJI7DWWjG72Ptow3gpXZYi7tMSeumOkoxnPwQ==", + "dependencies": { + "@gerhobbelt/ast-types": { + "version": "0.9.14-9", + "resolved": "https://registry.npmjs.org/@gerhobbelt/ast-types/-/ast-types-0.9.14-9.tgz", + "integrity": "sha512-5TmMhHOh6OE5VbGJuKnbQ2LEzN5z15CB1zGpA3hUYb00jN+G6qk/Z0ZhRFubS8GTp0h+JJaqnxUIbxneoNnTIQ==" + }, + "private": { + "version": "0.1.8", + "resolved": "https://registry.npmjs.org/private/-/private-0.1.8.tgz", + "integrity": "sha512-VvivMrbvd2nKkiG38qjULzlc+4Vx4wm/whI9pQD35YrARNnhxeiRktSOhSukRLFNlzg6Br/cJPet5J/u19r/mg==" + } + } + }, + "@gerhobbelt/xregexp": { + "version": "3.2.0-22", + "resolved": "https://registry.npmjs.org/@gerhobbelt/xregexp/-/xregexp-3.2.0-22.tgz", + "integrity": "sha512-TRu38Z67VxFSMrBP3z/ORiJVQqp56ulidZirbobtmJnVGBWLdo4GbHtihgIJFGieIZuk+LxmPkK45SY+SQsR3A==" + }, + "ansi-regex": { + "version": "2.1.1", + "resolved": "https://registry.npmjs.org/ansi-regex/-/ansi-regex-2.1.1.tgz", + "integrity": "sha1-w7M6te42DYbg5ijwRorn7yfWVN8=" + }, + "ansi-styles": { + "version": "3.2.0", + "resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-3.2.0.tgz", + "integrity": "sha512-NnSOmMEYtVR2JVMIGTzynRkkaxtiq1xnFBcdQD/DnNCYPoEPsVJhM98BDyaoNOQIi7p4okdi3E27eN7GQbsUug==" + }, + "anymatch": { + "version": "1.3.2", + "resolved": "https://registry.npmjs.org/anymatch/-/anymatch-1.3.2.tgz", + "integrity": "sha512-0XNayC8lTHQ2OI8aljNCN3sSx6hsr/1+rlcDAotXJR7C1oZZHCNsfpbKwMjRA3Uqb5tF1Rae2oloTr4xpq+WjA==", + "dev": true, + "optional": true + }, + "arr-diff": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/arr-diff/-/arr-diff-2.0.0.tgz", + "integrity": "sha1-jzuCf5Vai9ZpaX5KQlasPOrjVs8=", + "dev": true, + "optional": true + }, + "arr-flatten": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/arr-flatten/-/arr-flatten-1.1.0.tgz", + "integrity": "sha512-L3hKV5R/p5o81R7O02IGnwpDmkp6E982XhtbuwSe3O4qOtMMMtodicASA1Cny2U+aCXcNpml+m4dPsvsJ3jatg==", + "dev": true, + "optional": true + }, + "array-union": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/array-union/-/array-union-1.0.2.tgz", + "integrity": "sha1-mjRBDk9OPaI96jdb5b5w8kd47Dk=", + "dev": true + }, + "array-uniq": { + "version": "1.0.3", + "resolved": "https://registry.npmjs.org/array-uniq/-/array-uniq-1.0.3.tgz", + "integrity": "sha1-r2rId6Jcx/dOBYiUdThY39sk/bY=", + "dev": true + }, + "array-unique": { + "version": "0.2.1", + "resolved": "https://registry.npmjs.org/array-unique/-/array-unique-0.2.1.tgz", + "integrity": "sha1-odl8yvy8JiXMcPrc6zalDFiwGlM=", + "dev": true, + "optional": true + }, + "assertion-error": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/assertion-error/-/assertion-error-1.0.2.tgz", + "integrity": "sha1-E8pRXYYgbaC6xm6DTdOX2HWBCUw=", + "dev": true + }, + "async-each": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/async-each/-/async-each-1.0.1.tgz", + "integrity": "sha1-GdOGodntxufByF04iu28xW0zYC0=", + "dev": true, + "optional": true + }, + "babel-cli": { + "version": "6.26.0", + "resolved": "https://registry.npmjs.org/babel-cli/-/babel-cli-6.26.0.tgz", + "integrity": "sha1-UCq1SHTX24itALiHoGODzgPQAvE=", + "dev": true, + "dependencies": { + "source-map": { + "version": "0.5.7", + "resolved": "https://registry.npmjs.org/source-map/-/source-map-0.5.7.tgz", + "integrity": "sha1-igOdLRAh0i0eoUyA2OpGi6LvP8w=", + "dev": true + } + } + }, + "babel-code-frame": { + "version": "6.26.0", + "resolved": "https://registry.npmjs.org/babel-code-frame/-/babel-code-frame-6.26.0.tgz", + "integrity": "sha1-Y/1D99weO7fONZR9uP42mj9Yx0s=", + "dev": true, + "dependencies": { + "ansi-styles": { + "version": "2.2.1", + "resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-2.2.1.tgz", + "integrity": "sha1-tDLdM1i2NM914eRmQ2gkBTPB3b4=", + "dev": true + }, + "chalk": { + "version": "1.1.3", + "resolved": "https://registry.npmjs.org/chalk/-/chalk-1.1.3.tgz", + "integrity": "sha1-qBFcVeSnAv5NFQq9OHKCKn4J/Jg=", + "dev": true + }, + "supports-color": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/supports-color/-/supports-color-2.0.0.tgz", + "integrity": "sha1-U10EXOa2Nj+kARcIRimZXp3zJMc=", + "dev": true + } + } + }, + "babel-core": { + "version": "6.26.0", + "resolved": "https://registry.npmjs.org/babel-core/-/babel-core-6.26.0.tgz", + "integrity": "sha1-rzL3izGm/O8RnIew/Y2XU/A6C7g=", + "dev": true, + "dependencies": { + "json5": { + "version": "0.5.1", + "resolved": "https://registry.npmjs.org/json5/-/json5-0.5.1.tgz", + "integrity": "sha1-Hq3nrMASA0rYTiOWdn6tn6VJWCE=", + "dev": true + }, + "source-map": { + "version": "0.5.7", + "resolved": "https://registry.npmjs.org/source-map/-/source-map-0.5.7.tgz", + "integrity": "sha1-igOdLRAh0i0eoUyA2OpGi6LvP8w=", + "dev": true + } + } + }, + "babel-generator": { + "version": "6.26.0", + "resolved": "https://registry.npmjs.org/babel-generator/-/babel-generator-6.26.0.tgz", + "integrity": "sha1-rBriAHC3n248odMmlhMFN3TyDcU=", + "dev": true, + "dependencies": { + "source-map": { + "version": "0.5.7", + "resolved": "https://registry.npmjs.org/source-map/-/source-map-0.5.7.tgz", + "integrity": "sha1-igOdLRAh0i0eoUyA2OpGi6LvP8w=", + "dev": true + } + } + }, + "babel-helper-builder-binary-assignment-operator-visitor": { + "version": "6.24.1", + "resolved": "https://registry.npmjs.org/babel-helper-builder-binary-assignment-operator-visitor/-/babel-helper-builder-binary-assignment-operator-visitor-6.24.1.tgz", + "integrity": "sha1-zORReto1b0IgvK6KAsKzRvmlZmQ=", + "dev": true + }, + "babel-helper-call-delegate": { + "version": "6.24.1", + "resolved": "https://registry.npmjs.org/babel-helper-call-delegate/-/babel-helper-call-delegate-6.24.1.tgz", + "integrity": "sha1-7Oaqzdx25Bw0YfiL/Fdb0Nqi340=", + "dev": true + }, + "babel-helper-define-map": { + "version": "6.26.0", + "resolved": "https://registry.npmjs.org/babel-helper-define-map/-/babel-helper-define-map-6.26.0.tgz", + "integrity": "sha1-pfVtq0GiX5fstJjH66ypgZ+Vvl8=", + "dev": true + }, + "babel-helper-explode-assignable-expression": { + "version": "6.24.1", + "resolved": "https://registry.npmjs.org/babel-helper-explode-assignable-expression/-/babel-helper-explode-assignable-expression-6.24.1.tgz", + "integrity": "sha1-8luCz33BBDPFX3BZLVdGQArCLKo=", + "dev": true + }, + "babel-helper-function-name": { + "version": "6.24.1", + "resolved": "https://registry.npmjs.org/babel-helper-function-name/-/babel-helper-function-name-6.24.1.tgz", + "integrity": "sha1-00dbjAPtmCQqJbSDUasYOZ01gKk=", + "dev": true + }, + "babel-helper-get-function-arity": { + "version": "6.24.1", + "resolved": "https://registry.npmjs.org/babel-helper-get-function-arity/-/babel-helper-get-function-arity-6.24.1.tgz", + "integrity": "sha1-j3eCqpNAfEHTqlCQj4mwMbG2hT0=", + "dev": true + }, + "babel-helper-hoist-variables": { + "version": "6.24.1", + "resolved": "https://registry.npmjs.org/babel-helper-hoist-variables/-/babel-helper-hoist-variables-6.24.1.tgz", + "integrity": "sha1-HssnaJydJVE+rbyZFKc/VAi+enY=", + "dev": true + }, + "babel-helper-optimise-call-expression": { + "version": "6.24.1", + "resolved": "https://registry.npmjs.org/babel-helper-optimise-call-expression/-/babel-helper-optimise-call-expression-6.24.1.tgz", + "integrity": "sha1-96E0J7qfc/j0+pk8VKl4gtEkQlc=", + "dev": true + }, + "babel-helper-regex": { + "version": "6.26.0", + "resolved": "https://registry.npmjs.org/babel-helper-regex/-/babel-helper-regex-6.26.0.tgz", + "integrity": "sha1-MlxZ+QL4LyS3T6zu0DY5VPZJXnI=", + "dev": true + }, + "babel-helper-remap-async-to-generator": { + "version": "6.24.1", + "resolved": "https://registry.npmjs.org/babel-helper-remap-async-to-generator/-/babel-helper-remap-async-to-generator-6.24.1.tgz", + "integrity": "sha1-XsWBgnrXI/7N04HxySg5BnbkVRs=", + "dev": true + }, + "babel-helper-replace-supers": { + "version": "6.24.1", + "resolved": "https://registry.npmjs.org/babel-helper-replace-supers/-/babel-helper-replace-supers-6.24.1.tgz", + "integrity": "sha1-v22/5Dk40XNpohPKiov3S2qQqxo=", + "dev": true + }, + "babel-helpers": { + "version": "6.24.1", + "resolved": "https://registry.npmjs.org/babel-helpers/-/babel-helpers-6.24.1.tgz", + "integrity": "sha1-NHHenK7DiOXIUOWX5Yom3fN2ArI=", + "dev": true + }, + "babel-messages": { + "version": "6.23.0", + "resolved": "https://registry.npmjs.org/babel-messages/-/babel-messages-6.23.0.tgz", + "integrity": "sha1-8830cDhYA1sqKVHG7F7fbGLyYw4=", + "dev": true + }, + "babel-plugin-check-es2015-constants": { + "version": "6.22.0", + "resolved": "https://registry.npmjs.org/babel-plugin-check-es2015-constants/-/babel-plugin-check-es2015-constants-6.22.0.tgz", + "integrity": "sha1-NRV7EBQm/S/9PaP3XH0ekYNbv4o=", + "dev": true + }, + "babel-plugin-syntax-async-functions": { + "version": "6.13.0", + "resolved": "https://registry.npmjs.org/babel-plugin-syntax-async-functions/-/babel-plugin-syntax-async-functions-6.13.0.tgz", + "integrity": "sha1-ytnK0RkbWtY0vzCuCHI5HgZHvpU=", + "dev": true + }, + "babel-plugin-syntax-exponentiation-operator": { + "version": "6.13.0", + "resolved": "https://registry.npmjs.org/babel-plugin-syntax-exponentiation-operator/-/babel-plugin-syntax-exponentiation-operator-6.13.0.tgz", + "integrity": "sha1-nufoM3KQ2pUoggGmpX9BcDF4MN4=", + "dev": true + }, + "babel-plugin-syntax-object-rest-spread": { + "version": "6.13.0", + "resolved": "https://registry.npmjs.org/babel-plugin-syntax-object-rest-spread/-/babel-plugin-syntax-object-rest-spread-6.13.0.tgz", + "integrity": "sha1-/WU28rzhODb/o6VFjEkDpZe7O/U=", + "dev": true + }, + "babel-plugin-syntax-trailing-function-commas": { + "version": "6.22.0", + "resolved": "https://registry.npmjs.org/babel-plugin-syntax-trailing-function-commas/-/babel-plugin-syntax-trailing-function-commas-6.22.0.tgz", + "integrity": "sha1-ugNgk3+NBuQBgKQ/4NVhb/9TLPM=", + "dev": true + }, + "babel-plugin-transform-async-to-generator": { + "version": "6.24.1", + "resolved": "https://registry.npmjs.org/babel-plugin-transform-async-to-generator/-/babel-plugin-transform-async-to-generator-6.24.1.tgz", + "integrity": "sha1-ZTbjeK/2yx1VF6wOQOs+n8jQh2E=", + "dev": true + }, + "babel-plugin-transform-es2015-arrow-functions": { + "version": "6.22.0", + "resolved": "https://registry.npmjs.org/babel-plugin-transform-es2015-arrow-functions/-/babel-plugin-transform-es2015-arrow-functions-6.22.0.tgz", + "integrity": "sha1-RSaSy3EdX3ncf4XkQM5BufJE0iE=", + "dev": true + }, + "babel-plugin-transform-es2015-block-scoped-functions": { + "version": "6.22.0", + "resolved": "https://registry.npmjs.org/babel-plugin-transform-es2015-block-scoped-functions/-/babel-plugin-transform-es2015-block-scoped-functions-6.22.0.tgz", + "integrity": "sha1-u8UbSflk1wy42OC5ToICRs46YUE=", + "dev": true + }, + "babel-plugin-transform-es2015-block-scoping": { + "version": "6.26.0", + "resolved": "https://registry.npmjs.org/babel-plugin-transform-es2015-block-scoping/-/babel-plugin-transform-es2015-block-scoping-6.26.0.tgz", + "integrity": "sha1-1w9SmcEwjQXBL0Y4E7CgnnOxiV8=", + "dev": true + }, + "babel-plugin-transform-es2015-classes": { + "version": "6.24.1", + "resolved": "https://registry.npmjs.org/babel-plugin-transform-es2015-classes/-/babel-plugin-transform-es2015-classes-6.24.1.tgz", + "integrity": "sha1-WkxYpQyclGHlZLSyo7+ryXolhNs=", + "dev": true + }, + "babel-plugin-transform-es2015-computed-properties": { + "version": "6.24.1", + "resolved": "https://registry.npmjs.org/babel-plugin-transform-es2015-computed-properties/-/babel-plugin-transform-es2015-computed-properties-6.24.1.tgz", + "integrity": "sha1-b+Ko0WiV1WNPTNmZttNICjCBWbM=", + "dev": true + }, + "babel-plugin-transform-es2015-destructuring": { + "version": "6.23.0", + "resolved": "https://registry.npmjs.org/babel-plugin-transform-es2015-destructuring/-/babel-plugin-transform-es2015-destructuring-6.23.0.tgz", + "integrity": "sha1-mXux8auWf2gtKwh2/jWNYOdlxW0=", + "dev": true + }, + "babel-plugin-transform-es2015-duplicate-keys": { + "version": "6.24.1", + "resolved": "https://registry.npmjs.org/babel-plugin-transform-es2015-duplicate-keys/-/babel-plugin-transform-es2015-duplicate-keys-6.24.1.tgz", + "integrity": "sha1-c+s9MQypaePvnskcU3QabxV2Qj4=", + "dev": true + }, + "babel-plugin-transform-es2015-for-of": { + "version": "6.23.0", + "resolved": "https://registry.npmjs.org/babel-plugin-transform-es2015-for-of/-/babel-plugin-transform-es2015-for-of-6.23.0.tgz", + "integrity": "sha1-9HyVsrYT3x0+zC/bdXNiPHUkhpE=", + "dev": true + }, + "babel-plugin-transform-es2015-function-name": { + "version": "6.24.1", + "resolved": "https://registry.npmjs.org/babel-plugin-transform-es2015-function-name/-/babel-plugin-transform-es2015-function-name-6.24.1.tgz", + "integrity": "sha1-g0yJhTvDaxrw86TF26qU/Y6sqos=", + "dev": true + }, + "babel-plugin-transform-es2015-literals": { + "version": "6.22.0", + "resolved": "https://registry.npmjs.org/babel-plugin-transform-es2015-literals/-/babel-plugin-transform-es2015-literals-6.22.0.tgz", + "integrity": "sha1-T1SgLWzWbPkVKAAZox0xklN3yi4=", + "dev": true + }, + "babel-plugin-transform-es2015-modules-amd": { + "version": "6.24.1", + "resolved": "https://registry.npmjs.org/babel-plugin-transform-es2015-modules-amd/-/babel-plugin-transform-es2015-modules-amd-6.24.1.tgz", + "integrity": "sha1-Oz5UAXI5hC1tGcMBHEvS8AoA0VQ=", + "dev": true + }, + "babel-plugin-transform-es2015-modules-commonjs": { + "version": "6.26.0", + "resolved": "https://registry.npmjs.org/babel-plugin-transform-es2015-modules-commonjs/-/babel-plugin-transform-es2015-modules-commonjs-6.26.0.tgz", + "integrity": "sha1-DYOUApt9xqvhqX7xgeAHWN0uXYo=", + "dev": true + }, + "babel-plugin-transform-es2015-modules-systemjs": { + "version": "6.24.1", + "resolved": "https://registry.npmjs.org/babel-plugin-transform-es2015-modules-systemjs/-/babel-plugin-transform-es2015-modules-systemjs-6.24.1.tgz", + "integrity": "sha1-/4mhQrkRmpBhlfXxBuzzBdlAfSM=", + "dev": true + }, + "babel-plugin-transform-es2015-modules-umd": { + "version": "6.24.1", + "resolved": "https://registry.npmjs.org/babel-plugin-transform-es2015-modules-umd/-/babel-plugin-transform-es2015-modules-umd-6.24.1.tgz", + "integrity": "sha1-rJl+YoXNGO1hdq22B9YCNErThGg=", + "dev": true + }, + "babel-plugin-transform-es2015-object-super": { + "version": "6.24.1", + "resolved": "https://registry.npmjs.org/babel-plugin-transform-es2015-object-super/-/babel-plugin-transform-es2015-object-super-6.24.1.tgz", + "integrity": "sha1-JM72muIcuDp/hgPa0CH1cusnj40=", + "dev": true + }, + "babel-plugin-transform-es2015-parameters": { + "version": "6.24.1", + "resolved": "https://registry.npmjs.org/babel-plugin-transform-es2015-parameters/-/babel-plugin-transform-es2015-parameters-6.24.1.tgz", + "integrity": "sha1-V6w1GrScrxSpfNE7CfZv3wpiXys=", + "dev": true + }, + "babel-plugin-transform-es2015-shorthand-properties": { + "version": "6.24.1", + "resolved": "https://registry.npmjs.org/babel-plugin-transform-es2015-shorthand-properties/-/babel-plugin-transform-es2015-shorthand-properties-6.24.1.tgz", + "integrity": "sha1-JPh11nIch2YbvZmkYi5R8U3jiqA=", + "dev": true + }, + "babel-plugin-transform-es2015-spread": { + "version": "6.22.0", + "resolved": "https://registry.npmjs.org/babel-plugin-transform-es2015-spread/-/babel-plugin-transform-es2015-spread-6.22.0.tgz", + "integrity": "sha1-1taKmfia7cRTbIGlQujdnxdG+NE=", + "dev": true + }, + "babel-plugin-transform-es2015-sticky-regex": { + "version": "6.24.1", + "resolved": "https://registry.npmjs.org/babel-plugin-transform-es2015-sticky-regex/-/babel-plugin-transform-es2015-sticky-regex-6.24.1.tgz", + "integrity": "sha1-AMHNsaynERLN8M9hJsLta0V8zbw=", + "dev": true + }, + "babel-plugin-transform-es2015-template-literals": { + "version": "6.22.0", + "resolved": "https://registry.npmjs.org/babel-plugin-transform-es2015-template-literals/-/babel-plugin-transform-es2015-template-literals-6.22.0.tgz", + "integrity": "sha1-qEs0UPfp+PH2g51taH2oS7EjbY0=", + "dev": true + }, + "babel-plugin-transform-es2015-typeof-symbol": { + "version": "6.23.0", + "resolved": "https://registry.npmjs.org/babel-plugin-transform-es2015-typeof-symbol/-/babel-plugin-transform-es2015-typeof-symbol-6.23.0.tgz", + "integrity": "sha1-3sCfHN3/lLUqxz1QXITfWdzOs3I=", + "dev": true + }, + "babel-plugin-transform-es2015-unicode-regex": { + "version": "6.24.1", + "resolved": "https://registry.npmjs.org/babel-plugin-transform-es2015-unicode-regex/-/babel-plugin-transform-es2015-unicode-regex-6.24.1.tgz", + "integrity": "sha1-04sS9C6nMj9yk4fxinxa4frrNek=", + "dev": true + }, + "babel-plugin-transform-exponentiation-operator": { + "version": "6.24.1", + "resolved": "https://registry.npmjs.org/babel-plugin-transform-exponentiation-operator/-/babel-plugin-transform-exponentiation-operator-6.24.1.tgz", + "integrity": "sha1-KrDJx/MJj6SJB3cruBP+QejeOg4=", + "dev": true + }, + "babel-plugin-transform-object-rest-spread": { + "version": "6.26.0", + "resolved": "https://registry.npmjs.org/babel-plugin-transform-object-rest-spread/-/babel-plugin-transform-object-rest-spread-6.26.0.tgz", + "integrity": "sha1-DzZpLVD+9rfi1LOsFHgTepY7ewY=", + "dev": true + }, + "babel-plugin-transform-regenerator": { + "version": "6.26.0", + "resolved": "https://registry.npmjs.org/babel-plugin-transform-regenerator/-/babel-plugin-transform-regenerator-6.26.0.tgz", + "integrity": "sha1-4HA2lvveJ/Cj78rPi03KL3s6jy8=", + "dev": true + }, + "babel-plugin-transform-strict-mode": { + "version": "6.24.1", + "resolved": "https://registry.npmjs.org/babel-plugin-transform-strict-mode/-/babel-plugin-transform-strict-mode-6.24.1.tgz", + "integrity": "sha1-1fr3qleKZbvlkc9e2uBKDGcCB1g=", + "dev": true + }, + "babel-polyfill": { + "version": "6.26.0", + "resolved": "https://registry.npmjs.org/babel-polyfill/-/babel-polyfill-6.26.0.tgz", + "integrity": "sha1-N5k3q8Z9eJWXCtxiHyhM2WbPIVM=", + "dev": true, + "dependencies": { + "regenerator-runtime": { + "version": "0.10.5", + "resolved": "https://registry.npmjs.org/regenerator-runtime/-/regenerator-runtime-0.10.5.tgz", + "integrity": "sha1-M2w+/BIgrc7dosn6tntaeVWjNlg=", + "dev": true + } + } + }, + "babel-preset-env": { + "version": "1.6.1", + "resolved": "https://registry.npmjs.org/babel-preset-env/-/babel-preset-env-1.6.1.tgz", + "integrity": "sha512-W6VIyA6Ch9ePMI7VptNn2wBM6dbG0eSz25HEiL40nQXCsXGTGZSTZu1Iap+cj3Q0S5a7T9+529l/5Bkvd+afNA==", + "dev": true + }, + "babel-preset-modern-browsers": { + "version": "10.0.1", + "resolved": "https://registry.npmjs.org/babel-preset-modern-browsers/-/babel-preset-modern-browsers-10.0.1.tgz", + "integrity": "sha512-OwJlaopcYWBjgw4jLkPRXaArpFzpdAdgn7ZDQdY6a284uAjpKGsFP3eRo7rxrXsvmDMcXXQu1CsQzg09IUQelQ==", + "dev": true + }, + "babel-register": { + "version": "6.26.0", + "resolved": "https://registry.npmjs.org/babel-register/-/babel-register-6.26.0.tgz", + "integrity": "sha1-btAhFz4vy0htestFxgCahW9kcHE=", + "dev": true + }, + "babel-runtime": { + "version": "6.26.0", + "resolved": "https://registry.npmjs.org/babel-runtime/-/babel-runtime-6.26.0.tgz", + "integrity": "sha1-llxwWGaOgrVde/4E/yM3vItWR/4=", + "dev": true + }, + "babel-template": { + "version": "6.26.0", + "resolved": "https://registry.npmjs.org/babel-template/-/babel-template-6.26.0.tgz", + "integrity": "sha1-3gPi0WOWsGn0bdn/+FIfsaDjXgI=", + "dev": true + }, + "babel-traverse": { + "version": "6.26.0", + "resolved": "https://registry.npmjs.org/babel-traverse/-/babel-traverse-6.26.0.tgz", + "integrity": "sha1-RqnL1+3MYsjlwGTi0tjQ9ANXZu4=", + "dev": true + }, + "babel-types": { + "version": "6.26.0", + "resolved": "https://registry.npmjs.org/babel-types/-/babel-types-6.26.0.tgz", + "integrity": "sha1-o7Bz+Uq0nrb6Vc1lInozQ4BjJJc=", + "dev": true + }, + "babylon": { + "version": "6.18.0", + "resolved": "https://registry.npmjs.org/babylon/-/babylon-6.18.0.tgz", + "integrity": "sha512-q/UEjfGJ2Cm3oKV71DJz9d25TPnq5rhBVL2Q4fA5wcC3jcrdn7+SssEybFIxwAvvP+YCsCYNKughoF33GxgycQ==", + "dev": true + }, + "balanced-match": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/balanced-match/-/balanced-match-1.0.0.tgz", + "integrity": "sha1-ibTRmasr7kneFk6gK4nORi1xt2c=", + "dev": true + }, + "binary-extensions": { + "version": "1.10.0", + "resolved": "https://registry.npmjs.org/binary-extensions/-/binary-extensions-1.10.0.tgz", + "integrity": "sha1-muuabF6IY4qtFx4Wf1kAq+JINdA=", + "dev": true, + "optional": true + }, + "brace-expansion": { + "version": "1.1.8", + "resolved": "https://registry.npmjs.org/brace-expansion/-/brace-expansion-1.1.8.tgz", + "integrity": "sha1-wHshHHyVLsH479Uad+8NHTmQopI=", + "dev": true + }, + "braces": { + "version": "1.8.5", + "resolved": "https://registry.npmjs.org/braces/-/braces-1.8.5.tgz", + "integrity": "sha1-uneWLhLf+WnWt2cR6RS3N4V79qc=", + "dev": true, + "optional": true + }, + "browser-resolve": { + "version": "1.11.2", + "resolved": "https://registry.npmjs.org/browser-resolve/-/browser-resolve-1.11.2.tgz", + "integrity": "sha1-j/CbCixCFxihBRwmCzLkj0QpOM4=", + "dev": true, + "dependencies": { + "resolve": { + "version": "1.1.7", + "resolved": "https://registry.npmjs.org/resolve/-/resolve-1.1.7.tgz", + "integrity": "sha1-IDEU2CrSxe2ejgQRs5ModeiJ6Xs=", + "dev": true + } + } + }, + "browser-stdout": { + "version": "1.3.0", + "resolved": "https://registry.npmjs.org/browser-stdout/-/browser-stdout-1.3.0.tgz", + "integrity": "sha1-81HTKWnTL6XXpVZxVCY9korjvR8=", + "dev": true + }, + "browserslist": { + "version": "2.5.1", + "resolved": "https://registry.npmjs.org/browserslist/-/browserslist-2.5.1.tgz", + "integrity": "sha512-jAvM2ku7YDJ+leAq3bFH1DE0Ylw+F+EQDq4GkqZfgPEqpWYw9ofQH85uKSB9r3Tv7XDbfqVtE+sdvKJW7IlPJA==", + "dev": true + }, + "builtin-modules": { + "version": "1.1.1", + "resolved": "https://registry.npmjs.org/builtin-modules/-/builtin-modules-1.1.1.tgz", + "integrity": "sha1-Jw8HbFpywC9bZaR9+Uxf46J4iS8=" + }, + "camelcase": { + "version": "4.1.0", + "resolved": "https://registry.npmjs.org/camelcase/-/camelcase-4.1.0.tgz", + "integrity": "sha1-1UVjW+HjPFQmScaRc+Xeas+uNN0=" + }, + "caniuse-lite": { + "version": "1.0.30000749", + "resolved": "https://registry.npmjs.org/caniuse-lite/-/caniuse-lite-1.0.30000749.tgz", + "integrity": "sha1-L/OChlrq2MyjXaz7qwT1jv+kwBw=", + "dev": true + }, + "chai": { + "version": "4.1.2", + "resolved": "https://registry.npmjs.org/chai/-/chai-4.1.2.tgz", + "integrity": "sha1-D2RYS6ZC8PKs4oBiefTwbKI61zw=", + "dev": true + }, + "chalk": { + "version": "2.1.0", + "resolved": "https://registry.npmjs.org/chalk/-/chalk-2.1.0.tgz", + "integrity": "sha512-LUHGS/dge4ujbXMJrnihYMcL4AoOweGnw9Tp3kQuqy1Kx5c1qKjqvMJZ6nVJPMWJtKCTN72ZogH3oeSO9g9rXQ==" + }, + "check-error": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/check-error/-/check-error-1.0.2.tgz", + "integrity": "sha1-V00xLt2Iu13YkS6Sht1sCu1KrII=", + "dev": true + }, + "chokidar": { + "version": "1.7.0", + "resolved": "https://registry.npmjs.org/chokidar/-/chokidar-1.7.0.tgz", + "integrity": "sha1-eY5ol3gVHIB2tLNg5e3SjNortGg=", + "dev": true, + "optional": true + }, + "cliui": { + "version": "3.2.0", + "resolved": "https://registry.npmjs.org/cliui/-/cliui-3.2.0.tgz", + "integrity": "sha1-EgYBU3qRbSmUD5NNo7SNWFo5IT0=", + "dependencies": { + "string-width": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/string-width/-/string-width-1.0.2.tgz", + "integrity": "sha1-EYvfW4zcUaKn5w0hHgfisLmxB9M=" + } + } + }, + "code-point-at": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/code-point-at/-/code-point-at-1.1.0.tgz", + "integrity": "sha1-DQcLTQQ6W+ozovGkDi7bPZpMz3c=" + }, + "color-convert": { + "version": "1.9.0", + "resolved": "https://registry.npmjs.org/color-convert/-/color-convert-1.9.0.tgz", + "integrity": "sha1-Gsz5fdc5uYO/mU1W/sj5WFNkG3o=" + }, + "color-name": { + "version": "1.1.3", + "resolved": "https://registry.npmjs.org/color-name/-/color-name-1.1.3.tgz", + "integrity": "sha1-p9BVi9icQveV3UIyj3QIMcpTvCU=" + }, + "commander": { + "version": "2.11.0", + "resolved": "https://registry.npmjs.org/commander/-/commander-2.11.0.tgz", + "integrity": "sha512-b0553uYA5YAEGgyYIGYROzKQ7X5RAqedkfjiZxwi0kL1g3bOaBNNZfYkzt/CL0umgD5wc9Jec2FbB98CjkMRvQ==", + "dev": true + }, + "concat-map": { + "version": "0.0.1", + "resolved": "https://registry.npmjs.org/concat-map/-/concat-map-0.0.1.tgz", + "integrity": "sha1-2Klr13/Wjfd5OnMDajug1UBdR3s=", + "dev": true + }, + "convert-source-map": { + "version": "1.5.0", + "resolved": "https://registry.npmjs.org/convert-source-map/-/convert-source-map-1.5.0.tgz", + "integrity": "sha1-ms1whRxtXf3ZPZKC5e35SgP/RrU=", + "dev": true + }, + "core-js": { + "version": "2.5.1", + "resolved": "https://registry.npmjs.org/core-js/-/core-js-2.5.1.tgz", + "integrity": "sha1-rmh03GaTd4m4B1T/VCjfZoGcpQs=" + }, + "core-util-is": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/core-util-is/-/core-util-is-1.0.2.tgz", + "integrity": "sha1-tf1UIgqivFq1eqtxQMlAdUUDwac=", + "dev": true, + "optional": true + }, + "cross-spawn": { + "version": "5.1.0", + "resolved": "https://registry.npmjs.org/cross-spawn/-/cross-spawn-5.1.0.tgz", + "integrity": "sha1-6L0O/uWPz/b4+UUQoKVUu/ojVEk=" + }, + "debug": { + "version": "2.6.9", + "resolved": "https://registry.npmjs.org/debug/-/debug-2.6.9.tgz", + "integrity": "sha512-bC7ElrdJaJnPbAP+1EotYvqZsb3ecl5wi6Bfi6BJTUcNowp6cvspg0jXznRTKDjm/E7AdgFBVeAPVMNcKGsHMA==", + "dev": true + }, + "decamelize": { + "version": "1.2.0", + "resolved": "https://registry.npmjs.org/decamelize/-/decamelize-1.2.0.tgz", + "integrity": "sha1-9lNNFRSCabIDUue+4m9QH5oZEpA=" + }, + "deep-eql": { + "version": "3.0.1", + "resolved": "https://registry.npmjs.org/deep-eql/-/deep-eql-3.0.1.tgz", + "integrity": "sha512-+QeIQyN5ZuO+3Uk5DYh6/1eKO0m0YmJFGNmFHGACpf1ClL1nmlV/p4gNgbl2pJGxgXb4faqo6UE+M5ACEMyVcw==", + "dev": true + }, + "detect-indent": { + "version": "4.0.0", + "resolved": "https://registry.npmjs.org/detect-indent/-/detect-indent-4.0.0.tgz", + "integrity": "sha1-920GQ1LN9Docts5hnE7jqUdd4gg=", + "dev": true + }, + "diff": { + "version": "3.3.1", + "resolved": "https://registry.npmjs.org/diff/-/diff-3.3.1.tgz", + "integrity": "sha512-MKPHZDMB0o6yHyDryUOScqZibp914ksXwAMYMTHj6KO8UeKsRYNJD3oNCKjTqZon+V488P7N/HzXF8t7ZR95ww==", + "dev": true + }, + "electron-to-chromium": { + "version": "1.3.27", + "resolved": "https://registry.npmjs.org/electron-to-chromium/-/electron-to-chromium-1.3.27.tgz", + "integrity": "sha1-eOy4o5kGYYe7N07t412ccFZagD0=", + "dev": true + }, + "error-ex": { + "version": "1.3.1", + "resolved": "https://registry.npmjs.org/error-ex/-/error-ex-1.3.1.tgz", + "integrity": "sha1-+FWobOYa3E6GIcPNoh56dhLDqNw=" + }, + "escape-string-regexp": { + "version": "1.0.5", + "resolved": "https://registry.npmjs.org/escape-string-regexp/-/escape-string-regexp-1.0.5.tgz", + "integrity": "sha1-G2HAViGQqN/2rjuyzwIAyhMLhtQ=" + }, + "esprima": { + "version": "4.0.0", + "resolved": "https://registry.npmjs.org/esprima/-/esprima-4.0.0.tgz", + "integrity": "sha512-oftTcaMu/EGrEIu904mWteKIv8vMuOgGYo7EhVJJN00R/EED9DCua/xxHRdYnKtcECzVg7xOWhflvJMnqcFZjw==" + }, + "esutils": { + "version": "2.0.2", + "resolved": "https://registry.npmjs.org/esutils/-/esutils-2.0.2.tgz", + "integrity": "sha1-Cr9PHKpbyx96nYrMbepPqqBLrJs=", + "dev": true + }, + "execa": { + "version": "0.7.0", + "resolved": "https://registry.npmjs.org/execa/-/execa-0.7.0.tgz", + "integrity": "sha1-lEvs00zEHuMqY6n68nrVpl/Fl3c=" + }, + "exit": { + "version": "0.1.2", + "resolved": "https://registry.npmjs.org/exit/-/exit-0.1.2.tgz", + "integrity": "sha1-BjJjj42HfMghB9MKD/8aF8uhzQw=" + }, + "expand-brackets": { + "version": "0.1.5", + "resolved": "https://registry.npmjs.org/expand-brackets/-/expand-brackets-0.1.5.tgz", + "integrity": "sha1-3wcoTjQqgHzXM6xa9yQR5YHRF3s=", + "dev": true, + "optional": true + }, + "expand-range": { + "version": "1.8.2", + "resolved": "https://registry.npmjs.org/expand-range/-/expand-range-1.8.2.tgz", + "integrity": "sha1-opnv/TNf4nIeuujiV+x5ZE/IUzc=", + "dev": true, + "optional": true + }, + "extglob": { + "version": "0.3.2", + "resolved": "https://registry.npmjs.org/extglob/-/extglob-0.3.2.tgz", + "integrity": "sha1-Lhj/PS9JqydlzskCPwEdqo2DSaE=", + "dev": true, + "optional": true + }, + "filename-regex": { + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/filename-regex/-/filename-regex-2.0.1.tgz", + "integrity": "sha1-wcS5vuPglyXdsQa3XB4wH+LxiyY=", + "dev": true, + "optional": true + }, + "fill-range": { + "version": "2.2.3", + "resolved": "https://registry.npmjs.org/fill-range/-/fill-range-2.2.3.tgz", + "integrity": "sha1-ULd9/X5Gm8dJJHCWNpn+eoSFpyM=", + "dev": true, + "optional": true + }, + "find-up": { + "version": "2.1.0", + "resolved": "https://registry.npmjs.org/find-up/-/find-up-2.1.0.tgz", + "integrity": "sha1-RdG35QbHF93UgndaK3eSCjwMV6c=" + }, + "for-in": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/for-in/-/for-in-1.0.2.tgz", + "integrity": "sha1-gQaNKVqBQuwKxybG4iAMMPttXoA=", + "dev": true, + "optional": true + }, + "for-own": { + "version": "0.1.5", + "resolved": "https://registry.npmjs.org/for-own/-/for-own-0.1.5.tgz", + "integrity": "sha1-UmXGgaTylNq78XyVCbZ2OqhFEM4=", + "dev": true, + "optional": true + }, + "fs-readdir-recursive": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/fs-readdir-recursive/-/fs-readdir-recursive-1.0.0.tgz", + "integrity": "sha1-jNF0XItPiinIyuw5JHaSG6GV9WA=", + "dev": true + }, + "fs.realpath": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/fs.realpath/-/fs.realpath-1.0.0.tgz", + "integrity": "sha1-FQStJSMVjKpA20onh8sBQRmU6k8=", + "dev": true + }, + "fsevents": { + "version": "1.1.2", + "resolved": "https://registry.npmjs.org/fsevents/-/fsevents-1.1.2.tgz", + "integrity": "sha512-Sn44E5wQW4bTHXvQmvSHwqbuiXtduD6Rrjm2ZtUEGbyrig+nUH3t/QD4M4/ZXViY556TBpRgZkHLDx3JxPwxiw==", + "dev": true, + "optional": true, + "dependencies": { + "abbrev": { + "version": "1.1.0", + "bundled": true, + "dev": true, + "optional": true + }, + "ajv": { + "version": "4.11.8", + "bundled": true, + "dev": true, + "optional": true + }, + "ansi-regex": { + "version": "2.1.1", + "bundled": true, + "dev": true + }, + "aproba": { + "version": "1.1.1", + "bundled": true, + "dev": true, + "optional": true + }, + "are-we-there-yet": { + "version": "1.1.4", + "bundled": true, + "dev": true, + "optional": true + }, + "asn1": { + "version": "0.2.3", + "bundled": true, + "dev": true, + "optional": true + }, + "assert-plus": { + "version": "0.2.0", + "bundled": true, + "dev": true, + "optional": true + }, + "asynckit": { + "version": "0.4.0", + "bundled": true, + "dev": true, + "optional": true + }, + "aws-sign2": { + "version": "0.6.0", + "bundled": true, + "dev": true, + "optional": true + }, + "aws4": { + "version": "1.6.0", + "bundled": true, + "dev": true, + "optional": true + }, + "balanced-match": { + "version": "0.4.2", + "bundled": true, + "dev": true + }, + "bcrypt-pbkdf": { + "version": "1.0.1", + "bundled": true, + "dev": true, + "optional": true + }, + "block-stream": { + "version": "0.0.9", + "bundled": true, + "dev": true + }, + "boom": { + "version": "2.10.1", + "bundled": true, + "dev": true + }, + "brace-expansion": { + "version": "1.1.7", + "bundled": true, + "dev": true + }, + "buffer-shims": { + "version": "1.0.0", + "bundled": true, + "dev": true + }, + "caseless": { + "version": "0.12.0", + "bundled": true, + "dev": true, + "optional": true + }, + "co": { + "version": "4.6.0", + "bundled": true, + "dev": true, + "optional": true + }, + "code-point-at": { + "version": "1.1.0", + "bundled": true, + "dev": true + }, + "combined-stream": { + "version": "1.0.5", + "bundled": true, + "dev": true + }, + "concat-map": { + "version": "0.0.1", + "bundled": true, + "dev": true + }, + "console-control-strings": { + "version": "1.1.0", + "bundled": true, + "dev": true + }, + "core-util-is": { + "version": "1.0.2", + "bundled": true, + "dev": true + }, + "cryptiles": { + "version": "2.0.5", + "bundled": true, + "dev": true, + "optional": true + }, + "dashdash": { + "version": "1.14.1", + "bundled": true, + "dev": true, + "optional": true, + "dependencies": { + "assert-plus": { + "version": "1.0.0", + "bundled": true, + "dev": true, + "optional": true + } + } + }, + "debug": { + "version": "2.6.8", + "bundled": true, + "dev": true, + "optional": true + }, + "deep-extend": { + "version": "0.4.2", + "bundled": true, + "dev": true, + "optional": true + }, + "delayed-stream": { + "version": "1.0.0", + "bundled": true, + "dev": true + }, + "delegates": { + "version": "1.0.0", + "bundled": true, + "dev": true, + "optional": true + }, + "ecc-jsbn": { + "version": "0.1.1", + "bundled": true, + "dev": true, + "optional": true + }, + "extend": { + "version": "3.0.1", + "bundled": true, + "dev": true, + "optional": true + }, + "extsprintf": { + "version": "1.0.2", + "bundled": true, + "dev": true + }, + "forever-agent": { + "version": "0.6.1", + "bundled": true, + "dev": true, + "optional": true + }, + "form-data": { + "version": "2.1.4", + "bundled": true, + "dev": true, + "optional": true + }, + "fs.realpath": { + "version": "1.0.0", + "bundled": true, + "dev": true + }, + "fstream": { + "version": "1.0.11", + "bundled": true, + "dev": true + }, + "fstream-ignore": { + "version": "1.0.5", + "bundled": true, + "dev": true, + "optional": true + }, + "gauge": { + "version": "2.7.4", + "bundled": true, + "dev": true, + "optional": true + }, + "getpass": { + "version": "0.1.7", + "bundled": true, + "dev": true, + "optional": true, + "dependencies": { + "assert-plus": { + "version": "1.0.0", + "bundled": true, + "dev": true, + "optional": true + } + } + }, + "glob": { + "version": "7.1.2", + "bundled": true, + "dev": true + }, + "graceful-fs": { + "version": "4.1.11", + "bundled": true, + "dev": true + }, + "har-schema": { + "version": "1.0.5", + "bundled": true, + "dev": true, + "optional": true + }, + "har-validator": { + "version": "4.2.1", + "bundled": true, + "dev": true, + "optional": true + }, + "has-unicode": { + "version": "2.0.1", + "bundled": true, + "dev": true, + "optional": true + }, + "hawk": { + "version": "3.1.3", + "bundled": true, + "dev": true, + "optional": true + }, + "hoek": { + "version": "2.16.3", + "bundled": true, + "dev": true + }, + "http-signature": { + "version": "1.1.1", + "bundled": true, + "dev": true, + "optional": true + }, + "inflight": { + "version": "1.0.6", + "bundled": true, + "dev": true + }, + "inherits": { + "version": "2.0.3", + "bundled": true, + "dev": true + }, + "ini": { + "version": "1.3.4", + "bundled": true, + "dev": true, + "optional": true + }, + "is-fullwidth-code-point": { + "version": "1.0.0", + "bundled": true, + "dev": true + }, + "is-typedarray": { + "version": "1.0.0", + "bundled": true, + "dev": true, + "optional": true + }, + "isarray": { + "version": "1.0.0", + "bundled": true, + "dev": true + }, + "isstream": { + "version": "0.1.2", + "bundled": true, + "dev": true, + "optional": true + }, + "jodid25519": { + "version": "1.0.2", + "bundled": true, + "dev": true, + "optional": true + }, + "jsbn": { + "version": "0.1.1", + "bundled": true, + "dev": true, + "optional": true + }, + "json-schema": { + "version": "0.2.3", + "bundled": true, + "dev": true, + "optional": true + }, + "json-stable-stringify": { + "version": "1.0.1", + "bundled": true, + "dev": true, + "optional": true + }, + "json-stringify-safe": { + "version": "5.0.1", + "bundled": true, + "dev": true, + "optional": true + }, + "jsonify": { + "version": "0.0.0", + "bundled": true, + "dev": true, + "optional": true + }, + "jsprim": { + "version": "1.4.0", + "bundled": true, + "dev": true, + "optional": true, + "dependencies": { + "assert-plus": { + "version": "1.0.0", + "bundled": true, + "dev": true, + "optional": true + } + } + }, + "mime-db": { + "version": "1.27.0", + "bundled": true, + "dev": true + }, + "mime-types": { + "version": "2.1.15", + "bundled": true, + "dev": true + }, + "minimatch": { + "version": "3.0.4", + "bundled": true, + "dev": true + }, + "minimist": { + "version": "0.0.8", + "bundled": true, + "dev": true + }, + "mkdirp": { + "version": "0.5.1", + "bundled": true, + "dev": true + }, + "ms": { + "version": "2.0.0", + "bundled": true, + "dev": true, + "optional": true + }, + "node-pre-gyp": { + "version": "0.6.36", + "bundled": true, + "dev": true, + "optional": true + }, + "nopt": { + "version": "4.0.1", + "bundled": true, + "dev": true, + "optional": true + }, + "npmlog": { + "version": "4.1.0", + "bundled": true, + "dev": true, + "optional": true + }, + "number-is-nan": { + "version": "1.0.1", + "bundled": true, + "dev": true + }, + "oauth-sign": { + "version": "0.8.2", + "bundled": true, + "dev": true, + "optional": true + }, + "object-assign": { + "version": "4.1.1", + "bundled": true, + "dev": true, + "optional": true + }, + "once": { + "version": "1.4.0", + "bundled": true, + "dev": true + }, + "os-homedir": { + "version": "1.0.2", + "bundled": true, + "dev": true, + "optional": true + }, + "os-tmpdir": { + "version": "1.0.2", + "bundled": true, + "dev": true, + "optional": true + }, + "osenv": { + "version": "0.1.4", + "bundled": true, + "dev": true, + "optional": true + }, + "path-is-absolute": { + "version": "1.0.1", + "bundled": true, + "dev": true + }, + "performance-now": { + "version": "0.2.0", + "bundled": true, + "dev": true, + "optional": true + }, + "process-nextick-args": { + "version": "1.0.7", + "bundled": true, + "dev": true + }, + "punycode": { + "version": "1.4.1", + "bundled": true, + "dev": true, + "optional": true + }, + "qs": { + "version": "6.4.0", + "bundled": true, + "dev": true, + "optional": true + }, + "rc": { + "version": "1.2.1", + "bundled": true, + "dev": true, + "optional": true, + "dependencies": { + "minimist": { + "version": "1.2.0", + "bundled": true, + "dev": true, + "optional": true + } + } + }, + "readable-stream": { + "version": "2.2.9", + "bundled": true, + "dev": true + }, + "request": { + "version": "2.81.0", + "bundled": true, + "dev": true, + "optional": true + }, + "rimraf": { + "version": "2.6.1", + "bundled": true, + "dev": true + }, + "safe-buffer": { + "version": "5.0.1", + "bundled": true, + "dev": true + }, + "semver": { + "version": "5.3.0", + "bundled": true, + "dev": true, + "optional": true + }, + "set-blocking": { + "version": "2.0.0", + "bundled": true, + "dev": true, + "optional": true + }, + "signal-exit": { + "version": "3.0.2", + "bundled": true, + "dev": true, + "optional": true + }, + "sntp": { + "version": "1.0.9", + "bundled": true, + "dev": true, + "optional": true + }, + "sshpk": { + "version": "1.13.0", + "bundled": true, + "dev": true, + "optional": true, + "dependencies": { + "assert-plus": { + "version": "1.0.0", + "bundled": true, + "dev": true, + "optional": true + } + } + }, + "string_decoder": { + "version": "1.0.1", + "bundled": true, + "dev": true + }, + "string-width": { + "version": "1.0.2", + "bundled": true, + "dev": true + }, + "stringstream": { + "version": "0.0.5", + "bundled": true, + "dev": true, + "optional": true + }, + "strip-ansi": { + "version": "3.0.1", + "bundled": true, + "dev": true + }, + "strip-json-comments": { + "version": "2.0.1", + "bundled": true, + "dev": true, + "optional": true + }, + "tar": { + "version": "2.2.1", + "bundled": true, + "dev": true + }, + "tar-pack": { + "version": "3.4.0", + "bundled": true, + "dev": true, + "optional": true + }, + "tough-cookie": { + "version": "2.3.2", + "bundled": true, + "dev": true, + "optional": true + }, + "tunnel-agent": { + "version": "0.6.0", + "bundled": true, + "dev": true, + "optional": true + }, + "tweetnacl": { + "version": "0.14.5", + "bundled": true, + "dev": true, + "optional": true + }, + "uid-number": { + "version": "0.0.6", + "bundled": true, + "dev": true, + "optional": true + }, + "util-deprecate": { + "version": "1.0.2", + "bundled": true, + "dev": true + }, + "uuid": { + "version": "3.0.1", + "bundled": true, + "dev": true, + "optional": true + }, + "verror": { + "version": "1.3.6", + "bundled": true, + "dev": true, + "optional": true + }, + "wide-align": { + "version": "1.1.2", + "bundled": true, + "dev": true, + "optional": true + }, + "wrappy": { + "version": "1.0.2", + "bundled": true, + "dev": true + } + } + }, + "get-caller-file": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/get-caller-file/-/get-caller-file-1.0.2.tgz", + "integrity": "sha1-9wLmMSfn4jHBYKgMFVSstw1QR+U=" + }, + "get-func-name": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/get-func-name/-/get-func-name-2.0.0.tgz", + "integrity": "sha1-6td0q+5y4gQJQzoGY2YCPdaIekE=", + "dev": true + }, + "get-stream": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/get-stream/-/get-stream-3.0.0.tgz", + "integrity": "sha1-jpQ9E1jcN1VQVOy+LtsFqhdO3hQ=" + }, + "glob": { + "version": "7.1.2", + "resolved": "https://registry.npmjs.org/glob/-/glob-7.1.2.tgz", + "integrity": "sha512-MJTUg1kjuLeQCJ+ccE4Vpa6kKVXkPYJ2mOCQyUuKLcLQsdrMCpBPUi8qVE6+YuaJkozeA9NusTAw3hLr8Xe5EQ==", + "dev": true + }, + "glob-base": { + "version": "0.3.0", + "resolved": "https://registry.npmjs.org/glob-base/-/glob-base-0.3.0.tgz", + "integrity": "sha1-27Fk9iIbHAscz4Kuoyi0l98Oo8Q=", + "dev": true, + "optional": true + }, + "glob-parent": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/glob-parent/-/glob-parent-2.0.0.tgz", + "integrity": "sha1-gTg9ctsFT8zPUzbaqQLxgvbtuyg=", + "dev": true + }, + "globals": { + "version": "9.18.0", + "resolved": "https://registry.npmjs.org/globals/-/globals-9.18.0.tgz", + "integrity": "sha512-S0nG3CLEQiY/ILxqtztTWH/3iRRdyBLw6KMDxnKMchrtbj2OFmehVh0WUCfW3DUrIgx/qFrJPICrq4Z4sTR9UQ==", + "dev": true + }, + "globby": { + "version": "6.1.0", + "resolved": "https://registry.npmjs.org/globby/-/globby-6.1.0.tgz", + "integrity": "sha1-9abXDoOV4hyFj7BInWTfAkJNUGw=", + "dev": true + }, + "graceful-fs": { + "version": "4.1.11", + "resolved": "https://registry.npmjs.org/graceful-fs/-/graceful-fs-4.1.11.tgz", + "integrity": "sha1-Dovf5NHduIVNZOBOp8AOKgJuVlg=" + }, + "growl": { + "version": "1.10.3", + "resolved": "https://registry.npmjs.org/growl/-/growl-1.10.3.tgz", + "integrity": "sha512-hKlsbA5Vu3xsh1Cg3J7jSmX/WaW6A5oBeqzM88oNbCRQFz+zUaXm6yxS4RVytp1scBoJzSYl4YAEOQIt6O8V1Q==", + "dev": true + }, + "has-ansi": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/has-ansi/-/has-ansi-2.0.0.tgz", + "integrity": "sha1-NPUEnOHs3ysGSa8+8k5F7TVBbZE=", + "dev": true + }, + "has-flag": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/has-flag/-/has-flag-2.0.0.tgz", + "integrity": "sha1-6CB68cx7MNRGzHC3NLXovhj4jVE=" + }, + "he": { + "version": "1.1.1", + "resolved": "https://registry.npmjs.org/he/-/he-1.1.1.tgz", + "integrity": "sha1-k0EP0hsAlzUVH4howvJx80J+I/0=", + "dev": true + }, + "home-or-tmp": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/home-or-tmp/-/home-or-tmp-2.0.0.tgz", + "integrity": "sha1-42w/LSyufXRqhX440Y1fMqeILbg=", + "dev": true + }, + "hosted-git-info": { + "version": "2.5.0", + "resolved": "https://registry.npmjs.org/hosted-git-info/-/hosted-git-info-2.5.0.tgz", + "integrity": "sha512-pNgbURSuab90KbTqvRPsseaTxOJCZBD0a7t+haSN33piP9cCM4l0CqdzAif2hUqm716UovKB2ROmiabGAKVXyg==" + }, + "inflight": { + "version": "1.0.6", + "resolved": "https://registry.npmjs.org/inflight/-/inflight-1.0.6.tgz", + "integrity": "sha1-Sb1jMdfQLQwJvJEKEHW6gWW1bfk=", + "dev": true + }, + "inherits": { + "version": "2.0.3", + "resolved": "https://registry.npmjs.org/inherits/-/inherits-2.0.3.tgz", + "integrity": "sha1-Yzwsg+PaQqUC9SRmAiSA9CCCYd4=", + "dev": true + }, + "invariant": { + "version": "2.2.2", + "resolved": "https://registry.npmjs.org/invariant/-/invariant-2.2.2.tgz", + "integrity": "sha1-nh9WrArNtr8wMwbzOL47IErmA2A=", + "dev": true + }, + "invert-kv": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/invert-kv/-/invert-kv-1.0.0.tgz", + "integrity": "sha1-EEqOSqym09jNFXqO+L+rLXo//bY=" + }, + "is-arrayish": { + "version": "0.2.1", + "resolved": "https://registry.npmjs.org/is-arrayish/-/is-arrayish-0.2.1.tgz", + "integrity": "sha1-d8mYQFJ6qOyxqLppe4BkWnqSap0=" + }, + "is-binary-path": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/is-binary-path/-/is-binary-path-1.0.1.tgz", + "integrity": "sha1-dfFmQrSA8YenEcgUFh/TpKdlWJg=", + "dev": true, + "optional": true + }, + "is-buffer": { + "version": "1.1.5", + "resolved": "https://registry.npmjs.org/is-buffer/-/is-buffer-1.1.5.tgz", + "integrity": "sha1-Hzsm72E7IUuIy8ojzGwB2Hlh7sw=", + "dev": true + }, + "is-builtin-module": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/is-builtin-module/-/is-builtin-module-1.0.0.tgz", + "integrity": "sha1-VAVy0096wxGfj3bDDLwbHgN6/74=" + }, + "is-dotfile": { + "version": "1.0.3", + "resolved": "https://registry.npmjs.org/is-dotfile/-/is-dotfile-1.0.3.tgz", + "integrity": "sha1-pqLzL/0t+wT1yiXs0Pa4PPeYoeE=", + "dev": true, + "optional": true + }, + "is-equal-shallow": { + "version": "0.1.3", + "resolved": "https://registry.npmjs.org/is-equal-shallow/-/is-equal-shallow-0.1.3.tgz", + "integrity": "sha1-IjgJj8Ih3gvPpdnqxMRdY4qhxTQ=", + "dev": true, + "optional": true + }, + "is-extendable": { + "version": "0.1.1", + "resolved": "https://registry.npmjs.org/is-extendable/-/is-extendable-0.1.1.tgz", + "integrity": "sha1-YrEQ4omkcUGOPsNqYX1HLjAd/Ik=", + "dev": true, + "optional": true + }, + "is-extglob": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/is-extglob/-/is-extglob-1.0.0.tgz", + "integrity": "sha1-rEaBd8SUNAWgkvyPKXYMb/xiBsA=", + "dev": true + }, + "is-finite": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/is-finite/-/is-finite-1.0.2.tgz", + "integrity": "sha1-zGZ3aVYCvlUO8R6LSqYwU0K20Ko=", + "dev": true + }, + "is-fullwidth-code-point": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/is-fullwidth-code-point/-/is-fullwidth-code-point-1.0.0.tgz", + "integrity": "sha1-754xOG8DGn8NZDr4L95QxFfvAMs=" + }, + "is-glob": { + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/is-glob/-/is-glob-2.0.1.tgz", + "integrity": "sha1-0Jb5JqPe1WAPP9/ZEZjLCIjC2GM=", + "dev": true + }, + "is-module": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/is-module/-/is-module-1.0.0.tgz", + "integrity": "sha1-Mlj7afeMFNW4FdZkM2tM/7ZEFZE=", + "dev": true + }, + "is-number": { + "version": "2.1.0", + "resolved": "https://registry.npmjs.org/is-number/-/is-number-2.1.0.tgz", + "integrity": "sha1-Afy7s5NGOlSPL0ZszhbezknbkI8=", + "dev": true, + "optional": true + }, + "is-posix-bracket": { + "version": "0.1.1", + "resolved": "https://registry.npmjs.org/is-posix-bracket/-/is-posix-bracket-0.1.1.tgz", + "integrity": "sha1-MzTceXdDaOkvAW5vvAqI9c1ua8Q=", + "dev": true, + "optional": true + }, + "is-primitive": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/is-primitive/-/is-primitive-2.0.0.tgz", + "integrity": "sha1-IHurkWOEmcB7Kt8kCkGochADRXU=", + "dev": true, + "optional": true + }, + "is-stream": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/is-stream/-/is-stream-1.1.0.tgz", + "integrity": "sha1-EtSj3U5o4Lec6428hBc66A2RykQ=" + }, + "isarray": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/isarray/-/isarray-1.0.0.tgz", + "integrity": "sha1-u5NdSFgsuhaMBoNJV6VKPgcSTxE=", + "dev": true + }, + "isexe": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/isexe/-/isexe-2.0.0.tgz", + "integrity": "sha1-6PvzdNxVb/iUehDcsFctYz8s+hA=" + }, + "isobject": { + "version": "2.1.0", + "resolved": "https://registry.npmjs.org/isobject/-/isobject-2.1.0.tgz", + "integrity": "sha1-8GVWEJaj8dou9GJy+BXIQNh+DIk=", + "dev": true, + "optional": true + }, + "jison-helpers-lib": { + "version": "0.6.1-205", + "resolved": "https://registry.npmjs.org/jison-helpers-lib/-/jison-helpers-lib-0.6.1-205.tgz", + "integrity": "sha512-b4iWlapl1cAU0/pZJmIDeJnEUXKMnt7NkwnNahG7gMZWQKV3ogaQOl3ByGWyThYQKQLgGWO4rTUDUlzwgrv4SQ==" + }, + "js-tokens": { + "version": "3.0.2", + "resolved": "https://registry.npmjs.org/js-tokens/-/js-tokens-3.0.2.tgz", + "integrity": "sha1-mGbfOVECEw449/mWvOtlRDIJwls=", + "dev": true + }, + "jsesc": { + "version": "1.3.0", + "resolved": "https://registry.npmjs.org/jsesc/-/jsesc-1.3.0.tgz", + "integrity": "sha1-RsP+yMGJKxKwgz25vHYiF226s0s=", + "dev": true + }, + "kind-of": { + "version": "3.2.2", + "resolved": "https://registry.npmjs.org/kind-of/-/kind-of-3.2.2.tgz", + "integrity": "sha1-MeohpzS6ubuw8yRm2JOupR5KPGQ=", + "dev": true + }, + "lcid": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/lcid/-/lcid-1.0.0.tgz", + "integrity": "sha1-MIrMr6C8SDo4Z7S28rlQYlHRuDU=" + }, + "load-json-file": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/load-json-file/-/load-json-file-2.0.0.tgz", + "integrity": "sha1-eUfkIUmvgNaWy/eXvKq8/h/inKg=" + }, + "locate-path": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/locate-path/-/locate-path-2.0.0.tgz", + "integrity": "sha1-K1aLJl7slExtnA3pw9u7ygNUzY4=" + }, + "lodash": { + "version": "4.17.4", + "resolved": "https://registry.npmjs.org/lodash/-/lodash-4.17.4.tgz", + "integrity": "sha1-eCA6TRwyiuHYbcpkYONptX9AVa4=", + "dev": true + }, + "loose-envify": { + "version": "1.3.1", + "resolved": "https://registry.npmjs.org/loose-envify/-/loose-envify-1.3.1.tgz", + "integrity": "sha1-0aitM/qc4OcT1l/dCsi3SNR4yEg=", + "dev": true + }, + "lru-cache": { + "version": "4.1.1", + "resolved": "https://registry.npmjs.org/lru-cache/-/lru-cache-4.1.1.tgz", + "integrity": "sha512-q4spe4KTfsAS1SUHLO0wz8Qiyf1+vMIAgpRYioFYDMNqKfHQbg+AVDH3i4fvpl71/P1L0dBl+fQi+P37UYf0ew==" + }, + "mem": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/mem/-/mem-1.1.0.tgz", + "integrity": "sha1-Xt1StIXKHZAP5kiVUFOZoN+kX3Y=" + }, + "micromatch": { + "version": "2.3.11", + "resolved": "https://registry.npmjs.org/micromatch/-/micromatch-2.3.11.tgz", + "integrity": "sha1-hmd8l9FyCzY0MdBNDRUpO9OMFWU=", + "dev": true, + "optional": true + }, + "mimic-fn": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/mimic-fn/-/mimic-fn-1.1.0.tgz", + "integrity": "sha1-5md4PZLonb00KBi1IwudYqZyrRg=" + }, + "minimatch": { + "version": "3.0.4", + "resolved": "https://registry.npmjs.org/minimatch/-/minimatch-3.0.4.tgz", + "integrity": "sha512-yJHVQEhyqPLUTgt9B83PXu6W3rx4MvvHvSUvToogpwoGDOUQ+yDrR0HRot+yOCdCO7u4hX3pWft6kWBBcqh0UA==", + "dev": true + }, + "minimist": { + "version": "0.0.8", + "resolved": "https://registry.npmjs.org/minimist/-/minimist-0.0.8.tgz", + "integrity": "sha1-hX/Kv8M5fSYluCKCYuhqp6ARsF0=", + "dev": true + }, + "mkdirp": { + "version": "0.5.1", + "resolved": "https://registry.npmjs.org/mkdirp/-/mkdirp-0.5.1.tgz", + "integrity": "sha1-MAV0OOrGz3+MR2fzhkjWaX11yQM=", + "dev": true + }, + "mocha": { + "version": "4.0.1", + "resolved": "https://registry.npmjs.org/mocha/-/mocha-4.0.1.tgz", + "integrity": "sha512-evDmhkoA+cBNiQQQdSKZa2b9+W2mpLoj50367lhy+Klnx9OV8XlCIhigUnn1gaTFLQCa0kdNhEGDr0hCXOQFDw==", + "dev": true, + "dependencies": { + "debug": { + "version": "3.1.0", + "resolved": "https://registry.npmjs.org/debug/-/debug-3.1.0.tgz", + "integrity": "sha512-OX8XqP7/1a9cqkxYw2yXss15f26NKWBpDXQd0/uK/KPqdQhxbPa994hnzjcE2VqQpDslf55723cKPUOGSmMY3g==", + "dev": true + }, + "supports-color": { + "version": "4.4.0", + "resolved": "https://registry.npmjs.org/supports-color/-/supports-color-4.4.0.tgz", + "integrity": "sha512-rKC3+DyXWgK0ZLKwmRsrkyHVZAjNkfzeehuFWdGGcqGDTZFH73+RH6S/RDAAxl9GusSjZSUWYLmT9N5pzXFOXQ==", + "dev": true + } + } + }, + "ms": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/ms/-/ms-2.0.0.tgz", + "integrity": "sha1-VgiurfwAvmwpAd9fmGF4jeDVl8g=", + "dev": true + }, + "nan": { + "version": "2.7.0", + "resolved": "https://registry.npmjs.org/nan/-/nan-2.7.0.tgz", + "integrity": "sha1-2Vv3IeyHfgjbJ27T/G63j5CDrUY=", + "dev": true, + "optional": true + }, + "normalize-package-data": { + "version": "2.4.0", + "resolved": "https://registry.npmjs.org/normalize-package-data/-/normalize-package-data-2.4.0.tgz", + "integrity": "sha512-9jjUFbTPfEy3R/ad/2oNbKtW9Hgovl5O1FvFWKkKblNXoN/Oou6+9+KKohPK13Yc3/TyunyWhJp6gvRNR/PPAw==" + }, + "normalize-path": { + "version": "2.1.1", + "resolved": "https://registry.npmjs.org/normalize-path/-/normalize-path-2.1.1.tgz", + "integrity": "sha1-GrKLVW4Zg2Oowab35vogE3/mrtk=", + "dev": true + }, + "npm-run-path": { + "version": "2.0.2", + "resolved": "https://registry.npmjs.org/npm-run-path/-/npm-run-path-2.0.2.tgz", + "integrity": "sha1-NakjLfo11wZ7TLLd8jV7GHFTbF8=" + }, + "number-is-nan": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/number-is-nan/-/number-is-nan-1.0.1.tgz", + "integrity": "sha1-CXtgK1NCKlIsGvuHkDGDNpQaAR0=" + }, + "object-assign": { + "version": "4.1.1", + "resolved": "https://registry.npmjs.org/object-assign/-/object-assign-4.1.1.tgz", + "integrity": "sha1-IQmtx5ZYh8/AXLvUQsrIv7s2CGM=", + "dev": true + }, + "object.omit": { + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/object.omit/-/object.omit-2.0.1.tgz", + "integrity": "sha1-Gpx0SCnznbuFjHbKNXmuKlTr0fo=", + "dev": true, + "optional": true + }, + "once": { + "version": "1.4.0", + "resolved": "https://registry.npmjs.org/once/-/once-1.4.0.tgz", + "integrity": "sha1-WDsap3WWHUsROsF9nFC6753Xa9E=", + "dev": true + }, + "os-homedir": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/os-homedir/-/os-homedir-1.0.2.tgz", + "integrity": "sha1-/7xJiDNuDoM94MFox+8VISGqf7M=", + "dev": true + }, + "os-locale": { + "version": "2.1.0", + "resolved": "https://registry.npmjs.org/os-locale/-/os-locale-2.1.0.tgz", + "integrity": "sha512-3sslG3zJbEYcaC4YVAvDorjGxc7tv6KVATnLPZONiljsUncvihe9BQoVCEs0RZ1kmf4Hk9OBqlZfJZWI4GanKA==" + }, + "os-tmpdir": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/os-tmpdir/-/os-tmpdir-1.0.2.tgz", + "integrity": "sha1-u+Z0BseaqFxc/sdm/lc0VV36EnQ=", + "dev": true + }, + "output-file-sync": { + "version": "1.1.2", + "resolved": "https://registry.npmjs.org/output-file-sync/-/output-file-sync-1.1.2.tgz", + "integrity": "sha1-0KM+7+YaIF+suQCS6CZZjVJFznY=", + "dev": true + }, + "p-finally": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/p-finally/-/p-finally-1.0.0.tgz", + "integrity": "sha1-P7z7FbiZpEEjs0ttzBi3JDNqLK4=" + }, + "p-limit": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/p-limit/-/p-limit-1.1.0.tgz", + "integrity": "sha1-sH/y2aXYi+yAYDWJWiurZqJ5iLw=" + }, + "p-locate": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/p-locate/-/p-locate-2.0.0.tgz", + "integrity": "sha1-IKAQOyIqcMj9OcwuWAaA893l7EM=" + }, + "parse-glob": { + "version": "3.0.4", + "resolved": "https://registry.npmjs.org/parse-glob/-/parse-glob-3.0.4.tgz", + "integrity": "sha1-ssN2z7EfNVE7rdFz7wu246OIORw=", + "dev": true, + "optional": true + }, + "parse-json": { + "version": "2.2.0", + "resolved": "https://registry.npmjs.org/parse-json/-/parse-json-2.2.0.tgz", + "integrity": "sha1-9ID0BDTvgHQfhGkJn43qGPVaTck=" + }, + "path-exists": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/path-exists/-/path-exists-3.0.0.tgz", + "integrity": "sha1-zg6+ql94yxiSXqfYENe1mwEP1RU=" + }, + "path-is-absolute": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/path-is-absolute/-/path-is-absolute-1.0.1.tgz", + "integrity": "sha1-F0uSaHNVNP+8es5r9TpanhtcX18=", + "dev": true + }, + "path-key": { + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/path-key/-/path-key-2.0.1.tgz", + "integrity": "sha1-QRyttXTFoUDTpLGRDUDYDMn0C0A=" + }, + "path-parse": { + "version": "1.0.5", + "resolved": "https://registry.npmjs.org/path-parse/-/path-parse-1.0.5.tgz", + "integrity": "sha1-PBrfhx6pzWyUMbbqK9dKD/BVxME=", + "dev": true + }, + "path-type": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/path-type/-/path-type-2.0.0.tgz", + "integrity": "sha1-8BLMuEFbcJb8LaoQVMPXI4lZTHM=" + }, + "pathval": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/pathval/-/pathval-1.1.0.tgz", + "integrity": "sha1-uULm1L3mUwBe9rcTYd74cn0GReA=", + "dev": true + }, + "pify": { + "version": "2.3.0", + "resolved": "https://registry.npmjs.org/pify/-/pify-2.3.0.tgz", + "integrity": "sha1-7RQaasBDqEnqWISY59yosVMw6Qw=" + }, + "pinkie": { + "version": "2.0.4", + "resolved": "https://registry.npmjs.org/pinkie/-/pinkie-2.0.4.tgz", + "integrity": "sha1-clVrgM+g1IqXToDnckjoDtT3+HA=", + "dev": true + }, + "pinkie-promise": { + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/pinkie-promise/-/pinkie-promise-2.0.1.tgz", + "integrity": "sha1-ITXW36ejWMBprJsXh3YogihFD/o=", + "dev": true + }, + "preserve": { + "version": "0.2.0", + "resolved": "https://registry.npmjs.org/preserve/-/preserve-0.2.0.tgz", + "integrity": "sha1-gV7R9uvGWSb4ZbMQwHE7yzMVzks=", + "dev": true, + "optional": true + }, + "private": { + "version": "0.1.7", + "resolved": "https://registry.npmjs.org/private/-/private-0.1.7.tgz", + "integrity": "sha1-aM5eih7woju1cMwoU3tTMqumPvE=" + }, + "process-nextick-args": { + "version": "1.0.7", + "resolved": "https://registry.npmjs.org/process-nextick-args/-/process-nextick-args-1.0.7.tgz", + "integrity": "sha1-FQ4gt1ZZCtP5EJPyWk8q2L/zC6M=", + "dev": true, + "optional": true + }, + "pseudomap": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/pseudomap/-/pseudomap-1.0.2.tgz", + "integrity": "sha1-8FKijacOYYkX7wqKw0wa5aaChrM=" + }, + "randomatic": { + "version": "1.1.7", + "resolved": "https://registry.npmjs.org/randomatic/-/randomatic-1.1.7.tgz", + "integrity": "sha512-D5JUjPyJbaJDkuAazpVnSfVkLlpeO3wDlPROTMLGKG1zMFNFRgrciKo1ltz/AzNTkqE0HzDx655QOL51N06how==", + "dev": true, + "optional": true, + "dependencies": { + "is-number": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/is-number/-/is-number-3.0.0.tgz", + "integrity": "sha1-JP1iAaR4LPUFYcgQJ2r8fRLXEZU=", + "dev": true, + "optional": true, + "dependencies": { + "kind-of": { + "version": "3.2.2", + "resolved": "https://registry.npmjs.org/kind-of/-/kind-of-3.2.2.tgz", + "integrity": "sha1-MeohpzS6ubuw8yRm2JOupR5KPGQ=", + "dev": true, + "optional": true + } + } + }, + "kind-of": { + "version": "4.0.0", + "resolved": "https://registry.npmjs.org/kind-of/-/kind-of-4.0.0.tgz", + "integrity": "sha1-IIE989cSkosgc3hpGkUGb65y3Vc=", + "dev": true, + "optional": true + } + } + }, + "read-pkg": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/read-pkg/-/read-pkg-2.0.0.tgz", + "integrity": "sha1-jvHAYjxqbbDcZxPEv6xGMysjaPg=" + }, + "read-pkg-up": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/read-pkg-up/-/read-pkg-up-2.0.0.tgz", + "integrity": "sha1-a3KoBImE4MQeeVEP1en6mbO1Sb4=" + }, + "readable-stream": { + "version": "2.3.3", + "resolved": "https://registry.npmjs.org/readable-stream/-/readable-stream-2.3.3.tgz", + "integrity": "sha512-m+qzzcn7KUxEmd1gMbchF+Y2eIUbieUaxkWtptyHywrX0rE8QEYqPC07Vuy4Wm32/xE16NcdBctb8S0Xe/5IeQ==", + "dev": true, + "optional": true + }, + "readdirp": { + "version": "2.1.0", + "resolved": "https://registry.npmjs.org/readdirp/-/readdirp-2.1.0.tgz", + "integrity": "sha1-TtCtBg3zBzMAxIRANz9y0cxkLXg=", + "dev": true, + "optional": true + }, + "regenerate": { + "version": "1.3.3", + "resolved": "https://registry.npmjs.org/regenerate/-/regenerate-1.3.3.tgz", + "integrity": "sha512-jVpo1GadrDAK59t/0jRx5VxYWQEDkkEKi6+HjE3joFVLfDOh9Xrdh0dF1eSq+BI/SwvTQ44gSscJ8N5zYL61sg==", + "dev": true + }, + "regenerator-runtime": { + "version": "0.11.0", + "resolved": "https://registry.npmjs.org/regenerator-runtime/-/regenerator-runtime-0.11.0.tgz", + "integrity": "sha512-/aA0kLeRb5N9K0d4fw7ooEbI+xDe+DKD499EQqygGqeS8N3xto15p09uY2xj7ixP81sNPXvRLnAQIqdVStgb1A==", + "dev": true + }, + "regenerator-transform": { + "version": "0.10.1", + "resolved": "https://registry.npmjs.org/regenerator-transform/-/regenerator-transform-0.10.1.tgz", + "integrity": "sha512-PJepbvDbuK1xgIgnau7Y90cwaAmO/LCLMI2mPvaXq2heGMR3aWW5/BQvYrhJ8jgmQjXewXvBjzfqKcVOmhjZ6Q==", + "dev": true + }, + "regex-cache": { + "version": "0.4.4", + "resolved": "https://registry.npmjs.org/regex-cache/-/regex-cache-0.4.4.tgz", + "integrity": "sha512-nVIZwtCjkC9YgvWkpM55B5rBhBYRZhAaJbgcFYXXsHnbZ9UZI9nnVWYZpBlCqv9ho2eZryPnWrZGsOdPwVWXWQ==", + "dev": true, + "optional": true + }, + "regexpu-core": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/regexpu-core/-/regexpu-core-2.0.0.tgz", + "integrity": "sha1-SdA4g3uNz4v6W5pCE5k45uoq4kA=", + "dev": true + }, + "regjsgen": { + "version": "0.2.0", + "resolved": "https://registry.npmjs.org/regjsgen/-/regjsgen-0.2.0.tgz", + "integrity": "sha1-bAFq3qxVT3WCP+N6wFuS1aTtsfc=", + "dev": true + }, + "regjsparser": { + "version": "0.1.5", + "resolved": "https://registry.npmjs.org/regjsparser/-/regjsparser-0.1.5.tgz", + "integrity": "sha1-fuj4Tcb6eS0/0K4ijSS9lJ6tIFw=", + "dev": true, + "dependencies": { + "jsesc": { + "version": "0.5.0", + "resolved": "https://registry.npmjs.org/jsesc/-/jsesc-0.5.0.tgz", + "integrity": "sha1-597mbjXW/Bb3EP6R1c9p9w8IkR0=", + "dev": true + } + } + }, + "remove-trailing-separator": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/remove-trailing-separator/-/remove-trailing-separator-1.1.0.tgz", + "integrity": "sha1-wkvOKig62tW8P1jg1IJJuSN52O8=", + "dev": true + }, + "repeat-element": { + "version": "1.1.2", + "resolved": "https://registry.npmjs.org/repeat-element/-/repeat-element-1.1.2.tgz", + "integrity": "sha1-7wiaF40Ug7quTZPrmLT55OEdmQo=", + "dev": true + }, + "repeat-string": { + "version": "1.6.1", + "resolved": "https://registry.npmjs.org/repeat-string/-/repeat-string-1.6.1.tgz", + "integrity": "sha1-jcrkcOHIirwtYA//Sndihtp15jc=", + "dev": true, + "optional": true + }, + "repeating": { + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/repeating/-/repeating-2.0.1.tgz", + "integrity": "sha1-UhTFOpJtNVJwdSf7q0FdvAjQbdo=", + "dev": true + }, + "require-directory": { + "version": "2.1.1", + "resolved": "https://registry.npmjs.org/require-directory/-/require-directory-2.1.1.tgz", + "integrity": "sha1-jGStX9MNqxyXbiNE/+f3kqam30I=" + }, + "require-main-filename": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/require-main-filename/-/require-main-filename-1.0.1.tgz", + "integrity": "sha1-l/cXtp1IeE9fUmpsWqj/3aBVpNE=" + }, + "resolve": { + "version": "1.4.0", + "resolved": "https://registry.npmjs.org/resolve/-/resolve-1.4.0.tgz", + "integrity": "sha512-aW7sVKPufyHqOmyyLzg/J+8606v5nevBgaliIlV7nUpVMsDnoBGV/cbSLNjZAg9q0Cfd/+easKVKQ8vOu8fn1Q==", + "dev": true + }, + "rollup": { + "version": "0.50.0", + "resolved": "https://registry.npmjs.org/rollup/-/rollup-0.50.0.tgz", + "integrity": "sha512-7RqCBQ9iwsOBPkjYgoIaeUij606mSkDMExP0NT7QDI3bqkHYQHrQ83uoNIXwPcQm/vP2VbsUz3kiyZZ1qPlLTQ==", + "dev": true + }, + "rollup-plugin-node-resolve": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/rollup-plugin-node-resolve/-/rollup-plugin-node-resolve-3.0.0.tgz", + "integrity": "sha1-i4l8TDAw1QASd7BRSyXSygloPuA=", + "dev": true + }, + "safe-buffer": { + "version": "5.1.1", + "resolved": "https://registry.npmjs.org/safe-buffer/-/safe-buffer-5.1.1.tgz", + "integrity": "sha512-kKvNJn6Mm93gAczWVJg7wH+wGYWNrDHdWvpUmHyEsgCtIwwo3bqPtV4tR5tuPaUhTOo/kvhVwd8XwwOllGYkbg==", + "dev": true + }, + "semver": { + "version": "5.4.1", + "resolved": "https://registry.npmjs.org/semver/-/semver-5.4.1.tgz", + "integrity": "sha512-WfG/X9+oATh81XtllIo/I8gOiY9EXRdv1cQdyykeXK17YcUW3EXUAi2To4pcH6nZtJPr7ZOpM5OMyWJZm+8Rsg==" + }, + "set-blocking": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/set-blocking/-/set-blocking-2.0.0.tgz", + "integrity": "sha1-BF+XgtARrppoA93TgrJDkrPYkPc=" + }, + "set-immediate-shim": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/set-immediate-shim/-/set-immediate-shim-1.0.1.tgz", + "integrity": "sha1-SysbJ+uAip+NzEgaWOXlb1mfP2E=", + "dev": true, + "optional": true + }, + "shebang-command": { + "version": "1.2.0", + "resolved": "https://registry.npmjs.org/shebang-command/-/shebang-command-1.2.0.tgz", + "integrity": "sha1-RKrGW2lbAzmJaMOfNj/uXer98eo=" + }, + "shebang-regex": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/shebang-regex/-/shebang-regex-1.0.0.tgz", + "integrity": "sha1-2kL0l0DAtC2yypcoVxyxkMmO/qM=" + }, + "signal-exit": { + "version": "3.0.2", + "resolved": "https://registry.npmjs.org/signal-exit/-/signal-exit-3.0.2.tgz", + "integrity": "sha1-tf3AjxKH6hF4Yo5BXiUTK3NkbG0=" + }, + "slash": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/slash/-/slash-1.0.0.tgz", + "integrity": "sha1-xB8vbDn8FtHNF61LXYlhFK5HDVU=", + "dev": true + }, + "source-map": { + "version": "0.6.1", + "resolved": "https://registry.npmjs.org/source-map/-/source-map-0.6.1.tgz", + "integrity": "sha512-UjgapumWlbMhkBgzT7Ykc5YXUT46F0iKu8SGXq0bcwP5dz/h0Plj6enJqjz1Zbq2l5WaqYnrVbwWOWMyF3F47g==" + }, + "source-map-support": { + "version": "0.4.18", + "resolved": "https://registry.npmjs.org/source-map-support/-/source-map-support-0.4.18.tgz", + "integrity": "sha512-try0/JqxPLF9nOjvSta7tVondkP5dwgyLDjVoyMDlmjugT2lRZ1OfsrYTkCd2hkDnJTKRbO/Rl3orm8vlsUzbA==", + "dev": true, + "dependencies": { + "source-map": { + "version": "0.5.7", + "resolved": "https://registry.npmjs.org/source-map/-/source-map-0.5.7.tgz", + "integrity": "sha1-igOdLRAh0i0eoUyA2OpGi6LvP8w=", + "dev": true + } + } + }, + "spdx-correct": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/spdx-correct/-/spdx-correct-1.0.2.tgz", + "integrity": "sha1-SzBz2TP/UfORLwOsVRlJikFQ20A=" + }, + "spdx-expression-parse": { + "version": "1.0.4", + "resolved": "https://registry.npmjs.org/spdx-expression-parse/-/spdx-expression-parse-1.0.4.tgz", + "integrity": "sha1-m98vIOH0DtRH++JzJmGR/O1RYmw=" + }, + "spdx-license-ids": { + "version": "1.2.2", + "resolved": "https://registry.npmjs.org/spdx-license-ids/-/spdx-license-ids-1.2.2.tgz", + "integrity": "sha1-yd96NCRZSt5r0RkA1ZZpbcBrrFc=" + }, + "string_decoder": { + "version": "1.0.3", + "resolved": "https://registry.npmjs.org/string_decoder/-/string_decoder-1.0.3.tgz", + "integrity": "sha512-4AH6Z5fzNNBcH+6XDMfA/BTt87skxqJlO0lAh3Dker5zThcAxG6mKz+iGu308UKoPPQ8Dcqx/4JhujzltRa+hQ==", + "dev": true, + "optional": true + }, + "string-width": { + "version": "2.1.1", + "resolved": "https://registry.npmjs.org/string-width/-/string-width-2.1.1.tgz", + "integrity": "sha512-nOqH59deCq9SRHlxq1Aw85Jnt4w6KvLKqWVik6oA9ZklXLNIOlqg4F2yrT1MVaTjAqvVwdfeZ7w7aCvJD7ugkw==", + "dependencies": { + "ansi-regex": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/ansi-regex/-/ansi-regex-3.0.0.tgz", + "integrity": "sha1-7QMXwyIGT3lGbAKWa922Bas32Zg=" + }, + "is-fullwidth-code-point": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/is-fullwidth-code-point/-/is-fullwidth-code-point-2.0.0.tgz", + "integrity": "sha1-o7MKXE8ZkYMWeqq5O+764937ZU8=" + }, + "strip-ansi": { + "version": "4.0.0", + "resolved": "https://registry.npmjs.org/strip-ansi/-/strip-ansi-4.0.0.tgz", + "integrity": "sha1-qEeQIusaw2iocTibY1JixQXuNo8=" + } + } + }, + "strip-ansi": { + "version": "3.0.1", + "resolved": "https://registry.npmjs.org/strip-ansi/-/strip-ansi-3.0.1.tgz", + "integrity": "sha1-ajhfuIU9lS1f8F0Oiq+UJ43GPc8=" + }, + "strip-bom": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/strip-bom/-/strip-bom-3.0.0.tgz", + "integrity": "sha1-IzTBjpx1n3vdVv3vfprj1YjmjtM=" + }, + "strip-eof": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/strip-eof/-/strip-eof-1.0.0.tgz", + "integrity": "sha1-u0P/VZim6wXYm1n80SnJgzE2Br8=" + }, + "supports-color": { + "version": "4.5.0", + "resolved": "https://registry.npmjs.org/supports-color/-/supports-color-4.5.0.tgz", + "integrity": "sha1-vnoN5ITexcXN34s9WRJQRJEvY1s=" + }, + "to-fast-properties": { + "version": "1.0.3", + "resolved": "https://registry.npmjs.org/to-fast-properties/-/to-fast-properties-1.0.3.tgz", + "integrity": "sha1-uDVx+k2MJbguIxsG46MFXeTKGkc=", + "dev": true + }, + "trim-right": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/trim-right/-/trim-right-1.0.1.tgz", + "integrity": "sha1-yy4SAwZ+DI3h9hQJS5/kVwTqYAM=", + "dev": true + }, + "type-detect": { + "version": "4.0.3", + "resolved": "https://registry.npmjs.org/type-detect/-/type-detect-4.0.3.tgz", + "integrity": "sha1-Dj8mcLRAmbC0bChNE2p+9Jx0wuo=", + "dev": true + }, + "user-home": { + "version": "1.1.1", + "resolved": "https://registry.npmjs.org/user-home/-/user-home-1.1.1.tgz", + "integrity": "sha1-K1viOjK2Onyd640PKNSFcko98ZA=", + "dev": true + }, + "util-deprecate": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/util-deprecate/-/util-deprecate-1.0.2.tgz", + "integrity": "sha1-RQ1Nyfpw3nMnYvvS1KKJgUGaDM8=", + "dev": true, + "optional": true + }, + "v8flags": { + "version": "2.1.1", + "resolved": "https://registry.npmjs.org/v8flags/-/v8flags-2.1.1.tgz", + "integrity": "sha1-qrGh+jDUX4jdMhFIh1rALAtV5bQ=", + "dev": true + }, + "validate-npm-package-license": { + "version": "3.0.1", + "resolved": "https://registry.npmjs.org/validate-npm-package-license/-/validate-npm-package-license-3.0.1.tgz", + "integrity": "sha1-KAS6vnEq0zeUWaz74kdGqywwP7w=" + }, + "which": { + "version": "1.3.0", + "resolved": "https://registry.npmjs.org/which/-/which-1.3.0.tgz", + "integrity": "sha512-xcJpopdamTuY5duC/KnTTNBraPK54YwpenP4lzxU8H91GudWpFv38u0CKjclE1Wi2EH2EDz5LRcHcKbCIzqGyg==" + }, + "which-module": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/which-module/-/which-module-2.0.0.tgz", + "integrity": "sha1-2e8H3Od7mQK4o6j6SzHD4/fm6Ho=" + }, + "wrap-ansi": { + "version": "2.1.0", + "resolved": "https://registry.npmjs.org/wrap-ansi/-/wrap-ansi-2.1.0.tgz", + "integrity": "sha1-2Pw9KE3QV5T+hJc8rs3Rz4JP3YU=", + "dependencies": { + "string-width": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/string-width/-/string-width-1.0.2.tgz", + "integrity": "sha1-EYvfW4zcUaKn5w0hHgfisLmxB9M=" + } + } + }, + "wrappy": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/wrappy/-/wrappy-1.0.2.tgz", + "integrity": "sha1-tSQ9jz7BqjXxNkYFvA0QNuMKtp8=", + "dev": true + }, + "y18n": { + "version": "3.2.1", + "resolved": "https://registry.npmjs.org/y18n/-/y18n-3.2.1.tgz", + "integrity": "sha1-bRX7qITAhnnA136I53WegR4H+kE=" + }, + "yallist": { + "version": "2.1.2", + "resolved": "https://registry.npmjs.org/yallist/-/yallist-2.1.2.tgz", + "integrity": "sha1-HBH5IY8HYImkfdUS+TxmmaaoHVI=" + }, + "yargs": { + "version": "8.0.2", + "resolved": "https://registry.npmjs.org/yargs/-/yargs-8.0.2.tgz", + "integrity": "sha1-YpmpBVsc78lp/355wdkY3Osiw2A=" + }, + "yargs-parser": { + "version": "7.0.0", + "resolved": "https://registry.npmjs.org/yargs-parser/-/yargs-parser-7.0.0.tgz", + "integrity": "sha1-jQrELxbqVd69MyyvTEA4s+P139k=" + } + } +} diff --git a/package.json b/package.json index dea1800..62cc130 100644 --- a/package.json +++ b/package.json @@ -1,8 +1,13 @@ { - "author": "Zach Carter (http://zaa.ch)", - "name": "jison-lex", + "author": { + "name": "Zach Carter", + "email": "zach@carter.name", + "url": "http://zaa.ch" + }, + "name": "@gerhobbelt/jison-lex", "description": "lexical analyzer generator used by jison", - "version": "0.3.4", + "license": "MIT", + "version": "0.6.1-208", "keywords": [ "jison", "parser", @@ -13,26 +18,42 @@ ], "repository": { "type": "git", - "url": "git://github.com/zaach/jison-lex.git" + "url": "git://github.com/GerHobbelt/jison-lex.git" }, "bugs": { "email": "jison@librelist.com", "url": "http://github.com/zaach/jison-lex/issues" }, - "main": "regexp-lexer", - "bin": "cli.js", + "main": "dist/regexp-lexer-cjs-es5.js", + "module": "regexp-lexer.js", + "bin": "dist/cli-cjs-es5.js", "engines": { - "node": ">=0.4" + "node": ">=4.0" }, "dependencies": { - "lex-parser": "0.1.x", - "nomnom": "1.5.2" + "@gerhobbelt/ast-util": "0.6.1-4", + "@gerhobbelt/json5": "0.5.1-20", + "@gerhobbelt/lex-parser": "0.6.1-208", + "@gerhobbelt/nomnom": "1.8.4-24", + "@gerhobbelt/recast": "0.12.7-14", + "@gerhobbelt/xregexp": "3.2.0-22", + "jison-helpers-lib": "0.6.1-208" }, "devDependencies": { - "test": "0.4.4" + "babel-cli": "6.26.0", + "babel-core": "6.26.0", + "babel-preset-env": "1.6.1", + "babel-preset-modern-browsers": "10.0.1", + "chai": "4.1.2", + "globby": "6.1.0", + "mocha": "4.0.1", + "rollup-plugin-node-resolve": "3.0.0", + "rollup": "0.50.0" }, "scripts": { - "test": "node tests/all-tests.js" + "test": "make test", + "build": "make", + "pub": "echo '### WARNING/NOTICE: publish from the jison monorepo! ###' && false" }, "directories": { "lib": "lib", diff --git a/regexp-lexer.js b/regexp-lexer.js index cc68c65..9f1fc94 100644 --- a/regexp-lexer.js +++ b/regexp-lexer.js @@ -1,283 +1,2003 @@ // Basic Lexer implemented using JavaScript regular expressions +// Zachary Carter // MIT Licensed -"use strict"; +import XRegExp from '@gerhobbelt/xregexp'; +import json5 from '@gerhobbelt/json5'; +import lexParser from '@gerhobbelt/lex-parser'; +import setmgmt from './regexp-set-management.js'; +import helpers from 'jison-helpers-lib'; +var rmCommonWS = helpers.rmCommonWS; +var camelCase = helpers.camelCase; +var code_exec = helpers.exec; +// import recast from '@gerhobbelt/recast'; +// import astUtils from '@gerhobbelt/ast-util'; +import assert from 'assert'; + +var version = '0.6.1-208'; // require('./package.json').version; + + + + +const XREGEXP_UNICODE_ESCAPE_RE = setmgmt.XREGEXP_UNICODE_ESCAPE_RE; // Matches the XRegExp Unicode escape braced part, e.g. `{Number}` +const CHR_RE = setmgmt.CHR_RE; +const SET_PART_RE = setmgmt.SET_PART_RE; +const NOTHING_SPECIAL_RE = setmgmt.NOTHING_SPECIAL_RE; +const UNICODE_BASE_PLANE_MAX_CP = setmgmt.UNICODE_BASE_PLANE_MAX_CP; + +// The expanded regex sets which are equivalent to the given `\\{c}` escapes: +// +// `/\s/`: +const WHITESPACE_SETSTR = setmgmt.WHITESPACE_SETSTR; +// `/\d/`: +const DIGIT_SETSTR = setmgmt.DIGIT_SETSTR; +// `/\w/`: +const WORDCHAR_SETSTR = setmgmt.WORDCHAR_SETSTR; + +// WARNING: this regex MUST match the regex for `ID` in ebnf-parser::bnf.l jison language lexer spec! (`ID = [{ALPHA}]{ALNUM}*`) +// +// This is the base XRegExp ID regex used in many places; this should match the ID macro definition in the EBNF/BNF parser et al as well! +const ID_REGEX_BASE = '[\\p{Alphabetic}_][\\p{Alphabetic}_\\p{Number}]*'; + + + + +// see also ./lib/cli.js +/** +@public +@nocollapse +*/ +const defaultJisonLexOptions = { + moduleType: 'commonjs', + debug: false, + enableDebugLogs: false, + json: false, + main: false, // CLI: not:(--main option) + dumpSourceCodeOnFailure: true, + throwErrorOnCompileFailure: true, + + moduleName: undefined, + defaultModuleName: 'lexer', + file: undefined, + outfile: undefined, + inputPath: undefined, + inputFilename: undefined, + warn_cb: undefined, // function(msg) | true (= use Jison.Print) | false (= throw Exception) + + xregexp: false, + lexerErrorsAreRecoverable: false, + flex: false, + backtrack_lexer: false, + ranges: false, // track position range, i.e. start+end indexes in the input string + trackPosition: true, // track line+column position in the input string + caseInsensitive: false, + showSource: false, + exportSourceCode: false, + exportAST: false, + prettyCfg: true, + pre_lex: undefined, + post_lex: undefined, +}; + + +// Merge sets of options. +// +// Convert alternative jison option names to their base option. +// +// The *last* option set which overrides the default wins, where 'override' is +// defined as specifying a not-undefined value which is not equal to the +// default value. +// +// When the FIRST argument is STRING "NODEFAULT", then we MUST NOT mix the +// default values avialable in Jison.defaultJisonOptions. +// +// Return a fresh set of options. +/** @public */ +function mkStdOptions(/*...args*/) { + var h = Object.prototype.hasOwnProperty; + + var opts = {}; + var args = [].concat.apply([], arguments); + // clone defaults, so we do not modify those constants? + if (args[0] !== "NODEFAULT") { + args.unshift(defaultJisonLexOptions); + } else { + args.shift(); + } + + for (var i = 0, len = args.length; i < len; i++) { + var o = args[i]; + if (!o) continue; + + // clone input (while camel-casing the options), so we do not modify those either. + var o2 = {}; + + for (var p in o) { + if (typeof o[p] !== 'undefined' && h.call(o, p)) { + o2[camelCase(p)] = o[p]; + } + } + + // now clean them options up: + if (typeof o2.main !== 'undefined') { + o2.noMain = !o2.main; + } + + delete o2.main; + + // special check for `moduleName` to ensure we detect the 'default' moduleName entering from the CLI + // NOT overriding the moduleName set in the grammar definition file via an `%options` entry: + if (o2.moduleName === o2.defaultModuleName) { + delete o2.moduleName; + } + + // now see if we have an overriding option here: + for (var p in o2) { + if (h.call(o2, p)) { + if (typeof o2[p] !== 'undefined') { + opts[p] = o2[p]; + } + } + } + } + + return opts; +} + +// set up export/output attributes of the `options` object instance +function prepExportStructures(options) { + // set up the 'option' `exportSourceCode` as a hash object for returning + // all generated source code chunks to the caller + var exportSourceCode = options.exportSourceCode; + if (!exportSourceCode || typeof exportSourceCode !== 'object') { + exportSourceCode = { + enabled: !!exportSourceCode + }; + } else if (typeof exportSourceCode.enabled !== 'boolean') { + exportSourceCode.enabled = true; + } + options.exportSourceCode = exportSourceCode; +} + +// Autodetect if the input lexer spec is in JSON or JISON +// format when the `options.json` flag is `true`. +// +// Produce the JSON lexer spec result when these are JSON formatted already as that +// would save us the trouble of doing this again, anywhere else in the JISON +// compiler/generator. +// +// Otherwise return the *parsed* lexer spec as it has +// been processed through LexParser. +function autodetectAndConvertToJSONformat(lexerSpec, options) { + var chk_l = null; + var ex1, err; + + if (typeof lexerSpec === 'string') { + if (options.json) { + try { + chk_l = json5.parse(lexerSpec); + + // When JSON5-based parsing of the lexer spec succeeds, this implies the lexer spec is specified in `JSON mode` + // *OR* there's a JSON/JSON5 format error in the input: + } catch (e) { + ex1 = e; + } + } + if (!chk_l) { + // // WARNING: the lexer may receive options specified in the **grammar spec file**, + // // hence we should mix the options to ensure the lexParser always + // // receives the full set! + // // + // // make sure all options are 'standardized' before we go and mix them together: + // options = mkStdOptions(grammar.options, options); + try { + chk_l = lexParser.parse(lexerSpec, options); + } catch (e) { + if (options.json) { + err = new Error('Could not parse lexer spec in JSON AUTODETECT mode\nError: ' + ex1.message + ' (' + e.message + ')'); + err.secondary_exception = e; + err.stack = ex1.stack; + } else { + err = new Error('Could not parse lexer spec\nError: ' + e.message); + err.stack = e.stack; + } + throw err; + } + } + } else { + chk_l = lexerSpec; + } + + // Save time! Don't reparse the entire lexer spec *again* inside the code generators when that's not necessary: + + return chk_l; +} -var lexParser = require('lex-parser'); -var version = require('./package.json').version; // expand macros and convert matchers to RegExp's -function prepareRules(rules, macros, actions, tokens, startConditions, caseless) { - var m,i,k,action,conditions, - newRules = []; +function prepareRules(dict, actions, caseHelper, tokens, startConditions, opts) { + var m, i, k, rule, action, conditions, + active_conditions, + rules = dict.rules || [], + newRules = [], + macros = {}, + regular_rule_count = 0, + simple_rule_count = 0; + + // Assure all options are camelCased: + assert(typeof opts.options['case-insensitive'] === 'undefined'); + + if (!tokens) { + tokens = {}; + } - if (macros) { - macros = prepareMacros(macros); + // Depending on the location within the regex we need different expansions of the macros: + // one expansion for when a macro is *inside* a `[...]` and another expansion when a macro + // is anywhere else in a regex: + if (dict.macros) { + macros = prepareMacros(dict.macros, opts); } - function tokenNumberReplacement (str, token) { - return "return " + (tokens[token] || "'" + token + "'"); + function tokenNumberReplacement(str, token) { + return 'return ' + (tokens[token] || '\'' + token.replace(/'/g, '\\\'') + '\''); } - actions.push('switch($avoiding_name_collisions) {'); + // Make sure a comment does not contain any embedded '*/' end-of-comment marker + // as that would break the generated code + function postprocessComment(str) { + if (Array.isArray(str)) { + str = str.join(' '); + } + str = str.replace(/\*\//g, '*\\/'); // destroy any inner `*/` comment terminator sequence. + return str; + } + + actions.push('switch(yyrulenumber) {'); + + for (i = 0; i < rules.length; i++) { + rule = rules[i]; + m = rule[0]; - for (i=0;i < rules.length; i++) { - if (Object.prototype.toString.apply(rules[i][0]) !== '[object Array]') { + active_conditions = []; + if (Object.prototype.toString.apply(m) !== '[object Array]') { // implicit add to all inclusive start conditions for (k in startConditions) { if (startConditions[k].inclusive) { + active_conditions.push(k); startConditions[k].rules.push(i); } } - } else if (rules[i][0][0] === '*') { + } else if (m[0] === '*') { // Add to ALL start conditions + active_conditions.push('*'); for (k in startConditions) { startConditions[k].rules.push(i); } - rules[i].shift(); + rule.shift(); + m = rule[0]; } else { // Add to explicit start conditions - conditions = rules[i].shift(); - for (k=0;k= 0); + + se = '[' + se + ']'; + + if (!has_expansions && se.length < s1.length) { + s1 = se; + } + rv.push(s1); + break; + + // XRegExp Unicode escape, e.g. `\\p{Number}`: + case '\\p': + c2 = s.match(XREGEXP_UNICODE_ESCAPE_RE); + if (c2) { + c2 = c2[0]; + s = s.substr(c2.length); + + // nothing to expand. + rv.push(c1 + c2); + } else { + // nothing to stretch this match, hence nothing to expand. + rv.push(c1); + } + break; + + // Either a range expression or the start of a macro reference: `.{1,3}` or `{NAME}`. + // Treat it as a macro reference and see if it will expand to anything: + case '{': + c2 = s.match(NOTHING_SPECIAL_RE); + if (c2) { + c2 = c2[0]; + s = s.substr(c2.length); + + var c3 = s[0]; + s = s.substr(c3.length); + if (c3 === '}') { + // possibly a macro name in there... Expand if possible: + c2 = c1 + c2 + c3; + if (expandAllMacrosElsewhere_cb) { + c2 = expandAllMacrosElsewhere_cb(c2); + assert(c2); + if (c2 instanceof Error) { + return new Error(errinfo() + ': ' + c2.message); + } + } + } else { + // not a well-terminated macro reference or something completely different: + // we do not even attempt to expand this as there's guaranteed nothing to expand + // in this bit. + c2 = c1 + c2 + c3; + } + rv.push(c2); + } else { + // nothing to stretch this match, hence nothing to expand. + rv.push(c1); + } + break; + + // Recognize some other regex elements, but there's no need to understand them all. + // + // We are merely interested in any chunks now which do *not* include yet another regex set `[...]` + // nor any `{MACRO}` reference: + default: + // non-set character or word: see how much of this there is for us and then see if there + // are any macros still lurking inside there: + c2 = s.match(NOTHING_SPECIAL_RE); + if (c2) { + c2 = c2[0]; + s = s.substr(c2.length); + + // nothing to expand. + rv.push(c1 + c2); + } else { + // nothing to stretch this match, hence nothing to expand. + rv.push(c1); + } + break; + } + } + + s = rv.join(''); + + // When this result is suitable for use in a set, than we should be able to compile + // it in a regex; that way we can easily validate whether macro X is fit to be used + // inside a regex set: + try { + var re; + re = new XRegExp(s); + re.test(s[0]); + } catch (ex) { + // make sure we produce a regex expression which will fail badly when it is used + // in actual code: + return new Error(errinfo() + ': expands to an invalid regex: /' + s + '/'); + } + + assert(s); + return s; +} + + +// expand macros within macros and cache the result +function prepareMacros(dict_macros, opts) { + var macros = {}; + + // expand a `{NAME}` macro which exists inside a `[...]` set: + function expandMacroInSet(i) { + var k, a, m; + if (!macros[i]) { + m = dict_macros[i]; + + if (m.indexOf('{') >= 0) { + // set up our own record so we can detect definition loops: + macros[i] = { + in_set: false, + elsewhere: null, + raw: dict_macros[i] + }; + + for (k in dict_macros) { + if (dict_macros.hasOwnProperty(k) && i !== k) { + // it doesn't matter if the lexer recognized that the inner macro(s) + // were sitting inside a `[...]` set or not: the fact that they are used + // here in macro `i` which itself sits in a set, makes them *all* live in + // a set so all of them get the same treatment: set expansion style. + // + // Note: make sure we don't try to expand any XRegExp `\p{...}` or `\P{...}` + // macros here: + if (XRegExp._getUnicodeProperty(k)) { + // Work-around so that you can use `\p{ascii}` for a XRegExp slug, a.k.a. + // Unicode 'General Category' Property cf. http://unicode.org/reports/tr18/#Categories, + // while using `\p{ASCII}` as a *macro expansion* of the `ASCII` + // macro: + if (k.toUpperCase() !== k) { + m = new Error('Cannot use name "' + k + '" as a macro name as it clashes with the same XRegExp "\\p{..}" Unicode \'General Category\' Property name. Use all-uppercase macro names, e.g. name your macro "' + k.toUpperCase() + '" to work around this issue or give your offending macro a different name.'); + break; + } + } + + a = m.split('{' + k + '}'); + if (a.length > 1) { + var x = expandMacroInSet(k); + assert(x); + if (x instanceof Error) { + m = x; + break; + } + m = a.join(x); + } + } + } + } + + var mba = setmgmt.reduceRegexToSetBitArray(m, i, opts); + + var s1; + + // propagate deferred exceptions = error reports. + if (mba instanceof Error) { + s1 = mba; + } else { + s1 = setmgmt.bitarray2set(mba, false); + + m = s1; + } + + macros[i] = { + in_set: s1, + elsewhere: null, + raw: dict_macros[i] + }; + } else { + m = macros[i].in_set; + + if (m instanceof Error) { + // this turns out to be an macro with 'issues' and it is used, so the 'issues' do matter: bombs away! + return new Error(m.message); + } + + // detect definition loop: + if (m === false) { + return new Error('Macro name "' + i + '" has an illegal, looping, definition, i.e. it\'s definition references itself, either directly or indirectly, via other macros.'); + } + } + + return m; + } + + function expandMacroElsewhere(i) { + var k, a, m; + + if (macros[i].elsewhere == null) { + m = dict_macros[i]; + + // set up our own record so we can detect definition loops: + macros[i].elsewhere = false; + + // the macro MAY contain other macros which MAY be inside a `[...]` set in this + // macro or elsewhere, hence we must parse the regex: + m = reduceRegex(m, i, opts, expandAllMacrosInSet, expandAllMacrosElsewhere); + // propagate deferred exceptions = error reports. + if (m instanceof Error) { + return m; + } + + macros[i].elsewhere = m; + } else { + m = macros[i].elsewhere; + + if (m instanceof Error) { + // this turns out to be an macro with 'issues' and it is used, so the 'issues' do matter: bombs away! + return m; + } + + // detect definition loop: + if (m === false) { + return new Error('Macro name "' + i + '" has an illegal, looping, definition, i.e. it\'s definition references itself, either directly or indirectly, via other macros.'); + } + } + + return m; + } + + function expandAllMacrosInSet(s) { + var i, x; + + // process *all* the macros inside [...] set: + if (s.indexOf('{') >= 0) { + for (i in macros) { + if (macros.hasOwnProperty(i)) { + var a = s.split('{' + i + '}'); + if (a.length > 1) { + x = expandMacroInSet(i); + assert(x); + if (x instanceof Error) { + return new Error('failure to expand the macro [' + i + '] in set [' + s + ']: ' + x.message); + } + s = a.join(x); + } + + // stop the brute-force expansion attempt when we done 'em all: + if (s.indexOf('{') === -1) { + break; + } + } + } + } + + return s; + } + + function expandAllMacrosElsewhere(s) { + var i, x; + + // When we process the remaining macro occurrences in the regex + // every macro used in a lexer rule will become its own capture group. + // + // Meanwhile the cached expansion will expand any submacros into + // *NON*-capturing groups so that the backreference indexes remain as you'ld + // expect and using macros doesn't require you to know exactly what your + // used macro will expand into, i.e. which and how many submacros it has. + // + // This is a BREAKING CHANGE from vanilla jison 0.4.15! + if (s.indexOf('{') >= 0) { + for (i in macros) { + if (macros.hasOwnProperty(i)) { + // These are all submacro expansions, hence non-capturing grouping is applied: + var a = s.split('{' + i + '}'); + if (a.length > 1) { + x = expandMacroElsewhere(i); + assert(x); + if (x instanceof Error) { + return new Error('failure to expand the macro [' + i + '] in regex /' + s + '/: ' + x.message); + } + s = a.join('(?:' + x + ')'); + } + + // stop the brute-force expansion attempt when we done 'em all: + if (s.indexOf('{') === -1) { + break; + } } } } + + return s; } + + + var m, i; + + if (opts.debug) console.log('\n############## RAW macros: ', dict_macros); + + // first we create the part of the dictionary which is targeting the use of macros + // *inside* `[...]` sets; once we have completed that half of the expansions work, + // we then go and expand the macros for when they are used elsewhere in a regex: + // iff we encounter submacros then which are used *inside* a set, we can use that + // first half dictionary to speed things up a bit as we can use those expansions + // straight away! + for (i in dict_macros) { + if (dict_macros.hasOwnProperty(i)) { + expandMacroInSet(i); + } + } + + for (i in dict_macros) { + if (dict_macros.hasOwnProperty(i)) { + expandMacroElsewhere(i); + } + } + + if (opts.debug) console.log('\n############### expanded macros: ', macros); + return macros; } -function prepareStartConditions (conditions) { + + +// expand macros in a regex; expands them recursively +function expandMacros(src, macros, opts) { + var expansion_count = 0; + + // By the time we call this function `expandMacros` we MUST have expanded and cached all macros already! + // Hence things should be easy in there: + + function expandAllMacrosInSet(s) { + var i, m, x; + + // process *all* the macros inside [...] set: + if (s.indexOf('{') >= 0) { + for (i in macros) { + if (macros.hasOwnProperty(i)) { + m = macros[i]; + + var a = s.split('{' + i + '}'); + if (a.length > 1) { + x = m.in_set; + + assert(x); + if (x instanceof Error) { + // this turns out to be an macro with 'issues' and it is used, so the 'issues' do matter: bombs away! + throw x; + } + + // detect definition loop: + if (x === false) { + return new Error('Macro name "' + i + '" has an illegal, looping, definition, i.e. it\'s definition references itself, either directly or indirectly, via other macros.'); + } + + s = a.join(x); + expansion_count++; + } + + // stop the brute-force expansion attempt when we done 'em all: + if (s.indexOf('{') === -1) { + break; + } + } + } + } + + return s; + } + + function expandAllMacrosElsewhere(s) { + var i, m, x; + + // When we process the main macro occurrences in the regex + // every macro used in a lexer rule will become its own capture group. + // + // Meanwhile the cached expansion will expand any submacros into + // *NON*-capturing groups so that the backreference indexes remain as you'ld + // expect and using macros doesn't require you to know exactly what your + // used macro will expand into, i.e. which and how many submacros it has. + // + // This is a BREAKING CHANGE from vanilla jison 0.4.15! + if (s.indexOf('{') >= 0) { + for (i in macros) { + if (macros.hasOwnProperty(i)) { + m = macros[i]; + + var a = s.split('{' + i + '}'); + if (a.length > 1) { + // These are all main macro expansions, hence CAPTURING grouping is applied: + x = m.elsewhere; + assert(x); + + // detect definition loop: + if (x === false) { + return new Error('Macro name "' + i + '" has an illegal, looping, definition, i.e. it\'s definition references itself, either directly or indirectly, via other macros.'); + } + + s = a.join('(' + x + ')'); + expansion_count++; + } + + // stop the brute-force expansion attempt when we done 'em all: + if (s.indexOf('{') === -1) { + break; + } + } + } + } + + return s; + } + + + // When we process the macro occurrences in the regex + // every macro used in a lexer rule will become its own capture group. + // + // Meanwhile the cached expansion will have expanded any submacros into + // *NON*-capturing groups so that the backreference indexes remain as you'ld + // expect and using macros doesn't require you to know exactly what your + // used macro will expand into, i.e. which and how many submacros it has. + // + // This is a BREAKING CHANGE from vanilla jison 0.4.15! + var s2 = reduceRegex(src, null, opts, expandAllMacrosInSet, expandAllMacrosElsewhere); + // propagate deferred exceptions = error reports. + if (s2 instanceof Error) { + throw s2; + } + + // only when we did expand some actual macros do we take the re-interpreted/optimized/regenerated regex from reduceRegex() + // in order to keep our test cases simple and rules recognizable. This assumes the user can code good regexes on his own, + // as long as no macros are involved... + // + // Also pick the reduced regex when there (potentially) are XRegExp extensions in the original, e.g. `\\p{Number}`, + // unless the `xregexp` output option has been enabled. + if (expansion_count > 0 || (src.indexOf('\\p{') >= 0 && !opts.options.xregexp)) { + src = s2; + } else { + // Check if the reduced regex is smaller in size; when it is, we still go with the new one! + if (s2.length < src.length) { + src = s2; + } + } + + return src; +} + +function prepareStartConditions(conditions) { var sc, hash = {}; - for (sc in conditions) if (conditions.hasOwnProperty(sc)) { - hash[sc] = {rules:[],inclusive:!!!conditions[sc]}; + for (sc in conditions) { + if (conditions.hasOwnProperty(sc)) { + hash[sc] = {rules:[], inclusive: !conditions[sc]}; + } } return hash; } -function buildActions (dict, tokens) { - var actions = [dict.actionInclude || '', "var YYSTATE=YY_START;"]; +function buildActions(dict, tokens, opts) { + var actions = [dict.actionInclude || '', 'var YYSTATE = YY_START;']; var tok; var toks = {}; + var caseHelper = []; + // tokens: map/array of token numbers to token names for (tok in tokens) { - toks[tokens[tok]] = tok; + var idx = parseInt(tok); + if (idx && idx > 0) { + toks[tokens[tok]] = idx; + } } - if (dict.options && dict.options.flex) { - dict.rules.push([".", "console.log(yytext);"]); + if (opts.options.flex && dict.rules) { + dict.rules.push(['.', 'console.log("", yytext); /* `flex` lexing mode: the last resort rule! */']); } - this.rules = prepareRules(dict.rules, dict.macros, actions, tokens && toks, this.conditions, this.options["case-insensitive"]); - var fun = actions.join("\n"); - "yytext yyleng yylineno yylloc".split(' ').forEach(function (yy) { - fun = fun.replace(new RegExp("\\b(" + yy + ")\\b", "g"), "yy_.$1"); + var gen = prepareRules(dict, actions, caseHelper, tokens && toks, opts.conditions, opts); + + var fun = actions.join('\n'); + 'yytext yyleng yylineno yylloc yyerror'.split(' ').forEach(function (yy) { + fun = fun.replace(new RegExp('\\b(' + yy + ')\\b', 'g'), 'yy_.$1'); }); - return "function anonymous(yy,yy_,$avoiding_name_collisions,YY_START) {" + fun + "\n}"; + return { + caseHelperInclude: '{\n' + caseHelper.join(',') + '\n}', + + actions: `function lexer__performAction(yy, yyrulenumber, YY_START) { + var yy_ = this; + + ${fun} + }`, + + rules: gen.rules, + macros: gen.macros, // propagate these for debugging/diagnostic purposes + + regular_rule_count: gen.regular_rule_count, + simple_rule_count: gen.simple_rule_count, + }; } -function RegExpLexer (dict, input, tokens) { - var opts = processGrammar(dict, tokens); - var source = generateModuleBody(opts); - var lexer = eval(source); +// +// NOTE: this is *almost* a copy of the JisonParserError producing code in +// jison/lib/jison.js @ line 2304:lrGeneratorMixin.generateErrorClass +// +function generateErrorClass() { + // --- START lexer error class --- + +var prelude = `/** + * See also: + * http://stackoverflow.com/questions/1382107/whats-a-good-way-to-extend-error-in-javascript/#35881508 + * but we keep the prototype.constructor and prototype.name assignment lines too for compatibility + * with userland code which might access the derived class in a 'classic' way. + * + * @public + * @constructor + * @nocollapse + */ +function JisonLexerError(msg, hash) { + Object.defineProperty(this, 'name', { + enumerable: false, + writable: false, + value: 'JisonLexerError' + }); + + if (msg == null) msg = '???'; - lexer.yy = {}; - if (input) { - lexer.setInput(input); + Object.defineProperty(this, 'message', { + enumerable: false, + writable: true, + value: msg + }); + + this.hash = hash; + + var stacktrace; + if (hash && hash.exception instanceof Error) { + var ex2 = hash.exception; + this.message = ex2.message || msg; + stacktrace = ex2.stack; + } + if (!stacktrace) { + if (Error.hasOwnProperty('captureStackTrace')) { // V8 + Error.captureStackTrace(this, this.constructor); + } else { + stacktrace = (new Error(msg)).stack; + } } + if (stacktrace) { + Object.defineProperty(this, 'stack', { + enumerable: false, + writable: false, + value: stacktrace + }); + } +} + +if (typeof Object.setPrototypeOf === 'function') { + Object.setPrototypeOf(JisonLexerError.prototype, Error.prototype); +} else { + JisonLexerError.prototype = Object.create(Error.prototype); +} +JisonLexerError.prototype.constructor = JisonLexerError; +JisonLexerError.prototype.name = 'JisonLexerError';`; + + // --- END lexer error class --- + + return prelude; +} + + +const jisonLexerErrorDefinition = generateErrorClass(); - lexer.generate = function () { return generateFromOpts(opts); }; - lexer.generateModule = function () { return generateModule(opts); }; - lexer.generateCommonJSModule = function () { return generateCommonJSModule(opts); }; - lexer.generateAMDModule = function () { return generateAMDModule(opts); }; + +function generateFakeXRegExpClassSrcCode() { + return rmCommonWS` + var __hacky_counter__ = 0; + + /** + * @constructor + * @nocollapse + */ + function XRegExp(re, f) { + this.re = re; + this.flags = f; + this._getUnicodeProperty = function (k) {}; + var fake = /./; // WARNING: this exact 'fake' is also depended upon by the xregexp unit test! + __hacky_counter__++; + fake.__hacky_backy__ = __hacky_counter__; + return fake; + } + `; +} + + + +/** @constructor */ +function RegExpLexer(dict, input, tokens, build_options) { + var opts; + var dump = false; + + function test_me(tweak_cb, description, src_exception, ex_callback) { + opts = processGrammar(dict, tokens, build_options); + opts.__in_rules_failure_analysis_mode__ = false; + prepExportStructures(opts); + assert(opts.options); + if (tweak_cb) { + tweak_cb(); + } + var source = generateModuleBody(opts); + try { + // The generated code will always have the `lexer` variable declared at local scope + // as `eval()` will use the local scope. + // + // The compiled code will look something like this: + // + // ``` + // var lexer; + // bla bla... + // ``` + // + // or + // + // ``` + // var lexer = { bla... }; + // ``` + var testcode = [ + '// provide a local version for test purposes:', + jisonLexerErrorDefinition, + '', + generateFakeXRegExpClassSrcCode(), + '', + source, + '', + 'return lexer;'].join('\n'); + var lexer = code_exec(testcode, function generated_code_exec_wrapper_regexp_lexer(sourcecode) { + //console.log("===============================LEXER TEST CODE\n", sourcecode, "\n=====================END====================\n"); + var lexer_f = new Function('', sourcecode); + return lexer_f(); + }, opts.options, "lexer"); + + if (!lexer) { + throw new Error('no lexer defined *at all*?!'); + } + if (typeof lexer.options !== 'object' || lexer.options == null) { + throw new Error('your lexer class MUST have an .options member object or it won\'t fly!'); + } + if (typeof lexer.setInput !== 'function') { + throw new Error('your lexer class MUST have a .setInput function member or it won\'t fly!'); + } + if (lexer.EOF !== 1 && lexer.ERROR !== 2) { + throw new Error('your lexer class MUST have these constants defined: lexer.EOF = 1 and lexer.ERROR = 2 or it won\'t fly!'); + } + + // When we do NOT crash, we found/killed the problem area just before this call! + if (src_exception && description) { + src_exception.message += '\n (' + description + ')'; + } + + // patch the pre and post handlers in there, now that we have some live code to work with: + if (opts.options) { + var pre = opts.options.pre_lex; + var post = opts.options.post_lex; + // since JSON cannot encode functions, we'll have to do it manually now: + if (typeof pre === 'function') { + lexer.options.pre_lex = pre; + } + if (typeof post === 'function') { + lexer.options.post_lex = post; + } + } + + if (opts.options.showSource) { + if (typeof opts.options.showSource === 'function') { + opts.options.showSource(lexer, source, opts); + } else { + console.log("\nGenerated lexer sourcecode:\n----------------------------------------\n", source, "\n----------------------------------------\n"); + } + } + return lexer; + } catch (ex) { + // if (src_exception) { + // src_exception.message += '\n (' + description + ': ' + ex.message + ')'; + // } + + if (ex_callback) { + ex_callback(ex); + } else if (dump) { + console.log('source code:\n', source); + } + return false; + } + } + + /** @constructor */ + var lexer = test_me(null, null, null, function (ex) { + // When we get an exception here, it means some part of the user-specified lexer is botched. + // + // Now we go and try to narrow down the problem area/category: + assert(opts.options); + assert(opts.options.xregexp !== undefined); + var orig_xregexp_opt = !!opts.options.xregexp; + if (!test_me(function () { + assert(opts.options.xregexp !== undefined); + opts.options.xregexp = false; + opts.showSource = false; + }, 'When you have specified %option xregexp, you must also properly IMPORT the XRegExp library in the generated lexer.', ex, null)) { + if (!test_me(function () { + // restore xregexp option setting: the trouble wasn't caused by the xregexp flag i.c.w. incorrect XRegExp library importing! + opts.options.xregexp = orig_xregexp_opt; + + opts.conditions = []; + opts.showSource = false; + }, ((dict.rules && dict.rules.length > 0) ? + 'One or more of your lexer state names are possibly botched?' : + 'Your custom lexer is somehow botched.'), ex, null)) { + if (!test_me(function () { + // opts.conditions = []; + opts.rules = []; + opts.showSource = false; + opts.__in_rules_failure_analysis_mode__ = true; + }, 'One or more of your lexer rules are possibly botched?', ex, null)) { + // kill each rule action block, one at a time and test again after each 'edit': + var rv = false; + for (var i = 0, len = (dict.rules ? dict.rules.length : 0); i < len; i++) { + dict.rules[i][1] = '{ /* nada */ }'; + rv = test_me(function () { + // opts.conditions = []; + // opts.rules = []; + // opts.__in_rules_failure_analysis_mode__ = true; + }, 'Your lexer rule "' + dict.rules[i][0] + '" action code block is botched?', ex, null); + if (rv) { + break; + } + } + if (!rv) { + test_me(function () { + opts.conditions = []; + opts.rules = []; + opts.performAction = 'null'; + // opts.options = {}; + // opts.caseHelperInclude = '{}'; + opts.showSource = false; + opts.__in_rules_failure_analysis_mode__ = true; + + dump = false; + }, 'One or more of your lexer rule action code block(s) are possibly botched?', ex, null); + } + } + } + } + throw ex; + }); + + lexer.setInput(input); + + /** @public */ + lexer.generate = function () { + return generateFromOpts(opts); + }; + /** @public */ + lexer.generateModule = function () { + return generateModule(opts); + }; + /** @public */ + lexer.generateCommonJSModule = function () { + return generateCommonJSModule(opts); + }; + /** @public */ + lexer.generateESModule = function () { + return generateESModule(opts); + }; + /** @public */ + lexer.generateAMDModule = function () { + return generateAMDModule(opts); + }; + + // internal APIs to aid testing: + /** @public */ + lexer.getExpandedMacros = function () { + return opts.macros; + }; return lexer; } -RegExpLexer.prototype = { +// code stripping performance test for very simple grammar: +// +// - removing backtracking parser code branches: 730K -> 750K rounds +// - removing all location info tracking: yylineno, yylloc, etc.: 750K -> 900K rounds +// - no `yyleng`: 900K -> 905K rounds +// - no `this.done` as we cannot have a NULL `_input` anymore: 905K -> 930K rounds +// - `simpleCaseActionClusters` as array instead of hash object: 930K -> 940K rounds +// - lexers which have only return stmts, i.e. only a +// `simpleCaseActionClusters` lookup table to produce +// lexer tokens: *inline* the `performAction` call: 940K -> 950K rounds +// - given all the above, you can *inline* what's left of +// `lexer_next()`: 950K -> 955K rounds (? this stuff becomes hard to measure; inaccuracy abounds!) +// +// Total gain when we forget about very minor (and tough to nail) *inlining* `lexer_next()` gains: +// +// 730 -> 950 ~ 30% performance gain. +// + +// As a function can be reproduced in source-code form by any JavaScript engine, we're going to wrap this chunk +// of code in a function so that we can easily get it including it comments, etc.: +/** +@public +@nocollapse +*/ +function getRegExpLexerPrototype() { + // --- START lexer kernel --- +return `{ EOF: 1, - parseError: function parseError(str, hash) { - if (this.yy.parser) { - this.yy.parser.parseError(str, hash); - } else { - throw new Error(str); + ERROR: 2, + + // JisonLexerError: JisonLexerError, /// <-- injected by the code generator + + // options: {}, /// <-- injected by the code generator + + // yy: ..., /// <-- injected by setInput() + + __currentRuleSet__: null, /// INTERNAL USE ONLY: internal rule set cache for the current lexer state + + __error_infos: [], /// INTERNAL USE ONLY: the set of lexErrorInfo objects created since the last cleanup + + __decompressed: false, /// INTERNAL USE ONLY: mark whether the lexer instance has been 'unfolded' completely and is now ready for use + + done: false, /// INTERNAL USE ONLY + _backtrack: false, /// INTERNAL USE ONLY + _input: '', /// INTERNAL USE ONLY + _more: false, /// INTERNAL USE ONLY + _signaled_error_token: false, /// INTERNAL USE ONLY + + conditionStack: [], /// INTERNAL USE ONLY; managed via \`pushState()\`, \`popState()\`, \`topState()\` and \`stateStackSize()\` + + match: '', /// READ-ONLY EXTERNAL ACCESS - ADVANCED USE ONLY: tracks input which has been matched so far for the lexer token under construction. \`match\` is identical to \`yytext\` except that this one still contains the matched input string after \`lexer.performAction()\` has been invoked, where userland code MAY have changed/replaced the \`yytext\` value entirely! + matched: '', /// READ-ONLY EXTERNAL ACCESS - ADVANCED USE ONLY: tracks entire input which has been matched so far + matches: false, /// READ-ONLY EXTERNAL ACCESS - ADVANCED USE ONLY: tracks RE match result for last (successful) match attempt + yytext: '', /// ADVANCED USE ONLY: tracks input which has been matched so far for the lexer token under construction; this value is transferred to the parser as the 'token value' when the parser consumes the lexer token produced through a call to the \`lex()\` API. + offset: 0, /// READ-ONLY EXTERNAL ACCESS - ADVANCED USE ONLY: tracks the 'cursor position' in the input string, i.e. the number of characters matched so far + yyleng: 0, /// READ-ONLY EXTERNAL ACCESS - ADVANCED USE ONLY: length of matched input for the token under construction (\`yytext\`) + yylineno: 0, /// READ-ONLY EXTERNAL ACCESS - ADVANCED USE ONLY: 'line number' at which the token under construction is located + yylloc: null, /// READ-ONLY EXTERNAL ACCESS - ADVANCED USE ONLY: tracks location info (lines + columns) for the token under construction + + /** + * INTERNAL USE: construct a suitable error info hash object instance for \`parseError\`. + * + * @public + * @this {RegExpLexer} + */ + constructLexErrorInfo: function lexer_constructLexErrorInfo(msg, recoverable, show_input_position) { + msg = '' + msg; + + // heuristic to determine if the error message already contains a (partial) source code dump + // as produced by either \`showPosition()\` or \`prettyPrintRange()\`: + if (show_input_position == undefined) { + show_input_position = !(msg.indexOf('\\n') > 0 && msg.indexOf('^') > 0); + } + if (this.yylloc && show_input_position) { + if (typeof this.prettyPrintRange === 'function') { + var pretty_src = this.prettyPrintRange(this.yylloc); + + if (!/\\n\\s*$/.test(msg)) { + msg += '\\n'; + } + msg += '\\n Erroneous area:\\n' + this.prettyPrintRange(this.yylloc); + } else if (typeof this.showPosition === 'function') { + var pos_str = this.showPosition(); + if (pos_str) { + if (msg.length && msg[msg.length - 1] !== '\\n' && pos_str[0] !== '\\n') { + msg += '\\n' + pos_str; + } else { + msg += pos_str; + } + } + } + } + /** @constructor */ + var pei = { + errStr: msg, + recoverable: !!recoverable, + text: this.match, // This one MAY be empty; userland code should use the \`upcomingInput\` API to obtain more text which follows the 'lexer cursor position'... + token: null, + line: this.yylineno, + loc: this.yylloc, + yy: this.yy, + lexer: this, + + /** + * and make sure the error info doesn't stay due to potential + * ref cycle via userland code manipulations. + * These would otherwise all be memory leak opportunities! + * + * Note that only array and object references are nuked as those + * constitute the set of elements which can produce a cyclic ref. + * The rest of the members is kept intact as they are harmless. + * + * @public + * @this {LexErrorInfo} + */ + destroy: function destructLexErrorInfo() { + // remove cyclic references added to error info: + // info.yy = null; + // info.lexer = null; + // ... + var rec = !!this.recoverable; + for (var key in this) { + if (this.hasOwnProperty(key) && typeof key === 'object') { + this[key] = undefined; + } + } + this.recoverable = rec; + } + }; + // track this instance so we can \`destroy()\` it once we deem it superfluous and ready for garbage collection! + this.__error_infos.push(pei); + return pei; + }, + + /** + * handler which is invoked when a lexer error occurs. + * + * @public + * @this {RegExpLexer} + */ + parseError: function lexer_parseError(str, hash, ExceptionClass) { + if (!ExceptionClass) { + ExceptionClass = this.JisonLexerError; + } + if (this.yy) { + if (this.yy.parser && typeof this.yy.parser.parseError === 'function') { + return this.yy.parser.parseError.call(this, str, hash, ExceptionClass) || this.ERROR; + } else if (typeof this.yy.parseError === 'function') { + return this.yy.parseError.call(this, str, hash, ExceptionClass) || this.ERROR; + } + } + throw new ExceptionClass(str, hash); + }, + + /** + * method which implements \`yyerror(str, ...args)\` functionality for use inside lexer actions. + * + * @public + * @this {RegExpLexer} + */ + yyerror: function yyError(str /*, ...args */) { + var lineno_msg = ''; + if (this.yylloc) { + lineno_msg = ' on line ' + (this.yylineno + 1); + } + var p = this.constructLexErrorInfo('Lexical error' + lineno_msg + ': ' + str, this.options.lexerErrorsAreRecoverable); + + // Add any extra args to the hash under the name \`extra_error_attributes\`: + var args = Array.prototype.slice.call(arguments, 1); + if (args.length) { + p.extra_error_attributes = args; } + + return (this.parseError(p.errStr, p, this.JisonLexerError) || this.ERROR); + }, + + /** + * final cleanup function for when we have completed lexing the input; + * make it an API so that external code can use this one once userland + * code has decided it's time to destroy any lingering lexer error + * hash object instances and the like: this function helps to clean + * up these constructs, which *may* carry cyclic references which would + * otherwise prevent the instances from being properly and timely + * garbage-collected, i.e. this function helps prevent memory leaks! + * + * @public + * @this {RegExpLexer} + */ + cleanupAfterLex: function lexer_cleanupAfterLex(do_not_nuke_errorinfos) { + // prevent lingering circular references from causing memory leaks: + this.setInput('', {}); + + // nuke the error hash info instances created during this run. + // Userland code must COPY any data/references + // in the error hash instance(s) it is more permanently interested in. + if (!do_not_nuke_errorinfos) { + for (var i = this.__error_infos.length - 1; i >= 0; i--) { + var el = this.__error_infos[i]; + if (el && typeof el.destroy === 'function') { + el.destroy(); + } + } + this.__error_infos.length = 0; + } + + return this; }, - // resets the lexer, sets new input - setInput: function (input, yy) { + /** + * clear the lexer token context; intended for internal use only + * + * @public + * @this {RegExpLexer} + */ + clear: function lexer_clear() { + this.yytext = ''; + this.yyleng = 0; + this.match = ''; + // - DO NOT reset \`this.matched\` + this.matches = false; + this._more = false; + this._backtrack = false; + + var col = (this.yylloc ? this.yylloc.last_column : 0); + this.yylloc = { + first_line: this.yylineno + 1, + first_column: col, + last_line: this.yylineno + 1, + last_column: col, + + range: [this.offset, this.offset] + }; + }, + + /** + * resets the lexer, sets new input + * + * @public + * @this {RegExpLexer} + */ + setInput: function lexer_setInput(input, yy) { this.yy = yy || this.yy || {}; - this._input = input; - this._more = this._backtrack = this.done = false; - this.yylineno = this.yyleng = 0; - this.yytext = this.matched = this.match = ''; + + // also check if we've fully initialized the lexer instance, + // including expansion work to be done to go from a loaded + // lexer to a usable lexer: + if (!this.__decompressed) { + // step 1: decompress the regex list: + var rules = this.rules; + for (var i = 0, len = rules.length; i < len; i++) { + var rule_re = rules[i]; + + // compression: is the RE an xref to another RE slot in the rules[] table? + if (typeof rule_re === 'number') { + rules[i] = rules[rule_re]; + } + } + + // step 2: unfold the conditions[] set to make these ready for use: + var conditions = this.conditions; + for (var k in conditions) { + var spec = conditions[k]; + + var rule_ids = spec.rules; + + var len = rule_ids.length; + var rule_regexes = new Array(len + 1); // slot 0 is unused; we use a 1-based index approach here to keep the hottest code in \`lexer_next()\` fast and simple! + var rule_new_ids = new Array(len + 1); + + for (var i = 0; i < len; i++) { + var idx = rule_ids[i]; + var rule_re = rules[idx]; + rule_regexes[i + 1] = rule_re; + rule_new_ids[i + 1] = idx; + } + + spec.rules = rule_new_ids; + spec.__rule_regexes = rule_regexes; + spec.__rule_count = len; + } + + this.__decompressed = true; + } + + this._input = input || ''; + this.clear(); + this._signaled_error_token = false; + this.done = false; + this.yylineno = 0; + this.matched = ''; this.conditionStack = ['INITIAL']; + this.__currentRuleSet__ = null; this.yylloc = { first_line: 1, first_column: 0, last_line: 1, - last_column: 0 + last_column: 0, + + range: [0, 0] }; - if (this.options.ranges) { - this.yylloc.range = [0,0]; - } this.offset = 0; return this; }, - // consumes and returns one char from the input - input: function () { + /** + * edit the remaining input via user-specified callback. + * This can be used to forward-adjust the input-to-parse, + * e.g. inserting macro expansions and alike in the + * input which has yet to be lexed. + * The behaviour of this API contrasts the \`unput()\` et al + * APIs as those act on the *consumed* input, while this + * one allows one to manipulate the future, without impacting + * the current \`yyloc\` cursor location or any history. + * + * Use this API to help implement C-preprocessor-like + * \`#include\` statements, etc. + * + * The provided callback must be synchronous and is + * expected to return the edited input (string). + * + * The \`cpsArg\` argument value is passed to the callback + * as-is. + * + * \`callback\` interface: + * \`function callback(input, cpsArg)\` + * + * - \`input\` will carry the remaining-input-to-lex string + * from the lexer. + * - \`cpsArg\` is \`cpsArg\` passed into this API. + * + * The \`this\` reference for the callback will be set to + * reference this lexer instance so that userland code + * in the callback can easily and quickly access any lexer + * API. + * + * When the callback returns a non-string-type falsey value, + * we assume the callback did not edit the input and we + * will using the input as-is. + * + * When the callback returns a non-string-type value, it + * is converted to a string for lexing via the \`"" + retval\` + * operation. (See also why: http://2ality.com/2012/03/converting-to-string.html + * -- that way any returned object's \`toValue()\` and \`toString()\` + * methods will be invoked in a proper/desirable order.) + * + * @public + * @this {RegExpLexer} + */ + editRemainingInput: function lexer_editRemainingInput(callback, cpsArg) { + var rv = callback.call(this, this._input, cpsArg); + if (typeof rv !== 'string') { + if (rv) { + this._input = '' + rv; + } + // else: keep \`this._input\` as is. + } else { + this._input = rv; + } + return this; + }, + + /** + * consumes and returns one char from the input + * + * @public + * @this {RegExpLexer} + */ + input: function lexer_input() { + if (!this._input) { + //this.done = true; -- don't set \`done\` as we want the lex()/next() API to be able to produce one custom EOF token match after this anyhow. (lexer can match special <> tokens and perform user action code for a <> match, but only does so *once*) + return null; + } var ch = this._input[0]; this.yytext += ch; this.yyleng++; this.offset++; this.match += ch; this.matched += ch; - var lines = ch.match(/(?:\r\n?|\n).*/g); + // Count the linenumber up when we hit the LF (or a stand-alone CR). + // On CRLF, the linenumber is incremented when you fetch the CR or the CRLF combo + // and we advance immediately past the LF as well, returning both together as if + // it was all a single 'character' only. + var slice_len = 1; + var lines = false; + if (ch === '\\n') { + lines = true; + } else if (ch === '\\r') { + lines = true; + var ch2 = this._input[1]; + if (ch2 === '\\n') { + slice_len++; + ch += ch2; + this.yytext += ch2; + this.yyleng++; + this.offset++; + this.match += ch2; + this.matched += ch2; + this.yylloc.range[1]++; + } + } if (lines) { this.yylineno++; this.yylloc.last_line++; + this.yylloc.last_column = 0; } else { this.yylloc.last_column++; } - if (this.options.ranges) { - this.yylloc.range[1]++; - } + this.yylloc.range[1]++; - this._input = this._input.slice(1); + this._input = this._input.slice(slice_len); return ch; }, - // unshifts one char (or a string) into the input - unput: function (ch) { + /** + * unshifts one char (or an entire string) into the input + * + * @public + * @this {RegExpLexer} + */ + unput: function lexer_unput(ch) { var len = ch.length; - var lines = ch.split(/(?:\r\n?|\n)/g); + var lines = ch.split(/(?:\\r\\n?|\\n)/g); this._input = ch + this._input; this.yytext = this.yytext.substr(0, this.yytext.length - len); - //this.yyleng -= len; + this.yyleng = this.yytext.length; this.offset -= len; - var oldLines = this.match.split(/(?:\r\n?|\n)/g); - this.match = this.match.substr(0, this.match.length - 1); - this.matched = this.matched.substr(0, this.matched.length - 1); + this.match = this.match.substr(0, this.match.length - len); + this.matched = this.matched.substr(0, this.matched.length - len); - if (lines.length - 1) { + if (lines.length > 1) { this.yylineno -= lines.length - 1; - } - var r = this.yylloc.range; - this.yylloc = { - first_line: this.yylloc.first_line, - last_line: this.yylineno + 1, - first_column: this.yylloc.first_column, - last_column: lines ? - (lines.length === oldLines.length ? this.yylloc.first_column : 0) - + oldLines[oldLines.length - lines.length].length - lines[0].length : - this.yylloc.first_column - len - }; + this.yylloc.last_line = this.yylineno + 1; - if (this.options.ranges) { - this.yylloc.range = [r[0], r[0] + this.yyleng - len]; + // Get last entirely matched line into the \`pre_lines[]\` array's + // last index slot; we don't mind when other previously + // matched lines end up in the array too. + var pre = this.match; + var pre_lines = pre.split(/(?:\\r\\n?|\\n)/g); + if (pre_lines.length === 1) { + pre = this.matched; + pre_lines = pre.split(/(?:\\r\\n?|\\n)/g); + } + this.yylloc.last_column = pre_lines[pre_lines.length - 1].length; + } else { + this.yylloc.last_column -= len; } - this.yyleng = this.yytext.length; + + this.yylloc.range[1] = this.yylloc.range[0] + this.yyleng; + + this.done = false; return this; }, - // When called from action, caches matched text and appends it on next action - more: function () { + /** + * cache matched text and append it on next action + * + * @public + * @this {RegExpLexer} + */ + more: function lexer_more() { this._more = true; return this; }, - // When called from action, signals the lexer that this rule fails to match the input, so the next matching rule (regex) should be tested instead. - reject: function () { + /** + * signal the lexer that this rule fails to match the input, so the + * next matching rule (regex) should be tested instead. + * + * @public + * @this {RegExpLexer} + */ + reject: function lexer_reject() { if (this.options.backtrack_lexer) { this._backtrack = true; } else { - return this.parseError('Lexical error on line ' + (this.yylineno + 1) + '. You can only invoke reject() in the lexer when the lexer is of the backtracking persuasion (options.backtrack_lexer = true).\n' + this.showPosition(), { - text: "", - token: null, - line: this.yylineno - }); - + // when the \`parseError()\` call returns, we MUST ensure that the error is registered. + // We accomplish this by signaling an 'error' token to be produced for the current + // \`.lex()\` run. + var lineno_msg = ''; + if (this.yylloc) { + lineno_msg = ' on line ' + (this.yylineno + 1); + } + var p = this.constructLexErrorInfo('Lexical error' + lineno_msg + ': You can only invoke reject() in the lexer when the lexer is of the backtracking persuasion (options.backtrack_lexer = true).', false); + this._signaled_error_token = (this.parseError(p.errStr, p, this.JisonLexerError) || this.ERROR); } return this; }, - // retain first n characters of the match - less: function (n) { - this.unput(this.match.slice(n)); + /** + * retain first n characters of the match + * + * @public + * @this {RegExpLexer} + */ + less: function lexer_less(n) { + return this.unput(this.match.slice(n)); }, - // displays already matched input, i.e. for error messages - pastInput: function () { - var past = this.matched.substr(0, this.matched.length - this.match.length); - return (past.length > 20 ? '...':'') + past.substr(-20).replace(/\n/g, ""); + /** + * return (part of the) already matched input, i.e. for error + * messages. + * + * Limit the returned string length to \`maxSize\` (default: 20). + * + * Limit the returned string to the \`maxLines\` number of lines of + * input (default: 1). + * + * Negative limit values equal *unlimited*. + * + * @public + * @this {RegExpLexer} + */ + pastInput: function lexer_pastInput(maxSize, maxLines) { + var past = this.matched.substring(0, this.matched.length - this.match.length); + if (maxSize < 0) + maxSize = past.length; + else if (!maxSize) + maxSize = 20; + if (maxLines < 0) + maxLines = past.length; // can't ever have more input lines than this! + else if (!maxLines) + maxLines = 1; + // \`substr\` anticipation: treat \\r\\n as a single character and take a little + // more than necessary so that we can still properly check against maxSize + // after we've transformed and limited the newLines in here: + past = past.substr(-maxSize * 2 - 2); + // now that we have a significantly reduced string to process, transform the newlines + // and chop them, then limit them: + var a = past.replace(/\\r\\n|\\r/g, '\\n').split('\\n'); + a = a.slice(-maxLines); + past = a.join('\\n'); + // When, after limiting to maxLines, we still have too much to return, + // do add an ellipsis prefix... + if (past.length > maxSize) { + past = '...' + past.substr(-maxSize); + } + return past; }, - // displays upcoming input, i.e. for error messages - upcomingInput: function () { + /** + * return (part of the) upcoming input, i.e. for error messages. + * + * Limit the returned string length to \`maxSize\` (default: 20). + * + * Limit the returned string to the \`maxLines\` number of lines of input (default: 1). + * + * Negative limit values equal *unlimited*. + * + * > ### NOTE ### + * > + * > *"upcoming input"* is defined as the whole of the both + * > the *currently lexed* input, together with any remaining input + * > following that. *"currently lexed"* input is the input + * > already recognized by the lexer but not yet returned with + * > the lexer token. This happens when you are invoking this API + * > from inside any lexer rule action code block. + * > + * + * @public + * @this {RegExpLexer} + */ + upcomingInput: function lexer_upcomingInput(maxSize, maxLines) { var next = this.match; - if (next.length < 20) { - next += this._input.substr(0, 20-next.length); + if (maxSize < 0) + maxSize = next.length + this._input.length; + else if (!maxSize) + maxSize = 20; + if (maxLines < 0) + maxLines = maxSize; // can't ever have more input lines than this! + else if (!maxLines) + maxLines = 1; + // \`substring\` anticipation: treat \\r\\n as a single character and take a little + // more than necessary so that we can still properly check against maxSize + // after we've transformed and limited the newLines in here: + if (next.length < maxSize * 2 + 2) { + next += this._input.substring(0, maxSize * 2 + 2); // substring is faster on Chrome/V8 + } + // now that we have a significantly reduced string to process, transform the newlines + // and chop them, then limit them: + var a = next.replace(/\\r\\n|\\r/g, '\\n').split('\\n'); + a = a.slice(0, maxLines); + next = a.join('\\n'); + // When, after limiting to maxLines, we still have too much to return, + // do add an ellipsis postfix... + if (next.length > maxSize) { + next = next.substring(0, maxSize) + '...'; } - return (next.substr(0,20) + (next.length > 20 ? '...' : '')).replace(/\n/g, ""); + return next; }, - // displays the character position where the lexing error occurred, i.e. for error messages - showPosition: function () { - var pre = this.pastInput(); - var c = new Array(pre.length + 1).join("-"); - return pre + this.upcomingInput() + "\n" + c + "^"; + /** + * return a string which displays the character position where the + * lexing error occurred, i.e. for error messages + * + * @public + * @this {RegExpLexer} + */ + showPosition: function lexer_showPosition(maxPrefix, maxPostfix) { + var pre = this.pastInput(maxPrefix).replace(/\\s/g, ' '); + var c = new Array(pre.length + 1).join('-'); + return pre + this.upcomingInput(maxPostfix).replace(/\\s/g, ' ') + '\\n' + c + '^'; }, - // test the lexed token: return FALSE when not a match, otherwise return token - test_match: function(match, indexed_rule) { + /** + * return a string which displays the lines & columns of input which are referenced + * by the given location info range, plus a few lines of context. + * + * This function pretty-prints the indicated section of the input, with line numbers + * and everything! + * + * This function is very useful to provide highly readable error reports, while + * the location range may be specified in various flexible ways: + * + * - \`loc\` is the location info object which references the area which should be + * displayed and 'marked up': these lines & columns of text are marked up by \`^\` + * characters below each character in the entire input range. + * + * - \`context_loc\` is the *optional* location info object which instructs this + * pretty-printer how much *leading* context should be displayed alongside + * the area referenced by \`loc\`. This can help provide context for the displayed + * error, etc. + * + * When this location info is not provided, a default context of 3 lines is + * used. + * + * - \`context_loc2\` is another *optional* location info object, which serves + * a similar purpose to \`context_loc\`: it specifies the amount of *trailing* + * context lines to display in the pretty-print output. + * + * When this location info is not provided, a default context of 1 line only is + * used. + * + * Special Notes: + * + * - when the \`loc\`-indicated range is very large (about 5 lines or more), then + * only the first and last few lines of this block are printed while a + * \`...continued...\` message will be printed between them. + * + * This serves the purpose of not printing a huge amount of text when the \`loc\` + * range happens to be huge: this way a manageable & readable output results + * for arbitrary large ranges. + * + * - this function can display lines of input which whave not yet been lexed. + * \`prettyPrintRange()\` can access the entire input! + * + * @public + * @this {RegExpLexer} + */ + prettyPrintRange: function lexer_prettyPrintRange(loc, context_loc, context_loc2) { + var error_size = loc.last_line - loc.first_line; + const CONTEXT = 3; + const CONTEXT_TAIL = 1; + const MINIMUM_VISIBLE_NONEMPTY_LINE_COUNT = 2; + var input = this.matched + this._input; + var lines = input.split('\\n'); + //var show_context = (error_size < 5 || context_loc); + var l0 = Math.max(1, (context_loc ? context_loc.first_line : loc.first_line - CONTEXT)); + var l1 = Math.max(1, (context_loc2 ? context_loc2.last_line : loc.last_line + CONTEXT_TAIL)); + var lineno_display_width = (1 + Math.log10(l1 | 1) | 0); + var ws_prefix = new Array(lineno_display_width).join(' '); + var nonempty_line_indexes = []; + var rv = lines.slice(l0 - 1, l1 + 1).map(function injectLineNumber(line, index) { + var lno = index + l0; + var lno_pfx = (ws_prefix + lno).substr(-lineno_display_width); + var rv = lno_pfx + ': ' + line; + var errpfx = (new Array(lineno_display_width + 1)).join('^'); + var offset = 2 + 1; + var len = 0; + + if (lno === loc.first_line) { + offset += loc.first_column; + + len = Math.max( + 2, + ((lno === loc.last_line ? loc.last_column : line.length)) - loc.first_column + 1 + ); + } else if (lno === loc.last_line) { + len = Math.max(2, loc.last_column + 1); + } else if (lno > loc.first_line && lno < loc.last_line) { + len = Math.max(2, line.length + 1); + } + + if (len) { + var lead = new Array(offset).join('.'); + var mark = new Array(len).join('^'); + rv += '\\n' + errpfx + lead + mark; + + if (line.trim().length > 0) { + nonempty_line_indexes.push(index); + } + } + + rv = rv.replace(/\\t/g, ' '); + return rv; + }); + + // now make sure we don't print an overly large amount of error area: limit it + // to the top and bottom line count: + if (nonempty_line_indexes.length > 2 * MINIMUM_VISIBLE_NONEMPTY_LINE_COUNT) { + var clip_start = nonempty_line_indexes[MINIMUM_VISIBLE_NONEMPTY_LINE_COUNT - 1] + 1; + var clip_end = nonempty_line_indexes[nonempty_line_indexes.length - MINIMUM_VISIBLE_NONEMPTY_LINE_COUNT] - 1; + + var intermediate_line = (new Array(lineno_display_width + 1)).join(' ') + ' (...continued...)'; + intermediate_line += '\\n' + (new Array(lineno_display_width + 1)).join('-') + ' (---------------)'; + rv.splice(clip_start, clip_end - clip_start + 1, intermediate_line); + } + return rv.join('\\n'); + }, + + /** + * helper function, used to produce a human readable description as a string, given + * the input \`yylloc\` location object. + * + * Set \`display_range_too\` to TRUE to include the string character index position(s) + * in the description if the \`yylloc.range\` is available. + * + * @public + * @this {RegExpLexer} + */ + describeYYLLOC: function lexer_describe_yylloc(yylloc, display_range_too) { + var l1 = yylloc.first_line; + var l2 = yylloc.last_line; + var c1 = yylloc.first_column; + var c2 = yylloc.last_column; + var dl = l2 - l1; + var dc = c2 - c1; + var rv; + if (dl === 0) { + rv = 'line ' + l1 + ', '; + if (dc <= 1) { + rv += 'column ' + c1; + } else { + rv += 'columns ' + c1 + ' .. ' + c2; + } + } else { + rv = 'lines ' + l1 + '(column ' + c1 + ') .. ' + l2 + '(column ' + c2 + ')'; + } + if (yylloc.range && display_range_too) { + var r1 = yylloc.range[0]; + var r2 = yylloc.range[1] - 1; + if (r2 <= r1) { + rv += ' {String Offset: ' + r1 + '}'; + } else { + rv += ' {String Offset range: ' + r1 + ' .. ' + r2 + '}'; + } + } + return rv; + }, + + /** + * test the lexed token: return FALSE when not a match, otherwise return token. + * + * \`match\` is supposed to be an array coming out of a regex match, i.e. \`match[0]\` + * contains the actually matched text string. + * + * Also move the input cursor forward and update the match collectors: + * + * - \`yytext\` + * - \`yyleng\` + * - \`match\` + * - \`matches\` + * - \`yylloc\` + * - \`offset\` + * + * @public + * @this {RegExpLexer} + */ + test_match: function lexer_test_match(match, indexed_rule) { var token, lines, - backup; + backup, + match_str, + match_str_len; if (this.options.backtrack_lexer) { // save context @@ -285,9 +2005,11 @@ RegExpLexer.prototype = { yylineno: this.yylineno, yylloc: { first_line: this.yylloc.first_line, - last_line: this.last_line, + last_line: this.yylloc.last_line, first_column: this.yylloc.first_column, - last_column: this.yylloc.last_column + last_column: this.yylloc.last_column, + + range: this.yylloc.range.slice(0) }, yytext: this.yytext, match: this.match, @@ -297,39 +2019,48 @@ RegExpLexer.prototype = { offset: this.offset, _more: this._more, _input: this._input, + //_signaled_error_token: this._signaled_error_token, yy: this.yy, conditionStack: this.conditionStack.slice(0), done: this.done }; - if (this.options.ranges) { - backup.yylloc.range = this.yylloc.range.slice(0); - } } - lines = match[0].match(/(?:\r\n?|\n).*/g); - if (lines) { - this.yylineno += lines.length; - } - this.yylloc = { - first_line: this.yylloc.last_line, - last_line: this.yylineno + 1, - first_column: this.yylloc.last_column, - last_column: lines ? - lines[lines.length - 1].length - lines[lines.length - 1].match(/\r?\n?/)[0].length : - this.yylloc.last_column + match[0].length - }; - this.yytext += match[0]; - this.match += match[0]; + match_str = match[0]; + match_str_len = match_str.length; + // if (match_str.indexOf('\\n') !== -1 || match_str.indexOf('\\r') !== -1) { + lines = match_str.split(/(?:\\r\\n?|\\n)/g); + if (lines.length > 1) { + this.yylineno += lines.length - 1; + + this.yylloc.last_line = this.yylineno + 1; + this.yylloc.last_column = lines[lines.length - 1].length; + } else { + this.yylloc.last_column += match_str_len; + } + // } + this.yytext += match_str; + this.match += match_str; + this.matched += match_str; this.matches = match; this.yyleng = this.yytext.length; - if (this.options.ranges) { - this.yylloc.range = [this.offset, this.offset += this.yyleng]; - } + this.yylloc.range[1] += match_str_len; + + // previous lex rules MAY have invoked the \`more()\` API rather than producing a token: + // those rules will already have moved this \`offset\` forward matching their match lengths, + // hence we must only add our own match length now: + this.offset += match_str_len; this._more = false; this._backtrack = false; - this._input = this._input.slice(match[0].length); - this.matched += match[0]; - token = this.performAction.call(this, this.yy, this, indexed_rule, this.conditionStack[this.conditionStack.length - 1]); + this._input = this._input.slice(match_str_len); + + // calling this method: + // + // function lexer__performAction(yy, yyrulenumber, YY_START) {...} + token = this.performAction.call(this, this.yy, indexed_rule, this.conditionStack[this.conditionStack.length - 1] /* = YY_START */); + // otherwise, when the action codes are all simple return token statements: + //token = this.simpleCaseActionClusters[indexed_rule]; + if (this.done && this._input) { this.done = false; } @@ -340,14 +2071,27 @@ RegExpLexer.prototype = { for (var k in backup) { this[k] = backup[k]; } + this.__currentRuleSet__ = null; return false; // rule action called reject() implying the next rule should be tested instead. + } else if (this._signaled_error_token) { + // produce one 'error' token as \`.parseError()\` in \`reject()\` + // did not guarantee a failure signal by throwing an exception! + token = this._signaled_error_token; + this._signaled_error_token = false; + return token; } return false; }, - // return next match in input - next: function () { + /** + * return next match in input + * + * @public + * @this {RegExpLexer} + */ + next: function lexer_next() { if (this.done) { + this.clear(); return this.EOF; } if (!this._input) { @@ -359,21 +2103,45 @@ RegExpLexer.prototype = { tempMatch, index; if (!this._more) { - this.yytext = ''; - this.match = ''; + this.clear(); } - var rules = this._currentRules(); - for (var i = 0; i < rules.length; i++) { - tempMatch = this._input.match(this.rules[rules[i]]); + var spec = this.__currentRuleSet__; + if (!spec) { + // Update the ruleset cache as we apparently encountered a state change or just started lexing. + // The cache is set up for fast lookup -- we assume a lexer will switch states much less often than it will + // invoke the \`lex()\` token-producing API and related APIs, hence caching the set for direct access helps + // speed up those activities a tiny bit. + spec = this.__currentRuleSet__ = this._currentRules(); + // Check whether a *sane* condition has been pushed before: this makes the lexer robust against + // user-programmer bugs such as https://github.com/zaach/jison-lex/issues/19 + if (!spec || !spec.rules) { + var lineno_msg = ''; + if (this.options.trackPosition) { + lineno_msg = ' on line ' + (this.yylineno + 1); + } + var p = this.constructLexErrorInfo('Internal lexer engine error' + lineno_msg + ': The lex grammar programmer pushed a non-existing condition name "' + this.topState() + '"; this is a fatal error and should be reported to the application programmer team!', false); + // produce one 'error' token until this situation has been resolved, most probably by parse termination! + return (this.parseError(p.errStr, p, this.JisonLexerError) || this.ERROR); + } + } + + var rule_ids = spec.rules; + var regexes = spec.__rule_regexes; + var len = spec.__rule_count; + + // Note: the arrays are 1-based, while \`len\` itself is a valid index, + // hence the non-standard less-or-equal check in the next loop condition! + for (var i = 1; i <= len; i++) { + tempMatch = this._input.match(regexes[i]); if (tempMatch && (!match || tempMatch[0].length > match[0].length)) { match = tempMatch; index = i; if (this.options.backtrack_lexer) { - token = this.test_match(tempMatch, rules[i]); + token = this.test_match(tempMatch, rule_ids[i]); if (token !== false) { return token; } else if (this._backtrack) { - match = false; + match = undefined; continue; // rule action called reject() implying a rule MISmatch. } else { // else: this is a lexer rule which consumes input without producing a token (e.g. whitespace) @@ -385,219 +2153,972 @@ RegExpLexer.prototype = { } } if (match) { - token = this.test_match(match, rules[index]); + token = this.test_match(match, rule_ids[index]); if (token !== false) { return token; } // else: this is a lexer rule which consumes input without producing a token (e.g. whitespace) return false; } - if (this._input === "") { + if (!this._input) { + this.done = true; + this.clear(); return this.EOF; } else { - return this.parseError('Lexical error on line ' + (this.yylineno + 1) + '. Unrecognized text.\n' + this.showPosition(), { - text: "", - token: null, - line: this.yylineno - }); + var lineno_msg = ''; + if (this.options.trackPosition) { + lineno_msg = ' on line ' + (this.yylineno + 1); + } + var p = this.constructLexErrorInfo('Lexical error' + lineno_msg + ': Unrecognized text.', this.options.lexerErrorsAreRecoverable); + + var pendingInput = this._input; + var activeCondition = this.topState(); + var conditionStackDepth = this.conditionStack.length; + + token = (this.parseError(p.errStr, p, this.JisonLexerError) || this.ERROR); + if (token === this.ERROR) { + // we can try to recover from a lexer error that \`parseError()\` did not 'recover' for us + // by moving forward at least one character at a time IFF the (user-specified?) \`parseError()\` + // has not consumed/modified any pending input or changed state in the error handler: + if (!this.matches && + // and make sure the input has been modified/consumed ... + pendingInput === this._input && + // ...or the lexer state has been modified significantly enough + // to merit a non-consuming error handling action right now. + activeCondition === this.topState() && + conditionStackDepth === this.conditionStack.length + ) { + this.input(); + } + } + return token; } }, - // return next match that has a token - lex: function lex () { - var r = this.next(); - if (r) { - return r; - } else { - return this.lex(); + /** + * return next match that has a token + * + * @public + * @this {RegExpLexer} + */ + lex: function lexer_lex() { + var r; + // allow the PRE/POST handlers set/modify the return token for maximum flexibility of the generated lexer: + if (typeof this.options.pre_lex === 'function') { + r = this.options.pre_lex.call(this); } + + while (!r) { + r = this.next(); + } + + if (typeof this.options.post_lex === 'function') { + // (also account for a userdef function which does not return any value: keep the token as is) + r = this.options.post_lex.call(this, r) || r; + } + return r; }, - // activates a new lexer condition state (pushes the new lexer condition state onto the condition stack) - begin: function begin (condition) { + /** + * backwards compatible alias for \`pushState()\`; + * the latter is symmetrical with \`popState()\` and we advise to use + * those APIs in any modern lexer code, rather than \`begin()\`. + * + * @public + * @this {RegExpLexer} + */ + begin: function lexer_begin(condition) { + return this.pushState(condition); + }, + + /** + * activates a new lexer condition state (pushes the new lexer + * condition state onto the condition stack) + * + * @public + * @this {RegExpLexer} + */ + pushState: function lexer_pushState(condition) { this.conditionStack.push(condition); + this.__currentRuleSet__ = null; + return this; }, - // pop the previously active lexer condition state off the condition stack - popState: function popState () { + /** + * pop the previously active lexer condition state off the condition + * stack + * + * @public + * @this {RegExpLexer} + */ + popState: function lexer_popState() { var n = this.conditionStack.length - 1; if (n > 0) { + this.__currentRuleSet__ = null; return this.conditionStack.pop(); } else { return this.conditionStack[0]; } }, - // produce the lexer rule set which is active for the currently active lexer condition state - _currentRules: function _currentRules () { - if (this.conditionStack.length && this.conditionStack[this.conditionStack.length - 1]) { - return this.conditions[this.conditionStack[this.conditionStack.length - 1]].rules; - } else { - return this.conditions["INITIAL"].rules; - } - }, - - // return the currently active lexer condition state; when an index argument is provided it produces the N-th previous condition state, if available - topState: function topState (n) { + /** + * return the currently active lexer condition state; when an index + * argument is provided it produces the N-th previous condition state, + * if available + * + * @public + * @this {RegExpLexer} + */ + topState: function lexer_topState(n) { n = this.conditionStack.length - 1 - Math.abs(n || 0); if (n >= 0) { return this.conditionStack[n]; } else { - return "INITIAL"; + return 'INITIAL'; } }, - // alias for begin(condition) - pushState: function pushState (condition) { - this.begin(condition); + /** + * (internal) determine the lexer rule set which is active for the + * currently active lexer condition state + * + * @public + * @this {RegExpLexer} + */ + _currentRules: function lexer__currentRules() { + if (this.conditionStack.length && this.conditionStack[this.conditionStack.length - 1]) { + return this.conditions[this.conditionStack[this.conditionStack.length - 1]]; + } else { + return this.conditions['INITIAL']; + } }, - // return the number of states pushed - stateStackSize: function stateStackSize() { + /** + * return the number of states currently on the stack + * + * @public + * @this {RegExpLexer} + */ + stateStackSize: function lexer_stateStackSize() { return this.conditionStack.length; } -}; +}`; + // --- END lexer kernel --- +} + +RegExpLexer.prototype = (new Function(rmCommonWS` + return ${getRegExpLexerPrototype()}; +`))(); + + +// The lexer code stripper, driven by optimization analysis settings and +// lexer options, which cannot be changed at run-time. +function stripUnusedLexerCode(src, opt) { + // uses yyleng: ..................... ${opt.lexerActionsUseYYLENG} + // uses yylineno: ................... ${opt.lexerActionsUseYYLINENO} + // uses yytext: ..................... ${opt.lexerActionsUseYYTEXT} + // uses yylloc: ..................... ${opt.lexerActionsUseYYLOC} + // uses ParseError API: ............. ${opt.lexerActionsUseParseError} + // uses location tracking & editing: ${opt.lexerActionsUseLocationTracking} + // uses more() API: ................. ${opt.lexerActionsUseMore} + // uses unput() API: ................ ${opt.lexerActionsUseUnput} + // uses reject() API: ............... ${opt.lexerActionsUseReject} + // uses less() API: ................. ${opt.lexerActionsUseLess} + // uses display APIs pastInput(), upcomingInput(), showPosition(): + // ............................. ${opt.lexerActionsUseDisplayAPIs} + // uses describeYYLLOC() API: ....... ${opt.lexerActionsUseDescribeYYLOC} + + var ast = helpers.parseCodeChunkToAST(src, opt); + var new_src = helpers.prettyPrintAST(ast, opt); + +if (0) { + this.actionsUseYYLENG = analyzeFeatureUsage(this.performAction, /\byyleng\b/g, 1); + this.actionsUseYYLINENO = analyzeFeatureUsage(this.performAction, /\byylineno\b/g, 1); + this.actionsUseYYTEXT = analyzeFeatureUsage(this.performAction, /\byytext\b/g, 1); + this.actionsUseYYLOC = analyzeFeatureUsage(this.performAction, /\byyloc\b/g, 1); + this.actionsUseParseError = analyzeFeatureUsage(this.performAction, /\.parseError\b/g, 0); + + this.actionsUseValueTracking = analyzeFeatureUsage(this.performAction, /\byyvstack\b/g, 1); + // Ditto for the specific case where we are assigning a value to `$$`, i.e. `this.$`: + this.actionsUseValueAssignment = analyzeFeatureUsage(this.performAction, /\bthis\.\$[^\w]/g, 0); + // Ditto for the expansion of `@name`, `@$` and `@n` to its `yylstack[n]` index expression: + this.actionsUseLocationTracking = analyzeFeatureUsage(this.performAction, /\byylstack\b/g, 1); + // Ditto for the specific case where we are assigning a value to `@$`, i.e. `this._$`: + this.actionsUseLocationAssignment = analyzeFeatureUsage(this.performAction, /\bthis\._\$[^\w]/g, 0); + // Note that the `#name`, `#$` and `#n` constructs are expanded directly to their symbol number without + // the need to use yystack! Hence yystack is only there for very special use action code.) + + + if (devDebug || this.DEBUG) { + console.log('Optimization analysis: ', { + actionsUseYYLENG: this.actionsUseYYLENG, + actionsUseYYLINENO: this.actionsUseYYLINENO, + actionsUseYYTEXT: this.actionsUseYYTEXT, + actionsUseYYLOC: this.actionsUseYYLOC, + actionsUseParseError: this.actionsUseParseError, + actionsUseYYERROR: this.actionsUseYYERROR, + actionsUseYYRECOVERING: this.actionsUseYYRECOVERING, + actionsUseYYERROK: this.actionsUseYYERROK, + actionsUseYYCLEARIN: this.actionsUseYYCLEARIN, + actionsUseValueTracking: this.actionsUseValueTracking, + actionsUseValueAssignment: this.actionsUseValueAssignment, + actionsUseLocationTracking: this.actionsUseLocationTracking, + actionsUseLocationAssignment: this.actionsUseLocationAssignment, + actionsUseYYSTACK: this.actionsUseYYSTACK, + actionsUseYYSSTACK: this.actionsUseYYSSTACK, + actionsUseYYSTACKPOINTER: this.actionsUseYYSTACKPOINTER, + hasErrorRecovery: this.hasErrorRecovery, + hasErrorReporting: this.hasErrorReporting, + defaultActionMode: this.options.defaultActionMode, + noTryCatch: this.options.noTryCatch, + }); + } + + + function analyzeFeatureUsage(sourcecode, feature, threshold) { + var found = sourcecode.match(feature); + return !!(found && found.length > threshold); + } +} + + + + + + + + + // inject analysis report now: + new_src = new_src.replace(/\/\*\s*JISON-LEX-ANALYTICS-REPORT\s*\*\//g, rmCommonWS` + // Code Generator Information Report + // --------------------------------- + // + // Options: + // + // backtracking: .................... ${opt.options.backtrack_lexer} + // location.ranges: ................. ${opt.options.ranges} + // location line+column tracking: ... ${opt.options.trackPosition} + // + // + // Forwarded Parser Analysis flags: + // + // uses yyleng: ..................... ${opt.parseActionsUseYYLENG} + // uses yylineno: ................... ${opt.parseActionsUseYYLINENO} + // uses yytext: ..................... ${opt.parseActionsUseYYTEXT} + // uses yylloc: ..................... ${opt.parseActionsUseYYLOC} + // uses lexer values: ............... ${opt.parseActionsUseValueTracking} / ${opt.parseActionsUseValueAssignment} + // location tracking: ............... ${opt.parseActionsUseLocationTracking} + // location assignment: ............. ${opt.parseActionsUseLocationAssignment} + // + // + // Lexer Analysis flags: + // + // uses yyleng: ..................... ${opt.lexerActionsUseYYLENG} + // uses yylineno: ................... ${opt.lexerActionsUseYYLINENO} + // uses yytext: ..................... ${opt.lexerActionsUseYYTEXT} + // uses yylloc: ..................... ${opt.lexerActionsUseYYLOC} + // uses ParseError API: ............. ${opt.lexerActionsUseParseError} + // uses yyerror: .................... ${opt.lexerActionsUseYYERROR} + // uses location tracking & editing: ${opt.lexerActionsUseLocationTracking} + // uses more() API: ................. ${opt.lexerActionsUseMore} + // uses unput() API: ................ ${opt.lexerActionsUseUnput} + // uses reject() API: ............... ${opt.lexerActionsUseReject} + // uses less() API: ................. ${opt.lexerActionsUseLess} + // uses display APIs pastInput(), upcomingInput(), showPosition(): + // ............................. ${opt.lexerActionsUseDisplayAPIs} + // uses describeYYLLOC() API: ....... ${opt.lexerActionsUseDescribeYYLOC} + // + // --------- END OF REPORT ----------- + + `); + + return new_src; +} + + + // generate lexer source from a grammar -function generate (dict, tokens) { - var opt = processGrammar(dict, tokens); +/** @public */ +function generate(dict, tokens, build_options) { + var opt = processGrammar(dict, tokens, build_options); return generateFromOpts(opt); } // process the grammar and build final data structures and functions -function processGrammar(dict, tokens) { - var opts = {}; - if (typeof dict === 'string') { - dict = lexParser.parse(dict); - } - dict = dict || {}; +/** @public */ +function processGrammar(dict, tokens, build_options) { + build_options = build_options || {}; + var opts = { + // include the knowledge passed through `build_options` about which lexer + // features will actually be *used* by the environment (which in 99.9% + // of cases is a jison *parser*): + // + // (this stuff comes straight from the jison Optimization Analysis.) + // + parseActionsUseYYLENG: build_options.parseActionsUseYYLENG, + parseActionsUseYYLINENO: build_options.parseActionsUseYYLINENO, + parseActionsUseYYTEXT: build_options.parseActionsUseYYTEXT, + parseActionsUseYYLOC: build_options.parseActionsUseYYLOC, + parseActionsUseParseError: build_options.parseActionsUseParseError, + parseActionsUseYYERROR: build_options.parseActionsUseYYERROR, + parseActionsUseYYERROK: build_options.parseActionsUseYYERROK, + parseActionsUseYYRECOVERING: build_options.parseActionsUseYYRECOVERING, + parseActionsUseYYCLEARIN: build_options.parseActionsUseYYCLEARIN, + parseActionsUseValueTracking: build_options.parseActionsUseValueTracking, + parseActionsUseValueAssignment: build_options.parseActionsUseValueAssignment, + parseActionsUseLocationTracking: build_options.parseActionsUseLocationTracking, + parseActionsUseLocationAssignment: build_options.parseActionsUseLocationAssignment, + parseActionsUseYYSTACK: build_options.parseActionsUseYYSTACK, + parseActionsUseYYSSTACK: build_options.parseActionsUseYYSSTACK, + parseActionsUseYYSTACKPOINTER: build_options.parseActionsUseYYSTACKPOINTER, + parseActionsUseYYRULELENGTH: build_options.parseActionsUseYYRULELENGTH, + parserHasErrorRecovery: build_options.parserHasErrorRecovery, + parserHasErrorReporting: build_options.parserHasErrorReporting, + + lexerActionsUseYYLENG: '???', + lexerActionsUseYYLINENO: '???', + lexerActionsUseYYTEXT: '???', + lexerActionsUseYYLOC: '???', + lexerActionsUseParseError: '???', + lexerActionsUseYYERROR: '???', + lexerActionsUseLocationTracking: '???', + lexerActionsUseMore: '???', + lexerActionsUseUnput: '???', + lexerActionsUseReject: '???', + lexerActionsUseLess: '???', + lexerActionsUseDisplayAPIs: '???', + lexerActionsUseDescribeYYLOC: '???', + }; + + dict = autodetectAndConvertToJSONformat(dict, build_options) || {}; + + // Feed the possibly reprocessed 'dictionary' above back to the caller + // (for use by our error diagnostic assistance code) + opts.lex_rule_dictionary = dict; + + // Always provide the lexer with an options object, even if it's empty! + // Make sure to camelCase all options: + opts.options = mkStdOptions(build_options, dict.options); - opts.options = dict.options || {}; opts.moduleType = opts.options.moduleType; opts.moduleName = opts.options.moduleName; opts.conditions = prepareStartConditions(dict.startConditions); - opts.conditions.INITIAL = {rules:[],inclusive:true}; + opts.conditions.INITIAL = { + rules: [], + inclusive: true + }; + + var code = buildActions(dict, tokens, opts); + opts.performAction = code.actions; + opts.caseHelperInclude = code.caseHelperInclude; + opts.rules = code.rules; + opts.macros = code.macros; + + opts.regular_rule_count = code.regular_rule_count; + opts.simple_rule_count = code.simple_rule_count; - opts.performAction = buildActions.call(opts, dict, tokens); opts.conditionStack = ['INITIAL']; - opts.moduleInclude = (dict.moduleInclude || '').trim(); + opts.actionInclude = (dict.actionInclude || ''); + opts.moduleInclude = (opts.moduleInclude || '') + (dict.moduleInclude || '').trim(); + return opts; } // Assemble the final source from the processed grammar -function generateFromOpts (opt) { - var code = ""; +/** @public */ +function generateFromOpts(opt) { + var code = ''; - if (opt.moduleType === 'commonjs') { - code = generateCommonJSModule(opt); - } else if (opt.moduleType === 'amd') { - code = generateAMDModule(opt); - } else { + switch (opt.moduleType) { + case 'js': code = generateModule(opt); + break; + case 'amd': + code = generateAMDModule(opt); + break; + case 'es': + code = generateESModule(opt); + break; + case 'commonjs': + default: + code = generateCommonJSModule(opt); + break; } return code; } -function generateModuleBody (opt) { - var functionDescriptions = { - setInput: "resets the lexer, sets new input", - input: "consumes and returns one char from the input", - unput: "unshifts one char (or a string) into the input", - more: "When called from action, caches matched text and appends it on next action", - reject: "When called from action, signals the lexer that this rule fails to match the input, so the next matching rule (regex) should be tested instead.", - less: "retain first n characters of the match", - pastInput: "displays already matched input, i.e. for error messages", - upcomingInput: "displays upcoming input, i.e. for error messages", - showPosition: "displays the character position where the lexing error occurred, i.e. for error messages", - test_match: "test the lexed token: return FALSE when not a match, otherwise return token", - next: "return next match in input", - lex: "return next match that has a token", - begin: "activates a new lexer condition state (pushes the new lexer condition state onto the condition stack)", - popState: "pop the previously active lexer condition state off the condition stack", - _currentRules: "produce the lexer rule set which is active for the currently active lexer condition state", - topState: "return the currently active lexer condition state; when an index argument is provided it produces the N-th previous condition state, if available", - pushState: "alias for begin(condition)", - stateStackSize: "return the number of states currently on the stack" - }; - var out = "({\n"; - var p = []; - var descr; - for (var k in RegExpLexer.prototype) { - if (RegExpLexer.prototype.hasOwnProperty(k) && k.indexOf("generate") === -1) { - // copy the function description as a comment before the implementation; supports multi-line descriptions - descr = "\n"; - if (functionDescriptions[k]) { - descr += "// " + functionDescriptions[k].replace(/\n/g, "\n\/\/ ") + "\n"; +function generateRegexesInitTableCode(opt) { + var a = opt.rules; + var print_xregexp = opt.options && opt.options.xregexp; + var id_display_width = (1 + Math.log10(a.length | 1) | 0); + var ws_prefix = new Array(id_display_width).join(' '); + var b = a.map(function generateXRegExpInitCode(re, idx) { + var idx_str = (ws_prefix + idx).substr(-id_display_width); + + if (re instanceof XRegExp) { + // When we don't need the special XRegExp sauce at run-time, we do with the original + // JavaScript RegExp instance a.k.a. 'native regex': + if (re.xregexp.isNative || !print_xregexp) { + return `/* ${idx_str}: */ ${re}`; } - p.push(descr + k + ":" + (RegExpLexer.prototype[k].toString() || '""')); + // And make sure to escape the regex to make it suitable for placement inside a *string* + // as it is passed as a string argument to the XRegExp constructor here. + var re_src = re.xregexp.source.replace(/[\\"]/g, '\\$&'); + return `/* ${idx_str}: */ new XRegExp("${re_src}", "${re.xregexp.flags}")`; + } else { + return `/* ${idx_str}: */ ${re}`; } - } - out += p.join(",\n"); + }); + return b.join(',\n'); +} - if (opt.options) { - out += ",\noptions: " + JSON.stringify(opt.options); +function generateModuleBody(opt) { + // make the JSON output look more like JavaScript: + function cleanupJSON(str) { + str = str.replace(/ "rules": \[/g, ' rules: ['); + str = str.replace(/ "inclusive": /g, ' inclusive: '); + return str; } - out += ",\nperformAction: " + String(opt.performAction); - out += ",\nrules: [" + opt.rules + "]"; - out += ",\nconditions: " + JSON.stringify(opt.conditions); - out += "\n})"; + function produceOptions(opts) { + var obj = {}; + var do_not_pass = { + debug: !opts.debug, // do not include this item when it is FALSE as there's no debug tracing built into the generated grammar anyway! + enableDebugLogs: 1, + json: 1, + _: 1, + noMain: 1, + dumpSourceCodeOnFailure: 1, + throwErrorOnCompileFailure: 1, + reportStats: 1, + file: 1, + outfile: 1, + inputPath: 1, + inputFilename: 1, + defaultModuleName: 1, + moduleName: 1, + moduleType: 1, + lexerErrorsAreRecoverable: 0, + flex: 0, + backtrack_lexer: 0, + caseInsensitive: 0, + showSource: 1, + exportAST: 1, + exportAllTables: 1, + exportSourceCode: 1, + prettyCfg: 1, + parseActionsUseYYLENG: 1, + parseActionsUseYYLINENO: 1, + parseActionsUseYYTEXT: 1, + parseActionsUseYYLOC: 1, + parseActionsUseParseError: 1, + parseActionsUseYYERROR: 1, + parseActionsUseYYRECOVERING: 1, + parseActionsUseYYERROK: 1, + parseActionsUseYYCLEARIN: 1, + parseActionsUseValueTracking: 1, + parseActionsUseValueAssignment: 1, + parseActionsUseLocationTracking: 1, + parseActionsUseLocationAssignment: 1, + parseActionsUseYYSTACK: 1, + parseActionsUseYYSSTACK: 1, + parseActionsUseYYSTACKPOINTER: 1, + parseActionsUseYYRULELENGTH: 1, + parserHasErrorRecovery: 1, + parserHasErrorReporting: 1, + lexerActionsUseYYLENG: 1, + lexerActionsUseYYLINENO: 1, + lexerActionsUseYYTEXT: 1, + lexerActionsUseYYLOC: 1, + lexerActionsUseParseError: 1, + lexerActionsUseYYERROR: 1, + lexerActionsUseLocationTracking: 1, + lexerActionsUseMore: 1, + lexerActionsUseUnput: 1, + lexerActionsUseReject: 1, + lexerActionsUseLess: 1, + lexerActionsUseDisplayAPIs: 1, + lexerActionsUseDescribeYYLOC: 1, + }; + for (var k in opts) { + if (!do_not_pass[k] && opts[k] != null && opts[k] !== false) { + // make sure numeric values are encoded as numeric, the rest as boolean/string. + if (typeof opts[k] === 'string') { + var f = parseFloat(opts[k]); + if (f == opts[k]) { + obj[k] = f; + continue; + } + } + obj[k] = opts[k]; + } + } - return out; -} + // And now some options which should receive some special processing: + var pre = obj.pre_lex; + var post = obj.post_lex; + // since JSON cannot encode functions, we'll have to do it manually at run-time, i.e. later on: + if (pre) { + obj.pre_lex = true; + } + if (post) { + obj.post_lex = true; + } -function generateModule(opt) { - opt = opt || {}; + var js = JSON.stringify(obj, null, 2); - var out = "/* generated by jison-lex " + version + " */"; - var moduleName = opt.moduleName || "lexer"; + js = js.replace(new XRegExp(` "(${ID_REGEX_BASE})": `, 'g'), ' $1: '); + js = js.replace(/^( +)pre_lex: true(,)?$/gm, function (m, ls, tc) { + return ls + 'pre_lex: ' + String(pre) + (tc || ''); + }); + js = js.replace(/^( +)post_lex: true(,)?$/gm, function (m, ls, tc) { + return ls + 'post_lex: ' + String(post) + (tc || ''); + }); + return js; + } - out += "\nvar " + moduleName + " = (function(){\nvar lexer = " - + generateModuleBody(opt); - if (opt.moduleInclude) { - out += ";\n" + opt.moduleInclude; - } + var out; + if (opt.rules.length > 0 || opt.__in_rules_failure_analysis_mode__) { + // we don't mind that the `test_me()` code above will have this `lexer` variable re-defined: + // JavaScript is fine with that. + var code = [rmCommonWS` + var lexer = { + `, '/*JISON-LEX-ANALYTICS-REPORT*/' /* slot #1: placeholder for analysis report further below */ + ]; + + // get the RegExpLexer.prototype in source code form: + var protosrc = getRegExpLexerPrototype(); + // and strip off the surrounding bits we don't want: + protosrc = protosrc + .replace(/^[\s\r\n]*\{/, '') + .replace(/\s*\}[\s\r\n]*$/, '') + .trim(); + code.push(protosrc + ',\n'); + + assert(opt.options); + // Assure all options are camelCased: + assert(typeof opt.options['case-insensitive'] === 'undefined'); + + code.push(' options: ' + produceOptions(opt.options)); + + var performActionCode = String(opt.performAction); + var simpleCaseActionClustersCode = String(opt.caseHelperInclude); + var rulesCode = generateRegexesInitTableCode(opt); + var conditionsCode = cleanupJSON(JSON.stringify(opt.conditions, null, 2)); + code.push(rmCommonWS`, + JisonLexerError: JisonLexerError, + performAction: ${performActionCode}, + simpleCaseActionClusters: ${simpleCaseActionClustersCode}, + rules: [ + ${rulesCode} + ], + conditions: ${conditionsCode} + }; + `); + + opt.is_custom_lexer = false; - out += ";\nreturn lexer;\n})();"; + out = code.join(''); + } else { + // We're clearly looking at a custom lexer here as there's no lexer rules at all. + // + // We are re-purposing the `%{...%}` `actionInclude` code block here as it serves no purpose otherwise. + // + // Meanwhile we make sure we have the `lexer` variable declared in *local scope* no matter + // what crazy stuff (or lack thereof) the userland code is pulling in the `actionInclude` chunk. + out = 'var lexer;\n'; + + assert(opt.regular_rule_count === 0); + assert(opt.simple_rule_count === 0); + opt.is_custom_lexer = true; + + if (opt.actionInclude) { + out += opt.actionInclude + (!opt.actionInclude.match(/;[\s\r\n]*$/) ? ';' : '') + '\n'; + } + } + // The output of this function is guaranteed to read something like this: + // + // ``` + // var lexer; + // + // bla bla bla bla ... lotsa bla bla; + // ``` + // + // and that should work nicely as an `eval()`-able piece of source code. return out; } -function generateAMDModule(opt) { - var out = "/* generated by jison-lex " + version + " */"; +function generateGenericHeaderComment() { + var out = rmCommonWS` + /* lexer generated by jison-lex ${version} */ + + /* + * Returns a Lexer object of the following structure: + * + * Lexer: { + * yy: {} The so-called "shared state" or rather the *source* of it; + * the real "shared state" \`yy\` passed around to + * the rule actions, etc. is a direct reference! + * + * This "shared context" object was passed to the lexer by way of + * the \`lexer.setInput(str, yy)\` API before you may use it. + * + * This "shared context" object is passed to the lexer action code in \`performAction()\` + * so userland code in the lexer actions may communicate with the outside world + * and/or other lexer rules' actions in more or less complex ways. + * + * } + * + * Lexer.prototype: { + * EOF: 1, + * ERROR: 2, + * + * yy: The overall "shared context" object reference. + * + * JisonLexerError: function(msg, hash), + * + * performAction: function lexer__performAction(yy, yyrulenumber, YY_START), + * + * The function parameters and \`this\` have the following value/meaning: + * - \`this\` : reference to the \`lexer\` instance. + * \`yy_\` is an alias for \`this\` lexer instance reference used internally. + * + * - \`yy\` : a reference to the \`yy\` "shared state" object which was passed to the lexer + * by way of the \`lexer.setInput(str, yy)\` API before. + * + * Note: + * The extra arguments you specified in the \`%parse-param\` statement in your + * **parser** grammar definition file are passed to the lexer via this object + * reference as member variables. + * + * - \`yyrulenumber\` : index of the matched lexer rule (regex), used internally. + * + * - \`YY_START\`: the current lexer "start condition" state. + * + * parseError: function(str, hash, ExceptionClass), + * + * constructLexErrorInfo: function(error_message, is_recoverable), + * Helper function. + * Produces a new errorInfo \'hash object\' which can be passed into \`parseError()\`. + * See it\'s use in this lexer kernel in many places; example usage: + * + * var infoObj = lexer.constructParseErrorInfo(\'fail!\', true); + * var retVal = lexer.parseError(infoObj.errStr, infoObj, lexer.JisonLexerError); + * + * options: { ... lexer %options ... }, + * + * lex: function(), + * Produce one token of lexed input, which was passed in earlier via the \`lexer.setInput()\` API. + * You MAY use the additional \`args...\` parameters as per \`%parse-param\` spec of the **lexer** grammar: + * these extra \`args...\` are added verbatim to the \`yy\` object reference as member variables. + * + * WARNING: + * Lexer's additional \`args...\` parameters (via lexer's \`%parse-param\`) MAY conflict with + * any attributes already added to \`yy\` by the **parser** or the jison run-time; + * when such a collision is detected an exception is thrown to prevent the generated run-time + * from silently accepting this confusing and potentially hazardous situation! + * + * cleanupAfterLex: function(do_not_nuke_errorinfos), + * Helper function. + * + * This helper API is invoked when the **parse process** has completed: it is the responsibility + * of the **parser** (or the calling userland code) to invoke this method once cleanup is desired. + * + * This helper may be invoked by user code to ensure the internal lexer gets properly garbage collected. + * + * setInput: function(input, [yy]), + * + * + * input: function(), + * + * + * unput: function(str), + * + * + * more: function(), + * + * + * reject: function(), + * + * + * less: function(n), + * + * + * pastInput: function(n), + * + * + * upcomingInput: function(n), + * + * + * showPosition: function(), + * + * + * test_match: function(regex_match_array, rule_index), + * + * + * next: function(), + * + * + * begin: function(condition), + * + * + * pushState: function(condition), + * + * + * popState: function(), + * + * + * topState: function(), + * + * + * _currentRules: function(), + * + * + * stateStackSize: function(), + * + * + * performAction: function(yy, yy_, yyrulenumber, YY_START), + * + * + * rules: [...], + * + * + * conditions: {associative list: name ==> set}, + * } + * + * + * token location info (\`yylloc\`): { + * first_line: n, + * last_line: n, + * first_column: n, + * last_column: n, + * range: [start_number, end_number] + * (where the numbers are indexes into the input string, zero-based) + * } + * + * --- + * + * The \`parseError\` function receives a \'hash\' object with these members for lexer errors: + * + * { + * text: (matched text) + * token: (the produced terminal token, if any) + * token_id: (the produced terminal token numeric ID, if any) + * line: (yylineno) + * loc: (yylloc) + * recoverable: (boolean: TRUE when the parser MAY have an error recovery rule + * available for this particular error) + * yy: (object: the current parser internal "shared state" \`yy\` + * as is also available in the rule actions; this can be used, + * for instance, for advanced error analysis and reporting) + * lexer: (reference to the current lexer instance used by the parser) + * } + * + * while \`this\` will reference the current lexer instance. + * + * When \`parseError\` is invoked by the lexer, the default implementation will + * attempt to invoke \`yy.parser.parseError()\`; when this callback is not provided + * it will try to invoke \`yy.parseError()\` instead. When that callback is also not + * provided, a \`JisonLexerError\` exception will be thrown containing the error + * message and \`hash\`, as constructed by the \`constructLexErrorInfo()\` API. + * + * Note that the lexer\'s \`JisonLexerError\` error class is passed via the + * \`ExceptionClass\` argument, which is invoked to construct the exception + * instance to be thrown, so technically \`parseError\` will throw the object + * produced by the \`new ExceptionClass(str, hash)\` JavaScript expression. + * + * --- + * + * You can specify lexer options by setting / modifying the \`.options\` object of your Lexer instance. + * These options are available: + * + * (Options are permanent.) + * + * yy: { + * parseError: function(str, hash, ExceptionClass) + * optional: overrides the default \`parseError\` function. + * } + * + * lexer.options: { + * pre_lex: function() + * optional: is invoked before the lexer is invoked to produce another token. + * \`this\` refers to the Lexer object. + * post_lex: function(token) { return token; } + * optional: is invoked when the lexer has produced a token \`token\`; + * this function can override the returned token value by returning another. + * When it does not return any (truthy) value, the lexer will return + * the original \`token\`. + * \`this\` refers to the Lexer object. + * + * WARNING: the next set of options are not meant to be changed. They echo the abilities of + * the lexer as per when it was compiled! + * + * ranges: boolean + * optional: \`true\` ==> token location info will include a .range[] member. + * flex: boolean + * optional: \`true\` ==> flex-like lexing behaviour where the rules are tested + * exhaustively to find the longest match. + * backtrack_lexer: boolean + * optional: \`true\` ==> lexer regexes are tested in order and for invoked; + * the lexer terminates the scan when a token is returned by the action code. + * xregexp: boolean + * optional: \`true\` ==> lexer rule regexes are "extended regex format" requiring the + * \`XRegExp\` library. When this %option has not been specified at compile time, all lexer + * rule regexes have been written as standard JavaScript RegExp expressions. + * } + */ + `; - out += "define([], function(){\nvar lexer = " - + generateModuleBody(opt); + return out; +} + +function prepareOptions(opt) { + opt = opt || {}; - if (opt.moduleInclude) { - out += ";\n" + opt.moduleInclude; + // check for illegal identifier + if (!opt.moduleName || !opt.moduleName.match(/^[a-zA-Z_$][a-zA-Z0-9_$\.]*$/)) { + if (opt.moduleName) { + var msg = 'WARNING: The specified moduleName "' + opt.moduleName + '" is illegal (only characters [a-zA-Z0-9_$] and "." dot are accepted); using the default moduleName "lexer" instead.'; + if (typeof opt.warn_cb === 'function') { + opt.warn_cb(msg); + } else { + // do not treat as warning; barf hairball instead so that this oddity gets noticed right away! + throw new Error(msg); + } + } + opt.moduleName = 'lexer'; } - out += ";\nreturn lexer;" - + "\n});"; + prepExportStructures(opt); - return out; + return opt; } -function generateCommonJSModule(opt) { - opt = opt || {}; +function generateModule(opt) { + opt = prepareOptions(opt); + + var out = [ + generateGenericHeaderComment(), + '', + 'var ' + opt.moduleName + ' = (function () {', + jisonLexerErrorDefinition, + '', + generateModuleBody(opt), + '', + (opt.moduleInclude ? opt.moduleInclude + ';' : ''), + '', + 'return lexer;', + '})();' + ]; + + var src = out.join('\n') + '\n'; + src = stripUnusedLexerCode(src, opt); + opt.exportSourceCode.all = src; + return src; +} - var out = ""; - var moduleName = opt.moduleName || "lexer"; +function generateAMDModule(opt) { + opt = prepareOptions(opt); + + var out = [ + generateGenericHeaderComment(), + '', + 'define([], function () {', + jisonLexerErrorDefinition, + '', + generateModuleBody(opt), + '', + (opt.moduleInclude ? opt.moduleInclude + ';' : ''), + '', + 'return lexer;', + '});' + ]; + + var src = out.join('\n') + '\n'; + src = stripUnusedLexerCode(src, opt); + opt.exportSourceCode.all = src; + return src; +} - out += generateModule(opt); - out += "\nexports.lexer = " + moduleName; - out += ";\nexports.lex = function () { return " + moduleName + ".lex.apply(lexer, arguments); };"; - return out; +function generateESModule(opt) { + opt = prepareOptions(opt); + + var out = [ + generateGenericHeaderComment(), + '', + 'var lexer = (function () {', + jisonLexerErrorDefinition, + '', + generateModuleBody(opt), + '', + (opt.moduleInclude ? opt.moduleInclude + ';' : ''), + '', + 'return lexer;', + '})();', + '', + 'function yylex() {', + ' return lexer.lex.apply(lexer, arguments);', + '}', + rmCommonWS` + export { + lexer, + yylex as lex + }; + ` + ]; + + var src = out.join('\n') + '\n'; + src = stripUnusedLexerCode(src, opt); + opt.exportSourceCode.all = src; + return src; +} + +function generateCommonJSModule(opt) { + opt = prepareOptions(opt); + + var out = [ + generateGenericHeaderComment(), + '', + 'var ' + opt.moduleName + ' = (function () {', + jisonLexerErrorDefinition, + '', + generateModuleBody(opt), + '', + (opt.moduleInclude ? opt.moduleInclude + ';' : ''), + '', + 'return lexer;', + '})();', + '', + 'if (typeof require !== \'undefined\' && typeof exports !== \'undefined\') {', + ' exports.lexer = ' + opt.moduleName + ';', + ' exports.lex = function () {', + ' return ' + opt.moduleName + '.lex.apply(lexer, arguments);', + ' };', + '}' + ]; + + var src = out.join('\n') + '\n'; + src = stripUnusedLexerCode(src, opt); + opt.exportSourceCode.all = src; + return src; } RegExpLexer.generate = generate; -module.exports = RegExpLexer; +RegExpLexer.version = version; +RegExpLexer.defaultJisonLexOptions = defaultJisonLexOptions; +RegExpLexer.mkStdOptions = mkStdOptions; +RegExpLexer.camelCase = camelCase; +RegExpLexer.autodetectAndConvertToJSONformat = autodetectAndConvertToJSONformat; + + +export default RegExpLexer; diff --git a/regexp-set-management.js b/regexp-set-management.js new file mode 100644 index 0000000..4802ce4 --- /dev/null +++ b/regexp-set-management.js @@ -0,0 +1,999 @@ +// +// Helper library for set definitions +// +// MIT Licensed +// +// +// This code is intended to help parse regex set expressions and mix them +// together, i.e. to answer questions like this: +// +// what is the resulting regex set expression when we mix the regex set +// `[a-z]` with the regex set `[^\s]` where with 'mix' we mean that any +// input which matches either input regex should match the resulting +// regex set. (a.k.a. Full Outer Join, see also http://www.diffen.com/difference/Inner_Join_vs_Outer_Join) +// + +'use strict'; + +import XRegExp from '@gerhobbelt/xregexp'; +import assert from 'assert'; + + + + +const XREGEXP_UNICODE_ESCAPE_RE = /^\{[A-Za-z0-9 \-\._]+\}/; // Matches the XRegExp Unicode escape braced part, e.g. `{Number}` +const CHR_RE = /^(?:[^\\]|\\[^cxu0-9]|\\[0-9]{1,3}|\\c[A-Z]|\\x[0-9a-fA-F]{2}|\\u[0-9a-fA-F]{4}|\\u\{[0-9a-fA-F]+\})/; +const SET_PART_RE = /^(?:[^\\\]]|\\[^cxu0-9]|\\[0-9]{1,3}|\\c[A-Z]|\\x[0-9a-fA-F]{2}|\\u[0-9a-fA-F]{4}|\\u\{[0-9a-fA-F]+\})+/; +const NOTHING_SPECIAL_RE = /^(?:[^\\\[\]\(\)\|^\{\}]|\\[^cxu0-9]|\\[0-9]{1,3}|\\c[A-Z]|\\x[0-9a-fA-F]{2}|\\u[0-9a-fA-F]{4}|\\u\{[0-9a-fA-F]+\})+/; +const SET_IS_SINGLE_PCODE_RE = /^\\[dDwWsS]$|^\\p\{[A-Za-z0-9 \-\._]+\}$/; + +const UNICODE_BASE_PLANE_MAX_CP = 65535; + +// The expanded regex sets which are equivalent to the given `\\{c}` escapes: +// +// `/\s/`: +const WHITESPACE_SETSTR = ' \f\n\r\t\v\u00a0\u1680\u180e\u2000-\u200a\u2028\u2029\u202f\u205f\u3000\ufeff'; +// `/\d/`: +const DIGIT_SETSTR = '0-9'; +// `/\w/`: +const WORDCHAR_SETSTR = 'A-Za-z0-9_'; + + + + + +// Helper for `bitarray2set()`: convert character code to a representation string suitable for use in a regex +function i2c(i) { + var c, x; + + switch (i) { + case 10: + return '\\n'; + + case 13: + return '\\r'; + + case 9: + return '\\t'; + + case 8: + return '\\b'; + + case 12: + return '\\f'; + + case 11: + return '\\v'; + + case 45: // ASCII/Unicode for '-' dash + return '\\-'; + + case 91: // '[' + return '\\['; + + case 92: // '\\' + return '\\\\'; + + case 93: // ']' + return '\\]'; + + case 94: // ']' + return '\\^'; + } + if (i < 32 + || i > 0xFFF0 /* Unicode Specials, also in UTF16 */ + || (i >= 0xD800 && i <= 0xDFFF) /* Unicode Supplementary Planes; we're TOAST in JavaScript as we're NOT UTF-16 but UCS-2! */ + || String.fromCharCode(i).match(/[\u2028\u2029]/) /* Code compilation via `new Function()` does not like to see these, or rather: treats them as just another form of CRLF, which breaks your generated regex code! */ + ) { + // Detail about a detail: + // U+2028 and U+2029 are part of the `\s` regex escape code (`\s` and `[\s]` match either of these) and when placed in a JavaScript + // source file verbatim (without escaping it as a `\uNNNN` item) then JavaScript will interpret it as such and consequently report + // a b0rked generated parser, as the generated code would include this regex right here. + // Hence we MUST escape these buggers everywhere we go... + x = i.toString(16); + if (x.length >= 1 && i <= 0xFFFF) { + c = '0000' + x; + return '\\u' + c.substr(c.length - 4); + } else { + return '\\u{' + x + '}'; + } + } + return String.fromCharCode(i); +} + + +// Helper collection for `bitarray2set()`: we have expanded all these cached `\\p{NAME}` regex sets when creating +// this bitarray and now we should look at these expansions again to see if `bitarray2set()` can produce a +// `\\p{NAME}` shorthand to represent [part of] the bitarray: +var Pcodes_bitarray_cache = {}; +var Pcodes_bitarray_cache_test_order = []; + +// Helper collection for `bitarray2set()` for minifying special cases of result sets which can be represented by +// a single regex 'escape', e.g. `\d` for digits 0-9. +var EscCode_bitarray_output_refs; + +// now initialize the EscCodes_... table above: +init_EscCode_lookup_table(); + +function init_EscCode_lookup_table() { + var s, bitarr, set2esc = {}, esc2bitarr = {}; + + // patch global lookup tables for the time being, while we calculate their *real* content in this function: + EscCode_bitarray_output_refs = { + esc2bitarr: {}, + set2esc: {} + }; + Pcodes_bitarray_cache_test_order = []; + + // `/\S': + bitarr = []; + set2bitarray(bitarr, '^' + WHITESPACE_SETSTR); + s = bitarray2set(bitarr); + esc2bitarr['S'] = bitarr; + set2esc[s] = 'S'; + // set2esc['^' + s] = 's'; + Pcodes_bitarray_cache['\\S'] = bitarr; + + // `/\s': + bitarr = []; + set2bitarray(bitarr, WHITESPACE_SETSTR); + s = bitarray2set(bitarr); + esc2bitarr['s'] = bitarr; + set2esc[s] = 's'; + // set2esc['^' + s] = 'S'; + Pcodes_bitarray_cache['\\s'] = bitarr; + + // `/\D': + bitarr = []; + set2bitarray(bitarr, '^' + DIGIT_SETSTR); + s = bitarray2set(bitarr); + esc2bitarr['D'] = bitarr; + set2esc[s] = 'D'; + // set2esc['^' + s] = 'd'; + Pcodes_bitarray_cache['\\D'] = bitarr; + + // `/\d': + bitarr = []; + set2bitarray(bitarr, DIGIT_SETSTR); + s = bitarray2set(bitarr); + esc2bitarr['d'] = bitarr; + set2esc[s] = 'd'; + // set2esc['^' + s] = 'D'; + Pcodes_bitarray_cache['\\d'] = bitarr; + + // `/\W': + bitarr = []; + set2bitarray(bitarr, '^' + WORDCHAR_SETSTR); + s = bitarray2set(bitarr); + esc2bitarr['W'] = bitarr; + set2esc[s] = 'W'; + // set2esc['^' + s] = 'w'; + Pcodes_bitarray_cache['\\W'] = bitarr; + + // `/\w': + bitarr = []; + set2bitarray(bitarr, WORDCHAR_SETSTR); + s = bitarray2set(bitarr); + esc2bitarr['w'] = bitarr; + set2esc[s] = 'w'; + // set2esc['^' + s] = 'W'; + Pcodes_bitarray_cache['\\w'] = bitarr; + + EscCode_bitarray_output_refs = { + esc2bitarr: esc2bitarr, + set2esc: set2esc + }; + + updatePcodesBitarrayCacheTestOrder(); +} + +function updatePcodesBitarrayCacheTestOrder(opts) { + var t = new Array(UNICODE_BASE_PLANE_MAX_CP + 1); + var l = {}; + var user_has_xregexp = opts && opts.options && opts.options.xregexp; + var i, j, k, ba; + + // mark every character with which regex pcodes they are part of: + for (k in Pcodes_bitarray_cache) { + ba = Pcodes_bitarray_cache[k]; + + if (!user_has_xregexp && k.indexOf('\\p{') >= 0) { + continue; + } + + var cnt = 0; + for (i = 0; i <= UNICODE_BASE_PLANE_MAX_CP; i++) { + if (ba[i]) { + cnt++; + if (!t[i]) { + t[i] = [k]; + } else { + t[i].push(k); + } + } + } + l[k] = cnt; + } + + // now dig out the unique ones: only need one per pcode. + // + // We ASSUME every \\p{NAME} 'pcode' has at least ONE character + // in it that is ONLY matched by that particular pcode. + // If this assumption fails, nothing is lost, but our 'regex set + // optimized representation' will be sub-optimal as than this pcode + // won't be tested during optimization. + // + // Now that would be a pity, so the assumption better holds... + // Turns out the assumption doesn't hold already for /\S/ + /\D/ + // as the second one (\D) is a pure subset of \S. So we have to + // look for markers which match multiple escapes/pcodes for those + // ones where a unique item isn't available... + var lut = []; + var done = {}; + var keys = Object.keys(Pcodes_bitarray_cache); + + for (i = 0; i <= UNICODE_BASE_PLANE_MAX_CP; i++) { + k = t[i][0]; + if (t[i].length === 1 && !done[k]) { + assert(l[k] > 0); + lut.push([i, k]); + done[k] = true; + } + } + + for (j = 0; keys[j]; j++) { + k = keys[j]; + + if (!user_has_xregexp && k.indexOf('\\p{') >= 0) { + continue; + } + + if (!done[k]) { + assert(l[k] > 0); + // find a minimum span character to mark this one: + var w = Infinity; + var rv; + ba = Pcodes_bitarray_cache[k]; + for (i = 0; i <= UNICODE_BASE_PLANE_MAX_CP; i++) { + if (ba[i]) { + var tl = t[i].length; + if (tl > 1 && tl < w) { + assert(l[k] > 0); + rv = [i, k]; + w = tl; + } + } + } + if (rv) { + done[k] = true; + lut.push(rv); + } + } + } + + // order from large set to small set so that small sets don't gobble + // characters also represented by overlapping larger set pcodes. + // + // Again we assume something: that finding the large regex pcode sets + // before the smaller, more specialized ones, will produce a more + // optimal minification of the regex set expression. + // + // This is a guestimate/heuristic only! + lut.sort(function (a, b) { + var k1 = a[1]; + var k2 = b[1]; + var ld = l[k2] - l[k1]; + if (ld) { + return ld; + } + // and for same-size sets, order from high to low unique identifier. + return b[0] - a[0]; + }); + + Pcodes_bitarray_cache_test_order = lut; +} + + + + + + +// 'Join' a regex set `[...]` into a Unicode range spanning logic array, flagging every character in the given set. +function set2bitarray(bitarr, s, opts) { + var orig = s; + var set_is_inverted = false; + var bitarr_orig; + + function mark(d1, d2) { + if (d2 == null) d2 = d1; + for (var i = d1; i <= d2; i++) { + bitarr[i] = true; + } + } + + function add2bitarray(dst, src) { + for (var i = 0; i <= UNICODE_BASE_PLANE_MAX_CP; i++) { + if (src[i]) { + dst[i] = true; + } + } + } + + function eval_escaped_code(s) { + var c; + // decode escaped code? If none, just take the character as-is + if (s.indexOf('\\') === 0) { + var l = s.substr(0, 2); + switch (l) { + case '\\c': + c = s.charCodeAt(2) - 'A'.charCodeAt(0) + 1; + return String.fromCharCode(c); + + case '\\x': + s = s.substr(2); + c = parseInt(s, 16); + return String.fromCharCode(c); + + case '\\u': + s = s.substr(2); + if (s[0] === '{') { + s = s.substr(1, s.length - 2); + } + c = parseInt(s, 16); + if (c >= 0x10000) { + return new Error('We do NOT support Extended Plane Unicode Codepoints (i.e. CodePoints beyond U:FFFF) in regex set expressions, e.g. \\u{' + s + '}'); + } + return String.fromCharCode(c); + + case '\\0': + case '\\1': + case '\\2': + case '\\3': + case '\\4': + case '\\5': + case '\\6': + case '\\7': + s = s.substr(1); + c = parseInt(s, 8); + return String.fromCharCode(c); + + case '\\r': + return '\r'; + + case '\\n': + return '\n'; + + case '\\v': + return '\v'; + + case '\\f': + return '\f'; + + case '\\t': + return '\t'; + + case '\\b': + return '\b'; + + default: + // just the character itself: + return s.substr(1); + } + } else { + return s; + } + } + + if (s && s.length) { + var c1, c2; + + // inverted set? + if (s[0] === '^') { + set_is_inverted = true; + s = s.substr(1); + bitarr_orig = bitarr; + bitarr = new Array(UNICODE_BASE_PLANE_MAX_CP + 1); + } + + // BITARR collects flags for characters set. Inversion means the complement set of character is st instead. + // This results in an OR operations when sets are joined/chained. + + while (s.length) { + c1 = s.match(CHR_RE); + if (!c1) { + // hit an illegal escape sequence? cope anyway! + c1 = s[0]; + } else { + c1 = c1[0]; + // Quick hack for XRegExp escapes inside a regex `[...]` set definition: we *could* try to keep those + // intact but it's easier to unfold them here; this is not nice for when the grammar specifies explicit + // XRegExp support, but alas, we'll get there when we get there... ;-) + switch (c1) { + case '\\p': + s = s.substr(c1.length); + c2 = s.match(XREGEXP_UNICODE_ESCAPE_RE); + if (c2) { + c2 = c2[0]; + s = s.substr(c2.length); + // do we have this one cached already? + var pex = c1 + c2; + var ba4p = Pcodes_bitarray_cache[pex]; + if (!ba4p) { + // expand escape: + var xr = new XRegExp('[' + pex + ']'); // TODO: case-insensitive grammar??? + // rewrite to a standard `[...]` regex set: XRegExp will do this for us via `XRegExp.toString()`: + var xs = '' + xr; + // remove the wrapping `/.../` to get at the (possibly *combined* series of) `[...]` sets inside: + xs = xs.substr(1, xs.length - 2); + + ba4p = reduceRegexToSetBitArray(xs, pex, opts); + + Pcodes_bitarray_cache[pex] = ba4p; + updatePcodesBitarrayCacheTestOrder(opts); + } + // merge bitarrays: + add2bitarray(bitarr, ba4p); + continue; + } + break; + + case '\\S': + case '\\s': + case '\\W': + case '\\w': + case '\\d': + case '\\D': + // these can't participate in a range, but need to be treated special: + s = s.substr(c1.length); + // check for \S, \s, \D, \d, \W, \w and expand them: + var ba4e = EscCode_bitarray_output_refs.esc2bitarr[c1[1]]; + assert(ba4e); + add2bitarray(bitarr, ba4e); + continue; + + case '\\b': + // matches a backspace: https://developer.mozilla.org/en/docs/Web/JavaScript/Guide/Regular_Expressions#special-backspace + c1 = '\u0008'; + break; + } + } + var v1 = eval_escaped_code(c1); + // propagate deferred exceptions = error reports. + if (v1 instanceof Error) { + return v1; + } + v1 = v1.charCodeAt(0); + s = s.substr(c1.length); + + if (s[0] === '-' && s.length >= 2) { + // we can expect a range like 'a-z': + s = s.substr(1); + c2 = s.match(CHR_RE); + if (!c2) { + // hit an illegal escape sequence? cope anyway! + c2 = s[0]; + } else { + c2 = c2[0]; + } + var v2 = eval_escaped_code(c2); + // propagate deferred exceptions = error reports. + if (v2 instanceof Error) { + return v1; + } + v2 = v2.charCodeAt(0); + s = s.substr(c2.length); + + // legal ranges go UP, not /DOWN! + if (v1 <= v2) { + mark(v1, v2); + } else { + console.warn('INVALID CHARACTER RANGE found in regex: ', { re: orig, start: c1, start_n: v1, end: c2, end_n: v2 }); + mark(v1); + mark('-'.charCodeAt(0)); + mark(v2); + } + continue; + } + mark(v1); + } + + // When we have marked all slots, '^' NEGATES the set, hence we flip all slots. + // + // Since a regex like `[^]` should match everything(?really?), we don't need to check if the MARK + // phase actually marked anything at all: the `^` negation will correctly flip=mark the entire + // range then. + if (set_is_inverted) { + for (var i = 0; i <= UNICODE_BASE_PLANE_MAX_CP; i++) { + if (!bitarr[i]) { + bitarr_orig[i] = true; + } + } + } + } + return false; +} + + +// convert a simple bitarray back into a regex set `[...]` content: +function bitarray2set(l, output_inverted_variant, output_minimized) { + // construct the inverse(?) set from the mark-set: + // + // Before we do that, we inject a sentinel so that our inner loops + // below can be simple and fast: + l[UNICODE_BASE_PLANE_MAX_CP + 1] = 1; + // now reconstruct the regex set: + var rv = []; + var i, j, cnt, lut, tn, tspec, match, pcode, ba4pcode, l2; + var bitarr_is_cloned = false; + var l_orig = l; + + if (output_inverted_variant) { + // generate the inverted set, hence all unmarked slots are part of the output range: + cnt = 0; + for (i = 0; i <= UNICODE_BASE_PLANE_MAX_CP; i++) { + if (!l[i]) { + cnt++; + } + } + if (cnt === UNICODE_BASE_PLANE_MAX_CP + 1) { + // When there's nothing in the output we output a special 'match-nothing' regex: `[^\S\s]`. + // BUT... since we output the INVERTED set, we output the match-all set instead: + return '\\S\\s'; + } + else if (cnt === 0) { + // When we find the entire Unicode range is in the output match set, we replace this with + // a shorthand regex: `[\S\s]` + // BUT... since we output the INVERTED set, we output the match-nothing set instead: + return '^\\S\\s'; + } + + // Now see if we can replace several bits by an escape / pcode: + if (output_minimized) { + lut = Pcodes_bitarray_cache_test_order; + for (tn = 0; lut[tn]; tn++) { + tspec = lut[tn]; + // check if the uniquely identifying char is in the inverted set: + if (!l[tspec[0]]) { + // check if the pcode is covered by the inverted set: + pcode = tspec[1]; + ba4pcode = Pcodes_bitarray_cache[pcode]; + match = 0; + for (j = 0; j <= UNICODE_BASE_PLANE_MAX_CP; j++) { + if (ba4pcode[j]) { + if (!l[j]) { + // match in current inverted bitset, i.e. there's at + // least one 'new' bit covered by this pcode/escape: + match++; + } else if (l_orig[j]) { + // mismatch! + match = false; + break; + } + } + } + + // We're only interested in matches which actually cover some + // yet uncovered bits: `match !== 0 && match !== false`. + // + // Apply the heuristic that the pcode/escape is only going to be used + // when it covers *more* characters than its own identifier's length: + if (match && match > pcode.length) { + rv.push(pcode); + + // and nuke the bits in the array which match the given pcode: + // make sure these edits are visible outside this function as + // `l` is an INPUT parameter (~ not modified)! + if (!bitarr_is_cloned) { + l2 = new Array(UNICODE_BASE_PLANE_MAX_CP + 1); + for (j = 0; j <= UNICODE_BASE_PLANE_MAX_CP; j++) { + l2[j] = l[j] || ba4pcode[j]; // `!(!l[j] && !ba4pcode[j])` + } + // recreate sentinel + l2[UNICODE_BASE_PLANE_MAX_CP + 1] = 1; + l = l2; + bitarr_is_cloned = true; + } else { + for (j = 0; j <= UNICODE_BASE_PLANE_MAX_CP; j++) { + l[j] = l[j] || ba4pcode[j]; + } + } + } + } + } + } + + i = 0; + while (i <= UNICODE_BASE_PLANE_MAX_CP) { + // find first character not in original set: + while (l[i]) { + i++; + } + if (i >= UNICODE_BASE_PLANE_MAX_CP + 1) { + break; + } + // find next character not in original set: + for (j = i + 1; !l[j]; j++) {} /* empty loop */ + // generate subset: + rv.push(i2c(i)); + if (j - 1 > i) { + rv.push((j - 2 > i ? '-' : '') + i2c(j - 1)); + } + i = j; + } + } else { + // generate the non-inverted set, hence all logic checks are inverted here... + cnt = 0; + for (i = 0; i <= UNICODE_BASE_PLANE_MAX_CP; i++) { + if (l[i]) { + cnt++; + } + } + if (cnt === UNICODE_BASE_PLANE_MAX_CP + 1) { + // When we find the entire Unicode range is in the output match set, we replace this with + // a shorthand regex: `[\S\s]` + return '\\S\\s'; + } + else if (cnt === 0) { + // When there's nothing in the output we output a special 'match-nothing' regex: `[^\S\s]`. + return '^\\S\\s'; + } + + // Now see if we can replace several bits by an escape / pcode: + if (output_minimized) { + lut = Pcodes_bitarray_cache_test_order; + for (tn = 0; lut[tn]; tn++) { + tspec = lut[tn]; + // check if the uniquely identifying char is in the set: + if (l[tspec[0]]) { + // check if the pcode is covered by the set: + pcode = tspec[1]; + ba4pcode = Pcodes_bitarray_cache[pcode]; + match = 0; + for (j = 0; j <= UNICODE_BASE_PLANE_MAX_CP; j++) { + if (ba4pcode[j]) { + if (l[j]) { + // match in current bitset, i.e. there's at + // least one 'new' bit covered by this pcode/escape: + match++; + } else if (!l_orig[j]) { + // mismatch! + match = false; + break; + } + } + } + + // We're only interested in matches which actually cover some + // yet uncovered bits: `match !== 0 && match !== false`. + // + // Apply the heuristic that the pcode/escape is only going to be used + // when it covers *more* characters than its own identifier's length: + if (match && match > pcode.length) { + rv.push(pcode); + + // and nuke the bits in the array which match the given pcode: + // make sure these edits are visible outside this function as + // `l` is an INPUT parameter (~ not modified)! + if (!bitarr_is_cloned) { + l2 = new Array(UNICODE_BASE_PLANE_MAX_CP + 1); + for (j = 0; j <= UNICODE_BASE_PLANE_MAX_CP; j++) { + l2[j] = l[j] && !ba4pcode[j]; + } + // recreate sentinel + l2[UNICODE_BASE_PLANE_MAX_CP + 1] = 1; + l = l2; + bitarr_is_cloned = true; + } else { + for (j = 0; j <= UNICODE_BASE_PLANE_MAX_CP; j++) { + l[j] = l[j] && !ba4pcode[j]; + } + } + } + } + } + } + + i = 0; + while (i <= UNICODE_BASE_PLANE_MAX_CP) { + // find first character not in original set: + while (!l[i]) { + i++; + } + if (i >= UNICODE_BASE_PLANE_MAX_CP + 1) { + break; + } + // find next character not in original set: + for (j = i + 1; l[j]; j++) {} /* empty loop */ + if (j > UNICODE_BASE_PLANE_MAX_CP + 1) { + j = UNICODE_BASE_PLANE_MAX_CP + 1; + } + // generate subset: + rv.push(i2c(i)); + if (j - 1 > i) { + rv.push((j - 2 > i ? '-' : '') + i2c(j - 1)); + } + i = j; + } + } + + assert(rv.length); + var s = rv.join(''); + assert(s); + + // Check if the set is better represented by one of the regex escapes: + var esc4s = EscCode_bitarray_output_refs.set2esc[s]; + if (esc4s) { + // When we hit a special case like this, it is always the shortest notation, hence wins on the spot! + return '\\' + esc4s; + } + return s; +} + + + + + +// Pretty brutal conversion of 'regex' `s` back to raw regex set content: strip outer [...] when they're there; +// ditto for inner combos of sets, i.e. `]|[` as in `[0-9]|[a-z]`. +function reduceRegexToSetBitArray(s, name, opts) { + var orig = s; + + // propagate deferred exceptions = error reports. + if (s instanceof Error) { + return s; + } + + var l = new Array(UNICODE_BASE_PLANE_MAX_CP + 1); + var internal_state = 0; + var derr; + + while (s.length) { + var c1 = s.match(CHR_RE); + if (!c1) { + // cope with illegal escape sequences too! + return new Error('illegal escape sequence at start of regex part: "' + s + '" of regex "' + orig + '"'); + } else { + c1 = c1[0]; + } + s = s.substr(c1.length); + + switch (c1) { + case '[': + // this is starting a set within the regex: scan until end of set! + var set_content = []; + while (s.length) { + var inner = s.match(SET_PART_RE); + if (!inner) { + inner = s.match(CHR_RE); + if (!inner) { + // cope with illegal escape sequences too! + return new Error('illegal escape sequence at start of regex part: ' + s + '" of regex "' + orig + '"'); + } else { + inner = inner[0]; + } + if (inner === ']') break; + } else { + inner = inner[0]; + } + set_content.push(inner); + s = s.substr(inner.length); + } + + // ensure that we hit the terminating ']': + var c2 = s.match(CHR_RE); + if (!c2) { + // cope with illegal escape sequences too! + return new Error('regex set expression is broken in regex: "' + orig + '" --> "' + s + '"'); + } else { + c2 = c2[0]; + } + if (c2 !== ']') { + return new Error('regex set expression is broken in regex: ' + orig); + } + s = s.substr(c2.length); + + var se = set_content.join(''); + if (!internal_state) { + derr = set2bitarray(l, se, opts); + // propagate deferred exceptions = error reports. + if (derr instanceof Error) { + return derr; + } + + // a set is to use like a single character in a longer literal phrase, hence input `[abc]word[def]` would thus produce output `[abc]`: + internal_state = 1; + } + break; + + // Strip unescaped pipes to catch constructs like `\\r|\\n` and turn them into + // something ready for use inside a regex set, e.g. `\\r\\n`. + // + // > Of course, we realize that converting more complex piped constructs this way + // > will produce something you might not expect, e.g. `A|WORD2` which + // > would end up as the set `[AW]` which is something else than the input + // > entirely. + // > + // > However, we can only depend on the user (grammar writer) to realize this and + // > prevent this from happening by not creating such oddities in the input grammar. + case '|': + // a|b --> [ab] + internal_state = 0; + break; + + case '(': + // (a) --> a + // + // TODO - right now we treat this as 'too complex': + + // Strip off some possible outer wrappers which we know how to remove. + // We don't worry about 'damaging' the regex as any too-complex regex will be caught + // in the validation check at the end; our 'strippers' here would not damage useful + // regexes anyway and them damaging the unacceptable ones is fine. + s = s.replace(/^\((?:\?:)?(.*?)\)$/, '$1'); // (?:...) -> ... and (...) -> ... + s = s.replace(/^\^?(.*?)\$?$/, '$1'); // ^...$ --> ... (catch these both inside and outside the outer grouping, hence do the ungrouping twice: one before, once after this) + s = s.replace(/^\((?:\?:)?(.*?)\)$/, '$1'); // (?:...) -> ... and (...) -> ... + + return new Error('[macro [' + name + '] is unsuitable for use inside regex set expressions: "[' + orig + ']"]'); + + case '.': + case '*': + case '+': + case '?': + // wildcard + // + // TODO - right now we treat this as 'too complex': + return new Error('[macro [' + name + '] is unsuitable for use inside regex set expressions: "[' + orig + ']"]'); + + case '{': // range, e.g. `x{1,3}`, or macro? + // TODO - right now we treat this as 'too complex': + return new Error('[macro [' + name + '] is unsuitable for use inside regex set expressions: "[' + orig + ']"]'); + + default: + // literal character or word: take the first character only and ignore the rest, so that + // the constructed set for `word|noun` would be `[wb]`: + if (!internal_state) { + derr = set2bitarray(l, c1, opts); + // propagate deferred exceptions = error reports. + if (derr instanceof Error) { + return derr; + } + + internal_state = 2; + } + break; + } + } + + s = bitarray2set(l); + + // When this result is suitable for use in a set, than we should be able to compile + // it in a regex; that way we can easily validate whether macro X is fit to be used + // inside a regex set: + try { + var re; + assert(s); + assert(!(s instanceof Error)); + re = new XRegExp('[' + s + ']'); + re.test(s[0]); + + // One thing is apparently *not* caught by the RegExp compile action above: `[a[b]c]` + // so we check for lingering UNESCAPED brackets in here as those cannot be: + if (/[^\\][\[\]]/.exec(s)) { + throw new Error('unescaped brackets in set data'); + } + } catch (ex) { + // make sure we produce a set range expression which will fail badly when it is used + // in actual code: + s = new Error('[macro [' + name + '] is unsuitable for use inside regex set expressions: "[' + s + ']"]: ' + ex.message); + } + + assert(s); + // propagate deferred exceptions = error reports. + if (s instanceof Error) { + return s; + } + return l; +} + + + + +// Convert bitarray representing, for example, `'0-9'` to regex string `[0-9]` +// -- or in this example it can be further optimized to only `\d`! +function produceOptimizedRegex4Set(bitarr) { + // First try to produce a minimum regex from the bitarray directly: + var s1 = bitarray2set(bitarr, false, true); + + // and when the regex set turns out to match a single pcode/escape, then + // use that one as-is: + if (s1.match(SET_IS_SINGLE_PCODE_RE)) { + // When we hit a special case like this, it is always the shortest notation, hence wins on the spot! + return s1; + } else { + s1 = '[' + s1 + ']'; + } + + // Now try to produce a minimum regex from the *inverted* bitarray via negation: + // Because we look at a negated bitset, there's no use looking for matches with + // special cases here. + var s2 = bitarray2set(bitarr, true, true); + + if (s2[0] === '^') { + s2 = s2.substr(1); + if (s2.match(SET_IS_SINGLE_PCODE_RE)) { + // When we hit a special case like this, it is always the shortest notation, hence wins on the spot! + return s2; + } + } else { + s2 = '^' + s2; + } + s2 = '[' + s2 + ']'; + + // Then, as some pcode/escapes still happen to deliver a LARGER regex string in the end, + // we also check against the plain, unadulterated regex set expressions: + // + // First try to produce a minimum regex from the bitarray directly: + var s3 = bitarray2set(bitarr, false, false); + + // and when the regex set turns out to match a single pcode/escape, then + // use that one as-is: + if (s3.match(SET_IS_SINGLE_PCODE_RE)) { + // When we hit a special case like this, it is always the shortest notation, hence wins on the spot! + return s3; + } else { + s3 = '[' + s3 + ']'; + } + + // Now try to produce a minimum regex from the *inverted* bitarray via negation: + // Because we look at a negated bitset, there's no use looking for matches with + // special cases here. + var s4 = bitarray2set(bitarr, true, false); + + if (s4[0] === '^') { + s4 = s4.substr(1); + if (s4.match(SET_IS_SINGLE_PCODE_RE)) { + // When we hit a special case like this, it is always the shortest notation, hence wins on the spot! + return s4; + } + } else { + s4 = '^' + s4; + } + s4 = '[' + s4 + ']'; + + if (s2.length < s1.length) { + s1 = s2; + } + if (s3.length < s1.length) { + s1 = s3; + } + if (s4.length < s1.length) { + s1 = s4; + } + + return s1; +} + + + + + + +export default { + XREGEXP_UNICODE_ESCAPE_RE, + CHR_RE, + SET_PART_RE, + NOTHING_SPECIAL_RE, + SET_IS_SINGLE_PCODE_RE, + + UNICODE_BASE_PLANE_MAX_CP, + + WHITESPACE_SETSTR, + DIGIT_SETSTR, + WORDCHAR_SETSTR, + + set2bitarray, + bitarray2set, + produceOptimizedRegex4Set, + reduceRegexToSetBitArray, +}; + diff --git a/rollup.config-cli.js b/rollup.config-cli.js new file mode 100644 index 0000000..941308c --- /dev/null +++ b/rollup.config-cli.js @@ -0,0 +1,62 @@ +// rollup.config.js +import resolve from 'rollup-plugin-node-resolve'; + +export default { + input: 'cli.js', + output: [ + { + file: 'dist/cli-cjs.js', + format: 'cjs' + }, + { + file: 'dist/cli-es6.js', + format: 'es' + }, + { + file: 'dist/cli-umd.js', + name: 'jison-lex', + format: 'umd' + } + ], + plugins: [ + resolve({ + // use "module" field for ES6 module if possible + module: true, // Default: true + + // use "main" field or index.js, even if it's not an ES6 module + // (needs to be converted from CommonJS to ES6 + // see https://github.com/rollup/rollup-plugin-commonjs + main: true, // Default: true + + // not all files you want to resolve are .js files + extensions: [ '.js' ], // Default: ['.js'] + + // whether to prefer built-in modules (e.g. `fs`, `path`) or + // local ones with the same names + preferBuiltins: true, // Default: true + + // If true, inspect resolved files to check that they are + // ES2015 modules + modulesOnly: true, // Default: false + }) + ], + external: [ + '@gerhobbelt/ast-util', + '@gerhobbelt/json5', + '@gerhobbelt/nomnom', + '@gerhobbelt/prettier-miscellaneous', + '@gerhobbelt/recast', + '@gerhobbelt/xregexp', + 'jison-helpers-lib', + '@gerhobbelt/lex-parser', + '@gerhobbelt/jison-lex', + '@gerhobbelt/ebnf-parser', + '@gerhobbelt/jison2json', + '@gerhobbelt/json2jison', + 'jison-gho', + 'assert', + 'fs', + 'path', + 'process', + ] +}; diff --git a/rollup.config.js b/rollup.config.js new file mode 100644 index 0000000..890f242 --- /dev/null +++ b/rollup.config.js @@ -0,0 +1,62 @@ +// rollup.config.js +import resolve from 'rollup-plugin-node-resolve'; + +export default { + input: 'regexp-lexer.js', + output: [ + { + file: 'dist/regexp-lexer-cjs.js', + format: 'cjs' + }, + { + file: 'dist/regexp-lexer-es6.js', + format: 'es' + }, + { + file: 'dist/regexp-lexer-umd.js', + name: 'regexp-lexer', + format: 'umd' + } + ], + plugins: [ + resolve({ + // use "module" field for ES6 module if possible + module: true, // Default: true + + // use "main" field or index.js, even if it's not an ES6 module + // (needs to be converted from CommonJS to ES6 + // see https://github.com/rollup/rollup-plugin-commonjs + main: true, // Default: true + + // not all files you want to resolve are .js files + extensions: [ '.js' ], // Default: ['.js'] + + // whether to prefer built-in modules (e.g. `fs`, `path`) or + // local ones with the same names + preferBuiltins: true, // Default: true + + // If true, inspect resolved files to check that they are + // ES2015 modules + modulesOnly: true, // Default: false + }) + ], + external: [ + '@gerhobbelt/ast-util', + '@gerhobbelt/json5', + '@gerhobbelt/nomnom', + '@gerhobbelt/prettier-miscellaneous', + '@gerhobbelt/recast', + '@gerhobbelt/xregexp', + 'jison-helpers-lib', + '@gerhobbelt/lex-parser', + '@gerhobbelt/jison-lex', + '@gerhobbelt/ebnf-parser', + '@gerhobbelt/jison2json', + '@gerhobbelt/json2jison', + 'jison-gho', + 'assert', + 'fs', + 'path', + 'process', + ] +}; diff --git a/tests/all-tests.js b/tests/all-tests.js deleted file mode 100755 index 8a0a4dd..0000000 --- a/tests/all-tests.js +++ /dev/null @@ -1,4 +0,0 @@ -exports.testRegExpLexer = require("./regexplexer"); - -if (require.main === module) - process.exit(require("test").run(exports)); diff --git a/tests/index.html b/tests/index.html new file mode 100644 index 0000000..22b6ca8 --- /dev/null +++ b/tests/index.html @@ -0,0 +1,25 @@ + + + + Lexing Kernel Tests + + + + + +
+ + + + + + + + + + + diff --git a/tests/regexplexer.js b/tests/regexplexer.js index 6128c47..253c194 100644 --- a/tests/regexplexer.js +++ b/tests/regexplexer.js @@ -1,7 +1,16 @@ -var RegExpLexer = require("../regexp-lexer"), - assert = require("assert"); +var assert = require("chai").assert; +var RegExpLexer = require("../dist/regexp-lexer-cjs-es5"); +var XRegExp = require("@gerhobbelt/xregexp"); -exports["test basic matchers"] = function() { +function re2set(re) { + var xr = new XRegExp(re); + var xs = '' + xr; + return xs.substr(2, xs.length - 4); // strip off the wrapping: /[...]/ +} + + +describe("Lexer Kernel", function () { + it("test basic matchers", function() { var dict = { rules: [ ["x", "return 'X';" ], @@ -18,9 +27,422 @@ exports["test basic matchers"] = function() { assert.equal(lexer.lex(), "Y"); assert.equal(lexer.lex(), "X"); assert.equal(lexer.lex(), "EOF"); -}; + }); + + // Before we go and test the API any further, we must make sure + // the used lex grammar parser delivers as expected: + describe("Parsed Lexer Grammar", function () { + it("parses special character escapes correctly", function () { + var dict = [ + "%%", + "'x' {return 'X';}", + "\\n {return 'NL';}", + "\\r {return 'R';}", + "\\v {return 'V';}", + "\\a {return 'A';}", + "\\f {return 'F';}", + "\\b {return 'B';}", + "\\x42 {return 'C';}", + "\\u0043 {return 'D';}", + "\\ {return 'E';}", + "[^] {return this.ERROR;}", + ].join('\n'); + + var lexer = new RegExpLexer(dict); + var JisonLexerError = lexer.JisonLexerError; + assert(JisonLexerError); + + var input = "x\nx\rx\vx\ax\fx\bx\x42x\u0043x xxx\\nx\\rx\\vx\\ax\\fx\\bx\\x42x\\u0043x\\ "; + + // help us monitor/debug lexer output: + var old_lex_f = lexer.lex; + lexer.lex = function () { + try { + var rv = old_lex_f.call(this); + return rv; + } catch (ex) { + //console.error("lex() ERROR EX:", ex.message, ex.stack); + throw ex; + } + }; + + lexer.setInput(input); + assert.equal(lexer.lex(), 'X'); + assert.equal(lexer.lex(), 'NL'); + assert.equal(lexer.lex(), 'X'); + assert.equal(lexer.lex(), 'R'); + assert.equal(lexer.lex(), 'X'); + assert.equal(lexer.lex(), 'V'); + assert.equal(lexer.lex(), 'X'); + assert.equal(lexer.lex(), 'A'); + assert.equal(lexer.lex(), 'X'); + assert.equal(lexer.lex(), 'F'); + assert.equal(lexer.lex(), 'X'); + assert.equal(lexer.lex(), lexer.ERROR); // `\b` is a regex edge marker special, not string value '\b'! + assert.equal(lexer.lex(), 'X'); + + // As the `\b` rule comes before the 'C' rule, it will match at the start-of-word boundary... + assert.equal(lexer.lex(), 'B'); + // ...and since this lexer rule doesn't consume anything at all, it will match indefinitely... + for (var cnt = 42; cnt > 0; cnt--) { + assert.equal(lexer.lex(), 'B'); + } + + // ...until we manually NUKE that rule: + for (var i = 0, len = lexer.rules.length; i < len; i++) { + // find the lexer rule which matches the word boundary: + if (lexer.rules[i].test('k') && String(lexer.rules[i]).indexOf('\\b') >= 0) { + lexer.rules[i] = /MMMMMMMMM/; + } + } + + // and verify that our lexer decompression/ruleset-caching results + // in the above action not having any effect until we NUKE the + // same regex in the condition cache: + for (var cnt = 42; cnt > 0; cnt--) { + assert.equal(lexer.lex(), 'B'); + } + + var cond_rules = lexer.__currentRuleSet__.__rule_regexes; + for (var i = 0, len = cond_rules.length; i < len; i++) { + // find the lexer rule which matches the word boundary: + if (cond_rules[i] && cond_rules[i].test('k') && String(cond_rules[i]).indexOf('\\b') >= 0) { + cond_rules[i] = /MMMMMMMMM/; + } + } + + // **POSTSCRIPT** + // + // Regrettably I don't know of a way to check for this type of lexer regex rule + // anomaly in a generic way: the lexer rule may be a compound one, hiding the + // non-consuming `\b` in there, while there are other regex constructs + // imaginable which share the same problem with this `\b` lexer rule: a rexexp + // match which matches a boundary, hence **an empty string** without the + // grammar designer **intentionally** doing this. + + assert.equal(lexer.lex(), 'C'); + assert.equal(lexer.lex(), 'X'); + assert.equal(lexer.lex(), 'D'); + assert.equal(lexer.lex(), 'X'); + assert.equal(lexer.lex(), 'E'); + + assert.equal(lexer.lex(), 'X'); + assert.equal(lexer.lex(), 'X'); + assert.equal(lexer.lex(), 'X'); + + // \\n + assert.equal(lexer.lex(), lexer.ERROR); + assert.equal(lexer.lex(), lexer.ERROR); + assert.equal(lexer.lex(), 'X'); + // \\r + assert.equal(lexer.lex(), lexer.ERROR); + assert.equal(lexer.lex(), lexer.ERROR); + assert.equal(lexer.lex(), 'X'); + // \\v + assert.equal(lexer.lex(), lexer.ERROR); + assert.equal(lexer.lex(), lexer.ERROR); + assert.equal(lexer.lex(), 'X'); + // \\a + assert.equal(lexer.lex(), lexer.ERROR); + assert.equal(lexer.lex(), 'A'); + assert.equal(lexer.lex(), 'X'); + // \\f + assert.equal(lexer.lex(), lexer.ERROR); + assert.equal(lexer.lex(), lexer.ERROR); + assert.equal(lexer.lex(), 'X'); + // \\b + assert.equal(lexer.lex(), lexer.ERROR); + assert.equal(lexer.lex(), lexer.ERROR); + assert.equal(lexer.lex(), 'X'); + // \\x42 + assert.equal(lexer.lex(), lexer.ERROR); + assert.equal(lexer.lex(), 'X'); + assert.equal(lexer.lex(), lexer.ERROR); + assert.equal(lexer.lex(), lexer.ERROR); + assert.equal(lexer.lex(), 'X'); + // \\u0043 + assert.equal(lexer.lex(), lexer.ERROR); + assert.equal(lexer.lex(), lexer.ERROR); + assert.equal(lexer.lex(), lexer.ERROR); + assert.equal(lexer.lex(), lexer.ERROR); + assert.equal(lexer.lex(), lexer.ERROR); + assert.equal(lexer.lex(), lexer.ERROR); + assert.equal(lexer.lex(), 'X'); + // \\_ + assert.equal(lexer.lex(), lexer.ERROR); + assert.equal(lexer.lex(), 'E'); + + assert.equal(lexer.lex(), lexer.EOF); + }); + + it("parses literal rule strings with escapes correctly", function () { + var dict = [ + "%%", + "'x' {return 'X';}", + "'\\n' {return 'SN';}", + "'\\r' {return 'SR';}", + "'\\v' {return 'SV';}", + "'\\a' {return 'SA';}", + "'\\f' {return 'SF';}", + "'\\b' {return 'SB';}", + "'\\x42' {return 'SC';}", + "'\\u0043' {return 'SD';}", + "'\\ ' {return 'SE';}", + "[^] {return this.ERROR;}", + ].join('\n'); + + var lexer = new RegExpLexer(dict); + var JisonLexerError = lexer.JisonLexerError; + assert(JisonLexerError); + + var input = "x\nx\rx\vx\ax\fx\bx\x42x\u0043x xxx\\nx\\rx\\vx\\ax\\fx\\bx\\x42x\\u0043x\\ "; + + // help us monitor/debug lexer output: + var old_lex_f = lexer.lex; + lexer.lex = function () { + try { + var rv = old_lex_f.call(this); + return rv; + } catch (ex) { + //console.error("lex() ERROR EX:", ex.message, ex.stack); + throw ex; + } + }; + + lexer.setInput(input); + assert.equal(lexer.lex(), 'X'); + + // \n + assert.equal(lexer.lex(), lexer.ERROR); + assert.equal(lexer.lex(), 'X'); + // \r + assert.equal(lexer.lex(), lexer.ERROR); + assert.equal(lexer.lex(), 'X'); + // \v + assert.equal(lexer.lex(), lexer.ERROR); + assert.equal(lexer.lex(), 'X'); + // \a + assert.equal(lexer.lex(), lexer.ERROR); + assert.equal(lexer.lex(), 'X'); + // \f + assert.equal(lexer.lex(), lexer.ERROR); + assert.equal(lexer.lex(), 'X'); + // \b + assert.equal(lexer.lex(), lexer.ERROR); + assert.equal(lexer.lex(), 'X'); + // \x42 + assert.equal(lexer.lex(), lexer.ERROR); + assert.equal(lexer.lex(), 'X'); + // \u0043 + assert.equal(lexer.lex(), 'SD'); + assert.equal(lexer.lex(), 'X'); + // \_ + assert.equal(lexer.lex(), lexer.ERROR); + + assert.equal(lexer.lex(), 'X'); + assert.equal(lexer.lex(), 'X'); + assert.equal(lexer.lex(), 'X'); + + assert.equal(lexer.lex(), 'SN'); + assert.equal(lexer.lex(), 'X'); + assert.equal(lexer.lex(), 'SR'); + assert.equal(lexer.lex(), 'X'); + assert.equal(lexer.lex(), 'SV'); + assert.equal(lexer.lex(), 'X'); + assert.equal(lexer.lex(), 'SA'); + assert.equal(lexer.lex(), 'X'); + assert.equal(lexer.lex(), 'SF'); + assert.equal(lexer.lex(), 'X'); + assert.equal(lexer.lex(), 'SB'); + assert.equal(lexer.lex(), 'X'); + assert.equal(lexer.lex(), 'SC'); + assert.equal(lexer.lex(), 'X'); + // \\u0043 + assert.equal(lexer.lex(), lexer.ERROR); + assert.equal(lexer.lex(), lexer.ERROR); + assert.equal(lexer.lex(), lexer.ERROR); + assert.equal(lexer.lex(), lexer.ERROR); + assert.equal(lexer.lex(), lexer.ERROR); + assert.equal(lexer.lex(), lexer.ERROR); + assert.equal(lexer.lex(), 'X'); + assert.equal(lexer.lex(), 'SE'); + + assert.equal(lexer.lex(), lexer.EOF); + }); + }); + + it("lexer comes with its own JisonLexerError exception/error class", function () { + var dict = [ + "%%", + "'x' {return 'X';}", + ].join('\n'); + + var lexer = new RegExpLexer(dict); + var JisonLexerError = lexer.JisonLexerError; + assert(JisonLexerError); + + var t = new JisonLexerError('test', 42); + assert(t instanceof Error); + assert(t instanceof JisonLexerError); + assert(t.hash === 42); + assert(t.message === 'test'); + assert(t.toString() === 'JisonLexerError: test'); + + var t2 = new Error('a'); + var t3 = new JisonLexerError('test', { exception: t2 }); + assert(t2 instanceof Error); + assert(!(t2 instanceof JisonLexerError)); + assert(t3 instanceof Error); + assert(t3 instanceof JisonLexerError); + assert(!t2.hash); + assert(t3.hash); + assert(t3.hash.exception); + assert(t2.message === 'a'); + assert(t3.message === 'a'); + assert(t2.toString() === 'Error: a'); + assert(t3.toString() === 'JisonLexerError: a'); + }); + + it("lexer errors are thrown using its own JisonLexerError exception/error class", function () { + var dict = [ + "%%", + "'x' {return 'X';}", + ].join('\n'); + + var lexer = new RegExpLexer(dict); + var JisonLexerError = lexer.JisonLexerError; + assert(JisonLexerError); + + var input = "xxyx"; + + lexer.setInput(input); + assert.equal(lexer.lex(), 'X'); + assert.equal(lexer.lex(), 'X'); + var ex1 = null; + try { + lexer.lex(); + assert(false, "should never get here!"); + } catch (ex) { + assert(ex instanceof Error); + assert(ex instanceof JisonLexerError); + assert(/JisonLexerError:[^]*?Unrecognized text\./.test(ex)); + assert(ex.hash); + assert.equal(typeof ex.hash.errStr, 'string'); + assert.equal(typeof ex.message, 'string'); + ex1 = ex; + } + // since the lexer has been using the standard parseError method, + // which throws an exception **AND DOES NOT MOVE THE READ CURSOR FORWARD**, + // we WILL observe the same error again on the next invocation: + try { + lexer.lex(); + assert(false, "should never get here!"); + } catch (ex) { + assert(ex instanceof Error); + assert(ex instanceof JisonLexerError); + assert(/JisonLexerError:[^]*?Unrecognized text\./.test(ex)); + assert(ex.hash); + assert.equal(typeof ex.hash.errStr, 'string'); + assert.equal(typeof ex.message, 'string'); + + assert.strictEqual(ex.message, ex1.message); + var check_items = ['text', 'line', 'loc', 'errStr']; + check_items.forEach(function (item) { + assert.deepEqual(ex[item], ex1[item], "both exceptions should have a matching member '" + item + "'"); + }); + } + // however, when we apply a non-throwing parseError, we MUST shift one character + // forward on error: + lexer.parseError = function (str, hash) { + assert(hash); + assert(str); + // and make sure the `this` reference points right back at the current *lexer* instance! + assert.equal(this, lexer); + }; + assert.equal(lexer.lex(), lexer.ERROR); + assert.equal(lexer.yytext, "y"); // the one character shifted on error should end up in the lexer "value", i.e. `yytext`! + + assert.equal(lexer.lex(), 'X'); + assert.equal(lexer.yytext, "x"); + assert.equal(lexer.lex(), lexer.EOF); + }); + + it("lexer run-time errors include a display of the erroneous input context", function () { + var dict = [ + "%%", + "'x' {return 'X';}", + "\\n {return 'NL';}", + ].join('\n'); + + var lexer = new RegExpLexer(dict); + var JisonLexerError = lexer.JisonLexerError; + assert(JisonLexerError); + + var input = "x\nx\nxyzx\nx\ny\nz"; + + // help us monitor/debug lexer output: + var old_lex_f = lexer.lex; + lexer.lex = function () { + try { + var rv = old_lex_f.call(this); + return rv; + } catch (ex) { + //console.error("lex() ERROR EX:", ex.message, ex.stack); + throw ex; + } + }; -exports["test set yy"] = function() { + lexer.setInput(input); + assert.equal(lexer.lex(), 'X'); + assert.equal(lexer.lex(), 'NL'); + assert.equal(lexer.lex(), 'X'); + + var lastErrorMsg; + var lastErrorHash; + lexer.parseError = function (str, hash) { + assert(hash); + assert(str); + // and make sure the `this` reference points right back at the current *lexer* instance! + assert.equal(this, lexer); + lastErrorHash = hash; + lastErrorMsg = str; + + //hash.lexer = null; // nuke the lexer class in `yy` to keep the debug output leaner and cleaner + //console.error("error: fix?", { + // str, + // hash, + // matched: this.matched, + // match: this.match, + // matches: this.matches, + // yytext: this.yytext + //}); + + // consume at least one character of input as if everything was hunky-dory: + if (!this.matches) { + assert.strictEqual(this.yytext, ''); + this.input(); + assert.ok(this.yytext.length > 0); + } else { + assert.ok(this.yytext.length > 0); + } + return 'FIX_' + String(this.yytext).toUpperCase(); + }; + + assert.equal(lexer.lex(), 'NL'); + assert.equal(lexer.lex(), 'X'); + assert.equal(lexer.lex(), 'FIX_Y'); + assert.equal(lexer.lex(), 'FIX_Z'); + assert.equal(lexer.lex(), 'X'); + assert.equal(lexer.lex(), 'NL'); + assert.equal(lexer.lex(), 'X'); + assert.equal(lexer.lex(), 'NL'); + assert.equal(lexer.lex(), 'FIX_Y'); + assert.equal(lexer.lex(), 'NL'); + assert.equal(lexer.lex(), 'FIX_Z'); + assert.equal(lexer.lex(), lexer.EOF); + }); + + it("test set yy", function() { var dict = { rules: [ ["x", "return yy.x;" ], @@ -33,10 +455,14 @@ exports["test set yy"] = function() { var lexer = new RegExpLexer(dict); lexer.setInput(input, { x: 'EX' }); - assert.equal(lexer.lex(), "EX"); -}; - -exports["test set input after"] = function() { + assert.equal(lexer.lex(), 'EX'); + assert.equal(lexer.lex(), 'EX'); + assert.equal(lexer.lex(), 'Y'); + assert.equal(lexer.lex(), 'EX'); + assert.equal(lexer.lex(), 'EOF'); + }); + + it("test set input after", function() { var dict = { rules: [ ["x", "return 'X';" ], @@ -55,9 +481,9 @@ exports["test set input after"] = function() { assert.equal(lexer.lex(), "Y"); assert.equal(lexer.lex(), "X"); assert.equal(lexer.lex(), "EOF"); -}; + }); -exports["test unrecognized char"] = function() { + it("test unrecognized char", function() { var dict = { rules: [ ["x", "return 'X';" ], @@ -69,11 +495,42 @@ exports["test unrecognized char"] = function() { var input = "xa"; var lexer = new RegExpLexer(dict, input); + var JisonLexerError = lexer.JisonLexerError; + assert(JisonLexerError); + assert.equal(lexer.lex(), "X"); - assert.throws(function(){lexer.lex()}, "bad char"); -}; + assert.throws(function () { + lexer.lex(); + }, + JisonLexerError, + /Lexical error on line \d+[^]*?Unrecognized text/, "bad char" + ); + }); + + it("test if lexer continues correctly after having encountered an unrecognized char", function() { + var dict = { + rules: [ + ["x", "return 'X';" ], + ["y", "return 'Y';" ], + ["$", "return 'EOF';" ] + ] + }; + + var input = "xa"; + var err = 0; + + var lexer = new RegExpLexer(dict, input); + lexer.parseError = function (str) { + err++; + }; + assert.equal(lexer.lex(), "X"); + assert.equal(err, 0); + assert.equal(lexer.lex(), lexer.ERROR /* 2 */); + assert.equal(err, 1); + assert.equal(lexer.lex(), "EOF"); + }); -exports["test macro"] = function() { + it("test macro", function() { var dict = { macros: { "digit": "[0-9]" @@ -94,9 +551,9 @@ exports["test macro"] = function() { assert.equal(lexer.lex(), "Y"); assert.equal(lexer.lex(), "NAT"); assert.equal(lexer.lex(), "EOF"); -}; + }); -exports["test macro precedence"] = function() { + it("test macro precedence", function() { var dict = { macros: { "hex": "[0-9]|[a-f]" @@ -119,9 +576,9 @@ exports["test macro precedence"] = function() { assert.equal(lexer.lex(), "-"); assert.equal(lexer.lex(), "HEX"); assert.equal(lexer.lex(), "EOF"); -}; + }); -exports["test nested macros"] = function () { + it("test nested macros", function () { var dict = { macros: { "digit": "[0-9]", @@ -148,9 +605,9 @@ exports["test nested macros"] = function () { assert.equal(lexer.lex(), "Y"); assert.equal(lexer.lex(), "NNN"); assert.equal(lexer.lex(), "EOF"); -}; + }); -exports["test nested macro precedence"] = function() { + it("test nested macro precedence", function() { var dict = { macros: { "hex": "[0-9]|[a-f]", @@ -174,9 +631,9 @@ exports["test nested macro precedence"] = function() { assert.equal(lexer.lex(), "-"); assert.equal(lexer.lex(), "HEX"); assert.equal(lexer.lex(), "EOF"); -}; + }); -exports["test action include"] = function() { + it("test action include", function() { var dict = { rules: [ ["x", "return included ? 'Y' : 'N';" ], @@ -190,9 +647,9 @@ exports["test action include"] = function() { var lexer = new RegExpLexer(dict, input); assert.equal(lexer.lex(), "Y"); assert.equal(lexer.lex(), "EOF"); -}; + }); -exports["test ignored"] = function() { + it("test ignored", function() { var dict = { rules: [ ["x", "return 'X';" ], @@ -210,9 +667,9 @@ exports["test ignored"] = function() { assert.equal(lexer.lex(), "Y"); assert.equal(lexer.lex(), "X"); assert.equal(lexer.lex(), "EOF"); -}; + }); -exports["test disambiguate"] = function() { + it("test disambiguate", function() { var dict = { rules: [ ["for\\b", "return 'FOR';" ], @@ -231,9 +688,9 @@ exports["test disambiguate"] = function() { assert.equal(lexer.lex(), "FOR"); assert.equal(lexer.lex(), "FOR"); assert.equal(lexer.lex(), "EOF"); -}; + }); -exports["test yytext overwrite"] = function() { + it("test yytext overwrite", function() { var dict = { rules: [ ["x", "yytext = 'hi der'; return 'X';" ] @@ -245,9 +702,9 @@ exports["test yytext overwrite"] = function() { var lexer = new RegExpLexer(dict, input); lexer.lex(); assert.equal(lexer.yytext, "hi der"); -}; + }); -exports["test yylineno"] = function() { + it("test yylineno with test_match", function() { var dict = { rules: [ ["\\s+", "/* skip whitespace */" ], @@ -257,7 +714,6 @@ exports["test yylineno"] = function() { }; var input = "x\nxy\n\n\nx"; - var lexer = new RegExpLexer(dict, input); assert.equal(lexer.yylineno, 0); assert.equal(lexer.lex(), "x"); @@ -267,9 +723,9 @@ exports["test yylineno"] = function() { assert.equal(lexer.yylineno, 1); assert.equal(lexer.lex(), "x"); assert.equal(lexer.yylineno, 4); -}; + }); -exports["test yylloc"] = function() { + it("test yylineno with input", function() { var dict = { rules: [ ["\\s+", "/* skip whitespace */" ], @@ -278,35 +734,187 @@ exports["test yylloc"] = function() { ] }; - var input = "x\nxy\n\n\nx"; + // windows style + var input = "a\r\nb"; + var lexer = new RegExpLexer(dict, input); + assert.equal(lexer.yylineno, 0); + assert.equal(lexer.input(), "a"); + assert.equal(lexer.input(), "\r\n"); + assert.equal(lexer.yylineno, 1); + assert.equal(lexer.input(), "b"); + assert.equal(lexer.yylineno, 1); + + // linux style + var input = "a\nb"; + var lexer = new RegExpLexer(dict, input); + assert.equal(lexer.yylineno, 0); + assert.equal(lexer.input(), "a"); + assert.equal(lexer.input(), "\n"); + assert.equal(lexer.yylineno, 1); + assert.equal(lexer.input(), "b"); + assert.equal(lexer.yylineno, 1); + + // mac style + var input = "a\rb"; + var lexer = new RegExpLexer(dict, input); + assert.equal(lexer.yylineno, 0); + assert.equal(lexer.input(), "a"); + assert.equal(lexer.input(), "\r"); + assert.equal(lexer.yylineno, 1); + assert.equal(lexer.input(), "b"); + assert.equal(lexer.yylineno, 1); + }); + + + it("test yylloc, yyleng, and other lexer token parameters", function() { + var dict = { + rules: [ + ["\\s+", "/* skip whitespace */" ], + ["x+", "return 'x';" ], + ["y+", "return 'y';" ] + ] + }; + + var input = "x\nxy\n\n\nx\nyyyy"; + + var lexer = new RegExpLexer(dict, input); + assert.equal(lexer.lex(), "x"); + assert.equal(lexer.yytext, "x", "yytext"); + assert.equal(lexer.yyleng, 1, "yyleng"); + assert.equal(lexer.offset, 1, "offset"); + assert.equal(lexer.match, "x", "match"); + assert.equal(lexer.matched, "x", "matched"); + assert.equal(lexer.yylloc.first_line, 1); + assert.equal(lexer.yylloc.last_line, 1); + assert.equal(lexer.yylloc.first_column, 0); + assert.equal(lexer.yylloc.last_column, 1); + //assert.ok(lexer.yylloc.range === undefined); + assert.equal(lexer.lex(), "x"); + assert.equal(lexer.yytext, "x", "yytext"); + assert.equal(lexer.yyleng, 1, "yyleng"); + assert.equal(lexer.offset, 3, "offset"); + assert.equal(lexer.match, "x", "match"); + assert.equal(lexer.matched, "x\nx", "matched"); + assert.equal(lexer.yylloc.first_line, 2); + assert.equal(lexer.yylloc.last_line, 2); + assert.equal(lexer.yylloc.first_column, 0); + assert.equal(lexer.yylloc.last_column, 1); + assert.equal(lexer.lex(), "y"); + assert.equal(lexer.yytext, "y", "yytext"); + assert.equal(lexer.yyleng, 1, "yyleng"); + assert.equal(lexer.offset, 4, "offset"); + assert.equal(lexer.match, "y", "match"); + assert.equal(lexer.matched, "x\nxy", "matched"); + assert.equal(lexer.yylloc.first_line, 2); + assert.equal(lexer.yylloc.last_line, 2); + assert.equal(lexer.yylloc.first_column, 1); + assert.equal(lexer.yylloc.last_column, 2); + assert.equal(lexer.lex(), "x"); + assert.equal(lexer.yytext, "x", "yytext"); + assert.equal(lexer.yyleng, 1, "yyleng"); + assert.equal(lexer.offset, 8, "offset"); + assert.equal(lexer.match, "x", "match"); + assert.equal(lexer.matched, "x\nxy\n\n\nx", "matched"); + assert.equal(lexer.yylloc.first_line, 5); + assert.equal(lexer.yylloc.last_line, 5); + assert.equal(lexer.yylloc.first_column, 0); + assert.equal(lexer.yylloc.last_column, 1); + assert.equal(lexer.lex(), "y"); + assert.equal(lexer.yytext, "yyyy", "yytext"); + assert.equal(lexer.yyleng, 4, "yyleng"); + assert.equal(lexer.offset, 13, "offset"); + assert.equal(lexer.match, "yyyy", "match"); + assert.equal(lexer.matched, "x\nxy\n\n\nx\nyyyy", "matched"); + assert.equal(lexer.yylloc.first_line, 6); + assert.equal(lexer.yylloc.last_line, 6); + assert.equal(lexer.yylloc.first_column, 0); + assert.equal(lexer.yylloc.last_column, 4); + }); + + it("test yylloc with %options ranges", function() { + var dict = { + options: { + ranges: true + }, + rules: [ + ["\\s+", "/* skip whitespace */" ], + ["x+", "return 'x';" ], + ["y+", "return 'y';" ] + ] + }; + + var input = "x\nxy\n\n\nx\nyyyy"; var lexer = new RegExpLexer(dict, input); assert.equal(lexer.lex(), "x"); + assert.equal(lexer.yytext, "x", "yytext"); + assert.equal(lexer.yyleng, 1, "yyleng"); + assert.equal(lexer.offset, 1, "offset"); + assert.equal(lexer.match, "x", "match"); + assert.equal(lexer.matched, "x", "matched"); + assert.equal(lexer.yylloc.first_line, 1); + assert.equal(lexer.yylloc.last_line, 1); assert.equal(lexer.yylloc.first_column, 0); assert.equal(lexer.yylloc.last_column, 1); + assert.ok(lexer.yylloc.range != null); + assert.equal(lexer.yylloc.range[0], 0); + assert.equal(lexer.yylloc.range[1], 1); assert.equal(lexer.lex(), "x"); + assert.equal(lexer.yytext, "x", "yytext"); + assert.equal(lexer.yyleng, 1, "yyleng"); + assert.equal(lexer.offset, 3, "offset"); + assert.equal(lexer.match, "x", "match"); + assert.equal(lexer.matched, "x\nx", "matched"); assert.equal(lexer.yylloc.first_line, 2); assert.equal(lexer.yylloc.last_line, 2); assert.equal(lexer.yylloc.first_column, 0); assert.equal(lexer.yylloc.last_column, 1); + assert.equal(lexer.yylloc.range[0], 2); + assert.equal(lexer.yylloc.range[1], 3); assert.equal(lexer.lex(), "y"); + assert.equal(lexer.yytext, "y", "yytext"); + assert.equal(lexer.yyleng, 1, "yyleng"); + assert.equal(lexer.offset, 4, "offset"); + assert.equal(lexer.match, "y", "match"); + assert.equal(lexer.matched, "x\nxy", "matched"); assert.equal(lexer.yylloc.first_line, 2); assert.equal(lexer.yylloc.last_line, 2); assert.equal(lexer.yylloc.first_column, 1); assert.equal(lexer.yylloc.last_column, 2); + assert.equal(lexer.yylloc.range[0], 3); + assert.equal(lexer.yylloc.range[1], 4); assert.equal(lexer.lex(), "x"); + assert.equal(lexer.yytext, "x", "yytext"); + assert.equal(lexer.yyleng, 1, "yyleng"); + assert.equal(lexer.offset, 8, "offset"); + assert.equal(lexer.match, "x", "match"); + assert.equal(lexer.matched, "x\nxy\n\n\nx", "matched"); assert.equal(lexer.yylloc.first_line, 5); assert.equal(lexer.yylloc.last_line, 5); assert.equal(lexer.yylloc.first_column, 0); assert.equal(lexer.yylloc.last_column, 1); -}; + assert.equal(lexer.yylloc.range[0], 7); + assert.equal(lexer.yylloc.range[1], 8); + assert.equal(lexer.lex(), "y"); + assert.equal(lexer.yytext, "yyyy", "yytext"); + assert.equal(lexer.yyleng, 4, "yyleng"); + assert.equal(lexer.offset, 13, "offset"); + assert.equal(lexer.match, "yyyy", "match"); + assert.equal(lexer.matched, "x\nxy\n\n\nx\nyyyy", "matched"); + assert.equal(lexer.yylloc.first_line, 6); + assert.equal(lexer.yylloc.last_line, 6); + assert.equal(lexer.yylloc.first_column, 0); + assert.equal(lexer.yylloc.last_column, 4); + assert.equal(lexer.yylloc.range[0], 9); + assert.equal(lexer.yylloc.range[1], 13); + }); -exports["test more()"] = function() { + it("test more()", function() { var dict = { rules: [ ["x", "return 'X';" ], - ['"[^"]*', function(){ - if(yytext.charAt(yyleng-1) == '\\') { + ['"[^"]*', function () { + if (yytext.charAt(yyleng - 1) === '\\') { this.more(); } else { yytext += this.input(); // swallow end quote @@ -324,9 +932,9 @@ exports["test more()"] = function() { assert.equal(lexer.lex(), "STRING"); assert.equal(lexer.lex(), "X"); assert.equal(lexer.lex(), "EOF"); -}; + }); -exports["test defined token returns"] = function() { + it("test defined token returns", function() { var tokens = {"2":"X", "3":"Y", "4":"EOF"}; var dict = { rules: [ @@ -345,9 +953,9 @@ exports["test defined token returns"] = function() { assert.equal(lexer.lex(), 3); assert.equal(lexer.lex(), 2); assert.equal(lexer.lex(), 4); -}; + }); -exports["test module generator from constructor"] = function() { + it("test module generator from constructor", function() { var dict = { rules: [ ["x", "return 'X';" ], @@ -367,9 +975,9 @@ exports["test module generator from constructor"] = function() { assert.equal(lexer.lex(), "Y"); assert.equal(lexer.lex(), "X"); assert.equal(lexer.lex(), "EOF"); -}; + }); -exports["test module generator"] = function() { + it("test module generator", function() { var dict = { rules: [ ["x", "return 'X';" ], @@ -390,14 +998,14 @@ exports["test module generator"] = function() { assert.equal(lexer.lex(), "Y"); assert.equal(lexer.lex(), "X"); assert.equal(lexer.lex(), "EOF"); -}; + }); -exports["test generator with more complex lexer"] = function() { + it("test generator with more complex lexer", function() { var dict = { rules: [ ["x", "return 'X';" ], - ['"[^"]*', function(){ - if(yytext.charAt(yyleng-1) == '\\') { + ['"[^"]*', function () { + if (yytext.charAt(yyleng - 1) === '\\') { this.more(); } else { yytext += this.input(); // swallow end quote @@ -419,9 +1027,9 @@ exports["test generator with more complex lexer"] = function() { assert.equal(lexer.lex(), "STRING"); assert.equal(lexer.lex(), "X"); assert.equal(lexer.lex(), "EOF"); -}; + }); -exports["test commonjs module generator"] = function() { + it("test commonjs module generator", function() { var dict = { rules: [ ["x", "return 'X';" ], @@ -443,9 +1051,9 @@ exports["test commonjs module generator"] = function() { assert.equal(exports.lex(), "Y"); assert.equal(exports.lex(), "X"); assert.equal(exports.lex(), "EOF"); -}; + }); -exports["test amd module generator"] = function() { + it("test amd module generator", function() { var dict = { rules: [ ["x", "return 'X';" ], @@ -472,14 +1080,14 @@ exports["test amd module generator"] = function() { assert.equal(lexer.lex(), "Y"); assert.equal(lexer.lex(), "X"); assert.equal(lexer.lex(), "EOF"); -}; + }); -exports["test DJ lexer"] = function() { + it("test DJ lexer", function() { var dict = { - "lex": { + "lex": { "macros": { "digit": "[0-9]", - "id": "[a-zA-Z][a-zA-Z0-9]*" + "id": "[a-zA-Z_][a-zA-Z0-9_]*" }, "rules": [ @@ -514,11 +1122,11 @@ exports["test DJ lexer"] = function() { ["\\)", "return 'RPAREN';"], [";", "return 'SEMICOLON';"], ["\\s+", "/* skip whitespace */"], - [".", "print('Illegal character');throw 'Illegal character';"], + [".", "print('Illegal character'); throw 'Illegal character';"], ["$", "return 'ENDOFFILE';"] ] - } -}; + } + }; var input = "class Node extends Object { \ var nat value var nat value;\ @@ -577,12 +1185,12 @@ exports["test DJ lexer"] = function() { var lexer = new RegExpLexer(dict.lex); lexer.setInput(input); var tok; - while (tok = lexer.lex(), tok!==1) { + while (tok = lexer.lex(), tok !== 1) { assert.equal(typeof tok, "string"); } -}; + }); -exports["test instantiation from string"] = function() { + it("test instantiation from string", function() { var dict = "%%\n'x' {return 'X';}\n'y' {return 'Y';}\n<> {return 'EOF';}"; var input = "x"; @@ -592,9 +1200,9 @@ exports["test instantiation from string"] = function() { assert.equal(lexer.lex(), "X"); assert.equal(lexer.lex(), "EOF"); -}; + }); -exports["test inclusive start conditions"] = function() { + it("test inclusive start conditions", function() { var dict = { startConditions: { "TEST": 0, @@ -618,9 +1226,9 @@ exports["test inclusive start conditions"] = function() { assert.equal(lexer.lex(), "TY"); assert.equal(lexer.lex(), "Y"); assert.equal(lexer.lex(), "EOF"); -}; + }); -exports["test exclusive start conditions"] = function() { + it("test exclusive start conditions", function() { var dict = { startConditions: { "EAT": 1, @@ -643,9 +1251,9 @@ exports["test exclusive start conditions"] = function() { assert.equal(lexer.lex(), "Y"); assert.equal(lexer.lex(), "Y"); assert.equal(lexer.lex(), "EOF"); -}; + }); -exports["test pop start condition stack"] = function() { + it("test pop start condition stack", function() { var dict = { startConditions: { "EAT": 1, @@ -668,10 +1276,10 @@ exports["test pop start condition stack"] = function() { assert.equal(lexer.lex(), "Y"); assert.equal(lexer.lex(), "Y"); assert.equal(lexer.lex(), "EOF"); -}; + }); -exports["test star start condition"] = function() { + it("test star start condition", function() { var dict = { startConditions: { "EAT": 1, @@ -681,7 +1289,7 @@ exports["test star start condition"] = function() { [["EAT"], ".", "" ], ["x", "return 'X';" ], ["y", "return 'Y';" ], - [["*"],"$", "return 'EOF';" ] + [["*"], "$", "return 'EOF';" ] ] }; var input = "xy//yxteadh//stey"; @@ -692,19 +1300,19 @@ exports["test star start condition"] = function() { assert.equal(lexer.lex(), "X"); assert.equal(lexer.lex(), "Y"); assert.equal(lexer.lex(), "EOF"); -}; + }); -exports["test start condition constants"] = function() { + it("test start condition constants", function() { var dict = { startConditions: { "EAT": 1, }, rules: [ ["\\/\\/", "this.begin('EAT');" ], - [["EAT"], ".", "if (YYSTATE==='EAT') return 'E';" ], - ["x", "if (YY_START==='INITIAL') return 'X';" ], + [["EAT"], ".", "if (YYSTATE === 'EAT') return 'E';" ], + ["x", "if (YY_START === 'INITIAL') return 'X';" ], ["y", "return 'Y';" ], - [["*"],"$", "return 'EOF';" ] + [["*"], "$", "return 'EOF';" ] ] }; var input = "xy//y"; @@ -716,9 +1324,33 @@ exports["test start condition constants"] = function() { assert.equal(lexer.lex(), "Y"); assert.equal(lexer.lex(), "E"); assert.equal(lexer.lex(), "EOF"); -}; + }); -exports["test unicode encoding"] = function() { + it("test start condition & warning", function() { + var dict = { + startConditions: { + "INITIAL": 0, + }, + rules: [ + ["\\/\\/", "this.begin('EAT');" ], + [["EAT"], ".", "if (YYSTATE === 'EAT') return 'E';" ], + ["x", "if (YY_START === 'INITIAL') return 'X';" ], + ["y", "return 'Y';" ], + [["*"], "$", "return 'EOF';" ] + ] + }; + var input = "xy//y"; + + var lexer = new RegExpLexer(dict); + lexer.setInput(input); + + assert.equal(lexer.lex(), "X"); + assert.equal(lexer.lex(), "Y"); + assert.equal(lexer.lex(), "E"); + assert.equal(lexer.lex(), "EOF"); + }); + + it("test unicode encoding", function() { var dict = { rules: [ ["\\u2713", "return 'CHECK';" ], @@ -734,9 +1366,9 @@ exports["test unicode encoding"] = function() { assert.equal(lexer.lex(), "CHECK"); assert.equal(lexer.lex(), "PI"); assert.equal(lexer.lex(), "Y"); -}; + }); -exports["test unicode"] = function() { + it("test unicode", function() { var dict = { rules: [ ["π", "return 'PI';" ], @@ -750,9 +1382,9 @@ exports["test unicode"] = function() { assert.equal(lexer.lex(), "PI"); assert.equal(lexer.lex(), "Y"); -}; + }); -exports["test longest match returns"] = function() { + it("test longest match returns", function() { var dict = { rules: [ [".", "return 'DOT';" ], @@ -767,9 +1399,9 @@ exports["test longest match returns"] = function() { assert.equal(lexer.lex(), "CAT"); assert.equal(lexer.lex(), "DOT"); -}; + }); -exports["test case insensitivity"] = function() { + it("test case insensitivity", function() { var dict = { rules: [ ["cat", "return 'CAT';" ] @@ -782,9 +1414,26 @@ exports["test case insensitivity"] = function() { lexer.setInput(input); assert.equal(lexer.lex(), "CAT"); -}; + }); -exports["test less"] = function() { + it("test camelCased json options", function() { + var dict = { + rules: [ + ["cat", "return 'CAT';" ] + ], + options: { + caseInsensitive: true + } + }; + var input = "Cat"; + + var lexer = new RegExpLexer(dict); + lexer.setInput(input); + + assert.equal(lexer.lex(), "CAT"); + }); + + it("test less", function() { var dict = { rules: [ ["cat", "this.less(2); return 'CAT';" ], @@ -798,17 +1447,17 @@ exports["test less"] = function() { assert.equal(lexer.lex(), "CAT"); assert.equal(lexer.lex(), "T"); -}; + }); -exports["test EOF unput"] = function() { + it("test EOF unput", function() { var dict = { startConditions: { "UN": 1, }, rules: [ ["U", "this.begin('UN');return 'U';" ], - [["UN"],"$", "this.unput('X')" ], - [["UN"],"X", "this.popState();return 'X';" ], + [["UN"], "$", "this.unput('X')" ], + [["UN"], "X", "this.popState();return 'X';" ], ["$", "return 'EOF'" ] ] }; @@ -820,9 +1469,9 @@ exports["test EOF unput"] = function() { assert.equal(lexer.lex(), "U"); assert.equal(lexer.lex(), "X"); assert.equal(lexer.lex(), "EOF"); -}; + }); -exports["test flex mode default rule"] = function() { + it("test flex mode default rule", function() { var dict = { rules: [ ["x", "return 'X';" ] @@ -836,9 +1485,9 @@ exports["test flex mode default rule"] = function() { assert.equal(lexer.lex(), "X"); assert.equal(lexer.lex(), "X"); -}; + }); -exports["test pipe precedence"] = function() { + it("test pipe precedence", function() { var dict = { rules: [ ["x|y", "return 'X_Y';" ], @@ -853,9 +1502,9 @@ exports["test pipe precedence"] = function() { assert.equal(lexer.lex(), "X_Y"); assert.equal(lexer.lex(), "N"); assert.equal(lexer.lex(), "X_Y"); -}; + }); -exports["test ranges"] = function() { + it("test ranges", function() { var dict = { rules: [ ["x+", "return 'X';" ], @@ -870,9 +1519,9 @@ exports["test ranges"] = function() { assert.equal(lexer.lex(), "X"); assert.deepEqual(lexer.yylloc.range, [0, 3]); -}; + }); -exports["test unput location"] = function() { + it("test unput location", function() { var dict = { rules: [ ["x+", "return 'X';" ], @@ -887,7 +1536,6 @@ exports["test unput location"] = function() { var lexer = new RegExpLexer(dict); lexer.setInput(input); - console.log(lexer.rules); assert.equal(lexer.next(), "X"); assert.deepEqual(lexer.yylloc, {first_line: 1, @@ -914,9 +1562,9 @@ exports["test unput location"] = function() { last_column: 1, range: [5, 6]}); -}; + }); -exports["test unput location again"] = function() { + it("test unput location again", function() { var dict = { rules: [ ["x+", "return 'X';" ], @@ -931,7 +1579,6 @@ exports["test unput location again"] = function() { var lexer = new RegExpLexer(dict); lexer.setInput(input); - console.log(lexer.rules); assert.equal(lexer.next(), "X"); assert.deepEqual(lexer.yylloc, {first_line: 1, @@ -958,9 +1605,9 @@ exports["test unput location again"] = function() { last_column: 1, range: [7, 8]}); -}; + }); -exports["test backtracking lexer reject() method"] = function() { + it("test backtracking lexer reject() method", function() { var dict = { rules: [ ["[A-Z]+([0-9]+)", "if (this.matches[1].length) this.reject(); else return 'ID';" ], @@ -976,9 +1623,9 @@ exports["test backtracking lexer reject() method"] = function() { assert.equal(lexer.lex(), "WORD"); assert.equal(lexer.lex(), "NUM"); -}; + }); -exports["test lexer reject() exception when not in backtracking mode"] = function() { + it("test lexer reject() exception when not in backtracking mode", function() { var dict = { rules: [ ["[A-Z]+([0-9]+)", "if (this.matches[1].length) this.reject(); else return 'ID';" ], @@ -990,17 +1637,20 @@ exports["test lexer reject() exception when not in backtracking mode"] = functio var input = "A5"; var lexer = new RegExpLexer(dict); + var JisonLexerError = lexer.JisonLexerError; + assert(JisonLexerError); + lexer.setInput(input); assert.throws(function() { - lexer.lex(); - }, - function(err) { - return (err instanceof Error) && /You can only invoke reject/.test(err); - }); -}; - -exports["test yytext state after unput"] = function() { + lexer.lex(); + }, + JisonLexerError, + /Lexical error on line \d+[^]*?You can only invoke reject\(\) in the lexer when the lexer is of the backtracking persuasion/ + ); + }); + + it("test yytext state after unput", function() { var dict = { rules: [ ["cat4", "this.unput('4'); return 'CAT';" ], @@ -1018,4 +1668,1222 @@ exports["test yytext state after unput"] = function() { assert.equal(lexer.yytext, "cat"); assert.equal(lexer.lex(), "NUMBER"); assert.equal(lexer.lex(), "EOF"); -}; + }); + + it("test not blowing up on a sequence of ignored tokens the size of the maximum callstack size", function() { + var dict = { + rules: [ + ["#", "// ignored" ], + ["$", "return 'EOF';"] + ] + }; + + /** + * Crafts a src string of `#`s for our rules the size of the current maximum callstack. + * The lexer used to blow up with a stack overflow error in this case. + */ + var makeStackBlowingHashes = function() { + try { + return "#" + makeStackBlowingHashes(); + } catch (e) { + return "#"; + } + }; + + var input = makeStackBlowingHashes(); + + var lexer = new RegExpLexer(dict, input); + assert.equal(lexer.lex(), "EOF"); + }); + + it("test custom parseError handler", function() { + var dict = { + rules: [ + ["x", "return 't';" ] + ] + }; + + var input = "xyz ?"; + + var counter = 0; + + var lexer = new RegExpLexer(dict); + lexer.setInput(input, { + parser: { + parseError: function (str, hash) { + counter++; + } + } + }); + assert.equal(lexer.lex(), "t"); + assert.equal(lexer.yytext, "x"); + assert.equal(lexer.lex(), lexer.ERROR); + assert.equal(counter, 1); + assert.equal(lexer.yytext, "y"); + assert.equal(lexer.lex(), lexer.ERROR); + assert.equal(counter, 2); + assert.equal(lexer.yytext, "z"); + assert.equal(lexer.lex(), lexer.ERROR); + assert.equal(counter, 3); + assert.equal(lexer.yytext, " "); + assert.equal(lexer.lex(), lexer.ERROR); + assert.equal(counter, 4); + assert.equal(lexer.yytext, "?"); + assert.equal(lexer.lex(), lexer.EOF); + assert.equal(lexer.yytext, ""); + // and then the lexer keeps on spitting out EOF tokens ad nauseam: + assert.equal(lexer.lex(), lexer.EOF); + assert.equal(lexer.yytext, ""); + assert.equal(lexer.lex(), lexer.EOF); + assert.equal(lexer.yytext, ""); + }); + + it("test custom parseError handler which produces a replacement token", function() { + var dict = { + rules: [ + ["x", "return 't';" ] + ] + }; + + var input = "xyz ?"; + + var counter = 0; + var c1, c2; + + var lexer = new RegExpLexer(dict); + lexer.setInput(input, { + parser: { + parseError: function (str, hash) { + counter++; + assert.ok(hash.lexer); + // eat two more characters + c1 = hash.lexer.input(); + c2 = hash.lexer.input(); + return 'alt'; + } + } + }); + assert.equal(lexer.lex(), "t"); + assert.equal(lexer.yytext, "x"); + assert.equal(lexer.lex(), 'alt'); + assert.equal(counter, 1); + assert.equal(c1, "y"); + assert.equal(c2, "z"); + assert.equal(lexer.yytext, "yz"); + assert.equal(lexer.lex(), 'alt'); + assert.equal(counter, 2); + assert.equal(c1, " "); + assert.equal(c2, "?"); + assert.equal(lexer.yytext, " ?"); + assert.equal(lexer.lex(), lexer.EOF); + assert.equal(lexer.yytext, ""); + // and then the lexer keeps on spitting out EOF tokens ad nauseam: + assert.equal(lexer.lex(), lexer.EOF); + assert.equal(lexer.yytext, ""); + assert.equal(lexer.lex(), lexer.EOF); + assert.equal(lexer.yytext, ""); + }); + + it("test custom pre and post handlers", function() { + var dict = { + options: { + pre_lex: function () { + counter += 1; + if (counter % 2 === 1) { + return 'PRE'; + } + }, + post_lex: function (tok) { + counter += 2; + return 'a:' + tok; + } + }, + rules: [ + ["[a-z]", "return 't';" ] + ] + }; + + var input = "xyz"; + + var counter = 0; + + var lexer = new RegExpLexer(dict); + lexer.setInput(input); + assert.equal(lexer.lex(), "a:PRE"); + assert.equal(lexer.yytext, ""); + assert.equal(counter, 3); + assert.equal(lexer.lex(), "a:t"); + assert.equal(lexer.yytext, "x"); + assert.equal(counter, 6); + assert.equal(lexer.lex(), "a:PRE"); + // as our PRE handler causes the lexer to produce another token immediately + // without entering the lexer proper, `yytext` et al are NOT RESET: + assert.equal(lexer.yytext, "x"); + assert.equal(counter, 9); + assert.equal(lexer.lex(), "a:t"); + assert.equal(lexer.yytext, "y"); + assert.equal(counter, 12); + assert.equal(lexer.lex(), "a:PRE"); + assert.equal(lexer.yytext, "y"); + assert.equal(counter, 15); + assert.equal(lexer.lex(), "a:t"); + assert.equal(lexer.yytext, "z"); + assert.equal(counter, 18); + assert.equal(lexer.lex(), "a:PRE"); + assert.equal(lexer.yytext, "z"); + assert.equal(counter, 21); + assert.equal(lexer.EOF, 1); + assert.equal(lexer.lex(), "a:1"); + assert.equal(lexer.yytext, ""); + assert.equal(counter, 24); + // and then the lexer keeps on spitting out post-processed EOF tokens ad nauseam + // interleaved with PRE tokens produced by the PRE handler: + assert.equal(lexer.lex(), "a:PRE"); + assert.equal(lexer.yytext, ""); + assert.equal(counter, 27); + assert.equal(lexer.lex(), "a:1"); // EOF + assert.equal(lexer.yytext, ""); + assert.equal(counter, 30); + assert.equal(lexer.lex(), "a:PRE"); + assert.equal(lexer.yytext, ""); + assert.equal(counter, 33); + assert.equal(lexer.lex(), "a:1"); + assert.equal(lexer.yytext, ""); + assert.equal(counter, 36); + }); + + it("test live replacement of custom pre and post handlers", function() { + var dict = { + options: { + pre_lex: function () { + counter += 1; + if (counter % 2 === 1) { + return 'PRE'; + } + }, + post_lex: function (tok) { + counter += 2; + return 'a:' + tok; + } + }, + rules: [ + ["[a-z]", "return 't';" ] + ] + }; + + var input = "xyz"; + + var counter = 0; + + var lexer = new RegExpLexer(dict); + lexer.setInput(input); + assert.equal(lexer.lex(), "a:PRE"); + assert.equal(lexer.yytext, ""); + assert.equal(counter, 3); + assert.equal(lexer.lex(), "a:t"); + assert.equal(lexer.yytext, "x"); + assert.equal(counter, 6); + assert.equal(lexer.lex(), "a:PRE"); + // as our PRE handler causes the lexer to produce another token immediately + // without entering the lexer proper, `yytext` et al are NOT RESET: + assert.equal(lexer.yytext, "x"); + assert.equal(counter, 9); + + lexer.options.pre_lex = null; + lexer.options.post_lex = function (tok) { + counter--; + if (tok !== lexer.EOF) { + return 'V2:' + tok; + } + // default return of undefined/false/0 will have the lexer produce the raw token + }; + + assert.equal(lexer.lex(), "V2:t"); + assert.equal(lexer.yytext, "y"); + assert.equal(counter, 8); + assert.equal(lexer.lex(), "V2:t"); + assert.equal(lexer.yytext, "z"); + assert.equal(counter, 7); + assert.equal(lexer.EOF, 1); + assert.equal(lexer.lex(), lexer.EOF); + assert.equal(lexer.yytext, ""); + assert.equal(counter, 6); + // and then the lexer keeps on spitting out post-processed EOF tokens ad nauseam + // interleaved with PRE tokens produced by the PRE handler: + assert.equal(lexer.lex(), lexer.EOF); + assert.equal(lexer.yytext, ""); + assert.equal(counter, 5); + assert.equal(lexer.lex(), lexer.EOF); + assert.equal(lexer.yytext, ""); + assert.equal(counter, 4); + assert.equal(lexer.lex(), lexer.EOF); + assert.equal(lexer.yytext, ""); + assert.equal(counter, 3); + }); + + it("test edge case which could break documentation comments in the generated lexer", function() { + var dict = { + rules: [ + ["\\*\\/", "return 'X';" ], + ["\"*/\"", "return 'Y';" ], + ["'*/'", "return 'Z';" ] + ] + }; + + var input = "*/"; + + var lexer = new RegExpLexer(dict, input); + assert.equal(lexer.lex(), "X"); + assert.equal(lexer.lex(), lexer.EOF); + }); + + it("test yylloc info object must be unique for each token", function() { + var dict = { + rules: [ + ["[a-z]", "return 'X';" ] + ], + options: {ranges: true} + }; + + var input = "xyz"; + var prevloc = null; + + var lexer = new RegExpLexer(dict, input); + assert.equal(lexer.lex(), "X"); + assert.deepEqual(lexer.yylloc, {first_line: 1, + first_column: 0, + last_line: 1, + last_column: 1, + range: [0, 1]}); + prevloc = lexer.yylloc; + assert.equal(lexer.lex(), "X"); + assert.notStrictEqual(prevloc, lexer.yylloc); + assert.deepEqual(lexer.yylloc, {first_line: 1, + first_column: 1, + last_line: 1, + last_column: 2, + range: [1, 2]}); + prevloc = lexer.yylloc; + assert.equal(lexer.lex(), "X"); + assert.notStrictEqual(prevloc, lexer.yylloc); + assert.deepEqual(lexer.yylloc, {first_line: 1, + first_column: 2, + last_line: 1, + last_column: 3, + range: [2, 3]}); + prevloc = lexer.yylloc; + assert.equal(lexer.lex(), lexer.EOF); + // not so for EOF: + if (0) { + assert.notStrictEqual(prevloc, lexer.yylloc); + assert.deepEqual(lexer.yylloc, {first_line: 1, + first_column: 3, + last_line: 1, + last_column: 3, + range: [3, 3]}); + } + }); + + it("test yylloc info object is not modified by subsequent lex() activity", function() { + var dict = { + rules: [ + ["[a-z]", "return 'X';" ] + ], + options: {ranges: true} + }; + + var input = "xyz"; + var prevloc = null; + + var lexer = new RegExpLexer(dict, input); + assert.equal(lexer.lex(), "X"); + assert.deepEqual(lexer.yylloc, {first_line: 1, + first_column: 0, + last_line: 1, + last_column: 1, + range: [0, 1]}); + prevloc = lexer.yylloc; + assert.equal(lexer.lex(), "X"); + assert.notStrictEqual(prevloc, lexer.yylloc); + assert.deepEqual(prevloc, {first_line: 1, + first_column: 0, + last_line: 1, + last_column: 1, + range: [0, 1]}); + assert.deepEqual(lexer.yylloc, {first_line: 1, + first_column: 1, + last_line: 1, + last_column: 2, + range: [1, 2]}); + prevloc = lexer.yylloc; + assert.equal(lexer.lex(), "X"); + assert.notStrictEqual(prevloc, lexer.yylloc); + assert.deepEqual(prevloc, {first_line: 1, + first_column: 1, + last_line: 1, + last_column: 2, + range: [1, 2]}); + assert.deepEqual(lexer.yylloc, {first_line: 1, + first_column: 2, + last_line: 1, + last_column: 3, + range: [2, 3]}); + prevloc = lexer.yylloc; + assert.equal(lexer.lex(), lexer.EOF); + // not so for EOF: + if (0) { + assert.notStrictEqual(prevloc, lexer.yylloc); + assert.deepEqual(prevloc, {first_line: 1, + first_column: 2, + last_line: 1, + last_column: 3, + range: [2, 3]}); + assert.deepEqual(lexer.yylloc, {first_line: 1, + first_column: 3, + last_line: 1, + last_column: 3, + range: [3, 3]}); + } + }); + + it("test yylloc info object CAN be modified by subsequent input() activity", function() { + var dict = { + rules: [ + ["[a-z]", "return 'X';" ] + ], + options: {ranges: true} + }; + + var input = "xyz"; + var prevloc = null; + + var lexer = new RegExpLexer(dict, input); + assert.equal(lexer.lex(), "X"); + assert.deepEqual(lexer.yylloc, {first_line: 1, + first_column: 0, + last_line: 1, + last_column: 1, + range: [0, 1]}); + prevloc = lexer.yylloc; + assert.equal(lexer.lex(), "X"); + assert.notStrictEqual(prevloc, lexer.yylloc); + assert.deepEqual(prevloc, {first_line: 1, + first_column: 0, + last_line: 1, + last_column: 1, + range: [0, 1]}); + assert.deepEqual(lexer.yylloc, {first_line: 1, + first_column: 1, + last_line: 1, + last_column: 2, + range: [1, 2]}); + prevloc = lexer.yylloc; + assert.equal(lexer.input(), "z"); + // this will modify the existing yylloc: + assert.strictEqual(prevloc, lexer.yylloc); + assert.deepEqual(prevloc, {first_line: 1, + first_column: 1, + last_line: 1, + last_column: 3, + range: [1, 3]}); + assert.deepEqual(lexer.yylloc, {first_line: 1, + first_column: 1, + last_line: 1, + last_column: 3, + range: [1, 3]}); + prevloc = lexer.yylloc; + assert.equal(lexer.lex(), lexer.EOF); + // yylloc on EOF is NOT the same yylloc object as before: EOF is just another token, WITH its own yylloc info... + assert.notStrictEqual(prevloc, lexer.yylloc); + // and this yylloc value set is intuitive because EOF does update yylloc like any other lexed token: + assert.deepEqual(lexer.yylloc, {first_line: 1, + first_column: 3, + last_line: 1, + last_column: 3, + range: [3, 3]}); + }); + + it("test empty rule set with custom lexer", function() { + var src = null; + + // Wrap the custom lexer code in a function so we can String()-dump it: + function customLexerCode() { + var input = ""; + var input_offset = 0; + var lexer = { + EOF: 1, + ERROR: 2, + options: {}, + lex: function () { + if (input.length > input_offset) { + return "a" + input[input_offset++]; + } else { + return this.EOF; + } + }, + setInput: function (inp) { + input = inp; + input_offset = 0; + } + }; + } + + var dict = { + rules: [], + actionInclude: String(customLexerCode).replace(/function [^\{]+\{/, '').replace(/\}$/, ''), + moduleInclude: 'console.log("moduleInclude");', + options: { + foo: 'bar', + showSource: function (lexer, source, opts) { + src = source; + } + } + }; + + var input = "xxyx"; + + var lexer = new RegExpLexer(dict, input); + assert.equal(lexer.lex(), "ax"); + assert.equal(lexer.lex(), "ax"); + assert.equal(lexer.lex(), "ay"); + assert.equal(lexer.lex(), "ax"); + assert.equal(lexer.lex(), lexer.EOF); + }); + + it("test XRegExp option support", function() { + var dict = { + options: { + xregexp: true + }, + rules: [ + ["π", "return 'PI';" ], + ["\\p{Alphabetic}", "return 'Y';" ], + ["[\\p{Number}]", "return 'N';" ] + ] + }; + var input = "πyαε"; + + var lexer = new RegExpLexer(dict); + + // ensure the XRegExp class is invoked for the unicode rules; see also the compilation validation test code + // inside the regexp-lexer.js file for the counterpart of this nasty test: + // + // var __hacky_counter__ = 0; + // function XRegExp(re, f) { + // this.re = re; + // this.flags = f; + // var fake = /./; // WARNING: this exact 'fake' is also depended upon by the xregexp unit test! + // __hacky_counter__++; + // fake.__hacky_backy__ = __hacky_counter__; + // return fake; + // } + // + var generated_ruleset = lexer.rules; + assert(generated_ruleset); + var xregexp_count = 0; + for (var i = 0; i < generated_ruleset.length; i++) { + var rule = generated_ruleset[i]; + assert(rule); + if (rule.__hacky_backy__) { + xregexp_count += rule.__hacky_backy__; + } + } + assert.equal(xregexp_count, 1 + 2); + + // run the lexer and check the tokens produced by it: the faked version will be active but will deliver something + // similar to the real XRegExp for this particular ruleset only! + + lexer.setInput(input); + + assert.equal(lexer.lex(), "PI"); + assert.equal(lexer.lex(), "Y"); + assert.equal(lexer.lex(), "Y"); + assert.equal(lexer.lex(), "Y"); + assert.equal(lexer.lex(), lexer.EOF); + }); + + it("test support for basic unicode regex compilation via internal xregexp", function() { + var dict = { + options: { + xregexp: false // !!! + }, + rules: [ + ["π", "return 'PI';" ], + ["\\p{Alphabetic}", "return 'Y';" ], + ["[\\p{Number}]", "return 'N';" ] + ] + }; + var input = "πyα1ε"; + + var lexer = new RegExpLexer(dict); + + var generated_ruleset = lexer.rules; + assert(generated_ruleset); + var xregexp_count = 0; + for (var i = 0; i < generated_ruleset.length; i++) { + var rule = generated_ruleset[i]; + assert(rule); + if (rule.__hacky_backy__) { + xregexp_count += rule.__hacky_backy__; + } + } + assert.equal(xregexp_count, 0); + + // run the lexer + + lexer.setInput(input); + + assert.equal(lexer.lex(), "PI"); + assert.equal(lexer.lex(), "Y"); + assert.equal(lexer.lex(), "Y"); + assert.equal(lexer.lex(), "N"); + assert.equal(lexer.lex(), "Y"); + assert.equal(lexer.lex(), lexer.EOF); + }); + + it("test support for unicode macro expansion via internal xregexp", function() { + var dict = { + options: { + xregexp: false // !!! + }, + macros: { + "DIGIT": "[\\p{Number}]" + }, + rules: [ + ["π", "return 'PI';" ], + ["\\p{Alphabetic}", "return 'Y';" ], + ["{DIGIT}+", "return 'N';" ] + ] + }; + var input = "πyα123ε"; + + var lexer = new RegExpLexer(dict); + + lexer.setInput(input); + + assert.equal(lexer.lex(), "PI"); + assert.equal(lexer.lex(), "Y"); + assert.equal(lexer.lex(), "Y"); + assert.equal(lexer.lex(), "N"); + assert.equal(lexer.match, "123"); + assert.equal(lexer.lex(), "Y"); + assert.equal(lexer.match, "ε"); + assert.equal(lexer.lex(), lexer.EOF); + }); + + it("test macro expansion in regex set atom", function() { + var dict = { + options: { + xregexp: false // !!! + }, + macros: { + "DIGIT": "[\\p{Number}]" + }, + rules: [ + ["π", "return 'PI';" ], + ["\\p{Alphabetic}", "return 'Y';" ], + ["{DIGIT}+", "return 'N';" ] + ] + }; + var input = "πyα123ε"; + + var lexer = new RegExpLexer(dict); + + lexer.setInput(input); + + assert.equal(lexer.lex(), "PI"); + assert.equal(lexer.lex(), "Y"); + assert.equal(lexer.lex(), "Y"); + assert.equal(lexer.lex(), "N"); + assert.equal(lexer.match, "123"); + assert.equal(lexer.lex(), "Y"); + assert.equal(lexer.match, "ε"); + assert.equal(lexer.lex(), lexer.EOF); + }); + + it("test nested macro expansion in xregexp set atoms", function() { + var dict = { + options: { + xregexp: false // !!! + }, + macros: { + "DIGIT": "[\\p{Number}]", + "ALPHA": "[\\p{Alphabetic}]", + "ALNUM": "[{DIGIT}{ALPHA}]" + }, + rules: [ + ["π", "return 'PI';" ], + ["[{ALNUM}]+", "return 'Y';" ], + ["{DIGIT}+", "return 'N';" ] + ] + }; + var input = "πyα123ε"; + + var lexer = new RegExpLexer(dict); + + var expandedMacros = lexer.getExpandedMacros(); + //console.log("MACROS:::::::::::::::", expandedMacros); + // assert.equal(expandedMacros.DIGIT.in_set, re2set('[\\p{Number}]')); + // assert.equal(expandedMacros.ALPHA.in_set, re2set('[\\p{Alphabetic}]')); + // assert.equal(expandedMacros.ALNUM.in_set, re2set('[\\p{Number}\\p{Alphabetic}]')); + // assert.equal(expandedMacros.ALNUM.elsewhere, '[' + re2set('[\\p{Number}\\p{Alphabetic}]') + ']'); + + lexer.setInput(input); + + assert.equal(lexer.lex(), "PI"); + assert.equal(lexer.lex(), "Y"); + assert.equal(lexer.match, "yα123ε"); + assert.equal(lexer.lex(), lexer.EOF); + }); + + it("test macros in regex set atoms are recognized when coming from grammar string", function() { + var dict = [ + "DIGIT [\\p{Number}]", + "ALPHA [\\p{Alphabetic}]", + "ALNUM [{DIGIT}{ALPHA}]", + "", + "%%", + "", + "π return 'PI';", + "[{ALNUM}]+ return 'Y';", + "[{DIGIT}]+ return 'N';", + ].join('\n'); + + var input = "πyα123ε"; + + var lexer = new RegExpLexer(dict); + + var expandedMacros = lexer.getExpandedMacros(); + //console.log("MACROS:::::::::::::::", expandedMacros); + // assert.equal(expandedMacros.DIGIT.in_set, re2set('[\\p{Number}]')); + // assert.equal(expandedMacros.ALPHA.in_set, re2set('[\\p{Alphabetic}]')); + // assert.equal(expandedMacros.ALNUM.in_set, re2set('[\\p{Number}\\p{Alphabetic}]')); + // assert.equal(expandedMacros.ALNUM.elsewhere, '[' + re2set('[\\p{Number}\\p{Alphabetic}]') + ']'); + // assert.equal(expandedMacros.ALNUM.raw, '[{DIGIT}{ALPHA}]'); + + lexer.setInput(input); + + assert.equal(lexer.lex(), "PI"); + assert.equal(lexer.lex(), "Y"); + assert.equal(lexer.match, "yα123ε"); + assert.equal(lexer.lex(), lexer.EOF); + }); + + it("test nested macro expansion in regex set atoms", function() { + var dict = { + options: { + xregexp: false + }, + macros: { + "DIGIT": "[0-9]", + "ALPHA": "[a-zA-Z]", + "ALNUM": "[{DIGIT}{ALPHA}]" + }, + rules: [ + ["π", "return 'PI';" ], + ["[{ALNUM}]+", "return 'Y';" ], + ["{DIGIT}+", "return 'N';" ], + [".", "return '?';" ] + ] + }; + var input = "πyα123εE"; + + var lexer = new RegExpLexer(dict); + + var expandedMacros = lexer.getExpandedMacros(); + //console.log("MACROS:::::::::::::::", expandedMacros); + assert.equal(expandedMacros.DIGIT.in_set, '\\d'); + assert.equal(expandedMacros.ALPHA.in_set, 'A-Za-z'); + assert.equal(expandedMacros.ALNUM.in_set, '0-9A-Za-z'); + assert.equal(expandedMacros.ALNUM.elsewhere, '[^\\W_]'); /* [0-9A-Za-z] */ + + lexer.setInput(input); + + assert.equal(lexer.lex(), "PI"); + assert.equal(lexer.lex() + '=' + lexer.match, "Y=y"); + assert.equal(lexer.lex() + '=' + lexer.match, "?=α"); + assert.equal(lexer.lex() + '=' + lexer.match, "Y=123"); // (!) not 'N=123' as ALNUM-based rule comes before DIGIT rule. + assert.equal(lexer.lex() + '=' + lexer.match, "?=ε"); + assert.equal(lexer.lex() + '=' + lexer.match, "Y=E"); + assert.equal(lexer.lex(), lexer.EOF); + }); + + it("test nested macro expansion in regex set atoms with negating surrounding set (1 level)", function() { + var dict = { + options: { + xregexp: false + }, + macros: { + "DIGIT": "[0-9]", + "ALPHA": "[a-zA-Z]", + "ALNUM": "[{DIGIT}{ALPHA}]", + "CTRL": "[^{ALNUM}]", + }, + rules: [ + ["π", "return 'PI';" ], + ["{CTRL}+", "return 'C';" ], + ["[{ALNUM}]+", "return 'Y';" ], + ["{DIGIT}+", "return 'N';" ], + [".", "return '?';" ], + ] + }; + var input = "πyα * +123.@_[]εE"; + + var lexer = new RegExpLexer(dict); + + var expandedMacros = lexer.getExpandedMacros(); + //console.log("MACROS:::::::::::::::", expandedMacros); + assert.equal(expandedMacros.DIGIT.in_set, '\\d'); + assert.equal(expandedMacros.ALPHA.in_set, 'A-Za-z'); + assert.equal(expandedMacros.ALNUM.in_set, '0-9A-Za-z'); + assert.equal(expandedMacros.ALNUM.elsewhere, '[^\\W_]'); /* [0-9A-Za-z] */ + // assert.equal(expandedMacros.CTRL.in_inv_set, '0-9A-Za-z'); + assert.equal(expandedMacros.CTRL.elsewhere, '[\\W_]'); /* [^0-9A-Za-z] */ + + lexer.setInput(input); + + assert.equal(lexer.lex(), "PI"); + assert.equal(lexer.lex() + '=' + lexer.match, "Y=y"); + assert.equal(lexer.lex() + '=' + lexer.match, "C=α * +"); + assert.equal(lexer.lex() + '=' + lexer.match, "Y=123"); // (!) not 'N=123' as ALNUM-based rule comes before DIGIT rule. + assert.equal(lexer.lex() + '=' + lexer.match, "C=.@_[]ε"); + assert.equal(lexer.lex() + '=' + lexer.match, "Y=E"); + assert.equal(lexer.lex(), lexer.EOF); + }); + + it("test nested macro expansion in regex set atoms with negating inner set", function() { + var dict = { + options: { + xregexp: false + }, + macros: { + "DIGIT": "[0-9]", + "ALPHA": "[a-zA-Z]", + "ALNUM": "[{DIGIT}{ALPHA}]|[{DIGIT}]", + "CTRL": "[^{ALNUM}]", + "WORD": "[BLUB:]|[^{CTRL}]", + "WORDS": "[{WORD}]+", + "DIGITS":"[{DIGIT}]+", + "WS": "[^\\S\\r\\n]", + "NONE": "[^\\W\\w]", + "ANY": "[\\W\\w]", + }, + rules: [ + ["π", "return 'PI';" ], + ["{CTRL}+", "return 'C';" ], + ["[{WORD}]+", "return 'Y';" ], + ["[{DIGIT}]+", "return 'N';" ], + [".", "return '?';" ], + ] + }; + var input = "πyα * +123.@_[]εE"; + + var lexer = new RegExpLexer(dict); + + var expandedMacros = lexer.getExpandedMacros(); + //console.log("MACROS:::::::::::::::", expandedMacros); + assert.equal(expandedMacros.DIGIT.in_set, '\\d'); + assert.equal(expandedMacros.ALPHA.in_set, 'A-Za-z'); + assert.equal(expandedMacros.ALNUM.in_set, '0-9A-Za-z'); + assert.equal(expandedMacros.ALNUM.elsewhere, '[^\\W_]|\\d'); /* [0-9A-Za-z]|[0-9] */ + assert.equal(expandedMacros.CTRL.in_set, '\\u0000-/:-@\\[-`{-\\uffff' /* '^0-9a-zA-Z' */ ); + assert.equal(expandedMacros.CTRL.elsewhere, '[\\W_]'); /* [^0-9A-Za-z] */ + assert.equal(expandedMacros.WORD.in_set, '0-:A-Za-z'); + assert.equal(expandedMacros.WORD.elsewhere, '[:BLU]|[^\\W_]'); + // Unicode Character 'LINE SEPARATOR' (U+2028) and Unicode Character 'PARAGRAPH SEPARATOR' (U+2029) must be explicitly encoded in \uNNNN + // syntax to prevent crashes when the generated is compiled via `new Function()` as that one doesn't like it when you feed it + // regexes with these two characters embedded as is! + assert.equal(expandedMacros.WS.in_set, '\\t\\v\\f \u00a0\u1680\u180e\u2000-\u200a\\u2028\\u2029\u202f\u205f\u3000\ufeff'); + assert.equal(expandedMacros.WS.elsewhere, '[^\\S\\n\\r]'); + assert.equal(expandedMacros.ANY.in_set, '\\S\\s'); + assert.equal(expandedMacros.ANY.elsewhere, '[\\S\\s]'); + assert.equal(expandedMacros.NONE.in_set, '^\\S\\s'); + assert.equal(expandedMacros.NONE.elsewhere, '[^\\S\\s]'); + assert.ok(expandedMacros.DIGITS.in_set instanceof Error); + assert.equal(expandedMacros.DIGITS.elsewhere, '\\d+'); + assert.ok(expandedMacros.WORDS.in_set instanceof Error); + assert.equal(expandedMacros.WORDS.elsewhere, '[\\d:A-Za-z]+'); + + lexer.setInput(input); + + assert.equal(lexer.lex(), "PI"); + assert.equal(lexer.lex() + '=' + lexer.match, "Y=y"); + assert.equal(lexer.lex() + '=' + lexer.match, "C=α * +"); + assert.equal(lexer.lex() + '=' + lexer.match, "Y=123"); // (!) not 'N=123' as ALNUM-based rule comes before DIGIT rule. + assert.equal(lexer.lex() + '=' + lexer.match, "C=.@_[]ε"); + assert.equal(lexer.lex() + '=' + lexer.match, "Y=E"); + assert.equal(lexer.lex(), lexer.EOF); + }); + + it("test Unicode Supplementary Plane detection in regex set atoms - part 1", function() { + var dict = { + options: { + xregexp: false + }, + macros: { + "ISSUE_A": "[\\t\\n\\r\\u0120-\\uD7FF\\uE000\\uFFFD]", // \\u10000-\\u10FFFF + "ISSUE_B": "[\\u001F-\\u002F]", // side test: proper processing of 'dash' as a *character* in a set. + "NOTISSUE": "[^{ISSUE_A}{ISSUE_B}XYZ]", // negating the inner set means we include the U.S.P. in NOTISSUE! + "NOTNOTISSUE": "[^{NOTISSUE}]", // while negating the *negated set* once again *excludes* the U.S.P. in NOTNOTISSUE! + }, + rules: [ + ["{ISSUE_A}+", "return 'A';" ], + ["{ISSUE_B}+", "return 'B';" ], +// ["{NOTISSUE}+", "return 'N';" ], + ["{NOTNOTISSUE}+", "return 'C';" ], + ["[{ISSUE_A}]+", "return 'X';" ], + ["[{ISSUE_B}]+", "return 'Y';" ], +// ["[{NOTISSUE}]+", "return 'W';" ], + ["[{NOTNOTISSUE}]+", "return 'Z';" ], + [".", "return '?';" ], + ] + }; + var input = "πXYZxyzα\u10000\u{0023}\u{1023}\u{10230}ε"; + + var lexer = new RegExpLexer(dict); + + var expandedMacros = lexer.getExpandedMacros(); + //console.log("MACROS:::::::::::::::", expandedMacros); + + // test the calculated regexes -- the 'sollwert' for the test takes `i2c()` encoding particulars into account: + assert.equal(expandedMacros.ISSUE_A.in_set, '\\t\\n\\r\u0120-\uD7FF\uE000\\ufffd'); + assert.equal(expandedMacros.ISSUE_A.elsewhere, '[\\t\\n\\r\u0120-\uD7FF\uE000\\ufffd]'); + assert.equal(expandedMacros.ISSUE_B.in_set, '\\u001f-\u002F'); + assert.equal(expandedMacros.ISSUE_B.elsewhere, '[\\u001f-\u002F]'); + assert.equal(expandedMacros.NOTISSUE.in_set, '\\u0000-\\b\\v\\f\\u000e-\\u001e0-W\\[-\u011f\\ud800-\\udfff\ue001-\\ufffc\\ufffe\\uffff'); + assert.equal(expandedMacros.NOTISSUE.elsewhere, '[^\\t\\n\\r\\u001f-\u002FX-Z\u0120-\uD7FF\uE000\\ufffd]'); + assert.equal(expandedMacros.NOTNOTISSUE.in_set, '\\t\\n\\r\\u001f-\u002FX-Z\u0120-\uD7FF\uE000\\ufffd'); + assert.equal(expandedMacros.NOTNOTISSUE.elsewhere, '[\\t\\n\\r\\u001f-\u002FX-Z\u0120-\uD7FF\uE000\\ufffd]'); + + lexer.setInput(input); + + assert.equal(lexer.lex() + '=' + lexer.match, "A=π"); + assert.equal(lexer.lex() + '=' + lexer.match, "C=XYZ"); + assert.equal(lexer.lex() + '=' + lexer.match, "?=x"); + assert.equal(lexer.lex() + '=' + lexer.match, "?=y"); + assert.equal(lexer.lex() + '=' + lexer.match, "?=z"); + assert.equal(lexer.lex() + '=' + lexer.match, "A=α\u1000"); + assert.equal(lexer.lex() + '=' + lexer.match, "?=0"); + assert.equal(lexer.lex() + '=' + lexer.match, "B=\u0023"); + assert.equal(lexer.lex() + '=' + lexer.match, "A=\u1023"); + + // WARNING: as we don't support Extended Plane Unicode Codepoints + // (i.e. any input character beyond U+FFFF), you will + // observe that these characters, when fed to the lexer, MAY + // be split up in their individual UCS2 Character Codes. + // In this example U+10230 === UCS 0xD800 + UCS 0xDE30 + // ('UTF-16' encoding of U+10230) + + //assert.equal(lexer.lex() + '=' + lexer.match, "?=\uD800\uDE30"); // U+10230 + assert.equal(lexer.lex() + '=' + lexer.match, "?=\uD800"); + assert.equal(lexer.lex() + '=' + lexer.match, "?=\uDE30"); + + assert.equal(lexer.lex() + '=' + lexer.match, "A=ε"); + assert.equal(lexer.lex(), lexer.EOF); + }); + + it("test Unicode Supplementary Plane detection in regex set atoms - part 2 (XRegExp enabled)", function() { + var dict = { + options: { + xregexp: true + }, + macros: { + "ISSUE_A": "[\\t\\n\\r\\u0120-\\uD7FF\\uE000\\uFFFD]", // \\u10000-\\u10FFFF + "ISSUE_B": "[\\u001F-\\u002F]", // side test: proper processing of 'dash' as a *character* in a set. + "NOTISSUE": "[^{ISSUE_A}{ISSUE_B}XYZ]", // negating the inner set means we include the U.S.P. in NOTISSUE! + "NOTNOTISSUE": "[^{NOTISSUE}]", // while negating the *negated set* once again *excludes* the U.S.P. in NOTNOTISSUE! + }, + rules: [ + ["{ISSUE_A}+", "return 'A';" ], + ["{ISSUE_B}+", "return 'B';" ], +// ["{NOTISSUE}+", "return 'N';" ], + ["{NOTNOTISSUE}+", "return 'C';" ], + ["[{ISSUE_A}]+", "return 'X';" ], + ["[{ISSUE_B}]+", "return 'Y';" ], +// ["[{NOTISSUE}]+", "return 'W';" ], + ["[{NOTNOTISSUE}]+", "return 'Z';" ], + [".", "return '?';" ], + ] + }; + var input = "πXYZxyzα\u10000\u{0023}\u{1023}\u{10230}ε"; + + var lexer = new RegExpLexer(dict); + + var expandedMacros = lexer.getExpandedMacros(); + //console.log("MACROS:::::::::::::::", expandedMacros); + + // test the calculated regexes -- the 'sollwert' for the test takes `i2c()` encoding particulars into account: + assert.equal(expandedMacros.ISSUE_A.in_set, '\\t\\n\\r\u0120-\uD7FF\uE000\\ufffd'); + assert.equal(expandedMacros.ISSUE_A.elsewhere, '[\\t\\n\\r\u0120-\uD7FF\uE000\\ufffd]'); + assert.equal(expandedMacros.ISSUE_B.in_set, '\\u001f-\u002F'); + assert.equal(expandedMacros.ISSUE_B.elsewhere, '[\\u001f-\u002F]'); + assert.equal(expandedMacros.NOTISSUE.in_set, '\\u0000-\\b\\v\\f\\u000e-\\u001e0-W\\[-\u011f\\ud800-\\udfff\ue001-\\ufffc\\ufffe\\uffff'); + assert.equal(expandedMacros.NOTISSUE.elsewhere, '[^\\t\\n\\r\\u001f-\u002FX-Z\u0120-\uD7FF\uE000\\ufffd]'); + assert.equal(expandedMacros.NOTNOTISSUE.in_set, '\\t\\n\\r\\u001f-\u002FX-Z\u0120-\uD7FF\uE000\\ufffd'); + assert.equal(expandedMacros.NOTNOTISSUE.elsewhere, '[\\t\\n\\r\\u001f-\u002FX-Z\u0120-\uD7FF\uE000\\ufffd]'); + + lexer.setInput(input); + + assert.equal(lexer.lex() + '=' + lexer.match, "A=π"); + assert.equal(lexer.lex() + '=' + lexer.match, "C=XYZ"); + assert.equal(lexer.lex() + '=' + lexer.match, "?=x"); + assert.equal(lexer.lex() + '=' + lexer.match, "?=y"); + assert.equal(lexer.lex() + '=' + lexer.match, "?=z"); + assert.equal(lexer.lex() + '=' + lexer.match, "A=α\u1000"); + assert.equal(lexer.lex() + '=' + lexer.match, "?=0"); + assert.equal(lexer.lex() + '=' + lexer.match, "B=\u0023"); + assert.equal(lexer.lex() + '=' + lexer.match, "A=\u1023"); + + // WARNING: as we don't support Extended Plane Unicode Codepoints + // (i.e. any input character beyond U+FFFF), you will + // observe that these characters, when fed to the lexer, MAY + // be split up in their individual UCS2 Character Codes. + // In this example U+10230 === UCS 0xD800 + UCS 0xDE30 + // ('UTF-16' encoding of U+10230) + + //assert.equal(lexer.lex() + '=' + lexer.match, "?=\uD800\uDE30"); // U+10230 + assert.equal(lexer.lex() + '=' + lexer.match, "?=\uD800"); + assert.equal(lexer.lex() + '=' + lexer.match, "?=\uDE30"); + + assert.equal(lexer.lex() + '=' + lexer.match, "A=ε"); + assert.equal(lexer.lex(), lexer.EOF); + }); + + it("custom '<>' lexer rule must only fire once for end-of-input", function() { + var dict = [ + "%%", + "'x' {return 'X';}", + "<> {return 'CUSTOM_EOF';}", + ". {return yytext;}" + ].join('\n'); + + var input = "x<>"; + + var lexer = new RegExpLexer(dict); + lexer.setInput(input); + + assert.equal(lexer.lex(), "X"); + // side note: this particular input is also constructed to test/ensure + // that the lexer does not inadvertedly match the literal '<>' + // input string with the *special* <> lexer rule token! + // + // In other words: if this next lex() call fails, we know we have a + // deep b0rk in the lex compiler (rule parser/recognizer)! + assert.equal(lexer.lex(), "<"); + assert.equal(lexer.lex(), "<"); + assert.equal(lexer.lex(), "E"); + assert.equal(lexer.lex(), "O"); + assert.equal(lexer.lex(), "F"); + assert.equal(lexer.lex(), ">"); + assert.equal(lexer.lex(), ">"); + assert.equal(lexer.lex(), "CUSTOM_EOF"); + assert.equal(lexer.lex(), lexer.EOF); + assert.equal(lexer.lex(), lexer.EOF); + assert.equal(lexer.lex(), lexer.EOF); + assert.equal(lexer.lex(), lexer.EOF); + }); + + // related to https://github.com/GerHobbelt/jison/issues/9 + it("test multiple independent lexer instances", function() { + var dict1 = { + rules: [ + ["x", "return 'X';" ], + ["y", "return 'Y';" ], + ["$", "return 'EOF';" ] + ] + }; + + var dict2 = { + rules: [ + ["a", "return 'A';" ], + ["b", "return 'B';" ], + ["$", "return 'EOF';" ] + ] + }; + + var input1 = "xxyx"; + var input2 = "aaba"; + + var lexer1 = new RegExpLexer(dict1, input1); + var lexer2 = new RegExpLexer(dict2, input2); + assert.equal(lexer1.lex(), "X"); + assert.equal(lexer2.lex(), "A"); + assert.equal(lexer1.lex(), "X"); + assert.equal(lexer2.lex(), "A"); + assert.equal(lexer1.lex(), "Y"); + assert.equal(lexer2.lex(), "B"); + assert.equal(lexer1.lex(), "X"); + assert.equal(lexer2.lex(), "A"); + assert.equal(lexer1.lex(), "EOF"); + assert.equal(lexer2.lex(), "EOF"); + }); + + // related to https://github.com/GerHobbelt/jison/issues/9 + it("test cloned yet independent lexer instances", function() { + var dict = { + rules: [ + ["x", "return 'X';" ], + ["y", "return 'Y';" ], + ["$", "return 'EOF';" ] + ] + }; + + var input1 = "xxyx"; + var input2 = "yyx"; + + var lexerBase = new RegExpLexer(dict /*, input1 */); + function MyLexerClass() { + this.yy = {}; + } + MyLexerClass.prototype = lexerBase; + + function mkLexer() { + return new MyLexerClass(); + } + + var lexer1 = mkLexer(); + lexer1.setInput(input1, { + one: true + }); + + var lexer2 = mkLexer(); + lexer2.setInput(input2, { + two: true + }); + + assert.equal(lexer1.lex(), "X"); + assert.equal(lexer2.lex(), "Y"); + assert.equal(lexer1.lex(), "X"); + assert.equal(lexer2.lex(), "Y"); + assert.equal(lexer1.lex(), "Y"); + assert.equal(lexer2.lex(), "X"); + assert.equal(lexer1.lex(), "X"); + assert.equal(lexer2.lex(), "EOF"); + assert.equal(lexer1.lex(), "EOF"); + // once you've gone 'past' EOF, you get the EOF **ID** returned, rather than your custom EOF token. + // + // The `EOF` attribute is just a handy constant defined in the lexer prototype... + assert.equal(lexer2.lex(), lexerBase.EOF); + assert.equal(lexer1.lex(), lexerBase.EOF); + assert.equal(lexer1.EOF, lexerBase.EOF); + assert.equal(lexer2.EOF, lexerBase.EOF); + }); +}); + + +// prettyPrintRange() API +describe("prettyPrintRange() API", function () { + it("baseline - not invoking the API via ny error report", function () { + var dict = [ + '%%', + '"a" %{ return true; %}', + '"b" %{ return 1; %}', + ].join('\n'); + var lexer = new RegExpLexer(dict); + var JisonLexerError = lexer.JisonLexerError; + assert(JisonLexerError); + + var input = "abab"; + + lexer.setInput(input); + assert.strictEqual(lexer.lex(), true); + assert.strictEqual(lexer.lex(), 1); + assert.strictEqual(lexer.lex(), true); + assert.strictEqual(lexer.lex(), 1); + + assert.strictEqual(lexer.lex(), lexer.EOF); + }); + + it("fails when lexer cannot parse the spec due to faulty indentation", function () { + var dict = [ + '%%', + // rule regex MUST start the line; indentation (incorrectly) indicates this is all 'action code': + ' "a" %{ return true; %}', + ' "b" %{ return 1; %}', + ].join('\n'); + + assert.throws(function () { + var lexer = new RegExpLexer(dict); + }, + Error, + /an error in one or more of your lexer regex rules/ + ); + }); + + it("fails when lexer cannot find the end of a rule's action code block (alt 1)", function () { + var dict = [ + '%%', + // %{...%} action code blocks can contain ANYTHING, so + // we won't find this error until we validate-parse-as-JS + // the collected first action's source code. + '"a" %{ return true; ', + '"b" %{ return 1; %}', + ].join('\n'); + + assert.throws(function () { + var lexer = new RegExpLexer(dict, null, null, { + dumpSourceCodeOnFailure: false + }); + }, + Error, + /The rule\'s action code section does not compile[^]*?\n Erroneous area:\n1: %%\n2: "a" %\{ return true; \n\^\.\.\.\.\.\.\^\^\^\^\^\^\^\^\^\^\^\^\^\^\^\^\n3: "b" %\{ return 1; %\}\n\^\.\.\^\^\^\^\^\^\^\^\^\^\^\^\^\^\^\^\^\^\^/ + ); + }); + + it("fails when lexer cannot find the end of a rule's action code block (alt 2)", function () { + var dict = [ + '%%', + // %{...%} action code blocks can contain ANYTHING. + // Hence we won't find this error until we validate-parse-as-JS + // the entire generated lexer source code. + '"a" %{ return true; %}', + '"b" %{ return 1; ', + ].join('\n'); + + assert.throws(function () { + var lexer = new RegExpLexer(dict, null, null, { + dumpSourceCodeOnFailure: false + }); + }, + Error, + /Error: Lexical error on line 3:[^]*?missing 1 closing curly braces in lexer rule action block.[^]*?help jison grok more or less complex action code chunks.\n\n Erroneous area:\n1: %%\n2: "a" %{ return true; %}\n3: "b" %{ return 1;\s*\n\^\.\.\.\.\.\.\.\.\.\.\.\.\.\.\.\.\.\.\.\^/ + ); + }); + + it("fails when lexer finds an epilogue that's not parsable as JavaScript", function () { + var dict = [ + '%%', + '"a" %{ return true; %}', + '"b" %{ return 1; %}', + '%%', + '**This is gibberish!**', + ].join('\n'); + + assert.throws(function () { + var lexer = new RegExpLexer(dict, null, null, { + dumpSourceCodeOnFailure: false + }); + }, + Error, + /The extra lexer module code section \(a\.k\.a\. 'epilogue'\) does not compile[^]*?\n Erroneous area:\n1: %%\n2: "a" %\{ return true; %\}\n3: "b" %\{ return 1; %\}\n4: %%\n\^\.\.\.\.\^\n5: \*\*This is gibberish!\*\*\n\^\.\.\^\^\^\^\^\^\^\^\^\^\^\^\^\^\^\^\^\^\^\^\^\^/ + ); + }); + + it("fails when lexer finds a %code section that's not parsable as JavaScript", function () { + var dict = [ + '%%', + '"a" %{ return true; %}', + '"b" %{ return 1; %}', + '%code bugger %{ **This is gibberish!** %}', + ].join('\n'); + + assert.throws(function () { + var lexer = new RegExpLexer(dict, null, null, { + dumpSourceCodeOnFailure: false + }); + }, + Error, + /There's probably an error in one or more of your lexer regex rules[^]*?\n Erroneous code:\n1: %%\n2: "a" %\{ return true; %\}\n3: "b" %\{ return 1; %\}\n4: %code bugger %\{ \*\*This is gibberish!\*\* %\}\n\^\.\.\^\^\^\^\^\^\^\^\^\^\^\^\^\^\^\^\^\^\^\^\^\^\^\^\^\^\^\^\^\^\^\^\^\^\^\^\^\^\^\^\^\n[^]*?\n Technical error report:\nParse error on line 4:[^]*?Expecting end of input, [^]*? got unexpected "INIT_CODE"/ + ); + }); +});