From a2844c7fd79d88944ffac33e2bcdeb70d8a2c503 Mon Sep 17 00:00:00 2001
From: Ger Hobbelt <ger@hobbelt.com>
Date: Thu, 2 Feb 2017 23:36:50 +0100
Subject: [PATCH] synced the code generators with the ones in the jison tool:
 support all 4 modes: CommonJS, AMD, ES6 and vanilla JS; also taken the
 opportunity to give the lexer its own documentation comment chunk as is
 generated by jison for the parser at large.

---
 cli.js          |   3 +-
 regexp-lexer.js | 300 ++++++++++++++++++++++++++++++++++++++++++------
 2 files changed, 264 insertions(+), 39 deletions(-)

diff --git a/cli.js b/cli.js
index 9e5690e..3a33cfa 100755
--- a/cli.js
+++ b/cli.js
@@ -25,7 +25,8 @@ var opts = require('nomnom')
     abbr: 't',
     default: 'commonjs',
     metavar: 'TYPE',
-    help: 'The type of module to generate (commonjs, js)'
+    choices: ['commonjs', 'amd', 'js', 'es'],
+    help: 'The type of module to generate (commonjs, amd, es, js)'
   })
   .option('version', {
     abbr: 'V',
diff --git a/regexp-lexer.js b/regexp-lexer.js
index e936be5..e0b38e7 100644
--- a/regexp-lexer.js
+++ b/regexp-lexer.js
@@ -2754,12 +2754,20 @@ function processGrammar(dict, tokens, build_options) {
 function generateFromOpts(opt) {
     var code = '';
 
-    if (opt.moduleType === 'commonjs') {
-        code = generateCommonJSModule(opt);
-    } else if (opt.moduleType === 'amd') {
-        code = generateAMDModule(opt);
-    } else {
+    switch (opt.moduleType) {
+    case 'js':
         code = generateModule(opt);
+        break;
+    case 'amd':
+        code = generateAMDModule(opt);
+        break;
+    case 'es':
+        code = generateESModule(opt);
+        break;
+    case 'commonjs':
+    default:
+        code = generateCommonJSModule(opt);
+        break;
     }
 
     return code;
@@ -2887,62 +2895,278 @@ function generateModuleBody(opt) {
     return out;
 }
 
-function generateModule(opt) {
-    opt = opt || {};
+function generateGenericHeaderComment() {
+    var out = '/* lexer generated by jison-lex ' + version + ' */\n'
+        + '/*\n'
+        + ' * Returns a Lexer object of the following structure:\n'
+        + ' *\n'
+        + ' *  Lexer: {\n'
+        + ' *    yy: {}     The so-called "shared state" or rather the *source* of it;\n'
+        + ' *               the real "shared state" `yy` passed around to\n'
+        + ' *               the rule actions, etc. is a derivative/copy of this one,\n'
+        + ' *               not a direct reference!\n'
+        + ' *  }\n'
+        + ' *\n'
+        + ' *  Lexer.prototype: {\n'
+        + ' *    yy: {},\n'
+        + ' *    EOF: 1,\n'
+        + ' *    ERROR: 2,\n'
+        + ' *\n'
+        + ' *    JisonLexerError: function(msg, hash),\n'
+        + ' *\n'
+        + ' *    performAction: function lexer__performAction(yy, yy_, $avoiding_name_collisions, YY_START, ...),\n'
+        + ' *               where `...` denotes the (optional) additional arguments the user passed to\n'
+        + ' *               `lexer.lex(...)` and specified by way of `%parse-param ...` in the **parser** grammar file\n'
+        + ' *\n'
+        + ' *               The function parameters and `this` have the following value/meaning:\n'
+        + ' *               - `this`    : reference to the `lexer` instance.\n'
+        + ' *\n'
+        + ' *               - `yy`      : a reference to the `yy` "shared state" object which was passed to the lexer\n'
+        + ' *                             by way of the `lexer.setInput(str, yy)` API before.\n'
+        + ' *\n'
+        + ' *               - `yy_`     : lexer instance reference used internally.\n'
+        + ' *\n'
+        + ' *               - `$avoiding_name_collisions`   : index of the matched lexer rule (regex), used internally.\n'
+        + ' *\n'
+        + ' *               - `YY_START`: the current lexer "start condition" state.\n'
+        + ' *\n'
+        + ' *               - `...`     : the extra arguments you specified in the `%parse-param` statement in your\n'
+        + ' *                             **parser** grammar definition file and which are passed to the lexer via\n'
+        + ' *                             its `lexer.lex(...)` API.\n'
+        + ' *\n'
+        + ' *    parseError: function(str, hash),\n'
+        + ' *\n'
+        + ' *    constructLexErrorInfo: function(error_message, is_recoverable),\n'
+        + ' *               Helper function.\n'
+        + ' *               Produces a new errorInfo \'hash object\' which can be passed into `parseError()`.\n'
+        + ' *               See it\'s use in this lexer kernel in many places; example usage:\n'
+        + ' *\n'
+        + ' *                   var infoObj = lexer.constructParseErrorInfo(\'fail!\', true);\n'
+        + ' *                   var retVal = lexer.parseError(infoObj.errStr, infoObj);\n'
+        + ' *\n'
+        + ' *    options: { ... lexer %options ... },\n'
+        + ' *\n'
+        + ' *    lex: function([args...]),\n'
+        + ' *               Produce one token of lexed input, which was passed in earlier via the `lexer.setInput()` API.\n'
+        + ' *               You MAY use the additional `args...` parameters as per `%parse-param` spec of the **parser** grammar:\n'
+        + ' *               these extra `args...` are passed verbatim to the lexer rules\' action code.\n'
+        + ' *\n'
+        + ' *    cleanupAfterLex: function(do_not_nuke_errorinfos),\n'
+        + ' *               Helper function.\n'
+        + ' *               This helper API is invoked when the parse process has completed. This helper may\n'
+        + ' *               be invoked by user code to ensure the internal lexer gets properly garbage collected.\n'
+        + ' *\n'
+        + ' *        setInput: function(input, [yy]),\n'
+        + ' *        input: function(),\n'
+        + ' *        unput: function(str),\n'
+        + ' *        more: function(),\n'
+        + ' *        reject: function(),\n'
+        + ' *        less: function(n),\n'
+        + ' *        pastInput: function(n),\n'
+        + ' *        upcomingInput: function(n),\n'
+        + ' *        showPosition: function(),\n'
+        + ' *        test_match: function(regex_match_array, rule_index),\n'
+        + ' *        next: function(...),\n'
+        + ' *        lex: function(...),\n'
+        + ' *        begin: function(condition),\n'
+        + ' *        pushState: function(condition),\n'
+        + ' *        popState: function(),\n'
+        + ' *        topState: function(),\n'
+        + ' *        _currentRules: function(),\n'
+        + ' *        stateStackSize: function(),\n'
+        + ' *\n'
+        + ' *        options: { ... lexer %options ... },\n'
+        + ' *\n'
+        + ' *        performAction: function(yy, yy_, $avoiding_name_collisions, YY_START, ...),\n'
+        + ' *        rules: [...],\n'
+        + ' *        conditions: {associative list: name ==> set},\n'
+        + ' *  }\n'
+        + ' *\n'
+        + ' *\n'
+        + ' *  token location info (`yylloc`): {\n'
+        + ' *    first_line: n,\n'
+        + ' *    last_line: n,\n'
+        + ' *    first_column: n,\n'
+        + ' *    last_column: n,\n'
+        + ' *    range: [start_number, end_number]\n'
+        + ' *               (where the numbers are indexes into the input string, zero-based)\n'
+        + ' *  }\n'
+        + ' *\n'
+        + ' * ---\n'
+        + ' *\n'
+        + ' * The parseError function receives a \'hash\' object with these members for lexer errors:\n'
+        + ' *\n'
+        + ' *  {\n'
+        + ' *    text:        (matched text)\n'
+        + ' *    token:       (the produced terminal token, if any)\n'
+        + ' *    token_id:    (the produced terminal token numeric ID, if any)\n'
+        + ' *    line:        (yylineno)\n'
+        + ' *    loc:         (yylloc)\n'
+        + ' *    recoverable: (boolean: TRUE when the parser MAY have an error recovery rule\n'
+        + ' *                  available for this particular error)\n'
+        + ' *    yy:          (object: the current parser internal "shared state" `yy`\n'
+        + ' *                  as is also available in the rule actions; this can be used,\n'
+        + ' *                  for instance, for advanced error analysis and reporting)\n'
+        + ' *    lexer:       (reference to the current lexer instance used by the parser)\n'
+        + ' *  }\n'
+        + ' *\n'
+        + ' * while `this` will reference the current lexer instance.\n'
+        + ' *\n'
+        + ' * When `parseError` is invoked by the lexer, the default implementation will\n'
+        + ' * attempt to invoke `yy.parser.parseError()`; when this callback is not provided\n'
+        + ' * it will try to invoke `yy.parseError()` instead. When that callback is also not\n'
+        + ' * provided, a `JisonLexerError` exception will be thrown containing the error\n'
+        + ' * message and hash, as constructed by the `constructLexErrorInfo()` API.\n'
+        + ' *\n'
+        + ' * ---\n'
+        + ' *\n'
+        + ' * You can specify lexer options by setting / modifying the `.options` object of your Lexer instance.\n'
+        + ' * These options are available:\n'
+        + ' *\n'
+        + ' * (Options are permanent.)\n'
+        + ' *  \n'
+        + ' *  yy: {\n'
+        + ' *      parseError: function(str, hash)\n'
+        + ' *                 optional: overrides the default `parseError` function.\n'
+        + ' *  }\n'
+        + ' *\n'
+        + ' *  lexer.options: {\n'
+        + ' *      pre_lex:  function()\n'
+        + ' *                 optional: is invoked before the lexer is invoked to produce another token.\n'
+        + ' *                 `this` refers to the Lexer object.\n'
+        + ' *      post_lex: function(token) { return token; }\n'
+        + ' *                 optional: is invoked when the lexer has produced a token `token`;\n'
+        + ' *                 this function can override the returned token value by returning another.\n'
+        + ' *                 When it does not return any (truthy) value, the lexer will return\n'
+        + ' *                 the original `token`.\n'
+        + ' *                 `this` refers to the Lexer object.\n'
+        + ' *\n'
+        + ' * WARNING: the next set of options are not meant to be changed. They echo the abilities of\n'
+        + ' * the lexer as per when it was compiled!\n'
+        + ' *\n'
+        + ' *      ranges: boolean\n'
+        + ' *                 optional: `true` ==> token location info will include a .range[] member.\n'
+        + ' *      flex: boolean\n'
+        + ' *                 optional: `true` ==> flex-like lexing behaviour where the rules are tested\n'
+        + ' *                 exhaustively to find the longest match.\n'
+        + ' *      backtrack_lexer: boolean\n'
+        + ' *                 optional: `true` ==> lexer regexes are tested in order and for invoked;\n'
+        + ' *                 the lexer terminates the scan when a token is returned by the action code.\n'
+        + ' *      xregexp: boolean\n'
+        + ' *                 optional: `true` ==> lexer rule regexes are "extended regex format" requiring the\n'
+        + ' *                 `XRegExp` library. When this %option has not been specified at compile time, all lexer\n'
+        + ' *                 rule regexes have been written as standard JavaScript RegExp expressions.\n'
+        + ' *  }\n'
+        + ' */\n';
 
-    var out = ['/* generated by jison-lex ' + version + ' */'];
-    var moduleName = opt.moduleName || 'lexer';
+    return out;
+}
 
-    out.push('var ' + moduleName + ' = (function () {');
-    out.push(jisonLexerErrorDefinition);
-    out.push(generateModuleBody(opt));
+function prepareOptions(opt) {
+    opt = opt || {};
 
-    if (opt.moduleInclude) {
-        out.push(opt.moduleInclude + ';');
+    // check for illegal identifier
+    if (!opt.moduleName || !opt.moduleName.match(/^[a-zA-Z_$][a-zA-Z0-9_$\.]*$/)) {
+        if (opt.moduleName) {
+            var msg = 'WARNING: The specified moduleName "' + opt.moduleName + '" is illegal (only characters [a-zA-Z0-9_$] and "." dot are accepted); using the default moduleName "lexer" instead.';
+            if (typeof opt.warn_cb === 'function') {
+                opt.warn_cb(msg);
+            } else {
+                // do not treat as warning; barf hairball instead so that this oddity gets noticed right away!
+                throw new Error(msg);
+            }
+        }
+        opt.moduleName = 'lexer';
     }
+    return opt;
+};
+
+function generateModule(opt) {
+    opt = prepareOptions(opt);
 
-    out.push(
+    var out = [
+        generateGenericHeaderComment(),
+        '',
+        'var ' + opt.moduleName + ' = (function () {',
+        jisonLexerErrorDefinition,
+        '',
+        generateModuleBody(opt),
+        '',
+        (opt.moduleInclude ? opt.moduleInclude + ';' : ''),
+        '',
         'return lexer;',
         '})();'
-    );
+    ];
 
     return out.join('\n');
 }
 
 function generateAMDModule(opt) {
-    opt = opt || {};
+    opt = prepareOptions(opt);
 
-    var out = ['/* generated by jison-lex ' + version + ' */'];
+    var out = [
+        generateGenericHeaderComment(),
+        '',
+        'define([], function () {',
+        jisonLexerErrorDefinition,
+        '',
+        generateModuleBody(opt),
+        '',
+        (opt.moduleInclude ? opt.moduleInclude + ';' : ''),
+        '',
+        'return lexer;',
+        '});'
+    ];
 
-    out.push('define([], function () {');
-    out.push(jisonLexerErrorDefinition);
-    out.push(generateModuleBody(opt));
+    return out.join('\n');
+}
 
-    if (opt.moduleInclude) {
-        out.push(opt.moduleInclude + ';');
-    }
+function generateESModule(opt) {
+    opt = prepareOptions(opt);
 
-    out.push(
+    var out = [
+        generateGenericHeaderComment(),
+        '',
+        'var lexer = (function () {',
+        jisonLexerErrorDefinition,
+        '',
+        generateModuleBody(opt),
+        '',
+        (opt.moduleInclude ? opt.moduleInclude + ';' : ''),
+        '',
         'return lexer;',
-        '});'
-    );
+        '})();',
+        '',
+        'export {lexer};'
+    ];
 
     return out.join('\n');
-}
+};
 
 function generateCommonJSModule(opt) {
-    opt = opt || {};
+    opt = prepareOptions(opt);
 
-    var out = [];
-    var moduleName = opt.moduleName || 'lexer';
+    var out = [
+        generateGenericHeaderComment(),
+        '',
+        'var ' + opt.moduleName + ' = (function () {',
+        jisonLexerErrorDefinition,
+        '',
+        generateModuleBody(opt),
+        '',
+        (opt.moduleInclude ? opt.moduleInclude + ';' : ''),
+        '',
+        'return lexer;',
+        '})();',
+        '',
+        'if (typeof require !== \'undefined\' && typeof exports !== \'undefined\') {',
+        '  exports.lexer = ' + opt.moduleName + ';',
+        '  exports.lex = function () {',
+        '    return ' + opt.moduleName + '.lex.apply(lexer, arguments);',
+        '  };',
+        '}'
+    ];
 
-    out.push(
-        generateModule(opt),
-        'exports.lexer = ' + moduleName + ';',
-        'exports.lex = function () {',
-        ' return ' + moduleName + '.lex.apply(lexer, arguments);',
-        '};'
-    );
     return out.join('\n');
 }