working on #29: fix crash in jison when reporting an error on an epsi…

…lon rule (which has no location info); add / introduce the `lexer::deriveLocationInfo()` API to help you & us to construct a more-or-less useful/sane location info object from the context surrounding it when the requested location info itself is not available.
GerHobbelt · Nov 9, 2017 · 7976bab · 7976bab
1 parent eade44d
commit 7976bab
Show file tree

Hide file tree

Showing 2 changed files with 168 additions and 4 deletions.
diff --git a/packages/ebnf-parser/bnf.y b/packages/ebnf-parser/bnf.y
@@ -631,7 +631,7 @@ handle_action
                         You cannot specify a precedence override for an epsilon (a.k.a. empty) rule!
 
                           Erroneous area:
-                        ${yylexer.prettyPrintRange(@handle)}
+                        ${yylexer.prettyPrintRange(@handle, @0, @action /* @handle is very probably NULL! We need this one for some decent location info! */)}
                     `);
                 }
                 $$.push($prec);

diff --git a/packages/jison-lex/regexp-lexer.js b/packages/jison-lex/regexp-lexer.js
@@ -1874,6 +1874,115 @@ return `{
         return pre + this.upcomingInput(maxPostfix).replace(/\\s/g, ' ') + '\\n' + c + '^';
     },
 
+    /**
+     * return an YYLLOC info object derived off the given context (actual, preceding, following, current).
+     * Use this method when the given \`actual\` location is not guaranteed to exist (i.e. when
+     * it MAY be NULL) and you MUST have a valid location info object anyway:
+     * then we take the given context of the \`preceding\` and \`following\` locations, IFF those are available,
+     * and reconstruct the \`actual\` location info from those.
+     * If this fails, the heuristic is to take the \`current\` location, IFF available.
+     * If this fails as well, we assume the sought location is at/around the current lexer position
+     * and then produce that one as a response. DO NOTE that these heuristic/derived location info
+     * values MAY be inaccurate!
+     *
+     * NOTE: \`deriveLocationInfo()\` ALWAYS produces a location info object *copy* of \`actual\`, not just
+     * a *reference* hence all input location objects can be assumed to be 'constant' (function has no side-effects).
+     * 
+     * @public
+     * @this {RegExpLexer}
+     */
+    deriveLocationInfo: function lexer_deriveYYLLOC(actual, preceding, following, current) {
+        var loc = {
+            first_line: 1,
+            first_column: 0,
+            last_line: 1,
+            last_column: 0,
+
+            range: [0, 0]
+        };
+        if (actual) {
+            loc.first_line = actual.first_line | 0;
+            loc.last_line = actual.last_line | 0;
+            loc.first_column = actual.first_column | 0;
+            loc.last_column = actual.last_column | 0;
+
+            if (actual.range) {
+                loc.range[0] = actual.range[0] | 0; 
+                loc.range[1] = actual.range[1] | 0;
+            } 
+        }
+        if (loc.first_line <= 0 || loc.last_line < loc.first_line) {
+            // plan B: heuristic using preceding and following:
+            if (loc.first_line <= 0 && preceding) {
+                loc.first_line = preceding.last_line | 0;
+                loc.first_column = preceding.last_column | 0;
+
+                if (preceding.range) {
+                    loc.range[0] = actual.range[1] | 0; 
+                } 
+            }
+
+            if ((loc.last_line <= 0 || loc.last_line < loc.first_line) && following) {
+                loc.last_line = following.first_line | 0;
+                loc.last_column = following.first_column | 0;
+
+                if (following.range) {
+                    loc.range[1] = actual.range[0] | 0; 
+                } 
+            }
+
+            // plan C?: see if the 'current' location is useful/sane too:
+            if (loc.first_line <= 0 && current && (loc.last_line <= 0 || current.last_line <= loc.last_line)) {
+                loc.first_line = current.first_line | 0;
+                loc.first_column = current.first_column | 0;
+
+                if (current.range) {
+                    loc.range[0] = current.range[0] | 0; 
+                } 
+            }
+
+            if (loc.last_line <= 0 && current && (loc.first_line <= 0 || current.first_line >= loc.first_line)) {
+                loc.last_line = current.last_line | 0;
+                loc.last_column = current.last_column | 0;
+
+                if (current.range) {
+                    loc.range[1] = current.range[1] | 0; 
+                } 
+            }
+        }
+        // sanitize: fix last_line BEFORE we fix first_line as we use the 'raw' value of the latter
+        // or plan D heuristics to produce a 'sensible' last_line value:
+        if (loc.last_line <= 0) {
+            if (loc.first_line <= 0) {
+                loc.first_line = this.yylloc.first_line;
+                loc.last_line = this.yylloc.last_line;
+                loc.first_column = this.yylloc.first_column;
+                loc.last_column = this.yylloc.last_column;
+
+                loc.range[0] = this.yylloc.range[0];
+                loc.range[1] = this.yylloc.range[1];
+            } else {
+                loc.last_line = this.yylloc.last_line;
+                loc.last_column = this.yylloc.last_column;
+
+                loc.range[1] = this.yylloc.range[1];
+            }
+        }
+        if (loc.first_line <= 0) {
+            loc.first_line = loc.last_line;
+            loc.first_column = 0; // loc.last_column;
+
+            loc.range[1] = loc.range[0];
+        }
+        if (loc.first_column < 0) {
+            loc.first_column = 0;
+        }
+        if (loc.last_column < 0) {
+            loc.last_column = (loc.first_column > 0 ? loc.first_column : 80);
+        }
+        return loc;
+    },
+
     /**
      * return a string which displays the lines & columns of input which are referenced 
      * by the given location info range, plus a few lines of context.
@@ -1920,13 +2029,12 @@ return `{
      * @this {RegExpLexer}
      */
     prettyPrintRange: function lexer_prettyPrintRange(loc, context_loc, context_loc2) {
-        var error_size = loc.last_line - loc.first_line;
+        loc = this.deriveLocationInfo(loc, context_loc, context_loc2);        
         const CONTEXT = 3;
         const CONTEXT_TAIL = 1;
         const MINIMUM_VISIBLE_NONEMPTY_LINE_COUNT = 2;
         var input = this.matched + this._input;
         var lines = input.split('\\n');
-        //var show_context = (error_size < 5 || context_loc);
         var l0 = Math.max(1, (context_loc ? context_loc.first_line : loc.first_line - CONTEXT));
         var l1 = Math.max(1, (context_loc2 ? context_loc2.last_line : loc.last_line + CONTEXT_TAIL));
         var lineno_display_width = (1 + Math.log10(l1 | 1) | 0);
@@ -2250,21 +2358,77 @@ return `{
     lex: function lexer_lex() {
         var r;
         // allow the PRE/POST handlers set/modify the return token for maximum flexibility of the generated lexer:
+        if (typeof this.pre_lex === 'function') {
+            r = this.pre_lex.call(this, 0);
+        }
         if (typeof this.options.pre_lex === 'function') {
-            r = this.options.pre_lex.call(this);
+            // (also account for a userdef function which does not return any value: keep the token as is)
+            r = this.options.pre_lex.call(this, r) || r;
+        }
+        if (this.yy && typeof this.yy.pre_lex === 'function') {
+            // (also account for a userdef function which does not return any value: keep the token as is)
+            r = this.yy.pre_lex.call(this, r) || r;
         }
 
         while (!r) {
             r = this.next();
         }
 
+        if (this.yy && typeof this.yy.post_lex === 'function') {
+            // (also account for a userdef function which does not return any value: keep the token as is)
+            r = this.yy.post_lex.call(this, r) || r;
+        }
         if (typeof this.options.post_lex === 'function') {
             // (also account for a userdef function which does not return any value: keep the token as is)
             r = this.options.post_lex.call(this, r) || r;
         }
+        if (typeof this.post_lex === 'function') {
+            // (also account for a userdef function which does not return any value: keep the token as is)
+            r = this.post_lex.call(this, r) || r;
+        }
+        return r;
+    },
+
+    /**
+     * return next match that has a token. Identical to the \`lex()\` API but does not invoke any of the 
+     * \`pre_lex()\` nor any of the \`post_lex()\` callbacks.
+     * 
+     * @public
+     * @this {RegExpLexer}
+     */
+    fastLex: function lexer_fastLex() {
+        var r;
+
+        while (!r) {
+            r = this.next();
+        }
+
         return r;
     },
 
+    /**
+     * return info about the lexer state that can help a parser or other lexer API user to use the
+     * most efficient means available. This API is provided to aid run-time performance for larger
+     * systems which employ this lexer.
+     * 
+     * @public
+     * @this {RegExpLexer}
+     */
+    canIUse: function lexer_canIUse() {
+        var rv = {
+            fast_lex: !(
+                typeof this.pre_lex === 'function' ||
+                typeof this.options.pre_lex === 'function' ||
+                (this.yy && typeof this.yy.pre_lex === 'function') ||
+                (this.yy && typeof this.yy.post_lex === 'function') ||
+                typeof this.options.post_lex === 'function' ||
+                typeof this.post_lex === 'function'
+            ),
+        };
+        return r;
+    },
+
+
     /**
      * backwards compatible alias for \`pushState()\`;
      * the latter is symmetrical with \`popState()\` and we advise to use