From e2c2ca7a0bf64eeccb48fdb4be004ac6fdd91f7b Mon Sep 17 00:00:00 2001 From: Guido van Helvoort Date: Sat, 3 Jun 2017 14:23:18 -0700 Subject: [PATCH 1/2] Feature appendSectionLabelsWithParent option. False by default. When turned on, it the parse function will not just use the header of a section as the key in the map, but if there is a parent header that has no text of itself, the key will be ammended to reflect Parent Header Name : Section Name --- src/index.js | 63 ++++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 59 insertions(+), 4 deletions(-) diff --git a/src/index.js b/src/index.js index 32a70727..afe11e8a 100644 --- a/src/index.js +++ b/src/index.js @@ -23,7 +23,8 @@ var wtf_wikipedia = (function() { // options var defaultParseOptions = { - ignoreLists: true + ignoreLists: true, + appendSectionLabelsWithParent: false }; //some xml elements are just junk, and demand full inglorious death by regular exp @@ -114,7 +115,28 @@ var wtf_wikipedia = (function() { var output = new Map(); var lines = wiki.replace(/\r/g, '').split(/\n/); var section = 'Intro'; + var sectionStack = []; // only relevant when appendSectionLabelsWithParent === true var number = 1; + // Turns = Intro = into 1, == Summary == into 2 etc; + var countHeaderNumber = function (section) { + var aSection = section.match(/^={1,5}/); + + if (Array.isArray(aSection) && aSection.length !== 0) { + return aSection[0].length; + } else { + return null; + } + } + + var isEmptyParentSection = function (section, potentialParent) { + if (countHeaderNumber(section.sectionNameWithEquals) - 1 === countHeaderNumber(potentialParent.sectionNameWithEquals)) { + return !potentialParent.hasText; + } else { + return false; + } + return + } + lines.forEach(function(part) { if (!section) { return; @@ -147,7 +169,10 @@ var wtf_wikipedia = (function() { //headings var ban_headings = new RegExp('^ ?(' + i18n.sources.join('|') + ') ?$', 'i'); //remove things like 'external links' if (part.match(/^={1,5}[^=]{1,200}={1,5}$/)) { + var sectionNameWithEquals; + section = part.match(/^={1,5}([^=]{1,200}?)={1,5}$/) || []; + sectionNameWithEquals = section[0]; // used to keep track how deep this section is section = section[1] || ''; section = section.replace(/\./g, ' '); // this is necessary for mongo, i'm sorry section = helpers.trim_whitespace(section); @@ -155,19 +180,49 @@ var wtf_wikipedia = (function() { if (section && section.match(ban_headings)) { section = undefined; } + + // helps keep track who the parent section is, in case options.appendSectionLabelsWithParent === true + sectionStack.push({ + sectionNameWithEquals: sectionNameWithEquals, + name: section, + hasText: false + }); + return; } + + var sectionLabel = section; + + // Potential to expand the section label, if the option is turned on and the right circumstances apply + if (options.appendSectionLabelsWithParent === true) { + // We've made it to content text, mark that the last section has text (and will not be used as a parent marker) + if (sectionStack.length > 0) { + sectionStack[sectionStack.length - 1].hasText = true; + } + + // Don't get influenced by siblings, remove the siblings from the stack till we find a parent node + while (sectionStack.length > 1 && countHeaderNumber(sectionStack[sectionStack.length - 1].sectionNameWithEquals) === countHeaderNumber(sectionStack[sectionStack.length - 2].sectionNameWithEquals)) { + sectionStack.splice(-2, 1); + } + + // Check our previous (now) non-sibling node, is it without content text and exactly one level up? Then append the section label with it + if (options.appendSectionLabelsWithParent === true && sectionStack.length > 1 && isEmptyParentSection(sectionStack[sectionStack.length - 1], sectionStack[sectionStack.length - 2])) { + sectionLabel = sectionStack[sectionStack.length - 2].name + " : " + sectionStack[sectionStack.length - 1].name; + } + } + //still alive, add it to the section sentence_parser(part).forEach(function(line) { line = parse_line(line); + if (line && line.text) { // if (!output[section]) { - if (!output.get(section)) { + if (!output.get(sectionLabel)) { // output[section] = []; - output.set(section, []); + output.set(sectionLabel, []); } // output[section].push(line); - output.get(section).push(line); + output.get(sectionLabel).push(line); } }); }); From 2bcd4bd502411bede07afac5ffecbaf5db64b4b0 Mon Sep 17 00:00:00 2001 From: Guido van Helvoort Date: Sat, 3 Jun 2017 15:09:42 -0700 Subject: [PATCH 2/2] Updated readme for appendSectionLabelsWithParent option --- README.md | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 4ca6476c..a1d8760d 100644 --- a/README.md +++ b/README.md @@ -58,9 +58,12 @@ its a combination of [instaview](https://en.wikipedia.org/wiki/User:Pilaf/InstaV ## **.parse(markup, options)** turns wikipedia markup into a nice json object -options is optional. The only option currently supported is 'ignoreLists' which defaults to true. +options is optional. The options supported are +* 'ignoreLists' which defaults to true. +* 'appendSectionLabelsWithParent' which defaults to false. When turned on, the parse function will not just use the header of a section as the key in the map, but if there is a parent header that has no text of itself, the key will be amended to reflect Parent Header Name : Section Name" + ```javascript -wtf_wikipedia.parse(someWikiScript, { ignoreLists: false }) +wtf_wikipedia.parse(someWikiScript, { ignoreLists: false, appendSectionLabelsWithParent: true }) // {text:[...], infobox:{}, categories:[...], images:[] } ```