Skip to content

Commit

Permalink
Merge pull request #55 from moustachedelait/feature/appendSectionLabe…
Browse files Browse the repository at this point in the history
…lsWithParent

Feature/append section labels with parent
  • Loading branch information
spencermountain authored Jun 5, 2017
2 parents ab95743 + 2bcd4bd commit 7f5468a
Show file tree
Hide file tree
Showing 2 changed files with 64 additions and 6 deletions.
7 changes: 5 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -58,9 +58,12 @@ its a combination of [instaview](https://en.wikipedia.org/wiki/User:Pilaf/InstaV
## **.parse(markup, options)**
turns wikipedia markup into a nice json object

options is optional. The only option currently supported is 'ignoreLists' which defaults to true.
options is optional. The options supported are
* 'ignoreLists' which defaults to true.
* 'appendSectionLabelsWithParent' which defaults to false. When turned on, the parse function will not just use the header of a section as the key in the map, but if there is a parent header that has no text of itself, the key will be amended to reflect Parent Header Name : Section Name"

```javascript
wtf_wikipedia.parse(someWikiScript, { ignoreLists: false })
wtf_wikipedia.parse(someWikiScript, { ignoreLists: false, appendSectionLabelsWithParent: true })
// {text:[...], infobox:{}, categories:[...], images:[] }
```

Expand Down
63 changes: 59 additions & 4 deletions src/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,8 @@ var wtf_wikipedia = (function() {

// options
var defaultParseOptions = {
ignoreLists: true
ignoreLists: true,
appendSectionLabelsWithParent: false
};

//some xml elements are just junk, and demand full inglorious death by regular exp
Expand Down Expand Up @@ -114,7 +115,28 @@ var wtf_wikipedia = (function() {
var output = new Map();
var lines = wiki.replace(/\r/g, '').split(/\n/);
var section = 'Intro';
var sectionStack = []; // only relevant when appendSectionLabelsWithParent === true
var number = 1;
// Turns = Intro = into 1, == Summary == into 2 etc;
var countHeaderNumber = function (section) {
var aSection = section.match(/^={1,5}/);

if (Array.isArray(aSection) && aSection.length !== 0) {
return aSection[0].length;
} else {
return null;
}
}

var isEmptyParentSection = function (section, potentialParent) {
if (countHeaderNumber(section.sectionNameWithEquals) - 1 === countHeaderNumber(potentialParent.sectionNameWithEquals)) {
return !potentialParent.hasText;
} else {
return false;
}
return
}

lines.forEach(function(part) {
if (!section) {
return;
Expand Down Expand Up @@ -147,27 +169,60 @@ var wtf_wikipedia = (function() {
//headings
var ban_headings = new RegExp('^ ?(' + i18n.sources.join('|') + ') ?$', 'i'); //remove things like 'external links'
if (part.match(/^={1,5}[^=]{1,200}={1,5}$/)) {
var sectionNameWithEquals;

section = part.match(/^={1,5}([^=]{1,200}?)={1,5}$/) || [];
sectionNameWithEquals = section[0]; // used to keep track how deep this section is
section = section[1] || '';
section = section.replace(/\./g, ' '); // this is necessary for mongo, i'm sorry
section = helpers.trim_whitespace(section);
//ban some sections
if (section && section.match(ban_headings)) {
section = undefined;
}

// helps keep track who the parent section is, in case options.appendSectionLabelsWithParent === true
sectionStack.push({
sectionNameWithEquals: sectionNameWithEquals,
name: section,
hasText: false
});

return;
}

var sectionLabel = section;

// Potential to expand the section label, if the option is turned on and the right circumstances apply
if (options.appendSectionLabelsWithParent === true) {
// We've made it to content text, mark that the last section has text (and will not be used as a parent marker)
if (sectionStack.length > 0) {
sectionStack[sectionStack.length - 1].hasText = true;
}

// Don't get influenced by siblings, remove the siblings from the stack till we find a parent node
while (sectionStack.length > 1 && countHeaderNumber(sectionStack[sectionStack.length - 1].sectionNameWithEquals) === countHeaderNumber(sectionStack[sectionStack.length - 2].sectionNameWithEquals)) {
sectionStack.splice(-2, 1);
}

// Check our previous (now) non-sibling node, is it without content text and exactly one level up? Then append the section label with it
if (options.appendSectionLabelsWithParent === true && sectionStack.length > 1 && isEmptyParentSection(sectionStack[sectionStack.length - 1], sectionStack[sectionStack.length - 2])) {
sectionLabel = sectionStack[sectionStack.length - 2].name + " : " + sectionStack[sectionStack.length - 1].name;
}
}

//still alive, add it to the section
sentence_parser(part).forEach(function(line) {
line = parse_line(line);

if (line && line.text) {
// if (!output[section]) {
if (!output.get(section)) {
if (!output.get(sectionLabel)) {
// output[section] = [];
output.set(section, []);
output.set(sectionLabel, []);
}
// output[section].push(line);
output.get(section).push(line);
output.get(sectionLabel).push(line);
}
});
});
Expand Down

0 comments on commit 7f5468a

Please sign in to comment.