Skip to content

Commit

Permalink
feat(TableParser): Add getCleanMatrix() method (#113)
Browse files Browse the repository at this point in the history
## Problem

As seen in https://github.com/adrienjoly/npm-pdfreader-example/blob/master/parseTable.js, it's complicated to render a table that was parsed from a PDF file using `TableParser`.

The existing `getMatrix()` method returned a 3-dimension matrix instead of a 2-dimension one, because there can be more than one textual item per column. (e.g. when a word is splitted into 2 items, for some reason)

## Proposed solution

Add a `getCleanMatrix()` method that returns a 2-dimension matrix that can be used with `console.table()`.

## Example of use

```js
    // the thresholds were determined manually, based on the horizontal position (x) for column headers
    const colThresholds = [6.8, 9.5, 13.3, 16.7, 18.4, 28, 32, 36, Infinity];

    const columnQuantitizer = (item) => {
      return colThresholds.findIndex(
        (colThreshold) => parseFloat(item.x) < colThreshold
      );
    };

    const table = new lib.TableParser();
    new PdfReader().parseFileItems("./test/sample-table.pdf", (err, item) => {
      if (err) console.error(err);
      else if (!item) {
        console.table(table.getCleanMatrix({ collisionSeparator: "" })); // 👈
      } else if (item.text) {
        table.processItem(item, columnQuantitizer(item));
      }
    });
```

## Result

As displayed with `console.table(cleanMatrix)`:

```
┌─────────┬───────────────────┬───────────┬──────────────┬──────────────┬─────────┬──────────────────────────┬─────────────┬─────────────┬────────┐
│ (index) │         0         │     1     │      2       │      3       │    4    │            5             │      6      │      7      │   8    │
├─────────┼───────────────────┼───────────┼──────────────┼──────────────┼─────────┼──────────────────────────┼─────────────┼─────────────┼────────┤
│    0    │     'Version'     │   'LTS'   │    'Date'    │     'V8'     │  'npm'  │ 'NODE_MODULE_VERSION[1]' │             │             │        │
│    1    │ 'Node.js 17.1.0'  │           │ '2021-11-09' │ '9.5.172.25' │ '8.1.2' │          '102'           │ 'Downloads' │ 'Changelog' │ 'Docs' │
│    2    │ 'Node.js 17.0.1'  │           │ '2021-10-20' │ '9.5.172.21' │ '8.1.0' │          '102'           │ 'Downloads' │ 'Changelog' │ 'Docs' │
│    3    │ 'Node.js 17.0.0'  │           │ '2021-10-19' │ '9.5.172.21' │ '8.1.0' │          '102'           │ 'Downloads' │ 'Changelog' │ 'Docs' │
│    4    │ 'Node.js 16.14.2' │ 'Gallium' │ '2022-03-17' │ '9.4.146.24' │ '8.5.0' │           '93'           │ 'Downloads' │ 'Changelog' │ 'Docs' │
│    5    │ 'Node.js 16.14.1' │ 'Gallium' │ '2022-03-16' │ '9.4.146.24' │ '8.5.0' │           '93'           │ 'Downloads' │ 'Changelog' │ 'Docs' │
│    6    │ 'Node.js 16.14.0' │ 'Gallium' │ '2022-02-08' │ '9.4.146.24' │ '8.3.1' │           '93'           │ 'Downloads' │ 'Changelog' │ 'Docs' │
│    7    │ 'Node.js 16.13.2' │ 'Gallium' │ '2022-01-10' │ '9.4.146.24' │ '8.1.2' │           '93'           │ 'Downloads' │ 'Changelog' │ 'Docs' │
│    8    │ 'Node.js 16.13.1' │ 'Gallium' │ '2021-12-01' │ '9.4.146.24' │ '8.1.2' │           '93'           │ 'Downloads' │ 'Changelog' │ 'Docs' │
│    9    │ 'Node.js 16.13.0' │ 'Gallium' │ '2021-10-26' │ '9.4.146.19' │ '8.1.0' │           '93'           │ 'Downloads' │ 'Changelog' │ 'Docs' │
│   10    │ 'Node.js 16.12.0' │           │ '2021-10-20' │ '9.4.146.19' │ '8.1.0' │           '93'           │ 'Downloads' │ 'Changelog' │ 'Docs' │
└─────────┴───────────────────┴───────────┴──────────────┴──────────────┴─────────┴──────────────────────────┴─────────────┴─────────────┴────────┘
```
  • Loading branch information
adrienjoly authored Mar 25, 2022
1 parent 7dcf2e3 commit 281eb70
Show file tree
Hide file tree
Showing 6 changed files with 12,976 additions and 0 deletions.
14 changes: 14 additions & 0 deletions lib/TableParser.js
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@ function getSortedXValues(rows) {
return sortAsFloatValues(Object.keys(xSet));
}

/** @returns an 3-dimension matrix: row -> column -> items_collisionning_in_column -> item */
TableParser.prototype.getMatrix = function () {
var rows = this.getRows();
var xValues = getSortedXValues(rows);
Expand All @@ -79,6 +80,19 @@ TableParser.prototype.getMatrix = function () {
});
};

/**
* For use with console.table().
* @param {String} collisionSeparator separator to use when there are multiple values to join for a given column
* @returns a 2-dimension matrix: row -> column -> value
*/
TableParser.prototype.getCleanMatrix = function ({ collisionSeparator } = {}) {
return this.getMatrix().map((rowColumns) =>
rowColumns.map((items) =>
items.map((item) => item.text).join(collisionSeparator || "")
)
);
};

function getText(item) {
return item.text;
}
Expand Down
Loading

0 comments on commit 281eb70

Please sign in to comment.