From 76b858535e1d264e345a0db23dedf3be2bef4f43 Mon Sep 17 00:00:00 2001 From: Sam Plackett <60177449+samplackett@users.noreply.github.com> Date: Thu, 19 Dec 2024 14:15:59 +0000 Subject: [PATCH] Ignore non-printable characters (#60) --- package.json | 2 +- src/loaders/csvloader.js | 11 ++++++-- test/loaders/csvLoader.test.js | 47 +++++++++++++++++++++++++++++----- 3 files changed, 51 insertions(+), 9 deletions(-) diff --git a/package.json b/package.json index f462ba8..72c2037 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "ffc-pay-etl-framework", - "version": "1.1.3", + "version": "1.1.4", "publisher": "Defra", "main": "dist/cjs/index.js", "private": false, diff --git a/src/loaders/csvloader.js b/src/loaders/csvloader.js index 6404fa8..670848b 100644 --- a/src/loaders/csvloader.js +++ b/src/loaders/csvloader.js @@ -2,7 +2,6 @@ const fs = require("fs") const { Transform } = require("stream") const { parse } = require("csv-parse") -const { stdout, stderr } = require("process") /** * @@ -24,7 +23,15 @@ function CSVLoader(options){ transform(chunk, _, callback){ chunk["_columns"] = options.columns chunk["_linecount"] = lineCount - lineCount +=1 + lineCount += 1 + + // remove non-printable characters + options.columns.forEach((_column, index) => { + if (chunk[index]) { + chunk[index] = chunk[index].replace(/[\x00-\x1F\x7F-\x9F]/g, '') + } + }) + callback(null, chunk) } }) diff --git a/test/loaders/csvLoader.test.js b/test/loaders/csvLoader.test.js index 8a3c7df..cc125b5 100644 --- a/test/loaders/csvLoader.test.js +++ b/test/loaders/csvLoader.test.js @@ -1,7 +1,6 @@ -const { CSVLoader } = require("../../src/loaders/csvloader") -const { Readable, PassThrough } = require("node:stream") -const fs = require("fs") -const { expect } = require("@jest/globals") +const fs = require('fs') +const { PassThrough } = require('stream') +const { CSVLoader } = require('../../src/loaders/csvloader') jest.mock('fs') @@ -14,7 +13,7 @@ describe('csvLoader tests', () => { ] let lineCount = 1 const testPath = "someRandomPath" - fs.__setMockFileContent(testPath, testData) + fs.__setMockFileContent(testPath, testData.join('')) const uut = CSVLoader({ path: testPath, columns: ["a","b","c"]}) uut .pump(uut) @@ -29,8 +28,8 @@ describe('csvLoader tests', () => { } })) }) + it('should count csv file lines', (done) => { - jest.setTimeout(10000) const testData = [ "column1, column2, column3\n", "1,2,3\n", @@ -53,4 +52,40 @@ describe('csvLoader tests', () => { } })) }) + + it('should remove non-printable characters from CSV data', (done) => { + const testData = [ + "column1,column2,column3\n", + "1,\x00\x1F2,3\n", + "4,5,\x7F\x9F6\n" + ] + const expectedData = [ + "column1,column2,column3\n", + "1,2,3\n", + "4,5,6\n" + ] + let lineCount = 1 + const testPath = "someRandomPath" + fs.__setMockFileContent(testPath, testData.join('')) + const uut = CSVLoader({ path: testPath, columns: ["column1", "column2", "column3"] }) + uut + .pump(uut) + .pipe(new PassThrough({ + objectMode: true, + transform(chunk, _, callback) { + try { + const received = chunk.join(",") + const expected = expectedData[lineCount].replace(/\n/,"") + expect(received).toEqual(expected) + if (lineCount === expectedData.length - 1) { + done() + } + lineCount += 1 + callback(null, chunk) + } catch (error) { + done(error) + } + } + })) + }, 10000) }) \ No newline at end of file