From 93c92969569d1f86f50ca16b01ecccdff6488602 Mon Sep 17 00:00:00 2001 From: Baris Sencan Date: Tue, 9 Mar 2021 00:59:20 +0300 Subject: [PATCH] feat(lib): Allow per-column standardisation (#202) BREAKING CHANGE: standardise does not allow boolean values anymore, meand and variance are no longer returned --- README.md | 20 ++------ jest.config.js | 8 ++-- src/loadCsv.models.ts | 4 +- src/loadCsv.ts | 28 ++++++----- src/standardise.ts | 68 +++++++++++++++++++++++++++ tests/loadCsv.test.ts | 97 +++++++++++---------------------------- tests/standardise.test.ts | 88 +++++++++++++++++++++++++++++++++++ 7 files changed, 211 insertions(+), 102 deletions(-) create mode 100644 src/standardise.ts create mode 100644 tests/standardise.test.ts diff --git a/README.md b/README.md index 296b38d..121d9fd 100644 --- a/README.md +++ b/README.md @@ -53,14 +53,7 @@ Advanced usage: ```js import loadCsv from 'tensorflow-load-csv'; -const { - features, - labels, - testFeatures, - testLabels, - mean, // tensor holding mean of features, ignores testFeatures - variance, // tensor holding variance of features, ignores testFeatures -} = loadCsv('./data.csv', { +const { features, labels, testFeatures, testLabels } = loadCsv('./data.csv', { featureColumns: ['lat', 'lng', 'height'], labelColumns: ['temperature'], mappings: { @@ -68,10 +61,10 @@ const { temperature: (f) => (f < 50 ? [1, 0] : [0, 1]), // cold or hot classification }, // Map values based on which column they are in before they are loaded into tensors. flatten: ['temperature'], // Flattens the array result of a mapping so that each member is a new column. - shuffle: true, // Pass true to shuffle with a fixed seed, or a string to use it as a seed for the shuffling. - splitTest: true, // Splits your data in half. You can also provide a certain row count for the test data, or a percentage string (e.g. 10%). - prependOnes: true, // Prepends a column of 1s to your features and testFeatures tensors, useful for linear regression. - standardise: true, // Calculates mean and variance for each feature column using data only in features, then standardises the values in features and testFeatures. Does not touch labels. + shuffle: true, // Pass true to shuffle with a fixed seed, or a string to use as a seed for the shuffling. + splitTest: true, // Splits your data in half. You can also provide a certain row count for the test data, or a percentage string (e.g. '10%'). + standardise: ['height'], // Calculates mean and variance for each feature column using data only in features, then standardises the values in features and testFeatures. Does not touch labels. + prependOnes: true, // Prepends a column of 1s to your features and testFeatures tensors, useful for regression problems. }); features.print(); @@ -79,7 +72,4 @@ labels.print(); testFeatures.print(); testLabels.print(); - -mean.print(); -variance.print(); ``` diff --git a/jest.config.js b/jest.config.js index 919d0ea..09cbc32 100644 --- a/jest.config.js +++ b/jest.config.js @@ -12,10 +12,10 @@ module.exports = { coveragePathIgnorePatterns: ['/node_modules/', '/tests/'], coverageThreshold: { global: { - branches: 90, - functions: 95, - lines: 95, - statements: 95, + branches: 100, + functions: 100, + lines: 100, + statements: 100, }, }, collectCoverageFrom: ['src/*.{js,ts}'], diff --git a/src/loadCsv.models.ts b/src/loadCsv.models.ts index e434c53..a255026 100644 --- a/src/loadCsv.models.ts +++ b/src/loadCsv.models.ts @@ -35,9 +35,9 @@ export interface CsvReadOptions { */ prependOnes?: boolean; /** - * If true, calculates mean and variance for each feature column using data only in features, then standardises the values in features and testFeatures. Does not touch labels. + * Calculates mean and variance for given columns using data only in features, then standardises the values in features and testFeatures. Does not touch labels. */ - standardise?: boolean | string[]; + standardise?: string[]; /** * Useful for classification problems, if you have mapped a column's values to an array using `mappings`, you can choose to flatten it here so that each element becomes a new column. * diff --git a/src/loadCsv.ts b/src/loadCsv.ts index ef02703..aa7a0fe 100644 --- a/src/loadCsv.ts +++ b/src/loadCsv.ts @@ -7,6 +7,7 @@ import filterColumns from './filterColumns'; import splitTestData from './splitTestData'; import applyMappings from './applyMappings'; import shuffle from './shuffle'; +import standardise from './standardise'; const defaultShuffleSeed = 'mncv9340ur'; @@ -31,10 +32,10 @@ const loadCsv = ( featureColumns, labelColumns, mappings = {}, - shuffle: shouldShuffle = false, + shuffle: shouldShuffleOrSeed = false, splitTest, prependOnes = false, - standardise = false, + standardise: columnsToStandardise = [], flatten = [], }: CsvReadOptions ) => { @@ -54,11 +55,13 @@ const loadCsv = ( }; tables.labels.shift(); - tables.features.shift(); + const featureColumnNames = tables.features.shift() as string[]; - if (shouldShuffle) { + if (shouldShuffleOrSeed) { const seed = - typeof shouldShuffle === 'string' ? shouldShuffle : defaultShuffleSeed; + typeof shouldShuffleOrSeed === 'string' + ? shouldShuffleOrSeed + : defaultShuffleSeed; tables.features = shuffle(tables.features, seed); tables.labels = shuffle(tables.labels, seed); } @@ -76,11 +79,14 @@ const loadCsv = ( const labels = tf.tensor(tables.labels); const testLabels = tf.tensor(tables.testLabels); - const { mean, variance } = tf.moments(features, 0); - - if (standardise) { - features = features.sub(mean).div(variance.pow(0.5)); - testFeatures = testFeatures.sub(mean).div(variance.pow(0.5)); + if (columnsToStandardise.length > 0) { + const result = standardise( + features, + testFeatures, + featureColumnNames.map((c) => columnsToStandardise.includes(c)) + ); + features = result.features; + testFeatures = result.testFeatures; } if (prependOnes) { @@ -93,8 +99,6 @@ const loadCsv = ( labels, testFeatures, testLabels, - mean, - variance, }; }; diff --git a/src/standardise.ts b/src/standardise.ts new file mode 100644 index 0000000..4a66646 --- /dev/null +++ b/src/standardise.ts @@ -0,0 +1,68 @@ +import * as tf from '@tensorflow/tfjs'; + +const standardise = ( + features: tf.Tensor, + testFeatures: tf.Tensor, + indicesToStandardise: boolean[] +): { + features: tf.Tensor; + testFeatures: tf.Tensor; +} => { + let newFeatures, newTestFeatures; + + if (features.shape.length < 2 || testFeatures.shape.length < 2) { + throw new Error( + 'features and testFeatures must have at least two dimensions' + ); + } + + if (features.shape[1] !== testFeatures.shape[1]) { + throw new Error( + 'Length of the second dimension of features and testFeatures must be the same' + ); + } + + if (features.shape[1] !== indicesToStandardise.length) { + throw new Error( + 'Length of indicesToStandardise must match the length of the second dimension of features' + ); + } + + if (features.shape[1] === 0) { + return { features, testFeatures }; + } + + for (let i = 0; i < features.shape[1]; i++) { + let featureSlice = features.slice([0, i], [features.shape[0], 1]); + let testFeatureSlice = testFeatures.slice( + [0, i], + [testFeatures.shape[0], 1] + ); + if (indicesToStandardise[i]) { + const sliceMoments = tf.moments(featureSlice); + featureSlice = featureSlice + .sub(sliceMoments.mean) + .div(sliceMoments.variance.pow(0.5)); + testFeatureSlice = testFeatureSlice + .sub(sliceMoments.mean) + .div(sliceMoments.variance.pow(0.5)); + } + if (!newFeatures) { + newFeatures = featureSlice; + } else { + newFeatures = newFeatures.concat(featureSlice, 1); + } + if (!newTestFeatures) { + newTestFeatures = testFeatureSlice; + } else { + newTestFeatures = newTestFeatures.concat(testFeatureSlice, 1); + } + } + + return { + features: newFeatures as tf.Tensor, + testFeatures: newTestFeatures as tf.Tensor, + }; +}; + +export default standardise; diff --git a/tests/loadCsv.test.ts b/tests/loadCsv.test.ts index 9bbcf7f..009257a 100644 --- a/tests/loadCsv.test.ts +++ b/tests/loadCsv.test.ts @@ -46,95 +46,54 @@ test('Loading with only the required options should work', () => { ]); }); -test('Shuffling should work and preserve feature - label pairs', () => { - const { features, labels } = loadCsv(filePath, { +test('Loading with all extra options should work', () => { + const { features, labels, testFeatures, testLabels } = loadCsv(filePath, { featureColumns: ['lat', 'lng'], labelColumns: ['country'], + mappings: { + country: (name) => (name as string).toUpperCase(), + lat: (lat) => ((lat as number) > 0 ? [0, 1] : [1, 0]), // South or North classification + }, + flatten: ['lat'], shuffle: true, + splitTest: true, + prependOnes: true, + standardise: ['lng'], }); // @ts-ignore expect(features.arraySync()).toBeDeepCloseTo( [ - [5, 40.34], - [0.234, 1.47], - [-93.2, 103.34], - [102, -164], + [1, 0, 1, 1], + [1, 0, 1, -1], ], 3 ); - expect(labels.arraySync()).toMatchObject([ - ['Landistan'], - ['SomeCountria'], - ['SomeOtherCountria'], - ['Landotzka'], - ]); -}); - -test('Shuffling with a custom seed should work', () => { - const { features, labels } = loadCsv(filePath, { - featureColumns: ['lat', 'lng'], - labelColumns: ['country'], - shuffle: 'hello-is-it-me-you-are-looking-for', - }); + expect(labels.arraySync()).toMatchObject([['LANDISTAN'], ['SOMECOUNTRIA']]); // @ts-ignore - expect(features.arraySync()).toBeDeepCloseTo( + expect(testFeatures.arraySync()).toBeDeepCloseTo( [ - [-93.2, 103.34], - [102, -164], - [5, 40.34], - [0.234, 1.47], + [1, 1, 0, 4.241], + [1, 0, 1, -9.514], ], 3 ); - expect(labels.arraySync()).toMatchObject([ - ['SomeOtherCountria'], - ['Landotzka'], - ['Landistan'], - ['SomeCountria'], + expect(testLabels.arraySync()).toMatchObject([ + ['SOMEOTHERCOUNTRIA'], + ['LANDOTZKA'], ]); }); -test('Loading with all extra options other than shuffle as true should work', () => { - const { - features, - labels, - testFeatures, - testLabels, - mean, - variance, - } = loadCsv(filePath, { +test('Loading with custom seed should use the custom seed', () => { + const { features } = loadCsv(filePath, { featureColumns: ['lat', 'lng'], labelColumns: ['country'], - mappings: { - country: (name) => (name as string).toUpperCase(), - }, - splitTest: true, - prependOnes: true, - standardise: true, + shuffle: true, + }); + const { features: featuresCustom } = loadCsv(filePath, { + featureColumns: ['lat', 'lng'], + labelColumns: ['country'], + shuffle: 'sdhjhdf', }); // @ts-ignore - expect(features.arraySync()).toBeDeepCloseTo( - [ - [1, 1, -1], - [1, -1, 1], - ], - 3 - ); - expect(labels.arraySync()).toMatchObject([ - ['SOMECOUNTRIA'], - ['SOMEOTHERCOUNTRIA'], - ]); - // @ts-ignore - expect(testFeatures.arraySync()).toBeDeepCloseTo( - [ - [1, 1.102, -0.236], - [1, 3.178, -4.248], - ], - 3 - ); - expect(testLabels.arraySync()).toMatchObject([['LANDISTAN'], ['LANDOTZKA']]); - // @ts-ignore - expect(mean.arraySync()).toBeDeepCloseTo([-46.482, 52.404], 3); - // @ts-ignore - expect(variance.arraySync()).toBeDeepCloseTo([2182.478, 2594.374], 3); + expect(features).not.toBeDeepCloseTo(featuresCustom, 1); }); diff --git a/tests/standardise.test.ts b/tests/standardise.test.ts new file mode 100644 index 0000000..bac18d9 --- /dev/null +++ b/tests/standardise.test.ts @@ -0,0 +1,88 @@ +import * as tf from '@tensorflow/tfjs'; + +import standardise from '../src/standardise'; + +test('it should throw an error when features does not have more than one dimension', () => { + expect(() => + standardise(tf.tensor([1, 2]), tf.tensor([[1], [2]]), [true]) + ).toThrowError('features and testFeatures must have at least two dimensions'); +}); + +test('it should throw an error when testFeatures does not have more than one dimension', () => { + expect(() => + standardise(tf.tensor([[1], [2]]), tf.tensor([1, 2]), [true]) + ).toThrowError('features and testFeatures must have at least two dimensions'); +}); + +test('it should throw an error when features and testFeatures have different lengths for their second dimensions', () => { + expect(() => + standardise( + tf.tensor([ + [1, 2], + [1, 2], + ]), + tf.tensor([[1], [2]]), + [true, true] + ) + ).toThrowError( + 'Length of the second dimension of features and testFeatures must be the same' + ); +}); + +test('it should throw an error when length of indicesToStandardise does not match the length of the second dimension of features', () => { + expect(() => + standardise( + tf.tensor([ + [1, 2], + [1, 2], + ]), + tf.tensor([ + [3, 4], + [3, 4], + ]), + [true] + ) + ).toThrowError( + 'Length of indicesToStandardise must match the length of the second dimension of features' + ); +}); + +test('it should return the exact same tensors as the inputs when there are 0 columns', () => { + const features = tf.tensor([[], []]); + const testFeatures = tf.tensor([[], []]); + const result = standardise(features, testFeatures, []); + expect(result.features).toBe(features); + expect(result.testFeatures).toBe(testFeatures); +}); + +test('it should standardise only the requested second dimension slices', () => { + const result = standardise( + tf.tensor([ + [1, 2, 3], + [0, 0, 4], + ]), + tf.tensor([ + [0, 1, 3], + [0, 0, 4], + ]), + [true, true, false] + ); + // eslint-disable-next-line @typescript-eslint/ban-ts-comment + // @ts-ignore + expect(result.features.arraySync()).toBeDeepCloseTo( + [ + [1, 1, 3], + [-1, -1, 4], + ], + 3 + ); + // eslint-disable-next-line @typescript-eslint/ban-ts-comment + // @ts-ignore + expect(result.testFeatures.arraySync()).toBeDeepCloseTo( + [ + [-1, 0, 3], + [-1, -1, 4], + ], + 3 + ); +});