diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..4c906ec --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +node_modules +yarn.lock +yarn-error.log diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..48a04af --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2020 Greene Laboratory + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/README.md b/README.md index ee9e046..f4117df 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,239 @@ -# hclust -Agglomerative hierarchical clustering in JavaScript +### hclust +[Agglomerative hierarchical clustering](https://en.wikipedia.org/wiki/Hierarchical_clustering) in JavaScript + +Inspired by the MIT-licensed [hcluster.js](https://github.com/cmpolis/hcluster.js) by [@ChrisPolis](https://twitter.com/chrispolis) + +--- + +### Usage + +#### Browser + +```html + + +``` + +#### Node + +`npm install @greenelab/hclust.git` + +or + +`yarn add @greenelab/hclust.git` + +then + +```javascript +import { clusterData } from 'hclust'; +import { euclideanDistance } from 'hclust'; +import { avgDistance } from 'hclust'; +``` + +--- + +### `clusterData({ data, key, distance, linkage, onProgress })` + +#### Parameters + +**`data`** + +The data you want to cluster, in the format: + +```javascript +[ + ... + [ ... 1, 2, 3 ...], + [ ... 1, 2, 3 ...], + [ ... 1, 2, 3 ...], + ... +] +``` + +or if `key` parameter is specified: + +```javascript +[ + ... + { someKey: [ ... 1, 2, 3 ...] }, + { someKey: [ ... 1, 2, 3 ...] }, + { someKey: [ ... 1, 2, 3 ...] }, + ... +] +``` + +The entries in the outer array can be considered the `rows` and the entries within each `row` array can be considered the `cols`. +Each `row` should have the same number of `cols`. + +*Default value:* `[]` + +**`key`** + +A `string` key to specify which values to extract from the `data` array. +If omitted, `data` is assumed to be an array of arrays. +If specified, `data` is assumed to be array of objects, each with a key that contains the values for that `row`. + +*Default value:* `''` + +**`distance`** + +A function to calculate the distance between two equal-dimension vectors, used in calculating the distance matrix, in the format: + +```javascript +function (arrayA, arrayB) { return someNumber; } +``` + +The function receives two equal-length arrays of numbers (ints or floats) and should return a number (int or float). + +*Default value:* `euclideanDistance` from this `hclust` package + +**`linkage`** + +A function to calculate the distance between pairs of clusters based on a distance matrix, used in determining linkage criterion, in the format: + +```javascript +function (arrayA, arrayB, distanceMatrix) { return someNumber; } +``` + +The function receives two sets of indexes and the distance matrix computed between each datum and every other datum. +The function should return a number (int or float) + +*Default value:* `averageDistance` from this `hclust` package + +**`onProgress`** + +A function that is called several times throughout clustering, and is provided the current progress through the clustering, in the format: + +```javascript +function (progress) { } +``` + +The function receives the percent progress between `0` and `1`. + +*Default value:* an internal function that `console.log`'s the progress + +**Note:** [`postMessage`](https://developer.mozilla.org/en-US/docs/Web/API/Worker/postMessage) is called in the same places as `onProgress`, if the script is running as a [web worker](https://developer.mozilla.org/en-US/docs/Web/API/Web_Workers_API/Using_web_workers). + +#### Returns + +```javascript +const { clusters, distances, order, clustersGivenK } = clusterData(...); +``` + +**`clusters`** + +The resulting cluster tree, in the format: + +```javascript +{ + indexes: [ ... Number, Number, Number ... ], + height: Number, + children: [ ... {}, {}, {} ... ] +} + +``` + +**`distances`** + +The computed distance matrix, in the format: + +```javascript +[ + ... + [ ... Number, Number, Number ...], + [ ... Number, Number, Number ...], + [ ... Number, Number, Number ...] + ... +] +``` + +**`order`** + +The new order of the data, in terms of original data array indexes, in the format: + +```javascript +[ ... Number, Number, Number ... ] +``` + +Equivalent to `clusters.indexes` and `clustersGivenK[1]`. + +**`clustersGivenK`** + +A list of tree slices in terms of original data array indexes, where index = K, in the format: + +```javascript +[ + [], // K = 0 + [ [] ], // K = 1 + [ [], [] ], // K = 2 + [ [], [], [] ], // K = 3 + [ [], [], [], [] ], // K = 4 + [ [], [], [], [], [] ] // K = 5 + ... +] +``` + +--- + +### `euclideanDistance(arrayA, arrayB)` + +Calculates the [euclidean distance](https://en.wikipedia.org/wiki/Euclidean_distance) between two equal-dimension vectors. + +--- + +### `avgDistance(arrayA, arrayB, distanceMatrix)` + +Calculates the average distance between pairs of clusters based on a distance matrix. + +--- + +### Comparison with [hcluster.js](https://github.com/cmpolis/hcluster.js) + +- This package does not duplicate items from the original dataset in the results. +Results are given in terms of indexes, either with respect to the original dataset or the distance matrix. +- This package uses more modern JavaScript syntaxes and practices to make the code cleaner and simpler. +- This package provides an `onProgress` callback and calls `postMessage` for use in [web workers](https://developer.mozilla.org/en-US/docs/Web/API/Web_Workers_API/Using_web_workers). +Because clustering can take a long time with large data sets, +you may want to run it as a web worker so the browser doesn't freeze for a long time, and you may need a callback so you can give users visual feedback on its progress. +- This package makes some performance optimizations, such as removing unnecessary loops through big sets. +It has been tested on modern OS's (Windows, Mac, Linux, iOS, Android), devices (desktop, laptop, mobile), browsers (Chrome, Firefox, Safari), contexts (main thread, web worker), and hosting locations (local, online). +The results vary widely, and are likely sensitive to the specifics of hardware, cpu cores, browser implementation, etc. +But in general, this package is more performant than `hcluster.js`, to varying degrees, and is always at least as performant on average. +Chrome seems to see the most performance gains (up to 10x, when the row number is high), while Firefox seems to see no gains. +- This package does not touch the input data object, whereas the `hcluster.js` package does. +D3 often expects you to mutate data objects directly, which is now typically considered bad practice in JavaScript. +Instead, this package returns the useful data from the clustering algorithm (including the distance matrix), and allows you to mutate or not mutate the data object depending on your needs. +In the future, a simple option could be added to instruct the algorithm to mutate the data object, if users can provide good use cases for what information is needed for constructing various D3 visualizations. +- This package leaves out the `minDistance` or `maxDistance` functions that are built into `hcluster.js`, because -- per [this reference](https://onlinelibrary.wiley.com/doi/abs/10.1002/9780470316801.ch5) -- they are not as effective as `averageDistance`. + +--- + +### Making changes to the library + +1. [Install Node](https://nodejs.org/en/download/) +2. [Install Yarn](https://classic.yarnpkg.com/en/docs/install) +3. Clone this repo and navigate to it in your command terminal +4. Run `yarn install` to install this package's dependencies +5. Make desired changes to `./src/hclust.js` +6. Run `npm run test` to automatically rebuild the library and run test suite +7. Run `npm run build` to just rebuild the library, and output the compiled contents to `./build/hclust.min.js` +8. Commit changes to repo if necessary. *Make sure to run the build command before committing; it won't happen automatically.* + +--- + +### Similar libraries + +[cmpolis/hcluster.js](https://github.com/cmpolis/hcluster.js) +[harthur/clustering](https://github.com/harthur/clustering) +[mljs/hclust](https://github.com/mljs/hclust) +[math-utils/hierarchical-clustering](https://github.com/math-utils/hierarchical-clustering) + +--- + +### Further reading + +The [AGNES](https://onlinelibrary.wiley.com/doi/abs/10.1002/9780470316801.ch5) (AGglomerative NESting) method; continuously merge nodes that have the least dissimilarity. \ No newline at end of file diff --git a/babel.config.js b/babel.config.js new file mode 100644 index 0000000..f037a1a --- /dev/null +++ b/babel.config.js @@ -0,0 +1,12 @@ +module.exports = { + presets: [ + [ + '@babel/preset-env', + { + targets: { + node: 'current', + }, + }, + ], + ], +}; diff --git a/build/hclust.min.js b/build/hclust.min.js new file mode 100644 index 0000000..371d003 --- /dev/null +++ b/build/hclust.min.js @@ -0,0 +1,120 @@ +"use strict"; + +Object.defineProperty(exports, "__esModule", { + value: true +}); +exports.clusterData = exports.averageDistance = exports.euclideanDistance = void 0; + +// get euclidean distance between two equal-dimension vectors +const euclideanDistance = (a, b) => { + const size = Math.min(a.length, b.length); + let sum = 0; + + for (let index = 0; index < size; index++) sum += (a[index] - b[index]) * (a[index] - b[index]); + + return Math.sqrt(sum); +}; // get average distance between sets of indexes, given distance matrix + + +exports.euclideanDistance = euclideanDistance; + +const averageDistance = (setA, setB, distances) => { + let distance = 0; + + for (const a of setA) { + for (const b of setB) distance += distances[a][b]; + } + + return distance / setA.length / setB.length; +}; // update progress by calling user onProgress and postMessage for web workers + + +exports.averageDistance = averageDistance; + +const updateProgress = (stepNumber, stepProgress, onProgress) => { + // currently only two distinct steps: computing distance matrix and clustering + const progress = stepNumber / 2 + stepProgress / 2; // if onProgress is defined and is a function, call onProgress + + if (typeof onProgress === 'function') onProgress(progress); // if this script is being run as a web worker, call postMessage + + if (typeof WorkerGlobalScope !== 'undefined' && self instanceof WorkerGlobalScope) postMessage(progress); +}; // default onProgress function. console logs progress + + +const logProgress = progress => console.log('Clustering: ', (progress * 100).toFixed(1) + '%'); // the main clustering function + + +const clusterData = ({ + data = [], + key = '', + distance = euclideanDistance, + linkage = averageDistance, + onProgress = logProgress +}) => { + // extract values from specified key + if (key) data = data.map(datum => datum[key]); // compute distance between each data point and every other data point + // N x N matrix where N = data.length + + const distances = data.map((datum, index) => { + updateProgress(0, index / (data.length - 1), onProgress); // get distance between datum and other datum + + return data.map(otherDatum => distance(datum, otherDatum)); + }); // initialize clusters to match data + + const clusters = data.map((datum, index) => ({ + height: 0, + indexes: [Number(index)] + })); // keep track of all tree slices + + let clustersGivenK = []; // iterate through data + + for (let iteration = 0; iteration < data.length; iteration++) { + updateProgress(1, (iteration + 1) / data.length, onProgress); // add current tree slice + + clustersGivenK.push(clusters.map(cluster => cluster.indexes)); // dont find clusters to merge when only one cluster left + + if (iteration >= data.length - 1) break; // initialize smallest distance + + let nearestDistance = Infinity; + let nearestRow = 0; + let nearestCol = 0; // upper triangular matrix of clusters + + for (let row = 0; row < clusters.length; row++) { + for (let col = row + 1; col < clusters.length; col++) { + // calculate distance between clusters + const distance = linkage(clusters[row].indexes, clusters[col].indexes, distances); // update smallest distance + + if (distance < nearestDistance) { + nearestDistance = distance; + nearestRow = row; + nearestCol = col; + } + } + } // merge nearestRow and nearestCol clusters together + + + const newCluster = { + indexes: [...clusters[nearestRow].indexes, ...clusters[nearestCol].indexes], + height: nearestDistance, + children: [clusters[nearestRow], clusters[nearestCol]] + }; // remove nearestRow and nearestCol clusters + // splice higher index first so it doesn't affect second splice + + clusters.splice(Math.max(nearestRow, nearestCol), 1); + clusters.splice(Math.min(nearestRow, nearestCol), 1); // add new merged cluster + + clusters.push(newCluster); + } // assemble full list of tree slices into array where index = k + + + clustersGivenK = [[], ...clustersGivenK.reverse()]; // return useful information + + return { + clusters: clusters[0], + distances: distances, + order: clusters[0].indexes, + clustersGivenK: clustersGivenK + }; +}; + +exports.clusterData = clusterData; diff --git a/package.json b/package.json new file mode 100644 index 0000000..9392ea7 --- /dev/null +++ b/package.json @@ -0,0 +1,32 @@ +{ + "name": "@greenelab/hclust", + "version": "0.0.0-dev", + "description": "Agglomerative hierarchical clustering in JavaScript", + "keywords": [ + "hierarchy", + "hierarchical", + "cluster", + "clustering", + "agglomerative", + "data", + "tree" + ], + "author": "Vincent Rubinetti", + "license": "MIT", + "repository": "git+https://github.com/greenelab/hclust.git", + "main": "./build/hclust.min.js", + "module": "./build/hclust.min.js", + "scripts": { + "test": "bash ./scripts/build.sh && bash ./scripts/test.sh", + "build": "bash ./scripts/build.sh" + }, + "devDependencies": { + "@babel/cli": "^7.8.4", + "@babel/core": "^7.9.0", + "@babel/preset-env": "^7.9.0", + "babel-jest": "^25.1.0", + "babel-preset-minify": "^0.5.1", + "jest": "^25.1.0" + }, + "browserslist": "> 0.1%, not dead" +} diff --git a/scripts/build.sh b/scripts/build.sh new file mode 100644 index 0000000..9b91876 --- /dev/null +++ b/scripts/build.sh @@ -0,0 +1,3 @@ +rm -rf build +mkdir build +npx babel ./src/hclust.js --out-file ./build/hclust.min.js diff --git a/scripts/test.sh b/scripts/test.sh new file mode 100644 index 0000000..1eee2ef --- /dev/null +++ b/scripts/test.sh @@ -0,0 +1 @@ +jest ./test/test.js --notify diff --git a/src/hclust.js b/src/hclust.js new file mode 100644 index 0000000..6788ac8 --- /dev/null +++ b/src/hclust.js @@ -0,0 +1,135 @@ +// get euclidean distance between two equal-dimension vectors +export const euclideanDistance = (a, b) => { + const size = Math.min(a.length, b.length); + let sum = 0; + for (let index = 0; index < size; index++) + sum += (a[index] - b[index]) * (a[index] - b[index]); + return Math.sqrt(sum); +}; + +// get average distance between sets of indexes, given distance matrix +export const averageDistance = (setA, setB, distances) => { + let distance = 0; + for (const a of setA) { + for (const b of setB) + distance += distances[a][b]; + } + + return distance / setA.length / setB.length; +}; + +// update progress by calling user onProgress and postMessage for web workers +const updateProgress = (stepNumber, stepProgress, onProgress) => { + // currently only two distinct steps: computing distance matrix and clustering + const progress = stepNumber / 2 + stepProgress / 2; + + // if onProgress is defined and is a function, call onProgress + if (typeof onProgress === 'function') + onProgress(progress); + + // if this script is being run as a web worker, call postMessage + if ( + typeof WorkerGlobalScope !== 'undefined' && + self instanceof WorkerGlobalScope + ) + postMessage(progress); +}; + +// default onProgress function. console logs progress +const logProgress = (progress) => + console.log('Clustering: ', (progress * 100).toFixed(1) + '%'); + +// the main clustering function +export const clusterData = ({ + data = [], + key = '', + distance = euclideanDistance, + linkage = averageDistance, + onProgress = logProgress +}) => { + // extract values from specified key + if (key) + data = data.map((datum) => datum[key]); + + // compute distance between each data point and every other data point + // N x N matrix where N = data.length + const distances = data.map((datum, index) => { + updateProgress(0, index / (data.length - 1), onProgress); + + // get distance between datum and other datum + return data.map((otherDatum) => distance(datum, otherDatum)); + }); + + // initialize clusters to match data + const clusters = data.map((datum, index) => ({ + height: 0, + indexes: [Number(index)] + })); + + // keep track of all tree slices + let clustersGivenK = []; + + // iterate through data + for (let iteration = 0; iteration < data.length; iteration++) { + updateProgress(1, (iteration + 1) / data.length, onProgress); + + // add current tree slice + clustersGivenK.push(clusters.map((cluster) => cluster.indexes)); + + // dont find clusters to merge when only one cluster left + if (iteration >= data.length - 1) + break; + + // initialize smallest distance + let nearestDistance = Infinity; + let nearestRow = 0; + let nearestCol = 0; + + // upper triangular matrix of clusters + for (let row = 0; row < clusters.length; row++) { + for (let col = row + 1; col < clusters.length; col++) { + // calculate distance between clusters + const distance = linkage( + clusters[row].indexes, + clusters[col].indexes, + distances + ); + // update smallest distance + if (distance < nearestDistance) { + nearestDistance = distance; + nearestRow = row; + nearestCol = col; + } + } + } + + // merge nearestRow and nearestCol clusters together + const newCluster = { + indexes: [ + ...clusters[nearestRow].indexes, + ...clusters[nearestCol].indexes + ], + height: nearestDistance, + children: [clusters[nearestRow], clusters[nearestCol]] + }; + + // remove nearestRow and nearestCol clusters + // splice higher index first so it doesn't affect second splice + clusters.splice(Math.max(nearestRow, nearestCol), 1); + clusters.splice(Math.min(nearestRow, nearestCol), 1); + + // add new merged cluster + clusters.push(newCluster); + } + + // assemble full list of tree slices into array where index = k + clustersGivenK = [[], ...clustersGivenK.reverse()]; + + // return useful information + return { + clusters: clusters[0], + distances: distances, + order: clusters[0].indexes, + clustersGivenK: clustersGivenK + }; +}; diff --git a/test/chrispolis.hcluster.min.js b/test/chrispolis.hcluster.min.js new file mode 100644 index 0000000..7b68ba9 --- /dev/null +++ b/test/chrispolis.hcluster.min.js @@ -0,0 +1 @@ +(function(f){if(typeof exports==="object"&&typeof module!=="undefined"){module.exports=f()}else if(typeof define==="function"&&define.amd){define([],f)}else{var g;if(typeof window!=="undefined"){g=window}else if(typeof global!=="undefined"){g=global}else if(typeof self!=="undefined"){g=self}else{g=this}g.hcluster=f()}})(function(){var define,module,exports;return function e(t,n,r){function s(o,u){if(!n[o]){if(!t[o]){var a=typeof require=="function"&&require;if(!u&&a)return a(o,!0);if(i)return i(o,!0);var f=new Error("Cannot find module '"+o+"'");throw f.code="MODULE_NOT_FOUND",f}var l=n[o]={exports:{}};t[o][0].call(l.exports,function(e){var n=t[o][1][e];return s(n?n:e)},l,l.exports,e,t,n,r)}return n[o].exports}var i=typeof require=="function"&&require;for(var o=0;odata.length)throw new Error("n must be less than the size of the dataset");return clustersGivenK[data.length-n].map(function(indexes){return indexes.map(function(ndx){return data[ndx]})})};clust._squareMatrixPairs=function(n){var pairs=[];for(var row=0;row { + expect(hcluster).toBeDefined(); +}); + +test('can import hclust', () => { + expect(clusterData).toBeDefined(); +}); + +test('test dataset 1', () => { + // get received results from this package + const resultsB = clusterData({ + data: dataset1, + key: 'value', + onProgress: null + }); + + // transform order to be in terms of sample name/id + const orderB = resultsB.order + .map((index) => dataset1[index]) + .map((node) => node.sample); + // transform slice to be in terms of sample name/id + const sliceB = resultsB.clustersGivenK[10].map((cluster) => + cluster.map((index) => dataset1[index]).map((node) => node.sample) + ); + + // get "expected" results from hcluster.js + const resultsA = hcluster() + .distance('euclidean') + .linkage('avg') + .posKey('value') + .data(dataset1); + + // transform order to be in terms of sample name/id + const orderA = resultsA.orderedNodes().map((node) => node.sample); + // transform slice to be in terms of sample name/id + const sliceA = resultsA + .getClusters(10) + .map((cluster) => cluster.map((node) => node.sample)); + + console.log('Expected order:', orderA, 'Received order:', orderB); + console.log('Expected slice:', sliceA, 'Received slice:', sliceB); + + expect(orderB).toStrictEqual(orderA); + expect(sliceB).toStrictEqual(sliceA); +}); + +test('test dataset 2', () => { + // get received results from this package + const resultsB = clusterData({ + data: dataset2, + key: 'value', + onProgress: null + }); + + // transform order to be in terms of signature name/id + const orderB = resultsB.order + .map((index) => dataset2[index]) + .map((node) => node.signature); + // transform slice to be in terms of signature name/id + const sliceB = resultsB.clustersGivenK[10].map((cluster) => + cluster.map((index) => dataset2[index]).map((node) => node.signature) + ); + + // get "expected" results from hcluster.js + const resultsA = hcluster() + .distance('euclidean') + .linkage('avg') + .posKey('value') + .data(dataset2); + + // transform order to be in terms of signature name/id + const orderA = resultsA.orderedNodes().map((node) => node.signature); + // transform slice to be in terms of signature name/id + const sliceA = resultsA + .getClusters(10) + .map((cluster) => cluster.map((node) => node.signature)); + + console.log('Expected order:', orderA, 'Received order:', orderB); + console.log('Expected slice:', sliceA, 'Received slice:', sliceB); + + expect(orderB).toStrictEqual(orderA); + expect(sliceB).toStrictEqual(sliceA); +});