Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

support Apache Arrow as a normalized data representation #2115

Merged
merged 22 commits into from
Aug 5, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@
"@types/node": "^20.5.0",
"@typescript-eslint/eslint-plugin": "^7.2.0",
"@typescript-eslint/parser": "^7.2.0",
"apache-arrow": "^16.0.2",
"c8": "^9.1.0",
"canvas": "^2.0.0",
"d3-geo-projection": "^4.0.0",
Expand Down
7 changes: 6 additions & 1 deletion src/interactions/pointer.js
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import {pointer as pointof} from "d3";
import {composeRender} from "../mark.js";
import {isArray} from "../options.js";
import {applyFrameAnchor} from "../style.js";

const states = new WeakMap();
Expand Down Expand Up @@ -126,7 +127,11 @@ function pointerK(kx, ky, {x, y, px, py, maxRadius = 40, channels, render, ...op

// Dispatch the value. When simultaneously exiting this facet and
// entering a new one, prioritize the entering facet.
if (!(i == null && facetState?.size > 1)) context.dispatchValue(i == null ? null : data[i]);
if (!(i == null && facetState?.size > 1)) {
const value = i == null ? null : isArray(data) ? data[i] : data.get(i);
context.dispatchValue(value);
}

return r;
}

Expand Down
1 change: 1 addition & 0 deletions src/mark.d.ts
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ export type TipPointer = "x" | "y" | "xy";
*
* - an array, typed array, or other iterable
* - an object with a length property and indexed values
* - an Apache Arrow Table
*/
export type Data = Iterable<any> | ArrayLike<any>;
mbostock marked this conversation as resolved.
Show resolved Hide resolved

Expand Down
6 changes: 3 additions & 3 deletions src/mark.js
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ import {channelDomain, createChannels, valueObject} from "./channel.js";
import {defined} from "./defined.js";
import {maybeFacetAnchor} from "./facet.js";
import {maybeClip, maybeNamed, maybeValue} from "./options.js";
import {arrayify, isDomainSort, isObject, isOptions, keyword, range, singleton} from "./options.js";
import {dataify, isDomainSort, isObject, isOptions, keyword, range, singleton} from "./options.js";
import {project} from "./projection.js";
import {maybeClassName, styles} from "./style.js";
import {basic, initializer} from "./transforms/basic.js";
Expand Down Expand Up @@ -89,10 +89,10 @@ export class Mark {
}
}
initialize(facets, facetChannels, plotOptions) {
let data = arrayify(this.data);
let data = dataify(this.data);
if (facets === undefined && data != null) facets = [range(data)];
const originalFacets = facets;
if (this.transform != null) ({facets, data} = this.transform(data, facets, plotOptions)), (data = arrayify(data));
if (this.transform != null) ({facets, data} = this.transform(data, facets, plotOptions)), (data = dataify(data));
if (facets !== undefined) facets.original = originalFacets; // needed to read facetChannels
const channels = createChannels(this.channels, data);
if (this.sort != null) channelDomain(data, facets, channels, facetChannels, this.sort); // mutates facetChannels!
Expand Down
92 changes: 75 additions & 17 deletions src/options.js
Original file line number Diff line number Diff line change
Expand Up @@ -7,14 +7,36 @@ import {timeInterval, utcInterval} from "./time.js";
export const TypedArray = Object.getPrototypeOf(Uint8Array);
const objectToString = Object.prototype.toString;

export function isArray(value) {
return value instanceof Array || value instanceof TypedArray;
}

function isNumberArray(value) {
return value instanceof TypedArray && !isBigIntArray(value);
}

function isNumberType(type) {
return type?.prototype instanceof TypedArray && !isBigIntType(type);
}

function isBigIntArray(value) {
return value instanceof BigInt64Array || value instanceof BigUint64Array;
}

function isBigIntType(type) {
return type === BigInt64Array || type === BigUint64Array;
}

// If a reindex is attached to the data, channel values expressed as arrays will
// be reindexed when the channels are instantiated. See exclusiveFacets.
export const reindex = Symbol("reindex");

export function valueof(data, value, type) {
const valueType = typeof value;
return valueType === "string"
? maybeTypedMap(data, field(value), type)
? isArrowTable(data)
? maybeTypedArrowify(data.getChild(value), type)
: maybeTypedMap(data, field(value), type)
: valueType === "function"
? maybeTypedMap(data, value, type)
: valueType === "number" || value instanceof Date || valueType === "boolean"
Expand All @@ -29,21 +51,25 @@ function maybeTake(values, index) {
}

function maybeTypedMap(data, f, type) {
return map(data, type?.prototype instanceof TypedArray ? floater(f) : f, type);
return map(data, isNumberType(type) ? (d, i) => coerceNumber(f(d, i)) : f, type); // allow conversion from BigInt
mbostock marked this conversation as resolved.
Show resolved Hide resolved
}

function maybeTypedArrayify(data, type) {
return type === undefined
? arrayify(data) // preserve undefined type
: isArrowVector(data)
? maybeTypedArrowify(data, type)
: data instanceof type
? data
: type.prototype instanceof TypedArray && !(data instanceof TypedArray)
? type.from(data, coerceNumber)
: type.from(data);
: type.from(data, isNumberType(type) && !isNumberArray(data) ? coerceNumber : undefined);
}

function floater(f) {
return (d, i) => coerceNumber(f(d, i));
function maybeTypedArrowify(vector, type) {
return vector == null
? vector
: (type === undefined || type === Array) && isArrowDateType(vector.type)
? coerceDates(vector.toArray())
: maybeTypedArrayify(vector.toArray(), type);
}

export const singleton = [null]; // for data-less decoration marks, e.g. frame
Expand All @@ -70,7 +96,7 @@ export function percentile(reduce) {

// If the values are specified as a typed array, no coercion is required.
export function coerceNumbers(values) {
return values instanceof TypedArray ? values : map(values, coerceNumber, Float64Array);
return isNumberArray(values) ? values : map(values, coerceNumber, Float64Array);
Copy link
Member Author

@mbostock mbostock Jul 28, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

(This fixes coerceNumbers for BigInt arrays which is important because Apache Arrow uses BigInt64Array for dates… somewhat unnecessarily in my opinion when Float64Array would work better in practice.)

}

// Unlike Mark’s number, here we want to convert null and undefined to NaN since
Expand All @@ -95,7 +121,7 @@ export function coerceDate(x) {
? x
: typeof x === "string"
? isoParse(x)
: x == null || isNaN((x = +x))
: x == null || isNaN((x = Number(x))) // allow conversion from BigInt
? undefined
: new Date(x);
}
Expand Down Expand Up @@ -130,9 +156,15 @@ export function keyword(input, name, allowed) {
return i;
}

// Like arrayify, but also allows data to be an Apache Arrow Table.
export function dataify(data) {
return isArrowTable(data) ? data : arrayify(data);
}

// Promotes the specified data to an array as needed.
export function arrayify(values) {
if (values == null || values instanceof Array || values instanceof TypedArray) return values;
if (values == null || isArray(values)) return values;
if (isArrowVector(values)) return maybeTypedArrowify(values);
switch (values.type) {
case "FeatureCollection":
return values.features;
Expand Down Expand Up @@ -233,22 +265,21 @@ export function maybeZ({z, fill, stroke} = {}) {
return z;
}

export function lengthof(data) {
return isArray(data) ? data.length : data?.numRows;
}

// Returns a Uint32Array with elements [0, 1, 2, … data.length - 1].
export function range(data) {
const n = data.length;
const n = lengthof(data);
const r = new Uint32Array(n);
for (let i = 0; i < n; ++i) r[i] = i;
return r;
}

// Returns a filtered range of data given the test function.
export function where(data, test) {
return range(data).filter((i) => test(data[i], i, data));
}
mbostock marked this conversation as resolved.
Show resolved Hide resolved

// Returns an array [values[index[0]], values[index[1]], …].
export function take(values, index) {
return map(index, (i) => values[i], values.constructor);
return isArray(values) ? map(index, (i) => values[i], values.constructor) : map(index, (i) => values.at(i));
}

// If f does not take exactly one argument, wraps it in a function that uses take.
Expand Down Expand Up @@ -575,3 +606,30 @@ export function maybeClip(clip) {
else if (clip != null) clip = keyword(clip, "clip", ["frame", "sphere"]);
return clip;
}

// https://github.com/observablehq/stdlib/blob/746ca2e69135df6178e4f3a17244def35d8d6b20/src/arrow.js#L4C1-L17C1
function isArrowTable(value) {
return (
value &&
typeof value.getChild === "function" &&
typeof value.toArray === "function" &&
value.schema &&
Array.isArray(value.schema.fields)
);
}

function isArrowVector(value) {
return value && typeof value.toArray === "function" && value.type;
}

// Apache Arrow now represents dates as numbers. We currently only support
// implicit coercion to JavaScript Date objects when the numbers represent
// milliseconds since Unix epoch.
function isArrowDateType(type) {
return (
type &&
(type.typeId === 8 || // date
type.typeId === 10) && // timestamp
type.unit === 1 // millisecond
);
}
8 changes: 4 additions & 4 deletions src/plot.js
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ import {axisFx, axisFy, axisX, axisY, gridFx, gridFy, gridX, gridY} from "./mark
import {frame} from "./marks/frame.js";
import {tip} from "./marks/tip.js";
import {isColor, isIterable, isNone, isScaleOptions} from "./options.js";
import {arrayify, map, yes, maybeIntervalTransform, subarray} from "./options.js";
import {dataify, lengthof, map, yes, maybeIntervalTransform, subarray} from "./options.js";
import {createProjection, getGeometryChannels, hasProjection} from "./projection.js";
import {createScales, createScaleFunctions, autoScaleRange, exposeScales} from "./scales.js";
import {innerDimensions, outerDimensions} from "./scales.js";
Expand Down Expand Up @@ -459,7 +459,7 @@ function maybeTopFacet(facet, options) {
if (facet == null) return;
const {x, y} = facet;
if (x == null && y == null) return;
const data = arrayify(facet.data);
const data = dataify(facet.data);
if (data == null) throw new Error("missing facet data");
const channels = {};
if (x != null) channels.fx = createChannel(data, {value: x, scale: "fx"});
Expand All @@ -478,7 +478,7 @@ function maybeMarkFacet(mark, topFacetState, options) {
// here with maybeTopFacet that we could reduce.
const {fx, fy} = mark;
if (fx != null || fy != null) {
const data = arrayify(mark.data ?? fx ?? fy);
const data = dataify(mark.data ?? fx ?? fy);
if (data === undefined) throw new Error(`missing facet data in ${mark.ariaLabel}`);
if (data === null) return; // ignore channel definitions if no data is provided TODO this right?
const channels = {};
Expand All @@ -500,7 +500,7 @@ function maybeMarkFacet(mark, topFacetState, options) {
if (
data.length > 0 &&
(groups.size > 1 || (groups.size === 1 && channels.fx && channels.fy && [...groups][0][1].size > 1)) &&
arrayify(mark.data)?.length === data.length
lengthof(dataify(mark.data)) === lengthof(data)
) {
warn(
`Warning: the ${mark.ariaLabel} mark appears to use faceted data, but isn’t faceted. The mark data has the same length as the facet data and the mark facet option is "auto", but the mark data and facet data are distinct. If this mark should be faceted, set the mark facet option to true; otherwise, suppress this warning by setting the mark facet option to false.`
Expand Down
9 changes: 6 additions & 3 deletions src/transforms/basic.js
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import {randomLcg} from "d3";
import {ascendingDefined, descendingDefined} from "../defined.js";
import {arrayify, isDomainSort, isOptions, maybeValue, valueof} from "../options.js";
import {isArray, isDomainSort, isOptions} from "../options.js";
import {dataify, maybeValue, valueof} from "../options.js";

export function basic({filter: f1, sort: s1, reverse: r1, transform: t1, initializer: i1, ...options} = {}, transform) {
// If both t1 and t2 are defined, returns a composite transform that first
Expand Down Expand Up @@ -40,7 +41,7 @@ function composeTransform(t1, t2) {
if (t2 == null) return t1 === null ? undefined : t1;
return function (data, facets, plotOptions) {
({data, facets} = t1.call(this, data, facets, plotOptions));
return t2.call(this, arrayify(data), facets, plotOptions);
return t2.call(this, dataify(data), facets, plotOptions);
};
}

Expand Down Expand Up @@ -101,7 +102,9 @@ function sortTransform(value) {

function sortData(compare) {
return (data, facets) => {
const compareData = (i, j) => compare(data[i], data[j]);
const compareData = isArray(data)
? (i, j) => compare(data[i], data[j])
: (i, j) => compare(data.get(i), data.get(j));
return {data, facets: facets.map((I) => I.slice().sort(compareData))};
};
}
Expand Down
4 changes: 2 additions & 2 deletions src/transforms/exclusiveFacets.js
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
import {reindex, slice} from "../options.js";
import {lengthof, reindex, slice} from "../options.js";

export function exclusiveFacets(data, facets) {
if (facets.length === 1) return {data, facets}; // only one facet; trivially exclusive

const n = data.length;
const n = lengthof(data);
const O = new Uint8Array(n);
let overlaps = 0;

Expand Down
40 changes: 6 additions & 34 deletions src/transforms/group.js
Original file line number Diff line number Diff line change
@@ -1,37 +1,9 @@
import {
InternSet,
deviation,
group as grouper,
max,
maxIndex,
mean,
median,
min,
minIndex,
mode,
rollup,
sort,
sum,
variance
} from "d3";
import {InternSet, group as grouper, rollup, sort} from "d3";
import {deviation, max, maxIndex, mean, median, min, minIndex, mode, sum, variance} from "d3";
import {ascendingDefined} from "../defined.js";
import {
column,
identity,
isObject,
isTemporal,
labelof,
maybeApplyInterval,
maybeColorChannel,
maybeColumn,
maybeInput,
maybeTuple,
percentile,
range,
second,
take,
valueof
} from "../options.js";
import {maybeApplyInterval, maybeColorChannel, maybeColumn, maybeInput, maybeTuple} from "../options.js";
import {isArray, isObject, isTemporal} from "../options.js";
import {column, identity, labelof, percentile, range, second, take, valueof} from "../options.js";
import {basic} from "./basic.js";

// Group on {z, fill, stroke}.
Expand Down Expand Up @@ -444,7 +416,7 @@ export function find(test) {
if (typeof test !== "function") throw new Error(`invalid test function: ${test}`);
return {
reduceIndex(I, V, {data}) {
return V[I.find((i) => test(data[i], i, data))];
return V[I.find(isArray(data) ? (i) => test(data[i], i, data) : (i) => test(data.get(i), i, data))];
}
};
}
10 changes: 6 additions & 4 deletions src/transforms/stack.js
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ import {InternMap, cumsum, greatest, group, groupSort, max, min, rollup, sum} fr
import {ascendingDefined, descendingDefined} from "../defined.js";
import {withTip} from "../mark.js";
import {maybeApplyInterval, maybeColumn, maybeZ, maybeZero} from "../options.js";
import {column, field, mid, one, range, valueof} from "../options.js";
import {column, field, isArray, lengthof, mid, one, range, valueof} from "../options.js";
import {basic} from "./basic.js";
import {exclusiveFacets} from "./exclusiveFacets.js";

Expand Down Expand Up @@ -91,7 +91,7 @@ function stack(x, y = one, kx, ky, {offset, order, reverse}, options) {
const Y = valueof(data, y, Float64Array);
const Z = valueof(data, z);
const compare = order && order(data, X, Y, Z);
const n = data.length;
const n = lengthof(data);
const Y1 = setY1(new Float64Array(n));
const Y2 = setY2(new Float64Array(n));
const facetstacks = [];
Expand Down Expand Up @@ -252,7 +252,7 @@ function maybeOrder(order, offset, ky) {
return orderAccessor(field(order));
}
if (typeof order === "function") return (order.length === 1 ? orderAccessor : orderComparator)(order);
if (Array.isArray(order)) return orderGiven(order);
if (isArray(order)) return orderGiven(order);
throw new Error(`invalid order: ${order}`);
}

Expand Down Expand Up @@ -327,7 +327,9 @@ function orderAccessor(f) {
}

function orderComparator(f) {
return (data) => (i, j) => f(data[i], data[j]);
return (data) => {
return isArray(data) ? (i, j) => f(data[i], data[j]) : (i, j) => f(data.get(i), data.get(j));
};
}

function orderGiven(domain) {
Expand Down
Loading