Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

support Apache Arrow as a normalized data representation #2115

Merged
merged 22 commits into from
Aug 5, 2024
Merged
Show file tree
Hide file tree
Changes from 14 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@
"@types/node": "^20.5.0",
"@typescript-eslint/eslint-plugin": "^7.2.0",
"@typescript-eslint/parser": "^7.2.0",
"apache-arrow": "^16.0.2",
"c8": "^9.1.0",
"canvas": "^2.0.0",
"d3-geo-projection": "^4.0.0",
Expand Down
1 change: 1 addition & 0 deletions src/mark.d.ts
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ export type TipPointer = "x" | "y" | "xy";
*
* - an array, typed array, or other iterable
* - an object with a length property and indexed values
* - an Apache Arrow Table
*/
export type Data = Iterable<any> | ArrayLike<any>;
mbostock marked this conversation as resolved.
Show resolved Hide resolved

Expand Down
10 changes: 7 additions & 3 deletions src/mark.js
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ import {channelDomain, createChannels, valueObject} from "./channel.js";
import {defined} from "./defined.js";
import {maybeFacetAnchor} from "./facet.js";
import {maybeClip, maybeNamed, maybeValue} from "./options.js";
import {arrayify, isDomainSort, isObject, isOptions, keyword, range, singleton} from "./options.js";
import {arrayify, isArrowTable, isDomainSort, isObject, isOptions, keyword, range, singleton} from "./options.js";
import {project} from "./projection.js";
import {styles} from "./style.js";
import {basic, initializer} from "./transforms/basic.js";
Expand Down Expand Up @@ -87,10 +87,10 @@ export class Mark {
}
}
initialize(facets, facetChannels, plotOptions) {
let data = arrayify(this.data);
let data = dataify(this.data);
if (facets === undefined && data != null) facets = [range(data)];
const originalFacets = facets;
if (this.transform != null) ({facets, data} = this.transform(data, facets, plotOptions)), (data = arrayify(data));
if (this.transform != null) ({facets, data} = this.transform(data, facets, plotOptions)), (data = dataify(data));
if (facets !== undefined) facets.original = originalFacets; // needed to read facetChannels
const channels = createChannels(this.channels, data);
if (this.sort != null) channelDomain(data, facets, channels, facetChannels, this.sort); // mutates facetChannels!
Expand Down Expand Up @@ -130,6 +130,10 @@ export class Mark {
}
}

function dataify(data) {
return isArrowTable(data) ? data : arrayify(data);
}

export function marks(...marks) {
marks.plot = Mark.prototype.plot;
return marks;
Expand Down
87 changes: 70 additions & 17 deletions src/options.js
Original file line number Diff line number Diff line change
Expand Up @@ -7,14 +7,36 @@ import {timeInterval, utcInterval} from "./time.js";
export const TypedArray = Object.getPrototypeOf(Uint8Array);
const objectToString = Object.prototype.toString;

function isArray(value) {
return value instanceof Array || value instanceof TypedArray;
}

function isNumberArray(value) {
return value instanceof TypedArray && !isBigIntArray(value);
}

function isNumberType(type) {
return type?.prototype instanceof TypedArray && !isBigIntType(type);
}

function isBigIntArray(value) {
return value instanceof BigInt64Array || value instanceof BigUint64Array;
}

function isBigIntType(type) {
return type === BigInt64Array || type === BigUint64Array;
}

// If a reindex is attached to the data, channel values expressed as arrays will
// be reindexed when the channels are instantiated. See exclusiveFacets.
export const reindex = Symbol("reindex");

export function valueof(data, value, type) {
const valueType = typeof value;
return valueType === "string"
? maybeTypedMap(data, field(value), type)
? isArrowTable(data)
? maybeTypedArrowify(data.getChild(value), type)
: maybeTypedMap(data, field(value), type)
: valueType === "function"
? maybeTypedMap(data, value, type)
: valueType === "number" || value instanceof Date || valueType === "boolean"
Expand All @@ -29,21 +51,25 @@ function maybeTake(values, index) {
}

function maybeTypedMap(data, f, type) {
return map(data, type?.prototype instanceof TypedArray ? floater(f) : f, type);
return map(data, isNumberType(type) ? (d, i) => coerceNumber(f(d, i)) : f, type); // allow conversion from BigInt
mbostock marked this conversation as resolved.
Show resolved Hide resolved
}

function maybeTypedArrayify(data, type) {
return type === undefined
? arrayify(data) // preserve undefined type
: isArrowVector(data)
? maybeTypedArrowify(data, type)
: data instanceof type
? data
: type.prototype instanceof TypedArray && !(data instanceof TypedArray)
? type.from(data, coerceNumber)
: type.from(data);
: type.from(data, isNumberType(type) && !isNumberArray(data) ? coerceNumber : undefined);
}

function floater(f) {
return (d, i) => coerceNumber(f(d, i));
function maybeTypedArrowify(vector, type) {
return vector == null
? vector
: (type === undefined || type === Array) && isArrowDateType(vector.type)
? coerceDates(vector.toArray())
: maybeTypedArrayify(vector.toArray(), type);
}

export const singleton = [null]; // for data-less decoration marks, e.g. frame
Expand All @@ -70,7 +96,7 @@ export function percentile(reduce) {

// If the values are specified as a typed array, no coercion is required.
export function coerceNumbers(values) {
return values instanceof TypedArray ? values : map(values, coerceNumber, Float64Array);
return isNumberArray(values) ? values : map(values, coerceNumber, Float64Array);
Copy link
Member Author

@mbostock mbostock Jul 28, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

(This fixes coerceNumbers for BigInt arrays which is important because Apache Arrow uses BigInt64Array for dates… somewhat unnecessarily in my opinion when Float64Array would work better in practice.)

}

// Unlike Mark’s number, here we want to convert null and undefined to NaN since
Expand All @@ -95,7 +121,7 @@ export function coerceDate(x) {
? x
: typeof x === "string"
? isoParse(x)
: x == null || isNaN((x = +x))
: x == null || isNaN((x = Number(x))) // allow conversion from BigInt
? undefined
: new Date(x);
}
Expand Down Expand Up @@ -132,7 +158,8 @@ export function keyword(input, name, allowed) {

// Promotes the specified data to an array as needed.
export function arrayify(values) {
if (values == null || values instanceof Array || values instanceof TypedArray) return values;
if (values == null || isArray(values)) return values;
if (isArrowVector(values)) return maybeTypedArrowify(values);
switch (values.type) {
case "FeatureCollection":
return values.features;
Expand Down Expand Up @@ -233,22 +260,21 @@ export function maybeZ({z, fill, stroke} = {}) {
return z;
}

export function lengthof(data) {
return isArray(data) ? data.length : data.numRows;
}

// Returns a Uint32Array with elements [0, 1, 2, … data.length - 1].
export function range(data) {
const n = data.length;
const n = lengthof(data);
const r = new Uint32Array(n);
for (let i = 0; i < n; ++i) r[i] = i;
return r;
}

// Returns a filtered range of data given the test function.
export function where(data, test) {
return range(data).filter((i) => test(data[i], i, data));
}
mbostock marked this conversation as resolved.
Show resolved Hide resolved

// Returns an array [values[index[0]], values[index[1]], …].
export function take(values, index) {
return map(index, (i) => values[i], values.constructor);
return isArray(values) ? map(index, (i) => values[i], values.constructor) : map(index, (i) => values.at(i));
}

// If f does not take exactly one argument, wraps it in a function that uses take.
Expand Down Expand Up @@ -575,3 +601,30 @@ export function maybeClip(clip) {
else if (clip != null) clip = keyword(clip, "clip", ["frame", "sphere"]);
return clip;
}

// https://github.com/observablehq/stdlib/blob/746ca2e69135df6178e4f3a17244def35d8d6b20/src/arrow.js#L4C1-L17C1
export function isArrowTable(value) {
return (
value &&
typeof value.getChild === "function" &&
typeof value.toArray === "function" &&
value.schema &&
Array.isArray(value.schema.fields)
);
}

function isArrowVector(value) {
return value && typeof value.toArray === "function" && value.type;
}

// Apache Arrow now represents dates as numbers. We currently only support
// implicit coercion to JavaScript Date objects when the numbers represent
// milliseconds since Unix epoch.
function isArrowDateType(type) {
return (
type &&
(type.typeId === 8 || // date
type.typeId === 10) && // timestamp
type.unit === 1 // millisecond
);
}
4 changes: 2 additions & 2 deletions src/transforms/exclusiveFacets.js
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
import {reindex, slice} from "../options.js";
import {lengthof, reindex, slice} from "../options.js";

export function exclusiveFacets(data, facets) {
if (facets.length === 1) return {data, facets}; // only one facet; trivially exclusive

const n = data.length;
const n = lengthof(data);
const O = new Uint8Array(n);
let overlaps = 0;

Expand Down
4 changes: 2 additions & 2 deletions src/transforms/stack.js
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ import {InternMap, cumsum, greatest, group, groupSort, max, min, rollup, sum} fr
import {ascendingDefined, descendingDefined} from "../defined.js";
import {withTip} from "../mark.js";
import {maybeApplyInterval, maybeColumn, maybeZ, maybeZero} from "../options.js";
import {column, field, mid, one, range, valueof} from "../options.js";
import {column, field, lengthof, mid, one, range, valueof} from "../options.js";
import {basic} from "./basic.js";
import {exclusiveFacets} from "./exclusiveFacets.js";

Expand Down Expand Up @@ -91,7 +91,7 @@ function stack(x, y = one, kx, ky, {offset, order, reverse}, options) {
const Y = valueof(data, y, Float64Array);
const Z = valueof(data, z);
const compare = order && order(data, X, Y, Z);
const n = data.length;
const n = lengthof(data);
const Y1 = setY1(new Float64Array(n));
const Y2 = setY2(new Float64Array(n));
const facetstacks = [];
Expand Down
121 changes: 121 additions & 0 deletions test/output/arrowDates.svg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading