From 3e6662339f7f043a328331cf76effbf65ff9b29e Mon Sep 17 00:00:00 2001 From: Florian Bernd Date: Fri, 11 Oct 2024 17:00:37 +0200 Subject: [PATCH] Add proper enums for some `DenseVectorProperty` fields --- compiler/package-lock.json | 10 +- output/schema/schema.json | 112 ++++++++++++-- .../_types/mapping/DenseVectorIndexOptions.ts | 27 ---- .../_types/mapping/DenseVectorProperty.ts | 137 ++++++++++++++++++ specification/_types/mapping/Property.ts | 2 +- specification/_types/mapping/complex.ts | 10 -- 6 files changed, 238 insertions(+), 60 deletions(-) delete mode 100644 specification/_types/mapping/DenseVectorIndexOptions.ts create mode 100644 specification/_types/mapping/DenseVectorProperty.ts diff --git a/compiler/package-lock.json b/compiler/package-lock.json index 56c687fd0d..ed183152ca 100644 --- a/compiler/package-lock.json +++ b/compiler/package-lock.json @@ -33,10 +33,6 @@ "node": ">=14" } }, - "../compiler-rs/compiler-wasm-lib/pkg": { - "name": "compiler-wasm-lib", - "version": "0.1.0" - }, "node_modules/@babel/code-frame": { "version": "7.12.11", "resolved": "https://registry.npmjs.org/@babel/code-frame/-/code-frame-7.12.11.tgz", @@ -1544,8 +1540,8 @@ "dev": true }, "node_modules/compiler-wasm-lib": { - "resolved": "../compiler-rs/compiler-wasm-lib/pkg", - "link": true + "version": "0.1.0", + "resolved": "file:../compiler-rs/compiler-wasm-lib/pkg" }, "node_modules/concat-map": { "version": "0.0.1", @@ -6488,7 +6484,7 @@ "dev": true }, "compiler-wasm-lib": { - "version": "file:../compiler-rs/compiler-wasm-lib/pkg" + "version": "0.1.0" }, "concat-map": { "version": "0.0.1", diff --git a/output/schema/schema.json b/output/schema/schema.json index 91a9903ee0..3af0917211 100644 --- a/output/schema/schema.json +++ b/output/schema/schema.json @@ -72216,7 +72216,7 @@ } } ], - "specLocation": "_types/mapping/complex.ts#L61-L66" + "specLocation": "_types/mapping/complex.ts#L51-L56" }, { "kind": "interface", @@ -72906,6 +72906,28 @@ ], "specLocation": "_types/mapping/range.ts#L29-L32" }, + { + "kind": "enum", + "members": [ + { + "description": "Indexes a single bit per dimension. Useful for very high-dimensional vectors or models that specifically support\nbit vectors.\n\nNOTE: when using `bit`, the number of dimensions must be a multiple of `8` and must represent the number of bits.", + "name": "bit" + }, + { + "description": "Indexes a 1-byte integer value per dimension.", + "name": "byte" + }, + { + "description": "Indexes a 4-byte floating-point value per dimension.", + "name": "float" + } + ], + "name": { + "name": "DenseVectorElementType", + "namespace": "_types.mapping" + }, + "specLocation": "_types/mapping/DenseVectorProperty.ts#L32-L48" + }, { "kind": "interface", "name": { @@ -72919,8 +72941,8 @@ "type": { "kind": "instance_of", "type": { - "name": "string", - "namespace": "_builtins" + "name": "DenseVectorIndexOptionsType", + "namespace": "_types.mapping" } } }, @@ -72958,7 +72980,41 @@ } } ], - "specLocation": "_types/mapping/DenseVectorIndexOptions.ts#L22-L27" + "specLocation": "_types/mapping/DenseVectorProperty.ts#L97-L102" + }, + { + "kind": "enum", + "members": [ + { + "description": "This utilizes a brute-force search algorithm for exact kNN search. This supports all `element_type` values.", + "name": "flat" + }, + { + "description": "This utilizes the HNSW algorithm for scalable approximate kNN search. This supports all `element_type` values.", + "name": "hnsw" + }, + { + "description": "This utilizes a brute-force search algorithm in addition to automatically half-byte scalar quantization.\nOnly supports `element_type` of `float`.", + "name": "int4_flat" + }, + { + "description": "This utilizes the HNSW algorithm in addition to automatically scalar quantization for scalable approximate kNN\nsearch with `element_type` of `float`.\n\nThis can reduce the memory footprint by 8x at the cost of some accuracy.", + "name": "int4_hnsw" + }, + { + "description": "This utilizes a brute-force search algorithm in addition to automatically scalar quantization. Only supports \n`element_type` of `float`.", + "name": "int8_flat" + }, + { + "description": "The default index type for `float` vectors. This utilizes the HNSW algorithm in addition to automatically scalar \nquantization for scalable approximate kNN search with `element_type` of `float`.\n\nThis can reduce the memory footprint by 4x at the cost of some accuracy.", + "name": "int8_hnsw" + } + ], + "name": { + "name": "DenseVectorIndexOptionsType", + "namespace": "_types.mapping" + }, + "specLocation": "_types/mapping/DenseVectorProperty.ts#L104-L137" }, { "kind": "interface", @@ -72987,8 +73043,8 @@ "type": { "kind": "instance_of", "type": { - "name": "string", - "namespace": "_builtins" + "name": "DenseVectorElementType", + "namespace": "_types.mapping" } } }, @@ -73009,8 +73065,8 @@ "type": { "kind": "instance_of", "type": { - "name": "string", - "namespace": "_builtins" + "name": "DenseVectorSimilarity", + "namespace": "_types.mapping" } } }, @@ -73037,7 +73093,33 @@ } } ], - "specLocation": "_types/mapping/complex.ts#L52-L59" + "specLocation": "_types/mapping/DenseVectorProperty.ts#L23-L30" + }, + { + "kind": "enum", + "members": [ + { + "description": "Computes the cosine similarity. During indexing Elasticsearch automatically normalizes vectors with `cosine`\nsimilarity to unit length. This allows to internally use `dot_product` for computing similarity, which is more \nefficient. Original un-normalized vectors can be still accessed through scripts.\n\nThe document `_score` is computed as `(1 + cosine(query, vector)) / 2`.\n\nThe `cosine` similarity does not allow vectors with zero magnitude, since cosine is not defined in this case.", + "name": "cosine" + }, + { + "description": "Computes the dot product of two unit vectors. This option provides an optimized way to perform cosine similarity.\nThe constraints and computed score are defined by `element_type`.\n\nWhen `element_type` is `float`, all vectors must be unit length, including both document and query vectors.\n\nThe document `_score` is computed as `(1 + dot_product(query, vector)) / 2`.\n\nWhen `element_type` is `byte`, all vectors must have the same length including both document and query vectors or\nresults will be inaccurate.\n\nThe document `_score` is computed as `0.5 + (dot_product(query, vector) / (32768 * dims))` where `dims` is the\nnumber of dimensions per vector.", + "name": "dot_product" + }, + { + "description": "Computes similarity based on the `L2` distance (also known as Euclidean distance) between the vectors.\n\nThe document `_score` is computed as `1 / (1 + l2_norm(query, vector)^2)`.\n\nFor `bit` vectors, instead of using `l2_norm`, the `hamming` distance between the vectors is used.\n\nThe `_score` transformation is `(numBits - hamming(a, b)) / numBits`.", + "name": "l2_norm" + }, + { + "description": "Computes the maximum inner product of two vectors. This is similar to `dot_product`, but doesn't require vectors\nto be normalized. This means that each vector’s magnitude can significantly effect the score.\n\nThe document `_score` is adjusted to prevent negative values. For `max_inner_product` values `< 0`, the `_score`\nis `1 / (1 + -1 * max_inner_product(query, vector))`. For non-negative `max_inner_product` results the `_score`\nis calculated `max_inner_product(query, vector) + 1`.", + "name": "max_inner_product" + } + ], + "name": { + "name": "DenseVectorSimilarity", + "namespace": "_types.mapping" + }, + "specLocation": "_types/mapping/DenseVectorProperty.ts#L50-L95" }, { "kind": "interface", @@ -73880,7 +73962,7 @@ "name": "FieldType", "namespace": "_types.mapping" }, - "specLocation": "_types/mapping/Property.ts#L166-L213" + "specLocation": "_types/mapping/Property.ts#L168-L215" }, { "kind": "interface", @@ -74003,7 +74085,7 @@ } } ], - "specLocation": "_types/mapping/complex.ts#L26-L37" + "specLocation": "_types/mapping/complex.ts#L25-L36" }, { "kind": "interface", @@ -75270,7 +75352,7 @@ } } ], - "specLocation": "_types/mapping/complex.ts#L39-L44" + "specLocation": "_types/mapping/complex.ts#L38-L43" }, { "kind": "interface", @@ -75439,7 +75521,7 @@ } } ], - "specLocation": "_types/mapping/complex.ts#L46-L50" + "specLocation": "_types/mapping/complex.ts#L45-L49" }, { "kind": "enum", @@ -75544,7 +75626,7 @@ "name": "Property", "namespace": "_types.mapping" }, - "specLocation": "_types/mapping/Property.ts#L96-L164", + "specLocation": "_types/mapping/Property.ts#L98-L166", "type": { "kind": "union_of", "items": [ @@ -75999,7 +76081,7 @@ } } ], - "specLocation": "_types/mapping/Property.ts#L84-L94" + "specLocation": "_types/mapping/Property.ts#L86-L96" }, { "kind": "interface", diff --git a/specification/_types/mapping/DenseVectorIndexOptions.ts b/specification/_types/mapping/DenseVectorIndexOptions.ts deleted file mode 100644 index dd05a7ccd1..0000000000 --- a/specification/_types/mapping/DenseVectorIndexOptions.ts +++ /dev/null @@ -1,27 +0,0 @@ -/* - * Licensed to Elasticsearch B.V. under one or more contributor - * license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright - * ownership. Elasticsearch B.V. licenses this file to you under - * the Apache License, Version 2.0 (the "License"); you may - * not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -import { float, integer } from '@_types/Numeric' - -export class DenseVectorIndexOptions { - type: string - m?: integer - ef_construction?: integer - confidence_interval?: float -} diff --git a/specification/_types/mapping/DenseVectorProperty.ts b/specification/_types/mapping/DenseVectorProperty.ts new file mode 100644 index 0000000000..cb2df5a583 --- /dev/null +++ b/specification/_types/mapping/DenseVectorProperty.ts @@ -0,0 +1,137 @@ +/* + * Licensed to Elasticsearch B.V. under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch B.V. licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +import { float, integer } from '@_types/Numeric' +import { PropertyBase } from './Property' + +export class DenseVectorProperty extends PropertyBase { + type: 'dense_vector' + element_type?: DenseVectorElementType + dims?: integer + similarity?: DenseVectorSimilarity + index?: boolean + index_options?: DenseVectorIndexOptions +} + +export enum DenseVectorElementType { + /** + * Indexes a single bit per dimension. Useful for very high-dimensional vectors or models that specifically support + * bit vectors. + * + * NOTE: when using `bit`, the number of dimensions must be a multiple of `8` and must represent the number of bits. + */ + bit, + /** + * Indexes a 1-byte integer value per dimension. + */ + byte, + /** + * Indexes a 4-byte floating-point value per dimension. + */ + float +} + +export enum DenseVectorSimilarity { + /** + * Computes the cosine similarity. During indexing Elasticsearch automatically normalizes vectors with `cosine` + * similarity to unit length. This allows to internally use `dot_product` for computing similarity, which is more + * efficient. Original un-normalized vectors can be still accessed through scripts. + * + * The document `_score` is computed as `(1 + cosine(query, vector)) / 2`. + * + * The `cosine` similarity does not allow vectors with zero magnitude, since cosine is not defined in this case. + */ + cosine, + /** + * Computes the dot product of two unit vectors. This option provides an optimized way to perform cosine similarity. + * The constraints and computed score are defined by `element_type`. + * + * When `element_type` is `float`, all vectors must be unit length, including both document and query vectors. + * + * The document `_score` is computed as `(1 + dot_product(query, vector)) / 2`. + * + * When `element_type` is `byte`, all vectors must have the same length including both document and query vectors or + * results will be inaccurate. + * + * The document `_score` is computed as `0.5 + (dot_product(query, vector) / (32768 * dims))` where `dims` is the + * number of dimensions per vector. + */ + dot_product, + /** + * Computes similarity based on the `L2` distance (also known as Euclidean distance) between the vectors. + * + * The document `_score` is computed as `1 / (1 + l2_norm(query, vector)^2)`. + * + * For `bit` vectors, instead of using `l2_norm`, the `hamming` distance between the vectors is used. + * + * The `_score` transformation is `(numBits - hamming(a, b)) / numBits`. + */ + l2_norm, + /** + * Computes the maximum inner product of two vectors. This is similar to `dot_product`, but doesn't require vectors + * to be normalized. This means that each vector’s magnitude can significantly effect the score. + * + * The document `_score` is adjusted to prevent negative values. For `max_inner_product` values `< 0`, the `_score` + * is `1 / (1 + -1 * max_inner_product(query, vector))`. For non-negative `max_inner_product` results the `_score` + * is calculated `max_inner_product(query, vector) + 1`. + */ + max_inner_product +} + +export class DenseVectorIndexOptions { + type: DenseVectorIndexOptionsType + m?: integer + ef_construction?: integer + confidence_interval?: float +} + +export enum DenseVectorIndexOptionsType { + /** + * This utilizes a brute-force search algorithm for exact kNN search. This supports all `element_type` values. + */ + flat, + /** + * This utilizes the HNSW algorithm for scalable approximate kNN search. This supports all `element_type` values. + */ + hnsw, + /** + * This utilizes a brute-force search algorithm in addition to automatically half-byte scalar quantization. + * Only supports `element_type` of `float`. + */ + int4_flat, + /** + * This utilizes the HNSW algorithm in addition to automatically scalar quantization for scalable approximate kNN + * search with `element_type` of `float`. + * + * This can reduce the memory footprint by 8x at the cost of some accuracy. + */ + int4_hnsw, + /** + * This utilizes a brute-force search algorithm in addition to automatically scalar quantization. Only supports + * `element_type` of `float`. + */ + int8_flat, + /** + * The default index type for `float` vectors. This utilizes the HNSW algorithm in addition to automatically scalar + * quantization for scalable approximate kNN search with `element_type` of `float`. + * + * This can reduce the memory footprint by 4x at the cost of some accuracy. + */ + int8_hnsw +} diff --git a/specification/_types/mapping/Property.ts b/specification/_types/mapping/Property.ts index 8cce8819a2..456d835ea4 100644 --- a/specification/_types/mapping/Property.ts +++ b/specification/_types/mapping/Property.ts @@ -36,7 +36,6 @@ import { import { integer } from '@_types/Numeric' import { AggregateMetricDoubleProperty, - DenseVectorProperty, FlattenedProperty, NestedProperty, ObjectProperty @@ -69,6 +68,7 @@ import { VersionProperty, WildcardProperty } from './core' +import { DenseVectorProperty } from './DenseVectorProperty' import { DynamicMapping } from './dynamic-template' import { CompletionProperty, diff --git a/specification/_types/mapping/complex.ts b/specification/_types/mapping/complex.ts index d9b0c70bb3..5fa37f12ee 100644 --- a/specification/_types/mapping/complex.ts +++ b/specification/_types/mapping/complex.ts @@ -20,7 +20,6 @@ import { TimeSeriesMetricType } from '@_types/mapping/TimeSeriesMetricType' import { double, integer } from '@_types/Numeric' import { CorePropertyBase, IndexOptions } from './core' -import { DenseVectorIndexOptions } from './DenseVectorIndexOptions' import { PropertyBase } from './Property' export class FlattenedProperty extends PropertyBase { @@ -49,15 +48,6 @@ export class ObjectProperty extends CorePropertyBase { type?: 'object' } -export class DenseVectorProperty extends PropertyBase { - type: 'dense_vector' - element_type?: string - dims?: integer - similarity?: string - index?: boolean - index_options?: DenseVectorIndexOptions -} - export class AggregateMetricDoubleProperty extends PropertyBase { type: 'aggregate_metric_double' default_metric: string