From a9987af505ea0f8f9d2060de16c8077f99c5e54e Mon Sep 17 00:00:00 2001 From: lintool Date: Tue, 3 Dec 2024 19:47:49 -0500 Subject: [PATCH 1/3] Hard-code for Snowflake parquet --- .../anserini/collection/ParquetDenseVectorCollection.java | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/main/java/io/anserini/collection/ParquetDenseVectorCollection.java b/src/main/java/io/anserini/collection/ParquetDenseVectorCollection.java index 462d22c8f..ce5db7cec 100644 --- a/src/main/java/io/anserini/collection/ParquetDenseVectorCollection.java +++ b/src/main/java/io/anserini/collection/ParquetDenseVectorCollection.java @@ -137,16 +137,16 @@ private void initializeParquetReader(java.nio.file.Path path) throws IOException // Read each record from the Parquet file while ((record = reader.read()) != null) { // Extract the docid (String) from the record - String docid = record.getString("docid", 0); + String docid = record.getString("doc_id", 0); ids.add(docid); // Extract the vector (double[]) from the record - Group vectorGroup = record.getGroup("vector", 0); // Access the 'vector' field + Group vectorGroup = record.getGroup("embedding", 0); // Access the 'vector' field int vectorSize = vectorGroup.getFieldRepetitionCount(0); // Get the number of elements in the vector double[] vector = new double[vectorSize]; for (int i = 0; i < vectorSize; i++) { Group listGroup = vectorGroup.getGroup(0, i); // Access the 'list' group - vector[i] = listGroup.getDouble("element", 0); // Get the double value from the 'element' field + vector[i] = listGroup.getFloat("element", 0); // Get the double value from the 'element' field } vectors.add(vector); } From 66f020c5fe594c59e9dcf67208e1a1a42edc1acc Mon Sep 17 00:00:00 2001 From: lintool Date: Wed, 4 Dec 2024 07:15:59 -0500 Subject: [PATCH 2/3] Bumped up to 512g --- bin/run.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/run.sh b/bin/run.sh index 43bb461cc..79d6e7f24 100755 --- a/bin/run.sh +++ b/bin/run.sh @@ -1,3 +1,3 @@ #!/bin/sh -java -cp `ls target/*-fatjar.jar` -Xms512M -Xmx64G --add-modules jdk.incubator.vector $@ \ No newline at end of file +java -cp `ls target/*-fatjar.jar` -Xms512M -Xmx512G --add-modules jdk.incubator.vector $@ From 0283450b453136cad846ae3834e5645dfe0939df Mon Sep 17 00:00:00 2001 From: lintool Date: Thu, 5 Dec 2024 22:07:08 -0500 Subject: [PATCH 3/3] Add normalization. --- .../ParquetDenseVectorCollection.java | 34 +++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/src/main/java/io/anserini/collection/ParquetDenseVectorCollection.java b/src/main/java/io/anserini/collection/ParquetDenseVectorCollection.java index ce5db7cec..b3192bf3a 100644 --- a/src/main/java/io/anserini/collection/ParquetDenseVectorCollection.java +++ b/src/main/java/io/anserini/collection/ParquetDenseVectorCollection.java @@ -148,6 +148,7 @@ private void initializeParquetReader(java.nio.file.Path path) throws IOException Group listGroup = vectorGroup.getGroup(0, i); // Access the 'list' group vector[i] = listGroup.getFloat("element", 0); // Get the double value from the 'element' field } + vector = normalizeVector(vector); vectors.add(vector); } @@ -155,6 +156,39 @@ private void initializeParquetReader(java.nio.file.Path path) throws IOException currentIndex = 0; } + /** + * Computes the L2 norm (Euclidean norm) of a vector. + * @param vector the vector to compute the norm of + * @return the L2 norm of the vector + */ + private static double computeL2Norm(double[] vector) { + double sumOfSquares = 0.0; + for (double v : vector) { + sumOfSquares += v * v; + } + return Math.sqrt(sumOfSquares); + } + + /** + * Normalizes a vector to have a norm of 1. + * @param vector the vector to normalize + * @return a new vector that is the normalized version of the input vector + */ + private static double[] normalizeVector(double[] vector) { + double norm = computeL2Norm(vector); + double[] normalizedVector = new double[vector.length]; + + if (norm == 0) { + throw new IllegalArgumentException("Zero vector cannot be normalized."); + } + + for (int i = 0; i < vector.length; i++) { + normalizedVector[i] = vector[i] / norm; + } + + return normalizedVector; + } + /** * Reads the next document in the segment. *