diff --git a/.github/workflows/benchmarks.yml b/.github/workflows/benchmarks.yml index c1fd7b5ad..8d40c57a0 100644 --- a/.github/workflows/benchmarks.yml +++ b/.github/workflows/benchmarks.yml @@ -57,6 +57,7 @@ jobs: - parlayann - pg_embedding - pgvector + - pgvectorscale - pgvecto_rs - pynndescent - redisearch diff --git a/README.md b/README.md index a103d80f9..112026917 100644 --- a/README.md +++ b/README.md @@ -50,6 +50,7 @@ Evaluated * [pg_embedding](https://github.com/neondatabase/pg_embedding) ![https://img.shields.io/github/stars/pg_embedding/pg_embedding?style=social](https://img.shields.io/github/stars/neondatabase/pg_embedding?style=social) * [Descartes(01AI)](https://github.com/xiaoming-01ai/descartes) * [kgn](https://github.com/Henry-yan/kgn) +* [PGVectorScale](https://github.com/timescale/pgvectorscale/tree/main) Data sets ========= diff --git a/ann_benchmarks/algorithms/pgvectorscale/Dockerfile b/ann_benchmarks/algorithms/pgvectorscale/Dockerfile new file mode 100644 index 000000000..2187807f2 --- /dev/null +++ b/ann_benchmarks/algorithms/pgvectorscale/Dockerfile @@ -0,0 +1,59 @@ +FROM ann-benchmarks + +RUN git clone https://github.com/pgvector/pgvector /tmp/pgvector + +RUN DEBIAN_FRONTEND=noninteractive apt-get -y install tzdata +RUN apt-get update && apt-get install -y --no-install-recommends build-essential postgresql-common +RUN /usr/share/postgresql-common/pgdg/apt.postgresql.org.sh -y +RUN apt-get install -y --no-install-recommends postgresql-16 postgresql-server-dev-16 +RUN sh -c 'echo "local all all trust" > /etc/postgresql/16/main/pg_hba.conf' + +# Dynamically set OPTFLAGS based on the architecture +RUN ARCH=$(uname -m) && \ + if [ "$ARCH" = "aarch64" ]; then \ + OPTFLAGS="-march=native -msve-vector-bits=512"; \ + elif [ "$ARCH" = "x86_64" ]; then \ + OPTFLAGS="-march=native -mprefer-vector-width=512"; \ + else \ + OPTFLAGS="-march=native"; \ + fi && \ + cd /tmp/pgvector && \ + make clean && \ + make OPTFLAGS="$OPTFLAGS" && \ + make install + +# Install necessary dependencies +RUN apt-get update && apt-get install -y \ + curl \ + build-essential \ + libssl-dev \ + pkg-config \ + && rm -rf /var/lib/apt/lists/* + +# Install Rust and Cargo +RUN curl https://sh.rustup.rs -sSf | sh -s -- -y + +# Ensure the cargo bin directory is in the PATH +ENV PATH="/root/.cargo/bin:${PATH}" + +# Install pgrx using cargo +RUN cargo install --locked cargo-pgrx +# Initialize pgrx with the PostgreSQL version +RUN cargo pgrx init --pg16 pg_config +RUN git clone https://github.com/timescale/pgvectorscale /tmp/pgvectorscale +RUN cd /tmp/pgvectorscale/pgvectorscale && \ + cargo pgrx install --release + +USER postgres +RUN service postgresql start && \ + psql -c "CREATE USER ann WITH ENCRYPTED PASSWORD 'ann'" && \ + psql -c "CREATE DATABASE ann" && \ + psql -c "GRANT ALL PRIVILEGES ON DATABASE ann TO ann" && \ + psql -d ann -c "GRANT ALL ON SCHEMA public TO ann" && \ + psql -d ann -c "CREATE EXTENSION IF NOT EXISTS vectorscale CASCADE" && \ + psql -c "ALTER USER ann SET maintenance_work_mem = '4GB'" && \ + psql -c "ALTER USER ann SET max_parallel_maintenance_workers = 0" && \ + psql -c "ALTER SYSTEM SET shared_buffers = '4GB'" +USER root + +RUN pip install psycopg[binary] pgvector diff --git a/ann_benchmarks/algorithms/pgvectorscale/config.yml b/ann_benchmarks/algorithms/pgvectorscale/config.yml new file mode 100644 index 000000000..660011ae4 --- /dev/null +++ b/ann_benchmarks/algorithms/pgvectorscale/config.yml @@ -0,0 +1,33 @@ +float: + any: + - base_args: ['@metric'] + constructor: PGDiskANN + disabled: false + docker_tag: ann-benchmarks-pgvectorscale + module: ann_benchmarks.algorithms.pgvectorscale + name: pgvectorscale + run_groups: + N-50: + arg_groups: [{num_neighbors: 50, search_list_size: 100, max_alpha: 1.2}] + args: {} + query_args: [[10, 20, 40, 80, 120, 200, 400, 800]] + N-100: + arg_groups: [{num_neighbors: 100, search_list_size: 100, max_alpha: 1.2}] + args: {} + query_args: [[10, 20, 40, 80, 120, 200, 400, 800]] + ALPHA-1.00: + arg_groups: [{num_neighbors: 100, search_list_size: 100, max_alpha: 1}] + args: {} + query_args: [[10, 20, 40, 80, 120, 200, 400, 800]] + ALPHA-1.20: + arg_groups: [{num_neighbors: 100, search_list_size: 100, max_alpha: 1.2}] + args: {} + query_args: [[10, 20, 40, 80, 120, 200, 400, 800]] + ALPHA-1.50: + arg_groups: [{num_neighbors: 100, search_list_size: 100, max_alpha: 1.5}] + args: {} + query_args: [[10, 20, 40, 80, 120, 200, 400, 800]] + ALPHA-2.00: + arg_groups: [{num_neighbors: 100, search_list_size: 100, max_alpha: 2}] + args: {} + query_args: [[10, 20, 40, 80, 120, 200, 400, 800]] diff --git a/ann_benchmarks/algorithms/pgvectorscale/module.py b/ann_benchmarks/algorithms/pgvectorscale/module.py new file mode 100644 index 000000000..2e15e032c --- /dev/null +++ b/ann_benchmarks/algorithms/pgvectorscale/module.py @@ -0,0 +1,54 @@ +import subprocess +import sys + +import pgvector.psycopg +import psycopg + +from ..base.module import BaseANN + +class PGDiskANN(BaseANN): + def __init__(self, metric, method_param): + print(f"running constructor") + self._metric = metric + self._cur = None + self._query = "SELECT id FROM items ORDER BY embedding <=> %s LIMIT %s" + self._num_neighbors = method_param['num_neighbors'] + self._search_list_size = method_param['search_list_size'] + self._max_alpha = method_param['max_alpha'] + print(f"running only {self._metric} and {self._query}") + + def fit(self, X): + print("running fit") + subprocess.run("service postgresql start", shell=True, check=True, stdout=sys.stdout, stderr=sys.stderr) + conn = psycopg.connect(user="ann", password="ann", dbname="ann", autocommit=True) + pgvector.psycopg.register_vector(conn) + cur = conn.cursor() + cur.execute("DROP TABLE IF EXISTS items") + cur.execute("CREATE TABLE items (id int, embedding vector(%d))" % X.shape[1]) + cur.execute("ALTER TABLE items ALTER COLUMN embedding SET STORAGE PLAIN") + print("copying data...") + with cur.copy("COPY items (id, embedding) FROM STDIN WITH (FORMAT BINARY)") as copy: + copy.set_types(["int4", "vector"]) + for i, embedding in enumerate(X): + copy.write_row((i, embedding)) + print("creating index...") + cur.execute("CREATE INDEX ON items USING diskann(embedding) WITH (num_neighbors = %d, search_list_size = %d, max_alpha = %d)" % (self._num_neighbors, self._search_list_size, self._max_alpha)) + print("done!") + self._cur = cur + + def query(self, v, n): + self._cur.execute(self._query, (v, n), binary=True, prepare=True) + return [id for id, in self._cur.fetchall()] + + def set_query_arguments(self, list_size): + self._list_size = list_size + self._cur.execute("SET diskann.query_search_list_size = %d" % list_size) + + def get_memory_usage(self): + if self._cur is None: + return 0 + self._cur.execute("SELECT pg_relation_size('items_embedding_idx')") + return self._cur.fetchone()[0] / 1024 + + def __str__(self): + return f"PGDiskANN()"