diff --git a/.github/workflows/assign-github-issue.yml b/.github/workflows/assign-github-issue.yml new file mode 100644 index 0000000000..6a2833e252 --- /dev/null +++ b/.github/workflows/assign-github-issue.yml @@ -0,0 +1,66 @@ +# workflows/assign-github-issue.yml +# +# Assign GitHub Issue +# Automatically assign an issue to the commenter if they use the '/take' command. + +name: Assign GitHub Issue + +on: + issue_comment: + types: [created] + +# Required to assign the issue to the commenter +permissions: + issues: write + +concurrency: + group: assign-github-issue-${{ github.workflow }}-${{ github.event.issue.number }} + cancel-in-progress: true + +jobs: + assign-github-issue: + name: Assign GitHub Issue to Commenter + runs-on: depot-ubuntu-latest-2 + if: | + !github.event.issue.pull_request && + contains(github.event.comment.body, '/take') + + steps: + - name: Check if Commenter Can Be Assigned + id: check_assignee + run: | + HTTP_CODE=$(curl -X GET \ + -H "Authorization: token ${{ secrets.GITHUB_TOKEN }}" \ + -H "Accept: application/vnd.github.v3+json" \ + -o /dev/null -w '%{http_code}\n' -s \ + "https://api.github.com/repos/${{ github.repository }}/issues/${{ github.event.issue.number }}/assignees/${{ github.event.comment.user.login }}") + + if [ "$HTTP_CODE" -eq "204" ]; then + echo "can_assign=true" >> $GITHUB_OUTPUT + else + echo "can_assign=false" >> $GITHUB_OUTPUT + fi + + - name: Assign GitHub Issue + if: steps.check_assignee.outputs.can_assign == 'true' + run: | + curl -X POST \ + -H "Authorization: token ${{ secrets.GITHUB_TOKEN }}" \ + -H "Accept: application/vnd.github.v3+json" \ + -d '{"assignees": ["${{ github.event.comment.user.login }}"]}' \ + "https://api.github.com/repos/${{ github.repository }}/issues/${{ github.event.issue.number }}/assignees" + + echo "Issue #${{ github.event.issue.number }} assigned to ${{ github.event.comment.user.login }}" + + - name: Notify of Assignment Failure + if: steps.check_assignee.outputs.can_assign == 'false' + uses: actions/github-script@v6 + with: + github-token: ${{secrets.GITHUB_TOKEN}} + script: | + github.rest.issues.createComment({ + issue_number: context.issue.number, + owner: context.repo.owner, + repo: context.repo.name, + body: '@${{ github.event.comment.user.login }} Unable to assign this issue to you. You may not have the necessary permissions.' + }) diff --git a/.github/workflows/check-pg_search-schema-upgrade.yml b/.github/workflows/check-pg_search-schema-upgrade.yml new file mode 100644 index 0000000000..a5709e2ae5 --- /dev/null +++ b/.github/workflows/check-pg_search-schema-upgrade.yml @@ -0,0 +1,153 @@ +# workflows/check-pg_search-schema-upgrade.yml +# +# Check pg_search Schema Upgrade pg_search +# Determine if a commit introduces an extension schema change for pg_search. + +name: Check pg_search Schema Upgrade + +on: + pull_request: + types: [opened, synchronize, reopened, ready_for_review] + branches: + - dev + - main + paths: + - ".github/workflows/check-pg_search-schema-upgrade.yml" + - "pg_search/**" + - "!pg_search/README.md" + workflow_dispatch: + +# Required to post a comment to the PR +permissions: + pull-requests: write + +concurrency: + group: check-pg_search-schema-upgrade-${{ github.head_ref || github.ref }} + cancel-in-progress: true + +jobs: + check-pg_search-schema-upgrade: + name: Post Schema Change to PR + runs-on: depot-ubuntu-latest-8 + if: github.event.pull_request.draft == false + env: + pg_version: 13 # Required by pg-schema-diff + + steps: + - name: Checkout Git Repository + uses: actions/checkout@v4 + with: + fetch-depth: 0 # Fetch the entire history + + - name: Install Rust + uses: dtolnay/rust-toolchain@stable + + # Caches from base branches are available to PRs, but not across unrelated branches, so we only + # save the cache on the 'dev' branch, but load it on all branches. + - name: Install Rust Cache + uses: Swatinem/rust-cache@v2 + with: + prefix-key: "v1" + shared-key: ${{ runner.os }}-rust-cache-pg_search-${{ HashFiles('Cargo.lock') }} + cache-targets: true + cache-on-failure: true + cache-all-crates: true + save-if: ${{ github.ref == 'refs/heads/dev' }} + + - name: Install & Configure Supported PostgreSQL Version + run: | + wget --quiet -O - https://www.postgresql.org/media/keys/ACCC4CF8.asc | sudo apt-key add - + sudo sh -c 'echo "deb http://apt.postgresql.org/pub/repos/apt/ $(lsb_release -cs)-pgdg main" > /etc/apt/sources.list.d/pgdg.list' + sudo apt-get update && sudo apt-get install -y postgresql-${{ env.pg_version }} postgresql-server-dev-${{ env.pg_version }} + sudo chown -R $(whoami) /usr/share/postgresql/${{ env.pg_version }}/ /usr/lib/postgresql/${{ env.pg_version }}/ /var/lib/postgresql/${{ env.pg_version }}/ + rustup component add llvm-tools-preview + echo "/usr/lib/postgresql/${{ env.pg_version }}/bin" >> $GITHUB_PATH + + - name: Install pg-schema-diff and its Required Dependencies + run: | + sudo apt install clang llvm diffutils + cargo install --git https://github.com/zombodb/pg-schema-diff.git + + - name: Extract pgrx Version & Install cargo-pgrx + run: | + PGRX_VERSION=$(cargo tree --depth 1 -i pgrx -p pg_search | head -n 1 | cut -f2 -dv) + cargo install -j $(nproc) --locked cargo-pgrx --version ${PGRX_VERSION} + cargo pgrx init "--pg${{ env.pg_version }}=/usr/lib/postgresql/${{ env.pg_version }}/bin/pg_config" + + # Save the pgrx version for comparison later + echo "FIRST_PGRX_VERSION=${PGRX_VERSION}" >> $GITHUB_ENV + + - name: Generate Schema from this git rev + run: cargo pgrx schema -p pg_search pg${{ env.pg_version }} > ~/this.sql + + - name: Switch to Base git rev and Generate Schema Again + run: | + # Switch to the base git rev + git checkout ${{ github.event.pull_request.base.ref }} + + # See if we need a different cargo-pgrx and install it if so + THIS_PGRX_VERSION=$(cargo tree --depth 1 -i pgrx -p pg_search | head -n 1 | cut -f2 -dv) + if [[ "${THIS_PGRX_VERSION}" != "${FIRST_PGRX_VERSION}" ]]; then + # Install cargo-pgrx + cargo install -j $(nproc) --locked cargo-pgrx --version ${THIS_PGRX_VERSION} --force + + # Initialize it (again) -- probably unnecessary, but might as well in case ~/.pgrx/config.toml ever changes + cargo pgrx init "--pg${{ env.pg_version }}=/usr/lib/postgresql/${{ env.pg_version }}/bin/pg_config" + fi + + # Generate schema + cargo pgrx schema -p pg_search pg${{ env.pg_version }} > ~/old.sql + + - name: Generate Schema Diffs + run: | + (pg-schema-diff diff ~/old.sql ~/this.sql | grep -v "^$" > ~/diff.sql) || true + (diff ~/old.sql ~/this.sql > ~/diff.patch) || true + + - name: Generate Commit Message + id: generate_commit_message + run: | + if test -s ~/diff.sql; then + echo "Generating GitHub comment message" + { + echo 'DIFF<> "$GITHUB_ENV" + + # Set a flag to indicate a schema difference was detected + echo "schema_diff_detected=true" >> $GITHUB_OUTPUT + else + echo "No schema difference detected" + echo "schema_diff_detected=false" >> $GITHUB_OUTPUT + fi + + - name: Attach Schema Diff to PR + uses: actions/github-script@v6 + if: steps.generate_commit_message.outputs.schema_diff_detected == 'true' + with: + script: | + github.rest.issues.createComment({ + issue_number: context.issue.number, + owner: context.repo.owner, + repo: context.repo.repo, + body: process.env.DIFF + }) diff --git a/.prettierignore b/.prettierignore index 9af0a8b4db..d07eaa05cf 100644 --- a/.prettierignore +++ b/.prettierignore @@ -15,6 +15,7 @@ /docs/api-reference/full-text/joins.mdx /docs/api-reference/indexing/create_index.mdx /docs/api-reference/guides/autocomplete.mdx +/docs/api-reference/guides/hybrid.mdx /docs/api-reference/optimization/joins.mdx /docs/ingest/quickstart.mdx /pg_search/benchmarks/out/ diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 78168556c6..d9e8c1fc51 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -1,7 +1,6 @@ # **Contributing to ParadeDB** -Welcome! We're excited that you're interested in contributing to ParadeDB and want -to make the process as smooth as possible. +Welcome! We're excited that you're interested in contributing to ParadeDB and want to make the process as smooth as possible. ## Technical Info @@ -10,6 +9,21 @@ conventions to follow when submitting changes. If you have any questions not cov in this document, please reach out to us in the [ParadeDB Community Slack](https://join.slack.com/t/paradedbcommunity/shared_invite/zt-2lkzdsetw-OiIgbyFeiibd1DG~6wFgTQ) or via [email](mailto:support@paradedb.com). +### Claiming GitHub Issues + +This repository has a workflow to automatically assign issues to new contributors. This ensures that you don't need approval +from a maintainer to pick an issue. + +1. Before claiming an issue, ensure that: + +- It's not already assigned to someone else +- There are no comments indicating ongoing work + +2. To claim an unassigned issue, simply comment `/take` on the issue. This will automatically assign the issue to you. + +If you find yourself unable to make progress, don't hesitate to seek help in the issue comments or in the [ParadeDB Community Slack](https://join.slack.com/t/paradedbcommunity/shared_invite/zt-2lkzdsetw-OiIgbyFeiibd1DG~6wFgTQ). If you no longer wish to +work on the issue(s) you self-assigned, please use the `unassign me` link at the top of the issue(s) page to release it. + ### Development Workflow ParadeDB is structured as a monorepo containing our Postgres extensions, our Docker setup, and our development tools for benchmarking and testing. diff --git a/docker/Dockerfile b/docker/Dockerfile index 0354be2696..731f9fdeb0 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -271,14 +271,8 @@ RUN apt-get update && \ apt-get autoremove -y && \ rm -rf /var/lib/apt/lists/* -# Change the uid of the postgres user to 26, for CloudNativePG compatibility -RUN usermod -u 26 postgres \ - && chown -R 26:999 /var/lib/postgresql \ - && chown -R 26:999 /var/run/postgresql \ - && chmod -R 700 /var/lib/postgresql - # Switch back to the postgres user, with the new uid -USER 26 +USER postgres # Copy ParadeDB scripts to install extensions, configure postgresql.conf, update extensions, etc. COPY ./docker/bootstrap.sh /docker-entrypoint-initdb.d/10_bootstrap_paradedb.sh diff --git a/docs/api-reference/faceting/metric.mdx b/docs/api-reference/faceting/metric.mdx index b0ac07d216..f25144b4e2 100644 --- a/docs/api-reference/faceting/metric.mdx +++ b/docs/api-reference/faceting/metric.mdx @@ -152,3 +152,27 @@ SELECT search_idx.aggregate('{ The value to use for documents missing the field. By default, missing values are ignored. + +## Cardinality + +A cardinality aggregation estimates the number of unique values in the specified field using the HyperLogLog++ algorithm. +This is useful for understanding the uniqueness of values in a large dataset. + +The cardinality aggregation provides an approximate count, which is accurate within a small error range. +This trade-off allows for efficient computation even on very large datasets. + +```sql +SELECT search_idx.aggregate('{ + "unique_users": { + "cardinality": {"field": "user_id", "missing": "unknown"} + } +}'); +``` + + + The field name to compute the cardinality on. + + + The value to use for documents missing the field. By default, missing values + are ignored. + diff --git a/docs/api-reference/full-text/scoring.mdx b/docs/api-reference/full-text/scoring.mdx index 0d7c7ed843..3bc46c294c 100644 --- a/docs/api-reference/full-text/scoring.mdx +++ b/docs/api-reference/full-text/scoring.mdx @@ -14,27 +14,9 @@ The `score_bm25` function takes a query and returns a table with two columns: th parameters as [`search`](/api-reference/full-text/overview). ```sql -SELECT * FROM .score_bm25(''); +SELECT * FROM search_idx.score_bm25('description:keyboard'); ``` - - -```sql -SELECT * FROM search_idx.score_bm25( - 'description:keyboard', - limit_rows => 10 -); -``` - - - - - The name of the index. - - - The query string. - - ## Joining BM25 Scores Because `score_bm25` generates a new `score_bm25` column, the function does not return all the columns of the original table. diff --git a/docs/api-reference/full-text/sorting.mdx b/docs/api-reference/full-text/sorting.mdx index 3558712c63..45dfeb856f 100644 --- a/docs/api-reference/full-text/sorting.mdx +++ b/docs/api-reference/full-text/sorting.mdx @@ -16,10 +16,10 @@ If `false`, scores are not generated and instead results are returned in an un-d of this is that the results are returned as quickly as possible. This is useful for queries that are known to return many thousands or millions of rows. -`stable_sort` defaults to `false` and can be passed into `search`, `score_bm25`, and `snippet`. +`stable_sort` defaults to `true` and can be passed into `search`, `score_bm25`, and `snippet`. ```sql -SELECT * FROM search_idx.search('description:keyboard', stable_sort => true); +SELECT * FROM search_idx.search('description:keyboard', stable_sort => false); ``` ## Custom Ordering diff --git a/docs/api-reference/guides/hybrid.mdx b/docs/api-reference/guides/hybrid.mdx index 42580456b9..41b9217bad 100644 --- a/docs/api-reference/guides/hybrid.mdx +++ b/docs/api-reference/guides/hybrid.mdx @@ -2,65 +2,60 @@ title: Hybrid Search --- -## Overview +ParadeDB's full text and similarity search APIs can be combined in the same query to execute [hybrid search](/api-reference/concepts/search#hybrid-search). -Hybrid search, which combines BM25-based full text scores with vector-based -similarity scores, is especially useful in scenarios where you want to match by keywords and semantic meaning. + +This guide uses the [`mock_items` table](/api-reference/introduction#get-started). It assumes that the entire +[quickstart](/welcome/quickstart) tutorial has been completed, including the vector search section. + -## Basic Usage +## Reciprocal Rank Fusion -To calculate a row's hybrid score, ParadeDB introduces a `score_hybrid` function. Under the hood, -this function does the following: +Reciprocal rank fusion is a popular hybrid search algorithm that: -1. Calculates the BM25 and similarity scores for the respective queries -2. Applies minmax normalization to both scores, which sets the lowest score to `0` and the highest score to `1` -3. Calculates the weighted mean of the normalized scores +1. Calculates a BM25 and similarity score for the top `n` documents. +2. Ranks documents by their BM25 and similarity scores separately. The highest-ranked document for each score receives an `r` of `1`. +3. Calculates a reciprocal rank for each score as `1/(k + r)`, where `k` is a constant. `k` is usually set to `60`. +4. Calculates each document's reciprocal rank fusion score as the sum of the BM25 and similarity reciprocal rank scores. + +The following code block implements reciprocal rank fusion over the `mock_items` table. BM25 scores are calculated against the query `description:keyboard` and similarity scores are +calculated against the vector `[1,2,3]`. ```sql -SELECT * FROM search_idx.score_hybrid( - bm25_query => 'description:keyboard OR category:electronics', - similarity_query => '''[1,2,3]'' <-> embedding', - bm25_weight => 0.9, - similarity_weight => 0.1 -); +WITH semantic_search AS ( + SELECT id, RANK () OVER (ORDER BY embedding <=> '[1,2,3]') AS rank + FROM mock_items + ORDER BY embedding <=> '[1,2,3]' + LIMIT 20 +), +bm25_search AS ( + SELECT id, RANK () OVER (ORDER BY score_bm25 DESC) as rank + FROM search_idx.score_bm25('description:keyboard', limit_rows => 20) +) +SELECT + COALESCE(semantic_search.id, bm25_search.id) AS id, + COALESCE(1.0 / (60 + semantic_search.rank), 0.0) + + COALESCE(1.0 / (60 + bm25_search.rank), 0.0) AS score, + mock_items.description, + mock_items.embedding +FROM semantic_search +FULL OUTER JOIN bm25_search ON semantic_search.id = bm25_search.id +JOIN mock_items ON mock_items.id = COALESCE(semantic_search.id, bm25_search.id) +ORDER BY score DESC +LIMIT 5; ``` - - The name of the BM25 index associated with this table. For instance, if you - ran `CREATE INDEX my_index ON my_table USING bm25 ((my_table.*))`, the index - name would be `'my_index'`. - - - The full text search query string. For instance, `'description:keyboard'`. - - - The similarity query string. For instance, `'''[1,2,3]'' <-> embedding'`. - Note that double single quotes are used to escape the single quote inside the string. - - - The weight applied to the BM25 score. It is recommended that this weight and the similarity weight - add up to `1`. - - - The weight applied to the similarity score. It is recommended that this weight and the BM25 weight - add up to `1`. - - - The maximum number of rows that are considered for ranking using BM25. - - - The maximum number of rows that are considered for ranking using similarity search. - - -## With Query Builder - -[Query builder](/api-reference/advanced) functions can be passed directly into `bm25_query`: - -```sql -SELECT * FROM search_idx.score_hybrid( - bm25_query => paradedb.disjunction_max(disjuncts => ARRAY[paradedb.parse('description:shoes')]), - similarity_query => '''[1,2,3]'' <-> embedding', - bm25_weight => 0.9, - similarity_weight => 0.1 -); + +```csv + id | score | description | embedding +----+------------------------+--------------------------+----------- + 1 | 0.03062178588125292193 | Ergonomic metal keyboard | [3,4,5] + 2 | 0.02990695613646433318 | Plastic Keyboard | [4,5,6] + 19 | 0.01639344262295081967 | Artistic ceramic vase | [1,2,3] + 39 | 0.01639344262295081967 | Handcrafted wooden frame | [1,2,3] + 29 | 0.01639344262295081967 | Designer wall paintings | [1,2,3] +(5 rows) ``` + + +Here, we see that the top five results either contain `keyboard` in the `description` field or have an `embedding` of `[1,2,3]`. diff --git a/docs/welcome/quickstart.mdx b/docs/welcome/quickstart.mdx index 984e26876c..0f74e5b931 100644 --- a/docs/welcome/quickstart.mdx +++ b/docs/welcome/quickstart.mdx @@ -6,7 +6,6 @@ This guide will walk you through the following steps to get started with ParadeD 1. Full text search 2. Similarity (i.e. vector) search -3. Hybrid search ## Full Text Search @@ -210,76 +209,6 @@ LIMIT 3; ``` -## Hybrid Search - -Finally, let's implement hybrid search, which combines BM25-based full text scores with vector-based -similarity scores. Hybrid search is especially useful in scenarios where you want to match by both -exact keywords and semantic meaning. - -The `score_hybrid` function accepts a BM25 query and a similarity query. It applies minmax normalization to the BM25 and -similarity scores and combines them using a weighted average. - -```sql -SELECT * FROM default_idx.score_hybrid( - bm25_query => 'description:keyboard OR category:electronics', - similarity_query => '''[1,2,3]'' <-> embedding', - bm25_weight => 0.9, - similarity_weight => 0.1 -) LIMIT 5; -``` - - -``` csv - id | score_hybrid -----+-------------- - 2 | 0.95714283 - 1 | 0.8487012 - 29 | 0.1 - 39 | 0.1 - 9 | 0.1 -(5 rows) -``` - - -Let's join this result with our `mock_items` table to see the full results of our hybrid search: - -```sql -SELECT m.description, m.category, m.embedding, s.score_hybrid -FROM mock_items m -LEFT JOIN ( - SELECT * FROM default_idx.score_hybrid( - bm25_query => 'description:keyboard OR category:electronics', - similarity_query => '''[1,2,3]'' <-> embedding', - bm25_weight => 0.9, - similarity_weight => 0.1 - ) -) s -ON m.id = s.id -LIMIT 5; -``` - - -``` csv - description | category | embedding | score_hybrid ---------------------------+-------------+-----------+-------------- - Plastic Keyboard | Electronics | [4,5,6] | 0.95714283 - Ergonomic metal keyboard | Electronics | [3,4,5] | 0.8487012 - Designer wall paintings | Home Decor | [1,2,3] | 0.1 - Handcrafted wooden frame | Home Decor | [1,2,3] | 0.1 - Modern wall clock | Home Decor | [1,2,3] | 0.1 -(5 rows) -``` - - -As we can see, results with the word `keyboard` scored higher than results with an embedding of -`[1,2,3]` because we placed a weight of `0.9` on the BM25 scores. - - - Minmax normalization transforms the lowest and highest scores in a dataset to - `0` and `1`, respectively. Because of this, the lowest-ranking BM25 or - similarity score may be overlooked, as it is transformed to `0`. - - ## For Further Assistance The `paradedb.help` function opens a GitHub Discussion that the ParadeDB team will respond to. diff --git a/pg_search/src/bootstrap/create_bm25.rs b/pg_search/src/bootstrap/create_bm25.rs index 738b1b7f47..e14b877955 100644 --- a/pg_search/src/bootstrap/create_bm25.rs +++ b/pg_search/src/bootstrap/create_bm25.rs @@ -531,5 +531,6 @@ extension_sql!( ON sql_drop EXECUTE FUNCTION paradedb.drop_bm25_event_trigger(); "# - name = "create_drop_bm25_event_trigger" + name = "create_drop_bm25_event_trigger", + requires = [ delete_bm25_index_by_oid ] ); diff --git a/pg_search/src/lib.rs b/pg_search/src/lib.rs index 569329c29c..98394de691 100644 --- a/pg_search/src/lib.rs +++ b/pg_search/src/lib.rs @@ -50,7 +50,11 @@ const DEFAULT_STARTUP_COST: f64 = 10.0; pgrx::pg_module_magic!(); -extension_sql!("GRANT ALL ON SCHEMA paradedb TO PUBLIC;" name = "paradedb_grant_all"); +extension_sql!( + "GRANT ALL ON SCHEMA paradedb TO PUBLIC;", + name = "paradedb_grant_all", + finalize +); static mut TRACE_HOOK: shared::trace::TraceHook = shared::trace::TraceHook; @@ -70,6 +74,10 @@ pub fn MyDatabaseId() -> u32 { #[allow(non_snake_case)] #[pg_guard] pub unsafe extern "C" fn _PG_init() { + if !pg_sys::process_shared_preload_libraries_in_progress { + error!("pg_search must be loaded via shared_preload_libraries. Add 'pg_search' to shared_preload_libraries in postgresql.conf and restart Postgres."); + } + postgres::options::init(); GUCS.init("pg_search"); diff --git a/pg_search/tests/bm25_search.rs b/pg_search/tests/bm25_search.rs index 2aa0531f41..3cc046d1f3 100644 --- a/pg_search/tests/bm25_search.rs +++ b/pg_search/tests/bm25_search.rs @@ -382,76 +382,6 @@ fn uuid(mut conn: PgConnection) { assert_eq!(rows.len(), 10); } -#[rstest] -fn hybrid(mut conn: PgConnection) { - SimpleProductsTable::setup().execute(&mut conn); - r#" - CREATE EXTENSION vector; - ALTER TABLE paradedb.bm25_search ADD COLUMN embedding vector(3); - - UPDATE paradedb.bm25_search m - SET embedding = ('[' || - ((m.id + 1) % 10 + 1)::integer || ',' || - ((m.id + 2) % 10 + 1)::integer || ',' || - ((m.id + 3) % 10 + 1)::integer || ']')::vector; - - CREATE INDEX on paradedb.bm25_search - USING hnsw (embedding vector_l2_ops)"# - .execute(&mut conn); - - // Test with string query. - let columns: SimpleProductsTableVec = r#" - SELECT m.*, s.score_hybrid - FROM paradedb.bm25_search m - LEFT JOIN ( - SELECT * FROM bm25_search.score_hybrid( - bm25_query => 'description:keyboard OR category:electronics', - similarity_query => '''[1,2,3]'' <-> embedding', - bm25_weight => 0.9, - similarity_weight => 0.1 - ) - ) s ON m.id = s.id - LIMIT 5"# - .fetch_collect(&mut conn); - - assert_eq!(columns.id, vec![2, 1, 29, 39, 9]); - - // New score_hybrid function - // Test with query object. - let columns: SimpleProductsTableVec = r#" - SELECT m.*, s.score_hybrid - FROM paradedb.bm25_search m - LEFT JOIN ( - SELECT * FROM bm25_search.score_hybrid( - bm25_query => paradedb.parse('description:keyboard OR category:electronics'), - similarity_query => '''[1,2,3]'' <-> embedding', - bm25_weight => 0.9, - similarity_weight => 0.1 - ) - ) s ON m.id = s.id - LIMIT 5"# - .fetch_collect(&mut conn); - - assert_eq!(columns.id, vec![2, 1, 29, 39, 9]); - - // Test with string query. - let columns: SimpleProductsTableVec = r#" - SELECT m.*, s.score_hybrid - FROM paradedb.bm25_search m - LEFT JOIN ( - SELECT * FROM bm25_search.score_hybrid( - bm25_query => 'description:keyboard OR category:electronics', - similarity_query => '''[1,2,3]'' <-> embedding', - bm25_weight => 0.9, - similarity_weight => 0.1 - ) - ) s ON m.id = s.id - LIMIT 5"# - .fetch_collect(&mut conn); - - assert_eq!(columns.id, vec![2, 1, 29, 39, 9]); -} - #[rstest] fn multi_tree(mut conn: PgConnection) { SimpleProductsTable::setup().execute(&mut conn); diff --git a/pg_search/tests/hybrid.rs b/pg_search/tests/hybrid.rs new file mode 100644 index 0000000000..28175c9032 --- /dev/null +++ b/pg_search/tests/hybrid.rs @@ -0,0 +1,173 @@ +// Copyright (c) 2023-2024 Retake, Inc. +// +// This file is part of ParadeDB - Postgres for Search and Analytics +// +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU Affero General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Affero General Public License for more details. +// +// You should have received a copy of the GNU Affero General Public License +// along with this program. If not, see . + +mod fixtures; + +use fixtures::*; +use pretty_assertions::assert_eq; +use rstest::*; +use sqlx::PgConnection; + +#[rstest] +fn hybrid_deprecated(mut conn: PgConnection) { + SimpleProductsTable::setup().execute(&mut conn); + r#" + CREATE EXTENSION vector; + ALTER TABLE paradedb.bm25_search ADD COLUMN embedding vector(3); + + UPDATE paradedb.bm25_search m + SET embedding = ('[' || + ((m.id + 1) % 10 + 1)::integer || ',' || + ((m.id + 2) % 10 + 1)::integer || ',' || + ((m.id + 3) % 10 + 1)::integer || ']')::vector; + + CREATE INDEX on paradedb.bm25_search + USING hnsw (embedding vector_l2_ops)"# + .execute(&mut conn); + + // Test with string query. + let columns: SimpleProductsTableVec = r#" + SELECT m.*, s.score_hybrid + FROM paradedb.bm25_search m + LEFT JOIN ( + SELECT * FROM bm25_search.score_hybrid( + bm25_query => 'description:keyboard OR category:electronics', + similarity_query => '''[1,2,3]'' <-> embedding', + bm25_weight => 0.9, + similarity_weight => 0.1 + ) + ) s ON m.id = s.id + LIMIT 5"# + .fetch_collect(&mut conn); + + assert_eq!(columns.id, vec![2, 1, 29, 39, 9]); + + // New score_hybrid function + // Test with query object. + let columns: SimpleProductsTableVec = r#" + SELECT m.*, s.score_hybrid + FROM paradedb.bm25_search m + LEFT JOIN ( + SELECT * FROM bm25_search.score_hybrid( + bm25_query => paradedb.parse('description:keyboard OR category:electronics'), + similarity_query => '''[1,2,3]'' <-> embedding', + bm25_weight => 0.9, + similarity_weight => 0.1 + ) + ) s ON m.id = s.id + LIMIT 5"# + .fetch_collect(&mut conn); + + assert_eq!(columns.id, vec![2, 1, 29, 39, 9]); + + // Test with string query. + let columns: SimpleProductsTableVec = r#" + SELECT m.*, s.score_hybrid + FROM paradedb.bm25_search m + LEFT JOIN ( + SELECT * FROM bm25_search.score_hybrid( + bm25_query => 'description:keyboard OR category:electronics', + similarity_query => '''[1,2,3]'' <-> embedding', + bm25_weight => 0.9, + similarity_weight => 0.1 + ) + ) s ON m.id = s.id + LIMIT 5"# + .fetch_collect(&mut conn); + + assert_eq!(columns.id, vec![2, 1, 29, 39, 9]); +} + +#[rstest] +#[allow(clippy::excessive_precision)] +fn reciprocal_rank_fusion(mut conn: PgConnection) { + SimpleProductsTable::setup().execute(&mut conn); + r#" + CREATE EXTENSION vector; + ALTER TABLE paradedb.bm25_search ADD COLUMN embedding vector(3); + + UPDATE paradedb.bm25_search m + SET embedding = ('[' || + ((m.id + 1) % 10 + 1)::integer || ',' || + ((m.id + 2) % 10 + 1)::integer || ',' || + ((m.id + 3) % 10 + 1)::integer || ']')::vector; + + CREATE INDEX on paradedb.bm25_search + USING hnsw (embedding vector_l2_ops)"# + .execute(&mut conn); + + let columns: Vec<(i32, f32, String)> = r#" + WITH semantic AS ( + SELECT id, RANK () OVER (ORDER BY embedding <=> '[1,2,3]') AS rank + FROM paradedb.bm25_search + ORDER BY embedding <=> '[1,2,3]' + LIMIT 20 + ), + bm25 AS ( + SELECT id, RANK () OVER (ORDER BY score_bm25 DESC) as rank + FROM bm25_search.score_bm25('description:keyboard', limit_rows => 20) + ) + SELECT + COALESCE(semantic.id, bm25.id) AS id, + (COALESCE(1.0 / (60 + semantic.rank), 0.0) + + COALESCE(1.0 / (60 + bm25.rank), 0.0))::REAL AS score, + paradedb.bm25_search.description + FROM semantic + FULL OUTER JOIN bm25 ON semantic.id = bm25.id + JOIN paradedb.bm25_search ON paradedb.bm25_search.id = COALESCE(semantic.id, bm25.id) + ORDER BY score DESC + LIMIT 5; + "# + .fetch(&mut conn); + + assert_eq!( + columns[0], + ( + 1, + 0.03062178588125292193, + "Ergonomic metal keyboard".to_string() + ) + ); + assert_eq!( + columns[1], + (2, 0.02990695613646433318, "Plastic Keyboard".to_string()) + ); + assert_eq!( + columns[2], + ( + 19, + 0.01639344262295081967, + "Artistic ceramic vase".to_string() + ) + ); + assert_eq!( + columns[3], + ( + 39, + 0.01639344262295081967, + "Handcrafted wooden frame".to_string() + ) + ); + assert_eq!( + columns[4], + ( + 29, + 0.01639344262295081967, + "Designer wall paintings".to_string() + ) + ); +} diff --git a/pg_search/tests/replication.rs b/pg_search/tests/replication.rs index 7897f2ff09..20af4e3543 100644 --- a/pg_search/tests/replication.rs +++ b/pg_search/tests/replication.rs @@ -230,9 +230,13 @@ async fn test_ephemeral_postgres() -> Result<()> { "SELECT description FROM mock_items.search('description:shoes')".fetch(&mut source_conn); // Wait for the replication to complete - std::thread::sleep(std::time::Duration::from_secs(1)); let target_results: Vec<(String,)> = - "SELECT description FROM mock_items.search('description:shoes')".fetch(&mut target_conn); + "SELECT description FROM mock_items.search('description:shoes')".fetch_retry( + &mut target_conn, + 5, + 1000, + |result| !result.is_empty(), + ); assert_eq!(source_results.len(), 1); assert_eq!(target_results.len(), 1); @@ -247,10 +251,13 @@ async fn test_ephemeral_postgres() -> Result<()> { .fetch(&mut source_conn); // Wait for the replication to complete - std::thread::sleep(std::time::Duration::from_secs(1)); let target_results: Vec<(String,)> = - "SELECT description FROM mock_items.search('description:\"running shoes\"')" - .fetch(&mut target_conn); + "SELECT description FROM mock_items.search('description:\"running shoes\"')".fetch_retry( + &mut target_conn, + 5, + 1000, + |result| !result.is_empty(), + ); assert_eq!(source_results.len(), 1); assert_eq!(target_results.len(), 1); @@ -264,10 +271,13 @@ async fn test_ephemeral_postgres() -> Result<()> { "SELECT rating FROM mock_items WHERE description = 'Red sports shoes'" .fetch(&mut source_conn); - std::thread::sleep(std::time::Duration::from_secs(1)); let target_results: Vec<(i32,)> = - "SELECT rating FROM mock_items WHERE description = 'Red sports shoes'" - .fetch(&mut target_conn); + "SELECT rating FROM mock_items WHERE description = 'Red sports shoes'".fetch_retry( + &mut target_conn, + 5, + 1000, + |result| !result.is_empty(), + ); assert_eq!(source_results.len(), 1); assert_eq!(target_results.len(), 1); @@ -281,10 +291,13 @@ async fn test_ephemeral_postgres() -> Result<()> { "SELECT description FROM mock_items WHERE description = 'Red sports shoes'" .fetch(&mut source_conn); - std::thread::sleep(std::time::Duration::from_secs(1)); let target_results: Vec<(String,)> = - "SELECT description FROM mock_items WHERE description = 'Red sports shoes'" - .fetch(&mut target_conn); + "SELECT description FROM mock_items WHERE description = 'Red sports shoes'".fetch_retry( + &mut target_conn, + 5, + 1000, + |result| !result.is_empty(), + ); assert_eq!(source_results.len(), 0); assert_eq!(target_results.len(), 0); @@ -436,12 +449,14 @@ async fn test_replication_with_pg_search_only_on_replica() -> Result<()> { VALUES ('Green hiking shoes', 'Footwear', true, '16:00:00', '2024-07-11', '{}', '2024-07-11 16:00:00', 3)" .execute(&mut source_conn); - // Wait for the replication to complete - std::thread::sleep(std::time::Duration::from_secs(1)); - // Verify the insert is replicated to the target database and can be searched using pg_search let target_results: Vec<(String,)> = - "SELECT description FROM mock_items.search('description:shoes')".fetch(&mut target_conn); + "SELECT description FROM mock_items.search('description:shoes')".fetch_retry( + &mut target_conn, + 5, + 1000, + |result| !result.is_empty(), + ); assert_eq!(target_results.len(), 1); assert_eq!(target_results[0].0, "Green hiking shoes"); diff --git a/shared/src/fixtures/db.rs b/shared/src/fixtures/db.rs index abf3733120..2a41849690 100644 --- a/shared/src/fixtures/db.rs +++ b/shared/src/fixtures/db.rs @@ -16,6 +16,7 @@ // along with this program. If not, see . use super::arrow::schema_to_batch; +use anyhow::Result; use async_std::prelude::Stream; use async_std::stream::StreamExt; use async_std::task::block_on; @@ -26,7 +27,7 @@ use sqlx::{ testing::{TestArgs, TestContext, TestSupport}, ConnectOptions, Decode, Executor, FromRow, PgConnection, Postgres, Type, }; -use std::time::{SystemTime, UNIX_EPOCH}; +use std::time::{Duration, SystemTime, UNIX_EPOCH}; pub struct Db { context: TestContext, @@ -94,6 +95,41 @@ where }) } + fn fetch_retry( + self, + connection: &mut PgConnection, + retries: u32, + delay_ms: u64, + validate: fn(&[T]) -> bool, + ) -> Vec + where + T: for<'r> FromRow<'r, ::Row> + Send + Unpin, + { + for attempt in 0..retries { + match block_on(async { + sqlx::query_as::<_, T>(self.as_ref()) + .fetch_all(&mut *connection) + .await + .map_err(anyhow::Error::from) + }) { + Ok(result) => { + if validate(&result) { + return result; + } else if attempt < retries - 1 { + block_on(async_std::task::sleep(Duration::from_millis(delay_ms))); + } else { + return vec![]; + } + } + Err(_) if attempt < retries - 1 => { + block_on(async_std::task::sleep(Duration::from_millis(delay_ms))); + } + Err(e) => panic!("Fetch attempt {}/{} failed: {}", attempt + 1, retries, e), + } + } + panic!("Exhausted retries for query '{}'", self.as_ref()); + } + fn fetch_dynamic(self, connection: &mut PgConnection) -> Vec { block_on(async { sqlx::query(self.as_ref())