Merge branch 'paradedb:dev' into dev

Weijun-H · Sep 21, 2024 · b4ea120 · b4ea120
2 parents c9ac2b9 + ce18cd9
commit b4ea120
Show file tree

Hide file tree

Showing 16 changed files with 563 additions and 242 deletions.
diff --git a/.github/workflows/assign-github-issue.yml b/.github/workflows/assign-github-issue.yml
@@ -0,0 +1,66 @@
+# workflows/assign-github-issue.yml
+#
+# Assign GitHub Issue
+# Automatically assign an issue to the commenter if they use the '/take' command.
+
+name: Assign GitHub Issue
+
+on:
+  issue_comment:
+    types: [created]
+
+# Required to assign the issue to the commenter
+permissions:
+  issues: write
+
+concurrency:
+  group: assign-github-issue-${{ github.workflow }}-${{ github.event.issue.number }}
+  cancel-in-progress: true
+
+jobs:
+  assign-github-issue:
+    name: Assign GitHub Issue to Commenter
+    runs-on: depot-ubuntu-latest-2
+    if: |
+      !github.event.issue.pull_request &&
+      contains(github.event.comment.body, '/take')
+
+    steps:
+      - name: Check if Commenter Can Be Assigned
+        id: check_assignee
+        run: |
+          HTTP_CODE=$(curl -X GET \
+            -H "Authorization: token ${{ secrets.GITHUB_TOKEN }}" \
+            -H "Accept: application/vnd.github.v3+json" \
+            -o /dev/null -w '%{http_code}\n' -s \
+            "https://api.github.com/repos/${{ github.repository }}/issues/${{ github.event.issue.number }}/assignees/${{ github.event.comment.user.login }}")
+
+          if [ "$HTTP_CODE" -eq "204" ]; then
+            echo "can_assign=true" >> $GITHUB_OUTPUT
+          else
+            echo "can_assign=false" >> $GITHUB_OUTPUT
+          fi
+
+      - name: Assign GitHub Issue
+        if: steps.check_assignee.outputs.can_assign == 'true'
+        run: |
+          curl -X POST \
+            -H "Authorization: token ${{ secrets.GITHUB_TOKEN }}" \
+            -H "Accept: application/vnd.github.v3+json" \
+            -d '{"assignees": ["${{ github.event.comment.user.login }}"]}' \
+            "https://api.github.com/repos/${{ github.repository }}/issues/${{ github.event.issue.number }}/assignees"
+
+          echo "Issue #${{ github.event.issue.number }} assigned to ${{ github.event.comment.user.login }}"
+
+      - name: Notify of Assignment Failure
+        if: steps.check_assignee.outputs.can_assign == 'false'
+        uses: actions/github-script@v6
+        with:
+          github-token: ${{secrets.GITHUB_TOKEN}}
+          script: |
+            github.rest.issues.createComment({
+              issue_number: context.issue.number,
+              owner: context.repo.owner,
+              repo: context.repo.name,
+              body: '@${{ github.event.comment.user.login }} Unable to assign this issue to you. You may not have the necessary permissions.'
+            })
diff --git a/.github/workflows/check-pg_search-schema-upgrade.yml b/.github/workflows/check-pg_search-schema-upgrade.yml
@@ -0,0 +1,153 @@
+# workflows/check-pg_search-schema-upgrade.yml
+#
+# Check pg_search Schema Upgrade pg_search
+# Determine if a commit introduces an extension schema change for pg_search.
+
+name: Check pg_search Schema Upgrade
+
+on:
+  pull_request:
+    types: [opened, synchronize, reopened, ready_for_review]
+    branches:
+      - dev
+      - main
+    paths:
+      - ".github/workflows/check-pg_search-schema-upgrade.yml"
+      - "pg_search/**"
+      - "!pg_search/README.md"
+  workflow_dispatch:
+
+# Required to post a comment to the PR
+permissions:
+  pull-requests: write
+
+concurrency:
+  group: check-pg_search-schema-upgrade-${{ github.head_ref || github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  check-pg_search-schema-upgrade:
+    name: Post Schema Change to PR
+    runs-on: depot-ubuntu-latest-8
+    if: github.event.pull_request.draft == false
+    env:
+      pg_version: 13 # Required by pg-schema-diff
+
+    steps:
+      - name: Checkout Git Repository
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0 # Fetch the entire history
+
+      - name: Install Rust
+        uses: dtolnay/rust-toolchain@stable
+
+      # Caches from base branches are available to PRs, but not across unrelated branches, so we only
+      # save the cache on the 'dev' branch, but load it on all branches.
+      - name: Install Rust Cache
+        uses: Swatinem/rust-cache@v2
+        with:
+          prefix-key: "v1"
+          shared-key: ${{ runner.os }}-rust-cache-pg_search-${{ HashFiles('Cargo.lock') }}
+          cache-targets: true
+          cache-on-failure: true
+          cache-all-crates: true
+          save-if: ${{ github.ref == 'refs/heads/dev' }}
+
+      - name: Install & Configure Supported PostgreSQL Version
+        run: |
+          wget --quiet -O - https://www.postgresql.org/media/keys/ACCC4CF8.asc | sudo apt-key add -
+          sudo sh -c 'echo "deb http://apt.postgresql.org/pub/repos/apt/ $(lsb_release -cs)-pgdg main" > /etc/apt/sources.list.d/pgdg.list'
+          sudo apt-get update && sudo apt-get install -y postgresql-${{ env.pg_version }} postgresql-server-dev-${{ env.pg_version }}
+          sudo chown -R $(whoami) /usr/share/postgresql/${{ env.pg_version }}/ /usr/lib/postgresql/${{ env.pg_version }}/ /var/lib/postgresql/${{ env.pg_version }}/
+          rustup component add llvm-tools-preview
+          echo "/usr/lib/postgresql/${{ env.pg_version }}/bin" >> $GITHUB_PATH
+
+      - name: Install pg-schema-diff and its Required Dependencies
+        run: |
+          sudo apt install clang llvm diffutils
+          cargo install --git https://github.com/zombodb/pg-schema-diff.git
+
+      - name: Extract pgrx Version & Install cargo-pgrx
+        run: |
+          PGRX_VERSION=$(cargo tree --depth 1 -i pgrx -p pg_search | head -n 1 | cut -f2 -dv)
+          cargo install -j $(nproc) --locked cargo-pgrx --version ${PGRX_VERSION}
+          cargo pgrx init "--pg${{ env.pg_version }}=/usr/lib/postgresql/${{ env.pg_version }}/bin/pg_config"
+
+          # Save the pgrx version for comparison later
+          echo "FIRST_PGRX_VERSION=${PGRX_VERSION}" >> $GITHUB_ENV
+
+      - name: Generate Schema from this git rev
+        run: cargo pgrx schema -p pg_search pg${{ env.pg_version }} > ~/this.sql
+
+      - name: Switch to Base git rev and Generate Schema Again
+        run: |
+          # Switch to the base git rev
+          git checkout ${{ github.event.pull_request.base.ref }}
+
+          # See if we need a different cargo-pgrx and install it if so
+          THIS_PGRX_VERSION=$(cargo tree --depth 1 -i pgrx -p pg_search | head -n 1 | cut -f2 -dv)
+          if [[ "${THIS_PGRX_VERSION}" != "${FIRST_PGRX_VERSION}" ]]; then
+            # Install cargo-pgrx
+            cargo install -j $(nproc) --locked cargo-pgrx --version ${THIS_PGRX_VERSION} --force
+
+            # Initialize it (again) -- probably unnecessary, but might as well in case ~/.pgrx/config.toml ever changes
+            cargo pgrx init "--pg${{ env.pg_version }}=/usr/lib/postgresql/${{ env.pg_version }}/bin/pg_config"
+          fi
+
+          # Generate schema
+          cargo pgrx schema -p pg_search pg${{ env.pg_version }} > ~/old.sql
+
+      - name: Generate Schema Diffs
+        run: |
+          (pg-schema-diff diff ~/old.sql ~/this.sql | grep -v "^$" > ~/diff.sql) || true
+          (diff ~/old.sql ~/this.sql > ~/diff.patch) || true
+
+      - name: Generate Commit Message
+        id: generate_commit_message
+        run: |
+          if test -s ~/diff.sql; then
+            echo "Generating GitHub comment message"
+            {
+              echo 'DIFF<<EOF'
+              echo 'A schema difference was detected.'
+              echo
+              echo 'A suggested "upgrade.sql" script entry might be:'
+              echo
+              echo '```sql'
+              cat ~/diff.sql
+              echo '```'
+
+              #
+              # cargo-pgrx schema doesn't output its generated schema in a stable format
+              # so including the entire diff is garbage.  We still generate it, tho, because
+              # cargo-pgrx will improve someday, and when it does we can uncomment these lines
+              #
+
+              # echo
+              # echo 'The full diff between both schemas is:'
+              # echo
+              # echo '```diff'
+              # cat ~/diff.patch
+              # echo '```'
+              echo EOF
+            } >> "$GITHUB_ENV"
+
+            # Set a flag to indicate a schema difference was detected
+            echo "schema_diff_detected=true" >> $GITHUB_OUTPUT
+          else
+            echo "No schema difference detected"
+            echo "schema_diff_detected=false" >> $GITHUB_OUTPUT
+          fi
+
+      - name: Attach Schema Diff to PR
+        uses: actions/github-script@v6
+        if: steps.generate_commit_message.outputs.schema_diff_detected == 'true'
+        with:
+          script: |
+            github.rest.issues.createComment({
+              issue_number: context.issue.number,
+              owner: context.repo.owner,
+              repo: context.repo.repo,
+              body: process.env.DIFF
+            })
diff --git a/.prettierignore b/.prettierignore
@@ -15,6 +15,7 @@
 /docs/api-reference/full-text/joins.mdx
 /docs/api-reference/indexing/create_index.mdx
 /docs/api-reference/guides/autocomplete.mdx
+/docs/api-reference/guides/hybrid.mdx
 /docs/api-reference/optimization/joins.mdx
 /docs/ingest/quickstart.mdx
 /pg_search/benchmarks/out/
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -1,7 +1,6 @@
 # **Contributing to ParadeDB**
 
-Welcome! We're excited that you're interested in contributing to ParadeDB and want
-to make the process as smooth as possible.
+Welcome! We're excited that you're interested in contributing to ParadeDB and want to make the process as smooth as possible.
 
 ## Technical Info
 
@@ -10,6 +9,21 @@ conventions to follow when submitting changes. If you have any questions not cov
 in this document, please reach out to us in the [ParadeDB Community Slack](https://join.slack.com/t/paradedbcommunity/shared_invite/zt-2lkzdsetw-OiIgbyFeiibd1DG~6wFgTQ)
 or via [email](mailto:[email protected]).
 
+### Claiming GitHub Issues
+
+This repository has a workflow to automatically assign issues to new contributors. This ensures that you don't need approval
+from a maintainer to pick an issue.
+
+1. Before claiming an issue, ensure that:
+
+- It's not already assigned to someone else
+- There are no comments indicating ongoing work
+
+2. To claim an unassigned issue, simply comment `/take` on the issue. This will automatically assign the issue to you.
+
+If you find yourself unable to make progress, don't hesitate to seek help in the issue comments or in the [ParadeDB Community Slack](https://join.slack.com/t/paradedbcommunity/shared_invite/zt-2lkzdsetw-OiIgbyFeiibd1DG~6wFgTQ). If you no longer wish to
+work on the issue(s) you self-assigned, please use the `unassign me` link at the top of the issue(s) page to release it.
+
 ### Development Workflow
 
 ParadeDB is structured as a monorepo containing our Postgres extensions, our Docker setup, and our development tools for benchmarking and testing.

diff --git a/docker/Dockerfile b/docker/Dockerfile
@@ -271,14 +271,8 @@ RUN apt-get update && \
     apt-get autoremove -y && \
     rm -rf /var/lib/apt/lists/*
 
-# Change the uid of the postgres user to 26, for CloudNativePG compatibility
-RUN usermod -u 26 postgres \
-    && chown -R 26:999 /var/lib/postgresql \
-    && chown -R 26:999 /var/run/postgresql \
-    && chmod -R 700 /var/lib/postgresql
-
 # Switch back to the postgres user, with the new uid
-USER 26
+USER postgres
 
 # Copy ParadeDB scripts to install extensions, configure postgresql.conf, update extensions, etc.
 COPY ./docker/bootstrap.sh /docker-entrypoint-initdb.d/10_bootstrap_paradedb.sh

diff --git a/docs/api-reference/faceting/metric.mdx b/docs/api-reference/faceting/metric.mdx
@@ -152,3 +152,27 @@ SELECT search_idx.aggregate('{
   The value to use for documents missing the field. By default, missing values
   are ignored.
 </ParamField>
+
+## Cardinality
+
+A cardinality aggregation estimates the number of unique values in the specified field using the HyperLogLog++ algorithm.
+This is useful for understanding the uniqueness of values in a large dataset.
+
+The cardinality aggregation provides an approximate count, which is accurate within a small error range.
+This trade-off allows for efficient computation even on very large datasets.
+
+```sql
+SELECT search_idx.aggregate('{
+  "unique_users": {
+    "cardinality": {"field": "user_id", "missing": "unknown"}
+  }
+}');
+```
+
+<ParamField body="field" required>
+  The field name to compute the cardinality on.
+</ParamField>
+<ParamField body="missing">
+  The value to use for documents missing the field. By default, missing values
+  are ignored.
+</ParamField>
diff --git a/docs/api-reference/full-text/scoring.mdx b/docs/api-reference/full-text/scoring.mdx
@@ -14,27 +14,9 @@ The `score_bm25` function takes a query and returns a table with two columns: th
 parameters as [`search`](/api-reference/full-text/overview).
 
 ```sql
-SELECT * FROM <index_name>.score_bm25('<query>');
+SELECT * FROM search_idx.score_bm25('description:keyboard');
 ```
 
-<Accordion title="Example Usage">
-
-```sql
-SELECT * FROM search_idx.score_bm25(
-	'description:keyboard',
-	limit_rows => 10
-);
-```
-
-</Accordion>
-
-<ParamField body="index_name" required>
-  The name of the index.
-</ParamField>
-<ParamField body="query" required>
-  The query string.
-</ParamField>
-
 ## Joining BM25 Scores
 
 Because `score_bm25` generates a new `score_bm25` column, the function does not return all the columns of the original table.

diff --git a/docs/api-reference/full-text/sorting.mdx b/docs/api-reference/full-text/sorting.mdx
@@ -16,10 +16,10 @@ If `false`, scores are not generated and instead results are returned in an un-d
 of this is that the results are returned as quickly as possible. This is useful for queries that are known to return
 many thousands or millions of rows.
 
-`stable_sort` defaults to `false` and can be passed into `search`, `score_bm25`, and `snippet`.
+`stable_sort` defaults to `true` and can be passed into `search`, `score_bm25`, and `snippet`.
 
 ```sql
-SELECT * FROM search_idx.search('description:keyboard', stable_sort => true);
+SELECT * FROM search_idx.search('description:keyboard', stable_sort => false);
 ```
 
 ## Custom Ordering