docs: Major refactor (paradedb#1649)

Weijun-H · Sep 14, 2024 · cef61c7 · cef61c7
1 parent 2a64152
commit cef61c7
Show file tree

Hide file tree

Showing 64 changed files with 2,416 additions and 1,859 deletions.
diff --git a/.prettierignore b/.prettierignore
@@ -8,12 +8,13 @@
 **/*.txt
 **/*.zip
 /docker/.docker_cache_dev/
-/docs/api-reference/quickstart.mdx
-/docs/api-reference/full-text/autocomplete.mdx
+/docs/welcome/quickstart.mdx
 /docs/api-reference/full-text/fuzzy.mdx
 /docs/api-reference/full-text/scoring.mdx
 /docs/api-reference/full-text/highlighting.mdx
 /docs/api-reference/full-text/joins.mdx
+/docs/api-reference/indexing/create_index.mdx
 /docs/api-reference/aggregations.mdx
+/docs/api-reference/autocomplete.mdx
 /docs/ingest/quickstart.mdx
 /pg_search/benchmarks/out/
diff --git a/docs/api-reference/advanced/all.mdx b/docs/api-reference/advanced/all.mdx
@@ -0,0 +1,13 @@
+---
+title: All
+---
+
+## Basic Usage
+
+`all` indiscriminately matches every document in the index, assigning a uniform score of `1.0` to each.
+
+```sql
+SELECT * FROM search_idx.search(
+    query => paradedb.all()
+);
+```
diff --git a/docs/api-reference/advanced/boolean.mdx b/docs/api-reference/advanced/boolean.mdx
@@ -0,0 +1,52 @@
+---
+title: Boolean
+---
+
+## Basic Usage
+
+Boolean queries filter documents based on the logical relationships defined by their subqueries, considering:
+
+- Documents that satisfy all `must` conditions.
+- Documents that satisfy none of the `must_not` conditions.
+- Documents that satisfy at least one condition from either `must` or `should`.
+
+Boolean queries are a powerful way to combine the results of several different queries.
+
+```sql
+SELECT * FROM search_idx.search(
+    query => paradedb.boolean(
+	    should => ARRAY[
+		    paradedb.parse('description:shoes'),
+		    paradedb.phrase_prefix(field => 'description', phrases => ARRAY['book']),
+		    paradedb.term(field => 'description', value => 'speaker'),
+		    paradedb.fuzzy_term(field => 'description', value => 'wolo')
+	    ]
+    )
+);
+```
+
+<ParamField body="must">
+  A query object or an `ARRAY` of query objects as conditions which must be
+  matched.
+</ParamField>
+<ParamField body="must_not">
+  A query object or an `ARRAY` of query objects as conditions which must not be
+  matched.
+</ParamField>
+<ParamField body="should">
+  A query object or an `ARRAY` of query objects as conditions of which at least
+  one must be matched.
+</ParamField>
+
+In order for a boolean query to return a result, one of `must` or `should` must be provided.
+`must_not` acts as a mask and does not produce a result set. For instance, in order to find
+all rows from `mock_items` where `description` does not contain `shoes`, `paradedb.all()` should be used:
+
+```sql
+SELECT * FROM search_idx.search(
+  query => paradedb.boolean(
+    should => paradedb.all(),
+    must_not => paradedb.parse('description:shoes')
+  )
+);
+```
diff --git a/docs/api-reference/advanced/boost.mdx b/docs/api-reference/advanced/boost.mdx
@@ -0,0 +1,20 @@
+---
+title: Boost
+---
+
+## Basic Usage
+
+A boost query wraps around another query to amplify its scoring impact, without altering the set of matched documents.
+
+```sql
+SELECT * FROM search_idx.search(
+	query => paradedb.boost(query => paradedb.all(), boost => 1.5)
+);
+```
+
+<ParamField body="boost" required>
+  The factor by which to multiply the score of each result.
+</ParamField>
+<ParamField body="query" required>
+  The query to perform.
+</ParamField>
diff --git a/docs/api-reference/advanced/const.mdx b/docs/api-reference/advanced/const.mdx
@@ -0,0 +1,20 @@
+---
+title: Const Score
+---
+
+## Basic Usage
+
+Applies a constant score across all documents matched by the underlying query. It can avoid unnecessary score computation on the wrapped query.
+
+```sql
+SELECT * FROM search_idx.search(
+    query => paradedb.const_score(query => paradedb.all(), score => 3.9)
+);
+```
+
+<ParamField body="score" required>
+  The constant score to use for each result.
+</ParamField>
+<ParamField body="query" required>
+  The query to perform.
+</ParamField>
diff --git a/docs/api-reference/advanced/disjunction_max.mdx b/docs/api-reference/advanced/disjunction_max.mdx
@@ -0,0 +1,23 @@
+---
+title: Disjunction Max
+---
+
+## Basic Usage
+
+Returns documents that match one or more of the specified subqueries. If a document matches multiple criteria, it receives the highest score from those matches, with a slight increase for any additional matches.
+
+```sql
+SELECT * FROM search_idx.search(
+	query => paradedb.disjunction_max(
+		disjuncts => ARRAY[paradedb.parse('description:shoes')]
+	)
+);
+```
+
+<ParamField body="disjuncts" required>
+  Query objects to match against.
+</ParamField>
+<ParamField body="tie_breaker">
+  A tie breaking increment for matching subqueries. Should be a float, i.e.
+  `0.7`.
+</ParamField>
diff --git a/docs/api-reference/advanced/empty.mdx b/docs/api-reference/advanced/empty.mdx
@@ -0,0 +1,13 @@
+---
+title: Empty
+---
+
+## Basic Usage
+
+Serves as a placeholder, matching no documents. It's useful for testing scenarios or specific edge cases.
+
+```sql
+SELECT * FROM search_idx.search(
+	query => paradedb.empty()
+);
+```
diff --git a/docs/api-reference/advanced/exists.mdx b/docs/api-reference/advanced/exists.mdx
@@ -0,0 +1,36 @@
+---
+title: Exists
+---
+
+## Basic Usage
+
+Matches all documents with a non-null value in the specified field. All matched documents get a BM25 score of `1.0`.
+
+<Note>
+  Will error if the field has not been indexed as a [fast
+  field](/api-reference/indexing/fast_fields).
+</Note>
+
+```sql
+SELECT * FROM search_idx.search(
+	query => paradedb.exists(field => 'rating')
+);
+```
+
+<ParamField body="field" required>
+  Specifies the field within the document to search for the term.
+</ParamField>
+
+This query is useful for filtering on `NULL` values inside a boolean query. For instance, the following code block
+finds all rows with `description` matching `shoes` that have a non-null `rating`.
+
+```sql
+SELECT * FROM search_idx.search(
+  query => paradedb.boolean(
+    must => ARRAY[
+      paradedb.parse('description:shoes'),
+      paradedb.exists('rating')
+    ]
+  )
+);
+```
diff --git a/docs/api-reference/advanced/fuzzy_phrase.mdx b/docs/api-reference/advanced/fuzzy_phrase.mdx
@@ -0,0 +1,51 @@
+---
+title: Fuzzy Phrase
+---
+
+<Note>Highlighting is not currently supported for `fuzzy_phrase`.</Note>
+
+## Basic Usage
+
+`fuzzy_phrase` is like `fuzzy_term` but for query phrases that are comprised of multiple tokens.
+`fuzzy_phrase` finds documents that match against **any one** of the query tokens.
+
+<Note>
+  Setting `match_all_terms` to `true` makes it so that **all** query tokens must
+  match in order for a document to be considered a match.
+</Note>
+
+```sql
+SELECT * FROM search_idx.search(
+    query => paradedb.fuzzy_phrase(
+        field => 'description',
+        value => 'ruining shoez'
+    )
+);
+```
+
+<ParamField body="field" required>
+  Specifies the field within the document to search for the term.
+</ParamField>
+<ParamField body="value" required>
+  Defines the phrase you are searching for within the specified field. This
+  phrase is automatically tokenized in the same way as `field`.
+</ParamField>
+<ParamField body="distance" default={2}>
+  The maximum Levenshtein distance (i.e. single character edits) allowed to
+  consider a term in the index as a match for the query term. Maximum value is
+  `2`.
+</ParamField>
+<ParamField body="transpose_cost_one" default={true}>
+  When set to `true`, transpositions (swapping two adjacent characters) as a
+  single edit in the Levenshtein distance calculation, while `false` considers
+  it two separate edits (a deletion and an insertion).
+</ParamField>
+<ParamField body="prefix" default={false}>
+  When set to `true`, the initial substring (prefix) of the query term is
+  exempted from the fuzzy edit distance calculation, while false includes the
+  entire string in the calculation.
+</ParamField>
+<ParamField body="match_all_terms" default={false}>
+  When set to `true`, **all** tokens of the query have to match in order for a
+  document to be considered a match.
+</ParamField>
diff --git a/docs/api-reference/advanced/fuzzy_term.mdx b/docs/api-reference/advanced/fuzzy_term.mdx
@@ -0,0 +1,44 @@
+---
+title: Fuzzy Term
+---
+
+<Note>Highlighting is not currently supported for `fuzzy_term`.</Note>
+
+## Basic Usage
+
+`fuzzy_term` finds results that approximately match the query term,
+allowing for minor typos in the input.
+
+<Note>
+  `value` is treated as a token. Use `fuzzy_phrase` to perform fuzzy search over
+  a phrase containing multiple tokens.
+</Note>
+
+```sql
+SELECT * FROM search_idx.search(
+	query => paradedb.fuzzy_term(field => 'description', value => 'wolo')
+);
+```
+
+<ParamField body="field" required>
+  Specifies the field within the document to search for the term.
+</ParamField>
+<ParamField body="value" required>
+  Defines the term you are searching for within the specified field, using fuzzy
+  logic based on Levenshtein distance to find similar terms.
+</ParamField>
+<ParamField body="distance" default={2}>
+  The maximum Levenshtein distance (i.e. single character edits) allowed to
+  consider a term in the index as a match for the query term. Maximum value is
+  `2`.
+</ParamField>
+<ParamField body="transpose_cost_one" default={true}>
+  When set to `true`, transpositions (swapping two adjacent characters) as a
+  single edit in the Levenshtein distance calculation, while `false` considers
+  it two separate edits (a deletion and an insertion).
+</ParamField>
+<ParamField body="prefix" default={false}>
+  When set to `true`, the initial substring (prefix) of the query term is
+  exempted from the fuzzy edit distance calculation, while false includes the
+  entire string in the calculation.
+</ParamField>
diff --git a/docs/api-reference/advanced/more_like_this.mdx b/docs/api-reference/advanced/more_like_this.mdx
@@ -0,0 +1,68 @@
+---
+title: More Like This
+---
+
+## Basic Usage
+
+Finds documents similar to a given document or set of field values. This is useful for recommendation engines or finding related content based on textual similarities.
+
+You must pass either:
+
+- `with_document_id`, which takes a "key field" value to match against the corresponding document.
+- `with_document_fields`, which takes a JSON object string to match against.
+
+All other parameters are compatible with both `with_document_id` and `with_document_fields`.
+
+```sql
+-- with_document_id
+SELECT * FROM search_idx.search(
+    query => paradedb.more_like_this(
+        with_document_id => 2,
+        with_min_word_length => 2,
+        with_max_word_length => 5,
+        with_boost_factor => 1.0,
+        with_stop_words => ARRAY['and', 'the', 'for']
+    )
+);
+
+-- with_document_fields
+SELECT * FROM search_idx.search(
+    query => paradedb.more_like_this(
+        with_document_fields => '{"flavour": "banana"}',
+        with_min_doc_frequency => 0,
+        with_max_doc_frequency => 100,
+        with_min_term_frequency => 1,
+    )
+);
+```
+
+<ParamField body="with_document_id">
+  The ID of the document to find similar documents to.
+</ParamField>
+<ParamField body="with_document_fields">
+  A JSON object representing the field values to use for similarity matching.
+</ParamField>
+<ParamField body="with_min_doc_frequency">
+  Minimum document frequency of terms to be considered.
+</ParamField>
+<ParamField body="with_max_doc_frequency">
+  Maximum document frequency of terms to be considered.
+</ParamField>
+<ParamField body="with_min_term_frequency">
+  Minimum term frequency of terms to be considered.
+</ParamField>
+<ParamField body="with_max_query_terms">
+  Maximum number of query terms to be used.
+</ParamField>
+<ParamField body="with_min_word_length">
+  Minimum word length of terms to be considered.
+</ParamField>
+<ParamField body="with_max_word_length">
+  Maximum word length of terms to be considered.
+</ParamField>
+<ParamField body="with_boost_factor">
+  Boost factor to amplify the impact of matching terms.
+</ParamField>
+<ParamField body="with_stop_words">
+  A JSON array of stop words to be ignored in the query.
+</ParamField>