Merge branch 'main' into fix_restart

infiniflow · Dec 12, 2024 · 48fa60b · 48fa60b
2 parents c6994ad + 1e14308
commit 48fa60b
Show file tree

Hide file tree

Showing 33 changed files with 700 additions and 94 deletions.
diff --git a/docs/references/configurations.mdx b/docs/references/configurations.mdx
@@ -153,6 +153,18 @@ mem_index_capacity       = 1048576
 # Range: {"local"|"minio"}  
 storage_type             = "local"
 
+# The number of dense vector index building worker threads. Defaults to the half number of CPU cores.
+# Range: [1, number of CPU cores]
+dense_index_building_worker = 2
+
+# The number of sparse vector index building worker threads. Defaults to the half number of CPU cores.
+# Range: [1, number of CPU cores]
+sparse_index_building_worker = 2
+
+# The number of fulltext index building worker threads. Defaults to the half number of CPU cores.
+# Range: [1, number of CPU cores]
+fulltext_index_building_worker = 2
+
 # Object storage configuration
 [storage.object_storage]
 # URL of the object storage server

diff --git a/docs/references/http_api_reference.mdx b/docs/references/http_api_reference.mdx
@@ -1775,6 +1775,10 @@ Searches for data in a specified table. The search can range from a simple vecto
   - `"highlight"`: `string[]`
   - `"filter"`: `string`
   - `"fusion"`: `object`
+  - `"sort"` : `object[]`
+  - `"limit"` : `string`
+  - `"offset"` : `string`
+  - `"option"` : `object`
 
 ##### Request example
 
@@ -2006,7 +2010,17 @@ curl --request GET \
       - `"query_tensor"`: The tensor data to compare against. This should be provided as a list of lists of numerical values.
       - `"element_type"`: The element data type of the query tensor. Usually `"float"`.
 
+- `"sort"` : `object[]`
+  Defines how to sort the results.
 
+- `"limit"` : `string`
+  Indicates the limit row count.
+
+- `"offset"` : `string`
+  Indicates the offset position of the limit expression. You must use this parameter together with `limit`.
+
+- `"option"` : `object`
+  Indicates some search options.  This parameter must be used in conjunction with `limit`.
 
 #### Response
 
@@ -2029,11 +2043,14 @@ The response includes a JSON object like the following:
             "age": 16
         }
     ]
+    "total_hits_count": 3
 }
 ```
 
 - `"error_code"`: `integer`  
   `0`: The operation succeeds.
+- `"total_hits_count"`: `integer`, Optional
+  Available if you set a search option with `"total_hits_count": "true"`
 
 </TabItem>
   <TabItem value="s500">

diff --git a/docs/references/pysdk_api_reference.md b/docs/references/pysdk_api_reference.md
@@ -1788,13 +1788,134 @@ table_object.output(["*"]).filter("filter_fulltext('doc', 'first second', 'minim
 
 ---
 
+### sort
+
+```python
+table_object.sort(sort_expression_list)
+```
+
+Creates a sort expression using `sort_expression_list`.
+
+#### Parameters
+
+##### sort_expression_list: `list`, *Required*
+
+An expression list defining how to sort the results.
+
+#### Returns
+
+- Success: An `infinity.local_infinity.table.LocalTable` object in embedded mode or an `infinity.remote_thrift.table.RemoteTable` object in client-server mode.
+- Failure: `InfinityException`
+  - `error_code`: `int` A non-zero value indicating a specific error condition.
+  - `error_msg`: `str` A message providing additional details about the error.
+
+#### Examples
+
+```python
+# Output results sorted by the `c2` expression in ascending order and the `c1` expression in descending order
+table_obj.output(["c1", "c2"]).sort([["c2", SortType.Asc], ["c1", SortType.Desc]]).to_df()
+```
+
+---
+
+### limit
+
+```python
+table_object.limit(limit_num)
+```
+
+Creates an expression to limit the number of the output rows to a maximum of `limit_num`.
+
+#### Parameters
+
+##### limit_num: `int`, *Required*
+
+An integer specifying the maximum number of output rows.
+
+#### Returns
+
+- Success: An `infinity.local_infinity.table.LocalTable` object in embedded mode or an `infinity.remote_thrift.table.RemoteTable` object in client-server mode.
+- Failure: `InfinityException`
+  - `error_code`: `int` A non-zero value indicating a specific error condition.
+  - `error_msg`: `str` A message providing additional details about the error.
+
+#### Examples
+
+```python
+# Limit the output row count to a maximum of two
+table_instance.output(["num", "vec"]).limit(2).to_pl()
+```
+
+---
+
+### offset
+
+```python
+table_object.limit(limit_num).offset(offset_value)
+```
+
+Creates a limit expression with an offset value, setting the output to start from `offset_value` and limiting the row count to a maximum of `limit_num`. This method must be used in conjunction with `limit()`.
+
+#### Parameters
+
+##### offset_value: `int`, *Required*
+
+An integer specifying the offset position of the limit expression.
+
+#### Returns
+
+- Success: An `infinity.local_infinity.table.LocalTable` object in embedded mode or an `infinity.remote_thrift.table.RemoteTable` object in client-server mode.
+- Failure: `InfinityException`
+  - `error_code`: `int` A non-zero value indicating a specific error condition.
+  - `error_msg`: `str` A message providing additional details about the error.
+
+#### Examples
+
+```python
+# Limit the output row count not more than 2, start from position 1
+table_instance.output(["num", "vec"]).offset(1).limit(2).to_pl()
+```
+
+### option
+
+```python
+table_object.option(option_dict)
+```
+
+Indicates some search options.
+
+#### Parameters
+
+##### option_dict: `dict`, *Required*
+
+A dictionary specifying the following search options:
+
+- **"total_hits_count"**: `bool`, *Optional*
+  - Must combine with limit expression. If `"total_hits_count"` is `True`, the query will output an extra result including total hits row count of the query.
+
+#### Returns
+
+- Success: An `infinity.local_infinity.table.LocalTable` object in embedded mode or an `infinity.remote_thrift.table.RemoteTable` object in client-server mode.
+- Failure: `InfinityException`
+  - `error_code`: `int` A non-zero value indicating a specific error condition.
+  - `error_msg`: `str` A message providing additional details about the error.
+
+#### Examples
+
+```python
+# Limit the output row count not more than 2, start from position 1, output an extra result to indicate total hits row count
+table_instance.output(["num", "vec"]).limit(2).offset(1).option({"total_hits_count": True}).to_pl()
+```
+
+---
+
 ### match_dense
 
 ```python
 table_object.match_dense(vector_column_name, embedding_data, embedding_data_type, distance_type, topn, knn_params = None)
 ```
 
-Creates a dense vector search expression to identify the top n closest rows to the given dense vector. Suitable for working with dense vectors (dense embeddings) or multi-vectors (multiple dense embeddings in one row).
+Creates a dense vector search expression to identify the closest top n rows to the given dense vector. Suitable for working with dense vectors (dense embeddings) or multi-vectors (multiple dense embeddings in one row).
 
 :::tip NOTE
 To display your query results, you must chain this method with `output(columns)`, which specifies the columns to output, and a method such as `to_pl()`, `to_df()`, or `to_arrow()` to format the query results.
@@ -2285,29 +2406,29 @@ We recommend calling `to_df()`, `to_pl()`, or `to_arrow()` to format your result
 
 #### Returns 
 
-`tuple[dict[str, list[Any]], dict[str, Any]]`
+A `tuple[dict[str, list[Any]], dict[str, Any]], {}` object
 
 ### to_df
 
 ```python
 table_object.to_df()
 ```
 
-Returns the query result in pandas DataFrame format.
+Returns the query result as a tuple consisting of a pandas DataFrame and a dict.
 
 :::tip NOTE
 Call `to_df()` in a chain after (not necessarily "immediately after") `output(columns)` on the same table object.
 :::
 
 #### Returns
 
-A `pandas.DataFrame` object.
+A `tuple[pandas.DataFrame, {}]` object
 
 #### Examples
 
 ```python
 # Format columns "c1" and C2" of the current table into a pandas DataFrame
-res = table_object.output(["c1", "c2"]).to_df()
+res, extra_res = table_object.output(["c1", "c2"]).to_df()
 ```
 
 ### to_pl
@@ -2316,21 +2437,21 @@ res = table_object.output(["c1", "c2"]).to_df()
 table_object.to_pl()
 ```
 
-Returns the query result in Polas DataFrame format.
+Returns the query result as a tuple consisting of a Polars DataFrame and a dict.
 
 :::tip NOTE
 Call `to_pl()` in a chain after (not necessarily "immediately after") `output(columns)` on the same table object.
 :::
 
 #### Returns
 
-A `polas.DataFrame` object.
+A `tuple[polas.DataFrame, {}]` object.
 
 #### Examples
 
 ```python
-# Format a vector search result into a Polas DataFrame. 
-res = table_object.output(["*"]).match_dense("vec", [3.0, 2.8, 2.7, 3.1], "float", "ip", 10).to_pl()
+# Format a vector search result into a Polars DataFrame. 
+res, extra_res = table_object.output(["*"]).match_dense("vec", [3.0, 2.8, 2.7, 3.1], "float", "ip", 10).to_pl()
 ```
 
 ### to_arrow
@@ -2339,21 +2460,21 @@ res = table_object.output(["*"]).match_dense("vec", [3.0, 2.8, 2.7, 3.1], "float
 table_object.to_arrow()
 ```
 
-Returns the query result in Apache Arrow Table format.
+Returns the query result as a tuple consisting of an Apache Arrow Table and a dict.
 
 :::tip NOTE
 Call `to_arrow()` in a chain after (not necessarily "immediately after") `output(columns)` on the same table object.
 :::
 
 #### Returns
 
-A `pyarrow.Table` object.
+A `tuple[pyarrow.Table, {}]` object.
 
 #### Examples
 
 ```python
 # Format the current table object into an Apache Arrow Table. 
-res = table_object.output(["*"]).filter("score >= 90").to_arrow()
+res, extra_result = table_object.output(["*"]).filter("score >= 90").to_arrow()
 ```
 
 ---
diff --git a/example/delete_update_data.py b/example/delete_update_data.py
@@ -87,8 +87,10 @@
 
     print('about to update data')
     table_instance.update("num = 2", {"body": "unnecessary and harmful", "vec": [14.0, 7.2, 0.8, 10.9]})
-    result = table_instance.output(["*"]).to_pl()
+    result, extra_result = table_instance.output(["*"]).to_pl()
     print(result)
+    if extra_result is not None:
+        print(extra_result)
 
     infinity_instance.disconnect()
     print('test done')

diff --git a/example/export_data.py b/example/export_data.py
@@ -86,7 +86,7 @@
             },
             {
                 "num": 7,
-                "body": "Chris",
+                "name": "Chris",
                 "age": 21,
                 "score": 88.0,
             },

diff --git a/example/filter_data.py b/example/filter_data.py
@@ -72,7 +72,7 @@
             },
             {
                 "num": 7,
-                "body": "Chris",
+                "name": "Chris",
                 "score": 88.0,
             },
             {
@@ -99,8 +99,10 @@
     # result = table_instance.output(["num", "name", "score"]).filter("not (score > 80.0)").to_pl()
     # print(result)
 
-    result = table_instance.output(["num", "name", "score"]).filter("num <> 9").to_pl()
+    result, extra_result = table_instance.output(["num", "name", "score"]).filter("num <> 9").to_pl()
     print(result)
+    if extra_result is not None:
+        print(extra_result)
     infinity_instance.disconnect()
 
     print('test done')

diff --git a/example/filter_fulltext_keyword.py b/example/filter_fulltext_keyword.py
@@ -101,16 +101,22 @@
     )
 
     # output 7, 8, 9, 10
-    result = table_instance.output(["*"]).filter("(score > 80.0) and (score <= 90.0)").to_pl()
+    result, extra_result = table_instance.output(["*"]).filter("(score > 80.0) and (score <= 90.0)").to_pl()
     print(result)
+    if extra_result is not None:
+        print(extra_result)
 
     # output 6, 8
-    result = table_instance.output(["*"]).filter("filter_fulltext('uuid', 'UUID-2-1 UUID-2-3')").to_pl()
+    result, extra_result = table_instance.output(["*"]).filter("filter_fulltext('uuid', 'UUID-2-1 UUID-2-3')").to_pl()
     print(result)
+    if extra_result is not None:
+        print(extra_result)
 
     # output 8
-    result = table_instance.output(["*"]).filter("(score > 80.0) and (score <= 90.0) and filter_fulltext('uuid', 'UUID-2-1 UUID-2-3')").to_pl()
+    result, extra_result = table_instance.output(["*"]).filter("(score > 80.0) and (score <= 90.0) and filter_fulltext('uuid', 'UUID-2-1 UUID-2-3')").to_pl()
     print(result)
+    if extra_result is not None:
+        print(extra_result)
 
     # drop table
     db_instance.drop_table("my_table")

diff --git a/example/fulltext_search.py b/example/fulltext_search.py
@@ -86,13 +86,15 @@
         r'"harmful chemical"~10',  # sloppy phrase, refers to https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-match-query-phrase.html
     ]
     for question in questions:
-        qb_result = (
+        qb_result, extra_result = (
             table_instance.output(["num", "body", "_score"]).highlight(["body"])
             .match_text("body", question, 10)
             .to_pl()
         )
         print(f"question: {question}")
         print(qb_result)
+        if extra_result is not None:
+            print(extra_result)
 
     infinity_instance.disconnect()
 

diff --git a/example/fulltext_search_zh.py b/example/fulltext_search_zh.py
@@ -112,9 +112,11 @@
         r'"Bloom filter"',  # phrase: adjacent multiple terms
     ]
     for question in questions:
-        qb_result = table_instance.output(["num", "body", "_score"]).highlight(["body"]).match_text("body", question, 10).to_pl()
+        qb_result, extra_result = table_instance.output(["num", "body", "_score"]).highlight(["body"]).match_text("body", question, 10).to_pl()
         print(f"question: {question}")
         print(qb_result)
+        if extra_result is not None:
+            print(extra_result)
 
     infinity_instance.disconnect()