diff --git a/.github/workflows/slow_test.yml b/.github/workflows/slow_test.yml index 6439887a4b..de45fa258d 100644 --- a/.github/workflows/slow_test.yml +++ b/.github/workflows/slow_test.yml @@ -126,7 +126,7 @@ jobs: # Fix sanitizer: https://github.com/ClickHouse/ClickHouse/issues/64086 old_value=$(sudo sysctl -n vm.mmap_rnd_bits) sudo sysctl -w vm.mmap_rnd_bits=28 - sudo docker exec ${TESTER_CONTAINER} bash -c "cd /infinity/ && rm -fr /var/infinity && export INF_DIRECTORY=`cat .tester_env` && echo INF_DIRECTORY=${INF_DIRECTORY} && python3 tools/run_cluster_test.py --infinity_path=cmake-build-debug/src/infinity --infinity_dir=${INF_DIRECTORY}" + sudo docker exec ${TESTER_CONTAINER} bash -c "cd /infinity/ && rm -fr /var/infinity && export INF_DIRECTORY=`cat .tester_env` && echo INF_DIRECTORY=${INF_DIRECTORY} && python3 tools/run_cluster_test.py --infinity_path=cmake-build-debug/src/infinity --infinity_dir=${INF_DIRECTORY} --minio_port=9005 --minio_console_port=9006" sudo sysctl -w vm.mmap_rnd_bits=$old_value - name: Collect thread sanitizer output in cluster test diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 5f839f347a..ebdade27b6 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -84,7 +84,7 @@ jobs: id: run_cluster_test run: | sudo docker exec ${TESTER_CONTAINER} bash -c "rm -rf /root/.config/pip/pip.conf && cd /infinity/ && pip3 uninstall -y infinity-sdk infinity-embedded-sdk && cd python/infinity_sdk/ && pip3 install . -i https://pypi.tuna.tsinghua.edu.cn/simple --trusted-host tuna.tsinghua.edu.cn && cd ../.." - sudo docker exec ${TESTER_CONTAINER} bash -c "cd /infinity/ && rm -fr /var/infinity && export INF_DIRECTORY=`cat .tester_env` && echo INF_DIRECTORY=${INF_DIRECTORY} && python3 tools/run_cluster_test.py --infinity_path=cmake-build-debug/src/infinity --infinity_dir=${INF_DIRECTORY}" + sudo docker exec ${TESTER_CONTAINER} bash -c "cd /infinity/ && rm -fr /var/infinity && export INF_DIRECTORY=`cat .tester_env` && echo INF_DIRECTORY=${INF_DIRECTORY} && python3 tools/run_cluster_test.py --infinity_path=cmake-build-debug/src/infinity --infinity_dir=${INF_DIRECTORY} --minio_port=9005 --minio_console_port=9006" - name: Collect cluster test output if: ${{ !cancelled() }} @@ -123,7 +123,7 @@ jobs: MINIO_DIR=~/minio_data_$(od -An -N4 -tx4 /dev/urandom | tr -d ' ') echo "MINIO_CONTAINER=${MINIO_CONTAINER}" >> $GITHUB_ENV echo "MINIO_DIR=${MINIO_DIR}" >> $GITHUB_ENV - sudo docker rm -f -v ${MINIO_CONTAINER} && sudo rm -fr ${MINIO_DIR} && sudo mkdir ${MINIO_DIR} && sudo docker run -d --net=container:${BUILDER_CONTAINER} --name ${MINIO_CONTAINER} -e "MINIO_ROOT_PASSWORD=minioadmin" -e "MINIO_ROOT_USER=minioadmin" -v ${MINIO_DIR}:/data quay.io/minio/minio server /data --console-address ":9001" && sleep 5s + sudo docker rm -f -v ${MINIO_CONTAINER} && sudo rm -fr ${MINIO_DIR} && sudo mkdir ${MINIO_DIR} && sudo docker run -d --net=container:${BUILDER_CONTAINER} --name ${MINIO_CONTAINER} -e "MINIO_ROOT_PASSWORD=minioadmin" -e "MINIO_ROOT_USER=minioadmin" -v ${MINIO_DIR}:/data quay.io/minio/minio server /data --console-address ":9006" --address ":9005" && sleep 5s - name: Start infinity debug version with minio if: ${{ !cancelled() && !failure() }} @@ -290,7 +290,7 @@ jobs: MINIO_DIR=~/minio_data_$(od -An -N4 -tx4 /dev/urandom | tr -d ' ') echo "MINIO_CONTAINER=${MINIO_CONTAINER}" >> $GITHUB_ENV echo "MINIO_DIR=${MINIO_DIR}" >> $GITHUB_ENV - sudo docker rm -f -v ${MINIO_CONTAINER} && sudo rm -fr ${MINIO_DIR} && sudo mkdir ${MINIO_DIR} && sudo docker run -d --net=container:${BUILDER_CONTAINER} --name ${MINIO_CONTAINER} -e "MINIO_ROOT_PASSWORD=minioadmin" -e "MINIO_ROOT_USER=minioadmin" -v ${MINIO_DIR}:/data quay.io/minio/minio server /data --console-address ":9001" && sleep 5s + sudo docker rm -f -v ${MINIO_CONTAINER} && sudo rm -fr ${MINIO_DIR} && sudo mkdir ${MINIO_DIR} && sudo docker run -d --net=container:${BUILDER_CONTAINER} --name ${MINIO_CONTAINER} -e "MINIO_ROOT_PASSWORD=minioadmin" -e "MINIO_ROOT_USER=minioadmin" -v ${MINIO_DIR}:/data quay.io/minio/minio server /data --console-address ":9006" --address ":9005" && sleep 5s - name: Start infinity release version with minio if: ${{ !cancelled() && !failure() }} diff --git a/README.md b/README.md index 9c85b75a2e..8ad93c72ea 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,5 @@
- + Infinity logo
@@ -26,7 +26,7 @@ Infinity is a cutting-edge AI-native database that provides a wide range of sear ## ⚡️ Performance
- + Infinity performance comparison
## 🌟 Key Features @@ -60,9 +60,9 @@ Supports a wide range of data types including strings, numerics, vectors, and mo Infinity supports two working modes, embedded mode and client-server mode. Infinity's embedded mode enables you to quickly embed Infinity into your Python applications, without the need to connect to a separate backend server. The following shows how to operate in embedded mode: ```bash - pip install infinity-embedded-sdk==0.5.0.dev5 + pip install infinity-embedded-sdk==0.5.0.dev6 ``` -1. Use Infinity to conduct a dense vector search: + Use Infinity to conduct a dense vector search: ```python import infinity_embedded diff --git a/benchmark/local_infinity/fulltext/fulltext_benchmark.cpp b/benchmark/local_infinity/fulltext/fulltext_benchmark.cpp index 00a5a6b49d..a2ed13d291 100644 --- a/benchmark/local_infinity/fulltext/fulltext_benchmark.cpp +++ b/benchmark/local_infinity/fulltext/fulltext_benchmark.cpp @@ -254,7 +254,7 @@ void BenchmarkQuery(SharedPtr infinity, const String &db_name, const S output_columns->emplace_back(select_rowid_expr); output_columns->emplace_back(select_score_expr); } - infinity->Search(db_name, table_name, search_expr, nullptr, nullptr, nullptr, output_columns, nullptr, nullptr, nullptr); + infinity->Search(db_name, table_name, search_expr, nullptr, nullptr, nullptr, output_columns, nullptr, nullptr, nullptr, false); /* auto result = infinity->Search(db_name, table_name, search_expr, nullptr, output_columns); { diff --git a/benchmark/local_infinity/infinity_benchmark.cpp b/benchmark/local_infinity/infinity_benchmark.cpp index 528ca8387f..4b1a9e4831 100644 --- a/benchmark/local_infinity/infinity_benchmark.cpp +++ b/benchmark/local_infinity/infinity_benchmark.cpp @@ -226,7 +226,8 @@ int main() { output_columns, nullptr, nullptr, - nullptr); + nullptr, + false); }); results.push_back(fmt::format("-> Select QPS: {}", total_times / tims_costing_second)); } diff --git a/benchmark/local_infinity/knn/knn_query_benchmark.cpp b/benchmark/local_infinity/knn/knn_query_benchmark.cpp index a17d516b9e..38e2099de0 100644 --- a/benchmark/local_infinity/knn/knn_query_benchmark.cpp +++ b/benchmark/local_infinity/knn/knn_query_benchmark.cpp @@ -220,7 +220,7 @@ int main(int argc, char *argv[]) { auto select_rowid_expr = new FunctionExpr(); select_rowid_expr->func_name_ = "row_id"; output_columns->emplace_back(select_rowid_expr); - auto result = infinity->Search(db_name, table_name, search_expr, nullptr, nullptr, nullptr, output_columns, nullptr, nullptr, nullptr); + auto result = infinity->Search(db_name, table_name, search_expr, nullptr, nullptr, nullptr, output_columns, nullptr, nullptr, nullptr, false); { auto &cv = result.result_table_->GetDataBlockById(0)->column_vectors; auto &column = *cv[0]; diff --git a/benchmark/remote_infinity/remote_query_benchmark.cpp b/benchmark/remote_infinity/remote_query_benchmark.cpp index e07c4bdc22..14ad9fcf44 100644 --- a/benchmark/remote_infinity/remote_query_benchmark.cpp +++ b/benchmark/remote_infinity/remote_query_benchmark.cpp @@ -51,7 +51,7 @@ struct InfinityClient { transport->open(); CommonResponse response; ConnectRequest request; - request.__set_client_version(26); // 0.5.0.dev5 + request.__set_client_version(27); // 0.5.0.dev6 client->Connect(response, request); session_id = response.session_id; } diff --git a/client/cpp/infinity_client.cpp b/client/cpp/infinity_client.cpp index d229b2af82..f5e5fdefd7 100644 --- a/client/cpp/infinity_client.cpp +++ b/client/cpp/infinity_client.cpp @@ -25,7 +25,7 @@ Client Client::Connect(const std::string &ip_address, uint16_t port) { transport->open(); CommonResponse response; ConnectRequest request; - request.__set_client_version(26); // 0.5.0.dev5 + request.__set_client_version(27); // 0.5.0.dev6 client->Connect(response, request); return {socket, transport, protocol, std::move(client), response.session_id}; } diff --git a/conf/follower.toml b/conf/follower.toml index 3bbc4ac63f..517a745dcf 100644 --- a/conf/follower.toml +++ b/conf/follower.toml @@ -1,7 +1,7 @@ [general] version = "0.5.0" time_zone = "utc-8" -server_mode = "cluster" +server_mode = "admin" # "standalone" [network] server_address = "0.0.0.0" @@ -46,7 +46,7 @@ mem_index_capacity = 1048576 storage_type = "minio" [storage.object_storage] -url = "127.0.0.1:9000" +url = "127.0.0.1:9005" bucket_name = "infinity" access_key = "minioadmin" secret_key = "minioadmin" diff --git a/conf/infinity_conf.toml b/conf/infinity_conf.toml index b0739ac24f..a9344e4ad3 100644 --- a/conf/infinity_conf.toml +++ b/conf/infinity_conf.toml @@ -37,7 +37,7 @@ mem_index_capacity = 1048576 # S3 storage config example: # [storage.object_storage] -# url = "127.0.0.1:9000" +# url = "127.0.0.1:9005" # bucket_name = "infinity" # access_key = "minioadmin" # secret_key = "minioadmin" diff --git a/conf/infinity_minio_conf.toml b/conf/infinity_minio_conf.toml new file mode 100644 index 0000000000..c8eca3498c --- /dev/null +++ b/conf/infinity_minio_conf.toml @@ -0,0 +1,72 @@ +[general] +version = "0.5.0" +time_zone = "utc-8" + +[network] +server_address = "0.0.0.0" +postgres_port = 5432 +http_port = 23820 +client_port = 23817 +connection_pool_size = 128 + +[log] +log_filename = "infinity.log" +log_dir = "/var/infinity/log" +log_to_stdout = false +log_file_max_size = "10GB" +log_file_rotate_count = 10 + +# trace/debug/info/warning/error/critical 6 log levels, default: info +log_level = "info" + +[storage] +persistence_dir = "/var/infinity/persistence" +data_dir = "/var/infinity/data" +# periodically activates garbage collection: +# 0 means real-time, +# s means seconds, for example "60s", 60 seconds +# m means minutes, for example "60m", 60 minutes +# h means hours, for example "1h", 1 hour +optimize_interval = "10s" +cleanup_interval = "60s" +compact_interval = "120s" +storage_type = "minio" +# dump memory index entry when it reachs the capacity +mem_index_capacity = 1048576 + +[storage.object_storage] +url = "127.0.0.1:9005" +bucket_name = "infinity" +access_key = "minioadmin" +secret_key = "minioadmin" +enable_https = false + +# S3 storage config example: +# [storage.object_storage] +# url = "127.0.0.1:9005" +# bucket_name = "infinity" +# access_key = "minioadmin" +# secret_key = "minioadmin" +# enable_https = false + +[buffer] +buffer_manager_size = "4GB" +lru_num = 7 +temp_dir = "/var/infinity/tmp" +result_cache = "off" +memindex_memory_quota = "1GB" + +[wal] +wal_dir = "/var/infinity/wal" +full_checkpoint_interval = "86400s" +delta_checkpoint_interval = "60s" +# delta_checkpoint_threshold = 1000000000 +wal_compact_threshold = "1GB" + +# flush_at_once: write and flush log each commit +# only_write: write log, OS control when to flush the log, default +# flush_per_second: logs are written after each commit and flushed to disk per second. +wal_flush = "only_write" + +[resource] +resource_dir = "/var/infinity/resource" diff --git a/conf/leader.toml b/conf/leader.toml index 68a9c56b27..42d24eaf0d 100644 --- a/conf/leader.toml +++ b/conf/leader.toml @@ -1,7 +1,7 @@ [general] version = "0.5.0" time_zone = "utc-8" -server_mode = "cluster" +server_mode = "admin" # "standalone" [network] server_address = "0.0.0.0" @@ -46,7 +46,7 @@ mem_index_capacity = 1048576 storage_type = "minio" [storage.object_storage] -url = "127.0.0.1:9000" +url = "127.0.0.1:9005" bucket_name = "infinity" access_key = "minioadmin" secret_key = "minioadmin" diff --git a/conf/learner.toml b/conf/learner.toml index a432bffe90..27152a5485 100644 --- a/conf/learner.toml +++ b/conf/learner.toml @@ -1,7 +1,7 @@ [general] version = "0.5.0" time_zone = "utc-8" -server_mode = "cluster" +server_mode = "admin" # "standalone" [network] server_address = "0.0.0.0" @@ -46,7 +46,7 @@ mem_index_capacity = 1048576 storage_type = "minio" [storage.object_storage] -url = "127.0.0.1:9000" +url = "127.0.0.1:9005" bucket_name = "infinity" access_key = "minioadmin" secret_key = "minioadmin" diff --git a/conf/learner2.toml b/conf/learner2.toml index 79073f90f0..9de53f1d6d 100644 --- a/conf/learner2.toml +++ b/conf/learner2.toml @@ -1,7 +1,7 @@ [general] version = "0.5.0" time_zone = "utc-8" -server_mode = "cluster" +server_mode = "admin" # "standalone" [network] server_address = "0.0.0.0" @@ -46,7 +46,7 @@ mem_index_capacity = 1048576 storage_type = "minio" [storage.object_storage] -url = "127.0.0.1:9000" +url = "127.0.0.1:9005" bucket_name = "infinity" access_key = "minioadmin" secret_key = "minioadmin" diff --git a/conf/pytest_parallel_infinity_conf.toml b/conf/pytest_parallel_infinity_conf.toml index bc5c9822e2..d4d6fd1a23 100644 --- a/conf/pytest_parallel_infinity_conf.toml +++ b/conf/pytest_parallel_infinity_conf.toml @@ -16,6 +16,8 @@ log_level = "trace" [storage] persistence_dir = "/var/infinity/persistence" +compact_interval = "10s" +cleanup_interval = "0s" [buffer] buffer_manager_size = "8GB" diff --git a/conf/pytest_parallel_infinity_follower.toml b/conf/pytest_parallel_infinity_follower.toml index fbfc4745ac..5db2a512ea 100644 --- a/conf/pytest_parallel_infinity_follower.toml +++ b/conf/pytest_parallel_infinity_follower.toml @@ -1,7 +1,7 @@ [general] version = "0.5.0" time_zone = "utc-8" -server_mode = "cluster" +server_mode = "admin" # "standalone" [network] server_address = "0.0.0.0" @@ -40,7 +40,7 @@ mem_index_capacity = 1048576 storage_type = "minio" [storage.object_storage] -url = "127.0.0.1:9000" +url = "127.0.0.1:9005" bucket_name = "infinity" access_key = "pk9s2oJFX1qXLYObwIcz" secret_key = "ho1G9xh2iKup4Xj9Ja3eRgg8bfwMyDv4fvkQGcZl" diff --git a/conf/pytest_parallel_infinity_leader.toml b/conf/pytest_parallel_infinity_leader.toml index 7e975215cf..4aeda8a4cb 100644 --- a/conf/pytest_parallel_infinity_leader.toml +++ b/conf/pytest_parallel_infinity_leader.toml @@ -1,7 +1,7 @@ [general] version = "0.5.0" time_zone = "utc-8" -server_mode = "cluster" +server_mode = "admin" # "standalone" [network] server_address = "0.0.0.0" @@ -40,7 +40,7 @@ mem_index_capacity = 1048576 storage_type = "minio" [storage.object_storage] -url = "127.0.0.1:9000" +url = "127.0.0.1:9005" bucket_name = "infinity" access_key = "minioadmin" secret_key = "minioadmin" diff --git a/conf/pytest_parallel_infinity_minio.toml b/conf/pytest_parallel_infinity_minio.toml index 400fbe3627..2dd0fcc91e 100644 --- a/conf/pytest_parallel_infinity_minio.toml +++ b/conf/pytest_parallel_infinity_minio.toml @@ -18,7 +18,7 @@ persistence_dir = "/var/infinity/persistence" storage_type = "minio" [storage.object_storage] -url = "127.0.0.1:9000" +url = "127.0.0.1:9005" bucket_name = "infinity" access_key = "minioadmin" secret_key = "minioadmin" diff --git a/docs/getstarted/build_from_source.mdx b/docs/getstarted/build_from_source.mdx index 93954b3655..3a4df25aa5 100644 --- a/docs/getstarted/build_from_source.mdx +++ b/docs/getstarted/build_from_source.mdx @@ -7,6 +7,10 @@ slug: /build_from_source import Tabs from '@theme/Tabs'; import TabItem from '@theme/TabItem'; +Build Infinity from source, build and run unit/functional tests. + +--- + This document provides instructions for building Infinity from source, as well as building and running unit and functional tests. :::tip NOTE @@ -260,7 +264,7 @@ cmake --build . -t test_main 2. Install Python sdk of infinity: ```bash - pip install infinity-sdk==0.5.0.dev5 + pip install infinity-sdk==0.5.0.dev6 ``` 3. Run the functional tests: @@ -282,7 +286,7 @@ cmake --build . -t test_main 2. Install Python sdk of infinity: ```bash - pip install infinity-sdk==0.5.0.dev5 + pip install infinity-sdk==0.5.0.dev6 ``` 3. Run the functional tests: @@ -305,7 +309,7 @@ cmake --build . -t test_main 2. Install Python sdk of infinity: ```bash - pip install infinity-sdk==0.5.0.dev5 + pip install infinity-sdk==0.5.0.dev6 ``` 3. Run the functional tests: diff --git a/docs/getstarted/deploy_infinity_server.mdx b/docs/getstarted/deploy_infinity_server.mdx index 28825d478d..9d5ee6962a 100644 --- a/docs/getstarted/deploy_infinity_server.mdx +++ b/docs/getstarted/deploy_infinity_server.mdx @@ -7,6 +7,10 @@ slug: /deploy_infinity_server import Tabs from '@theme/Tabs'; import TabItem from '@theme/TabItem'; +Three ways to deploy Infinity. + +--- + This document provides guidance on deploying the Infinity database. In general, you can deploy Infinity in the following three ways: - [Import Infinity as a Python module](#import-infinity-as-a-python-module): To run Infinity locally as a Python module. @@ -30,7 +34,7 @@ This approach allows you to call Infinity as a Python module. To deploy Infinity ### Install Infinity as a module ``` -pip install infinity-embedded-sdk==0.5.0.dev5 +pip install infinity-embedded-sdk==0.5.0.dev6 ``` ### Create an Infinity object @@ -97,10 +101,10 @@ If you are on Windows 10+, you must enable WSL or WSL2 to deploy Infinity using ### Install Infinity client ``` -pip install infinity-sdk==0.5.0.dev5 +pip install infinity-sdk==0.5.0.dev6 ``` -### Connect to Infinity Server and run a dense vector search +### Run a vector search ```python import infinity @@ -147,7 +151,7 @@ This section provides instructions on deploying Infinity using binary package on Fedora/RHEL/CentOS/OpenSUSE ```bash -sudo rpm -i infinity-0.5.0.dev5-x86_64.rpm +sudo rpm -i infinity-0.5.0.dev6-x86_64.rpm ``` ```bash @@ -158,7 +162,7 @@ sudo systemctl start infinity ```bash -sudo dpkg -i infinity-0.5.0.dev5-x86_64.deb +sudo dpkg -i infinity-0.5.0.dev6-x86_64.deb ``` ```bash @@ -171,10 +175,10 @@ sudo systemctl start infinity ### Install Infinity client ``` -pip install infinity-sdk==0.5.0.dev5 +pip install infinity-sdk==0.5.0.dev6 ``` -### Connect to Infinity Server and run a dense vector search +### Run a vector search ```python import infinity diff --git a/docs/getstarted/quickstart.md b/docs/getstarted/quickstart.md index 2a271cc211..61a7235f99 100644 --- a/docs/getstarted/quickstart.md +++ b/docs/getstarted/quickstart.md @@ -5,6 +5,8 @@ slug: / # Quickstart +A quickstart guide. + ## Prerequisites - CPU: x86_64 with AVX2 support. @@ -19,7 +21,7 @@ If you wish to embed Infinity into your Python application without the need for 1. Install the Infinity-embedded SDK: ```bash - pip install infinity-embedded-sdk==0.5.0.dev5 + pip install infinity-embedded-sdk==0.5.0.dev6 ``` 2. Use Infinity to conduct a dense vector search: ```python diff --git a/docs/guides/search_guide.md b/docs/guides/search_guide.md index 44e6b9912f..1e342d8445 100644 --- a/docs/guides/search_guide.md +++ b/docs/guides/search_guide.md @@ -2,57 +2,146 @@ sidebar_position: 1 slug: /search_guide --- -# Search usage guide +# Conduct a search + +Full-text, vector, sparse vector, tensor, hybrid search. + +--- ## Overview -Infinity offers powerful search capabilities. This page covers the search usage. +This document offers guidance on conducting a search within Infinity. ## Full-text search -Full text search will work if full text index is created. There are two kinds of work modes for full text indexing: +### Work modes for full-text index -- Real-time mode - If the full text index is created immediately after the table is created, then the full-text index will work in real time mode if data is ingested at this time. Real time index will accumulate posting data within memory and flush to disk if it reaches up the quota. -- Offline mode - If the full-text index is created after the data is inserted, then it will work in offline mode, where the full-text index is constructed through external sorting. +A full-text index must be built to perform a full-text search, and this index operates in two modes: + +- **Real-time mode** - If created immediately after a table is created, a full-text index will be built on ingested data in real time, accumulating posting data within memory and flushing it to disk when a specified quota is reached. +- **Offline mode** - For data inserted before the creation of a full-text index, index will be built in offline mode using external sorting. ### Tokenizer -There are several built-in tokenizers within Infinity. With the exception of the default standard analyzer and ngram analyzer, everything else requires the resource file to be in the right place. Make sure to download [resource package](https://github.com/infiniflow/resource) and put it to correct directory according to `[resource]` configuration: +When creating a full-text index, you are required to specify a tokenizer/analyzer, which will be used for future full-text searches on the same column(s). Infinity has many built-in tokenizers. Except for the Ngram analyzer and the default standard analyzer, all other analyzers require dedicated resource files. Please download the appropriate files for your chosen analyzer from [this link](https://github.com/infiniflow/resource) and save it to the directory specified by `resource_dir` in the configuration file. ```yaml [resource] -# Directory for Infinity's resource files, including the dictionary files used by the analyzer +# Directory for Infinity's resource files, including dictionaries to be used by analyzers resource_dir = "/var/infinity/resource" ``` -You must specify a tokenizer when creating a full text index, but you don't need to specify one when querying, because the query will select the same tokenizer in the same columns. +The following are Infinity's built-in analyzers/tokenizers. + +#### Standard analyzer + +The standard analyzer is the default tokenizer and works best with Latin characters. It uses stemmer before outputting tokens segmented by white space, and `English` is the default stemmer. To specify a stemmer for a different language, use `"standard-xxx"`, where `xxx` is the language to use. + +Supported language stemmers include: `Danish`, `Dutch`, `English`, `Finnish`, `French`, `German`, `Hungarian`, `Italian`, `Norwegian`, `Porter`, `Portuguese`, `Romanian`, `Russian`, `Spanish`, `Swedish`, and `Turkish`. + +#### Ngram analyzer + +A definition of N-gram can be found on [wikipedia](https://en.wikipedia.org/wiki/N-gram). Use `"ngram-x"` to select the Ngram analyzer, where `x` represents the value of `N`. For example, a common choice for full-text searches in code is `"ngram-3"`. + +#### Simplified Chinese analyzer + +Use `"chinese"` to select the simplified Chinese analyzer, which is a wrapper of [Jieba](https://github.com/yanyiwu/cppjieba) analyzer. Use `"chinese-fine"` to output fine-grained analyzer results. + +#### Traditional Chinese analyzer + +Use `"traditional"` to select the traditional Chinese analyzer, which essentially converts simplified Chinese into traditional Chinese based on the outputs of the [Jieba](https://github.com/yanyiwu/cppjieba) analyzer. + +#### Japanese analyzer + +Use `"japanese"` to select the Japanese analyzer, which is a wrapper of [mecab](http://taku910.github.io/mecab/). + +#### Korean analyzer + +Use `"korean"` to select the Korean tokenizer, which is a wrapper of [mecab](http://taku910.github.io/mecab/) but uses a different Korean dictionary. + +#### RAG analyzer + +The RAG analyzer is a bilingual tokenizer that supports Chinese (simplified and traditional) and English. It is a C++ adaptation of [RAGFlow's tokenizer](https://github.com/infiniflow/ragflow/blob/main/rag/nlp/rag_tokenizer.py), and its tokenization of Latin characters derives from [NLTK](https://www.nltk.org/api/nltk.tokenize.punkt.html). + +This analyzer offers better recall for Chinese than the [Jieba](https://github.com/yanyiwu/cppjieba) analyzer, but lower tokenization throughput due to higher computational costs. Its English language processing involves an additional lemmatization step before stemming, different from that of the standard analyzer. + +Use `"rag"` to select the RAG analyzer or `"rag-fine"` for fine-grained mode, which outputs tokenization results with the second highest score. + +:::note +Both RAG tokenization and fine-grained RAG tokenization are used in RAGFlow to ensure high recall. +::: + +#### IK analyzer + +The IK analyzer is a bilingual tokenizer that supports Chinese (simplified and traditional) and English. It is a C++ adaptation of the [IK Analyzer](https://github.com/infinilabs/analysis-ik), which is widely used as a tokenizer by Chinese Elasticsearch users. + +Use `"ik"` to select this analyzer, which is equivalent to the `ik_smart` option in the [IK Analyzer](https://github.com/infinilabs/analysis-ik), or `"ik-fine"` for fine-grained mode, which is equivalent to the `ik_max_word` option in the [IK Analyzer](https://github.com/infinilabs/analysis-ik). + +#### Keyword analyzer + +The keyword analyzer is a "noop" analyzer used for columns containing keywords only, where traditional scoring methods like `BM25` do not apply. It scores `0` or `1`, depending on whether any keywords are matched. + +Use `"keyword"` to select this analyzer. + +### Search and ranking syntax + +Infinity supports the following syntax or full-text search expressions: + +- Single term +- AND multiple terms +- OR multiple terms +- Phrase search +- CARAT opertor +- Sloppy phrase search +- Field-specific search +- Escape character + +#### Single term + +Example: `"blooms"` + +#### AND multiple terms + +- `"space AND efficient"` + +#### OR multiple terms + +- `"Bloom OR filter"` +- `"Bloom filter"` + +:::tip NOTE +`OR` is the default semantic in a multi-term full-text search unless explicitly specified otherwise. +::: + +#### Phrase search + +- `"Bloom filter"` +- `'Bloom filter'` + +#### CARAT operator + +Use `^` to boost the importance of a specific term. For example: `quick^2 brown` boosts the importance of `quick` by a factor of 2, making it twice as important as `brown`. + +#### Sloppy phrase search + +Example: `'"harmful chemical"~10'` + +#### Field-specific search -- Standard analyzer: It's the default tokenizer, and is suitable for latin characters. Standard analyzer will just output tokens segmented by white spaces. It will also use stemmer before outputing, and `English` is the default stemmer. If you want to specify stemmers for other languages, use `standard-xxx` and `xxx` is the language you want to use. Currently, supported language stemmer includes: `Danish`, `Dutch`, `English`, `Finnish`, `French`, `German`, `Hungarian`, `Italian`, `Norwegian`, `Porter`, `Portuguese`, `Romanian`, `Russian`, `Spanish`, `Swedish`, `Turkish`. -- Ngram analyzer: The definition of Ngram could be referred to through [wikipedia](https://en.wikipedia.org/wiki/N-gram). You must specify the number of `N` when creating full text index if you want to use Ngram analyzer through `ngram-x` where `x` equals to the definition of `N`. For example, for code search, a typical choice is `ngram-3` . -- Chinese analyzer: Use `chinese` to specify tokenizer for simplified Chinese. It's a wrapper of [Jieba](https://github.com/yanyiwu/cppjieba) analyzer. Use `chinese-fine` to output the fine-grained analyzer results. -- Traditional Chinese analyzer: Use `traditional` to specify tokenizer for traditional Chinese, which is actually a conversion between simplified Chinese and traditional Chinese based on the outputs of [Jieba](https://github.com/yanyiwu/cppjieba) analyzer. -- Japanese analyzer: Use `japanese` to specify tokenizer for Japanese. It's a wrapper of [mecab](http://taku910.github.io/mecab/). -- Korean analyzer: Use `korean` to specify tokenizer for Korean. It's also a wrapper of [mecab](http://taku910.github.io/mecab/) but has different Korean dictionary. -- RAG analyzer: It's a C++ migration of tokenizer imported from [RAGFlow](https://github.com/infiniflow/ragflow/blob/main/rag/nlp/rag_tokenizer.py). It's a multilingual tokenizer and currently, both Chinese and English are well supported. RAG analyzer has better recall compared to [Jieba](https://github.com/yanyiwu/cppjieba) analyzer, but lower tokenization throughput due to much more expensive computation. The English processing within RAG analyzer is also different from Standard analyzer, because it has an extra step of lemmatization before stemming, additionally, the tokenization of latin characters is a c++ migration of [NLTK](https://www.nltk.org/api/nltk.tokenize.punkt.html). RAG analyzer also supports fined grained mode through `rag-fine`, which will output tokenization results with the second highest score. In RAGFlow, both RAG tokenization and fine-grained RAG tokenization are used to guarantee the recall. -- Keyword analyzer: It's a noop analyzer. This is used if you have columns containing keywords only, and you don't want such traditional scoring approaches as `BM25`to take into effects, the score will return 0 or 1 according to whether any keywords are hit. +Example: `"title:(quick OR brown) AND body:foobar"` -### Search and ranking +#### Escape character -Infinity offers following syntax for full text search: +Use `\` to escape reserved characters like ` ` `(` `)` `^` `"` `'` `~` `*` `?` `:` `\`. For example: `"space\:efficient"`. -- Single term: `"blooms"` -- AND multiple terms: `"space AND efficient"`, `"space && efficient"` or `"space + efficient"` -- OR multiple terms: `"Bloom OR filter"`, `"Bloom || filter"` or just `"Bloom filter"` . -- Phrase search: `"Bloom filter" or 'Bloom filter'` -- CARAT operator: `^`: Used to boost the importance of a term, e.g., `quick^2 brown` boosts the importance of `quick` by a factor of 2, making it twice as important as `brown`. -- Sloppy phrase search: `'"harmful chemical"~10'` -- Field-specific search: `"title:(quick OR brown) AND body:foobar"` -- Escaping reserved characters: `"space\-efficient"` . `:` `~` `()` `""` `+` `-` `=` `&` `|` `[]` `{}` `*` `?` `\` `/` are reserved characters for search syntax. +### Scoring -`OR` is the default semantic among multiple terms if user does not specify in search syntax. Infinity offers `BM25` scoring and block-max `WAND` for dynamic pruning to accelerate the multiple terms search processing. There are two approaches to bypass `BM25` scoring: +Infinity offers `BM25` scoring and block-max `WAND` for dynamic pruning to accelerate multi-term searches. To *not* use `BM25` scoring, do either of the following: -* Using `keyword` analyzer when creating index, then `BM25` will not be used and it will return the score based on whether keywords are hit. -* Specifying `similarity=boolean` during searching. Then the scoring is decided by the number of keywords hits. +- Set `"analyzer"` to `"keyword"` when creating index (to select the keyword analyzer). + *The returned score will then be based on whether keywords are matched.* +- Add `{"similarity": "boolean"}` as a search option. + *The scoring will then depend on the number of matched keywords.* ## Dense vector search @@ -105,7 +194,7 @@ Infinity offers three types of rerankers for fusion: ## Conditional filters -Conditional filters in Infinity must work through an index to facilitate search. There are two types of indexes in Infinity that support conditional filters: +Conditional filters in Infinity must work through an index to facilitate search. The following two types of indexes in Infinity support conditional filters: - **Secondary index**: Built on numeric or string columns. This index does not apply any tokenization to a string column when using conditional filters. - **Full-text index**: Built on full-text columns. This index applies tokenization to the full-text column but does not trigger any relevance scoring procedure. diff --git a/docs/guides/set_up_cluster.md b/docs/guides/set_up_cluster.md new file mode 100644 index 0000000000..abfeb8a1a9 --- /dev/null +++ b/docs/guides/set_up_cluster.md @@ -0,0 +1,192 @@ +--- +sidebar_position: 2 +slug: /set_up_cluster +--- +# Set up an Infinity cluster + +Architecture overview and user guide for Infinity cluster. + +--- + +## Overview + +An Infinity cluster consists of one leader node, up to four follower nodes, and several learner nodes: + +- **Leader node**: The read node and the only write node. +- **Follower node**: Read node. +- **Learner node**: Read node. + +As of v0.5.0, the supported shared storage is MinIO. + +![infinity_cluster](https://github.com/user-attachments/assets/3e9abeed-1698-4741-8bdb-ba3b05c1d7a3) + +### Architecture + +Infinity employs a distributed architecture comprising one leader node, *N* follower nodes (0 ≤ *N* ≤ 4), and a number of learner nodes. As illustrated in the diagram above, all nodes in the cluster use MinIO for persistent storage. + +- **Leader node**: The node responsible for processing transactions and managing connection status of other nodes in the cluster. When a transaction occurs, the leader node transmits the logs to both follower and learner nodes. The leader node confirms the completion of the transaction only upon receiving messages confirming completion of log persistence from *all* follower nodes. +- **Follower node**: Receives log/WAL from the leader synchronously. It acts as a backup for the leader node, maintaining strong consistency with the leader's data state. +- **Learner node**: Receives log/WAL from the leader *asynchronously*. A learner also serves as a backup for the leader node. However, its state may be behind that of the leader, because it is not required to maintain strong consistency with the leader, and neither does the leader need to confirm whether all learner nodes have completed log persistence. + +From the user's perspective, the leader is the only write node, and all write operations must go through the leader node; all nodes in the cluster serve as read nodes, allowing you to send read operations to any of the leader, follower, or learner nodes, thereby alleviating the write burden on the leader. + +### Startup and communication processes + +When started up as a cluster node (see [Customize configuration files for cluster](#customize-configuration-files-for-cluster)), a node enters `ADMIN` mode, but is not automatically assigned a role like leader, follower, or learner. You must call `ADMIN SET NODE ROLE` to assign it a role. Once a leader node starts, it reads logs from the local disk to determine the metadata and data to read from shared storage. + +Once you set a node to follower or learner using `ADMIN SET NODE ROLE`, it registers with the leader node. Upon receiving the registration request, the leader node sends back its current log for the followers and learners to construct their data state from shared storage. + +### Keep-alive mechanism + +Once successfully registered with the leader node, a follower or learner starts sending periodic heartbeats to it. The leader node relies on these heartbeats to manage the connection status of each node. For example, if it does not receive heartbeats from a particular node for a specified time period, it sets that node's connection status to `timeout`. + +### Log synchronization + +When a transaction occurs, the leader node sends its log to both follower and learner nodes. The leader confirms the transaction's completion only after receiving confirmation that all its follower nodes have successfully persisted the log. While the leader also sends logs to learner nodes, it does not require confirmation from them. + +### Mode and role transition + +![mode_transition](https://github.com/user-attachments/assets/932072a3-9ffb-4aad-89f1-7eef0fff931c) + +## Set up an Infinity cluster + +### Customize configuration files for cluster + +For *each* cluster node, you are required to prepare a customized configuration file to start it. Ensure that you properly set `server_mode`, `peer_ip`, `peer_port`, `storage_type`, and other related parameters. + +1. Set `server_mode` to `"admin"`. +2. Set `storage_type` to `"minio"`. +3. Set `peer_ip` and `peer_port`. +4. Update object storage-specific settings. +5. Save your changes and start up Infinity using the customized configuration file. + *When a cluster node starts, it automatically operates in `ADMIN` mode.* + +For further instructions on specifying a configuration file or setting parameters, see the [Configurations](https://infiniflow.org/docs/dev/configurations). + +### Set the leader node + + A cluster can have only one leader node. If the cluster you start does not have a leader node, call `ADMIN SET NODE ROLE` to promote the node you just started, which currently operates in `ADMIN` mode, to leader. Below is an example code: + +```shell +curl --request POST \ + --url http://localhost:23821/admin/node/current \ + --header 'accept: application/json' \ + --header 'content-type: application/json' \ + --data ' { + "role" : "leader", + "name" : "Harry", + } ' +``` + +*When the method call succeeds, the node switches to leader and operates in `CLUSTER` mode.* + +:::tip NOTE + +A node in `ADMIN` mode with `storage_type = "minio"` or in `CLUSTER` mode (as a follower or learner node) can be promoted to leader. + +::: + +You can also use `ADMIN SHOW CURRENT NODE` to verify the node's role and connection status: + +```shell +curl --request GET \ + --url http://localhost:23821/admin/node/current \ + --header 'accept: application/json' +``` + +### Set a follower node + +If the current node operates in `ADMIN` mode and the number of follower nodes in your cluster is less than four, call `ADMIN SET NODE ROLE` to promote this node to follower node: + +```shell +curl --request POST \ + --url http://localhost:23822/admin/node/current \ + --header 'accept: application/json' \ + --header 'content-type: application/json' \ + --data ' { + "role" : "follower", + "name" : "Hermione", + "address" : "0.0.0.0:23851" + } ' +``` + +*When the method call succeeds, the node is promoted to follower and registered with the leader node, which listens on `0.0.0.0:23851`.* + +:::tip NOTE + +A node in `ADMIN` mode with `storage_type = "minio"` can be promoted to follower node. + +::: + +### Set a learner node + +If the current node operates in `ADMIN` mode, call `ADMIN SET NODE ROLE` to promote this new node to learner node. + +```shell +curl --request POST \ + --url http://localhost:23823/admin/node/current \ + --header 'accept: application/json' \ + --header 'content-type: application/json' \ + --data ' { + "role" : "learner", + "name" : "Ron", + "address" : "0.0.0.0:23851" + } ' +``` + +*When the method call succeeds, the node is promoted to learner and registered with the leader node, which listens on `0.0.0.0:23851`.* + +:::tip NOTE + +Only a node in `ADMIN` mode with `storage_type = "minio"` can be promoted to learner node. + +::: + +### Check cluster health status + +You can send an HTTP request `ADMIN LIST NODES` to any node in the cluster to display the health status of all nodes. In the following code example, a follower node is called: + +```shell +curl --request GET \ + --url http://localhost:23822/admin/nodes \ + --header 'accept: application/json' +``` + +*When the method call succeeds, you get the following information of each node:* + +- *The HTTP address of the node.* +- *The number of heartbeats received from the leader node.* +- *The name of the node.* +- *The role of the node: leader, follower, or learner.* +- *The connection status of the node.* +- *The last time that the node was updated.* + +:::tip NOTE + +See `ADMIN LIST NODES` for further details. + +::: + +### Remove a node from the cluster + +Call `ADMIN REMOVE NODE` to remove a node from the cluster. Note that you must send your HTTP request to the leader node for this action. In the following code example, learner Ron will be removed: + +```shell +curl --request DELETE \ + --url http://localhost:23821/admin/node/ron \ + --header 'accept: application/json' \ + --header 'content-type: application/json' +``` + +*When the method call succeeds, the node operates in `ADMIN` mode and is unregistered.* + +## Distributed APIs + +- [ADMIN SET NODE ROLE](https://infiniflow.org/docs/dev/http_api_reference#admin-set-node-role) +- [ADMIN SHOW NODE VARIABLES](https://infiniflow.org/docs/dev/http_api_reference#admin-show-node-variables) +- [ADMIN SHOW NODE CONFIGS](https://infiniflow.org/docs/dev/http_api_reference#admin-show-node-configs) +- [ADMIN SHOW NODE VARIABLE](https://infiniflow.org/docs/dev/http_api_reference#admin-show-node-variable) +- [ADMIN SHOW CURRENT NODE](https://infiniflow.org/docs/dev/http_api_reference#admin-show-current-node) +- [ADMIN SHOW NODE](https://infiniflow.org/docs/dev/http_api_reference#admin-show-node) +- [ADMIN LIST NODES](https://infiniflow.org/docs/dev/http_api_reference#admin-list-nodes) +- [ADMIN REMOVE NODE](https://infiniflow.org/docs/dev/http_api_reference#admin-remove-node) \ No newline at end of file diff --git a/docs/references/benchmark.md b/docs/references/benchmark.md index f056a974b7..b66c36bd76 100644 --- a/docs/references/benchmark.md +++ b/docs/references/benchmark.md @@ -1,8 +1,9 @@ --- -sidebar_position: 1 +sidebar_position: 3 slug: /benchmark --- # Benchmark + This document compares the following key specifications of Elasticsearch, Qdrant, Quickwit and Infinity: - Time to insert & build index diff --git a/docs/references/configurations.mdx b/docs/references/configurations.mdx index ad83630f99..38be8dfc3c 100644 --- a/docs/references/configurations.mdx +++ b/docs/references/configurations.mdx @@ -1,5 +1,5 @@ --- -sidebar_position: 5 +sidebar_position: 0 slug: /configurations --- @@ -7,6 +7,10 @@ slug: /configurations import Tabs from '@theme/Tabs'; import TabItem from '@theme/TabItem'; +How to set and load configuration file when starting Infinity. + +--- + This document provides instructions for loading configuration file for Infinity and descriptions of each configuration entry. @@ -59,6 +63,12 @@ time_zone = "utc-8" # The number of worker threads. Defaults to the number of CPU cores. # Range: [1, 16384] cpu_limit = 8 +# The mode in which the server starts. Available options: +# - `"standalone"`: Start Infinity as a standalone server. +# - `"admin"`: +# - Start Infinity either as a standalone server in `ADMIN` mode (when `storage_type` is set to `"local"`) +# - Start Infinity as a cluster node in `ADMIN` mode (when `storage_type` is set to `"minio"`) +server_mode = "standalone" # Network configuration [network] @@ -74,9 +84,9 @@ client_port = 23817 # The maximum number of connections. Defaults to 256. # Range: [1, 65536] connection_pool_size = 128 -# The IP address of the Infinity peer server to be accessed by a peer node +# The IP address on which the current node listens. Used for registration and inter-node communication peer_ip = "0.0.0.0" -# The port of the Infinity peer server to be accessed by a peer node +# The port number on which the current node listens. Used for registration and inter-node communication peer_port = 23850 # The delay time for reconnecting to the Infinity peer server after a failed connection @@ -137,10 +147,24 @@ compact_interval = "120s" # the system performs a flush operation on that index. # Range: [8192, 8388608] mem_index_capacity = 1048576 -# Storage type. Defaults to "local". +# The type of storage to use. Available options: +# - `"local"`: (default) +# - `"minio"`: If you set `server_mode` to `"admin"` and `storage_type` to `"minio"`, the node will start as a cluster node in `ADMIN` mode. # Range: {"local"|"minio"} storage_type = "local" +# The number of dense vector index building worker threads. Defaults to the half number of CPU cores. +# Range: [1, number of CPU cores] +dense_index_building_worker = 2 + +# The number of sparse vector index building worker threads. Defaults to the half number of CPU cores. +# Range: [1, number of CPU cores] +sparse_index_building_worker = 2 + +# The number of fulltext index building worker threads. Defaults to the half number of CPU cores. +# Range: [1, number of CPU cores] +fulltext_index_building_worker = 2 + # Object storage configuration [storage.object_storage] # URL of the object storage server diff --git a/docs/references/faq.md b/docs/references/faq.md index 72febbd874..4fff1765a7 100644 --- a/docs/references/faq.md +++ b/docs/references/faq.md @@ -1,10 +1,12 @@ --- -sidebar_position: 2 +sidebar_position: 4 slug: /FAQ --- # Frequently asked questions +FAQs to be developed. + ## What is Retrieval-Augmented Generation? Retrieval Augmented Generation (RAG) is a technique used to improve the accuracy and reliability of responses from foundation models, specifically Large Language Models (LLMs). It works by supplementing the existing LLMs with external sources of knowledge. @@ -21,4 +23,4 @@ In addition to basic vector search, an AI-vector database also offers advanced c ## Where can I find a benchmark report of your database? -You can find a benchmark report on Infinity, the AI-native database, [here](../references/benchmark.md). \ No newline at end of file +You can find a benchmark report on Infinity, the AI-native database, [here](../references/benchmark.md). diff --git a/docs/references/http_api_reference.mdx b/docs/references/http_api_reference.mdx index 1259237108..86273816dc 100644 --- a/docs/references/http_api_reference.mdx +++ b/docs/references/http_api_reference.mdx @@ -1,5 +1,5 @@ --- -sidebar_position: 3 +sidebar_position: 1 slug: /http_api_reference --- @@ -976,10 +976,12 @@ curl --request POST \ - `"standard"`: (Default) Standard analyzer, segmented by tokens, lowercase processing, provides stemming outputs. Use `-` to specify stemmer for languages, `English` is the default stemmer: `"standard-english"` and `"standard"` have the same stemmer setting. Supported language stemmer includes: `Danish`, `Dutch`, `English`, `Finnish`, `French`, `German`, `Hungarian`, `Italian`, `Norwegian`, `Porter`, `Portuguese`, `Romanian`,`Russian`,`Spanish`,`Swedish`,`Turkish`. - `"rag"`: Multilingual RAG analyzer imported from [RAGFlow](https://github.com/infiniflow/ragflow/blob/main/rag/nlp/rag_tokenizer.py), supporting `Chinese` and `English`. Use `-fine` to output the fine-grained analyzer results. - `"chinese"`: Simplified Chinese. Use `-fine` to output the fine-grained analyzer results. - - `"traditional"`: Traditional Chinese - - `"japanese"`: Japanese - - `"korean"`: Korean - - `"ngram"`: [N-gram](https://en.wikipedia.org/wiki/N-gram) + - `"ik"`: Bilingual analyzer imported from [ik-analyzer](https://github.com/infinilabs/analysis-ik), supporting `Chinese` and `English`. Use `-fine` to output the fine-grained analyzer results. + - `"traditional"`: Traditional Chinese. + - `"japanese"`: Japanese. + - `"korean"`: Korean. + - `"ngram"`: [N-gram](https://en.wikipedia.org/wiki/N-gram). + - `"keyword"`: "noop" analyzer used for columns containing keywords only. - Parameter settings for a secondary index: - `"type"`: `"secondary"` - Parameter settings for a BMP index: @@ -1773,6 +1775,10 @@ Searches for data in a specified table. The search can range from a simple vecto - `"highlight"`: `string[]` - `"filter"`: `string` - `"fusion"`: `object` + - `"sort"` : `object[]` + - `"limit"` : `string` + - `"offset"` : `string` + - `"option"` : `object` ##### Request example @@ -1910,10 +1916,10 @@ curl --request GET \ A non-empty text string to search for. Used *only* when `"match_method"` is set to `"text"`. You can use various search options within the matching text, including: - Single terms: `"blooms"` - - OR multiple terms: `"Bloom OR filter"`, `"Bloom || filter"` or just `"Bloom filter"` + - OR multiple terms: `"Bloom OR filter"` or just `"Bloom filter"` - Phrase search: `'"Bloom filter"'` - - AND multiple terms: `"space AND efficient"`, `"space && efficient"` or `"space + efficient"` - - Escaping reserved characters: `"space\-efficient"` + - AND multiple terms: `"space AND efficient"` + - Escaping reserved characters: `"space\:efficient"` - Sloppy phrase search: `'"harmful chemical"~10'` - Field-specific search: `"title:(quick OR brown) AND body:foobar"` - `element_type`: `str`, *Required* @@ -1977,17 +1983,17 @@ curl --request GET \ - If `"fields"` is an empty string, this parameter specifies the default field to search on. - **"operator"**: `str`, *Optional* - If not specified, the search follows Infinity's full-text search syntax, meaning that logical and arithmetic operators, quotation marks and escape characters will function as full-text search operators, such as: - - AND operator: `AND`, `&&`, `+` - - OR operator: `OR`, `||` - - NOT operator: `NOT`, `!`, `-` + - AND operator: `AND` + - OR operator: `OR` + - NOT operator: `NOT` - PAREN operator: `(`, `)`, need to appear in pairs, and can be nested. - COLON operator: `:`: Used to specify field-specific search, e.g., `body:foobar` searches for `foobar` in the `body` field. - CARAT operator: `^`: Used to boost the importance of a term, e.g., `quick^2 brown` boosts the importance of `quick` by a factor of 2, making it twice as important as `brown`. - TILDE operator: `~`: Used for sloppy phrase search, e.g., `"harmful chemical"~10` searches for the phrase `"harmful chemical"` within a tolerable distance of 10 words. - SINGLE_QUOTED_STRING: Used to search for a phrase, e.g., `'Bloom filter'`. - DOUBLE_QUOTED_STRING: Used to search for a phrase, e.g., `"Bloom filter"`. - - Escape characters: Used to escape reserved characters, e.g., `space\-efficient`. Starting with a backslash `\` will escape the following characters: - `' '`, `'+'`, `'-'`, `'='`, `'&'`, `'|'`, `'!'`, `'('`, `')'`, `'{'`, `'}'`, `'['`, `']'`, `'^'`, `'"'`, `'~'`, `'*'`, `'?'`, `':'`, `'\'`, `'/'` + - Escape characters: Used to escape reserved characters, e.g., `space\:efficient`. Starting with a backslash `\` will escape the following characters: + `' '`, `'('`, `')'`, `'^'`, `'"'`, `'\''`, `'~'`, `'*'`, `'?'`, `':'`, `'\\'` - If specified, Infinity's full-text search syntax will not take effect, and the specified operator will be interpolated into `matching_text`. Useful for searching text including code numbers like `"A01-233:BC"`. - `{"operator": "or"}`: Interpolates the `OR` operator between words in `matching_text` to create a new search text. @@ -2004,7 +2010,17 @@ curl --request GET \ - `"query_tensor"`: The tensor data to compare against. This should be provided as a list of lists of numerical values. - `"element_type"`: The element data type of the query tensor. Usually `"float"`. +- `"sort"` : `object[]` + Defines how to sort the results. +- `"limit"` : `string` + Indicates the limit row count. + +- `"offset"` : `string` + Indicates the offset position of the limit expression. You must use this parameter together with `limit`. + +- `"option"` : `object` + Indicates some search options. This parameter must be used in conjunction with `limit`. #### Response @@ -2027,11 +2043,14 @@ The response includes a JSON object like the following: "age": 16 } ] + "total_hits_count": 3 } ``` - `"error_code"`: `integer` `0`: The operation succeeds. +- `"total_hits_count"`: `integer`, Optional + Available if you set a search option with `"total_hits_count": "true"` @@ -3485,6 +3504,73 @@ A `500` HTTP status code indicates an error condition. The response includes a J --- + +### Admin remove node + +**POST** `/admin/node/{node_name}` + +Removes a node from the cluster. This command can only be executed by the leader node. + +#### Request + +- Method: DELETE +- URL: `/admin/node/node_name` +- Headers: + - `accept: application/json` + - `content-type: application/json` + +##### Request example + +```shell +curl --request DELETE \ + --url http://localhost:23821/admin/node/follower1 \ + --header 'accept: application/json' \ + --header 'content-type: application/json' +``` + +#### Response + + + + +The response includes a JSON object like the following: + +```shell +{ + "error_code": 0 +} +``` + +- `"error_code"`: `integer` + `0`: The operation succeeds. + + + + +A `500` HTTP status code indicates an error condition. The response includes a JSON object like the following: + +```shell +{ + "error_code":7020 + "error_message" : "Duplicate node: following" +} +``` + +- `"error_code"`: `integer` + A non-zero value indicates a specific error condition. +- `"error_message"`: `string` + When `error_code` is non-zero, `"error_message"` provides additional details about the error. + + + + +--- + ### Admin show node variables **GET** `/admin/variables` @@ -3673,7 +3759,7 @@ Gets information about the currently connected node. #### Request - Method: GET -- URL: `/admin/node/{node_name}` +- URL: `/admin/node/current` - Headers: `accept: application/json` ##### Request example @@ -3781,7 +3867,7 @@ A `500` HTTP status code indicates an error condition. The response includes a J **GET** `/admin/nodes` -Lists all nodes in the cluster. +Lists all nodes in the cluster #### Request @@ -3809,19 +3895,37 @@ The response includes a JSON object like the following: ```shell { - "error_code":0, - "nodes":[ - ["following","follower"], - ["boss","leader"], - ["learning","learner"] - ] + "error_code": 0, + "nodes": [ + { + "address": "0.0.0.0:23852", + "heartbeat": "91", + "last_update": "Mon Dec 2 14:48:34 2024\n", + "name": "follower", + "role": "follower", + "status": "alive" + }, + { + "address": "0.0.0.0:23851", + "heartbeat": "0", + "last_update": "Mon Dec 2 14:48:34 2024\n", + "name": "boss", + "role": "leader", + "status": "alive" + } + ] } ``` -- `"error_code"`: `integer` +- `"error_code"`: `integer` `0`: The operation succeeds. -- `"nodes" : array` : - Each element is in `[nodename, noderole]` format. +- `"nodes" : array` : + Each element is an object with following information: + - `"address"` : the address of the node. + - `"heartbeat"` : only valid for non-leader role, this is the number of heartbeat message received from the leader. + - `"name"` : the node's name. + - `"role"` : the node's role. + - `"status"` : the current status of the node. --- diff --git a/docs/references/pysdk_api_reference.md b/docs/references/pysdk_api_reference.md index 87cd1306fe..5c09eabefb 100644 --- a/docs/references/pysdk_api_reference.md +++ b/docs/references/pysdk_api_reference.md @@ -1,5 +1,5 @@ --- -sidebar_position: 4 +sidebar_position: 2 slug: /pysdk_api_reference --- # Python API Reference @@ -901,11 +901,13 @@ An `IndexInfo` structure contains three fields,`column_name`, `index_type`, and - `"ANALYZER"`: *Optional* - `"standard"`: (Default) The standard analyzer, segmented by token, lowercase processing, and provides stemming output. Use `-` to specify the languages stemmer. `English` is the default stemmer: `"standard-english"` and `"standard"` are the same stemmer setting. Supported language stemmers include: `Danish`, `Dutch`, `English`, `Finnish`, `French`, `German`, `Hungarian`, `Italian`, `Norwegian`, `Porter`, `Portuguese`, `Romanian`, `Russian`, `Spanish`, `Swedish`, and `Turkish`. - `"rag"`: Multilingual RAG analyzer imported from [RAGFlow](https://github.com/infiniflow/ragflow/blob/main/rag/nlp/rag_tokenizer.py), supporting `Chinese` and `English`. Use `-fine` to output the fine-grained analyzer results. + - `"ik"`: Bilingual analyzer imported from [ik-analyzer](https://github.com/infinilabs/analysis-ik), supporting `Chinese` and `English`. Use `-fine` to output the fine-grained analyzer results. - `"chinese"`: Simplified Chinese. Use `-fine` to output the fine-grained analyzer results. - - `"traditional"`: Traditional Chinese - - `"japanese"`: Japanese - - `"korean"`: Korean - - `"ngram"`: [N-gram](https://en.wikipedia.org/wiki/N-gram) + - `"traditional"`: Traditional Chinese. + - `"japanese"`: Japanese. + - `"korean"`: Korean. + - `"ngram"`: [N-gram](https://en.wikipedia.org/wiki/N-gram). + - `"keyword"`: "noop" analyzer used for columns containing keywords only. - Parameter settings for a secondary index: No parameters are required. For now, use an empty list `[]`. - Parameter settings for a BMP index: @@ -1569,7 +1571,7 @@ table_object.delete("c1 >= 70 and c1 <= 90") --- -### update data +### update ```python table_object.update(cond, data) @@ -1786,13 +1788,134 @@ table_object.output(["*"]).filter("filter_fulltext('doc', 'first second', 'minim --- +### sort + +```python +table_object.sort(sort_expression_list) +``` + +Creates a sort expression using `sort_expression_list`. + +#### Parameters + +##### sort_expression_list: `list`, *Required* + +An expression list defining how to sort the results. + +#### Returns + +- Success: An `infinity.local_infinity.table.LocalTable` object in embedded mode or an `infinity.remote_thrift.table.RemoteTable` object in client-server mode. +- Failure: `InfinityException` + - `error_code`: `int` A non-zero value indicating a specific error condition. + - `error_msg`: `str` A message providing additional details about the error. + +#### Examples + +```python +# Output results sorted by the `c2` expression in ascending order and the `c1` expression in descending order +table_obj.output(["c1", "c2"]).sort([["c2", SortType.Asc], ["c1", SortType.Desc]]).to_df() +``` + +--- + +### limit + +```python +table_object.limit(limit_num) +``` + +Creates an expression to limit the number of the output rows to a maximum of `limit_num`. + +#### Parameters + +##### limit_num: `int`, *Required* + +An integer specifying the maximum number of output rows. + +#### Returns + +- Success: An `infinity.local_infinity.table.LocalTable` object in embedded mode or an `infinity.remote_thrift.table.RemoteTable` object in client-server mode. +- Failure: `InfinityException` + - `error_code`: `int` A non-zero value indicating a specific error condition. + - `error_msg`: `str` A message providing additional details about the error. + +#### Examples + +```python +# Limit the output row count to a maximum of two +table_instance.output(["num", "vec"]).limit(2).to_pl() +``` + +--- + +### offset + +```python +table_object.limit(limit_num).offset(offset_value) +``` + +Creates a limit expression with an offset value, setting the output to start from `offset_value` and limiting the row count to a maximum of `limit_num`. This method must be used in conjunction with `limit()`. + +#### Parameters + +##### offset_value: `int`, *Required* + +An integer specifying the offset position of the limit expression. + +#### Returns + +- Success: An `infinity.local_infinity.table.LocalTable` object in embedded mode or an `infinity.remote_thrift.table.RemoteTable` object in client-server mode. +- Failure: `InfinityException` + - `error_code`: `int` A non-zero value indicating a specific error condition. + - `error_msg`: `str` A message providing additional details about the error. + +#### Examples + +```python +# Limit the output row count not more than 2, start from position 1 +table_instance.output(["num", "vec"]).offset(1).limit(2).to_pl() +``` + +### option + +```python +table_object.option(option_dict) +``` + +Indicates some search options. + +#### Parameters + +##### option_dict: `dict`, *Required* + +A dictionary specifying the following search options: + +- **"total_hits_count"**: `bool`, *Optional* + - Must combine with limit expression. If `"total_hits_count"` is `True`, the query will output an extra result including total hits row count of the query. + +#### Returns + +- Success: An `infinity.local_infinity.table.LocalTable` object in embedded mode or an `infinity.remote_thrift.table.RemoteTable` object in client-server mode. +- Failure: `InfinityException` + - `error_code`: `int` A non-zero value indicating a specific error condition. + - `error_msg`: `str` A message providing additional details about the error. + +#### Examples + +```python +# Limit the output row count not more than 2, start from position 1, output an extra result to indicate total hits row count +table_instance.output(["num", "vec"]).limit(2).offset(1).option({"total_hits_count": True}).to_pl() +``` + +--- + ### match_dense ```python table_object.match_dense(vector_column_name, embedding_data, embedding_data_type, distance_type, topn, knn_params = None) ``` -Creates a dense vector search expression to identify the top n closest rows to the given dense vector. Suitable for working with dense vectors (dense embeddings) or multi-vectors (multiple dense embeddings in one row). +Creates a dense vector search expression to identify the closest top n rows to the given dense vector. Suitable for working with dense vectors (dense embeddings) or multi-vectors (multiple dense embeddings in one row). :::tip NOTE To display your query results, you must chain this method with `output(columns)`, which specifies the columns to output, and a method such as `to_pl()`, `to_df()`, or `to_arrow()` to format the query results. @@ -2031,10 +2154,10 @@ To display your query results, you must chain this method with `output(columns)` A non-empty text string to search for. You can use various search options within the matching text, including: - Single terms: `"blooms"` -- OR multiple terms: `"Bloom OR filter"`, `"Bloom || filter"` or just `"Bloom filter"` +- OR multiple terms: `"Bloom OR filter"` or just `"Bloom filter"` - Phrase search: `'"Bloom filter"'` -- AND multiple terms: `"space AND efficient"`, `"space && efficient"` or `"space + efficient"` -- Escaping reserved characters: `"space\-efficient"` +- AND multiple terms: `"space AND efficient"` +- Escaping reserved characters: `"space\:efficient"` - Sloppy phrase search: `'"harmful chemical"~10'` - Field-specific search: `"title:(quick OR brown) AND body:foobar"` @@ -2050,17 +2173,17 @@ An optional dictionary specifying the following search options: - If `"fields"` is an empty string, this parameter specifies the default field to search on. - **"operator"**: `str`, *Optional* - If not specified, the search follows Infinity's full-text search syntax, meaning that logical and arithmetic operators, quotation marks and escape characters will function as full-text search operators, such as: - - AND operator: `AND`, `&&`, `+` - - OR operator: `OR`, `||` - - NOT operator: `NOT`, `!`, `-` + - AND operator: `AND` + - OR operator: `OR` + - NOT operator: `NOT` - PAREN operator: `(`, `)`, need to appear in pairs, and can be nested. - COLON operator: `:`: Used to specify field-specific search, e.g., `body:foobar` searches for `foobar` in the `body` field. - CARAT operator: `^`: Used to boost the importance of a term, e.g., `quick^2 brown` boosts the importance of `quick` by a factor of 2, making it twice as important as `brown`. - TILDE operator: `~`: Used for sloppy phrase search, e.g., `"harmful chemical"~10` searches for the phrase `"harmful chemical"` within a tolerable distance of 10 words. - SINGLE_QUOTED_STRING: Used to search for a phrase, e.g., `'Bloom filter'`. - DOUBLE_QUOTED_STRING: Used to search for a phrase, e.g., `"Bloom filter"`. - - Escape characters: Used to escape reserved characters, e.g., `space\-efficient`. Starting with a backslash `\` will escape the following characters: - `' '`, `'+'`, `'-'`, `'='`, `'&'`, `'|'`, `'!'`, `'('`, `')'`, `'{'`, `'}'`, `'['`, `']'`, `'^'`, `'"'`, `'~'`, `'*'`, `'?'`, `':'`, `'\'`, `'/'` + - Escape characters: Used to escape reserved characters, e.g., `space\:efficient`. Starting with a backslash `\` will escape the following characters: + `' '`, `'('`, `')'`, `'^'`, `'"'`, `'\''`, `'~'`, `'*'`, `'?'`, `':'`, `'\\'` - If specified, Infinity's full-text search syntax will not take effect, and the specified operator will be interpolated into `matching_text`. Useful for searching text including code numbers like `"A01-233:BC"`. - `{"operator": "or"}`: Interpolates the `OR` operator between words in `matching_text` to create a new search text. @@ -2283,7 +2406,7 @@ We recommend calling `to_df()`, `to_pl()`, or `to_arrow()` to format your result #### Returns -`tuple[dict[str, list[Any]], dict[str, Any]]` +A `tuple[dict[str, list[Any]], dict[str, Any]], {}` object ### to_df @@ -2291,7 +2414,7 @@ We recommend calling `to_df()`, `to_pl()`, or `to_arrow()` to format your result table_object.to_df() ``` -Returns the query result in pandas DataFrame format. +Returns the query result as a tuple consisting of a pandas DataFrame and a dict. :::tip NOTE Call `to_df()` in a chain after (not necessarily "immediately after") `output(columns)` on the same table object. @@ -2299,13 +2422,13 @@ Call `to_df()` in a chain after (not necessarily "immediately after") `output(co #### Returns -A `pandas.DataFrame` object. +A `tuple[pandas.DataFrame, {}]` object #### Examples ```python # Format columns "c1" and C2" of the current table into a pandas DataFrame -res = table_object.output(["c1", "c2"]).to_df() +res, extra_res = table_object.output(["c1", "c2"]).to_df() ``` ### to_pl @@ -2314,7 +2437,7 @@ res = table_object.output(["c1", "c2"]).to_df() table_object.to_pl() ``` -Returns the query result in Polas DataFrame format. +Returns the query result as a tuple consisting of a Polars DataFrame and a dict. :::tip NOTE Call `to_pl()` in a chain after (not necessarily "immediately after") `output(columns)` on the same table object. @@ -2322,13 +2445,13 @@ Call `to_pl()` in a chain after (not necessarily "immediately after") `output(co #### Returns -A `polas.DataFrame` object. +A `tuple[polas.DataFrame, {}]` object. #### Examples ```python -# Format a vector search result into a Polas DataFrame. -res = table_object.output(["*"]).match_dense("vec", [3.0, 2.8, 2.7, 3.1], "float", "ip", 10).to_pl() +# Format a vector search result into a Polars DataFrame. +res, extra_res = table_object.output(["*"]).match_dense("vec", [3.0, 2.8, 2.7, 3.1], "float", "ip", 10).to_pl() ``` ### to_arrow @@ -2337,7 +2460,7 @@ res = table_object.output(["*"]).match_dense("vec", [3.0, 2.8, 2.7, 3.1], "float table_object.to_arrow() ``` -Returns the query result in Apache Arrow Table format. +Returns the query result as a tuple consisting of an Apache Arrow Table and a dict. :::tip NOTE Call `to_arrow()` in a chain after (not necessarily "immediately after") `output(columns)` on the same table object. @@ -2345,13 +2468,13 @@ Call `to_arrow()` in a chain after (not necessarily "immediately after") `output #### Returns -A `pyarrow.Table` object. +A `tuple[pyarrow.Table, {}]` object. #### Examples ```python # Format the current table object into an Apache Arrow Table. -res = table_object.output(["*"]).filter("score >= 90").to_arrow() +res, extra_result = table_object.output(["*"]).filter("score >= 90").to_arrow() ``` --- diff --git a/example/delete_update_data.py b/example/delete_update_data.py index 2f1ac7ac9f..398c6edfe3 100644 --- a/example/delete_update_data.py +++ b/example/delete_update_data.py @@ -87,8 +87,10 @@ print('about to update data') table_instance.update("num = 2", {"body": "unnecessary and harmful", "vec": [14.0, 7.2, 0.8, 10.9]}) - result = table_instance.output(["*"]).to_pl() + result, extra_result = table_instance.output(["*"]).to_pl() print(result) + if extra_result is not None: + print(extra_result) infinity_instance.disconnect() print('test done') diff --git a/example/export_data.py b/example/export_data.py index d39ce8a5b8..d3d655487e 100644 --- a/example/export_data.py +++ b/example/export_data.py @@ -86,7 +86,7 @@ }, { "num": 7, - "body": "Chris", + "name": "Chris", "age": 21, "score": 88.0, }, diff --git a/example/filter_data.py b/example/filter_data.py index abb067511d..a13eecfeab 100644 --- a/example/filter_data.py +++ b/example/filter_data.py @@ -72,7 +72,7 @@ }, { "num": 7, - "body": "Chris", + "name": "Chris", "score": 88.0, }, { @@ -99,8 +99,10 @@ # result = table_instance.output(["num", "name", "score"]).filter("not (score > 80.0)").to_pl() # print(result) - result = table_instance.output(["num", "name", "score"]).filter("num <> 9").to_pl() + result, extra_result = table_instance.output(["num", "name", "score"]).filter("num <> 9").to_pl() print(result) + if extra_result is not None: + print(extra_result) infinity_instance.disconnect() print('test done') diff --git a/example/filter_fulltext_keyword.py b/example/filter_fulltext_keyword.py index 5809da1333..daf6e691a9 100644 --- a/example/filter_fulltext_keyword.py +++ b/example/filter_fulltext_keyword.py @@ -101,16 +101,22 @@ ) # output 7, 8, 9, 10 - result = table_instance.output(["*"]).filter("(score > 80.0) and (score <= 90.0)").to_pl() + result, extra_result = table_instance.output(["*"]).filter("(score > 80.0) and (score <= 90.0)").to_pl() print(result) + if extra_result is not None: + print(extra_result) # output 6, 8 - result = table_instance.output(["*"]).filter("filter_fulltext('uuid', 'UUID-2-1 UUID-2-3')").to_pl() + result, extra_result = table_instance.output(["*"]).filter("filter_fulltext('uuid', 'UUID-2-1 UUID-2-3')").to_pl() print(result) + if extra_result is not None: + print(extra_result) # output 8 - result = table_instance.output(["*"]).filter("(score > 80.0) and (score <= 90.0) and filter_fulltext('uuid', 'UUID-2-1 UUID-2-3')").to_pl() + result, extra_result = table_instance.output(["*"]).filter("(score > 80.0) and (score <= 90.0) and filter_fulltext('uuid', 'UUID-2-1 UUID-2-3')").to_pl() print(result) + if extra_result is not None: + print(extra_result) # drop table db_instance.drop_table("my_table") diff --git a/example/fulltext_search.py b/example/fulltext_search.py index f3e6102187..75ad5c962d 100644 --- a/example/fulltext_search.py +++ b/example/fulltext_search.py @@ -86,13 +86,15 @@ r'"harmful chemical"~10', # sloppy phrase, refers to https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-match-query-phrase.html ] for question in questions: - qb_result = ( + qb_result, extra_result = ( table_instance.output(["num", "body", "_score"]).highlight(["body"]) .match_text("body", question, 10) .to_pl() ) print(f"question: {question}") print(qb_result) + if extra_result is not None: + print(extra_result) infinity_instance.disconnect() diff --git a/example/fulltext_search_zh.py b/example/fulltext_search_zh.py index 58f7deaf22..d658058c77 100644 --- a/example/fulltext_search_zh.py +++ b/example/fulltext_search_zh.py @@ -112,9 +112,11 @@ r'"Bloom filter"', # phrase: adjacent multiple terms ] for question in questions: - qb_result = table_instance.output(["num", "body", "_score"]).highlight(["body"]).match_text("body", question, 10).to_pl() + qb_result, extra_result = table_instance.output(["num", "body", "_score"]).highlight(["body"]).match_text("body", question, 10).to_pl() print(f"question: {question}") print(qb_result) + if extra_result is not None: + print(extra_result) infinity_instance.disconnect() diff --git a/example/functions.py b/example/functions.py index 440216155b..a7821643b2 100644 --- a/example/functions.py +++ b/example/functions.py @@ -26,55 +26,85 @@ # varchar functions #function char_length -res = table_obj.output(["*", "char_length(c1)"]).filter("char_length(c1) = 1").to_df() +res, extra_result = table_obj.output(["*", "char_length(c1)"]).filter("char_length(c1) = 1").to_df() print(res) +if extra_result is not None: + print(extra_result) -res = table_obj.output(["*", "char_length(c1)"]).filter("char_length(c1) = 3").to_df() +res, extra_result = table_obj.output(["*", "char_length(c1)"]).filter("char_length(c1) = 3").to_df() print(res) +if extra_result is not None: + print(extra_result) -res = table_obj.output(["*", "char_length(c1)"]).filter("char_length(c1) = 4").to_df() +res, extra_result = table_obj.output(["*", "char_length(c1)"]).filter("char_length(c1) = 4").to_df() print(res) +if extra_result is not None: + print(extra_result) -res = table_obj.output(["*", "char_length(c1)"]).filter("char_length(c1) = char_length(c2)").to_df() +res, extra_result = table_obj.output(["*", "char_length(c1)"]).filter("char_length(c1) = char_length(c2)").to_df() print(res) +if extra_result is not None: + print(extra_result) #function regex -res = table_obj.output(["*", "regex(c1, 'bc')"]).filter("regex(c1, 'bc')").to_df() +res, extra_result = table_obj.output(["*", "regex(c1, 'bc')"]).filter("regex(c1, 'bc')").to_df() print(res) +if extra_result is not None: + print(extra_result) -res = table_obj.output(["*"]).filter("regex(c1, '(\w+([-+.]\w+)*)@(\w+([-.]\w+)*)\.(\w+([-.]\w+)*)')").to_df() +res, extra_result = table_obj.output(["*"]).filter("regex(c1, '(\w+([-+.]\w+)*)@(\w+([-.]\w+)*)\.(\w+([-.]\w+)*)')").to_df() print(res) +if extra_result is not None: + print(extra_result) #function substring -res = table_obj.output(["*", "substring(c1, 0, 2)"]).filter("substring(c1, 0, 2) = 'ab'").to_df() +res, extra_result = table_obj.output(["*", "substring(c1, 0, 2)"]).filter("substring(c1, 0, 2) = 'ab'").to_df() print(res) +if extra_result is not None: + print(extra_result) -res = table_obj.output(["*", "substring(c1, 0, 4)"]).filter("substring(c1, 0, 4) = 'test'").to_df() +res, extra_result = table_obj.output(["*", "substring(c1, 0, 4)"]).filter("substring(c1, 0, 4) = 'test'").to_df() print(res) +if extra_result is not None: + print(extra_result) #function upper and lower -res = table_obj.output(["*", "upper(c1)"]).filter("upper(c1) = 'TEST@GMAIL.COM'").to_df() +res, extra_result = table_obj.output(["*", "upper(c1)"]).filter("upper(c1) = 'TEST@GMAIL.COM'").to_df() print(res) +if extra_result is not None: + print(extra_result) -res = table_obj.output(["*"]).filter("lower('ABC') = c1").to_df() +res, extra_result = table_obj.output(["*"]).filter("lower('ABC') = c1").to_df() print(res) +if extra_result is not None: + print(extra_result) #function ltrim, rtrim, trim -res = table_obj.output(["*", "ltrim(c1)"]).filter("ltrim(c1) = 'abc'").to_df() +res, extra_result = table_obj.output(["*", "ltrim(c1)"]).filter("ltrim(c1) = 'abc'").to_df() print(res) +if extra_result is not None: + print(extra_result) -res = table_obj.output(["*", "rtrim(c1)"]).filter("rtrim(c1) = 'abc'").to_df() +res, extra_result = table_obj.output(["*", "rtrim(c1)"]).filter("rtrim(c1) = 'abc'").to_df() print(res) +if extra_result is not None: + print(extra_result) -res = table_obj.output(["*", "trim(c1)"]).filter("trim(c1) = 'abc'").to_df() +res, extra_result = table_obj.output(["*", "trim(c1)"]).filter("trim(c1) = 'abc'").to_df() print(res) +if extra_result is not None: + print(extra_result) -res = table_obj.output(["*"]).filter("trim(' abc ') = rtrim(ltrim(' abc '))").to_df() +res, extra_result = table_obj.output(["*"]).filter("trim(' abc ') = rtrim(ltrim(' abc '))").to_df() print(res) +if extra_result is not None: + print(extra_result) #function char_position -res = table_obj.output(["*", "char_position(c1, 'bc')"]).filter("char_position(c1, c1) <> 0").to_df() +res, extra_result = table_obj.output(["*", "char_position(c1, 'bc')"]).filter("char_position(c1, c1) <> 0").to_df() print(res) +if extra_result is not None: + print(extra_result) # math functions db_obj.drop_table("function_example", ConflictType.Ignore) @@ -87,27 +117,39 @@ {"c1": 9, "c2": 10}, {"c1": 11, "c2": 12}, {"c1": 13, "c2": 14}, {"c1": 15, "c2": 16},]) #function sqrt -res = table_obj.output(["*", "sqrt(c1)", "sqrt(c2)"]).to_df() +res, extra_result = table_obj.output(["*", "sqrt(c1)", "sqrt(c2)"]).to_df() print(res) +if extra_result is not None: + print(extra_result) -res = table_obj.output(["*", "sqrt(c1)", "sqrt(c2)"]).filter("sqrt(c1) = 3").to_df() +res, extra_result = table_obj.output(["*", "sqrt(c1)", "sqrt(c2)"]).filter("sqrt(c1) = 3").to_df() print(res) +if extra_result is not None: + print(extra_result) #function round -res = table_obj.output(["*", "round(c1)", "round(c2)"]).to_df() +res, extra_result = table_obj.output(["*", "round(c1)", "round(c2)"]).to_df() print(res) +if extra_result is not None: + print(extra_result) #function ceiling -res = table_obj.output(["*", "ceil(c1)", "ceil(c2)"]).to_df() +res, extra_result = table_obj.output(["*", "ceil(c1)", "ceil(c2)"]).to_df() print(res) +if extra_result is not None: + print(extra_result) #function floor -res = table_obj.output(["*", "floor(c1)", "floor(c2)"]).to_df() +res, extra_result = table_obj.output(["*", "floor(c1)", "floor(c2)"]).to_df() print(res) +if extra_result is not None: + print(extra_result) #function ln -res = table_obj.output(["*", "ln(c1)", "ln(c2)"]).to_df() +res, extra_result = table_obj.output(["*", "ln(c1)", "ln(c2)"]).to_df() print(res) +if extra_result is not None: + print(extra_result) res = db_obj.drop_table("function_example") diff --git a/example/http/insert_search_data.sh b/example/http/insert_search_data.sh index ffd62a01b9..0a5d4af466 100755 --- a/example/http/insert_search_data.sh +++ b/example/http/insert_search_data.sh @@ -144,7 +144,11 @@ curl --request GET \ ], "filter": "num > 1 and year < 2024", "offset": "1", - "limit": "1" + "limit": "1", + "option": + { + "total_hits_count": "true" + } } ' echo -e '\n\n-- search with dense vector' diff --git a/example/hybrid_search.py b/example/hybrid_search.py index 762d6f47f6..6692d84cc6 100644 --- a/example/hybrid_search.py +++ b/example/hybrid_search.py @@ -90,9 +90,9 @@ infinity.common.ConflictType.Error, ) - result = ( + result, extra_result = ( table_instance.output( - ["num", "body", "vec", "sparse", "year", "tensor", "_score"] + ["num", "body", "vec", "sparse", "year", "tensor", "score()"] ) .match_dense("vec", [3.0, 2.8, 2.7, 3.1], "float", "cosine", 3) .match_sparse( @@ -108,6 +108,8 @@ .to_pl() # .explain(explain_type=infinity.table.ExplainType.UnOpt) ) + if extra_result is not None: + print(extra_result) print(result) infinity_instance.disconnect() diff --git a/example/import_data.py b/example/import_data.py index 38027df883..a355fdba3e 100644 --- a/example/import_data.py +++ b/example/import_data.py @@ -48,8 +48,10 @@ table_instance.import_data(project_directory + "/../test/data/csv/fulltext_delete.csv", {"header": True, "file_type": "csv", "delimiter": "\t"}) - result = table_instance.output(["num", "doc"]).to_pl() + result, extra_result = table_instance.output(["num", "doc"]).to_pl() print(result) + if extra_result is not None: + print(extra_result) infinity_instance.disconnect() diff --git a/example/search_with_limit_offset.py b/example/search_with_limit_offset.py index ede57f24f1..7495275f8a 100644 --- a/example/search_with_limit_offset.py +++ b/example/search_with_limit_offset.py @@ -61,9 +61,11 @@ ] ) - result = table_instance.output(["num", "vec", "_similarity"]).match_dense("vec", [3.0, 2.8, 2.7, 3.1], "float", - "cosine", 3).limit(2).offset(1).to_pl() + result, extra_result = table_instance.output(["num", "vec", "_similarity"]).match_dense("vec", [3.0, 2.8, 2.7, 3.1], "float", "cosine", 3).limit(2).offset(1).option({"total_hits_count": True}).to_pl() print(result) + if extra_result is not None: + print(extra_result) + infinity_instance.disconnect() print('test done') diff --git a/example/secondary_index.py b/example/secondary_index.py index 48d1eefbaa..3c4a7aac5a 100644 --- a/example/secondary_index.py +++ b/example/secondary_index.py @@ -55,8 +55,10 @@ ) table_instance.create_index("index1", infinity.index.IndexInfo("id", infinity.index.IndexType.Secondary)) - res = table_instance.filter("id='ID_1'").output(["*"]).to_pl() + res, extra_result = table_instance.filter("id='ID_1'").output(["*"]).to_pl() print(res) + if extra_result is not None: + print(extra_result) infinity_instance.disconnect() diff --git a/example/simple_example.py b/example/simple_example.py index d4117f200f..73799d4c5c 100644 --- a/example/simple_example.py +++ b/example/simple_example.py @@ -61,8 +61,10 @@ ] ) - res = table_instance.output(["num", "body", "vec"]).to_pl() + res, extra_result = table_instance.output(["num", "body", "vec"]).to_pl() print(res) + if extra_result is not None: + print(extra_result) infinity_instance.disconnect() diff --git a/example/sparse_vector_search.py b/example/sparse_vector_search.py index 0a66bdc1dc..979a841502 100644 --- a/example/sparse_vector_search.py +++ b/example/sparse_vector_search.py @@ -61,8 +61,11 @@ ] ) - result = table_instance.output(["num", "vec", "_similarity"]).match_sparse("vec", infinity.common.SparseVector([0, 20, 80], [1.0, 2.0, 3.0]), "ip", 3).to_pl() + result, extra_result = table_instance.output(["num", "vec", "_similarity"]).match_sparse("vec", infinity.common.SparseVector([0, 20, 80], [1.0, 2.0, 3.0]), "ip", 3).to_pl() print(result) + if extra_result is not None: + print(extra_result) + infinity_instance.disconnect() print('test done') diff --git a/example/tensor_search.py b/example/tensor_search.py index f9822adcc6..59072f41fc 100644 --- a/example/tensor_search.py +++ b/example/tensor_search.py @@ -62,10 +62,14 @@ }, ] ) - result = table_instance.output(["num", "vec", "_score"]).match_tensor("vec", - [[0.9, 0.0, 0.0, 0.0], [1.1, 0.0, 0.0, 0.0]], - 'float', 2).to_pl() + result, extra_result = table_instance.output(["num", "vec", "_score"]).match_tensor("vec", + [[0.9, 0.0, 0.0, 0.0], + [1.1, 0.0, 0.0, 0.0]], + 'float', 2).to_pl() print(result) + if extra_result is not None: + print(extra_result) + infinity_instance.disconnect() print('test done') diff --git a/example/vector_search.py b/example/vector_search.py index 443d4d7e6b..ab377359f5 100644 --- a/example/vector_search.py +++ b/example/vector_search.py @@ -61,9 +61,16 @@ ] ) - result = table_instance.output(["num", "vec", "_similarity"]).match_dense("vec", [3.0, 2.8, 2.7, 3.1], "float", - "cosine", 3).to_pl() + result, extra_result = (table_instance. + output(["num", "vec", "_similarity"]). + match_dense("vec", [3.0, 2.8, 2.7, 3.1], "float", "cosine", 3). + option({"total_hits_count": False}). + to_pl()) + print(result) + if extra_result is not None: + print(extra_result) + infinity_instance.disconnect() print('test done') diff --git a/pyproject.toml b/pyproject.toml index 723c9970dc..d75de1c4a1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -11,7 +11,7 @@ build-backend = "scikit_build_core.build" [project] name = "infinity_embedded_sdk" -version = "0.5.0.dev5" +version = "0.5.0.dev6" requires-python = ">=3.10" dependencies = [ "sqlglot~=11.7.0", diff --git a/python/README.md b/python/README.md index 9e201caa2d..1ea77fa2ff 100644 --- a/python/README.md +++ b/python/README.md @@ -51,11 +51,11 @@ LD_PRELOAD=$(ldconfig -p | grep 'libjemalloc.so ' | awk '{print $4}') python3 ex ``` Note: If you run with the debug version, you must set the **libasan** environment variable, for example ```shell -LD_PRELOAD=$(find $(clang-18 -print-resource-dir) -name "libclang_rt.asan.so") python3 example/simple_example.py +LD_PRELOAD=$(find $(clang-18 -print-resource-dir) -name "libclang_rt.asan-x86_64.so") python3 example/simple_example.py ``` -Note: When running with the debug version infinity-sdk, you may find some memory leaks caused by arrow. You can use `ASAN_OPTIONS=detect_leaks=0` to disable memory leak detection, for example +Note: When running with the debug version infinity_embedded-sdk, you may find some memory leaks caused by arrow. You can use `ASAN_OPTIONS=detect_leaks=0` to disable memory leak detection, for example ```shell -LD_PRELOAD=$(find $(clang-18 -print-resource-dir) -name "libclang_rt.asan.so") ASAN_OPTIONS=detect_leaks=0 python3 example/simple_example.py +LD_PRELOAD=$(find $(clang-18 -print-resource-dir) -name "libclang_rt.asan-x86_64.so") ASAN_OPTIONS=detect_leaks=0 python3 example/simple_example.py ``` # run pysdk test diff --git a/python/infinity_embedded/__init__.py b/python/infinity_embedded/__init__.py index 4b039c4926..764f6a567f 100644 --- a/python/infinity_embedded/__init__.py +++ b/python/infinity_embedded/__init__.py @@ -1,4 +1,4 @@ -# Copyright(C) 2023 InfiniFlow, Inc. All rights reserved. +# Copyright(C) 2024 InfiniFlow, Inc. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -20,12 +20,14 @@ # import pkg_resources # __version__ = pkg_resources.get_distribution("infinity_sdk").version -from infinity_embedded.common import URI, NetworkAddress, LOCAL_HOST, LOCAL_INFINITY_PATH, InfinityException, LOCAL_INFINITY_CONFIG_PATH +from infinity_embedded.common import URI, NetworkAddress, LOCAL_HOST, LOCAL_INFINITY_PATH, InfinityException, \ + LOCAL_INFINITY_CONFIG_PATH from infinity_embedded.infinity import InfinityConnection from infinity_embedded.local_infinity.infinity import LocalInfinityConnection from infinity_embedded.errors import ErrorCode -def connect(uri = LOCAL_INFINITY_PATH, config_path = LOCAL_INFINITY_CONFIG_PATH) -> InfinityConnection: + +def connect(uri=LOCAL_INFINITY_PATH, config_path=LOCAL_INFINITY_CONFIG_PATH) -> InfinityConnection: if isinstance(uri, str) and len(uri) != 0: return LocalInfinityConnection(uri, config_path) else: diff --git a/python/infinity_embedded/common.py b/python/infinity_embedded/common.py index d956f5c28d..a0d2e5fb87 100644 --- a/python/infinity_embedded/common.py +++ b/python/infinity_embedded/common.py @@ -1,4 +1,4 @@ -# Copyright(C) 2023 InfiniFlow, Inc. All rights reserved. +# Copyright(C) 2024 InfiniFlow, Inc. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -11,6 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. + from pathlib import Path from typing import Union from dataclasses import dataclass @@ -75,10 +76,12 @@ class ConflictType(object): Error = 1 Replace = 2 + class SortType(object): Asc = 0 Desc = 1 + class InfinityException(Exception): def __init__(self, error_code=0, error_message=None): self.error_code = error_code diff --git a/python/infinity_embedded/db.py b/python/infinity_embedded/db.py index 1e1693c890..930924397e 100644 --- a/python/infinity_embedded/db.py +++ b/python/infinity_embedded/db.py @@ -1,4 +1,4 @@ -# Copyright(C) 2023 InfiniFlow, Inc. All rights reserved. +# Copyright(C) 2024 InfiniFlow, Inc. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -14,6 +14,7 @@ from abc import ABC, abstractmethod + class Database(ABC): @abstractmethod diff --git a/python/infinity_embedded/errors.py b/python/infinity_embedded/errors.py index 1e482d3eb8..a40d15f381 100644 --- a/python/infinity_embedded/errors.py +++ b/python/infinity_embedded/errors.py @@ -1,4 +1,4 @@ -# Copyright(C) 2023 InfiniFlow, Inc. All rights reserved. +# Copyright(C) 2024 InfiniFlow, Inc. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -117,6 +117,15 @@ class ErrorCode(IntEnum): INVALID_EXPLAIN_TYPE = 3081, CHUNK_NOT_EXIST = 3082, NAME_MISMATCHED = 3083, + TRANSACTION_NOT_FOUND = 3084, + INVALID_DATABASE_INDEX = 3085, + INVALID_TABLE_INDEX = 3086, + FUNCTION_IS_DISABLE = 3087, + NOT_FOUND = 3088, + ERROR_INIT = 3089, + FILE_IS_OPEN = 3090, + UNKNOWN = 3091, + INVALID_QUERY_OPTION = 3092, TXN_ROLLBACK = 4001, TXN_CONFLICT = 4002, @@ -126,6 +135,7 @@ class ErrorCode(IntEnum): TOO_MANY_CONNECTIONS = 5003, CONFIGURATION_LIMIT_EXCEED = 5004, QUERY_IS_TOO_COMPLEX = 5005, + FAIL_TO_GET_SYS_INFO = 5006, QUERY_CANCELLED = 6001, QUERY_NOT_SUPPORTED = 6002, @@ -147,7 +157,26 @@ class ErrorCode(IntEnum): MUNMAP_FILE_ERROR = 7014, INVALID_FILE_FLAG = 7015, INVALID_SERVER_ADDRESS = 7016, + FAIL_TO_FUN_PYTHON = 7017, + CANT_CONNECT_SERVER = 7018, + NOT_EXIST_NODE = 7019, + DUPLICATE_NODE = 7020, + CANT_CONNECT_LEADER = 7021, + MINIO_INVALID_ACCESS_KEY = 7022, + MINIO_BUCKET_NOT_EXISTS = 7023, + INVALID_STORAGE_TYPE = 7024, + NOT_REGISTERED = 7025, + CANT_SWITCH_ROLE = 7026, + TOO_MANY_FOLLOWER = 7027, + TOO_MANY_LEARNER = 7028, INVALID_ENTRY = 8001, - NOT_FOUND_ENTRY = 8002, - EMPTY_ENTRY_LIST = 8003, + DUPLICATE_ENTRY = 8002 + NOT_FOUND_ENTRY = 8003, + EMPTY_ENTRY_LIST = 8004, + NO_WAL_ENTRY_FOUND = 8005, + WRONG_CHECKPOINT_TYPE = 8006, + INVALID_NODE_ROLE = 8007, + INVALID_NODE_STATUS = 8008, + NODE_INFO_UPDATED = 8009, + NODE_NAME_MISMATCH = 8010 diff --git a/python/infinity_embedded/index.py b/python/infinity_embedded/index.py index 1620ceb644..392185c275 100644 --- a/python/infinity_embedded/index.py +++ b/python/infinity_embedded/index.py @@ -1,4 +1,4 @@ -# Copyright(C) 2023 InfiniFlow, Inc. All rights reserved. +# Copyright(C) 2024 InfiniFlow, Inc. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -18,7 +18,6 @@ from infinity_embedded.embedded_infinity_ext import IndexType as LocalIndexType, WrapIndexInfo from infinity_embedded.embedded_infinity_ext import InitParameter as LocalInitParameter -from infinity_embedded.embedded_infinity_ext import WrapIndexInfo as LocalIndexInfo from infinity_embedded.errors import ErrorCode diff --git a/python/infinity_embedded/infinity.py b/python/infinity_embedded/infinity.py index 1251925629..8b40e08186 100644 --- a/python/infinity_embedded/infinity.py +++ b/python/infinity_embedded/infinity.py @@ -1,4 +1,4 @@ -# Copyright(C) 2023 InfiniFlow, Inc. All rights reserved. +# Copyright(C) 2024 InfiniFlow, Inc. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -11,8 +11,10 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. + from abc import ABC, abstractmethod + # abstract class class InfinityConnection(ABC): def __init__(self, uri): diff --git a/python/infinity_embedded/local_infinity/client.py b/python/infinity_embedded/local_infinity/client.py index 501867ad57..82f322ab27 100644 --- a/python/infinity_embedded/local_infinity/client.py +++ b/python/infinity_embedded/local_infinity/client.py @@ -1,4 +1,4 @@ -# Copyright(C) 2023 InfiniFlow, Inc. All rights reserved. +# Copyright(C) 2024 InfiniFlow, Inc. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -22,7 +22,8 @@ class LocalQueryResult: def __init__(self, error_code: PyErrorCode, error_msg: str, db_names=None, table_names=None, index_names=None, column_defs=None, column_fields=None, database_name=None, store_dir=None, table_count=None, comment=None, - table_name=None, index_name=None, index_type=None, index_comment=None, deleted_rows=0): + table_name=None, index_name=None, index_type=None, index_comment=None, deleted_rows=0, + extra_result=None): self.error_code = error_code self.error_msg = error_msg self.db_names = db_names @@ -40,10 +41,11 @@ def __init__(self, error_code: PyErrorCode, error_msg: str, db_names=None, table self.index_type = index_type self.index_comment = index_comment self.deleted_rows = deleted_rows + self.extra_result = extra_result class LocalInfinityClient: - def __init__(self, path: str = LOCAL_INFINITY_PATH, config_path = LOCAL_INFINITY_CONFIG_PATH): + def __init__(self, path: str = LOCAL_INFINITY_PATH, config_path=LOCAL_INFINITY_CONFIG_PATH): self.path = path Infinity.LocalInit(path, config_path) self.client = Infinity.LocalConnect() @@ -69,7 +71,7 @@ def convert_res(self, res, has_db_names=False, has_table_names=False, has_result return LocalQueryResult(PyErrorCode(res.error_code.value), res.error_msg, table_names=res.names) if has_result_data: return LocalQueryResult(PyErrorCode(res.error_code.value), res.error_msg, column_defs=res.column_defs, - column_fields=res.column_fields) + column_fields=res.column_fields, extra_result=res.extra_result) if has_db_name: return LocalQueryResult(PyErrorCode(res.error_code.value), res.error_msg, database_name=res.database_name, store_dir=res.store_dir, table_count=res.table_count, comment=res.comment) @@ -205,6 +207,7 @@ def search(self, highlight_list: list[WrapParsedExpr] = [], order_by_list: list[WrapOrderByExpr] = [], group_by_list: list[WrapParsedExpr] = [], + total_hits_count_flag: bool = False, search_expr: WrapSearchExpr = None, where_expr: WrapParsedExpr = None, limit_expr: WrapParsedExpr = None, @@ -217,6 +220,7 @@ def search(self, highlight_list, order_by_list, group_by_list, + total_hits_count_flag, search_expr, where_expr, limit_expr, diff --git a/python/infinity_embedded/local_infinity/db.py b/python/infinity_embedded/local_infinity/db.py index d2835e3b11..81e0308120 100644 --- a/python/infinity_embedded/local_infinity/db.py +++ b/python/infinity_embedded/local_infinity/db.py @@ -1,3 +1,17 @@ +# Copyright(C) 2024 InfiniFlow, Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + from abc import ABC from infinity_embedded.db import Database diff --git a/python/infinity_embedded/local_infinity/infinity.py b/python/infinity_embedded/local_infinity/infinity.py index 669a007579..9201def38f 100644 --- a/python/infinity_embedded/local_infinity/infinity.py +++ b/python/infinity_embedded/local_infinity/infinity.py @@ -1,3 +1,17 @@ +# Copyright(C) 2024 InfiniFlow, Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import os from infinity_embedded import InfinityConnection from abc import ABC @@ -90,7 +104,6 @@ def show_current_node(self): else: raise InfinityException(res.error_code, res.error_msg) - def search(self, db_name, table_name): self.check_connect() res = self._client.search(db_name, table_name, []) diff --git a/python/infinity_embedded/local_infinity/query_builder.py b/python/infinity_embedded/local_infinity/query_builder.py index a11935d869..3e1967e232 100644 --- a/python/infinity_embedded/local_infinity/query_builder.py +++ b/python/infinity_embedded/local_infinity/query_builder.py @@ -1,3 +1,17 @@ +# Copyright(C) 2024 InfiniFlow, Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + from __future__ import annotations from abc import ABC @@ -18,17 +32,19 @@ from infinity_embedded.table import ExplainType as BaseExplainType from infinity_embedded.errors import ErrorCode + class Query(ABC): def __init__( - self, - columns: Optional[List[WrapParsedExpr]], - highlight: Optional[List[WrapParsedExpr]], - search: Optional[WrapSearchExpr], - filter: Optional[WrapParsedExpr], - group_by: Optional[List[WrapParsedExpr]], - limit: Optional[WrapParsedExpr], - offset: Optional[WrapParsedExpr], - sort: Optional[List[WrapOrderByExpr]] + self, + columns: Optional[List[WrapParsedExpr]], + highlight: Optional[List[WrapParsedExpr]], + search: Optional[WrapSearchExpr], + filter: Optional[WrapParsedExpr], + group_by: Optional[List[WrapParsedExpr]], + limit: Optional[WrapParsedExpr], + offset: Optional[WrapParsedExpr], + sort: Optional[List[WrapOrderByExpr]], + total_hits_count: Optional[bool] ): self.columns = columns self.highlight = highlight @@ -38,22 +54,23 @@ def __init__( self.limit = limit self.offset = offset self.sort = sort + self.total_hits_count = total_hits_count class ExplainQuery(Query): def __init__( - self, - columns: Optional[List[WrapParsedExpr]], - highlight: Optional[List[WrapParsedExpr]], - search: Optional[WrapSearchExpr], - filter: Optional[WrapParsedExpr], - group_by: Optional[List[WrapParsedExpr]], - limit: Optional[WrapParsedExpr], - offset: Optional[WrapParsedExpr], - sort: Optional[List[WrapOrderByExpr]], - explain_type: Optional[BaseExplainType], + self, + columns: Optional[List[WrapParsedExpr]], + highlight: Optional[List[WrapParsedExpr]], + search: Optional[WrapSearchExpr], + filter: Optional[WrapParsedExpr], + group_by: Optional[List[WrapParsedExpr]], + limit: Optional[WrapParsedExpr], + offset: Optional[WrapParsedExpr], + sort: Optional[List[WrapOrderByExpr]], + explain_type: Optional[BaseExplainType], ): - super().__init__(columns, highlight, search, filter, group_by, limit, offset, sort) + super().__init__(columns, highlight, search, filter, group_by, limit, offset, sort, None) self.explain_type = explain_type @@ -68,6 +85,7 @@ def __init__(self, table): self._limit = None self._offset = None self._sort = None + self._total_hits_count = None def reset(self): self._columns = None @@ -78,15 +96,16 @@ def reset(self): self._limit = None self._offset = None self._sort = None + self._total_hits_count = None def match_dense( - self, - vector_column_name: str, - embedding_data: VEC, - embedding_data_type: str, - distance_type: str, - topn: int, - knn_params: {} = None, + self, + vector_column_name: str, + embedding_data: VEC, + embedding_data_type: str, + distance_type: str, + topn: int, + knn_params: {} = None, ) -> InfinityLocalQueryBuilder: if self._search is None: self._search = WrapSearchExpr() @@ -104,7 +123,8 @@ def match_dense( if embedding_data_type == "bit": if len(embedding_data) % 8 != 0: raise InfinityException( - ErrorCode.INVALID_EMBEDDING_DATA_TYPE, f"Embeddings with data bit must have dimension of times of 8!" + ErrorCode.INVALID_EMBEDDING_DATA_TYPE, + f"Embeddings with data bit must have dimension of times of 8!" ) else: new_embedding_data = [] @@ -170,7 +190,8 @@ def match_dense( elem_type = EmbeddingDataType.kElemBFloat16 data.bf16_array_value = embedding_data else: - raise InfinityException(ErrorCode.INVALID_EMBEDDING_DATA_TYPE, f"Invalid embedding {embedding_data[0]} type") + raise InfinityException(ErrorCode.INVALID_EMBEDDING_DATA_TYPE, + f"Invalid embedding {embedding_data[0]} type") dist_type = KnnDistanceType.kInvalid if distance_type == "l2": @@ -214,12 +235,12 @@ def match_dense( return self def match_sparse( - self, - vector_column_name: str, - sparse_data: SparseVector | dict, - metric_type: str, - topn: int, - opt_params: {} = None, + self, + vector_column_name: str, + sparse_data: SparseVector | dict, + metric_type: str, + topn: int, + opt_params: {} = None, ) -> InfinityLocalQueryBuilder: if self._search is None: self._search = WrapSearchExpr() @@ -294,7 +315,7 @@ def match_sparse( return self def match_text( - self, fields: str, matching_text: str, topn: int, extra_options: Optional[dict] + self, fields: str, matching_text: str, topn: int, extra_options: Optional[dict] ) -> InfinityLocalQueryBuilder: if self._search is None: self._search = WrapSearchExpr() @@ -320,12 +341,12 @@ def match_text( return self def match_tensor( - self, - column_name: str, - query_data: VEC, - query_data_type: str, - topn: int, - extra_option: Optional[dict] = None, + self, + column_name: str, + query_data: VEC, + query_data_type: str, + topn: int, + extra_option: Optional[dict] = None, ) -> InfinityLocalQueryBuilder: if self._search is None: self._search = WrapSearchExpr() @@ -425,6 +446,26 @@ def output(self, columns: Optional[list]) -> InfinityLocalQueryBuilder: parsed_expr = WrapParsedExpr(expr_type) parsed_expr.function_expr = func_expr + select_list.append(parsed_expr) + case "_create_timestamp": + func_expr = WrapFunctionExpr() + func_expr.func_name = "create_timestamp" + func_expr.arguments = [] + + expr_type = ParsedExprType(ParsedExprType.kFunction) + parsed_expr = WrapParsedExpr(expr_type) + parsed_expr.function_expr = func_expr + + select_list.append(parsed_expr) + case "_delete_timestamp": + func_expr = WrapFunctionExpr() + func_expr.func_name = "delete_timestamp" + func_expr.arguments = [] + + expr_type = ParsedExprType(ParsedExprType.kFunction) + parsed_expr = WrapParsedExpr(expr_type) + parsed_expr.function_expr = func_expr + select_list.append(parsed_expr) case "_score": func_expr = WrapFunctionExpr() @@ -456,6 +497,37 @@ def output(self, columns: Optional[list]) -> InfinityLocalQueryBuilder: parsed_expr = WrapParsedExpr(expr_type) parsed_expr.function_expr = func_expr + select_list.append(parsed_expr) + case "_score_factors": + func_expr = WrapFunctionExpr() + func_expr.func_name = "score_factors" + func_expr.arguments = [] + + expr_type = ParsedExprType(ParsedExprType.kFunction) + parsed_expr = WrapParsedExpr(expr_type) + parsed_expr.function_expr = func_expr + + select_list.append(parsed_expr) + + case "_similarity_factors": + func_expr = WrapFunctionExpr() + func_expr.func_name = "similarity_factors" + func_expr.arguments = [] + + expr_type = ParsedExprType(ParsedExprType.kFunction) + parsed_expr = WrapParsedExpr(expr_type) + parsed_expr.function_expr = func_expr + + select_list.append(parsed_expr) + case "_distance_factors": + func_expr = WrapFunctionExpr() + func_expr.func_name = "distance_factors" + func_expr.arguments = [] + + expr_type = ParsedExprType(ParsedExprType.kFunction) + parsed_expr = WrapParsedExpr(expr_type) + parsed_expr.function_expr = func_expr + select_list.append(parsed_expr) case _: @@ -477,6 +549,12 @@ def highlight(self, columns: Optional[list]) -> InfinityLocalQueryBuilder: self._highlight = highlight_list return self + def option(self, option_kv: {}): + if 'total_hits_count' in option_kv: + if isinstance(option_kv['total_hits_count'], bool): + self._total_hits_count = option_kv['total_hits_count'] + return self + def sort(self, order_by_expr_list: Optional[List[list[str, SortType]]]) -> InfinityLocalQueryBuilder: sort_list: List[WrapOrderByExpr] = [] @@ -504,6 +582,28 @@ def sort(self, order_by_expr_list: Optional[List[list[str, SortType]]]) -> Infin parsed_expr = WrapParsedExpr(expr_type) parsed_expr.function_expr = func_expr + order_by_expr = WrapOrderByExpr(parsed_expr, order_by_expr[1] == SortType.Asc) + sort_list.append(order_by_expr) + case "_create_timestamp": + func_expr = WrapFunctionExpr() + func_expr.func_name = "create_timestamp" + func_expr.arguments = [] + + expr_type = ParsedExprType(ParsedExprType.kFunction) + parsed_expr = WrapParsedExpr(expr_type) + parsed_expr.function_expr = func_expr + + order_by_expr = WrapOrderByExpr(parsed_expr, order_by_expr[1] == SortType.Asc) + sort_list.append(order_by_expr) + case "_delete_timestamp": + func_expr = WrapFunctionExpr() + func_expr.func_name = "delete_timestamp" + func_expr.arguments = [] + + expr_type = ParsedExprType(ParsedExprType.kFunction) + parsed_expr = WrapParsedExpr(expr_type) + parsed_expr.function_expr = func_expr + order_by_expr = WrapOrderByExpr(parsed_expr, order_by_expr[1] == SortType.Asc) sort_list.append(order_by_expr) case "_score": @@ -539,6 +639,40 @@ def sort(self, order_by_expr_list: Optional[List[list[str, SortType]]]) -> Infin order_by_expr = WrapOrderByExpr(parsed_expr, order_by_expr[1] == SortType.Asc) sort_list.append(order_by_expr) + case "_score_factors": + func_expr = WrapFunctionExpr() + func_expr.func_name = "score_factors" + func_expr.arguments = [] + + expr_type = ParsedExprType(ParsedExprType.kFunction) + parsed_expr = WrapParsedExpr(expr_type) + parsed_expr.function_expr = func_expr + + order_by_expr = WrapOrderByExpr(parsed_expr, order_by_expr[1] == SortType.Asc) + sort_list.append(order_by_expr) + case "_similarity_factors": + func_expr = WrapFunctionExpr() + func_expr.func_name = "similarity_factors" + func_expr.arguments = [] + + expr_type = ParsedExprType(ParsedExprType.kFunction) + parsed_expr = WrapParsedExpr(expr_type) + parsed_expr.function_expr = func_expr + + order_by_expr = WrapOrderByExpr(parsed_expr, order_by_expr[1] == SortType.Asc) + sort_list.append(order_by_expr) + case "_distance_factors": + func_expr = WrapFunctionExpr() + func_expr.func_name = "distance_factors" + func_expr.arguments = [] + + expr_type = ParsedExprType(ParsedExprType.kFunction) + parsed_expr = WrapParsedExpr(expr_type) + parsed_expr.function_expr = func_expr + + order_by_expr = WrapOrderByExpr(parsed_expr, order_by_expr[1] == SortType.Asc) + sort_list.append(order_by_expr) + case _: parsed_expr = parse_expr(maybe_parse(order_by_expr_str)) order_by_expr = WrapOrderByExpr(parsed_expr, order_by_expr[1] == SortType.Asc) @@ -547,7 +681,7 @@ def sort(self, order_by_expr_list: Optional[List[list[str, SortType]]]) -> Infin self._sort = sort_list return self - def to_result(self): + def to_result(self) -> tuple[dict[str, list[Any]], dict[str, Any], {}]: query = Query( columns=self._columns, highlight=self._highlight, @@ -557,23 +691,26 @@ def to_result(self): limit=self._limit, offset=self._offset, sort=self._sort, + total_hits_count=self._total_hits_count, ) self.reset() return self._table._execute_query(query) - def to_df(self) -> pd.DataFrame: + def to_df(self) -> (pd.DataFrame, {}): df_dict = {} - data_dict, data_type_dict = self.to_result() + data_dict, data_type_dict, extra_result = self.to_result() for k, v in data_dict.items(): data_series = pd.Series(v, dtype=logic_type_to_dtype(data_type_dict[k])) df_dict[k] = data_series - return pd.DataFrame(df_dict) + return pd.DataFrame(df_dict), extra_result - def to_pl(self) -> pl.DataFrame: - return pl.from_pandas(self.to_df()) + def to_pl(self) -> (pl.DataFrame, {}): + dataframe, extra_result = self.to_df() + return pl.from_pandas(dataframe), extra_result - def to_arrow(self) -> Table: - return pa.Table.from_pandas(self.to_df()) + def to_arrow(self) -> (Table, {}): + dataframe, extra_result = self.to_df() + return pa.Table.from_pandas(dataframe), extra_result def explain(self, explain_type=ExplainType.kPhysical) -> Any: query = ExplainQuery( diff --git a/python/infinity_embedded/local_infinity/table.py b/python/infinity_embedded/local_infinity/table.py index b6a1ac35ea..7b0137de3e 100644 --- a/python/infinity_embedded/local_infinity/table.py +++ b/python/infinity_embedded/local_infinity/table.py @@ -1,4 +1,4 @@ -# Copyright(C) 2023 InfiniFlow, Inc. All rights reserved. +# Copyright(C) 2024 InfiniFlow, Inc. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -11,6 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. + import functools import inspect from typing import Optional, Union, List, Any @@ -59,7 +60,7 @@ def wrapper(*args, **kwargs): @name_validity_check("index_name", "Index") def create_index(self, index_name: str, index_info: IndexInfo, - conflict_type: ConflictType = ConflictType.Error, index_comment : str = ""): + conflict_type: ConflictType = ConflictType.Error, index_comment: str = ""): index_name = index_name.strip() create_index_conflict: LocalConflictType @@ -166,8 +167,8 @@ def insert(self, data: Union[INSERT_DATA, list[INSERT_DATA]]): constant_expression = get_local_constant_expr_from_python_value(value) parse_exprs.append(constant_expression) insert_row = WrapInsertRowExpr() - insert_row.columns=column_names - insert_row.values=parse_exprs + insert_row.columns = column_names + insert_row.values = parse_exprs fields.append(insert_row) res = self._conn.insert(db_name=db_name, table_name=table_name, fields=fields) @@ -384,10 +385,15 @@ def sort(self, order_by_expr_list: Optional[List[list[str, SortType]]]): raise InfinityException(ErrorCode.INVALID_PARAMETER_VALUE, "order_by_expr_list must be a list of [column_name, sort_type]") if order_by_expr[1] not in [SortType.Asc, SortType.Desc]: - raise InfinityException(ErrorCode.INVALID_PARAMETER_VALUE, "sort_type must be SortType.Asc or SortType.Desc") + raise InfinityException(ErrorCode.INVALID_PARAMETER_VALUE, + "sort_type must be SortType.Asc or SortType.Desc") self.query_builder.sort(order_by_expr_list) return self + def option(self, option_kv: {}): + self.query_builder.option(option_kv) + return self + def to_df(self): return self.query_builder.to_df() @@ -432,12 +438,16 @@ def _execute_query(self, query: Query): if query.group_by is not None: group_by_list = query.group_by + total_hits_count_flag = False + if query.total_hits_count: + total_hits_count_flag = True res = self._conn.search(db_name=self._db_name, table_name=self._table_name, select_list=query.columns, highlight_list=highlight, order_by_list=order_by_list, group_by_list=group_by_list, + total_hits_count_flag=total_hits_count_flag, search_expr=query.search, where_expr=query.filter, limit_expr=query.limit, diff --git a/python/infinity_embedded/local_infinity/types.py b/python/infinity_embedded/local_infinity/types.py index 85f31d601b..80f3a8f22a 100644 --- a/python/infinity_embedded/local_infinity/types.py +++ b/python/infinity_embedded/local_infinity/types.py @@ -1,4 +1,4 @@ -# Copyright(C) 2023 InfiniFlow, Inc. All rights reserved. +# Copyright(C) 2024 InfiniFlow, Inc. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,12 +13,12 @@ # limitations under the License. import struct +import json from collections import defaultdict -from typing import Any, Tuple, Dict, List -import polars as pl +from typing import Any import numpy as np from numpy import dtype -from infinity_embedded.common import VEC, SparseVector, InfinityException, DEFAULT_MATCH_VECTOR_TOPN +from infinity_embedded.common import VEC, SparseVector, InfinityException from infinity_embedded.embedded_infinity_ext import * from infinity_embedded.errors import ErrorCode from datetime import date, time, datetime, timedelta @@ -407,7 +407,7 @@ def make_match_tensor_expr(vector_column_name: str, embedding_data: VEC, embeddi match_tensor_expr.embedding_data = data return match_tensor_expr -def build_result(res: WrapQueryResult) -> tuple[dict[str | Any, list[Any, Any]], dict[str | Any, Any]]: +def build_result(res: WrapQueryResult) -> tuple[dict[str | Any, list[Any, Any]], dict[str | Any, Any], Any]: data_dict = {} data_type_dict = {} column_counter = defaultdict(int) @@ -426,4 +426,11 @@ def build_result(res: WrapQueryResult) -> tuple[dict[str | Any, list[Any, Any]], data_dict[column_name] = data_list data_type_dict[column_name] = column_data_type - return data_dict, data_type_dict + extra_result = None + if res.extra_result is not None: + try: + extra_result = json.loads(res.extra_result) + except json.JSONDecodeError: + pass + + return data_dict, data_type_dict, extra_result diff --git a/python/infinity_embedded/local_infinity/utils.py b/python/infinity_embedded/local_infinity/utils.py index 9dd1ac8175..c5a55bf4b5 100644 --- a/python/infinity_embedded/local_infinity/utils.py +++ b/python/infinity_embedded/local_infinity/utils.py @@ -1,4 +1,4 @@ -# Copyright(C) 2023 InfiniFlow, Inc. All rights reserved. +# Copyright(C) 2024 InfiniFlow, Inc. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -15,6 +15,7 @@ import re import functools import inspect +from typing import Any import pandas as pd import polars as pl from sqlglot import condition @@ -24,11 +25,10 @@ from infinity_embedded.common import InfinityException, SparseVector from infinity_embedded.local_infinity.types import build_result, logic_type_to_dtype from infinity_embedded.utils import binary_exp_to_paser_exp -from infinity_embedded.embedded_infinity_ext import WrapInExpr, WrapParsedExpr, WrapOrderByExpr, WrapFunctionExpr, \ +from infinity_embedded.embedded_infinity_ext import WrapInExpr, WrapParsedExpr, WrapFunctionExpr, \ WrapColumnExpr, WrapConstantExpr, ParsedExprType, LiteralType from infinity_embedded.embedded_infinity_ext import WrapEmbeddingType, WrapColumnDef, WrapDataType, LogicalType, \ EmbeddingDataType, WrapSparseType, ConstraintType -from datetime import date, time, datetime, timedelta def traverse_conditions(cons, fn=None): @@ -365,9 +365,9 @@ def wrapper(*args, **kwargs): return decorator -def select_res_to_polars(res) -> pl.DataFrame: +def select_res_to_polars(res) -> (pl.DataFrame, Any): df_dict = {} - data_dict, data_type_dict = build_result(res) + data_dict, data_type_dict, extra_result = build_result(res) for k, v in data_dict.items(): data_series = pd.Series(v, dtype=logic_type_to_dtype(data_type_dict[k])) df_dict[k] = data_series diff --git a/python/infinity_embedded/table.py b/python/infinity_embedded/table.py index 164d9b72da..2f7b78bc54 100644 --- a/python/infinity_embedded/table.py +++ b/python/infinity_embedded/table.py @@ -1,4 +1,4 @@ -# Copyright(C) 2023 InfiniFlow, Inc. All rights reserved. +# Copyright(C) 2024 InfiniFlow, Inc. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -12,15 +12,13 @@ # See the License for the specific language governing permissions and # limitations under the License. -from abc import ABC, abstractmethod from enum import Enum -from typing import Optional, Union, Any -from infinity_embedded.index import IndexInfo from infinity_embedded.common import InfinityException, INSERT_DATA from infinity_embedded.embedded_infinity_ext import ExplainType as LocalExplainType from infinity_embedded.errors import ErrorCode + class ExplainType(Enum): Analyze = 1 Ast = 2 @@ -30,7 +28,6 @@ class ExplainType(Enum): Pipeline = 6 Fragment = 7 - def to_local_ttype(self): if self is ExplainType.Ast: return LocalExplainType.kAst diff --git a/python/infinity_embedded/utils.py b/python/infinity_embedded/utils.py index 8f3c2d7bb5..6857fded8e 100644 --- a/python/infinity_embedded/utils.py +++ b/python/infinity_embedded/utils.py @@ -1,4 +1,4 @@ -# Copyright(C) 2023 InfiniFlow, Inc. All rights reserved. +# Copyright(C) 2024 InfiniFlow, Inc. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -47,5 +47,6 @@ def binary_exp_to_paser_exp(binary_expr_key) -> str: else: raise InfinityException(ErrorCode.INVALID_EXPRESSION, f"unknown binary expression: {binary_expr_key}") + def deprecated_api(message): warnings.warn(message, DeprecationWarning, stacklevel=2) diff --git a/python/infinity_http.py b/python/infinity_http.py index d1bb4cba3a..e2f7be7d9a 100644 --- a/python/infinity_http.py +++ b/python/infinity_http.py @@ -3,6 +3,7 @@ import requests import logging +import json from test_pysdk.common.common_data import * from infinity.common import ConflictType, InfinityException, SparseVector, SortType from typing import Optional, Any @@ -711,6 +712,9 @@ def __init__(self, output: list, table_http: table_http): self._match_sparse = [] self._search_exprs = [] self._sort = [] + self._limit = None + self._offset = None + self._option = None def select(self): url = f"databases/{self.table_http.database_name}/tables/{self.table_http.table_name}/docs" @@ -726,15 +730,28 @@ def select(self): tmp["highlight"] = self._highlight if len(self._sort): tmp["sort"] = self._sort + if self._limit is not None: + tmp["limit"] = str(self._limit) + if self._offset is not None: + tmp["offset"] = str(self._offset) + if self._option is not None: + tmp["option"] = self._option # print(tmp) d = self.table_http.net.set_up_data([], tmp) r = self.table_http.net.request(url, "get", h, d) self.table_http.net.raise_exception(r) # print(r.json()) - if "output" in r.json(): - self.output_res = r.json()["output"] + result_json = r.json() + if "output" in result_json: + self.output_res = result_json["output"] else: self.output_res = [] + + if "total_hits_count" in result_json: + self.total_hits_count = result_json["total_hits_count"] + else: + self.total_hits_count = None + return self def explain(self, ExplainType=ExplainType.Physical): @@ -757,6 +774,13 @@ def explain(self, ExplainType=ExplainType.Physical): tmp["output"] = self._output if len(self._highlight): tmp["highlight"] = self._highlight + if self._limit is not None: + tmp["limit"] = self._limit + if self._offset is not None: + tmp["offset"] = self._offset + if self._option is not None: + tmp["option"] = self._option + tmp["explain_type"] = ExplainType_transfrom(ExplainType) # print(tmp) d = self.table_http.net.set_up_data([], tmp) @@ -796,6 +820,23 @@ def sort(self, order_by_expr_list: Optional[List[list[str, SortType]]]): self._sort.append(tmp) return self + def limit(self, limit_num): + self._limit = limit_num + return self + + def offset(self, offset): + self._offset = offset + return self + + def option(self, option: {}): + # option_str = json.dumps(option) + # option_str = str(option) + # option_str.replace("'\"'", "") + # eval(option_str) + # option_str.replace("'", "") + self._option = option + return self + def match_text(self, fields: str, query: str, topn: int, opt_params: Optional[dict] = None): tmp_match_expr = {"match_method": "text", "fields": fields, "matching_text": query, "topn": topn} if opt_params is not None: @@ -902,6 +943,10 @@ def to_result(self): df_dict[k] = new_tup # print(self.output_res) # print(df_dict) + extra_result = None + if self.total_hits_count is not None: + extra_result = {} + extra_result["total_hits_count"] = self.total_hits_count df_type = {} for k in df_dict: @@ -936,17 +981,19 @@ def to_result(self): if (function_name in bool_functions): df_type[k] = dtype('bool') break - return df_dict, df_type + return df_dict, df_type, extra_result def to_pl(self): - return pl.from_pandas(self.to_df()) + dataframe, extra_result = self.to_df() + return pl.from_pandas(dataframe), extra_result def to_df(self): - df_dict, df_type = self.to_result() - return pd.DataFrame(df_dict).astype(df_type) + df_dict, df_type, extra_result = self.to_result() + return pd.DataFrame(df_dict).astype(df_type), extra_result def to_arrow(self): - return pa.Table.from_pandas(self.to_df()) + dataframe, extra_result = self.to_df() + return pa.Table.from_pandas(dataframe), extra_result @dataclass class database_result(): diff --git a/python/infinity_sdk/infinity/__init__.py b/python/infinity_sdk/infinity/__init__.py index 95bfba7981..4dff794bcc 100644 --- a/python/infinity_sdk/infinity/__init__.py +++ b/python/infinity_sdk/infinity/__init__.py @@ -26,7 +26,8 @@ from infinity.remote_thrift.infinity import RemoteThriftInfinityConnection from infinity.errors import ErrorCode -def connect(uri = LOCAL_HOST, logger: logging.Logger = None) -> InfinityConnection: + +def connect(uri=LOCAL_HOST, logger: logging.Logger = None) -> InfinityConnection: if isinstance(uri, NetworkAddress): return RemoteThriftInfinityConnection(uri, logger) else: diff --git a/python/infinity_sdk/infinity/common.py b/python/infinity_sdk/infinity/common.py index 2300818b9c..09b4fdeaa0 100644 --- a/python/infinity_sdk/infinity/common.py +++ b/python/infinity_sdk/infinity/common.py @@ -74,10 +74,12 @@ class ConflictType(object): Error = 1 Replace = 2 + class SortType(object): Asc = 0 Desc = 1 + class InfinityException(Exception): def __init__(self, error_code=0, error_message=None): self.error_code = error_code diff --git a/python/infinity_sdk/infinity/connection_pool.py b/python/infinity_sdk/infinity/connection_pool.py index 6f4893e07e..e2f74490ad 100644 --- a/python/infinity_sdk/infinity/connection_pool.py +++ b/python/infinity_sdk/infinity/connection_pool.py @@ -1,3 +1,17 @@ +# Copyright(C) 2024 InfiniFlow, Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + from threading import Lock import infinity from infinity.common import NetworkAddress @@ -5,7 +19,7 @@ class ConnectionPool(object): - def __init__(self, uri = NetworkAddress("127.0.0.1", 23817), max_size=16): + def __init__(self, uri=NetworkAddress("127.0.0.1", 23817), max_size=16): self.uri_ = uri self.max_size_ = max_size self.free_pool_ = [] @@ -13,7 +27,6 @@ def __init__(self, uri = NetworkAddress("127.0.0.1", 23817), max_size=16): for i in range(max_size): self._create_conn() - def _del__(self): self.destroy() @@ -21,7 +34,6 @@ def _create_conn(self): infinity_coon = infinity.connect(self.uri_) self.free_pool_.append(infinity_coon) - def get_conn(self): with self.lock_: if (len(self.free_pool_) == 0): @@ -30,20 +42,18 @@ def get_conn(self): logging.debug("get_conn") return conn - def release_conn(self, conn): """ Note: User is allowed to release a connection not created by ConnectionPool, or not releasing(due to exception or some other reasons) a connection created by ConnectionPool. """ with self.lock_: - if(self.free_pool_.count(conn)): + if (self.free_pool_.count(conn)): raise Exception("the connection has been released") if (len(self.free_pool_) < self.max_size_): self.free_pool_.append(conn) logging.debug("release_conn") - def destroy(self): for conn in iter(self.free_pool_): conn.disconnect() - self.free_pool_.clear() \ No newline at end of file + self.free_pool_.clear() diff --git a/python/infinity_sdk/infinity/db.py b/python/infinity_sdk/infinity/db.py index 1e1693c890..635ee74581 100644 --- a/python/infinity_sdk/infinity/db.py +++ b/python/infinity_sdk/infinity/db.py @@ -14,6 +14,7 @@ from abc import ABC, abstractmethod + class Database(ABC): @abstractmethod diff --git a/python/infinity_sdk/infinity/errors.py b/python/infinity_sdk/infinity/errors.py index 1e482d3eb8..df959070f4 100644 --- a/python/infinity_sdk/infinity/errors.py +++ b/python/infinity_sdk/infinity/errors.py @@ -117,6 +117,15 @@ class ErrorCode(IntEnum): INVALID_EXPLAIN_TYPE = 3081, CHUNK_NOT_EXIST = 3082, NAME_MISMATCHED = 3083, + TRANSACTION_NOT_FOUND = 3084, + INVALID_DATABASE_INDEX = 3085, + INVALID_TABLE_INDEX = 3086, + FUNCTION_IS_DISABLE = 3087, + NOT_FOUND = 3088, + ERROR_INIT = 3089, + FILE_IS_OPEN = 3090, + UNKNOWN = 3091, + INVALID_QUERY_OPTION = 3092, TXN_ROLLBACK = 4001, TXN_CONFLICT = 4002, @@ -126,6 +135,7 @@ class ErrorCode(IntEnum): TOO_MANY_CONNECTIONS = 5003, CONFIGURATION_LIMIT_EXCEED = 5004, QUERY_IS_TOO_COMPLEX = 5005, + FAIL_TO_GET_SYS_INFO = 5006, QUERY_CANCELLED = 6001, QUERY_NOT_SUPPORTED = 6002, @@ -147,7 +157,26 @@ class ErrorCode(IntEnum): MUNMAP_FILE_ERROR = 7014, INVALID_FILE_FLAG = 7015, INVALID_SERVER_ADDRESS = 7016, + FAIL_TO_FUN_PYTHON = 7017, + CANT_CONNECT_SERVER = 7018, + NOT_EXIST_NODE = 7019, + DUPLICATE_NODE = 7020, + CANT_CONNECT_LEADER = 7021, + MINIO_INVALID_ACCESS_KEY = 7022, + MINIO_BUCKET_NOT_EXISTS = 7023, + INVALID_STORAGE_TYPE = 7024, + NOT_REGISTERED = 7025, + CANT_SWITCH_ROLE = 7026, + TOO_MANY_FOLLOWER = 7027, + TOO_MANY_LEARNER = 7028, INVALID_ENTRY = 8001, - NOT_FOUND_ENTRY = 8002, - EMPTY_ENTRY_LIST = 8003, + DUPLICATE_ENTRY = 8002 + NOT_FOUND_ENTRY = 8003, + EMPTY_ENTRY_LIST = 8004, + NO_WAL_ENTRY_FOUND = 8005, + WRONG_CHECKPOINT_TYPE = 8006, + INVALID_NODE_ROLE = 8007, + INVALID_NODE_STATUS = 8008, + NODE_INFO_UPDATED = 8009, + NODE_NAME_MISMATCH = 8010 \ No newline at end of file diff --git a/python/infinity_sdk/infinity/remote_thrift/client.py b/python/infinity_sdk/infinity/remote_thrift/client.py index 27e3718447..82800c6c73 100644 --- a/python/infinity_sdk/infinity/remote_thrift/client.py +++ b/python/infinity_sdk/infinity/remote_thrift/client.py @@ -28,6 +28,7 @@ TRY_TIMES = 10 + class ThriftInfinityClient: def __init__(self, uri: URI, *, try_times: int = TRY_TIMES, logger: logging.Logger = None): self.lock = rwlock.RWLockRead() @@ -96,7 +97,8 @@ def _reconnect(self): # version: 0.5.0.dev2, client_version: 24 # version: 0.5.0.dev3, client_version: 25 # version: 0.5.0.dev4 and 0.5.0.dev5, client_version: 26 - res = self.client.Connect(ConnectRequest(client_version=26)) # 0.5.0.dev5 + # version: 0.5.0.dev6, client_version: 27 + res = self.client.Connect(ConnectRequest(client_version=27)) # 0.5.0.dev6 if res.error_code != 0: raise InfinityException(res.error_code, res.error_msg) self.session_id = res.session_id @@ -115,12 +117,14 @@ def wrapper(self, *args, **kwargs): if old_session_i == self.session_i: self._reconnect() self.session_i += 1 - self.logger.debug(f"Tried {i} times, session_id: {self.session_id}, session_i: {self.session_i}, exception: {str(e)}") + self.logger.debug( + f"Tried {i} times, session_id: {self.session_id}, session_i: {self.session_i}, exception: {str(e)}") except Exception as e: raise else: return CommonResponse(ErrorCode.TOO_MANY_CONNECTIONS, f"Try {self.try_times} times, but still failed") return ret + return wrapper @retry_wrapper @@ -259,7 +263,7 @@ def export_data(self, db_name: str, table_name: str, file_name: str, export_opti @retry_wrapper def select(self, db_name: str, table_name: str, select_list, highlight_list, search_expr, - where_expr, group_by_list, limit_expr, offset_expr, order_by_list): + where_expr, group_by_list, limit_expr, offset_expr, order_by_list, total_hits_count): return self.client.Select(SelectRequest(session_id=self.session_id, db_name=db_name, table_name=table_name, @@ -270,7 +274,8 @@ def select(self, db_name: str, table_name: str, select_list, highlight_list, sea group_by_list=group_by_list, limit_expr=limit_expr, offset_expr=offset_expr, - order_by_list=order_by_list + order_by_list=order_by_list, + total_hits_count=total_hits_count )) @retry_wrapper @@ -384,3 +389,7 @@ def command(self, command: ttypes.CommandRequest): def flush(self, flush_request: ttypes.FlushRequest): flush_request.session_id = self.session_id return self.client.Flush(flush_request) + + @retry_wrapper + def compact(self, db_name: str, table_name: str): + return self.client.Compact(CompactRequest(session_id=self.session_id, db_name=db_name, table_name=table_name)) diff --git a/python/infinity_sdk/infinity/remote_thrift/db.py b/python/infinity_sdk/infinity/remote_thrift/db.py index 60e2b8553e..e5da17858e 100644 --- a/python/infinity_sdk/infinity/remote_thrift/db.py +++ b/python/infinity_sdk/infinity/remote_thrift/db.py @@ -28,6 +28,7 @@ from infinity.common import ConflictType from infinity.common import InfinityException + class RemoteDatabase(Database, ABC): def __init__(self, conn, name: str): self._conn = conn diff --git a/python/infinity_sdk/infinity/remote_thrift/infinity_thrift_rpc/InfinityService.py b/python/infinity_sdk/infinity/remote_thrift/infinity_thrift_rpc/InfinityService.py index 7da04b05a6..c05a3817cf 100644 --- a/python/infinity_sdk/infinity/remote_thrift/infinity_thrift_rpc/InfinityService.py +++ b/python/infinity_sdk/infinity/remote_thrift/infinity_thrift_rpc/InfinityService.py @@ -315,6 +315,14 @@ def Flush(self, request): """ pass + def Compact(self, request): + """ + Parameters: + - request + + """ + pass + class Client(Iface): def __init__(self, iprot, oprot=None): @@ -1507,6 +1515,38 @@ def recv_Flush(self): return result.success raise TApplicationException(TApplicationException.MISSING_RESULT, "Flush failed: unknown result") + def Compact(self, request): + """ + Parameters: + - request + + """ + self.send_Compact(request) + return self.recv_Compact() + + def send_Compact(self, request): + self._oprot.writeMessageBegin('Compact', TMessageType.CALL, self._seqid) + args = Compact_args() + args.request = request + args.write(self._oprot) + self._oprot.writeMessageEnd() + self._oprot.trans.flush() + + def recv_Compact(self): + iprot = self._iprot + (fname, mtype, rseqid) = iprot.readMessageBegin() + if mtype == TMessageType.EXCEPTION: + x = TApplicationException() + x.read(iprot) + iprot.readMessageEnd() + raise x + result = Compact_result() + result.read(iprot) + iprot.readMessageEnd() + if result.success is not None: + return result.success + raise TApplicationException(TApplicationException.MISSING_RESULT, "Compact failed: unknown result") + class Processor(Iface, TProcessor): def __init__(self, handler): @@ -1549,6 +1589,7 @@ def __init__(self, handler): self._processMap["Cleanup"] = Processor.process_Cleanup self._processMap["Command"] = Processor.process_Command self._processMap["Flush"] = Processor.process_Flush + self._processMap["Compact"] = Processor.process_Compact self._on_message_begin = None def on_message_begin(self, func): @@ -2422,6 +2463,29 @@ def process_Flush(self, seqid, iprot, oprot): oprot.writeMessageEnd() oprot.trans.flush() + def process_Compact(self, seqid, iprot, oprot): + args = Compact_args() + args.read(iprot) + iprot.readMessageEnd() + result = Compact_result() + try: + result.success = self._handler.Compact(args.request) + msg_type = TMessageType.REPLY + except TTransport.TTransportException: + raise + except TApplicationException as ex: + logging.exception('TApplication exception in handler') + msg_type = TMessageType.EXCEPTION + result = ex + except Exception: + logging.exception('Unexpected exception in handler') + msg_type = TMessageType.EXCEPTION + result = TApplicationException(TApplicationException.INTERNAL_ERROR, 'Internal error') + oprot.writeMessageBegin("Compact", msg_type, seqid) + result.write(oprot) + oprot.writeMessageEnd() + oprot.trans.flush() + # HELPER FUNCTIONS AND STRUCTURES @@ -7048,5 +7112,130 @@ def __ne__(self, other): Flush_result.thrift_spec = ( (0, TType.STRUCT, 'success', [CommonResponse, None], None, ), # 0 ) + + +class Compact_args(object): + """ + Attributes: + - request + + """ + + + def __init__(self, request=None,): + self.request = request + + def read(self, iprot): + if iprot._fast_decode is not None and isinstance(iprot.trans, TTransport.CReadableTransport) and self.thrift_spec is not None: + iprot._fast_decode(self, iprot, [self.__class__, self.thrift_spec]) + return + iprot.readStructBegin() + while True: + (fname, ftype, fid) = iprot.readFieldBegin() + if ftype == TType.STOP: + break + if fid == 1: + if ftype == TType.STRUCT: + self.request = CompactRequest() + self.request.read(iprot) + else: + iprot.skip(ftype) + else: + iprot.skip(ftype) + iprot.readFieldEnd() + iprot.readStructEnd() + + def write(self, oprot): + if oprot._fast_encode is not None and self.thrift_spec is not None: + oprot.trans.write(oprot._fast_encode(self, [self.__class__, self.thrift_spec])) + return + oprot.writeStructBegin('Compact_args') + if self.request is not None: + oprot.writeFieldBegin('request', TType.STRUCT, 1) + self.request.write(oprot) + oprot.writeFieldEnd() + oprot.writeFieldStop() + oprot.writeStructEnd() + + def validate(self): + return + + def __repr__(self): + L = ['%s=%r' % (key, value) + for key, value in self.__dict__.items()] + return '%s(%s)' % (self.__class__.__name__, ', '.join(L)) + + def __eq__(self, other): + return isinstance(other, self.__class__) and self.__dict__ == other.__dict__ + + def __ne__(self, other): + return not (self == other) +all_structs.append(Compact_args) +Compact_args.thrift_spec = ( + None, # 0 + (1, TType.STRUCT, 'request', [CompactRequest, None], None, ), # 1 +) + + +class Compact_result(object): + """ + Attributes: + - success + + """ + + + def __init__(self, success=None,): + self.success = success + + def read(self, iprot): + if iprot._fast_decode is not None and isinstance(iprot.trans, TTransport.CReadableTransport) and self.thrift_spec is not None: + iprot._fast_decode(self, iprot, [self.__class__, self.thrift_spec]) + return + iprot.readStructBegin() + while True: + (fname, ftype, fid) = iprot.readFieldBegin() + if ftype == TType.STOP: + break + if fid == 0: + if ftype == TType.STRUCT: + self.success = CommonResponse() + self.success.read(iprot) + else: + iprot.skip(ftype) + else: + iprot.skip(ftype) + iprot.readFieldEnd() + iprot.readStructEnd() + + def write(self, oprot): + if oprot._fast_encode is not None and self.thrift_spec is not None: + oprot.trans.write(oprot._fast_encode(self, [self.__class__, self.thrift_spec])) + return + oprot.writeStructBegin('Compact_result') + if self.success is not None: + oprot.writeFieldBegin('success', TType.STRUCT, 0) + self.success.write(oprot) + oprot.writeFieldEnd() + oprot.writeFieldStop() + oprot.writeStructEnd() + + def validate(self): + return + + def __repr__(self): + L = ['%s=%r' % (key, value) + for key, value in self.__dict__.items()] + return '%s(%s)' % (self.__class__.__name__, ', '.join(L)) + + def __eq__(self, other): + return isinstance(other, self.__class__) and self.__dict__ == other.__dict__ + + def __ne__(self, other): + return not (self == other) +all_structs.append(Compact_result) +Compact_result.thrift_spec = ( + (0, TType.STRUCT, 'success', [CommonResponse, None], None, ), # 0 +) fix_spec(all_structs) del all_structs diff --git a/python/infinity_sdk/infinity/remote_thrift/infinity_thrift_rpc/ttypes.py b/python/infinity_sdk/infinity/remote_thrift/infinity_thrift_rpc/ttypes.py index 62bb260d07..506b5f7711 100644 --- a/python/infinity_sdk/infinity/remote_thrift/infinity_thrift_rpc/ttypes.py +++ b/python/infinity_sdk/infinity/remote_thrift/infinity_thrift_rpc/ttypes.py @@ -6747,6 +6747,7 @@ class SelectRequest(object): - limit_expr - offset_expr - order_by_list + - total_hits_count """ @@ -6755,7 +6756,7 @@ def __init__(self, session_id=None, db_name=None, table_name=None, select_list=[ ], highlight_list=[ ], search_expr=None, where_expr=None, group_by_list=[ ], having_expr=None, limit_expr=None, offset_expr=None, order_by_list=[ - ],): + ], total_hits_count=None,): self.session_id = session_id self.db_name = db_name self.table_name = table_name @@ -6780,6 +6781,7 @@ def __init__(self, session_id=None, db_name=None, table_name=None, select_list=[ order_by_list = [ ] self.order_by_list = order_by_list + self.total_hits_count = total_hits_count def read(self, iprot): if iprot._fast_decode is not None and isinstance(iprot.trans, TTransport.CReadableTransport) and self.thrift_spec is not None: @@ -6879,6 +6881,11 @@ def read(self, iprot): iprot.readListEnd() else: iprot.skip(ftype) + elif fid == 13: + if ftype == TType.BOOL: + self.total_hits_count = iprot.readBool() + else: + iprot.skip(ftype) else: iprot.skip(ftype) iprot.readFieldEnd() @@ -6949,6 +6956,10 @@ def write(self, oprot): iter384.write(oprot) oprot.writeListEnd() oprot.writeFieldEnd() + if self.total_hits_count is not None: + oprot.writeFieldBegin('total_hits_count', TType.BOOL, 13) + oprot.writeBool(self.total_hits_count) + oprot.writeFieldEnd() oprot.writeFieldStop() oprot.writeStructEnd() @@ -6974,13 +6985,14 @@ class SelectResponse(object): - error_msg - column_defs - column_fields + - extra_result """ def __init__(self, error_code=None, error_msg=None, column_defs=[ ], column_fields=[ - ],): + ], extra_result=None,): self.error_code = error_code self.error_msg = error_msg if column_defs is self.thrift_spec[3][4]: @@ -6991,6 +7003,7 @@ def __init__(self, error_code=None, error_msg=None, column_defs=[ column_fields = [ ] self.column_fields = column_fields + self.extra_result = extra_result def read(self, iprot): if iprot._fast_decode is not None and isinstance(iprot.trans, TTransport.CReadableTransport) and self.thrift_spec is not None: @@ -7033,6 +7046,11 @@ def read(self, iprot): iprot.readListEnd() else: iprot.skip(ftype) + elif fid == 5: + if ftype == TType.STRING: + self.extra_result = iprot.readString().decode('utf-8', errors='replace') if sys.version_info[0] == 2 else iprot.readString() + else: + iprot.skip(ftype) else: iprot.skip(ftype) iprot.readFieldEnd() @@ -7065,6 +7083,10 @@ def write(self, oprot): iter398.write(oprot) oprot.writeListEnd() oprot.writeFieldEnd() + if self.extra_result is not None: + oprot.writeFieldBegin('extra_result', TType.STRING, 5) + oprot.writeString(self.extra_result.encode('utf-8') if sys.version_info[0] == 2 else self.extra_result) + oprot.writeFieldEnd() oprot.writeFieldStop() oprot.writeStructEnd() @@ -8840,6 +8862,85 @@ def __eq__(self, other): def __ne__(self, other): return not (self == other) + + +class CompactRequest(object): + """ + Attributes: + - session_id + - db_name + - table_name + + """ + + + def __init__(self, session_id=None, db_name=None, table_name=None,): + self.session_id = session_id + self.db_name = db_name + self.table_name = table_name + + def read(self, iprot): + if iprot._fast_decode is not None and isinstance(iprot.trans, TTransport.CReadableTransport) and self.thrift_spec is not None: + iprot._fast_decode(self, iprot, [self.__class__, self.thrift_spec]) + return + iprot.readStructBegin() + while True: + (fname, ftype, fid) = iprot.readFieldBegin() + if ftype == TType.STOP: + break + if fid == 1: + if ftype == TType.I64: + self.session_id = iprot.readI64() + else: + iprot.skip(ftype) + elif fid == 2: + if ftype == TType.STRING: + self.db_name = iprot.readString().decode('utf-8', errors='replace') if sys.version_info[0] == 2 else iprot.readString() + else: + iprot.skip(ftype) + elif fid == 3: + if ftype == TType.STRING: + self.table_name = iprot.readString().decode('utf-8', errors='replace') if sys.version_info[0] == 2 else iprot.readString() + else: + iprot.skip(ftype) + else: + iprot.skip(ftype) + iprot.readFieldEnd() + iprot.readStructEnd() + + def write(self, oprot): + if oprot._fast_encode is not None and self.thrift_spec is not None: + oprot.trans.write(oprot._fast_encode(self, [self.__class__, self.thrift_spec])) + return + oprot.writeStructBegin('CompactRequest') + if self.session_id is not None: + oprot.writeFieldBegin('session_id', TType.I64, 1) + oprot.writeI64(self.session_id) + oprot.writeFieldEnd() + if self.db_name is not None: + oprot.writeFieldBegin('db_name', TType.STRING, 2) + oprot.writeString(self.db_name.encode('utf-8') if sys.version_info[0] == 2 else self.db_name) + oprot.writeFieldEnd() + if self.table_name is not None: + oprot.writeFieldBegin('table_name', TType.STRING, 3) + oprot.writeString(self.table_name.encode('utf-8') if sys.version_info[0] == 2 else self.table_name) + oprot.writeFieldEnd() + oprot.writeFieldStop() + oprot.writeStructEnd() + + def validate(self): + return + + def __repr__(self): + L = ['%s=%r' % (key, value) + for key, value in self.__dict__.items()] + return '%s(%s)' % (self.__class__.__name__, ', '.join(L)) + + def __eq__(self, other): + return isinstance(other, self.__class__) and self.__dict__ == other.__dict__ + + def __ne__(self, other): + return not (self == other) all_structs.append(Property) Property.thrift_spec = ( None, # 0 @@ -9392,6 +9493,7 @@ def __ne__(self, other): (11, TType.STRUCT, 'offset_expr', [ParsedExpr, None], None, ), # 11 (12, TType.LIST, 'order_by_list', (TType.STRUCT, [OrderByExpr, None], False), [ ], ), # 12 + (13, TType.BOOL, 'total_hits_count', None, None, ), # 13 ) all_structs.append(SelectResponse) SelectResponse.thrift_spec = ( @@ -9402,6 +9504,7 @@ def __ne__(self, other): ], ), # 3 (4, TType.LIST, 'column_fields', (TType.STRUCT, [ColumnField, None], False), [ ], ), # 4 + (5, TType.STRING, 'extra_result', 'UTF8', None, ), # 5 ) all_structs.append(DeleteRequest) DeleteRequest.thrift_spec = ( @@ -9559,5 +9662,12 @@ def __ne__(self, other): (1, TType.I64, 'session_id', None, None, ), # 1 (2, TType.STRING, 'flush_type', 'UTF8', None, ), # 2 ) +all_structs.append(CompactRequest) +CompactRequest.thrift_spec = ( + None, # 0 + (1, TType.I64, 'session_id', None, None, ), # 1 + (2, TType.STRING, 'db_name', 'UTF8', None, ), # 2 + (3, TType.STRING, 'table_name', 'UTF8', None, ), # 3 +) fix_spec(all_structs) del all_structs diff --git a/python/infinity_sdk/infinity/remote_thrift/query_builder.py b/python/infinity_sdk/infinity/remote_thrift/query_builder.py index 4beb15e62f..c6ac98289d 100644 --- a/python/infinity_sdk/infinity/remote_thrift/query_builder.py +++ b/python/infinity_sdk/infinity/remote_thrift/query_builder.py @@ -48,6 +48,7 @@ def __init__( limit: Optional[ParsedExpr], offset: Optional[ParsedExpr], sort: Optional[List[OrderByExpr]], + total_hits_count: Optional[bool] ): self.columns = columns self.highlight = highlight @@ -57,6 +58,7 @@ def __init__( self.limit = limit self.offset = offset self.sort = sort + self.total_hits_count = total_hits_count class ExplainQuery(Query): @@ -72,7 +74,7 @@ def __init__( sort: Optional[List[OrderByExpr]], explain_type: Optional[ExplainType], ): - super().__init__(columns, highlight, search, filter, groupby, limit, offset, sort) + super().__init__(columns, highlight, search, filter, groupby, limit, offset, sort, False) self.explain_type = explain_type @@ -87,6 +89,7 @@ def __init__(self, table): self._limit = None self._offset = None self._sort = None + self._total_hits_count = None def reset(self): self._columns = None @@ -97,6 +100,7 @@ def reset(self): self._limit = None self._offset = None self._sort = None + self._total_hits_count = None def match_dense( self, @@ -353,6 +357,16 @@ def output(self, columns: Optional[list]) -> InfinityThriftQueryBuilder: expr_type = ParsedExprType(function_expr=func_expr) parsed_expr = ParsedExpr(type=expr_type) select_list.append(parsed_expr) + case "_create_timestamp": + func_expr = FunctionExpr(function_name="create_timestamp", arguments=[]) + expr_type = ParsedExprType(function_expr=func_expr) + parsed_expr = ParsedExpr(type=expr_type) + select_list.append(parsed_expr) + case "_delete_timestamp": + func_expr = FunctionExpr(function_name="delete_timestamp", arguments=[]) + expr_type = ParsedExprType(function_expr=func_expr) + parsed_expr = ParsedExpr(type=expr_type) + select_list.append(parsed_expr) case "_score": func_expr = FunctionExpr(function_name="score", arguments=[]) expr_type = ParsedExprType(function_expr=func_expr) @@ -368,6 +382,21 @@ def output(self, columns: Optional[list]) -> InfinityThriftQueryBuilder: expr_type = ParsedExprType(function_expr=func_expr) parsed_expr = ParsedExpr(type=expr_type) select_list.append(parsed_expr) + case "_score_factors": + func_expr = FunctionExpr(function_name="score_factors", arguments=[]) + expr_type = ParsedExprType(function_expr=func_expr) + parsed_expr = ParsedExpr(type=expr_type) + select_list.append(parsed_expr) + case "_similarity_factors": + func_expr = FunctionExpr(function_name="similarity_factors", arguments=[]) + expr_type = ParsedExprType(function_expr=func_expr) + parsed_expr = ParsedExpr(type=expr_type) + select_list.append(parsed_expr) + case "_distance_factors": + func_expr = FunctionExpr(function_name="distance_factors", arguments=[]) + expr_type = ParsedExprType(function_expr=func_expr) + parsed_expr = ParsedExpr(type=expr_type) + select_list.append(parsed_expr) case _: select_list.append(parse_expr(maybe_parse(column))) @@ -384,6 +413,12 @@ def highlight(self, columns: Optional[list]) -> InfinityThriftQueryBuilder: self._highlight = highlight_list return self + def option(self, option_kv: {}): + if 'total_hits_count' in option_kv: + if isinstance(option_kv['total_hits_count'], bool): + self._total_hits_count = option_kv['total_hits_count'] + return self + def sort(self, order_by_expr_list: Optional[List[list[str, SortType]]]) -> InfinityThriftQueryBuilder: sort_list: List[OrderByExpr] = [] for order_by_expr in order_by_expr_list: @@ -407,6 +442,20 @@ def sort(self, order_by_expr_list: Optional[List[list[str, SortType]]]) -> Infin order_by_flag: bool = order_by_expr[1] == SortType.Asc order_by_expr = OrderByExpr(expr=parsed_expr, asc=order_by_flag) sort_list.append(order_by_expr) + case "_create_timestamp": + func_expr = FunctionExpr(function_name="create_timestamp", arguments=[]) + expr_type = ParsedExprType(function_expr=func_expr) + parsed_expr = ParsedExpr(type=expr_type) + order_by_flag: bool = order_by_expr[1] == SortType.Asc + order_by_expr = OrderByExpr(expr=parsed_expr, asc=order_by_flag) + sort_list.append(order_by_expr) + case "_delete_timestamp": + func_expr = FunctionExpr(function_name="delete_timestamp", arguments=[]) + expr_type = ParsedExprType(function_expr=func_expr) + parsed_expr = ParsedExpr(type=expr_type) + order_by_flag: bool = order_by_expr[1] == SortType.Asc + order_by_expr = OrderByExpr(expr=parsed_expr, asc=order_by_flag) + sort_list.append(order_by_expr) case "_score": func_expr = FunctionExpr(function_name="score", arguments=[]) expr_type = ParsedExprType(function_expr=func_expr) @@ -428,6 +477,27 @@ def sort(self, order_by_expr_list: Optional[List[list[str, SortType]]]) -> Infin order_by_flag: bool = order_by_expr[1] == SortType.Asc order_by_expr = OrderByExpr(expr=parsed_expr, asc=order_by_flag) sort_list.append(order_by_expr) + case "_score_factors": + func_expr = FunctionExpr(function_name="score_factors", arguments=[]) + expr_type = ParsedExprType(function_expr=func_expr) + parsed_expr = ParsedExpr(type=expr_type) + order_by_flag: bool = order_by_expr[1] == SortType.Asc + order_by_expr = OrderByExpr(expr=parsed_expr, asc=order_by_flag) + sort_list.append(order_by_expr) + case "_similarity_factors": + func_expr = FunctionExpr(function_name="similarity_factors", arguments=[]) + expr_type = ParsedExprType(function_expr=func_expr) + parsed_expr = ParsedExpr(type=expr_type) + order_by_flag: bool = order_by_expr[1] == SortType.Asc + order_by_expr = OrderByExpr(expr=parsed_expr, asc=order_by_flag) + sort_list.append(order_by_expr) + case "_distance_factors": + func_expr = FunctionExpr(function_name="distance_factors", arguments=[]) + expr_type = ParsedExprType(function_expr=func_expr) + parsed_expr = ParsedExpr(type=expr_type) + order_by_flag: bool = order_by_expr[1] == SortType.Asc + order_by_expr = OrderByExpr(expr=parsed_expr, asc=order_by_flag) + sort_list.append(order_by_expr) case _: parsed_expr = parse_expr(maybe_parse(order_by_expr_str)) order_by_flag: bool = order_by_expr[1] == SortType.Asc @@ -436,7 +506,7 @@ def sort(self, order_by_expr_list: Optional[List[list[str, SortType]]]) -> Infin self._sort = sort_list return self - def to_result(self) -> tuple[dict[str, list[Any]], dict[str, Any]]: + def to_result(self) -> tuple[dict[str, list[Any]], dict[str, Any], {}]: query = Query( columns=self._columns, highlight=self._highlight, @@ -446,23 +516,26 @@ def to_result(self) -> tuple[dict[str, list[Any]], dict[str, Any]]: limit=self._limit, offset=self._offset, sort=self._sort, + total_hits_count=self._total_hits_count, ) self.reset() return self._table._execute_query(query) - def to_df(self) -> pd.DataFrame: + def to_df(self) -> (pd.DataFrame, {}): df_dict = {} - data_dict, data_type_dict = self.to_result() + data_dict, data_type_dict, extra_result = self.to_result() for k, v in data_dict.items(): data_series = pd.Series(v, dtype=logic_type_to_dtype(data_type_dict[k])) df_dict[k] = data_series - return pd.DataFrame(df_dict) + return pd.DataFrame(df_dict), extra_result - def to_pl(self) -> pl.DataFrame: - return pl.from_pandas(self.to_df()) + def to_pl(self) -> (pl.DataFrame, {}): + dataframe, extra_result = self.to_df() + return pl.from_pandas(dataframe), extra_result - def to_arrow(self) -> Table: - return pa.Table.from_pandas(self.to_df()) + def to_arrow(self) -> (Table, {}): + dataframe, extra_result = self.to_df() + return pa.Table.from_pandas(dataframe), extra_result def explain(self, explain_type=ExplainType.Physical) -> Any: query = ExplainQuery( diff --git a/python/infinity_sdk/infinity/remote_thrift/table.py b/python/infinity_sdk/infinity/remote_thrift/table.py index 284341daf6..3ff29cc695 100644 --- a/python/infinity_sdk/infinity/remote_thrift/table.py +++ b/python/infinity_sdk/infinity/remote_thrift/table.py @@ -62,7 +62,7 @@ def wrapper(*args, **kwargs): @name_validity_check("index_name", "Index") def create_index(self, index_name: str, index_info: IndexInfo, - conflict_type: ConflictType = ConflictType.Error, index_comment = ""): + conflict_type: ConflictType = ConflictType.Error, index_comment=""): index_name = index_name.strip() index_info_to_use = index_info.to_ttype() @@ -400,10 +400,15 @@ def sort(self, order_by_expr_list: Optional[List[list[str, SortType]]]): raise InfinityException(ErrorCode.INVALID_PARAMETER_VALUE, "order_by_expr_list must be a list of [column_name, sort_type]") if order_by_expr[1] not in [SortType.Asc, SortType.Desc]: - raise InfinityException(ErrorCode.INVALID_PARAMETER_VALUE, "sort_type must be SortType.Asc or SortType.Desc") + raise InfinityException(ErrorCode.INVALID_PARAMETER_VALUE, + "sort_type must be SortType.Asc or SortType.Desc") self.query_builder.sort(order_by_expr_list) return self + def option(self, option_kv: {}): + self.query_builder.option(option_kv) + return self + def to_result(self): return self.query_builder.to_result() @@ -438,6 +443,9 @@ def drop_columns(self, column_names: list[str] | str): return self._conn.drop_columns(db_name=self._db_name, table_name=self._table_name, column_names=column_names) + def compact(self): + return self._conn.compact(db_name=self._db_name, table_name=self._table_name) + def _execute_query(self, query: Query) -> tuple[dict[str, list[Any]], dict[str, Any]]: # execute the query @@ -450,7 +458,8 @@ def _execute_query(self, query: Query) -> tuple[dict[str, list[Any]], dict[str, group_by_list=None, limit_expr=query.limit, offset_expr=query.offset, - order_by_list=query.sort) + order_by_list=query.sort, + total_hits_count=query.total_hits_count) # process the results if res.error_code == ErrorCode.OK: diff --git a/python/infinity_sdk/infinity/remote_thrift/types.py b/python/infinity_sdk/infinity/remote_thrift/types.py index b535c943e0..08f7b866a5 100644 --- a/python/infinity_sdk/infinity/remote_thrift/types.py +++ b/python/infinity_sdk/infinity/remote_thrift/types.py @@ -13,11 +13,12 @@ # limitations under the License. import struct +import json import numpy as np from infinity.common import VEC, SparseVector, InfinityException from infinity.remote_thrift.infinity_thrift_rpc.ttypes import * from collections import defaultdict -from typing import Any, Tuple, Dict, List, Optional +from typing import Any, Optional from datetime import date, time, datetime, timedelta import polars as pl @@ -173,14 +174,16 @@ def column_vector_to_list(column_type: ttypes.ColumnType, column_data_type: ttyp case _: raise NotImplementedError(f"Unsupported type {column_type}") + def parse_date_bytes(column_vector): parsed_list = list(struct.unpack('<{}i'.format(len(column_vector) // 4), column_vector)) date_list = [] epoch = date(1970, 1, 1) - for value in parsed_list: - date_list.append((epoch + timedelta(days = value)).strftime('%Y-%m-%d')) + for value in parsed_list: + date_list.append((epoch + timedelta(days=value)).strftime('%Y-%m-%d')) return date_list + def parse_time_bytes(column_vector): parsed_list = list(struct.unpack('<{}i'.format(len(column_vector) // 4), column_vector)) time_list = [] @@ -191,15 +194,18 @@ def parse_time_bytes(column_vector): time_list.append(time(hour=hours, minute=minutes, second=seconds).strftime('%H:%M:%S')) return time_list + def parse_datetime_bytes(column_vector): parsed_list = list(struct.unpack('<{}i'.format(len(column_vector) // 4), column_vector)) datetime_list = [] epoch = datetime(1970, 1, 1) for i in range(0, len(parsed_list), 2): if i + 1 < len(parsed_list): - datetime_list.append((epoch + timedelta(days = parsed_list[i], seconds = parsed_list[i + 1])).strftime('%Y-%m-%d %H:%M:%S')); + datetime_list.append( + (epoch + timedelta(days=parsed_list[i], seconds=parsed_list[i + 1])).strftime('%Y-%m-%d %H:%M:%S')); return datetime_list + def parse_interval_bytes(column_vector): parsed_list = list(struct.unpack('<{}i'.format(len(column_vector) // 4), column_vector)) interval_list = [] @@ -207,6 +213,7 @@ def parse_interval_bytes(column_vector): interval_list.append(str(timedelta(seconds=value).total_seconds()) + 's') return interval_list + def parse_bytes(bytes_data): results = [] offset = 0 @@ -298,6 +305,7 @@ def tensor_to_list(column_data_type: ttypes.DataType, binary_data) -> list[list[ raise NotImplementedError( f"Unsupported type {column_data_type.physical_type.embedding_type.element_type}") + def parse_sparse_bytes(column_data_type: ttypes.DataType, column_vector): dimension = column_data_type.physical_type.sparse_type.dimension element_type = column_data_type.physical_type.sparse_type.element_type @@ -374,7 +382,7 @@ def find_data_type(column_name: str, column_defs: list[ttypes.ColumnDef]) -> tty raise KeyError(f"column name {column_name} not found in column defs") -def build_result(res: ttypes.SelectResponse) -> tuple[dict[str | Any, list[Any, Any]], dict[str | Any, Any]]: +def build_result(res: ttypes.SelectResponse) -> tuple[dict[str | Any, list[Any, Any]], dict[str | Any, Any], {}]: data_dict = {} data_type_dict = {} column_counter = defaultdict(int) @@ -394,7 +402,14 @@ def build_result(res: ttypes.SelectResponse) -> tuple[dict[str | Any, list[Any, data_dict[column_name] = data_list data_type_dict[column_name] = column_data_type - return data_dict, data_type_dict + extra_result = None + if res.extra_result is not None: + try: + extra_result = json.loads(res.extra_result) + except json.JSONDecodeError: + pass + + return data_dict, data_type_dict, extra_result def make_match_tensor_expr(vector_column_name: str, embedding_data: VEC, embedding_data_type: str, method_type: str, @@ -483,6 +498,7 @@ def make_match_sparse_expr(vector_column_name: str, sparse_data: SparseVector | for k, v in opt_params.items(): match_sparse_options.append(InitParameter(param_name=k, param_value=v)) - match_sparse_expr = MatchSparseExpr(column_expr=column_expr, query_sparse_expr=query_sparse_expr, metric_type=metric_type, + match_sparse_expr = MatchSparseExpr(column_expr=column_expr, query_sparse_expr=query_sparse_expr, + metric_type=metric_type, topn=topn, opt_params=match_sparse_options, filter_expr=filter_expr) return match_sparse_expr diff --git a/python/infinity_sdk/infinity/remote_thrift/utils.py b/python/infinity_sdk/infinity/remote_thrift/utils.py index 11baf3576b..5e7bed11ff 100644 --- a/python/infinity_sdk/infinity/remote_thrift/utils.py +++ b/python/infinity_sdk/infinity/remote_thrift/utils.py @@ -15,6 +15,7 @@ import re import functools import inspect +from typing import Any import pandas as pd import polars as pl from sqlglot import condition @@ -25,7 +26,6 @@ from infinity.utils import binary_exp_to_paser_exp from infinity.common import InfinityException, SparseVector from infinity.errors import ErrorCode -from datetime import date, time, datetime, timedelta def traverse_conditions(cons, fn=None) -> ttypes.ParsedExpr: @@ -74,6 +74,16 @@ def traverse_conditions(cons, fn=None) -> ttypes.ParsedExpr: expr_type = ttypes.ParsedExprType(function_expr=func_expr) parsed_expr = ttypes.ParsedExpr(type=expr_type) return parsed_expr + case "_create_timestamp": + func_expr = ttypes.FunctionExpr(function_name="create_timestamp", arguments=[]) + expr_type = ttypes.ParsedExprType(function_expr=func_expr) + parsed_expr = ttypes.ParsedExpr(type=expr_type) + return parsed_expr + case "_delete_timestamp": + func_expr = ttypes.FunctionExpr(function_name="delete_timestamp", arguments=[]) + expr_type = ttypes.ParsedExprType(function_expr=func_expr) + parsed_expr = ttypes.ParsedExpr(type=expr_type) + return parsed_expr case "_score": func_expr = ttypes.FunctionExpr(function_name="score", arguments=[]) expr_type = ttypes.ParsedExprType(function_expr=func_expr) @@ -89,6 +99,21 @@ def traverse_conditions(cons, fn=None) -> ttypes.ParsedExpr: expr_type = ttypes.ParsedExprType(function_expr=func_expr) parsed_expr = ttypes.ParsedExpr(type=expr_type) return parsed_expr + case "_score_factors": + func_expr = ttypes.FunctionExpr(function_name="score_factors", arguments=[]) + expr_type = ttypes.ParsedExprType(function_expr=func_expr) + parsed_expr = ttypes.ParsedExpr(type=expr_type) + return parsed_expr + case "_similarity_factors": + func_expr = ttypes.FunctionExpr(function_name="similarity_factors", arguments=[]) + expr_type = ttypes.ParsedExprType(function_expr=func_expr) + parsed_expr = ttypes.ParsedExpr(type=expr_type) + return parsed_expr + case "_distance_factors": + func_expr = ttypes.FunctionExpr(function_name="distance_factors", arguments=[]) + expr_type = ttypes.ParsedExprType(function_expr=func_expr) + parsed_expr = ttypes.ParsedExpr(type=expr_type) + return parsed_expr case _: parsed_expr = ttypes.ParsedExpr() column_expr = ttypes.ColumnExpr() @@ -384,9 +409,9 @@ def wrapper(*args, **kwargs): return decorator -def select_res_to_polars(res) -> pl.DataFrame: +def select_res_to_polars(res) -> (pl.DataFrame, Any): df_dict = {} - data_dict, data_type_dict = build_result(res) + data_dict, data_type_dict, extra_result = build_result(res) for k, v in data_dict.items(): data_series = pd.Series(v, dtype=logic_type_to_dtype(data_type_dict[k])) df_dict[k] = data_series diff --git a/python/infinity_sdk/infinity/table.py b/python/infinity_sdk/infinity/table.py index 10393991ce..a2205ad629 100644 --- a/python/infinity_sdk/infinity/table.py +++ b/python/infinity_sdk/infinity/table.py @@ -21,6 +21,7 @@ from infinity.common import InfinityException, INSERT_DATA from infinity.errors import ErrorCode + class ExplainType(Enum): Analyze = 1 Ast = 2 @@ -46,4 +47,4 @@ def to_ttype(self): elif self is ExplainType.Fragment: return ttypes.ExplainType.Fragment else: - raise InfinityException(ErrorCode.INVALID_EXPLAIN_TYPE, "Unknown explain type") \ No newline at end of file + raise InfinityException(ErrorCode.INVALID_EXPLAIN_TYPE, "Unknown explain type") diff --git a/python/infinity_sdk/infinity/utils.py b/python/infinity_sdk/infinity/utils.py index 2514dfe445..64d7b82101 100644 --- a/python/infinity_sdk/infinity/utils.py +++ b/python/infinity_sdk/infinity/utils.py @@ -47,5 +47,6 @@ def binary_exp_to_paser_exp(binary_expr_key) -> str: else: raise InfinityException(ErrorCode.INVALID_EXPRESSION, f"unknown binary expression: {binary_expr_key}") + def deprecated_api(message): warnings.warn(message, DeprecationWarning, stacklevel=2) diff --git a/python/infinity_sdk/pyproject.toml b/python/infinity_sdk/pyproject.toml index 92f7574c5e..50386f3aaf 100644 --- a/python/infinity_sdk/pyproject.toml +++ b/python/infinity_sdk/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "infinity-sdk" -version = "0.5.0.dev5" +version = "0.5.0.dev6" requires-python = ">=3.10" dependencies = [ "sqlglot~=11.7.0", diff --git a/python/parallel_test/test_chaos.py b/python/parallel_test/test_chaos.py index e39de5ea5c..44e0715fea 100644 --- a/python/parallel_test/test_chaos.py +++ b/python/parallel_test/test_chaos.py @@ -82,13 +82,13 @@ def read_out_data(): def search_fulltext(table_obj): - res = table_obj.output(["index", "body", "other_vector", "_row_id", "_score"]).match_text( + res, extra_result = table_obj.output(["index", "body", "other_vector", "_row_id", "_score"]).match_text( "body^5", "harmful chemical", 3).to_pl() print(res) def search_vector(table_obj): - res = table_obj.output(["*"]).match_dense("other_vector", [2] * 4, "float", "l2", 3).to_pl() + res, extra_result = table_obj.output(["*"]).match_dense("other_vector", [2] * 4, "float", "l2", 3).to_pl() print(res) diff --git a/python/parallel_test/test_ddl_and_insert_delete.py b/python/parallel_test/test_ddl_and_insert_delete.py index 5c32e3d892..22f6418adf 100644 --- a/python/parallel_test/test_ddl_and_insert_delete.py +++ b/python/parallel_test/test_ddl_and_insert_delete.py @@ -77,7 +77,7 @@ def insert(db_obj: Database): value.append({"tag": random.randint(0, 9), "c1": [random.random(), random.random(), random.random(), random.random()]}) table_obj.insert(value) - res = table_obj.output(['*']).to_df() + res, extra_result = table_obj.output(['*']).to_df() print(res) except Exception as e: return diff --git a/python/parallel_test/test_index_parallel.py b/python/parallel_test/test_index_parallel.py index b9d23e3750..732c867dec 100644 --- a/python/parallel_test/test_index_parallel.py +++ b/python/parallel_test/test_index_parallel.py @@ -52,7 +52,7 @@ def read_worker(connection_pool: ConnectionPool, end_time): table_obj = db_obj.get_table("test_fulltext_index_parallel") while time.time() < end_time: - res = table_obj.output(["doctitle", "docdate", "_row_id", "_score"]).match_text( + res, extra_result = table_obj.output(["doctitle", "docdate", "_row_id", "_score"]).match_text( "body^5", "harmful chemical", 3).to_pl() print(res) time.sleep(0.1) @@ -149,7 +149,7 @@ def test_vector_index_single_thread(self, get_infinity_connection_pool, index_ty print("begin import") table_obj.import_data(file_path) print("import complete") - res = table_obj.output(["variant_id"]).match_dense( + res, extra_result = table_obj.output(["variant_id"]).match_dense( knn_column_name, [1] * 4, "float", knn_distance_type, 5).to_pl() print(res) @@ -199,7 +199,7 @@ def read_worker(connection_pool: ConnectionPool, end_time, knn_column_name, knn_ table_obj = db_obj.get_table("test_vector_index_parallel") while time.time() < end_time: - res = table_obj.output(["variant_id"]).match_dense( + res, extra_result = table_obj.output(["variant_id"]).match_dense( knn_column_name, [1] * 4, "float", knn_distance_type, 5).to_pl() print(res) time.sleep(0.1) @@ -313,7 +313,7 @@ def query_worker(connection_pool: ConnectionPool, table_name, end_time, thread_i while time.time() < end_time: try: - res = table_obj.output(["doctitle", "docdate", "_row_id", "_score"]).match_text( + res, extra_result = table_obj.output(["doctitle", "docdate", "_row_id", "_score"]).match_text( "body^5", "harmful chemical", 3).to_pl() # print(f"thread {thread_id}: check result:\n{res}") self.logger.info(f"thread {thread_id}: check result:\n{res}") diff --git a/python/parallel_test/test_insert_delete_parallel.py b/python/parallel_test/test_insert_delete_parallel.py index 2e8a39fbd0..64db2968bf 100644 --- a/python/parallel_test/test_insert_delete_parallel.py +++ b/python/parallel_test/test_insert_delete_parallel.py @@ -46,7 +46,7 @@ def test_insert_and_delete_parallel(self, get_infinity_connection_pool): infinity_obj = connection_pool.get_conn() db_obj = infinity_obj.get_database(db_name) table_obj = db_obj.get_table(table_name) - res = table_obj.output(["*"]).to_df() + res, extra_result = table_obj.output(["*"]).to_df() assert len(res) == 0 res = db_obj.drop_table(table_name, ConflictType.Error) diff --git a/python/parallel_test/test_insert_delete_parallel_simple.py b/python/parallel_test/test_insert_delete_parallel_simple.py index 33ef407fa3..eef92b64de 100644 --- a/python/parallel_test/test_insert_delete_parallel_simple.py +++ b/python/parallel_test/test_insert_delete_parallel_simple.py @@ -40,7 +40,7 @@ def test_insert_and_delete_parallel_simple(self, get_infinity_connection_pool): infinity_obj = connection_pool.get_conn() db_obj = infinity_obj.get_database(db_name) table_obj = db_obj.get_table(table_name) - res = table_obj.output(["*"]).to_df() + res, extra_result = table_obj.output(["*"]).to_df() print(res) assert len(res) == 0 res = db_obj.drop_table(table_name, ConflictType.Error) diff --git a/python/parallel_test/test_insert_delete_update.py b/python/parallel_test/test_insert_delete_update.py index e606261dbc..da3304c295 100644 --- a/python/parallel_test/test_insert_delete_update.py +++ b/python/parallel_test/test_insert_delete_update.py @@ -35,7 +35,7 @@ def test_insert_delete_update_parallel_vec(self, get_infinity_connection_pool): infinity_obj = connection_pool.get_conn() db_obj = infinity_obj.get_database(db_name) table_obj = db_obj.get_table(table_name) - res = table_obj.output(['*']).to_df() + res, extra_result = table_obj.output(['*']).to_df() print(res) res = db_obj.drop_table(table_name, ConflictType.Error) assert res.error_code == ErrorCode.OK @@ -67,7 +67,7 @@ def updata(table_obj): def search(table_obj): - res = table_obj.output(['*']).to_df() + res, extra_result = table_obj.output(['*']).to_df() print(res) diff --git a/python/parallel_test/test_insert_parallel.py b/python/parallel_test/test_insert_parallel.py index c430ebbd9f..5f8b81888f 100644 --- a/python/parallel_test/test_insert_parallel.py +++ b/python/parallel_test/test_insert_parallel.py @@ -47,7 +47,7 @@ def test_insert_parallel(self, get_infinity_connection_pool): db_obj = infinity_obj.get_database("default_db") table_obj = db_obj.get_table("parallel_insert_test") - res = table_obj.output(['*']).to_df() + res, extra_result = table_obj.output(['*']).to_df() print(res) assert len(res) == total_row_count @@ -91,7 +91,7 @@ def test_insert_one_thread(self, get_infinity_connection_pool): table_obj.insert(value) value.clear() print(f"test_insert_one_thread: cost {time.time() - start_ts} s") - res = table_obj.output(['*']).to_df() + res, extra_result = table_obj.output(['*']).to_df() print(res) assert len(res) == total_row_count @@ -141,7 +141,7 @@ def test_insert_and_count_star_parallel(self, get_infinity_connection_pool): db_obj = infinity_obj.get_database("default_db") table_obj = db_obj.get_table("parallel_insert_test") - res = table_obj.output(['*']).to_df() + res, extra_result = table_obj.output(['*']).to_df() print(res) assert len(res) == total_row_count diff --git a/python/restart_test/restart_util.py b/python/restart_test/restart_util.py index d9afd7e585..38cb9ea812 100644 --- a/python/restart_test/restart_util.py +++ b/python/restart_test/restart_util.py @@ -191,6 +191,8 @@ def index(): def import_file() -> str: base_filepath = "test/data/jsonl/test_table.jsonl" filepath = "test/data/jsonl/test_table_gen.jsonl" + if os.path.exists(filepath): + return filepath if not os.path.exists("test/data/jsonl"): os.makedirs("test/data/jsonl") with open(base_filepath, "r") as base_file: diff --git a/python/restart_test/test_alter.py b/python/restart_test/test_alter.py index 9ae5a68722..507456d698 100644 --- a/python/restart_test/test_alter.py +++ b/python/restart_test/test_alter.py @@ -55,7 +55,7 @@ def part2(infinity_obj): db_obj = infinity_obj.get_database("default_db") table_obj = db_obj.get_table(table_name) - res = table_obj.output(["*"]).to_df() + res, extra_result = table_obj.output(["*"]).to_df() pd.testing.assert_frame_equal( res, pd.DataFrame( @@ -136,7 +136,7 @@ def part1(infinity_obj): def part2(infinity_obj): db_obj = infinity_obj.get_database("default_db") table_obj = db_obj.get_table(table_name) - res = table_obj.output(["*"]).to_df() + res, extra_result = table_obj.output(["*"]).to_df() pd.testing.assert_frame_equal( res, pd.DataFrame( @@ -163,7 +163,7 @@ def part2(infinity_obj): res = table_obj.list_indexes() assert len(res.index_names) == 1 - data_dict, _ = ( + data_dict, _, _ = ( table_obj.output(["c1"]) .match_text(fields="c3", matching_text="test", topn=1) .to_result() @@ -229,3 +229,48 @@ def part2(infinity_obj): db_obj.drop_table(table_name) part2() + + def test_restart_after_alter_and_checkpoint(self, infinity_runner: InfinityRunner): + table_name = "test_alter4" + config = "test/data/config/restart_test/test_alter/1.toml" + + infinity_runner.clear() + uri = common_values.TEST_LOCAL_HOST + data_dir = "/var/infinity/data" + + decorator = infinity_runner_decorator_factory(config, uri, infinity_runner) + + @decorator + def part1(infinity_obj): + db_obj = infinity_obj.get_database("default_db") + db_obj.drop_table(table_name, ConflictType.Ignore) + table_obj = db_obj.create_table( + table_name, + { + "c1": {"type": "int"}, + "c2": {"type": "int"}, + "c3": {"type": "varchar"}, + }, + ) + table_obj.insert([{"c1": 1, "c2": 2, "c3": "test"}]) + + table_obj.add_columns({"c4": {"type": "varchar", "default": "tttt"}}) + table_obj.drop_columns(["c2"]) + + infinity_obj.flush_data() + + table_obj.drop_columns(["c3"]) + + infinity_obj.flush_delta() + + part1() + + @decorator + def part2(infinity_obj): + dropped_column_dirs = pathlib.Path(data_dir).rglob("1.col") + assert len(list(dropped_column_dirs)) == 0 + + dropped_column_dirs = pathlib.Path(data_dir).rglob("2.col") + assert len(list(dropped_column_dirs)) == 0 + + part2() diff --git a/python/restart_test/test_cleanup.py b/python/restart_test/test_cleanup.py index 218eb3a88e..3b5c8d9659 100644 --- a/python/restart_test/test_cleanup.py +++ b/python/restart_test/test_cleanup.py @@ -161,7 +161,7 @@ def part2(infinity_obj): db_obj = infinity_obj.get_database("default_db") table_obj = db_obj.get_table(table_name) - data_dict, _ = table_obj.output(["count(*)"]).to_result() + data_dict, _, _ = table_obj.output(["count(*)"]).to_result() count_star = data_dict["count(star)"][0] assert count_star == insert_n diff --git a/python/restart_test/test_compact.py b/python/restart_test/test_compact.py new file mode 100644 index 0000000000..186fbae3a8 --- /dev/null +++ b/python/restart_test/test_compact.py @@ -0,0 +1,91 @@ +from infinity_runner import InfinityRunner, infinity_runner_decorator_factory +from common import common_values +from restart_util import * +from infinity.common import ConflictType + + +class TestCompact: + def test_restart_after_compact_and_cleanup(self, infinity_runner: InfinityRunner): + config = "test/data/config/restart_test/test_compact/1.toml" + uri = common_values.TEST_LOCAL_HOST + infinity_runner.clear() + + decorator = infinity_runner_decorator_factory(config, uri, infinity_runner) + + columns = LChYDataGenerato.columns() + indexes = LChYDataGenerato.index() + import_file = LChYDataGenerato.import_file() + + @decorator + def part1(infinity_obj): + db_obj = infinity_obj.get_database("default_db") + db_obj.drop_table("test_compact", ConflictType.Ignore) + table_obj = db_obj.create_table("test_compact", columns) + table_obj.import_data(import_file, {"file_type": "jsonl"}) + table_obj.import_data(import_file, {"file_type": "jsonl"}) + infinity_obj.flush_delta() + table_obj.compact() + for index_info in indexes: + table_obj.create_index(f"idx_{index_info.column_name}", index_info) + + infinity_obj.cleanup() + + part1() + + @decorator + def part2(infinity_obj): + pass + + part2() + + def test_restart_compact_index(self, infinity_runner: InfinityRunner): + config = "test/data/config/restart_test/test_compact/1.toml" + uri = common_values.TEST_LOCAL_HOST + infinity_runner.clear() + + decorator = infinity_runner_decorator_factory(config, uri, infinity_runner) + + table_name = "test_compact1" + dataset_path = "test/data/csv/enwiki_9.csv" + import_options = {"delimiter": "\t", "file_type": "csv"} + + @decorator + def part1(infinity_obj): + db_obj = infinity_obj.get_database("default_db") + db_obj.drop_table(table_name, ConflictType.Ignore) + table_obj = db_obj.create_table( + table_name, + { + "doctitle": {"type": "varchar"}, + "docdate": {"type": "varchar"}, + "body": {"type": "varchar"}, + }, + ) + table_obj.create_index( + "ft_index", index.IndexInfo("body", index.IndexType.FullText) + ) + table_obj.import_data(dataset_path, import_options) + table_obj.import_data(dataset_path, import_options) + table_obj.compact() + + infinity_obj.flush_data() + + table_obj.import_data(dataset_path, import_options) + table_obj.compact() + infinity_obj.flush_delta() + + table_obj.import_data(dataset_path, import_options) + table_obj.compact() + + + part1() + import_time = 4 + + @decorator + def part2(infinity_obj): + table_obj = infinity_obj.get_database("default_db").get_table(table_name) + data_dict, data_type_dict, _ = table_obj.output(["count(*)"]).to_result() + count_star = data_dict["count(star)"][0] + assert count_star == 9 * import_time + + part2() diff --git a/python/restart_test/test_fulltext.py b/python/restart_test/test_fulltext.py index 0b8b696836..59908082dc 100644 --- a/python/restart_test/test_fulltext.py +++ b/python/restart_test/test_fulltext.py @@ -145,8 +145,8 @@ def t1(): .to_result() ) - data_dict, _ = res - gt_data_dict, _ = gt_res + data_dict, _, _ = res + gt_data_dict, _, _ = gt_res if data_dict != gt_data_dict: print(f"diff: {data_dict} {gt_data_dict}") else: diff --git a/python/restart_test/test_insert.py b/python/restart_test/test_insert.py index eeb0e89222..a2cb836a0b 100644 --- a/python/restart_test/test_insert.py +++ b/python/restart_test/test_insert.py @@ -83,7 +83,7 @@ def part1(infinity_obj): db_obj = infinity_obj.get_database("default_db") table_obj = db_obj.get_table("test_insert") - data_dict, _ = table_obj.output(["count(*)"]).to_result() + data_dict, _, _ = table_obj.output(["count(*)"]).to_result() count_star = data_dict["count(star)"][0] assert count_star == cur_insert_n print(f"cur_insert_n: {cur_insert_n}") @@ -243,7 +243,7 @@ def part1(infinity_obj, test_i: int): db_obj = infinity_obj.get_database("default_db") table_obj = db_obj.get_table("test_insert_checkpoint") - data_dict, _ = table_obj.output(["count(*)"]).to_result() + data_dict, _, _ = table_obj.output(["count(*)"]).to_result() count_star = data_dict["count(star)"][0] assert count_star == line_num diff --git a/python/restart_test/test_insert_import.py b/python/restart_test/test_insert_import.py index a45d722247..8ddd5a5f01 100644 --- a/python/restart_test/test_insert_import.py +++ b/python/restart_test/test_insert_import.py @@ -111,7 +111,7 @@ def part1(infinity_obj): db_obj = infinity_obj.get_database("default_db") table_obj = db_obj.get_table("test_insert") - data_dict, _ = table_obj.output(["count(*)"]).to_result() + data_dict, _, _ = table_obj.output(["count(*)"]).to_result() count_star = data_dict["count(star)"][0] assert count_star == cur_n logger.debug(f"cur_n: {cur_n}") diff --git a/python/restart_test/test_memidx.py b/python/restart_test/test_memidx.py index facae0546a..6678ca8f5f 100644 --- a/python/restart_test/test_memidx.py +++ b/python/restart_test/test_memidx.py @@ -5,6 +5,7 @@ import time import pathlib from infinity.common import ConflictType, SparseVector +import pytest class TestMemIdx: @@ -58,7 +59,7 @@ def part2(infinity_obj): time.sleep(5) db_obj = infinity_obj.get_database("default_db") table_obj = db_obj.get_table("test_memidx1") - data_dict, data_type_dict = ( + data_dict, data_type_dict, _ = ( table_obj.output(["c1"]) .match_dense("c2", [0.3, 0.3, 0.2, 0.2], "float", "l2", 6) .to_result() @@ -66,7 +67,7 @@ def part2(infinity_obj): # print(data_dict["c1"]) assert data_dict["c1"] == [4, 4, 4, 4, 4, 2] - data_dict, data_type_dict = table_obj.output(["count(*)"]).to_result() + data_dict, data_type_dict, _ = table_obj.output(["count(*)"]).to_result() # print(data_dict) assert data_dict["count(star)"] == [10] @@ -87,14 +88,14 @@ def part3(infinity_obj): table_obj = db_obj.get_table("test_memidx1") def check(): - data_dict, data_type_dict = ( + data_dict, data_type_dict, _ = ( table_obj.output(["c1"]) .match_dense("c2", [0.3, 0.3, 0.2, 0.2], "float", "l2", 6) .to_result() ) assert data_dict["c1"] == [8, 6, 6, 4, 4, 4] - data_dict, data_type_dict = table_obj.output(["count(*)"]).to_result() + data_dict, data_type_dict, _ = table_obj.output(["count(*)"]).to_result() assert data_dict["count(star)"] == [13] check() @@ -130,6 +131,322 @@ def check(): # select count(*) from test_memidx1; # # result: 13 + def test_mem_ivf(self, infinity_runner: InfinityRunner): + config1 = "test/data/config/restart_test/test_memidx/1.toml" + config2 = "test/data/config/restart_test/test_memidx/2.toml" + config3 = "test/data/config/restart_test/test_memidx/3.toml" + uri = common_values.TEST_LOCAL_HOST + infinity_runner.clear() + + decorator1 = infinity_runner_decorator_factory(config1, uri, infinity_runner) + + @decorator1 + def part1(infinity_obj): + db_obj = infinity_obj.get_database("default_db") + table_obj = db_obj.create_table( + "test_mem_ivf", + {"c1": {"type": "int"}, "c2": {"type": "vector,4,float"}}, + ) + res = table_obj.create_index( + "idx1", + index.IndexInfo( + "c2", + index.IndexType.IVF, + { + "metric": "l2", + }, + ), + ) + assert res.error_code == infinity.ErrorCode.OK + + table_obj.insert([{"c1": 2, "c2": [0.1, 0.2, 0.3, -0.2]} for i in range(51)]) + # trigger the dump by 52th record + table_obj.insert([{"c1": 4, "c2": [0.2, 0.1, 0.3, 0.4]}]) + # table_obj.insert([{"c1": 2, "c2": [0.1, 0.2, 0.3, -0.2]} for i in range(2)]) + time.sleep(5) + table_obj.insert([{"c1": 4, "c2": [0.2, 0.1, 0.3, 0.4]} for i in range(4)]) + + part1() + + # config1 can hold 51 rows of ivf mem index before dump + # 1. recover by dumpindex wal & memindex recovery + decorator2 = infinity_runner_decorator_factory(config2, uri, infinity_runner) + + @decorator2 + def part2(infinity_obj): + time.sleep(5) + db_obj = infinity_obj.get_database("default_db") + table_obj = db_obj.get_table("test_mem_ivf") + data_dict, data_type_dict, _ = table_obj.output(["count(*)"]).to_result() + # print(data_dict) + assert data_dict["count(star)"] == [56] + + data_dict, data_type_dict, _ = ( + table_obj.output(["c1"]) + .match_dense("c2", [0.3, 0.3, 0.2, 0.2], "float", "l2", 6, {"nprobe" : "100"}) + .to_result() + ) + # print(data_dict["c1"]) + assert data_dict["c1"] == [4, 4, 4, 4, 4, 2] + + data_dict, data_type_dict, _ = table_obj.output(["count(*)"]).to_result() + # print(data_dict) + assert data_dict["count(star)"] == [56] + + table_obj.insert([{"c1": 6, "c2": [0.3, 0.2, 0.1, 0.4]} for i in range(2)]) + # wait for memindex dump & delta checkpoint to dump + time.sleep(5) + table_obj.insert([{"c1": 8, "c2": [0.4, 0.3, 0.2, 0.1]}]) + + part2() + + # 2. recover by delta ckp & dumpindex wal & memindex recovery + decorator3 = infinity_runner_decorator_factory(config3, uri, infinity_runner) + + @decorator3 + def part3(infinity_obj): + time.sleep(5) + db_obj = infinity_obj.get_database("default_db") + table_obj = db_obj.get_table("test_mem_ivf") + + def check(): + data_dict, data_type_dict, _ = ( + table_obj.output(["c1"]) + .match_dense("c2", [0.3, 0.3, 0.2, 0.2], "float", "l2", 6) + .to_result() + ) + assert data_dict["c1"] == [8, 6, 6, 4, 4, 4] + + data_dict, data_type_dict, _ = table_obj.output(["count(*)"]).to_result() + assert data_dict["count(star)"] == [59] + + check() + infinity_obj.optimize("default_db", "test_mem_ivf", optimize_opt=None) + check() + + db_obj.drop_table("test_memidx1") + + part3() + + def test_mem_indexer(self, infinity_runner : InfinityRunner): + config1 = "test/data/config/restart_test/test_memidx/1.toml" + config2 = "test/data/config/restart_test/test_memidx/2.toml" + config3 = "test/data/config/restart_test/test_memidx/3.toml" + uri = common_values.TEST_LOCAL_HOST + infinity_runner.clear() + + decorator1 = infinity_runner_decorator_factory(config1, uri, infinity_runner) + + @decorator1 + def part1(infinity_obj): + db_obj = infinity_obj.get_database("default_db") + table_obj = db_obj.create_table( + "test_mem_indexer", + {"c1" : {"type" : "int"}, "c2": {"type": "varchar"}}, + ) + res = table_obj.create_index( + "idx1", + index.IndexInfo( + "c2", + index.IndexType.FullText, + ), + ) + assert res.error_code == infinity.ErrorCode.OK + + table_obj.insert([ + {"c1" : 1, "c2" : "this is a test text"}, + {"c1" : 2, "c2" : "this is not a test text"}, + ]) + # trigger the dump in 3rd record + table_obj.insert([ + {"c1" : 3, "c2" : "this is indeed a test text"}, + ]) + table_obj.insert([ + {"c1" : 4, "c2" : "this is definitely not a test text"}, + {"c1" : 5, "c2" : "this is nothing but a test text"}, + ]) + + part1() + + # config1 can hold 2 rows of identical fulltext mem index before dump + # 1. recover by dumpindex wal & memindex recovery + decorator2 = infinity_runner_decorator_factory(config2, uri, infinity_runner) + + @decorator2 + def part2(infinity_obj): + time.sleep(5) + db_obj = infinity_obj.get_database("default_db") + table_obj = db_obj.get_table("test_mem_indexer") + data_dict, data_type_dict, _ = table_obj.output(["count(*)"]).to_result() + # print(data_dict) + assert data_dict["count(star)"] == [5] + + data_dict, data_type_dict, _ = ( + table_obj.output(["c1"]) + .match_text('c2', 'test text', 3) + .to_result() + ) + # print(data_dict["c1"]) + assert data_dict["c1"] == [1, 2, 3] + + data_dict, data_type_dict, _ = table_obj.output(["count(*)"]).to_result() + # print(data_dict) + assert data_dict["count(star)"] == [5] + + # the 2nd dump + table_obj.insert([ + {"c1" : 6, "c2" : "this is the exact opposite of a test text"}, + ]) + time.sleep(5) + table_obj.insert([ + {"c1" : 7, "c2" : "what is this?"}, + {"c1" : 8, "c2" : "this is what?"}, + {"c1" : 9, "c2" : "not a test text!"}, + {"c1" : 10, "c2" : "what a this?"}, + {"c1" : 11, "c2" : "this is you!"}, + ]) + + part2() + + # 2. recover by delta ckp & dumpindex wal & memindex recovery + decorator3 = infinity_runner_decorator_factory(config3, uri, infinity_runner) + + @decorator3 + def part3(infinity_obj): + time.sleep(5) + db_obj = infinity_obj.get_database("default_db") + table_obj = db_obj.get_table("test_mem_indexer") + + def check(rows): + data_dict, data_type_dict, _ = ( + table_obj.output(["c1"]) + .match_text('c2', 'this what', 3) + .to_result() + ) + # print(data_dict["c1"]) + assert data_dict["c1"] == [7, 8, 10] + + data_dict, data_type_dict, _ = table_obj.output(["count(*)"]).to_result() + assert data_dict["count(star)"] == [rows] + + check(11) + table_obj.insert([ + {"c1" : 12, "c2" : "this is a text!"}, + ]) + check(12) + + # the 3rd dump + db_obj.drop_table("test_mem_indexer") + + part3() + + @pytest.mark.skip(reason="bug") + def test_mem_bmp(self, infinity_runner: InfinityRunner): + config1 = "test/data/config/restart_test/test_memidx/1.toml" + config2 = "test/data/config/restart_test/test_memidx/2.toml" + config3 = "test/data/config/restart_test/test_memidx/3.toml" + uri = common_values.TEST_LOCAL_HOST + infinity_runner.clear() + + test_data = [ + {"c1" : 1, "c2" : SparseVector(indices=[0, 10, 20, 30, 40, 50, 60, 70, 80, 90], values=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0])}, + {"c1" : 2, "c2" : SparseVector(indices=[0, 20, 40, 60, 80], values=[2.0, 2.0, 2.0, 2.0, 2.0])}, + {"c1" : 3, "c2" : SparseVector(indices=[0, 30, 60, 90], values=[3.0, 3.0, 3.0, 3.0])}, + {"c1" : 4, "c2" : SparseVector(indices=[0, 40, 80], values=[4.0, 4.0, 4.0])}, + {"c1" : 5, "c2" : SparseVector(indices=[0], values=[0.0])}, + ] + query_vector = SparseVector(indices=[0, 20, 80], values=[1.0, 2.0, 3.0]) + + decorator1 = infinity_runner_decorator_factory(config1, uri, infinity_runner) + + @decorator1 + def part1(infinity_obj): + db_obj = infinity_obj.get_database("default_db") + table_obj = db_obj.create_table( + "test_mem_bmp", + {"c1": {"type": "int"}, "c2": {"type": "sparse,100,float,int"}}, + ) + res = table_obj.create_index( + "idx1", + index.IndexInfo( + "c2", + index.IndexType.BMP, + {"BLOCK_SIZE": "8", "COMPRESS_TYPE": "compress"}, + ), + ) + assert res.error_code == infinity.ErrorCode.OK + + # trigger dump + for i in range(7): + table_obj.insert(test_data) + + part1() + + # config1 can hold 51 rows of ivf mem index before dump + # 1. recover by dumpindex wal & memindex recovery + decorator2 = infinity_runner_decorator_factory(config2, uri, infinity_runner) + + @decorator2 + def part2(infinity_obj): + time.sleep(5) + db_obj = infinity_obj.get_database("default_db") + table_obj = db_obj.get_table("test_mem_bmp") + data_dict, data_type_dict = table_obj.output(["count(*)"]).to_result() + # print(data_dict) + assert data_dict["count(star)"] == [35] + + data_dict, data_type_dict = ( + table_obj.output(["c1"]) + .match_sparse("c2", query_vector, "ip", 8) + .to_result() + ) + assert data_dict["c1"] == [4, 4, 4, 4, 4, 4, 4, 2] + + data_dict, data_type_dict = table_obj.output(["count(*)"]).to_result() + # print(data_dict) + assert data_dict["count(star)"] == [35] + + for i in range(3): + table_obj.insert(test_data) + time.sleep(5) + + data_dict, data_type_dict = ( + table_obj.output(["c1"]) + .match_sparse("c2", query_vector, "ip", 11) + .to_result() + ) + assert data_dict["c1"] == [4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 2] + + part2() + + # 2. recover by delta ckp & dumpindex wal & memindex recovery + decorator3 = infinity_runner_decorator_factory(config3, uri, infinity_runner) + + @decorator3 + def part3(infinity_obj): + time.sleep(5) + db_obj = infinity_obj.get_database("default_db") + table_obj = db_obj.get_table("test_mem_bmp") + + def check(): + data_dict, data_type_dict = ( + table_obj.output(["c1"]) + .match_sparse("c2", query_vector, "ip", 11) + .to_result() + ) + assert data_dict["c1"] == [4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 2] + + data_dict, data_type_dict = table_obj.output(["count(*)"]).to_result() + assert data_dict["count(star)"] == [50] + + check() + infinity_obj.optimize("default_db", "test_mem_bmp", optimize_opt=None) + check() + + db_obj.drop_table("test_mem_bmp") + + part3() + def test_optimize_from_different_database(self, infinity_runner: InfinityRunner): infinity_runner.clear() @@ -358,12 +675,12 @@ def part1(infinity_obj): def part2(infinity_obj): db_obj = infinity_obj.get_database("default_db") table_obj = db_obj.get_table(table_name) - data_dict, data_type_dict = ( + data_dict, data_type_dict, _ = ( table_obj.output(["c1"]).filter("c2 >= 8192").to_result() ) assert data_dict["c1"] == [8192 + i for i in range(100)] - data_dict, data_type_dict = ( + data_dict, data_type_dict, _ = ( table_obj.output(["c1"]) .match_sparse("c3", SparseVector(indices=[1], values=[1.0]), "ip", 100) .to_result() @@ -406,7 +723,7 @@ def part1(infinity_obj): def part2(infinity_obj): db_obj = infinity_obj.get_database("default_db") table_obj = db_obj.get_table("test_memidx5") - data_dict, data_type_dict = ( + data_dict, data_type_dict, _ = ( table_obj.output(["c1"]).match_text("c2", "hello", 2).to_result() ) assert data_dict["c1"] == [1, 2] diff --git a/python/test_cluster/conftest.py b/python/test_cluster/conftest.py index 10b601a83e..e2dd953503 100644 --- a/python/test_cluster/conftest.py +++ b/python/test_cluster/conftest.py @@ -18,6 +18,11 @@ def pytest_addoption(parser): parser.addoption( "--minio_port", action="store", + default=9000, + ) + parser.addoption( + "--minio_console_port", + action="store", default=9001, ) parser.addoption( @@ -55,7 +60,8 @@ def pytest_generate_tests(metafunc): infinity_path = metafunc.config.getoption("infinity_path") minio_dir = metafunc.config.getoption("minio_dir") minio_port = metafunc.config.getoption("minio_port") - minio_params = MinioParams(minio_dir, minio_port) + minio_console_port = metafunc.config.getoption("minio_console_port") + minio_params = MinioParams(minio_dir, minio_port, minio_console_port) infinity_dir = metafunc.config.getoption("infinity_dir") use_sudo = metafunc.config.getoption("use_sudo") diff --git a/python/test_cluster/database_operations.py b/python/test_cluster/database_operations.py index 4dcea24df1..80b98b284c 100644 --- a/python/test_cluster/database_operations.py +++ b/python/test_cluster/database_operations.py @@ -8,11 +8,12 @@ from dataclasses import dataclass from typing import Dict, Set, Tuple -class instance_state: - def __init__(self, client : infinity_http.infinity_http = None): - self.db2tables : Dict[str, Set[str]] = {"default_db" : set()} - self.dbtable2index : Dict[Tuple[str, str], Set[str]] = {} - self.dbtable2df : Dict[Tuple[str, str], pd.DataFrame] = {} + +class instance_state: + def __init__(self, client: infinity_http.infinity_http = None): + self.db2tables: Dict[str, Set[str]] = {"default_db": set()} + self.dbtable2index: Dict[Tuple[str, str], Set[str]] = {} + self.dbtable2df: Dict[Tuple[str, str], pd.DataFrame] = {} if client is not None: databases = client.list_databases().db_names @@ -22,8 +23,8 @@ def __init__(self, client : infinity_http.infinity_http = None): tables = db_object.get_all_tables() for table_name in tables: table_object = db_object.get_table(table_name) - df = table_object.output(["*"]).to_df() - res = table_object.output(["*"]).to_result() + df, extra_result = table_object.output(["*"]).to_df() + res, _, _ = table_object.output(["*"]).to_result() # print(f"instance_state initializing, table {db_name}.{table_name}") # print(res) self.add_table(db_name, table_name, ConflictType.Ignore) @@ -32,38 +33,40 @@ def __init__(self, client : infinity_http.infinity_http = None): for index in indexes: self.add_index(db_name, table_name, index["index_name"], ConflictType.Ignore) - def check_db_exist(self, db_name : str): + def check_db_exist(self, db_name: str): if db_name not in self.db2tables: raise InfinityException(ErrorCode.DB_NOT_EXIST, f"database {db_name} does not exist!") - def check_db_not_exist(self, db_name : str): + def check_db_not_exist(self, db_name: str): if db_name in self.db2tables: raise InfinityException(ErrorCode.DUPLICATE_DATABASE_NAME, f"database {db_name} already exists!") - def check_table_exist(self, db_name : str, table_name : str): + def check_table_exist(self, db_name: str, table_name: str): self.check_db_exist(db_name) if table_name not in self.db2tables[db_name]: raise InfinityException(ErrorCode.TABLE_NOT_EXIST, f"table {db_name}.{table_name} does not exist!") - def check_table_not_exist(self, db_name : str, table_name : str): + def check_table_not_exist(self, db_name: str, table_name: str): self.check_db_exist(db_name) if table_name in self.db2tables[db_name]: raise InfinityException(ErrorCode.DUPLICATE_TABLE_NAME, f"table {db_name}.{table_name} already exists!") - def check_index_exist(self, db_name : str, table_name : str, index_name : str): + def check_index_exist(self, db_name: str, table_name: str, index_name: str): self.check_table_exist(db_name, table_name) if index_name not in self.dbtable2index: - raise InfinityException(ErrorCode.INDEX_NOT_EXIST, f"table {db_name}.{table_name}.{index_name} does not exist!") + raise InfinityException(ErrorCode.INDEX_NOT_EXIST, + f"table {db_name}.{table_name}.{index_name} does not exist!") - def check_index_not_exist(self, db_name : str, table_name : str, index_name : str): + def check_index_not_exist(self, db_name: str, table_name: str, index_name: str): self.check_table_exist(db_name, table_name) if index_name in self.dbtable2index: - raise InfinityException(ErrorCode.DUPLICATE_INDEX_NAME, f"table {db_name}.{table_name}.{index_name} already exists!") + raise InfinityException(ErrorCode.DUPLICATE_INDEX_NAME, + f"table {db_name}.{table_name}.{index_name} already exists!") # operations to a instance_state() # add drop : database, table, index - def add_database(self, db_name : str, conflict_type : ConflictType): + def add_database(self, db_name: str, conflict_type: ConflictType): if conflict_type == ConflictType.Ignore: if db_name in self.db2tables: return @@ -72,7 +75,7 @@ def add_database(self, db_name : str, conflict_type : ConflictType): self.db2tables[db_name] = set() - def drop_database(self, db_name : str, conflict_type : ConflictType): + def drop_database(self, db_name: str, conflict_type: ConflictType): if conflict_type == ConflictType.Ignore: if db_name not in self.db2tables: return @@ -84,7 +87,7 @@ def drop_database(self, db_name : str, conflict_type : ConflictType): self.dbtable2index.pop((db_name, table_name)) self.db2tables.pop(db_name) - def add_table(self, db_name : str, table_name : str, conflict_type : ConflictType): + def add_table(self, db_name: str, table_name: str, conflict_type: ConflictType): if conflict_type == ConflictType.Ignore: self.check_db_exist(db_name) if table_name in self.db2tables[db_name]: @@ -95,7 +98,7 @@ def add_table(self, db_name : str, table_name : str, conflict_type : ConflictTyp self.db2tables[db_name].add(table_name) self.dbtable2index[(db_name, table_name)] = set() - def drop_table(self, db_name : str, table_name : str, conflit_type : ConflictType): + def drop_table(self, db_name: str, table_name: str, conflit_type: ConflictType): if conflit_type == ConflictType.Ignore: self.check_database_exist(db_name) if table_name not in self.db2tables: @@ -107,7 +110,7 @@ def drop_table(self, db_name : str, table_name : str, conflit_type : ConflictTyp self.dbtable2index.pop((db_name, table_name)) self.dbtable2df.pop((db_name, table_name)) - def add_index(self, db_name : str, table_name : str, index_name : str, conflict_type : ConflictType): + def add_index(self, db_name: str, table_name: str, index_name: str, conflict_type: ConflictType): if conflict_type == ConflictType.Ignore: self.check_table_exist(db_name, table_name) if index_name in self.dbtable2index[(db_name, table_name)]: @@ -117,7 +120,7 @@ def add_index(self, db_name : str, table_name : str, index_name : str, conflict_ self.dbtable2index[(db_name, table_name)].add(index_name) - def drop_index(self, db_name : str, table_name : str, index_name : str, conflit_type : ConflictType): + def drop_index(self, db_name: str, table_name: str, index_name: str, conflit_type: ConflictType): if conflit_type == ConflictType.Ignore: self.check_table_exist(db_name, table_name) if index_name not in self.dbtable2index[(db_name, table_name)]: @@ -127,23 +130,24 @@ def drop_index(self, db_name : str, table_name : str, index_name : str, conflit_ self.dbtable2index[(db_name, table_name)].remove(index_name) - def get_table_df(self, db_name : str, table_name :str) -> pd.DataFrame | None: + def get_table_df(self, db_name: str, table_name: str) -> pd.DataFrame | None: self.check_table_exist(db_name, table_name) if (db_name, table_name) in self.dbtable2df: return self.dbtable2df[(db_name, table_name)] - else : + else: return None - def set_table_df(self, db_name : str, table_name :str, df : pd.DataFrame) : + def set_table_df(self, db_name: str, table_name: str, df: pd.DataFrame): df = df.reset_index(drop=True) # print(f"setting {db_name}.{table_name} = ") # print(df) self.check_table_exist(db_name, table_name) self.dbtable2df[(db_name, table_name)] = df + # this will clear a instance to its initial state: # only a default_db is remained -def clear_instance(state : instance_state, client : infinity_http.infinity_http): +def clear_instance(state: instance_state, client: infinity_http.infinity_http): for db_name, tables in state.db2tables.items(): if db_name == "default_db": db_obj = client.get_database(db_name) @@ -152,10 +156,12 @@ def clear_instance(state : instance_state, client : infinity_http.infinity_http) else: client.drop_database(db_name) -def check_instance_table_equal(state : instance_state, client : infinity_http.infinity_http, db_name : str, table_name : str): + +def check_instance_table_equal(state: instance_state, client: infinity_http.infinity_http, db_name: str, + table_name: str): db_obj = client.get_database(db_name) table_obj = db_obj.get_table(table_name) - res = table_obj.output(["*"]).to_df() + res, extra_result = table_obj.output(["*"]).to_df() expected = state.get_table_df(db_name, table_name) # print("res = ") # print(res) @@ -163,30 +169,37 @@ def check_instance_table_equal(state : instance_state, client : infinity_http.in # print(expected) pd.testing.assert_frame_equal(res, expected) -def check_instance_equal(state : instance_state, client : infinity_http.infinity_http): + +def check_instance_equal(state: instance_state, client: infinity_http.infinity_http): client_state = instance_state(client) assert state.db2tables == client_state.db2tables assert state.dbtable2index == client_state.dbtable2index for db_name, tables in state.db2tables.items(): for table_name in tables: - pd.testing.assert_frame_equal(state.dbtable2df[(db_name, table_name)], client_state.dbtable2df[(db_name, table_name)]) + pd.testing.assert_frame_equal(state.dbtable2df[(db_name, table_name)], + client_state.dbtable2df[(db_name, table_name)]) + # do operations on a single node -def do_some_operations(client : infinity_http.infinity_http, state : instance_state): +def do_some_operations(client: infinity_http.infinity_http, state: instance_state): table_create_insert_delete_modify(client, state) + # do operations on a cluster of nodes -def do_some_operations_cluster(leader_client : infinity_http.infinity_http, other_clients : [infinity_http.infinity_http], leader_state : instance_state): +def do_some_operations_cluster(leader_client: infinity_http.infinity_http, other_clients: [infinity_http.infinity_http], + leader_state: instance_state): table_create_insert_delete_modify(leader_client, leader_state) time.sleep(1) for client in other_clients: table_create_insert_delete_modify_verify(client, leader_state) return -def table_create_insert_delete_modify_verify(client : infinity_http.infinity_http, leader_state : instance_state): + +def table_create_insert_delete_modify_verify(client: infinity_http.infinity_http, leader_state: instance_state): check_instance_equal(leader_state, client) -def table_create_insert_delete_modify(client : infinity_http.infinity_http, leader_state : instance_state): + +def table_create_insert_delete_modify(client: infinity_http.infinity_http, leader_state: instance_state): db = client.get_database("default_db") table = db.create_table("test_data", {"c1": {"type": "int"}, "c2": {"type": "vector,4,float"}}, ConflictType.Ignore) leader_state.add_table("default_db", "test_data", ConflictType.Ignore) @@ -198,10 +211,10 @@ def table_create_insert_delete_modify(client : infinity_http.infinity_http, lead table.insert([{"c1": i, "c2": [1.0, 2.0, 3.0, 4.0]}]) df_to_insert = pd.DataFrame( { - "c1" : [i], - "c2" : [[1.0, 2.0, 3.0, 4.0]] + "c1": [i], + "c2": [[1.0, 2.0, 3.0, 4.0]] } - ).astype({"c1" : dtype("int32"), "c2" : dtype("object")}) + ).astype({"c1": dtype("int32"), "c2": dtype("object")}) table_df = pd.concat([table_df, df_to_insert]) for i in range(0, 10, 2): diff --git a/python/test_cluster/infinity_cluster.py b/python/test_cluster/infinity_cluster.py index 2217321f9a..97cce44b69 100644 --- a/python/test_cluster/infinity_cluster.py +++ b/python/test_cluster/infinity_cluster.py @@ -34,9 +34,10 @@ def is_port_in_use(port: int) -> bool: class MinioParams: - def __init__(self, minio_dir: str, minio_port: int): + def __init__(self, minio_dir: str, minio_port: int, minio_console_port: int): self.minio_dir = minio_dir self.minio_port = minio_port + self.minio_console_port = minio_console_port class BaseInfinityRunner: @@ -244,7 +245,7 @@ def add_node(self, node_name: str, config_path: str, init=True): def add_minio(self, minio_params: MinioParams): minio_image_name = "quay.io/minio/minio" - minio_cmd = f'server /data --console-address ":{minio_params.minio_port}"' + minio_cmd = f'server /data --address ":{minio_params.minio_port}" --console-address ":{minio_params.minio_console_port}"' docker_client = docker.from_env() container_name = "minio_host" diff --git a/python/test_cluster/test_admin.py b/python/test_cluster/test_admin.py index 267af8eeca..87f67b0e62 100644 --- a/python/test_cluster/test_admin.py +++ b/python/test_cluster/test_admin.py @@ -13,7 +13,7 @@ def test_admin(cluster: InfinityCluster): res = infinity1.show_current_node() logger.info(f'{res.node_role}, {res.node_status}') assert (res.node_role == "admin") - assert (res.node_status == "starting") + assert (res.node_status == "started") res = infinity1.show_admin_variables() logger.info(res.data) diff --git a/python/test_cluster/test_basic.py b/python/test_cluster/test_basic.py index 90898397a4..3955d9aba2 100644 --- a/python/test_cluster/test_basic.py +++ b/python/test_cluster/test_basic.py @@ -42,29 +42,29 @@ def test_0(cluster: InfinityCluster): cluster.remove_node("node1") -def test_mock(mock_cluster: MockInfinityCluster): - cluster = mock_cluster - with cluster: - cluster.add_node("node1", "conf/leader.toml") - cluster.add_node("node2", "conf/follower.toml") - - cluster.set_leader("node1") - cluster.set_follower("node2") - - time.sleep(1) - - cluster.disconnect("node2") - time.sleep(0.1) - cluster.reconnect("node2") - - cluster.block_peer_net("node2") - time.sleep(0.1) - cluster.restore_peer_net("node2") - - time.sleep(1) - - cluster.remove_node("node2") - cluster.remove_node("node1") +# def test_mock(mock_cluster: MockInfinityCluster): +# cluster = mock_cluster +# with cluster: +# cluster.add_node("node1", "conf/leader.toml") +# cluster.add_node("node2", "conf/follower.toml") +# +# cluster.set_leader("node1") +# cluster.set_follower("node2") +# +# time.sleep(1) +# +# cluster.disconnect("node2") +# time.sleep(0.1) +# cluster.reconnect("node2") +# +# cluster.block_peer_net("node2") +# time.sleep(0.1) +# cluster.restore_peer_net("node2") +# +# time.sleep(1) +# +# cluster.remove_node("node2") +# cluster.remove_node("node1") @pytest.mark.docker diff --git a/python/test_cluster/test_delete.py b/python/test_cluster/test_delete.py index 703a21e15e..67db17e64b 100644 --- a/python/test_cluster/test_delete.py +++ b/python/test_cluster/test_delete.py @@ -41,25 +41,25 @@ def test_delete(self, cluster: InfinityCluster): res = table_obj.delete("c1 = 1") assert res.error_code == ErrorCode.OK - res = table_obj.output(["*"]).to_df() + res, extra_result = table_obj.output(["*"]).to_df() pd.testing.assert_frame_equal(res, pd.DataFrame({'c1': (2, 3, 4), 'c2': (20, 30, 40), 'c3': (200, 300, 400)}) .astype({'c1': dtype('int32'), 'c2': dtype('int32'), 'c3': dtype('int32')})) time.sleep(1) db_obj_2 = infinity2.get_database("default_db") table_obj_2 = db_obj_2.get_table("test_delete") - res = table_obj_2.output(["*"]).to_df() + res, extra_result = table_obj_2.output(["*"]).to_df() pd.testing.assert_frame_equal(res, pd.DataFrame({'c1': (2, 3, 4), 'c2': (20, 30, 40), 'c3': (200, 300, 400)}) .astype({'c1': dtype('int32'), 'c2': dtype('int32'), 'c3': dtype('int32')})) res = table_obj.delete() assert res.error_code == ErrorCode.OK - res = table_obj.output(["*"]).to_df() + res, extra_result = table_obj.output(["*"]).to_df() pd.testing.assert_frame_equal(res, pd.DataFrame({'c1': (), 'c2': (), 'c3': ()}) .astype({'c1': dtype('int32'), 'c2': dtype('int32'), 'c3': dtype('int32')})) - res = table_obj_2.output(["*"]).to_df() + res, extra_result = table_obj_2.output(["*"]).to_df() pd.testing.assert_frame_equal(res, pd.DataFrame({'c1': (), 'c2': (), 'c3': ()}) .astype({'c1': dtype('int32'), 'c2': dtype('int32'), 'c3': dtype('int32')})) @@ -104,11 +104,11 @@ def test_delete_on_follower(self, cluster: InfinityCluster): print(e) assert(e.error_code == 8007) - res = table_obj.output(["*"]).to_df() + res, extra_result = table_obj.output(["*"]).to_df() pd.testing.assert_frame_equal(res, pd.DataFrame({'c1': (1, 2, 3, 4), 'c2': (10, 20, 30, 40), 'c3': (100, 200, 300, 400)}) .astype({'c1': dtype('int32'), 'c2': dtype('int32'), 'c3': dtype('int32')})) - res = table_obj_2.output(["*"]).to_df() + res, extra_result = table_obj_2.output(["*"]).to_df() pd.testing.assert_frame_equal(res, pd.DataFrame({'c1': (1, 2, 3, 4), 'c2': (10, 20, 30, 40), 'c3': (100, 200, 300, 400)}) .astype({'c1': dtype('int32'), 'c2': dtype('int32'), 'c3': dtype('int32')})) diff --git a/python/test_cluster/test_import.py b/python/test_cluster/test_import.py index 2e1c7b732d..c718c02aa8 100644 --- a/python/test_cluster/test_import.py +++ b/python/test_cluster/test_import.py @@ -45,7 +45,7 @@ def test1(self, cluster: InfinityCluster): } ).astype({"c1": dtype("int32"), "c2": dtype("object")}) - res = table_obj1.output(["*"]).to_df() + res, extra_result = table_obj1.output(["*"]).to_df() pd.testing.assert_frame_equal(res, res_gt) time.sleep(1) @@ -54,7 +54,7 @@ def test1(self, cluster: InfinityCluster): infinity2 = cluster.client("node2") db_obj2 = infinity2.get_database("default_db") table_obj2 = db_obj2.get_table(table_name) - res = table_obj2.output(["*"]).to_df() + res, extra_result = table_obj2.output(["*"]).to_df() pd.testing.assert_frame_equal(res, res_gt) db_obj1.drop_table(table_name) diff --git a/python/test_cluster/test_index.py b/python/test_cluster/test_index.py index 520523475a..7f0035ae21 100644 --- a/python/test_cluster/test_index.py +++ b/python/test_cluster/test_index.py @@ -61,12 +61,12 @@ def test1(self, cluster: InfinityCluster): "c2": ("text1", "text2"), } ).astype({"c1": dtype("int32"), "c2": dtype("object")}) - res1 = table_obj1.output(["*"]).filter("c1 < 3").to_df() + res1, extra_result = table_obj1.output(["*"]).filter("c1 < 3").to_df() pd.testing.assert_frame_equal(res1, res_gt) print("select in node2") time.sleep(1) - res2 = table_obj2.output(["*"]).filter("c1 < 3").to_df() + res2, extra_result = table_obj2.output(["*"]).filter("c1 < 3").to_df() # print(res2) pd.testing.assert_frame_equal(res2, res_gt) diff --git a/python/test_cluster/test_insert.py b/python/test_cluster/test_insert.py index 8cde4d111b..1ad7559806 100644 --- a/python/test_cluster/test_insert.py +++ b/python/test_cluster/test_insert.py @@ -42,7 +42,7 @@ def __test_inner_1(self, cluster: InfinityCluster): } ).astype({"c1": dtype("int32"), "c2": dtype("object")}) - res = table1.output(["*"]).to_df() + res, extra_result = table1.output(["*"]).to_df() pd.testing.assert_frame_equal(res, res_gt) time.sleep(1) @@ -51,7 +51,7 @@ def __test_inner_1(self, cluster: InfinityCluster): infinity2 = cluster.client("node2") db2 = infinity2.get_database("default_db") table2 = db2.get_table(table_name) - res = table2.output(["*"]).to_df() + res, extra_result = table2.output(["*"]).to_df() pd.testing.assert_frame_equal(res, res_gt) res = db1.drop_table(table_name) @@ -60,8 +60,8 @@ def __test_inner_1(self, cluster: InfinityCluster): def test_insert_11(self, cluster: InfinityCluster): self.__test_inner_1(cluster) - def test_insert_12(self, mock_cluster: MockInfinityCluster): - self.__test_inner_1(mock_cluster) + # def test_insert_12(self, mock_cluster: MockInfinityCluster): + # self.__test_inner_1(mock_cluster) @pytest.mark.docker def test_insert_13(self, docker_cluster: DockerInfinityCluster): @@ -104,7 +104,7 @@ def test_insert_2(self, docker_cluster: DockerInfinityCluster): infinity2 = docker_cluster.client("node2") db2 = infinity2.get_database("default_db") table2 = db2.get_table(table_name) - res = table2.output(["*"]).to_df() + res, extra_result = table2.output(["*"]).to_df() pd.testing.assert_frame_equal(res, res_gt) docker_cluster.disconnect("node2") @@ -121,7 +121,7 @@ def noreturn_request(): docker_cluster.reconnect("node2") docker_cluster.disconnect("node1") - res = table2.output(["*"]).to_df() + res, extra_result = table2.output(["*"]).to_df() pd.testing.assert_frame_equal(res, res_gt) docker_cluster.reconnect("node1") @@ -169,7 +169,7 @@ def test_insert_3(self, docker_cluster: DockerInfinityCluster): infinity2 = docker_cluster.client("node2") db2 = infinity2.get_database("default_db") table2 = db2.get_table(table_name) - res = table2.output(["*"]).to_df() + res, extra_result = table2.output(["*"]).to_df() pd.testing.assert_frame_equal(res, res_gt) # reconnect leader @@ -189,7 +189,7 @@ def test_insert_3(self, docker_cluster: DockerInfinityCluster): ), } ).astype({"c1": dtype("int32"), "c2": dtype("object")}) - res = table2.output(["*"]).to_df() + res, extra_result = table2.output(["*"]).to_df() pd.testing.assert_frame_equal(res, res_gt) db1.drop_table(table_name) diff --git a/python/test_cluster/test_knn.py b/python/test_cluster/test_knn.py index ea8a9f5a41..c4cbff528f 100644 --- a/python/test_cluster/test_knn.py +++ b/python/test_cluster/test_knn.py @@ -47,13 +47,13 @@ def test_knn(self, cluster: InfinityCluster): res = table_obj.import_data(test_csv_dir, None) assert res.error_code == ErrorCode.OK - res = table_obj.output(["variant_id", "_row_id"]).match_dense("gender_vector", [1.0] * 4, "float", "ip", 10).to_pl() + res, extra_result = table_obj.output(["variant_id", "_row_id"]).match_dense("gender_vector", [1.0] * 4, "float", "ip", 10).to_pl() print(res) time.sleep(1) db_obj_2 = infinity2.get_database("default_db") table_obj_2 = db_obj_2.get_table("test_knn") - res = table_obj_2.output(["variant_id", "_row_id"]).match_dense("gender_vector", [1.0] * 4, "float", "ip", 10).to_pl() + res, extra_result = table_obj_2.output(["variant_id", "_row_id"]).match_dense("gender_vector", [1.0] * 4, "float", "ip", 10).to_pl() print(res) res = db_obj.drop_table("test_knn", ConflictType.Error) diff --git a/python/test_cluster/test_member_change.py b/python/test_cluster/test_member_change.py index 99ae559c79..1645e82c65 100644 --- a/python/test_cluster/test_member_change.py +++ b/python/test_cluster/test_member_change.py @@ -148,7 +148,7 @@ def verify_data(node_name: str): nonlocal insert_line infinity: infinity_http = cluster.client(node_name) table = infinity.get_database("default_db").get_table(table_name) - res = table.output(["*"]).to_df() + res, extra_result = table.output(["*"]).to_df() if res.shape[0] == insert_line + 1: insert_line += 1 logger.debug(f"test_i: {i}, verify data, node_name: {node_name}") diff --git a/python/test_cluster/test_select.py b/python/test_cluster/test_select.py index bca4b6e1f5..135bd5542a 100644 --- a/python/test_cluster/test_select.py +++ b/python/test_cluster/test_select.py @@ -9,6 +9,7 @@ from infinity.errors import ErrorCode import common_values + class TestSelect: def test_select(self, cluster: InfinityCluster): with cluster: @@ -32,48 +33,49 @@ def test_select(self, cluster: InfinityCluster): "c2": {"type": "int", "constraints": ["not null"]}}, ConflictType.Error) res = table_obj.insert( - [{"c1": -3, "c2": -3}, {"c1": -2, "c2": -2}, {"c1": -1, "c2": -1}, {"c1": 0, "c2": 0}, {"c1": 1, "c2": 1}, - {"c1": 2, "c2": 2}, {"c1": 3, "c2": 3}]) + [{"c1": -3, "c2": -3}, {"c1": -2, "c2": -2}, {"c1": -1, "c2": -1}, {"c1": 0, "c2": 0}, + {"c1": 1, "c2": 1}, + {"c1": 2, "c2": 2}, {"c1": 3, "c2": 3}]) assert res.error_code == ErrorCode.OK res = table_obj.insert( - [{"c1": -8, "c2": -8}, {"c1": -7, "c2": -7}, {"c1": -6, "c2": -6}, {"c1": 7, "c2": 7}, {"c1": 8, "c2": 8}, - {"c1": 9, "c2": 9}]) + [{"c1": -8, "c2": -8}, {"c1": -7, "c2": -7}, {"c1": -6, "c2": -6}, {"c1": 7, "c2": 7}, + {"c1": 8, "c2": 8}, + {"c1": 9, "c2": 9}]) assert res.error_code == ErrorCode.OK - - res = table_obj.output(["*"]).to_df() + res, extra_result = table_obj.output(["*"]).to_df() pd.testing.assert_frame_equal(res, pd.DataFrame({'c1': (-3, -2, -1, 0, 1, 2, 3, -8, -7, -6, 7, 8, 9), - 'c2': (-3, -2, -1, 0, 1, 2, 3, -8, -7, -6, 7, 8, 9)}) - .astype({'c1': dtype('int32'), 'c2': dtype('int32')})) + 'c2': (-3, -2, -1, 0, 1, 2, 3, -8, -7, -6, 7, 8, 9)}) + .astype({'c1': dtype('int32'), 'c2': dtype('int32')})) - res = table_obj.output(["c1", "c2"]).to_df() + res, extra_result = table_obj.output(["c1", "c2"]).to_df() pd.testing.assert_frame_equal(res, pd.DataFrame({'c1': (-3, -2, -1, 0, 1, 2, 3, -8, -7, -6, 7, 8, 9), - 'c2': (-3, -2, -1, 0, 1, 2, 3, -8, -7, -6, 7, 8, 9)}) - .astype({'c1': dtype('int32'), 'c2': dtype('int32')})) + 'c2': (-3, -2, -1, 0, 1, 2, 3, -8, -7, -6, 7, 8, 9)}) + .astype({'c1': dtype('int32'), 'c2': dtype('int32')})) - res = table_obj.output( + res, extra_result = table_obj.output( ["c1 + c2"]).filter("c1 = 3").to_df() pd.testing.assert_frame_equal(res, pd.DataFrame({'(c1 + c2)': (6,)}) - .astype({'(c1 + c2)': dtype('int32')})) + .astype({'(c1 + c2)': dtype('int32')})) time.sleep(1) db_obj_2 = infinity2.get_database("default_db") table_obj_2 = db_obj_2.get_table("test_select") - res = table_obj_2.output(["*"]).to_df() + res, extra_result = table_obj_2.output(["*"]).to_df() pd.testing.assert_frame_equal(res, pd.DataFrame({'c1': (-3, -2, -1, 0, 1, 2, 3, -8, -7, -6, 7, 8, 9), - 'c2': (-3, -2, -1, 0, 1, 2, 3, -8, -7, -6, 7, 8, 9)}) - .astype({'c1': dtype('int32'), 'c2': dtype('int32')})) + 'c2': (-3, -2, -1, 0, 1, 2, 3, -8, -7, -6, 7, 8, 9)}) + .astype({'c1': dtype('int32'), 'c2': dtype('int32')})) - res = table_obj_2.output(["c1", "c2"]).to_df() + res, extra_result = table_obj_2.output(["c1", "c2"]).to_df() pd.testing.assert_frame_equal(res, pd.DataFrame({'c1': (-3, -2, -1, 0, 1, 2, 3, -8, -7, -6, 7, 8, 9), - 'c2': (-3, -2, -1, 0, 1, 2, 3, -8, -7, -6, 7, 8, 9)}) - .astype({'c1': dtype('int32'), 'c2': dtype('int32')})) + 'c2': (-3, -2, -1, 0, 1, 2, 3, -8, -7, -6, 7, 8, 9)}) + .astype({'c1': dtype('int32'), 'c2': dtype('int32')})) - res = table_obj_2.output( + res, extra_result = table_obj_2.output( ["c1 + c2"]).filter("c1 = 3").to_df() pd.testing.assert_frame_equal(res, pd.DataFrame({'(c1 + c2)': (6,)}) - .astype({'(c1 + c2)': dtype('int32')})) + .astype({'(c1 + c2)': dtype('int32')})) res = db_obj.drop_table("test_select", ConflictType.Error) assert res.error_code == ErrorCode.OK diff --git a/python/test_cluster/test_single_node.py b/python/test_cluster/test_single_node.py index 0f8b6ae43f..7a0af66afb 100644 --- a/python/test_cluster/test_single_node.py +++ b/python/test_cluster/test_single_node.py @@ -81,7 +81,7 @@ def test_standalone2admin2leader2admin(cluster: InfinityCluster): test: (standalone, operations)->admin->(leader, operations)->admin ''' with cluster: - cluster.add_node("test", "conf/infinity_conf.toml") + cluster.add_node("test", "conf/infinity_minio_conf.toml") test_client = cluster.client("test") state = instance_state(test_client) assert test_client.show_current_node().node_role == "standalone" diff --git a/python/test_cluster/test_tc.py b/python/test_cluster/test_tc.py index 633b42c5e0..2800be92db 100644 --- a/python/test_cluster/test_tc.py +++ b/python/test_cluster/test_tc.py @@ -78,7 +78,7 @@ def test_tc1(cluster: InfinityCluster): } ).astype({"c1": dtype("int32"), "c2": dtype("object")}) - res = table1.output(["*"]).to_df() + res, extra_result = table1.output(["*"]).to_df() pd.testing.assert_frame_equal(res, res_gt) time.sleep(1) @@ -87,7 +87,7 @@ def test_tc1(cluster: InfinityCluster): infinity2 = cluster.client("node2") db2 = infinity2.get_database("default_db") table2 = db2.get_table(table_name) - res = table2.output(["*"]).to_df() + res, extra_result = table2.output(["*"]).to_df() pd.testing.assert_frame_equal(res, res_gt) try: @@ -154,12 +154,12 @@ def test_tc1(cluster: InfinityCluster): } ).astype({"c1": dtype("int32"), "c2": dtype("object")}) - res = table1.output(["*"]).to_df() + res, extra_result = table1.output(["*"]).to_df() pd.testing.assert_frame_equal(res, res_gt) db2 = infinity2.get_database("default_db") table2 = db2.get_table(table_name) - res = table2.output(["*"]).to_df() + res, extra_result = table2.output(["*"]).to_df() pd.testing.assert_frame_equal(res, res_gt) cluster.set_admin("node2") @@ -185,9 +185,9 @@ def test_tc1(cluster: InfinityCluster): assert (res.node_role == "leader") assert (res.node_status == "alive") - res = table1.output(["*"]).to_df() + res, extra_result = table1.output(["*"]).to_df() pd.testing.assert_frame_equal(res, res_gt) - res = table2.output(["*"]).to_df() + res, extra_result = table2.output(["*"]).to_df() pd.testing.assert_frame_equal(res, res_gt) res = db1.drop_table(table_name, ConflictType.Ignore) @@ -282,7 +282,7 @@ def test_tc2(cluster: InfinityCluster): for server in [infinity1, infinity2, infinity3, infinity4]: db = server.get_database("default_db") table = db.get_table(table_name) - res = table.output(["*"]).to_df() + res, extra_result = table.output(["*"]).to_df() pd.testing.assert_frame_equal(res, res_gt) res = db1.drop_table(table_name) diff --git a/python/test_cluster/test_update.py b/python/test_cluster/test_update.py index a4f06f242e..4ece414a5c 100644 --- a/python/test_cluster/test_update.py +++ b/python/test_cluster/test_update.py @@ -40,7 +40,7 @@ def test_update(self, cluster: InfinityCluster): res = table_obj.update("c1 = 1", {"c2": 90, "c3": 900}) assert res.error_code == ErrorCode.OK - res = table_obj.output(["*"]).to_df() + res, extra_result = table_obj.output(["*"]).to_df() pd.testing.assert_frame_equal(res, pd.DataFrame( {'c1': (2, 3, 4, 1), 'c2': (20, 30, 40, 90), 'c3': (200, 300, 400, 900)}) .astype({'c1': dtype('int32'), 'c2': dtype('int32'), 'c3': dtype('int32')})) @@ -48,7 +48,7 @@ def test_update(self, cluster: InfinityCluster): time.sleep(1) db_obj_2 = infinity2.get_database("default_db") table_obj_2 = db_obj_2.get_table("test_update") - res = table_obj_2.output(["*"]).to_df() + res, extra_result = table_obj_2.output(["*"]).to_df() pd.testing.assert_frame_equal(res, pd.DataFrame( {'c1': (2, 3, 4, 1), 'c2': (20, 30, 40, 90), 'c3': (200, 300, 400, 900)}) .astype({'c1': dtype('int32'), 'c2': dtype('int32'), 'c3': dtype('int32')})) diff --git a/python/test_pysdk/test_alter.py b/python/test_pysdk/test_alter.py index be52eb233e..8892313068 100644 --- a/python/test_pysdk/test_alter.py +++ b/python/test_pysdk/test_alter.py @@ -54,12 +54,12 @@ def test_simple_add_columns(self): res = table_obj.insert([{"c1": 1, "c2": 2}]) assert res.error_code == infinity.ErrorCode.OK - res = table_obj.output(["*"]).to_df() + res, extra_result = table_obj.output(["*"]).to_df() pd.testing.assert_frame_equal( res, pd.DataFrame({"c1": [1], "c2": [2]}).astype( {"c1": dtype("int32"), "c2": dtype("int32")} - ), + ) ) res = table_obj.add_columns({"c2": {"type": "varchar", "default": "default"}}) @@ -71,17 +71,17 @@ def test_simple_add_columns(self): res = table_obj.add_columns({"c3": {"type": "varchar", "default": "default"}}) assert res.error_code == infinity.ErrorCode.OK - res = table_obj.output(["*"]).to_df() + res, extra_result = table_obj.output(["*"]).to_df() pd.testing.assert_frame_equal( res, pd.DataFrame({"c1": [1], "c2": [2], "c3": ["default"]}).astype( {"c1": dtype("int32"), "c2": dtype("int32"), "c3": dtype("object")} - ), + ) ) table_obj.insert([{"c1": 2, "c2": 3, "c3": "test"}]) - res = table_obj.output(["*"]).to_df() + res, extra_result = table_obj.output(["*"]).to_df() pd.testing.assert_frame_equal( res, pd.DataFrame( @@ -110,7 +110,7 @@ def test_simple_drop_columns(self): res = table_obj.insert([{"c1": 1, "c2": 2, "c3": "test"}]) assert res.error_code == infinity.ErrorCode.OK - res = table_obj.output(["*"]).to_df() + res, extra_result = table_obj.output(["*"]).to_df() pd.testing.assert_frame_equal( res, pd.DataFrame({"c1": [1], "c2": [2], "c3": ["test"]}).astype( @@ -124,7 +124,7 @@ def test_simple_drop_columns(self): res = table_obj.drop_columns("c2") assert res.error_code == infinity.ErrorCode.OK - res = table_obj.output(["*"]).to_df() + res, extra_result = table_obj.output(["*"]).to_df() pd.testing.assert_frame_equal( res, pd.DataFrame({"c1": [1], "c3": ["test"]}).astype( @@ -134,7 +134,7 @@ def test_simple_drop_columns(self): table_obj.insert([{"c1": 2, "c3": "test2"}]) - res = table_obj.output(["*"]).to_df() + res, extra_result = table_obj.output(["*"]).to_df() pd.testing.assert_frame_equal( res, pd.DataFrame({"c1": [1, 2], "c3": ["test", "test2"]}).astype( @@ -187,7 +187,7 @@ def test_insert_after_drop_columns(self): ] ) - result = table_obj.output(["*"]).to_df() + result, extra_result = table_obj.output(["*"]).to_df() print(result) pd.testing.assert_frame_equal( result, @@ -252,7 +252,7 @@ def test_add_drop_column_with_index(self): res = table_obj.add_columns({"c2": {"type": "varchar", "default": "test"}}) assert res.error_code == infinity.ErrorCode.OK - res = table_obj.output(["*"]).to_df() + res, extra_result = table_obj.output(["*"]).to_df() pd.testing.assert_frame_equal( res, pd.DataFrame({"c1": [1], "c3": ["test"], "c2": ["test"]}).astype( @@ -262,7 +262,7 @@ def test_add_drop_column_with_index(self): table_obj.insert([{"c1": 1, "c2": "t1", "c3": "t2"}]) - res = table_obj.output(["*"]).to_df() + res, extra_result = table_obj.output(["*"]).to_df() pd.testing.assert_frame_equal( res, pd.DataFrame( diff --git a/python/test_pysdk/test_basic.py b/python/test_pysdk/test_basic.py index 409fa4415d..143728138e 100644 --- a/python/test_pysdk/test_basic.py +++ b/python/test_pysdk/test_basic.py @@ -175,12 +175,12 @@ def test_basic(self, check_data ,suffix): [{"c1": 1, "c2": 1.1}, {"c1": 2, "c2": 2.2}]) assert res.error_code == ErrorCode.OK # search - res = table_obj.output(["c1 + 0.1"]).to_df() + res, extra_result = table_obj.output(["c1 + 0.1"]).to_df() print(res) pd.testing.assert_frame_equal(res, pd.DataFrame({'(c1 + 0.1)': (1.1, 2.1)}).astype( {'(c1 + 0.1)': dtype('float64')})) - res = table_obj.output( + res, extra_result = table_obj.output( ["*"]).filter("c1 > 1").to_df() print(res) pd.testing.assert_frame_equal(res, pd.DataFrame({'c1': (2,), 'c2': (2.2,)}).astype( @@ -237,7 +237,7 @@ def test_basic(self, check_data ,suffix): res = export_table_obj.import_data(common_values.TEST_TMP_DIR + suffix + test_export_csv_file) assert res.error_code == ErrorCode.OK - res = table_obj.output(["c1"]).filter("c1 > 1").to_df() + res, extra_result = table_obj.output(["c1"]).filter("c1 > 1").to_df() print(res) res = db_obj.drop_table("my_table_export"+suffix) @@ -249,7 +249,7 @@ def test_basic(self, check_data ,suffix): res = export_table_obj.import_data(common_values.TEST_TMP_DIR + suffix + test_export_jsonl_file, import_options={"file_type": "jsonl"}) assert res.error_code == ErrorCode.OK - res = table_obj.output(["c1"]).filter("c1 > 1").to_df() + res, extra_result = table_obj.output(["c1"]).filter("c1 > 1").to_df() print(res) res = db_obj.drop_table("my_table_export"+suffix) assert res.error_code == ErrorCode.OK @@ -261,7 +261,7 @@ def test_basic(self, check_data ,suffix): res = export_table_obj.import_data(common_values.TEST_TMP_DIR + suffix + test_export_csv_file, import_options={"file_type": "csv"}) assert res.error_code == ErrorCode.OK - res = table_obj.output(["c1"]).filter("c1 > 1").to_df() + res, extra_result = table_obj.output(["c1"]).filter("c1 > 1").to_df() print(res) res = db_obj.drop_table("my_table_export"+suffix) assert res.error_code == ErrorCode.OK @@ -272,7 +272,7 @@ def test_basic(self, check_data ,suffix): os.remove(common_values.TEST_TMP_DIR + suffix + test_export_jsonl_file_part) # search - res = table_obj.output( + res, extra_result = table_obj.output( ["c1"]).filter("c1 > 1").to_df() print(res) res = db_obj.drop_table("my_table4"+suffix) diff --git a/python/test_pysdk/test_convert.py b/python/test_pysdk/test_convert.py index 1c4b4803f7..9ed5c7ef35 100644 --- a/python/test_pysdk/test_convert.py +++ b/python/test_pysdk/test_convert.py @@ -49,11 +49,11 @@ def test_to_pl(self, suffix): table_obj = db_obj.get_table("test_to_pl"+suffix) table_obj.insert([{"c1": 1, "c2": 2}]) print() - res = table_obj.output(["c1", "c2"]).to_pl() + res, extra_result = table_obj.output(["c1", "c2"]).to_pl() print(res) - res = table_obj.output(["c1", "c1"]).to_pl() + res, extra_result = table_obj.output(["c1", "c1"]).to_pl() print(res) - res = table_obj.output(["c1", "c2", "c1"]).to_pl() + res, extra_result = table_obj.output(["c1", "c2", "c1"]).to_pl() print(res) db_obj.drop_table("test_to_pl"+suffix, ConflictType.Error) def test_to_pa(self, suffix): @@ -65,11 +65,11 @@ def test_to_pa(self, suffix): table_obj = db_obj.get_table("test_to_pa"+suffix) table_obj.insert([{"c1": 1, "c2": 2.0}]) print() - res = table_obj.output(["c1", "c2"]).to_arrow() + res, extra_result = table_obj.output(["c1", "c2"]).to_arrow() print(res) - res = table_obj.output(["c1", "c1"]).to_arrow() + res, extra_result = table_obj.output(["c1", "c1"]).to_arrow() print(res) - res = table_obj.output(["c1", "c2", "c1"]).to_arrow() + res, extra_result = table_obj.output(["c1", "c2", "c1"]).to_arrow() print(res) db_obj.drop_table("test_to_pa"+suffix, ConflictType.Error) def test_to_df(self, suffix): @@ -81,11 +81,11 @@ def test_to_df(self, suffix): table_obj = db_obj.get_table("test_to_df"+suffix) table_obj.insert([{"c1": 1, "c2": 2.0}]) print() - res = table_obj.output(["c1", "c2"]).to_df() + res, extra_result = table_obj.output(["c1", "c2"]).to_df() print(res) - res = table_obj.output(["c1", "c1"]).to_df() + res, extra_result = table_obj.output(["c1", "c1"]).to_df() print(res) - res = table_obj.output(["c1", "c2", "c1"]).to_df() + res, extra_result = table_obj.output(["c1", "c2", "c1"]).to_df() print(res) db_obj.drop_table("test_to_df"+suffix, ConflictType.Error) @@ -102,8 +102,8 @@ def test_without_output_select_list(self, suffix): with pytest.raises(InfinityException) as e: insert_res_df = table_obj.output([]).to_df() - insert_res_arrow = table_obj.output([]).to_arrow() - insert_res_pl = table_obj.output([]).to_pl() + insert_res_arrow, extra_result = table_obj.output([]).to_arrow() + insert_res_pl, extra_result = table_obj.output([]).to_pl() print(insert_res_df, insert_res_arrow, insert_res_pl) assert e.value.args[0] == ErrorCode.EMPTY_SELECT_FIELDS @@ -129,7 +129,7 @@ def test_convert_test_with_valid_select_list_output(self, condition_list, suffix {"c1": 1000, "c2": 2.0}, {"c1": 10000, "c2": 2.0}]) - insert_res_df = table_obj.output(["c1", condition_list]).to_pl() + insert_res_df, extra_result = table_obj.output(["c1", condition_list]).to_pl() print(insert_res_df) db_obj.drop_table("test_with_valid_select_list_output"+suffix, ConflictType.Error) @@ -151,7 +151,7 @@ def test_convert_test_with_invalid_select_list_output(self, condition_list, suff {"c1": 10000, "c2": 2.0}]) with pytest.raises(Exception): - insert_res_df = table_obj.output(["c1", condition_list]).to_pl() + insert_res_df, extra_result = table_obj.output(["c1", condition_list]).to_pl() print(insert_res_df) db_obj.drop_table("test_with_invalid_select_list_output"+suffix, ConflictType.Error) @@ -180,7 +180,7 @@ def test_convert_test_output_with_valid_filter_function(self, filter_list, suffi {"c1": 1000, "c2": 2.0}, {"c1": 10000, "c2": 2.0}]) # TODO add more filter function - insert_res_df = InfinityThriftQueryBuilder(table_obj).output(["*"]).filter(filter_list).to_pl() + insert_res_df, extra_result = InfinityThriftQueryBuilder(table_obj).output(["*"]).filter(filter_list).to_pl() print(str(insert_res_df)) db_obj.drop_table("test_output_with_valid_filter_function"+suffix, ConflictType.Error) @@ -209,7 +209,7 @@ def test_convert_test_output_with_invalid_filter_function(self, filter_list, suf {"c1": 10000, "c2": 2.0}]) # TODO add more filter function with pytest.raises(Exception) as e: - insert_res_df = InfinityThriftQueryBuilder(table_obj).output(["*"]).filter(filter_list).to_pl() + insert_res_df, extra_result = InfinityThriftQueryBuilder(table_obj).output(["*"]).filter(filter_list).to_pl() print(str(insert_res_df)) print(e.type) diff --git a/python/test_pysdk/test_delete.py b/python/test_pysdk/test_delete.py index 45e0e29d2a..ddf99d3829 100644 --- a/python/test_pysdk/test_delete.py +++ b/python/test_pysdk/test_delete.py @@ -100,7 +100,7 @@ def test_delete(self, suffix): assert res.error_code == ErrorCode.OK assert res.deleted_rows == 1 - res = table_obj.output(["*"]).to_df() + res, extra_result = table_obj.output(["*"]).to_df() pd.testing.assert_frame_equal(res, pd.DataFrame({'c1': (2, 3, 4), 'c2': (20, 30, 40), 'c3': (200, 300, 400)}) .astype({'c1': dtype('int32'), 'c2': dtype('int32'), 'c3': dtype('int32')})) @@ -108,7 +108,7 @@ def test_delete(self, suffix): assert res.error_code == ErrorCode.OK assert res.deleted_rows == 3 - res = table_obj.output(["*"]).to_df() + res, extra_result = table_obj.output(["*"]).to_df() pd.testing.assert_frame_equal(res, pd.DataFrame({'c1': (), 'c2': (), 'c3': ()}) .astype({'c1': dtype('int32'), 'c2': dtype('int32'), 'c3': dtype('int32')})) @@ -192,7 +192,7 @@ def test_delete_table_no_rows_met_condition(self,suffix): except Exception as e: print(e) - res = table_obj.output(["*"]).to_df() + res, extra_result = table_obj.output(["*"]).to_df() print("{}:{}".format(common_values.types_array[i], res)) assert tb @@ -207,7 +207,7 @@ def test_delete_table_with_one_block(self, suffix): # insert values = [{"c1": 1} for _ in range(8192)] table_obj.insert(values) - insert_res = table_obj.output(["*"]).to_df() + insert_res, extra_result = table_obj.output(["*"]).to_df() print(insert_res) # delete @@ -215,7 +215,7 @@ def test_delete_table_with_one_block(self, suffix): assert res.error_code == ErrorCode.OK assert res.deleted_rows == 8192 - delete_res = table_obj.output(["*"]).to_df() + delete_res, extra_result = table_obj.output(["*"]).to_df() print(delete_res) db_obj.drop_table("test_delete_table_with_one_block"+suffix, ConflictType.Error) @@ -230,13 +230,13 @@ def test_delete_table_with_one_segment(self, suffix): for i in range(1024): values = [{"c1": i} for _ in range(10)] table_obj.insert(values) - insert_res = table_obj.output(["*"]).to_df() + insert_res, extra_result = table_obj.output(["*"]).to_df() print(insert_res) # delete for i in range(1024): table_obj.delete("c1 = " + str(i)) - delete_res = table_obj.output(["*"]).to_df() + delete_res, extra_result = table_obj.output(["*"]).to_df() db_obj.drop_table("test_delete_table_with_one_segment"+suffix, ConflictType.Error) print(delete_res) @@ -250,7 +250,7 @@ def test_select_before_after_delete(self, suffix): for i in range(10): values = [{"c1": i} for _ in range(10)] table_obj.insert(values) - insert_res = table_obj.output(["*"]).to_df() + insert_res, extra_result = table_obj.output(["*"]).to_df() print(insert_res) # delete @@ -258,7 +258,7 @@ def test_select_before_after_delete(self, suffix): assert res.error_code == ErrorCode.OK assert res.deleted_rows == 10 - delete_res = table_obj.output(["*"]).to_df() + delete_res, extra_result = table_obj.output(["*"]).to_df() print(delete_res) db_obj.drop_table("test_select_before_after_delete"+suffix, ConflictType.Error) @@ -277,7 +277,7 @@ def test_delete_insert_data(self, suffix): assert res.error_code == ErrorCode.OK assert res.deleted_rows == 10 - delete_res = table_obj.output(["*"]).to_df() + delete_res, extra_result = table_obj.output(["*"]).to_df() print(delete_res) db_obj.drop_table("test_delete_insert_data"+suffix, ConflictType.Error) @@ -301,7 +301,7 @@ def test_delete_inserted_long_before_data(self, suffix): assert res.error_code == ErrorCode.OK assert res.deleted_rows == 5 - delete_res = table_obj.output(["*"]).to_df() + delete_res, extra_result = table_obj.output(["*"]).to_df() print(delete_res) db_obj.drop_table("test_delete_inserted_long_before_data"+suffix, ConflictType.Error) @@ -346,11 +346,11 @@ def test_various_expression_in_where_clause(self, column_types, column_types_exa try: table_obj.insert(values) - insert_res = table_obj.output(["*"]).to_df() + insert_res, extra_result = table_obj.output(["*"]).to_df() print(insert_res) table_obj.delete("c1 = " + str(column_types_example)) - delete_res = table_obj.output(["*"]).to_df() + delete_res, extra_result = table_obj.output(["*"]).to_df() print(delete_res) except Exception as e: print(e) @@ -368,7 +368,7 @@ def test_delete_one_block_without_expression(self, suffix): # insert values = [{"c1": 1} for _ in range(8192)] table_obj.insert(values) - insert_res = table_obj.output(["*"]).to_df() + insert_res, extra_result = table_obj.output(["*"]).to_df() print(insert_res) # delete @@ -376,7 +376,7 @@ def test_delete_one_block_without_expression(self, suffix): assert res.error_code == ErrorCode.OK assert res.deleted_rows == 8192 - delete_res = table_obj.output(["*"]).to_df() + delete_res, extra_result = table_obj.output(["*"]).to_df() print(delete_res) res = db_obj.drop_table("test_delete_one_block_without_expression"+suffix, ConflictType.Error) assert res.error_code == ErrorCode.OK @@ -392,7 +392,7 @@ def test_delete_one_segment_without_expression(self,suffix): for i in range(1024): values = [{"c1": i} for _ in range(10)] table_obj.insert(values) - insert_res = table_obj.output(["*"]).to_df() + insert_res, extra_result = table_obj.output(["*"]).to_df() print(insert_res) # delete @@ -400,7 +400,7 @@ def test_delete_one_segment_without_expression(self,suffix): assert res.error_code == ErrorCode.OK assert res.deleted_rows == 10240 - delete_res = table_obj.output(["*"]).to_df() + delete_res, extra_result = table_obj.output(["*"]).to_df() print(delete_res) db_obj.drop_table("test_delete_one_segment_without_expression"+suffix, ConflictType.Error) @@ -424,14 +424,14 @@ def test_filter_with_valid_expression(self, filter_list, suffix): for i in range(10): values = [{"c1": i, "c2": 3.0} for _ in range(10)] table_obj.insert(values) - insert_res = table_obj.output(["*"]).to_df() + insert_res, extra_result = table_obj.output(["*"]).to_df() print(insert_res) # delete res = table_obj.delete(filter_list) assert res.error_code == ErrorCode.OK - delete_res = table_obj.output(["*"]).to_df() + delete_res, extra_result = table_obj.output(["*"]).to_df() print(delete_res) db_obj.drop_table("test_filter_expression"+suffix, ConflictType.Error) @@ -455,13 +455,13 @@ def test_filter_with_invalid_expression(self, filter_list, suffix): for i in range(10): values = [{"c1": i, "c2": 3.0} for _ in range(10)] table_obj.insert(values) - insert_res = table_obj.output(["*"]).to_df() + insert_res, extra_result = table_obj.output(["*"]).to_df() print(insert_res) # delete # TODO: Detailed error information check with pytest.raises(Exception): table_obj.delete(filter_list) - delete_res = table_obj.output(["*"]).to_df() + delete_res, extra_result = table_obj.output(["*"]).to_df() print(delete_res) db_obj.drop_table("test_filter_expression"+suffix, ConflictType.Error) \ No newline at end of file diff --git a/python/test_pysdk/test_export.py b/python/test_pysdk/test_export.py index d7ec702b97..6eb599b627 100644 --- a/python/test_pysdk/test_export.py +++ b/python/test_pysdk/test_export.py @@ -74,7 +74,7 @@ def test_export_csv(self, suffix): table_obj = db_obj.create_table("test_export_csv"+suffix, {"doctitle": {"type": "varchar"}, "docdate": {"type": "varchar"}, "body": {"type": "varchar"}, "num": {"type": "integer"}, "vec": {"type": "vector, 4, float"}}) res = table_obj.import_data(test_csv_dir, import_options={"file_type": "csv", "delimiter" : "\t"}) assert res.error_code == ErrorCode.OK - res = table_obj.output(["count(*)"]).to_pl() + res, extra_result = table_obj.output(["count(*)"]).to_pl() print(res) test_export_csv_file_path = common_values.TEST_TMP_DIR + suffix +"test_export_csv.csv" @@ -124,7 +124,7 @@ def test_export_jsonl(self, suffix): table_obj = db_obj.create_table("test_export_jsonl"+suffix, {"doctitle": {"type": "varchar"}, "docdate": {"type": "varchar"}, "body": {"type": "varchar"}, "num": {"type": "integer"}, "vec": {"type": "vector, 4, float"}}) res = table_obj.import_data(test_csv_dir, import_options={"file_type": "csv", "delimiter" : "\t"}) assert res.error_code == ErrorCode.OK - res = table_obj.output(["count(*)"]).to_pl() + res, extra_result = table_obj.output(["count(*)"]).to_pl() print(res) test_export_jsonl_file_path = common_values.TEST_TMP_DIR + suffix + "test_export_jsonl.jsonl" @@ -174,7 +174,7 @@ def test_export_fvecs(self, suffix): table_obj = db_obj.create_table("test_export_fvecs"+suffix, {"doctitle": {"type": "varchar"}, "docdate": {"type": "varchar"}, "body": {"type": "varchar"}, "num": {"type": "integer"}, "vec": {"type": "vector, 4, float"}}) res = table_obj.import_data(test_csv_dir, import_options={"file_type": "csv", "delimiter" : "\t"}) assert res.error_code == ErrorCode.OK - res = table_obj.output(["count(*)"]).to_pl() + res, extra_result = table_obj.output(["count(*)"]).to_pl() print(res) test_export_fvecs_file_path = common_values.TEST_TMP_DIR + suffix + "test_export_fvecs.fvecs" diff --git a/python/test_pysdk/test_index.py b/python/test_pysdk/test_index.py index e4fb2eb796..6be7d62e82 100644 --- a/python/test_pysdk/test_index.py +++ b/python/test_pysdk/test_index.py @@ -166,7 +166,7 @@ def test_drop_index_fulltext(self, suffix): res = table_obj.create_index("my_index", index.IndexInfo("body", index.IndexType.FullText), ConflictType.Error) assert res.error_code == ErrorCode.OK # fulltext search when index is created: expect success - res = table_obj.output(["doctitle", "_score"]).match_text("body^5", "harmful chemical", 3).to_pl() + res, extra_result = table_obj.output(["doctitle", "_score"]).match_text("body^5", "harmful chemical", 3).to_pl() print(res) res = table_obj.drop_index("my_index") assert res.error_code == ErrorCode.OK @@ -640,13 +640,13 @@ def test_insert_data_fulltext_index_search(self, file_format, suffix): "docdate": data["docdate"][i], "body": data["body"][i]}) table_obj.insert(value) time.sleep(5) - res = table_obj.output(["doctitle", "docdate", "_row_id", "_score"]).match_text( + res, extra_result = table_obj.output(["doctitle", "docdate", "_row_id", "_score"]).match_text( "body^5", "harmful chemical", 3).to_pl() assert not res.is_empty() print(res) # Check if highlight work - res = table_obj.output(["doctitle", "docdate", "body", "_row_id", "_score"]).highlight(["body"]).match_text( + res, extra_result = table_obj.output(["doctitle", "docdate", "body", "_row_id", "_score"]).highlight(["body"]).match_text( "body^5", "harmful chemical", 3).to_pl() assert not res.is_empty() for body in res["body"].to_list(): @@ -700,12 +700,12 @@ def test_empty_fulltext_index(self, file_format, suffix): index.IndexType.FullText)) assert res.error_code == ErrorCode.OK - res = table_obj.output(["doctitle", "docdate", "_row_id", "_score"]).match_text( + res, extra_result = table_obj.output(["doctitle", "docdate", "_row_id", "_score"]).match_text( "body^5", "harmful chemical", 3).to_pl() assert not res.is_empty() print(res) - res = table_obj.output(["doctitle", "docdate", "body2", "_row_id", "_score"]).match_text( + res, extra_result = table_obj.output(["doctitle", "docdate", "body2", "_row_id", "_score"]).match_text( "body2^5", "harmful chemical", 3).to_pl() assert res.is_empty() print(res) @@ -750,12 +750,12 @@ def test_create_index_on_deleted_table(self, suffix): embedding_data = [i for i in range(128)] value = [{"c1": embedding_data} for _ in range(1024)] table_obj.insert(value) - res = table_obj.output(["*"]).to_pl() + res, extra_result = table_obj.output(["*"]).to_pl() print(res) # delete data table_obj.delete() - res = table_obj.output(["*"]).to_pl() + res, extra_result = table_obj.output(["*"]).to_pl() print(res) # create index @@ -790,7 +790,7 @@ def test_create_index_on_update_table(self, suffix): embedding_data = [i for i in range(128)] value = [{"c1": embedding_data, "c2": i} for i in range(10)] table_obj.insert(value) - res = table_obj.output(["*"]).to_pl() + res, extra_result = table_obj.output(["*"]).to_pl() print(res) # update data embedding_data = [i + 0.1 * i for i in range(128)] @@ -799,7 +799,7 @@ def test_create_index_on_update_table(self, suffix): value = [{"c1": embedding_data} for _ in range(10)] for i in range(10): table_obj.update("c2 = " + str(i), {"c1": embedding_data}) - res = table_obj.output(["*"]).to_pl() + res, extra_result = table_obj.output(["*"]).to_pl() print(res) res = db_obj.drop_table( "test_create_index_on_update_table" + suffix, ConflictType.Error) diff --git a/python/test_pysdk/test_insert.py b/python/test_pysdk/test_insert.py index 143384eb13..ffcf908186 100644 --- a/python/test_pysdk/test_insert.py +++ b/python/test_pysdk/test_insert.py @@ -101,7 +101,7 @@ def _test_insert_basic(self, suffix): res = table_obj.insert([{"c2": 3, "c1": 3}, {"c1": 4, "c2": 4}]) assert res.error_code == ErrorCode.OK - res = table_obj.output(["*"]).to_df() + res, extra_result = table_obj.output(["*"]).to_df() pd.testing.assert_frame_equal(res, pd.DataFrame({'c1': (0, 1, 2, 3, 4), 'c2': (0, 1, 2, 3, 4)}) .astype({'c1': dtype('int32'), 'c2': dtype('int32')})) @@ -121,7 +121,7 @@ def _test_insert_bool(self, suffix): assert table_obj res = table_obj.insert([{"c1": -1, "c2": True}, {"c1": 2, "c2": False}]) assert res.error_code == ErrorCode.OK - res = table_obj.output(["*"]).to_df() + res, extra_result = table_obj.output(["*"]).to_df() print(res) pd.testing.assert_frame_equal(res, pd.DataFrame({'c1': (-1, 2), 'c2': (True, False)}).astype( {'c1': dtype('float32'), 'c2': dtype('bool')})) @@ -145,7 +145,7 @@ def _test_insert_bool(self, suffix): assert table_instance res = table_instance.insert({"c1": 1, "c7": "Tom"}) assert res.error_code == ErrorCode.OK - res = table_instance.output(["*"]).to_df() + res, extra_result = table_instance.output(["*"]).to_df() print(res) pd.testing.assert_frame_equal(res, pd.DataFrame( {'c1': (1,), 'c2': (0,), 'c3': (0,), 'c4': (0,), 'c5': (0,), 'c6': (0,), 'c7': ("Tom",), 'c8': (1.0,), @@ -171,7 +171,7 @@ def _test_insert_float16_bfloat16(self, suffix): [{"c1": -1, "c2": 1, "c3": -1}, {"c1": 2, "c2": -2, "c3": 2}, {"c1": -3, "c2": 3, "c3": -3}, {"c1": 4, "c2": -4, "c3": 4}, {"c1": -5, "c2": 5, "c3": -5}]) assert res.error_code == ErrorCode.OK - res = table_obj.output(["*"]).to_df() + res, extra_result = table_obj.output(["*"]).to_df() print(res) pd.testing.assert_frame_equal(res, pd.DataFrame( {'c1': (-1, 2, -3, 4, -5), 'c2': (1, -2, 3, -4, 5), 'c3': (-1, 2, -3, 4, -5)}).astype( @@ -197,7 +197,7 @@ def _test_insert_varchar(self, suffix): res = table_obj.insert([{"c1": "^789$ test insert varchar"}]) assert res.error_code == ErrorCode.OK - res = table_obj.output(["*"]).to_df() + res, extra_result = table_obj.output(["*"]).to_df() pd.testing.assert_frame_equal(res, pd.DataFrame({'c1': ("test_insert_varchar", " test insert varchar ", "^789$ test insert varchar")})) res = db_obj.drop_table("test_insert_varchar"+suffix, ConflictType.Error) @@ -217,7 +217,7 @@ def _test_insert_big_varchar(self, suffix): res = table_obj.insert([{"c1": "test_insert_big_varchar" * 1000}]) assert res.error_code == ErrorCode.OK - res = table_obj.output(["*"]).to_df() + res, extra_result = table_obj.output(["*"]).to_df() pd.testing.assert_frame_equal(res, pd.DataFrame( {'c1': ["test_insert_big_varchar" * 1000] * 100})) @@ -242,13 +242,13 @@ def _test_insert_embedding(self, suffix): assert res.error_code == ErrorCode.OK res = table_obj.insert([{"c1": [-7, -8, -9]}]) assert res.error_code == ErrorCode.OK - res = table_obj.output(["*"]).to_df() + res, extra_result = table_obj.output(["*"]).to_df() pd.testing.assert_frame_equal(res, pd.DataFrame( {'c1': ([1, 2, 3], [4, 5, 6], [7, 8, 9], [-7, -8, -9])})) res = table_obj.insert([{"c1": [1, 2, 3]}, {"c1": [4, 5, 6]}, { "c1": [7, 8, 9]}, {"c1": [-7, -8, -9]}]) assert res.error_code == ErrorCode.OK - res = table_obj.output(["*"]).to_df() + res, extra_result = table_obj.output(["*"]).to_df() pd.testing.assert_frame_equal(res, pd.DataFrame({'c1': ([1, 2, 3], [4, 5, 6], [7, 8, 9], [-7, -8, -9], [1, 2, 3], [4, 5, 6], [7, 8, 9], [-7, -8, -9])})) @@ -269,7 +269,7 @@ def _test_insert_embedding(self, suffix): res = table_obj.insert([{"c1": embedding_insert_float[3]}]) assert res.error_code == ErrorCode.OK - res = table_obj.output(["*"]).to_df() + res, extra_result = table_obj.output(["*"]).to_df() pd.testing.assert_frame_equal(res, pd.DataFrame( {'c1': embedding_insert_float})) @@ -288,7 +288,7 @@ def _test_insert_embedding(self, suffix): assert res.error_code == ErrorCode.OK res = table_obj.insert([{"c1": embedding_insert_float[3]}]) assert res.error_code == ErrorCode.OK - res = table_obj.output(["*"]).to_df() + res, extra_result = table_obj.output(["*"]).to_df() print(res) pd.testing.assert_frame_equal(res, pd.DataFrame( {'c1': [np.array(x).astype(np.float16).tolist() for x in embedding_insert_float]})) @@ -307,7 +307,7 @@ def _test_insert_embedding(self, suffix): assert res.error_code == ErrorCode.OK res = table_obj.insert([{"c1": embedding_insert_float[3]}]) assert res.error_code == ErrorCode.OK - res = table_obj.output(["*"]).to_df() + res, extra_result = table_obj.output(["*"]).to_df() print(res) tmp_bf16 = np.array(embedding_insert_float).astype(' 1").to_df() + res, extra_result = table_obj.output(["c1"]).filter("c1 > 1").to_df() print(res) res = db_obj.drop_table("test_import"+suffix, ConflictType.Error) assert res.error_code == ErrorCode.OK @@ -107,14 +107,14 @@ def test_import_different_file_format_data(self, file_format, check_data, suffix table_obj = db_obj.create_table("test_import_different_file_format_data"+suffix, {"c1": {"type": "vector,128,float"}}, ConflictType.Error) table_obj.import_data(common_values.TEST_TMP_DIR + file_name, {"file_type": file_format}) - res = table_obj.output(["*"]).to_df() + res, extra_result = table_obj.output(["*"]).to_df() print(res) else: print(common_values.TEST_DATA_DIR + file_format + "/pysdk_test." + file_format) table_obj.import_data( os.getcwd() + common_values.TEST_DATA_DIR + file_format + "/pysdk_test." + file_format, {"file_type": file_format}) - res = table_obj.output(["*"]).to_df() + res, extra_result = table_obj.output(["*"]).to_df() print(res) res = db_obj.drop_table("test_import_different_file_format_data"+suffix, ConflictType.Error) @@ -128,7 +128,7 @@ def test_import_empty_file_fvecs(self, file_format, suffix): {"c1": {"type": "vector,128,float"}}, ConflictType.Error) table_obj.import_data(os.getcwd() + common_values.TEST_DATA_DIR + file_format + "/test_empty." + file_format) - res = table_obj.output(["*"]).to_df() + res, extra_result = table_obj.output(["*"]).to_df() print(res) db_obj.drop_table("test_import_empty_file_fvecs"+suffix, ConflictType.Error) @@ -140,7 +140,7 @@ def test_import_empty_file_csv(self, file_format, suffix): {"c1": {"type": "int"}, "c2": {"type": "vector,3,int"}}, ConflictType.Error) table_obj.import_data(os.getcwd() + common_values.TEST_DATA_DIR + file_format + "/test_empty." + file_format) - res = table_obj.output(["*"]).to_df() + res, extra_result = table_obj.output(["*"]).to_df() print(res) db_obj.drop_table("test_import_empty_file_csv"+suffix, ConflictType.Error) @@ -152,7 +152,7 @@ def test_import_empty_file_jsonl(self, file_format, suffix): {"c1": {"type": "int"}, "c2": {"type": "vector,3,int"}}, ConflictType.Error) table_obj.import_data(os.getcwd() + common_values.TEST_DATA_DIR + file_format + "/test_empty." + file_format) - res = table_obj.output(["*"]).to_df() + res, extra_result = table_obj.output(["*"]).to_df() print(res) db_obj.drop_table("test_import_empty_file_jsonl"+suffix, ConflictType.Error) @@ -170,7 +170,7 @@ def test_import_format_unrecognized_data(self, file_format, suffix): os.getcwd() + common_values.TEST_DATA_DIR + file_format + "/pysdk_test." + file_format, {"file_type": file_format}) - res = table_obj.output(["*"]).to_df() + res, extra_result = table_obj.output(["*"]).to_df() print(res) db_obj.drop_table("test_import_format_unrecognized_data"+suffix, ConflictType.Error) @@ -206,7 +206,7 @@ def test_csv_with_different_delimiter(self, check_data, delimiter, types, suffix import_options={ "delimiter": delimiter[1] }) - res = table_obj.output(["*"]).to_df() + res, extra_result = table_obj.output(["*"]).to_df() print(res) db_obj.drop_table("test_csv_with_different_delimiter"+suffix, ConflictType.Error) else: @@ -235,7 +235,7 @@ def test_csv_with_different_delimiter_more_than_one_character(self, check_data, table_obj.import_data(common_values.TEST_TMP_DIR + "pysdk_test_" + delimiter + ".csv", import_options={"delimiter": " "}) - res = table_obj.output(["*"]).to_df() + res, extra_result = table_obj.output(["*"]).to_df() print(res) db_obj.drop_table("test_csv_with_different_delimiter_more_than_one_character"+suffix, ConflictType.Error) @@ -251,7 +251,7 @@ def test_import_csv_with_headers(self, check_data, has_header, suffix): ConflictType.Error) table_obj.import_data(common_values.TEST_TMP_DIR + "pysdk_test_commas.csv", import_options={"header": has_header}) - res = table_obj.output(["*"]).to_pl() + res, extra_result = table_obj.output(["*"]).to_pl() print(res) db_obj.drop_table("test_import_csv_with_headers"+suffix, ConflictType.Error) @@ -275,7 +275,7 @@ def test_import_fvecs_table_with_more_columns(self, check_data, suffix): assert e.type == InfinityException assert e.value.args[0] == ErrorCode.IMPORT_FILE_FORMAT_ERROR - res = table_obj.output(["*"]).to_df() + res, extra_result = table_obj.output(["*"]).to_df() print(res) db_obj.drop_table("test_import_fvecs_table_with_more_columns"+suffix, ConflictType.Error) @@ -298,7 +298,7 @@ def test_import_embedding_with_not_match_definition(self, check_data, types, suf res = table_obj.import_data(test_csv_dir, import_options={"file_type": "csv"}) assert res.error_code == ErrorCode.OK - res = table_obj.output(["*"]).to_df() + res, extra_result = table_obj.output(["*"]).to_df() print(res) db_obj.drop_table("test_import_embedding_with_not_match_definition"+suffix, ConflictType.Error) @@ -321,7 +321,7 @@ def test_import_embedding_with_dimension_unmatch(self, check_data, types, suffix assert e.type == InfinityException assert e.value.args[0] == ErrorCode.IMPORT_FILE_FORMAT_ERROR - res = table_obj.output(["*"]).to_df() + res, extra_result = table_obj.output(["*"]).to_df() print(res) db_obj.drop_table("test_import_embedding_with_not_match_definition"+suffix, ConflictType.Error) @@ -341,7 +341,7 @@ def test_import_embedding_with_unmatched_elem_type(self, check_data, types, suff test_csv_dir = common_values.TEST_TMP_DIR + "embedding_int_dim3.csv" res = table_obj.import_data(test_csv_dir, import_options={"file_type": "csv"}) assert res.error_code == ErrorCode.OK - res = table_obj.output(["*"]).to_df() + res, extra_result = table_obj.output(["*"]).to_df() print(res) db_obj.drop_table("test_import_embedding_with_not_match_definition"+suffix, ConflictType.Ignore) @@ -359,7 +359,7 @@ def test_import_varchar_with_not_match_definition(self, check_data, suffix): res = table_obj.import_data(test_csv_dir) assert res.error_code == ErrorCode.OK - res = table_obj.output(["*"]).to_pl() + res, extra_result = table_obj.output(["*"]).to_pl() print(res) db_obj.drop_table("test_import_varchar_with_not_match_definition"+suffix, ConflictType.Error) @@ -379,7 +379,7 @@ def test_import_10000_columns(self, check_data, suffix): res = table_obj.import_data(test_csv_dir) assert res.error_code == ErrorCode.OK - res = table_obj.output(["*"]).to_df() + res, extra_result = table_obj.output(["*"]).to_df() print(res) db_obj.drop_table("test_import_10000_columns"+suffix, ConflictType.Error) @@ -403,7 +403,7 @@ def test_table_with_not_matched_columns(self, columns, check_data, suffix): assert e.type == InfinityException assert e.value.args[0] == ErrorCode.COLUMN_COUNT_MISMATCH or e.value.args[0] == ErrorCode.IMPORT_FILE_FORMAT_ERROR - res = table_obj.output(["*"]).to_df() + res, extra_result = table_obj.output(["*"]).to_df() print(res) db_obj.drop_table("test_table_with_not_matched_columns"+suffix, ConflictType.Error) @@ -423,7 +423,7 @@ def test_import_with_different_size(self, check_data, data_size, suffix): res = table_obj.import_data(test_csv_dir) assert res.error_code == ErrorCode.OK - res = table_obj.output(["count(*)"]).to_pl() + res, extra_result = table_obj.output(["count(*)"]).to_pl() assert res.height == 1 and res.width == 1 and res.item(0, 0) == data_size db_obj.drop_table("test_import_with_different_size"+suffix, ConflictType.Ignore) @@ -443,7 +443,7 @@ def test_import_exceeding_rows(self, check_data, suffix): res = table_obj.import_data(test_csv_dir) assert res.error_code == ErrorCode.OK - res = table_obj.output(["count(*)"]).to_pl() + res, extra_result = table_obj.output(["count(*)"]).to_pl() assert res.height == 1 and res.width == 1 and res.item(0, 0) == 1024 * 8192 db_obj.drop_table("test_import_exceeding_rows"+suffix, ConflictType.Error) @@ -499,7 +499,7 @@ def test_import_jsonl_file_with_default(self, check_data, suffix): test_csv_dir = common_values.TEST_TMP_DIR + "test_default.jsonl" res = table_obj.import_data(test_csv_dir, import_options={"file_type": "jsonl"}) assert res.error_code == ErrorCode.OK - res = table_obj.output(["*"]).to_pl() + res, extra_result = table_obj.output(["*"]).to_pl() print(res) db_obj.drop_table("test_import_jsonl_file_with_default"+suffix, ConflictType.Error) @@ -531,7 +531,7 @@ def test_import_csv_file_with_default(self, check_data, suffix): test_csv_dir = common_values.TEST_TMP_DIR + "pysdk_test_import_default.csv" res = table_obj.import_data(test_csv_dir, import_options={"file_type": "csv"}) assert res.error_code == ErrorCode.OK - res = table_obj.output(["*"]).to_pl() + res, extra_result = table_obj.output(["*"]).to_pl() print(res) db_obj.drop_table("test_import_csv_file_with_default"+suffix, ConflictType.Error) @@ -566,6 +566,6 @@ def test_import_json_file_with_default(self, check_data, suffix): test_csv_dir = common_values.TEST_TMP_DIR + "pysdk_test_default.json" res = table_obj.import_data(test_csv_dir, import_options={"file_type": "json"}) assert res.error_code == ErrorCode.OK - res = table_obj.output(["*"]).to_pl() + res, extra_result = table_obj.output(["*"]).to_pl() print(res) db_obj.drop_table("test_import_json_file_with_default"+suffix, ConflictType.Error) diff --git a/python/test_pysdk/test_query.py b/python/test_pysdk/test_query.py index 1e2789b830..140ffd12cb 100644 --- a/python/test_pysdk/test_query.py +++ b/python/test_pysdk/test_query.py @@ -85,7 +85,7 @@ def test_query(self): query_builder.match_dense('vec', [3.0] * 5, 'float', 'ip', 2) query_builder.match_text('body', 'harmful', 2, None) query_builder.fusion(method='rrf', topn=10, fusion_params=None) - res = query_builder.to_df() + res, extra_result = query_builder.to_df() print(res) res = table.drop_index("my_index", ConflictType.Error) assert res.error_code == ErrorCode.OK diff --git a/python/test_pysdk/test_select.py b/python/test_pysdk/test_select.py index 850fd842d7..18c41e4ff8 100644 --- a/python/test_pysdk/test_select.py +++ b/python/test_pysdk/test_select.py @@ -11,6 +11,7 @@ from numpy import dtype from infinity.errors import ErrorCode from infinity.common import ConflictType, SortType + current_dir = os.path.dirname(os.path.abspath(__file__)) parent_dir = os.path.dirname(current_dir) if parent_dir not in sys.path: @@ -19,14 +20,17 @@ from common.utils import copy_data from datetime import date, time, datetime + @pytest.fixture(scope="class") def local_infinity(request): return request.config.getoption("--local-infinity") + @pytest.fixture(scope="class") def http(request): return request.config.getoption("--http") + @pytest.fixture(scope="class") def setup_class(request, local_infinity, http): if local_infinity: @@ -49,6 +53,7 @@ def setup_class(request, local_infinity, http): yield request.cls.infinity_obj.disconnect() + @pytest.mark.usefixtures("setup_class") @pytest.mark.usefixtures("suffix") class TestInfinity: @@ -116,9 +121,9 @@ def test_select(self, suffix): db_obj = self.infinity_obj.get_database("default_db") # infinity - db_obj.drop_table("test_select"+suffix, ConflictType.Ignore) + db_obj.drop_table("test_select" + suffix, ConflictType.Ignore) table_obj = db_obj.create_table( - "test_select"+suffix, { + "test_select" + suffix, { "c1": {"type": "int", "constraints": ["primary key", "not null"]}, "c2": {"type": "int", "constraints": ["not null"]}}, ConflictType.Error) @@ -134,77 +139,75 @@ def test_select(self, suffix): {"c1": 9, "c2": 9}]) assert res.error_code == ErrorCode.OK - res = table_obj.output(["*"]).to_df() + res, extra_result = table_obj.output(["*"]).to_df() pd.testing.assert_frame_equal(res, pd.DataFrame({'c1': (-3, -2, -1, 0, 1, 2, 3, -8, -7, -6, 7, 8, 9), 'c2': (-3, -2, -1, 0, 1, 2, 3, -8, -7, -6, 7, 8, 9)}) .astype({'c1': dtype('int32'), 'c2': dtype('int32')})) - res = table_obj.output(["c1", "c2"]).to_df() + res, extra_result = table_obj.output(["c1", "c2"]).to_df() pd.testing.assert_frame_equal(res, pd.DataFrame({'c1': (-3, -2, -1, 0, 1, 2, 3, -8, -7, -6, 7, 8, 9), 'c2': (-3, -2, -1, 0, 1, 2, 3, -8, -7, -6, 7, 8, 9)}) .astype({'c1': dtype('int32'), 'c2': dtype('int32')})) - res = table_obj.output( + res, extra_result = table_obj.output( ["c1 + c2"]).filter("c1 = 3").to_df() pd.testing.assert_frame_equal(res, pd.DataFrame({'(c1 + c2)': (6,)}) .astype({'(c1 + c2)': dtype('int32')})) - res = table_obj.output( + res, extra_result = table_obj.output( ["c1"]).filter("c1 > 2 and c2 < 4").to_df() pd.testing.assert_frame_equal(res, pd.DataFrame({'c1': (3,)}) .astype({'c1': dtype('int32')})) - res = table_obj.output(["c2"]).filter( - "(-7 < c1 or 9 <= c1) and (c1 = 3)").to_df() + res, extra_result = table_obj.output(["c2"]).filter("(-7 < c1 or 9 <= c1) and (c1 = 3)").to_df() pd.testing.assert_frame_equal(res, pd.DataFrame({'c2': (3,)}) .astype({'c2': dtype('int32')})) - res = table_obj.output(["c2"]).filter( - "(-8 < c1 and c1 <= -7) or (c1 >= 1 and 2 > c1)").to_df() + res, extra_result = table_obj.output(["c2"]).filter("(-8 < c1 and c1 <= -7) or (c1 >= 1 and 2 > c1)").to_df() pd.testing.assert_frame_equal(res, pd.DataFrame({'c2': (1, -7)}) .astype({'c2': dtype('int32')})) - res = table_obj.output(["c2"]).filter( + res, extra_result = table_obj.output(["c2"]).filter( "((c1 >= -8 and -4 >= c1) or (c1 >= 0 and 5 > c1)) and ((c1 > 0 and c1 <= 1) or (c1 > -8 and c1 < -6))").to_df() pd.testing.assert_frame_equal(res, pd.DataFrame({'c2': (1, -7)}) .astype({'c2': dtype('int32')})) - res = table_obj.output(["c2"]).filter( + res, extra_result = table_obj.output(["c2"]).filter( "(-7 < c1 or 9 <= c1) and (c2 = 3)").to_df() pd.testing.assert_frame_equal(res, pd.DataFrame({'c2': (3,)}) .astype({'c2': dtype('int32')})) - res = table_obj.output(["c2"]).filter( + res, extra_result = table_obj.output(["c2"]).filter( "(-8 < c1 and c2 <= -7) or (c1 >= 1 and 2 > c2)").to_df() pd.testing.assert_frame_equal(res, pd.DataFrame({'c2': (1, -7)}) .astype({'c2': dtype('int32')})) - res = table_obj.output(["c2"]).filter( + res, extra_result = table_obj.output(["c2"]).filter( "((c2 >= -8 and -4 >= c1) or (c1 >= 0 and 5 > c2)) and ((c2 > 0 and c1 <= 1) or (c1 > -8 and c2 < -6))").to_df() pd.testing.assert_frame_equal(res, pd.DataFrame({'c2': (1, -7)}) .astype({'c2': dtype('int32')})) - res = table_obj.output(["c2"]).filter( + res, extra_result = table_obj.output(["c2"]).filter( "(not(c2 < -8 or -4 < c1) or not(c1 < 0 or 5 <= c2)) and not((c2 <= 0 or c1 > 1) and (c1 <= -8 or c2 >= -6))").to_df() pd.testing.assert_frame_equal(res, pd.DataFrame({'c2': (1, -7)}) .astype({'c2': dtype('int32')})) - res = table_obj.output(["*"]).filter("c1 in (1, 2, 3)").to_df() + res, extra_result = table_obj.output(["*"]).filter("c1 in (1, 2, 3)").to_df() pd.testing.assert_frame_equal(res, pd.DataFrame({'c1': (1, 2, 3), 'c2': (1, 2, 3)}) .astype({'c1': dtype('int32'), 'c2': dtype('int32')})) - res = table_obj.output(["*"]).filter("c1 in (1, 2, 3) and c2 in (1, 2, 3)").to_df() + res, extra_result = table_obj.output(["*"]).filter("c1 in (1, 2, 3) and c2 in (1, 2, 3)").to_df() pd.testing.assert_frame_equal(res, pd.DataFrame({'c1': (1, 2, 3), 'c2': (1, 2, 3)}) .astype({'c1': dtype('int32'), 'c2': dtype('int32')})) - res = table_obj.output(["*"]).filter("c1 not in (1, 2, 3)").to_df() + res, extra_result = table_obj.output(["*"]).filter("c1 not in (1, 2, 3)").to_df() pd.testing.assert_frame_equal(res, pd.DataFrame({'c1': (-3, -2, -1, 0, -8, -7, -6, 7, 8, 9), 'c2': (-3, -2, -1, 0, -8, -7, -6, 7, 8, 9)}) .astype({'c1': dtype('int32'), 'c2': dtype('int32')})) - res = table_obj.output(["*"]).filter("(c2 + 1) in (8, 9, 10)").to_df() + res, extra_result = table_obj.output(["*"]).filter("(c2 + 1) in (8, 9, 10)").to_df() pd.testing.assert_frame_equal(res, pd.DataFrame({'c1': (7, 8, 9), 'c2': (7, 8, 9)}) .astype({'c1': dtype('int32'), 'c2': dtype('int32')})) @@ -214,7 +217,7 @@ def test_select(self, suffix): # 'c2': (-3, -2, -1)}) # .astype({'c1': dtype('int32'), 'c2': dtype('int32')})) - res = db_obj.drop_table("test_select"+suffix, ConflictType.Error) + res = db_obj.drop_table("test_select" + suffix, ConflictType.Error) assert res.error_code == ErrorCode.OK def test_select_datetime(self, suffix): @@ -239,14 +242,14 @@ def test_select_datetime(self, suffix): """ db_obj = self.infinity_obj.get_database("default_db") - db_obj.drop_table("test_select_datetime"+suffix, ConflictType.Ignore) + db_obj.drop_table("test_select_datetime" + suffix, ConflictType.Ignore) table_obj = db_obj.create_table( - "test_select_datetime"+suffix, { + "test_select_datetime" + suffix, { "c1": {"type": "date"}, "c2": {"type": "time"}, - "c3" : {"type": "datetime"}, - "c4" : {"type" : "timestamp"}}, - ConflictType.Error) + "c3": {"type": "datetime"}, + "c4": {"type": "timestamp"}}, + ConflictType.Error) assert table_obj is not None @@ -260,7 +263,7 @@ def test_select_datetime(self, suffix): dt_list.append("2024-09-23 20:45:11") ts_list.append("2024-09-23 20:45:11") res = table_obj.insert( - {"c1" : d_list[0], "c2" : t_list[0], "c3" : dt_list[0], "c4" : ts_list[0]} + {"c1": d_list[0], "c2": t_list[0], "c3": dt_list[0], "c4": ts_list[0]} ) assert res.error_code == ErrorCode.OK @@ -269,7 +272,7 @@ def test_select_datetime(self, suffix): dt_list.append("2022-05-26 21:44:33") ts_list.append("2022-05-26 21:44:33") res = table_obj.insert( - {"c1" : d_list[1], "c2" : t_list[1], "c3" : dt_list[1], "c4" : ts_list[1]} + {"c1": d_list[1], "c2": t_list[1], "c3": dt_list[1], "c4": ts_list[1]} ) assert res.error_code == ErrorCode.OK @@ -278,27 +281,31 @@ def test_select_datetime(self, suffix): dt_list.append("2021-03-04 20:58:59") ts_list.append("2021-03-04 20:58:59") res = table_obj.insert( - {"c1" : d_list[2], "c2" : t_list[2], "c3" : dt_list[2], "c4" : ts_list[2]} + {"c1": d_list[2], "c2": t_list[2], "c3": dt_list[2], "c4": ts_list[2]} ) assert res.error_code == ErrorCode.OK - - res = table_obj.output(["*"]).to_pl() - for i in range(3) : - assert res.item(i, 0) == d_list[i] and res.item(i, 1) == t_list[i] and res.item(i, 2) == dt_list[i] and res.item(i, 3) == ts_list[i] - res = table_obj.output(["c1", "c2"]).filter("c1='2024-09-23'").to_pl() + res, extra_result = table_obj.output(["*"]).to_pl() + for i in range(3): + assert res.item(i, 0) == d_list[i] and res.item(i, 1) == t_list[i] and res.item(i, 2) == dt_list[ + i] and res.item(i, 3) == ts_list[i] + + res, extra_result = table_obj.output(["c1", "c2"]).filter("c1='2024-09-23'").to_pl() assert res.item(0, 0) == d_list[0] and res.item(0, 1) == t_list[0] - res = table_obj.output(["*"]).filter("c2='21:44:33'").to_pl() - assert res.item(0, 0) == d_list[1] and res.item(0, 1) == t_list[1] and res.item(0, 2) == dt_list[1] and res.item(0, 3) == ts_list[1] + res, extra_result = table_obj.output(["*"]).filter("c2='21:44:33'").to_pl() + assert res.item(0, 0) == d_list[1] and res.item(0, 1) == t_list[1] and res.item(0, 2) == dt_list[ + 1] and res.item(0, 3) == ts_list[1] - res = table_obj.output(["*"]).filter("c3='2021-03-04 20:58:59'").to_pl() - assert res.item(0, 0) == d_list[2] and res.item(0, 1) == t_list[2] and res.item(0, 2) == dt_list[2] and res.item(0, 3) == ts_list[2] + res, extra_result = table_obj.output(["*"]).filter("c3='2021-03-04 20:58:59'").to_pl() + assert res.item(0, 0) == d_list[2] and res.item(0, 1) == t_list[2] and res.item(0, 2) == dt_list[ + 2] and res.item(0, 3) == ts_list[2] - res = table_obj.output(["*"]).filter("c4='2021-03-04 20:58:59'").to_pl() - assert res.item(0, 0) == d_list[2] and res.item(0, 1) == t_list[2] and res.item(0, 2) == dt_list[2] and res.item(0, 3) == ts_list[2] + res, extra_result = table_obj.output(["*"]).filter("c4='2021-03-04 20:58:59'").to_pl() + assert res.item(0, 0) == d_list[2] and res.item(0, 1) == t_list[2] and res.item(0, 2) == dt_list[ + 2] and res.item(0, 3) == ts_list[2] - res = db_obj.drop_table("test_select_datetime"+suffix, ConflictType.Ignore) + res = db_obj.drop_table("test_select_datetime" + suffix, ConflictType.Ignore) assert res.error_code == ErrorCode.OK def test_select_aggregate(self, suffix): @@ -340,9 +347,9 @@ def test_select_aggregate(self, suffix): db_obj = self.infinity_obj.get_database("default_db") # infinity - db_obj.drop_table("test_select_aggregate"+suffix, ConflictType.Ignore) + db_obj.drop_table("test_select_aggregate" + suffix, ConflictType.Ignore) table_obj = db_obj.create_table( - "test_select_aggregate"+suffix, { + "test_select_aggregate" + suffix, { "c1": {"type": "int", "constraints": ["primary key", "not null"]}, "c2": {"type": "float", "constraints": ["not null"]}}, ConflictType.Error) @@ -359,20 +366,20 @@ def test_select_aggregate(self, suffix): {"c1": 90, "c2": -19}]) assert res.error_code == ErrorCode.OK - res = table_obj.output(["count(*)"]).to_pl() + res, extra_result = table_obj.output(["count(*)"]).to_pl() assert res.height == 1 and res.width == 1 and res.item(0, 0) == 13 - res = table_obj.output(["max(c1)"]).to_pl() + res, extra_result = table_obj.output(["max(c1)"]).to_pl() assert res.height == 1 and res.width == 1 and res.item(0, 0) == 90 - res = table_obj.output(["min(c2)"]).to_pl() + res, extra_result = table_obj.output(["min(c2)"]).to_pl() assert res.height == 1 and res.width == 1 and res.item(0, 0) == -19 - res = table_obj.output(["min(c1) + max(c2)"]).to_pl() + res, extra_result = table_obj.output(["min(c1) + max(c2)"]).to_pl() print(res) - res = table_obj.output(["sum(c1)"]).to_pl() + res, extra_result = table_obj.output(["sum(c1)"]).to_pl() print(res) - res = table_obj.output(["avg(c2)"]).to_pl() + res, extra_result = table_obj.output(["avg(c2)"]).to_pl() print(res) - res = db_obj.drop_table("test_select_aggregate"+suffix, ConflictType.Error) + res = db_obj.drop_table("test_select_aggregate" + suffix, ConflictType.Error) assert res.error_code == ErrorCode.OK def test_select_varchar(self, suffix): @@ -425,11 +432,11 @@ def test_select_varchar(self, suffix): """ db_obj = self.infinity_obj.get_database("default_db") - db_obj.drop_table("test_select_varchar"+suffix, ConflictType.Ignore) - db_obj.create_table("test_select_varchar"+suffix, + db_obj.drop_table("test_select_varchar" + suffix, ConflictType.Ignore) + db_obj.create_table("test_select_varchar" + suffix, {"c1": {"type": "varchar", "constraints": ["primary key", "not null"]}, "c2": {"type": "varchar", "constraints": ["not null"]}}, ConflictType.Error) - table_obj = db_obj.get_table("test_select_varchar"+suffix) + table_obj = db_obj.get_table("test_select_varchar" + suffix) table_obj.insert( [{"c1": 'a', "c2": 'a'}, {"c1": 'b', "c2": 'b'}, {"c1": 'c', "c2": 'c'}, {"c1": 'd', "c2": 'd'}, {"c1": 'e', "c2": 'e'}, {"c1": 'f', "c2": 'f'}, { @@ -437,38 +444,38 @@ def test_select_varchar(self, suffix): {"c1": 'i', "c2": 'i'}, {"c1": 'j', "c2": 'j'}, { "c1": 'k', "c2": 'k'}, {"c1": 'l', "c2": 'l'}, {"c1": 'm', "c2": 'm'}]) - res = table_obj.output(["*"]).to_df() + res, extra_result = table_obj.output(["*"]).to_df() pd.testing.assert_frame_equal(res, pd.DataFrame({'c1': ('a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm'), 'c2': ('a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm')}) .astype({'c1': dtype('O'), 'c2': dtype('O')})) - res = table_obj.output( + res, extra_result = table_obj.output( ["c1", "c2"]).filter("c1 = 'a'").to_df() pd.testing.assert_frame_equal(res, pd.DataFrame({'c1': ('a',), 'c2': ('a',)}).astype( {'c1': dtype('O'), 'c2': dtype('O')})) # TODO NotImplement Error: Not implement: varchar > varchar - # res = table_obj.output(["c1"]).filter("c1 > 'a' and c2 < 'c'").to_df() + # res, extra_result = table_obj.output(["c1"]).filter("c1 > 'a' and c2 < 'c'").to_df() # pd.testing.assert_frame_equal(res, pd.DataFrame({'c1': ('b',)}).astype({'c1': dtype('O')})) - res = db_obj.drop_table("test_select_varchar"+suffix) + res = db_obj.drop_table("test_select_varchar" + suffix) assert res.error_code == ErrorCode.OK def test_select_big(self, suffix): db_obj = self.infinity_obj.get_database("default_db") - res = db_obj.drop_table("test_select_big"+suffix, ConflictType.Ignore) - db_obj.create_table("test_select_big"+suffix, { + res = db_obj.drop_table("test_select_big" + suffix, ConflictType.Ignore) + db_obj.create_table("test_select_big" + suffix, { "c1": {"type": "varchar", "constraints": ["primary key", "not null"]}, "c2": {"type": "varchar", "constraints": ["not null"]}}, ConflictType.Error) - table_obj = db_obj.get_table("test_select_big"+suffix) + table_obj = db_obj.get_table("test_select_big" + suffix) for i in range(1000): table_obj.insert( [{"c1": 'a', "c2": 'a'}, {"c1": 'b', "c2": 'b'}, {"c1": 'c', "c2": 'c'}, {"c1": 'd', "c2": 'd'}]) - res = db_obj.drop_table("test_select_big"+suffix, ConflictType.Error) + res = db_obj.drop_table("test_select_big" + suffix, ConflictType.Error) assert res.error_code == ErrorCode.OK @pytest.mark.parametrize("check_data", [{"file_name": "embedding_int_dim3.csv", @@ -491,12 +498,12 @@ def test_select_embedding_int32(self, check_data, suffix): """ db_obj = self.infinity_obj.get_database("default_db") - db_obj.drop_table("test_select_embedding"+suffix, ConflictType.Ignore) + db_obj.drop_table("test_select_embedding" + suffix, ConflictType.Ignore) - res = db_obj.create_table("test_select_embedding"+suffix, { + res = db_obj.create_table("test_select_embedding" + suffix, { "c1": {"type": "int"}, "c2": {"type": "vector,3,int"}}, ConflictType.Error) - table_obj = db_obj.get_table("test_select_embedding"+suffix) + table_obj = db_obj.get_table("test_select_embedding" + suffix) if not check_data: copy_data("embedding_int_dim3.csv") @@ -507,17 +514,17 @@ def test_select_embedding_int32(self, check_data, suffix): res = table_obj.import_data(test_csv_dir, None) assert res.error_code == ErrorCode.OK - res = table_obj.output(["c2"]).to_df() + res, extra_result = table_obj.output(["c2"]).to_df() print(res) pd.testing.assert_frame_equal(res, pd.DataFrame( {'c2': ([2, 3, 4], [6, 7, 8], [10, 11, 12])})) - res = table_obj.output(["*"]).to_df() + res, extra_result = table_obj.output(["*"]).to_df() print(res) pd.testing.assert_frame_equal(res, pd.DataFrame({'c1': (1, 5, 9), 'c2': ([2, 3, 4], [6, 7, 8], [10, 11, 12])}) .astype({'c1': dtype('int32'), 'c2': dtype('O')})) - res = db_obj.drop_table("test_select_embedding"+suffix, ConflictType.Error) + res = db_obj.drop_table("test_select_embedding" + suffix, ConflictType.Error) assert res.error_code == ErrorCode.OK @pytest.mark.parametrize("check_data", [{"file_name": "embedding_float_dim4.csv", @@ -539,12 +546,12 @@ def test_select_embedding_float(self, check_data, suffix): """ db_obj = self.infinity_obj.get_database("default_db") - db_obj.drop_table("test_select_embedding_float"+suffix, ConflictType.Ignore) + db_obj.drop_table("test_select_embedding_float" + suffix, ConflictType.Ignore) - res = db_obj.create_table("test_select_embedding_float"+suffix, { + res = db_obj.create_table("test_select_embedding_float" + suffix, { "c1": {"type": "float"}, "c2": {"type": "vector,4,float"}}, ConflictType.Error) - table_obj = db_obj.get_table("test_select_embedding_float"+suffix) + table_obj = db_obj.get_table("test_select_embedding_float" + suffix) test_dir = "/var/infinity/test_data/" test_csv_dir = test_dir + "embedding_float_dim4.csv" @@ -555,13 +562,13 @@ def test_select_embedding_float(self, check_data, suffix): res = table_obj.import_data(test_csv_dir, None) assert res.error_code == ErrorCode.OK - res = table_obj.output(["c2"]).to_df() + res, extra_result = table_obj.output(["c2"]).to_df() print(res) pd.testing.assert_frame_equal(res, pd.DataFrame( {'c2': ([0.1, 0.2, 0.3, -0.2], [0.2, 0.1, 0.3, 0.4], [0.3, 0.2, 0.1, 0.4], [0.4, 0.3, 0.2, 0.1])})) - res = table_obj.output(["*"]).to_df() + res, extra_result = table_obj.output(["*"]).to_df() print(res) pd.testing.assert_frame_equal(res, @@ -572,7 +579,7 @@ def test_select_embedding_float(self, check_data, suffix): .astype({'c1': dtype('float32'), 'c2': dtype('O')})) res = db_obj.drop_table( - "test_select_embedding_float"+suffix, ConflictType.Error) + "test_select_embedding_float" + suffix, ConflictType.Error) assert res.error_code == ErrorCode.OK @pytest.mark.parametrize("check_data", [{"file_name": "embedding_int_dim3.csv", @@ -597,12 +604,12 @@ def test_select_big_embedding(self, check_data, suffix): """ db_obj = self.infinity_obj.get_database("default_db") - db_obj.drop_table("test_select_big_embedding"+suffix, ConflictType.Ignore) + db_obj.drop_table("test_select_big_embedding" + suffix, ConflictType.Ignore) - db_obj.create_table("test_select_big_embedding"+suffix, { + db_obj.create_table("test_select_big_embedding" + suffix, { "c1": {"type": "int"}, "c2": {"type": "vector,3,int"}}, ConflictType.Error) - table_obj = db_obj.get_table("test_select_big_embedding"+suffix) + table_obj = db_obj.get_table("test_select_big_embedding" + suffix) if not check_data: copy_data("embedding_int_dim3.csv") @@ -615,49 +622,49 @@ def test_select_big_embedding(self, check_data, suffix): assert res.error_code == ErrorCode.OK res = db_obj.drop_table( - "test_select_big_embedding"+suffix, ConflictType.Error) + "test_select_big_embedding" + suffix, ConflictType.Error) assert res.error_code == ErrorCode.OK @pytest.mark.usefixtures("skip_if_http") def test_select_same_output(self, suffix): db_obj = self.infinity_obj.get_database("default_db") - db_obj.drop_table("test_select_same_output"+suffix, ConflictType.Ignore) - db_obj.create_table("test_select_same_output"+suffix, { + db_obj.drop_table("test_select_same_output" + suffix, ConflictType.Ignore) + db_obj.create_table("test_select_same_output" + suffix, { "c1": {"type": "int"}, "c2": {"type": "int"}}, ConflictType.Error) - table_obj = db_obj.get_table("test_select_same_output"+suffix) + table_obj = db_obj.get_table("test_select_same_output" + suffix) table_obj.insert([{"c1": 1, "c2": 2}]) print() - res = table_obj.output(["c1", "c2"]).to_df() + res, extra_result = table_obj.output(["c1", "c2"]).to_df() print(res) - res = table_obj.output(["c1", "c1"]).to_df() + res, extra_result = table_obj.output(["c1", "c1"]).to_df() print(res) - res = table_obj.output(["c1", "c2", "c1"]).to_df() + res, extra_result = table_obj.output(["c1", "c2", "c1"]).to_df() print(res) - res = db_obj.drop_table("test_select_same_output"+suffix, ConflictType.Error) + res = db_obj.drop_table("test_select_same_output" + suffix, ConflictType.Error) assert res.error_code == ErrorCode.OK @pytest.mark.usefixtures("skip_if_http") def test_empty_table(self, suffix): db_obj = self.infinity_obj.get_database("default_db") - db_obj.drop_table("test_empty_table"+suffix, ConflictType.Ignore) - db_obj.create_table("test_empty_table"+suffix, { + db_obj.drop_table("test_empty_table" + suffix, ConflictType.Ignore) + db_obj.create_table("test_empty_table" + suffix, { "c1": {"type": "int"}, "c2": {"type": "int"}}, ConflictType.Error) - table_obj = db_obj.get_table("test_empty_table"+suffix) + table_obj = db_obj.get_table("test_empty_table" + suffix) print() - res = table_obj.output(["c1", "c2"]).to_df() + res, extra_result = table_obj.output(["c1", "c2"]).to_df() pd.testing.assert_frame_equal(res, pd.DataFrame({'c1': (), 'c2': ()}).astype( {'c1': dtype('int32'), 'c2': dtype('int32')})) - res = table_obj.output(["c1", "c1"]).to_df() + res, extra_result = table_obj.output(["c1", "c1"]).to_df() pd.testing.assert_frame_equal(res, pd.DataFrame({'c1': (), 'c1_2': ()}).astype( {'c1': dtype('int32'), 'c1_2': dtype('int32')})) - res = table_obj.output(["c1", "c2", "c1"]).to_df() + res, extra_result = table_obj.output(["c1", "c2", "c1"]).to_df() pd.testing.assert_frame_equal(res, pd.DataFrame({'c1': (), 'c2': (), 'c1_2': ()}).astype( {'c1': dtype('int32'), 'c2': dtype('int32'), 'c1_2': dtype('int32')})) - res = db_obj.drop_table("test_empty_table"+suffix, ConflictType.Error) + res = db_obj.drop_table("test_empty_table" + suffix, ConflictType.Error) assert res.error_code == ErrorCode.OK @pytest.mark.parametrize("filter_list", [ @@ -672,8 +679,8 @@ def test_empty_table(self, suffix): def test_valid_filter_expression(self, filter_list, suffix): # connect db_obj = self.infinity_obj.get_database("default_db") - db_obj.drop_table("test_valid_filter_expression"+suffix, ConflictType.Ignore) - table_obj = db_obj.create_table("test_valid_filter_expression"+suffix, { + db_obj.drop_table("test_valid_filter_expression" + suffix, ConflictType.Ignore) + table_obj = db_obj.create_table("test_valid_filter_expression" + suffix, { "c1": {"type": "int"}, "c2": {"type": "float"}}, ConflictType.Error) table_obj.insert([{"c1": 1, "c2": 2.0}, {"c1": 10, "c2": 2.0}, @@ -681,11 +688,11 @@ def test_valid_filter_expression(self, filter_list, suffix): {"c1": 1000, "c2": 2.0}, {"c1": 10000, "c2": 2.0}]) # TODO add more filter function - select_res_df = table_obj.output(["*"]).filter(filter_list).to_pl() - print(str(select_res_df)) + res, extra_result = table_obj.output(["*"]).filter(filter_list).to_pl() + print(str(res)) res = db_obj.drop_table( - "test_valid_filter_expression"+suffix, ConflictType.Error) + "test_valid_filter_expression" + suffix, ConflictType.Error) assert res.error_code == ErrorCode.OK @pytest.mark.parametrize("filter_list", [ @@ -700,9 +707,9 @@ def test_valid_filter_expression(self, filter_list, suffix): def test_invalid_filter_expression(self, filter_list, suffix): # connect db_obj = self.infinity_obj.get_database("default_db") - db_obj.drop_table("test_invalid_filter_expression"+suffix, + db_obj.drop_table("test_invalid_filter_expression" + suffix, ConflictType.Ignore) - table_obj = db_obj.create_table("test_invalid_filter_expression"+suffix, { + table_obj = db_obj.create_table("test_invalid_filter_expression" + suffix, { "c1": {"type": "int"}, "c2": {"type": "float"}}, ConflictType.Error) table_obj.insert([{"c1": 1, "c2": 2.0}, {"c1": 10, "c2": 2.0}, @@ -711,11 +718,11 @@ def test_invalid_filter_expression(self, filter_list, suffix): {"c1": 10000, "c2": 2.0}]) # TODO add more filter function with pytest.raises(Exception): - select_res_df = table_obj.output(["*"]).filter(filter_list).to_pl() - print(str(select_res_df)) + res, extra_result = table_obj.output(["*"]).filter(filter_list).to_pl() + print(str(res)) res = db_obj.drop_table( - "test_invalid_filter_expression"+suffix, ConflictType.Error) + "test_invalid_filter_expression" + suffix, ConflictType.Error) assert res.error_code == ErrorCode.OK def test_filter_fulltext(self, suffix): @@ -729,25 +736,39 @@ def test_filter_fulltext(self, suffix): def test_func(): expect_result = pd.DataFrame({'num': (1,), "doc": "first text"}).astype({'num': dtype('int32')}) - pd.testing.assert_frame_equal(expect_result, table_obj.output(["*"]).filter( - "filter_text('doc', 'first text', 'minimum_should_match=100%')").to_df()) - pd.testing.assert_frame_equal(expect_result, table_obj.output(["*"]).filter( - "filter_text('', 'first second', 'default_field=doc;minimum_should_match=99%') and not num = 2").to_df()) - pd.testing.assert_frame_equal(expect_result, table_obj.output(["*"]).filter( - "filter_text('doc', 'first OR second') and (num < 2 or num > 2)").to_df()) - pd.testing.assert_frame_equal(expect_result, table_obj.output(["*"]).filter( - "(filter_text('doc', 'first') or filter_fulltext('doc', 'second')) and (num < 2 or num > 2)").to_df()) + res, extra_result = table_obj.output(["*"]).filter( + "filter_text('doc', 'first text', 'minimum_should_match=100%')").to_df() + pd.testing.assert_frame_equal(expect_result, res) + + res, extra_result = table_obj.output(["*"]).filter( + "filter_text('', 'first second', 'default_field=doc;minimum_should_match=99%') and not num = 2").to_df() + pd.testing.assert_frame_equal(expect_result, res) + + res, extra_result = table_obj.output(["*"]).filter( + "filter_text('doc', 'first OR second') and (num < 2 or num > 2)").to_df() + pd.testing.assert_frame_equal(expect_result, res) + + res, extra_result = table_obj.output(["*"]).filter( + "(filter_text('doc', 'first') or filter_fulltext('doc', 'second')) and (num < 2 or num > 2)").to_df() + pd.testing.assert_frame_equal(expect_result, res) expect_result = pd.DataFrame( {'num': (1, 2, 3), "doc": ("first text", "second text multiple", "third text many words")}).astype( {'num': dtype('int32')}) - pd.testing.assert_frame_equal(expect_result, table_obj.output(["*"]).filter( - "filter_text('doc', 'first') or num >= 2").to_df()) - pd.testing.assert_frame_equal(expect_result, table_obj.output(["*"]).filter( - "filter_fulltext('doc', 'second') or (num < 2 or num > 2)").to_df()) + + res, extra_result = table_obj.output(["*"]).filter( + "filter_text('doc', 'first') or num >= 2").to_df() + pd.testing.assert_frame_equal(expect_result, res) + + res, extra_result = table_obj.output(["*"]).filter( + "filter_fulltext('doc', 'second') or (num < 2 or num > 2)").to_df() + pd.testing.assert_frame_equal(expect_result, res) + + res, extra_result = table_obj.output( + ["filter_text('doc', 'second') or num > 2", "filter_text('doc', 'second')"]).to_df() pd.testing.assert_frame_equal(pd.DataFrame({ "(FILTER_FULLTEXT('doc', 'second') OR (num > 2))": (False, True, True), "FILTER_FULLTEXT('doc', 'second')": (False, True, False)}), - table_obj.output(["filter_text('doc', 'second') or num > 2", "filter_text('doc', 'second')"]).to_df()) + res) test_func() table_obj.create_index("my_sc_index", index.IndexInfo("num", index.IndexType.Secondary), ConflictType.Error) @@ -760,7 +781,9 @@ def test_neg_func(self, suffix): db_obj.drop_table("test_neg_func" + suffix, ConflictType.Ignore) table_obj = db_obj.create_table("test_neg_func" + suffix, {"num": {"type": "float64"}}, ConflictType.Error) table_obj.insert([{"num": 1.0}, {"num": 2.0}, {"num": 3.0}]) - pd.testing.assert_frame_equal(table_obj.output(["-abs(num) - 1"]).filter("-abs(num) >= -2").to_df(), + + res, extra_result = table_obj.output(["-abs(num) - 1"]).filter("-abs(num) >= -2").to_df() + pd.testing.assert_frame_equal(res, pd.DataFrame({"(-(ABS(num)) - 1)": (-2.0, -3.0)})) res = db_obj.drop_table("test_neg_func" + suffix, ConflictType.Error) assert res.error_code == ErrorCode.OK @@ -769,9 +792,9 @@ def test_sort(self, suffix): db_obj = self.infinity_obj.get_database("default_db") # infinity - db_obj.drop_table("test_sort"+suffix, ConflictType.Ignore) + db_obj.drop_table("test_sort" + suffix, ConflictType.Ignore) table_obj = db_obj.create_table( - "test_sort"+suffix, { + "test_sort" + suffix, { "c1": {"type": "int", "constraints": ["primary key", "not null"]}, "c2": {"type": "int", "constraints": ["not null"]}}, ConflictType.Error) @@ -787,212 +810,216 @@ def test_sort(self, suffix): {"c1": 9, "c2": 9}]) assert res.error_code == ErrorCode.OK - res = table_obj.output(["c1", "c2"]).sort([["c2", SortType.Asc], ["c1", SortType.Desc]]).to_df() + res, extra_res = table_obj.output(["c1", "c2"]).sort([["c2", SortType.Asc], ["c1", SortType.Desc]]).to_df() pd.testing.assert_frame_equal(res, pd.DataFrame({'c1': (0, 1, -1, 2, -2, 3, -3, -6, 7, -7, 8, -8, 9), 'c2': (0, 1, 1, 2, 2, 3, 3, 6, 7, 7, 8, 8, 9)}) .astype({'c1': dtype('int32'), 'c2': dtype('int32')})) - res = table_obj.output(["c1", "c2"]).sort([["c2", SortType.Asc], ["c1", SortType.Asc]]).to_df() + res, extra_res = table_obj.output(["c1", "c2"]).sort([["c2", SortType.Asc], ["c1", SortType.Asc]]).to_df() pd.testing.assert_frame_equal(res, pd.DataFrame({'c1': (0, -1, 1, -2, 2, -3, 3, -6, -7, 7, -8, 8, 9), 'c2': (0, 1, 1, 2, 2, 3, 3, 6, 7, 7, 8, 8, 9)}) .astype({'c1': dtype('int32'), 'c2': dtype('int32')})) - res = table_obj.output(["_row_id"]).sort([["_row_id", SortType.Desc]]).to_df() - #pd.testing.assert_frame_equal(res, pd.DataFrame({'ROW_ID': (12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)}) + res, extra_res = table_obj.output(["_row_id"]).sort([["_row_id", SortType.Desc]]).to_df() + # pd.testing.assert_frame_equal(res, pd.DataFrame({'ROW_ID': (12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)}) # .astype({'ROW_ID': dtype('int64')})) print(res) - res = db_obj.drop_table("test_sort"+suffix, ConflictType.Error) + res = db_obj.drop_table("test_sort" + suffix, ConflictType.Error) assert res.error_code == ErrorCode.OK def test_select_varchar_length(self, suffix): db_obj = self.infinity_obj.get_database("default_db") - db_obj.drop_table("test_select_varchar_length"+suffix, ConflictType.Ignore) - db_obj.create_table("test_select_varchar_length"+suffix, + db_obj.drop_table("test_select_varchar_length" + suffix, ConflictType.Ignore) + db_obj.create_table("test_select_varchar_length" + suffix, {"c1": {"type": "varchar", "constraints": ["primary key", "not null"]}, "c2": {"type": "varchar", "constraints": ["not null"]}}, ConflictType.Error) - table_obj = db_obj.get_table("test_select_varchar_length"+suffix) + table_obj = db_obj.get_table("test_select_varchar_length" + suffix) table_obj.insert( [{"c1": 'a', "c2": 'a'}, {"c1": 'b', "c2": 'b'}, {"c1": 'c', "c2": 'c'}, {"c1": 'd', "c2": 'd'}, - {"c1": 'abc', "c2": 'abc'}, {"c1": 'bbc', "c2": 'bbc'}, {"c1": 'cbc', "c2": 'cbc'}, {"c1": 'dbc', "c2": 'dbc'}]) + {"c1": 'abc', "c2": 'abc'}, {"c1": 'bbc', "c2": 'bbc'}, {"c1": 'cbc', "c2": 'cbc'}, + {"c1": 'dbc', "c2": 'dbc'}]) - res = table_obj.output(["*"]).filter("char_length(c1) = 1").to_df() + res, extra_res = table_obj.output(["*"]).filter("char_length(c1) = 1").to_df() print(res) pd.testing.assert_frame_equal(res, pd.DataFrame({'c1': ('a', 'b', 'c', 'd'), 'c2': ('a', 'b', 'c', 'd')}) .astype({'c1': dtype('O'), 'c2': dtype('O')})) - res = table_obj.output(["*"]).filter("char_length(c1) = 3").to_df() + res, extra_res = table_obj.output(["*"]).filter("char_length(c1) = 3").to_df() print(res) pd.testing.assert_frame_equal(res, pd.DataFrame({'c1': ('abc', 'bbc', 'cbc', 'dbc'), 'c2': ('abc', 'bbc', 'cbc', 'dbc')}) .astype({'c1': dtype('O'), 'c2': dtype('O')})) - res = db_obj.drop_table("test_select_varchar_length"+suffix) + res = db_obj.drop_table("test_select_varchar_length" + suffix) assert res.error_code == ErrorCode.OK def test_select_regex(self, suffix): db_obj = self.infinity_obj.get_database("default_db") - db_obj.drop_table("test_select_regex"+suffix, ConflictType.Ignore) - db_obj.create_table("test_select_regex"+suffix, + db_obj.drop_table("test_select_regex" + suffix, ConflictType.Ignore) + db_obj.create_table("test_select_regex" + suffix, {"c1": {"type": "varchar", "constraints": ["primary key", "not null"]}, "c2": {"type": "varchar", "constraints": ["not null"]}}, ConflictType.Error) - table_obj = db_obj.get_table("test_select_regex"+suffix) + table_obj = db_obj.get_table("test_select_regex" + suffix) table_obj.insert( [{"c1": 'a', "c2": 'a'}, {"c1": 'b', "c2": 'b'}, {"c1": 'c', "c2": 'c'}, {"c1": 'd', "c2": 'd'}, - {"c1": 'abc', "c2": 'abc'}, {"c1": 'bbc', "c2": 'bbc'}, {"c1": 'cbc', "c2": 'cbc'}, {"c1": 'dbc', "c2": 'dbc'},]) + {"c1": 'abc', "c2": 'abc'}, {"c1": 'bbc', "c2": 'bbc'}, {"c1": 'cbc', "c2": 'cbc'}, + {"c1": 'dbc', "c2": 'dbc'}, ]) - res = table_obj.output(["*"]).filter("regex(c1, 'bc')").to_df() + res, extra_res = table_obj.output(["*"]).filter("regex(c1, 'bc')").to_df() print(res) pd.testing.assert_frame_equal(res, pd.DataFrame({'c1': ('abc', 'bbc', 'cbc', 'dbc'), 'c2': ('abc', 'bbc', 'cbc', 'dbc')}) .astype({'c1': dtype('O'), 'c2': dtype('O')})) - - res = db_obj.drop_table("test_select_regex"+suffix) + res = db_obj.drop_table("test_select_regex" + suffix) assert res.error_code == ErrorCode.OK def test_select_upper_lower(self, suffix): db_obj = self.infinity_obj.get_database("default_db") - db_obj.drop_table("test_select_upper_lower"+suffix, ConflictType.Ignore) - db_obj.create_table("test_select_upper_lower"+suffix, + db_obj.drop_table("test_select_upper_lower" + suffix, ConflictType.Ignore) + db_obj.create_table("test_select_upper_lower" + suffix, {"c1": {"type": "varchar", "constraints": ["primary key", "not null"]}, "c2": {"type": "varchar", "constraints": ["not null"]}}, ConflictType.Error) - table_obj = db_obj.get_table("test_select_upper_lower"+suffix) + table_obj = db_obj.get_table("test_select_upper_lower" + suffix) table_obj.insert( [{"c1": 'a', "c2": 'A'}, {"c1": 'b', "c2": 'B'}, {"c1": 'c', "c2": 'C'}, {"c1": 'd', "c2": 'D'}, - {"c1": 'abc', "c2": 'ABC'}, {"c1": 'bbc', "c2": 'bbc'}, {"c1": 'cbc', "c2": 'cbc'}, {"c1": 'dbc', "c2": 'dbc'},]) + {"c1": 'abc', "c2": 'ABC'}, {"c1": 'bbc', "c2": 'bbc'}, {"c1": 'cbc', "c2": 'cbc'}, + {"c1": 'dbc', "c2": 'dbc'}, ]) - res = table_obj.output(["*"]).filter("upper(c1) = c2").to_df() + res, extra_res = table_obj.output(["*"]).filter("upper(c1) = c2").to_df() print(res) pd.testing.assert_frame_equal(res, pd.DataFrame({'c1': ('a', 'b', 'c', 'd', 'abc'), 'c2': ('A', 'B', 'C', 'D', 'ABC')}) .astype({'c1': dtype('O'), 'c2': dtype('O')})) - - res = db_obj.drop_table("test_select_upper_lower"+suffix) + res = db_obj.drop_table("test_select_upper_lower" + suffix) assert res.error_code == ErrorCode.OK def test_select_substring(self, suffix): db_obj = self.infinity_obj.get_database("default_db") - db_obj.drop_table("test_select_substring"+suffix, ConflictType.Ignore) - db_obj.create_table("test_select_substring"+suffix, + db_obj.drop_table("test_select_substring" + suffix, ConflictType.Ignore) + db_obj.create_table("test_select_substring" + suffix, {"c1": {"type": "varchar", "constraints": ["primary key", "not null"]}, "c2": {"type": "varchar", "constraints": ["not null"]}}, ConflictType.Error) - table_obj = db_obj.get_table("test_select_substring"+suffix) + table_obj = db_obj.get_table("test_select_substring" + suffix) table_obj.insert( [{"c1": 'a', "c2": 'A'}, {"c1": 'b', "c2": 'B'}, {"c1": 'c', "c2": 'C'}, {"c1": 'd', "c2": 'D'}, - {"c1": 'abc', "c2": 'ABC'}, {"c1": 'bbcc', "c2": 'bbc'}, {"c1": 'cbcc', "c2": 'cbc'}, {"c1": 'dbcc', "c2": 'dbc'},]) + {"c1": 'abc', "c2": 'ABC'}, {"c1": 'bbcc', "c2": 'bbc'}, {"c1": 'cbcc', "c2": 'cbc'}, + {"c1": 'dbcc', "c2": 'dbc'}, ]) - res = table_obj.output(["*"]).filter("substring(c1, 0, 3) = c2").to_df() + res, extra_res = table_obj.output(["*"]).filter("substring(c1, 0, 3) = c2").to_df() print(res) pd.testing.assert_frame_equal(res, pd.DataFrame({'c1': ('bbcc', 'cbcc', 'dbcc'), 'c2': ('bbc', 'cbc', 'dbc')}) .astype({'c1': dtype('O'), 'c2': dtype('O')})) - res = db_obj.drop_table("test_select_substring"+suffix) + res = db_obj.drop_table("test_select_substring" + suffix) assert res.error_code == ErrorCode.OK def test_select_trim(self, suffix): db_obj = self.infinity_obj.get_database("default_db") - db_obj.drop_table("test_select_trim"+suffix, ConflictType.Ignore) - db_obj.create_table("test_select_trim"+suffix, + db_obj.drop_table("test_select_trim" + suffix, ConflictType.Ignore) + db_obj.create_table("test_select_trim" + suffix, {"c1": {"type": "varchar", "constraints": ["primary key", "not null"]}, "c2": {"type": "varchar", "constraints": ["not null"]}}, ConflictType.Error) - table_obj = db_obj.get_table("test_select_trim"+suffix) + table_obj = db_obj.get_table("test_select_trim" + suffix) table_obj.insert( - [{"c1": ' a', "c2": 'a'}, {"c1": ' b', "c2": 'b'}, {"c1": ' c', "c2": 'c'}, - {"c1": 'ab ', "c2": 'ab'}, {"c1": 'bcc ', "c2": 'bcc'}, {"c1": 'cbc ', "c2": 'cbc'}, {"c1": ' dbc ', "c2": 'dbc'},]) + [{"c1": ' a', "c2": 'a'}, {"c1": ' b', "c2": 'b'}, {"c1": ' c', "c2": 'c'}, + {"c1": 'ab ', "c2": 'ab'}, {"c1": 'bcc ', "c2": 'bcc'}, {"c1": 'cbc ', "c2": 'cbc'}, + {"c1": ' dbc ', "c2": 'dbc'}, ]) - res = table_obj.output(["*"]).filter("ltrim(c1) = c2").to_df() + res, extra_res = table_obj.output(["*"]).filter("ltrim(c1) = c2").to_df() print(res) pd.testing.assert_frame_equal(res, pd.DataFrame({'c1': (' a', ' b', ' c'), 'c2': ('a', 'b', 'c')}) .astype({'c1': dtype('O'), 'c2': dtype('O')})) - - res = table_obj.output(["*"]).filter("rtrim(c1) = c2").to_df() + + res, extra_res = table_obj.output(["*"]).filter("rtrim(c1) = c2").to_df() print(res) pd.testing.assert_frame_equal(res, pd.DataFrame({'c1': ('ab ', 'bcc ', 'cbc '), 'c2': ('ab', 'bcc', 'cbc')}) .astype({'c1': dtype('O'), 'c2': dtype('O')})) - res = table_obj.output(["*"]).filter("trim(c1) = c2").to_df() + res, extra_res = table_obj.output(["*"]).filter("trim(c1) = c2").to_df() print(res) pd.testing.assert_frame_equal(res, pd.DataFrame({'c1': (' a', ' b', ' c', 'ab ', 'bcc ', 'cbc ', ' dbc '), 'c2': ('a', 'b', 'c', 'ab', 'bcc', 'cbc', 'dbc')}) .astype({'c1': dtype('O'), 'c2': dtype('O')})) - res = db_obj.drop_table("test_select_trim"+suffix) + res = db_obj.drop_table("test_select_trim" + suffix) assert res.error_code == ErrorCode.OK def test_select_position(self, suffix): db_obj = self.infinity_obj.get_database("default_db") - db_obj.drop_table("test_select_position"+suffix, ConflictType.Ignore) - db_obj.create_table("test_select_position"+suffix, + db_obj.drop_table("test_select_position" + suffix, ConflictType.Ignore) + db_obj.create_table("test_select_position" + suffix, {"c1": {"type": "varchar", "constraints": ["primary key", "not null"]}, "c2": {"type": "varchar", "constraints": ["not null"]}}, ConflictType.Error) - table_obj = db_obj.get_table("test_select_position"+suffix) + table_obj = db_obj.get_table("test_select_position" + suffix) table_obj.insert( [{"c1": 'a', "c2": 'A'}, {"c1": 'b', "c2": 'B'}, {"c1": 'c', "c2": 'C'}, {"c1": 'd', "c2": 'D'}, - {"c1": 'abc', "c2": 'ABC'}, {"c1": 'bbcc', "c2": 'bbc'}, {"c1": 'cbcc', "c2": 'cbc'}, {"c1": 'dbcc', "c2": 'dbc'},]) + {"c1": 'abc', "c2": 'ABC'}, {"c1": 'bbcc', "c2": 'bbc'}, {"c1": 'cbcc', "c2": 'cbc'}, + {"c1": 'dbcc', "c2": 'dbc'}, ]) - res = table_obj.output(["*"]).filter("char_position(c1, c2) <> 0").to_df() + res, extra_res = table_obj.output(["*"]).filter("char_position(c1, c2) <> 0").to_df() print(res) pd.testing.assert_frame_equal(res, pd.DataFrame({'c1': ('bbcc', 'cbcc', 'dbcc'), 'c2': ('bbc', 'cbc', 'dbc')}) .astype({'c1': dtype('O'), 'c2': dtype('O')})) - res = db_obj.drop_table("test_select_position"+suffix) + res = db_obj.drop_table("test_select_position" + suffix) assert res.error_code == ErrorCode.OK def test_select_sqrt(self, suffix): db_obj = self.infinity_obj.get_database("default_db") - db_obj.drop_table("test_select_sqrt"+suffix, ConflictType.Ignore) - db_obj.create_table("test_select_sqrt"+suffix, + db_obj.drop_table("test_select_sqrt" + suffix, ConflictType.Ignore) + db_obj.create_table("test_select_sqrt" + suffix, {"c1": {"type": "integer"}, "c2": {"type": "double"}}, ConflictType.Error) - table_obj = db_obj.get_table("test_select_sqrt"+suffix) + table_obj = db_obj.get_table("test_select_sqrt" + suffix) table_obj.insert( [{"c1": '1', "c2": '2'}, {"c1": '4', "c2": '5'}, {"c1": '9', "c2": '10'}, {"c1": '16', "c2": '17'}]) - res = table_obj.output(["*", "sqrt(c1)", "sqrt(c2)"]).to_df() + res, extra_res = table_obj.output(["*", "sqrt(c1)", "sqrt(c2)"]).to_df() print(res) - res = table_obj.output(["*"]).filter("sqrt(c1) = 2").to_df() + res, extra_res = table_obj.output(["*"]).filter("sqrt(c1) = 2").to_df() pd.testing.assert_frame_equal(res, pd.DataFrame({'c1': (4,), 'c2': (5,)}) .astype({'c1': dtype('int32'), 'c2': dtype('double')})) - res = db_obj.drop_table("test_select_sqrt"+suffix) + res = db_obj.drop_table("test_select_sqrt" + suffix) assert res.error_code == ErrorCode.OK def test_select_round(self, suffix): db_obj = self.infinity_obj.get_database("default_db") - db_obj.drop_table("test_select_round"+suffix, ConflictType.Ignore) - db_obj.create_table("test_select_round"+suffix, + db_obj.drop_table("test_select_round" + suffix, ConflictType.Ignore) + db_obj.create_table("test_select_round" + suffix, {"c1": {"type": "integer"}, "c2": {"type": "double"}}, ConflictType.Error) - table_obj = db_obj.get_table("test_select_round"+suffix) + table_obj = db_obj.get_table("test_select_round" + suffix) table_obj.insert( [{"c1": '1', "c2": '2.4'}, {"c1": '4', "c2": '-2.4'}, {"c1": '9', "c2": '2.5'}, {"c1": '16', "c2": '-2.5'}]) - res = table_obj.output(["c1", "round(c2)"]).to_df() + res, extra_res = table_obj.output(["c1", "round(c2)"]).to_df() print(res) pd.testing.assert_frame_equal(res, pd.DataFrame({'c1': (1, 4, 9, 16), 'round(c2)': (2, -2, 3, -3)}) .astype({'c1': dtype('int32'), 'round(c2)': dtype('double')})) - res = table_obj.output(["c1", "ceil(c2)"]).to_df() + res, extra_res = table_obj.output(["c1", "ceil(c2)"]).to_df() print(res) pd.testing.assert_frame_equal(res, pd.DataFrame({'c1': (1, 4, 9, 16), 'ceil(c2)': (3, -2, 3, -2)}) .astype({'c1': dtype('int32'), 'ceil(c2)': dtype('double')})) - res = table_obj.output(["c1", "floor(c2)"]).to_df() + res, extra_res = table_obj.output(["c1", "floor(c2)"]).to_df() print(res) pd.testing.assert_frame_equal(res, pd.DataFrame({'c1': (1, 4, 9, 16), 'floor(c2)': (2, -3, 2, -3)}) .astype({'c1': dtype('int32'), 'floor(c2)': dtype('double')})) - res = db_obj.drop_table("test_select_round"+suffix) - assert res.error_code == ErrorCode.OK \ No newline at end of file + res = db_obj.drop_table("test_select_round" + suffix) + assert res.error_code == ErrorCode.OK diff --git a/python/test_pysdk/test_update.py b/python/test_pysdk/test_update.py index c2365275cd..7c79b4cb75 100644 --- a/python/test_pysdk/test_update.py +++ b/python/test_pysdk/test_update.py @@ -103,7 +103,7 @@ def test_update(self, suffix): res = table_obj.update("c1 = 1", {"c2": 90, "c3": 900}) assert res.error_code == ErrorCode.OK - res = table_obj.output(["*"]).to_df() + res, extra_result = table_obj.output(["*"]).to_df() pd.testing.assert_frame_equal(res, pd.DataFrame( {'c1': (2, 3, 4, 1), 'c2': (20, 30, 40, 90), 'c3': (200, 300, 400, 900)}) .astype({'c1': dtype('int32'), 'c2': dtype('int32'), 'c3': dtype('int32')})) @@ -111,7 +111,7 @@ def test_update(self, suffix): with pytest.raises(Exception): table_obj.update(None, {"c2": 90, "c3": 900}) - res = table_obj.output(["*"]).to_df() + res, extra_result = table_obj.output(["*"]).to_df() pd.testing.assert_frame_equal(res, pd.DataFrame( {'c1': (2, 3, 4, 1), 'c2': (20, 30, 40, 90), 'c3': (200, 300, 400, 900)}) .astype({'c1': dtype('int32'), 'c2': dtype('int32'), 'c3': dtype('int32')})) @@ -187,7 +187,7 @@ def test_update_no_row_is_met_the_condition(self, suffix): try: tb_obj.insert([{"c1": common_values.types_example_array[i], "c2": common_values.types_example_array[i]}]) - res = tb_obj.output(["*"]).to_df() + res, extra_result = tb_obj.output(["*"]).to_df() print(res) print("insert c1 = " + str(common_values.types_example_array[i]) + ", c2 = " + str(common_values.types_example_array[i])) @@ -196,7 +196,7 @@ def test_update_no_row_is_met_the_condition(self, suffix): try: tb_obj.update("c1 = 2", {"c2": common_values.types_example_array[i]}) - res = tb_obj.output(["*"]).to_df() + res, extra_result = tb_obj.output(["*"]).to_df() print("update type: {} \n {}".format(common_values.types_array[i], res)) except Exception as e: @@ -224,7 +224,7 @@ def test_update_all_row_is_met_the_condition(self, suffix): try: tb_obj.insert([{"c1": common_values.types_example_array[i], "c2": common_values.types_example_array[i]}]) - res = tb_obj.output(["*"]).to_df() + res, extra_result = tb_obj.output(["*"]).to_df() print(res) print("insert c1 = " + str(common_values.types_example_array[i]) + ", c2 = " + str(common_values.types_example_array[i])) @@ -234,7 +234,7 @@ def test_update_all_row_is_met_the_condition(self, suffix): try: tb_obj.update("c1 = " + str(common_values.types_example_array[i]), {"c2": common_values.types_example_array[i]}) - res = tb_obj.output(["*"]).to_df() + res, extra_result = tb_obj.output(["*"]).to_df() print("update type: {} \n {}".format(common_values.types_array[i], res)) except Exception as e: @@ -255,12 +255,12 @@ def test_update_table_with_one_block(self, suffix): values = [{"c1": 1, "c2": 2} for _ in range(8192)] # values = [{"c1": 1, "c2": 2}] table_obj.insert(values) - insert_res = table_obj.output(["*"]).to_df() + insert_res, extra_result = table_obj.output(["*"]).to_df() print(insert_res) # update table_obj.update("c1 = 1", {"c2": 20}) - delete_res = table_obj.output(["*"]).to_df() + delete_res, extra_result = table_obj.output(["*"]).to_df() print(delete_res) res = db_obj.drop_table("test_update_table_with_one_block"+suffix, ConflictType.Error) @@ -278,12 +278,12 @@ def test_update_table_with_one_segment(self, suffix): for i in range(1024): values = [{"c1": 1, "c2": 2} for _ in range(8)] table_obj.insert(values) - insert_res = table_obj.output(["*"]).to_df() + insert_res, extra_result = table_obj.output(["*"]).to_df() print(insert_res) # update table_obj.update("c1 = 1", {"c2": 20}) - delete_res = table_obj.output(["*"]).to_df() + delete_res, extra_result = table_obj.output(["*"]).to_df() print(delete_res) res = db_obj.drop_table("test_update_table_with_one_segment"+suffix, ConflictType.Error) @@ -299,17 +299,17 @@ def test_update_before_delete(self, suffix): # insert values = [{"c1": 1, "c2": 2} for _ in range(8)] table_obj.insert(values) - insert_res = table_obj.output(["*"]).to_df() + insert_res, extra_result = table_obj.output(["*"]).to_df() print(insert_res) # delete table_obj.delete("c1 = 1") - delete_res = table_obj.output(["*"]).to_df() + delete_res, extra_result = table_obj.output(["*"]).to_df() print(delete_res) # update table_obj.update("c1 = 1", {"c2": 20}) - update_res = table_obj.output(["*"]).to_df() + update_res, extra_result = table_obj.output(["*"]).to_df() print(update_res) res = db_obj.drop_table("test_update_before_delete"+suffix, ConflictType.Error) @@ -325,12 +325,12 @@ def test_update_inserted_data(self, suffix): # insert values = [{"c1": 1, "c2": 2} for _ in range(8)] table_obj.insert(values) - insert_res = table_obj.output(["*"]).to_df() + insert_res, extra_result = table_obj.output(["*"]).to_df() print(insert_res) # update table_obj.update("c1 = 1", {"c2": 21}) - update_res = table_obj.output(["*"]).to_df() + update_res, extra_result = table_obj.output(["*"]).to_df() print(update_res) res = db_obj.drop_table("test_update_inserted_data"+suffix, ConflictType.Error) @@ -349,14 +349,14 @@ def test_update_inserted_long_before(self, suffix): # insert values = [{"c1": 1, "c2": 2} for _ in range(8)] table_obj.insert(values) - insert_res = table_obj.output(["*"]).to_df() + insert_res, extra_result = table_obj.output(["*"]).to_df() print(insert_res) time.sleep(60) # update table_obj.update("c1 = 1", {"c2": 21}) - update_res = table_obj.output(["*"]).to_df() + update_res, extra_result = table_obj.output(["*"]).to_df() print(update_res) res = db_obj.drop_table("test_update_inserted_long_before"+suffix, ConflictType.Error) @@ -374,7 +374,7 @@ def test_update_dropped_table(self, suffix): # update with pytest.raises(InfinityException) as e: table_obj.update("c1 = 1", {"c2": 21}) - update_res = table_obj.output(["*"]).to_df() + update_res, extra_result = table_obj.output(["*"]).to_df() print(update_res) assert e.type == InfinityException @@ -390,7 +390,7 @@ def test_update_invalid_value_1(self, types, types_example, suffix): ConflictType.Error) # update table_obj.update("c1 = 1", {"c2": types_example}) - update_res = table_obj.output(["*"]).to_df() + update_res, extra_result = table_obj.output(["*"]).to_df() print(update_res) res = db_obj.drop_table("test_update_invalid_value"+suffix, ConflictType.Error) @@ -411,7 +411,7 @@ def test_update_new_value(self, types, types_example, suffix): # update table_obj.update("c1 = 1", {"c2": types_example}) - update_res = table_obj.output(["*"]).to_df() + update_res, extra_result = table_obj.output(["*"]).to_df() print(update_res) res = db_obj.drop_table("test_update_new_value"+suffix, ConflictType.Error) @@ -435,7 +435,7 @@ def test_update_invalid_value_2(self, types, types_example, suffix): assert e.type == InfinityException assert e.value.args[0] == ErrorCode.NOT_SUPPORTED_TYPE_CONVERSION - update_res = table_obj.output(["*"]).to_df() + update_res, extra_result = table_obj.output(["*"]).to_df() print(update_res) res = db_obj.drop_table("test_update_invalid_value"+suffix, ConflictType.Error) @@ -462,12 +462,12 @@ def test_valid_filter_expression(self, filter_list, types_example, suffix): for i in range(10): values = [{"c1": i, "c2": 3.0} for _ in range(10)] table_obj.insert(values) - insert_res = table_obj.output(["*"]).to_df() + insert_res, extra_result = table_obj.output(["*"]).to_df() print(insert_res) # delete table_obj.update(filter_list, {"c2": types_example}) - delete_res = table_obj.output(["*"]).to_df() + delete_res, extra_result = table_obj.output(["*"]).to_df() print(delete_res) res = db_obj.drop_table("test_filter_expression"+suffix, ConflictType.Error) @@ -495,14 +495,14 @@ def test_invalid_filter_expression(self, filter_list, types_example, suffix): for i in range(10): values = [{"c1": i, "c2": 3.0} for _ in range(10)] table_obj.insert(values) - insert_res = table_obj.output(["*"]).to_df() + insert_res, extra_result = table_obj.output(["*"]).to_df() print(insert_res) # delete with pytest.raises(Exception): table_obj.update(filter_list, {"c2": types_example}) - delete_res = table_obj.output(["*"]).to_df() + delete_res, extra_result = table_obj.output(["*"]).to_df() print(delete_res) res = db_obj.drop_table("test_invalid_filter_expression"+suffix, ConflictType.Error) @@ -531,12 +531,12 @@ def test_update_sparse_vector(self, suffix): } ]) - res = table_instance.output(["*"]).to_pl() + res, extra_result = table_instance.output(["*"]).to_pl() print(res) table_instance.update("id = 1", {"content_demo_sparse":SparseVector([1, 2, 3], [1.1, 1.1, 1.1])}) - res = table_instance.output(["*"]).to_pl() + res, extra_result = table_instance.output(["*"]).to_pl() print(res) res = db_obj.drop_table("test_update_sparse_vector"+suffix, ConflictType.Error) diff --git a/src/admin/admin_executor.cpp b/src/admin/admin_executor.cpp index efb92b1050..d81135ee33 100644 --- a/src/admin/admin_executor.cpp +++ b/src/admin/admin_executor.cpp @@ -4111,6 +4111,7 @@ QueryResult AdminExecutor::ShowCurrentNode(QueryContext *query_context, const Ad } } } else { + NodeRole server_role = InfinityContext::instance().GetServerRole(); { SizeT column_id = 0; { @@ -4121,7 +4122,7 @@ QueryResult AdminExecutor::ShowCurrentNode(QueryContext *query_context, const Ad ++column_id; { - Value value = Value::MakeVarchar(ToString(InfinityContext::instance().GetServerRole())); + Value value = Value::MakeVarchar(ToString(server_role)); ValueExpression value_expr(value); value_expr.AppendToChunk(output_block_ptr->column_vectors[column_id]); } @@ -4139,7 +4140,7 @@ QueryResult AdminExecutor::ShowCurrentNode(QueryContext *query_context, const Ad { bool infinity_started = InfinityContext::instance().InfinityContextStarted(); String infinity_status("started"); - if (!infinity_started) { + if (!infinity_started && server_role != NodeRole::kAdmin) { infinity_status = "starting"; } Value value = Value::MakeVarchar(infinity_status); @@ -4253,10 +4254,12 @@ QueryResult AdminExecutor::SetRole(QueryContext *query_context, const AdminState status = InfinityContext::instance().ChangeServerRole(NodeRole::kFollower, false, node_name, leader_ip, leader_port); if (!status.ok()) { - LOG_INFO("Fail to change to FOLLOWER role"); - Status restore_status = InfinityContext::instance().ChangeServerRole(NodeRole::kAdmin); - if (!restore_status.ok()) { - UnrecoverableError(fmt::format("Fail to change node role to FOLLOWER, then fail to restore to ADMIN.")); + if(status.code() != ErrorCode::kCantSwitchRole) { + LOG_INFO("Fail to change to FOLLOWER role"); + Status restore_status = InfinityContext::instance().ChangeServerRole(NodeRole::kAdmin); + if (!restore_status.ok()) { + UnrecoverableError(fmt::format("Fail to change node role to FOLLOWER, then fail to restore to ADMIN.")); + } } } else { LOG_INFO("Start in FOLLOWER role"); @@ -4296,10 +4299,12 @@ QueryResult AdminExecutor::SetRole(QueryContext *query_context, const AdminState status = InfinityContext::instance().ChangeServerRole(NodeRole::kLearner, false, node_name, leader_ip, leader_port); if (!status.ok()) { - LOG_INFO("Fail to change to LEARNER role"); - Status restore_status = InfinityContext::instance().ChangeServerRole(NodeRole::kAdmin); - if (!restore_status.ok()) { - UnrecoverableError(fmt::format("Fail to change node role to FOLLOWER, then fail to restore to ADMIN.")); + if(status.code() != ErrorCode::kCantSwitchRole) { + LOG_INFO("Fail to change to LEARNER role"); + Status restore_status = InfinityContext::instance().ChangeServerRole(NodeRole::kAdmin); + if (!restore_status.ok()) { + UnrecoverableError(fmt::format("Fail to change node role to FOLLOWER, then fail to restore to ADMIN.")); + } } } else { LOG_INFO("Start in LEARNER role"); diff --git a/src/common/analyzer/analyzer.cppm b/src/common/analyzer/analyzer.cppm index 1ee76ed136..34c8f197de 100644 --- a/src/common/analyzer/analyzer.cppm +++ b/src/common/analyzer/analyzer.cppm @@ -34,8 +34,6 @@ public: virtual ~Analyzer() = default; - void SetInnerAnalyzer(SharedPtr &analyzer) { inner_analyzer_ = analyzer; } - void SetExtractSpecialChar(bool extract_special_char, bool convert_to_placeholder = true) { extract_special_char_ = extract_special_char; convert_to_placeholder_ = convert_to_placeholder; @@ -43,6 +41,8 @@ public: void SetCharOffset(bool set) { get_char_offset_ = set; } + void SetTokenizerConfig(const TokenizeConfig &conf) { tokenizer_.SetConfig(conf); } + int Analyze(const Term &input, TermList &output) { void *array[2] = {&output, this}; return AnalyzeImpl(input, &array, &Analyzer::AppendTermList); @@ -84,7 +84,6 @@ protected: Tokenizer tokenizer_; - SharedPtr inner_analyzer_; /// Whether including speical characters (e.g. puncutations) in the result. bool extract_special_char_; diff --git a/src/common/analyzer/analyzer_pool.cpp b/src/common/analyzer/analyzer_pool.cpp index c8b60cc73f..660a1727eb 100644 --- a/src/common/analyzer/analyzer_pool.cpp +++ b/src/common/analyzer/analyzer_pool.cpp @@ -23,6 +23,7 @@ import third_party; import config; import infinity_context; import analyzer; +import tokenizer; import stemmer; import chinese_analyzer; import traditional_chinese_analyzer; @@ -31,7 +32,8 @@ import korean_analyzer; import standard_analyzer; import ngram_analyzer; import rag_analyzer; -import keyword_analyzer; +import whitespace_analyzer; +import ik_analyzer; import logger; namespace infinity { @@ -75,6 +77,7 @@ Tuple, Status> AnalyzerPool::GetAnalyzer(const std::string_v Config *config = InfinityContext::instance().config(); if (config == nullptr) { // InfinityContext has not been initialized. + // For unit test only path = "/var/infinity/resource"; } else { path = config->ResourcePath(); @@ -107,6 +110,7 @@ Tuple, Status> AnalyzerPool::GetAnalyzer(const std::string_v Config *config = InfinityContext::instance().config(); if (config == nullptr) { // InfinityContext has not been initialized. + // For unit test only path = "/var/infinity/resource"; } else { path = config->ResourcePath(); @@ -140,6 +144,7 @@ Tuple, Status> AnalyzerPool::GetAnalyzer(const std::string_v Config *config = InfinityContext::instance().config(); if (config == nullptr) { // InfinityContext has not been initialized. + // For unit test only path = "/var/infinity/resource"; } else { path = config->ResourcePath(); @@ -164,6 +169,39 @@ Tuple, Status> AnalyzerPool::GetAnalyzer(const std::string_v analyzer->SetFineGrained(fine_grained); return {std::move(analyzer), Status::OK()}; } + case Str2Int(IK.data()): { + // + Analyzer *prototype = cache_[IK].get(); + if (prototype == nullptr) { + String path; + Config *config = InfinityContext::instance().config(); + if (config == nullptr) { + // InfinityContext has not been initialized. + // For unit test only + path = "/var/infinity/resource"; + } else { + path = config->ResourcePath(); + } + UniquePtr analyzer = MakeUnique(std::move(path)); + Status load_status = analyzer->Load(); + if (!load_status.ok()) { + return {nullptr, load_status}; + } + prototype = analyzer.get(); + cache_[IK] = std::move(analyzer); + } + bool fine_grained = false; + const char *str = name.data(); + while (*str != '\0' && *str != '-') { + str++; + } + if (strcmp(str, "-fine") == 0) { + fine_grained = true; + } + UniquePtr analyzer = MakeUnique(*reinterpret_cast(prototype)); + analyzer->SetFineGrained(fine_grained); + return {std::move(analyzer), Status::OK()}; + } case Str2Int(JAPANESE.data()): { Analyzer *prototype = cache_[JAPANESE].get(); if (prototype == nullptr) { @@ -171,6 +209,7 @@ Tuple, Status> AnalyzerPool::GetAnalyzer(const std::string_v Config *config = InfinityContext::instance().config(); if (config == nullptr) { // InfinityContext has not been initialized. + // For unit test only path = "/var/infinity/resource"; } else { path = config->ResourcePath(); @@ -192,6 +231,7 @@ Tuple, Status> AnalyzerPool::GetAnalyzer(const std::string_v Config *config = InfinityContext::instance().config(); if (config == nullptr) { // InfinityContext has not been initialized. + // For unit test only path = "/var/infinity/resource"; } else { path = config->ResourcePath(); @@ -208,6 +248,19 @@ Tuple, Status> AnalyzerPool::GetAnalyzer(const std::string_v } case Str2Int(STANDARD.data()): { UniquePtr analyzer = MakeUnique(); + + TokenizeConfig token_config; + // String allow_str("-"); + String divide_str("@#$"); + String unite_str("/"); + // Allow("-"): 2012-02-14 => 2012-02-14 + // Divide: delimiters + // Unite: 2012/02/14 => 20120214 + // token_config.AddAllows(allow_str); + token_config.AddDivides(divide_str); + token_config.AddUnites(unite_str); + analyzer->SetTokenizerConfig(token_config); + Language lang = STEM_LANG_ENGLISH; const char *str = name.data(); while (*str != '\0' && *str != '-') { @@ -269,7 +322,10 @@ Tuple, Status> AnalyzerPool::GetAnalyzer(const std::string_v return {MakeUnique(ngram), Status::OK()}; } case Str2Int(KEYWORD.data()): { - return {MakeUnique(), Status::OK()}; + return {MakeUnique(), Status::OK()}; + } + case Str2Int(WHITESPACE.data()): { + return {MakeUnique(), Status::OK()}; } default: { if(std::filesystem::is_regular_file(name)) { diff --git a/src/common/analyzer/analyzer_pool.cppm b/src/common/analyzer/analyzer_pool.cppm index 6a7aa61a8f..46a2e0cf08 100644 --- a/src/common/analyzer/analyzer_pool.cppm +++ b/src/common/analyzer/analyzer_pool.cppm @@ -40,7 +40,9 @@ public: static constexpr std::string_view STANDARD = "standard"; static constexpr std::string_view NGRAM = "ngram"; static constexpr std::string_view RAG = "rag"; + static constexpr std::string_view IK = "ik"; static constexpr std::string_view KEYWORD = "keyword"; + static constexpr std::string_view WHITESPACE = "whitespace"; private: CacheType cache_{}; diff --git a/src/common/analyzer/common_analyzer.cpp b/src/common/analyzer/common_analyzer.cpp index 63b1a4b3de..6d45eda603 100644 --- a/src/common/analyzer/common_analyzer.cpp +++ b/src/common/analyzer/common_analyzer.cpp @@ -29,14 +29,7 @@ constexpr int MAX_TUPLE_LENGTH = 1024; CommonLanguageAnalyzer::CommonLanguageAnalyzer() : Analyzer(), lowercase_string_buffer_(term_string_buffer_limit_), stemmer_(MakeUnique()), case_sensitive_(false), contain_lower_(false), - extract_eng_stem_(true), extract_synonym_(false), cjk_(false), remove_stopwords_(false) { - TokenizeConfig token_config; - String divide_str("@#$"); - String unite_str("/"); - token_config.AddDivides(divide_str); - token_config.AddUnites(unite_str); - tokenizer_.SetConfig(token_config); -} + extract_eng_stem_(true), extract_synonym_(false), cjk_(false), remove_stopwords_(false) {} CommonLanguageAnalyzer::~CommonLanguageAnalyzer() {} diff --git a/src/common/analyzer/ik/analyze_context.cpp b/src/common/analyzer/ik/analyze_context.cpp index 8c21c3bc51..b98949e4eb 100644 --- a/src/common/analyzer/ik/analyze_context.cpp +++ b/src/common/analyzer/ik/analyze_context.cpp @@ -1,11 +1,9 @@ module; -#include #include -#include -#include #include -#include + +module analyze_context; import stl; import quick_sort_set; @@ -14,10 +12,8 @@ import lexeme; import lexeme_path; import ik_dict; -module analyze_context; - namespace infinity { -AnalyzeContext::AnalyzeContext(Dictionary *dict) : dict_(dict) { +AnalyzeContext::AnalyzeContext(Dictionary *dict, bool ik_smart) : dict_(dict), ik_smart_(ik_smart) { buff_offset_ = 0; cursor_ = 0; last_useless_char_num_ = 0; @@ -52,11 +48,7 @@ bool AnalyzeContext::MoveCursor() { } } -bool AnalyzeContext::NeedRefillBuffer() const { - return available_ == BUFF_SIZE && cursor_ < available_ - 1 && cursor_ > available_ - BUFF_EXHAUST_CRITICAL && !IsBufferLocked(); -} - -void AnalyzeContext::AddLexeme(Lexeme *lexeme) { org_lexemes_.AddLexeme(lexeme); } +bool AnalyzeContext::AddLexeme(Lexeme *lexeme) { return org_lexemes_->AddLexeme(lexeme); } void AnalyzeContext::AddLexemePath(LexemePath *path) { if (path != nullptr) { @@ -114,14 +106,16 @@ Lexeme *AnalyzeContext::GetNextLexeme() { result->SetLexemeText( std::wstring(segment_buff_.begin() + result->GetBegin(), segment_buff_.begin() + result->GetBegin() + result->GetLength())); break; + } else { + delete result; + result = nullptr; } } return result; } void AnalyzeContext::Reset() { - buff_locker_.clear(); - org_lexemes_ = QuickSortSet(); + org_lexemes_ = MakeUnique(); available_ = 0; buff_offset_ = 0; char_types_.clear(); @@ -132,6 +126,8 @@ void AnalyzeContext::Reset() { } void AnalyzeContext::Compound(Lexeme *result) { + if (!ik_smart_) + return; if (!results_.empty()) { if (Lexeme::TYPE_ARABIC == result->GetLexemeType()) { Lexeme *next_lexeme = results_.front(); @@ -142,7 +138,9 @@ void AnalyzeContext::Compound(Lexeme *result) { append_ok = result->Append(*next_lexeme, Lexeme::TYPE_CQUAN); } if (append_ok) { + Lexeme *r = results_.front(); results_.pop_front(); + delete r; } } if (Lexeme::TYPE_CNUM == result->GetLexemeType() && !results_.empty()) { @@ -152,7 +150,9 @@ void AnalyzeContext::Compound(Lexeme *result) { append_ok = result->Append(*next_lexeme, Lexeme::TYPE_CQUAN); } if (append_ok) { + Lexeme *r = results_.front(); results_.pop_front(); + delete r; } } } diff --git a/src/common/analyzer/ik/analyze_context.cppm b/src/common/analyzer/ik/analyze_context.cppm index f1d3ddb49d..8023cfbf35 100644 --- a/src/common/analyzer/ik/analyze_context.cppm +++ b/src/common/analyzer/ik/analyze_context.cppm @@ -31,9 +31,7 @@ public: int last_useless_char_num_; - HashSet buff_locker_; - - QuickSortSet org_lexemes_; + UniquePtr org_lexemes_; HashMap> path_map_; @@ -41,7 +39,9 @@ public: Dictionary *dict_{nullptr}; - AnalyzeContext(Dictionary *dict); + bool ik_smart_{true}; + + AnalyzeContext(Dictionary *dict, bool is_smart = true); int GetCursor() const { return cursor_; } @@ -59,23 +59,15 @@ public: bool MoveCursor(); - void LockBuffer(const std::wstring &segmenter_name) { buff_locker_.insert(segmenter_name); } - - void UnlockBuffer(const std::wstring &segmenter_name) { buff_locker_.erase(segmenter_name); } - - bool IsBufferLocked() const { return !buff_locker_.empty(); } - bool IsBufferConsumed() const { return cursor_ == available_ - 1; } - bool NeedRefillBuffer() const; - void MarkBufferOffset() { buff_offset_ += cursor_; } - void AddLexeme(Lexeme *lexeme); + bool AddLexeme(Lexeme *lexeme); void AddLexemePath(LexemePath *path); - QuickSortSet *GetOrgLexemes() { return &(org_lexemes_); } + QuickSortSet *GetOrgLexemes() { return org_lexemes_.get(); } void OutputToResult(); diff --git a/src/common/analyzer/ik/arbitrator.cpp b/src/common/analyzer/ik/arbitrator.cpp index 608233bbc3..050457e26a 100644 --- a/src/common/analyzer/ik/arbitrator.cpp +++ b/src/common/analyzer/ik/arbitrator.cpp @@ -2,13 +2,14 @@ module; #include +module arbitrator; + import stl; import analyze_context; import lexeme; import lexeme_path; import quick_sort_set; - -module arbitrator; +import third_party; namespace infinity { @@ -16,36 +17,37 @@ void IKArbitrator::Process(AnalyzeContext *context, bool use_smart) { QuickSortSet *org_lexemes = context->GetOrgLexemes(); Lexeme *org_lexeme = org_lexemes->PollFirst(); - LexemePath *cross_path = new LexemePath(); + UniquePtr cross_path = MakeUnique(); while (org_lexeme != nullptr) { if (!cross_path->AddCrossLexeme(org_lexeme)) { if (cross_path->Size() == 1 || !use_smart) { - context->AddLexemePath(cross_path); + context->AddLexemePath(cross_path.release()); } else { QuickSortSet::Cell *head_cell = cross_path->GetHead(); LexemePath *judge_result = Judge(head_cell, cross_path->GetPathLength()); context->AddLexemePath(judge_result); - delete cross_path; } - - cross_path = new LexemePath(); + cross_path = MakeUnique(); cross_path->AddCrossLexeme(org_lexeme); } org_lexeme = org_lexemes->PollFirst(); } if (cross_path->Size() == 1 || !use_smart) { - context->AddLexemePath(cross_path); + context->AddLexemePath(cross_path.release()); } else { QuickSortSet::Cell *head_cell = cross_path->GetHead(); LexemePath *judge_result = Judge(head_cell, cross_path->GetPathLength()); context->AddLexemePath(judge_result); - delete cross_path; } } +struct CompareLexemePath { + bool operator()(const UniquePtr &lhs, const UniquePtr &rhs) const { return lhs->CompareTo(*rhs); } +}; + LexemePath *IKArbitrator::Judge(QuickSortSet::Cell *lexeme_cell, int fulltext_length) { - Set> path_options; + std::set, CompareLexemePath> path_options; UniquePtr option = MakeUnique(); std::stack lexeme_stack = ForwardPath(lexeme_cell, option.get()); @@ -68,7 +70,9 @@ std::stack IKArbitrator::ForwardPath(QuickSortSet::Cell *l std::stack conflict_stack; QuickSortSet::Cell *c = lexeme_cell; while (c != nullptr && c->GetLexeme() != nullptr) { - if (!option->AddNotCrossLexeme(c->GetLexeme())) { + Lexeme *lexeme = c->GetLexeme()->Copy(); + if (!option->AddNotCrossLexeme(lexeme)) { + delete lexeme; conflict_stack.push(c); } c = c->GetNext(); @@ -78,7 +82,8 @@ std::stack IKArbitrator::ForwardPath(QuickSortSet::Cell *l void IKArbitrator::BackPath(Lexeme *l, LexemePath *option) { while (option->CheckCross(l)) { - option->RemoveTail(); + Lexeme *lexeme = option->RemoveTail(); + delete lexeme; } } diff --git a/src/common/analyzer/ik/character_util.cppm b/src/common/analyzer/ik/character_util.cppm index 75be1e5c8d..6b06ad0286 100644 --- a/src/common/analyzer/ik/character_util.cppm +++ b/src/common/analyzer/ik/character_util.cppm @@ -1,7 +1,7 @@ module; -#include -#include +// #include +// #include #include export module character_util; @@ -81,9 +81,10 @@ public: } static std::wstring UTF8ToUTF16(const std::string &utf8_str) { + // std::wstring_convert, wchar_t> converter; + // return converter.from_bytes(utf8_str); std::wstring utf16_str; std::string_view utf8_view(utf8_str); - while (!utf8_view.empty()) { if ((utf8_view[0] & 0x80) == 0) { // 1-byte character utf16_str.push_back(static_cast(utf8_view[0])); @@ -120,9 +121,10 @@ public: } static std::string UTF16ToUTF8(const std::wstring &utf16_str) { + // std::wstring_convert, wchar_t> converter; + // return converter.to_bytes(utf16_str); std::string utf8_str; std::wstring_view utf16_view(utf16_str); - while (!utf16_view.empty()) { if (utf16_view[0] < 0xD800 || utf16_view[0] > 0xDFFF) { // Basic Multilingual Plane uint32_t code_point = utf16_view[0]; diff --git a/src/common/analyzer/ik/cjk_segmenter.cpp b/src/common/analyzer/ik/cjk_segmenter.cpp index b9b728cd78..2ff6a9dc3e 100644 --- a/src/common/analyzer/ik/cjk_segmenter.cpp +++ b/src/common/analyzer/ik/cjk_segmenter.cpp @@ -2,6 +2,8 @@ module; #include +module cjk_segmenter; + import stl; import hit; import segmenter; @@ -9,47 +11,52 @@ import analyze_context; import lexeme; import character_util; import ik_dict; - -module cjk_segmenter; +import third_party; namespace infinity { const std::wstring CJKSegmenter::SEGMENTER_NAME = L"CJK_SEGMENTER"; -CJKSegmenter::CJKSegmenter(Dictionary *dict) : dict_(dict) { tmp_hits_ = List(); } +CJKSegmenter::CJKSegmenter(Dictionary *dict) : dict_(dict) {} void CJKSegmenter::Analyze(AnalyzeContext *context) { if (CharacterUtil::CHAR_USELESS != context->GetCurrentCharType()) { if (!tmp_hits_.empty()) { - std::vector tmp_array(tmp_hits_.begin(), tmp_hits_.end()); - for (Hit *hit : tmp_array) { + for (auto it = tmp_hits_.begin(); it != tmp_hits_.end();) { + Hit *hit = (*it).get(); hit = dict_->MatchWithHit(context->GetSegmentBuff(), context->GetCursor(), hit); + if (hit->IsMatch()) { - Lexeme *newLexeme = + Lexeme *new_lexeme = new Lexeme(context->GetBufferOffset(), hit->GetBegin(), context->GetCursor() - hit->GetBegin() + 1, Lexeme::TYPE_CNWORD); - context->AddLexeme(newLexeme); + if (!context->AddLexeme(new_lexeme)) + delete new_lexeme; if (!hit->IsPrefix()) { - tmp_hits_.remove(hit); + it = tmp_hits_.erase(it); + } else { + ++it; } } else if (hit->IsUnmatch()) { - tmp_hits_.remove(hit); + it = tmp_hits_.erase(it); + } else { + ++it; } } } - Hit *single_char_hit = dict_->MatchInMainDict(context->GetSegmentBuff(), context->GetCursor(), 1); + UniquePtr single_char_hit(dict_->MatchInMainDict(context->GetSegmentBuff(), context->GetCursor(), 1)); if (single_char_hit->IsMatch()) { - Lexeme *newLexeme = new Lexeme(context->GetBufferOffset(), context->GetCursor(), 1, Lexeme::TYPE_CNWORD); - context->AddLexeme(newLexeme); + Lexeme *new_lexeme = new Lexeme(context->GetBufferOffset(), context->GetCursor(), 1, Lexeme::TYPE_CNWORD); + if (!context->AddLexeme(new_lexeme)) + delete new_lexeme; if (single_char_hit->IsPrefix()) { - tmp_hits_.push_back(single_char_hit); + tmp_hits_.push_back(std::move(single_char_hit)); } } else if (single_char_hit->IsPrefix()) { - tmp_hits_.push_back(single_char_hit); + tmp_hits_.push_back(std::move(single_char_hit)); } - } else { tmp_hits_.clear(); } @@ -57,12 +64,6 @@ void CJKSegmenter::Analyze(AnalyzeContext *context) { if (context->IsBufferConsumed()) { tmp_hits_.clear(); } - - if (tmp_hits_.empty()) { - context->UnlockBuffer(SEGMENTER_NAME); - } else { - context->LockBuffer(SEGMENTER_NAME); - } } void CJKSegmenter::Reset() { tmp_hits_.clear(); } diff --git a/src/common/analyzer/ik/cjk_segmenter.cppm b/src/common/analyzer/ik/cjk_segmenter.cppm index 054c8d5705..08486b37f5 100644 --- a/src/common/analyzer/ik/cjk_segmenter.cppm +++ b/src/common/analyzer/ik/cjk_segmenter.cppm @@ -16,7 +16,7 @@ export class CJKSegmenter : public Segmenter { public: static const std::wstring SEGMENTER_NAME; - List tmp_hits_; + List> tmp_hits_; Dictionary *dict_{nullptr}; diff --git a/src/common/analyzer/ik/cn_quantifier_segmenter.cpp b/src/common/analyzer/ik/cn_quantifier_segmenter.cpp index 319e4de231..ebbf43048c 100644 --- a/src/common/analyzer/ik/cn_quantifier_segmenter.cpp +++ b/src/common/analyzer/ik/cn_quantifier_segmenter.cpp @@ -2,6 +2,8 @@ module; #include +module cn_quantifier_segmenter; + import stl; import hit; import segmenter; @@ -10,8 +12,6 @@ import lexeme; import character_util; import ik_dict; -module cn_quantifier_segmenter; - namespace infinity { const std::wstring CNQuantifierSegmenter::SEGMENTER_NAME = L"QUAN_SEGMENTER"; @@ -27,23 +27,12 @@ void CNQuantifierSegmenter::InitChnNumber() { CNQuantifierSegmenter::CNQuantifierSegmenter(Dictionary *dict) : dict_(dict) { nstart_ = -1; nend_ = -1; - count_hits_ = List(); InitChnNumber(); } void CNQuantifierSegmenter::Analyze(AnalyzeContext *context) { - // 处理中文数词 ProcessCNumber(context); - // 处理中文量词 ProcessCount(context); - - // 判断是否锁定缓冲区 - if (nstart_ == -1 && nend_ == -1 && count_hits_.empty()) { - // 对缓冲区解锁 - context->UnlockBuffer(SEGMENTER_NAME); - } else { - context->LockBuffer(SEGMENTER_NAME); - } } void CNQuantifierSegmenter::Reset() { @@ -81,34 +70,40 @@ void CNQuantifierSegmenter::ProcessCount(AnalyzeContext *context) { if (CharacterUtil::CHAR_CHINESE == context->GetCurrentCharType()) { if (!count_hits_.empty()) { - std::vector tmp_array(count_hits_.begin(), count_hits_.end()); - for (Hit *hit : tmp_array) { + for (auto it = count_hits_.begin(); it != count_hits_.end();) { + Hit *hit = (*it).get(); hit = dict_->MatchWithHit(context->GetSegmentBuff(), context->GetCursor(), hit); if (hit->IsMatch()) { Lexeme *new_lexeme = new Lexeme(context->GetBufferOffset(), hit->GetBegin(), context->GetCursor() - hit->GetBegin() + 1, Lexeme::TYPE_COUNT); - context->AddLexeme(new_lexeme); + if (!context->AddLexeme(new_lexeme)) + delete new_lexeme; if (!hit->IsPrefix()) { - count_hits_.remove(hit); + it = count_hits_.erase(it); + } else { + ++it; } } else if (hit->IsUnmatch()) { - count_hits_.remove(hit); + it = count_hits_.erase(it); + } else { + ++it; } } } - Hit *single_char_hit = dict_->MatchInQuantifierDict(context->GetSegmentBuff(), context->GetCursor(), 1); + UniquePtr single_char_hit(dict_->MatchInQuantifierDict(context->GetSegmentBuff(), context->GetCursor(), 1)); if (single_char_hit->IsMatch()) { Lexeme *new_lexeme = new Lexeme(context->GetBufferOffset(), context->GetCursor(), 1, Lexeme::TYPE_COUNT); - context->AddLexeme(new_lexeme); + if (!context->AddLexeme(new_lexeme)) + delete new_lexeme; if (single_char_hit->IsPrefix()) { - count_hits_.push_back(single_char_hit); + count_hits_.push_back(std::move(single_char_hit)); } } else if (single_char_hit->IsPrefix()) { - count_hits_.push_back(single_char_hit); + count_hits_.push_back(std::move(single_char_hit)); } } else { @@ -138,7 +133,8 @@ bool CNQuantifierSegmenter::NeedCountScan(AnalyzeContext *context) { void CNQuantifierSegmenter::OutputNumLexeme(AnalyzeContext *context) { if (nstart_ > -1 && nend_ > -1) { Lexeme *new_lexeme = new Lexeme(context->GetBufferOffset(), nstart_, nend_ - nstart_ + 1, Lexeme::TYPE_CNUM); - context->AddLexeme(new_lexeme); + if (!context->AddLexeme(new_lexeme)) + delete new_lexeme; } } diff --git a/src/common/analyzer/ik/cn_quantifier_segmenter.cppm b/src/common/analyzer/ik/cn_quantifier_segmenter.cppm index a44b2712b2..51d03bb684 100644 --- a/src/common/analyzer/ik/cn_quantifier_segmenter.cppm +++ b/src/common/analyzer/ik/cn_quantifier_segmenter.cppm @@ -23,7 +23,7 @@ public: int nstart_; int nend_; - List count_hits_; + List> count_hits_; Dictionary *dict_{nullptr}; diff --git a/src/common/analyzer/ik/dict.cpp b/src/common/analyzer/ik/dict.cpp index 407dd940ae..e2f84ffb33 100644 --- a/src/common/analyzer/ik/dict.cpp +++ b/src/common/analyzer/ik/dict.cpp @@ -2,6 +2,7 @@ module; #include #include +#include #include #include #include @@ -13,22 +14,46 @@ import hit; import stl; import status; import character_util; +import third_party; namespace fs = std::filesystem; namespace infinity { -const String PATH_DIC_MAIN = "ik/main.dic"; -const String PATH_DIC_SURNAME = "ik/surname.dic"; -const String PATH_DIC_QUANTIFIER = "ik/quantifier.dic"; -const String PATH_DIC_SUFFIX = "ik/suffix.dic"; -const String PATH_DIC_PREP = "ik/preposition.dic"; -const String PATH_DIC_STOP = "ik/stopword.dic"; -const String FILE_NAME = "ik/IKAnalyzer.cfg.xml"; -const String EXT_DICT = "ik/ext_dict"; -const String EXT_STOP = "ik/ext_stopwords"; +const String PATH_DIC_MAIN = "main.dic"; +const String PATH_DIC_SURNAME = "surname.dic"; +const String PATH_DIC_QUANTIFIER = "quantifier.dic"; +const String PATH_DIC_SUFFIX = "suffix.dic"; +const String PATH_DIC_PREP = "preposition.dic"; +const String PATH_DIC_STOP = "stopword.dic"; +const String FILE_NAME = "IKAnalyzer.cfg.xml"; +const String EXT_DICT = "ext_dict"; +const String EXT_STOP = "ext_stopwords"; + +bool IsSpaceOrNewline(char c) { return std::isspace(static_cast(c)) || c == '\n' || c == '\r'; } + +String Trim(const String &str) { + if (str.empty()) { + return str; + } + + std::size_t start = 0; + while (start < str.size() && IsSpaceOrNewline(str[start])) { + ++start; + } + + std::size_t end = str.size() - 1; + while (end > start && IsSpaceOrNewline(str[end])) { + --end; + } + return str.substr(start, end - start + 1); +} -Dictionary::Dictionary(const String &dir) : conf_dir_(dir) {} +Dictionary::Dictionary(const String &dir) { + fs::path root(dir); + fs::path ik_root = root / "ik"; + conf_dir_ = ik_root.string(); +} Status Dictionary::Load() { Status load_status; @@ -93,8 +118,9 @@ Status Dictionary::LoadDictFile(DictSegment *dict, const String &file_path, bool if (!is.is_open()) { return Status::InvalidAnalyzerFile(file_path); } - std::string line; + String line; while (std::getline(is, line)) { + line = Trim(line); std::wstring word = CharacterUtil::UTF8ToUTF16(line); if (!word.empty() && word[0] == L'\uFEFF') { word = word.substr(1); @@ -167,12 +193,13 @@ Hit *Dictionary::MatchInQuantifierDict(const Vector &char_array, int be } Hit *Dictionary::MatchWithHit(const Vector &char_array, int current_index, Hit *matched_hit) { - DictSegment *ds = matched_hit->getMatchedDictSegment(); + DictSegment *ds = matched_hit->GetMatchedDictSegment(); return ds->Match(char_array, current_index, 1, matched_hit); } bool Dictionary::IsStopWord(const Vector &char_array, int begin, int length) { - return stop_words_->Match(char_array, begin, length)->IsMatch(); + UniquePtr hit(stop_words_->Match(char_array, begin, length)); + return hit->IsMatch(); } Status Dictionary::LoadMainDict() { @@ -194,7 +221,6 @@ Status Dictionary::LoadExtDict() { Status load_status; if (!ext_dict_files.empty()) { for (const String &ext_dict_name : ext_dict_files) { - std::cout << "[Dict Loading] " << ext_dict_name << std::endl; String file = fs::path(conf_dir_) / fs::path(ext_dict_name).string(); load_status = LoadDictFile(main_dict_.get(), file, false, "Extra Dict"); if (!load_status.ok()) { @@ -217,7 +243,6 @@ Status Dictionary::LoadStopWordDict() { Vector ext_stopword_dict_files = GetExtStopWordDictionarys(); if (!ext_stopword_dict_files.empty()) { for (const String &ext_stopword_dict_file : ext_stopword_dict_files) { - std::cout << "[Dict Loading] " << ext_stopword_dict_file << std::endl; String file = fs::path(conf_dir_) / fs::path(ext_stopword_dict_file).string(); load_status = LoadDictFile(stop_words_.get(), file, false, "Extra Stopwords"); if (!load_status.ok()) { @@ -272,10 +297,11 @@ void Dictionary::ParseProperties(const String &content) { std::stringstream ss(content); String line; while (std::getline(ss, line)) { - size_t pos = line.find('='); - if (pos != String::npos) { - String key = line.substr(0, pos); - String value = line.substr(pos + 1); + std::regex attribute_regex(R"#(([^<]+))#"); + std::smatch match; + if (std::regex_search(line, match, attribute_regex)) { + std::string key = match[1].str(); + std::string value = match[2].str(); props_[key] = value; } } diff --git a/src/common/analyzer/ik/dict_segment.cpp b/src/common/analyzer/ik/dict_segment.cpp index 52489a391e..969b4e27b3 100644 --- a/src/common/analyzer/ik/dict_segment.cpp +++ b/src/common/analyzer/ik/dict_segment.cpp @@ -1,15 +1,18 @@ module; -import stl; -import hit; - #include module ik_dict_segment; +import stl; +import hit; +import character_util; +import third_party; + namespace infinity { HashMap DictSegment::char_map_; + DictSegment::DictSegment(wchar_t node_char) : node_char_(node_char) {} Hit *DictSegment::Match(const Vector &char_array, int begin, int length, Hit *search_hit) { @@ -24,20 +27,19 @@ Hit *DictSegment::Match(const Vector &char_array, int begin, int length wchar_t key_char = char_array[begin]; DictSegment *ds = nullptr; - Vector> &segment_array = children_array_; - HashMap> &segment_map = children_map_; - - if (!segment_array.empty()) { + if (!children_array_.empty()) { UniquePtr key_segment = MakeUnique(key_char); - auto it = std::lower_bound(segment_array.begin(), - segment_array.begin() + store_size_, + auto it = std::lower_bound(children_array_.begin(), + children_array_.begin() + store_size_, key_segment, [](const UniquePtr &a, const UniquePtr &b) { return a->node_char_ < b->node_char_; }); - if (it != segment_array.begin() + store_size_ && (*it)->node_char_ == key_char) { + if (it != children_array_.begin() + store_size_ && (*it)->node_char_ == key_char) { ds = (*it).get(); } - } else if (!segment_map.empty()) { - ds = segment_map[key_char].get(); + } else if (!children_map_.empty()) { + auto it = children_map_.find(key_char); + if (it != children_map_.end()) + ds = it->second.get(); } if (ds != nullptr) { @@ -59,12 +61,14 @@ Hit *DictSegment::Match(const Vector &char_array, int begin, int length void DictSegment::FillSegment(const Vector &char_array, int begin, int length, int enabled) { wchar_t begin_char = char_array[begin]; - wchar_t key_char = char_map_[begin_char]; - if (key_char == L'\0') { + wchar_t key_char; + HashMap::iterator it = char_map_.find(begin_char); + if (it == char_map_.end()) { char_map_[begin_char] = begin_char; key_char = begin_char; + } else { + key_char = it->second; } - DictSegment *ds = LookforSegment(key_char, enabled); if (ds != nullptr) { if (length > 1) { @@ -79,13 +83,13 @@ DictSegment *DictSegment::LookforSegment(wchar_t key_char, int create) { DictSegment *ds = nullptr; if (store_size_ <= ARRAY_LENGTH_LIMIT) { - Vector> &segment_array = GetChildrenArray(); + Vector> &children_array_ = GetChildrenArray(); UniquePtr key_segment = MakeUnique(key_char); - auto it = std::lower_bound(segment_array.begin(), - segment_array.begin() + store_size_, + auto it = std::lower_bound(children_array_.begin(), + children_array_.begin() + store_size_, key_segment, [](const UniquePtr &a, const UniquePtr &b) { return a->node_char_ < b->node_char_; }); - if (it != segment_array.begin() + store_size_ && (*it)->node_char_ == key_char) { + if (it != children_array_.begin() + store_size_ && (*it)->node_char_ == key_char) { ds = (*it).get(); } @@ -93,26 +97,28 @@ DictSegment *DictSegment::LookforSegment(wchar_t key_char, int create) { UniquePtr ds_ptr = MakeUnique(key_char); ds = ds_ptr.get(); if (store_size_ < ARRAY_LENGTH_LIMIT) { - segment_array[store_size_] = std::move(ds_ptr); + children_array_[store_size_] = std::move(ds_ptr); store_size_++; - std::sort(segment_array.begin(), - segment_array.begin() + store_size_, + std::sort(children_array_.begin(), + children_array_.begin() + store_size_, [](const UniquePtr &a, const UniquePtr &b) { return a->node_char_ < b->node_char_; }); } else { - HashMap> &segment_map = GetChildrenMap(); - Migrate(segment_array, segment_map); - segment_map[key_char] = std::move(ds_ptr); + for (auto &segment : children_array_) { + if (segment.get() != nullptr) { + children_map_[segment->node_char_] = std::move(segment); + } + } + children_map_[key_char] = std::move(ds_ptr); store_size_++; children_array_.clear(); } } } else { - HashMap> &segment_map = GetChildrenMap(); - ds = segment_map[key_char].get(); + ds = children_map_[key_char].get(); if (ds == nullptr && create == 1) { UniquePtr ds_ptr = MakeUnique(key_char); ds = ds_ptr.get(); - segment_map[key_char] = std::move(ds_ptr); + children_map_[key_char] = std::move(ds_ptr); store_size_++; } } @@ -120,12 +126,4 @@ DictSegment *DictSegment::LookforSegment(wchar_t key_char, int create) { return ds; } -void DictSegment::Migrate(Vector> &segment_array, HashMap> &segment_map) { - for (auto &segment : segment_array) { - if (segment.get() != nullptr) { - segment_map[segment->node_char_] = std::move(segment); - } - } -} - } // namespace infinity diff --git a/src/common/analyzer/ik/dict_segment.cppm b/src/common/analyzer/ik/dict_segment.cppm index a9b46c12d4..76b4a39f47 100644 --- a/src/common/analyzer/ik/dict_segment.cppm +++ b/src/common/analyzer/ik/dict_segment.cppm @@ -54,10 +54,6 @@ private: } return children_map_; } - - void Migrate(Vector> &segment_array, HashMap> &segment_map); - - int CompareTo(const DictSegment &o) const { return node_char_ - o.node_char_; } }; } // namespace infinity \ No newline at end of file diff --git a/src/common/analyzer/ik/hit.cpp b/src/common/analyzer/ik/hit.cpp index cbb6984ca4..50747ece83 100644 --- a/src/common/analyzer/ik/hit.cpp +++ b/src/common/analyzer/ik/hit.cpp @@ -1,9 +1,9 @@ module; -import ik_dict_segment; - module hit; +import ik_dict_segment; + namespace infinity { void Hit::SetMatchedDictSegment(DictSegment *matched_dict_segment) { matched_dict_segment_ = matched_dict_segment; } diff --git a/src/common/analyzer/ik/hit.cppm b/src/common/analyzer/ik/hit.cppm index 4ca2a0713a..ed2c793d11 100644 --- a/src/common/analyzer/ik/hit.cppm +++ b/src/common/analyzer/ik/hit.cppm @@ -33,7 +33,7 @@ public: void SetUnmatch() { hit_state_ = UNMATCH; } - DictSegment *getMatchedDictSegment() const { return matched_dict_segment_; } + DictSegment *GetMatchedDictSegment() const { return matched_dict_segment_; } void SetMatchedDictSegment(DictSegment *matched_dict_segment_); diff --git a/src/common/analyzer/ik/ik_analyzer.cpp b/src/common/analyzer/ik/ik_analyzer.cpp index 1e57c8d61b..21dd3ab8ac 100644 --- a/src/common/analyzer/ik/ik_analyzer.cpp +++ b/src/common/analyzer/ik/ik_analyzer.cpp @@ -2,6 +2,8 @@ module; #include +module ik_analyzer; + import stl; import segmenter; import cjk_segmenter; @@ -13,14 +15,13 @@ import arbitrator; import term; import status; import character_util; - -module ik_analyzer; +import third_party; namespace infinity { IKAnalyzer::IKAnalyzer(const String &path) : dict_path_(path) {} -IKAnalyzer::IKAnalyzer(const IKAnalyzer &other) : own_dict_(false), dict_(other.dict_) { Init(); } +IKAnalyzer::IKAnalyzer(const IKAnalyzer &other) : own_dict_(false), ik_smart_(other.ik_smart_), dict_(other.dict_) { Init(); } IKAnalyzer::~IKAnalyzer() { if (own_dict_) { @@ -29,11 +30,18 @@ IKAnalyzer::~IKAnalyzer() { } void IKAnalyzer::Init() { - context_ = MakeUnique(dict_); + context_ = MakeUnique(dict_, ik_smart_); LoadSegmenters(); arbitrator_ = MakeUnique(); } +void IKAnalyzer::SetFineGrained(bool fine_grained) { + ik_smart_ = !fine_grained; + if (context_.get()) { + context_->ik_smart_ = ik_smart_; + } +} + Status IKAnalyzer::Load() { dict_ = new Dictionary(dict_path_); Status load_status = dict_->Load(); @@ -52,13 +60,6 @@ void IKAnalyzer::LoadSegmenters() { segmenters_.push_back(MakeUnique(dict_)); } -Lexeme *IKAnalyzer::Next() { - Lexeme *l = context_->GetNextLexeme(); - while (l == nullptr) { - } - return l; -} - void IKAnalyzer::Reset() { context_->Reset(); for (auto &segmenter : segmenters_) { @@ -72,20 +73,18 @@ int IKAnalyzer::AnalyzeImpl(const Term &input, void *data, HookType func) { unsigned level = 0; unsigned offset = 0; std::wstring line = CharacterUtil::UTF8ToUTF16(input.text_); + context_->Reset(); context_->FillBuffer(line); context_->InitCursor(); do { for (auto &segmenter : segmenters_) { segmenter->Analyze(context_.get()); } - if (context_->NeedRefillBuffer()) { - break; - } } while (context_->MoveCursor()); for (auto &segmenter : segmenters_) { segmenter->Reset(); } - arbitrator_->Process(context_.get(), true); + arbitrator_->Process(context_.get(), ik_smart_); context_->OutputToResult(); context_->MarkBufferOffset(); Lexeme *lexeme = nullptr; diff --git a/src/common/analyzer/ik/ik_analyzer.cppm b/src/common/analyzer/ik/ik_analyzer.cppm index aea48938cf..f944fecc38 100644 --- a/src/common/analyzer/ik/ik_analyzer.cppm +++ b/src/common/analyzer/ik/ik_analyzer.cppm @@ -26,6 +26,8 @@ public: Status Load(); + void SetFineGrained(bool fine_grained); + protected: int AnalyzeImpl(const Term &input, void *data, HookType func) override; @@ -34,8 +36,6 @@ private: void LoadSegmenters(); - Lexeme *Next(); - void Reset(); int GetLastUselessCharNum(); @@ -45,6 +45,8 @@ private: bool own_dict_{}; + bool ik_smart_{true}; + Dictionary *dict_{nullptr}; UniquePtr context_; diff --git a/src/common/analyzer/ik/letter_segmenter.cpp b/src/common/analyzer/ik/letter_segmenter.cpp index b50620c035..2179c5c535 100644 --- a/src/common/analyzer/ik/letter_segmenter.cpp +++ b/src/common/analyzer/ik/letter_segmenter.cpp @@ -2,14 +2,14 @@ module; #include +module letter_segmenter; + import stl; import segmenter; import analyze_context; import lexeme; import character_util; -module letter_segmenter; - namespace infinity { const std::wstring LetterSegmenter::SEGMENTER_NAME = L"LETTER_SEGMENTER"; Vector LetterSegmenter::Letter_Connector = {L'#', L'&', L'+', L'-', L'.', L'@', L'_'}; @@ -27,16 +27,9 @@ LetterSegmenter::LetterSegmenter() { } void LetterSegmenter::Analyze(AnalyzeContext *context) { - bool buffer_lock_flag = false; - buffer_lock_flag = ProcessEnglishLetter(context) || buffer_lock_flag; - buffer_lock_flag = ProcessArabicLetter(context) || buffer_lock_flag; - buffer_lock_flag = ProcessMixLetter(context) || buffer_lock_flag; - - if (buffer_lock_flag) { - context->LockBuffer(SEGMENTER_NAME); - } else { - context->UnlockBuffer(SEGMENTER_NAME); - } + ProcessEnglishLetter(context); + ProcessArabicLetter(context); + ProcessMixLetter(context); } void LetterSegmenter::Reset() { @@ -63,7 +56,8 @@ bool LetterSegmenter::ProcessMixLetter(AnalyzeContext *context) { end_ = context->GetCursor(); } else { Lexeme *new_lexeme = new Lexeme(context->GetBufferOffset(), start_, end_ - start_ + 1, Lexeme::TYPE_LETTER); - context->AddLexeme(new_lexeme); + if (!context->AddLexeme(new_lexeme)) + delete new_lexeme; start_ = -1; end_ = -1; } @@ -71,7 +65,8 @@ bool LetterSegmenter::ProcessMixLetter(AnalyzeContext *context) { if (context->IsBufferConsumed() && (start_ != -1 && end_ != -1)) { Lexeme *new_lexeme = new Lexeme(context->GetBufferOffset(), start_, end_ - start_ + 1, Lexeme::TYPE_LETTER); - context->AddLexeme(new_lexeme); + if (!context->AddLexeme(new_lexeme)) + delete new_lexeme; start_ = -1; end_ = -1; } @@ -97,7 +92,8 @@ bool LetterSegmenter::ProcessEnglishLetter(AnalyzeContext *context) { english_end_ = context->GetCursor(); } else { Lexeme *new_lexeme = new Lexeme(context->GetBufferOffset(), english_start_, english_end_ - english_start_ + 1, Lexeme::TYPE_ENGLISH); - context->AddLexeme(new_lexeme); + if (!context->AddLexeme(new_lexeme)) + delete new_lexeme; english_start_ = -1; english_end_ = -1; } @@ -105,7 +101,8 @@ bool LetterSegmenter::ProcessEnglishLetter(AnalyzeContext *context) { if (context->IsBufferConsumed() && (english_start_ != -1 && english_end_ != -1)) { Lexeme *new_lexeme = new Lexeme(context->GetBufferOffset(), english_start_, english_end_ - english_start_ + 1, Lexeme::TYPE_ENGLISH); - context->AddLexeme(new_lexeme); + if (!context->AddLexeme(new_lexeme)) + delete new_lexeme; english_start_ = -1; english_end_ = -1; } @@ -132,7 +129,8 @@ bool LetterSegmenter::ProcessArabicLetter(AnalyzeContext *context) { } else if (CharacterUtil::CHAR_USELESS == context->GetCurrentCharType() && IsNumConnector(context->GetCurrentChar())) { } else { Lexeme *new_lexeme = new Lexeme(context->GetBufferOffset(), arabic_start_, arabic_end_ - arabic_start_ + 1, Lexeme::TYPE_ARABIC); - context->AddLexeme(new_lexeme); + if (!context->AddLexeme(new_lexeme)) + delete new_lexeme; arabic_start_ = -1; arabic_end_ = -1; } @@ -140,7 +138,8 @@ bool LetterSegmenter::ProcessArabicLetter(AnalyzeContext *context) { if (context->IsBufferConsumed() && (arabic_start_ != -1 && arabic_end_ != -1)) { Lexeme *new_lexeme = new Lexeme(context->GetBufferOffset(), arabic_start_, arabic_end_ - arabic_start_ + 1, Lexeme::TYPE_ARABIC); - context->AddLexeme(new_lexeme); + if (!context->AddLexeme(new_lexeme)) + delete new_lexeme; arabic_start_ = -1; arabic_end_ = -1; } diff --git a/src/common/analyzer/ik/lexeme.cpp b/src/common/analyzer/ik/lexeme.cpp index e13386dd4f..5c98194492 100644 --- a/src/common/analyzer/ik/lexeme.cpp +++ b/src/common/analyzer/ik/lexeme.cpp @@ -15,6 +15,11 @@ Lexeme::Lexeme(int offset, int begin, int length, int lexeme_type) { lexeme_type_ = lexeme_type; } +Lexeme *Lexeme::Copy() { + Lexeme *copy = new Lexeme(offset_, begin_, length_, lexeme_type_); + return copy; +} + bool Lexeme::Append(const Lexeme &l, int lexeme_type) { if (!l.lexeme_text_.empty() && GetEndPosition() == l.GetBeginPosition()) { length_ += l.length_; diff --git a/src/common/analyzer/ik/lexeme.cppm b/src/common/analyzer/ik/lexeme.cppm index 7187e9e536..60549b631f 100644 --- a/src/common/analyzer/ik/lexeme.cppm +++ b/src/common/analyzer/ik/lexeme.cppm @@ -32,6 +32,8 @@ public: Lexeme(int offset, int begin, int length, int lexeme_type); + Lexeme *Copy(); + bool Equals(const Lexeme &other) const { return offset_ == other.offset_ && begin_ == other.begin_ && length_ == other.length_; } int Hash() const { @@ -80,7 +82,7 @@ public: std::wstring GetLexemeText() const { return lexeme_text_; } void SetLexemeText(const std::wstring &lexeme_text) { - if (lexeme_text_.empty()) { + if (lexeme_text.empty()) { lexeme_text_ = L""; length_ = 0; } else { diff --git a/src/common/analyzer/ik/lexeme_path.cpp b/src/common/analyzer/ik/lexeme_path.cpp index ad1237907d..d6ac709f77 100644 --- a/src/common/analyzer/ik/lexeme_path.cpp +++ b/src/common/analyzer/ik/lexeme_path.cpp @@ -2,7 +2,6 @@ module; #include #include -#include module lexeme_path; @@ -104,7 +103,9 @@ LexemePath *LexemePath::Copy() const { the_copy->payload_length_ = payload_length_; Cell *c = GetHead(); while (c != nullptr && c->GetLexeme() != nullptr) { - the_copy->AddLexeme(c->GetLexeme()); + Lexeme *lexeme = c->GetLexeme()->Copy(); + if (!(the_copy->AddLexeme(lexeme))) + delete lexeme; c = c->GetNext(); } return the_copy; diff --git a/src/common/analyzer/ik/quick_sort_set.cpp b/src/common/analyzer/ik/quick_sort_set.cpp index 05a9c4ef4d..47c1cfb1aa 100644 --- a/src/common/analyzer/ik/quick_sort_set.cpp +++ b/src/common/analyzer/ik/quick_sort_set.cpp @@ -5,58 +5,59 @@ import lexeme; module quick_sort_set; namespace infinity { + QuickSortSet::QuickSortSet() {} QuickSortSet::~QuickSortSet() { while (size_ > 0) { Lexeme *tail = PollLast(); delete tail; - size_--; } } bool QuickSortSet::AddLexeme(Lexeme *lexeme) { - Cell *new_cell = new Cell(lexeme); + UniquePtr new_cell = MakeUnique(lexeme); if (size_ == 0) { - head_ = new_cell; - tail_ = new_cell; + Cell *cell_ptr = new_cell.release(); + head_ = cell_ptr; + tail_ = cell_ptr; size_++; return true; } else { - if (tail_->CompareTo(new_cell) == 0) { - delete new_cell; + if (tail_->CompareTo(new_cell.get()) == 0) { return false; - } else if (tail_->CompareTo(new_cell) < 0) { - tail_->next_ = new_cell; - new_cell->prev_ = tail_; - tail_ = new_cell; + } else if (tail_->CompareTo(new_cell.get()) < 0) { + Cell *cell_ptr = new_cell.release(); + tail_->next_ = cell_ptr; + cell_ptr->prev_ = tail_; + tail_ = cell_ptr; size_++; return true; - } else if (head_->CompareTo(new_cell) > 0) { - head_->prev_ = new_cell; - new_cell->next_ = head_; - head_ = new_cell; + } else if (head_->CompareTo(new_cell.get()) > 0) { + Cell *cell_ptr = new_cell.release(); + head_->prev_ = cell_ptr; + cell_ptr->next_ = head_; + head_ = cell_ptr; size_++; return true; } else { Cell *index = tail_; - while (index != nullptr && index->CompareTo(new_cell) > 0) { + while (index != nullptr && index->CompareTo(new_cell.get()) > 0) { index = index->prev_; } - if (index->CompareTo(new_cell) == 0) { - delete new_cell; + if (index->CompareTo(new_cell.get()) == 0) { return false; - } else if (index->CompareTo(new_cell) < 0) { - new_cell->prev_ = index; - new_cell->next_ = index->next_; - index->next_->prev_ = new_cell; - index->next_ = new_cell; + } else if (index->CompareTo(new_cell.get()) < 0) { + Cell *cell_ptr = new_cell.release(); + cell_ptr->prev_ = index; + cell_ptr->next_ = index->next_; + index->next_->prev_ = cell_ptr; + index->next_ = cell_ptr; size_++; return true; } } } - delete new_cell; return false; } } // namespace infinity diff --git a/src/common/analyzer/keyword_analyzer.cpp b/src/common/analyzer/whitespace_analyzer.cpp similarity index 89% rename from src/common/analyzer/keyword_analyzer.cpp rename to src/common/analyzer/whitespace_analyzer.cpp index 9e539c44cc..fc76172a30 100644 --- a/src/common/analyzer/keyword_analyzer.cpp +++ b/src/common/analyzer/whitespace_analyzer.cpp @@ -16,15 +16,14 @@ module; #include #include -module keyword_analyzer; - +module whitespace_analyzer; import stl; import term; import analyzer; namespace infinity { -int KeywordAnalyzer::AnalyzeImpl(const Term &input, void *data, HookType func) { +int WhitespaceAnalyzer::AnalyzeImpl(const Term &input, void *data, HookType func) { std::istringstream is(input.text_); std::string t; u32 offset = 0; diff --git a/src/common/analyzer/keyword_analyzer.cppm b/src/common/analyzer/whitespace_analyzer.cppm similarity index 76% rename from src/common/analyzer/keyword_analyzer.cppm rename to src/common/analyzer/whitespace_analyzer.cppm index bddf389714..d262b07c7f 100644 --- a/src/common/analyzer/keyword_analyzer.cppm +++ b/src/common/analyzer/whitespace_analyzer.cppm @@ -1,4 +1,4 @@ -// Copyright(C) 2023 InfiniFlow, Inc. All rights reserved. +// Copyright(C) 2024 InfiniFlow, Inc. All rights reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -14,16 +14,17 @@ module; -export module keyword_analyzer; +export module whitespace_analyzer; import stl; import term; import analyzer; namespace infinity { -export class KeywordAnalyzer : public Analyzer { + +export class WhitespaceAnalyzer : public Analyzer { public: - KeywordAnalyzer() = default; - ~KeywordAnalyzer() override = default; + WhitespaceAnalyzer() = default; + ~WhitespaceAnalyzer() override = default; protected: int AnalyzeImpl(const Term &input, void *data, HookType func) override; diff --git a/src/common/default_values.cppm b/src/common/default_values.cppm index bbe4b4ac0d..d962ab620f 100644 --- a/src/common/default_values.cppm +++ b/src/common/default_values.cppm @@ -278,6 +278,9 @@ export { constexpr std::string_view MEMINDEX_MEMORY_QUOTA_OPTION_NAME = "memindex_memory_quota"; constexpr std::string_view RESULT_CACHE_OPTION_NAME = "result_cache"; constexpr std::string_view CACHE_RESULT_CAPACITY_OPTION_NAME = "cache_result_capacity"; + constexpr std::string_view DENSE_INDEX_BUILDING_WORKER_OPTION_NAME = "dense_index_building_worker"; + constexpr std::string_view SPARSE_INDEX_BUILDING_WORKER_OPTION_NAME = "sparse_index_building_worker"; + constexpr std::string_view FULLTEXT_INDEX_BUILDING_WORKER_OPTION_NAME = "fulltext_index_building_worker"; constexpr std::string_view WAL_DIR_OPTION_NAME = "wal_dir"; constexpr std::string_view WAL_COMPACT_THRESHOLD_OPTION_NAME = "wal_compact_threshold"; @@ -314,8 +317,10 @@ export { constexpr std::string_view SYSTEM_MEMORY_USAGE_VAR_NAME = "system_memory_usage"; // global constexpr std::string_view OPEN_FILE_COUNT_VAR_NAME = "open_file_count"; // global constexpr std::string_view CPU_USAGE_VAR_NAME = "cpu_usage"; // global - constexpr std::string_view FOLLOWER_NUMBER = "follower_number"; // global + constexpr std::string_view FOLLOWER_NUMBER_VAR_NAME = "follower_number"; // global constexpr std::string_view CACHE_RESULT_NUM_VAR_NAME = "cache_result_num"; // global + constexpr std::string_view MEMORY_CACHE_MISS_VAR_NAME = "memory_cache_miss"; // global + constexpr std::string_view DISK_CACHE_MISS_VAR_NAME = "disk_cache_miss"; // global // IO related constexpr SizeT DEFAULT_READ_BUFFER_SIZE = 4096; diff --git a/src/common/status.cpp b/src/common/status.cpp index 158d1e7735..e5bfb74d7a 100644 --- a/src/common/status.cpp +++ b/src/common/status.cpp @@ -448,6 +448,10 @@ Status Status::FileIsOpen(const String &filename) { return Status(ErrorCode::kFi Status Status::Unknown(const String &name) { return Status(ErrorCode::kUnknown, MakeUnique(fmt::format("Unknown {}", name))); } +Status Status::InvalidQueryOption(const String &detail) { + return Status(ErrorCode::kUnknown, MakeUnique(fmt::format("Invalid query option: {}", detail))); +} + // 4. TXN fail Status Status::TxnRollback(u64 txn_id, const String &rollback_reason) { return Status(ErrorCode::kTxnRollback, MakeUnique(fmt::format("Transaction: {} is rollback. {}", txn_id, rollback_reason))); @@ -599,6 +603,12 @@ Status Status::NotRegistered(const String &node_info) { Status Status::CantSwitchRole(const String &detailed_info) { return Status(ErrorCode::kCantSwitchRole, MakeUnique(detailed_info)); } +Status Status::TooManyFollower(infinity::u8 follower_limit) { + return Status(ErrorCode::kTooManyFollower, MakeUnique(fmt::format("Too many followers, limit: {}", follower_limit))); +} + +Status Status::TooManyLearner() { return Status(ErrorCode::kTooManyLearner, MakeUnique("Too many learner, limit: 255")); } + // meta Status Status::InvalidEntry() { return Status(ErrorCode::kInvalidEntry, MakeUnique("Invalid entry")); } diff --git a/src/common/status.cppm b/src/common/status.cppm index 19bb71b379..b953e7d013 100644 --- a/src/common/status.cppm +++ b/src/common/status.cppm @@ -138,6 +138,7 @@ export enum class ErrorCode : long { kErrorInit = 3089, kFileIsOpen = 3090, kUnknown = 3091, + kInvalidQueryOption = 3092, // 4. Txn fail kTxnRollback = 4001, @@ -183,6 +184,8 @@ export enum class ErrorCode : long { kInvalidStorageType = 7024, kNotRegistered = 7025, kCantSwitchRole = 7026, + kTooManyFollower = 7027, + kTooManyLearner = 7028, // 8. meta error kInvalidEntry = 8001, @@ -311,6 +314,7 @@ public: static Status ErrorInit(const String &detailed_info); static Status FileIsOpen(const String &filename); static Status Unknown(const String &name); + static Status InvalidQueryOption(const String& detail); // 4. TXN fail static Status TxnRollback(u64 txn_id, const String &rollback_reason = "no reanson gived"); @@ -356,6 +360,8 @@ public: static Status InvalidStorageType(const String &expected, const String &actual); static Status NotRegistered(const String &node_info); static Status CantSwitchRole(const String &detailed_info); + static Status TooManyFollower(u8 follower_limit); + static Status TooManyLearner(); // meta static Status InvalidEntry(); diff --git a/src/embedded_infinity/wrap_infinity.cpp b/src/embedded_infinity/wrap_infinity.cpp index 93e1d4afca..a782925a16 100644 --- a/src/embedded_infinity/wrap_infinity.cpp +++ b/src/embedded_infinity/wrap_infinity.cpp @@ -1358,6 +1358,13 @@ void ProcessDataBlocks(QueryResult &query_result, WrapQueryResult &wrap_query_re auto data_block = query_result.result_table_->GetDataBlockById(block_idx); ProcessColumns(data_block, query_result.result_table_->ColumnCount(), columns); } + + if(query_result.result_table_->total_hits_count_flag_) { + nlohmann::json json_response; + json_response["total_hits_count"] = query_result.result_table_->total_hits_count_; + wrap_query_result.extra_result = json_response.dump(); + } + HandleColumnDef(wrap_query_result, query_result.result_table_->ColumnCount(), query_result.result_table_->definition_ptr_, columns); } @@ -1368,6 +1375,7 @@ WrapQueryResult WrapSearch(Infinity &instance, Vector highlight_list, Vector order_by_list, Vector group_by_list, + bool total_hits_count_flag, WrapSearchExpr *wrap_search_expr, WrapParsedExpr *filter_expr, WrapParsedExpr *limit_expr, @@ -1531,8 +1539,17 @@ WrapQueryResult WrapSearch(Infinity &instance, } } - auto query_result = - instance.Search(db_name, table_name, search_expr, filter, limit, offset, output_columns, highlight, order_by_exprs, group_by_exprs); + auto query_result = instance.Search(db_name, + table_name, + search_expr, + filter, + limit, + offset, + output_columns, + highlight, + order_by_exprs, + group_by_exprs, + total_hits_count_flag); search_expr = nullptr; filter = nullptr; limit = nullptr; diff --git a/src/embedded_infinity/wrap_infinity.cppm b/src/embedded_infinity/wrap_infinity.cppm index 93e1147746..0e1181ef76 100644 --- a/src/embedded_infinity/wrap_infinity.cppm +++ b/src/embedded_infinity/wrap_infinity.cppm @@ -125,6 +125,7 @@ export struct WrapQueryResult { Vector names; Vector column_defs; Vector column_fields; + String extra_result; // show database String database_name; String store_dir; @@ -432,6 +433,7 @@ export WrapQueryResult WrapSearch(Infinity &instance, Vector highlight_list, Vector order_by_list, Vector group_by_list, + bool total_hits_count_flag, WrapSearchExpr *wrap_search_expr = nullptr, WrapParsedExpr *where_expr = nullptr, WrapParsedExpr *limit_expr = nullptr, diff --git a/src/embedded_infinity_ext.cpp b/src/embedded_infinity_ext.cpp index cb92ed61b2..6124856de2 100644 --- a/src/embedded_infinity_ext.cpp +++ b/src/embedded_infinity_ext.cpp @@ -48,6 +48,7 @@ NB_MODULE(embedded_infinity_ext, m) { .def_rw("names", &WrapQueryResult::names) .def_rw("column_defs", &WrapQueryResult::column_defs) .def_rw("column_fields", &WrapQueryResult::column_fields) + .def_rw("extra_result", &WrapQueryResult::extra_result) .def_rw("database_name", &WrapQueryResult::database_name) .def_rw("store_dir", &WrapQueryResult::store_dir) .def_rw("table_count", &WrapQueryResult::table_count) @@ -325,6 +326,7 @@ NB_MODULE(embedded_infinity_ext, m) { nb::arg("highlight_list"), nb::arg("order_by_list"), nb::arg("group_by_list"), + nb::arg("total_hits_count_flag"), nb::arg("wrap_search_expr") = nullptr, nb::arg("where_expr") = nullptr, nb::arg("limit_expr") = nullptr, diff --git a/src/executor/operator/physical_compact_finish.cpp b/src/executor/operator/physical_compact_finish.cpp index 852dbae8fa..341352b57e 100644 --- a/src/executor/operator/physical_compact_finish.cpp +++ b/src/executor/operator/physical_compact_finish.cpp @@ -33,6 +33,8 @@ import internal_types; import infinity_context; import infinity_exception; import status; +import txn_store; +import segment_index_entry; namespace infinity { @@ -79,6 +81,29 @@ void PhysicalCompactFinish::SaveSegmentData(QueryContext *query_context, const C } LOG_DEBUG(ss.str()); + for (const auto &compact_segment_data : compact_state_data->segment_data_list_) { + TxnStore *txn_store = txn->txn_store(); + TxnTableStore *txn_table_store = txn_store->GetTxnTableStore(table_entry); + auto index_map = table_entry->IndexMetaMap(); + for (const auto &[index_name, index_meta] : *index_map) { + auto [table_index_entry, status] = index_meta->GetEntryNolock(txn->TxnID(), txn->BeginTS()); + if (!status.ok()) { + continue; + } + Vector segment_index_entries; + auto &segment_index_map = table_index_entry->index_by_segment(); + for (const auto *old_segment : compact_segment_data.old_segments_) { + auto iter = segment_index_map.find(old_segment->segment_id()); + if (iter == segment_index_map.end()) { + continue; + } + auto *segment_index_entry = iter->second.get(); + segment_index_entries.push_back(segment_index_entry); + } + txn_table_store->AddSegmentIndexesStore(table_index_entry, std::move(segment_index_entries)); + } + } + txn->Compact(table_entry, std::move(segment_data), compact_type_); } diff --git a/src/executor/operator/physical_import.cpp b/src/executor/operator/physical_import.cpp index 1912b5ea76..62af0ee72b 100644 --- a/src/executor/operator/physical_import.cpp +++ b/src/executor/operator/physical_import.cpp @@ -1280,6 +1280,34 @@ void PhysicalImport::JSONLRowHandler(const nlohmann::json &line_json, Vector schema; + arrow::Status status = arrow_reader->GetSchema(&schema); + if (!status.ok()) { + return Status::ImportFileFormatError(status.ToString()); + } + const arrow::FieldVector &fields = schema->fields(); + const Vector> &column_defs = table_entry->column_defs(); + if (fields.size() != column_defs.size()) { + return Status::ColumnCountMismatch(fmt::format("Column count mismatch: {} != {}", fields.size(), column_defs.size())); + } + for (SizeT i = 0; i < fields.size(); ++i) { + const auto &field = fields[i]; + const auto &column_def = column_defs[i]; + + if (*column_def->type() != *field->type()) { + return Status::ImportFileFormatError( + fmt::format("Column {} mismatch, {} != {}", i, column_def->type()->ToString(), field->type()->ToString())); + } + } + + return Status::OK(); +} + +} // namespace + void PhysicalImport::ImportPARQUET(QueryContext *query_context, ImportOperatorState *import_op_state) { arrow::MemoryPool *pool = arrow::DefaultMemoryPool(); @@ -1304,6 +1332,10 @@ void PhysicalImport::ImportPARQUET(QueryContext *query_context, ImportOperatorSt } std::unique_ptr arrow_reader = build_result.MoveValueUnsafe(); + if (Status status = CheckParquetColumns(table_entry_, arrow_reader.get()); !status.ok()) { + RecoverableError(status); + } + std::shared_ptr rb_reader; if (auto status = arrow_reader->GetRecordBatchReader(&rb_reader); !status.ok()) { RecoverableError(Status::ImportFileFormatError(status.ToString())); @@ -1330,10 +1362,6 @@ void PhysicalImport::ImportPARQUET(QueryContext *query_context, ImportOperatorSt auto batch = maybe_batch.MoveValueUnsafe(); const auto batch_row_count = batch->num_rows(); const auto batch_col_count = batch->num_columns(); - if (static_cast(batch_col_count) != table_entry_->ColumnCount()) { - RecoverableError( - Status::ColumnCountMismatch(fmt::format("Column count mismatch: {} != {}", batch_col_count, table_entry_->ColumnCount()))); - } for (i64 batch_row_id = 0; batch_row_id < batch_row_count; ++batch_row_id) { for (int column_idx = 0; column_idx < batch_col_count; ++column_idx) { SharedPtr column = batch->column(column_idx); @@ -1408,14 +1436,8 @@ void ParquetSparseValueHandler(const SparseInfo *sparse_info, ColumnVector &column_vector, i64 start_offset, i64 end_offset) { - if (sparse_info->DataType() != EmbeddingDataType::kElemBit && data_array.get() == nullptr) { - RecoverableError(Status::ImportFileFormatError("Invalid parquet file format.")); - } switch (sparse_info->DataType()) { case EmbeddingDataType::kElemBit: { - if (data_array.get() != nullptr) { - RecoverableError(Status::ImportFileFormatError("Invalid parquet file format.")); - } ParquetSparseValueHandler(sparse_info, index_array, nullptr, @@ -1425,13 +1447,7 @@ void ParquetSparseValueHandler(const SparseInfo *sparse_info, break; } case EmbeddingDataType::kElemUInt8: { - if (data_array.get() == nullptr) { - RecoverableError(Status::ImportFileFormatError("Invalid parquet file format.")); - } - auto uint8_value_array = std::dynamic_pointer_cast(data_array->values()); - if (uint8_value_array.get() == nullptr) { - RecoverableError(Status::ImportFileFormatError("Invalid parquet file format.")); - } + auto uint8_value_array = std::static_pointer_cast(data_array->values()); ParquetSparseValueHandler(sparse_info, index_array, uint8_value_array, @@ -1441,13 +1457,7 @@ void ParquetSparseValueHandler(const SparseInfo *sparse_info, break; } case EmbeddingDataType::kElemInt8: { - if (data_array.get() == nullptr) { - RecoverableError(Status::ImportFileFormatError("Invalid parquet file format.")); - } - auto int8_value_array = std::dynamic_pointer_cast(data_array->values()); - if (int8_value_array.get() == nullptr) { - RecoverableError(Status::ImportFileFormatError("Invalid parquet file format.")); - } + auto int8_value_array = std::static_pointer_cast(data_array->values()); ParquetSparseValueHandler(sparse_info, index_array, int8_value_array, @@ -1457,13 +1467,7 @@ void ParquetSparseValueHandler(const SparseInfo *sparse_info, break; } case EmbeddingDataType::kElemInt16: { - if (data_array.get() == nullptr) { - RecoverableError(Status::ImportFileFormatError("Invalid parquet file format.")); - } - auto int16_value_array = std::dynamic_pointer_cast(data_array->values()); - if (int16_value_array.get() == nullptr) { - RecoverableError(Status::ImportFileFormatError("Invalid parquet file format.")); - } + auto int16_value_array = std::static_pointer_cast(data_array->values()); ParquetSparseValueHandler(sparse_info, index_array, int16_value_array, @@ -1473,13 +1477,7 @@ void ParquetSparseValueHandler(const SparseInfo *sparse_info, break; } case EmbeddingDataType::kElemInt32: { - if (data_array.get() == nullptr) { - RecoverableError(Status::ImportFileFormatError("Invalid parquet file format.")); - } - auto int32_value_array = std::dynamic_pointer_cast(data_array->values()); - if (int32_value_array.get() == nullptr) { - RecoverableError(Status::ImportFileFormatError("Invalid parquet file format.")); - } + auto int32_value_array = std::static_pointer_cast(data_array->values()); ParquetSparseValueHandler(sparse_info, index_array, int32_value_array, @@ -1489,13 +1487,7 @@ void ParquetSparseValueHandler(const SparseInfo *sparse_info, break; } case EmbeddingDataType::kElemInt64: { - if (data_array.get() == nullptr) { - RecoverableError(Status::ImportFileFormatError("Invalid parquet file format.")); - } - auto int64_value_array = std::dynamic_pointer_cast(data_array->values()); - if (int64_value_array.get() == nullptr) { - RecoverableError(Status::ImportFileFormatError("Invalid parquet file format.")); - } + auto int64_value_array = std::static_pointer_cast(data_array->values()); ParquetSparseValueHandler(sparse_info, index_array, int64_value_array, @@ -1505,13 +1497,7 @@ void ParquetSparseValueHandler(const SparseInfo *sparse_info, break; } case EmbeddingDataType::kElemFloat16: { - if (data_array.get() == nullptr) { - RecoverableError(Status::ImportFileFormatError("Invalid parquet file format.")); - } - auto float16_value_array = std::dynamic_pointer_cast(data_array->values()); - if (float16_value_array.get() == nullptr) { - RecoverableError(Status::ImportFileFormatError("Invalid parquet file format.")); - } + auto float16_value_array = std::static_pointer_cast(data_array->values()); ParquetSparseValueHandler(sparse_info, index_array, float16_value_array, @@ -1521,13 +1507,7 @@ void ParquetSparseValueHandler(const SparseInfo *sparse_info, break; } case EmbeddingDataType::kElemBFloat16: { - if (data_array.get() == nullptr) { - RecoverableError(Status::ImportFileFormatError("Invalid parquet file format.")); - } - auto float_value_array = std::dynamic_pointer_cast(data_array->values()); - if (float_value_array.get() == nullptr) { - RecoverableError(Status::ImportFileFormatError("Invalid parquet file format.")); - } + auto float_value_array = std::static_pointer_cast(data_array->values()); ParquetSparseValueHandler(sparse_info, index_array, float_value_array, @@ -1537,13 +1517,7 @@ void ParquetSparseValueHandler(const SparseInfo *sparse_info, break; } case EmbeddingDataType::kElemFloat: { - if (data_array.get() == nullptr) { - RecoverableError(Status::ImportFileFormatError("Invalid parquet file format.")); - } - auto float_value_array = std::dynamic_pointer_cast(data_array->values()); - if (float_value_array.get() == nullptr) { - RecoverableError(Status::ImportFileFormatError("Invalid parquet file format.")); - } + auto float_value_array = std::static_pointer_cast(data_array->values()); ParquetSparseValueHandler(sparse_info, index_array, float_value_array, @@ -1553,13 +1527,7 @@ void ParquetSparseValueHandler(const SparseInfo *sparse_info, break; } case EmbeddingDataType::kElemDouble: { - if (data_array.get() == nullptr) { - RecoverableError(Status::ImportFileFormatError("Invalid parquet file format.")); - } - auto double_value_array = std::dynamic_pointer_cast(data_array->values()); - if (double_value_array.get() == nullptr) { - RecoverableError(Status::ImportFileFormatError("Invalid parquet file format.")); - } + auto double_value_array = std::static_pointer_cast(data_array->values()); ParquetSparseValueHandler(sparse_info, index_array, double_value_array, @@ -1595,10 +1563,7 @@ Pair, SizeT> ParquetEmbeddingHandler(SharedPtr list_ } const SizeT byte_size = dim / 8; auto embedding = MakeUnique(byte_size); - auto bool_array = std::dynamic_pointer_cast(list_array->values()); - if (bool_array.get() == nullptr) { - RecoverableError(Status::ImportFileFormatError("Invalid parquet file format.")); - } + auto bool_array = std::static_pointer_cast(list_array->values()); auto *raw_u8_ptr = reinterpret_cast(embedding.get()); for (i64 j = start_offset; j < end_offset; ++j) { if (bool_array->Value(j)) { @@ -1610,10 +1575,7 @@ Pair, SizeT> ParquetEmbeddingHandler(SharedPtr list_ } case EmbeddingDataType::kElemUInt8: { auto embedding = MakeUnique(dim * sizeof(u8)); - auto uint8_array = std::dynamic_pointer_cast(list_array->values()); - if (uint8_array.get() == nullptr) { - RecoverableError(Status::ImportFileFormatError("Invalid parquet file format.")); - } + auto uint8_array = std::static_pointer_cast(list_array->values()); for (i64 j = start_offset; j < end_offset; ++j) { const u8 value = uint8_array->Value(j); reinterpret_cast(embedding.get())[j - start_offset] = value; @@ -1622,10 +1584,7 @@ Pair, SizeT> ParquetEmbeddingHandler(SharedPtr list_ } case EmbeddingDataType::kElemInt8: { auto embedding = MakeUnique(dim * sizeof(i8)); - auto int8_array = std::dynamic_pointer_cast(list_array->values()); - if (int8_array.get() == nullptr) { - RecoverableError(Status::ImportFileFormatError("Invalid parquet file format.")); - } + auto int8_array = std::static_pointer_cast(list_array->values()); for (i64 j = start_offset; j < end_offset; ++j) { i8 value = int8_array->Value(j); reinterpret_cast(embedding.get())[j - start_offset] = value; @@ -1634,10 +1593,7 @@ Pair, SizeT> ParquetEmbeddingHandler(SharedPtr list_ } case EmbeddingDataType::kElemInt16: { auto embedding = MakeUnique(dim * sizeof(i16)); - auto int16_array = std::dynamic_pointer_cast(list_array->values()); - if (int16_array.get() == nullptr) { - RecoverableError(Status::ImportFileFormatError("Invalid parquet file format.")); - } + auto int16_array = std::static_pointer_cast(list_array->values()); for (i64 j = start_offset; j < end_offset; ++j) { i16 value = int16_array->Value(j); reinterpret_cast(embedding.get())[j - start_offset] = value; @@ -1646,10 +1602,7 @@ Pair, SizeT> ParquetEmbeddingHandler(SharedPtr list_ } case EmbeddingDataType::kElemInt32: { auto embedding = MakeUnique(dim * sizeof(i32)); - auto int32_array = std::dynamic_pointer_cast(list_array->values()); - if (int32_array.get() == nullptr) { - RecoverableError(Status::ImportFileFormatError("Invalid parquet file format.")); - } + auto int32_array = std::static_pointer_cast(list_array->values()); for (i64 j = start_offset; j < end_offset; ++j) { i32 value = int32_array->Value(j); reinterpret_cast(embedding.get())[j - start_offset] = value; @@ -1658,10 +1611,7 @@ Pair, SizeT> ParquetEmbeddingHandler(SharedPtr list_ } case EmbeddingDataType::kElemInt64: { auto embedding = MakeUnique(dim * sizeof(i64)); - auto int64_array = std::dynamic_pointer_cast(list_array->values()); - if (int64_array.get() == nullptr) { - RecoverableError(Status::ImportFileFormatError("Invalid parquet file format.")); - } + auto int64_array = std::static_pointer_cast(list_array->values()); for (i64 j = start_offset; j < end_offset; ++j) { i64 value = int64_array->Value(j); reinterpret_cast(embedding.get())[j - start_offset] = value; @@ -1670,10 +1620,7 @@ Pair, SizeT> ParquetEmbeddingHandler(SharedPtr list_ } case EmbeddingDataType::kElemFloat16: { auto embedding = MakeUnique(dim * sizeof(Float16T)); - auto float16_array = std::dynamic_pointer_cast(list_array->values()); - if (float16_array.get() == nullptr) { - RecoverableError(Status::ImportFileFormatError("Invalid parquet file format.")); - } + auto float16_array = std::static_pointer_cast(list_array->values()); for (i64 j = start_offset; j < end_offset; ++j) { const u16 value = float16_array->Value(j); reinterpret_cast(embedding.get())[j - start_offset] = value; @@ -1682,10 +1629,7 @@ Pair, SizeT> ParquetEmbeddingHandler(SharedPtr list_ } case EmbeddingDataType::kElemBFloat16: { auto embedding = MakeUnique(dim * sizeof(BFloat16T)); - auto float_array = std::dynamic_pointer_cast(list_array->values()); - if (float_array.get() == nullptr) { - RecoverableError(Status::ImportFileFormatError("Invalid parquet file format.")); - } + auto float_array = std::static_pointer_cast(list_array->values()); for (i64 j = start_offset; j < end_offset; ++j) { const float value = float_array->Value(j); reinterpret_cast(embedding.get())[j - start_offset] = value; @@ -1694,10 +1638,7 @@ Pair, SizeT> ParquetEmbeddingHandler(SharedPtr list_ } case EmbeddingDataType::kElemFloat: { auto embedding = MakeUnique(dim * sizeof(float)); - auto float_array = std::dynamic_pointer_cast(list_array->values()); - if (float_array.get() == nullptr) { - RecoverableError(Status::ImportFileFormatError("Invalid parquet file format.")); - } + auto float_array = std::static_pointer_cast(list_array->values()); for (i64 j = start_offset; j < end_offset; ++j) { float value = float_array->Value(j); reinterpret_cast(embedding.get())[j - start_offset] = value; @@ -1706,10 +1647,7 @@ Pair, SizeT> ParquetEmbeddingHandler(SharedPtr list_ } case EmbeddingDataType::kElemDouble: { auto embedding = MakeUnique(dim * sizeof(double)); - auto double_array = std::dynamic_pointer_cast(list_array->values()); - if (double_array.get() == nullptr) { - RecoverableError(Status::ImportFileFormatError("Invalid parquet file format.")); - } + auto double_array = std::static_pointer_cast(list_array->values()); for (i64 j = start_offset; j < end_offset; ++j) { double value = double_array->Value(j); reinterpret_cast(embedding.get())[j - start_offset] = value; @@ -1736,10 +1674,7 @@ ParquetTensorHandler(SharedPtr list_array, const EmbeddingInfo embedding_vec.push_back(std::move(data)); } } else { - auto tensor_ele_array = std::dynamic_pointer_cast(list_array->values()); - if (tensor_ele_array.get() == nullptr) { - RecoverableError(Status::ImportFileFormatError("Invalid parquet file format.")); - } + auto tensor_ele_array = std::static_pointer_cast(list_array->values()); for (i64 j = start_offset; j < end_offset; ++j) { auto data = ParquetEmbeddingHandler(tensor_ele_array, embedding_info, j); embedding_vec.push_back(std::move(data)); @@ -1753,78 +1688,78 @@ ParquetTensorHandler(SharedPtr list_array, const EmbeddingInfo void PhysicalImport::ParquetValueHandler(const SharedPtr &array, ColumnVector &column_vector, u64 value_idx) { switch (const auto column_data_logical_type = column_vector.data_type()->type(); column_data_logical_type) { case LogicalType::kBoolean: { - auto value = std::dynamic_pointer_cast(array)->Value(value_idx); + auto value = std::static_pointer_cast(array)->Value(value_idx); column_vector.AppendByPtr(reinterpret_cast(&value)); break; } case LogicalType::kTinyInt: { - auto value = std::dynamic_pointer_cast(array)->Value(value_idx); + auto value = std::static_pointer_cast(array)->Value(value_idx); column_vector.AppendByPtr(reinterpret_cast(&value)); break; } case LogicalType::kSmallInt: { - auto value = std::dynamic_pointer_cast(array)->Value(value_idx); + auto value = std::static_pointer_cast(array)->Value(value_idx); column_vector.AppendByPtr(reinterpret_cast(&value)); break; } case LogicalType::kInteger: { - auto value = std::dynamic_pointer_cast(array)->Value(value_idx); + auto value = std::static_pointer_cast(array)->Value(value_idx); column_vector.AppendByPtr(reinterpret_cast(&value)); break; } case LogicalType::kBigInt: { - auto value = std::dynamic_pointer_cast(array)->Value(value_idx); + auto value = std::static_pointer_cast(array)->Value(value_idx); column_vector.AppendByPtr(reinterpret_cast(&value)); break; } case LogicalType::kFloat16: { - auto value = std::dynamic_pointer_cast(array)->Value(value_idx); + auto value = std::static_pointer_cast(array)->Value(value_idx); const Float16T float16_value(value); column_vector.AppendByPtr(reinterpret_cast(&float16_value)); break; } case LogicalType::kBFloat16: { - auto value = std::dynamic_pointer_cast(array)->Value(value_idx); + auto value = std::static_pointer_cast(array)->Value(value_idx); const BFloat16T bfloat16_value(value); column_vector.AppendByPtr(reinterpret_cast(&bfloat16_value)); break; } case LogicalType::kFloat: { - auto value = std::dynamic_pointer_cast(array)->Value(value_idx); + auto value = std::static_pointer_cast(array)->Value(value_idx); column_vector.AppendByPtr(reinterpret_cast(&value)); break; } case LogicalType::kDouble: { - auto value = std::dynamic_pointer_cast(array)->Value(value_idx); + auto value = std::static_pointer_cast(array)->Value(value_idx); column_vector.AppendByPtr(reinterpret_cast(&value)); break; } case LogicalType::kDate: { - auto value = std::dynamic_pointer_cast(array)->Value(value_idx); + auto value = std::static_pointer_cast(array)->Value(value_idx); const DateT date_value(value); column_vector.AppendByPtr(reinterpret_cast(&date_value)); break; } case LogicalType::kTime: { - auto value = std::dynamic_pointer_cast(array)->Value(value_idx); + auto value = std::static_pointer_cast(array)->Value(value_idx); const TimeT time_value(value); column_vector.AppendByPtr(reinterpret_cast(&time_value)); break; } case LogicalType::kDateTime: { - auto value = std::dynamic_pointer_cast(array)->Value(value_idx); + auto value = std::static_pointer_cast(array)->Value(value_idx); const DateTimeT datetime_value(value); column_vector.AppendByPtr(reinterpret_cast(&datetime_value)); break; } case LogicalType::kTimestamp: { - auto value = std::dynamic_pointer_cast(array)->Value(value_idx); + auto value = std::static_pointer_cast(array)->Value(value_idx); const TimestampT timestamp_value(value); column_vector.AppendByPtr(reinterpret_cast(×tamp_value)); break; } case LogicalType::kVarchar: { - String value_str = std::dynamic_pointer_cast(array)->GetString(value_idx); + String value_str = std::static_pointer_cast(array)->GetString(value_idx); std::string_view value(value_str); column_vector.AppendByStringView(value); break; @@ -1835,10 +1770,7 @@ void PhysicalImport::ParquetValueHandler(const SharedPtr &array, C auto [data, size] = ParquetEmbeddingHandler(fixed_list_array, embedding_info, value_idx); column_vector.AppendByPtr(reinterpret_cast(data.get())); } else { - auto list_array = std::dynamic_pointer_cast(array); - if (list_array.get() == nullptr) { - RecoverableError(Status::ImportFileFormatError("Invalid parquet file format.")); - } + auto list_array = std::static_pointer_cast(array); auto [data, size] = ParquetEmbeddingHandler(list_array, embedding_info, value_idx); column_vector.AppendByPtr(reinterpret_cast(data.get())); } @@ -1847,26 +1779,17 @@ void PhysicalImport::ParquetValueHandler(const SharedPtr &array, C case LogicalType::kSparse: { const auto *sparse_info = static_cast(column_vector.data_type()->type_info().get()); - auto struct_array = std::dynamic_pointer_cast(array); - if (struct_array.get() == nullptr) { - RecoverableError(Status::ImportFileFormatError("Invalid parquet file format.")); - } + auto struct_array = std::static_pointer_cast(array); auto index_raw_array = struct_array->GetFieldByName("index"); - if (index_raw_array.get() == nullptr) { - RecoverableError(Status::ImportFileFormatError("Invalid parquet file format.")); - } - auto index_array = std::dynamic_pointer_cast(index_raw_array); - if (index_array.get() == nullptr) { - RecoverableError(Status::ImportFileFormatError("Invalid parquet file format.")); - } + auto index_array = std::static_pointer_cast(index_raw_array); i64 start_offset = index_array->value_offset(value_idx); i64 end_offset = index_array->value_offset(value_idx + 1); SharedPtr data_array; auto value_raw_array = struct_array->GetFieldByName("value"); if (value_raw_array.get() != nullptr) { - data_array = std::dynamic_pointer_cast(value_raw_array); + data_array = std::static_pointer_cast(value_raw_array); i64 start_offset1 = index_array->value_offset(value_idx); i64 end_offset1 = index_array->value_offset(value_idx + 1); @@ -1877,10 +1800,7 @@ void PhysicalImport::ParquetValueHandler(const SharedPtr &array, C switch (sparse_info->IndexType()) { case EmbeddingDataType::kElemInt8: { - auto int8_index_array = std::dynamic_pointer_cast(index_array->values()); - if (int8_index_array.get() == nullptr) { - RecoverableError(Status::ImportFileFormatError("Invalid parquet file format.")); - } + auto int8_index_array = std::static_pointer_cast(index_array->values()); ParquetSparseValueHandler(sparse_info, int8_index_array, data_array, @@ -1890,10 +1810,7 @@ void PhysicalImport::ParquetValueHandler(const SharedPtr &array, C break; } case EmbeddingDataType::kElemInt16: { - auto int16_index_array = std::dynamic_pointer_cast(index_array->values()); - if (int16_index_array.get() == nullptr) { - RecoverableError(Status::ImportFileFormatError("Invalid parquet file format.")); - } + auto int16_index_array = std::static_pointer_cast(index_array->values()); ParquetSparseValueHandler(sparse_info, int16_index_array, data_array, @@ -1903,10 +1820,7 @@ void PhysicalImport::ParquetValueHandler(const SharedPtr &array, C break; } case EmbeddingDataType::kElemInt32: { - auto int32_index_array = std::dynamic_pointer_cast(index_array->values()); - if (int32_index_array.get() == nullptr) { - RecoverableError(Status::ImportFileFormatError("Invalid parquet file format.")); - } + auto int32_index_array = std::static_pointer_cast(index_array->values()); ParquetSparseValueHandler(sparse_info, int32_index_array, data_array, @@ -1916,10 +1830,7 @@ void PhysicalImport::ParquetValueHandler(const SharedPtr &array, C break; } case EmbeddingDataType::kElemInt64: { - auto int64_index_array = std::dynamic_pointer_cast(index_array->values()); - if (int64_index_array.get() == nullptr) { - RecoverableError(Status::ImportFileFormatError("Invalid parquet file format.")); - } + auto int64_index_array = std::static_pointer_cast(index_array->values()); ParquetSparseValueHandler(sparse_info, int64_index_array, data_array, @@ -1937,10 +1848,7 @@ void PhysicalImport::ParquetValueHandler(const SharedPtr &array, C case LogicalType::kMultiVector: case LogicalType::kTensor: { auto embedding_info = std::static_pointer_cast(column_vector.data_type()->type_info()); - auto list_array = std::dynamic_pointer_cast(array); - if (list_array.get() == nullptr) { - RecoverableError(Status::ImportFileFormatError("Invalid parquet file format.")); - } + auto list_array = std::static_pointer_cast(array); Vector, SizeT>> embedding_vec = ParquetTensorHandler(list_array, embedding_info.get(), value_idx); Vector> embedding_data; for (const auto &[data_ptr, data_bytes] : embedding_vec) { @@ -1958,16 +1866,10 @@ void PhysicalImport::ParquetValueHandler(const SharedPtr &array, C case LogicalType::kTensorArray: { const auto embedding_info = std::static_pointer_cast(column_vector.data_type()->type_info()); auto value = Value::MakeTensorArray(embedding_info); - auto list_array = std::dynamic_pointer_cast(array); - if (list_array.get() == nullptr) { - RecoverableError(Status::ImportFileFormatError("Invalid parquet file format.")); - } + auto list_array = std::static_pointer_cast(array); i64 start_offset = list_array->value_offset(value_idx); i64 end_offset = list_array->value_offset(value_idx + 1); - auto tensor_array = std::dynamic_pointer_cast(list_array->values()); - if (tensor_array.get() == nullptr) { - RecoverableError(Status::ImportFileFormatError("Invalid parquet file format.")); - } + auto tensor_array = std::static_pointer_cast(list_array->values()); for (i64 j = start_offset; j < end_offset; ++j) { Vector, SizeT>> embedding_vec = ParquetTensorHandler(tensor_array, embedding_info.get(), j); Vector> embedding_data; diff --git a/src/executor/operator/physical_limit.cpp b/src/executor/operator/physical_limit.cpp index 62a39471cb..ccd928d271 100644 --- a/src/executor/operator/physical_limit.cpp +++ b/src/executor/operator/physical_limit.cpp @@ -37,6 +37,8 @@ import logger; namespace infinity { +void LimitCounter::AddHitsCount(u64 row_count) { total_hits_count_ += row_count; } + SizeT AtomicCounter::Offset(SizeT row_count) { auto success = false; SizeT result = 0; @@ -141,9 +143,10 @@ PhysicalLimit::PhysicalLimit(u64 id, UniquePtr left, SharedPtr limit_expr, SharedPtr offset_expr, - SharedPtr> load_metas) + SharedPtr> load_metas, + bool total_hits_count_flag) : PhysicalOperator(PhysicalOperatorType::kLimit, std::move(left), nullptr, id, load_metas), limit_expr_(std::move(limit_expr)), - offset_expr_(std::move(offset_expr)) { + offset_expr_(std::move(offset_expr)), total_hits_count_flag_(total_hits_count_flag) { i64 offset = 0; i64 limit = (static_pointer_cast(limit_expr_))->GetValue().value_.big_int; @@ -162,10 +165,11 @@ void PhysicalLimit::Init() {} bool PhysicalLimit::Execute(QueryContext *query_context, const Vector> &input_blocks, Vector> &output_blocks, - LimitCounter *counter) { + LimitCounter *counter, + bool total_hits_count_flag) { SizeT input_row_count = 0; - for (SizeT block_id = 0; block_id < input_blocks.size(); block_id++) { + for (SizeT block_id = 0; block_id < input_blocks.size(); ++block_id) { input_row_count += input_blocks[block_id]->row_count(); } @@ -191,7 +195,7 @@ bool PhysicalLimit::Execute(QueryContext *query_context, } } - for (SizeT block_id = block_start_idx; block_id < input_blocks.size(); block_id++) { + for (SizeT block_id = block_start_idx; block_id < input_blocks.size(); ++block_id) { auto &input_block = input_blocks[block_id]; auto row_count = input_block->row_count(); if (row_count == 0) { @@ -216,14 +220,27 @@ bool PhysicalLimit::Execute(QueryContext *query_context, } } + if (total_hits_count_flag) { + counter->AddHitsCount(input_row_count); + } + return true; } bool PhysicalLimit::Execute(QueryContext *query_context, OperatorState *operator_state) { - auto result = Execute(query_context, operator_state->prev_op_state_->data_block_array_, operator_state->data_block_array_, counter_.get()); + auto result = Execute(query_context, + operator_state->prev_op_state_->data_block_array_, + operator_state->data_block_array_, + counter_.get(), + total_hits_count_flag_); operator_state->prev_op_state_->data_block_array_.clear(); if (counter_->IsLimitOver() || operator_state->prev_op_state_->Complete()) { + if (total_hits_count_flag_) { + LimitOperatorState *limit_operator_state = (LimitOperatorState *)operator_state; + limit_operator_state->total_hits_count_flag_ = true; + limit_operator_state->total_hits_count_ = counter_->TotalHitsCount(); + } operator_state->SetComplete(); } return result; diff --git a/src/executor/operator/physical_limit.cppm b/src/executor/operator/physical_limit.cppm index 628601902b..f8eff547f8 100644 --- a/src/executor/operator/physical_limit.cppm +++ b/src/executor/operator/physical_limit.cppm @@ -45,6 +45,13 @@ public: virtual SizeT Limit(SizeT row_count) = 0; virtual bool IsLimitOver() = 0; + + SizeT TotalHitsCount() const { return total_hits_count_; } + + void AddHitsCount(u64 row_count); + +private: + Atomic total_hits_count_{}; }; export class AtomicCounter final : public LimitCounter { @@ -87,7 +94,8 @@ public: UniquePtr left, SharedPtr limit_expr, SharedPtr offset_expr, - SharedPtr> load_metas); + SharedPtr> load_metas, + bool total_hits_count_flag); ~PhysicalLimit() final = default; @@ -96,7 +104,8 @@ public: static bool Execute(QueryContext *query_context, const Vector> &input_blocks, Vector> &output_blocks, - LimitCounter *counter); + LimitCounter *counter, + bool total_hits_count_flag); bool Execute(QueryContext *query_context, OperatorState *operator_state) final; @@ -115,6 +124,7 @@ private: SharedPtr offset_expr_{}; UniquePtr counter_{}; + bool total_hits_count_flag_{}; }; } // namespace infinity diff --git a/src/executor/operator/physical_match.cpp b/src/executor/operator/physical_match.cpp index 151a3deedb..6173cd1c5e 100644 --- a/src/executor/operator/physical_match.cpp +++ b/src/executor/operator/physical_match.cpp @@ -74,213 +74,191 @@ import score_threshold_iterator; namespace infinity { -void ASSERT_FLOAT_EQ(float bar, u32 i, float a, float b) { - float diff_percent = std::abs(a - b) / std::max(std::abs(a), std::abs(b)); +struct QueryIterators { + UniquePtr query_iter{}; + // for comparison + UniquePtr bmw_iter{}; + UniquePtr batch_iter{}; + UniquePtr naive_iter{}; +}; + +QueryIterators CreateQueryIterators(QueryBuilder &query_builder, + FullTextQueryContext &context, + const EarlyTermAlgo early_term_algo, + const float begin_threshold, + const float score_threshold) { + auto get_iter = [&query_builder, &context, begin_threshold, score_threshold, function_name = __func__](const EarlyTermAlgo algo) { + switch (algo) { + case EarlyTermAlgo::kAuto: + case EarlyTermAlgo::kNaive: + case EarlyTermAlgo::kBatch: + case EarlyTermAlgo::kBMW: { + // ok + break; + } + case EarlyTermAlgo::kCompare: { + UnrecoverableError(std::format("{}: Wrong parameter!", function_name)); + break; + } + } + context.early_term_algo_ = algo; + auto iter = query_builder.CreateSearch(context); + // iter is nullptr if fulltext index is present but there's no data + if (iter) { + iter->UpdateScoreThreshold(std::max(begin_threshold, score_threshold)); + if (score_threshold > 0.0f) { + auto new_iter = MakeUnique(std::move(iter), score_threshold); + iter = std::move(new_iter); + } + } + return iter; + }; + QueryIterators query_iterators; + switch (early_term_algo) { + case EarlyTermAlgo::kAuto: + case EarlyTermAlgo::kNaive: + case EarlyTermAlgo::kBatch: + case EarlyTermAlgo::kBMW: { + query_iterators.query_iter = get_iter(early_term_algo); + break; + } + case EarlyTermAlgo::kCompare: { + query_iterators.bmw_iter = get_iter(EarlyTermAlgo::kBMW); + query_iterators.batch_iter = get_iter(EarlyTermAlgo::kBatch); + query_iterators.naive_iter = get_iter(EarlyTermAlgo::kNaive); + break; + } + } + return query_iterators; +} + +void ASSERT_FLOAT_EQ(const float bar, const u32 i, const float a, const float b) { + const float diff_percent = std::abs(a - b) / std::max(std::abs(a), std::abs(b)); if (diff_percent > bar) { OStringStream oss; - oss << "result mismatch at " << i << " : a: " << a << ", b: " << b << ", diff_percent: " << diff_percent << std::endl; - Status status = Status::SyntaxError("Debug Info: " + std::move(oss).str()); - RecoverableError(status); + oss << "Debug Info: result mismatch at " << i << " : a: " << a << ", b: " << b << ", diff_percent: " << diff_percent << std::endl; + LOG_ERROR(std::move(oss).str()); } } -void ExecuteFTSearch(UniquePtr &et_iter, FullTextScoreResultHeap &result_heap, u32 &blockmax_loop_cnt) { - // et_iter is nullptr if fulltext index is present but there's no data - if (et_iter == nullptr) { - LOG_DEBUG(fmt::format("et_iter is nullptr")); - return; +u32 ExecuteFTSearch(DocIterator *iter, FullTextScoreResultHeap &result_heap) { + u32 loop_cnt = 0; + // iter is nullptr if fulltext index is present but there's no data + if (!iter) { + LOG_DEBUG("iter is nullptr"); + return loop_cnt; } while (true) { - ++blockmax_loop_cnt; - bool ok = et_iter->Next(); - if (!ok) [[unlikely]] { + ++loop_cnt; + if (!(iter->Next())) [[unlikely]] { break; } - RowID id = et_iter->DocID(); - float et_score = et_iter->Score(); - if (SHOULD_LOG_DEBUG()) { - OStringStream oss; - et_iter->PrintTree(oss, "", true); - String msg = fmt::format("Found candidate doc_id {} score {}\n", id.ToUint64(), et_score); - msg += oss.str(); - LOG_DEBUG(msg); - } - if (result_heap.AddResult(et_score, id)) { + if (result_heap.AddResult(iter->Score(), iter->DocID())) { // update threshold - et_iter->UpdateScoreThreshold(result_heap.GetScoreThreshold()); + iter->UpdateScoreThreshold(result_heap.GetScoreThreshold()); } - if (blockmax_loop_cnt % 10 == 0) { - LOG_DEBUG(fmt::format("ExecuteFTSearch has evaluated {} candidates", blockmax_loop_cnt)); + } + return loop_cnt; +} + +auto ExecuteFTSearch(const QueryIterators &query_iterators, const u32 topn) { + struct FTSearchResultType { + u32 result_count{}; + UniquePtr score_result{}; + UniquePtr row_id_result{}; + }; + auto GetFTSearchResult = [topn](const UniquePtr &iter) { + FTSearchResultType result; + result.score_result = MakeUniqueForOverwrite(topn); + result.row_id_result = MakeUniqueForOverwrite(topn); + FullTextScoreResultHeap result_heap(topn, result.score_result.get(), result.row_id_result.get()); + [[maybe_unused]] const auto loop_cnt = ExecuteFTSearch(iter.get(), result_heap); + result_heap.Sort(); + result.result_count = result_heap.GetResultSize(); + return result; + }; + if (query_iterators.query_iter) [[likely]] { + return GetFTSearchResult(query_iterators.query_iter); + } + // compare + auto bmw_result = GetFTSearchResult(query_iterators.bmw_iter); + FTSearchResultType naive_result; + { + naive_result.score_result = MakeUniqueForOverwrite(topn); + naive_result.row_id_result = MakeUniqueForOverwrite(topn); + FullTextScoreResultHeap result_heap(topn, naive_result.score_result.get(), naive_result.row_id_result.get()); + if (query_iterators.batch_iter || query_iterators.naive_iter) { + if (!query_iterators.batch_iter || !query_iterators.naive_iter) { + UnrecoverableError("batch_iter and naive_iter should be both nullptr or both not nullptr"); + } + while (true) { + const auto batch_next = query_iterators.batch_iter->Next(); + const auto naive_next = query_iterators.naive_iter->Next(); + if (!batch_next && !naive_next) { + break; + } + if (!batch_next || !naive_next) { + LOG_ERROR("Conflict: batch_next and naive_next should be both true or both false"); + break; + } + const auto batch_score = query_iterators.batch_iter->Score(); + const auto naive_score = query_iterators.naive_iter->Score(); + const auto batch_doc_id = query_iterators.batch_iter->DocID(); + const auto naive_doc_id = query_iterators.naive_iter->DocID(); + const auto batch_match_count = query_iterators.batch_iter->MatchCount(); + const auto naive_match_count = query_iterators.naive_iter->MatchCount(); + if (batch_doc_id != naive_doc_id) { + LOG_ERROR(fmt::format("doc_id mismatch between batch and naive: {} vs {}", batch_doc_id.ToString(), naive_doc_id.ToString())); + } + if (batch_match_count != naive_match_count) { + LOG_ERROR(fmt::format("match count mismatch between batch and naive: {} vs {}", batch_match_count, naive_match_count)); + } + if (std::abs(batch_score - naive_score) / std::max(std::abs(batch_score), std::abs(naive_score)) > 1e-4) { + LOG_ERROR(fmt::format("score mismatch between batch and naive: {} vs {}", batch_score, naive_score)); + } + result_heap.AddResult(naive_score, naive_doc_id); + } } + result_heap.Sort(); + naive_result.result_count = result_heap.GetResultSize(); + } + if (bmw_result.result_count != naive_result.result_count) { + LOG_ERROR(fmt::format("result count mismatch between bmw and naive: {} vs {}", bmw_result.result_count, naive_result.result_count)); } + for (u32 i = 0; i < std::min(bmw_result.result_count, naive_result.result_count); ++i) { + ASSERT_FLOAT_EQ(1e-4, i, bmw_result.score_result[i], naive_result.score_result[i]); + } + return bmw_result; } -#pragma clang diagnostic push -#pragma clang diagnostic ignored "-Wunused-variable" -#pragma clang diagnostic ignored "-Wunused-but-set-variable" -bool PhysicalMatch::ExecuteInnerHomebrewed(QueryContext *query_context, OperatorState *operator_state) { +bool PhysicalMatch::ExecuteInner(QueryContext *query_context, OperatorState *operator_state) { + if (!common_query_filter_) { + UnrecoverableError(fmt::format("{}: common_query_filter_ is nullptr", __func__)); + } using TimeDurationType = std::chrono::duration; - auto execute_start_time = std::chrono::high_resolution_clock::now(); + const auto execute_start_time = std::chrono::high_resolution_clock::now(); // 1. build QueryNode tree - // 1.1 populate column2analyzer - // Txn *txn = query_context->GetTxn(); QueryBuilder query_builder(base_table_ref_.get()); query_builder.Init(index_reader_); - auto finish_init_query_builder_time = std::chrono::high_resolution_clock::now(); - TimeDurationType query_builder_init_duration = finish_init_query_builder_time - execute_start_time; - LOG_DEBUG(fmt::format("PhysicalMatch 0: Init QueryBuilder time: {} ms", query_builder_init_duration.count())); - - // 1.2 parse options into map, populate default_field - bool use_ordinary_iter = false; - bool use_block_max_iter = false; - - switch (early_term_algo_) { - case EarlyTermAlgo::kBMW: { - use_block_max_iter = true; - break; - } - case EarlyTermAlgo::kNaive: { - use_ordinary_iter = true; - break; - } - case EarlyTermAlgo::kCompare: { - use_ordinary_iter = true; - use_block_max_iter = true; - break; - } - default: { - use_block_max_iter = true; - break; - } - } - - auto finish_parse_query_tree_time = std::chrono::high_resolution_clock::now(); - TimeDurationType parse_query_tree_duration = finish_parse_query_tree_time - finish_init_query_builder_time; - LOG_DEBUG(fmt::format("PhysicalMatch 1: Parse QueryNode tree time: {} ms", parse_query_tree_duration.count())); + const auto finish_init_query_builder_time = std::chrono::high_resolution_clock::now(); + LOG_DEBUG(fmt::format("PhysicalMatch 1: Init QueryBuilder time: {} ms", + static_cast(finish_init_query_builder_time - execute_start_time).count())); // 2 build query iterator - // result - FullTextQueryContext full_text_query_context(ft_similarity_, minimum_should_match_option_, match_expr_->index_names_); - u32 result_count = 0; - const float *score_result = nullptr; - const RowID *row_id_result = nullptr; - // for comparison - UniquePtr et_iter; - UniquePtr doc_iterator; - u32 ordinary_loop_cnt = 0; - u32 blockmax_loop_cnt = 0; - u32 ordinary_result_count = 0; - u32 blockmax_result_count = 0; - UniquePtr ordinary_score_result; - UniquePtr ordinary_row_id_result; - UniquePtr blockmax_score_result; - UniquePtr blockmax_row_id_result; - TimeDurationType ordinary_duration = {}; - TimeDurationType blockmax_duration = {}; - assert(common_query_filter_); + FullTextQueryContext full_text_query_context(ft_similarity_, minimum_should_match_option_, top_n_, match_expr_->index_names_); full_text_query_context.query_tree_ = MakeUnique(common_query_filter_.get(), std::move(query_tree_)); - - if (use_block_max_iter) { - full_text_query_context.early_term_algo_ = EarlyTermAlgo::kBMW; - et_iter = query_builder.CreateSearch(full_text_query_context); - // et_iter is nullptr if fulltext index is present but there's no data - if (et_iter != nullptr) { - et_iter->UpdateScoreThreshold(std::max(begin_threshold_, score_threshold_)); - if (score_threshold_ > 0.0f) { - auto new_et_iter = MakeUnique(std::move(et_iter), score_threshold_); - et_iter = std::move(new_et_iter); - } - } - } - if (use_ordinary_iter) { - full_text_query_context.early_term_algo_ = EarlyTermAlgo::kNaive; - doc_iterator = query_builder.CreateSearch(full_text_query_context); - if (doc_iterator && score_threshold_ > 0.0f) { - auto new_doc_iter = MakeUnique(std::move(doc_iterator), score_threshold_); - doc_iterator = std::move(new_doc_iter); - } - } + const auto query_iterators = CreateQueryIterators(query_builder, full_text_query_context, early_term_algo_, begin_threshold_, score_threshold_); + const auto finish_query_builder_time = std::chrono::high_resolution_clock::now(); + LOG_DEBUG(fmt::format("PhysicalMatch Part 2: Build Query iterator time: {} ms", + static_cast(finish_query_builder_time - finish_init_query_builder_time).count())); // 3 full text search - - auto finish_query_builder_time = std::chrono::high_resolution_clock::now(); - TimeDurationType query_builder_duration = finish_query_builder_time - finish_parse_query_tree_time; - LOG_DEBUG(fmt::format("PhysicalMatch Part 2: Build Query iterator time: {} ms", query_builder_duration.count())); - if (use_block_max_iter) { - blockmax_score_result = MakeUniqueForOverwrite(top_n_); - blockmax_row_id_result = MakeUniqueForOverwrite(top_n_); - FullTextScoreResultHeap result_heap(top_n_, blockmax_score_result.get(), blockmax_row_id_result.get()); -#ifdef INFINITY_DEBUG - auto blockmax_begin_ts = std::chrono::high_resolution_clock::now(); -#endif - ExecuteFTSearch(et_iter, result_heap, blockmax_loop_cnt); - result_heap.Sort(); - blockmax_result_count = result_heap.GetResultSize(); -#ifdef INFINITY_DEBUG - auto blockmax_end_ts = std::chrono::high_resolution_clock::now(); - blockmax_duration = blockmax_end_ts - blockmax_begin_ts; -#endif - } - if (use_ordinary_iter) { - ordinary_score_result = MakeUniqueForOverwrite(top_n_); - ordinary_row_id_result = MakeUniqueForOverwrite(top_n_); - FullTextScoreResultHeap result_heap(top_n_, ordinary_score_result.get(), ordinary_row_id_result.get()); -#ifdef INFINITY_DEBUG - auto ordinary_begin_ts = std::chrono::high_resolution_clock::now(); -#endif - ExecuteFTSearch(doc_iterator, result_heap, ordinary_loop_cnt); - result_heap.Sort(); - ordinary_result_count = result_heap.GetResultSize(); -#ifdef INFINITY_DEBUG - auto ordinary_end_ts = std::chrono::high_resolution_clock::now(); - ordinary_duration = ordinary_end_ts - ordinary_begin_ts; -#endif - } - if (use_block_max_iter) { - result_count = blockmax_result_count; - score_result = blockmax_score_result.get(); - row_id_result = blockmax_row_id_result.get(); - } else { - result_count = ordinary_result_count; - score_result = ordinary_score_result.get(); - row_id_result = ordinary_row_id_result.get(); - } + const auto [result_count, score_result, row_id_result] = ExecuteFTSearch(query_iterators, top_n_); auto finish_query_time = std::chrono::high_resolution_clock::now(); - TimeDurationType query_duration = finish_query_time - finish_query_builder_time; - LOG_DEBUG(fmt::format("PhysicalMatch Part 3: Full text search time: {} ms", query_duration.count())); -#ifdef INFINITY_DEBUG - { - OStringStream stat_info; - stat_info << "Full text search stat:\n"; - if (use_block_max_iter) { - stat_info << "blockmax_duration: " << blockmax_duration << std::endl; - stat_info << "blockmax_loop_cnt: " << blockmax_loop_cnt << std::endl; - } - if (use_ordinary_iter) { - stat_info << "ordinary_duration: " << ordinary_duration << std::endl; - stat_info << "ordinary_loop_cnt: " << ordinary_loop_cnt << std::endl; - } - LOG_DEBUG(std::move(stat_info).str()); - } - if (use_ordinary_iter and use_block_max_iter) { - OStringStream compare_info; - compare_info << "Compare ordinary and blockmax:\n"; - compare_info << "duration ratio: " << blockmax_duration.count() / ordinary_duration.count() << std::endl; - compare_info << "loop count ratio: " << (static_cast(blockmax_loop_cnt) / ordinary_loop_cnt) << std::endl; - LOG_DEBUG(std::move(compare_info).str()); - if (ordinary_result_count != blockmax_result_count) { - Status status = Status::SyntaxError("Debug Info: result count mismatch!"); - RecoverableError(status); - } - for (u32 i = 0; i < result_count; ++i) { - ASSERT_FLOAT_EQ(1e-4, i, ordinary_score_result[i], blockmax_score_result[i]); - } - } -#endif + LOG_DEBUG(fmt::format("PhysicalMatch Part 3: Full text search time: {} ms", + static_cast(finish_query_time - finish_query_builder_time).count())); LOG_DEBUG(fmt::format("Full text search result count: {}", result_count)); - auto begin_output_time = std::chrono::high_resolution_clock::now(); - TimeDurationType output_info_duration = begin_output_time - finish_query_time; - LOG_DEBUG(fmt::format("PhysicalMatch Part 4: Output stat info time: {} ms", output_info_duration.count())); + // 4 populate result DataBlock // 4.1 prepare first output_data_block auto &output_data_blocks = operator_state->data_block_array_; @@ -294,6 +272,8 @@ bool PhysicalMatch::ExecuteInnerHomebrewed(QueryContext *query_context, Operator append_data_block(); // 4.2 output { + OutputToDataBlockHelper output_to_data_block_helper; + u32 output_block_idx = output_data_blocks.size() - 1; Vector &column_ids = base_table_ref_->column_ids_; SizeT column_n = column_ids.size(); u32 block_capacity = DEFAULT_BLOCK_CAPACITY; @@ -305,6 +285,7 @@ bool PhysicalMatch::ExecuteInnerHomebrewed(QueryContext *query_context, Operator output_block_ptr->Finalize(); append_data_block(); output_block_ptr = output_data_blocks.back().get(); + ++output_block_idx; output_block_row_id = 0; } const RowID &row_id = row_id_result[output_id]; @@ -312,13 +293,11 @@ bool PhysicalMatch::ExecuteInnerHomebrewed(QueryContext *query_context, Operator u32 segment_offset = row_id.segment_offset_; u16 block_id = segment_offset / DEFAULT_BLOCK_CAPACITY; u16 block_offset = segment_offset % DEFAULT_BLOCK_CAPACITY; - BlockEntry *block_entry = base_table_ref_->block_index_->GetBlockEntry(segment_id, block_id); - assert(block_entry != nullptr); SizeT column_id = 0; - for (; column_id < column_n; ++column_id) { - ColumnVector column_vector = block_entry->GetConstColumnVector(query_context->storage()->buffer_manager(), column_ids[column_id]); - output_block_ptr->column_vectors[column_id]->AppendWith(column_vector, block_offset, 1); + output_to_data_block_helper + .AddOutputJobInfo(segment_id, block_id, column_ids[column_id], block_offset, output_block_idx, column_id, output_block_row_id); + output_block_ptr->column_vectors[column_id]->Finalize(output_block_ptr->column_vectors[column_id]->Size() + 1); } Value v = Value::MakeFloat(score_result[output_id]); output_block_ptr->column_vectors[column_id++]->AppendValue(v); @@ -326,19 +305,19 @@ bool PhysicalMatch::ExecuteInnerHomebrewed(QueryContext *query_context, Operator ++output_block_row_id; } output_block_ptr->Finalize(); + output_to_data_block_helper.OutputToDataBlock(query_context->storage()->buffer_manager(), + base_table_ref_->block_index_.get(), + output_data_blocks); } - operator_state->SetComplete(); ResultCacheManager *cache_mgr = query_context->storage()->result_cache_manager(); if (cache_result_ && cache_mgr != nullptr) { AddCache(query_context, cache_mgr, output_data_blocks); } - auto finish_output_time = std::chrono::high_resolution_clock::now(); - TimeDurationType output_duration = finish_output_time - begin_output_time; - LOG_DEBUG(fmt::format("PhysicalMatch Part 5: Output data time: {} ms", output_duration.count())); + LOG_DEBUG(fmt::format("PhysicalMatch Part 4: Output data time: {} ms", + static_cast(std::chrono::high_resolution_clock::now() - finish_query_time).count())); return true; } -#pragma clang diagnostic pop PhysicalMatch::PhysicalMatch(const u64 id, SharedPtr base_table_ref, @@ -379,7 +358,7 @@ bool PhysicalMatch::Execute(QueryContext *query_context, OperatorState *operator return true; } } - bool return_value = ExecuteInnerHomebrewed(query_context, operator_state); + bool return_value = ExecuteInner(query_context, operator_state); auto end_time = std::chrono::high_resolution_clock::now(); std::chrono::duration duration = end_time - start_time; LOG_DEBUG(fmt::format("PhysicalMatch Execute time: {} ms", duration.count())); diff --git a/src/executor/operator/physical_match.cppm b/src/executor/operator/physical_match.cppm index b49bd0c5e7..3a2fedae1f 100644 --- a/src/executor/operator/physical_match.cppm +++ b/src/executor/operator/physical_match.cppm @@ -106,7 +106,7 @@ private: IndexReader index_reader_; UniquePtr query_tree_; float begin_threshold_; - EarlyTermAlgo early_term_algo_{EarlyTermAlgo::kBMW}; + EarlyTermAlgo early_term_algo_{EarlyTermAlgo::kAuto}; u32 top_n_{1}; // for filter @@ -117,7 +117,6 @@ private: FulltextSimilarity ft_similarity_{FulltextSimilarity::kBM25}; bool ExecuteInner(QueryContext *query_context, OperatorState *operator_state); - bool ExecuteInnerHomebrewed(QueryContext *query_context, OperatorState *operator_state); }; } // namespace infinity diff --git a/src/executor/operator/physical_merge_limit.cpp b/src/executor/operator/physical_merge_limit.cpp index cc5730094f..6a3069e0e2 100644 --- a/src/executor/operator/physical_merge_limit.cpp +++ b/src/executor/operator/physical_merge_limit.cpp @@ -51,7 +51,11 @@ bool PhysicalMergeLimit::Execute(QueryContext *query_context, OperatorState *ope if (limit_op_state->input_data_blocks_.empty()) { return false; } - auto result = PhysicalLimit::Execute(query_context, limit_op_state->input_data_blocks_, limit_op_state->data_block_array_, counter_.get()); + auto result = PhysicalLimit::Execute(query_context, + limit_op_state->input_data_blocks_, + limit_op_state->data_block_array_, + counter_.get(), + false); if (counter_->IsLimitOver() || limit_op_state->input_complete_) { limit_op_state->input_complete_ = true; diff --git a/src/executor/operator/physical_merge_limit.cppm b/src/executor/operator/physical_merge_limit.cppm index 2abd981359..122310fc1a 100644 --- a/src/executor/operator/physical_merge_limit.cppm +++ b/src/executor/operator/physical_merge_limit.cppm @@ -61,6 +61,7 @@ private: SharedPtr offset_expr_{}; UniquePtr counter_{}; + bool total_hits_count_flag_{}; }; } // namespace infinity diff --git a/src/executor/operator/physical_project.cpp b/src/executor/operator/physical_project.cpp index c5e04a435c..b8450aaeb3 100644 --- a/src/executor/operator/physical_project.cpp +++ b/src/executor/operator/physical_project.cpp @@ -50,9 +50,9 @@ void PhysicalProject::Init() { bool PhysicalProject::Execute(QueryContext *, OperatorState *operator_state) { auto *project_operator_state = static_cast(operator_state); - if(project_operator_state->empty_source_) { + if (project_operator_state->empty_source_) { project_operator_state->data_block_array_.emplace_back(DataBlock::MakeUniquePtr()); - DataBlock* output_data_block = project_operator_state->data_block_array_.back().get(); + DataBlock *output_data_block = project_operator_state->data_block_array_.back().get(); output_data_block->Init(*GetOutputTypes()); ExpressionEvaluator evaluator; @@ -77,14 +77,14 @@ bool PhysicalProject::Execute(QueryContext *, OperatorState *operator_state) { output_data_block->Finalize(); project_operator_state->SetComplete(); } else { - OperatorState* prev_op_state = operator_state->prev_op_state_; + OperatorState *prev_op_state = operator_state->prev_op_state_; SizeT input_block_count = prev_op_state->data_block_array_.size(); - for(SizeT block_idx = 0; block_idx < input_block_count; ++ block_idx) { - DataBlock* input_data_block = prev_op_state->data_block_array_[block_idx].get(); + for (SizeT block_idx = 0; block_idx < input_block_count; ++block_idx) { + DataBlock *input_data_block = prev_op_state->data_block_array_[block_idx].get(); project_operator_state->data_block_array_.emplace_back(DataBlock::MakeUniquePtr()); - DataBlock* output_data_block = project_operator_state->data_block_array_.back().get(); + DataBlock *output_data_block = project_operator_state->data_block_array_.back().get(); output_data_block->Init(*GetOutputTypes()); ExpressionEvaluator evaluator; @@ -139,7 +139,6 @@ bool PhysicalProject::Execute(QueryContext *, OperatorState *operator_state) { output_data_block->Finalize(); } - // if (prev_op_state->Complete() && !prev_op_state->data_block_->Finalized()) { // project_operator_state->data_block_->Finalize(); // project_operator_state->SetComplete(); @@ -148,6 +147,10 @@ bool PhysicalProject::Execute(QueryContext *, OperatorState *operator_state) { prev_op_state->data_block_array_.clear(); if (prev_op_state->Complete()) { + if(prev_op_state->total_hits_count_flag_) { + project_operator_state->total_hits_count_flag_ = true; + project_operator_state->total_hits_count_ = prev_op_state->total_hits_count_; + } project_operator_state->SetComplete(); } } diff --git a/src/executor/operator/physical_project.cppm b/src/executor/operator/physical_project.cppm index 3d27b869e0..738d70a5b5 100644 --- a/src/executor/operator/physical_project.cppm +++ b/src/executor/operator/physical_project.cppm @@ -52,9 +52,7 @@ public: SharedPtr>> GetOutputTypes() const final; - SizeT TaskletCount() override { - return left_->TaskletCount(); - } + SizeT TaskletCount() override { return left_->TaskletCount(); } Vector> expressions_{}; diff --git a/src/executor/operator/physical_scan/physical_match_tensor_scan.cpp b/src/executor/operator/physical_scan/physical_match_tensor_scan.cpp index 806ff9acec..8fbede0534 100644 --- a/src/executor/operator/physical_scan/physical_match_tensor_scan.cpp +++ b/src/executor/operator/physical_scan/physical_match_tensor_scan.cpp @@ -431,55 +431,14 @@ void PhysicalMatchTensorScan::ExecuteInner(QueryContext *query_context, MatchTen } else { // all task Complete const u32 result_n = function_data.End(); - const auto output_type_ptr = GetOutputTypes(); - { - // prepare output data block - const u32 total_data_row_count = result_n; - u32 row_idx = 0; - do { - auto data_block = DataBlock::MakeUniquePtr(); - data_block->Init(*output_type_ptr); - operator_state->data_block_array_.emplace_back(std::move(data_block)); - row_idx += DEFAULT_BLOCK_CAPACITY; - } while (row_idx < total_data_row_count); - } - u32 output_block_row_id = 0; - u32 output_block_idx = 0; - DataBlock *output_block_ptr = operator_state->data_block_array_[output_block_idx].get(); - const float *result_scores = function_data.score_result_.get(); - const RowID *result_row_ids = function_data.row_id_result_.get(); - for (u32 top_idx = 0; top_idx < result_n; ++top_idx) { - const SegmentID segment_id = result_row_ids[top_idx].segment_id_; - const SegmentOffset segment_offset = result_row_ids[top_idx].segment_offset_; - const BlockID block_id = segment_offset / DEFAULT_BLOCK_CAPACITY; - const BlockOffset block_offset = segment_offset % DEFAULT_BLOCK_CAPACITY; - BlockEntry *block_entry = block_index->GetBlockEntry(segment_id, block_id); - if (block_entry == nullptr) { - String error_message = fmt::format("Cannot find segment id: {}, block id: {}", segment_id, block_id); - UnrecoverableError(error_message); - } - if (output_block_row_id == DEFAULT_BLOCK_CAPACITY) { - output_block_ptr->Finalize(); - ++output_block_idx; - output_block_ptr = operator_state->data_block_array_[output_block_idx].get(); - output_block_row_id = 0; - } - const SizeT column_n = base_table_ref_->column_ids_.size(); - for (SizeT i = 0; i < column_n; ++i) { - const auto column_id = base_table_ref_->column_ids_[i]; - auto column_vector = block_entry->GetConstColumnVector(buffer_mgr, column_id); - output_block_ptr->column_vectors[i]->AppendWith(column_vector, block_offset, 1); - } - output_block_ptr->AppendValueByPtr(column_n, (ptr_t)&result_scores[top_idx]); - output_block_ptr->AppendValueByPtr(column_n + 1, (ptr_t)&result_row_ids[top_idx]); - ++output_block_row_id; - } - output_block_ptr->Finalize(); - - ResultCacheManager *cache_mgr = query_context->storage()->result_cache_manager(); - if (cache_result_ && cache_mgr != nullptr) { - AddCache(query_context, cache_mgr, operator_state->data_block_array_); - } + float *result_scores = function_data.score_result_.get(); + RowID *result_row_ids = function_data.row_id_result_.get(); + SetOutput(Vector{reinterpret_cast(result_scores)}, + Vector{result_row_ids}, + sizeof(std::remove_pointer_t), + result_n, + query_context, + operator_state); operator_state->SetComplete(); } } diff --git a/src/executor/operator/physical_scan/physical_scan_base.cpp b/src/executor/operator/physical_scan/physical_scan_base.cpp index 0171589b40..ea772504df 100644 --- a/src/executor/operator/physical_scan/physical_scan_base.cpp +++ b/src/executor/operator/physical_scan/physical_scan_base.cpp @@ -90,9 +90,12 @@ void PhysicalScanBase::SetOutput(const Vector &raw_result_dists_list, SizeT result_size, i64 result_n, QueryContext *query_context, - OperatorState *operator_state) { + OperatorState *operator_state) const { BlockIndex *block_index = base_table_ref_->block_index_.get(); SizeT query_n = raw_result_dists_list.size(); + if (query_n != 1u) { + UnrecoverableError(fmt::format("{}: Unexpected: more than 1 query?", __func__)); + } { SizeT total_data_row_count = query_n * result_n; @@ -104,8 +107,8 @@ void PhysicalScanBase::SetOutput(const Vector &raw_result_dists_list, row_idx += DEFAULT_BLOCK_CAPACITY; } while (row_idx < total_data_row_count); } - auto *buffer_mgr = query_context->storage()->buffer_manager(); + OutputToDataBlockHelper output_to_data_block_helper; SizeT output_block_row_id = 0; SizeT output_block_idx = 0; DataBlock *output_block_ptr = operator_state->data_block_array_[output_block_idx].get(); @@ -113,19 +116,12 @@ void PhysicalScanBase::SetOutput(const Vector &raw_result_dists_list, char *raw_result_dists = raw_result_dists_list[query_idx]; RowID *row_ids = row_ids_list[query_idx]; for (i64 top_idx = 0; top_idx < result_n; ++top_idx) { - SizeT id = query_n * query_idx + top_idx; SegmentID segment_id = row_ids[top_idx].segment_id_; SegmentOffset segment_offset = row_ids[top_idx].segment_offset_; BlockID block_id = segment_offset / DEFAULT_BLOCK_CAPACITY; BlockOffset block_offset = segment_offset % DEFAULT_BLOCK_CAPACITY; - BlockEntry *block_entry = block_index->GetBlockEntry(segment_id, block_id); - if (block_entry == nullptr) { - String error_message = fmt::format("Cannot find segment id: {}, block id: {}", segment_id, block_id); - UnrecoverableError(error_message); - } - if (output_block_row_id == DEFAULT_BLOCK_CAPACITY) { output_block_ptr->Finalize(); ++output_block_idx; @@ -136,18 +132,17 @@ void PhysicalScanBase::SetOutput(const Vector &raw_result_dists_list, SizeT column_n = base_table_ref_->column_ids_.size(); for (SizeT i = 0; i < column_n; ++i) { SizeT column_id = base_table_ref_->column_ids_[i]; - ColumnVector &&column_vector = block_entry->GetConstColumnVector(buffer_mgr, column_id); - - output_block_ptr->column_vectors[i]->AppendWith(column_vector, block_offset, 1); + output_to_data_block_helper.AddOutputJobInfo(segment_id, block_id, column_id, block_offset, output_block_idx, i, output_block_row_id); + output_block_ptr->column_vectors[i]->Finalize(output_block_ptr->column_vectors[i]->Size() + 1); } - output_block_ptr->AppendValueByPtr(column_n, raw_result_dists + id * result_size); - output_block_ptr->AppendValueByPtr(column_n + 1, (ptr_t)&row_ids[id]); + output_block_ptr->AppendValueByPtr(column_n, raw_result_dists + top_idx * result_size); + output_block_ptr->AppendValueByPtr(column_n + 1, (ptr_t)&row_ids[top_idx]); ++output_block_row_id; } } output_block_ptr->Finalize(); - + output_to_data_block_helper.OutputToDataBlock(query_context->storage()->buffer_manager(), block_index, operator_state->data_block_array_); ResultCacheManager *cache_mgr = query_context->storage()->result_cache_manager(); if (cache_result_ && cache_mgr != nullptr) { AddCache(query_context, cache_mgr, operator_state->data_block_array_); diff --git a/src/executor/operator/physical_scan/physical_scan_base.cppm b/src/executor/operator/physical_scan/physical_scan_base.cppm index 32a900497a..f753174f1d 100644 --- a/src/executor/operator/physical_scan/physical_scan_base.cppm +++ b/src/executor/operator/physical_scan/physical_scan_base.cppm @@ -68,7 +68,7 @@ protected: SizeT result_size, i64 result_n, QueryContext *query_context, - OperatorState *operator_state); + OperatorState *operator_state) const; void AddCache(QueryContext *query_context, ResultCacheManager *cache_mgr, const Vector> &output_data_blocks) const; diff --git a/src/executor/operator/physical_show.cpp b/src/executor/operator/physical_show.cpp index e1df03ae84..1d65e4f884 100644 --- a/src/executor/operator/physical_show.cpp +++ b/src/executor/operator/physical_show.cpp @@ -3185,6 +3185,69 @@ void PhysicalShow::ExecuteShowConfigs(QueryContext *query_context, ShowOperatorS } } + { + { + // option name + Value value = Value::MakeVarchar(DENSE_INDEX_BUILDING_WORKER_OPTION_NAME); + ValueExpression value_expr(value); + value_expr.AppendToChunk(output_block_ptr->column_vectors[0]); + } + { + // option name type + Value value = Value::MakeVarchar(std::to_string(global_config->DenseIndexBuildingWorker())); + ValueExpression value_expr(value); + value_expr.AppendToChunk(output_block_ptr->column_vectors[1]); + } + { + // option name type + Value value = Value::MakeVarchar("Dense vector index building worker count"); + ValueExpression value_expr(value); + value_expr.AppendToChunk(output_block_ptr->column_vectors[2]); + } + } + + { + { + // option name + Value value = Value::MakeVarchar(SPARSE_INDEX_BUILDING_WORKER_OPTION_NAME); + ValueExpression value_expr(value); + value_expr.AppendToChunk(output_block_ptr->column_vectors[0]); + } + { + // option name type + Value value = Value::MakeVarchar(std::to_string(global_config->SparseIndexBuildingWorker())); + ValueExpression value_expr(value); + value_expr.AppendToChunk(output_block_ptr->column_vectors[1]); + } + { + // option name type + Value value = Value::MakeVarchar("Sparse vector index building worker count"); + ValueExpression value_expr(value); + value_expr.AppendToChunk(output_block_ptr->column_vectors[2]); + } + } + + { + { + // option name + Value value = Value::MakeVarchar(FULLTEXT_INDEX_BUILDING_WORKER_OPTION_NAME); + ValueExpression value_expr(value); + value_expr.AppendToChunk(output_block_ptr->column_vectors[0]); + } + { + // option name type + Value value = Value::MakeVarchar(std::to_string(global_config->FulltextIndexBuildingWorker())); + ValueExpression value_expr(value); + value_expr.AppendToChunk(output_block_ptr->column_vectors[1]); + } + { + // option name type + Value value = Value::MakeVarchar("Full-text index building worker count"); + ValueExpression value_expr(value); + value_expr.AppendToChunk(output_block_ptr->column_vectors[2]); + } + } + { { // option name @@ -3985,6 +4048,48 @@ void PhysicalShow::ExecuteShowGlobalVariable(QueryContext *query_context, ShowOp value_expr.AppendToChunk(output_block_ptr->column_vectors[0]); break; } + case GlobalVariable::kMemoryCacheMiss: { + Vector> output_column_defs = { + MakeShared(0, varchar_type, "value", std::set()), + }; + + SharedPtr table_def = + TableDef::Make(MakeShared("default_db"), MakeShared("variables"), nullptr, output_column_defs); + output_ = MakeShared(table_def, TableType::kResult); + + Vector> output_column_types{ + varchar_type, + }; + + BufferManager *buffer_manager = query_context->storage()->buffer_manager(); + u64 total_request_count = buffer_manager->TotalRequestCount(); + u64 cache_miss_count = buffer_manager->CacheMissCount(); + + output_block_ptr->Init(output_column_types); + Value value = Value::MakeVarchar(fmt::format("{}/{}", cache_miss_count, total_request_count)); + ValueExpression value_expr(value); + value_expr.AppendToChunk(output_block_ptr->column_vectors[0]); + break; + } + case GlobalVariable::kDiskCacheMiss: { + Vector> output_column_defs = { + MakeShared(0, varchar_type, "value", std::set()), + }; + + SharedPtr table_def = + TableDef::Make(MakeShared("default_db"), MakeShared("variables"), nullptr, output_column_defs); + output_ = MakeShared(table_def, TableType::kResult); + + Vector> output_column_types{ + varchar_type, + }; + + output_block_ptr->Init(output_column_types); + Value value = Value::MakeVarchar(fmt::format("{}/{}", VirtualStore::CacheMissCount(), VirtualStore::TotalRequestCount())); + ValueExpression value_expr(value); + value_expr.AppendToChunk(output_block_ptr->column_vectors[0]); + break; + } case GlobalVariable::kQueryCount: { Vector> output_column_defs = { MakeShared(0, integer_type, "value", std::set()), @@ -4443,7 +4548,7 @@ void PhysicalShow::ExecuteShowGlobalVariable(QueryContext *query_context, ShowOp output_block_ptr->Init(output_column_types); - i64 follower_number = InfinityContext::instance().cluster_manager()->GetFollowerNumber(); + i64 follower_number = InfinityContext::instance().cluster_manager()->GetFollowerLimit(); Value value = Value::MakeBigInt(follower_number); ValueExpression value_expr(value); value_expr.AppendToChunk(output_block_ptr->column_vectors[0]); @@ -4564,6 +4669,51 @@ void PhysicalShow::ExecuteShowGlobalVariables(QueryContext *query_context, ShowO } break; } + case GlobalVariable::kMemoryCacheMiss: { + BufferManager *buffer_manager = query_context->storage()->buffer_manager(); + u64 total_request_count = buffer_manager->TotalRequestCount(); + u64 cache_miss_count = buffer_manager->CacheMissCount(); + { + // option name + Value value = Value::MakeVarchar(var_name); + ValueExpression value_expr(value); + value_expr.AppendToChunk(output_block_ptr->column_vectors[0]); + } + { + // option value + Value value = Value::MakeVarchar(fmt::format("{}/{}", cache_miss_count, total_request_count)); + ValueExpression value_expr(value); + value_expr.AppendToChunk(output_block_ptr->column_vectors[1]); + } + { + // option description + Value value = Value::MakeVarchar("Memory cache miss"); + ValueExpression value_expr(value); + value_expr.AppendToChunk(output_block_ptr->column_vectors[2]); + } + break; + } + case GlobalVariable::kDiskCacheMiss: { + { + // option name + Value value = Value::MakeVarchar(var_name); + ValueExpression value_expr(value); + value_expr.AppendToChunk(output_block_ptr->column_vectors[0]); + } + { + // option value + Value value = Value::MakeVarchar(fmt::format("{}/{}", VirtualStore::CacheMissCount(), VirtualStore::TotalRequestCount())); + ValueExpression value_expr(value); + value_expr.AppendToChunk(output_block_ptr->column_vectors[1]); + } + { + // option description + Value value = Value::MakeVarchar("Disk cache miss"); + ValueExpression value_expr(value); + value_expr.AppendToChunk(output_block_ptr->column_vectors[2]); + } + break; + } case GlobalVariable::kQueryCount: { { // option name @@ -5050,7 +5200,7 @@ void PhysicalShow::ExecuteShowGlobalVariables(QueryContext *query_context, ShowO } { // option value - SizeT follower_count = InfinityContext::instance().cluster_manager()->GetFollowerNumber(); + SizeT follower_count = InfinityContext::instance().cluster_manager()->GetFollowerLimit(); Value value = Value::MakeVarchar(std::to_string(follower_count)); ValueExpression value_expr(value); value_expr.AppendToChunk(output_block_ptr->column_vectors[1]); diff --git a/src/executor/operator/physical_sink.cpp b/src/executor/operator/physical_sink.cpp index 363eeebcff..0128ab9448 100644 --- a/src/executor/operator/physical_sink.cpp +++ b/src/executor/operator/physical_sink.cpp @@ -108,6 +108,8 @@ void PhysicalSink::FillSinkStateFromLastOperatorState(MaterializeSinkState *mate } case PhysicalOperatorType::kProjection: { ProjectionOperatorState *projection_output_state = static_cast(task_op_state); + materialize_sink_state->total_hits_count_flag_ = projection_output_state->total_hits_count_flag_; + materialize_sink_state->total_hits_count_ = projection_output_state->total_hits_count_; if (projection_output_state->data_block_array_.empty()) { materialize_sink_state->empty_result_ = true; } else { @@ -474,7 +476,9 @@ void PhysicalSink::FillSinkStateFromLastOperatorState(FragmentContext *fragment_ queue_sink_state->task_id_, idx, output_data_block_count, - task_operator_state->Complete()); + task_operator_state->Complete(), + task_operator_state->total_hits_count_flag_, + task_operator_state->total_hits_count_); if (task_operator_state->Complete() && !fragment_context->IsMaterialize()) { fragment_data->data_idx_ = None; } diff --git a/src/executor/operator/physical_top.cpp b/src/executor/operator/physical_top.cpp index ac4865b5c0..fa5d917832 100644 --- a/src/executor/operator/physical_top.cpp +++ b/src/executor/operator/physical_top.cpp @@ -332,19 +332,26 @@ void PhysicalTop::Init() { // Behavior now: always sort the output results bool PhysicalTop::Execute(QueryContext *, OperatorState *operator_state) { - auto prev_op_state = operator_state->prev_op_state_; + TopOperatorState *top_operator_state = (TopOperatorState *)operator_state; + auto prev_op_state = top_operator_state->prev_op_state_; if ((offset_ != 0) and !(prev_op_state->Complete())) { String error_message = "Only 1 PhysicalTop job but !(prev_op_state->Complete())"; UnrecoverableError(error_message); } auto &input_data_block_array = prev_op_state->data_block_array_; - auto &output_data_block_array = operator_state->data_block_array_; + auto &output_data_block_array = top_operator_state->data_block_array_; + + SizeT total_hits_row_count = std::accumulate(input_data_block_array.begin(), input_data_block_array.end(), 0, [](u32 x, const auto &y) -> u32 { + return x + y->row_count(); + }); // sometimes the input_data_block_array is empty, but the operator is not complete - if (std::accumulate(input_data_block_array.begin(), input_data_block_array.end(), 0, [](u32 x, const auto &y) -> u32 { - return x + y->row_count(); - }) == 0) { + if (total_hits_row_count == 0) { if (prev_op_state->Complete()) { - operator_state->SetComplete(); + if (total_hits_count_flag_) { + top_operator_state->total_hits_count_flag_ = true; + top_operator_state->total_hits_count_ = 0; + } + top_operator_state->SetComplete(); } return true; } @@ -352,13 +359,17 @@ bool PhysicalTop::Execute(QueryContext *, OperatorState *operator_state) { String error_message = "output data_block_array_ is not empty"; UnrecoverableError(error_message); } - auto eval_columns = GetEvalColumns(sort_expressions_, (static_cast(operator_state))->expr_states_, input_data_block_array); + auto eval_columns = GetEvalColumns(sort_expressions_, top_operator_state->expr_states_, input_data_block_array); TopSolver solve_top(limit_, prefer_left_function_); auto output_row_cnt = solve_top.WriteTopResultsToOutput(eval_columns, input_data_block_array, output_data_block_array); input_data_block_array.clear(); HandleOutputOffset(output_row_cnt, offset_, output_data_block_array); if (prev_op_state->Complete()) { - operator_state->SetComplete(); + if (total_hits_count_flag_) { + top_operator_state->total_hits_count_flag_ = true; + top_operator_state->total_hits_count_ = total_hits_row_count; + } + top_operator_state->SetComplete(); } return true; } diff --git a/src/executor/operator/physical_top.cppm b/src/executor/operator/physical_top.cppm index 850286b2f5..1e7a38921e 100644 --- a/src/executor/operator/physical_top.cppm +++ b/src/executor/operator/physical_top.cppm @@ -72,9 +72,10 @@ public: u32 offset, Vector> sort_expressions, Vector order_by_types, - SharedPtr> load_metas) + SharedPtr> load_metas, + bool total_hits_count_flag) : PhysicalOperator(PhysicalOperatorType::kTop, std::move(left), nullptr, id, load_metas), limit_(limit), offset_(offset), - order_by_types_(std::move(order_by_types)), sort_expressions_(std::move(sort_expressions)) {} + order_by_types_(std::move(order_by_types)), sort_expressions_(std::move(sort_expressions)), total_hits_count_flag_(total_hits_count_flag) {} ~PhysicalTop() override = default; @@ -122,6 +123,7 @@ private: Vector order_by_types_; // ASC or DESC Vector> sort_expressions_; // expressions to sort CompareTwoRowAndPreferLeft prefer_left_function_; // compare function + bool total_hits_count_flag_{}; // TODO: save a common threshold value for all tasks }; diff --git a/src/executor/operator_state.cpp b/src/executor/operator_state.cpp index 9b209d6879..f7b95e3974 100644 --- a/src/executor/operator_state.cpp +++ b/src/executor/operator_state.cpp @@ -117,6 +117,10 @@ bool QueueSourceState::GetData() { auto *fragment_data = static_cast(fragment_data_base.get()); MergeLimitOperatorState *limit_op_state = (MergeLimitOperatorState *)next_op_state; limit_op_state->input_data_blocks_.push_back(std::move(fragment_data->data_block_)); + if (fragment_data->total_hits_count_flag_) { + limit_op_state->total_hits_count_flag_ = true; + limit_op_state->total_hits_count_ += fragment_data->total_hits_count_; + } if (!limit_op_state->input_complete_) { limit_op_state->input_complete_ = completed; } @@ -129,6 +133,12 @@ bool QueueSourceState::GetData() { auto *fragment_data = static_cast(fragment_data_base.get()); auto top_op_state = (MergeTopOperatorState *)next_op_state; top_op_state->input_data_blocks_.push_back(std::move(fragment_data->data_block_)); + + if (fragment_data->total_hits_count_flag_) { + top_op_state->total_hits_count_flag_ = true; + top_op_state->total_hits_count_ += fragment_data->total_hits_count_; + } + if (!top_op_state->input_complete_) { top_op_state->input_complete_ = completed; } diff --git a/src/executor/operator_state.cppm b/src/executor/operator_state.cppm index fbb16e38fd..6650c53c51 100644 --- a/src/executor/operator_state.cppm +++ b/src/executor/operator_state.cppm @@ -60,6 +60,9 @@ export struct OperatorState { bool complete_{false}; + bool total_hits_count_flag_{}; + SizeT total_hits_count_{}; + inline void SetComplete() { complete_ = true; } inline bool Complete() const { return complete_; } @@ -585,6 +588,8 @@ export struct MaterializeSinkState : public SinkState { Vector> data_block_array_{}; bool empty_result_{false}; + bool total_hits_count_flag_{false}; + SizeT total_hits_count_{}; }; export struct ResultSinkState : public SinkState { diff --git a/src/executor/physical_operator.cpp b/src/executor/physical_operator.cpp index ede67b950c..5da8190a57 100644 --- a/src/executor/physical_operator.cpp +++ b/src/executor/physical_operator.cpp @@ -37,6 +37,8 @@ import data_block; import txn; import table_entry; import cached_match; +import buffer_manager; +import block_index; namespace infinity { @@ -52,10 +54,10 @@ void PhysicalOperator::InputLoad(QueryContext *query_context, OperatorState *ope // FIXME: After columnar reading is supported, use a different table_ref for each LoadMetas auto table_ref = table_refs[load_metas[0].binding_.table_idx]; if (table_ref.get() == nullptr) { - String error_message = "TableRef not found"; - UnrecoverableError(error_message); + UnrecoverableError("TableRef not found"); } + OutputToDataBlockHelper output_to_data_block_helper; for (SizeT i = 0; i < operator_state->prev_op_state_->data_block_array_.size(); ++i) { auto input_block = operator_state->prev_op_state_->data_block_array_[i].get(); SizeT load_column_count = load_metas_->size(); @@ -69,7 +71,7 @@ void PhysicalOperator::InputLoad(QueryContext *query_context, OperatorState *ope auto column_vector_type = (load_metas[j].type_->type() == LogicalType::kBoolean) ? ColumnVectorType::kCompactBit : ColumnVectorType::kFlat; column_vector->Initialize(column_vector_type, capacity); - + column_vector->Finalize(row_count); input_block->InsertVector(column_vector, load_metas[j].index_); } @@ -82,16 +84,15 @@ void PhysicalOperator::InputLoad(QueryContext *query_context, OperatorState *ope u32 segment_offset = row_id.segment_offset_; u16 block_id = segment_offset / DEFAULT_BLOCK_CAPACITY; u16 block_offset = segment_offset % DEFAULT_BLOCK_CAPACITY; - - BlockEntry *block_entry = table_ref->block_index_->GetBlockEntry(segment_id, block_id); for (SizeT k = 0; k < load_column_count; ++k) { - auto binding = load_metas[k].binding_; - - ColumnVector column_vector = block_entry->GetConstColumnVector(query_context->storage()->buffer_manager(), binding.column_idx); - input_block->column_vectors[load_metas[k].index_]->AppendWith(column_vector, block_offset, 1); + output_to_data_block_helper + .AddOutputJobInfo(segment_id, block_id, load_metas[k].binding_.column_idx, block_offset, i, load_metas[k].index_, j); } } } + output_to_data_block_helper.OutputToDataBlock(query_context->storage()->buffer_manager(), + table_ref->block_index_.get(), + operator_state->prev_op_state_->data_block_array_); } SharedPtr> PhysicalCommonFunctionUsingLoadMeta::GetOutputNames(const PhysicalOperator &op) { @@ -116,4 +117,29 @@ SharedPtr>> PhysicalCommonFunctionUsingLoadMeta::GetO return output_types; } +void OutputToDataBlockHelper::OutputToDataBlock(BufferManager *buffer_mgr, + const BlockIndex *block_index, + const Vector> &output_data_blocks) { + std::sort(output_job_infos.begin(), output_job_infos.end()); + auto cache_segment_id = std::numeric_limits::max(); + auto cache_block_id = std::numeric_limits::max(); + BlockEntry *cache_block_entry = nullptr; + auto cache_column_id = std::numeric_limits::max(); + ColumnVector cache_column_vector; + for (const auto [segment_id, block_id, column_id, block_offset, output_block_id, output_column_id, output_row_id] : output_job_infos) { + if (segment_id != cache_segment_id || block_id != cache_block_id) { + cache_segment_id = segment_id; + cache_block_id = block_id; + cache_block_entry = block_index->GetBlockEntry(segment_id, block_id); + cache_column_id = std::numeric_limits::max(); + } + if (column_id != cache_column_id) { + cache_column_vector = cache_block_entry->GetConstColumnVector(buffer_mgr, column_id); + } + auto val_for_update = cache_column_vector.GetValue(block_offset); + output_data_blocks[output_block_id]->column_vectors[output_column_id]->SetValue(output_row_id, val_for_update); + } + output_job_infos.clear(); +} + } // namespace infinity diff --git a/src/executor/physical_operator.cppm b/src/executor/physical_operator.cppm index 21a304a30a..54bbec0c4f 100644 --- a/src/executor/physical_operator.cppm +++ b/src/executor/physical_operator.cppm @@ -124,4 +124,34 @@ export struct PhysicalCommonFunctionUsingLoadMeta { static SharedPtr>> GetOutputTypes(const PhysicalOperator &op); }; +struct OutputJobInfo { + // src data info + SegmentID segment_id_{}; + BlockID block_id_{}; + ColumnID column_id_{}; + BlockOffset block_offset_{}; + // target position + u32 output_block_id_{}; + u32 output_column_id_{}; + u32 output_row_id_{}; + friend auto operator<=>(const OutputJobInfo &, const OutputJobInfo &) = default; +}; + +class BufferManager; +struct BlockIndex; +struct DataBlock; +export struct OutputToDataBlockHelper { + Vector output_job_infos; + void AddOutputJobInfo(const SegmentID segment_id, + const BlockID block_id, + const ColumnID column_id, + const BlockOffset block_offset, + const u32 output_block_id, + const u32 output_column_id, + const u32 output_row_id) { + output_job_infos.emplace_back(segment_id, block_id, column_id, block_offset, output_block_id, output_column_id, output_row_id); + } + void OutputToDataBlock(BufferManager *buffer_mgr, const BlockIndex *block_index, const Vector> &output_data_blocks); +}; + } // namespace infinity diff --git a/src/executor/physical_planner.cpp b/src/executor/physical_planner.cpp index 1c225de9ed..07abfa78ba 100644 --- a/src/executor/physical_planner.cpp +++ b/src/executor/physical_planner.cpp @@ -752,7 +752,8 @@ UniquePtr PhysicalPlanner::BuildLimit(const SharedPtrlimit_expression_, logical_limit->offset_expression_, - logical_operator->load_metas()); + logical_operator->load_metas(), + logical_limit->total_hits_count_flag_); } else { i64 child_limit = (static_pointer_cast(logical_limit->limit_expression_))->GetValue().value_.big_int; @@ -763,7 +764,8 @@ UniquePtr PhysicalPlanner::BuildLimit(const SharedPtr(Value::MakeBigInt(child_limit)), nullptr, - logical_operator->load_metas()); + logical_operator->load_metas(), + logical_limit->total_hits_count_flag_); return MakeUnique(query_context_ptr_->GetNextNodeID(), std::move(child_limit_op), logical_limit->limit_expression_, @@ -810,7 +812,8 @@ UniquePtr PhysicalPlanner::BuildTop(const SharedPtrsort_expressions_, logical_operator_top->order_by_types_, - logical_operator_top->load_metas()); + logical_operator_top->load_metas(), + logical_operator_top->total_hits_count_flag_); } else { // need MergeTop auto child_top_op = MakeUnique(logical_operator_top->node_id(), @@ -819,7 +822,8 @@ UniquePtr PhysicalPlanner::BuildTop(const SharedPtrsort_expressions_, logical_operator_top->order_by_types_, - logical_operator_top->load_metas()); + logical_operator_top->load_metas(), + logical_operator_top->total_hits_count_flag_); return MakeUnique(query_context_ptr_->GetNextNodeID(), logical_operator_top->base_table_ref_, std::move(child_top_op), diff --git a/src/function/builtin_functions.cpp b/src/function/builtin_functions.cpp index 60d58b34cc..bf7eda0b32 100644 --- a/src/function/builtin_functions.cpp +++ b/src/function/builtin_functions.cpp @@ -165,6 +165,18 @@ void BuiltinFunctions::RegisterSpecialFunction() { SharedPtr score_function = MakeShared("SCORE", DataType(LogicalType::kFloat), 4, SpecialType::kScore); Catalog::AddSpecialFunction(catalog_ptr_.get(), score_function); + SharedPtr distance_factors_function = + MakeShared("DISTANCE_FACTORS", DataType(LogicalType::kVarchar), 5, SpecialType::kDistanceFactors); + Catalog::AddSpecialFunction(catalog_ptr_.get(), distance_factors_function); + + SharedPtr similarity_factors_function = + MakeShared("SIMILARITY_FACTORS", DataType(LogicalType::kVarchar), 6, SpecialType::kSimilarityFactors); + Catalog::AddSpecialFunction(catalog_ptr_.get(), similarity_factors_function); + + SharedPtr score_factors_function = + MakeShared("SCORE_FACTORS", DataType(LogicalType::kVarchar), 7, SpecialType::kScoreFactors); + Catalog::AddSpecialFunction(catalog_ptr_.get(), score_factors_function); + auto createts_function = MakeShared("CREATE_TIMESTAMP", DataType(LogicalType::kBigInt), COLUMN_IDENTIFIER_CREATE, SpecialType::kCreateTs); Catalog::AddSpecialFunction(catalog_ptr_.get(), createts_function); diff --git a/src/function/special_function.cppm b/src/function/special_function.cppm index 2d1ead8e24..472301c835 100644 --- a/src/function/special_function.cppm +++ b/src/function/special_function.cppm @@ -26,8 +26,11 @@ namespace infinity { export enum class SpecialType { kRowID, kDistance, + kDistanceFactors, kSimilarity, + kSimilarityFactors, kScore, + kScoreFactors, kCreateTs, kDeleteTs, kFilterFullText, diff --git a/src/main/cluster_manager.cppm b/src/main/cluster_manager.cppm index 0d65cef8cc..abd0b08466 100644 --- a/src/main/cluster_manager.cppm +++ b/src/main/cluster_manager.cppm @@ -88,7 +88,7 @@ public: Status SyncLogs(); // Used by leader to control the number of follower Status SetFollowerNumber(SizeT new_follower_number); - SizeT GetFollowerNumber() const; + SizeT GetFollowerLimit() const; private: void CheckHeartBeatThread(); @@ -104,7 +104,7 @@ private: // Leader clients to followers and learners Map> reader_client_map_{}; // Used by leader; Vector> logs_to_sync_{}; - Atomic follower_count_{1}; + Atomic follower_limit_{4}; Vector> clients_for_cleanup_; // Follower and Learner diff --git a/src/main/cluster_manager_leader.cpp b/src/main/cluster_manager_leader.cpp index 11c308a7db..748e29962e 100644 --- a/src/main/cluster_manager_leader.cpp +++ b/src/main/cluster_manager_leader.cpp @@ -26,6 +26,7 @@ import storage; import logger; import infinity_exception; import wal_manager; +import admin_statement; namespace infinity { @@ -109,6 +110,10 @@ Status ClusterManager::AddNodeInfo(const SharedPtr &other_node) { UnrecoverableError(error_message); } + if (other_node_name == this_node_->node_name()) { + return Status::DuplicateNode(other_node_name); + } + // Add by register auto iter = other_node_map_.find(other_node_name); if (iter != other_node_map_.end()) { @@ -116,6 +121,44 @@ Status ClusterManager::AddNodeInfo(const SharedPtr &other_node) { // TODO: Update node info and not throw error. return Status::DuplicateNode(other_node_name); } + + u32 follower_count = 0; + u32 learner_count = 0; + for (auto &other_node_pair : other_node_map_) { + switch (other_node_pair.second->node_role()) { + case NodeRole::kFollower: { + follower_count += 1; + break; + } + case NodeRole::kLearner: { + learner_count += 1; + break; + } + default: { + String error_message = "Non-follower / learner role should be here."; + UnrecoverableError(error_message); + } + } + } + + // Add learner and follower limit + switch (other_node->node_role()) { + case NodeRole::kFollower: { + if (follower_count + 1 == follower_limit_) { + return Status::TooManyFollower(follower_limit_); + } + break; + } + case NodeRole::kLearner: { + if (learner_count + 1 == std::numeric_limits::max()) { + return Status::TooManyLearner(); + } + break; + } + default: { + return Status::InvalidNodeRole(fmt::format("Invalid node role: {}", ToString(other_node->node_role()))); + } + } } // Connect to follower/learner server. @@ -434,11 +477,11 @@ Status ClusterManager::SetFollowerNumber(SizeT new_follower_number) { } // Check current follower count, if new count is less, leader will downgrade some followers to learner - follower_count_ = new_follower_number; + follower_limit_ = new_follower_number; return Status::OK(); } -SizeT ClusterManager::GetFollowerNumber() const { return follower_count_; } +SizeT ClusterManager::GetFollowerLimit() const { return follower_limit_; } Status ClusterManager::SendLogs(const String &node_name, const SharedPtr &peer_client, diff --git a/src/main/config.cpp b/src/main/config.cpp index 9671010dad..dd389bcfdb 100644 --- a/src/main/config.cpp +++ b/src/main/config.cpp @@ -271,7 +271,8 @@ Status Config::Init(const SharedPtr &config_path, DefaultConfig *default // Peer connect timeout i64 peer_connect_timeout = DEFAULT_PEER_CONNECT_TIMEOUT; - UniquePtr peer_connect_timeout_option = MakeUnique(PEER_CONNECT_TIMEOUT_OPTION_NAME, peer_connect_timeout, 10000, 0); + UniquePtr peer_connect_timeout_option = + MakeUnique(PEER_CONNECT_TIMEOUT_OPTION_NAME, peer_connect_timeout, 10000, 0); status = global_options_.AddOption(std::move(peer_connect_timeout_option)); if (!status.ok()) { fmt::print("Fatal: {}", status.message()); @@ -482,6 +483,45 @@ Status Config::Init(const SharedPtr &config_path, DefaultConfig *default UnrecoverableError(status.message()); } + // Dense index building worker + i64 dense_index_building_worker = Thread::hardware_concurrency() / 2; + if (dense_index_building_worker < 2) { + dense_index_building_worker = 2; + } + UniquePtr dense_index_building_worker_option = + MakeUnique(DENSE_INDEX_BUILDING_WORKER_OPTION_NAME, dense_index_building_worker, Thread::hardware_concurrency(), 1); + status = global_options_.AddOption(std::move(dense_index_building_worker_option)); + if (!status.ok()) { + fmt::print("Fatal: {}", status.message()); + UnrecoverableError(status.message()); + } + + // Sparse index building worker + i64 sparse_index_building_worker = Thread::hardware_concurrency() / 2; + if (sparse_index_building_worker < 2) { + sparse_index_building_worker = 2; + } + UniquePtr sparse_index_building_worker_option = + MakeUnique(SPARSE_INDEX_BUILDING_WORKER_OPTION_NAME, sparse_index_building_worker, Thread::hardware_concurrency(), 1); + status = global_options_.AddOption(std::move(sparse_index_building_worker_option)); + if (!status.ok()) { + fmt::print("Fatal: {}", status.message()); + UnrecoverableError(status.message()); + } + + // Fulltext index building worker + i64 fulltext_index_building_worker = Thread::hardware_concurrency() / 2; + if (fulltext_index_building_worker < 2) { + fulltext_index_building_worker = 2; + } + UniquePtr fulltext_index_building_worker_option = + MakeUnique(FULLTEXT_INDEX_BUILDING_WORKER_OPTION_NAME, fulltext_index_building_worker, Thread::hardware_concurrency(), 1); + status = global_options_.AddOption(std::move(fulltext_index_building_worker_option)); + if (!status.ok()) { + fmt::print("Fatal: {}", status.message()); + UnrecoverableError(status.message()); + } + // Result Cache String result_cache(DEFAULT_RESULT_CACHE); auto result_cache_option = MakeUnique(RESULT_CACHE_OPTION_NAME, result_cache); @@ -492,7 +532,8 @@ Status Config::Init(const SharedPtr &config_path, DefaultConfig *default } i64 cache_result_num = DEFAULT_CACHE_RESULT_CAPACITY; - auto cache_result_num_option = MakeUnique(CACHE_RESULT_CAPACITY_OPTION_NAME, cache_result_num, std::numeric_limits::max(), 0); + auto cache_result_num_option = + MakeUnique(CACHE_RESULT_CAPACITY_OPTION_NAME, cache_result_num, std::numeric_limits::max(), 0); status = global_options_.AddOption(std::move(cache_result_num_option)); if (!status.ok()) { fmt::print("Fatal: {}", status.message()); @@ -656,7 +697,7 @@ Status Config::Init(const SharedPtr &config_path, DefaultConfig *default } ToLower(server_mode); - if (server_mode == "standalone" or server_mode == "cluster") { + if (server_mode == "standalone" or server_mode == "admin") { UniquePtr server_mode_option = MakeUnique(SERVER_MODE_OPTION_NAME, server_mode); Status status = global_options_.AddOption(std::move(server_mode_option)); if (!status.ok()) { @@ -1744,6 +1785,61 @@ Status Config::Init(const SharedPtr &config_path, DefaultConfig *default } break; } + + case GlobalOptionIndex::kDenseIndexBuildingWorker: { + i64 dense_index_building_worker = Thread::hardware_concurrency() / 2; + if (elem.second.is_integer()) { + dense_index_building_worker = elem.second.value_or(dense_index_building_worker); + } else { + return Status::InvalidConfig("'lru_num' field isn't integer."); + } + UniquePtr dense_index_building_worker_option = + MakeUnique(DENSE_INDEX_BUILDING_WORKER_OPTION_NAME, + dense_index_building_worker, + Thread::hardware_concurrency(), + 1); + if (!dense_index_building_worker_option->Validate()) { + return Status::InvalidConfig(fmt::format("Invalid dense vector index building number: {}", 0)); + } + global_options_.AddOption(std::move(dense_index_building_worker_option)); + break; + } + case GlobalOptionIndex::kSparseIndexBuildingWorker: { + i64 sparse_index_building_worker = Thread::hardware_concurrency() / 2; + if (elem.second.is_integer()) { + sparse_index_building_worker = elem.second.value_or(sparse_index_building_worker); + } else { + return Status::InvalidConfig("'lru_num' field isn't integer."); + } + UniquePtr sparse_index_building_worker_option = + MakeUnique(SPARSE_INDEX_BUILDING_WORKER_OPTION_NAME, + sparse_index_building_worker, + Thread::hardware_concurrency(), + 1); + if (!sparse_index_building_worker_option->Validate()) { + return Status::InvalidConfig(fmt::format("Invalid sparse vector index building number: {}", 0)); + } + global_options_.AddOption(std::move(sparse_index_building_worker_option)); + break; + } + case GlobalOptionIndex::kFulltextIndexBuildingWorker: { + i64 fulltext_index_building_worker = Thread::hardware_concurrency() / 2; + if (elem.second.is_integer()) { + fulltext_index_building_worker = elem.second.value_or(fulltext_index_building_worker); + } else { + return Status::InvalidConfig("'lru_num' field isn't integer."); + } + UniquePtr fulltext_index_building_worker_option = + MakeUnique(FULLTEXT_INDEX_BUILDING_WORKER_OPTION_NAME, + fulltext_index_building_worker, + Thread::hardware_concurrency(), + 1); + if (!fulltext_index_building_worker_option->Validate()) { + return Status::InvalidConfig(fmt::format("Invalid fulltext vector index building number: {}", 0)); + } + global_options_.AddOption(std::move(fulltext_index_building_worker_option)); + break; + } default: { return Status::InvalidConfig(fmt::format("Unrecognized config parameter: {} in 'storage' field", var_name)); } @@ -1830,6 +1926,52 @@ Status Config::Init(const SharedPtr &config_path, DefaultConfig *default } } + if (global_options_.GetOptionByIndex(GlobalOptionIndex::kDenseIndexBuildingWorker) == nullptr) { + // dense index building worker + i64 dense_index_building_worker = Thread::hardware_concurrency() / 2; + if (dense_index_building_worker < 2) { + dense_index_building_worker = 2; + } + UniquePtr dense_index_building_worker_option = MakeUnique(DENSE_INDEX_BUILDING_WORKER_OPTION_NAME, + dense_index_building_worker, + Thread::hardware_concurrency(), + 1); + Status status = global_options_.AddOption(std::move(dense_index_building_worker_option)); + if (!status.ok()) { + UnrecoverableError(status.message()); + } + } + if (global_options_.GetOptionByIndex(GlobalOptionIndex::kSparseIndexBuildingWorker) == nullptr) { + // sparse index building worker + i64 sparse_index_building_worker = Thread::hardware_concurrency() / 2; + if (sparse_index_building_worker < 2) { + sparse_index_building_worker = 2; + } + UniquePtr sparse_index_building_worker_option = MakeUnique(SPARSE_INDEX_BUILDING_WORKER_OPTION_NAME, + sparse_index_building_worker, + Thread::hardware_concurrency(), + 1); + Status status = global_options_.AddOption(std::move(sparse_index_building_worker_option)); + if (!status.ok()) { + UnrecoverableError(status.message()); + } + } + if (global_options_.GetOptionByIndex(GlobalOptionIndex::kMemIndexMemoryQuota) == nullptr) { + // fulltext index building worker + i64 fulltext_index_building_worker = Thread::hardware_concurrency() / 2; + if (fulltext_index_building_worker < 2) { + fulltext_index_building_worker = 2; + } + UniquePtr fulltext_index_building_worker_option = + MakeUnique(FULLTEXT_INDEX_BUILDING_WORKER_OPTION_NAME, + fulltext_index_building_worker, + Thread::hardware_concurrency(), + 1); + Status status = global_options_.AddOption(std::move(fulltext_index_building_worker_option)); + if (!status.ok()) { + UnrecoverableError(status.message()); + } + } } else { return Status::InvalidConfig("No 'storage' section in configure file."); } @@ -2543,6 +2685,21 @@ i64 Config::MemIndexCapacity() { return global_options_.GetIntegerValue(GlobalOptionIndex::kMemIndexCapacity); } +i64 Config::DenseIndexBuildingWorker() { + std::lock_guard guard(mutex_); + return global_options_.GetIntegerValue(GlobalOptionIndex::kDenseIndexBuildingWorker); +} + +i64 Config::SparseIndexBuildingWorker() { + std::lock_guard guard(mutex_); + return global_options_.GetIntegerValue(GlobalOptionIndex::kSparseIndexBuildingWorker); +} + +i64 Config::FulltextIndexBuildingWorker() { + std::lock_guard guard(mutex_); + return global_options_.GetIntegerValue(GlobalOptionIndex::kFulltextIndexBuildingWorker); +} + StorageType Config::StorageType() { std::lock_guard guard(mutex_); String storage_type_str = global_options_.GetStringValue(GlobalOptionIndex::kStorageType); @@ -2741,6 +2898,9 @@ void Config::PrintAll() { fmt::print(" - compact_interval: {}\n", Utility::FormatTimeInfo(CompactInterval())); fmt::print(" - optimize_index_interval: {}\n", Utility::FormatTimeInfo(OptimizeIndexInterval())); fmt::print(" - memindex_capacity: {}\n", MemIndexCapacity()); // mem index capacity is line number + fmt::print(" - dense_index_building_worker: {}\n", DenseIndexBuildingWorker()); + fmt::print(" - sparse_index_building_worker: {}\n", SparseIndexBuildingWorker()); + fmt::print(" - fulltext_index_building_worker: {}\n", FulltextIndexBuildingWorker()); fmt::print(" - storage_type: {}\n", ToString(StorageType())); switch (StorageType()) { case StorageType::kLocal: { diff --git a/src/main/config.cppm b/src/main/config.cppm index 66154f2541..4836235a69 100644 --- a/src/main/config.cppm +++ b/src/main/config.cppm @@ -102,6 +102,9 @@ public: void SetOptimizeInterval(i64); i64 MemIndexCapacity(); + i64 DenseIndexBuildingWorker(); + i64 SparseIndexBuildingWorker(); + i64 FulltextIndexBuildingWorker(); StorageType StorageType(); String ObjectStorageUrl(); diff --git a/src/main/infinity.cpp b/src/main/infinity.cpp index a0e1b22412..b45f49ed28 100644 --- a/src/main/infinity.cpp +++ b/src/main/infinity.cpp @@ -60,9 +60,9 @@ import extra_ddl_info; import drop_index_info; import drop_table_info; import third_party; +import defer_op; import infinity_exception; -import third_party; namespace infinity { @@ -93,10 +93,10 @@ std::variant, QueryResult> Infinity::GetQueryContext(boo return query_context_ptr; } -#define GET_QUERY_CONTEXT(result, query_context_ptr) \ - if (std::holds_alternative(result)) { \ - return std::get(result); \ - } \ +#define GET_QUERY_CONTEXT(result, query_context_ptr) \ + if (std::holds_alternative(result)) { \ + return std::get(result); \ + } \ query_context_ptr = std::move(std::get>(result)); u64 Infinity::GetSessionId() { return session_->session_id(); } @@ -247,6 +247,18 @@ QueryResult Infinity::Flush(const String &flush_type) { return result; } +QueryResult Infinity::Compact(const String &db_name, const String &table_name) { + UniquePtr query_context_ptr; + GET_QUERY_CONTEXT(GetQueryContext(), query_context_ptr); + auto compact_statement = MakeUnique(db_name, table_name); + + ToLower(compact_statement->schema_name_); + ToLower(compact_statement->table_name_); + + QueryResult result = query_context_ptr->QueryStatement(compact_statement.get()); + return result; +} + QueryResult Infinity::SetVariableOrConfig(const String &name, bool value, SetScope scope) { UniquePtr query_context_ptr; GET_QUERY_CONTEXT(GetQueryContext(), query_context_ptr); @@ -379,6 +391,17 @@ QueryResult Infinity::CreateTable(const String &db_name, Vector column_defs, Vector constraints, CreateTableOptions create_table_options) { + DeferFn free_create_table([&]() { + for (auto &column_def : column_defs) { + delete column_def; + column_def = nullptr; + } + for (auto &constraint : constraints) { + delete constraint; + constraint = nullptr; + } + }); + UniquePtr query_context_ptr; GET_QUERY_CONTEXT(GetQueryContext(), query_context_ptr); UniquePtr create_statement = MakeUnique(); @@ -871,6 +894,16 @@ QueryResult Infinity::ShowFunction(const String &function_name) { } QueryResult Infinity::Insert(const String &db_name, const String &table_name, Vector *insert_rows) { + DeferFn free_insert_rows([&]() { + if (insert_rows != nullptr) { + for (auto *insert_row : *insert_rows) { + delete insert_row; + insert_row = nullptr; + } + delete insert_rows; + insert_rows = nullptr; + } + }); UniquePtr query_context_ptr; GET_QUERY_CONTEXT(GetQueryContext(), query_context_ptr); UniquePtr insert_statement = MakeUnique(); @@ -886,8 +919,6 @@ QueryResult Infinity::Insert(const String &db_name, const String &table_name, Ve insert_statement->insert_rows_.emplace_back(insert_row_expr_ptr); insert_row_expr_ptr = nullptr; } - delete insert_rows; - insert_rows = nullptr; QueryResult result = query_context_ptr->QueryStatement(insert_statement.get()); return result; } @@ -917,6 +948,17 @@ QueryResult Infinity::Import(const String &db_name, const String &table_name, co QueryResult Infinity::Export(const String &db_name, const String &table_name, Vector *columns, const String &path, ExportOptions export_options) { + DeferFn free_column_expressions([&]() { + if (columns != nullptr) { + for (auto &column_expr : *columns) { + delete column_expr; + column_expr = nullptr; + } + delete columns; + columns = nullptr; + } + }); + UniquePtr query_context_ptr; GET_QUERY_CONTEXT(GetQueryContext(), query_context_ptr); UniquePtr export_statement = MakeUnique(); @@ -940,6 +982,7 @@ Infinity::Export(const String &db_name, const String &table_name, Vectorrow_limit_ = export_options.row_limit_; QueryResult result = query_context_ptr->QueryStatement(export_statement.get()); + columns = nullptr; return result; } @@ -961,6 +1004,17 @@ QueryResult Infinity::Delete(const String &db_name, const String &table_name, Pa } QueryResult Infinity::Update(const String &db_name, const String &table_name, ParsedExpr *filter, Vector *update_list) { + DeferFn free_update_list([&]() { + if (update_list != nullptr) { + for (auto &update_expr : *update_list) { + delete update_expr; + update_expr = nullptr; + } + delete update_list; + update_list = nullptr; + } + }); + UniquePtr query_context_ptr; GET_QUERY_CONTEXT(GetQueryContext(), query_context_ptr); UniquePtr update_statement = MakeUnique(); @@ -975,6 +1029,7 @@ QueryResult Infinity::Update(const String &db_name, const String &table_name, Pa ToLower(update_expr_ptr->column_name); } QueryResult result = query_context_ptr->QueryStatement(update_statement.get()); + update_list = nullptr; return result; } @@ -989,6 +1044,46 @@ QueryResult Infinity::Explain(const String &db_name, Vector *highlight_columns, Vector *order_by_list, Vector *group_by_list) { + DeferFn free_output_columns([&]() { + if (output_columns != nullptr) { + for (auto &output_column : *output_columns) { + delete output_column; + output_column = nullptr; + } + delete output_columns; + output_columns = nullptr; + } + }); + DeferFn free_highlight_columns([&]() { + if (output_columns != nullptr) { + for (auto &highlight_column : *highlight_columns) { + delete highlight_column; + highlight_column = nullptr; + } + delete highlight_columns; + highlight_columns = nullptr; + } + }); + DeferFn free_order_by_list([&]() { + if (output_columns != nullptr) { + for (auto &order_by : *order_by_list) { + delete order_by; + order_by = nullptr; + } + delete order_by_list; + order_by_list = nullptr; + } + }); + DeferFn free_group_by_list([&]() { + if (output_columns != nullptr) { + for (auto &group_by : *group_by_list) { + delete group_by; + group_by = nullptr; + } + delete group_by_list; + group_by_list = nullptr; + } + }); UniquePtr query_context_ptr; GET_QUERY_CONTEXT(GetQueryContext(), query_context_ptr); @@ -1014,11 +1109,16 @@ QueryResult Infinity::Explain(const String &db_name, select_statement->search_expr_ = search_expr; select_statement->limit_expr_ = limit; select_statement->offset_expr_ = offset; - select_statement->order_by_list = order_by_list; + select_statement->order_by_list_ = order_by_list; + select_statement->group_by_list_ = group_by_list; explain_statment->statement_ = select_statement; QueryResult result = query_context_ptr->QueryStatement(explain_statment.get()); + output_columns = nullptr; + highlight_columns = nullptr; + order_by_list = nullptr; + group_by_list = nullptr; return result; } @@ -1031,7 +1131,57 @@ QueryResult Infinity::Search(const String &db_name, Vector *output_columns, Vector *highlight_columns, Vector *order_by_list, - Vector *group_by_list) { + Vector *group_by_list, + bool total_hits_count_flag) { + if (total_hits_count_flag) { + if (limit == nullptr) { + QueryResult query_result; + query_result.result_table_ = nullptr; + query_result.status_ = Status::InvalidQueryOption("'total_hits_count' is only valid when limit keyword is set"); + return query_result; + } + } + + DeferFn free_output_columns([&]() { + if (output_columns != nullptr) { + for (auto &output_column : *output_columns) { + delete output_column; + output_column = nullptr; + } + delete output_columns; + output_columns = nullptr; + } + }); + DeferFn free_highlight_columns([&]() { + if (output_columns != nullptr) { + for (auto &highlight_column : *highlight_columns) { + delete highlight_column; + highlight_column = nullptr; + } + delete highlight_columns; + highlight_columns = nullptr; + } + }); + DeferFn free_order_by_list([&]() { + if (output_columns != nullptr) { + for (auto &order_by : *order_by_list) { + delete order_by; + order_by = nullptr; + } + delete order_by_list; + order_by_list = nullptr; + } + }); + DeferFn free_group_by_list([&]() { + if (output_columns != nullptr) { + for (auto &group_by : *group_by_list) { + delete group_by; + group_by = nullptr; + } + delete group_by_list; + group_by_list = nullptr; + } + }); UniquePtr query_context_ptr; GET_QUERY_CONTEXT(GetQueryContext(), query_context_ptr); UniquePtr select_statement = MakeUnique(); @@ -1053,9 +1203,15 @@ QueryResult Infinity::Search(const String &db_name, select_statement->search_expr_ = search_expr; select_statement->limit_expr_ = limit; select_statement->offset_expr_ = offset; - select_statement->order_by_list = order_by_list; + select_statement->order_by_list_ = order_by_list; + select_statement->group_by_list_ = group_by_list; + select_statement->total_hits_count_flag_ = total_hits_count_flag; QueryResult result = query_context_ptr->QueryStatement(select_statement.get()); + output_columns = nullptr; + highlight_columns = nullptr; + order_by_list = nullptr; + group_by_list = nullptr; return result; } diff --git a/src/main/infinity.cppm b/src/main/infinity.cppm index 7a319a306e..699e7ecc3f 100644 --- a/src/main/infinity.cppm +++ b/src/main/infinity.cppm @@ -74,6 +74,8 @@ public: QueryResult Flush(const String &flush_type = ""); + QueryResult Compact(const String &db_name, const String &table_name); + QueryResult SetVariableOrConfig(const String &name, bool value, SetScope scope); QueryResult SetVariableOrConfig(const String &name, i64 value, SetScope scope); @@ -191,7 +193,8 @@ public: Vector *output_columns, Vector *highlight_columns, Vector *order_by_list, - Vector *group_by_list); + Vector *group_by_list, + bool total_hits_count_flag); QueryResult Optimize(const String &db_name, const String &table_name, OptimizeOptions optimize_options = OptimizeOptions{}); diff --git a/src/main/infinity_context.cpp b/src/main/infinity_context.cpp index 942b773cb9..442c6b5243 100644 --- a/src/main/infinity_context.cpp +++ b/src/main/infinity_context.cpp @@ -35,6 +35,7 @@ import wal_manager; import global_resource_usage; import infinity_thrift_service; import defer_op; +import virtual_store; namespace infinity { @@ -63,6 +64,7 @@ void InfinityContext::InitPhase1(const SharedPtr &config_path, bool admi config_ = MakeUnique(); auto status = config_->Init(config_path, default_config); if (!status.ok()) { + fmt::print("Error: {}\n", status.message()); std::exit(static_cast(status.code())); } InfinityContext::instance().config()->PrintAll(); // Print all configs @@ -92,10 +94,10 @@ void InfinityContext::InitPhase1(const SharedPtr &config_path, bool admi void InfinityContext::InitPhase2() { - if (config_->ServerMode() == "cluster") { + if (config_->ServerMode() == "admin") { // Admin mode or cluster start phase infinity_context_inited_ = true; -// fmt::print("Infinity is started as a cluster node.\n"); + // fmt::print("Infinity is started as a cluster node.\n"); return; } @@ -153,6 +155,9 @@ Status InfinityContext::ChangeServerRole(NodeRole target_role, bool from_leader, break; } case NodeRole::kLeader: { + if (config_->StorageType() == StorageType::kLocal) { + return Status::InvalidStorageType("shared storage", "local"); + } // No need to un-init cluster manager, since current is admin Status init_status = cluster_manager_->InitAsLeader(node_name); if (!init_status.ok()) { @@ -169,6 +174,10 @@ Status InfinityContext::ChangeServerRole(NodeRole target_role, bool from_leader, break; } case NodeRole::kFollower: { + if (config_->StorageType() == StorageType::kLocal) { + return Status::InvalidStorageType("shared storage", "local"); + } + Status set_storage_status = storage_->SetStorageMode(StorageMode::kReadable); if (!set_storage_status.ok()) { return set_storage_status; @@ -189,6 +198,10 @@ Status InfinityContext::ChangeServerRole(NodeRole target_role, bool from_leader, break; } case NodeRole::kLearner: { + if (config_->StorageType() == StorageType::kLocal) { + return Status::InvalidStorageType("shared storage", "local"); + } + Status set_storage_status = storage_->SetStorageMode(StorageMode::kReadable); if (!set_storage_status.ok()) { return set_storage_status; @@ -226,8 +239,7 @@ Status InfinityContext::ChangeServerRole(NodeRole target_role, bool from_leader, } task_scheduler_ = MakeUnique(config_.get()); - i64 cpu_limit = config_->CPULimit(); - SetIndexThreadPool(cpu_limit); + SetIndexThreadPool(); break; } case NodeRole::kStandalone: { @@ -355,9 +367,17 @@ Status InfinityContext::ChangeServerRole(NodeRole target_role, bool from_leader, } break; } - case NodeRole::kLearner: + case NodeRole::kLearner: { + if (target_role == NodeRole::kFollower) { + return Status::CantSwitchRole(fmt::format("Can't switch node role: from {} to {}", ToString(current_role), ToString(target_role))); + } + } case NodeRole::kFollower: { switch (target_role) { + case NodeRole::kLearner: { + return Status::CantSwitchRole( + fmt::format("Can't switch node role: from {} to {}", ToString(current_role), ToString(target_role))); + } case NodeRole::kAdmin: { if (cluster_manager_ == nullptr) { UnrecoverableError("cluster manager wasn't valid."); @@ -492,21 +512,18 @@ void InfinityContext::UnInit() { config_.reset(); } -void InfinityContext::SetIndexThreadPool(SizeT thread_num) { - thread_num = thread_num / 2; - if (thread_num < 2) - thread_num = 2; - LOG_TRACE(fmt::format("Set index thread pool size to {}", thread_num)); - inverting_thread_pool_.resize(thread_num); - commiting_thread_pool_.resize(thread_num); - hnsw_build_thread_pool_.resize(thread_num); +void InfinityContext::SetIndexThreadPool() { + LOG_TRACE("Set index thread pool."); + inverting_thread_pool_.resize(config_->DenseIndexBuildingWorker()); + commiting_thread_pool_.resize(config_->SparseIndexBuildingWorker()); + hnsw_build_thread_pool_.resize(config_->FulltextIndexBuildingWorker()); } void InfinityContext::RestoreIndexThreadPoolToDefault() { - LOG_TRACE("Restore index thread pool size to default"); - inverting_thread_pool_.resize(4); - commiting_thread_pool_.resize(2); - hnsw_build_thread_pool_.resize(4); + LOG_TRACE("Restore index thread pool size to default."); + inverting_thread_pool_.resize(config_->DenseIndexBuildingWorker()); + commiting_thread_pool_.resize(config_->SparseIndexBuildingWorker()); + hnsw_build_thread_pool_.resize(config_->FulltextIndexBuildingWorker()); } void InfinityContext::AddThriftServerFn(std::function start_func, std::function stop_func) { diff --git a/src/main/infinity_context.cppm b/src/main/infinity_context.cppm index 136cc6b69d..fd0e60722e 100644 --- a/src/main/infinity_context.cppm +++ b/src/main/infinity_context.cppm @@ -64,7 +64,7 @@ public: void UnInit(); - void SetIndexThreadPool(SizeT thread_num); + void SetIndexThreadPool(); void RestoreIndexThreadPoolToDefault(); void AddThriftServerFn(std::function start_func, std::function stop_func); @@ -90,11 +90,11 @@ private: atomic_bool infinity_context_inited_{false}; // For fulltext index - ThreadPool inverting_thread_pool_{4}; + ThreadPool inverting_thread_pool_{2}; ThreadPool commiting_thread_pool_{2}; // For hnsw index - ThreadPool hnsw_build_thread_pool_{4}; + ThreadPool hnsw_build_thread_pool_{2}; mutable std::mutex mutex_; diff --git a/src/main/options.cpp b/src/main/options.cpp index d5b06c838c..113dc59ae9 100644 --- a/src/main/options.cpp +++ b/src/main/options.cpp @@ -78,6 +78,10 @@ GlobalOptions::GlobalOptions() { name2index_[String(TEMP_DIR_OPTION_NAME)] = GlobalOptionIndex::kTempDir; name2index_[String(MEMINDEX_MEMORY_QUOTA_OPTION_NAME)] = GlobalOptionIndex::kMemIndexMemoryQuota; + name2index_[String(DENSE_INDEX_BUILDING_WORKER_OPTION_NAME)] = GlobalOptionIndex::kDenseIndexBuildingWorker; + name2index_[String(SPARSE_INDEX_BUILDING_WORKER_OPTION_NAME)] = GlobalOptionIndex::kSparseIndexBuildingWorker; + name2index_[String(FULLTEXT_INDEX_BUILDING_WORKER_OPTION_NAME)] = GlobalOptionIndex::kFulltextIndexBuildingWorker; + name2index_[String(RESULT_CACHE_OPTION_NAME)] = GlobalOptionIndex::kResultCache; name2index_[String(CACHE_RESULT_CAPACITY_OPTION_NAME)] = GlobalOptionIndex::kCacheResultCapacity; diff --git a/src/main/options.cppm b/src/main/options.cppm index 33b78f3a64..2ac15c9f45 100644 --- a/src/main/options.cppm +++ b/src/main/options.cppm @@ -165,8 +165,10 @@ export enum class GlobalOptionIndex : i8 { kPeerConnectTimeout = 49, kPeerRecvTimeout = 50, kPeerSendTimeout = 51, - - kInvalid = 52, + kDenseIndexBuildingWorker = 53, + kSparseIndexBuildingWorker = 54, + kFulltextIndexBuildingWorker = 55, + kInvalid = 57, }; export struct GlobalOptions { diff --git a/src/main/variables.cpp b/src/main/variables.cpp index 65353c8b9c..c44e6018c6 100644 --- a/src/main/variables.cpp +++ b/src/main/variables.cpp @@ -44,10 +44,12 @@ void VarUtil::InitVariablesMap() { global_name_map_[CPU_USAGE_VAR_NAME.data()] = GlobalVariable::kCPUUsage; global_name_map_["jeprof"] = GlobalVariable::kJeProf; global_name_map_["cleanup_trace"] = GlobalVariable::kCleanupTrace; - global_name_map_[FOLLOWER_NUMBER.data()] = GlobalVariable::kFollowerNum; + global_name_map_[FOLLOWER_NUMBER_VAR_NAME.data()] = GlobalVariable::kFollowerNum; global_name_map_[RESULT_CACHE_OPTION_NAME.data()] = GlobalVariable::kResultCache; global_name_map_[CACHE_RESULT_CAPACITY_OPTION_NAME.data()] = GlobalVariable::kCacheResultCapacity; global_name_map_[CACHE_RESULT_NUM_VAR_NAME.data()] = GlobalVariable::kCacheResultNum; + global_name_map_[MEMORY_CACHE_MISS_VAR_NAME.data()] = GlobalVariable::kMemoryCacheMiss; + global_name_map_[DISK_CACHE_MISS_VAR_NAME.data()] = GlobalVariable::kDiskCacheMiss; session_name_map_[QUERY_COUNT_VAR_NAME.data()] = SessionVariable::kQueryCount; session_name_map_[TOTAL_COMMIT_COUNT_VAR_NAME.data()] = SessionVariable::kTotalCommitCount; diff --git a/src/main/variables.cppm b/src/main/variables.cppm index 1ea862061f..3a651c9b45 100644 --- a/src/main/variables.cppm +++ b/src/main/variables.cppm @@ -49,6 +49,8 @@ export enum class GlobalVariable { kResultCache, // global kCacheResultCapacity, // global kCacheResultNum, // global + kMemoryCacheMiss, // global + kDiskCacheMiss, // global kInvalid, }; diff --git a/src/network/http/http_search.cpp b/src/network/http/http_search.cpp index 28b64bc407..8733cd3e2b 100644 --- a/src/network/http/http_search.cpp +++ b/src/network/http/http_search.cpp @@ -66,6 +66,7 @@ void HTTPSearch::Process(Infinity *infinity_ptr, Vector *output_columns{nullptr}; Vector *highlight_columns{nullptr}; Vector *order_by_list{nullptr}; + bool total_hits_count_flag{}; DeferFn defer_fn([&]() { if (output_columns != nullptr) { for (auto &expr : *output_columns) { @@ -189,6 +190,40 @@ void HTTPSearch::Process(Infinity *infinity_ptr, if (!search_expr) { return; } + } else if (IsEqual(key, "option")) { + auto &option_object = elem.value(); + if (!option_object.is_object()) { + response["error_code"] = ErrorCode::kInvalidExpression; + response["error_message"] = "Option field should be object"; + return; + } + + for (const auto &option : option_object.items()) { + String key = option.key(); + ToLower(key); + if (key == "total_hits_count") { + if(option.value().is_string()) { + String value = option.value(); + ToLower(value); + if (value == "true") { + total_hits_count_flag = true; + } else if (value == "false") { + total_hits_count_flag = false; + } else { + response["error_code"] = ErrorCode::kInvalidExpression; + response["error_message"] = fmt::format("Unknown search option: {}, value: {}", key, value); + return; + } + } else if(option.value().is_boolean()) { + total_hits_count_flag = option.value(); + } else { + response["error_code"] = ErrorCode::kInvalidExpression; + response["error_message"] = "Invalid total hits count type"; + return; + } + + } + } } else { response["error_code"] = ErrorCode::kInvalidExpression; response["error_message"] = "Unknown expression: " + key; @@ -205,7 +240,8 @@ void HTTPSearch::Process(Infinity *infinity_ptr, output_columns, highlight_columns, order_by_list, - nullptr); + nullptr, + total_hits_count_flag); output_columns = nullptr; highlight_columns = nullptr; @@ -229,6 +265,10 @@ void HTTPSearch::Process(Infinity *infinity_ptr, } } + if(result.result_table_->total_hits_count_flag_) { + response["total_hits_count"] = result.result_table_->total_hits_count_; + } + response["error_code"] = 0; http_status = HTTPStatus::CODE_200; } else { @@ -484,10 +524,11 @@ UniquePtr HTTPSearch::ParseFilter(const nlohmann::json &json_object, Vector *HTTPSearch::ParseOutput(const nlohmann::json &output_list, HTTPStatus &http_status, nlohmann::json &response) { Vector *output_columns = new Vector(); - DeferFn defer_fn([&]() { + DeferFn free_output_columns([&]() { if (output_columns != nullptr) { for (auto &expr : *output_columns) { delete expr; + expr = nullptr; } delete output_columns; output_columns = nullptr; diff --git a/src/network/http_server.cpp b/src/network/http_server.cpp index e83b0b326f..3c0dff0682 100644 --- a/src/network/http_server.cpp +++ b/src/network/http_server.cpp @@ -477,6 +477,8 @@ class CreateTableHandler final : public HttpRequestHandler { if (json_response["error_code"] == 0) { auto result = infinity->CreateTable(database_name, table_name, column_definitions, table_constraint, options); + column_definitions.clear(); + table_constraint.clear(); if (result.IsOk()) { json_response["error_code"] = 0; http_status = HTTPStatus::CODE_200; @@ -485,8 +487,6 @@ class CreateTableHandler final : public HttpRequestHandler { json_response["error_message"] = result.ErrorMsg(); http_status = HTTPStatus::CODE_500; } - column_definitions.clear(); - table_constraint.clear(); } return ResponseFactory::createResponse(http_status, json_response.dump()); } @@ -871,10 +871,11 @@ class InsertHandler final : public HttpRequestHandler { json_response["error_message"] = fmt::format("Invalid json format: {}", data_body); } auto *insert_rows = new Vector(); - DeferFn del_insert_rows([&]() { + DeferFn free_insert_rows([&]() { if (insert_rows != nullptr) { for (auto *insert_row : *insert_rows) { delete insert_row; + insert_row = nullptr; } delete insert_rows; insert_rows = nullptr; @@ -3511,7 +3512,7 @@ class AdminSetNodeRoleHandler final : public HttpRequestHandler { if (!http_body_json.contains("role") or !http_body_json["role"].is_string()) { http_status = HTTPStatus::CODE_500; json_response["error_code"] = ErrorCode::kInvalidCommand; - json_response["error_message"] = "field 'role' is required to be set to string!"; + json_response["error_message"] = "Field 'role' is required"; return ResponseFactory::createResponse(http_status, json_response.dump()); } @@ -3531,7 +3532,7 @@ class AdminSetNodeRoleHandler final : public HttpRequestHandler { } else { http_status = HTTPStatus::CODE_500; json_response["error_code"] = ErrorCode::kInvalidNodeRole; - json_response["error_code"] = "invalid node role"; + json_response["error_message"] = fmt::format("Invalid node role {}", role); return ResponseFactory::createResponse(http_status, json_response.dump()); } @@ -3541,7 +3542,7 @@ class AdminSetNodeRoleHandler final : public HttpRequestHandler { } else { http_status = HTTPStatus::CODE_500; json_response["error_code"] = result.ErrorCode(); - json_response["error_code"] = result.ErrorMsg(); + json_response["error_message"] = result.ErrorMsg(); } return ResponseFactory::createResponse(http_status, json_response.dump()); @@ -3662,10 +3663,11 @@ class AdminShowCatalogsHandler final : public HttpRequestHandler { auto result = infinity->AdminShowCatalogs(); if (result.IsOk()) { json_response["error_code"] = 0; - if(result.result_table_->data_blocks_.empty()) { + if (result.result_table_->data_blocks_.empty()) { ; } else { - DataBlock *data_block = result.result_table_->GetDataBlockById(0).get(); // Assume the config output data only included in one data block + DataBlock *data_block = + result.result_table_->GetDataBlockById(0).get(); // Assume the config output data only included in one data block auto row_count = data_block->row_count(); for (int row = 0; row < row_count; ++row) { nlohmann::json node_json; diff --git a/src/network/infinity_thrift/InfinityService.cpp b/src/network/infinity_thrift/InfinityService.cpp index d3fa99f290..0e8f32ff70 100644 --- a/src/network/infinity_thrift/InfinityService.cpp +++ b/src/network/infinity_thrift/InfinityService.cpp @@ -6927,6 +6927,193 @@ uint32_t InfinityService_Flush_presult::read(::apache::thrift::protocol::TProtoc return xfer; } + +InfinityService_Compact_args::~InfinityService_Compact_args() noexcept { +} + + +uint32_t InfinityService_Compact_args::read(::apache::thrift::protocol::TProtocol* iprot) { + + ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot); + uint32_t xfer = 0; + std::string fname; + ::apache::thrift::protocol::TType ftype; + int16_t fid; + + xfer += iprot->readStructBegin(fname); + + using ::apache::thrift::protocol::TProtocolException; + + + while (true) + { + xfer += iprot->readFieldBegin(fname, ftype, fid); + if (ftype == ::apache::thrift::protocol::T_STOP) { + break; + } + switch (fid) + { + case 1: + if (ftype == ::apache::thrift::protocol::T_STRUCT) { + xfer += this->request.read(iprot); + this->__isset.request = true; + } else { + xfer += iprot->skip(ftype); + } + break; + default: + xfer += iprot->skip(ftype); + break; + } + xfer += iprot->readFieldEnd(); + } + + xfer += iprot->readStructEnd(); + + return xfer; +} + +uint32_t InfinityService_Compact_args::write(::apache::thrift::protocol::TProtocol* oprot) const { + uint32_t xfer = 0; + ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot); + xfer += oprot->writeStructBegin("InfinityService_Compact_args"); + + xfer += oprot->writeFieldBegin("request", ::apache::thrift::protocol::T_STRUCT, 1); + xfer += this->request.write(oprot); + xfer += oprot->writeFieldEnd(); + + xfer += oprot->writeFieldStop(); + xfer += oprot->writeStructEnd(); + return xfer; +} + + +InfinityService_Compact_pargs::~InfinityService_Compact_pargs() noexcept { +} + + +uint32_t InfinityService_Compact_pargs::write(::apache::thrift::protocol::TProtocol* oprot) const { + uint32_t xfer = 0; + ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot); + xfer += oprot->writeStructBegin("InfinityService_Compact_pargs"); + + xfer += oprot->writeFieldBegin("request", ::apache::thrift::protocol::T_STRUCT, 1); + xfer += (*(this->request)).write(oprot); + xfer += oprot->writeFieldEnd(); + + xfer += oprot->writeFieldStop(); + xfer += oprot->writeStructEnd(); + return xfer; +} + + +InfinityService_Compact_result::~InfinityService_Compact_result() noexcept { +} + + +uint32_t InfinityService_Compact_result::read(::apache::thrift::protocol::TProtocol* iprot) { + + ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot); + uint32_t xfer = 0; + std::string fname; + ::apache::thrift::protocol::TType ftype; + int16_t fid; + + xfer += iprot->readStructBegin(fname); + + using ::apache::thrift::protocol::TProtocolException; + + + while (true) + { + xfer += iprot->readFieldBegin(fname, ftype, fid); + if (ftype == ::apache::thrift::protocol::T_STOP) { + break; + } + switch (fid) + { + case 0: + if (ftype == ::apache::thrift::protocol::T_STRUCT) { + xfer += this->success.read(iprot); + this->__isset.success = true; + } else { + xfer += iprot->skip(ftype); + } + break; + default: + xfer += iprot->skip(ftype); + break; + } + xfer += iprot->readFieldEnd(); + } + + xfer += iprot->readStructEnd(); + + return xfer; +} + +uint32_t InfinityService_Compact_result::write(::apache::thrift::protocol::TProtocol* oprot) const { + + uint32_t xfer = 0; + + xfer += oprot->writeStructBegin("InfinityService_Compact_result"); + + if (this->__isset.success) { + xfer += oprot->writeFieldBegin("success", ::apache::thrift::protocol::T_STRUCT, 0); + xfer += this->success.write(oprot); + xfer += oprot->writeFieldEnd(); + } + xfer += oprot->writeFieldStop(); + xfer += oprot->writeStructEnd(); + return xfer; +} + + +InfinityService_Compact_presult::~InfinityService_Compact_presult() noexcept { +} + + +uint32_t InfinityService_Compact_presult::read(::apache::thrift::protocol::TProtocol* iprot) { + + ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot); + uint32_t xfer = 0; + std::string fname; + ::apache::thrift::protocol::TType ftype; + int16_t fid; + + xfer += iprot->readStructBegin(fname); + + using ::apache::thrift::protocol::TProtocolException; + + + while (true) + { + xfer += iprot->readFieldBegin(fname, ftype, fid); + if (ftype == ::apache::thrift::protocol::T_STOP) { + break; + } + switch (fid) + { + case 0: + if (ftype == ::apache::thrift::protocol::T_STRUCT) { + xfer += (*(this->success)).read(iprot); + this->__isset.success = true; + } else { + xfer += iprot->skip(ftype); + } + break; + default: + xfer += iprot->skip(ftype); + break; + } + xfer += iprot->readFieldEnd(); + } + + xfer += iprot->readStructEnd(); + + return xfer; +} + void InfinityServiceClient::Connect(CommonResponse& _return, const ConnectRequest& request) { send_Connect(request); @@ -9073,6 +9260,64 @@ void InfinityServiceClient::recv_Flush(CommonResponse& _return) throw ::apache::thrift::TApplicationException(::apache::thrift::TApplicationException::MISSING_RESULT, "Flush failed: unknown result"); } +void InfinityServiceClient::Compact(CommonResponse& _return, const CompactRequest& request) +{ + send_Compact(request); + recv_Compact(_return); +} + +void InfinityServiceClient::send_Compact(const CompactRequest& request) +{ + int32_t cseqid = 0; + oprot_->writeMessageBegin("Compact", ::apache::thrift::protocol::T_CALL, cseqid); + + InfinityService_Compact_pargs args; + args.request = &request; + args.write(oprot_); + + oprot_->writeMessageEnd(); + oprot_->getTransport()->writeEnd(); + oprot_->getTransport()->flush(); +} + +void InfinityServiceClient::recv_Compact(CommonResponse& _return) +{ + + int32_t rseqid = 0; + std::string fname; + ::apache::thrift::protocol::TMessageType mtype; + + iprot_->readMessageBegin(fname, mtype, rseqid); + if (mtype == ::apache::thrift::protocol::T_EXCEPTION) { + ::apache::thrift::TApplicationException x; + x.read(iprot_); + iprot_->readMessageEnd(); + iprot_->getTransport()->readEnd(); + throw x; + } + if (mtype != ::apache::thrift::protocol::T_REPLY) { + iprot_->skip(::apache::thrift::protocol::T_STRUCT); + iprot_->readMessageEnd(); + iprot_->getTransport()->readEnd(); + } + if (fname.compare("Compact") != 0) { + iprot_->skip(::apache::thrift::protocol::T_STRUCT); + iprot_->readMessageEnd(); + iprot_->getTransport()->readEnd(); + } + InfinityService_Compact_presult result; + result.success = &_return; + result.read(iprot_); + iprot_->readMessageEnd(); + iprot_->getTransport()->readEnd(); + + if (result.__isset.success) { + // _return pointer has now been filled + return; + } + throw ::apache::thrift::TApplicationException(::apache::thrift::TApplicationException::MISSING_RESULT, "Compact failed: unknown result"); +} + bool InfinityServiceProcessor::dispatchCall(::apache::thrift::protocol::TProtocol* iprot, ::apache::thrift::protocol::TProtocol* oprot, const std::string& fname, int32_t seqid, void* callContext) { ProcessMap::iterator pfn; pfn = processMap_.find(fname); @@ -11090,6 +11335,60 @@ void InfinityServiceProcessor::process_Flush(int32_t seqid, ::apache::thrift::pr } } +void InfinityServiceProcessor::process_Compact(int32_t seqid, ::apache::thrift::protocol::TProtocol* iprot, ::apache::thrift::protocol::TProtocol* oprot, void* callContext) +{ + void* ctx = nullptr; + if (this->eventHandler_.get() != nullptr) { + ctx = this->eventHandler_->getContext("InfinityService.Compact", callContext); + } + ::apache::thrift::TProcessorContextFreer freer(this->eventHandler_.get(), ctx, "InfinityService.Compact"); + + if (this->eventHandler_.get() != nullptr) { + this->eventHandler_->preRead(ctx, "InfinityService.Compact"); + } + + InfinityService_Compact_args args; + args.read(iprot); + iprot->readMessageEnd(); + uint32_t bytes = iprot->getTransport()->readEnd(); + + if (this->eventHandler_.get() != nullptr) { + this->eventHandler_->postRead(ctx, "InfinityService.Compact", bytes); + } + + InfinityService_Compact_result result; + try { + iface_->Compact(result.success, args.request); + result.__isset.success = true; + } catch (const std::exception& e) { + if (this->eventHandler_.get() != nullptr) { + this->eventHandler_->handlerError(ctx, "InfinityService.Compact"); + } + + ::apache::thrift::TApplicationException x(e.what()); + oprot->writeMessageBegin("Compact", ::apache::thrift::protocol::T_EXCEPTION, seqid); + x.write(oprot); + oprot->writeMessageEnd(); + oprot->getTransport()->writeEnd(); + oprot->getTransport()->flush(); + return; + } + + if (this->eventHandler_.get() != nullptr) { + this->eventHandler_->preWrite(ctx, "InfinityService.Compact"); + } + + oprot->writeMessageBegin("Compact", ::apache::thrift::protocol::T_REPLY, seqid); + result.write(oprot); + oprot->writeMessageEnd(); + bytes = oprot->getTransport()->writeEnd(); + oprot->getTransport()->flush(); + + if (this->eventHandler_.get() != nullptr) { + this->eventHandler_->postWrite(ctx, "InfinityService.Compact", bytes); + } +} + ::std::shared_ptr< ::apache::thrift::TProcessor > InfinityServiceProcessorFactory::getProcessor(const ::apache::thrift::TConnectionInfo& connInfo) { ::apache::thrift::ReleaseHandler< InfinityServiceIfFactory > cleanup(handlerFactory_); ::std::shared_ptr< InfinityServiceIf > handler(handlerFactory_->getHandler(connInfo), cleanup); @@ -14205,5 +14504,89 @@ void InfinityServiceConcurrentClient::recv_Flush(CommonResponse& _return, const } // end while(true) } +void InfinityServiceConcurrentClient::Compact(CommonResponse& _return, const CompactRequest& request) +{ + int32_t seqid = send_Compact(request); + recv_Compact(_return, seqid); +} + +int32_t InfinityServiceConcurrentClient::send_Compact(const CompactRequest& request) +{ + int32_t cseqid = this->sync_->generateSeqId(); + ::apache::thrift::async::TConcurrentSendSentry sentry(this->sync_.get()); + oprot_->writeMessageBegin("Compact", ::apache::thrift::protocol::T_CALL, cseqid); + + InfinityService_Compact_pargs args; + args.request = &request; + args.write(oprot_); + + oprot_->writeMessageEnd(); + oprot_->getTransport()->writeEnd(); + oprot_->getTransport()->flush(); + + sentry.commit(); + return cseqid; +} + +void InfinityServiceConcurrentClient::recv_Compact(CommonResponse& _return, const int32_t seqid) +{ + + int32_t rseqid = 0; + std::string fname; + ::apache::thrift::protocol::TMessageType mtype; + + // the read mutex gets dropped and reacquired as part of waitForWork() + // The destructor of this sentry wakes up other clients + ::apache::thrift::async::TConcurrentRecvSentry sentry(this->sync_.get(), seqid); + + while(true) { + if(!this->sync_->getPending(fname, mtype, rseqid)) { + iprot_->readMessageBegin(fname, mtype, rseqid); + } + if(seqid == rseqid) { + if (mtype == ::apache::thrift::protocol::T_EXCEPTION) { + ::apache::thrift::TApplicationException x; + x.read(iprot_); + iprot_->readMessageEnd(); + iprot_->getTransport()->readEnd(); + sentry.commit(); + throw x; + } + if (mtype != ::apache::thrift::protocol::T_REPLY) { + iprot_->skip(::apache::thrift::protocol::T_STRUCT); + iprot_->readMessageEnd(); + iprot_->getTransport()->readEnd(); + } + if (fname.compare("Compact") != 0) { + iprot_->skip(::apache::thrift::protocol::T_STRUCT); + iprot_->readMessageEnd(); + iprot_->getTransport()->readEnd(); + + // in a bad state, don't commit + using ::apache::thrift::protocol::TProtocolException; + throw TProtocolException(TProtocolException::INVALID_DATA); + } + InfinityService_Compact_presult result; + result.success = &_return; + result.read(iprot_); + iprot_->readMessageEnd(); + iprot_->getTransport()->readEnd(); + + if (result.__isset.success) { + // _return pointer has now been filled + sentry.commit(); + return; + } + // in a bad state, don't commit + throw ::apache::thrift::TApplicationException(::apache::thrift::TApplicationException::MISSING_RESULT, "Compact failed: unknown result"); + } + // seqid != rseqid + this->sync_->updatePending(fname, mtype, rseqid); + + // this will temporarily unlock the readMutex, and let other clients get work done + this->sync_->waitForWork(seqid); + } // end while(true) +} + } // namespace diff --git a/src/network/infinity_thrift/InfinityService.h b/src/network/infinity_thrift/InfinityService.h index abcf51f50d..3f06f162f4 100644 --- a/src/network/infinity_thrift/InfinityService.h +++ b/src/network/infinity_thrift/InfinityService.h @@ -59,6 +59,7 @@ class InfinityServiceIf { virtual void Cleanup(CommonResponse& _return, const CommonRequest& request) = 0; virtual void Command(CommonResponse& _return, const CommandRequest& request) = 0; virtual void Flush(CommonResponse& _return, const FlushRequest& request) = 0; + virtual void Compact(CommonResponse& _return, const CompactRequest& request) = 0; }; class InfinityServiceIfFactory { @@ -199,6 +200,9 @@ class InfinityServiceNull : virtual public InfinityServiceIf { void Flush(CommonResponse& /* _return */, const FlushRequest& /* request */) override { return; } + void Compact(CommonResponse& /* _return */, const CompactRequest& /* request */) override { + return; + } }; typedef struct _InfinityService_Connect_args__isset { @@ -4049,6 +4053,110 @@ class InfinityService_Flush_presult { }; +typedef struct _InfinityService_Compact_args__isset { + _InfinityService_Compact_args__isset() : request(false) {} + bool request :1; +} _InfinityService_Compact_args__isset; + +class InfinityService_Compact_args { + public: + + InfinityService_Compact_args(const InfinityService_Compact_args&); + InfinityService_Compact_args& operator=(const InfinityService_Compact_args&); + InfinityService_Compact_args() noexcept { + } + + virtual ~InfinityService_Compact_args() noexcept; + CompactRequest request; + + _InfinityService_Compact_args__isset __isset; + + void __set_request(const CompactRequest& val); + + bool operator == (const InfinityService_Compact_args & rhs) const + { + if (!(request == rhs.request)) + return false; + return true; + } + bool operator != (const InfinityService_Compact_args &rhs) const { + return !(*this == rhs); + } + + bool operator < (const InfinityService_Compact_args & ) const; + + uint32_t read(::apache::thrift::protocol::TProtocol* iprot); + uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const; + +}; + + +class InfinityService_Compact_pargs { + public: + + + virtual ~InfinityService_Compact_pargs() noexcept; + const CompactRequest* request; + + uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const; + +}; + +typedef struct _InfinityService_Compact_result__isset { + _InfinityService_Compact_result__isset() : success(false) {} + bool success :1; +} _InfinityService_Compact_result__isset; + +class InfinityService_Compact_result { + public: + + InfinityService_Compact_result(const InfinityService_Compact_result&); + InfinityService_Compact_result& operator=(const InfinityService_Compact_result&); + InfinityService_Compact_result() noexcept { + } + + virtual ~InfinityService_Compact_result() noexcept; + CommonResponse success; + + _InfinityService_Compact_result__isset __isset; + + void __set_success(const CommonResponse& val); + + bool operator == (const InfinityService_Compact_result & rhs) const + { + if (!(success == rhs.success)) + return false; + return true; + } + bool operator != (const InfinityService_Compact_result &rhs) const { + return !(*this == rhs); + } + + bool operator < (const InfinityService_Compact_result & ) const; + + uint32_t read(::apache::thrift::protocol::TProtocol* iprot); + uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const; + +}; + +typedef struct _InfinityService_Compact_presult__isset { + _InfinityService_Compact_presult__isset() : success(false) {} + bool success :1; +} _InfinityService_Compact_presult__isset; + +class InfinityService_Compact_presult { + public: + + + virtual ~InfinityService_Compact_presult() noexcept; + CommonResponse* success; + + _InfinityService_Compact_presult__isset __isset; + + uint32_t read(::apache::thrift::protocol::TProtocol* iprot); + +}; + class InfinityServiceClient : virtual public InfinityServiceIf { public: InfinityServiceClient(std::shared_ptr< ::apache::thrift::protocol::TProtocol> prot) { @@ -4185,6 +4293,9 @@ class InfinityServiceClient : virtual public InfinityServiceIf { void Flush(CommonResponse& _return, const FlushRequest& request) override; void send_Flush(const FlushRequest& request); void recv_Flush(CommonResponse& _return); + void Compact(CommonResponse& _return, const CompactRequest& request) override; + void send_Compact(const CompactRequest& request); + void recv_Compact(CommonResponse& _return); protected: std::shared_ptr< ::apache::thrift::protocol::TProtocol> piprot_; std::shared_ptr< ::apache::thrift::protocol::TProtocol> poprot_; @@ -4237,6 +4348,7 @@ class InfinityServiceProcessor : public ::apache::thrift::TDispatchProcessor { void process_Cleanup(int32_t seqid, ::apache::thrift::protocol::TProtocol* iprot, ::apache::thrift::protocol::TProtocol* oprot, void* callContext); void process_Command(int32_t seqid, ::apache::thrift::protocol::TProtocol* iprot, ::apache::thrift::protocol::TProtocol* oprot, void* callContext); void process_Flush(int32_t seqid, ::apache::thrift::protocol::TProtocol* iprot, ::apache::thrift::protocol::TProtocol* oprot, void* callContext); + void process_Compact(int32_t seqid, ::apache::thrift::protocol::TProtocol* iprot, ::apache::thrift::protocol::TProtocol* oprot, void* callContext); public: InfinityServiceProcessor(::std::shared_ptr iface) : iface_(iface) { @@ -4277,6 +4389,7 @@ class InfinityServiceProcessor : public ::apache::thrift::TDispatchProcessor { processMap_["Cleanup"] = &InfinityServiceProcessor::process_Cleanup; processMap_["Command"] = &InfinityServiceProcessor::process_Command; processMap_["Flush"] = &InfinityServiceProcessor::process_Flush; + processMap_["Compact"] = &InfinityServiceProcessor::process_Compact; } virtual ~InfinityServiceProcessor() {} @@ -4675,6 +4788,16 @@ class InfinityServiceMultiface : virtual public InfinityServiceIf { return; } + void Compact(CommonResponse& _return, const CompactRequest& request) override { + size_t sz = ifaces_.size(); + size_t i = 0; + for (; i < (sz - 1); ++i) { + ifaces_[i]->Compact(_return, request); + } + ifaces_[i]->Compact(_return, request); + return; + } + }; // The 'concurrent' client is a thread safe client that correctly handles @@ -4818,6 +4941,9 @@ class InfinityServiceConcurrentClient : virtual public InfinityServiceIf { void Flush(CommonResponse& _return, const FlushRequest& request) override; int32_t send_Flush(const FlushRequest& request); void recv_Flush(CommonResponse& _return, const int32_t seqid); + void Compact(CommonResponse& _return, const CompactRequest& request) override; + int32_t send_Compact(const CompactRequest& request); + void recv_Compact(CommonResponse& _return, const int32_t seqid); protected: std::shared_ptr< ::apache::thrift::protocol::TProtocol> piprot_; std::shared_ptr< ::apache::thrift::protocol::TProtocol> poprot_; diff --git a/src/network/infinity_thrift/infinity_types.cpp b/src/network/infinity_thrift/infinity_types.cpp index 9fe664d0da..58c2cf28fc 100644 --- a/src/network/infinity_thrift/infinity_types.cpp +++ b/src/network/infinity_thrift/infinity_types.cpp @@ -11625,6 +11625,11 @@ void SelectRequest::__set_order_by_list(const std::vector & val) { this->order_by_list = val; __isset.order_by_list = true; } + +void SelectRequest::__set_total_hits_count(const bool val) { + this->total_hits_count = val; +__isset.total_hits_count = true; +} std::ostream& operator<<(std::ostream& out, const SelectRequest& obj) { obj.printTo(out); @@ -11797,6 +11802,14 @@ uint32_t SelectRequest::read(::apache::thrift::protocol::TProtocol* iprot) { xfer += iprot->skip(ftype); } break; + case 13: + if (ftype == ::apache::thrift::protocol::T_BOOL) { + xfer += iprot->readBool(this->total_hits_count); + this->__isset.total_hits_count = true; + } else { + xfer += iprot->skip(ftype); + } + break; default: xfer += iprot->skip(ftype); break; @@ -11902,6 +11915,11 @@ uint32_t SelectRequest::write(::apache::thrift::protocol::TProtocol* oprot) cons } xfer += oprot->writeFieldEnd(); } + if (this->__isset.total_hits_count) { + xfer += oprot->writeFieldBegin("total_hits_count", ::apache::thrift::protocol::T_BOOL, 13); + xfer += oprot->writeBool(this->total_hits_count); + xfer += oprot->writeFieldEnd(); + } xfer += oprot->writeFieldStop(); xfer += oprot->writeStructEnd(); return xfer; @@ -11921,6 +11939,7 @@ void swap(SelectRequest &a, SelectRequest &b) { swap(a.limit_expr, b.limit_expr); swap(a.offset_expr, b.offset_expr); swap(a.order_by_list, b.order_by_list); + swap(a.total_hits_count, b.total_hits_count); swap(a.__isset, b.__isset); } @@ -11937,6 +11956,7 @@ SelectRequest::SelectRequest(const SelectRequest& other474) { limit_expr = other474.limit_expr; offset_expr = other474.offset_expr; order_by_list = other474.order_by_list; + total_hits_count = other474.total_hits_count; __isset = other474.__isset; } SelectRequest& SelectRequest::operator=(const SelectRequest& other475) { @@ -11952,6 +11972,7 @@ SelectRequest& SelectRequest::operator=(const SelectRequest& other475) { limit_expr = other475.limit_expr; offset_expr = other475.offset_expr; order_by_list = other475.order_by_list; + total_hits_count = other475.total_hits_count; __isset = other475.__isset; return *this; } @@ -11970,6 +11991,7 @@ void SelectRequest::printTo(std::ostream& out) const { out << ", " << "limit_expr="; (__isset.limit_expr ? (out << to_string(limit_expr)) : (out << "")); out << ", " << "offset_expr="; (__isset.offset_expr ? (out << to_string(offset_expr)) : (out << "")); out << ", " << "order_by_list="; (__isset.order_by_list ? (out << to_string(order_by_list)) : (out << "")); + out << ", " << "total_hits_count="; (__isset.total_hits_count ? (out << to_string(total_hits_count)) : (out << "")); out << ")"; } @@ -11993,6 +12015,10 @@ void SelectResponse::__set_column_defs(const std::vector & val) { void SelectResponse::__set_column_fields(const std::vector & val) { this->column_fields = val; } + +void SelectResponse::__set_extra_result(const std::string& val) { + this->extra_result = val; +} std::ostream& operator<<(std::ostream& out, const SelectResponse& obj) { obj.printTo(out); @@ -12077,6 +12103,14 @@ uint32_t SelectResponse::read(::apache::thrift::protocol::TProtocol* iprot) { xfer += iprot->skip(ftype); } break; + case 5: + if (ftype == ::apache::thrift::protocol::T_STRING) { + xfer += iprot->readString(this->extra_result); + this->__isset.extra_result = true; + } else { + xfer += iprot->skip(ftype); + } + break; default: xfer += iprot->skip(ftype); break; @@ -12126,6 +12160,10 @@ uint32_t SelectResponse::write(::apache::thrift::protocol::TProtocol* oprot) con } xfer += oprot->writeFieldEnd(); + xfer += oprot->writeFieldBegin("extra_result", ::apache::thrift::protocol::T_STRING, 5); + xfer += oprot->writeString(this->extra_result); + xfer += oprot->writeFieldEnd(); + xfer += oprot->writeFieldStop(); xfer += oprot->writeStructEnd(); return xfer; @@ -12137,6 +12175,7 @@ void swap(SelectResponse &a, SelectResponse &b) { swap(a.error_msg, b.error_msg); swap(a.column_defs, b.column_defs); swap(a.column_fields, b.column_fields); + swap(a.extra_result, b.extra_result); swap(a.__isset, b.__isset); } @@ -12145,6 +12184,7 @@ SelectResponse::SelectResponse(const SelectResponse& other488) { error_msg = other488.error_msg; column_defs = other488.column_defs; column_fields = other488.column_fields; + extra_result = other488.extra_result; __isset = other488.__isset; } SelectResponse& SelectResponse::operator=(const SelectResponse& other489) { @@ -12152,6 +12192,7 @@ SelectResponse& SelectResponse::operator=(const SelectResponse& other489) { error_msg = other489.error_msg; column_defs = other489.column_defs; column_fields = other489.column_fields; + extra_result = other489.extra_result; __isset = other489.__isset; return *this; } @@ -12162,6 +12203,7 @@ void SelectResponse::printTo(std::ostream& out) const { out << ", " << "error_msg=" << to_string(error_msg); out << ", " << "column_defs=" << to_string(column_defs); out << ", " << "column_fields=" << to_string(column_fields); + out << ", " << "extra_result=" << to_string(extra_result); out << ")"; } @@ -15141,4 +15183,136 @@ void FlushRequest::printTo(std::ostream& out) const { out << ")"; } + +CompactRequest::~CompactRequest() noexcept { +} + + +void CompactRequest::__set_session_id(const int64_t val) { + this->session_id = val; +} + +void CompactRequest::__set_db_name(const std::string& val) { + this->db_name = val; +} + +void CompactRequest::__set_table_name(const std::string& val) { + this->table_name = val; +} +std::ostream& operator<<(std::ostream& out, const CompactRequest& obj) +{ + obj.printTo(out); + return out; +} + + +uint32_t CompactRequest::read(::apache::thrift::protocol::TProtocol* iprot) { + + ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot); + uint32_t xfer = 0; + std::string fname; + ::apache::thrift::protocol::TType ftype; + int16_t fid; + + xfer += iprot->readStructBegin(fname); + + using ::apache::thrift::protocol::TProtocolException; + + + while (true) + { + xfer += iprot->readFieldBegin(fname, ftype, fid); + if (ftype == ::apache::thrift::protocol::T_STOP) { + break; + } + switch (fid) + { + case 1: + if (ftype == ::apache::thrift::protocol::T_I64) { + xfer += iprot->readI64(this->session_id); + this->__isset.session_id = true; + } else { + xfer += iprot->skip(ftype); + } + break; + case 2: + if (ftype == ::apache::thrift::protocol::T_STRING) { + xfer += iprot->readString(this->db_name); + this->__isset.db_name = true; + } else { + xfer += iprot->skip(ftype); + } + break; + case 3: + if (ftype == ::apache::thrift::protocol::T_STRING) { + xfer += iprot->readString(this->table_name); + this->__isset.table_name = true; + } else { + xfer += iprot->skip(ftype); + } + break; + default: + xfer += iprot->skip(ftype); + break; + } + xfer += iprot->readFieldEnd(); + } + + xfer += iprot->readStructEnd(); + + return xfer; +} + +uint32_t CompactRequest::write(::apache::thrift::protocol::TProtocol* oprot) const { + uint32_t xfer = 0; + ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot); + xfer += oprot->writeStructBegin("CompactRequest"); + + xfer += oprot->writeFieldBegin("session_id", ::apache::thrift::protocol::T_I64, 1); + xfer += oprot->writeI64(this->session_id); + xfer += oprot->writeFieldEnd(); + + xfer += oprot->writeFieldBegin("db_name", ::apache::thrift::protocol::T_STRING, 2); + xfer += oprot->writeString(this->db_name); + xfer += oprot->writeFieldEnd(); + + xfer += oprot->writeFieldBegin("table_name", ::apache::thrift::protocol::T_STRING, 3); + xfer += oprot->writeString(this->table_name); + xfer += oprot->writeFieldEnd(); + + xfer += oprot->writeFieldStop(); + xfer += oprot->writeStructEnd(); + return xfer; +} + +void swap(CompactRequest &a, CompactRequest &b) { + using ::std::swap; + swap(a.session_id, b.session_id); + swap(a.db_name, b.db_name); + swap(a.table_name, b.table_name); + swap(a.__isset, b.__isset); +} + +CompactRequest::CompactRequest(const CompactRequest& other544) { + session_id = other544.session_id; + db_name = other544.db_name; + table_name = other544.table_name; + __isset = other544.__isset; +} +CompactRequest& CompactRequest::operator=(const CompactRequest& other545) { + session_id = other545.session_id; + db_name = other545.db_name; + table_name = other545.table_name; + __isset = other545.__isset; + return *this; +} +void CompactRequest::printTo(std::ostream& out) const { + using ::apache::thrift::to_string; + out << "CompactRequest("; + out << "session_id=" << to_string(session_id); + out << ", " << "db_name=" << to_string(db_name); + out << ", " << "table_name=" << to_string(table_name); + out << ")"; +} + } // namespace diff --git a/src/network/infinity_thrift/infinity_types.h b/src/network/infinity_thrift/infinity_types.h index bbc3b1c905..3107238071 100644 --- a/src/network/infinity_thrift/infinity_types.h +++ b/src/network/infinity_thrift/infinity_types.h @@ -416,6 +416,8 @@ class CommandRequest; class FlushRequest; +class CompactRequest; + typedef struct _Property__isset { _Property__isset() : key(false), value(false) {} bool key :1; @@ -4597,7 +4599,7 @@ void swap(ExplainResponse &a, ExplainResponse &b); std::ostream& operator<<(std::ostream& out, const ExplainResponse& obj); typedef struct _SelectRequest__isset { - _SelectRequest__isset() : session_id(false), db_name(false), table_name(false), select_list(true), highlight_list(true), search_expr(false), where_expr(false), group_by_list(true), having_expr(false), limit_expr(false), offset_expr(false), order_by_list(true) {} + _SelectRequest__isset() : session_id(false), db_name(false), table_name(false), select_list(true), highlight_list(true), search_expr(false), where_expr(false), group_by_list(true), having_expr(false), limit_expr(false), offset_expr(false), order_by_list(true), total_hits_count(false) {} bool session_id :1; bool db_name :1; bool table_name :1; @@ -4610,6 +4612,7 @@ typedef struct _SelectRequest__isset { bool limit_expr :1; bool offset_expr :1; bool order_by_list :1; + bool total_hits_count :1; } _SelectRequest__isset; class SelectRequest : public virtual ::apache::thrift::TBase { @@ -4620,7 +4623,8 @@ class SelectRequest : public virtual ::apache::thrift::TBase { SelectRequest() noexcept : session_id(0), db_name(), - table_name() { + table_name(), + total_hits_count(0) { @@ -4640,6 +4644,7 @@ class SelectRequest : public virtual ::apache::thrift::TBase { ParsedExpr limit_expr; ParsedExpr offset_expr; std::vector order_by_list; + bool total_hits_count; _SelectRequest__isset __isset; @@ -4667,6 +4672,8 @@ class SelectRequest : public virtual ::apache::thrift::TBase { void __set_order_by_list(const std::vector & val); + void __set_total_hits_count(const bool val); + bool operator == (const SelectRequest & rhs) const { if (!(session_id == rhs.session_id)) @@ -4709,6 +4716,10 @@ class SelectRequest : public virtual ::apache::thrift::TBase { return false; else if (__isset.order_by_list && !(order_by_list == rhs.order_by_list)) return false; + if (__isset.total_hits_count != rhs.__isset.total_hits_count) + return false; + else if (__isset.total_hits_count && !(total_hits_count == rhs.total_hits_count)) + return false; return true; } bool operator != (const SelectRequest &rhs) const { @@ -4728,11 +4739,12 @@ void swap(SelectRequest &a, SelectRequest &b); std::ostream& operator<<(std::ostream& out, const SelectRequest& obj); typedef struct _SelectResponse__isset { - _SelectResponse__isset() : error_code(false), error_msg(false), column_defs(true), column_fields(true) {} + _SelectResponse__isset() : error_code(false), error_msg(false), column_defs(true), column_fields(true), extra_result(false) {} bool error_code :1; bool error_msg :1; bool column_defs :1; bool column_fields :1; + bool extra_result :1; } _SelectResponse__isset; class SelectResponse : public virtual ::apache::thrift::TBase { @@ -4742,7 +4754,8 @@ class SelectResponse : public virtual ::apache::thrift::TBase { SelectResponse& operator=(const SelectResponse&); SelectResponse() noexcept : error_code(0), - error_msg() { + error_msg(), + extra_result() { } @@ -4752,6 +4765,7 @@ class SelectResponse : public virtual ::apache::thrift::TBase { std::string error_msg; std::vector column_defs; std::vector column_fields; + std::string extra_result; _SelectResponse__isset __isset; @@ -4763,6 +4777,8 @@ class SelectResponse : public virtual ::apache::thrift::TBase { void __set_column_fields(const std::vector & val); + void __set_extra_result(const std::string& val); + bool operator == (const SelectResponse & rhs) const { if (!(error_code == rhs.error_code)) @@ -4773,6 +4789,8 @@ class SelectResponse : public virtual ::apache::thrift::TBase { return false; if (!(column_fields == rhs.column_fields)) return false; + if (!(extra_result == rhs.extra_result)) + return false; return true; } bool operator != (const SelectResponse &rhs) const { @@ -6004,6 +6022,63 @@ void swap(FlushRequest &a, FlushRequest &b); std::ostream& operator<<(std::ostream& out, const FlushRequest& obj); +typedef struct _CompactRequest__isset { + _CompactRequest__isset() : session_id(false), db_name(false), table_name(false) {} + bool session_id :1; + bool db_name :1; + bool table_name :1; +} _CompactRequest__isset; + +class CompactRequest : public virtual ::apache::thrift::TBase { + public: + + CompactRequest(const CompactRequest&); + CompactRequest& operator=(const CompactRequest&); + CompactRequest() noexcept + : session_id(0), + db_name(), + table_name() { + } + + virtual ~CompactRequest() noexcept; + int64_t session_id; + std::string db_name; + std::string table_name; + + _CompactRequest__isset __isset; + + void __set_session_id(const int64_t val); + + void __set_db_name(const std::string& val); + + void __set_table_name(const std::string& val); + + bool operator == (const CompactRequest & rhs) const + { + if (!(session_id == rhs.session_id)) + return false; + if (!(db_name == rhs.db_name)) + return false; + if (!(table_name == rhs.table_name)) + return false; + return true; + } + bool operator != (const CompactRequest &rhs) const { + return !(*this == rhs); + } + + bool operator < (const CompactRequest & ) const; + + uint32_t read(::apache::thrift::protocol::TProtocol* iprot) override; + uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const override; + + virtual void printTo(std::ostream& out) const; +}; + +void swap(CompactRequest &a, CompactRequest &b); + +std::ostream& operator<<(std::ostream& out, const CompactRequest& obj); + } // namespace #endif diff --git a/src/network/infinity_thrift_service.cpp b/src/network/infinity_thrift_service.cpp index d3aec40d57..6d50d15ce1 100644 --- a/src/network/infinity_thrift_service.cpp +++ b/src/network/infinity_thrift_service.cpp @@ -93,6 +93,7 @@ ClientVersions::ClientVersions() { client_version_map_[24] = String("0.5.0.dev2"); client_version_map_[25] = String("0.5.0.dev3"); client_version_map_[26] = String("0.5.0.dev5"); + client_version_map_[27] = String("0.5.0.dev6"); } Pair ClientVersions::GetVersionByIndex(i64 version_index) { @@ -667,7 +668,8 @@ void InfinityThriftService::Select(infinity_thrift_rpc::SelectResponse &response output_columns, highlight_columns, order_by_list, - nullptr); + nullptr, + request.total_hits_count); output_columns = nullptr; highlight_columns = nullptr; filter = nullptr; @@ -1760,6 +1762,18 @@ void InfinityThriftService::Flush(infinity_thrift_rpc::CommonResponse &response, ProcessQueryResult(response, result); } +void InfinityThriftService::Compact(infinity_thrift_rpc::CommonResponse &response, const infinity_thrift_rpc::CompactRequest &request) { + auto [infinity, infinity_status] = GetInfinityBySessionID(request.session_id); + if (!infinity_status.ok()) { + ProcessStatus(response, infinity_status); + return; + } + LOG_TRACE(fmt::format("THRIFT: Compact Table: {}", request.table_name)); + + QueryResult result = infinity->Compact(request.db_name, request.table_name); + ProcessQueryResult(response, result); +} + Tuple InfinityThriftService::GetInfinityBySessionID(i64 session_id) { std::lock_guard lock(infinity_session_map_mutex_); auto iter = infinity_session_map_.find(session_id); @@ -2730,6 +2744,13 @@ void InfinityThriftService::ProcessDataBlocks(const QueryResult &result, return; } } + + if(result.result_table_->total_hits_count_flag_) { + nlohmann::json json_response; + json_response["total_hits_count"] = result.result_table_->total_hits_count_; + response.extra_result = json_response.dump(); + } + HandleColumnDef(response, result.result_table_->ColumnCount(), result.result_table_->definition_ptr_, columns); } diff --git a/src/network/infinity_thrift_service.cppm b/src/network/infinity_thrift_service.cppm index 3e438e1fda..1c26e848b5 100644 --- a/src/network/infinity_thrift_service.cppm +++ b/src/network/infinity_thrift_service.cppm @@ -60,7 +60,7 @@ struct ClientVersions { export class InfinityThriftService final : public infinity_thrift_rpc::InfinityServiceIf { private: static constexpr std::string_view ErrorMsgHeader = "[THRIFT ERROR]"; - static constexpr i64 current_version_index_{26}; // 0.5.0.dev5 + static constexpr i64 current_version_index_{27}; // 0.5.0.dev6 static std::mutex infinity_session_map_mutex_; static HashMap> infinity_session_map_; @@ -158,6 +158,8 @@ public: void Flush(infinity_thrift_rpc::CommonResponse &response, const infinity_thrift_rpc::FlushRequest &request) final; + void Compact(infinity_thrift_rpc::CommonResponse &response, const infinity_thrift_rpc::CompactRequest &request) final; + private: Tuple GetInfinityBySessionID(i64 session_id); diff --git a/src/network/infinity_thrift_types.cppm b/src/network/infinity_thrift_types.cppm index 751697cc33..be146719a2 100644 --- a/src/network/infinity_thrift_types.cppm +++ b/src/network/infinity_thrift_types.cppm @@ -59,6 +59,7 @@ export using infinity_thrift_rpc::DropColumnsRequest; export using infinity_thrift_rpc::ShowCurrentNodeRequest; export using infinity_thrift_rpc::CommandRequest; export using infinity_thrift_rpc::FlushRequest; +export using infinity_thrift_rpc::CompactRequest; export using infinity_thrift_rpc::CommonResponse; export using infinity_thrift_rpc::DeleteResponse; export using infinity_thrift_rpc::SelectResponse; diff --git a/src/network/node_info.cpp b/src/network/node_info.cpp index 176b76f560..57ed759041 100644 --- a/src/network/node_info.cpp +++ b/src/network/node_info.cpp @@ -60,6 +60,15 @@ String NodeInfo::node_ip() const { return ip_address_; } +bool NodeInfo::IsSameNode(const NodeInfo& other) { + std::unique_lock locker(node_mutex_); + if(node_name_ == other.node_name_ && ip_address_ == other.ip_address_ && port_ == other.port_) { + return true; + } else { + return false; + } +} + String ToString(NodeStatus status) { switch (status) { case NodeStatus::kAlive: diff --git a/src/network/node_info.cppm b/src/network/node_info.cppm index a9875ca7b5..729f11cf59 100644 --- a/src/network/node_info.cppm +++ b/src/network/node_info.cppm @@ -62,6 +62,8 @@ public: void heartbeat_count_increase() { ++heartbeat_count_; } u64 heartbeat_count() const { return heartbeat_count_; } + bool IsSameNode(const NodeInfo& other); + private: mutable std::mutex node_mutex_{}; Atomic node_role_{NodeRole::kUnInitialized}; diff --git a/src/network/peer_server_thrift_service.cpp b/src/network/peer_server_thrift_service.cpp index 70a4afe317..7efa35180b 100644 --- a/src/network/peer_server_thrift_service.cpp +++ b/src/network/peer_server_thrift_service.cpp @@ -74,7 +74,7 @@ void PeerServerThriftService::Register(infinity_peer_server::RegisterResponse &r } } else { response.error_code = static_cast(ErrorCode::kInvalidNodeRole); - response.error_message = "Attempt to register a non-leader node"; + response.error_message = "Attempt to register with a non-leader node"; } return; diff --git a/src/network/peer_thrift_client.cpp b/src/network/peer_thrift_client.cpp index 5cdac62b21..8af4eacdc9 100644 --- a/src/network/peer_thrift_client.cpp +++ b/src/network/peer_thrift_client.cpp @@ -338,7 +338,7 @@ void PeerClient::HeartBeat(HeartBeatPeerTask *peer_task) { return; } default: { - String error_message = "Heartbeat: error in data transfer to leader"; + String error_message = "Heartbeat: error happens when data transfer to leader"; UnrecoverableError(error_message); } } diff --git a/src/parser/CMakeLists.txt b/src/parser/CMakeLists.txt index e6479f67a2..362a64e80c 100644 --- a/src/parser/CMakeLists.txt +++ b/src/parser/CMakeLists.txt @@ -20,3 +20,4 @@ target_include_directories(sql_parser PUBLIC "${CMAKE_SOURCE_DIR}/src/storage/in target_include_directories(sql_parser PUBLIC "${CMAKE_SOURCE_DIR}/third_party/spdlog/include") target_include_directories(sql_parser PUBLIC "${CMAKE_SOURCE_DIR}/third_party/nlohmann") target_include_directories(sql_parser PUBLIC "${CMAKE_SOURCE_DIR}/third_party/parallel-hashmap") +target_include_directories(sql_parser PUBLIC "${CMAKE_SOURCE_DIR}/third_party/arrow/src") diff --git a/src/parser/expr_parser.cpp b/src/parser/expr_parser.cpp index f875405259..4e6e645736 100644 --- a/src/parser/expr_parser.cpp +++ b/src/parser/expr_parser.cpp @@ -34,7 +34,7 @@ void ExprParser::Parse(const std::string &expr_text, ExpressionParserResult *res state_ = expression_scan_string(expr_text.c_str(), scanner_); - // WARNNING: shall reset result to avoid polluting later Parse! + // WARNING: shall reset result to avoid polluting later Parse! result->Reset(); if (expressionparse(scanner_, result)) { std::cerr << "Parse expression error: " << expr_text << std::endl; diff --git a/src/parser/expression_lexer.cpp b/src/parser/expression_lexer.cpp index ecd6d74fed..62f363e292 100644 --- a/src/parser/expression_lexer.cpp +++ b/src/parser/expression_lexer.cpp @@ -1,6 +1,6 @@ -#line 1 "expression_lexer.cpp" +#line 2 "expression_lexer.cpp" -#line 3 "expression_lexer.cpp" +#line 4 "expression_lexer.cpp" #define YY_INT_ALIGNED short int @@ -810,10 +810,10 @@ static const flex_int16_t yy_rule_linenum[29] = static thread_local std::stringstream string_buffer; -#line 813 "expression_lexer.cpp" +#line 814 "expression_lexer.cpp" #define YY_NO_INPUT 1 -#line 816 "expression_lexer.cpp" +#line 817 "expression_lexer.cpp" #define INITIAL 0 #define SINGLE_QUOTED_STRING 1 @@ -1167,7 +1167,7 @@ YY_DECL #line 27 "expression_lexer.l" -#line 1170 "expression_lexer.cpp" +#line 1171 "expression_lexer.cpp" while ( /*CONSTCOND*/1 ) /* loops until end-of-file is reached */ { @@ -1412,7 +1412,7 @@ YY_RULE_SETUP #line 89 "expression_lexer.l" ECHO; YY_BREAK -#line 1415 "expression_lexer.cpp" +#line 1416 "expression_lexer.cpp" case YY_STATE_EOF(INITIAL): yyterminate(); diff --git a/src/parser/expression_lexer.h b/src/parser/expression_lexer.h index 3112d9da4d..533db5085f 100644 --- a/src/parser/expression_lexer.h +++ b/src/parser/expression_lexer.h @@ -2,9 +2,9 @@ #define expressionHEADER_H 1 #define expressionIN_HEADER 1 -#line 5 "expression_lexer.h" +#line 6 "expression_lexer.h" -#line 7 "expression_lexer.h" +#line 8 "expression_lexer.h" #define YY_INT_ALIGNED short int @@ -849,6 +849,6 @@ extern int yylex \ #line 89 "expression_lexer.l" -#line 852 "expression_lexer.h" +#line 853 "expression_lexer.h" #undef expressionIN_HEADER #endif /* expressionHEADER_H */ diff --git a/src/parser/generate_parser.sh b/src/parser/generate_parser.sh index f2b5a9f0b7..9d0323624a 100755 --- a/src/parser/generate_parser.sh +++ b/src/parser/generate_parser.sh @@ -5,5 +5,4 @@ flex -di --reentrant --bison-bridge --bison-location -Cem -oexpression_lexer.cpp bison -oexpression_parser.cpp --header=expression_parser.h expression_parser.y -Wcounterexamples -d -v flex -+dvB8 -Cem -osearch_lexer.cpp search_lexer.l -flex -+dvB8 -Cem -osearch_lexer_plain.cpp search_lexer_plain.l bison -osearch_parser.cpp --header=search_parser.h search_parser.y -Wcounterexamples -d -v diff --git a/src/parser/lexer.cpp b/src/parser/lexer.cpp index 99252f204a..80b3da660b 100644 --- a/src/parser/lexer.cpp +++ b/src/parser/lexer.cpp @@ -1,6 +1,6 @@ -#line 1 "lexer.cpp" +#line 2 "lexer.cpp" -#line 3 "lexer.cpp" +#line 4 "lexer.cpp" #define YY_INT_ALIGNED short int @@ -1373,10 +1373,10 @@ static const flex_int16_t yy_rule_linenum[209] = static thread_local std::stringstream string_buffer; -#line 1376 "lexer.cpp" +#line 1377 "lexer.cpp" #define YY_NO_INPUT 1 -#line 1379 "lexer.cpp" +#line 1380 "lexer.cpp" #define INITIAL 0 #define SINGLE_QUOTED_STRING 1 @@ -1730,7 +1730,7 @@ YY_DECL #line 27 "lexer.l" -#line 1733 "lexer.cpp" +#line 1734 "lexer.cpp" while ( /*CONSTCOND*/1 ) /* loops until end-of-file is reached */ { @@ -2875,7 +2875,7 @@ YY_RULE_SETUP #line 269 "lexer.l" ECHO; YY_BREAK -#line 2878 "lexer.cpp" +#line 2879 "lexer.cpp" case YY_STATE_EOF(INITIAL): yyterminate(); diff --git a/src/parser/lexer.h b/src/parser/lexer.h index ee56d7289d..3a5ea4ec0b 100644 --- a/src/parser/lexer.h +++ b/src/parser/lexer.h @@ -2,9 +2,9 @@ #define sqlHEADER_H 1 #define sqlIN_HEADER 1 -#line 5 "lexer.h" +#line 6 "lexer.h" -#line 7 "lexer.h" +#line 8 "lexer.h" #define YY_INT_ALIGNED short int @@ -849,6 +849,6 @@ extern int yylex \ #line 269 "lexer.l" -#line 852 "lexer.h" +#line 853 "lexer.h" #undef sqlIN_HEADER #endif /* sqlHEADER_H */ diff --git a/src/parser/parser.cpp b/src/parser/parser.cpp index bff796d6d8..5ddce93af9 100644 --- a/src/parser/parser.cpp +++ b/src/parser/parser.cpp @@ -5418,7 +5418,7 @@ YYLTYPE yylloc = yyloc_default; yyerror(&yyloc, scanner, result, "Result modifier(ORDER BY) is conflict with SEARCH expression."); YYERROR; } - (yyvsp[-3].select_stmt)->order_by_list = (yyvsp[-2].order_by_expr_list_t); + (yyvsp[-3].select_stmt)->order_by_list_ = (yyvsp[-2].order_by_expr_list_t); (yyvsp[-3].select_stmt)->limit_expr_ = (yyvsp[-1].expr_t); (yyvsp[-3].select_stmt)->offset_expr_ = (yyvsp[0].expr_t); (yyval.select_stmt) = (yyvsp[-3].select_stmt); diff --git a/src/parser/parser.y b/src/parser/parser.y index 77983e9232..4a1c4938fd 100644 --- a/src/parser/parser.y +++ b/src/parser/parser.y @@ -1593,7 +1593,7 @@ select_clause_with_modifier: select_clause_without_modifier order_by_clause limi yyerror(&yyloc, scanner, result, "Result modifier(ORDER BY) is conflict with SEARCH expression."); YYERROR; } - $1->order_by_list = $2; + $1->order_by_list_ = $2; $1->limit_expr_ = $3; $1->offset_expr_ = $4; $$ = $1; diff --git a/src/parser/search_lexer.cpp b/src/parser/search_lexer.cpp index 61b9a6fd45..0ea9ecf996 100644 --- a/src/parser/search_lexer.cpp +++ b/src/parser/search_lexer.cpp @@ -1,6 +1,6 @@ -#line 1 "search_lexer.cpp" +#line 2 "search_lexer.cpp" -#line 3 "search_lexer.cpp" +#line 4 "search_lexer.cpp" #define YY_INT_ALIGNED short int @@ -411,8 +411,8 @@ int yyFlexLexer::yylex() /* %% [3.0] code to copy yytext_ptr to yytext[] goes here, if %array \ */\ (yy_c_buf_p) = yy_cp; /* %% [4.0] data tables for the DFA and the user's section 1 definitions go here */ -#define YY_NUM_RULES 27 -#define YY_END_OF_BUFFER 28 +#define YY_NUM_RULES 20 +#define YY_END_OF_BUFFER 21 /* This struct is not used in this scanner, but its presence is necessary. */ struct yy_trans_info @@ -420,15 +420,14 @@ struct yy_trans_info flex_int32_t yy_verify; flex_int32_t yy_nxt; }; -static const flex_int16_t yy_accept[69] = +static const flex_int16_t yy_accept[56] = { 0, - 0, 0, 21, 21, 25, 25, 28, 27, 1, 8, - 23, 27, 19, 10, 11, 4, 9, 27, 16, 12, - 18, 18, 18, 18, 27, 27, 27, 27, 27, 27, - 27, 21, 22, 25, 26, 1, 3, 0, 16, 17, - 16, 16, 18, 0, 0, 0, 0, 18, 18, 5, - 0, 13, 6, 15, 0, 0, 21, 20, 25, 24, - 16, 2, 7, 14, 13, 0, 13, 0 + 0, 0, 0, 0, 0, 0, 21, 20, 1, 11, + 16, 12, 5, 6, 7, 11, 11, 11, 20, 20, + 20, 20, 20, 20, 13, 15, 14, 17, 19, 18, + 1, 11, 0, 0, 0, 0, 11, 11, 3, 0, + 8, 10, 0, 0, 13, 14, 17, 18, 2, 4, + 9, 8, 0, 8, 0 } ; static const YY_CHAR yy_ec[256] = @@ -436,109 +435,106 @@ static const YY_CHAR yy_ec[256] = 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 3, 4, 5, 1, 1, 1, 6, 7, 8, - 9, 10, 11, 1, 12, 13, 10, 14, 14, 14, - 14, 14, 14, 14, 14, 14, 14, 15, 1, 1, - 10, 1, 10, 1, 16, 17, 17, 18, 17, 17, - 17, 17, 17, 17, 17, 17, 17, 19, 20, 17, - 17, 21, 17, 22, 17, 17, 17, 17, 17, 17, - 10, 23, 10, 24, 17, 1, 17, 17, 17, 17, - - 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, - 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, - 17, 17, 10, 25, 10, 26, 1, 27, 27, 27, - 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, - 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, - 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, - 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, - 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, - 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, - 27, 1, 1, 28, 28, 28, 28, 28, 28, 28, - - 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, - 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, - 28, 28, 28, 29, 29, 29, 29, 29, 29, 29, - 29, 29, 29, 29, 29, 29, 29, 29, 29, 30, - 30, 30, 30, 30, 1, 1, 1, 1, 1, 1, + 1, 3, 4, 5, 4, 4, 4, 4, 6, 7, + 8, 9, 4, 4, 4, 10, 4, 11, 11, 11, + 11, 11, 11, 11, 11, 11, 11, 12, 4, 4, + 4, 4, 9, 4, 13, 4, 4, 14, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 15, 16, 4, + 4, 17, 4, 18, 4, 4, 4, 4, 4, 4, + 4, 19, 4, 20, 4, 4, 4, 4, 4, 4, + + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 21, 1, 22, 22, 22, + 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, + 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, + 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, + 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, + 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, + 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, + 22, 1, 1, 23, 23, 23, 23, 23, 23, 23, + + 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, + 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, + 23, 23, 23, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, 24, 25, + 25, 25, 25, 25, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 } ; -static const YY_CHAR yy_meta[31] = +static const YY_CHAR yy_meta[26] = { 0, - 1, 1, 2, 2, 3, 2, 4, 2, 2, 2, - 2, 2, 1, 1, 2, 1, 1, 1, 1, 1, - 1, 1, 2, 2, 2, 2, 1, 1, 1, 1 + 1, 1, 2, 3, 4, 5, 2, 2, 2, 3, + 3, 2, 3, 3, 3, 3, 3, 3, 6, 2, + 2, 1, 3, 3, 3 } ; -static const flex_int16_t yy_base[74] = +static const flex_int16_t yy_base[64] = { 0, - 0, 0, 109, 108, 109, 108, 112, 117, 29, 117, - 117, 105, 117, 117, 117, 117, 20, 96, 22, 117, - 34, 90, 35, 37, 0, 46, 83, 93, 79, 78, - 77, 0, 96, 0, 97, 59, 117, 87, 50, 86, - 85, 0, 79, 0, 70, 69, 68, 47, 27, 75, - 79, 54, 117, 78, 64, 63, 0, 117, 0, 117, - 75, 69, 68, 72, 70, 30, 23, 117, 68, 72, - 75, 78, 81 + 0, 0, 20, 21, 23, 24, 116, 117, 28, 25, + 117, 117, 117, 117, 117, 36, 17, 19, 0, 27, + 104, 92, 91, 90, 0, 117, 0, 0, 117, 0, + 43, 96, 0, 88, 87, 86, 38, 39, 92, 95, + 52, 87, 59, 56, 0, 117, 0, 117, 62, 43, + 45, 36, 19, 24, 117, 63, 69, 73, 78, 84, + 88, 94, 99 } ; -static const flex_int16_t yy_def[74] = +static const flex_int16_t yy_def[64] = { 0, - 68, 1, 69, 69, 70, 70, 68, 68, 68, 68, - 68, 68, 68, 68, 68, 68, 68, 68, 68, 68, - 19, 21, 21, 21, 71, 68, 68, 68, 68, 68, - 68, 72, 68, 73, 68, 68, 68, 68, 68, 68, - 68, 19, 21, 71, 68, 68, 68, 21, 21, 21, - 68, 68, 68, 68, 68, 68, 72, 68, 73, 68, - 68, 21, 21, 68, 68, 68, 68, 0, 68, 68, - 68, 68, 68 + 55, 1, 56, 56, 57, 57, 55, 55, 55, 58, + 55, 55, 55, 55, 55, 58, 16, 16, 59, 55, + 55, 55, 55, 55, 60, 55, 61, 62, 55, 63, + 55, 16, 59, 55, 55, 55, 16, 16, 16, 55, + 55, 55, 55, 55, 60, 55, 62, 55, 16, 16, + 55, 55, 55, 55, 0, 55, 55, 55, 55, 55, + 55, 55, 55 } ; -static const flex_int16_t yy_nxt[148] = +static const flex_int16_t yy_nxt[143] = { 0, - 8, 9, 9, 10, 11, 12, 13, 14, 15, 8, - 16, 17, 18, 19, 20, 21, 22, 22, 23, 24, - 22, 22, 25, 26, 27, 28, 8, 29, 30, 31, - 36, 36, 38, 39, 41, 42, 67, 43, 43, 43, - 43, 43, 43, 43, 44, 43, 68, 43, 63, 45, - 46, 47, 48, 43, 49, 43, 43, 50, 51, 52, - 36, 36, 41, 39, 62, 43, 65, 52, 32, 32, - 32, 32, 34, 34, 34, 34, 43, 43, 57, 57, - 57, 59, 59, 67, 59, 64, 43, 43, 61, 66, - 43, 54, 64, 43, 56, 55, 43, 43, 61, 40, - - 40, 60, 58, 56, 55, 43, 54, 53, 43, 40, - 37, 68, 35, 35, 33, 33, 7, 68, 68, 68, - 68, 68, 68, 68, 68, 68, 68, 68, 68, 68, - 68, 68, 68, 68, 68, 68, 68, 68, 68, 68, - 68, 68, 68, 68, 68, 68, 68 + 8, 9, 9, 10, 11, 12, 13, 14, 8, 10, + 10, 15, 16, 10, 17, 18, 10, 10, 19, 20, + 21, 8, 22, 23, 24, 26, 26, 29, 29, 31, + 31, 32, 38, 32, 54, 39, 40, 41, 27, 27, + 32, 30, 30, 33, 31, 31, 54, 34, 35, 36, + 37, 49, 32, 32, 33, 51, 50, 32, 34, 35, + 36, 52, 41, 25, 25, 25, 25, 25, 25, 28, + 28, 28, 28, 28, 28, 32, 32, 53, 32, 32, + 32, 32, 32, 32, 45, 45, 45, 45, 46, 46, + 46, 46, 46, 46, 47, 47, 47, 42, 47, 48, + + 48, 48, 48, 48, 48, 51, 32, 44, 43, 32, + 32, 44, 43, 32, 42, 55, 7, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55 } ; -static const flex_int16_t yy_chk[148] = +static const flex_int16_t yy_chk[143] = { 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 9, 9, 17, 17, 19, 19, 67, 19, 19, 19, - 19, 19, 19, 19, 19, 49, 21, 21, 49, 19, - 19, 19, 21, 23, 23, 24, 66, 24, 26, 26, - 36, 36, 39, 39, 48, 48, 52, 52, 69, 69, - 69, 69, 70, 70, 70, 70, 71, 71, 72, 72, - 72, 73, 73, 65, 73, 64, 63, 62, 61, 56, - 55, 54, 51, 50, 47, 46, 45, 43, 41, 40, - - 38, 35, 33, 31, 30, 29, 28, 27, 22, 18, - 12, 7, 6, 5, 4, 3, 68, 68, 68, 68, - 68, 68, 68, 68, 68, 68, 68, 68, 68, 68, - 68, 68, 68, 68, 68, 68, 68, 68, 68, 68, - 68, 68, 68, 68, 68, 68, 68 + 1, 1, 1, 1, 1, 3, 4, 5, 6, 9, + 9, 17, 17, 18, 54, 18, 20, 20, 3, 4, + 53, 5, 6, 10, 31, 31, 52, 10, 10, 10, + 16, 37, 37, 38, 16, 51, 38, 50, 16, 16, + 16, 41, 41, 56, 56, 56, 56, 56, 56, 57, + 57, 57, 57, 57, 57, 58, 49, 44, 58, 59, + 43, 59, 59, 59, 60, 60, 60, 60, 61, 61, + 61, 61, 61, 61, 62, 62, 62, 42, 62, 63, + + 63, 63, 63, 63, 63, 40, 39, 36, 35, 34, + 32, 24, 23, 22, 21, 7, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55 } ; -static const flex_int16_t yy_rule_linenum[27] = +static const flex_int16_t yy_rule_linenum[20] = { 0, - 62, 64, 65, 66, 68, 69, 71, 72, 73, 75, - 77, 79, 81, 82, 84, 86, 87, 88, 90, 91, - 92, 93, 96, 97, 98, 99 + 65, 67, 69, 71, 73, 75, 77, 79, 80, 82, + 84, 86, 87, 88, 89, 92, 93, 94, 95 } ; /* The intent behind this definition is that it'll catch @@ -577,10 +573,10 @@ using token = infinity::SearchParser::token; /* for temporary storage of quoted string */ static thread_local std::stringstream string_buffer; -#line 580 "search_lexer.cpp" +#line 577 "search_lexer.cpp" #define YY_NO_INPUT 1 -#line 583 "search_lexer.cpp" +#line 580 "search_lexer.cpp" #define INITIAL 0 #define SINGLE_QUOTED_STRING 1 @@ -774,16 +770,16 @@ YY_DECL { /* %% [7.0] user's declarations go here */ -#line 54 "search_lexer.l" +#line 57 "search_lexer.l" /** Code executed at the beginning of yylex **/ -#line 57 "search_lexer.l" +#line 60 "search_lexer.l" yylval = lval; - /* Note: special characters in pattern shall be double-quoted or escaped with backslash: " <^.+|/()[]{}" */ + /* Note: special characters in pattern shall be double-quoted or escaped with backslash: ' ()^"~*?:\\' */ -#line 786 "search_lexer.cpp" +#line 783 "search_lexer.cpp" while ( /*CONSTCOND*/1 ) /* loops until end-of-file is reached */ { @@ -812,13 +808,13 @@ YY_DECL while ( yy_chk[yy_base[yy_current_state] + yy_c] != yy_current_state ) { yy_current_state = (int) yy_def[yy_current_state]; - if ( yy_current_state >= 69 ) + if ( yy_current_state >= 56 ) yy_c = yy_meta[yy_c]; } yy_current_state = yy_nxt[yy_base[yy_current_state] + yy_c]; ++yy_cp; } - while ( yy_current_state != 68 ); + while ( yy_current_state != 55 ); yy_cp = (yy_last_accepting_cpos); yy_current_state = (yy_last_accepting_state); @@ -837,12 +833,12 @@ YY_DECL { if ( yy_act == 0 ) std::cerr << "--scanner backing up\n"; - else if ( yy_act < 27 ) + else if ( yy_act < 20 ) std::cerr << "--accepting rule at line " << yy_rule_linenum[yy_act] << "(\"" << yytext << "\")\n"; - else if ( yy_act == 27 ) + else if ( yy_act == 20 ) std::cerr << "--accepting default rule (\"" << yytext << "\")\n"; - else if ( yy_act == 28 ) + else if ( yy_act == 21 ) std::cerr << "--(end of buffer or a NUL)\n"; else std::cerr << "--EOF (start condition " << YY_START << ")\n"; @@ -861,126 +857,114 @@ YY_DECL case 1: /* rule 1 can match eol */ YY_RULE_SETUP -#line 62 "search_lexer.l" +#line 65 "search_lexer.l" /* ignore \t\n and space */; YY_BREAK case 2: -#line 65 "search_lexer.l" -case 3: -#line 66 "search_lexer.l" -case 4: YY_RULE_SETUP -#line 66 "search_lexer.l" +#line 67 "search_lexer.l" { return token::AND; } YY_BREAK -case 5: -#line 69 "search_lexer.l" -case 6: +case 3: YY_RULE_SETUP #line 69 "search_lexer.l" { return token::OR; } YY_BREAK -case 7: -#line 72 "search_lexer.l" -case 8: -#line 73 "search_lexer.l" -case 9: +case 4: YY_RULE_SETUP -#line 73 "search_lexer.l" +#line 71 "search_lexer.l" { return token::NOT; } YY_BREAK -case 10: +case 5: YY_RULE_SETUP -#line 75 "search_lexer.l" +#line 73 "search_lexer.l" { return token::LPAREN; } YY_BREAK -case 11: +case 6: YY_RULE_SETUP -#line 77 "search_lexer.l" +#line 75 "search_lexer.l" { return token::RPAREN; } YY_BREAK -case 12: +case 7: YY_RULE_SETUP -#line 79 "search_lexer.l" +#line 77 "search_lexer.l" { return token::OP_COLON; } YY_BREAK -case 13: -#line 82 "search_lexer.l" -case 14: +case 8: +#line 80 "search_lexer.l" +case 9: YY_RULE_SETUP -#line 82 "search_lexer.l" +#line 80 "search_lexer.l" { yylval->build(std::strtof(yytext+1, NULL)); return token::CARAT; } YY_BREAK -case 15: +case 10: YY_RULE_SETUP -#line 84 "search_lexer.l" +#line 82 "search_lexer.l" { yylval->build(std::strtoul(yytext+1, NULL, 10)); return token::TILDE; } YY_BREAK -case 16: -#line 87 "search_lexer.l" -case 17: -#line 88 "search_lexer.l" -case 18: +case 11: YY_RULE_SETUP -#line 88 "search_lexer.l" +#line 84 "search_lexer.l" { yylval->build(InfString(yytext, false)); return token::STRING; } // https://stackoverflow.com/questions/9611682/flexlexer-support-for-unicode YY_BREAK -case 19: +case 12: YY_RULE_SETUP -#line 90 "search_lexer.l" +#line 86 "search_lexer.l" { BEGIN SINGLE_QUOTED_STRING; string_buffer.clear(); string_buffer.str(""); } // Clear strbuf manually, see #170 YY_BREAK -case 20: +case 13: +/* rule 13 can match eol */ YY_RULE_SETUP -#line 91 "search_lexer.l" -{ string_buffer << '\''; } +#line 87 "search_lexer.l" +{ string_buffer << yytext; } YY_BREAK -case 21: -/* rule 21 can match eol */ +case 14: +/* rule 14 can match eol */ YY_RULE_SETUP -#line 92 "search_lexer.l" +#line 88 "search_lexer.l" { string_buffer << yytext; } YY_BREAK -case 22: +case 15: YY_RULE_SETUP -#line 93 "search_lexer.l" -{ BEGIN INITIAL; yylval->build(InfString(string_buffer.str(), true)); return token::STRING; } +#line 89 "search_lexer.l" +{ BEGIN INITIAL; yylval->build(InfString(std::move(string_buffer).str(), true)); return token::STRING; } YY_BREAK case YY_STATE_EOF(SINGLE_QUOTED_STRING): -#line 94 "search_lexer.l" +#line 90 "search_lexer.l" { std::cerr << "[Lucene-Lexer-Error] Unterminated string" << std::endl; return 0; } YY_BREAK -case 23: +case 16: YY_RULE_SETUP -#line 96 "search_lexer.l" +#line 92 "search_lexer.l" { BEGIN DOUBLE_QUOTED_STRING; string_buffer.clear(); string_buffer.str(""); } // Clear strbuf manually, see #170 YY_BREAK -case 24: +case 17: +/* rule 17 can match eol */ YY_RULE_SETUP -#line 97 "search_lexer.l" -{ string_buffer << '\"'; } +#line 93 "search_lexer.l" +{ string_buffer << yytext; } YY_BREAK -case 25: -/* rule 25 can match eol */ +case 18: +/* rule 18 can match eol */ YY_RULE_SETUP -#line 98 "search_lexer.l" +#line 94 "search_lexer.l" { string_buffer << yytext; } YY_BREAK -case 26: +case 19: YY_RULE_SETUP -#line 99 "search_lexer.l" -{ BEGIN INITIAL; yylval->build(InfString(string_buffer.str(), true)); return token::STRING; } +#line 95 "search_lexer.l" +{ BEGIN INITIAL; yylval->build(InfString(std::move(string_buffer).str(), true)); return token::STRING; } YY_BREAK case YY_STATE_EOF(DOUBLE_QUOTED_STRING): -#line 100 "search_lexer.l" +#line 96 "search_lexer.l" { std::cerr << "[Lucene-Lexer-Error] Unterminated string" << std::endl; return 0; } YY_BREAK -case 27: +case 20: YY_RULE_SETUP -#line 102 "search_lexer.l" +#line 98 "search_lexer.l" ECHO; YY_BREAK -#line 983 "search_lexer.cpp" +#line 968 "search_lexer.cpp" case YY_STATE_EOF(INITIAL): yyterminate(); @@ -1420,7 +1404,7 @@ int yyFlexLexer::yy_get_next_buffer() while ( yy_chk[yy_base[yy_current_state] + yy_c] != yy_current_state ) { yy_current_state = (int) yy_def[yy_current_state]; - if ( yy_current_state >= 69 ) + if ( yy_current_state >= 56 ) yy_c = yy_meta[yy_c]; } yy_current_state = yy_nxt[yy_base[yy_current_state] + yy_c]; @@ -1453,11 +1437,11 @@ int yyFlexLexer::yy_get_next_buffer() while ( yy_chk[yy_base[yy_current_state] + yy_c] != yy_current_state ) { yy_current_state = (int) yy_def[yy_current_state]; - if ( yy_current_state >= 69 ) + if ( yy_current_state >= 56 ) yy_c = yy_meta[yy_c]; } yy_current_state = yy_nxt[yy_base[yy_current_state] + yy_c]; - yy_is_jam = (yy_current_state == 68); + yy_is_jam = (yy_current_state == 55); return yy_is_jam ? 0 : yy_current_state; } @@ -2099,6 +2083,6 @@ void yyfree (void * ptr ) /* %ok-for-header */ -#line 102 "search_lexer.l" +#line 98 "search_lexer.l" diff --git a/src/parser/search_lexer.l b/src/parser/search_lexer.l index 2818061cdf..b956457f94 100644 --- a/src/parser/search_lexer.l +++ b/src/parser/search_lexer.l @@ -45,8 +45,11 @@ UANY {ASC}|{U2}{U}|{U3}{U}{U}|{U4}{U}{U}{U} UANYN {ASCN}|{U2}{U}|{U3}{U}{U}|{U4}{U}{U}{U} UONLY {U2}{U}|{U3}{U}{U}|{U4}{U}{U}{U} -ESCAPABLE [\x20+\-=&|!(){}\[\]^"~*?:\\/] -ESCAPED \\{ESCAPABLE} +ESCAPABLE [\x20()^"'~*?:\\] +ESCAPED \\{ESCAPABLE} +NOESCAPE [[:graph:]]{-}[\x20()^"'~*?:\\] +SQNOESCAPE [\x00-\xff]{-}['\\] +DQNOESCAPE [\x00-\xff]{-}["\\] %x SINGLE_QUOTED_STRING %x DOUBLE_QUOTED_STRING @@ -55,47 +58,40 @@ ESCAPED \\{ESCAPABLE} %{ /** Code executed at the beginning of yylex **/ yylval = lval; - /* Note: special characters in pattern shall be double-quoted or escaped with backslash: " <^.+|/()[]{}" */ + /* Note: special characters in pattern shall be double-quoted or escaped with backslash: ' ()^"~*?:\\' */ %} [ \t\n]+ /* ignore \t\n and space */; -AND | -&& | -"+" { return token::AND; } +AND { return token::AND; } -OR | -"||" { return token::OR; } +OR { return token::OR; } -NOT | -! | -- { return token::NOT; } +NOT { return token::NOT; } "(" { return token::LPAREN; } ")" { return token::RPAREN; } -: { return token::OP_COLON; } +: { return token::OP_COLON; } "^"[0-9]+("."[0-9]*)? | "^."[0-9]+ { yylval->build(std::strtof(yytext+1, NULL)); return token::CARAT; } "~"[0-9]+ { yylval->build(std::strtoul(yytext+1, NULL, 10)); return token::TILDE; } --?[0-9]+("."[0-9]*)? | --?"."[0-9]+ | -([a-zA-Z0-9_]|{UONLY}|{ESCAPED})+ { yylval->build(InfString(yytext, false)); return token::STRING; } // https://stackoverflow.com/questions/9611682/flexlexer-support-for-unicode - -\' { BEGIN SINGLE_QUOTED_STRING; string_buffer.clear(); string_buffer.str(""); } // Clear strbuf manually, see #170 -\'\' { string_buffer << '\''; } -[^']* { string_buffer << yytext; } -\' { BEGIN INITIAL; yylval->build(InfString(string_buffer.str(), true)); return token::STRING; } -<> { std::cerr << "[Lucene-Lexer-Error] Unterminated string" << std::endl; return 0; } - -\" { BEGIN DOUBLE_QUOTED_STRING; string_buffer.clear(); string_buffer.str(""); } // Clear strbuf manually, see #170 -\"\" { string_buffer << '\"'; } -[^"]* { string_buffer << yytext; } -\" { BEGIN INITIAL; yylval->build(InfString(string_buffer.str(), true)); return token::STRING; } -<> { std::cerr << "[Lucene-Lexer-Error] Unterminated string" << std::endl; return 0; } +({UONLY}|{ESCAPED}|{NOESCAPE})+ { yylval->build(InfString(yytext, false)); return token::STRING; } // https://stackoverflow.com/questions/9611682/flexlexer-support-for-unicode + +\' { BEGIN SINGLE_QUOTED_STRING; string_buffer.clear(); string_buffer.str(""); } // Clear strbuf manually, see #170 +{SQNOESCAPE}+ { string_buffer << yytext; } +\\([\x00-\xff]?) { string_buffer << yytext; } +\' { BEGIN INITIAL; yylval->build(InfString(std::move(string_buffer).str(), true)); return token::STRING; } +<> { std::cerr << "[Lucene-Lexer-Error] Unterminated string" << std::endl; return 0; } + +\" { BEGIN DOUBLE_QUOTED_STRING; string_buffer.clear(); string_buffer.str(""); } // Clear strbuf manually, see #170 +{DQNOESCAPE}+ { string_buffer << yytext; } +\\([\x00-\xff]?) { string_buffer << yytext; } +\" { BEGIN INITIAL; yylval->build(InfString(std::move(string_buffer).str(), true)); return token::STRING; } +<> { std::cerr << "[Lucene-Lexer-Error] Unterminated string" << std::endl; return 0; } %% diff --git a/src/parser/search_lexer_plain.cpp b/src/parser/search_lexer_plain.cpp deleted file mode 100644 index 4df9e82f5f..0000000000 --- a/src/parser/search_lexer_plain.cpp +++ /dev/null @@ -1,1954 +0,0 @@ -#line 1 "search_lexer_plain.cpp" - -#line 3 "search_lexer_plain.cpp" - -#define YY_INT_ALIGNED short int - -/* A lexical scanner generated by flex */ - -/* %not-for-header */ -/* %if-c-only */ -/* %if-not-reentrant */ -/* %endif */ -/* %endif */ -/* %ok-for-header */ - -#define FLEX_SCANNER -#define YY_FLEX_MAJOR_VERSION 2 -#define YY_FLEX_MINOR_VERSION 6 -#define YY_FLEX_SUBMINOR_VERSION 4 -#if YY_FLEX_SUBMINOR_VERSION > 0 -#define FLEX_BETA -#endif - -/* %if-c++-only */ - /* The c++ scanner is a mess. The FlexLexer.h header file relies on the - * following macro. This is required in order to pass the c++-multiple-scanners - * test in the regression suite. We get reports that it breaks inheritance. - * We will address this in a future release of flex, or omit the C++ scanner - * altogether. - */ - #define yyFlexLexer SearchScannerPlainFlexLexer -/* %endif */ - -/* %if-c-only */ -/* %endif */ - -#ifdef yyalloc -#define SearchScannerPlainalloc_ALREADY_DEFINED -#else -#define yyalloc SearchScannerPlainalloc -#endif - -#ifdef yyrealloc -#define SearchScannerPlainrealloc_ALREADY_DEFINED -#else -#define yyrealloc SearchScannerPlainrealloc -#endif - -#ifdef yyfree -#define SearchScannerPlainfree_ALREADY_DEFINED -#else -#define yyfree SearchScannerPlainfree -#endif - -/* %if-c-only */ -/* %endif */ - -/* First, we deal with platform-specific or compiler-specific issues. */ - -/* begin standard C headers. */ -/* %if-c-only */ -/* %endif */ - -/* %if-tables-serialization */ -/* %endif */ -/* end standard C headers. */ - -/* %if-c-or-c++ */ -/* flex integer type definitions */ - -#ifndef FLEXINT_H -#define FLEXINT_H - -/* C99 systems have . Non-C99 systems may or may not. */ - -#if defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L - -/* C99 says to define __STDC_LIMIT_MACROS before including stdint.h, - * if you want the limit (max/min) macros for int types. - */ -#ifndef __STDC_LIMIT_MACROS -#define __STDC_LIMIT_MACROS 1 -#endif - -#include -typedef int8_t flex_int8_t; -typedef uint8_t flex_uint8_t; -typedef int16_t flex_int16_t; -typedef uint16_t flex_uint16_t; -typedef int32_t flex_int32_t; -typedef uint32_t flex_uint32_t; -#else -typedef signed char flex_int8_t; -typedef short int flex_int16_t; -typedef int flex_int32_t; -typedef unsigned char flex_uint8_t; -typedef unsigned short int flex_uint16_t; -typedef unsigned int flex_uint32_t; - -/* Limits of integral types. */ -#ifndef INT8_MIN -#define INT8_MIN (-128) -#endif -#ifndef INT16_MIN -#define INT16_MIN (-32767-1) -#endif -#ifndef INT32_MIN -#define INT32_MIN (-2147483647-1) -#endif -#ifndef INT8_MAX -#define INT8_MAX (127) -#endif -#ifndef INT16_MAX -#define INT16_MAX (32767) -#endif -#ifndef INT32_MAX -#define INT32_MAX (2147483647) -#endif -#ifndef UINT8_MAX -#define UINT8_MAX (255U) -#endif -#ifndef UINT16_MAX -#define UINT16_MAX (65535U) -#endif -#ifndef UINT32_MAX -#define UINT32_MAX (4294967295U) -#endif - -#ifndef SIZE_MAX -#define SIZE_MAX (~(size_t)0) -#endif - -#endif /* ! C99 */ - -#endif /* ! FLEXINT_H */ - -/* %endif */ - -/* begin standard C++ headers. */ -/* %if-c++-only */ -#include -#include -#include -#include -#include -/* end standard C++ headers. */ -/* %endif */ - -/* TODO: this is always defined, so inline it */ -#define yyconst const - -#if defined(__GNUC__) && __GNUC__ >= 3 -#define yynoreturn __attribute__((__noreturn__)) -#else -#define yynoreturn -#endif - -/* %not-for-header */ -/* Returned upon end-of-file. */ -#define YY_NULL 0 -/* %ok-for-header */ - -/* %not-for-header */ -/* Promotes a possibly negative, possibly signed char to an - * integer in range [0..255] for use as an array index. - */ -#define YY_SC_TO_UI(c) ((YY_CHAR) (c)) -/* %ok-for-header */ - -/* %if-reentrant */ -/* %endif */ - -/* %if-not-reentrant */ - -/* %endif */ - -/* Enter a start condition. This macro really ought to take a parameter, - * but we do it the disgusting crufty way forced on us by the ()-less - * definition of BEGIN. - */ -#define BEGIN (yy_start) = 1 + 2 * -/* Translate the current start state into a value that can be later handed - * to BEGIN to return to the state. The YYSTATE alias is for lex - * compatibility. - */ -#define YY_START (((yy_start) - 1) / 2) -#define YYSTATE YY_START -/* Action number for EOF rule of a given start state. */ -#define YY_STATE_EOF(state) (YY_END_OF_BUFFER + state + 1) -/* Special action meaning "start processing a new file". */ -#define YY_NEW_FILE yyrestart( yyin ) -#define YY_END_OF_BUFFER_CHAR 0 - -/* Size of default input buffer. */ -#ifndef YY_BUF_SIZE -#ifdef __ia64__ -/* On IA-64, the buffer size is 16k, not 8k. - * Moreover, YY_BUF_SIZE is 2*YY_READ_BUF_SIZE in the general case. - * Ditto for the __ia64__ case accordingly. - */ -#define YY_BUF_SIZE 32768 -#else -#define YY_BUF_SIZE 16384 -#endif /* __ia64__ */ -#endif - -/* The state buf must be large enough to hold one state per character in the main buffer. - */ -#define YY_STATE_BUF_SIZE ((YY_BUF_SIZE + 2) * sizeof(yy_state_type)) - -#ifndef YY_TYPEDEF_YY_BUFFER_STATE -#define YY_TYPEDEF_YY_BUFFER_STATE -typedef struct yy_buffer_state *YY_BUFFER_STATE; -#endif - -#ifndef YY_TYPEDEF_YY_SIZE_T -#define YY_TYPEDEF_YY_SIZE_T -typedef size_t yy_size_t; -#endif - -/* %if-not-reentrant */ -extern int yyleng; -/* %endif */ - -/* %if-c-only */ -/* %if-not-reentrant */ -/* %endif */ -/* %endif */ - -#define EOB_ACT_CONTINUE_SCAN 0 -#define EOB_ACT_END_OF_FILE 1 -#define EOB_ACT_LAST_MATCH 2 - - #define YY_LESS_LINENO(n) - #define YY_LINENO_REWIND_TO(ptr) - -/* Return all but the first "n" matched characters back to the input stream. */ -#define yyless(n) \ - do \ - { \ - /* Undo effects of setting up yytext. */ \ - int yyless_macro_arg = (n); \ - YY_LESS_LINENO(yyless_macro_arg);\ - *yy_cp = (yy_hold_char); \ - YY_RESTORE_YY_MORE_OFFSET \ - (yy_c_buf_p) = yy_cp = yy_bp + yyless_macro_arg - YY_MORE_ADJ; \ - YY_DO_BEFORE_ACTION; /* set up yytext again */ \ - } \ - while ( 0 ) -#define unput(c) yyunput( c, (yytext_ptr) ) - -#ifndef YY_STRUCT_YY_BUFFER_STATE -#define YY_STRUCT_YY_BUFFER_STATE -struct yy_buffer_state - { -/* %if-c-only */ -/* %endif */ - -/* %if-c++-only */ - std::streambuf* yy_input_file; -/* %endif */ - - char *yy_ch_buf; /* input buffer */ - char *yy_buf_pos; /* current position in input buffer */ - - /* Size of input buffer in bytes, not including room for EOB - * characters. - */ - int yy_buf_size; - - /* Number of characters read into yy_ch_buf, not including EOB - * characters. - */ - int yy_n_chars; - - /* Whether we "own" the buffer - i.e., we know we created it, - * and can realloc() it to grow it, and should free() it to - * delete it. - */ - int yy_is_our_buffer; - - /* Whether this is an "interactive" input source; if so, and - * if we're using stdio for input, then we want to use getc() - * instead of fread(), to make sure we stop fetching input after - * each newline. - */ - int yy_is_interactive; - - /* Whether we're considered to be at the beginning of a line. - * If so, '^' rules will be active on the next match, otherwise - * not. - */ - int yy_at_bol; - - int yy_bs_lineno; /**< The line count. */ - int yy_bs_column; /**< The column count. */ - - /* Whether to try to fill the input buffer when we reach the - * end of it. - */ - int yy_fill_buffer; - - int yy_buffer_status; - -#define YY_BUFFER_NEW 0 -#define YY_BUFFER_NORMAL 1 - /* When an EOF's been seen but there's still some text to process - * then we mark the buffer as YY_EOF_PENDING, to indicate that we - * shouldn't try reading from the input source any more. We might - * still have a bunch of tokens to match, though, because of - * possible backing-up. - * - * When we actually see the EOF, we change the status to "new" - * (via yyrestart()), so that the user can continue scanning by - * just pointing yyin at a new input file. - */ -#define YY_BUFFER_EOF_PENDING 2 - - }; -#endif /* !YY_STRUCT_YY_BUFFER_STATE */ - -/* %if-c-only Standard (non-C++) definition */ -/* %not-for-header */ -/* %if-not-reentrant */ -/* %endif */ -/* %ok-for-header */ - -/* %endif */ - -/* We provide macros for accessing buffer states in case in the - * future we want to put the buffer states in a more general - * "scanner state". - * - * Returns the top of the stack, or NULL. - */ -#define YY_CURRENT_BUFFER ( (yy_buffer_stack) \ - ? (yy_buffer_stack)[(yy_buffer_stack_top)] \ - : NULL) -/* Same as previous macro, but useful when we know that the buffer stack is not - * NULL or when we need an lvalue. For internal use only. - */ -#define YY_CURRENT_BUFFER_LVALUE (yy_buffer_stack)[(yy_buffer_stack_top)] - -/* %if-c-only Standard (non-C++) definition */ -/* %if-not-reentrant */ -/* %not-for-header */ -/* %ok-for-header */ - -/* %endif */ -/* %endif */ - -void *yyalloc ( yy_size_t ); -void *yyrealloc ( void *, yy_size_t ); -void yyfree ( void * ); - -#define yy_new_buffer yy_create_buffer -#define yy_set_interactive(is_interactive) \ - { \ - if ( ! YY_CURRENT_BUFFER ){ \ - yyensure_buffer_stack (); \ - YY_CURRENT_BUFFER_LVALUE = \ - yy_create_buffer( yyin, YY_BUF_SIZE ); \ - } \ - YY_CURRENT_BUFFER_LVALUE->yy_is_interactive = is_interactive; \ - } -#define yy_set_bol(at_bol) \ - { \ - if ( ! YY_CURRENT_BUFFER ){\ - yyensure_buffer_stack (); \ - YY_CURRENT_BUFFER_LVALUE = \ - yy_create_buffer( yyin, YY_BUF_SIZE ); \ - } \ - YY_CURRENT_BUFFER_LVALUE->yy_at_bol = at_bol; \ - } -#define YY_AT_BOL() (YY_CURRENT_BUFFER_LVALUE->yy_at_bol) - -/* %% [1.0] yytext/yyin/yyout/yy_state_type/yylineno etc. def's & init go here */ -/* Begin user sect3 */ -#define YY_SKIP_YYWRAP - -#define FLEX_DEBUG -typedef flex_uint8_t YY_CHAR; - -#define yytext_ptr yytext - -#include - -int yyFlexLexer::yywrap() { return 1; } -int yyFlexLexer::yylex() - { - LexerError( "yyFlexLexer::yylex invoked but %option yyclass used" ); - return 0; - } - -#define YY_DECL int infinity::SearchScannerPlain::yylex() - -/* %% [1.5] DFA */ - -/* %if-c-only Standard (non-C++) definition */ -/* %endif */ - -/* Done after the current pattern has been matched and before the - * corresponding action - sets up yytext. - */ -#define YY_DO_BEFORE_ACTION \ - (yytext_ptr) = yy_bp; \ -/* %% [2.0] code to fiddle yytext and yyleng for yymore() goes here \ */\ - yyleng = (int) (yy_cp - yy_bp); \ - (yy_hold_char) = *yy_cp; \ - *yy_cp = '\0'; \ -/* %% [3.0] code to copy yytext_ptr to yytext[] goes here, if %array \ */\ - (yy_c_buf_p) = yy_cp; -/* %% [4.0] data tables for the DFA and the user's section 1 definitions go here */ -#define YY_NUM_RULES 5 -#define YY_END_OF_BUFFER 6 -/* This struct is not used in this scanner, - but its presence is necessary. */ -struct yy_trans_info - { - flex_int32_t yy_verify; - flex_int32_t yy_nxt; - }; -static const flex_int16_t yy_accept[26] = - { 0, - 0, 0, 6, 4, 4, 4, 1, 3, 4, 4, - 4, 0, 1, 2, 1, 1, 3, 0, 0, 0, - 0, 0, 1, 0, 0 - } ; - -static const YY_CHAR yy_ec[256] = - { 0, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 3, 4, 1, 5, 5, 5, - 5, 5, 5, 5, 5, 5, 5, 1, 1, 1, - 1, 1, 1, 1, 6, 6, 6, 6, 6, 6, - 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, - 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, - 1, 1, 1, 1, 6, 1, 6, 6, 6, 6, - - 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, - 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, - 6, 6, 1, 1, 1, 1, 1, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 1, 1, 8, 8, 8, 8, 8, 8, 8, - - 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, - 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, - 8, 8, 8, 9, 9, 9, 9, 9, 9, 9, - 9, 9, 9, 9, 9, 9, 9, 9, 9, 10, - 10, 10, 10, 10, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1 - } ; - -static const YY_CHAR yy_meta[11] = - { 0, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 - } ; - -static const flex_int16_t yy_base[26] = - { 0, - 0, 0, 36, 37, 7, 30, 9, 16, 27, 26, - 25, 26, 26, 24, 23, 0, 0, 20, 19, 18, - 17, 16, 17, 9, 37 - } ; - -static const flex_int16_t yy_def[26] = - { 0, - 25, 1, 25, 25, 25, 25, 25, 7, 25, 25, - 25, 25, 5, 25, 25, 7, 8, 25, 25, 25, - 25, 25, 25, 25, 0 - } ; - -static const flex_int16_t yy_nxt[48] = - { 0, - 4, 4, 5, 6, 7, 8, 4, 9, 10, 11, - 12, 13, 15, 16, 17, 17, 18, 19, 20, 25, - 17, 23, 24, 17, 22, 21, 17, 23, 14, 15, - 14, 22, 21, 17, 14, 25, 3, 25, 25, 25, - 25, 25, 25, 25, 25, 25, 25 - } ; - -static const flex_int16_t yy_chk[48] = - { 0, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 5, 5, 7, 7, 7, 24, 7, 7, 7, 8, - 8, 23, 22, 21, 20, 19, 18, 15, 14, 13, - 12, 11, 10, 9, 6, 3, 25, 25, 25, 25, - 25, 25, 25, 25, 25, 25, 25 - } ; - -static const flex_int16_t yy_rule_linenum[5] = - { 0, - 51, 52, 53, 55 - } ; - -/* The intent behind this definition is that it'll catch - * any uses of REJECT which flex missed. - */ -#define REJECT reject_used_but_not_detected -#define yymore() yymore_used_but_not_detected -#define YY_MORE_ADJ 0 -#define YY_RESTORE_YY_MORE_OFFSET -#line 1 "search_lexer_plain.l" -#line 2 "search_lexer_plain.l" -#include -#include -#include -#include - -/* Implementation of yyFlexScanner */ -#define SearchScannerDerived SearchScannerPlain -#include "search_scanner_derived.h" -#undef SearchScannerDerived -#undef YY_DECL -#define YY_DECL int infinity::SearchScannerPlain::yylex(infinity::SearchParser::semantic_type * const lval, infinity::SearchParser::location_type *loc) - -/* typedef to make the returns for the tokens shorter */ -using token = infinity::SearchParser::token; - -/* define yyterminate as this instead of NULL */ -#define yyterminate() return( token::END ) - -/* msvc2010 requires that we exclude this header file. */ -#define YY_NO_UNISTD_H - -/* update location on matching */ -#define YY_USER_ACTION loc->step(); loc->columns(yyleng); - -#line 537 "search_lexer_plain.cpp" -#define YY_NO_INPUT 1 -#line 539 "search_lexer_plain.cpp" - -#define INITIAL 0 - -#ifndef YY_NO_UNISTD_H -/* Special case for "unistd.h", since it is non-ANSI. We include it way - * down here because we want the user's section 1 to have been scanned first. - * The user has a chance to override it with an option. - */ -/* %if-c-only */ -/* %endif */ -/* %if-c++-only */ -#include -/* %endif */ -#endif - -#ifndef YY_EXTRA_TYPE -#define YY_EXTRA_TYPE void * -#endif - -/* %if-c-only Reentrant structure and macros (non-C++). */ -/* %if-reentrant */ -/* %if-c-only */ -/* %endif */ -/* %if-reentrant */ -/* %endif */ -/* %endif End reentrant structures and macros. */ -/* %if-bison-bridge */ -/* %endif */ -/* %not-for-header */ -/* %ok-for-header */ - -/* %endif */ - -#ifndef yytext_ptr -static void yy_flex_strncpy ( char *, const char *, int ); -#endif - -#ifdef YY_NEED_STRLEN -static int yy_flex_strlen ( const char * ); -#endif - -#ifndef YY_NO_INPUT -/* %if-c-only Standard (non-C++) definition */ -/* %not-for-header */ -/* %ok-for-header */ - -/* %endif */ -#endif - -/* %if-c-only */ -/* %endif */ - -/* Amount of stuff to slurp up with each read. */ -#ifndef YY_READ_BUF_SIZE -#ifdef __ia64__ -/* On IA-64, the buffer size is 16k, not 8k */ -#define YY_READ_BUF_SIZE 16384 -#else -#define YY_READ_BUF_SIZE 8192 -#endif /* __ia64__ */ -#endif - -/* Copy whatever the last rule matched to the standard output. */ -#ifndef ECHO -/* %if-c-only Standard (non-C++) definition */ -/* %endif */ -/* %if-c++-only C++ definition */ -#define ECHO LexerOutput( yytext, yyleng ) -/* %endif */ -#endif - -/* Gets input and stuffs it into "buf". number of characters read, or YY_NULL, - * is returned in "result". - */ -#ifndef YY_INPUT -#define YY_INPUT(buf,result,max_size) \ -/* %% [5.0] fread()/read() definition of YY_INPUT goes here unless we're doing C++ \ */\ -\ -/* %if-c++-only C++ definition \ */\ - if ( (int)(result = LexerInput( (char *) buf, max_size )) < 0 ) \ - YY_FATAL_ERROR( "input in flex scanner failed" ); -/* %endif */ - -#endif - -/* No semi-colon after return; correct usage is to write "yyterminate();" - - * we don't want an extra ';' after the "return" because that will cause - * some compilers to complain about unreachable statements. - */ -#ifndef yyterminate -#define yyterminate() return YY_NULL -#endif - -/* Number of entries by which start-condition stack grows. */ -#ifndef YY_START_STACK_INCR -#define YY_START_STACK_INCR 25 -#endif - -/* Report a fatal error. */ -#ifndef YY_FATAL_ERROR -/* %if-c-only */ -/* %endif */ -/* %if-c++-only */ -#define YY_FATAL_ERROR(msg) LexerError( msg ) -/* %endif */ -#endif - -/* %if-tables-serialization structures and prototypes */ -/* %not-for-header */ -/* %ok-for-header */ - -/* %not-for-header */ -/* %tables-yydmap generated elements */ -/* %endif */ -/* end tables serialization structures and prototypes */ - -/* %ok-for-header */ - -/* Default declaration of generated scanner - a define so the user can - * easily add parameters. - */ -#ifndef YY_DECL -#define YY_DECL_IS_OURS 1 -/* %if-c-only Standard (non-C++) definition */ -/* %endif */ -/* %if-c++-only C++ definition */ -#define YY_DECL int yyFlexLexer::yylex() -/* %endif */ -#endif /* !YY_DECL */ - -/* Code executed at the beginning of each rule, after yytext and yyleng - * have been set up. - */ -#ifndef YY_USER_ACTION -#define YY_USER_ACTION -#endif - -/* Code executed at the end of each rule. */ -#ifndef YY_BREAK -#define YY_BREAK /*LINTED*/break; -#endif - -/* %% [6.0] YY_RULE_SETUP definition goes here */ -#define YY_RULE_SETUP \ - YY_USER_ACTION - -/* %not-for-header */ -/** The main scanner function which does all the work. - */ -YY_DECL -{ - yy_state_type yy_current_state; - char *yy_cp, *yy_bp; - int yy_act; - - if ( !(yy_init) ) - { - (yy_init) = 1; - -#ifdef YY_USER_INIT - YY_USER_INIT; -#endif - - if ( ! (yy_start) ) - (yy_start) = 1; /* first start state */ - - if ( ! yyin ) -/* %if-c-only */ -/* %endif */ -/* %if-c++-only */ - yyin.rdbuf(std::cin.rdbuf()); -/* %endif */ - - if ( ! yyout ) -/* %if-c-only */ -/* %endif */ -/* %if-c++-only */ - yyout.rdbuf(std::cout.rdbuf()); -/* %endif */ - - if ( ! YY_CURRENT_BUFFER ) { - yyensure_buffer_stack (); - YY_CURRENT_BUFFER_LVALUE = - yy_create_buffer( yyin, YY_BUF_SIZE ); - } - - yy_load_buffer_state( ); - } - - { -/* %% [7.0] user's declarations go here */ -#line 45 "search_lexer_plain.l" - - /** Code executed at the beginning of yylex **/ -#line 48 "search_lexer_plain.l" - yylval = lval; - - -#line 738 "search_lexer_plain.cpp" - - while ( /*CONSTCOND*/1 ) /* loops until end-of-file is reached */ - { -/* %% [8.0] yymore()-related code goes here */ - yy_cp = (yy_c_buf_p); - - /* Support of yytext. */ - *yy_cp = (yy_hold_char); - - /* yy_bp points to the position in yy_ch_buf of the start of - * the current run. - */ - yy_bp = yy_cp; - -/* %% [9.0] code to set up and find next match goes here */ - yy_current_state = (yy_start); -yy_match: - do - { - YY_CHAR yy_c = yy_ec[YY_SC_TO_UI(*yy_cp)] ; - if ( yy_accept[yy_current_state] ) - { - (yy_last_accepting_state) = yy_current_state; - (yy_last_accepting_cpos) = yy_cp; - } - while ( yy_chk[yy_base[yy_current_state] + yy_c] != yy_current_state ) - { - yy_current_state = (int) yy_def[yy_current_state]; - if ( yy_current_state >= 26 ) - yy_c = yy_meta[yy_c]; - } - yy_current_state = yy_nxt[yy_base[yy_current_state] + yy_c]; - ++yy_cp; - } - while ( yy_current_state != 25 ); - yy_cp = (yy_last_accepting_cpos); - yy_current_state = (yy_last_accepting_state); - -yy_find_action: -/* %% [10.0] code to find the action number goes here */ - yy_act = yy_accept[yy_current_state]; - - YY_DO_BEFORE_ACTION; - -/* %% [11.0] code for yylineno update goes here */ - -do_action: /* This label is used only to access EOF actions. */ - -/* %% [12.0] debug code goes here */ - if ( yy_flex_debug ) - { - if ( yy_act == 0 ) - std::cerr << "--scanner backing up\n"; - else if ( yy_act < 5 ) - std::cerr << "--accepting rule at line " << yy_rule_linenum[yy_act] << - "(\"" << yytext << "\")\n"; - else if ( yy_act == 5 ) - std::cerr << "--accepting default rule (\"" << yytext << "\")\n"; - else if ( yy_act == 6 ) - std::cerr << "--(end of buffer or a NUL)\n"; - else - std::cerr << "--EOF (start condition " << YY_START << ")\n"; - } - - switch ( yy_act ) - { /* beginning of action switch */ -/* %% [13.0] actions go here */ - case 0: /* must back up */ - /* undo the effects of YY_DO_BEFORE_ACTION */ - *yy_cp = (yy_hold_char); - yy_cp = (yy_last_accepting_cpos); - yy_current_state = (yy_last_accepting_state); - goto yy_find_action; - -case 1: -#line 52 "search_lexer_plain.l" -case 2: -#line 53 "search_lexer_plain.l" -case 3: -YY_RULE_SETUP -#line 53 "search_lexer_plain.l" -{ yylval->build(InfString(yytext, false)); return token::STRING; } // https://stackoverflow.com/questions/9611682/flexlexer-support-for-unicode - YY_BREAK -case 4: -/* rule 4 can match eol */ -YY_RULE_SETUP -#line 55 "search_lexer_plain.l" -/* ignore any other character */; - YY_BREAK -case 5: -YY_RULE_SETUP -#line 57 "search_lexer_plain.l" -ECHO; - YY_BREAK -#line 833 "search_lexer_plain.cpp" -case YY_STATE_EOF(INITIAL): - yyterminate(); - - case YY_END_OF_BUFFER: - { - /* Amount of text matched not including the EOB char. */ - int yy_amount_of_matched_text = (int) (yy_cp - (yytext_ptr)) - 1; - - /* Undo the effects of YY_DO_BEFORE_ACTION. */ - *yy_cp = (yy_hold_char); - YY_RESTORE_YY_MORE_OFFSET - - if ( YY_CURRENT_BUFFER_LVALUE->yy_buffer_status == YY_BUFFER_NEW ) - { - /* We're scanning a new file or input source. It's - * possible that this happened because the user - * just pointed yyin at a new source and called - * yylex(). If so, then we have to assure - * consistency between YY_CURRENT_BUFFER and our - * globals. Here is the right place to do so, because - * this is the first action (other than possibly a - * back-up) that will match for the new input source. - */ - (yy_n_chars) = YY_CURRENT_BUFFER_LVALUE->yy_n_chars; -/* %if-c-only */ -/* %endif */ -/* %if-c++-only */ - YY_CURRENT_BUFFER_LVALUE->yy_input_file = yyin.rdbuf(); -/* %endif */ - YY_CURRENT_BUFFER_LVALUE->yy_buffer_status = YY_BUFFER_NORMAL; - } - - /* Note that here we test for yy_c_buf_p "<=" to the position - * of the first EOB in the buffer, since yy_c_buf_p will - * already have been incremented past the NUL character - * (since all states make transitions on EOB to the - * end-of-buffer state). Contrast this with the test - * in input(). - */ - if ( (yy_c_buf_p) <= &YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[(yy_n_chars)] ) - { /* This was really a NUL. */ - yy_state_type yy_next_state; - - (yy_c_buf_p) = (yytext_ptr) + yy_amount_of_matched_text; - - yy_current_state = yy_get_previous_state( ); - - /* Okay, we're now positioned to make the NUL - * transition. We couldn't have - * yy_get_previous_state() go ahead and do it - * for us because it doesn't know how to deal - * with the possibility of jamming (and we don't - * want to build jamming into it because then it - * will run more slowly). - */ - - yy_next_state = yy_try_NUL_trans( yy_current_state ); - - yy_bp = (yytext_ptr) + YY_MORE_ADJ; - - if ( yy_next_state ) - { - /* Consume the NUL. */ - yy_cp = ++(yy_c_buf_p); - yy_current_state = yy_next_state; - goto yy_match; - } - - else - { -/* %% [14.0] code to do back-up for compressed tables and set up yy_cp goes here */ - yy_cp = (yy_last_accepting_cpos); - yy_current_state = (yy_last_accepting_state); - goto yy_find_action; - } - } - - else switch ( yy_get_next_buffer( ) ) - { - case EOB_ACT_END_OF_FILE: - { - (yy_did_buffer_switch_on_eof) = 0; - - if ( yywrap( ) ) - { - /* Note: because we've taken care in - * yy_get_next_buffer() to have set up - * yytext, we can now set up - * yy_c_buf_p so that if some total - * hoser (like flex itself) wants to - * call the scanner after we return the - * YY_NULL, it'll still work - another - * YY_NULL will get returned. - */ - (yy_c_buf_p) = (yytext_ptr) + YY_MORE_ADJ; - - yy_act = YY_STATE_EOF(YY_START); - goto do_action; - } - - else - { - if ( ! (yy_did_buffer_switch_on_eof) ) - YY_NEW_FILE; - } - break; - } - - case EOB_ACT_CONTINUE_SCAN: - (yy_c_buf_p) = - (yytext_ptr) + yy_amount_of_matched_text; - - yy_current_state = yy_get_previous_state( ); - - yy_cp = (yy_c_buf_p); - yy_bp = (yytext_ptr) + YY_MORE_ADJ; - goto yy_match; - - case EOB_ACT_LAST_MATCH: - (yy_c_buf_p) = - &YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[(yy_n_chars)]; - - yy_current_state = yy_get_previous_state( ); - - yy_cp = (yy_c_buf_p); - yy_bp = (yytext_ptr) + YY_MORE_ADJ; - goto yy_find_action; - } - break; - } - - default: - YY_FATAL_ERROR( - "fatal flex scanner internal error--no action found" ); - } /* end of action switch */ - } /* end of scanning one token */ - } /* end of user's declarations */ -} /* end of yylex */ -/* %ok-for-header */ - -/* %if-c++-only */ -/* %not-for-header */ -/* The contents of this function are C++ specific, so the () macro is not used. - * This constructor simply maintains backward compatibility. - * DEPRECATED - */ -yyFlexLexer::yyFlexLexer( std::istream* arg_yyin, std::ostream* arg_yyout ): - yyin(arg_yyin ? arg_yyin->rdbuf() : std::cin.rdbuf()), - yyout(arg_yyout ? arg_yyout->rdbuf() : std::cout.rdbuf()) -{ - ctor_common(); -} - -/* The contents of this function are C++ specific, so the () macro is not used. - */ -yyFlexLexer::yyFlexLexer( std::istream& arg_yyin, std::ostream& arg_yyout ): - yyin(arg_yyin.rdbuf()), - yyout(arg_yyout.rdbuf()) -{ - ctor_common(); -} - -/* The contents of this function are C++ specific, so the () macro is not used. - */ -void yyFlexLexer::ctor_common() -{ - yy_c_buf_p = 0; - yy_init = 0; - yy_start = 0; - yy_flex_debug = 0; - yylineno = 1; // this will only get updated if %option yylineno - - yy_did_buffer_switch_on_eof = 0; - - yy_looking_for_trail_begin = 0; - yy_more_flag = 0; - yy_more_len = 0; - yy_more_offset = yy_prev_more_offset = 0; - - yy_start_stack_ptr = yy_start_stack_depth = 0; - yy_start_stack = NULL; - - yy_buffer_stack = NULL; - yy_buffer_stack_top = 0; - yy_buffer_stack_max = 0; - - yy_state_buf = 0; - -} - -/* The contents of this function are C++ specific, so the () macro is not used. - */ -yyFlexLexer::~yyFlexLexer() -{ - delete [] yy_state_buf; - yyfree( yy_start_stack ); - yy_delete_buffer( YY_CURRENT_BUFFER ); - yyfree( yy_buffer_stack ); -} - -/* The contents of this function are C++ specific, so the () macro is not used. - */ -void yyFlexLexer::switch_streams( std::istream& new_in, std::ostream& new_out ) -{ - // was if( new_in ) - yy_delete_buffer( YY_CURRENT_BUFFER ); - yy_switch_to_buffer( yy_create_buffer( new_in, YY_BUF_SIZE ) ); - - // was if( new_out ) - yyout.rdbuf(new_out.rdbuf()); -} - -/* The contents of this function are C++ specific, so the () macro is not used. - */ -void yyFlexLexer::switch_streams( std::istream* new_in, std::ostream* new_out ) -{ - if( ! new_in ) { - new_in = &yyin; - } - - if ( ! new_out ) { - new_out = &yyout; - } - - switch_streams(*new_in, *new_out); -} - -#ifdef YY_INTERACTIVE -int yyFlexLexer::LexerInput( char* buf, int /* max_size */ ) -#else -int yyFlexLexer::LexerInput( char* buf, int max_size ) -#endif -{ - if ( yyin.eof() || yyin.fail() ) - return 0; - -#ifdef YY_INTERACTIVE - yyin.get( buf[0] ); - - if ( yyin.eof() ) - return 0; - - if ( yyin.bad() ) - return -1; - - return 1; - -#else - (void) yyin.read( buf, max_size ); - - if ( yyin.bad() ) - return -1; - else - return yyin.gcount(); -#endif -} - -void yyFlexLexer::LexerOutput( const char* buf, int size ) -{ - (void) yyout.write( buf, size ); -} -/* %ok-for-header */ - -/* %endif */ - -/* yy_get_next_buffer - try to read in a new buffer - * - * Returns a code representing an action: - * EOB_ACT_LAST_MATCH - - * EOB_ACT_CONTINUE_SCAN - continue scanning from current position - * EOB_ACT_END_OF_FILE - end of file - */ -/* %if-c-only */ -/* %endif */ -/* %if-c++-only */ -int yyFlexLexer::yy_get_next_buffer() -/* %endif */ -{ - char *dest = YY_CURRENT_BUFFER_LVALUE->yy_ch_buf; - char *source = (yytext_ptr); - int number_to_move, i; - int ret_val; - - if ( (yy_c_buf_p) > &YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[(yy_n_chars) + 1] ) - YY_FATAL_ERROR( - "fatal flex scanner internal error--end of buffer missed" ); - - if ( YY_CURRENT_BUFFER_LVALUE->yy_fill_buffer == 0 ) - { /* Don't try to fill the buffer, so this is an EOF. */ - if ( (yy_c_buf_p) - (yytext_ptr) - YY_MORE_ADJ == 1 ) - { - /* We matched a single character, the EOB, so - * treat this as a final EOF. - */ - return EOB_ACT_END_OF_FILE; - } - - else - { - /* We matched some text prior to the EOB, first - * process it. - */ - return EOB_ACT_LAST_MATCH; - } - } - - /* Try to read more data. */ - - /* First move last chars to start of buffer. */ - number_to_move = (int) ((yy_c_buf_p) - (yytext_ptr) - 1); - - for ( i = 0; i < number_to_move; ++i ) - *(dest++) = *(source++); - - if ( YY_CURRENT_BUFFER_LVALUE->yy_buffer_status == YY_BUFFER_EOF_PENDING ) - /* don't do the read, it's not guaranteed to return an EOF, - * just force an EOF - */ - YY_CURRENT_BUFFER_LVALUE->yy_n_chars = (yy_n_chars) = 0; - - else - { - int num_to_read = - YY_CURRENT_BUFFER_LVALUE->yy_buf_size - number_to_move - 1; - - while ( num_to_read <= 0 ) - { /* Not enough room in the buffer - grow it. */ - - /* just a shorter name for the current buffer */ - YY_BUFFER_STATE b = YY_CURRENT_BUFFER_LVALUE; - - int yy_c_buf_p_offset = - (int) ((yy_c_buf_p) - b->yy_ch_buf); - - if ( b->yy_is_our_buffer ) - { - int new_size = b->yy_buf_size * 2; - - if ( new_size <= 0 ) - b->yy_buf_size += b->yy_buf_size / 8; - else - b->yy_buf_size *= 2; - - b->yy_ch_buf = (char *) - /* Include room in for 2 EOB chars. */ - yyrealloc( (void *) b->yy_ch_buf, - (yy_size_t) (b->yy_buf_size + 2) ); - } - else - /* Can't grow it, we don't own it. */ - b->yy_ch_buf = NULL; - - if ( ! b->yy_ch_buf ) - YY_FATAL_ERROR( - "fatal error - scanner input buffer overflow" ); - - (yy_c_buf_p) = &b->yy_ch_buf[yy_c_buf_p_offset]; - - num_to_read = YY_CURRENT_BUFFER_LVALUE->yy_buf_size - - number_to_move - 1; - - } - - if ( num_to_read > YY_READ_BUF_SIZE ) - num_to_read = YY_READ_BUF_SIZE; - - /* Read in more data. */ - YY_INPUT( (&YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[number_to_move]), - (yy_n_chars), num_to_read ); - - YY_CURRENT_BUFFER_LVALUE->yy_n_chars = (yy_n_chars); - } - - if ( (yy_n_chars) == 0 ) - { - if ( number_to_move == YY_MORE_ADJ ) - { - ret_val = EOB_ACT_END_OF_FILE; - yyrestart( yyin ); - } - - else - { - ret_val = EOB_ACT_LAST_MATCH; - YY_CURRENT_BUFFER_LVALUE->yy_buffer_status = - YY_BUFFER_EOF_PENDING; - } - } - - else - ret_val = EOB_ACT_CONTINUE_SCAN; - - if (((yy_n_chars) + number_to_move) > YY_CURRENT_BUFFER_LVALUE->yy_buf_size) { - /* Extend the array by 50%, plus the number we really need. */ - int new_size = (yy_n_chars) + number_to_move + ((yy_n_chars) >> 1); - YY_CURRENT_BUFFER_LVALUE->yy_ch_buf = (char *) yyrealloc( - (void *) YY_CURRENT_BUFFER_LVALUE->yy_ch_buf, (yy_size_t) new_size ); - if ( ! YY_CURRENT_BUFFER_LVALUE->yy_ch_buf ) - YY_FATAL_ERROR( "out of dynamic memory in yy_get_next_buffer()" ); - /* "- 2" to take care of EOB's */ - YY_CURRENT_BUFFER_LVALUE->yy_buf_size = (int) (new_size - 2); - } - - (yy_n_chars) += number_to_move; - YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[(yy_n_chars)] = YY_END_OF_BUFFER_CHAR; - YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[(yy_n_chars) + 1] = YY_END_OF_BUFFER_CHAR; - - (yytext_ptr) = &YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[0]; - - return ret_val; -} - -/* yy_get_previous_state - get the state just before the EOB char was reached */ - -/* %if-c-only */ -/* %not-for-header */ -/* %endif */ -/* %if-c++-only */ - yy_state_type yyFlexLexer::yy_get_previous_state() -/* %endif */ -{ - yy_state_type yy_current_state; - char *yy_cp; - -/* %% [15.0] code to get the start state into yy_current_state goes here */ - yy_current_state = (yy_start); - - for ( yy_cp = (yytext_ptr) + YY_MORE_ADJ; yy_cp < (yy_c_buf_p); ++yy_cp ) - { -/* %% [16.0] code to find the next state goes here */ - YY_CHAR yy_c = (*yy_cp ? yy_ec[YY_SC_TO_UI(*yy_cp)] : 1); - if ( yy_accept[yy_current_state] ) - { - (yy_last_accepting_state) = yy_current_state; - (yy_last_accepting_cpos) = yy_cp; - } - while ( yy_chk[yy_base[yy_current_state] + yy_c] != yy_current_state ) - { - yy_current_state = (int) yy_def[yy_current_state]; - if ( yy_current_state >= 26 ) - yy_c = yy_meta[yy_c]; - } - yy_current_state = yy_nxt[yy_base[yy_current_state] + yy_c]; - } - - return yy_current_state; -} - -/* yy_try_NUL_trans - try to make a transition on the NUL character - * - * synopsis - * next_state = yy_try_NUL_trans( current_state ); - */ -/* %if-c-only */ -/* %endif */ -/* %if-c++-only */ - yy_state_type yyFlexLexer::yy_try_NUL_trans( yy_state_type yy_current_state ) -/* %endif */ -{ - int yy_is_jam; - /* %% [17.0] code to find the next state, and perhaps do backing up, goes here */ - char *yy_cp = (yy_c_buf_p); - - YY_CHAR yy_c = 1; - if ( yy_accept[yy_current_state] ) - { - (yy_last_accepting_state) = yy_current_state; - (yy_last_accepting_cpos) = yy_cp; - } - while ( yy_chk[yy_base[yy_current_state] + yy_c] != yy_current_state ) - { - yy_current_state = (int) yy_def[yy_current_state]; - if ( yy_current_state >= 26 ) - yy_c = yy_meta[yy_c]; - } - yy_current_state = yy_nxt[yy_base[yy_current_state] + yy_c]; - yy_is_jam = (yy_current_state == 25); - - return yy_is_jam ? 0 : yy_current_state; -} - -#ifndef YY_NO_UNPUT -/* %if-c-only */ -/* %endif */ -/* %if-c++-only */ - void yyFlexLexer::yyunput( int c, char* yy_bp) -/* %endif */ -{ - char *yy_cp; - - yy_cp = (yy_c_buf_p); - - /* undo effects of setting up yytext */ - *yy_cp = (yy_hold_char); - - if ( yy_cp < YY_CURRENT_BUFFER_LVALUE->yy_ch_buf + 2 ) - { /* need to shift things up to make room */ - /* +2 for EOB chars. */ - int number_to_move = (yy_n_chars) + 2; - char *dest = &YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[ - YY_CURRENT_BUFFER_LVALUE->yy_buf_size + 2]; - char *source = - &YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[number_to_move]; - - while ( source > YY_CURRENT_BUFFER_LVALUE->yy_ch_buf ) - *--dest = *--source; - - yy_cp += (int) (dest - source); - yy_bp += (int) (dest - source); - YY_CURRENT_BUFFER_LVALUE->yy_n_chars = - (yy_n_chars) = (int) YY_CURRENT_BUFFER_LVALUE->yy_buf_size; - - if ( yy_cp < YY_CURRENT_BUFFER_LVALUE->yy_ch_buf + 2 ) - YY_FATAL_ERROR( "flex scanner push-back overflow" ); - } - - *--yy_cp = (char) c; - -/* %% [18.0] update yylineno here */ - - (yytext_ptr) = yy_bp; - (yy_hold_char) = *yy_cp; - (yy_c_buf_p) = yy_cp; -} -/* %if-c-only */ -/* %endif */ -#endif - -/* %if-c-only */ -/* %endif */ -/* %if-c++-only */ - int yyFlexLexer::yyinput() -/* %endif */ -{ - int c; - - *(yy_c_buf_p) = (yy_hold_char); - - if ( *(yy_c_buf_p) == YY_END_OF_BUFFER_CHAR ) - { - /* yy_c_buf_p now points to the character we want to return. - * If this occurs *before* the EOB characters, then it's a - * valid NUL; if not, then we've hit the end of the buffer. - */ - if ( (yy_c_buf_p) < &YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[(yy_n_chars)] ) - /* This was really a NUL. */ - *(yy_c_buf_p) = '\0'; - - else - { /* need more input */ - int offset = (int) ((yy_c_buf_p) - (yytext_ptr)); - ++(yy_c_buf_p); - - switch ( yy_get_next_buffer( ) ) - { - case EOB_ACT_LAST_MATCH: - /* This happens because yy_g_n_b() - * sees that we've accumulated a - * token and flags that we need to - * try matching the token before - * proceeding. But for input(), - * there's no matching to consider. - * So convert the EOB_ACT_LAST_MATCH - * to EOB_ACT_END_OF_FILE. - */ - - /* Reset buffer status. */ - yyrestart( yyin ); - - /*FALLTHROUGH*/ - - case EOB_ACT_END_OF_FILE: - { - if ( yywrap( ) ) - return 0; - - if ( ! (yy_did_buffer_switch_on_eof) ) - YY_NEW_FILE; -#ifdef __cplusplus - return yyinput(); -#else - return input(); -#endif - } - - case EOB_ACT_CONTINUE_SCAN: - (yy_c_buf_p) = (yytext_ptr) + offset; - break; - } - } - } - - c = *(unsigned char *) (yy_c_buf_p); /* cast for 8-bit char's */ - *(yy_c_buf_p) = '\0'; /* preserve yytext */ - (yy_hold_char) = *++(yy_c_buf_p); - -/* %% [19.0] update BOL and yylineno */ - - return c; -} -/* %if-c-only */ -/* %endif */ - -/** Immediately switch to a different input stream. - * @param input_file A readable stream. - * - * @note This function does not reset the start condition to @c INITIAL . - */ -/* %if-c-only */ -/* %endif */ -/* %if-c++-only */ - void yyFlexLexer::yyrestart( std::istream& input_file ) -/* %endif */ -{ - - if ( ! YY_CURRENT_BUFFER ){ - yyensure_buffer_stack (); - YY_CURRENT_BUFFER_LVALUE = - yy_create_buffer( yyin, YY_BUF_SIZE ); - } - - yy_init_buffer( YY_CURRENT_BUFFER, input_file ); - yy_load_buffer_state( ); -} - -/* %if-c++-only */ -/** Delegate to the new version that takes an istream reference. - * @param input_file A readable stream. - * - * @note This function does not reset the start condition to @c INITIAL . - */ -void yyFlexLexer::yyrestart( std::istream* input_file ) -{ - if( ! input_file ) { - input_file = &yyin; - } - yyrestart( *input_file ); -} -/* %endif */ - -/** Switch to a different input buffer. - * @param new_buffer The new input buffer. - * - */ -/* %if-c-only */ -/* %endif */ -/* %if-c++-only */ - void yyFlexLexer::yy_switch_to_buffer( YY_BUFFER_STATE new_buffer ) -/* %endif */ -{ - - /* TODO. We should be able to replace this entire function body - * with - * yypop_buffer_state(); - * yypush_buffer_state(new_buffer); - */ - yyensure_buffer_stack (); - if ( YY_CURRENT_BUFFER == new_buffer ) - return; - - if ( YY_CURRENT_BUFFER ) - { - /* Flush out information for old buffer. */ - *(yy_c_buf_p) = (yy_hold_char); - YY_CURRENT_BUFFER_LVALUE->yy_buf_pos = (yy_c_buf_p); - YY_CURRENT_BUFFER_LVALUE->yy_n_chars = (yy_n_chars); - } - - YY_CURRENT_BUFFER_LVALUE = new_buffer; - yy_load_buffer_state( ); - - /* We don't actually know whether we did this switch during - * EOF (yywrap()) processing, but the only time this flag - * is looked at is after yywrap() is called, so it's safe - * to go ahead and always set it. - */ - (yy_did_buffer_switch_on_eof) = 1; -} - -/* %if-c-only */ -/* %endif */ -/* %if-c++-only */ - void yyFlexLexer::yy_load_buffer_state() -/* %endif */ -{ - (yy_n_chars) = YY_CURRENT_BUFFER_LVALUE->yy_n_chars; - (yytext_ptr) = (yy_c_buf_p) = YY_CURRENT_BUFFER_LVALUE->yy_buf_pos; -/* %if-c-only */ -/* %endif */ -/* %if-c++-only */ - yyin.rdbuf(YY_CURRENT_BUFFER_LVALUE->yy_input_file); -/* %endif */ - (yy_hold_char) = *(yy_c_buf_p); -} - -/** Allocate and initialize an input buffer state. - * @param file A readable stream. - * @param size The character buffer size in bytes. When in doubt, use @c YY_BUF_SIZE. - * - * @return the allocated buffer state. - */ -/* %if-c-only */ -/* %endif */ -/* %if-c++-only */ - YY_BUFFER_STATE yyFlexLexer::yy_create_buffer( std::istream& file, int size ) -/* %endif */ -{ - YY_BUFFER_STATE b; - - b = (YY_BUFFER_STATE) yyalloc( sizeof( struct yy_buffer_state ) ); - if ( ! b ) - YY_FATAL_ERROR( "out of dynamic memory in yy_create_buffer()" ); - - b->yy_buf_size = size; - - /* yy_ch_buf has to be 2 characters longer than the size given because - * we need to put in 2 end-of-buffer characters. - */ - b->yy_ch_buf = (char *) yyalloc( (yy_size_t) (b->yy_buf_size + 2) ); - if ( ! b->yy_ch_buf ) - YY_FATAL_ERROR( "out of dynamic memory in yy_create_buffer()" ); - - b->yy_is_our_buffer = 1; - - yy_init_buffer( b, file ); - - return b; -} - -/* %if-c++-only */ -/** Delegate creation of buffers to the new version that takes an istream reference. - * @param file A readable stream. - * @param size The character buffer size in bytes. When in doubt, use @c YY_BUF_SIZE. - * - * @return the allocated buffer state. - */ - YY_BUFFER_STATE yyFlexLexer::yy_create_buffer( std::istream* file, int size ) -{ - return yy_create_buffer( *file, size ); -} -/* %endif */ - -/** Destroy the buffer. - * @param b a buffer created with yy_create_buffer() - * - */ -/* %if-c-only */ -/* %endif */ -/* %if-c++-only */ - void yyFlexLexer::yy_delete_buffer( YY_BUFFER_STATE b ) -/* %endif */ -{ - - if ( ! b ) - return; - - if ( b == YY_CURRENT_BUFFER ) /* Not sure if we should pop here. */ - YY_CURRENT_BUFFER_LVALUE = (YY_BUFFER_STATE) 0; - - if ( b->yy_is_our_buffer ) - yyfree( (void *) b->yy_ch_buf ); - - yyfree( (void *) b ); -} - -/* Initializes or reinitializes a buffer. - * This function is sometimes called more than once on the same buffer, - * such as during a yyrestart() or at EOF. - */ -/* %if-c-only */ -/* %endif */ -/* %if-c++-only */ - void yyFlexLexer::yy_init_buffer( YY_BUFFER_STATE b, std::istream& file ) -/* %endif */ - -{ - int oerrno = errno; - - yy_flush_buffer( b ); - -/* %if-c-only */ -/* %endif */ -/* %if-c++-only */ - b->yy_input_file = file.rdbuf(); -/* %endif */ - b->yy_fill_buffer = 1; - - /* If b is the current buffer, then yy_init_buffer was _probably_ - * called from yyrestart() or through yy_get_next_buffer. - * In that case, we don't want to reset the lineno or column. - */ - if (b != YY_CURRENT_BUFFER){ - b->yy_bs_lineno = 1; - b->yy_bs_column = 0; - } - -/* %if-c-only */ -/* %endif */ -/* %if-c++-only */ - b->yy_is_interactive = 0; -/* %endif */ - errno = oerrno; -} - -/** Discard all buffered characters. On the next scan, YY_INPUT will be called. - * @param b the buffer state to be flushed, usually @c YY_CURRENT_BUFFER. - * - */ -/* %if-c-only */ -/* %endif */ -/* %if-c++-only */ - void yyFlexLexer::yy_flush_buffer( YY_BUFFER_STATE b ) -/* %endif */ -{ - if ( ! b ) - return; - - b->yy_n_chars = 0; - - /* We always need two end-of-buffer characters. The first causes - * a transition to the end-of-buffer state. The second causes - * a jam in that state. - */ - b->yy_ch_buf[0] = YY_END_OF_BUFFER_CHAR; - b->yy_ch_buf[1] = YY_END_OF_BUFFER_CHAR; - - b->yy_buf_pos = &b->yy_ch_buf[0]; - - b->yy_at_bol = 1; - b->yy_buffer_status = YY_BUFFER_NEW; - - if ( b == YY_CURRENT_BUFFER ) - yy_load_buffer_state( ); -} - -/* %if-c-or-c++ */ -/** Pushes the new state onto the stack. The new state becomes - * the current state. This function will allocate the stack - * if necessary. - * @param new_buffer The new state. - * - */ -/* %if-c-only */ -/* %endif */ -/* %if-c++-only */ -void yyFlexLexer::yypush_buffer_state (YY_BUFFER_STATE new_buffer) -/* %endif */ -{ - if (new_buffer == NULL) - return; - - yyensure_buffer_stack(); - - /* This block is copied from yy_switch_to_buffer. */ - if ( YY_CURRENT_BUFFER ) - { - /* Flush out information for old buffer. */ - *(yy_c_buf_p) = (yy_hold_char); - YY_CURRENT_BUFFER_LVALUE->yy_buf_pos = (yy_c_buf_p); - YY_CURRENT_BUFFER_LVALUE->yy_n_chars = (yy_n_chars); - } - - /* Only push if top exists. Otherwise, replace top. */ - if (YY_CURRENT_BUFFER) - (yy_buffer_stack_top)++; - YY_CURRENT_BUFFER_LVALUE = new_buffer; - - /* copied from yy_switch_to_buffer. */ - yy_load_buffer_state( ); - (yy_did_buffer_switch_on_eof) = 1; -} -/* %endif */ - -/* %if-c-or-c++ */ -/** Removes and deletes the top of the stack, if present. - * The next element becomes the new top. - * - */ -/* %if-c-only */ -/* %endif */ -/* %if-c++-only */ -void yyFlexLexer::yypop_buffer_state (void) -/* %endif */ -{ - if (!YY_CURRENT_BUFFER) - return; - - yy_delete_buffer(YY_CURRENT_BUFFER ); - YY_CURRENT_BUFFER_LVALUE = NULL; - if ((yy_buffer_stack_top) > 0) - --(yy_buffer_stack_top); - - if (YY_CURRENT_BUFFER) { - yy_load_buffer_state( ); - (yy_did_buffer_switch_on_eof) = 1; - } -} -/* %endif */ - -/* %if-c-or-c++ */ -/* Allocates the stack if it does not exist. - * Guarantees space for at least one push. - */ -/* %if-c-only */ -/* %endif */ -/* %if-c++-only */ -void yyFlexLexer::yyensure_buffer_stack(void) -/* %endif */ -{ - yy_size_t num_to_alloc; - - if (!(yy_buffer_stack)) { - - /* First allocation is just for 2 elements, since we don't know if this - * scanner will even need a stack. We use 2 instead of 1 to avoid an - * immediate realloc on the next call. - */ - num_to_alloc = 1; /* After all that talk, this was set to 1 anyways... */ - (yy_buffer_stack) = (struct yy_buffer_state**)yyalloc - (num_to_alloc * sizeof(struct yy_buffer_state*) - ); - if ( ! (yy_buffer_stack) ) - YY_FATAL_ERROR( "out of dynamic memory in yyensure_buffer_stack()" ); - - memset((yy_buffer_stack), 0, num_to_alloc * sizeof(struct yy_buffer_state*)); - - (yy_buffer_stack_max) = num_to_alloc; - (yy_buffer_stack_top) = 0; - return; - } - - if ((yy_buffer_stack_top) >= ((yy_buffer_stack_max)) - 1){ - - /* Increase the buffer to prepare for a possible push. */ - yy_size_t grow_size = 8 /* arbitrary grow size */; - - num_to_alloc = (yy_buffer_stack_max) + grow_size; - (yy_buffer_stack) = (struct yy_buffer_state**)yyrealloc - ((yy_buffer_stack), - num_to_alloc * sizeof(struct yy_buffer_state*) - ); - if ( ! (yy_buffer_stack) ) - YY_FATAL_ERROR( "out of dynamic memory in yyensure_buffer_stack()" ); - - /* zero only the new slots.*/ - memset((yy_buffer_stack) + (yy_buffer_stack_max), 0, grow_size * sizeof(struct yy_buffer_state*)); - (yy_buffer_stack_max) = num_to_alloc; - } -} -/* %endif */ - -/* %if-c-only */ -/* %endif */ - -/* %if-c-only */ -/* %endif */ - -/* %if-c-only */ -/* %endif */ - -/* %if-c-only */ -/* %endif */ -/* %if-c++-only */ - void yyFlexLexer::yy_push_state( int _new_state ) -/* %endif */ -{ - if ( (yy_start_stack_ptr) >= (yy_start_stack_depth) ) - { - yy_size_t new_size; - - (yy_start_stack_depth) += YY_START_STACK_INCR; - new_size = (yy_size_t) (yy_start_stack_depth) * sizeof( int ); - - if ( ! (yy_start_stack) ) - (yy_start_stack) = (int *) yyalloc( new_size ); - - else - (yy_start_stack) = (int *) yyrealloc( - (void *) (yy_start_stack), new_size ); - - if ( ! (yy_start_stack) ) - YY_FATAL_ERROR( "out of memory expanding start-condition stack" ); - } - - (yy_start_stack)[(yy_start_stack_ptr)++] = YY_START; - - BEGIN(_new_state); -} - -/* %if-c-only */ -/* %endif */ -/* %if-c++-only */ - void yyFlexLexer::yy_pop_state() -/* %endif */ -{ - if ( --(yy_start_stack_ptr) < 0 ) - YY_FATAL_ERROR( "start-condition stack underflow" ); - - BEGIN((yy_start_stack)[(yy_start_stack_ptr)]); -} - -/* %if-c-only */ -/* %endif */ -/* %if-c++-only */ - int yyFlexLexer::yy_top_state() -/* %endif */ -{ - return (yy_start_stack)[(yy_start_stack_ptr) - 1]; -} - -#ifndef YY_EXIT_FAILURE -#define YY_EXIT_FAILURE 2 -#endif - -/* %if-c-only */ -/* %endif */ -/* %if-c++-only */ -void yyFlexLexer::LexerError( const char* msg ) -{ - std::cerr << msg << std::endl; - exit( YY_EXIT_FAILURE ); -} -/* %endif */ - -/* Redefine yyless() so it works in section 3 code. */ - -#undef yyless -#define yyless(n) \ - do \ - { \ - /* Undo effects of setting up yytext. */ \ - int yyless_macro_arg = (n); \ - YY_LESS_LINENO(yyless_macro_arg);\ - yytext[yyleng] = (yy_hold_char); \ - (yy_c_buf_p) = yytext + yyless_macro_arg; \ - (yy_hold_char) = *(yy_c_buf_p); \ - *(yy_c_buf_p) = '\0'; \ - yyleng = yyless_macro_arg; \ - } \ - while ( 0 ) - -/* Accessor methods (get/set functions) to struct members. */ - -/* %if-c-only */ -/* %if-reentrant */ -/* %endif */ -/* %if-reentrant */ -/* %endif */ -/* %endif */ - -/* %if-reentrant */ -/* %if-bison-bridge */ -/* %endif */ -/* %endif if-c-only */ - -/* %if-c-only */ -/* %endif */ - -/* %if-c-only SNIP! this currently causes conflicts with the c++ scanner */ -/* %if-reentrant */ -/* %endif */ -/* %endif */ - -/* - * Internal utility routines. - */ - -#ifndef yytext_ptr -static void yy_flex_strncpy (char* s1, const char * s2, int n ) -{ - - int i; - for ( i = 0; i < n; ++i ) - s1[i] = s2[i]; -} -#endif - -#ifdef YY_NEED_STRLEN -static int yy_flex_strlen (const char * s ) -{ - int n; - for ( n = 0; s[n]; ++n ) - ; - - return n; -} -#endif - -void *yyalloc (yy_size_t size ) -{ - return malloc(size); -} - -void *yyrealloc (void * ptr, yy_size_t size ) -{ - - /* The cast to (char *) in the following accommodates both - * implementations that use char* generic pointers, and those - * that use void* generic pointers. It works with the latter - * because both ANSI C and C++ allow castless assignment from - * any pointer type to void*, and deal with argument conversions - * as though doing an assignment. - */ - return realloc(ptr, size); -} - -void yyfree (void * ptr ) -{ - free( (char *) ptr ); /* see yyrealloc() for (char *) cast */ -} - -/* %if-tables-serialization definitions */ -/* %define-yytables The name for this specific scanner's tables. */ -#define YYTABLES_NAME "yytables" -/* %endif */ - -/* %ok-for-header */ - -#line 57 "search_lexer_plain.l" - - diff --git a/src/parser/search_lexer_plain.l b/src/parser/search_lexer_plain.l deleted file mode 100644 index cac468ee94..0000000000 --- a/src/parser/search_lexer_plain.l +++ /dev/null @@ -1,56 +0,0 @@ -%{ -#include -#include -#include -#include - -/* Implementation of yyFlexScanner */ -#define SearchScannerDerived SearchScannerPlain -#include "search_scanner_derived.h" -#undef SearchScannerDerived -#undef YY_DECL -#define YY_DECL int infinity::SearchScannerPlain::yylex(infinity::SearchParser::semantic_type * const lval, infinity::SearchParser::location_type *loc) - -/* typedef to make the returns for the tokens shorter */ -using token = infinity::SearchParser::token; - -/* define yyterminate as this instead of NULL */ -#define yyterminate() return( token::END ) - -/* msvc2010 requires that we exclude this header file. */ -#define YY_NO_UNISTD_H - -/* update location on matching */ -#define YY_USER_ACTION loc->step(); loc->columns(yyleng); - -%} - -%option c++ -%option yyclass="infinity::SearchScannerPlain" -%option noyywrap nounput batch debug noinput -%option prefix="SearchScannerPlain" -%option warn -%option never-interactive - -ASC [\x00-\x7f] -ASCN [\x00-\t\v-\x7f] -U [\x80-\xbf] -U2 [\xc2-\xdf] -U3 [\xe0-\xef] -U4 [\xf0-\xf4] -UANY {ASC}|{U2}{U}|{U3}{U}{U}|{U4}{U}{U}{U} -UANYN {ASCN}|{U2}{U}|{U3}{U}{U}|{U4}{U}{U}{U} -UONLY {U2}{U}|{U3}{U}{U}|{U4}{U}{U}{U} - -%% -%{ /** Code executed at the beginning of yylex **/ - yylval = lval; -%} - --?[0-9]+("."[0-9]*)? | --?"."[0-9]+ | -([a-zA-Z0-9_]|{UONLY})+ { yylval->build(InfString(yytext, false)); return token::STRING; } // https://stackoverflow.com/questions/9611682/flexlexer-support-for-unicode - -.|\n /* ignore any other character */; - -%% diff --git a/src/parser/statement/select_statement.cpp b/src/parser/statement/select_statement.cpp index 762688e88b..fd6e09616a 100644 --- a/src/parser/statement/select_statement.cpp +++ b/src/parser/statement/select_statement.cpp @@ -62,12 +62,12 @@ SelectStatement::~SelectStatement() { having_expr_ = nullptr; } - if (order_by_list != nullptr) { - for (auto &expr_ptr : *order_by_list) { + if (order_by_list_ != nullptr) { + for (auto &expr_ptr : *order_by_list_) { delete expr_ptr; } - delete order_by_list; - order_by_list = nullptr; + delete order_by_list_; + order_by_list_ = nullptr; } if (limit_expr_ != nullptr) { diff --git a/src/parser/statement/select_statement.h b/src/parser/statement/select_statement.h index a4d06d19b7..6118779e28 100644 --- a/src/parser/statement/select_statement.h +++ b/src/parser/statement/select_statement.h @@ -60,13 +60,15 @@ class SelectStatement final : public BaseStatement { ParsedExpr *where_expr_{nullptr}; std::vector *group_by_list_{nullptr}; ParsedExpr *having_expr_{nullptr}; - std::vector *order_by_list{nullptr}; + std::vector *order_by_list_{nullptr}; ParsedExpr *limit_expr_{nullptr}; ParsedExpr *offset_expr_{nullptr}; std::vector *with_exprs_{nullptr}; SetOperatorType set_op_{SetOperatorType::kUnion}; SelectStatement *nested_select_{nullptr}; + + bool total_hits_count_flag_{false}; }; } // namespace infinity diff --git a/src/parser/type/complex/embedding_type.cpp b/src/parser/type/complex/embedding_type.cpp index ef8fe386c4..d734b32e60 100644 --- a/src/parser/type/complex/embedding_type.cpp +++ b/src/parser/type/complex/embedding_type.cpp @@ -14,9 +14,37 @@ #include "embedding_type.h" #include +#include namespace infinity { +bool operator==(const EmbeddingDataType &type, const arrow::Type::type &arrow_type) { + switch (type) { + case EmbeddingDataType::kElemBit: + return arrow_type == arrow::Type::BOOL; + case EmbeddingDataType::kElemUInt8: + return arrow_type == arrow::Type::UINT8; + case EmbeddingDataType::kElemInt8: + return arrow_type == arrow::Type::INT8; + case EmbeddingDataType::kElemInt16: + return arrow_type == arrow::Type::INT16; + case EmbeddingDataType::kElemInt32: + return arrow_type == arrow::Type::INT32; + case EmbeddingDataType::kElemInt64: + return arrow_type == arrow::Type::INT64; + case EmbeddingDataType::kElemFloat: + return arrow_type == arrow::Type::FLOAT; + case EmbeddingDataType::kElemFloat16: + return arrow_type == arrow::Type::HALF_FLOAT; + case EmbeddingDataType::kElemDouble: + return arrow_type == arrow::Type::DOUBLE; + default: + return false; + } +} + +bool operator!=(const EmbeddingDataType &type, const arrow::Type::type &arrow_type) { return !(type == arrow_type); } + size_t EmbeddingType::embedding_type_width[] = { 0, // bit 1, // int8 diff --git a/src/parser/type/complex/embedding_type.h b/src/parser/type/complex/embedding_type.h index fef13a1194..d447956dfd 100644 --- a/src/parser/type/complex/embedding_type.h +++ b/src/parser/type/complex/embedding_type.h @@ -24,6 +24,7 @@ #include #include #include +#include namespace infinity { @@ -41,6 +42,10 @@ enum class EmbeddingDataType : int8_t { kElemInvalid, }; +bool operator==(const EmbeddingDataType &type, const arrow::Type::type &arrow_type); + +bool operator!=(const EmbeddingDataType &type, const arrow::Type::type &arrow_type); + constexpr auto to_underlying_val(EmbeddingDataType type) { return static_cast>(type); } template diff --git a/src/parser/type/data_type.cpp b/src/parser/type/data_type.cpp index c45381ba28..df8e15366b 100644 --- a/src/parser/type/data_type.cpp +++ b/src/parser/type/data_type.cpp @@ -26,6 +26,7 @@ #include #include #include +#include namespace infinity { @@ -114,8 +115,101 @@ bool DataType::operator==(const DataType &other) const { } } +bool DataType::operator==(const arrow::DataType &other) const { + switch (type_) { + case LogicalType::kBoolean: + return other.id() == arrow::Type::BOOL; + case LogicalType::kTinyInt: + return other.id() == arrow::Type::INT8; + case LogicalType::kSmallInt: + return other.id() == arrow::Type::INT16; + case LogicalType::kInteger: + return other.id() == arrow::Type::INT32; + case LogicalType::kBigInt: + return other.id() == arrow::Type::INT64; + case LogicalType::kFloat16: + return other.id() == arrow::Type::HALF_FLOAT; + case LogicalType::kBFloat16: + return other.id() == arrow::Type::FLOAT; + case LogicalType::kFloat: + return other.id() == arrow::Type::FLOAT; + case LogicalType::kDouble: + return other.id() == arrow::Type::DOUBLE; + case LogicalType::kDate: + return other.id() == arrow::Type::DATE32; + case LogicalType::kTime: + return other.id() == arrow::Type::TIME32; + case LogicalType::kDateTime: + return other.id() == arrow::Type::TIMESTAMP; + case LogicalType::kTimestamp: + return other.id() == arrow::Type::TIMESTAMP; + case LogicalType::kVarchar: + return other.id() == arrow::Type::STRING; + case LogicalType::kEmbedding: { + auto *embedding_info = static_cast(type_info_.get()); + if (other.id() == arrow::Type::FIXED_SIZE_LIST) { + const auto &list_type = static_cast(other); + return *embedding_info == list_type; + } else if (other.id() == arrow::Type::LIST) { + const auto &list_type = static_cast(other); + return *embedding_info == list_type; + } else { + return false; + } + } + case LogicalType::kSparse: { + const auto *sparse_info = static_cast(type_info_.get()); + if (other.id() != arrow::Type::STRUCT) { + return false; + } + const auto &struct_type = static_cast(other); + return *sparse_info == struct_type; + } + case LogicalType::kMultiVector: + case LogicalType::kTensor: { + auto *embedding_info = static_cast(type_info_.get()); + if (other.id() != arrow::Type::LIST) { + return false; + } + const auto &tensor_type = static_cast(other); + if (tensor_type.value_type()->id() == arrow::Type::FIXED_SIZE_LIST) { + const auto &inner_type = static_cast(*tensor_type.value_field()->type()); + return *embedding_info == inner_type; + } else if (tensor_type.value_type()->id() == arrow::Type::LIST) { + const auto &inner_type = static_cast(*tensor_type.value_field()->type()); + return *embedding_info == inner_type; + } + return false; + } + case LogicalType::kTensorArray: { + auto *embedding_info = static_cast(type_info_.get()); + if (other.id() != arrow::Type::LIST) { + return false; + } + const auto &tensor_array_type = static_cast(other); + if (tensor_array_type.value_type()->id() != arrow::Type::LIST) { + return false; + } + const auto &tensor_type = static_cast(*tensor_array_type.value_field()->type()); + if (tensor_type.value_type()->id() == arrow::Type::FIXED_SIZE_LIST) { + const auto &inner_type = static_cast(*tensor_type.value_field()->type()); + return *embedding_info == inner_type; + } else if (tensor_type.value_type()->id() == arrow::Type::LIST) { + const auto &inner_type = static_cast(*tensor_type.value_field()->type()); + return *embedding_info == inner_type; + } + return false; + } + default: { + return false; + } + } +} + bool DataType::operator!=(const DataType &other) const { return !operator==(other); } +bool DataType::operator!=(const arrow::DataType &other) const { return !operator==(other); } + size_t DataType::Size() const { switch (type_) { case LogicalType::kEmbedding: diff --git a/src/parser/type/data_type.h b/src/parser/type/data_type.h index bd5cfad7aa..c089e71429 100644 --- a/src/parser/type/data_type.h +++ b/src/parser/type/data_type.h @@ -21,6 +21,12 @@ #include #include +namespace arrow { + +class DataType; + +} + namespace infinity { class DataType { @@ -54,8 +60,12 @@ class DataType { bool operator==(const DataType &other) const; + bool operator==(const arrow::DataType &other) const; + bool operator!=(const DataType &other) const; + bool operator!=(const arrow::DataType &other) const; + [[nodiscard]] std::string ToString() const; [[nodiscard]] size_t Size() const; diff --git a/src/parser/type/info/embedding_info.cpp b/src/parser/type/info/embedding_info.cpp index 2467292a14..7805f4cf93 100644 --- a/src/parser/type/info/embedding_info.cpp +++ b/src/parser/type/info/embedding_info.cpp @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. #include "embedding_info.h" +#include namespace infinity { @@ -24,6 +25,15 @@ bool EmbeddingInfo::operator==(const TypeInfo &other) const { return this->dimension_ == embedding_info_ptr->dimension_ && this->embedding_data_type_ == embedding_info_ptr->embedding_data_type_; } +bool EmbeddingInfo::operator==(const arrow::FixedSizeListType &other) const { + if (static_cast(dimension_) != other.list_size()) { + return false; + } + return embedding_data_type_ == other.value_type()->id(); +} + +bool EmbeddingInfo::operator==(const arrow::ListType &other) const { return embedding_data_type_ == other.value_type()->id(); } + nlohmann::json EmbeddingInfo::Serialize() const { nlohmann::json res; res["dimension"] = dimension_; diff --git a/src/parser/type/info/embedding_info.h b/src/parser/type/info/embedding_info.h index 61f475fe65..2bc102b237 100644 --- a/src/parser/type/info/embedding_info.h +++ b/src/parser/type/info/embedding_info.h @@ -20,6 +20,11 @@ #include +namespace arrow { +class FixedSizeListType; +class ListType; +} + namespace infinity { class EmbeddingInfo : public TypeInfo { @@ -36,6 +41,10 @@ class EmbeddingInfo : public TypeInfo { bool operator==(const TypeInfo &other) const override; + bool operator==(const arrow::FixedSizeListType &other) const; + + bool operator==(const arrow::ListType &other) const; + [[nodiscard]] inline size_t Size() const override { return EmbeddingType::EmbeddingSize(embedding_data_type_, dimension_); } [[nodiscard]] inline EmbeddingDataType Type() const noexcept { return embedding_data_type_; } diff --git a/src/parser/type/info/sparse_info.cpp b/src/parser/type/info/sparse_info.cpp index 34458ee3ce..2a580eb622 100644 --- a/src/parser/type/info/sparse_info.cpp +++ b/src/parser/type/info/sparse_info.cpp @@ -13,6 +13,7 @@ // limitations under the License. #include "sparse_info.h" +#include #include namespace infinity { @@ -65,6 +66,32 @@ bool SparseInfo::operator==(const TypeInfo &other) const { dimension_ == other_sparse_info->dimension_; } +bool SparseInfo::operator==(const arrow::StructType &other) const { + std::shared_ptr index_field = other.GetFieldByName("index"); + if (!index_field) { + return false; + } + auto index_type = std::dynamic_pointer_cast(index_field->type()); + if (!index_type) { + return false; + } + if (index_type_ != index_type->value_type()->id()) { + return false; + } + std::shared_ptr value_field = other.GetFieldByName("value"); + if (!value_field) { + return data_type_ == EmbeddingDataType::kElemBit; + } + if (data_type_ == EmbeddingDataType::kElemBit) { + return false; + } + auto value_type = std::dynamic_pointer_cast(value_field->type()); + if (!value_type) { + return false; + } + return data_type_ == value_type->value_type()->id(); +} + nlohmann::json SparseInfo::Serialize() const { nlohmann::json res; res["data_type"] = data_type_; diff --git a/src/parser/type/info/sparse_info.h b/src/parser/type/info/sparse_info.h index f4163fec1d..0a4bf61659 100644 --- a/src/parser/type/info/sparse_info.h +++ b/src/parser/type/info/sparse_info.h @@ -20,6 +20,10 @@ #include "type/complex/sparse_type.h" #include "type/type_info.h" +namespace arrow { +class StructType; +} + namespace infinity { enum class SparseStoreType : int8_t { @@ -46,6 +50,8 @@ class SparseInfo : public TypeInfo { bool operator==(const TypeInfo &other) const override; + bool operator==(const arrow::StructType &other) const; + [[nodiscard]] inline size_t Size() const override { return sizeof(SparseType); } inline size_t SparseSize(size_t nnz) const { return IndiceSize(nnz) + DataSize(nnz); } diff --git a/src/planner/bound_select_statement.cpp b/src/planner/bound_select_statement.cpp index 2b13a340f7..112540076d 100644 --- a/src/planner/bound_select_statement.cpp +++ b/src/planner/bound_select_statement.cpp @@ -137,12 +137,13 @@ SharedPtr BoundSelectStatement::BuildPlan(QueryContext *query_conte limit_expression_, offset_expression_, order_by_expressions_, - order_by_types_); + order_by_types_, + total_hits_count_flag_); top->set_left_node(root); root = top; } } else if (limit_expression_.get() != nullptr) { - auto limit = MakeShared(bind_context->GetNewLogicalNodeId(), limit_expression_, offset_expression_); + auto limit = MakeShared(bind_context->GetNewLogicalNodeId(), limit_expression_, offset_expression_, total_hits_count_flag_); limit->set_left_node(root); root = limit; } @@ -218,7 +219,9 @@ SharedPtr BoundSelectStatement::BuildPlan(QueryContext *query_conte // option: block max iter = search_ops.options_.find("block_max"); - if (iter == search_ops.options_.end() or iter->second == "true" or iter->second == "bmw") { + if (iter == search_ops.options_.end() || iter->second == "auto") { + match_node->early_term_algo_ = EarlyTermAlgo::kAuto; + } else if (iter->second == "true" || iter->second == "bmw") { match_node->early_term_algo_ = EarlyTermAlgo::kBMW; } else if (iter->second == "batch") { match_node->early_term_algo_ = EarlyTermAlgo::kBatch; @@ -227,8 +230,7 @@ SharedPtr BoundSelectStatement::BuildPlan(QueryContext *query_conte } else if (iter->second == "compare") { match_node->early_term_algo_ = EarlyTermAlgo::kCompare; } else { - Status status = Status::SyntaxError("block_max option must be empty, true, false or compare"); - RecoverableError(status); + RecoverableError(Status::SyntaxError("block_max option must be empty, auto, bmw, true, batch, false, or compare")); } // option: top n @@ -387,7 +389,7 @@ SharedPtr BoundSelectStatement::BuildPlan(QueryContext *query_conte } if (limit_expression_.get() != nullptr) { - auto limit = MakeShared(bind_context->GetNewLogicalNodeId(), limit_expression_, offset_expression_); + auto limit = MakeShared(bind_context->GetNewLogicalNodeId(), limit_expression_, offset_expression_, total_hits_count_flag_); limit->set_left_node(root); root = limit; } diff --git a/src/planner/bound_select_statement.cppm b/src/planner/bound_select_statement.cppm index ae8532ec47..e75b524595 100644 --- a/src/planner/bound_select_statement.cppm +++ b/src/planner/bound_select_statement.cppm @@ -100,6 +100,7 @@ public: // Project expression list Vector> projection_expressions_{}; + bool total_hits_count_flag_{false}; // Highlight info Map> highlight_columns_{}; diff --git a/src/planner/explain_ast.cpp b/src/planner/explain_ast.cpp index 949f3d0aac..5a2b89e32c 100644 --- a/src/planner/explain_ast.cpp +++ b/src/planner/explain_ast.cpp @@ -363,11 +363,11 @@ Status ExplainAST::BuildSelect(const SelectStatement *select_statement, result->emplace_back(MakeShared(having_str)); } - if (select_statement->order_by_list != nullptr) { + if (select_statement->order_by_list_ != nullptr) { String order_str = String(intent_size, ' ') + "groupby: "; - SizeT order_count = select_statement->order_by_list->size(); + SizeT order_count = select_statement->order_by_list_->size(); for (SizeT idx = 0; idx < order_count - 1; ++idx) { - OrderByExpr *order_expr = select_statement->order_by_list->at(idx); + OrderByExpr *order_expr = select_statement->order_by_list_->at(idx); order_str += order_expr->expr_->ToString(); if (order_expr->type_ == OrderType::kAsc) { order_str += " Ascending, "; @@ -375,7 +375,7 @@ Status ExplainAST::BuildSelect(const SelectStatement *select_statement, order_str += " Descending, "; } } - OrderByExpr *order_expr = select_statement->order_by_list->back(); + OrderByExpr *order_expr = select_statement->order_by_list_->back(); order_str += order_expr->expr_->ToString(); if (order_expr->type_ == OrderType::kAsc) { order_str += " Ascending"; diff --git a/src/planner/expression_binder.cpp b/src/planner/expression_binder.cpp index 78d01bd0f1..0e5a06136a 100644 --- a/src/planner/expression_binder.cpp +++ b/src/planner/expression_binder.cpp @@ -981,23 +981,26 @@ Optional> ExpressionBinder::TryBuildSpecialFuncExpr(co auto [special_function_ptr, status] = Catalog::GetSpecialFunctionByNameNoExcept(query_context_->storage()->catalog(), expr.func_name_); if (status.ok()) { switch (special_function_ptr->special_type()) { + case SpecialType::kDistanceFactors: case SpecialType::kDistance: { if (!bind_context_ptr->allow_distance) { RecoverableError( - Status::SyntaxError("DISTANCE() needs to be allowed only when there is only MATCH VECTOR with distance metrics, like L2")); + Status::SyntaxError("DISTANCE() / DISTANCE_FACTORS() needs to be allowed only when there is only MATCH VECTOR with distance metrics, like L2")); } break; } + case SpecialType::kSimilarityFactors: case SpecialType::kSimilarity: { if (!bind_context_ptr->allow_similarity) { RecoverableError(Status::SyntaxError( - "SIMILARITY() needs to be allowed only when there is only MATCH VECTOR with similarity metrics, like Inner product")); + "SIMILARITY() / SIMILARITY_FACTORS() needs to be allowed only when there is only MATCH VECTOR with similarity metrics, like Inner product")); } break; } + case SpecialType::kScoreFactors: case SpecialType::kScore: { if (!bind_context_ptr->allow_score) { - RecoverableError(Status::SyntaxError("SCORE() requires Fusion or MATCH TEXT or MATCH TENSOR")); + RecoverableError(Status::SyntaxError("SCORE() / SCORE_FACTORS() requires Fusion or MATCH TEXT or MATCH TENSOR")); } break; } diff --git a/src/planner/logical_planner.cpp b/src/planner/logical_planner.cpp index 8f4d30f347..f927e384c3 100644 --- a/src/planner/logical_planner.cpp +++ b/src/planner/logical_planner.cpp @@ -105,6 +105,7 @@ import special_function; import utility; import wal_manager; import infinity_context; +import table_entry; namespace infinity { @@ -732,7 +733,14 @@ Status LogicalPlanner::BuildCreateIndex(const CreateStatement *statement, Shared SharedPtr index_name = MakeShared(std::move(create_index_info->index_name_)); UniquePtr query_binder_ptr = MakeUnique(this->query_context_ptr_, bind_context_ptr); auto base_table_ref = query_binder_ptr->GetTableRef(*schema_name, *table_name); - auto status = base_table_ref->table_entry_ptr_->AddWriteTxnNum(txn); + TableEntry *table_entry = base_table_ref->table_entry_ptr_; + { + TableEntry::TableStatus status; + if (!table_entry->SetCreatingIndex(status, txn)) { + RecoverableError(Status::NotSupport(fmt::format("Cannot create index when table {} status is {}", table_entry->encode(), u8(status)))); + } + } + auto status = table_entry->AddWriteTxnNum(txn); if (!status.ok()) { RecoverableError(status); } @@ -949,7 +957,7 @@ Status LogicalPlanner::BuildCopy(CopyStatement *statement, SharedPtrschema_name_); if (statement->copy_from_) { StorageMode storage_mode = InfinityContext::instance().storage()->GetStorageMode(); - if (storage_mode == StorageMode::kUnInitialized) { + if (storage_mode == StorageMode::kUnInitialized) { UnrecoverableError("Uninitialized storage mode"); } diff --git a/src/planner/node/logical_limit.cppm b/src/planner/node/logical_limit.cppm index 36887e1363..82cde1ff0e 100644 --- a/src/planner/node/logical_limit.cppm +++ b/src/planner/node/logical_limit.cppm @@ -28,9 +28,12 @@ namespace infinity { export class LogicalLimit : public LogicalNode { public: - inline explicit LogicalLimit(u64 node_id, SharedPtr limit_expression, SharedPtr offset_expression) + inline explicit LogicalLimit(u64 node_id, + SharedPtr limit_expression, + SharedPtr offset_expression, + bool total_hits_count_flag) : LogicalNode(node_id, LogicalNodeType::kLimit), limit_expression_(std::move(limit_expression)), - offset_expression_(std::move(offset_expression)) {} + offset_expression_(std::move(offset_expression)), total_hits_count_flag_(total_hits_count_flag) {} [[nodiscard]] Vector GetColumnBindings() const final; @@ -44,6 +47,8 @@ public: SharedPtr limit_expression_{}; SharedPtr offset_expression_{}; + + bool total_hits_count_flag_{false}; }; } // namespace infinity diff --git a/src/planner/node/logical_match.cppm b/src/planner/node/logical_match.cppm index e55d40b65b..ff1705690a 100644 --- a/src/planner/node/logical_match.cppm +++ b/src/planner/node/logical_match.cppm @@ -61,7 +61,7 @@ public: IndexReader index_reader_; UniquePtr query_tree_; float begin_threshold_; - EarlyTermAlgo early_term_algo_{EarlyTermAlgo::kBMW}; + EarlyTermAlgo early_term_algo_{EarlyTermAlgo::kAuto}; u32 top_n_{1}; SharedPtr common_query_filter_{}; diff --git a/src/planner/node/logical_project.cppm b/src/planner/node/logical_project.cppm index 8254a0fdd3..e62d453f14 100644 --- a/src/planner/node/logical_project.cppm +++ b/src/planner/node/logical_project.cppm @@ -52,6 +52,8 @@ public: u64 table_index_{}; Map> highlight_columns_{}; + + bool total_hits_count_flag_{false}; }; } // namespace infinity diff --git a/src/planner/node/logical_top.cppm b/src/planner/node/logical_top.cppm index a475421f4b..a2d2fba10d 100644 --- a/src/planner/node/logical_top.cppm +++ b/src/planner/node/logical_top.cppm @@ -36,10 +36,11 @@ public: SharedPtr limit_expression, SharedPtr offset_expression, Vector> sort_expressions, - Vector order_by_types) + Vector order_by_types, + bool total_hits_count_flag) : LogicalNode(node_id, LogicalNodeType::kTop), base_table_ref_(std::move(base_table_ref)), limit_expression_(std::move(limit_expression)), offset_expression_(std::move(offset_expression)), sort_expressions_(std::move(sort_expressions)), - order_by_types_(std::move(order_by_types)) {} + order_by_types_(std::move(order_by_types)), total_hits_count_flag_(total_hits_count_flag) {} [[nodiscard]] Vector GetColumnBindings() const final; @@ -56,6 +57,7 @@ public: SharedPtr offset_expression_{}; Vector> sort_expressions_{}; Vector order_by_types_{}; + bool total_hits_count_flag_{}; }; } // namespace infinity diff --git a/src/planner/optimizer/column_remapper.cpp b/src/planner/optimizer/column_remapper.cpp index eb2dcd4abc..620a4f0703 100644 --- a/src/planner/optimizer/column_remapper.cpp +++ b/src/planner/optimizer/column_remapper.cpp @@ -104,6 +104,15 @@ SharedPtr BindingRemapper::VisitReplace(const SharedPtralias_, column_cnt_ - 2); } + case SpecialType::kScoreFactors: + case SpecialType::kSimilarityFactors: + case SpecialType::kDistanceFactors: { + return ReferenceExpression::Make(expression->Type(), + expression->table_name(), + expression->column_name(), + expression->alias_, + column_cnt_ - 3); + } case SpecialType::kCreateTs: case SpecialType::kDeleteTs: { break; diff --git a/src/planner/optimizer/index_scan/filter_expression_push_down_indexscanfilter.cpp b/src/planner/optimizer/index_scan/filter_expression_push_down_indexscanfilter.cpp index f31e374eb7..7dd49be290 100644 --- a/src/planner/optimizer/index_scan/filter_expression_push_down_indexscanfilter.cpp +++ b/src/planner/optimizer/index_scan/filter_expression_push_down_indexscanfilter.cpp @@ -456,7 +456,7 @@ class IndexScanFilterExpressionPushDownMethod { case Enum::kFilterFulltextExpr: { auto *filter_fulltext_expr = static_cast(index_filter_tree_node.src_ptr->get()); auto index_reader = table_entry_ptr_->GetFullTextIndexReader(query_context_->GetTxn()); - EarlyTermAlgo early_term_algo = EarlyTermAlgo::kNaive; + EarlyTermAlgo early_term_algo = EarlyTermAlgo::kAuto; UniquePtr query_tree; MinimumShouldMatchOption minimum_should_match_option; f32 score_threshold = {}; @@ -474,16 +474,18 @@ class IndexScanFilterExpressionPushDownMethod { // option: block max iter = search_ops.options_.find("block_max"); - if (iter == search_ops.options_.end() || iter->second == "false") { + if (iter == search_ops.options_.end() || iter->second == "auto") { + early_term_algo = EarlyTermAlgo::kAuto; + } else if (iter->second == "batch") { + early_term_algo = EarlyTermAlgo::kBatch; + } else if (iter->second == "false") { early_term_algo = EarlyTermAlgo::kNaive; } else if (iter->second == "true" || iter->second == "bmw") { early_term_algo = EarlyTermAlgo::kBMW; - } else if (iter->second == "batch") { - early_term_algo = EarlyTermAlgo::kBatch; } else if (iter->second == "compare") { early_term_algo = EarlyTermAlgo::kCompare; } else { - RecoverableError(Status::SyntaxError("block_max option must be empty, true, false or compare")); + RecoverableError(Status::SyntaxError("block_max option must be empty, auto, batch, false, true, bmw, or compare")); } // option: top n diff --git a/src/planner/optimizer/index_scan/index_filter_evaluators.cpp b/src/planner/optimizer/index_scan/index_filter_evaluators.cpp index 74cb9b5842..d490d6c6fe 100644 --- a/src/planner/optimizer/index_scan/index_filter_evaluators.cpp +++ b/src/planner/optimizer/index_scan/index_filter_evaluators.cpp @@ -481,7 +481,7 @@ Bitmask IndexFilterEvaluatorFulltext::Evaluate(const SegmentID segment_id, const result.SetAllFalse(); const RowID begin_rowid(segment_id, 0); const RowID end_rowid(segment_id, segment_row_count); - const CreateSearchParams params{table_entry_, &index_reader_, early_term_algo_, ft_similarity_, minimum_should_match_, index_names_}; + const CreateSearchParams params{table_entry_, &index_reader_, early_term_algo_, ft_similarity_, minimum_should_match_, 0u, index_names_}; auto ft_iter = query_tree_->CreateSearch(params); if (ft_iter && score_threshold_ > 0.0f) { auto new_ft_iter = MakeUnique(std::move(ft_iter), score_threshold_); @@ -525,6 +525,7 @@ Bitmask IndexFilterEvaluatorAND::Evaluate(const SegmentID segment_id, const Segm fulltext_evaluator_->early_term_algo_, fulltext_evaluator_->ft_similarity_, fulltext_evaluator_->minimum_should_match_, + 0u, fulltext_evaluator_->index_names_}; auto ft_iter = fulltext_evaluator_->query_tree_->CreateSearch(params); if (ft_iter && fulltext_evaluator_->score_threshold_ > 0.0f) { @@ -606,7 +607,6 @@ struct TrunkReader { template struct TrunkReaderT final : TrunkReader { using KeyType = typename TrunkReader::SecondaryIndexOrderedT; - static constexpr u32 data_pair_size = sizeof(KeyType) + sizeof(u32); const u32 segment_row_count_; SharedPtr chunk_index_entry_; u32 begin_pos_ = 0; @@ -614,9 +614,9 @@ struct TrunkReaderT final : TrunkReader { TrunkReaderT(const u32 segment_row_count, const SharedPtr &chunk_index_entry) : segment_row_count_(segment_row_count), chunk_index_entry_(chunk_index_entry) {} u32 GetResultCnt(const Pair interval_range) override { - BufferHandle index_handle_head = chunk_index_entry_->GetIndex(); - auto index = static_cast(index_handle_head.GetData()); - u32 index_data_num = index->GetChunkRowCount(); + const BufferHandle index_handle = chunk_index_entry_->GetIndex(); + const auto index = static_cast(index_handle.GetData()); + const u32 index_data_num = index->GetChunkRowCount(); const auto [begin_val, end_val] = interval_range; // 1. search PGM and get approximate search range // result: @@ -625,117 +625,61 @@ struct TrunkReaderT final : TrunkReader { // 3. size_t upper_bound_; ///< The upper bound of the range. // NOTICE: PGM return a range [lower_bound_, upper_bound_) which must include **one** key when the key exists // NOTICE: but the range may not include the complete [start, end] range - auto [begin_approx_pos, begin_lower, begin_upper] = index->SearchPGM(&begin_val); - auto [end_approx_pos, end_lower, end_upper] = index->SearchPGM(&end_val); + const auto [begin_approx_pos, begin_lower, begin_upper] = index->SearchPGM(&begin_val); + const auto [end_approx_pos, end_lower, end_upper] = index->SearchPGM(&end_val); u32 begin_pos = begin_lower; u32 end_pos = std::min(end_upper, index_data_num - 1); if (end_pos < begin_pos) { return 0; } - const auto column_data_type = chunk_index_entry_->segment_index_entry_->table_index_entry()->column_def()->type(); - const auto index_part_num = chunk_index_entry_->GetPartNum(); - // 2. find the exact range - // 2.1 find the exact begin_pos which is the first position that index_key >= begin_val - u32 begin_part_id = begin_pos / 8192; - u32 begin_part_offset = begin_pos % 8192; - auto index_handle_b = chunk_index_entry_->GetIndexPartAt(begin_part_id); - auto index_data_b = index_handle_b.GetData(); - auto index_key_b_ptr = [&index_data_b](u32 i) -> KeyType { - KeyType key = {}; - std::memcpy(&key, static_cast(index_data_b) + i * data_pair_size, sizeof(KeyType)); + const auto [key_ptr, offset_ptr] = index->GetKeyOffsetPointer(); + auto index_key_ptr = [key_ptr](const u32 i) -> KeyType { + KeyType key{}; + std::memcpy(&key, static_cast(key_ptr) + i * sizeof(KeyType), sizeof(KeyType)); return key; }; - auto begin_part_size = chunk_index_entry_->GetPartRowCount(begin_part_id); - if (index_key_b_ptr(begin_part_offset) < begin_val) { + // 2. find the exact range + // 2.1 find the exact begin_pos which is the first position that index_key >= begin_val + if (index_key_ptr(begin_pos) < begin_val) { // search forward - while (index_key_b_ptr(begin_part_offset) < begin_val) { - if (++begin_part_offset == begin_part_size) { - if (++begin_part_id >= index_part_num) { - // nothing found - return 0; - } - index_handle_b = chunk_index_entry_->GetIndexPartAt(begin_part_id); - index_data_b = index_handle_b.GetData(); - begin_part_size = chunk_index_entry_->GetPartRowCount(begin_part_id); - begin_part_offset = 0; + while (index_key_ptr(begin_pos) < begin_val) { + if (++begin_pos == index_data_num) { + // nothing found + return 0; } } } else { // search backward - auto test_begin_part_id = begin_part_id; - auto test_begin_part_offset = begin_part_offset; - while (index_key_b_ptr(test_begin_part_offset) >= begin_val) { + auto test_begin_pos = begin_pos; + while (index_key_ptr(test_begin_pos) >= begin_val) { // keep valid begin_pos - begin_part_id = test_begin_part_id; - begin_part_offset = test_begin_part_offset; - if (test_begin_part_offset-- == 0) { - if (test_begin_part_id-- == 0) { - // left bound is the leftmost - break; - } - index_handle_b = chunk_index_entry_->GetIndexPartAt(test_begin_part_id); - index_data_b = index_handle_b.GetData(); - begin_part_size = chunk_index_entry_->GetPartRowCount(test_begin_part_id); - test_begin_part_offset = begin_part_size - 1; + begin_pos = test_begin_pos; + if (test_begin_pos-- == 0) { + // left bound is the leftmost + break; } } - // recover valid pointers - index_handle_b = chunk_index_entry_->GetIndexPartAt(begin_part_id); - index_data_b = index_handle_b.GetData(); - begin_part_size = chunk_index_entry_->GetPartRowCount(begin_part_id); } - // update begin_pos - begin_pos = begin_part_id * 8192 + begin_part_offset; // 2.2 find the exact end_pos which is the first position that index_key > end_val (or the position past the end) - u32 end_part_id = end_pos / 8192; - u32 end_part_offset = end_pos % 8192; - auto index_handle_e = chunk_index_entry_->GetIndexPartAt(end_part_id); - auto index_data_e = index_handle_e.GetData(); - auto index_key_e_ptr = [&index_data_e](u32 i) -> KeyType { - KeyType key = {}; - std::memcpy(&key, static_cast(index_data_e) + i * data_pair_size, sizeof(KeyType)); - return key; - }; - auto end_part_size = chunk_index_entry_->GetPartRowCount(end_part_id); - if (index_key_e_ptr(end_part_offset) <= end_val) { + if (index_key_ptr(end_pos) <= end_val) { // search forward - while (index_key_e_ptr(end_part_offset) <= end_val) { - if (++end_part_offset == end_part_size) { - if (++end_part_id >= index_part_num) { - // right bound is the rightmost - // recover end_part_id and keep end_part_offset - // they will be used to calculate end_pos - --end_part_id; - break; - } - index_handle_e = chunk_index_entry_->GetIndexPartAt(end_part_id); - index_data_e = index_handle_e.GetData(); - end_part_size = chunk_index_entry_->GetPartRowCount(end_part_id); - end_part_offset = 0; + while (index_key_ptr(end_pos) <= end_val) { + if (++end_pos == index_data_num) { + // right bound is the rightmost + break; } } } else { // search backward - auto test_end_part_id = end_part_id; - auto test_end_part_offset = end_part_offset; - while (index_key_e_ptr(test_end_part_offset) > end_val) { - end_part_id = test_end_part_id; - end_part_offset = test_end_part_offset; - if (test_end_part_offset-- == 0) { - if (test_end_part_id-- == 0) { - // nothing found - return 0; - } - index_handle_e = chunk_index_entry_->GetIndexPartAt(test_end_part_id); - index_data_e = index_handle_e.GetData(); - // no need to update end_part_size - test_end_part_offset = chunk_index_entry_->GetPartRowCount(test_end_part_id) - 1; + auto test_end_pos = end_pos; + while (index_key_ptr(test_end_pos) > end_val) { + end_pos = test_end_pos; + if (test_end_pos-- == 0) { + // nothing found + return 0; } } - // does not need to recover valid values like index_handle_e, index_data_e, index_key_e_ptr, end_part_size } - // update end_pos - end_pos = end_part_id * 8192 + end_part_offset; // 3. now we know result size if (end_pos <= begin_pos) { // nothing found @@ -750,27 +694,12 @@ struct TrunkReaderT final : TrunkReader { void OutPut(Bitmask &selected_rows) override { const u32 begin_pos = begin_pos_; const u32 end_pos = end_pos_; - const u32 result_size = end_pos - begin_pos; - u32 begin_part_id = begin_pos / 8192; - u32 begin_part_offset = begin_pos % 8192; - auto index_handle_b = chunk_index_entry_->GetIndexPartAt(begin_part_id); - auto index_data_b = index_handle_b.GetData(); - auto index_offset_b_ptr = [&index_data_b](const u32 i) -> u32 { - u32 result = 0; - std::memcpy(&result, static_cast(index_data_b) + i * data_pair_size + sizeof(KeyType), sizeof(u32)); - return result; - }; - auto begin_part_size = chunk_index_entry_->GetPartRowCount(begin_part_id); + const auto index_handle = chunk_index_entry_->GetIndex(); + const auto index = static_cast(index_handle.GetData()); + const auto [key_ptr, offset_ptr] = index->GetKeyOffsetPointer(); // output result - for (u32 i = 0; i < result_size; ++i) { - if (begin_part_offset == begin_part_size) { - index_handle_b = chunk_index_entry_->GetIndexPartAt(++begin_part_id); - index_data_b = index_handle_b.GetData(); - begin_part_size = chunk_index_entry_->GetPartRowCount(begin_part_id); - begin_part_offset = 0; - } - selected_rows.SetTrue(index_offset_b_ptr(begin_part_offset)); - ++begin_part_offset; + for (u32 i = begin_pos; i < end_pos; ++i) { + selected_rows.SetTrue(offset_ptr[i]); } } }; diff --git a/src/planner/optimizer/lazy_load.cpp b/src/planner/optimizer/lazy_load.cpp index 2908fccd86..7665687372 100644 --- a/src/planner/optimizer/lazy_load.cpp +++ b/src/planner/optimizer/lazy_load.cpp @@ -95,7 +95,10 @@ SharedPtr RefencecColumnCollection::VisitReplace(const SharedPtr case SpecialType::kRowID: case SpecialType::kDistance: case SpecialType::kSimilarity: - case SpecialType::kScore: { + case SpecialType::kScore: + case SpecialType::kDistanceFactors: + case SpecialType::kSimilarityFactors: + case SpecialType::kScoreFactors: { return expression; } default: { diff --git a/src/planner/query_binder.cpp b/src/planner/query_binder.cpp index 36248fb30d..69c74f9dd0 100644 --- a/src/planner/query_binder.cpp +++ b/src/planner/query_binder.cpp @@ -79,6 +79,7 @@ import txn; import logger; import defer_op; import highlighter; +import txn_store; namespace infinity { @@ -216,13 +217,14 @@ UniquePtr QueryBinder::BindSelect(const SelectStatement &s // TODO: Add projection before sort, limit? // // Push order by expression to projection - // if (statement.order_by_list != nullptr) { + // if (statement.order_by_list_ != nullptr) { // PushOrderByToProject(query_context_ptr_, statement); // } // 11. SELECT (not flatten subquery) BuildSelectList(query_context_ptr_, bound_select_statement); bound_select_statement->aggregate_expressions_ = bind_context_ptr_->aggregate_exprs_; + bound_select_statement->total_hits_count_flag_ = statement.total_hits_count_flag_; // 12. highlight list if (statement.highlight_list_ != nullptr) { @@ -238,7 +240,7 @@ UniquePtr QueryBinder::BindSelect(const SelectStatement &s } // 13. ORDER BY - if (statement.order_by_list != nullptr) { + if (statement.order_by_list_ != nullptr) { BuildOrderBy(query_context_ptr_, statement, bound_select_statement); } @@ -866,7 +868,7 @@ void QueryBinder::BuildHaving(QueryContext *query_context, } void QueryBinder::PushOrderByToProject(QueryContext *, const SelectStatement &statement) { - for (const OrderByExpr *order_by_expr : *statement.order_by_list) { + for (const OrderByExpr *order_by_expr : *statement.order_by_list_) { if (order_by_expr->expr_->type_ == ParsedExprType::kKnn) { continue; } else { @@ -909,7 +911,8 @@ void QueryBinder::BuildSelectList(QueryContext *, UniquePtrhaving_expressions_.empty() || !bound_select_statement->group_by_expressions_.empty() || !bind_context_ptr_->aggregate_exprs_.empty()) { if (!project_binder->BoundColumn().empty()) { - Status status = Status::SyntaxError(fmt::format("Column: {} must appear in the GROUP BY clause or be used in an aggregate function", project_binder->BoundColumn())); + Status status = Status::SyntaxError( + fmt::format("Column: {} must appear in the GROUP BY clause or be used in an aggregate function", project_binder->BoundColumn())); RecoverableError(status); } } @@ -919,10 +922,10 @@ void QueryBinder::BuildOrderBy(QueryContext *query_context, const SelectStatement &statement, UniquePtr &bound_statement) const { auto order_binder = MakeShared(query_context); - SizeT order_by_count = statement.order_by_list->size(); + SizeT order_by_count = statement.order_by_list_->size(); bound_statement->order_by_expressions_.reserve(order_by_count); bound_statement->order_by_types_.reserve(order_by_count); - for (const OrderByExpr *order_expr : *statement.order_by_list) { + for (const OrderByExpr *order_expr : *statement.order_by_list_) { auto bound_order_expr = order_binder->Bind(*order_expr->expr_, this->bind_context_ptr_.get(), 0, true); bound_statement->order_by_types_.emplace_back(order_expr->type_); bound_statement->order_by_expressions_.emplace_back(bound_order_expr); @@ -1011,7 +1014,7 @@ UniquePtr QueryBinder::BindDelete(const DeleteStatement &s if (statement.where_expr_ != nullptr) { auto where_binder = MakeShared(this->query_context_ptr_, bind_alias_proxy); SharedPtr where_expr = where_binder->Bind(*statement.where_expr_, this->bind_context_ptr_.get(), 0, true); - if(where_expr->Type().type() != LogicalType::kBoolean) { + if (where_expr->Type().type() != LogicalType::kBoolean) { Status status = Status::InvalidFilterExpression(where_expr->Type().ToString()); RecoverableError(status); } @@ -1041,7 +1044,7 @@ UniquePtr QueryBinder::BindUpdate(const UpdateStatement &s if (statement.where_expr_ != nullptr) { auto where_binder = MakeShared(this->query_context_ptr_, bind_alias_proxy); SharedPtr where_expr = where_binder->Bind(*statement.where_expr_, this->bind_context_ptr_.get(), 0, true); - if(where_expr->Type().type() != LogicalType::kBoolean) { + if (where_expr->Type().type() != LogicalType::kBoolean) { Status status = Status::InvalidFilterExpression(where_expr->Type().ToString()); RecoverableError(status); } @@ -1129,11 +1132,22 @@ UniquePtr QueryBinder::BindCompact(const CompactStatement } base_table_ref = MakeShared(compact_statement.table_entry_, std::move(block_index)); } - TableEntry *table_entry = base_table_ref->table_entry_ptr_;\ + TableEntry *table_entry = base_table_ref->table_entry_ptr_; + { + TxnTableStore *txn_table_store = txn->txn_store()->GetTxnTableStore(table_entry); + txn_table_store->SetCompactType(statement.compact_type_); + } + auto status = table_entry->AddWriteTxnNum(txn); if (!status.ok()) { RecoverableError(status); } + { + TableEntry::TableStatus status; + if (!table_entry->SetCompact(status, txn)) { + RecoverableError(Status::NotSupport(fmt::format("Cannot compact when table_status is {}", u8(status)))); + } + } base_table_ref->index_index_ = table_entry->GetIndexIndex(txn); return MakeUnique(bind_context_ptr_, base_table_ref, statement.compact_type_); diff --git a/src/planner/subquery/subquery_unnest.cpp b/src/planner/subquery/subquery_unnest.cpp index ebb8b3d79c..519fc0cf18 100644 --- a/src/planner/subquery/subquery_unnest.cpp +++ b/src/planner/subquery/subquery_unnest.cpp @@ -101,7 +101,7 @@ SharedPtr SubqueryUnnest::UnnestUncorrelated(SubqueryExpression // Step1 Generate limit operator on the subquery SharedPtr limit_expression = MakeShared(Value::MakeBigInt(1)); SharedPtr offset_expression = MakeShared(Value::MakeBigInt(0)); - SharedPtr limit_node = MakeShared(bind_context->GetNewLogicalNodeId(), limit_expression, offset_expression); + SharedPtr limit_node = MakeShared(bind_context->GetNewLogicalNodeId(), limit_expression, offset_expression, false); limit_node->set_left_node(subquery_plan); // Step2 Generate aggregate first operator on the limit operator diff --git a/src/scheduler/fragment_context.cpp b/src/scheduler/fragment_context.cpp index 8803a0efdf..303644b89d 100644 --- a/src/scheduler/fragment_context.cpp +++ b/src/scheduler/fragment_context.cpp @@ -1454,6 +1454,8 @@ SharedPtr SerialMaterializedFragmentCtx::GetResultInternal() { } SharedPtr result_table = DataTable::MakeResultTable(column_defs); + result_table->total_hits_count_flag_ = materialize_sink_state->total_hits_count_flag_; + result_table->total_hits_count_ = materialize_sink_state->total_hits_count_; for (auto &data_block : materialize_sink_state->data_block_array_) { result_table->UpdateRowCount(data_block->row_count()); result_table->data_blocks_.emplace_back(std::move(data_block)); @@ -1527,6 +1529,7 @@ SharedPtr ParallelMaterializedFragmentCtx::GetResultInternal() { std::set())); } + SizeT total_hits_count = 0; for (const auto &task : tasks_) { if (task->sink_state_->state_type() != SinkStateType::kMaterialize) { String error_message = "Parallel materialized fragment will only have common sink state"; @@ -1537,12 +1540,15 @@ SharedPtr ParallelMaterializedFragmentCtx::GetResultInternal() { if (result_table.get() == nullptr) { result_table = DataTable::MakeResultTable(column_defs); } + result_table->total_hits_count_flag_ = materialize_sink_state->total_hits_count_flag_; + total_hits_count += materialize_sink_state->total_hits_count_; for (auto &result_data_block : materialize_sink_state->data_block_array_) { result_table->Append(std::move(result_data_block)); } materialize_sink_state->data_block_array_.clear(); } + result_table->total_hits_count_ = total_hits_count; return result_table; } @@ -1583,10 +1589,11 @@ SharedPtr ParallelStreamFragmentCtx::GetResultInternal() { } auto *materialize_sink_state = static_cast(task->sink_state_.get()); - if (result_table.get() == nullptr) { result_table = DataTable::MakeResultTable(column_defs); } + result_table->total_hits_count_flag_ = materialize_sink_state->total_hits_count_flag_; + result_table->total_hits_count_ = materialize_sink_state->total_hits_count_; for (auto &result_data_block : materialize_sink_state->data_block_array_) { result_table->Append(std::move(result_data_block)); diff --git a/src/scheduler/fragment_data.cppm b/src/scheduler/fragment_data.cppm index 3f8d29e3a7..bdef438f54 100644 --- a/src/scheduler/fragment_data.cppm +++ b/src/scheduler/fragment_data.cppm @@ -37,11 +37,10 @@ export struct FragmentDataBase { }; export struct FragmentError : public FragmentDataBase { -// UniquePtr error_message_{}; + // UniquePtr error_message_{}; Status status_{}; - FragmentError(u64 fragment_id, Status status) - : FragmentDataBase(FragmentDataType::kError, fragment_id), status_(std::move(status)) {} + FragmentError(u64 fragment_id, Status status) : FragmentDataBase(FragmentDataType::kError, fragment_id), status_(std::move(status)) {} }; export struct FragmentData : public FragmentDataBase { @@ -50,10 +49,19 @@ export struct FragmentData : public FragmentDataBase { Optional data_idx_{}; SizeT data_count_{std::numeric_limits::max()}; bool is_last_{false}; + bool total_hits_count_flag_{false}; + SizeT total_hits_count_{}; - FragmentData(u64 fragment_id, UniquePtr data_block, i64 task_id, SizeT data_idx, SizeT data_count, bool is_last) + FragmentData(u64 fragment_id, + UniquePtr data_block, + i64 task_id, + SizeT data_idx, + SizeT data_count, + bool is_last, + bool total_hits_count_flag, + SizeT total_hits_count) : FragmentDataBase(FragmentDataType::kData, fragment_id), data_block_(std::move(data_block)), task_id_(task_id), data_idx_(data_idx), - data_count_(data_count), is_last_(is_last) {} + data_count_(data_count), is_last_(is_last), total_hits_count_flag_(total_hits_count_flag), total_hits_count_(total_hits_count) {} }; export struct FragmentNone : public FragmentDataBase { diff --git a/src/storage/buffer/buffer_manager.cppm b/src/storage/buffer/buffer_manager.cppm index 00102f4f38..007e3634c4 100644 --- a/src/storage/buffer/buffer_manager.cppm +++ b/src/storage/buffer/buffer_manager.cppm @@ -51,7 +51,7 @@ public: explicit BufferManager(u64 memory_limit, SharedPtr data_dir, SharedPtr temp_dir, - PersistenceManager* persistence_manager, + PersistenceManager *persistence_manager, SizeT lru_count = DEFAULT_BUFFER_MANAGER_LRU_COUNT); ~BufferManager(); @@ -85,9 +85,13 @@ public: Vector GetBufferObjectsInfo(); - inline PersistenceManager* persistence_manager() const { - return persistence_manager_; - } + inline PersistenceManager *persistence_manager() const { return persistence_manager_; } + + inline void AddRequestCount() { ++total_request_count_; } + inline void AddCacheMissCount() { ++cache_miss_count_; } + inline u64 TotalRequestCount() { return total_request_count_; } + inline u64 CacheMissCount() { return cache_miss_count_; } + private: friend class BufferObj; @@ -116,7 +120,7 @@ private: SharedPtr data_dir_; SharedPtr temp_dir_; const u64 memory_limit_{}; - PersistenceManager* persistence_manager_; + PersistenceManager *persistence_manager_; Atomic current_memory_size_{}; std::mutex w_locker_{}; @@ -133,6 +137,9 @@ private: std::mutex temp_locker_{}; HashSet temp_set_; HashSet clean_temp_set_; + + Atomic total_request_count_{0}; + Atomic cache_miss_count_{0}; }; } // namespace infinity diff --git a/src/storage/buffer/buffer_obj.cpp b/src/storage/buffer/buffer_obj.cpp index 3f2ff70739..559d8769f1 100644 --- a/src/storage/buffer/buffer_obj.cpp +++ b/src/storage/buffer/buffer_obj.cpp @@ -68,6 +68,7 @@ void BufferObj::UpdateFileWorkerInfo(UniquePtr new_file_worker) { } BufferHandle BufferObj::Load() { + buffer_mgr_->AddRequestCount(); std::unique_lock locker(w_locker_); switch (status_) { case BufferStatus::kLoaded: { @@ -81,6 +82,7 @@ BufferHandle BufferObj::Load() { break; } case BufferStatus::kFreed: { + buffer_mgr_->AddCacheMissCount(); bool free_success = buffer_mgr_->RequestSpace(GetBufferSize()); if (!free_success) { String error_message = "Out of memory."; @@ -95,6 +97,7 @@ BufferHandle BufferObj::Load() { break; } case BufferStatus::kNew: { + buffer_mgr_->AddCacheMissCount(); LOG_TRACE(fmt::format("Request memory {}", GetBufferSize())); bool free_success = buffer_mgr_->RequestSpace(GetBufferSize()); if (!free_success) { diff --git a/src/storage/buffer/buffer_obj.cppm b/src/storage/buffer/buffer_obj.cppm index 45c79a00de..909b1f2d9a 100644 --- a/src/storage/buffer/buffer_obj.cppm +++ b/src/storage/buffer/buffer_obj.cppm @@ -158,7 +158,7 @@ protected: private: u32 id_; - u32 obj_rc_ = 1; + u32 obj_rc_ = 0; }; } // namespace infinity \ No newline at end of file diff --git a/src/storage/buffer/file_worker/file_worker_type.cppm b/src/storage/buffer/file_worker/file_worker_type.cppm index 277b5b1e86..5c338fe7f8 100644 --- a/src/storage/buffer/file_worker/file_worker_type.cppm +++ b/src/storage/buffer/file_worker/file_worker_type.cppm @@ -29,7 +29,6 @@ export enum class FileWorkerType { kHNSWIndexFile, kRawFile, kSecondaryIndexFile, - kSecondaryIndexPartFile, kVersionDataFile, kIndexFile, kEMVBIndexFile, @@ -57,9 +56,6 @@ export String FileWorkerType2Str(FileWorkerType type) { case FileWorkerType::kSecondaryIndexFile: { return "secondary index"; } - case FileWorkerType::kSecondaryIndexPartFile: { - return "secondary index part"; - } case FileWorkerType::kVersionDataFile: { return "version data"; } diff --git a/src/storage/buffer/file_worker/secondary_index_file_worker.cpp b/src/storage/buffer/file_worker/secondary_index_file_worker.cpp index f32bec32a7..b0eb744073 100644 --- a/src/storage/buffer/file_worker/secondary_index_file_worker.cpp +++ b/src/storage/buffer/file_worker/secondary_index_file_worker.cpp @@ -39,14 +39,12 @@ SecondaryIndexFileWorker::~SecondaryIndexFileWorker() { void SecondaryIndexFileWorker::AllocateInMemory() { if (data_) [[unlikely]] { - String error_message = "AllocateInMemory: Already allocated."; - UnrecoverableError(error_message); + UnrecoverableError("AllocateInMemory: Already allocated."); } else if (auto &data_type = column_def_->type(); data_type->CanBuildSecondaryIndex()) [[likely]] { data_ = static_cast(GetSecondaryIndexData(data_type, row_count_, true)); LOG_TRACE("Finished AllocateInMemory()."); } else { - String error_message = fmt::format("Cannot build secondary index on data type: {}", data_type->ToString()); - UnrecoverableError(error_message); + UnrecoverableError(fmt::format("Cannot build secondary index on data type: {}", data_type->ToString())); } } @@ -57,8 +55,7 @@ void SecondaryIndexFileWorker::FreeInMemory() { data_ = nullptr; LOG_TRACE("Finished FreeInMemory(), deleted data_ ptr."); } else { - String error_message = "FreeInMemory: Data is not allocated."; - UnrecoverableError(error_message); + UnrecoverableError("FreeInMemory: Data is not allocated."); } } @@ -69,8 +66,7 @@ bool SecondaryIndexFileWorker::WriteToFileImpl(bool to_spill, bool &prepare_succ prepare_success = true; LOG_TRACE("Finished WriteToFileImpl(bool &prepare_success)."); } else { - String error_message = "WriteToFileImpl: data_ is nullptr"; - UnrecoverableError(error_message); + UnrecoverableError("WriteToFileImpl: data_ is nullptr"); } return true; } @@ -82,92 +78,8 @@ void SecondaryIndexFileWorker::ReadFromFileImpl(SizeT file_size) { data_ = static_cast(index); LOG_TRACE("Finished ReadFromFileImpl()."); } else { - String error_message = "ReadFromFileImpl: data_ is not nullptr"; - UnrecoverableError(error_message); + UnrecoverableError("ReadFromFileImpl: data_ is not nullptr"); } } -SecondaryIndexFileWorkerParts::SecondaryIndexFileWorkerParts(SharedPtr data_dir, - SharedPtr temp_dir, - SharedPtr file_dir, - SharedPtr file_name, - SharedPtr index_base, - SharedPtr column_def, - u32 row_count, - u32 part_id, - PersistenceManager* persistence_manager) - : IndexFileWorker(std::move(data_dir), - std::move(temp_dir), - std::move(file_dir), - std::move(file_name), - std::move(index_base), - column_def, - persistence_manager), - row_count_(row_count), part_id_(part_id) { - data_pair_size_ = GetSecondaryIndexDataPairSize(column_def_->type()); -} - -SecondaryIndexFileWorkerParts::~SecondaryIndexFileWorkerParts() { - if (data_ != nullptr) { - FreeInMemory(); - data_ = nullptr; - } -} - -void SecondaryIndexFileWorkerParts::AllocateInMemory() { - if (row_count_ < part_id_ * 8192) { - String error_message = fmt::format("AllocateInMemory: row_count_: {} < part_id_ * 8192: {}", row_count_, part_id_ * 8192); - UnrecoverableError(error_message); - } - if (data_) [[unlikely]] { - String error_message = "AllocateInMemory: Already allocated."; - UnrecoverableError(error_message); - } else if (auto &data_type = column_def_->type(); data_type->CanBuildSecondaryIndex()) [[likely]] { - data_ = static_cast(new char[part_row_count_ * data_pair_size_]); - LOG_TRACE("Finished AllocateInMemory()."); - } else { - String error_message = fmt::format("Cannot build secondary index on data type: {}", data_type->ToString()); - UnrecoverableError(error_message); - } -} - -void SecondaryIndexFileWorkerParts::FreeInMemory() { - if (data_) [[likely]] { - delete[] static_cast(data_); - data_ = nullptr; - LOG_TRACE("Finished FreeInMemory(), deleted data_ ptr."); - } else { - String error_message = "FreeInMemory: Data is not allocated."; - UnrecoverableError(error_message); - } -} - -bool SecondaryIndexFileWorkerParts::WriteToFileImpl(bool to_spill, bool &prepare_success, const FileWorkerSaveCtx &ctx) { - if (data_) [[likely]] { - file_handle_->Append(data_, part_row_count_ * data_pair_size_); - prepare_success = true; - LOG_TRACE("Finished WriteToFileImpl(bool &prepare_success)."); - } else { - String error_message = "WriteToFileImpl: data_ is nullptr"; - UnrecoverableError(error_message); - } - return true; -} - -void SecondaryIndexFileWorkerParts::ReadFromFileImpl(SizeT file_size) { - if (row_count_ < part_id_ * 8192) { - String error_message = fmt::format("ReadFromFileImpl: row_count_: {} < part_id_ * 8192: {}", row_count_, part_id_ * 8192); - UnrecoverableError(error_message); - } - if (!data_) [[likely]] { - const u32 read_bytes = part_row_count_ * data_pair_size_; - data_ = static_cast(new char[read_bytes]); - file_handle_->Read(data_, read_bytes); - LOG_TRACE("Finished ReadFromFileImpl()."); - } else { - String error_message = "ReadFromFileImpl: data_ is not nullptr"; - UnrecoverableError(error_message); - } -} - -} // namespace infinity \ No newline at end of file +} // namespace infinity diff --git a/src/storage/buffer/file_worker/secondary_index_file_worker.cppm b/src/storage/buffer/file_worker/secondary_index_file_worker.cppm index 008b636d23..9584030760 100644 --- a/src/storage/buffer/file_worker/secondary_index_file_worker.cppm +++ b/src/storage/buffer/file_worker/secondary_index_file_worker.cppm @@ -65,36 +65,4 @@ protected: const u32 row_count_{}; }; -// row_count * pair -export class SecondaryIndexFileWorkerParts final : public IndexFileWorker { -public: - explicit SecondaryIndexFileWorkerParts(SharedPtr data_dir, - SharedPtr temp_dir, - SharedPtr file_dir, - SharedPtr file_name, - SharedPtr index_base, - SharedPtr column_def, - u32 row_count, - u32 part_id, - PersistenceManager* persistence_manager); - - ~SecondaryIndexFileWorkerParts() override; - - void AllocateInMemory() override; - - void FreeInMemory() override; - - FileWorkerType Type() const override { return FileWorkerType::kSecondaryIndexPartFile; } - -protected: - bool WriteToFileImpl(bool to_spill, bool &prepare_success, const FileWorkerSaveCtx &ctx) override; - - void ReadFromFileImpl(SizeT file_size) override; - - const u32 row_count_; - const u32 part_id_; - u32 part_row_count_ = std::min(8192, row_count_ - part_id_ * 8192); - u32 data_pair_size_ = 0; -}; - -} // namespace infinity \ No newline at end of file +} // namespace infinity diff --git a/src/storage/compaction/DBT_compaction_alg.cpp b/src/storage/compaction/DBT_compaction_alg.cpp index 798f44cb60..e0a5ec223a 100644 --- a/src/storage/compaction/DBT_compaction_alg.cpp +++ b/src/storage/compaction/DBT_compaction_alg.cpp @@ -133,6 +133,7 @@ Vector DBTCompactionAlg::CheckCompaction(TransactionID txn_id) { if (++running_task_n_ == 1) { status_ = CompactionStatus::kRunning; } + LOG_TRACE(fmt::format("CheckCompaction add running_task_n to {}, txn_id: ", running_task_n_, txn_id)); txn_2_layer_.emplace(txn_id, layer); return compact_segments; @@ -185,8 +186,9 @@ void DBTCompactionAlg::CommitCompact(TransactionID commit_txn_id) { if (--running_task_n_ == 0) { status_ = CompactionStatus::kEnable; - cv_.notify_one(); + cv_.notify_all(); } + LOG_TRACE(fmt::format("CommitCompact substract running_task_n to {}, txn_id: ", running_task_n_, commit_txn_id)); } void DBTCompactionAlg::RollbackCompact(TransactionID rollback_txn_id) { @@ -205,16 +207,14 @@ void DBTCompactionAlg::RollbackCompact(TransactionID rollback_txn_id) { } if (--running_task_n_ == 0) { status_ = CompactionStatus::kEnable; + cv_.notify_all(); } + LOG_TRACE(fmt::format("RollbackCompact substract running_task_n to {}, txn_id: ", running_task_n_, rollback_txn_id)); } // Must be called when all segments are not compacting void DBTCompactionAlg::Enable(const Vector &segment_entries) { std::unique_lock lock(mtx_); - if (status_ != CompactionStatus::kDisable) { - String error_message = fmt::format("Enable compaction when compaction not disable, {}", (u8)status_); - UnrecoverableError(error_message); - } for (auto *segment_entry : segment_entries) { this->AddSegmentInner(segment_entry); } @@ -225,7 +225,7 @@ void DBTCompactionAlg::Enable(const Vector &segment_entries) { UnrecoverableError(error_message); } status_ = CompactionStatus::kEnable; - cv_.notify_one(); + cv_.notify_all(); } void DBTCompactionAlg::Disable() { diff --git a/src/storage/data_table.cppm b/src/storage/data_table.cppm index 297ee5dcde..352660a8c5 100644 --- a/src/storage/data_table.cppm +++ b/src/storage/data_table.cppm @@ -103,6 +103,8 @@ public: TableType type_{TableType::kInvalid}; Vector> data_blocks_{}; SharedPtr result_msg_{}; + bool total_hits_count_flag_{false}; + SizeT total_hits_count_{}; }; } // namespace infinity diff --git a/src/storage/invertedindex/column_index_reader.cppm b/src/storage/invertedindex/column_index_reader.cppm index d70f319ff7..70c0d82c04 100644 --- a/src/storage/invertedindex/column_index_reader.cppm +++ b/src/storage/invertedindex/column_index_reader.cppm @@ -22,7 +22,7 @@ import segment_posting; import index_segment_reader; import posting_iterator; import index_defines; -import memory_indexer; +// import memory_indexer; import internal_types; import segment_index_entry; import chunk_index_entry; @@ -32,6 +32,7 @@ namespace infinity { struct TableEntry; class TermDocIterator; class Txn; +class MemoryIndexer; export class ColumnIndexReader { public: diff --git a/src/storage/invertedindex/column_inverter.cpp b/src/storage/invertedindex/column_inverter.cpp index f85a449d73..5654dc8526 100644 --- a/src/storage/invertedindex/column_inverter.cpp +++ b/src/storage/invertedindex/column_inverter.cpp @@ -52,7 +52,7 @@ ColumnInverter::ColumnInverter(PostingWriterProvider posting_writer_provider, Ve void ColumnInverter::InitAnalyzer(const String &analyzer_name) { auto [analyzer, status] = AnalyzerPool::instance().GetAnalyzer(analyzer_name); - if(!status.ok()) { + if (!status.ok()) { Status status = Status::UnexpectedError(fmt::format("Invalid analyzer: {}", analyzer_name)); RecoverableError(status); } @@ -203,11 +203,13 @@ void ColumnInverter::Sort() { 16); } -void ColumnInverter::GeneratePosting() { +MemUsageChange ColumnInverter::GeneratePosting() { u32 last_term_num = std::numeric_limits::max(); u32 last_doc_id = INVALID_DOCID; StringRef last_term, term; SharedPtr posting = nullptr; + MemUsageChange ret{true, 0}; + Map modified_writers; // printf("GeneratePosting() begin begin_doc_id_ %u, doc_count_ %u, merged_ %u", begin_doc_id_, doc_count_, merged_); for (auto &i : positions_) { if (last_term_num != i.term_num_) { @@ -218,6 +220,9 @@ void ColumnInverter::GeneratePosting() { } term = GetTermFromNum(i.term_num_); posting = posting_writer_provider_(String(term.data())); + if (modified_writers.find(term) == modified_writers.end()) { + modified_writers[term] = posting.get(); + } // printf("\nswitched-term-%d-<%s>\n", i.term_num_, term.data()); if (last_term_num != (u32)(-1)) { assert(last_term_num < i.term_num_); @@ -242,6 +247,12 @@ void ColumnInverter::GeneratePosting() { // printf(" EndDocument3-%u\n", last_doc_id); } // printf("GeneratePosting() end begin_doc_id_ %u, doc_count_ %u, merged_ %u", begin_doc_id_, doc_count_, merged_); + for (auto kv : modified_writers) { + PostingWriter *writer = kv.second; + ret.Add(writer->GetSizeChange()); + } + LOG_TRACE(fmt::format("MemUsageChange : {}, {}", ret.is_add_, ret.mem_)); + return ret; } void ColumnInverter::SortForOfflineDump() { @@ -258,7 +269,7 @@ void ColumnInverter::SortForOfflineDump() { // ----------------------------------------------------------------------------------------------------------------------------+ // Data within each group -void ColumnInverter::SpillSortResults(FILE *spill_file, u64 &tuple_count, UniquePtr& buf_writer) { +void ColumnInverter::SpillSortResults(FILE *spill_file, u64 &tuple_count, UniquePtr &buf_writer) { // spill sort results for external merge sort // if (positions_.empty()) { // return; @@ -267,19 +278,19 @@ void ColumnInverter::SpillSortResults(FILE *spill_file, u64 &tuple_count, Unique // size of this Run in bytes u32 data_size = 0; u64 data_size_pos = spill_file_tell; - buf_writer->Write((const char*)&data_size, sizeof(u32)); + buf_writer->Write((const char *)&data_size, sizeof(u32)); spill_file_tell += sizeof(u32); // number of tuples u32 num_of_tuples = positions_.size(); tuple_count += num_of_tuples; - buf_writer->Write((const char*)&num_of_tuples, sizeof(u32)); + buf_writer->Write((const char *)&num_of_tuples, sizeof(u32)); spill_file_tell += sizeof(u32); // start offset for next spill u64 next_start_offset = 0; u64 next_start_offset_pos = spill_file_tell; - buf_writer->Write((const char*)&next_start_offset, sizeof(u64)); + buf_writer->Write((const char *)&next_start_offset, sizeof(u64)); spill_file_tell += sizeof(u64); u64 data_start_offset = spill_file_tell; @@ -295,11 +306,11 @@ void ColumnInverter::SpillSortResults(FILE *spill_file, u64 &tuple_count, Unique } record_length = term.size() + sizeof(docid_t) + sizeof(u32) + 1; - buf_writer->Write((const char*)&record_length, sizeof(u32)); + buf_writer->Write((const char *)&record_length, sizeof(u32)); buf_writer->Write(term.data(), term.size()); - buf_writer->Write((const char*)&str_null, sizeof(char)); - buf_writer->Write((const char*)&(i.doc_id_), sizeof(docid_t)); - buf_writer->Write((const char*)&(i.term_pos_), sizeof(u32)); + buf_writer->Write((const char *)&str_null, sizeof(char)); + buf_writer->Write((const char *)&(i.doc_id_), sizeof(docid_t)); + buf_writer->Write((const char *)&(i.term_pos_), sizeof(u32)); } buf_writer->Flush(); // update data size @@ -312,4 +323,4 @@ void ColumnInverter::SpillSortResults(FILE *spill_file, u64 &tuple_count, Unique fseek(spill_file, next_start_offset, SEEK_SET); } -} // namespace infinity \ No newline at end of file +} // namespace infinity diff --git a/src/storage/invertedindex/column_inverter.cppm b/src/storage/invertedindex/column_inverter.cppm index 253bdc46d2..377ac43887 100644 --- a/src/storage/invertedindex/column_inverter.cppm +++ b/src/storage/invertedindex/column_inverter.cppm @@ -28,6 +28,7 @@ import internal_types; import posting_writer; import vector_with_lock; import buf_writer; +import mem_usage_change; namespace infinity { @@ -52,7 +53,7 @@ public: void Sort(); - void GeneratePosting(); + MemUsageChange GeneratePosting(); u32 GetDocCount() { return doc_count_; } @@ -74,7 +75,7 @@ public: } }; - void SpillSortResults(FILE *spill_file, u64 &tuple_count, UniquePtr& buf_writer); + void SpillSortResults(FILE *spill_file, u64 &tuple_count, UniquePtr &buf_writer); private: using TermBuffer = Vector; diff --git a/src/storage/invertedindex/format/doc_list_encoder.cppm b/src/storage/invertedindex/format/doc_list_encoder.cppm index bf0f36d109..b7d2652c03 100644 --- a/src/storage/invertedindex/format/doc_list_encoder.cppm +++ b/src/storage/invertedindex/format/doc_list_encoder.cppm @@ -51,6 +51,8 @@ public: PostingByteSlice *GetDocListBuffer() { return &doc_list_buffer_; } + inline SizeT GetSizeInBytes() const { return doc_list_buffer_.GetSizeInBytes() + doc_skiplist_writer_->GetSizeInBytes(); } + private: void AddDocument(docid_t doc_id, docpayload_t doc_payload, tf_t tf, u32 doc_len); @@ -78,4 +80,4 @@ private: friend class InMemDocListDecoderTest; }; -} // namespace infinity \ No newline at end of file +} // namespace infinity diff --git a/src/storage/invertedindex/format/position_list_encoder.cppm b/src/storage/invertedindex/format/position_list_encoder.cppm index e35484aa42..781509ef78 100644 --- a/src/storage/invertedindex/format/position_list_encoder.cppm +++ b/src/storage/invertedindex/format/position_list_encoder.cppm @@ -18,8 +18,7 @@ namespace infinity { export class PositionListEncoder { public: - PositionListEncoder(const PostingFormatOption &format_option, - const PositionListFormat *pos_list_format = nullptr); + PositionListEncoder(const PostingFormatOption &format_option, const PositionListFormat *pos_list_format = nullptr); ~PositionListEncoder(); @@ -38,6 +37,8 @@ public: const PositionListFormat *GetPositionListFormat() const { return pos_list_format_; } + inline SizeT GetSizeInBytes() const { return pos_list_buffer_.GetSizeInBytes() + pos_skiplist_writer_->GetSizeInBytes(); } + private: void CreatePosSkipListWriter(); void AddPosSkipListItem(u32 total_pos_count, u32 compressed_pos_size, bool need_flush); @@ -45,10 +46,10 @@ private: private: PostingByteSlice pos_list_buffer_; - pos_t last_pos_in_cur_doc_; // 4byte - u32 total_pos_count_; // 4byte + pos_t last_pos_in_cur_doc_; // 4byte + u32 total_pos_count_; // 4byte PostingFormatOption format_option_; - bool is_own_format_; // 1byte + bool is_own_format_; // 1byte UniquePtr pos_skiplist_writer_; const PositionListFormat *pos_list_format_; }; diff --git a/src/storage/invertedindex/format/posting_buffer.cppm b/src/storage/invertedindex/format/posting_buffer.cppm index 38d23ac3e9..0dcc17fbaf 100644 --- a/src/storage/invertedindex/format/posting_buffer.cppm +++ b/src/storage/invertedindex/format/posting_buffer.cppm @@ -37,6 +37,8 @@ public: u8 Size() const { return size_; } + inline SizeT GetSizeInBytes() const { return capacity_ * posting_fields_->GetTotalSize(); } + u8 GetRowCount() const { return posting_fields_->GetSize(); } template diff --git a/src/storage/invertedindex/format/posting_byte_slice.cppm b/src/storage/invertedindex/format/posting_byte_slice.cppm index a0b42c7db4..b76bc696cd 100644 --- a/src/storage/invertedindex/format/posting_byte_slice.cppm +++ b/src/storage/invertedindex/format/posting_byte_slice.cppm @@ -57,6 +57,8 @@ public: SizeT EstimateDumpSize() const { return posting_writer_.GetSize(); } + inline SizeT GetSizeInBytes() const { return buffer_.GetSizeInBytes() + posting_writer_.GetSize(); } + protected: SizeT DoFlush(); @@ -71,4 +73,4 @@ inline void PostingByteSlice::PushBack(u8 row, T value) { buffer_.PushBack(row, value); } -} // namespace infinity \ No newline at end of file +} // namespace infinity diff --git a/src/storage/invertedindex/memory_indexer.cpp b/src/storage/invertedindex/memory_indexer.cpp index 184095bd62..47e7e5570d 100644 --- a/src/storage/invertedindex/memory_indexer.cpp +++ b/src/storage/invertedindex/memory_indexer.cpp @@ -66,6 +66,7 @@ import utility; import persist_result_handler; import virtual_store; import local_file_handle; +import mem_usage_change; namespace infinity { constexpr int MAX_TUPLE_LENGTH = 1024; // we assume that analyzed term, together with docid/offset info, will never exceed such length @@ -76,10 +77,16 @@ bool MemoryIndexer::KeyComp::operator()(const String &lhs, const String &rhs) co MemoryIndexer::PostingTable::PostingTable() {} -MemoryIndexer::MemoryIndexer(const String &index_dir, const String &base_name, RowID base_row_id, optionflag_t flag, const String &analyzer) +MemoryIndexer::MemoryIndexer(const String &index_dir, + const String &base_name, + RowID base_row_id, + optionflag_t flag, + const String &analyzer, + SegmentIndexEntry *segment_index_entry) : index_dir_(index_dir), base_name_(base_name), base_row_id_(base_row_id), flag_(flag), posting_format_(PostingFormatOption(flag_)), analyzer_(analyzer), inverting_thread_pool_(infinity::InfinityContext::instance().GetFulltextInvertingThreadPool()), - commiting_thread_pool_(infinity::InfinityContext::instance().GetFulltextCommitingThreadPool()), ring_inverted_(15UL), ring_sorted_(13UL) { + commiting_thread_pool_(infinity::InfinityContext::instance().GetFulltextCommitingThreadPool()), ring_inverted_(15UL), ring_sorted_(13UL), + segment_index_entry_(segment_index_entry) { assert(std::filesystem::path(index_dir).is_absolute()); posting_table_ = MakeShared(); prepared_posting_ = MakeShared(posting_format_, column_lengths_); @@ -138,6 +145,8 @@ void MemoryIndexer::Insert(SharedPtr column_vector, u32 row_offset } inverting_thread_pool_.push(std::move(func)); } else { + // mem trace : the column_lengths_; + IncreaseMemoryUsage(sizeof(u32) * row_count); PostingWriterProvider provider = [this](const String &term) -> SharedPtr { return GetOrAddPosting(term); }; auto inverter = MakeShared(provider, column_lengths_); inverter->InitAnalyzer(this->analyzer_); @@ -221,6 +230,7 @@ SizeT MemoryIndexer::CommitSync(SizeT wait_if_empty_ms) { return 0; } + MemUsageChange mem_usage_change = {true, 0}; while (1) { this->ring_sorted_.GetBatch(inverters, wait_if_empty_ms); // num_merged = inverters.size(); @@ -228,7 +238,7 @@ SizeT MemoryIndexer::CommitSync(SizeT wait_if_empty_ms) { break; } for (auto &inverter : inverters) { - inverter->GeneratePosting(); + mem_usage_change.Add(inverter->GeneratePosting()); num_generated += inverter->GetMerged(); } } @@ -239,6 +249,11 @@ SizeT MemoryIndexer::CommitSync(SizeT wait_if_empty_ms) { cv_.notify_all(); } } + if (mem_usage_change.is_add_) { + IncreaseMemoryUsage(mem_usage_change.mem_); + } else { + DecreaseMemoryUsage(mem_usage_change.mem_); + } // LOG_INFO(fmt::format("MemoryIndexer::CommitSync sorted {} inverters, generated posting for {} inverters(merged to {}), inflight_tasks_ is {}", // num_sorted, @@ -376,6 +391,8 @@ SharedPtr MemoryIndexer::GetOrAddPosting(const String &term) { PostingPtr posting; bool found = posting_store.GetOrAdd(term, posting, prepared_posting_); if (!found) { + // mem trace : add term's size + IncreaseMemoryUsage(term.size()); prepared_posting_ = MakeShared(posting_format_, column_lengths_); } return posting; @@ -386,6 +403,40 @@ void MemoryIndexer::Reset() { posting_table_->store_.Clear(); } column_lengths_.Clear(); + DecreaseMemoryUsage(mem_used_); +} + +MemIndexTracerInfo MemoryIndexer::GetInfo() const { + auto *table_index_entry = segment_index_entry_->table_index_entry(); + SharedPtr index_name = table_index_entry->GetIndexName(); + auto *table_entry = table_index_entry->table_index_meta()->GetTableEntry(); + SharedPtr table_name = table_entry->GetTableName(); + SharedPtr db_name = table_entry->GetDBName(); + + return MemIndexTracerInfo(index_name, table_name, db_name, MemUsed(), doc_count_); +} + +TableIndexEntry *MemoryIndexer::table_index_entry() const { return segment_index_entry_->table_index_entry(); } + +SizeT MemoryIndexer::MemUsed() const { return mem_used_; } + +void MemoryIndexer::ApplyMemUseChange(MemUsageChange mem_change) { + if (mem_change.is_add_) { + IncreaseMemoryUsage(mem_change.mem_); + } else { + DecreaseMemoryUsage(mem_change.mem_); + } +} + +void MemoryIndexer::IncreaseMemoryUsage(SizeT mem) { + mem_used_ += mem; + BaseMemIndex::IncreaseMemoryUsageBase(mem); +} + +void MemoryIndexer::DecreaseMemoryUsage(SizeT mem) { + assert(mem_used_ >= mem); + mem_used_ -= mem; + BaseMemIndex::DecreaseMemoryUsageBase(mem); } void MemoryIndexer::TupleListToIndexFile(UniquePtr> &merger) { @@ -546,4 +597,4 @@ void MemoryIndexer::PrepareSpillFile() { buf_writer_ = MakeUnique(spill_file_handle_, write_buf_size); } -} // namespace infinity \ No newline at end of file +} // namespace infinity diff --git a/src/storage/invertedindex/memory_indexer.cppm b/src/storage/invertedindex/memory_indexer.cppm index 79ffc8755b..12f0a62096 100644 --- a/src/storage/invertedindex/memory_indexer.cppm +++ b/src/storage/invertedindex/memory_indexer.cppm @@ -34,11 +34,18 @@ import buf_writer; import posting_list_format; import external_sort_merger; import persistence_manager; +import base_memindex; +import memindex_tracer; +import segment_index_entry; +import table_index_entry; +import mem_usage_change; namespace infinity { -export class MemoryIndexer { +export class MemoryIndexer final : public BaseMemIndex { public: + void ApplyMemUseChange(MemUsageChange mem_change); + struct KeyComp { bool operator()(const String &lhs, const String &rhs) const; }; @@ -52,7 +59,12 @@ public: PostingTableStore store_; }; - MemoryIndexer(const String &index_dir, const String &base_name, RowID base_row_id, optionflag_t flag, const String &analyzer); + MemoryIndexer(const String &index_dir, + const String &base_name, + RowID base_row_id, + optionflag_t flag, + const String &analyzer, + SegmentIndexEntry *segment_index_entry); ~MemoryIndexer(); @@ -106,7 +118,19 @@ public: void Reset(); + MemIndexTracerInfo GetInfo() const override; + + TableIndexEntry *table_index_entry() const override; + + SizeT MemUsed() const; + private: + // call with write lock + void IncreaseMemoryUsage(SizeT mem); + + // call with write lock + void DecreaseMemoryUsage(SizeT mem); + // CommitOffline is for offline case. It spill a batch of ColumnInverter. Returns the size of the batch. SizeT CommitOffline(SizeT wait_if_empty_ms = 0); @@ -157,5 +181,8 @@ private: UniquePtr spill_buffer_{}; SizeT spill_buffer_size_{0}; UniquePtr buf_writer_; + + SegmentIndexEntry *segment_index_entry_{nullptr}; + Atomic mem_used_{0}; }; } // namespace infinity diff --git a/src/storage/invertedindex/posting_writer.cpp b/src/storage/invertedindex/posting_writer.cpp index cddcba5927..11577f2960 100644 --- a/src/storage/invertedindex/posting_writer.cpp +++ b/src/storage/invertedindex/posting_writer.cpp @@ -13,6 +13,7 @@ import posting_list_format; import index_defines; import term_meta; import vector_with_lock; +import mem_usage_change; module posting_writer; @@ -106,4 +107,16 @@ InMemPostingDecoder *PostingWriter::CreateInMemPostingDecoder() const { return posting_decoder; } -} // namespace infinity \ No newline at end of file + +MemUsageChange PostingWriter::GetSizeChange() { + SizeT size = doc_list_encoder_->GetSizeInBytes() + position_list_encoder_->GetSizeInBytes(); + SizeT last_size = last_size_; + last_size_ = size; + if (size >= last_size) { + return MemUsageChange{true, size - last_size}; + } else { + return MemUsageChange{false, last_size - size}; + } +} + +} // namespace infinity diff --git a/src/storage/invertedindex/posting_writer.cppm b/src/storage/invertedindex/posting_writer.cppm index 7d46ffbd01..6981099b08 100644 --- a/src/storage/invertedindex/posting_writer.cppm +++ b/src/storage/invertedindex/posting_writer.cppm @@ -13,6 +13,7 @@ import posting_list_format; import index_defines; import term_meta; import vector_with_lock; +import mem_usage_change; namespace infinity { export class PostingWriter { @@ -47,7 +48,11 @@ public: u32 GetDocColumnLength(docid_t doc_id) { return column_lengths_.Get(doc_id); } + MemUsageChange GetSizeChange(); + private: + // for memory tracing + SizeT last_size_{0}; const PostingFormat &posting_format_; DocListEncoder *doc_list_encoder_{nullptr}; PositionListEncoder *position_list_encoder_{nullptr}; @@ -57,4 +62,4 @@ private: export using PostingWriterProvider = std::function(const String &)>; -} // namespace infinity \ No newline at end of file +} // namespace infinity diff --git a/src/storage/invertedindex/search/doc_iterator.cppm b/src/storage/invertedindex/search/doc_iterator.cppm index e3c3043840..017fea1a0a 100644 --- a/src/storage/invertedindex/search/doc_iterator.cppm +++ b/src/storage/invertedindex/search/doc_iterator.cppm @@ -24,10 +24,11 @@ import internal_types; namespace infinity { export enum class EarlyTermAlgo { - kNaive, - kBatch, - kBMW, - kCompare, + kAuto, // choose between kNaive, kBatch, kBMW + kNaive, // naive or + kBatch, // use batch_or if (sum_of_df > total_doc_num / 4) and term nodes under or node achieve a certain number + kBMW, // use bmw if it is "or iterator" on the top level and has only term children + kCompare, // compare bmw, batch, naive }; export enum class DocIteratorType : u8 { diff --git a/src/storage/invertedindex/search/phrase_doc_iterator.cpp b/src/storage/invertedindex/search/phrase_doc_iterator.cpp index 8a0b2208d9..dc131afc18 100644 --- a/src/storage/invertedindex/search/phrase_doc_iterator.cpp +++ b/src/storage/invertedindex/search/phrase_doc_iterator.cpp @@ -140,6 +140,7 @@ bool PhraseDocIterator::GetExactPhraseMatchData() { begin_positions.push_back(now_position0); } } +#ifdef INFINITY_DEBUG if (SHOULD_LOG_DEBUG()) { std::ostringstream oss; oss << "Phrase \"" << terms_ptr_->at(0); @@ -152,6 +153,7 @@ bool PhraseDocIterator::GetExactPhraseMatchData() { } LOG_DEBUG(oss.str()); } +#endif if (begin_positions.empty()) { return false; } @@ -186,7 +188,7 @@ bool PhraseDocIterator::GetSloppyPhraseMatchData() { term_pos_i : term i's current position in document phrase_pos_i: term_pos_i - pos_i - For a solution (term_pos_0, term_pos_1, ..., term_pos_n), it's acceptable iff: + For a solution (term_pos_0, term_pos_1, ..., term_pos_n), it's acceptable if: for any i, j (0<=i<=n, 0<=j<=n), |phrase_pos_i - phrase_pos_j| <= slop For an acceptable solution, its matchLength is: @@ -265,6 +267,7 @@ bool PhraseDocIterator::GetSloppyPhraseMatchData() { for (auto &solution : solutions) { tf_ += 1.0F / (1.0F + solution.matchLength); } +#ifdef INFINITY_DEBUG if (SHOULD_LOG_DEBUG()) { std::ostringstream oss; oss << "Phrase \"" << terms_ptr_->at(0); @@ -283,6 +286,7 @@ bool PhraseDocIterator::GetSloppyPhraseMatchData() { } LOG_DEBUG(oss.str()); } +#endif if (!solutions.empty()) { doc_freq_++; all_tf_.push_back(tf_); diff --git a/src/storage/invertedindex/search/query_builder.cpp b/src/storage/invertedindex/search/query_builder.cpp index 3f1e8665cd..af5addb95a 100644 --- a/src/storage/invertedindex/search/query_builder.cpp +++ b/src/storage/invertedindex/search/query_builder.cpp @@ -56,6 +56,7 @@ UniquePtr QueryBuilder::CreateSearch(FullTextQueryContext &context) context.early_term_algo_, context.ft_similarity_, context.minimum_should_match_, + context.topn_, context.index_names_}; auto result = context.optimized_query_tree_->CreateSearch(params); #ifdef INFINITY_DEBUG diff --git a/src/storage/invertedindex/search/query_builder.cppm b/src/storage/invertedindex/search/query_builder.cppm index cdd8c15199..1657d2cc1e 100644 --- a/src/storage/invertedindex/search/query_builder.cppm +++ b/src/storage/invertedindex/search/query_builder.cppm @@ -36,12 +36,14 @@ export struct FullTextQueryContext { const FulltextSimilarity ft_similarity_{}; const MinimumShouldMatchOption minimum_should_match_option_{}; u32 minimum_should_match_ = 0; + u32 topn_ = 0; EarlyTermAlgo early_term_algo_ = EarlyTermAlgo::kNaive; const Vector &index_names_; FullTextQueryContext(const FulltextSimilarity ft_similarity, const MinimumShouldMatchOption &minimum_should_match_option, + const u32 topn, const Vector &index_names) - : ft_similarity_(ft_similarity), minimum_should_match_option_(minimum_should_match_option), index_names_(index_names) {} + : ft_similarity_(ft_similarity), minimum_should_match_option_(minimum_should_match_option), topn_(topn), index_names_(index_names) {} }; export class QueryBuilder { diff --git a/src/storage/invertedindex/search/query_node.cpp b/src/storage/invertedindex/search/query_node.cpp index 079eab2374..c7f39fc4d7 100644 --- a/src/storage/invertedindex/search/query_node.cpp +++ b/src/storage/invertedindex/search/query_node.cpp @@ -1,6 +1,7 @@ #include "query_node.h" #include #include +#include import stl; import third_party; @@ -528,20 +529,23 @@ std::unique_ptr OrQueryNode::CreateSearch(const CreateSearchParams } } if (sub_doc_iters.size() < 2) { + // 0 or 1 // no need for WAND all_are_term = false; all_are_term_or_phrase = false; } const u32 msm_bar = keyword_iters.empty() ? 1u : 0u; - if (sub_doc_iters.empty() && keyword_iters.empty()) { - return nullptr; - } else if (sub_doc_iters.size() + keyword_iters.size() == 1) { - return only_child->CreateSearch(params, is_top_level); - } else if (is_top_level && all_are_term && params.ft_similarity == FulltextSimilarity::kBM25 && params.early_term_algo == EarlyTermAlgo::kBMW) { + auto GetIterResultT = [&]() -> std::unique_ptr { if (params.minimum_should_match > sub_doc_iters.size()) { return nullptr; } else if (params.minimum_should_match <= msm_bar) { - auto msm_iter = MakeUnique(std::move(sub_doc_iters)); + if constexpr (std::is_same_v) { + sub_doc_iters.insert(sub_doc_iters.end(), + std::make_move_iterator(keyword_iters.begin()), + std::make_move_iterator(keyword_iters.end())); + return MakeUnique(std::move(sub_doc_iters)); + } + auto msm_iter = MakeUnique(std::move(sub_doc_iters)); if (keyword_iters.empty()) { return msm_iter; } else { @@ -552,9 +556,10 @@ std::unique_ptr OrQueryNode::CreateSearch(const CreateSearchParams // must use minimum_should_match UniquePtr msm_iter; if (params.minimum_should_match <= 1) { - msm_iter = MakeUnique(std::move(sub_doc_iters)); + msm_iter = MakeUnique(std::move(sub_doc_iters)); } else if (params.minimum_should_match < sub_doc_iters.size()) { - msm_iter = MakeUnique>(std::move(sub_doc_iters), params.minimum_should_match); + using MSM_T = std::conditional_t, MinimumShouldMatchIterator, MinimumShouldMatchWrapper>; + msm_iter = MakeUnique(std::move(sub_doc_iters), params.minimum_should_match); } else { msm_iter = MakeUnique(std::move(sub_doc_iters)); } @@ -565,64 +570,105 @@ std::unique_ptr OrQueryNode::CreateSearch(const CreateSearchParams return MakeUnique(std::move(keyword_iters)); } } - } else if (is_top_level && all_are_term && params.ft_similarity == FulltextSimilarity::kBM25 && params.early_term_algo == EarlyTermAlgo::kBatch) { - if (params.minimum_should_match > sub_doc_iters.size()) { - return nullptr; - } else if (params.minimum_should_match <= msm_bar) { - auto msm_iter = MakeUnique(std::move(sub_doc_iters)); - if (keyword_iters.empty()) { - return msm_iter; - } else { - keyword_iters.emplace_back(std::move(msm_iter)); - return MakeUnique(std::move(keyword_iters)); + }; + auto term_num_threshold = [](const u32 topn) -> u32 { + if (topn < 5u) { + return std::numeric_limits::max(); + } + if (topn <= 10u) { + return 500u / topn; + } + return 50u / std::log10f(topn); + }; + if (sub_doc_iters.empty() && keyword_iters.empty()) { + return nullptr; + } + if (sub_doc_iters.size() + keyword_iters.size() == 1) { + return only_child->CreateSearch(params, is_top_level); + } + if (is_top_level && all_are_term && params.ft_similarity == FulltextSimilarity::kBM25) { + auto choose_algo = EarlyTermAlgo::kNaive; + switch (params.early_term_algo) { + case EarlyTermAlgo::kAuto: { + if (params.topn > 0u && sub_doc_iters.size() <= term_num_threshold(params.topn)) { + choose_algo = EarlyTermAlgo::kBMW; + } else { + // check df + const auto total_df = static_cast(sub_doc_iters.front().get())->GetTotalDF(); + u64 df_sum = 0u; + for (const auto &iter : sub_doc_iters) { + df_sum += static_cast(iter.get())->GetDocFreq(); + } + if (df_sum * 5ull < total_df) { + choose_algo = EarlyTermAlgo::kBMW; + } else { + choose_algo = EarlyTermAlgo::kBatch; + } + } + break; } - } else { - // must use minimum_should_match - UniquePtr msm_iter; - if (params.minimum_should_match <= 1) { - msm_iter = MakeUnique(std::move(sub_doc_iters)); - } else if (params.minimum_should_match < sub_doc_iters.size()) { - msm_iter = MakeUnique>(std::move(sub_doc_iters), params.minimum_should_match); - } else { - msm_iter = MakeUnique(std::move(sub_doc_iters)); + case EarlyTermAlgo::kBMW: + case EarlyTermAlgo::kBatch: + case EarlyTermAlgo::kNaive: { + choose_algo = params.early_term_algo; + break; } - if (keyword_iters.empty()) { - return msm_iter; - } else { - keyword_iters.insert(keyword_iters.begin(), std::move(msm_iter)); - return MakeUnique(std::move(keyword_iters)); + case EarlyTermAlgo::kCompare: { + UnrecoverableError("OrQueryNode: EarlyTermAlgo::kCompare is not allowed here"); + break; } } - } else if (all_are_term_or_phrase) { - if (params.minimum_should_match > sub_doc_iters.size()) { - return nullptr; - } else if (params.minimum_should_match <= msm_bar) { - sub_doc_iters.insert(sub_doc_iters.end(), std::make_move_iterator(keyword_iters.begin()), std::make_move_iterator(keyword_iters.end())); - return MakeUnique(std::move(sub_doc_iters)); - } else { - // must use minimum_should_match - UniquePtr msm_iter; - if (params.minimum_should_match <= 1) { - msm_iter = MakeUnique(std::move(sub_doc_iters)); - } else if (params.minimum_should_match < sub_doc_iters.size()) { - msm_iter = MakeUnique(std::move(sub_doc_iters), params.minimum_should_match); - } else { - msm_iter = MakeUnique(std::move(sub_doc_iters)); + if (choose_algo == EarlyTermAlgo::kBMW) { + return GetIterResultT.template operator()(); + } else if (choose_algo == EarlyTermAlgo::kBatch) { + return GetIterResultT.template operator()(); + } else if (choose_algo == EarlyTermAlgo::kNaive) { + return GetIterResultT.template operator()(); + } + UnrecoverableError("Unreachable code"); + return nullptr; + } + if ((params.early_term_algo == EarlyTermAlgo::kAuto || params.early_term_algo == EarlyTermAlgo::kBatch) && + params.ft_similarity == FulltextSimilarity::kBM25) { + // try to apply batch when possible + // collect all term children info + u64 total_df = 0u; + u64 df_sum = 0u; + for (const auto &iter : sub_doc_iters) { + if (iter->GetType() == DocIteratorType::kTermDocIterator) { + const auto tdi = static_cast(iter.get()); + total_df = tdi->GetTotalDF(); + df_sum += tdi->GetDocFreq(); } - if (keyword_iters.empty()) { - return msm_iter; + } + if (df_sum && (df_sum * 5ull >= total_df)) { + // must have child other than term + Vector> term_iters; + Vector> not_term_iters = std::move(keyword_iters); + for (auto &iter : sub_doc_iters) { + if (iter->GetType() == DocIteratorType::kTermDocIterator) { + term_iters.emplace_back(std::move(iter)); + } else { + not_term_iters.emplace_back(std::move(iter)); + } + } + auto batch_or_iter = MakeUnique(std::move(term_iters)); + not_term_iters.emplace_back(std::move(batch_or_iter)); + if (params.minimum_should_match <= 0) { + return MakeUnique(std::move(not_term_iters)); } else { - keyword_iters.insert(keyword_iters.begin(), std::move(msm_iter)); - return MakeUnique(std::move(keyword_iters)); + return MakeUnique>(std::move(not_term_iters), params.minimum_should_match); } } + } + if (all_are_term_or_phrase) { + return GetIterResultT.template operator()(); + } + sub_doc_iters.insert(sub_doc_iters.end(), std::make_move_iterator(keyword_iters.begin()), std::make_move_iterator(keyword_iters.end())); + if (params.minimum_should_match <= 0) { + return MakeUnique(std::move(sub_doc_iters)); } else { - sub_doc_iters.insert(sub_doc_iters.end(), std::make_move_iterator(keyword_iters.begin()), std::make_move_iterator(keyword_iters.end())); - if (params.minimum_should_match <= msm_bar) { - return MakeUnique(std::move(sub_doc_iters)); - } else { - return MakeUnique>(std::move(sub_doc_iters), params.minimum_should_match); - } + return MakeUnique>(std::move(sub_doc_iters), params.minimum_should_match); } } diff --git a/src/storage/invertedindex/search/query_node.h b/src/storage/invertedindex/search/query_node.h index 81ad61761f..4a26d9f5c6 100644 --- a/src/storage/invertedindex/search/query_node.h +++ b/src/storage/invertedindex/search/query_node.h @@ -60,8 +60,9 @@ struct CreateSearchParams { EarlyTermAlgo early_term_algo; FulltextSimilarity ft_similarity; uint32_t minimum_should_match; + uint32_t topn; const std::vector &index_names_; - [[nodiscard]] CreateSearchParams RemoveMSM() const { return {table_entry, index_reader, early_term_algo, ft_similarity, 0, index_names_}; } + [[nodiscard]] CreateSearchParams RemoveMSM() const { return {table_entry, index_reader, early_term_algo, ft_similarity, 0, topn, index_names_}; } }; // step 1. get the query tree from parser diff --git a/src/storage/invertedindex/search/search_driver.cpp b/src/storage/invertedindex/search/search_driver.cpp index 397839d9e1..f346928c23 100644 --- a/src/storage/invertedindex/search/search_driver.cpp +++ b/src/storage/invertedindex/search/search_driver.cpp @@ -23,9 +23,6 @@ #define SearchScannerSuffix InfinitySyntax #include "search_scanner_derived_helper.h" #undef SearchScannerSuffix -// #define SearchScannerSuffix Plain -// #include "search_scanner_derived_helper.h" -// #undef SearchScannerSuffix import stl; import term; @@ -275,7 +272,7 @@ SearchDriver::AnalyzeAndBuildQueryNode(const std::string &field, const std::stri // Unescape reserved characters per https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-query-string-query.html // Shall keep sync with ESCAPEABLE in search_lexer.l -// [\x20+\-=&|!(){}\[\]^"~*?:\\/] +// [\x20()^"'~*?:\\] std::string SearchDriver::Unescape(const std::string &text) { std::string result; result.reserve(text.size()); @@ -283,26 +280,16 @@ std::string SearchDriver::Unescape(const std::string &text) { if (text[i] == '\\' && i + 1 < text.size()) { switch (text[i + 1]) { case ' ': - case '+': - case '-': - case '=': - case '&': - case '|': - case '!': case '(': case ')': - case '{': - case '}': - case '[': - case ']': case '^': case '"': + case '\'': case '~': case '*': case '?': case ':': case '\\': - case '/': result.push_back(text[i + 1]); ++i; break; diff --git a/src/storage/invertedindex/search/term_doc_iterator.cpp b/src/storage/invertedindex/search/term_doc_iterator.cpp index 9c58fdc892..915734af1a 100644 --- a/src/storage/invertedindex/search/term_doc_iterator.cpp +++ b/src/storage/invertedindex/search/term_doc_iterator.cpp @@ -52,6 +52,7 @@ void TermDocIterator::InitBM25Info(UniquePtr &&colum column_length_reader_ = std::move(column_length_reader); avg_column_len_ = column_length_reader_->GetAvgColumnLength(); + total_df_ = column_length_reader_->GetTotalDF(); const float smooth_idf = std::log(1.0F + (column_length_reader_->GetTotalDF() - doc_freq_ + 0.5F) / (doc_freq_ + 0.5F)); bm25_common_score_ = weight_ * smooth_idf * (k1 + 1.0F); bm25_score_upper_bound_ = bm25_common_score_ / (1.0F + k1 * b / avg_column_len_); diff --git a/src/storage/invertedindex/search/term_doc_iterator.cppm b/src/storage/invertedindex/search/term_doc_iterator.cppm index 98348bb76e..18bf99d6fe 100644 --- a/src/storage/invertedindex/search/term_doc_iterator.cppm +++ b/src/storage/invertedindex/search/term_doc_iterator.cppm @@ -42,6 +42,8 @@ public: inline u32 GetDocFreq() const { return doc_freq_; } + inline auto GetTotalDF() const { return total_df_; } + u64 GetTermFreq() const { return term_freq_; } void InitBM25Info(UniquePtr &&column_length_reader); @@ -96,6 +98,7 @@ public: private: u32 doc_freq_ = 0; + u64 total_df_ = 0; u64 column_id_; UniquePtr iter_; diff --git a/src/storage/io/virtual_store.cpp b/src/storage/io/virtual_store.cpp index 907af76ecf..03e12c702a 100644 --- a/src/storage/io/virtual_store.cpp +++ b/src/storage/io/virtual_store.cpp @@ -450,6 +450,8 @@ i32 VirtualStore::MunmapFile(const String &file_path) { StorageType VirtualStore::storage_type_ = StorageType::kInvalid; String VirtualStore::bucket_ = "infinity"; UniquePtr VirtualStore::s3_client_ = nullptr; +Atomic VirtualStore::total_request_count_ = 0; +Atomic VirtualStore::cache_miss_count_ = 0; Status VirtualStore::InitRemoteStore(StorageType storage_type, const String &URL, diff --git a/src/storage/io/virtual_store.cppm b/src/storage/io/virtual_store.cppm index e27972ccd9..643a14fa6b 100644 --- a/src/storage/io/virtual_store.cppm +++ b/src/storage/io/virtual_store.cppm @@ -90,6 +90,11 @@ public: // static Status BucketExists(); + static void AddRequestCount() { ++total_request_count_; } + static void AddCacheMissCount() { ++cache_miss_count_; } + static u64 TotalRequestCount() { return total_request_count_; } + static u64 CacheMissCount() { return cache_miss_count_; } + private: static std::mutex mtx_; static HashMap mapped_files_; @@ -98,6 +103,9 @@ private: static String bucket_; static UniquePtr s3_client_; + static Atomic total_request_count_; + static Atomic cache_miss_count_; + friend class ObjectStorageProcess; }; diff --git a/src/storage/knn_index/knn_hnsw/abstract_hnsw.cpp b/src/storage/knn_index/knn_hnsw/abstract_hnsw.cpp index 6b41fa4611..5a6f4d2882 100644 --- a/src/storage/knn_index/knn_hnsw/abstract_hnsw.cpp +++ b/src/storage/knn_index/knn_hnsw/abstract_hnsw.cpp @@ -47,7 +47,7 @@ UniquePtr HnswIndexInMem::Make(RowID begin_row_id, using T = std::decay_t; if constexpr (!std::is_same_v) { if (index != nullptr) { - memindex_tracer->AddMemUsed(index->mem_usage()); + memindex_tracer->IncreaseMemoryUsage(index->mem_usage()); } } }, @@ -166,7 +166,7 @@ void HnswIndexInMem::InsertVecs(SizeT block_offset, break; } } - this->AddMemUsed(mem_usage); + this->IncreaseMemoryUsageBase(mem_usage); } }, hnsw_); @@ -214,7 +214,7 @@ void HnswIndexInMem::InsertVecs(const SegmentEntry *segment_entry, break; } } - this->AddMemUsed(mem_usage); + this->IncreaseMemoryUsageBase(mem_usage); } }, hnsw_); diff --git a/src/storage/knn_index/knn_ivf/ivf_index_data_in_mem.cpp b/src/storage/knn_index/knn_ivf/ivf_index_data_in_mem.cpp index d6aefee162..7328a637c1 100644 --- a/src/storage/knn_index/knn_ivf/ivf_index_data_in_mem.cpp +++ b/src/storage/knn_index/knn_ivf/ivf_index_data_in_mem.cpp @@ -72,9 +72,7 @@ u32 IVFIndexInMem::GetInputRowCount() const { } template -struct InMemStorage { - SizeT MemoryUsed() const { UnrecoverableError("only embedding and multi-vector is supported!"); } -}; +struct InMemStorage; template struct InMemStorage { @@ -112,6 +110,12 @@ class IVFIndexInMemT final : public IVFIndexInMem { } } + ~IVFIndexInMemT() { + if (own_ivf_index_storage_) { + DecreaseMemoryUsageBase(MemoryUsed()); + } + } + MemIndexTracerInfo GetInfo() const override { auto *table_index_entry = segment_index_entry_->table_index_entry(); SharedPtr index_name = table_index_entry->GetIndexName(); @@ -189,7 +193,7 @@ class IVFIndexInMemT final : public IVFIndexInMem { SizeT mem2 = MemoryUsed(); LOG_TRACE(fmt::format("ivf mem usage = {}", mem2)); LOG_TRACE(fmt::format("ivf mem added = {}", mem2 - mem1)); - AddMemUsed(mem2 > mem1 ? mem2 - mem1 : 0); + IncreaseMemoryUsageBase(mem2 > mem1 ? mem2 - mem1 : 0); } void BuildIndex() { diff --git a/src/storage/knn_index/knn_ivf/ivf_index_storage.cppm b/src/storage/knn_index/knn_ivf/ivf_index_storage.cppm index f16e8b64d3..6cb366088f 100644 --- a/src/storage/knn_index/knn_ivf/ivf_index_storage.cppm +++ b/src/storage/knn_index/knn_ivf/ivf_index_storage.cppm @@ -58,7 +58,7 @@ class IVF_Parts_Storage { SizeT memory_used_ = 0; protected: - void AddMemUsed(SizeT mem_usage) { memory_used_ += mem_usage; } + void IncreaseMemoryUsage(SizeT mem_usage) { memory_used_ += mem_usage; } void DecMemUsed(SizeT mem_decreased) { memory_used_ -= mem_decreased; } SizeT row_memory_cost_ = 0; explicit IVF_Parts_Storage(const u32 embedding_dimension, const u32 centroids_num) @@ -81,7 +81,7 @@ public: const void *embedding_ptr, SegmentOffset segment_offset, const IVF_Centroids_Storage *ivf_centroids_storage) { - AddMemUsed(row_memory_cost_); + IncreaseMemoryUsage(row_memory_cost_); AppendOneEmbedding(part_id, embedding_ptr, segment_offset, ivf_centroids_storage); } diff --git a/src/storage/knn_index/knn_ivf/ivf_index_storage_parts.cpp b/src/storage/knn_index/knn_ivf/ivf_index_storage_parts.cpp index 8d843ba1f6..81f32bb4e8 100644 --- a/src/storage/knn_index/knn_ivf/ivf_index_storage_parts.cpp +++ b/src/storage/knn_index/knn_ivf/ivf_index_storage_parts.cpp @@ -231,7 +231,7 @@ class IVF_Parts_Storage_Info : const IndexIVFStorageOption &ivf_storage_option) : IVF_Parts_Storage(embedding_dimension, centroids_num), sq_bits_(ivf_storage_option.scalar_quantization_bits_), common_vec_a_(embedding_dimension), common_vec_b_(embedding_dimension) { - AddMemUsed(sizeof(f32) * 2 * embedding_dimension); + IncreaseMemoryUsage(sizeof(f32) * 2 * embedding_dimension); assert(sq_bits_ == 4 || sq_bits_ == 8); // see IVF_Part_Storage row_memory_cost_ = sizeof(SegmentOffset) + sizeof(f32); @@ -347,7 +347,7 @@ class IVF_Parts_Storage_Info } else { row_memory_cost_ += subspace_num_ * 2; } - AddMemUsed(sizeof(f32) * (expect_subspace_centroid_num_ * embedding_dimension() + expect_subspace_centroid_num_ * subspace_num_)); + IncreaseMemoryUsage(sizeof(f32) * (expect_subspace_centroid_num_ * embedding_dimension() + expect_subspace_centroid_num_ * subspace_num_)); } ~IVF_Parts_Storage_Info() override = default; diff --git a/src/storage/knn_index/sparse/abstract_bmp.cpp b/src/storage/knn_index/sparse/abstract_bmp.cpp index 5c3be67630..52c434c6cb 100644 --- a/src/storage/knn_index/sparse/abstract_bmp.cpp +++ b/src/storage/knn_index/sparse/abstract_bmp.cpp @@ -25,11 +25,35 @@ import sparse_util; import segment_iter; import segment_entry; import infinity_exception; +import third_party; +import logger; namespace infinity { -BMPIndexInMem::BMPIndexInMem(RowID begin_row_id, const IndexBase *index_base, const ColumnDef *column_def) - : begin_row_id_(begin_row_id), bmp_(InitAbstractIndex(index_base, column_def)) { +MemIndexTracerInfo BMPIndexInMem::GetInfo() const { + auto *table_index_entry = segment_index_entry_->table_index_entry(); + SharedPtr index_name = table_index_entry->GetIndexName(); + auto *table_entry = table_index_entry->table_index_meta()->GetTableEntry(); + SharedPtr table_name = table_entry->GetTableName(); + SharedPtr db_name = table_entry->GetDBName(); + + auto [mem_used, row_cnt] = std::visit( + [](auto &&index) -> Pair { + using T = std::decay_t; + if constexpr (std::is_same_v) { + return {}; + } else { + return {index->MemoryUsage(), index->DocNum()}; + } + }, + bmp_); + return MemIndexTracerInfo(index_name, table_name, db_name, mem_used, row_cnt); +} + +TableIndexEntry *BMPIndexInMem::table_index_entry() const { return segment_index_entry_->table_index_entry(); } + +BMPIndexInMem::BMPIndexInMem(RowID begin_row_id, const IndexBase *index_base, const ColumnDef *column_def, SegmentIndexEntry *segment_index_entry) + : begin_row_id_(begin_row_id), bmp_(InitAbstractIndex(index_base, column_def)), segment_index_entry_(segment_index_entry) { const auto *index_bmp = static_cast(index_base); const auto *sparse_info = static_cast(column_def->type()->type_info().get()); SizeT term_num = sparse_info->Dimension(); @@ -68,14 +92,16 @@ BMPIndexInMem::~BMPIndexInMem() { return; } std::visit( - [](auto &&index) { + [&](auto &&index) { using T = std::decay_t; if constexpr (std::is_same_v) { return; } else { + SizeT mem_used = index->MemoryUsage(); if (index != nullptr) { delete index; } + DecreaseMemoryUsageBase(mem_used); } }, bmp_); @@ -94,6 +120,7 @@ SizeT BMPIndexInMem::GetRowCount() const { bmp_); } +// realtime insert, trace this void BMPIndexInMem::AddDocs(SizeT block_offset, BlockColumnEntry *block_column_entry, BufferManager *buffer_mgr, SizeT row_offset, SizeT row_count) { std::visit( [&](auto &&index) { @@ -103,9 +130,12 @@ void BMPIndexInMem::AddDocs(SizeT block_offset, BlockColumnEntry *block_column_e } else { using IndexT = std::decay_t; using SparseRefT = SparseVecRef; - + SizeT mem_before = index->MemoryUsage(); MemIndexInserterIter iter(block_offset, block_column_entry, buffer_mgr, row_offset, row_count); index->AddDocs(std::move(iter)); + SizeT mem_after = index->MemoryUsage(); + IncreaseMemoryUsageBase(mem_after - mem_before); + LOG_INFO(fmt::format("before : {} -> after : {}, add mem_used : {}", mem_before, mem_after, mem_after - mem_before)); } }, bmp_); @@ -133,7 +163,7 @@ void BMPIndexInMem::AddDocs(const SegmentEntry *segment_entry, BufferManager *bu bmp_); } -SharedPtr BMPIndexInMem::Dump(SegmentIndexEntry *segment_index_entry, BufferManager *buffer_mgr) const { +SharedPtr BMPIndexInMem::Dump(SegmentIndexEntry *segment_index_entry, BufferManager *buffer_mgr, SizeT *dump_size) { if (!own_memory_) { UnrecoverableError("BMPIndexInMem::Dump() called with own_memory_ = false."); } @@ -147,6 +177,9 @@ SharedPtr BMPIndexInMem::Dump(SegmentIndexEntry *segment_index_ } else { row_count = index->DocNum(); index_size = index->GetSizeInBytes(); + if (dump_size != nullptr) { + *dump_size = index->MemoryUsage(); + } } }, bmp_); @@ -160,4 +193,4 @@ SharedPtr BMPIndexInMem::Dump(SegmentIndexEntry *segment_index_ return new_chunk_index_entry; } -} // namespace infinity \ No newline at end of file +} // namespace infinity diff --git a/src/storage/knn_index/sparse/abstract_bmp.cppm b/src/storage/knn_index/sparse/abstract_bmp.cppm index 9beecc76db..25aa696d1b 100644 --- a/src/storage/knn_index/sparse/abstract_bmp.cppm +++ b/src/storage/knn_index/sparse/abstract_bmp.cppm @@ -30,6 +30,9 @@ import index_bmp; import sparse_info; import internal_types; import buffer_handle; +import base_memindex; +import memindex_tracer; +import table_index_entry; namespace infinity { @@ -52,11 +55,15 @@ export using AbstractBMP = std::variant *, std::nullptr_t>; -export struct BMPIndexInMem { +export struct BMPIndexInMem final : public BaseMemIndex { public: BMPIndexInMem() : bmp_(nullptr) {} - BMPIndexInMem(RowID begin_row_id, const IndexBase *index_base, const ColumnDef *column_def); + BMPIndexInMem(RowID begin_row_id, const IndexBase *index_base, const ColumnDef *column_def, SegmentIndexEntry *segment_index_entry); + + MemIndexTracerInfo GetInfo() const override; + + TableIndexEntry *table_index_entry() const override; private: template @@ -112,13 +119,14 @@ public: AbstractBMP &get_ref() { return bmp_; } - SharedPtr Dump(SegmentIndexEntry *segment_index_entry, BufferManager *buffer_mgr) const ; + SharedPtr Dump(SegmentIndexEntry *segment_index_entry, BufferManager *buffer_mgr, SizeT *dump_size = nullptr); private: RowID begin_row_id_ = {}; AbstractBMP bmp_ = nullptr; mutable bool own_memory_ = true; mutable BufferHandle chunk_handle_{}; + SegmentIndexEntry *segment_index_entry_; }; -} // namespace infinity \ No newline at end of file +} // namespace infinity diff --git a/src/storage/knn_index/sparse/bmp_alg.cpp b/src/storage/knn_index/sparse/bmp_alg.cpp index a02dd36b7d..edec13d340 100644 --- a/src/storage/knn_index/sparse/bmp_alg.cpp +++ b/src/storage/knn_index/sparse/bmp_alg.cpp @@ -14,9 +14,9 @@ module; +#include "common/simd/simd_common_intrin_include.h" #include #include -#include "common/simd/simd_common_intrin_include.h" module bmp_alg; @@ -25,12 +25,15 @@ import third_party; import serialize; import segment_iter; import bp_reordering; +import bmp_blockterms; namespace infinity { template template -void BMPIvt::AddBlock(BMPBlockID block_id, const Vector, Vector>> &tail_terms) { +void BMPIvt::AddBlock(BMPBlockID block_id, + const Vector, Vector>> &tail_terms, + SizeT &mem_usage) { HashMap max_scores; for (const auto &[indices, data] : tail_terms) { SizeT block_size = indices.size(); @@ -41,7 +44,7 @@ void BMPIvt::AddBlock(BMPBlockID block_id, const Vector< } } for (const auto &[term_id, score] : max_scores) { - postings_[term_id].data_.AddBlock(block_id, score); + postings_[term_id].data_.AddBlock(block_id, score, mem_usage); } } @@ -65,7 +68,7 @@ template class BMPIvt; template class BMPIvt; template -SizeT TailFwd::AddDoc(const SparseVecRef &doc) { +SizeT TailFwd::AddDoc(const SparseVecRef &doc, SizeT &mem_usage) { Vector indices; Vector data; indices.reserve(doc.nnz_); @@ -75,6 +78,7 @@ SizeT TailFwd::AddDoc(const SparseVecRef & data.push_back(doc.data_[i]); } tail_terms_.emplace_back(std::move(indices), std::move(data)); + mem_usage += doc.nnz_ * (sizeof(IdxType) + sizeof(DataType)); return tail_terms_.size(); } @@ -144,8 +148,8 @@ template class TailFwd; template class TailFwd; template -Optional> BlockFwd::AddDoc(const SparseVecRef &doc) { - SizeT tail_size = tail_fwd_.AddDoc(doc); +Optional> BlockFwd::AddDoc(const SparseVecRef &doc, SizeT &mem_usage) { + SizeT tail_size = tail_fwd_.AddDoc(doc, mem_usage); if (tail_size < block_size_) { return None; } @@ -154,6 +158,7 @@ Optional> BlockFwd::AddDoc(const S Vector, Vector>> block_terms = tail_fwd1.ToBlockFwd(); block_terms_list_.emplace_back(block_terms); + mem_usage += block_terms_list_.back().GetSizeInBytes(); return tail_fwd1; } @@ -242,14 +247,17 @@ void BMPAlg::AddDoc(const SparseVecRef> tail_fwd = block_fwd_.AddDoc(doc); + Optional> tail_fwd = block_fwd_.AddDoc(doc, mem_usage); if (!tail_fwd.has_value()) { + mem_usage_.fetch_add(sizeof(BMPDocID) + mem_usage); return; } BMPBlockID block_id = block_fwd_.block_num() - 1; const auto &tail_terms = tail_fwd->GetTailTerms(); - bm_ivt_.AddBlock(block_id, tail_terms); + bm_ivt_.AddBlock(block_id, tail_terms, mem_usage); + mem_usage_.fetch_add(sizeof(BMPDocID) + mem_usage); } template diff --git a/src/storage/knn_index/sparse/bmp_alg.cppm b/src/storage/knn_index/sparse/bmp_alg.cppm index f98892a49a..2626acf0ff 100644 --- a/src/storage/knn_index/sparse/bmp_alg.cppm +++ b/src/storage/knn_index/sparse/bmp_alg.cppm @@ -36,7 +36,7 @@ public: BMPIvt(SizeT term_num) : postings_(term_num) {} template - void AddBlock(BMPBlockID block_id, const Vector, Vector>> &tail_terms); + void AddBlock(BMPBlockID block_id, const Vector, Vector>> &tail_terms, SizeT &mem_usage); void Optimize(i32 topk, Vector> ivt_scores); @@ -62,7 +62,7 @@ private: public: TailFwd(SizeT block_size) { tail_terms_.reserve(block_size); } - SizeT AddDoc(const SparseVecRef &doc); + SizeT AddDoc(const SparseVecRef &doc, SizeT &mem_usage); const Vector, Vector>> &GetTailTerms() const { return tail_terms_; } @@ -86,7 +86,7 @@ public: BlockFwd(SizeT block_size) : block_size_(block_size), tail_fwd_(block_size) {} - Optional> AddDoc(const SparseVecRef &doc); + Optional> AddDoc(const SparseVecRef &doc, SizeT &mem_usage); Vector, Vector>> GetFwd(SizeT doc_num, SizeT term_num) const; @@ -151,6 +151,8 @@ public: SizeT GetSizeInBytes() const; + inline SizeT MemoryUsage() const { return mem_usage_.load(); } + private: void WriteAdv(char *&p) const; @@ -160,6 +162,7 @@ private: BMPIvt bm_ivt_; BlockFwd block_fwd_; Vector doc_ids_; + Atomic mem_usage_ = 0; mutable std::shared_mutex mtx_; }; @@ -262,7 +265,7 @@ Pair, Vector> BMPAlg DataType score = scores[block_off]; add_result(score, doc_id); } - if (ub_score * options.alpha_ < result_handler.GetDistance0(0 /*query_id*/)) { + if (result_handler.GetSize(0) == u32(topk) && ub_score * options.alpha_ < result_handler.GetDistance0(0 /*query_id*/)) { break; } } diff --git a/src/storage/knn_index/sparse/bmp_posting.cpp b/src/storage/knn_index/sparse/bmp_posting.cpp index bfab145e67..7b1e6b02c3 100644 --- a/src/storage/knn_index/sparse/bmp_posting.cpp +++ b/src/storage/knn_index/sparse/bmp_posting.cpp @@ -34,15 +34,16 @@ void BlockData::Calculate(Vector -void BlockData::AddBlock(BMPBlockID block_id, DataType max_score) { +void BlockData::AddBlock(BMPBlockID block_id, DataType max_score, SizeT &mem_usage) { block_ids_.push_back(block_id); max_scores_.push_back(max_score); + mem_usage += (sizeof(BMPBlockID) + sizeof(DataType)); } template void BlockData::Prefetch() const { - _mm_prefetch((const char*)block_ids_.data(), _MM_HINT_T0); - _mm_prefetch((const char*)max_scores_.data(), _MM_HINT_T0); + _mm_prefetch((const char *)block_ids_.data(), _MM_HINT_T0); + _mm_prefetch((const char *)max_scores_.data(), _MM_HINT_T0); } template struct BlockData; @@ -58,8 +59,9 @@ void BlockData::Calculate(Vector &upp } template -void BlockData::AddBlock(BMPBlockID block_id, DataType max_score) { +void BlockData::AddBlock(BMPBlockID block_id, DataType max_score, SizeT &mem_usage) { if (block_id >= (BMPBlockID)max_scores_.size()) { + mem_usage += sizeof(BMPBlockID); max_scores_.resize(block_id + 1, 0.0); } max_scores_[block_id] = max_score; @@ -67,10 +69,10 @@ void BlockData::AddBlock(BMPBlockID block_id, D template void BlockData::Prefetch() const { - _mm_prefetch((const char*)max_scores_.data(), _MM_HINT_T0); + _mm_prefetch((const char *)max_scores_.data(), _MM_HINT_T0); } template struct BlockData; template struct BlockData; -} // namespace infinity \ No newline at end of file +} // namespace infinity diff --git a/src/storage/knn_index/sparse/bmp_posting.cppm b/src/storage/knn_index/sparse/bmp_posting.cppm index d56bd2a403..0d176a5818 100644 --- a/src/storage/knn_index/sparse/bmp_posting.cppm +++ b/src/storage/knn_index/sparse/bmp_posting.cppm @@ -29,7 +29,7 @@ struct BlockData { public: void Calculate(Vector &upper_bounds, DataType query_score) const; - void AddBlock(BMPBlockID block_id, DataType max_score); + void AddBlock(BMPBlockID block_id, DataType max_score, SizeT &mem_usage); void Prefetch() const; @@ -48,7 +48,7 @@ public: // template void Calculate(Vector &upper_bounds, DataType query_score) const; - void AddBlock(BMPBlockID block_id, DataType max_score); + void AddBlock(BMPBlockID block_id, DataType max_score, SizeT &mem_usage); void Prefetch() const; @@ -77,4 +77,4 @@ public: BlockData data_; }; -} // namespace infinity \ No newline at end of file +} // namespace infinity diff --git a/src/storage/meta/catalog.cpp b/src/storage/meta/catalog.cpp index 7b35406bba..71d7347897 100644 --- a/src/storage/meta/catalog.cpp +++ b/src/storage/meta/catalog.cpp @@ -495,8 +495,11 @@ void Catalog::AddSpecialFunction(Catalog *catalog, const SharedPtrspecial_type()) { case SpecialType::kRowID: case SpecialType::kDistance: + case SpecialType::kDistanceFactors: case SpecialType::kSimilarity: + case SpecialType::kSimilarityFactors: case SpecialType::kScore: + case SpecialType::kScoreFactors: case SpecialType::kFilterFullText: { return; } @@ -576,10 +579,12 @@ void Catalog::AttachDeltaCheckpoint(const String &file_name) { // called by Replay UniquePtr Catalog::LoadFromFileDelta(const String &catalog_path) { + VirtualStore::AddRequestCount(); if (!VirtualStore::Exists(catalog_path)) { std::filesystem::path filePath = catalog_path; String dst_file_name = filePath.filename(); VirtualStore::DownloadObject(catalog_path, dst_file_name); + VirtualStore::AddCacheMissCount(); } auto [catalog_file_handle, status] = VirtualStore::Open(catalog_path, FileAccessMode::kRead); @@ -905,6 +910,7 @@ void Catalog::LoadFromEntryDelta(UniquePtr delta_entry, Buffe auto min_ts = add_segment_index_entry_op->min_ts_; auto max_ts = add_segment_index_entry_op->max_ts_; auto next_chunk_id = add_segment_index_entry_op->next_chunk_id_; + auto deprecate_ts = add_segment_index_entry_op->deprecate_ts_; auto *db_entry = this->GetDatabaseReplay(db_name, txn_id, begin_ts); auto *table_entry = db_entry->GetTableReplay(table_name, txn_id, begin_ts); @@ -912,7 +918,7 @@ void Catalog::LoadFromEntryDelta(UniquePtr delta_entry, Buffe if (auto iter = table_entry->segment_map_.find(segment_id); iter != table_entry->segment_map_.end()) { auto *table_index_entry = table_entry->GetIndexReplay(index_name, txn_id, begin_ts); auto *segment_entry = iter->second.get(); - if (segment_entry->status() == SegmentStatus::kDeprecated) { + if (merge_flag != MergeFlag::kDelete && segment_entry->status() == SegmentStatus::kDeprecated) { String error_message = fmt::format("Segment {} is deprecated", segment_id); UnrecoverableError(error_message); } @@ -925,14 +931,15 @@ void Catalog::LoadFromEntryDelta(UniquePtr delta_entry, Buffe next_chunk_id, txn_id, begin_ts, - commit_ts); + commit_ts, + deprecate_ts); if (merge_flag == MergeFlag::kNew) { bool insert_ok = table_index_entry->index_by_segment().insert({segment_id, std::move(segment_index_entry)}).second; if (!insert_ok) { String error_message = fmt::format("Segment index {} is already in the catalog", segment_id); UnrecoverableError(error_message); } - } else if (merge_flag == MergeFlag::kUpdate) { + } else if (merge_flag == MergeFlag::kUpdate || merge_flag == MergeFlag::kDelete) { auto iter = table_index_entry->index_by_segment().find(segment_id); if (iter == table_index_entry->index_by_segment().end()) { String error_message = fmt::format("Segment index {} is not found", segment_id); @@ -997,8 +1004,10 @@ UniquePtr Catalog::LoadFullCheckpoint(const String &file_name) { VirtualStore::MakeDirectory(dst_dir); } + VirtualStore::AddRequestCount(); if (!VirtualStore::Exists(catalog_path)) { VirtualStore::DownloadObject(catalog_path, dst_file_name); + VirtualStore::AddCacheMissCount(); } auto [catalog_file_handle, status] = VirtualStore::Open(catalog_path, FileAccessMode::kRead); diff --git a/src/storage/meta/entry/block_column_entry.cpp b/src/storage/meta/entry/block_column_entry.cpp index 389630d766..cb00683947 100644 --- a/src/storage/meta/entry/block_column_entry.cpp +++ b/src/storage/meta/entry/block_column_entry.cpp @@ -67,14 +67,16 @@ BlockColumnEntry::BlockColumnEntry(const BlockColumnEntry &other) std::shared_lock lock(other.mutex_); outline_buffers_ = other.outline_buffers_; last_chunk_offset_ = other.last_chunk_offset_; -} -UniquePtr BlockColumnEntry::Clone(BlockEntry *block_entry) const { - auto ret = UniquePtr(new BlockColumnEntry(*this)); buffer_->AddObjRc(); for (auto *outline_buffer : outline_buffers_) { outline_buffer->AddObjRc(); } +} + +UniquePtr BlockColumnEntry::Clone(BlockEntry *block_entry) const { + auto ret = UniquePtr(new BlockColumnEntry(*this)); + ret->block_entry_ = block_entry; return ret; } @@ -105,6 +107,7 @@ UniquePtr BlockColumnEntry::NewBlockColumnEntry(const BlockEnt buffer_mgr->persistence_manager()); block_column_entry->buffer_ = buffer_mgr->AllocateBufferObject(std::move(file_worker)); + block_column_entry->buffer_->AddObjRc(); return block_column_entry; } @@ -132,6 +135,7 @@ UniquePtr BlockColumnEntry::NewReplayBlockColumnEntry(const Bl buffer_manager->persistence_manager()); column_entry->buffer_ = buffer_manager->GetBufferObject(std::move(file_worker), true /*restart*/); + column_entry->buffer_->AddObjRc(); if (next_outline_idx > 0) { SizeT buffer_size = last_chunk_offset; @@ -142,6 +146,7 @@ UniquePtr BlockColumnEntry::NewReplayBlockColumnEntry(const Bl buffer_size, buffer_manager->persistence_manager()); auto *buffer_obj = buffer_manager->GetBufferObject(std::move(outline_buffer_file_worker), true /*restart*/); + buffer_obj->AddObjRc(); column_entry->outline_buffers_.push_back(buffer_obj); } column_entry->last_chunk_offset_ = last_chunk_offset; @@ -171,6 +176,7 @@ ColumnVector BlockColumnEntry::GetColumnVectorInner(BufferManager *buffer_mgr, c 0, buffer_mgr->persistence_manager()); this->buffer_ = buffer_mgr->GetBufferObject(std::move(file_worker)); + buffer_->AddObjRc(); } ColumnVector column_vector(column_type_); @@ -293,6 +299,7 @@ void BlockColumnEntry::Cleanup(CleanupInfoTracer *info_tracer, [[maybe_unused]] String file_path = outline_buffer->GetFilename(); info_tracer->AddCleanupInfo(std::move(file_path)); } + outline_buffer = nullptr; } } } diff --git a/src/storage/meta/entry/block_column_entry.cppm b/src/storage/meta/entry/block_column_entry.cppm index 6f03c4177f..a1d94d96ab 100644 --- a/src/storage/meta/entry/block_column_entry.cppm +++ b/src/storage/meta/entry/block_column_entry.cppm @@ -100,6 +100,7 @@ public: void AppendOutlineBuffer(BufferObj *buffer) { std::unique_lock lock(mutex_); outline_buffers_.emplace_back(buffer); + buffer->AddObjRc(); } BufferObj *GetOutlineBuffer(SizeT idx) const { diff --git a/src/storage/meta/entry/block_entry.cpp b/src/storage/meta/entry/block_entry.cpp index 9889a1067c..4d44c9a68b 100644 --- a/src/storage/meta/entry/block_entry.cpp +++ b/src/storage/meta/entry/block_entry.cpp @@ -75,11 +75,12 @@ BlockEntry::BlockEntry(const BlockEntry &other) checkpoint_ts_ = other.checkpoint_ts_; using_txn_id_ = other.using_txn_id_; checkpoint_row_count_ = other.checkpoint_row_count_; + + version_buffer_object_->AddObjRc(); } UniquePtr BlockEntry::Clone(SegmentEntry *segment_entry) const { auto ret = UniquePtr(new BlockEntry(*this)); - version_buffer_object_->AddObjRc(); ret->segment_entry_ = segment_entry; for (auto &column : columns_) { ret->columns_.emplace_back(column->Clone(ret.get())); @@ -112,6 +113,7 @@ BlockEntry::NewBlockEntry(const SegmentEntry *segment_entry, BlockID block_id, T block_entry->row_capacity_, buffer_mgr->persistence_manager()); block_entry->version_buffer_object_ = buffer_mgr->AllocateBufferObject(std::move(version_file_worker)); + block_entry->version_buffer_object_->AddObjRc(); return block_entry; } @@ -147,6 +149,7 @@ UniquePtr BlockEntry::NewReplayBlockEntry(const SegmentEntry *segmen row_capacity, buffer_mgr->persistence_manager()); block_entry->version_buffer_object_ = buffer_mgr->GetBufferObject(std::move(version_file_worker)); + block_entry->version_buffer_object_->AddObjRc(); block_entry->checkpoint_ts_ = check_point_ts; block_entry->checkpoint_row_count_ = checkpoint_row_count; @@ -182,7 +185,7 @@ ColumnVector BlockEntry::GetConstColumnVector(BufferManager *buffer_mgr, ColumnI std::shared_lock lock(rw_locker_); row_count = block_row_count_; } - return block_column_entry->GetColumnVector(buffer_mgr, row_count); + return block_column_entry->GetConstColumnVector(buffer_mgr, row_count); } SizeT BlockEntry::row_count(TxnTimeStamp check_ts) const { diff --git a/src/storage/meta/entry/chunk_index_entry.cpp b/src/storage/meta/entry/chunk_index_entry.cpp index e2155795ec..6c12c170e5 100644 --- a/src/storage/meta/entry/chunk_index_entry.cpp +++ b/src/storage/meta/entry/chunk_index_entry.cpp @@ -73,19 +73,14 @@ ChunkIndexEntry::~ChunkIndexEntry() {} ChunkIndexEntry::ChunkIndexEntry(const ChunkIndexEntry &other) : BaseEntry(other), chunk_id_(other.chunk_id_), segment_index_entry_(other.segment_index_entry_), base_name_(other.base_name_), - base_rowid_(other.base_rowid_), row_count_(other.row_count_), deprecate_ts_(other.deprecate_ts_.load()), buffer_obj_(other.buffer_obj_), - part_buffer_objs_(other.part_buffer_objs_) {} + base_rowid_(other.base_rowid_), row_count_(other.row_count_), deprecate_ts_(other.deprecate_ts_.load()), buffer_obj_(other.buffer_obj_) { + if (buffer_obj_) { + buffer_obj_->AddObjRc(); + } +} UniquePtr ChunkIndexEntry::Clone(SegmentIndexEntry *segment_index_entry) const { auto ret = UniquePtr(new ChunkIndexEntry(*this)); - if (buffer_obj_ != nullptr) { - buffer_obj_->AddObjRc(); - } - for (auto *part_buffer_obj : part_buffer_objs_) { - if (part_buffer_obj != nullptr) { - part_buffer_obj->AddObjRc(); - } - } ret->segment_index_entry_ = segment_index_entry; return ret; } @@ -116,6 +111,7 @@ SharedPtr ChunkIndexEntry::NewHnswIndexChunkIndexEntry(ChunkID buffer_mgr->persistence_manager(), index_size); chunk_index_entry->buffer_obj_ = buffer_mgr->AllocateBufferObject(std::move(file_worker)); + chunk_index_entry->buffer_obj_->AddObjRc(); } return chunk_index_entry; } @@ -138,6 +134,7 @@ SharedPtr ChunkIndexEntry::NewFtChunkIndexEntry(SegmentIndexEnt row_count * sizeof(u32), buffer_mgr->persistence_manager()); chunk_index_entry->buffer_obj_ = buffer_mgr->GetBufferObject(std::move(file_worker)); + chunk_index_entry->buffer_obj_->AddObjRc(); } return chunk_index_entry; } @@ -165,21 +162,7 @@ SharedPtr ChunkIndexEntry::NewSecondaryIndexChunkIndexEntry(Chu row_count, buffer_mgr->persistence_manager()); chunk_index_entry->buffer_obj_ = buffer_mgr->AllocateBufferObject(std::move(file_worker)); - const u32 part_cnt = (row_count + 8191) / 8192; - for (u32 i = 0; i < part_cnt; ++i) { - auto part_name = MakeShared(fmt::format("{}_part{}", *secondary_index_file_name, i)); - auto part_file_worker = MakeUnique(MakeShared(InfinityContext::instance().config()->DataDir()), - MakeShared(InfinityContext::instance().config()->TempDir()), - index_dir, - std::move(part_name), - index_base, - column_def, - row_count, - i, - buffer_mgr->persistence_manager()); - BufferObj *part_ptr = buffer_mgr->AllocateBufferObject(std::move(part_file_worker)); - chunk_index_entry->part_buffer_objs_.push_back(part_ptr); - } + chunk_index_entry->buffer_obj_->AddObjRc(); } return chunk_index_entry; } @@ -206,6 +189,7 @@ SharedPtr ChunkIndexEntry::NewIVFIndexChunkIndexEntry(ChunkID c column_def, buffer_mgr->persistence_manager()); chunk_index_entry->buffer_obj_ = buffer_mgr->AllocateBufferObject(std::move(file_worker)); + chunk_index_entry->buffer_obj_->AddObjRc(); } return chunk_index_entry; } @@ -234,6 +218,7 @@ SharedPtr ChunkIndexEntry::NewEMVBIndexChunkIndexEntry(ChunkID segment_start_offset, buffer_mgr->persistence_manager()); chunk_index_entry->buffer_obj_ = buffer_mgr->AllocateBufferObject(std::move(file_worker)); + chunk_index_entry->buffer_obj_->AddObjRc(); } return chunk_index_entry; } @@ -262,6 +247,7 @@ SharedPtr ChunkIndexEntry::NewBMPIndexChunkIndexEntry(ChunkID c buffer_mgr->persistence_manager(), index_size); chunk_index_entry->buffer_obj_ = buffer_mgr->AllocateBufferObject(std::move(file_worker)); + chunk_index_entry->buffer_obj_->AddObjRc(); } return chunk_index_entry; } @@ -315,7 +301,6 @@ SharedPtr ChunkIndexEntry::NewReplayChunkIndexEntry(ChunkID chu row_count, buffer_mgr->persistence_manager()); chunk_index_entry->buffer_obj_ = buffer_mgr->GetBufferObject(std::move(file_worker)); - chunk_index_entry->LoadPartsReader(buffer_mgr); break; } case IndexType::kIVF: { @@ -363,6 +348,9 @@ SharedPtr ChunkIndexEntry::NewReplayChunkIndexEntry(ChunkID chu UnrecoverableError(fmt::format("Unsupported index type: {}", index_base->ToString())); } } + if (chunk_index_entry->buffer_obj_) { + chunk_index_entry->buffer_obj_->AddObjRc(); + } chunk_index_entry->commit_ts_ = commit_ts; chunk_index_entry->deprecate_ts_ = deprecate_ts; return chunk_index_entry; @@ -412,12 +400,6 @@ void ChunkIndexEntry::Cleanup(CleanupInfoTracer *info_tracer, bool dropped) { info_tracer->AddCleanupInfo(buffer_obj_->GetFilename()); } } - for (auto &part_buffer_obj : part_buffer_objs_) { - part_buffer_obj->PickForCleanup(); - if (info_tracer) { - info_tracer->AddCleanupInfo(part_buffer_obj->GetFilename()); - } - } if (!dropped) { return; } @@ -460,11 +442,7 @@ void ChunkIndexEntry::Cleanup(CleanupInfoTracer *info_tracer, bool dropped) { Vector ChunkIndexEntry::GetFilePath(TransactionID txn_id, TxnTimeStamp begin_ts) const { Vector res; - res.reserve(part_buffer_objs_.size() + 1); res.emplace_back(buffer_obj_->GetFilename()); - for (auto *buffer_obj : part_buffer_objs_) { - res.emplace_back(buffer_obj->GetFilename()); - } return res; } @@ -473,34 +451,6 @@ void ChunkIndexEntry::SaveIndexFile() { return; } buffer_obj_->Save(); - for (auto *part_buffer_obj : part_buffer_objs_) { - part_buffer_obj->Save(); - } -} - -void ChunkIndexEntry::LoadPartsReader(BufferManager *buffer_mgr) { - const auto &index_dir = segment_index_entry_->index_dir(); - SegmentID segment_id = segment_index_entry_->segment_id(); - String secondary_index_file_name = IndexFileName(segment_id, chunk_id_); - const auto &index_base = segment_index_entry_->table_index_entry()->table_index_def(); - const auto &column_def = segment_index_entry_->table_index_entry()->column_def(); - const u32 part_cnt = (row_count_ + 8191) / 8192; - part_buffer_objs_.clear(); - part_buffer_objs_.reserve(part_cnt); - for (u32 i = 0; i < part_cnt; ++i) { - auto part_name = MakeShared(fmt::format("{}_part{}", secondary_index_file_name, i)); - auto part_file_worker = MakeUnique(MakeShared(InfinityContext::instance().config()->DataDir()), - MakeShared(InfinityContext::instance().config()->TempDir()), - index_dir, - std::move(part_name), - index_base, - column_def, - row_count_, - i, - buffer_mgr->persistence_manager()); - BufferObj *part_ptr = buffer_mgr->GetBufferObject(std::move(part_file_worker)); - part_buffer_objs_.push_back(part_ptr); - } } void ChunkIndexEntry::DeprecateChunk(TxnTimeStamp commit_ts) { @@ -509,8 +459,6 @@ void ChunkIndexEntry::DeprecateChunk(TxnTimeStamp commit_ts) { LOG_INFO(fmt::format("Deprecate chunk {}, ts: {}", encode(), commit_ts)); } -BufferHandle ChunkIndexEntry::GetIndexPartAt(u32 i) { return part_buffer_objs_.at(i)->Load(); } - bool ChunkIndexEntry::CheckVisible(Txn *txn) const { if (txn == nullptr) { return deprecate_ts_.load() == UNCOMMIT_TS; @@ -519,10 +467,4 @@ bool ChunkIndexEntry::CheckVisible(Txn *txn) const { return begin_ts < deprecate_ts_.load() && BaseEntry::CheckVisible(txn); } -void ChunkIndexEntry::Save() { - if (buffer_obj_) { - buffer_obj_->Save(); - } -} - } // namespace infinity diff --git a/src/storage/meta/entry/chunk_index_entry.cppm b/src/storage/meta/entry/chunk_index_entry.cppm index 969abe90ad..c4f706c8cc 100644 --- a/src/storage/meta/entry/chunk_index_entry.cppm +++ b/src/storage/meta/entry/chunk_index_entry.cppm @@ -14,8 +14,6 @@ module; -#include - export module chunk_index_entry; import stl; @@ -116,14 +114,8 @@ public: // Only for fulltext u64 GetColumnLengthSum() const; - inline u32 GetPartNum() const { return (row_count_ + 8191) / 8192; } - - inline u32 GetPartRowCount(const u32 part_id) const { return std::min(8192, row_count_ - part_id * 8192); } - BufferHandle GetIndex(); - BufferHandle GetIndexPartAt(u32 i); - nlohmann::json Serialize(); static SharedPtr @@ -133,12 +125,10 @@ public: virtual void PickCleanup(CleanupScanner *scanner) override {} - Vector GetFilePath(TransactionID txn_id, TxnTimeStamp begin_ts) const final; + Vector GetFilePath(TransactionID txn_id, TxnTimeStamp begin_ts) const override; void SaveIndexFile(); - void LoadPartsReader(BufferManager *buffer_mgr); - BufferObj *GetBufferObj() { return buffer_obj_; } void DeprecateChunk(TxnTimeStamp commit_ts); @@ -150,8 +140,6 @@ public: return ts >= deprecate_ts; } - void Save(); - public: ChunkID chunk_id_; @@ -163,7 +151,6 @@ public: private: BufferObj *buffer_obj_{}; - Vector part_buffer_objs_; }; } // namespace infinity diff --git a/src/storage/meta/entry/segment_index_entry.cpp b/src/storage/meta/entry/segment_index_entry.cpp index 51ff72a5fd..38686defe0 100644 --- a/src/storage/meta/entry/segment_index_entry.cpp +++ b/src/storage/meta/entry/segment_index_entry.cpp @@ -67,6 +67,7 @@ import hnsw_util; import wal_entry; import infinity_context; import defer_op; +import memory_indexer; namespace infinity { @@ -153,7 +154,8 @@ SharedPtr SegmentIndexEntry::NewReplaySegmentIndexEntry(Table u32 next_chunk_id, TransactionID txn_id, TxnTimeStamp begin_ts, - TxnTimeStamp commit_ts) { + TxnTimeStamp commit_ts, + TxnTimeStamp deprecate_ts) { auto [segment_row_count, status] = table_entry->GetSegmentRowCountBySegmentID(segment_id); if (!status.ok()) { UnrecoverableError(status.message()); @@ -170,6 +172,7 @@ SharedPtr SegmentIndexEntry::NewReplaySegmentIndexEntry(Table segment_index_entry->commit_ts_.store(commit_ts); segment_index_entry->buffer_manager_ = buffer_manager; + segment_index_entry->deprecate_ts_.store(deprecate_ts); return segment_index_entry; } @@ -180,6 +183,7 @@ void SegmentIndexEntry::UpdateSegmentIndexReplay(SharedPtr ne min_ts_ = new_entry->min_ts_; max_ts_ = new_entry->max_ts_; next_chunk_id_ = new_entry->next_chunk_id_.load(); + deprecate_ts_ = new_entry->deprecate_ts_.load(); } // String SegmentIndexEntry::IndexFileName(SegmentID segment_id) { return fmt::format("seg{}.idx", segment_id); } @@ -208,7 +212,8 @@ void SegmentIndexEntry::MemIndexInsert(SharedPtr block_entry, { std::unique_lock lck(rw_locker_); String full_path = Path(InfinityContext::instance().config()->DataDir()) / *table_index_entry_->index_dir(); - memory_indexer_ = MakeUnique(full_path, base_name, begin_row_id, index_fulltext->flag_, index_fulltext->analyzer_); + memory_indexer_ = + MakeUnique(full_path, base_name, begin_row_id, index_fulltext->flag_, index_fulltext->analyzer_, this); } table_index_entry_->UpdateFulltextSegmentTs(commit_ts); } else { @@ -264,7 +269,7 @@ void SegmentIndexEntry::MemIndexInsert(SharedPtr block_entry, case IndexType::kBMP: { if (memory_bmp_index_.get() == nullptr) { std::unique_lock lck(rw_locker_); - memory_bmp_index_ = MakeShared(begin_row_id, index_base.get(), column_def.get()); + memory_bmp_index_ = MakeShared(begin_row_id, index_base.get(), column_def.get(), this); } BlockColumnEntry *block_column_entry = block_entry->GetColumnBlockEntry(column_idx); memory_bmp_index_->AddDocs(block_offset, block_column_entry, buffer_manager, row_offset, row_count); @@ -377,7 +382,7 @@ SharedPtr SegmentIndexEntry::MemIndexDump(bool spill, SizeT *du if (memory_bmp_index_.get() == nullptr) { return nullptr; } - chunk_index_entry = memory_bmp_index_->Dump(this, buffer_manager_); + chunk_index_entry = memory_bmp_index_->Dump(this, buffer_manager_, dump_size); chunk_index_entry->SaveIndexFile(); std::unique_lock lck(rw_locker_); chunk_index_entries_.push_back(chunk_index_entry); @@ -459,7 +464,7 @@ void SegmentIndexEntry::PopulateEntirely(const SegmentEntry *segment_entry, Txn const IndexFullText *index_fulltext = static_cast(index_base); String base_name = fmt::format("ft_{:016x}", base_row_id.ToUint64()); String full_path = Path(InfinityContext::instance().config()->DataDir()) / *table_index_entry_->index_dir(); - memory_indexer_ = MakeUnique(full_path, base_name, base_row_id, index_fulltext->flag_, index_fulltext->analyzer_); + memory_indexer_ = MakeUnique(full_path, base_name, base_row_id, index_fulltext->flag_, index_fulltext->analyzer_, this); u64 column_id = column_def->id(); SizeT column_idx = table_entry->GetColumnIdxByID(column_id); auto block_entry_iter = BlockEntryIter(segment_entry); @@ -537,7 +542,7 @@ void SegmentIndexEntry::PopulateEntirely(const SegmentEntry *segment_entry, Txn break; } case IndexType::kBMP: { - memory_bmp_index_ = MakeShared(base_row_id, index_base, column_def.get()); + memory_bmp_index_ = MakeShared(base_row_id, index_base, column_def.get(), this); memory_bmp_index_->AddDocs(segment_entry, buffer_mgr, column_def->id(), begin_ts, config.check_ts_); dumped_memindex_entry = MemIndexDump(); @@ -921,7 +926,7 @@ ChunkIndexEntry *SegmentIndexEntry::RebuildChunkIndexEntries(TxnTableStore *txn_ break; } case IndexType::kBMP: { - auto memory_bmp_index = MakeShared(base_rowid, index_base, column_def.get()); + auto memory_bmp_index = MakeShared(base_rowid, index_base, column_def.get(), this); AbstractBMP abstract_bmp = memory_bmp_index->get(); std::visit( @@ -945,7 +950,7 @@ ChunkIndexEntry *SegmentIndexEntry::RebuildChunkIndexEntries(TxnTableStore *txn_ merged_chunk_index_entry = CreateSecondaryIndexChunkIndexEntry(base_rowid, row_count, buffer_mgr); BufferHandle handle = merged_chunk_index_entry->GetIndex(); auto data_ptr = static_cast(handle.GetDataMut()); - data_ptr->InsertMergeData(old_chunks, merged_chunk_index_entry); + data_ptr->InsertMergeData(old_chunks); break; } case IndexType::kEMVB: { @@ -979,11 +984,14 @@ ChunkIndexEntry *SegmentIndexEntry::RebuildChunkIndexEntries(TxnTableStore *txn_ } BaseMemIndex *SegmentIndexEntry::GetMemIndex() const { - // only support hnsw and ivf index now. if (memory_hnsw_index_.get() != nullptr) { return static_cast(memory_hnsw_index_.get()); } else if (memory_ivf_index_.get() != nullptr) { return static_cast(memory_ivf_index_.get()); + } else if (memory_indexer_.get() != nullptr) { + return static_cast(memory_indexer_.get()); + } else if (memory_bmp_index_.get() != nullptr) { + return static_cast(memory_bmp_index_.get()); } return nullptr; } @@ -1170,6 +1178,7 @@ nlohmann::json SegmentIndexEntry::Serialize(TxnTimeStamp max_commit_ts) { } index_entry_json["ft_column_len_sum"] = this->ft_column_len_sum_; index_entry_json["ft_column_len_cnt"] = this->ft_column_len_cnt_; + index_entry_json["deprecate_ts"] = this->deprecate_ts_.load(); } return index_entry_json; @@ -1205,6 +1214,13 @@ UniquePtr SegmentIndexEntry::Deserialize(const nlohmann::json segment_index_entry->ft_column_len_sum_ = index_entry_json["ft_column_len_sum"]; segment_index_entry->ft_column_len_cnt_ = index_entry_json["ft_column_len_cnt"]; + if (!index_entry_json.contains("deprecate_ts") || index_entry_json["deprecate_ts"] == UNCOMMIT_TS) { + segment_index_entry->deleted_ = false; + } else { + segment_index_entry->deleted_ = true; + segment_index_entry->deprecate_ts_ = index_entry_json["deprecate_ts"]; + } + return segment_index_entry; } @@ -1218,4 +1234,23 @@ void SegmentIndexEntry::ResetOptimizing() { optimizing_.compare_exchange_strong(expected, false); } +Pair SegmentIndexEntry::GetFulltextColumnLenInfo() { + std::shared_lock lock(rw_locker_); + if (ft_column_len_sum_ == 0 && memory_indexer_.get() != nullptr) { + return {memory_indexer_->GetColumnLengthSum(), memory_indexer_->GetDocCount()}; + } + return {ft_column_len_sum_, ft_column_len_cnt_}; +} + +void SegmentIndexEntry::SetMemoryIndexer(UniquePtr &&memory_indexer) { memory_indexer_ = std::move(memory_indexer); } + +void SegmentIndexEntry::SetDeprecated(TxnTimeStamp deprecate_ts) { + std::unique_lock lock(rw_locker_); + for (auto &chunk_index_entry : chunk_index_entries_) { + chunk_index_entry->DeprecateChunk(deprecate_ts); + } + this->deleted_ = true; + this->deprecate_ts_ = deprecate_ts; +} + } // namespace infinity diff --git a/src/storage/meta/entry/segment_index_entry.cppm b/src/storage/meta/entry/segment_index_entry.cppm index 9e3bc057e8..7d3cd87894 100644 --- a/src/storage/meta/entry/segment_index_entry.cppm +++ b/src/storage/meta/entry/segment_index_entry.cppm @@ -30,7 +30,6 @@ import index_base; import column_def; import cleanup_scanner; import chunk_index_entry; -import memory_indexer; import default_values; import statement_common; import txn; @@ -50,6 +49,7 @@ class SecondaryIndexInMem; class EMVBIndexInMem; class BMPIndexInMem; class BaseMemIndex; +class MemoryIndexer; export struct PopulateEntireConfig { bool prepare_; @@ -73,7 +73,8 @@ public: u32 next_chunk_id, TransactionID txn_id, TxnTimeStamp begin_ts, - TxnTimeStamp commit_ts); + TxnTimeStamp commit_ts, + TxnTimeStamp deprecate_ts); void UpdateSegmentIndexReplay(SharedPtr new_entry); @@ -110,6 +111,7 @@ public: inline TxnTimeStamp min_ts() const { return min_ts_; } inline TxnTimeStamp max_ts() const { return max_ts_; } inline ChunkID next_chunk_id() const { return next_chunk_id_; } + inline TxnTimeStamp deprecate_ts() const { return deprecate_ts_; } SharedPtr index_dir() const { return index_dir_; } // MemIndexInsert is non-blocking. Caller must ensure there's no RowID gap between each call. @@ -180,13 +182,8 @@ public: return {chunk_index_entries_, memory_emvb_index_}; } - Pair GetFulltextColumnLenInfo() { - std::shared_lock lock(rw_locker_); - if (ft_column_len_sum_ == 0 && memory_indexer_.get() != nullptr) { - return {memory_indexer_->GetColumnLengthSum(), memory_indexer_->GetDocCount()}; - } - return {ft_column_len_sum_, ft_column_len_cnt_}; - } + Pair GetFulltextColumnLenInfo(); + void UpdateFulltextColumnLenInfo(u64 column_len_sum, u32 column_len_cnt) { std::unique_lock lock(rw_locker_); ft_column_len_sum_ += column_len_sum; @@ -230,11 +227,18 @@ public: // only for unittest MemoryIndexer *GetMemoryIndexer() { return memory_indexer_.get(); } - void SetMemoryIndexer(UniquePtr &&memory_indexer) { memory_indexer_ = std::move(memory_indexer); } + void SetMemoryIndexer(UniquePtr &&memory_indexer); static SharedPtr CreateFakeEntry(const String &index_dir); ChunkID GetNextChunkID() { return next_chunk_id_++; } + void SetDeprecated(TxnTimeStamp deprecate_ts); + + bool CheckDeprecate(TxnTimeStamp ts) { + TxnTimeStamp deprecate_ts = deprecate_ts_.load(); + return ts >= deprecate_ts; + } + private: explicit SegmentIndexEntry(TableIndexEntry *table_index_entry, SegmentID segment_id); @@ -270,6 +274,7 @@ private: u64 ft_column_len_sum_{}; // increase only u32 ft_column_len_cnt_{}; // increase only + Atomic deprecate_ts_ = UNCOMMIT_TS; public: bool TrySetOptimizing(); diff --git a/src/storage/meta/entry/table_entry.cpp b/src/storage/meta/entry/table_entry.cpp index 003f0eaabb..faddca5f30 100644 --- a/src/storage/meta/entry/table_entry.cpp +++ b/src/storage/meta/entry/table_entry.cpp @@ -573,6 +573,7 @@ Status TableEntry::CommitCompact(TransactionID txn_id, TxnTimeStamp commit_ts, T auto [table_index_entry, status] = table_index_meta->GetEntryNolock(txn_id, commit_ts); if (!status.ok()) continue; + table_index_entry->CommitCompact(txn_id, commit_ts, compact_store); const IndexBase *index_base = table_index_entry->index_base(); switch (index_base->index_type_) { case IndexType::kFullText: { @@ -601,17 +602,12 @@ Status TableEntry::CommitCompact(TransactionID txn_id, TxnTimeStamp commit_ts, T break; } default: { - String error_message = "Invalid compact task type"; - UnrecoverableError(error_message); } } return Status::OK(); } Status TableEntry::RollbackCompact(TransactionID txn_id, TxnTimeStamp commit_ts, const TxnCompactStore &compact_store) { - if (compact_store.compact_data_.empty()) { - return Status::OK(); - } { for (const auto &[segment_store, old_segments] : compact_store.compact_data_) { SharedPtr segment; @@ -654,8 +650,6 @@ Status TableEntry::RollbackCompact(TransactionID txn_id, TxnTimeStamp commit_ts, break; } default: { - String error_message = "Invalid compact task type"; - UnrecoverableError(error_message); } } } @@ -820,6 +814,9 @@ void TableEntry::MemIndexRecover(BufferManager *buffer_manager, TxnTimeStamp ts) if (!status.ok()) continue; for (const auto &[segment_id, segment_entry] : segment_map_) { + if (segment_entry->CheckDeprecate(ts)) { + continue; + } auto iter = table_index_entry->index_by_segment().find(segment_id); SharedPtr segment_index_entry = nullptr; if (iter == table_index_entry->index_by_segment().end()) { @@ -832,7 +829,8 @@ void TableEntry::MemIndexRecover(BufferManager *buffer_manager, TxnTimeStamp ts) 0 /*next_chunk_id*/, 0 /*txn_id*/, ts /*begin_ts*/, - ts /*commit_ts*/); + ts /*commit_ts*/, + UNCOMMIT_TS /*deprecate_ts*/); table_index_entry->index_by_segment().emplace(segment_id, segment_index_entry); } else { segment_index_entry = iter->second; @@ -1554,6 +1552,49 @@ void TableEntry::SetUnlock() { locked_ = false; } +bool TableEntry::SetCompact(TableStatus &status, Txn *txn) { + std::unique_lock lock(rw_locker_); + if (table_status_ == TableStatus::kCreatingIndex) { + status = table_status_; + LOG_TRACE(fmt::format("SetCompact fail. Table {} is in status: {}", encode(), u8(table_status_))); + return false; + } + table_status_ = TableStatus::kCompacting; + txn->txn_store()->SetCompacting(this); + LOG_TRACE(fmt::format("SetCompact success. Table {} is in status: {}", encode(), u8(table_status_))); + return true; +} + +bool TableEntry::SetCreatingIndex(TableStatus &status, Txn *txn) { + std::unique_lock lock(rw_locker_); + if (table_status_ == TableStatus::kCompacting) { + status = table_status_; + LOG_TRACE(fmt::format("SetCreatingIndex fail. Table {} is in status: {}", encode(), u8(table_status_))); + return false; + } + table_status_ = TableStatus::kCreatingIndex; + txn->txn_store()->SetCreatingIndex(this); + LOG_TRACE(fmt::format("SetCreatingIndex success. Table {} is in status: {}", encode(), u8(table_status_))); + return true; +} + +void TableEntry::SetCompactDone() { + std::unique_lock lock(rw_locker_); + if (table_status_ == TableStatus::kCreatingIndex) { + UnrecoverableError(fmt::format("Cannot set table {} to None, status: {}", encode(), u8(table_status_))); + } + table_status_ = TableStatus::kNone; +} + +void TableEntry::SetCreateIndexDone() { + std::unique_lock lock(rw_locker_); + if (table_status_ == TableStatus::kCompacting) { + UnrecoverableError(fmt::format("Cannot set table {} to None, status: {}", encode(), u8(table_status_))); + } + table_status_ = TableStatus::kNone; +} + + void TableEntry::AddColumns(const Vector> &column_defs, TxnTableStore *txn_table_store) { ExpressionBinder tmp_binder(nullptr); Vector default_values; diff --git a/src/storage/meta/entry/table_entry.cppm b/src/storage/meta/entry/table_entry.cppm index 868b2642a0..e7d7bd1fc0 100644 --- a/src/storage/meta/entry/table_entry.cppm +++ b/src/storage/meta/entry/table_entry.cppm @@ -43,6 +43,7 @@ import meta_info; import block_entry; import column_index_reader; import value; +import infinity_exception; namespace infinity { @@ -368,7 +369,23 @@ public: void SetUnlock(); + enum struct TableStatus: u8 { + kNone = 0, + kCreatingIndex, + kCompacting, + }; + + bool SetCompact(TableStatus &status, Txn *txn); + + bool SetCreatingIndex(TableStatus &status, Txn *txn); + + void SetCompactDone(); + + void SetCreateIndexDone(); + private: + TableStatus table_status_ = TableStatus::kNone; + std::mutex mtx_; // when table is locked, write is not allowed. std::condition_variable cv_; bool locked_ = false; diff --git a/src/storage/meta/entry/table_index_entry.cpp b/src/storage/meta/entry/table_index_entry.cpp index 366668d74b..03891d8709 100644 --- a/src/storage/meta/entry/table_index_entry.cpp +++ b/src/storage/meta/entry/table_index_entry.cpp @@ -206,6 +206,29 @@ void TableIndexEntry::CommitCreateIndex(TxnIndexStore *txn_index_store, TxnTimeS // } // } +void TableIndexEntry::CommitCompact([[maybe_unused]] TransactionID txn_id, TxnTimeStamp commit_ts, TxnCompactStore &compact_store) { + std::unique_lock w_lock(rw_locker_); + for (const auto &[segment_store, old_segments] : compact_store.compact_data_) { + auto *segment_entry = segment_store.segment_entry_; + + auto iter = index_by_segment_.find(segment_entry->segment_id()); + if (iter == index_by_segment_.end()) { + continue; + } + [[maybe_unused]] auto *segment_index_entry = iter->second.get(); + + for (auto *old_segment : old_segments) { + auto iter = index_by_segment_.find(old_segment->segment_id()); + if (iter == index_by_segment_.end()) { + continue; + } + auto *old_segment_index_entry = iter->second.get(); + old_segment_index_entry->SetDeprecated(commit_ts); + } + } +} + + nlohmann::json TableIndexEntry::Serialize(TxnTimeStamp max_commit_ts) { nlohmann::json json; @@ -225,7 +248,7 @@ nlohmann::json TableIndexEntry::Serialize(TxnTimeStamp max_commit_ts) { std::shared_lock r_lock(rw_locker_); for (const auto &[segment_id, index_entry] : this->index_by_segment_) { - if (index_entry->commit_ts_ <= max_commit_ts) { + if (index_entry->commit_ts_ <= max_commit_ts && !index_entry->deleted_) { segment_index_entry_candidates.push_back(index_entry); } } @@ -455,9 +478,17 @@ Vector TableIndexEntry::GetFilePath(TransactionID txn_id, TxnTimeStamp b } void TableIndexEntry::PickCleanup(CleanupScanner *scanner) { + TxnTimeStamp visible_ts = scanner->visible_ts(); std::shared_lock r_lock(rw_locker_); - for (auto &[segment_id, segment_index_entry] : index_by_segment_) { - segment_index_entry->PickCleanup(scanner); + for (auto iter = index_by_segment_.begin(); iter != index_by_segment_.end();) { + auto &[segment_id, segment_index_entry] = *iter; + if (segment_index_entry->CheckDeprecate(visible_ts)) { + scanner->AddEntry(std::move(segment_index_entry)); + iter = index_by_segment_.erase(iter); + } else { + segment_index_entry->PickCleanup(scanner); + ++iter; + } } } diff --git a/src/storage/meta/entry/table_index_entry.cppm b/src/storage/meta/entry/table_index_entry.cppm index 729b001b8e..e3ae5e20e4 100644 --- a/src/storage/meta/entry/table_index_entry.cppm +++ b/src/storage/meta/entry/table_index_entry.cppm @@ -145,6 +145,8 @@ public: void CommitCreateIndex(TxnIndexStore *txn_index_store, TxnTimeStamp commit_ts, bool is_replay = false); + void CommitCompact(TransactionID txn_id, TxnTimeStamp commit_ts, TxnCompactStore &compact_store); + // void RollbackPopulateIndex(TxnIndexStore *txn_index_store, Txn *txn); // replay diff --git a/src/storage/persistence/persist_result_handler.cpp b/src/storage/persistence/persist_result_handler.cpp index 76368708df..ae44078bfa 100644 --- a/src/storage/persistence/persist_result_handler.cpp +++ b/src/storage/persistence/persist_result_handler.cpp @@ -56,16 +56,20 @@ ObjAddr PersistResultHandler::HandleReadResult(const PersistReadResult &result) ObjCached expect = ObjCached::kNotCached; Atomic &cached = result.obj_stat_->cached_; if (cached.compare_exchange_strong(expect, ObjCached::kDownloading)) { + VirtualStore::AddRequestCount(); String read_path = InfinityContext::instance().persistence_manager()->GetObjPath(result.obj_addr_.obj_key_); LOG_TRACE(fmt::format("GetObjCache download object {}.", read_path)); VirtualStore::DownloadObject(read_path, result.obj_addr_.obj_key_); LOG_TRACE(fmt::format("GetObjCache download object {} done.", read_path)); cached.store(ObjCached::kCached); cached.notify_all(); + VirtualStore::AddCacheMissCount(); } else if (expect == ObjCached::kDownloading) { LOG_TRACE(fmt::format("GetObjCache waiting downloading object {}", result.obj_addr_.obj_key_)); cached.wait(ObjCached::kDownloading); LOG_TRACE(fmt::format("GetObjCache finish waiting object {}", result.obj_addr_.obj_key_)); + } else { + VirtualStore::AddRequestCount(); } } return result.obj_addr_; diff --git a/src/storage/secondary_index/common_query_filter.cpp b/src/storage/secondary_index/common_query_filter.cpp index 5df54f8a26..ec4e032ffa 100644 --- a/src/storage/secondary_index/common_query_filter.cpp +++ b/src/storage/secondary_index/common_query_filter.cpp @@ -40,10 +40,14 @@ import vector_buffer; import data_type; import logical_type; import expression_state; +import expression_type; +import reference_expression; +import in_expression; import infinity_exception; import third_party; import logger; import index_defines; +import logger; namespace infinity { @@ -51,22 +55,50 @@ void ReadDataBlock(DataBlock *output, BufferManager *buffer_mgr, const SizeT row_count, BlockEntry *current_block_entry, - const Vector &column_ids) { - auto block_id = current_block_entry->block_id(); - auto segment_id = current_block_entry->segment_id(); + const Vector &column_ids, + const Vector &column_should_load) { + const auto block_id = current_block_entry->block_id(); + const auto segment_id = current_block_entry->segment_id(); for (SizeT i = 0; i < column_ids.size(); ++i) { - SizeT column_id = column_ids[i]; - if (column_id == COLUMN_IDENTIFIER_ROW_ID) { - u32 segment_offset = block_id * DEFAULT_BLOCK_CAPACITY; + if (const SizeT column_id = column_ids[i]; column_id == COLUMN_IDENTIFIER_ROW_ID) { + const u32 segment_offset = block_id * DEFAULT_BLOCK_CAPACITY; output->column_vectors[i]->AppendWith(RowID(segment_id, segment_offset), row_count); - } else { + } else if (column_should_load[i]) { ColumnVector column_vector = current_block_entry->GetConstColumnVector(buffer_mgr, column_id); output->column_vectors[i]->AppendWith(column_vector, 0, row_count); + } else { + // no need to load this column + output->column_vectors[i]->Finalize(row_count); } } output->Finalize(); } +void CollectUsedColumnRef(BaseExpression *expr, Vector &column_should_load) { + switch (expr->type()) { + case ExpressionType::kColumn: { + LOG_ERROR(std::format("{}: ColumnExpression should not be in the leftover_filter after optimizer.", __func__)); + break; + } + case ExpressionType::kReference: { + const auto *ref_expr = static_cast(expr); + column_should_load[ref_expr->column_index()] = true; + break; + } + case ExpressionType::kIn: { + auto *in_expr = static_cast(expr); + CollectUsedColumnRef(in_expr->left_operand().get(), column_should_load); + break; + } + default: { + break; + } + } + for (const auto &child : expr->arguments()) { + CollectUsedColumnRef(child.get(), column_should_load); + } +} + void MergeFalseIntoBitmask(const VectorBuffer *input_bool_column_buffer, const SharedPtr &input_null_mask, const SizeT count, @@ -130,12 +162,13 @@ void CommonQueryFilter::BuildFilter(u32 task_id, Txn *txn) { auto db_for_filter = db_for_filter_p.get(); Vector> read_column_types = *(base_table_ref_->column_types_); Vector column_ids = base_table_ref_->column_ids_; - { - if (read_column_types.empty() || read_column_types.back()->type() != LogicalType::kRowID) { - read_column_types.push_back(MakeShared(LogicalType::kRowID)); - column_ids.push_back(COLUMN_IDENTIFIER_ROW_ID); - } + if (read_column_types.empty() || read_column_types.back()->type() != LogicalType::kRowID) { + read_column_types.push_back(MakeShared(LogicalType::kRowID)); + column_ids.push_back(COLUMN_IDENTIFIER_ROW_ID); } + // collect the base_table_ref columns used in filter + Vector column_should_load(column_ids.size(), false); + CollectUsedColumnRef(leftover_filter_.get(), column_should_load); db_for_filter->Init(read_column_types); auto bool_column = ColumnVector::Make(MakeShared(LogicalType::kBoolean)); // filter and build bitmask, if filter_expression_ != nullptr @@ -146,7 +179,7 @@ void CommonQueryFilter::BuildFilter(u32 task_id, Txn *txn) { const auto block_row_count = block_entry->row_count(); const auto row_count = std::min(segment_row_count - segment_row_count_read, block_row_count); db_for_filter->Reset(row_count); - ReadDataBlock(db_for_filter, buffer_mgr, row_count, block_entry, column_ids); + ReadDataBlock(db_for_filter, buffer_mgr, row_count, block_entry, column_ids, column_should_load); bool_column->Initialize(ColumnVectorType::kCompactBit, row_count); expr_evaluator.Init(db_for_filter); expr_evaluator.Execute(leftover_filter_, filter_state, bool_column); diff --git a/src/storage/secondary_index/secondary_index_data.cpp b/src/storage/secondary_index/secondary_index_data.cpp index c042341a76..c03cb78bd1 100644 --- a/src/storage/secondary_index/secondary_index_data.cpp +++ b/src/storage/secondary_index/secondary_index_data.cpp @@ -14,6 +14,7 @@ module; +#include #include #include @@ -36,33 +37,26 @@ namespace infinity { template struct SecondaryIndexChunkDataReader { using OrderedKeyType = ConvertToOrderedType; - static constexpr u32 PairSize = sizeof(OrderedKeyType) + sizeof(SegmentOffset); ChunkIndexEntry *chunk_index_; - BufferHandle current_handle_; - u32 part_count_ = 0; - u32 current_offset_ = 0; - u32 current_part_id_ = 0; - u32 current_part_size_ = 0; + BufferHandle handle_; + u32 row_count_ = 0; + u32 next_offset_ = 0; + const void *key_ptr_ = nullptr; + const SegmentOffset *offset_ptr_ = nullptr; SecondaryIndexChunkDataReader(ChunkIndexEntry *chunk_index) : chunk_index_(chunk_index) { - part_count_ = chunk_index_->GetPartNum(); - current_part_size_ = chunk_index_->GetPartRowCount(current_part_id_); + handle_ = chunk_index_->GetIndex(); + row_count_ = chunk_index_->GetRowCount(); + auto *index = static_cast(handle_.GetData()); + std::tie(key_ptr_, offset_ptr_) = index->GetKeyOffsetPointer(); + assert(index->GetChunkRowCount() == row_count_); } bool GetNextDataPair(OrderedKeyType &key, u32 &offset) { - if (current_offset_ == 0) { - if (current_part_id_ >= part_count_) { - return false; - } - current_handle_ = chunk_index_->GetIndexPartAt(current_part_id_); - } - const auto *data_ptr = static_cast(current_handle_.GetData()); - std::memcpy(&key, data_ptr + current_offset_ * PairSize, sizeof(OrderedKeyType)); - std::memcpy(&offset, data_ptr + current_offset_ * PairSize + sizeof(OrderedKeyType), sizeof(SegmentOffset)); - if (++current_offset_ == current_part_size_) { - current_offset_ = 0; - if (++current_part_id_ < part_count_) { - current_part_size_ = chunk_index_->GetPartRowCount(current_part_id_); - } + if (next_offset_ >= row_count_) { + return false; } + std::memcpy(&key, static_cast(key_ptr_) + next_offset_ * sizeof(OrderedKeyType), sizeof(key)); + std::memcpy(&offset, offset_ptr_ + next_offset_, sizeof(offset)); + ++next_offset_; return true; } }; @@ -105,49 +99,37 @@ struct SecondaryIndexChunkMerger { template class SecondaryIndexDataT final : public SecondaryIndexData { using OrderedKeyType = ConvertToOrderedType; - // sorted values in chunk - // only for build and save - // should not be loaded from file - bool need_save_ = false; UniquePtr key_; UniquePtr offset_; public: - static constexpr u32 PairSize = sizeof(OrderedKeyType) + sizeof(SegmentOffset); - SecondaryIndexDataT(const u32 chunk_row_count, const bool allocate) : SecondaryIndexData(chunk_row_count) { pgm_index_ = GenerateSecondaryPGMIndex(); - if (allocate) { - need_save_ = true; - LOG_TRACE(fmt::format("SecondaryIndexDataT(): Allocate space for chunk_row_count_: {}", chunk_row_count_)); - key_ = MakeUnique(chunk_row_count_); - offset_ = MakeUnique(chunk_row_count_); - } + key_ = MakeUnique(chunk_row_count_); + offset_ = MakeUnique(chunk_row_count_); + key_ptr_ = key_.get(); + offset_ptr_ = offset_.get(); } void SaveIndexInner(LocalFileHandle &file_handle) const override { - if (!need_save_) { - String error_message = "SaveIndexInner(): error: SecondaryIndexDataT is not allocated."; - UnrecoverableError(error_message); - } + file_handle.Append(key_ptr_, chunk_row_count_ * sizeof(OrderedKeyType)); + file_handle.Append(offset_ptr_, chunk_row_count_ * sizeof(SegmentOffset)); pgm_index_->SaveIndex(file_handle); } - void ReadIndexInner(LocalFileHandle &file_handle) override { pgm_index_->LoadIndex(file_handle); } + void ReadIndexInner(LocalFileHandle &file_handle) override { + file_handle.Read(key_ptr_, chunk_row_count_ * sizeof(OrderedKeyType)); + file_handle.Read(offset_ptr_, chunk_row_count_ * sizeof(SegmentOffset)); + pgm_index_->LoadIndex(file_handle); + } - void InsertData(const void *ptr, SharedPtr &chunk_index) override { - if (!need_save_) { - String error_message = "InsertData(): error: SecondaryIndexDataT is not allocated."; - UnrecoverableError(error_message); - } + void InsertData(const void *ptr) override { auto map_ptr = static_cast *>(ptr); if (!map_ptr) { - String error_message = "InsertData(): error: map_ptr type error."; - UnrecoverableError(error_message); + UnrecoverableError("InsertData(): error: map_ptr type error."); } if (map_ptr->size() != chunk_row_count_) { - String error_message = fmt::format("InsertData(): error: map size: {} != chunk_row_count_: {}", map_ptr->size(), chunk_row_count_); - UnrecoverableError(error_message); + UnrecoverableError(fmt::format("InsertData(): error: map size: {} != chunk_row_count_: {}", map_ptr->size(), chunk_row_count_)); } u32 i = 0; for (const auto &[key, offset] : *map_ptr) { @@ -156,17 +138,12 @@ class SecondaryIndexDataT final : public SecondaryIndexData { ++i; } if (i != chunk_row_count_) { - String error_message = fmt::format("InsertData(): error: i: {} != chunk_row_count_: {}", i, chunk_row_count_); - UnrecoverableError(error_message); + UnrecoverableError(fmt::format("InsertData(): error: i: {} != chunk_row_count_: {}", i, chunk_row_count_)); } - OutputAndBuild(chunk_index); + pgm_index_->BuildIndex(chunk_row_count_, key_.get()); } - void InsertMergeData(Vector &old_chunks, SharedPtr &merged_chunk_index_entry) override { - if (!need_save_) { - String error_message = "InsertMergeData(): error: SecondaryIndexDataT is not allocated."; - UnrecoverableError(error_message); - } + void InsertMergeData(Vector &old_chunks) override { SecondaryIndexChunkMerger merger(old_chunks); OrderedKeyType key = {}; u32 offset = 0; @@ -177,24 +154,7 @@ class SecondaryIndexDataT final : public SecondaryIndexData { ++i; } if (i != chunk_row_count_) { - String error_message = fmt::format("InsertMergeData(): error: i: {} != chunk_row_count_: {}", i, chunk_row_count_); - UnrecoverableError(error_message); - } - OutputAndBuild(merged_chunk_index_entry); - } - - void OutputAndBuild(SharedPtr &chunk_index) { - const u32 part_num = chunk_index->GetPartNum(); - for (u32 part_id = 0; part_id < part_num; ++part_id) { - const u32 part_row_count = chunk_index->GetPartRowCount(part_id); - const u32 part_offset = part_id * 8192; - BufferHandle handle = chunk_index->GetIndexPartAt(part_id); - auto data_ptr = static_cast(handle.GetDataMut()); - for (u32 j = 0; j < part_row_count; ++j) { - const u32 index = part_offset + j; - std::memcpy(data_ptr + j * PairSize, key_.get() + index, sizeof(OrderedKeyType)); - std::memcpy(data_ptr + j * PairSize + sizeof(OrderedKeyType), offset_.get() + index, sizeof(SegmentOffset)); - } + UnrecoverableError(fmt::format("InsertMergeData(): error: i: {} != chunk_row_count_: {}", i, chunk_row_count_)); } pgm_index_->BuildIndex(chunk_row_count_, key_.get()); } @@ -202,8 +162,7 @@ class SecondaryIndexDataT final : public SecondaryIndexData { SecondaryIndexData *GetSecondaryIndexData(const SharedPtr &data_type, const u32 chunk_row_count, const bool allocate) { if (!(data_type->CanBuildSecondaryIndex())) { - String error_message = fmt::format("Cannot build secondary index on data type: {}", data_type->ToString()); - UnrecoverableError(error_message); + UnrecoverableError(fmt::format("Cannot build secondary index on data type: {}", data_type->ToString())); return nullptr; } switch (data_type->type()) { @@ -241,59 +200,10 @@ SecondaryIndexData *GetSecondaryIndexData(const SharedPtr &data_type, return new SecondaryIndexDataT(chunk_row_count, allocate); } default: { - String error_message = fmt::format("Need to add secondary index support for data type: {}", data_type->ToString()); - UnrecoverableError(error_message); + UnrecoverableError(fmt::format("Need to add secondary index support for data type: {}", data_type->ToString())); return nullptr; } } } -u32 GetSecondaryIndexDataPairSize(const SharedPtr &data_type) { - if (!(data_type->CanBuildSecondaryIndex())) { - String error_message = fmt::format("Cannot build secondary index on data type: {}", data_type->ToString()); - UnrecoverableError(error_message); - return 0; - } - switch (data_type->type()) { - case LogicalType::kTinyInt: { - return SecondaryIndexDataT::PairSize; - } - case LogicalType::kSmallInt: { - return SecondaryIndexDataT::PairSize; - } - case LogicalType::kInteger: { - return SecondaryIndexDataT::PairSize; - } - case LogicalType::kBigInt: { - return SecondaryIndexDataT::PairSize; - } - case LogicalType::kFloat: { - return SecondaryIndexDataT::PairSize; - } - case LogicalType::kDouble: { - return SecondaryIndexDataT::PairSize; - } - case LogicalType::kDate: { - return SecondaryIndexDataT::PairSize; - } - case LogicalType::kTime: { - return SecondaryIndexDataT::PairSize; - } - case LogicalType::kDateTime: { - return SecondaryIndexDataT::PairSize; - } - case LogicalType::kTimestamp: { - return SecondaryIndexDataT::PairSize; - } - case LogicalType::kVarchar: { - return SecondaryIndexDataT::PairSize; - } - default: { - String error_message = fmt::format("Need to add secondary index support for data type: {}", data_type->ToString()); - UnrecoverableError(error_message); - return 0; - } - } -} - -} // namespace infinity \ No newline at end of file +} // namespace infinity diff --git a/src/storage/secondary_index/secondary_index_data.cppm b/src/storage/secondary_index/secondary_index_data.cppm index 1e670f0794..20bb9a7f05 100644 --- a/src/storage/secondary_index/secondary_index_data.cppm +++ b/src/storage/secondary_index/secondary_index_data.cppm @@ -46,9 +46,7 @@ template concept ConvertToHashU64 = IsAnyOf; template -struct ConvertToOrdered { - static_assert(false, "type not supported"); -}; +struct ConvertToOrdered; template struct ConvertToOrdered { @@ -72,7 +70,7 @@ struct ConvertToOrdered { export template requires KeepOrderedSelf or ConvertToOrderedI32 or ConvertToOrderedI64 or ConvertToHashU64 -using ConvertToOrderedType = ConvertToOrdered::type; +using ConvertToOrderedType = typename ConvertToOrdered::type; export template ConvertToOrderedType ConvertToOrderedKeyValue(RawValueType value) { @@ -142,18 +140,21 @@ export class SecondaryIndexData { protected: u32 chunk_row_count_ = 0; // pgm index - // will always be loaded UniquePtr pgm_index_; + // k-v data + void *key_ptr_ = nullptr; + SegmentOffset *offset_ptr_ = nullptr; public: explicit SecondaryIndexData(u32 chunk_row_count) : chunk_row_count_(chunk_row_count) {} virtual ~SecondaryIndexData() = default; + [[nodiscard]] Pair GetKeyOffsetPointer() const { return {key_ptr_, offset_ptr_}; } + [[nodiscard]] inline auto SearchPGM(const void *val_ptr) const { if (!pgm_index_) { - String error_message = "Not initialized yet."; - UnrecoverableError(error_message); + UnrecoverableError("Not initialized yet."); } return pgm_index_->SearchIndex(val_ptr); } @@ -164,13 +165,11 @@ public: virtual void ReadIndexInner(LocalFileHandle &file_handle) = 0; - virtual void InsertData(const void *ptr, SharedPtr &chunk_index) = 0; + virtual void InsertData(const void *ptr) = 0; - virtual void InsertMergeData(Vector &old_chunks, SharedPtr &merged_chunk_index_entry) = 0; + virtual void InsertMergeData(Vector &old_chunks) = 0; }; export SecondaryIndexData *GetSecondaryIndexData(const SharedPtr &data_type, u32 chunk_row_count, bool allocate); -export u32 GetSecondaryIndexDataPairSize(const SharedPtr &data_type); - -} // namespace infinity \ No newline at end of file +} // namespace infinity diff --git a/src/storage/secondary_index/secondary_index_in_mem.cpp b/src/storage/secondary_index/secondary_index_in_mem.cpp index a079faf608..e66018a4ca 100644 --- a/src/storage/secondary_index/secondary_index_in_mem.cpp +++ b/src/storage/secondary_index/secondary_index_in_mem.cpp @@ -61,7 +61,7 @@ class SecondaryIndexInMemT final : public SecondaryIndexInMem { auto new_chunk_index_entry = segment_index_entry->CreateSecondaryIndexChunkIndexEntry(begin_row_id_, row_count, buffer_mgr); BufferHandle handle = new_chunk_index_entry->GetIndex(); auto data_ptr = static_cast(handle.GetDataMut()); - data_ptr->InsertData(&in_mem_secondary_index_, new_chunk_index_entry); + data_ptr->InsertData(&in_mem_secondary_index_); return new_chunk_index_entry; } Pair RangeQuery(const void *input) const override { diff --git a/src/storage/secondary_index/secondary_index_pgm.cppm b/src/storage/secondary_index/secondary_index_pgm.cppm index 1f004b8300..709988ef15 100644 --- a/src/storage/secondary_index/secondary_index_pgm.cppm +++ b/src/storage/secondary_index/secondary_index_pgm.cppm @@ -145,40 +145,36 @@ class SecondaryPGMIndexTemplate final : public SecondaryPGMIndex { public: SecondaryPGMIndexTemplate() = default; - ~SecondaryPGMIndexTemplate() final = default; + ~SecondaryPGMIndexTemplate() override = default; - void SaveIndex(LocalFileHandle &file_handle) const final { + void SaveIndex(LocalFileHandle &file_handle) const override { if (!initialized_) { - String error_message = "Not initialized yet."; - UnrecoverableError(error_message); + UnrecoverableError("Not initialized yet."); } pgm_index_->Save(file_handle); } - void LoadIndex(LocalFileHandle &file_handle) final { + void LoadIndex(LocalFileHandle &file_handle) override { if (initialized_) { - String error_message = "Already initialized."; - UnrecoverableError(error_message); + UnrecoverableError("Already initialized."); } pgm_index_ = MakeUnique>(); pgm_index_->Load(file_handle); initialized_ = true; } - void BuildIndex(SizeT data_cnt, const void *data_ptr) final { + void BuildIndex(SizeT data_cnt, const void *data_ptr) override { if (initialized_) { - String error_message = "Already initialized."; - UnrecoverableError(error_message); + UnrecoverableError("Already initialized."); } auto typed_data_ptr = static_cast(data_ptr); pgm_index_ = MakeUnique>(typed_data_ptr, typed_data_ptr + data_cnt); initialized_ = true; } - SecondaryIndexApproxPos SearchIndex(const void *val_ptr) const final { + SecondaryIndexApproxPos SearchIndex(const void *val_ptr) const override { if (!initialized_) { - String error_message = "Not initialized yet."; - UnrecoverableError(error_message); + UnrecoverableError("Not initialized yet."); } auto val = *(static_cast(val_ptr)); auto [pos, lo, hi] = pgm_index_->search(val); @@ -191,4 +187,4 @@ inline UniquePtr GenerateSecondaryPGMIndex() { return MakeUnique>(); } -} // namespace infinity \ No newline at end of file +} // namespace infinity diff --git a/src/storage/tracer/base_memindex.cpp b/src/storage/tracer/base_memindex.cpp new file mode 100644 index 0000000000..fc30187efe --- /dev/null +++ b/src/storage/tracer/base_memindex.cpp @@ -0,0 +1,35 @@ +// Copyright(C) 2024 InfiniFlow, Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +module; + +module base_memindex; + +import stl; +import memindex_tracer; +import infinity_context; + +namespace infinity { + +void BaseMemIndex::IncreaseMemoryUsageBase(SizeT mem) { + auto *memindex_tracer = InfinityContext::instance().storage()->memindex_tracer(); + memindex_tracer->IncreaseMemoryUsage(mem); +} + +void BaseMemIndex::DecreaseMemoryUsageBase(SizeT mem) { + auto *memindex_tracer = InfinityContext::instance().storage()->memindex_tracer(); + memindex_tracer->DecreaseMemUsed(mem); +} + +} \ No newline at end of file diff --git a/src/storage/tracer/base_memindex.cppm b/src/storage/tracer/base_memindex.cppm index 1a77a66e1a..49d5da5be4 100644 --- a/src/storage/tracer/base_memindex.cppm +++ b/src/storage/tracer/base_memindex.cppm @@ -1,4 +1,4 @@ -// Copyright(C) 2023 InfiniFlow, Inc. All rights reserved. +// Copyright(C) 2024 InfiniFlow, Inc. All rights reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -18,7 +18,6 @@ export module base_memindex; import stl; import memindex_tracer; -import infinity_context; namespace infinity { @@ -31,10 +30,8 @@ public: virtual TableIndexEntry *table_index_entry() const = 0; protected: - void AddMemUsed(SizeT mem) { - auto *memindex_tracer = InfinityContext::instance().storage()->memindex_tracer(); - memindex_tracer->AddMemUsed(mem); - } + void IncreaseMemoryUsageBase(SizeT mem); + void DecreaseMemoryUsageBase(SizeT mem); }; } // namespace infinity diff --git a/src/storage/tracer/mem_usage_change.cppm b/src/storage/tracer/mem_usage_change.cppm new file mode 100644 index 0000000000..9f844b2a92 --- /dev/null +++ b/src/storage/tracer/mem_usage_change.cppm @@ -0,0 +1,44 @@ +module; + +#include + +export module mem_usage_change; + +import stl; + +namespace infinity { + +export struct MemUsageChange { + bool is_add_{true}; + SizeT mem_{0}; + + void Add(const MemUsageChange &other) { + if (this->mem_ == 0) { + this->mem_ = other.mem_; + this->is_add_ = other.is_add_; + return; + } + + if (this->is_add_ == other.is_add_) { + this->mem_ += other.mem_; + } else { + if (other.mem_ > this->mem_) { + this->mem_ = other.mem_ - this->mem_; + this->is_add_ = !this->is_add_; + } else { + this->mem_ -= other.mem_; + } + } + } + + SizeT Apply(SizeT original) { + if (is_add_) { + return original + mem_; + } else { + assert(mem_ <= original); + return original - mem_; + } + } +}; + +} // namespace infinity diff --git a/src/storage/tracer/memindex_tracer.cppm b/src/storage/tracer/memindex_tracer.cppm index c91a8b851a..9f0b355170 100644 --- a/src/storage/tracer/memindex_tracer.cppm +++ b/src/storage/tracer/memindex_tracer.cppm @@ -49,7 +49,7 @@ public: void DecreaseMemUsed(SizeT mem_used); - void AddMemUsed(SizeT usage); + void IncreaseMemoryUsage(SizeT usage); void DumpDone(SizeT actual_dump_size, BaseMemIndex *mem_index); @@ -84,8 +84,8 @@ protected: Atomic acc_proposed_dump_ = 0; }; -inline void MemIndexTracer::AddMemUsed(SizeT add) { - LOG_TRACE(fmt::format("Add mem used: {}, mem index limit: {}", add, index_memory_limit_)); +inline void MemIndexTracer::IncreaseMemoryUsage(SizeT add) { + // LOG_TRACE(fmt::format("Add mem used: {}, mem index limit: {}", add, index_memory_limit_)); if (add == 0 || index_memory_limit_ == 0) { return; } diff --git a/src/storage/txn/txn.cpp b/src/storage/txn/txn.cpp index 75edf1ff49..c152b1a174 100644 --- a/src/storage/txn/txn.cpp +++ b/src/storage/txn/txn.cpp @@ -55,6 +55,7 @@ import infinity_context; import admin_statement; import global_resource_usage; import wal_manager; +import defer_op; namespace infinity { @@ -534,6 +535,7 @@ WalEntry *Txn::GetWALEntry() const { return wal_entry_.get(); } // } TxnTimeStamp Txn::Commit() { + DeferFn defer_op([&] { txn_store_.RevertTableStatus(); }); if (wal_entry_->cmds_.empty() && txn_store_.ReadOnly()) { // Don't need to write empty WalEntry (read-only transactions). TxnTimeStamp commit_ts = txn_mgr_->GetReadCommitTS(this); @@ -623,6 +625,7 @@ void Txn::CancelCommitBottom() { } void Txn::Rollback() { + DeferFn defer_op([&] { txn_store_.RevertTableStatus(); }); auto state = this->GetTxnState(); TxnTimeStamp abort_ts = 0; if (state == TxnState::kStarted) { diff --git a/src/storage/txn/txn_store.cpp b/src/storage/txn/txn_store.cpp index b2d57ef3ec..4f921876c4 100644 --- a/src/storage/txn/txn_store.cpp +++ b/src/storage/txn/txn_store.cpp @@ -271,13 +271,14 @@ Tuple, Status> TxnTableStore::Delete(const Vector &row_ return {nullptr, Status::OK()}; } +void TxnTableStore::SetCompactType(CompactStatementType type) { compact_state_.type_ = type; } + Tuple, Status> TxnTableStore::Compact(Vector, Vector>> &&segment_data, CompactStatementType type) { - if (compact_state_.type_ != CompactStatementType::kInvalid) { - String error_message = "Attempt to compact table store twice"; + if (compact_state_.type_ != type) { + String error_message = fmt::format("Compact type mismatch: {} vs {}", static_cast(compact_state_.type_), static_cast(type)); UnrecoverableError(error_message); } - compact_state_ = TxnCompactStore(type); for (auto &[new_segment, old_segments] : segment_data) { auto txn_segment_store = TxnSegmentStore::AddSegmentStore(new_segment.get()); compact_state_.compact_data_.emplace_back(std::move(txn_segment_store), old_segments); @@ -295,9 +296,6 @@ Tuple, Status> TxnTableStore::Compact(VectorDecWriteTxnNum(); - } if (append_state_.get() != nullptr) { // Rollback the data already been appended. Catalog::RollbackAppend(table_entry_, txn_id, abort_ts, this); @@ -452,9 +450,6 @@ void TxnTableStore::Commit(TransactionID txn_id, TxnTimeStamp commit_ts) { } } } - if (added_txn_num_) { - table_entry_->DecWriteTxnNum(); - } } void TxnTableStore::MaintainCompactionAlg() { @@ -525,6 +520,20 @@ void TxnTableStore::AddDeltaOp(CatalogDeltaEntry *local_delta_ops, TxnManager *t } } +void TxnTableStore::TryRevert() { + if (table_status_ == TxnStoreStatus::kCompacting) { + table_status_ = TxnStoreStatus::kNone; + table_entry_->SetCompactDone(); + } else if (table_status_ == TxnStoreStatus::kCreatingIndex) { + table_status_ = TxnStoreStatus::kNone; + table_entry_->SetCreateIndexDone(); + } + if (added_txn_num_) { + added_txn_num_ = false; + table_entry_->DecWriteTxnNum(); + } +} + TxnStore::TxnStore(Txn *txn, Catalog *catalog) : txn_(txn), catalog_(catalog) {} void TxnStore::AddDBStore(DBEntry *db_entry) { txn_dbs_.emplace(db_entry, ptr_seq_n_++); } @@ -721,4 +730,14 @@ bool TxnStore::ReadOnly() const { return read_only; } +void TxnStore::RevertTableStatus() { + for (const auto &[table_name, table_store] : txn_tables_store_) { + table_store->TryRevert(); + } +} + +void TxnStore::SetCompacting(TableEntry *table_entry) { GetTxnTableStore(table_entry)->SetCompacting(); } + +void TxnStore::SetCreatingIndex(TableEntry *table_entry) { GetTxnTableStore(table_entry)->SetCreatingIndex(); } + } // namespace infinity diff --git a/src/storage/txn/txn_store.cppm b/src/storage/txn/txn_store.cppm index 8bc02a1325..ff4efe2612 100644 --- a/src/storage/txn/txn_store.cppm +++ b/src/storage/txn/txn_store.cppm @@ -109,6 +109,8 @@ public: Tuple, Status> Delete(const Vector &row_ids); + void SetCompactType(CompactStatementType type); + Tuple, Status> Compact(Vector, Vector>> &&segment_data, CompactStatementType type); void AddSegmentStore(SegmentEntry *segment_entry); @@ -163,6 +165,8 @@ public: // Setter, Getter void AddWriteTxnNum() { added_txn_num_ = true; } + bool AddedTxnNum() const { return added_txn_num_; } + private: std::mutex mtx_{}; @@ -188,6 +192,21 @@ private: bool added_txn_num_{false}; bool has_update_{false}; + +public: + void SetCompacting() { table_status_ = TxnStoreStatus::kCompacting; } + + void SetCreatingIndex() { table_status_ = TxnStoreStatus::kCreatingIndex; } + + void TryRevert(); + +private: + enum struct TxnStoreStatus { + kNone = 0, + kCreatingIndex, + kCompacting, + }; + TxnStoreStatus table_status_{TxnStoreStatus::kNone}; }; export class TxnStore { @@ -228,6 +247,12 @@ public: std::mutex mtx_{}; + void RevertTableStatus(); + + void SetCompacting(TableEntry *table_entry); + + void SetCreatingIndex(TableEntry *table_entry); + private: // Txn store Txn *txn_{}; // TODO: remove this diff --git a/src/storage/wal/catalog_delta_entry.cpp b/src/storage/wal/catalog_delta_entry.cpp index b9d7eba314..2f3c3dc9e5 100644 --- a/src/storage/wal/catalog_delta_entry.cpp +++ b/src/storage/wal/catalog_delta_entry.cpp @@ -362,6 +362,24 @@ MergeFlag CatalogDeltaOperation::NextDeleteFlag(MergeFlag new_merge_flag) const return MergeFlag::kInvalid; }; +void CatalogDeltaOperation::CheckDelete() { + if (type_ == CatalogDeltaOpType::ADD_SEGMENT_ENTRY) { + auto *add_segment_op = static_cast(this); + if (add_segment_op->status_ == SegmentStatus::kDeprecated) { + add_segment_op->merge_flag_ = MergeFlag::kDelete; + } + } else if (type_ == CatalogDeltaOpType::ADD_CHUNK_INDEX_ENTRY) { + auto *add_chunk_index_op = static_cast(this); + if (add_chunk_index_op->deprecate_ts_ != UNCOMMIT_TS) { + add_chunk_index_op->merge_flag_ = MergeFlag::kDelete; + LOG_DEBUG(fmt::format("Delete chunk: {} at {}", *encode_, add_chunk_index_op->deprecate_ts_)); + } + } else if (type_ == CatalogDeltaOpType::ADD_SEGMENT_INDEX_ENTRY) { + [[maybe_unused]] auto *add_segment_index_op = static_cast(this); + } +} + + AddDBEntryOp::AddDBEntryOp(DBEntry *db_entry, TxnTimeStamp commit_ts) : CatalogDeltaOperation(CatalogDeltaOpType::ADD_DATABASE_ENTRY, db_entry, commit_ts), db_entry_dir_(db_entry->db_entry_dir()), comment_(db_entry->db_comment_ptr()) {} @@ -402,7 +420,8 @@ AddTableIndexEntryOp::AddTableIndexEntryOp(TableIndexEntry *table_index_entry, T AddSegmentIndexEntryOp::AddSegmentIndexEntryOp(SegmentIndexEntry *segment_index_entry, TxnTimeStamp commit_ts) : CatalogDeltaOperation(CatalogDeltaOpType::ADD_SEGMENT_INDEX_ENTRY, segment_index_entry, commit_ts), segment_index_entry_(segment_index_entry), - min_ts_(segment_index_entry->min_ts()), max_ts_(segment_index_entry->max_ts()), next_chunk_id_(segment_index_entry->next_chunk_id()) {} + min_ts_(segment_index_entry->min_ts()), max_ts_(segment_index_entry->max_ts()), next_chunk_id_(segment_index_entry->next_chunk_id()), + deprecate_ts_(segment_index_entry->deprecate_ts()) {} AddChunkIndexEntryOp::AddChunkIndexEntryOp(ChunkIndexEntry *chunk_index_entry, TxnTimeStamp commit_ts) : CatalogDeltaOperation(CatalogDeltaOpType::ADD_CHUNK_INDEX_ENTRY, chunk_index_entry, commit_ts), base_name_(chunk_index_entry->base_name_), @@ -539,6 +558,7 @@ UniquePtr AddSegmentIndexEntryOp::ReadAdv(const char *&p add_segment_index_op->min_ts_ = ReadBufAdv(ptr); add_segment_index_op->max_ts_ = ReadBufAdv(ptr); add_segment_index_op->next_chunk_id_ = ReadBufAdv(ptr); + add_segment_index_op->deprecate_ts_ = ReadBufAdv(ptr); return add_segment_index_op; } @@ -624,6 +644,7 @@ SizeT AddSegmentIndexEntryOp::GetSizeInBytes() const { auto total_size = sizeof(CatalogDeltaOpType) + GetBaseSizeInBytes(); total_size += sizeof(TxnTimeStamp) + sizeof(TxnTimeStamp); total_size += sizeof(ChunkID); + total_size += sizeof(TxnTimeStamp); return total_size; } @@ -719,6 +740,7 @@ void AddSegmentIndexEntryOp::WriteAdv(char *&buf) const { WriteBufAdv(buf, this->min_ts_); WriteBufAdv(buf, this->max_ts_); WriteBufAdv(buf, this->next_chunk_id_); + WriteBufAdv(buf, this->deprecate_ts_); } void AddChunkIndexEntryOp::WriteAdv(char *&buf) const { @@ -799,11 +821,12 @@ const String AddTableIndexEntryOp::ToString() const { } const String AddSegmentIndexEntryOp::ToString() const { - return fmt::format("AddSegmentIndexEntryOp {} min_ts: {} max_ts: {}, next_chunk_id: {}", + return fmt::format("AddSegmentIndexEntryOp {} min_ts: {} max_ts: {}, next_chunk_id: {}, deprecate_ts: {}", CatalogDeltaOperation::ToString(), min_ts_, max_ts_, - next_chunk_id_); + next_chunk_id_, + deprecate_ts_); } const String AddChunkIndexEntryOp::ToString() const { @@ -868,7 +891,7 @@ bool AddTableIndexEntryOp::operator==(const CatalogDeltaOperation &rhs) const { bool AddSegmentIndexEntryOp::operator==(const CatalogDeltaOperation &rhs) const { auto *rhs_op = dynamic_cast(&rhs); return rhs_op != nullptr && CatalogDeltaOperation::operator==(rhs) && min_ts_ == rhs_op->min_ts_ && max_ts_ == rhs_op->max_ts_ && - next_chunk_id_ == rhs_op->next_chunk_id_; + next_chunk_id_ == rhs_op->next_chunk_id_ && deprecate_ts_ == rhs_op->deprecate_ts_; } bool AddChunkIndexEntryOp::operator==(const CatalogDeltaOperation &rhs) const { @@ -1242,18 +1265,8 @@ void GlobalCatalogDeltaEntry::AddDeltaEntryInner(CatalogDeltaEntry *delta_entry) max_commit_ts_ = std::max(max_commit_ts_, max_commit_ts); for (auto &new_op : delta_entry->operations()) { - if (new_op->type_ == CatalogDeltaOpType::ADD_SEGMENT_ENTRY) { - auto *add_segment_op = static_cast(new_op.get()); - if (add_segment_op->status_ == SegmentStatus::kDeprecated) { - add_segment_op->merge_flag_ = MergeFlag::kDelete; - } - } else if (new_op->type_ == CatalogDeltaOpType::ADD_CHUNK_INDEX_ENTRY) { - auto *add_chunk_index_op = static_cast(new_op.get()); - if (add_chunk_index_op->deprecate_ts_ != UNCOMMIT_TS) { - add_chunk_index_op->merge_flag_ = MergeFlag::kDelete; - LOG_DEBUG(fmt::format("Delete chunk: {} at {}", *new_op->encode_, add_chunk_index_op->deprecate_ts_)); - } - } + new_op->CheckDelete(); + const String &encode = *new_op->encode_; if (encode.empty()) { String error_message = "encode is empty"; @@ -1303,6 +1316,38 @@ void GlobalCatalogDeltaEntry::PruneOpWithSamePrefix(const String &encode1) { auto iter = delta_ops_.lower_bound(encode1); while (iter != delta_ops_.end()) { const auto &[encode2, delta_op2] = *iter; +#if 1 + SizeT encode1_len = encode1.size(); + SizeT encode2_len = encode2.size(); + if(encode1_len > encode2_len) { + // encode1 isn't prefix of encode2 + break; + } + // encode1_len <= encode2_len + SizeT idx = 0; + while(idx < encode1_len) { + if(encode1[idx] != encode2[idx]) { + break; + } + ++ idx; + } + + if(idx != encode1_len) { + break; // encode1 is not prefix of encode2 + } + + if(idx == encode2_len) { + ++iter; + continue; // encode1 == encode2; + } + + if(encode2[idx] != '#') { + ++iter; + continue; // encode1 is not prefix of encode2 + } + + iter = delta_ops_.erase(iter); // encode1 is prefix of encode2 +#else auto [iter1, iter2] = std::mismatch(encode1.begin(), encode1.end(), encode2.begin()); if (iter1 != encode1.end()) { break; // encode1 is not prefix of encode2 @@ -1316,9 +1361,11 @@ void GlobalCatalogDeltaEntry::PruneOpWithSamePrefix(const String &encode1) { continue; // not prefix } iter = delta_ops_.erase(iter); // is prefix +#endif } } - +// #default_db#test_cleanup_index#idx1_todrop +// #default_db#test_cleanup_index#idx2 void GlobalCatalogDeltaEntry::RemoveDeltaOp(TxnTimeStamp max_commit_ts) { std::lock_guard lock(catalog_delta_locker_); for (auto iter = delta_ops_.begin(); iter != delta_ops_.end();) { diff --git a/src/storage/wal/catalog_delta_entry.cppm b/src/storage/wal/catalog_delta_entry.cppm index dd06550778..798c5a5201 100644 --- a/src/storage/wal/catalog_delta_entry.cppm +++ b/src/storage/wal/catalog_delta_entry.cppm @@ -112,6 +112,8 @@ public: MergeFlag NextDeleteFlag(MergeFlag new_merge_flag) const; + void CheckDelete(); + public: TxnTimeStamp begin_ts_{0}; TransactionID txn_id_{0}; @@ -306,6 +308,7 @@ public: TxnTimeStamp min_ts_{0}; TxnTimeStamp max_ts_{0}; ChunkID next_chunk_id_{0}; + TxnTimeStamp deprecate_ts_{0}; }; /// class AddSegmentColumnEntryOperation diff --git a/src/storage/wal/wal_manager.cpp b/src/storage/wal/wal_manager.cpp index d49d8a1c56..59a52289fa 100644 --- a/src/storage/wal/wal_manager.cpp +++ b/src/storage/wal/wal_manager.cpp @@ -1330,7 +1330,8 @@ void WalManager::WalCmdDumpIndexReplay(WalCmdDumpIndex &cmd, TransactionID txn_i 0 /*next_chunk_id*/, txn_id /*txn_id*/, commit_ts /*begin_ts*/, - commit_ts); + commit_ts, + UNCOMMIT_TS /*deprecate_ts*/); index_by_segment.emplace(cmd.segment_id_, segment_index_entry_ptr); segment_index_entry = segment_index_entry_ptr.get(); } diff --git a/src/unit_test/common/analyzer/standard_analyzer.cpp b/src/unit_test/common/analyzer/standard_analyzer.cpp index 6c531250c8..bf85480c55 100644 --- a/src/unit_test/common/analyzer/standard_analyzer.cpp +++ b/src/unit_test/common/analyzer/standard_analyzer.cpp @@ -19,6 +19,7 @@ import stl; import term; import stemmer; import standard_analyzer; +import tokenizer; using namespace infinity; namespace fs = std::filesystem; @@ -126,22 +127,91 @@ TEST_F(StandardAnalyzerTest, test5) { } TEST_F(StandardAnalyzerTest, test6) { + StandardAnalyzer analyzer; + TermList term_list; + String input("2012-01-02 unit tests."); + TokenizeConfig token_config; + String allow_str("-"); + String divide_str("@#$"); + String unite_str("/"); + token_config.AddAllows(allow_str); + token_config.AddDivides(divide_str); + token_config.AddUnites(unite_str); + analyzer.SetTokenizerConfig(token_config); + analyzer.Analyze(input, term_list); + + ASSERT_EQ(term_list.size(), 3U); + ASSERT_EQ(term_list[0].text_, String("2012-01-02")); + ASSERT_EQ(term_list[0].word_offset_, 0U); + ASSERT_EQ(term_list[1].text_, String("unit")); + ASSERT_EQ(term_list[1].word_offset_, 1U); + ASSERT_EQ(term_list[2].text_, String("tests")); + ASSERT_EQ(term_list[2].word_offset_, 2U); + // ASSERT_EQ(term_list[3].text_, PLACE_HOLDER); + // ASSERT_EQ(term_list[3].word_offset_, 3U); +} + +TEST_F(StandardAnalyzerTest, test7) { StandardAnalyzer analyzer; analyzer.InitStemmer(STEM_LANG_ENGLISH); analyzer.SetExtractEngStem(true); analyzer.SetCharOffset(true); + TokenizeConfig token_config; + String divide_str("@#$"); + String unite_str("/"); + token_config.AddAllows("-"); + token_config.AddDivides(divide_str); + token_config.AddUnites(unite_str); + analyzer.SetTokenizerConfig(token_config); TermList term_list; Vector queries = { - R"#({{Redirect|Anarchist|the fictional character|Anarchist (comics)}} {{Redirect|Anarchists}} {{Anarchism sidebar}} {{Libertarianism sidebar}} '''Anarchism''' is generally defined as the [[political philosophy]] which holds the [[state (polity)|state]] to be undesirable, unnecessary, and harmful, {{Cite journal|last=Malatesta|first=Errico|title=Towards Anarchism|journal=MAN!|publisher=International Group of San Francisco|location=Los Angeles|oclc=3930443|url=http://www.marxists.org/archive/malatesta/1930s/xx/toanarchy.htm|authorlink=Errico Malatesta}} {{Cite journal|url=http://www.theglobeandmail.com/servlet/story/RTGAM.20070514.wxlanarchist14/BNStory/lifeWork/home/ |title=Working for The Man |journal=[[The Globe and Mail]] |accessdate=2008-04-14 |last=Agrell |first=Siri |date=2007-05-14}} {{cite web|url=http://www.britannica.com/eb/article-9117285|title=Anarchism|year=2006|work=Encyclopædia Britannica|publisher=Encyclopædia Britannica Premium Service|accessdate=2006-08-29| archiveurl=)#", - R"#(http://web.archive.org/web/20061214085638/http://www.britannica.com/eb/article-9117285| archivedate= 14 December 2006}} {{Cite journal|year=2005|title=Anarchism|journal=The Shorter [[Routledge Encyclopedia of Philosophy]]|page=14|quote=Anarchism is the view that a society without the state, or government, is both possible and desirable.}} The following sources cite anarchism as a political philosophy: {{Cite book| last = Mclaughlin | first = Paul | title = Anarchism and Authority | publisher = Ashgate | location = Aldershot | year = 2007 | isbn = 0-7546-6196-2 |page=59}} {{Cite book| last = Johnston | first = R. | title = The Dictionary of Human Geography | publisher = Blackwell Publishers | location = Cambridge | year = 2000 | isbn = 0-631-20561-6 |page=24}}Slevin, Carl. "Anarchism." ''The Concise Oxford Dictionary of Politics''. Ed. Iain McLean and Alistair McMillan. Oxford University Press, 2003. or alternatively as opposing [[authority]] and)#", - R"#([[hierarchical organization]] in the conduct of human relations."The [[International of Anarchist Federations|IAF - IFA]] fights for : the abolition of all forms of authority whether economical, political, social, religious, cultural or sexual."[http://www.iaf-ifa.org/principles/english.html "Principles of The [[International of Anarchist Federations]]"]"Anarchism, then, really stands for the liberation of the human mind from the dominion of religion; the liberation of the human body from the dominion of property; liberation from the shackles and restraint of government. Anarchism stands for a social order based on the free grouping of individuals for the purpose of producing real social wealth; an order that will guarantee to every human being free access to the earth and full enjoyment of the necessities of life, according to individual desires, tastes, and inclinations." [[Emma Goldman]]. "What it Really Stands for Anarchy" in ''[[Anarchism and Other)#", - R"#(Essays]]''.Individualist anarchist Benjamin Tucker defined anarchism as opposition to authority as follows "They found that they must turn either to the right or to the left, — follow either the path of Authority or the path of Liberty. Marx went one way; Warren and Proudhon the other. Thus were born State Socialism and Anarchism...Authority, takes many shapes, but, broadly speaking, her enemies divide themselves into three classes: first, those who abhor her both as a means and as an end of progress, opposing her openly, avowedly, sincerely, consistently, universally; second, those who profess to believe in her as a means of progress, but who accept her only so far as they think she will subserve their own selfish interests, denying her and her blessings to the rest of the world; third, those who distrust her as a means of progress, believing in her only as an end to be obtained by first trampling upon, violating, and outraging her. These three phases of opposition to Liberty are met in almost)#", - R"#(every sphere of thought and human activity. Good representatives of the first are seen in the Catholic Church and the Russian autocracy; of the second, in the Protestant Church and the Manchester school of politics and political economy; of the third, in the atheism of Gambetta and the socialism of the socialism off Karl Marg." [[Benjamin Tucker]]. [http://www.theanarchistlibrary.org/HTML/Benjamin_Tucker__Individual_Liberty.html ''Individual Liberty.'']{{cite web|url=http://www.panarchy.org/ward/organization.1966.html|last=Ward|first=Colin|year=1966|title=Anarchism as a Theory of Organization|accessdate=1 March 2010| archiveurl= http://web.archive.org/web/20100325081119/http://www.panarchy.org/ward/organization.1966.html| archivedate= 25 March 2010}}Anarchist historian [[George Woodcock]] report of [[Mikhail Bakunin]]'s anti-authoritarianism and shows opposition to both state and non-state forms of authority as follows: "All anarchists deny)#"}; + R"#({{Redirect|Anarchist|the fictional character|Anarchist (comics)}} {{Redirect|Anarchists}} {{Anarchism sidebar}} {{Libertarianism + sidebar}} '''Anarchism''' is generally defined as the [[political philosophy]] which holds the [[state (polity)|state]] to be undesirable, + unnecessary, and harmful, {{Cite journal|last=Malatesta|first=Errico|title=Towards + Anarchism|journal=MAN!|publisher=International Group of San Francisco|location=Los + Angeles|oclc=3930443|url=http://www.marxists.org/archive/malatesta/1930s/xx/toanarchy.htm|authorlink=Errico Malatesta}} {{Cite + journal|url=http://www.theglobeandmail.com/servlet/story/RTGAM.20070514.wxlanarchist14/BNStory/lifeWork/home/ |title=Working for The Man + |journal=[[The Globe and Mail]] |accessdate=2008-04-14 |last=Agrell |first=Siri |date=2007-05-14}} {{cite + web|url=http://www.britannica.com/eb/article-9117285|title=Anarchism|year=2006|work=Encyclopædia Britannica|publisher=Encyclopædia + Britannica Premium Service|accessdate=2006-08-29| archiveurl=)#", + R"#(http://web.archive.org/web/20061214085638/http://www.britannica.com/eb/article-9117285| archivedate= 14 December 2006}} {{Cite journal|year=2005|title=Anarchism|journal=The Shorter [[Routledge Encyclopedia of Philosophy]]|page=14|quote=Anarchism + is the view that a society without the state, or government, is both possible and desirable.}} The following sources cite anarchism as a + political philosophy: {{Cite book| last = Mclaughlin | first = Paul | title = Anarchism and Authority | publisher = Ashgate | location = + Aldershot | year = 2007 | isbn = 0-7546-6196-2 |page=59}} {{Cite book| last = Johnston | first = R. | title = The Dictionary of Human + Geography | publisher = Blackwell Publishers | location = Cambridge | year = 2000 | isbn = 0-631-20561-6 |page=24}}Slevin, Carl. "Anarchism." ''The Concise Oxford Dictionary of Politics''. Ed. Iain McLean and Alistair McMillan. Oxford + University Press, 2003. or alternatively as opposing [[authority]] and)#", + R"#([[hierarchical organization]] in the conduct of human relations."The [[International of Anarchist + Federations|IAF - IFA]] fights for : the abolition of all forms of authority whether economical, political, social, religious, cultural or + sexual."[http://www.iaf-ifa.org/principles/english.html "Principles of The [[International of Anarchist + Federations]]"]"Anarchism, then, really stands for the liberation of the human mind from the dominion of religion; the + liberation of the human body from the dominion of property; liberation from the shackles and restraint of government. Anarchism stands for + a social order based on the free grouping of individuals for the purpose of producing real social wealth; an order that will guarantee to + every human being free access to the earth and full enjoyment of the necessities of life, according to individual desires, tastes, and + inclinations." [[Emma Goldman]]. "What it Really Stands for Anarchy" in ''[[Anarchism and Other)#", + R"#(Essays]]''.Individualist anarchist Benjamin Tucker defined anarchism as opposition to authority as follows "They found that + they must turn either to the right or to the left, — follow either the path of Authority or the path of Liberty. Marx went one way; Warren + and Proudhon the other. Thus were born State Socialism and Anarchism...Authority, takes many shapes, but, broadly speaking, her enemies + divide themselves into three classes: first, those who abhor her both as a means and as an end of progress, opposing her openly, avowedly, + sincerely, consistently, universally; second, those who profess to believe in her as a means of progress, but who accept her only so far as + they think she will subserve their own selfish interests, denying her and her blessings to the rest of the world; third, those who distrust + her as a means of progress, believing in her only as an end to be obtained by first trampling upon, violating, and outraging her. These + three phases of opposition to Liberty are met in almost)#", + R"#(every sphere of thought and human activity. Good representatives of the first are seen in the Catholic Church and the Russian + autocracy; of the second, in the Protestant Church and the Manchester school of politics and political economy; of the third, in the + atheism of Gambetta and the socialism of the socialism off Karl Marg." [[Benjamin Tucker]]. + [http://www.theanarchistlibrary.org/HTML/Benjamin_Tucker__Individual_Liberty.html ''Individual Liberty.'']{{cite web|url=http://www.panarchy.org/ward/organization.1966.html|last=Ward|first=Colin|year=1966|title=Anarchism as a Theory of + Organization|accessdate=1 March 2010| archiveurl= + http://web.archive.org/web/20100325081119/http://www.panarchy.org/ward/organization.1966.html| archivedate= 25 March 2010}}Anarchist historian [[George Woodcock]] report of [[Mikhail Bakunin]]'s anti-authoritarianism and shows opposition + to both state and non-state forms of authority as follows: "All anarchists deny)#", + }; for (auto &query : queries) { TermList term_list; analyzer.Analyze(query, term_list); - // std::cout << "Text #" << query << "# parsed as:" << std::endl; // for (unsigned i = 0; i < term_list.size(); ++i) { // std::cout << "\t" << i << "#" << term_list[i].text_ << "@" << term_list[i].word_offset_ << "#"; // } diff --git a/src/unit_test/main/infinity.cpp b/src/unit_test/main/infinity.cpp index d043ccef5b..1fd72d5745 100644 --- a/src/unit_test/main/infinity.cpp +++ b/src/unit_test/main/infinity.cpp @@ -201,9 +201,9 @@ TEST_F(InfinityTest, test1) { col2->names_.emplace_back(col2_name); output_columns->emplace_back(col2); - SearchExpr * search_expr = nullptr; + SearchExpr *search_expr = nullptr; - result = infinity->Search("default_db", "table1", search_expr, nullptr, nullptr, nullptr, output_columns, nullptr, nullptr, nullptr); + result = infinity->Search("default_db", "table1", search_expr, nullptr, nullptr, nullptr, output_columns, nullptr, nullptr, nullptr, false); SharedPtr data_block = result.result_table_->GetDataBlockById(0); EXPECT_EQ(data_block->row_count(), 1); Value value = data_block->GetValue(0, 0); diff --git a/src/unit_test/parser/search_driver.cpp b/src/unit_test/parser/search_driver.cpp index 486e5bce51..e30e0d064c 100644 --- a/src/unit_test/parser/search_driver.cpp +++ b/src/unit_test/parser/search_driver.cpp @@ -272,3 +272,40 @@ graphic cards } } } + +TEST_F(SearchDriverTest, whitespace_analyzer_test) { + using namespace infinity; + std::string row_quires = R"##( +#basic_filter_boost with explicit field +name:芯片^1.2 +name:dune^1.2 +num:123.456^7.8 +label:DS-K3AJ303/Dm140^1.2 +date:2025-01-01^1.2 + +#clause +DS-K3AJ303/Dm140^1.2 OR 2025-01-01^1.2 +(邓肯 上帝) AND (foo bar) +_exists_:"author" AND page_count:xxx AND name:star^1.3 + +#quote +"TO BE OR NOT TO BE" +"nanjing吉祥物\"羽宝\"头部head" "DS-K3AJ303/Dm140" +"吉祥物nanjing\"DS-K3AJ303/Dm140\"头部" + )##"; + + Map column2analyzer; + for (auto v : std::array{"name", "num", "label", "date", "_exists_", "body"}) { + column2analyzer[v] = "whitespace"; + } + String default_field("body"); + SearchDriver driver(column2analyzer, default_field); + IStringStream iss(row_quires); + try { + int rc = ParseStream(driver, iss); + EXPECT_EQ(rc, 0); + } catch (RecoverableException &e) { + // // catch because dict resource file does not exist in CI environment + // std::cerr << fmt::format("RecoverableException: {}\n", e.what()); + } +} diff --git a/src/unit_test/parser/sql_select_statement.cpp b/src/unit_test/parser/sql_select_statement.cpp index 76b2654685..e72d4e3d5f 100644 --- a/src/unit_test/parser/sql_select_statement.cpp +++ b/src/unit_test/parser/sql_select_statement.cpp @@ -486,18 +486,18 @@ TEST_F(SelectStatementParsingTest, good_test2) { EXPECT_EQ(col_expr->names_[0], "b"); } - EXPECT_NE(select_statement->order_by_list, nullptr); - EXPECT_EQ(select_statement->order_by_list->size(), 2u); + EXPECT_NE(select_statement->order_by_list_, nullptr); + EXPECT_EQ(select_statement->order_by_list_->size(), 2u); { - EXPECT_EQ((*select_statement->order_by_list)[0]->type_, OrderType::kAsc); - EXPECT_EQ((*select_statement->order_by_list)[0]->expr_->type_, ParsedExprType::kColumn); - auto *col_expr = (ColumnExpr *)((*select_statement->order_by_list)[0]->expr_); + EXPECT_EQ((*select_statement->order_by_list_)[0]->type_, OrderType::kAsc); + EXPECT_EQ((*select_statement->order_by_list_)[0]->expr_->type_, ParsedExprType::kColumn); + auto *col_expr = (ColumnExpr *)((*select_statement->order_by_list_)[0]->expr_); EXPECT_EQ(col_expr->names_[0], "a"); } { - EXPECT_EQ((*select_statement->order_by_list)[1]->type_, OrderType::kDesc); - EXPECT_EQ((*select_statement->order_by_list)[1]->expr_->type_, ParsedExprType::kColumn); - auto *col_expr = (ColumnExpr *)((*select_statement->order_by_list)[1]->expr_); + EXPECT_EQ((*select_statement->order_by_list_)[1]->type_, OrderType::kDesc); + EXPECT_EQ((*select_statement->order_by_list_)[1]->expr_->type_, ParsedExprType::kColumn); + auto *col_expr = (ColumnExpr *)((*select_statement->order_by_list_)[1]->expr_); EXPECT_EQ(col_expr->names_[0], "b"); } } diff --git a/src/unit_test/storage/buffer/buffer_manager.cpp b/src/unit_test/storage/buffer/buffer_manager.cpp index 0809454274..a7d58b5b72 100644 --- a/src/unit_test/storage/buffer/buffer_manager.cpp +++ b/src/unit_test/storage/buffer/buffer_manager.cpp @@ -116,6 +116,7 @@ TEST_F(BufferManagerTest, cleanup_test) { auto file_name = MakeShared(fmt::format("file_{}", i)); auto file_worker = MakeUnique(data_dir_, temp_dir_, MakeShared(""), file_name, file_size, buffer_mgr.persistence_manager()); auto *buffer_obj = buffer_mgr.AllocateBufferObject(std::move(file_worker)); + buffer_obj->AddObjRc(); buffer_objs.push_back(buffer_obj); { auto buffer_handle = buffer_obj->Load(); @@ -374,6 +375,7 @@ class Test1Obj : public TestObj { file_info.file_size_ = file_size; auto file_worker = MakeUnique(data_dir_, temp_dir_, MakeShared(""), file_name, file_size, nullptr); file_info.buffer_obj_ = buffer_mgr_->AllocateBufferObject(std::move(file_worker)); + file_info.buffer_obj_->AddObjRc(); } else { auto file_worker = MakeUnique(data_dir_, temp_dir_, MakeShared(""), file_name, file_info.file_size_, nullptr); file_info.buffer_obj_ = buffer_mgr_->GetBufferObject(std::move(file_worker)); @@ -449,6 +451,7 @@ class Test2Obj : public TestObj { if (alloc_new) { auto file_worker = MakeUnique(data_dir_, temp_dir_, MakeShared(""), file_name, 0, nullptr); file_info.buffer_obj_ = buffer_mgr_->AllocateBufferObject(std::move(file_worker)); + file_info.buffer_obj_->AddObjRc(); } else { auto file_worker = MakeUnique(data_dir_, temp_dir_, MakeShared(""), file_name, file_info.file_size_, nullptr); file_info.buffer_obj_ = buffer_mgr_->GetBufferObject(std::move(file_worker)); diff --git a/src/unit_test/storage/invertedindex/column_index_merger.cpp b/src/unit_test/storage/invertedindex/column_index_merger.cpp index d08fd55ca6..93cc1122d1 100644 --- a/src/unit_test/storage/invertedindex/column_index_merger.cpp +++ b/src/unit_test/storage/invertedindex/column_index_merger.cpp @@ -94,7 +94,7 @@ void ColumnIndexMergerTest::CreateIndex(const Vector& paragraphs, column->AppendValue(v); } for (SizeT i = 0; i < chunk_names.size(); ++i) { - MemoryIndexer indexer(index_dir, chunk_names[i], base_row_ids[i], flag_, "standard"); + MemoryIndexer indexer(index_dir, chunk_names[i], base_row_ids[i], flag_, "standard", nullptr); indexer.Insert(column, row_offsets[i], row_counts[i]); indexer.Dump(); } diff --git a/src/unit_test/storage/invertedindex/memory_indexer.cpp b/src/unit_test/storage/invertedindex/memory_indexer.cpp index eb90698129..c5c069ff60 100644 --- a/src/unit_test/storage/invertedindex/memory_indexer.cpp +++ b/src/unit_test/storage/invertedindex/memory_indexer.cpp @@ -127,12 +127,12 @@ INSTANTIATE_TEST_SUITE_P(TestWithDifferentParams, TEST_P(MemoryIndexerTest, Insert) { // prepare fake segment index entry auto fake_segment_index_entry_1 = SegmentIndexEntry::CreateFakeEntry(GetFullDataDir()); - MemoryIndexer indexer1(GetFullDataDir(), "chunk1", RowID(0U, 0U), flag_, "standard"); + MemoryIndexer indexer1(GetFullDataDir(), "chunk1", RowID(0U, 0U), flag_, "standard", fake_segment_index_entry_1.get()); indexer1.Insert(column_, 0, 1); indexer1.Insert(column_, 1, 3); indexer1.Dump(); - auto indexer2 = MakeUnique(GetFullDataDir(), "chunk2", RowID(0U, 4U), flag_, "standard"); + auto indexer2 = MakeUnique(GetFullDataDir(), "chunk2", RowID(0U, 4U), flag_, "standard", fake_segment_index_entry_1.get()); indexer2->Insert(column_, 4, 1); while (indexer2->GetInflightTasks() > 0) { sleep(1); @@ -149,7 +149,7 @@ TEST_P(MemoryIndexerTest, Insert) { TEST_P(MemoryIndexerTest, test2) { auto fake_segment_index_entry_1 = SegmentIndexEntry::CreateFakeEntry(GetFullDataDir()); - MemoryIndexer indexer1(GetFullDataDir(), "chunk1", RowID(0U, 0U), flag_, "standard"); + MemoryIndexer indexer1(GetFullDataDir(), "chunk1", RowID(0U, 0U), flag_, "standard", fake_segment_index_entry_1.get()); indexer1.Insert(column_, 0, 2, true); indexer1.Insert(column_, 2, 2, true); indexer1.Insert(column_, 4, 1, true); @@ -165,7 +165,7 @@ TEST_P(MemoryIndexerTest, test2) { TEST_P(MemoryIndexerTest, test3) { auto fake_segment_index_entry_1 = SegmentIndexEntry::CreateFakeEntry(GetFullDataDir()); - MemoryIndexer indexer1(GetFullDataDir(), "chunk1", RowID(0U, 0U), flag_, "standard"); + MemoryIndexer indexer1(GetFullDataDir(), "chunk1", RowID(0U, 0U), flag_, "standard", fake_segment_index_entry_1.get()); indexer1.Insert(empty_column_, 0, 10, true); indexer1.Dump(true); fake_segment_index_entry_1->AddFtChunkIndexEntry("chunk1", RowID(0U, 0U).ToUint64(), 5U); @@ -182,7 +182,7 @@ TEST_P(MemoryIndexerTest, test3) { TEST_P(MemoryIndexerTest, test4) { auto fake_segment_index_entry_1 = SegmentIndexEntry::CreateFakeEntry(GetFullDataDir()); - MemoryIndexer indexer1(GetFullDataDir(), "chunk1", RowID(0U, 0U), flag_, "standard"); + MemoryIndexer indexer1(GetFullDataDir(), "chunk1", RowID(0U, 0U), flag_, "standard", fake_segment_index_entry_1.get()); indexer1.Insert(empty_column_, 0, 5, true); indexer1.Insert(column_, 0, 5, true); indexer1.Dump(true); @@ -201,7 +201,7 @@ TEST_P(MemoryIndexerTest, test4) { TEST_P(MemoryIndexerTest, SpillLoadTest) { auto fake_segment_index_entry_1 = SegmentIndexEntry::CreateFakeEntry(GetFullDataDir()); - auto indexer1 = MakeUnique(GetFullDataDir(), "chunk1", RowID(0U, 0U), flag_, "standard"); + auto indexer1 = MakeUnique(GetFullDataDir(), "chunk1", RowID(0U, 0U), flag_, "standard", fake_segment_index_entry_1.get()); indexer1->Insert(column_, 0, 2); indexer1->Insert(column_, 2, 2); indexer1->Insert(column_, 4, 1); @@ -211,7 +211,8 @@ TEST_P(MemoryIndexerTest, SpillLoadTest) { } indexer1->Dump(false, true); - UniquePtr loaded_indexer = MakeUnique(GetFullDataDir(), "chunk1", RowID(0U, 0U), flag_, "standard"); + UniquePtr loaded_indexer = + MakeUnique(GetFullDataDir(), "chunk1", RowID(0U, 0U), flag_, "standard", fake_segment_index_entry_1.get()); loaded_indexer->Load(); SegmentID segment_id = fake_segment_index_entry_1->segment_id(); @@ -251,7 +252,7 @@ TEST_P(MemoryIndexerTest, SeekPosition) { } auto fake_segment_index_entry_1 = SegmentIndexEntry::CreateFakeEntry(GetFullDataDir()); - MemoryIndexer indexer1(GetFullDataDir(), "chunk1", RowID(0U, 0U), flag_, "standard"); + MemoryIndexer indexer1(GetFullDataDir(), "chunk1", RowID(0U, 0U), flag_, "standard", fake_segment_index_entry_1.get()); indexer1.Insert(column, 0, 8192); while (indexer1.GetInflightTasks() > 0) { sleep(1); diff --git a/src/unit_test/storage/invertedindex/posting_merger.cpp b/src/unit_test/storage/invertedindex/posting_merger.cpp index 48a6922ae1..c9c1748fc4 100644 --- a/src/unit_test/storage/invertedindex/posting_merger.cpp +++ b/src/unit_test/storage/invertedindex/posting_merger.cpp @@ -75,12 +75,12 @@ void PostingMergerTest::CreateIndex() { } auto fake_segment_index_entry_1 = SegmentIndexEntry::CreateFakeEntry(GetFullDataDir()); - MemoryIndexer indexer1(GetFullDataDir(), "chunk1", RowID(0U, 0U), flag_, "standard"); + MemoryIndexer indexer1(GetFullDataDir(), "chunk1", RowID(0U, 0U), flag_, "standard", fake_segment_index_entry_1.get()); indexer1.Insert(column, 0, 1); indexer1.Dump(); fake_segment_index_entry_1->AddFtChunkIndexEntry("chunk1", RowID(0U, 0U).ToUint64(), 1U); - auto indexer2 = MakeUnique(GetFullDataDir(), "chunk2", RowID(0U, 1U), flag_, "standard"); + auto indexer2 = MakeUnique(GetFullDataDir(), "chunk2", RowID(0U, 1U), flag_, "standard", fake_segment_index_entry_1.get()); indexer2->Insert(column, 1, 1); indexer2->Dump(); } @@ -185,4 +185,4 @@ TEST_P(PostingMergerTest, Basic) { for (auto segment_term_posting : segment_term_postings) { delete segment_term_posting; } -} \ No newline at end of file +} diff --git a/src/unit_test/storage/invertedindex/search/query_builder.cpp b/src/unit_test/storage/invertedindex/search/query_builder.cpp index 24e4823cb2..f8a3b547e2 100644 --- a/src/unit_test/storage/invertedindex/search/query_builder.cpp +++ b/src/unit_test/storage/invertedindex/search/query_builder.cpp @@ -201,7 +201,7 @@ TEST_F(QueryBuilderTest, test_and) { LOG_INFO(oss.str()); // apply query builder Vector hints; - FullTextQueryContext context(FulltextSimilarity::kBM25, MinimumShouldMatchOption{}, hints); + FullTextQueryContext context(FulltextSimilarity::kBM25, MinimumShouldMatchOption{}, 10, hints); context.early_term_algo_ = EarlyTermAlgo::kNaive; context.query_tree_ = std::move(and_root); FakeQueryBuilder fake_query_builder; @@ -273,7 +273,7 @@ TEST_F(QueryBuilderTest, test_or) { LOG_INFO(oss.str()); // apply query builder Vector hints; - FullTextQueryContext context(FulltextSimilarity::kBM25, MinimumShouldMatchOption{}, hints); + FullTextQueryContext context(FulltextSimilarity::kBM25, MinimumShouldMatchOption{}, 10, hints); context.early_term_algo_ = EarlyTermAlgo::kNaive; context.query_tree_ = std::move(or_root); FakeQueryBuilder fake_query_builder; @@ -351,7 +351,7 @@ TEST_F(QueryBuilderTest, test_and_not) { LOG_INFO(oss.str()); // apply query builder Vector hints; - FullTextQueryContext context(FulltextSimilarity::kBM25, MinimumShouldMatchOption{}, hints); + FullTextQueryContext context(FulltextSimilarity::kBM25, MinimumShouldMatchOption{}, 10, hints); context.early_term_algo_ = EarlyTermAlgo::kNaive; context.query_tree_ = std::move(and_not_root); FakeQueryBuilder fake_query_builder; @@ -435,7 +435,7 @@ TEST_F(QueryBuilderTest, test_and_not2) { LOG_INFO(oss.str()); // apply query builder Vector hints; - FullTextQueryContext context(FulltextSimilarity::kBM25, MinimumShouldMatchOption{}, hints); + FullTextQueryContext context(FulltextSimilarity::kBM25, MinimumShouldMatchOption{}, 10, hints); context.early_term_algo_ = EarlyTermAlgo::kNaive; context.query_tree_ = std::move(and_not_root); FakeQueryBuilder fake_query_builder; diff --git a/src/unit_test/storage/invertedindex/search/query_match.cpp b/src/unit_test/storage/invertedindex/search/query_match.cpp index 0ffcc91633..d03343aa11 100644 --- a/src/unit_test/storage/invertedindex/search/query_match.cpp +++ b/src/unit_test/storage/invertedindex/search/query_match.cpp @@ -338,7 +338,7 @@ void QueryMatchTest::QueryMatch(const String &db_name, Status status = Status::ParseMatchExprFailed(match_expr->fields_, match_expr->matching_text_); RecoverableError(status); } - FullTextQueryContext full_text_query_context(FulltextSimilarity::kBM25, MinimumShouldMatchOption{}, index_hints); + FullTextQueryContext full_text_query_context(FulltextSimilarity::kBM25, MinimumShouldMatchOption{}, 10, index_hints); full_text_query_context.early_term_algo_ = EarlyTermAlgo::kNaive; full_text_query_context.query_tree_ = std::move(query_tree); UniquePtr doc_iterator = query_builder.CreateSearch(full_text_query_context); diff --git a/src/unit_test/storage/knnindex/knn_sparse/test_bmp_index.cpp b/src/unit_test/storage/knnindex/knn_sparse/test_bmp_index.cpp index 8c470a681c..72376aba37 100644 --- a/src/unit_test/storage/knnindex/knn_sparse/test_bmp_index.cpp +++ b/src/unit_test/storage/knnindex/knn_sparse/test_bmp_index.cpp @@ -58,7 +58,7 @@ class BMPIndexTest : public BaseTest { { SparseMatrixIter iter(query_set); SparseVecRef vec = iter.val(); - auto [indices, scores] = index.SearchKnn(vec, 0/*topk*/, options); + auto [indices, scores] = index.SearchKnn(vec, 0 /*topk*/, options); EXPECT_EQ(indices.size(), 0); EXPECT_EQ(scores.size(), 0); } @@ -80,7 +80,7 @@ class BMPIndexTest : public BaseTest { // SparseTestUtil::PrintQuery(query_id, gt_indices, gt_scores, gt_size, indices, scores); // std::cout << fmt::format("accuracy: {}\n", (f32)hit / total); } -// std::cout << fmt::format("hit: {}, total: {}\n", hit_all, total_all); + // std::cout << fmt::format("hit: {}, total: {}\n", hit_all, total_all); if (hit_all < total_all * accuracy_all) { EXPECT_TRUE(false); } @@ -133,4 +133,54 @@ TEST_F(BMPIndexTest, test1) { u32 block_size = 8; TestFunc(block_size); } -} \ No newline at end of file +} + +TEST_F(BMPIndexTest, test2) { + using BMPAlg = BMPAlg; + + u32 ncol = 2; + u32 block_size = 2; + + u32 topk = 5; + + BmpSearchOptions options; + options.use_lock_ = false; + + Vector query_idx = {0, 1}; + Vector query_data = {1.0, 1.0}; + SparseVecRef query(query_idx.size(), query_idx.data(), query_data.data()); + + Vector vec1_idx = {0}; + Vector vec1_data = {3.0}; + SparseVecRef vec1(vec1_idx.size(), vec1_idx.data(), vec1_data.data()); + + Vector vec2_idx = {1}; + Vector vec2_data = {1.0}; + SparseVecRef vec2(vec2_idx.size(), vec2_idx.data(), vec2_data.data()); + + Vector vec3_idx = {0, 1}; + Vector vec3_data = {1.0, 1.0}; + SparseVecRef vec3(vec3_idx.size(), vec3_idx.data(), vec3_data.data()); + + BMPAlg index(ncol, block_size); + index.AddDoc(vec1, 0); + index.AddDoc(vec2, 1); + index.AddDoc(vec3, 2); + index.AddDoc(vec3, 3); + index.AddDoc(vec3, 4); + index.AddDoc(vec3, 5); + + [[maybe_unused]] auto [indices, scores] = index.SearchKnn(query, topk, options); + ASSERT_EQ(indices.size(), topk); + ASSERT_EQ(indices[0], 0); + ASSERT_EQ(indices[1], 2); + ASSERT_EQ(indices[2], 3); + ASSERT_EQ(indices[3], 4); + ASSERT_EQ(indices[4], 5); + ASSERT_EQ(scores.size(), topk); + ASSERT_EQ(scores[0], 3.0); + ASSERT_EQ(scores[1], 2.0); + ASSERT_EQ(scores[2], 2.0); + ASSERT_EQ(scores[3], 2.0); + ASSERT_EQ(scores[4], 2.0); +} diff --git a/src/unit_test/storage/tracer/test_tracer.cpp b/src/unit_test/storage/tracer/test_tracer.cpp index 0c90d3993c..664e7a5689 100644 --- a/src/unit_test/storage/tracer/test_tracer.cpp +++ b/src/unit_test/storage/tracer/test_tracer.cpp @@ -41,7 +41,7 @@ class TestMemIndex : public BaseMemIndex { TableIndexEntry *table_index_entry() const override { return nullptr; } - void AddMemUsed(SizeT usage, SizeT row_cnt); + void IncreaseMemoryUsage(SizeT usage, SizeT row_cnt); void Dump(SizeT &usage, SizeT &row_cnt) && { std::lock_guard lck(mtx_); @@ -120,7 +120,7 @@ TestMemIndex *TestCatalog::GetMemIndex(const String &index_name) { void TestCatalog::AppendMemIndex(const String &index_name, SizeT mem_used, SizeT row_cnt) { std::lock_guard lck(mtx_); auto *memindex = GetMemIndexInner(index_name); - memindex->AddMemUsed(mem_used, row_cnt); + memindex->IncreaseMemoryUsage(mem_used, row_cnt); } bool TestCatalog::DumpMemIndex(const String &index_name, SizeT &mem_used, SizeT &row_cnt) { @@ -188,13 +188,13 @@ TestMemIndex::~TestMemIndex() { } } -void TestMemIndex::AddMemUsed(SizeT usage, SizeT row_cnt) { +void TestMemIndex::IncreaseMemoryUsage(SizeT usage, SizeT row_cnt) { { std::lock_guard lck(mtx_); mem_used_ += usage; row_count_ += row_cnt; } - tracer_->AddMemUsed(usage); + tracer_->IncreaseMemoryUsage(usage); } void TestMemIndexTracer::HandleDump(UniquePtr task) { diff --git a/test/data/config/restart_test/test_compact/1.toml b/test/data/config/restart_test/test_compact/1.toml new file mode 100644 index 0000000000..3f38d61d0d --- /dev/null +++ b/test/data/config/restart_test/test_compact/1.toml @@ -0,0 +1,22 @@ +[general] +version = "0.5.0" +time_zone = "utc-8" + +[network] +[log] +log_to_stdout = true +log_level = "trace" + +[storage] +data_dir = "/var/infinity/data" +optimize_interval = "0s" +cleanup_interval = "0s" +compact_interval = "0s" +persistence_dir = "" + +[buffer] +[wal] +delta_checkpoint_interval = "0s" +full_checkpoint_interval = "0s" + +[resource] diff --git a/test/sql/ddl/alter/lock_table.slt b/test/sql/ddl/alter/lock_table.slt index f964b39af7..1aca10fdc5 100644 --- a/test/sql/ddl/alter/lock_table.slt +++ b/test/sql/ddl/alter/lock_table.slt @@ -42,11 +42,25 @@ UNLOCK TABLE products; statement ok INSERT INTO products VALUES (2, 2, 'abcdef'); +statement ok +DELETE FROM products WHERE product_no=1; + +statement ok +DELETE FROM products WHERE product_no=-1; + +statement ok +COMPACT TABLE products; + +statement ok +LOCK TABLE products; + query SELECT * FROM products; ---- -1 2 abcdef 2 2 abcdef statement ok -DROP TABLE products; \ No newline at end of file +UNLOCK TABLE products; + +statement ok +DROP TABLE products; diff --git a/test/sql/dql/fulltext/fulltext_whitespace.slt b/test/sql/dql/fulltext/fulltext_whitespace.slt new file mode 100644 index 0000000000..6fd3e039ee --- /dev/null +++ b/test/sql/dql/fulltext/fulltext_whitespace.slt @@ -0,0 +1,29 @@ + +statement ok +DROP TABLE IF EXISTS ft_whitespace; + +statement ok +CREATE TABLE ft_whitespace(num int, doc varchar DEFAULT 'default text'); + +statement ok +INSERT INTO ft_whitespace VALUES (1, '2020-01-01 2023-01-01'), (2, '2023 01 01'), (3, '01 01 2023'), (4); + +statement ok +CREATE INDEX ft_index ON ft_whitespace(doc) USING FULLTEXT WITH (analyzer = whitespace); + +query I +SELECT * FROM ft_whitespace; +---- +1 2020-01-01 2023-01-01 +2 2023 01 01 +3 01 01 2023 +4 default text + +query I rowsort +SELECT * FROM ft_whitespace SEARCH MATCH TEXT ('doc^4.5', '2023-01-01^6.6', 'topn=10'); +---- +1 2020-01-01 2023-01-01 + +# Clean up +statement ok +DROP TABLE ft_whitespace; diff --git a/test/sql/dql/knn/embedding/test_knn_cos.slt b/test/sql/dql/knn/embedding/test_knn_cos.slt index 7de5f7d889..3907cbc6da 100644 --- a/test/sql/dql/knn/embedding/test_knn_cos.slt +++ b/test/sql/dql/knn/embedding/test_knn_cos.slt @@ -13,7 +13,7 @@ CREATE TABLE test_knn_cos(c1 INT, c2 EMBEDDING(FLOAT, 4)); statement ok COPY test_knn_cos FROM '/var/infinity/test_data/embedding_float_dim4.csv' WITH (DELIMITER ',', FORMAT CSV); -# metric cos will order descendingly. The query will return row 1, 2, 3 +# metric cos will be in descending order. The query will return row 1, 2, 3 query I SELECT c1 FROM test_knn_cos SEARCH MATCH VECTOR (c2, [0.3, 0.3, 0.2, 0.2], 'float', 'cosine', 3); ---- diff --git a/test/sql/dql/knn/embedding/test_knn_hnsw_cos.slt b/test/sql/dql/knn/embedding/test_knn_hnsw_cos.slt index 1cc4b49d7e..513daf6785 100644 --- a/test/sql/dql/knn/embedding/test_knn_hnsw_cos.slt +++ b/test/sql/dql/knn/embedding/test_knn_hnsw_cos.slt @@ -13,7 +13,7 @@ CREATE TABLE test_knn_hnsw_cos(c1 INT, c2 EMBEDDING(FLOAT, 4)); statement ok COPY test_knn_hnsw_cos FROM '/var/infinity/test_data/embedding_float_dim4.csv' WITH (DELIMITER ',', FORMAT CSV); -# mertic cos will order ascendingly. The query will return row 4, 3, 2 +# metric cos will be in descending order. The query will return row 4, 3, 2 query I SELECT c1 FROM test_knn_hnsw_cos SEARCH MATCH VECTOR (c2, [0.3, 0.3, 0.2, 0.2], 'float', 'cosine', 3); ---- diff --git a/test/sql/dql/knn/embedding/test_knn_hnsw_ip.slt b/test/sql/dql/knn/embedding/test_knn_hnsw_ip.slt index 041fda5768..7e242e73f4 100644 --- a/test/sql/dql/knn/embedding/test_knn_hnsw_ip.slt +++ b/test/sql/dql/knn/embedding/test_knn_hnsw_ip.slt @@ -13,7 +13,7 @@ CREATE TABLE test_knn_hnsw_ip(c1 INT, c2 EMBEDDING(FLOAT, 4)); statement ok COPY test_knn_hnsw_ip FROM '/var/infinity/test_data/embedding_float_dim4.csv' WITH (DELIMITER ',', FORMAT CSV); -# mertic ip will order descendingly. The query will return row 4, 3, 2 +# metric ip will be in descending order. The query will return row 4, 3, 2 query I SELECT c1 FROM test_knn_hnsw_ip SEARCH MATCH VECTOR (c2, [0.3, 0.3, 0.2, 0.2], 'float', 'ip', 3); ---- diff --git a/test/sql/dql/knn/embedding/test_knn_hnsw_ip_filter.slt b/test/sql/dql/knn/embedding/test_knn_hnsw_ip_filter.slt index 2c4a7aae8f..c40f084bef 100644 --- a/test/sql/dql/knn/embedding/test_knn_hnsw_ip_filter.slt +++ b/test/sql/dql/knn/embedding/test_knn_hnsw_ip_filter.slt @@ -13,7 +13,7 @@ CREATE TABLE test_knn_hnsw_ip_filter(c1 INT, c2 EMBEDDING(FLOAT, 4)); statement ok COPY test_knn_hnsw_ip_filter FROM '/var/infinity/test_data/embedding_float_dim4.csv' WITH (DELIMITER ',', FORMAT CSV); -# mertic ip will order descendingly. The query will return row 4, 3, 2 +# metric ip will be in descending order. The query will return row 4, 3, 2 query I SELECT c1 FROM test_knn_hnsw_ip_filter SEARCH MATCH VECTOR (c2, [0.3, 0.3, 0.2, 0.2], 'float', 'ip', 3); ---- diff --git a/test/sql/dql/knn/embedding/test_knn_hnsw_l2.slt b/test/sql/dql/knn/embedding/test_knn_hnsw_l2.slt index 077c012b08..0214a4512c 100644 --- a/test/sql/dql/knn/embedding/test_knn_hnsw_l2.slt +++ b/test/sql/dql/knn/embedding/test_knn_hnsw_l2.slt @@ -13,7 +13,7 @@ CREATE TABLE test_knn_hnsw_l2(c1 INT, c2 EMBEDDING(FLOAT, 4)); statement ok COPY test_knn_hnsw_l2 FROM '/var/infinity/test_data/embedding_float_dim4.csv' WITH (DELIMITER ',', FORMAT CSV); -# mertic l2 will order ascendingly. The query will return row 4, 3, 2 +# metric l2 will be in descending order. The query will return row 4, 3, 2 query I SELECT c1 FROM test_knn_hnsw_l2 SEARCH MATCH VECTOR (c2, [0.3, 0.3, 0.2, 0.2], 'float', 'l2', 3); ---- diff --git a/test/sql/dql/knn/embedding/test_knn_hnsw_l2_filter.slt b/test/sql/dql/knn/embedding/test_knn_hnsw_l2_filter.slt index 33daaec42a..57735469a9 100644 --- a/test/sql/dql/knn/embedding/test_knn_hnsw_l2_filter.slt +++ b/test/sql/dql/knn/embedding/test_knn_hnsw_l2_filter.slt @@ -13,7 +13,7 @@ CREATE TABLE test_knn_hnsw_l2_filter(c1 INT, c2 EMBEDDING(FLOAT, 4)); statement ok COPY test_knn_hnsw_l2_filter FROM '/var/infinity/test_data/embedding_float_dim4.csv' WITH (DELIMITER ',', FORMAT CSV); -# mertic l2 will order ascendingly. The query will return row 4, 3, 2 +# metric l2 will be in descending order. The query will return row 4, 3, 2 query I SELECT c1 FROM test_knn_hnsw_l2_filter SEARCH MATCH VECTOR (c2, [0.3, 0.3, 0.2, 0.2], 'float', 'l2', 3); ---- diff --git a/test/sql/dql/knn/embedding/test_knn_ip.slt b/test/sql/dql/knn/embedding/test_knn_ip.slt index 0dc7cd67f5..8882e477d4 100644 --- a/test/sql/dql/knn/embedding/test_knn_ip.slt +++ b/test/sql/dql/knn/embedding/test_knn_ip.slt @@ -13,7 +13,7 @@ CREATE TABLE test_knn_ip(c1 INT, c2 EMBEDDING(FLOAT, 4)); statement ok COPY test_knn_ip FROM '/var/infinity/test_data/embedding_float_dim4.csv' WITH (DELIMITER ',', FORMAT CSV); -# mertic ip will order descendingly. The query will return row 4, 3, 2 +# metric ip will be in descending order. The query will return row 4, 3, 2 query I SELECT c1 FROM test_knn_ip SEARCH MATCH VECTOR (c2, [0.3, 0.3, 0.2, 0.2], 'float', 'ip', 3); ---- diff --git a/test/sql/dql/knn/embedding/test_knn_ip_filter.slt b/test/sql/dql/knn/embedding/test_knn_ip_filter.slt index 7ffe65f87a..269dca13cf 100644 --- a/test/sql/dql/knn/embedding/test_knn_ip_filter.slt +++ b/test/sql/dql/knn/embedding/test_knn_ip_filter.slt @@ -13,7 +13,7 @@ CREATE TABLE test_knn_ip_filter(c1 INT, c2 EMBEDDING(FLOAT, 4)); statement ok COPY test_knn_ip_filter FROM '/var/infinity/test_data/embedding_float_dim4.csv' WITH (DELIMITER ',', FORMAT CSV); -# mertic ip will order descendingly. The query will return row 4, 3, 2 +# metric ip will be in descending order. The query will return row 4, 3, 2 query I SELECT c1 FROM test_knn_ip_filter SEARCH MATCH VECTOR (c2, [0.3, 0.3, 0.2, 0.2], 'float', 'ip', 3); ---- diff --git a/test/sql/dql/knn/embedding/test_knn_ivf_ip_filter.slt b/test/sql/dql/knn/embedding/test_knn_ivf_ip_filter.slt index a0835e802d..64dd6025a2 100644 --- a/test/sql/dql/knn/embedding/test_knn_ivf_ip_filter.slt +++ b/test/sql/dql/knn/embedding/test_knn_ivf_ip_filter.slt @@ -13,7 +13,7 @@ CREATE TABLE test_knn_ivf_ip_filter(c1 INT, c2 EMBEDDING(FLOAT, 4)); statement ok COPY test_knn_ivf_ip_filter FROM '/var/infinity/test_data/embedding_float_dim4.csv' WITH (DELIMITER ',', FORMAT CSV); -# mertic ip will order descendingly. The query will return row 4, 3, 2 +# metric ip will be in descending order. The query will return row 4, 3, 2 query I SELECT c1 FROM test_knn_ivf_ip_filter SEARCH MATCH VECTOR (c2, [0.3, 0.3, 0.2, 0.2], 'float', 'ip', 3); ---- diff --git a/test/sql/dql/knn/embedding/test_knn_ivf_l2.slt b/test/sql/dql/knn/embedding/test_knn_ivf_l2.slt index f44ddd303c..c8e733c722 100644 --- a/test/sql/dql/knn/embedding/test_knn_ivf_l2.slt +++ b/test/sql/dql/knn/embedding/test_knn_ivf_l2.slt @@ -13,7 +13,7 @@ CREATE TABLE test_knn_ivf_l2(c1 INT, c2 EMBEDDING(FLOAT, 4)); statement ok COPY test_knn_ivf_l2 FROM '/var/infinity/test_data/embedding_float_dim4.csv' WITH (DELIMITER ',', FORMAT CSV); -# mertic l2 will order ascendingly. The query will return row 4, 3, 2 +# metric l2 will be in descending order. The query will return row 4, 3, 2 query I SELECT c1 FROM test_knn_ivf_l2 SEARCH MATCH VECTOR (c2, [0.3, 0.3, 0.2, 0.2], 'float', 'l2', 3); ---- diff --git a/test/sql/dql/knn/embedding/test_knn_ivf_l2_filter.slt b/test/sql/dql/knn/embedding/test_knn_ivf_l2_filter.slt index 01d51f1335..27e94494a7 100644 --- a/test/sql/dql/knn/embedding/test_knn_ivf_l2_filter.slt +++ b/test/sql/dql/knn/embedding/test_knn_ivf_l2_filter.slt @@ -13,7 +13,7 @@ CREATE TABLE test_knn_ivf_l2_filter(c1 INT, c2 EMBEDDING(FLOAT, 4)); statement ok COPY test_knn_ivf_l2_filter FROM '/var/infinity/test_data/embedding_float_dim4.csv' WITH (DELIMITER ',', FORMAT CSV); -# mertic l2 will order ascendingly. The query will return row 4, 3, 2 +# metric l2 will be in descending order. The query will return row 4, 3, 2 query I SELECT c1 FROM test_knn_ivf_l2_filter SEARCH MATCH VECTOR (c2, [0.3, 0.3, 0.2, 0.2], 'float', 'l2', 3); ---- diff --git a/test/sql/dql/knn/embedding/test_knn_l2.slt b/test/sql/dql/knn/embedding/test_knn_l2.slt index 4317bd9439..75352540c5 100644 --- a/test/sql/dql/knn/embedding/test_knn_l2.slt +++ b/test/sql/dql/knn/embedding/test_knn_l2.slt @@ -13,7 +13,7 @@ CREATE TABLE test_knn_l2(c1 INT, c2 EMBEDDING(FLOAT, 4)); statement ok COPY test_knn_l2 FROM '/var/infinity/test_data/embedding_float_dim4.csv' WITH (DELIMITER ',', FORMAT CSV); -# mertic l2 will order ascendingly. The query will return row 4, 3, 2 +# metric l2 will be in descending order. The query will return row 4, 3, 2 query I SELECT c1 FROM test_knn_l2 SEARCH MATCH VECTOR (c2, [0.3, 0.3, 0.2, 0.2], 'float', 'l2', 3); ---- diff --git a/test/sql/dql/knn/embedding/test_knn_l2_filter.slt b/test/sql/dql/knn/embedding/test_knn_l2_filter.slt index 3be5df7456..ecfa04eabd 100644 --- a/test/sql/dql/knn/embedding/test_knn_l2_filter.slt +++ b/test/sql/dql/knn/embedding/test_knn_l2_filter.slt @@ -13,7 +13,7 @@ CREATE TABLE test_knn_l2_filter(c1 INT, c2 EMBEDDING(FLOAT, 4)); statement ok COPY test_knn_l2_filter FROM '/var/infinity/test_data/embedding_float_dim4.csv' WITH (DELIMITER ',', FORMAT CSV); -# mertic l2 will order ascendingly. The query will return row 4, 3, 2 +# metric l2 will be in descending order. The query will return row 4, 3, 2 query I SELECT c1 FROM test_knn_l2_filter SEARCH MATCH VECTOR (c2, [0.3, 0.3, 0.2, 0.2], 'float', 'l2', 3); ---- diff --git a/test/sql/dql/knn/sparse/test_knn_sparse.slt b/test/sql/dql/knn/sparse/test_knn_sparse.slt index 067f310c4c..a44ede94a6 100644 --- a/test/sql/dql/knn/sparse/test_knn_sparse.slt +++ b/test/sql/dql/knn/sparse/test_knn_sparse.slt @@ -14,7 +14,7 @@ CREATE TABLE test_knn_sparse(c1 INT, c2 SPARSE(FLOAT, 100)); statement ok COPY test_knn_sparse FROM '/var/infinity/test_data/sparse_knn.csv' WITH (DELIMITER ',', FORMAT CSV); -# mertic ip will order descendingly. The query will return row 4, 2, 1 +# metric ip will be in descending order. The query will return row 4, 2, 1 query I SELECT c1 FROM test_knn_sparse SEARCH MATCH SPARSE (c2, [0:1.0,20:2.0,80:3.0], 'ip', 3); ---- diff --git a/test/sql/dql/knn/sparse/test_knn_sparse_bit.slt b/test/sql/dql/knn/sparse/test_knn_sparse_bit.slt index 99418f94b7..d4bc79bbd9 100644 --- a/test/sql/dql/knn/sparse/test_knn_sparse_bit.slt +++ b/test/sql/dql/knn/sparse/test_knn_sparse_bit.slt @@ -14,7 +14,7 @@ CREATE TABLE test_knn_sparse_bit(c1 INT, c2 SPARSE(BIT, 100)); statement ok COPY test_knn_sparse_bit FROM '/var/infinity/test_data/sparse_knn_bit.csv' WITH (DELIMITER ',', FORMAT CSV); -# mertic ip will order descendingly. The query will return row 1, 2, 3 +# metric ip will be in descending order. The query will return row 1, 2, 3 query I SELECT c1 FROM test_knn_sparse_bit SEARCH MATCH SPARSE (c2, [20,30,40,60], 'ip', 3); ---- diff --git a/thrift/infinity.thrift b/thrift/infinity.thrift index 714858da0e..b026dc00e5 100644 --- a/thrift/infinity.thrift +++ b/thrift/infinity.thrift @@ -590,17 +590,18 @@ struct ExplainResponse { struct SelectRequest { 1: i64 session_id, -2: string db_name, -3: string table_name, -4: list select_list = [], -5: optional list highlight_list = [], -6: optional SearchExpr search_expr, -7: optional ParsedExpr where_expr, -8: optional list group_by_list = [], -9: optional ParsedExpr having_expr, -10: optional ParsedExpr limit_expr, -11: optional ParsedExpr offset_expr, -12: optional list order_by_list = [], +2: string db_name, +3: string table_name, +4: list select_list = [], +5: optional list highlight_list = [], +6: optional SearchExpr search_expr, +7: optional ParsedExpr where_expr, +8: optional list group_by_list = [], +9: optional ParsedExpr having_expr, +10: optional ParsedExpr limit_expr, +11: optional ParsedExpr offset_expr, +12: optional list order_by_list = [], +13: optional bool total_hits_count, } struct SelectResponse { @@ -608,6 +609,7 @@ struct SelectResponse { 2: string error_msg, 3: list column_defs = [], 4: list column_fields = []; +5: string extra_result; } struct DeleteRequest { @@ -745,6 +747,12 @@ struct FlushRequest { 2: string flush_type, } +struct CompactRequest { +1: i64 session_id +2: string db_name, +3: string table_name, +} + // Service service InfinityService { CommonResponse Connect(1:ConnectRequest request), @@ -798,4 +806,6 @@ CommonResponse Command(1: CommandRequest request), CommonResponse Flush(1: FlushRequest request), +CommonResponse Compact(1: CompactRequest request), + } diff --git a/tools/generate_embedding_parquet.py b/tools/generate_embedding_parquet.py index 647189933d..6eb97069a2 100644 --- a/tools/generate_embedding_parquet.py +++ b/tools/generate_embedding_parquet.py @@ -11,6 +11,7 @@ def generate(generate_if_exists: bool, copy_dir: str): table_name = "parquet_embedding_table" table_name1 = "parquet_embedding_table1" + table_name_err = "parquet_embedding_table_err" parquet_filename = "gen_embedding.parquet" parquet_filename1 = "gen_embedding1.parquet" parquet_path = parquet_dir + "/" + parquet_filename @@ -54,9 +55,7 @@ def generate(generate_if_exists: bool, copy_dir: str): slt_file.write("statement ok\n") slt_file.write( - "CREATE TABLE {} (c1 INT, c2 EMBEDDING(INT, {}));\n".format( - table_name, dim - ) + "CREATE TABLE {} (c1 INT, c2 EMBEDDING(INT, {}));\n".format(table_name, dim) ) slt_file.write("\n") @@ -91,7 +90,9 @@ def generate(generate_if_exists: bool, copy_dir: str): slt_file.write("statement ok\n") slt_file.write( - "CREATE TABLE {} (c1 INT, c2 EMBEDDING(INT, {}));\n".format(table_name1, dim) + "CREATE TABLE {} (c1 INT, c2 EMBEDDING(INT, {}));\n".format( + table_name1, dim + ) ) slt_file.write("\n") @@ -122,6 +123,31 @@ def generate(generate_if_exists: bool, copy_dir: str): slt_file.write("DROP TABLE {};\n".format(table_name)) slt_file.write("\n") + # import with incompactible schema + slt_file.write("statement ok\n") + slt_file.write("DROP TABLE IF EXISTS {};\n".format(table_name_err)) + slt_file.write("\n") + + slt_file.write("statement ok\n") + slt_file.write( + "CREATE TABLE {} (c1 INT, c2 EMBEDDING(INT, {}));\n".format( + table_name_err, dim + 1 + ) + ) + slt_file.write("\n") + + slt_file.write("statement error\n") + slt_file.write( + "COPY {} FROM '{}' WITH (FORMAT PARQUET);\n".format( + table_name_err, copy_path + ) + ) + slt_file.write("\n") + + slt_file.write("statement ok\n") + slt_file.write("DROP TABLE {};\n".format(table_name_err)) + slt_file.write("\n") + if __name__ == "__main__": parser = argparse.ArgumentParser(description="Generate parquet data for test") diff --git a/tools/generate_multivector_parquet.py b/tools/generate_multivector_parquet.py index ad99ce966e..92ba9b082f 100644 --- a/tools/generate_multivector_parquet.py +++ b/tools/generate_multivector_parquet.py @@ -11,6 +11,7 @@ def generate(generate_if_exists: bool, copy_dir: str): table_name = "parquet_multivector_table" table_name1 = "parquet_multivector_table1" + table_name_err = "parquet_multivector_table_err" parquet_filename = "gen_multivector.parquet" parquet_filename1 = "gen_multivector1.parquet" parquet_path = parquet_dir + "/" + parquet_filename @@ -56,6 +57,7 @@ def generate(generate_if_exists: bool, copy_dir: str): # t = pq.read_table(parquet_path) # print(t) with open(import_slt_path, "w") as slt_file: + def write_query(): for row_id in range(row_n): slt_file.write("{} [".format(row_id)) @@ -77,7 +79,9 @@ def write_query(): slt_file.write("statement ok\n") slt_file.write( - "CREATE TABLE {} (c1 INT, c2 MULTIVECTOR(INT, {}));\n".format(table_name, dim) + "CREATE TABLE {} (c1 INT, c2 MULTIVECTOR(INT, {}));\n".format( + table_name, dim + ) ) slt_file.write("\n") @@ -104,7 +108,9 @@ def write_query(): slt_file.write("statement ok\n") slt_file.write( - "CREATE TABLE {} (c1 INT, c2 MULTIVECTOR(INT, {}));\n".format(table_name1, dim) + "CREATE TABLE {} (c1 INT, c2 MULTIVECTOR(INT, {}));\n".format( + table_name1, dim + ) ) slt_file.write("\n") @@ -127,6 +133,31 @@ def write_query(): slt_file.write("DROP TABLE {};\n".format(table_name)) slt_file.write("\n") + # import with incompactible schema + slt_file.write("statement ok\n") + slt_file.write("DROP TABLE IF EXISTS {};\n".format(table_name_err)) + slt_file.write("\n") + + slt_file.write("statement ok\n") + slt_file.write( + "CREATE TABLE {} (c1 INT, c2 MULTIVECTOR(INT, {}));\n".format( + table_name_err, dim + 1 + ) + ) + slt_file.write("\n") + + slt_file.write("statement error\n") + slt_file.write( + "COPY {} FROM '{}' WITH (FORMAT PARQUET);\n".format( + table_name_err, copy_path + ) + ) + slt_file.write("\n") + + slt_file.write("statement ok\n") + slt_file.write("DROP TABLE {};\n".format(table_name_err)) + slt_file.write("\n") + if __name__ == "__main__": parser = argparse.ArgumentParser(description="Generate parquet data for test") diff --git a/tools/generate_sparse_parquet.py b/tools/generate_sparse_parquet.py index 43702cc8b5..70ac9d4115 100644 --- a/tools/generate_sparse_parquet.py +++ b/tools/generate_sparse_parquet.py @@ -13,6 +13,7 @@ def generate(generate_if_exists: bool, copy_dir: str): table_name = "parquet_sparse_table" table_name1 = "parquet_sparse_table1" + table_name_err = "parquet_sparse_table_err" parquet_filename = "gen_sparse.parquet" parquet_filename1 = "gen_sparse1.parquet" parquet_path = parquet_dir + "/" + parquet_filename @@ -116,6 +117,31 @@ def write_query(): slt_file.write("DROP TABLE {};\n".format(table_name)) slt_file.write("\n") + # import with incompactible schema + slt_file.write("statement ok\n") + slt_file.write("DROP TABLE IF EXISTS {};\n".format(table_name_err)) + slt_file.write("\n") + + slt_file.write("statement ok\n") + slt_file.write( + "CREATE TABLE {} (c1 INT, c2 SPARSE(DOUBlE, {}) WITH (SORTED));\n".format( + table_name_err, max_dim + ) + ) + slt_file.write("\n") + + slt_file.write("statement error\n") + slt_file.write( + "COPY {} FROM '{}' WITH (FORMAT PARQUET);\n".format( + table_name_err, copy_path + ) + ) + slt_file.write("\n") + + slt_file.write("statement ok\n") + slt_file.write("DROP TABLE {};\n".format(table_name_err)) + slt_file.write("\n") + for row_id in range(row_n): start, end = indptr[row_id], indptr[row_id + 1] col2_vec.append( diff --git a/tools/generate_tensor_array_parquet.py b/tools/generate_tensor_array_parquet.py index 4004fc35c4..928f82a495 100644 --- a/tools/generate_tensor_array_parquet.py +++ b/tools/generate_tensor_array_parquet.py @@ -11,6 +11,7 @@ def generate(generate_if_exists: bool, copy_dir: str): table_name = "parquet_tensor_array_table" table_name1 = "parquet_tensor_array_table1" + table_name_err = "parquet_tensor_array_table_err" parquet_filename = "gen_tensor_array.parquet" parquet_filename1 = "gen_tensor_array1.parquet" parquet_path = parquet_dir + "/" + parquet_filename @@ -145,6 +146,31 @@ def write_query(): slt_file.write("DROP TABLE {};\n".format(table_name)) slt_file.write("\n") + # import with incompactible schema + slt_file.write("statement ok\n") + slt_file.write("DROP TABLE IF EXISTS {};\n".format(table_name_err)) + slt_file.write("\n") + + slt_file.write("statement ok\n") + slt_file.write( + "CREATE TABLE {} (c1 INT, c2 TENSORARRAY(INT, {}));\n".format( + table_name_err, dim + 1 + ) + ) + slt_file.write("\n") + + slt_file.write("statement error\n") + slt_file.write( + "COPY {} FROM '{}' WITH (FORMAT PARQUET);\n".format( + table_name_err, copy_path + ) + ) + slt_file.write("\n") + + slt_file.write("statement ok\n") + slt_file.write("DROP TABLE {};\n".format(table_name_err)) + slt_file.write("\n") + if __name__ == "__main__": parser = argparse.ArgumentParser(description="Generate parquet data for test") diff --git a/tools/generate_tensor_parquet.py b/tools/generate_tensor_parquet.py index b60eb28a21..fa7d3745d8 100644 --- a/tools/generate_tensor_parquet.py +++ b/tools/generate_tensor_parquet.py @@ -11,6 +11,7 @@ def generate(generate_if_exists: bool, copy_dir: str): table_name = "parquet_tensor_table" table_name1 = "parquet_tensor_table1" + table_name_err = "parquet_tensor_table_err" parquet_filename = "gen_tensor.parquet" parquet_filename1 = "gen_tensor1.parquet" parquet_path = parquet_dir + "/" + parquet_filename @@ -56,6 +57,7 @@ def generate(generate_if_exists: bool, copy_dir: str): # t = pq.read_table(parquet_path) # print(t) with open(import_slt_path, "w") as slt_file: + def write_query(): for row_id in range(row_n): slt_file.write("{} [".format(row_id)) @@ -127,6 +129,31 @@ def write_query(): slt_file.write("DROP TABLE {};\n".format(table_name)) slt_file.write("\n") + # import with incompactible schema + slt_file.write("statement ok\n") + slt_file.write("DROP TABLE IF EXISTS {};\n".format(table_name_err)) + slt_file.write("\n") + + slt_file.write("statement ok\n") + slt_file.write( + "CREATE TABLE {} (c1 INT, c2 TENSOR(INT, {}));\n".format( + table_name_err, dim + 1 + ) + ) + slt_file.write("\n") + + slt_file.write("statement error\n") + slt_file.write( + "COPY {} FROM '{}' WITH (FORMAT PARQUET);\n".format( + table_name_err, copy_path + ) + ) + slt_file.write("\n") + + slt_file.write("statement ok\n") + slt_file.write("DROP TABLE {};\n".format(table_name_err)) + slt_file.write("\n") + if __name__ == "__main__": parser = argparse.ArgumentParser(description="Generate parquet data for test") diff --git a/tools/generate_test_parquet.py b/tools/generate_test_parquet.py index d4bceac5e5..725bd0edb1 100644 --- a/tools/generate_test_parquet.py +++ b/tools/generate_test_parquet.py @@ -11,6 +11,7 @@ def generate(generate_if_exist: bool, copy_dir: str): table_name = "parquet_test_table" table_name1 = "parquet_test_table1" + table_name_err = "parquet_test_table_err" parquet_filename = "gen_test.parquet" parquet_filename1 = "gen_test1.parquet" parquet_path = parquet_dir + "/" + parquet_filename @@ -130,6 +131,29 @@ def generate(generate_if_exist: bool, copy_dir: str): slt_file.write("DROP TABLE {};\n".format(table_name)) slt_file.write("\n") + # import with incompatible schema + slt_file.write("statement ok\n") + slt_file.write("DROP TABLE IF EXISTS {};\n".format(table_name_err)) + slt_file.write("\n") + + slt_file.write("statement ok\n") + slt_file.write( + f"CREATE TABLE {table_name_err} (col1 Boolean, col2 TINYINT, col3 BIGINT);\n" + ) + slt_file.write("\n") + + slt_file.write("statement error\n") + slt_file.write( + "COPY {} FROM '{}' WITH (FORMAT PARQUET);\n".format( + table_name_err, copy_path + ) + ) + slt_file.write("\n") + + slt_file.write("statement ok\n") + slt_file.write("DROP TABLE {};\n".format(table_name_err)) + slt_file.write("\n") + if __name__ == "__main__": parser = argparse.ArgumentParser(description="Generate parquet data for test") diff --git a/tools/generate_varchar_parquet.py b/tools/generate_varchar_parquet.py index d9ae8ce19b..29dd07483c 100644 --- a/tools/generate_varchar_parquet.py +++ b/tools/generate_varchar_parquet.py @@ -11,6 +11,7 @@ def generate(generate_if_exists: bool, copy_dir: str): table_name = "parquet_varchar_table" table_name1 = "parquet_varchar_table1" + table_name_err = "parquet_varchar_table_err" parquet_filename = "gen_varchar.parquet" parquet_filename1 = "gen_varchar1.parquet" parquet_path = parquet_dir + "/" + parquet_filename @@ -100,6 +101,27 @@ def generate(generate_if_exists: bool, copy_dir: str): slt_file.write("DROP TABLE {};\n".format(table_name)) slt_file.write("\n") + # import with incompactible schema + slt_file.write("statement ok\n") + slt_file.write("DROP TABLE IF EXISTS {};\n".format(table_name_err)) + slt_file.write("\n") + + slt_file.write("statement ok\n") + slt_file.write("CREATE TABLE {} (c1 INT, c2 INT);\n".format(table_name_err)) + slt_file.write("\n") + + slt_file.write("statement error\n") + slt_file.write( + "COPY {} FROM '{}' WITH (FORMAT PARQUET);\n".format( + table_name_err, copy_path + ) + ) + slt_file.write("\n") + + slt_file.write("statement ok\n") + slt_file.write("DROP TABLE {};\n".format(table_name_err)) + slt_file.write("\n") + if __name__ == "__main__": parser = argparse.ArgumentParser(description="Generate parquet data for test") diff --git a/tools/run_cluster_test.py b/tools/run_cluster_test.py index 9d89a3f77e..64f3cf9dbe 100644 --- a/tools/run_cluster_test.py +++ b/tools/run_cluster_test.py @@ -33,12 +33,24 @@ default=False, help="Use sudo to run command", ) + parser.add_argument( + "--minio_port", + type=int, + default=9000, + ) + parser.add_argument( + "--minio_console_port", + type=int, + default=9001, + ) args = parser.parse_args() infinity_path = args.infinity_path docker = args.docker infinity_dir = args.infinity_dir use_sudo = args.use_sudo + minio_port = args.minio_port + minio_console_port = args.minio_console_port current_path = os.getcwd() python_test_dir = current_path + "/python" @@ -61,6 +73,8 @@ "-m", "not slow", f"--infinity_dir={infinity_dir}", + f"--minio_port={minio_port}", + f"--minio_console_port={minio_console_port}", ] if use_sudo: cmd.append("--use_sudo") diff --git a/tools/run_http_api.py b/tools/run_http_api.py index 1812d2a90d..ba75651b20 100644 --- a/tools/run_http_api.py +++ b/tools/run_http_api.py @@ -44,7 +44,7 @@ def python_sdk_test(python_test_dir: str, pytest_mark: str): print("Note: this script must be run under root directory of the project.") current_path = os.getcwd() python_test_dir = current_path + "/python" - parser = argparse.ArgumentParser(description="Http Api Test For Infinity") + parser = argparse.ArgumentParser(description="Http API Test For Infinity") parser.add_argument( "-m", "--pytest_mark", @@ -54,7 +54,7 @@ def python_sdk_test(python_test_dir: str, pytest_mark: str): ) args = parser.parse_args() - print("Start Http Api testing...") + print("Start Http API testing...") start = time.time() try: python_sdk_test(python_test_dir, args.pytest_mark) diff --git a/tools/run_restart_test.py b/tools/run_restart_test.py index 0140c94ff5..a7584f0aa6 100644 --- a/tools/run_restart_test.py +++ b/tools/run_restart_test.py @@ -18,6 +18,11 @@ type=bool, default=False, ) + parser.add_argument( + "--test_case", + type=str, + required=False, + ) args = parser.parse_args() infinity_path = args.infinity_path @@ -26,13 +31,20 @@ current_path = os.getcwd() python_test_dir = current_path + "/python" + test_case = None + if args.test_case: + test_case = f"{python_test_dir}/restart_test/{args.test_case}" + else: + test_case = f"{python_test_dir}/restart_test" + if not slow: process = subprocess.Popen( [ python_executable, "-m", "pytest", - f"{python_test_dir}/restart_test", + "-v", + test_case, f"--infinity_path={infinity_path}", # "-x", "-s", @@ -48,7 +60,8 @@ python_executable, "-m", "pytest", - f"{python_test_dir}/restart_test", + "-v", + test_case, f"--infinity_path={infinity_path}", "-x", "-s",