Skip to content

Commit

Permalink
Finish diskann search module and unit test (infiniflow#2128)
Browse files Browse the repository at this point in the history
### What problem does this PR solve?

Finish diskann search module and related unit tests

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
- [x] Test cases

---------

Co-authored-by: Yingfeng <[email protected]>
Co-authored-by: Jin Hai <[email protected]>
  • Loading branch information
3 people authored Oct 31, 2024
1 parent 35f6cfd commit 2291af7
Show file tree
Hide file tree
Showing 8 changed files with 1,251 additions and 181 deletions.
97 changes: 48 additions & 49 deletions src/common/default_values.cppm
Original file line number Diff line number Diff line change
Expand Up @@ -95,42 +95,41 @@ export {
constexpr SizeT MIN_CLEANUP_INTERVAL_SEC = 0; // 0 means disable the function
constexpr SizeT DEFAULT_CLEANUP_INTERVAL_SEC = 10;
constexpr std::string_view DEFAULT_CLEANUP_INTERVAL_SEC_STR = "10s"; // 10 seconds
constexpr SizeT MAX_CLEANUP_INTERVAL_SEC = 60 * 60 * 24 * 30; // 1 month
constexpr SizeT MAX_CLEANUP_INTERVAL_SEC = 60 * 60 * 24 * 30; // 1 month

constexpr SizeT MIN_COMPACT_INTERVAL_SEC = 0; // 0 means disable the function
constexpr SizeT DEFAULT_COMPACT_INTERVAL_SEC = 10;
constexpr std::string_view DEFAULT_COMPACT_INTERVAL_SEC_STR = "10s"; // 10 seconds
constexpr SizeT MAX_COMPACT_INTERVAL_SEC = 60 * 60 * 24 * 30; // 1 month
constexpr SizeT MAX_COMPACT_INTERVAL_SEC = 60 * 60 * 24 * 30; // 1 month

constexpr SizeT MIN_OPTIMIZE_INTERVAL_SEC = 1;
constexpr SizeT DEFAULT_OPTIMIZE_INTERVAL_SEC = 10;
constexpr std::string_view DEFAULT_OPTIMIZE_INTERVAL_SEC_STR = "10s"; // 10 seconds
constexpr SizeT MAX_OPTIMIZE_INTERVAL_SEC = 60 * 60 * 24 * 30; // 1 month
constexpr SizeT MAX_OPTIMIZE_INTERVAL_SEC = 60 * 60 * 24 * 30; // 1 month

constexpr SizeT MIN_MEMINDEX_CAPACITY = DEFAULT_BLOCK_CAPACITY; // 1 Block
constexpr SizeT DEFAULT_MEMINDEX_CAPACITY = 128 * DEFAULT_BLOCK_CAPACITY; // 128 * 8192 = 1M rows
constexpr SizeT MAX_MEMINDEX_CAPACITY = DEFAULT_SEGMENT_CAPACITY; // 1 Segment

constexpr i64 MIN_WAL_FILE_SIZE_THRESHOLD = 1024; // 1KB
constexpr i64 DEFAULT_WAL_FILE_SIZE_THRESHOLD = 1 * 1024l * 1024l * 1024l; // 1GB
constexpr std::string_view DEFAULT_WAL_FILE_SIZE_THRESHOLD_STR = "1GB"; // 1GB
constexpr std::string_view DEFAULT_WAL_FILE_SIZE_THRESHOLD_STR = "1GB"; // 1GB
constexpr i64 MAX_WAL_FILE_SIZE_THRESHOLD = 1024l * DEFAULT_WAL_FILE_SIZE_THRESHOLD; // 1TB

constexpr i64 MIN_FULL_CHECKPOINT_INTERVAL_SEC = 0; // 0 means disable full checkpoint
constexpr i64 DEFAULT_FULL_CHECKPOINT_INTERVAL_SEC = 30; // 30 seconds
constexpr i64 MIN_FULL_CHECKPOINT_INTERVAL_SEC = 0; // 0 means disable full checkpoint
constexpr i64 DEFAULT_FULL_CHECKPOINT_INTERVAL_SEC = 30; // 30 seconds
constexpr std::string_view DEFAULT_FULL_CHECKPOINT_INTERVAL_SEC_STR = "30s"; // 30 seconds
constexpr i64 MAX_FULL_CHECKPOINT_INTERVAL_SEC = 60 * 60 * 24 * 30; // 1 month
constexpr i64 MAX_FULL_CHECKPOINT_INTERVAL_SEC = 60 * 60 * 24 * 30; // 1 month

constexpr i64 MIN_DELTA_CHECKPOINT_INTERVAL_SEC = 0; // 0 means disable delta checkpoint
constexpr i64 DEFAULT_DELTA_CHECKPOINT_INTERVAL_SEC = 5; // 5 seconds
constexpr i64 MIN_DELTA_CHECKPOINT_INTERVAL_SEC = 0; // 0 means disable delta checkpoint
constexpr i64 DEFAULT_DELTA_CHECKPOINT_INTERVAL_SEC = 5; // 5 seconds
constexpr std::string_view DEFAULT_DELTA_CHECKPOINT_INTERVAL_SEC_STR = "5s"; // 5 seconds
constexpr i64 MAX_DELTA_CHECKPOINT_INTERVAL_SEC = 60 * 60 * 24 * 30; // 1 month
constexpr i64 MAX_DELTA_CHECKPOINT_INTERVAL_SEC = 60 * 60 * 24 * 30; // 1 month

constexpr i64 MIN_CHECKPOINT_INTERVAL_WAL_BYTES = 1024; // 1K
constexpr i64 DELTA_CHECKPOINT_INTERVAL_WAL_BYTES = 64 * 1024l * 1024l; // 64 MB
constexpr i64 MIN_CHECKPOINT_INTERVAL_WAL_BYTES = 1024; // 1K
constexpr i64 DELTA_CHECKPOINT_INTERVAL_WAL_BYTES = 64 * 1024l * 1024l; // 64 MB
constexpr std::string_view DELTA_CHECKPOINT_INTERVAL_WAL_BYTES_STR = "64MB"; // 64 MB
constexpr i64 MAX_CHECKPOINT_INTERVAL_WAL_BYTES = 1024l * 1024l * 1024l; // 1GB

constexpr i64 MAX_CHECKPOINT_INTERVAL_WAL_BYTES = 1024l * 1024l * 1024l; // 1GB

constexpr std::string_view WAL_FILE_TEMP_FILE = "wal.log";
constexpr std::string_view WAL_FILE_PREFIX = "wal.log";
Expand All @@ -154,16 +153,16 @@ export {
constexpr SizeT DISKANN_NUM_PQ_CHUNKS = 4;
constexpr SizeT DISKANN_NUM_PARTS = 1;
constexpr SizeT DISKANN_MAX_PQ_CHUNKS = 100;
constexpr SizeT DISKANN_TRAINING_SET_SIZE = 100000; // sample rate = data_size/TRAINING_SET_SIZE
constexpr SizeT DISKANN_TRAINING_SET_SIZE = 100000; // sample rate = data_size/TRAINING_SET_SIZE
constexpr f64 DISKANN_SPACE_FOR_CACHED_NODES_IN_GB = 0.25; // cache vector memory in GB
constexpr f64 DISKANN_THRESHOLD_FOR_CACHING_IN_GB = 1.0; //
constexpr u32 DISKANN_NUM_NODES_TO_CACHE = 250000; // cache node num
constexpr u32 DISKANN_WARMUP_L = 20;
constexpr u32 DISKANN_NUM_KMEANS_REPS = 12; // max iterations of lloyds kmeans
constexpr u32 DISKANN_NUM_CENTERS = 256; // number of centers for pq chunk
constexpr f32 DISKANN_GRAPH_SLACK_FACTOR = 1.3f; // In-mem index reserve factor
constexpr SizeT DISKANN_MAX_GRAPH_DEGREE = 512; // SSD index max degree
constexpr SizeT DISKANN_SECTOR_LEN = 4096u; // SSD index sector size
constexpr f64 DISKANN_THRESHOLD_FOR_CACHING_IN_GB = 1.0; //
constexpr u32 DISKANN_NUM_NODES_TO_CACHE = 250000; // cache node num
constexpr u32 DISKANN_WARMUP_L = 20;
constexpr u32 DISKANN_NUM_KMEANS_REPS = 12; // max iterations of lloyds kmeans
constexpr u32 DISKANN_NUM_CENTERS = 256; // number of centers for pq chunk
constexpr f32 DISKANN_GRAPH_SLACK_FACTOR = 1.3f; // In-mem index reserve factor
constexpr SizeT DISKANN_MAX_GRAPH_DEGREE = 512; // SSD index max degree
constexpr SizeT DISKANN_SECTOR_LEN = 4096u; // SSD index sector size
constexpr SizeT DISKANN_MAX_N_SECTOR_READS = 128; // SSD index max sector reads

// default hnsw parameter
Expand Down Expand Up @@ -191,9 +190,9 @@ export {
constexpr std::string_view DEFAULT_BUFFER_MANAGER_SIZE_STR = "8GB"; // 8Gib

constexpr SizeT DEFAULT_MEMINDEX_MEMORY_QUOTA = 4 * 1024lu * 1024lu * 1024lu; // 4GB
constexpr std::string_view DEFAULT_MEMINDEX_MEMORY_QUOTA_STR = "4GB"; // 4GB
constexpr std::string_view DEFAULT_MEMINDEX_MEMORY_QUOTA_STR = "4GB"; // 4GB

constexpr SizeT DEFAULT_LOG_FILE_SIZE = 64 * 1024lu * 1024lu; // 64MB
constexpr SizeT DEFAULT_LOG_FILE_SIZE = 64 * 1024lu * 1024lu; // 64MB
constexpr std::string_view DEFAULT_LOG_FILE_SIZE_STR = "64MB"; // 64MB

constexpr SizeT INSERT_BATCH_ROW_LIMIT = 8192;
Expand Down Expand Up @@ -280,30 +279,30 @@ export {
constexpr std::string_view RECORD_RUNNING_QUERY_OPTION_NAME = "record_running_query";

// Variable name
constexpr std::string_view QUERY_COUNT_VAR_NAME = "query_count"; // global and session
constexpr std::string_view SESSION_COUNT_VAR_NAME = "session_count"; // global
constexpr std::string_view BUFFER_USAGE_VAR_NAME = "buffer_usage"; // global
constexpr std::string_view SCHEDULE_POLICY_VAR_NAME = "schedule_policy"; // global
constexpr std::string_view DELTA_LOG_COUNT_VAR_NAME = "delta_log_count"; // global
constexpr std::string_view NEXT_TXN_ID_VAR_NAME = "next_transaction_id"; // global
constexpr std::string_view BUFFER_OBJECT_COUNT_VAR_NAME = "buffer_object_count"; // global
constexpr std::string_view UNUSED_BUFFER_OBJECT_COUNT_VAR_NAME = "unused_buffer_object"; // global
constexpr std::string_view ACTIVE_TXN_COUNT_VAR_NAME = "active_txn_count"; // global
constexpr std::string_view CURRENT_TS_VAR_NAME = "current_timestamp"; // global
constexpr std::string_view TOTAL_COMMIT_COUNT_VAR_NAME = "total_commit_count"; // global and session
constexpr std::string_view TOTAL_ROLLBACK_COUNT_VAR_NAME = "total_rollback_count"; // global and session
constexpr std::string_view CONNECTED_TS_VAR_NAME = "connected_timestamp"; // session
constexpr std::string_view CATALOG_VERSION_VAR_NAME = "catalog_version"; // global
constexpr std::string_view ACTIVE_WAL_FILENAME_VAR_NAME = "active_wal_filename"; // global
constexpr std::string_view ENABLE_PROFILE_VAR_NAME = "enable_profile"; // session
constexpr std::string_view PROFILE_RECORD_CAPACITY_VAR_NAME = "profile_record_capacity"; // session
constexpr std::string_view BG_TASK_COUNT_VAR_NAME = "bg_task_count"; // global
constexpr std::string_view RUNNING_BG_TASK_VAR_NAME = "running_bg_task"; // global
constexpr std::string_view RUNNING_COMPACT_TASK_VAR_NAME = "running_compact_task"; // global
constexpr std::string_view SYSTEM_MEMORY_USAGE_VAR_NAME = "system_memory_usage"; // global
constexpr std::string_view OPEN_FILE_COUNT_VAR_NAME = "open_file_count"; // global
constexpr std::string_view CPU_USAGE_VAR_NAME = "cpu_usage"; // global
constexpr std::string_view FOLLOWER_NUMBER = "follower_number"; // global
constexpr std::string_view QUERY_COUNT_VAR_NAME = "query_count"; // global and session
constexpr std::string_view SESSION_COUNT_VAR_NAME = "session_count"; // global
constexpr std::string_view BUFFER_USAGE_VAR_NAME = "buffer_usage"; // global
constexpr std::string_view SCHEDULE_POLICY_VAR_NAME = "schedule_policy"; // global
constexpr std::string_view DELTA_LOG_COUNT_VAR_NAME = "delta_log_count"; // global
constexpr std::string_view NEXT_TXN_ID_VAR_NAME = "next_transaction_id"; // global
constexpr std::string_view BUFFER_OBJECT_COUNT_VAR_NAME = "buffer_object_count"; // global
constexpr std::string_view UNUSED_BUFFER_OBJECT_COUNT_VAR_NAME = "unused_buffer_object"; // global
constexpr std::string_view ACTIVE_TXN_COUNT_VAR_NAME = "active_txn_count"; // global
constexpr std::string_view CURRENT_TS_VAR_NAME = "current_timestamp"; // global
constexpr std::string_view TOTAL_COMMIT_COUNT_VAR_NAME = "total_commit_count"; // global and session
constexpr std::string_view TOTAL_ROLLBACK_COUNT_VAR_NAME = "total_rollback_count"; // global and session
constexpr std::string_view CONNECTED_TS_VAR_NAME = "connected_timestamp"; // session
constexpr std::string_view CATALOG_VERSION_VAR_NAME = "catalog_version"; // global
constexpr std::string_view ACTIVE_WAL_FILENAME_VAR_NAME = "active_wal_filename"; // global
constexpr std::string_view ENABLE_PROFILE_VAR_NAME = "enable_profile"; // session
constexpr std::string_view PROFILE_RECORD_CAPACITY_VAR_NAME = "profile_record_capacity"; // session
constexpr std::string_view BG_TASK_COUNT_VAR_NAME = "bg_task_count"; // global
constexpr std::string_view RUNNING_BG_TASK_VAR_NAME = "running_bg_task"; // global
constexpr std::string_view RUNNING_COMPACT_TASK_VAR_NAME = "running_compact_task"; // global
constexpr std::string_view SYSTEM_MEMORY_USAGE_VAR_NAME = "system_memory_usage"; // global
constexpr std::string_view OPEN_FILE_COUNT_VAR_NAME = "open_file_count"; // global
constexpr std::string_view CPU_USAGE_VAR_NAME = "cpu_usage"; // global
constexpr std::string_view FOLLOWER_NUMBER = "follower_number"; // global

// IO related
constexpr SizeT DEFAULT_READ_BUFFER_SIZE = 4096;
Expand Down
8 changes: 4 additions & 4 deletions src/storage/knn_index/knn_diskann/diskann_index_data.cppm
Original file line number Diff line number Diff line change
Expand Up @@ -95,12 +95,12 @@ private:
SizeT medoid, vamana_frozen_num, vamana_frozen_loc = 0;
mem_index_file_handle.Read(&actual_file_size, sizeof(SizeT));
mem_index_file_handle.Read(&width, sizeof(u32));
mem_index_file_handle.Read(&medoid, sizeof(SizeT));
mem_index_file_handle.Read(&medoid, sizeof(SizeT)); // medoid node id i.e. enter point
mem_index_file_handle.Read(&vamana_frozen_num, sizeof(SizeT));

if (vamana_frozen_num == 1)
vamana_frozen_loc = medoid;
// node structure: [neighbor_size(u32), neighbor_id(SizeT)*]
// node structure: [vector(VectorDataType)*, neighbor_size(u32), neighbor_id(SizeT)*]
u64 max_node_len = (width) * sizeof(SizeT) + sizeof(u32) + ndims * sizeof(VectorDataType); // graph node + vector
u32 nnodes_per_sector = DISKANN_SECTOR_LEN / max_node_len; // 0 if max_node_len > SECTOR_LEN(multi-sector for a node)

Expand All @@ -125,7 +125,7 @@ private:
output_file_meta[5] = n_sectors;
output_file_meta[6] = vamana_frozen_num;
output_file_meta[7] = vamana_frozen_loc;
u64 append_reorder_data = 0; // not used for now
u64 append_reorder_data = 0; // whether use reorder data, not used for now
output_file_meta[8] = append_reorder_data;
output_file_meta[9] = disk_index_file_size;
index_file_handle.Append(output_file_meta.data(), DISKANN_SECTOR_LEN);
Expand Down Expand Up @@ -278,7 +278,7 @@ public:
LOG_TRACE(fmt::format("Sample training size :{}", train_size));
}

// step 2. generate pq pivots and pq data
// step 2. generate pq pivots from training data and compress pq data
{
data_file_handle->Seek(0);
auto [pqCompressed_data_file_handle, pq_data_file_status] = VirtualStore::Open(pqCompressed_data_file_path_, FileAccessMode::kWrite);
Expand Down
Loading

0 comments on commit 2291af7

Please sign in to comment.