Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Release v0.8.0 #523

Merged
merged 52 commits into from
Dec 3, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
52 commits
Select commit Hold shift + click to select a range
361893c
#438: fix reordering warning
jlmelville Mar 5, 2023
b3fb472
Merge pull request #443 from jlmelville/438-gcc-reorder
yurymalkov Mar 5, 2023
dccd4f9
Add CMake install targets (#446)
moritz-h May 7, 2023
a9de060
Add HierarchicalNSW::indexFileSize() function for precise memory foot…
drons Dec 5, 2022
6aac477
Add multithread search to BF index (#425)
dyashuni May 12, 2023
cd844b5
Merge pull request #427 from drons/indexFileSize
yurymalkov May 13, 2023
1925428
add virtual destructor to BaseFilterFunctor (#460)
yoshoku May 13, 2023
2c538db
Add static again
alxvth May 17, 2023
a4c8b0b
Fix mac setup (#461)
dyashuni May 21, 2023
643e9dc
Merge pull request #463 from alxvth/fix_global_linkage_again
yurymalkov Jun 13, 2023
47c0a32
[warnings] Fix build warnings.
ttsugriy Jun 19, 2023
3ca227a
Replace priority_queue::push with emplace.
ttsugriy Jun 19, 2023
013def5
[bruteforce] Fix bruteforce removePoint.
ttsugriy Jun 19, 2023
b977d25
Merge pull request #472 from ttsugriy/emplace
yurymalkov Jun 19, 2023
70ce84e
Merge branch 'nmslib:develop' into develop
ttsugriy Jun 19, 2023
1ee95db
Use unique_ptr to manage visited_list_pool_.
ttsugriy Jun 19, 2023
802a0ec
Merge pull request #471 from ttsugriy/warnings
yurymalkov Jun 20, 2023
74f14d2
InnerProductSIMD16ExtAVX512 functions are implemented using the more …
aurora327 Jun 20, 2023
9291020
InnerProductSIMD16ExtAVX512 Efficient AVX512 instruction implementat…
aurora327 Jun 20, 2023
e7c66c8
Merge pull request #473 from ttsugriy/develop
yurymalkov Jul 4, 2023
0df757e
Merge pull request #474 from ttsugriy/uniq-ptr
yurymalkov Jul 4, 2023
8911d9e
Fix memory leak on loadIndex with non-empty HierarchicalNSW object
drons Jul 2, 2023
f30b6e1
Merge pull request #475 from aurora327/efficient_avx512_instruction
yurymalkov Jul 10, 2023
006d7b2
Merge pull request #477 from drons/fixLoadIndexLeak
yurymalkov Jul 10, 2023
6a3a0f4
hnswalg.h: cap M to 100000
emollier Jul 18, 2023
5aba6c6
hnswalg.h: reduce M cap further to 10000
emollier Jul 19, 2023
f6d170c
Merge pull request #484 from emollier/cve-2023-37365
yurymalkov Jul 20, 2023
08569de
Bring back HNSWLIB_NO_NATIVE
dyashuni Aug 11, 2023
e023f7e
Fix
dyashuni Aug 11, 2023
4f7b192
get_items return numpy array
dyashuni Aug 12, 2023
f22a5b1
Fix tests
dyashuni Aug 12, 2023
eecd540
Rename flag, use class name to access static variables
dyashuni Aug 13, 2023
db19931
Add type check for return results of get_items
dyashuni Aug 13, 2023
39b210d
Merge pull request #494 from dyashuni/get_items_numpy
yurymalkov Aug 14, 2023
fd027d4
Merge pull request #493 from dyashuni/no_native
yurymalkov Aug 19, 2023
f27913b
python_bindings/bindings.cpp: fix typo.
emollier Aug 21, 2023
ca85f0d
Merge pull request #500 from emollier/typo
yurymalkov Aug 23, 2023
9a33d05
Linking error fix
jrade Aug 24, 2023
5b3d81b
define a macro for error stream
jlmelville Sep 17, 2023
f7ec147
use HNSWERR instead of std::cerr directly
jlmelville Sep 17, 2023
92e053a
Add a comment
jlmelville Sep 17, 2023
431efa8
Avoid sign mismatch in loop
jlmelville Sep 17, 2023
3de1d69
Resolve initialisation order warning
stephematician Sep 27, 2023
a9e62cb
Add link to PR in comment
jlmelville Sep 29, 2023
ae5ba1b
Merge pull request #501 from jrade/master
yurymalkov Oct 1, 2023
c4418ea
Merge pull request #508 from jlmelville/develop
yurymalkov Oct 1, 2023
898bf5d
Merge pull request #509 from jlmelville/patch-1
yurymalkov Oct 1, 2023
d44bd5d
Merge pull request #511 from stephematician/master
yurymalkov Oct 1, 2023
2142dc6
Stop condition (#490)
dyashuni Nov 6, 2023
ba284f5
update repo testing code
yurymalkov Nov 20, 2023
b9e0597
Bump library version
yurymalkov Dec 3, 2023
5a8fd34
Update README.md
yurymalkov Dec 3, 2023
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 9 additions & 5 deletions .github/workflows/build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ jobs:
runs-on: ${{matrix.os}}
strategy:
matrix:
os: [ubuntu-latest, windows-latest]
os: [ubuntu-latest, windows-latest, macos-latest]
python-version: ["3.7", "3.8", "3.9", "3.10"]
steps:
- uses: actions/checkout@v3
Expand All @@ -28,7 +28,7 @@ jobs:
runs-on: ${{matrix.os}}
strategy:
matrix:
os: [ubuntu-latest, windows-latest]
os: [ubuntu-latest, windows-latest, macos-latest]
steps:
- uses: actions/checkout@v3
- uses: actions/setup-python@v4
Expand All @@ -40,10 +40,10 @@ jobs:
mkdir build
cd build
cmake ..
if [ "$RUNNER_OS" == "Linux" ]; then
make
elif [ "$RUNNER_OS" == "Windows" ]; then
if [ "$RUNNER_OS" == "Windows" ]; then
cmake --build ./ --config Release
else
make
fi
shell: bash

Expand All @@ -67,10 +67,14 @@ jobs:
./example_mt_search
./example_mt_filter
./example_mt_replace_deleted
./example_multivector_search
./example_epsilon_search
./searchKnnCloserFirst_test
./searchKnnWithFilter_test
./multiThreadLoad_test
./multiThread_replace_test
./test_updates
./test_updates update
./multivector_search_test
./epsilon_search_test
shell: bash
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -10,3 +10,4 @@ var/
.vscode/
.vs/
**.DS_Store
*.pyc
63 changes: 56 additions & 7 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,25 +1,68 @@
cmake_minimum_required (VERSION 2.6)
project(hnsw_lib
cmake_minimum_required(VERSION 3.0...3.26)

project(hnswlib
LANGUAGES CXX)

include(GNUInstallDirs)
include(CheckCXXCompilerFlag)

add_library(hnswlib INTERFACE)
target_include_directories(hnswlib INTERFACE .)
add_library(hnswlib::hnswlib ALIAS hnswlib)

target_include_directories(hnswlib INTERFACE
$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>
$<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}>)

# Install
install(DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/hnswlib
DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})

install(TARGETS hnswlib
EXPORT hnswlibTargets)

install(EXPORT hnswlibTargets
FILE hnswlibConfig.cmake
NAMESPACE hnswlib::
DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/hnswlib)

# Examples and tests
if(CMAKE_PROJECT_NAME STREQUAL PROJECT_NAME)
option(HNSWLIB_EXAMPLES "Build examples and tests." ON)
else()
option(HNSWLIB_EXAMPLES "Build examples and tests." OFF)
endif()
if(HNSWLIB_EXAMPLES)
set(CMAKE_CXX_STANDARD 11)

if (CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
SET( CMAKE_CXX_FLAGS "-Ofast -DNDEBUG -std=c++11 -DHAVE_CXX0X -openmp -march=native -fpic -ftree-vectorize")
if (CMAKE_CXX_COMPILER_ID MATCHES "Clang")
SET( CMAKE_CXX_FLAGS "-Ofast -std=c++11 -DHAVE_CXX0X -openmp -fpic -ftree-vectorize" )
check_cxx_compiler_flag("-march=native" COMPILER_SUPPORT_NATIVE_FLAG)
if(COMPILER_SUPPORT_NATIVE_FLAG)
SET( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=native" )
message("set -march=native flag")
else()
check_cxx_compiler_flag("-mcpu=apple-m1" COMPILER_SUPPORT_M1_FLAG)
if(COMPILER_SUPPORT_M1_FLAG)
SET( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mcpu=apple-m1" )
message("set -mcpu=apple-m1 flag")
endif()
endif()
elseif (CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
SET( CMAKE_CXX_FLAGS "-Ofast -lrt -DNDEBUG -std=c++11 -DHAVE_CXX0X -march=native -fpic -w -fopenmp -ftree-vectorize -ftree-vectorizer-verbose=0" )
SET( CMAKE_CXX_FLAGS "-Ofast -lrt -std=c++11 -DHAVE_CXX0X -march=native -fpic -w -fopenmp -ftree-vectorize -ftree-vectorizer-verbose=0" )
elseif (CMAKE_CXX_COMPILER_ID STREQUAL "MSVC")
SET( CMAKE_CXX_FLAGS "-Ofast -lrt -DNDEBUG -std=c++11 -DHAVE_CXX0X -openmp -march=native -fpic -w -fopenmp -ftree-vectorize" )
SET( CMAKE_CXX_FLAGS "/O2 -DHAVE_CXX0X /W1 /openmp /EHsc" )
endif()

# examples
add_executable(example_search examples/cpp/example_search.cpp)
target_link_libraries(example_search hnswlib)

add_executable(example_epsilon_search examples/cpp/example_epsilon_search.cpp)
target_link_libraries(example_epsilon_search hnswlib)

add_executable(example_multivector_search examples/cpp/example_multivector_search.cpp)
target_link_libraries(example_multivector_search hnswlib)

add_executable(example_filter examples/cpp/example_filter.cpp)
target_link_libraries(example_filter hnswlib)

Expand All @@ -36,6 +79,12 @@ if(CMAKE_PROJECT_NAME STREQUAL PROJECT_NAME)
target_link_libraries(example_mt_replace_deleted hnswlib)

# tests
add_executable(multivector_search_test tests/cpp/multivector_search_test.cpp)
target_link_libraries(multivector_search_test hnswlib)

add_executable(epsilon_search_test tests/cpp/epsilon_search_test.cpp)
target_link_libraries(epsilon_search_test hnswlib)

add_executable(test_updates tests/cpp/updates_test.cpp)
target_link_libraries(test_updates hnswlib)

Expand Down
13 changes: 12 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,15 @@ Header-only C++ HNSW implementation with python bindings, insertions and updates

**NEWS:**

**version 0.8.0**

* Multi-vector document search and epsilon search (for now, only in C++)
* By default, there is no statistic aggregation, which speeds up the multi-threaded search (it does not seem like people are using it anyway: [Issue #495](https://github.com/nmslib/hnswlib/issues/495)).
* Various bugfixes and improvements
* `get_items` now have `return_type` parameter, which can be either 'numpy' or 'list'

Full list of changes: https://github.com/nmslib/hnswlib/pull/523

**version 0.7.0**

* Added support to filtering (#402, #430) by [@kishorenc](https://github.com/kishorenc)
Expand Down Expand Up @@ -79,7 +88,7 @@ For other spaces use the nmslib library https://github.com/nmslib/nmslib.

* `set_num_threads(num_threads)` set the default number of cpu threads used during data insertion/querying.

* `get_items(ids)` - returns a numpy array (shape:`N*dim`) of vectors that have integer identifiers specified in `ids` numpy vector (shape:`N`). Note that for cosine similarity it currently returns **normalized** vectors.
* `get_items(ids, return_type = 'numpy')` - returns a numpy array (shape:`N*dim`) of vectors that have integer identifiers specified in `ids` numpy vector (shape:`N`) if `return_type` is `list` return list of lists. Note that for cosine similarity it currently returns **normalized** vectors.

* `get_ids_list()` - returns a list of all elements' ids.

Expand Down Expand Up @@ -229,6 +238,8 @@ print("Recall for two batches:", np.mean(labels.reshape(-1) == np.arange(len(dat
* filtering during the search with a boolean function
* deleting the elements and reusing the memory of the deleted elements for newly added elements
* multithreaded usage
* multivector search
* epsilon search


### Bindings installation
Expand Down
6 changes: 5 additions & 1 deletion examples/cpp/EXAMPLES.md
Original file line number Diff line number Diff line change
Expand Up @@ -182,4 +182,8 @@ int main() {
Multithreaded examples:
* Creating index, inserting elements, searching [example_mt_search.cpp](example_mt_search.cpp)
* Filtering during the search with a boolean function [example_mt_filter.cpp](example_mt_filter.cpp)
* Reusing the memory of the deleted elements when new elements are being added [example_mt_replace_deleted.cpp](example_mt_replace_deleted.cpp)
* Reusing the memory of the deleted elements when new elements are being added [example_mt_replace_deleted.cpp](example_mt_replace_deleted.cpp)

More examples:
* Multivector search [example_multivector_search.cpp](example_multivector_search.cpp)
* Epsilon search [example_epsilon_search.cpp](example_epsilon_search.cpp)
66 changes: 66 additions & 0 deletions examples/cpp/example_epsilon_search.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
#include "../../hnswlib/hnswlib.h"

typedef unsigned int docidtype;
typedef float dist_t;

int main() {
int dim = 16; // Dimension of the elements
int max_elements = 10000; // Maximum number of elements, should be known beforehand
int M = 16; // Tightly connected with internal dimensionality of the data
// strongly affects the memory consumption
int ef_construction = 200; // Controls index search speed/build speed tradeoff
int min_num_candidates = 100; // Minimum number of candidates to search in the epsilon region
// this parameter is similar to ef

int num_queries = 5;
float epsilon2 = 2.0; // Squared distance to query

// Initing index
hnswlib::L2Space space(dim);
hnswlib::HierarchicalNSW<dist_t>* alg_hnsw = new hnswlib::HierarchicalNSW<dist_t>(&space, max_elements, M, ef_construction);

// Generate random data
std::mt19937 rng;
rng.seed(47);
std::uniform_real_distribution<> distrib_real;

size_t data_point_size = space.get_data_size();
char* data = new char[data_point_size * max_elements];
for (int i = 0; i < max_elements; i++) {
char* point_data = data + i * data_point_size;
for (int j = 0; j < dim; j++) {
char* vec_data = point_data + j * sizeof(float);
float value = distrib_real(rng);
*(float*)vec_data = value;
}
}

// Add data to index
for (int i = 0; i < max_elements; i++) {
hnswlib::labeltype label = i;
char* point_data = data + i * data_point_size;
alg_hnsw->addPoint(point_data, label);
}

// Query random vectors
for (int i = 0; i < num_queries; i++) {
char* query_data = new char[data_point_size];
for (int j = 0; j < dim; j++) {
size_t offset = j * sizeof(float);
char* vec_data = query_data + offset;
float value = distrib_real(rng);
*(float*)vec_data = value;
}
std::cout << "Query #" << i << "\n";
hnswlib::EpsilonSearchStopCondition<dist_t> stop_condition(epsilon2, min_num_candidates, max_elements);
std::vector<std::pair<float, hnswlib::labeltype>> result =
alg_hnsw->searchStopConditionClosest(query_data, stop_condition);
size_t num_vectors = result.size();
std::cout << "Found " << num_vectors << " vectors\n";
delete[] query_data;
}

delete[] data;
delete alg_hnsw;
return 0;
}
83 changes: 83 additions & 0 deletions examples/cpp/example_multivector_search.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
#include "../../hnswlib/hnswlib.h"

typedef unsigned int docidtype;
typedef float dist_t;

int main() {
int dim = 16; // Dimension of the elements
int max_elements = 10000; // Maximum number of elements, should be known beforehand
int M = 16; // Tightly connected with internal dimensionality of the data
// strongly affects the memory consumption
int ef_construction = 200; // Controls index search speed/build speed tradeoff

int num_queries = 5;
int num_docs = 5; // Number of documents to search
int ef_collection = 6; // Number of candidate documents during the search
// Controlls the recall: higher ef leads to better accuracy, but slower search
docidtype min_doc_id = 0;
docidtype max_doc_id = 9;

// Initing index
hnswlib::MultiVectorL2Space<docidtype> space(dim);
hnswlib::HierarchicalNSW<dist_t>* alg_hnsw = new hnswlib::HierarchicalNSW<dist_t>(&space, max_elements, M, ef_construction);

// Generate random data
std::mt19937 rng;
rng.seed(47);
std::uniform_real_distribution<> distrib_real;
std::uniform_int_distribution<docidtype> distrib_docid(min_doc_id, max_doc_id);

size_t data_point_size = space.get_data_size();
char* data = new char[data_point_size * max_elements];
for (int i = 0; i < max_elements; i++) {
// set vector value
char* point_data = data + i * data_point_size;
for (int j = 0; j < dim; j++) {
char* vec_data = point_data + j * sizeof(float);
float value = distrib_real(rng);
*(float*)vec_data = value;
}
// set document id
docidtype doc_id = distrib_docid(rng);
space.set_doc_id(point_data, doc_id);
}

// Add data to index
std::unordered_map<hnswlib::labeltype, docidtype> label_docid_lookup;
for (int i = 0; i < max_elements; i++) {
hnswlib::labeltype label = i;
char* point_data = data + i * data_point_size;
alg_hnsw->addPoint(point_data, label);
label_docid_lookup[label] = space.get_doc_id(point_data);
}

// Query random vectors
size_t query_size = dim * sizeof(float);
for (int i = 0; i < num_queries; i++) {
char* query_data = new char[query_size];
for (int j = 0; j < dim; j++) {
size_t offset = j * sizeof(float);
char* vec_data = query_data + offset;
float value = distrib_real(rng);
*(float*)vec_data = value;
}
std::cout << "Query #" << i << "\n";
hnswlib::MultiVectorSearchStopCondition<docidtype, dist_t> stop_condition(space, num_docs, ef_collection);
std::vector<std::pair<float, hnswlib::labeltype>> result =
alg_hnsw->searchStopConditionClosest(query_data, stop_condition);
size_t num_vectors = result.size();

std::unordered_map<docidtype, size_t> doc_counter;
for (auto pair: result) {
hnswlib::labeltype label = pair.second;
docidtype doc_id = label_docid_lookup[label];
doc_counter[doc_id] += 1;
}
std::cout << "Found " << doc_counter.size() << " documents, " << num_vectors << " vectors\n";
delete[] query_data;
}

delete[] data;
delete alg_hnsw;
return 0;
}
14 changes: 10 additions & 4 deletions hnswlib/bruteforce.h
Original file line number Diff line number Diff line change
Expand Up @@ -84,10 +84,16 @@ class BruteforceSearch : public AlgorithmInterface<dist_t> {


void removePoint(labeltype cur_external) {
size_t cur_c = dict_external_to_internal[cur_external];
std::unique_lock<std::mutex> lock(index_lock);

dict_external_to_internal.erase(cur_external);
auto found = dict_external_to_internal.find(cur_external);
if (found == dict_external_to_internal.end()) {
return;
}

dict_external_to_internal.erase(found);

size_t cur_c = found->second;
labeltype label = *((labeltype*)(data_ + size_per_element_ * (cur_element_count-1) + data_size_));
dict_external_to_internal[label] = cur_c;
memcpy(data_ + size_per_element_ * cur_c,
Expand All @@ -106,7 +112,7 @@ class BruteforceSearch : public AlgorithmInterface<dist_t> {
dist_t dist = fstdistfunc_(query_data, data_ + size_per_element_ * i, dist_func_param_);
labeltype label = *((labeltype*) (data_ + size_per_element_ * i + data_size_));
if ((!isIdAllowed) || (*isIdAllowed)(label)) {
topResults.push(std::pair<dist_t, labeltype>(dist, label));
topResults.emplace(dist, label);
}
}
dist_t lastdist = topResults.empty() ? std::numeric_limits<dist_t>::max() : topResults.top().first;
Expand All @@ -115,7 +121,7 @@ class BruteforceSearch : public AlgorithmInterface<dist_t> {
if (dist <= lastdist) {
labeltype label = *((labeltype *) (data_ + size_per_element_ * i + data_size_));
if ((!isIdAllowed) || (*isIdAllowed)(label)) {
topResults.push(std::pair<dist_t, labeltype>(dist, label));
topResults.emplace(dist, label);
}
if (topResults.size() > k)
topResults.pop();
Expand Down
Loading