diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 8cfa469a..b75c72c7 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -7,7 +7,7 @@ jobs:
     runs-on: ${{matrix.os}}
     strategy:
       matrix:
-        os: [ubuntu-latest, windows-latest]
+        os: [ubuntu-latest, windows-latest, macos-latest]
         python-version: ["3.7", "3.8", "3.9", "3.10"]
     steps:
       - uses: actions/checkout@v3
@@ -28,7 +28,7 @@ jobs:
     runs-on: ${{matrix.os}}
     strategy:
       matrix:
-        os: [ubuntu-latest, windows-latest]
+        os: [ubuntu-latest, windows-latest, macos-latest]
     steps:
       - uses: actions/checkout@v3
       - uses: actions/setup-python@v4
@@ -40,10 +40,10 @@ jobs:
           mkdir build
           cd build
           cmake ..
-          if [ "$RUNNER_OS" == "Linux" ]; then
-            make
-          elif [ "$RUNNER_OS" == "Windows" ]; then
+          if [ "$RUNNER_OS" == "Windows" ]; then
             cmake --build ./ --config Release
+          else
+            make
           fi
         shell: bash
 
@@ -67,10 +67,14 @@ jobs:
           ./example_mt_search
           ./example_mt_filter
           ./example_mt_replace_deleted
+          ./example_multivector_search
+          ./example_epsilon_search
           ./searchKnnCloserFirst_test
           ./searchKnnWithFilter_test
           ./multiThreadLoad_test
           ./multiThread_replace_test
           ./test_updates
           ./test_updates update
+          ./multivector_search_test
+          ./epsilon_search_test
         shell: bash
diff --git a/.gitignore b/.gitignore
index 48f74604..d46c9890 100644
--- a/.gitignore
+++ b/.gitignore
@@ -10,3 +10,4 @@ var/
 .vscode/
 .vs/
 **.DS_Store
+*.pyc
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 7cebe600..be0d40f0 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,25 +1,68 @@
-cmake_minimum_required (VERSION 2.6)
-project(hnsw_lib
+cmake_minimum_required(VERSION 3.0...3.26)
+
+project(hnswlib
     LANGUAGES CXX)
 
+include(GNUInstallDirs)
+include(CheckCXXCompilerFlag)
+
 add_library(hnswlib INTERFACE)
-target_include_directories(hnswlib INTERFACE .) 
+add_library(hnswlib::hnswlib ALIAS hnswlib)
+
+target_include_directories(hnswlib INTERFACE
+    $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>
+    $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}>)
+
+# Install
+install(DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/hnswlib
+    DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
+
+install(TARGETS hnswlib
+    EXPORT hnswlibTargets)
+
+install(EXPORT hnswlibTargets
+    FILE hnswlibConfig.cmake
+    NAMESPACE hnswlib::
+    DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/hnswlib)
 
+# Examples and tests
 if(CMAKE_PROJECT_NAME STREQUAL PROJECT_NAME)
+    option(HNSWLIB_EXAMPLES "Build examples and tests." ON)
+else()
+    option(HNSWLIB_EXAMPLES "Build examples and tests." OFF)
+endif()
+if(HNSWLIB_EXAMPLES)
     set(CMAKE_CXX_STANDARD 11)
 
-    if (CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
-      SET( CMAKE_CXX_FLAGS  "-Ofast -DNDEBUG -std=c++11 -DHAVE_CXX0X -openmp -march=native -fpic -ftree-vectorize")
+    if (CMAKE_CXX_COMPILER_ID MATCHES "Clang")
+      SET( CMAKE_CXX_FLAGS  "-Ofast -std=c++11 -DHAVE_CXX0X -openmp -fpic -ftree-vectorize" )
+      check_cxx_compiler_flag("-march=native" COMPILER_SUPPORT_NATIVE_FLAG)
+      if(COMPILER_SUPPORT_NATIVE_FLAG)
+        SET( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=native" )
+        message("set -march=native flag")
+      else()
+        check_cxx_compiler_flag("-mcpu=apple-m1" COMPILER_SUPPORT_M1_FLAG)
+        if(COMPILER_SUPPORT_M1_FLAG)
+          SET( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mcpu=apple-m1" )
+          message("set -mcpu=apple-m1 flag")
+        endif()
+      endif()
     elseif (CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
-      SET( CMAKE_CXX_FLAGS  "-Ofast -lrt -DNDEBUG -std=c++11 -DHAVE_CXX0X -march=native -fpic -w -fopenmp -ftree-vectorize -ftree-vectorizer-verbose=0" )
+      SET( CMAKE_CXX_FLAGS  "-Ofast -lrt -std=c++11 -DHAVE_CXX0X -march=native -fpic -w -fopenmp -ftree-vectorize -ftree-vectorizer-verbose=0" )
     elseif (CMAKE_CXX_COMPILER_ID STREQUAL "MSVC")
-      SET( CMAKE_CXX_FLAGS  "-Ofast -lrt -DNDEBUG -std=c++11 -DHAVE_CXX0X -openmp -march=native -fpic -w -fopenmp -ftree-vectorize" )
+      SET( CMAKE_CXX_FLAGS  "/O2 -DHAVE_CXX0X /W1 /openmp /EHsc" )
     endif()
 
     # examples
     add_executable(example_search examples/cpp/example_search.cpp)
     target_link_libraries(example_search hnswlib)
 
+    add_executable(example_epsilon_search examples/cpp/example_epsilon_search.cpp)
+    target_link_libraries(example_epsilon_search hnswlib)
+
+    add_executable(example_multivector_search examples/cpp/example_multivector_search.cpp)
+    target_link_libraries(example_multivector_search hnswlib)
+
     add_executable(example_filter examples/cpp/example_filter.cpp)
     target_link_libraries(example_filter hnswlib)
 
@@ -36,6 +79,12 @@ if(CMAKE_PROJECT_NAME STREQUAL PROJECT_NAME)
     target_link_libraries(example_mt_replace_deleted hnswlib)
 
     # tests
+    add_executable(multivector_search_test tests/cpp/multivector_search_test.cpp)
+    target_link_libraries(multivector_search_test hnswlib)
+
+    add_executable(epsilon_search_test tests/cpp/epsilon_search_test.cpp)
+    target_link_libraries(epsilon_search_test hnswlib)
+
     add_executable(test_updates tests/cpp/updates_test.cpp)
     target_link_libraries(test_updates hnswlib)
 
diff --git a/README.md b/README.md
index 3ed466a7..6eefcf20 100644
--- a/README.md
+++ b/README.md
@@ -3,6 +3,15 @@ Header-only C++ HNSW implementation with python bindings, insertions and updates
 
 **NEWS:**
 
+**version 0.8.0** 
+
+* Multi-vector document search and epsilon search (for now, only in C++)
+* By default, there is no statistic aggregation, which speeds up the multi-threaded search (it does not seem like people are using it anyway: [Issue #495](https://github.com/nmslib/hnswlib/issues/495)). 
+* Various bugfixes and improvements
+* `get_items` now have `return_type` parameter, which can be either 'numpy' or 'list'
+
+Full list of changes: https://github.com/nmslib/hnswlib/pull/523
+
 **version 0.7.0** 
 
 * Added support to filtering (#402, #430) by [@kishorenc](https://github.com/kishorenc)
@@ -79,7 +88,7 @@ For other spaces use the nmslib library https://github.com/nmslib/nmslib.
 
 * `set_num_threads(num_threads)` set the default number of cpu threads used during data insertion/querying.
   
-* `get_items(ids)` - returns a numpy array (shape:`N*dim`) of vectors that have integer identifiers specified in `ids` numpy vector (shape:`N`). Note that for cosine similarity it currently returns **normalized** vectors.
+* `get_items(ids, return_type = 'numpy')` - returns a numpy array (shape:`N*dim`) of vectors that have integer identifiers specified in `ids` numpy vector (shape:`N`) if `return_type` is `list` return list of lists. Note that for cosine similarity it currently returns **normalized** vectors.
   
 * `get_ids_list()`  - returns a list of all elements' ids.
 
@@ -229,6 +238,8 @@ print("Recall for two batches:", np.mean(labels.reshape(-1) == np.arange(len(dat
 * filtering during the search with a boolean function
 * deleting the elements and reusing the memory of the deleted elements for newly added elements
 * multithreaded usage
+* multivector search
+* epsilon search
 
 
 ### Bindings installation
diff --git a/examples/cpp/EXAMPLES.md b/examples/cpp/EXAMPLES.md
index 3af603d4..5f9adc30 100644
--- a/examples/cpp/EXAMPLES.md
+++ b/examples/cpp/EXAMPLES.md
@@ -182,4 +182,8 @@ int main() {
 Multithreaded examples:
 * Creating index, inserting elements, searching [example_mt_search.cpp](example_mt_search.cpp)
 * Filtering during the search with a boolean function [example_mt_filter.cpp](example_mt_filter.cpp)
-* Reusing the memory of the deleted elements when new elements are being added [example_mt_replace_deleted.cpp](example_mt_replace_deleted.cpp)
\ No newline at end of file
+* Reusing the memory of the deleted elements when new elements are being added [example_mt_replace_deleted.cpp](example_mt_replace_deleted.cpp)
+
+More examples:
+* Multivector search [example_multivector_search.cpp](example_multivector_search.cpp)
+* Epsilon search [example_epsilon_search.cpp](example_epsilon_search.cpp)
\ No newline at end of file
diff --git a/examples/cpp/example_epsilon_search.cpp b/examples/cpp/example_epsilon_search.cpp
new file mode 100644
index 00000000..49eec408
--- /dev/null
+++ b/examples/cpp/example_epsilon_search.cpp
@@ -0,0 +1,66 @@
+#include "../../hnswlib/hnswlib.h"
+
+typedef unsigned int docidtype;
+typedef float dist_t;
+
+int main() {
+    int dim = 16;                  // Dimension of the elements
+    int max_elements = 10000;      // Maximum number of elements, should be known beforehand
+    int M = 16;                    // Tightly connected with internal dimensionality of the data
+                                   // strongly affects the memory consumption
+    int ef_construction = 200;     // Controls index search speed/build speed tradeoff
+    int min_num_candidates = 100;  // Minimum number of candidates to search in the epsilon region
+                                   // this parameter is similar to ef
+
+    int num_queries = 5;
+    float epsilon2 = 2.0;          // Squared distance to query
+
+    // Initing index
+    hnswlib::L2Space space(dim);
+    hnswlib::HierarchicalNSW<dist_t>* alg_hnsw = new hnswlib::HierarchicalNSW<dist_t>(&space, max_elements, M, ef_construction);
+
+    // Generate random data
+    std::mt19937 rng;
+    rng.seed(47);
+    std::uniform_real_distribution<> distrib_real;
+
+    size_t data_point_size = space.get_data_size();
+    char* data = new char[data_point_size * max_elements];
+    for (int i = 0; i < max_elements; i++) {
+        char* point_data = data + i * data_point_size;
+        for (int j = 0; j < dim; j++) {
+            char* vec_data = point_data + j * sizeof(float);
+            float value = distrib_real(rng);
+            *(float*)vec_data = value;
+        }
+    }
+
+    // Add data to index
+    for (int i = 0; i < max_elements; i++) {
+        hnswlib::labeltype label = i;
+        char* point_data = data + i * data_point_size;
+        alg_hnsw->addPoint(point_data, label);
+    }
+
+    // Query random vectors
+    for (int i = 0; i < num_queries; i++) {
+        char* query_data = new char[data_point_size];
+        for (int j = 0; j < dim; j++) {
+            size_t offset = j * sizeof(float);
+            char* vec_data = query_data + offset;
+            float value = distrib_real(rng);
+            *(float*)vec_data = value;
+        }
+        std::cout << "Query #" << i << "\n";
+        hnswlib::EpsilonSearchStopCondition<dist_t> stop_condition(epsilon2, min_num_candidates, max_elements);
+        std::vector<std::pair<float, hnswlib::labeltype>> result = 
+            alg_hnsw->searchStopConditionClosest(query_data, stop_condition);
+        size_t num_vectors = result.size();
+        std::cout << "Found " << num_vectors << " vectors\n";
+        delete[] query_data;
+    }
+
+    delete[] data;
+    delete alg_hnsw;
+    return 0;
+}
diff --git a/examples/cpp/example_multivector_search.cpp b/examples/cpp/example_multivector_search.cpp
new file mode 100644
index 00000000..06aafe0b
--- /dev/null
+++ b/examples/cpp/example_multivector_search.cpp
@@ -0,0 +1,83 @@
+#include "../../hnswlib/hnswlib.h"
+
+typedef unsigned int docidtype;
+typedef float dist_t;
+
+int main() {
+    int dim = 16;               // Dimension of the elements
+    int max_elements = 10000;   // Maximum number of elements, should be known beforehand
+    int M = 16;                 // Tightly connected with internal dimensionality of the data
+                                // strongly affects the memory consumption
+    int ef_construction = 200;  // Controls index search speed/build speed tradeoff
+
+    int num_queries = 5;
+    int num_docs = 5;           // Number of documents to search
+    int ef_collection = 6;      // Number of candidate documents during the search
+                                // Controlls the recall: higher ef leads to better accuracy, but slower search
+    docidtype min_doc_id = 0;
+    docidtype max_doc_id = 9;
+
+    // Initing index
+    hnswlib::MultiVectorL2Space<docidtype> space(dim);
+    hnswlib::HierarchicalNSW<dist_t>* alg_hnsw = new hnswlib::HierarchicalNSW<dist_t>(&space, max_elements, M, ef_construction);
+
+    // Generate random data
+    std::mt19937 rng;
+    rng.seed(47);
+    std::uniform_real_distribution<> distrib_real;
+    std::uniform_int_distribution<docidtype> distrib_docid(min_doc_id, max_doc_id);
+
+    size_t data_point_size = space.get_data_size();
+    char* data = new char[data_point_size * max_elements];
+    for (int i = 0; i < max_elements; i++) {
+        // set vector value
+        char* point_data = data + i * data_point_size;
+        for (int j = 0; j < dim; j++) {
+            char* vec_data = point_data + j * sizeof(float);
+            float value = distrib_real(rng);
+            *(float*)vec_data = value;
+        }
+        // set document id
+        docidtype doc_id = distrib_docid(rng);
+        space.set_doc_id(point_data, doc_id);
+    }
+
+    // Add data to index
+    std::unordered_map<hnswlib::labeltype, docidtype> label_docid_lookup;
+    for (int i = 0; i < max_elements; i++) {
+        hnswlib::labeltype label = i;
+        char* point_data = data + i * data_point_size;
+        alg_hnsw->addPoint(point_data, label);
+        label_docid_lookup[label] = space.get_doc_id(point_data);
+    }
+
+    // Query random vectors
+    size_t query_size = dim * sizeof(float);
+    for (int i = 0; i < num_queries; i++) {
+        char* query_data = new char[query_size];
+        for (int j = 0; j < dim; j++) {
+            size_t offset = j * sizeof(float);
+            char* vec_data = query_data + offset;
+            float value = distrib_real(rng);
+            *(float*)vec_data = value;
+        }
+        std::cout << "Query #" << i << "\n";
+        hnswlib::MultiVectorSearchStopCondition<docidtype, dist_t> stop_condition(space, num_docs, ef_collection);
+        std::vector<std::pair<float, hnswlib::labeltype>> result = 
+            alg_hnsw->searchStopConditionClosest(query_data, stop_condition);
+        size_t num_vectors = result.size();
+
+        std::unordered_map<docidtype, size_t> doc_counter;
+        for (auto pair: result) {
+            hnswlib::labeltype label = pair.second;
+            docidtype doc_id = label_docid_lookup[label];
+            doc_counter[doc_id] += 1;
+        }
+        std::cout << "Found " << doc_counter.size() << " documents, " << num_vectors << " vectors\n";
+        delete[] query_data;
+    }
+
+    delete[] data;
+    delete alg_hnsw;
+    return 0;
+}
diff --git a/hnswlib/bruteforce.h b/hnswlib/bruteforce.h
index 30b33ae9..8727cc8a 100644
--- a/hnswlib/bruteforce.h
+++ b/hnswlib/bruteforce.h
@@ -84,10 +84,16 @@ class BruteforceSearch : public AlgorithmInterface<dist_t> {
 
 
     void removePoint(labeltype cur_external) {
-        size_t cur_c = dict_external_to_internal[cur_external];
+        std::unique_lock<std::mutex> lock(index_lock);
 
-        dict_external_to_internal.erase(cur_external);
+        auto found = dict_external_to_internal.find(cur_external);
+        if (found == dict_external_to_internal.end()) {
+            return;
+        }
+
+        dict_external_to_internal.erase(found);
 
+        size_t cur_c = found->second;
         labeltype label = *((labeltype*)(data_ + size_per_element_ * (cur_element_count-1) + data_size_));
         dict_external_to_internal[label] = cur_c;
         memcpy(data_ + size_per_element_ * cur_c,
@@ -106,7 +112,7 @@ class BruteforceSearch : public AlgorithmInterface<dist_t> {
             dist_t dist = fstdistfunc_(query_data, data_ + size_per_element_ * i, dist_func_param_);
             labeltype label = *((labeltype*) (data_ + size_per_element_ * i + data_size_));
             if ((!isIdAllowed) || (*isIdAllowed)(label)) {
-                topResults.push(std::pair<dist_t, labeltype>(dist, label));
+                topResults.emplace(dist, label);
             }
         }
         dist_t lastdist = topResults.empty() ? std::numeric_limits<dist_t>::max() : topResults.top().first;
@@ -115,7 +121,7 @@ class BruteforceSearch : public AlgorithmInterface<dist_t> {
             if (dist <= lastdist) {
                 labeltype label = *((labeltype *) (data_ + size_per_element_ * i + data_size_));
                 if ((!isIdAllowed) || (*isIdAllowed)(label)) {
-                    topResults.push(std::pair<dist_t, labeltype>(dist, label));
+                    topResults.emplace(dist, label);
                 }
                 if (topResults.size() > k)
                     topResults.pop();
diff --git a/hnswlib/hnswalg.h b/hnswlib/hnswalg.h
index bef00170..e269ae69 100644
--- a/hnswlib/hnswalg.h
+++ b/hnswlib/hnswalg.h
@@ -8,6 +8,7 @@
 #include <assert.h>
 #include <unordered_set>
 #include <list>
+#include <memory>
 
 namespace hnswlib {
 typedef unsigned int tableint;
@@ -33,7 +34,7 @@ class HierarchicalNSW : public AlgorithmInterface<dist_t> {
     double mult_{0.0}, revSize_{0.0};
     int maxlevel_{0};
 
-    VisitedListPool *visited_list_pool_{nullptr};
+    std::unique_ptr<VisitedListPool> visited_list_pool_{nullptr};
 
     // Locks operations with element by label value
     mutable std::vector<std::mutex> label_op_locks_;
@@ -92,8 +93,8 @@ class HierarchicalNSW : public AlgorithmInterface<dist_t> {
         size_t ef_construction = 200,
         size_t random_seed = 100,
         bool allow_replace_deleted = false)
-        : link_list_locks_(max_elements),
-            label_op_locks_(MAX_LABEL_OPERATION_LOCKS),
+        : label_op_locks_(MAX_LABEL_OPERATION_LOCKS),
+            link_list_locks_(max_elements),
             element_levels_(max_elements),
             allow_replace_deleted_(allow_replace_deleted) {
         max_elements_ = max_elements;
@@ -101,7 +102,13 @@ class HierarchicalNSW : public AlgorithmInterface<dist_t> {
         data_size_ = s->get_data_size();
         fstdistfunc_ = s->get_dist_func();
         dist_func_param_ = s->get_dist_func_param();
-        M_ = M;
+        if ( M <= 10000 ) {
+            M_ = M;
+        } else {
+            HNSWERR << "warning: M parameter exceeds 10000 which may lead to adverse effects." << std::endl;
+            HNSWERR << "         Cap to 10000 will be applied for the rest of the processing." << std::endl;
+            M_ = 10000;
+        }
         maxM_ = M_;
         maxM0_ = M_ * 2;
         ef_construction_ = std::max(ef_construction, M_);
@@ -122,7 +129,7 @@ class HierarchicalNSW : public AlgorithmInterface<dist_t> {
 
         cur_element_count = 0;
 
-        visited_list_pool_ = new VisitedListPool(1, max_elements);
+        visited_list_pool_ = std::unique_ptr<VisitedListPool>(new VisitedListPool(1, max_elements));
 
         // initializations for special treatment of the first node
         enterpoint_node_ = -1;
@@ -138,13 +145,20 @@ class HierarchicalNSW : public AlgorithmInterface<dist_t> {
 
 
     ~HierarchicalNSW() {
+        clear();
+    }
+
+    void clear() {
         free(data_level0_memory_);
+        data_level0_memory_ = nullptr;
         for (tableint i = 0; i < cur_element_count; i++) {
             if (element_levels_[i] > 0)
                 free(linkLists_[i]);
         }
         free(linkLists_);
-        delete visited_list_pool_;
+        linkLists_ = nullptr;
+        cur_element_count = 0;
+        visited_list_pool_.reset(nullptr);
     }
 
 
@@ -291,9 +305,15 @@ class HierarchicalNSW : public AlgorithmInterface<dist_t> {
     }
 
 
-    template <bool has_deletions, bool collect_metrics = false>
+    // bare_bone_search means there is no check for deletions and stop condition is ignored in return of extra performance
+    template <bool bare_bone_search = true, bool collect_metrics = false>
     std::priority_queue<std::pair<dist_t, tableint>, std::vector<std::pair<dist_t, tableint>>, CompareByFirst>
-    searchBaseLayerST(tableint ep_id, const void *data_point, size_t ef, BaseFilterFunctor* isIdAllowed = nullptr) const {
+    searchBaseLayerST(
+        tableint ep_id,
+        const void *data_point,
+        size_t ef,
+        BaseFilterFunctor* isIdAllowed = nullptr,
+        BaseSearchStopCondition<dist_t>* stop_condition = nullptr) const {
         VisitedList *vl = visited_list_pool_->getFreeVisitedList();
         vl_type *visited_array = vl->mass;
         vl_type visited_array_tag = vl->curV;
@@ -302,10 +322,15 @@ class HierarchicalNSW : public AlgorithmInterface<dist_t> {
         std::priority_queue<std::pair<dist_t, tableint>, std::vector<std::pair<dist_t, tableint>>, CompareByFirst> candidate_set;
 
         dist_t lowerBound;
-        if ((!has_deletions || !isMarkedDeleted(ep_id)) && ((!isIdAllowed) || (*isIdAllowed)(getExternalLabel(ep_id)))) {
-            dist_t dist = fstdistfunc_(data_point, getDataByInternalId(ep_id), dist_func_param_);
+        if (bare_bone_search || 
+            (!isMarkedDeleted(ep_id) && ((!isIdAllowed) || (*isIdAllowed)(getExternalLabel(ep_id))))) {
+            char* ep_data = getDataByInternalId(ep_id);
+            dist_t dist = fstdistfunc_(data_point, ep_data, dist_func_param_);
             lowerBound = dist;
             top_candidates.emplace(dist, ep_id);
+            if (!bare_bone_search && stop_condition) {
+                stop_condition->add_point_to_result(getExternalLabel(ep_id), ep_data, dist);
+            }
             candidate_set.emplace(-dist, ep_id);
         } else {
             lowerBound = std::numeric_limits<dist_t>::max();
@@ -316,9 +341,19 @@ class HierarchicalNSW : public AlgorithmInterface<dist_t> {
 
         while (!candidate_set.empty()) {
             std::pair<dist_t, tableint> current_node_pair = candidate_set.top();
+            dist_t candidate_dist = -current_node_pair.first;
 
-            if ((-current_node_pair.first) > lowerBound &&
-                (top_candidates.size() == ef || (!isIdAllowed && !has_deletions))) {
+            bool flag_stop_search;
+            if (bare_bone_search) {
+                flag_stop_search = candidate_dist > lowerBound;
+            } else {
+                if (stop_condition) {
+                    flag_stop_search = stop_condition->should_stop_search(candidate_dist, lowerBound);
+                } else {
+                    flag_stop_search = candidate_dist > lowerBound && top_candidates.size() == ef;
+                }
+            }
+            if (flag_stop_search) {
                 break;
             }
             candidate_set.pop();
@@ -353,7 +388,14 @@ class HierarchicalNSW : public AlgorithmInterface<dist_t> {
                     char *currObj1 = (getDataByInternalId(candidate_id));
                     dist_t dist = fstdistfunc_(data_point, currObj1, dist_func_param_);
 
-                    if (top_candidates.size() < ef || lowerBound > dist) {
+                    bool flag_consider_candidate;
+                    if (!bare_bone_search && stop_condition) {
+                        flag_consider_candidate = stop_condition->should_consider_candidate(dist, lowerBound);
+                    } else {
+                        flag_consider_candidate = top_candidates.size() < ef || lowerBound > dist;
+                    }
+
+                    if (flag_consider_candidate) {
                         candidate_set.emplace(-dist, candidate_id);
 #ifdef USE_SSE
                         _mm_prefetch(data_level0_memory_ + candidate_set.top().second * size_data_per_element_ +
@@ -361,11 +403,30 @@ class HierarchicalNSW : public AlgorithmInterface<dist_t> {
                                         _MM_HINT_T0);  ////////////////////////
 #endif
 
-                        if ((!has_deletions || !isMarkedDeleted(candidate_id)) && ((!isIdAllowed) || (*isIdAllowed)(getExternalLabel(candidate_id))))
+                        if (bare_bone_search || 
+                            (!isMarkedDeleted(candidate_id) && ((!isIdAllowed) || (*isIdAllowed)(getExternalLabel(candidate_id))))) {
                             top_candidates.emplace(dist, candidate_id);
+                            if (!bare_bone_search && stop_condition) {
+                                stop_condition->add_point_to_result(getExternalLabel(candidate_id), currObj1, dist);
+                            }
+                        }
 
-                        if (top_candidates.size() > ef)
+                        bool flag_remove_extra = false;
+                        if (!bare_bone_search && stop_condition) {
+                            flag_remove_extra = stop_condition->should_remove_extra();
+                        } else {
+                            flag_remove_extra = top_candidates.size() > ef;
+                        }
+                        while (flag_remove_extra) {
+                            tableint id = top_candidates.top().second;
                             top_candidates.pop();
+                            if (!bare_bone_search && stop_condition) {
+                                stop_condition->remove_point_from_result(getExternalLabel(id), getDataByInternalId(id), dist);
+                                flag_remove_extra = stop_condition->should_remove_extra();
+                            } else {
+                                flag_remove_extra = top_candidates.size() > ef;
+                            }
+                        }
 
                         if (!top_candidates.empty())
                             lowerBound = top_candidates.top().first;
@@ -380,8 +441,8 @@ class HierarchicalNSW : public AlgorithmInterface<dist_t> {
 
 
     void getNeighborsByHeuristic2(
-            std::priority_queue<std::pair<dist_t, tableint>, std::vector<std::pair<dist_t, tableint>>, CompareByFirst> &top_candidates,
-    const size_t M) {
+        std::priority_queue<std::pair<dist_t, tableint>, std::vector<std::pair<dist_t, tableint>>, CompareByFirst> &top_candidates,
+        const size_t M) {
         if (top_candidates.size() < M) {
             return;
         }
@@ -573,8 +634,7 @@ class HierarchicalNSW : public AlgorithmInterface<dist_t> {
         if (new_max_elements < cur_element_count)
             throw std::runtime_error("Cannot resize, max element is less than the current number of elements");
 
-        delete visited_list_pool_;
-        visited_list_pool_ = new VisitedListPool(1, new_max_elements);
+        visited_list_pool_.reset(new VisitedListPool(1, new_max_elements));
 
         element_levels_.resize(new_max_elements);
 
@@ -595,6 +655,32 @@ class HierarchicalNSW : public AlgorithmInterface<dist_t> {
         max_elements_ = new_max_elements;
     }
 
+    size_t indexFileSize() const {
+        size_t size = 0;
+        size += sizeof(offsetLevel0_);
+        size += sizeof(max_elements_);
+        size += sizeof(cur_element_count);
+        size += sizeof(size_data_per_element_);
+        size += sizeof(label_offset_);
+        size += sizeof(offsetData_);
+        size += sizeof(maxlevel_);
+        size += sizeof(enterpoint_node_);
+        size += sizeof(maxM_);
+
+        size += sizeof(maxM0_);
+        size += sizeof(M_);
+        size += sizeof(mult_);
+        size += sizeof(ef_construction_);
+
+        size += cur_element_count * size_data_per_element_;
+
+        for (size_t i = 0; i < cur_element_count; i++) {
+            unsigned int linkListSize = element_levels_[i] > 0 ? size_links_per_element_ * element_levels_[i] : 0;
+            size += sizeof(linkListSize);
+            size += linkListSize;
+        }
+        return size;
+    }
 
     void saveIndex(const std::string &location) {
         std::ofstream output(location, std::ios::binary);
@@ -633,6 +719,7 @@ class HierarchicalNSW : public AlgorithmInterface<dist_t> {
         if (!input.is_open())
             throw std::runtime_error("Cannot open file");
 
+        clear();
         // get file size:
         input.seekg(0, input.end);
         std::streampos total_filesize = input.tellg();
@@ -698,7 +785,7 @@ class HierarchicalNSW : public AlgorithmInterface<dist_t> {
         std::vector<std::mutex>(max_elements).swap(link_list_locks_);
         std::vector<std::mutex>(MAX_LABEL_OPERATION_LOCKS).swap(label_op_locks_);
 
-        visited_list_pool_ = new VisitedListPool(1, max_elements);
+        visited_list_pool_.reset(new VisitedListPool(1, max_elements));
 
         linkLists_ = (char **) malloc(sizeof(void *) * max_elements);
         if (linkLists_ == nullptr)
@@ -752,7 +839,7 @@ class HierarchicalNSW : public AlgorithmInterface<dist_t> {
         size_t dim = *((size_t *) dist_func_param_);
         std::vector<data_t> data;
         data_t* data_ptr = (data_t*) data_ptrv;
-        for (int i = 0; i < dim; i++) {
+        for (size_t i = 0; i < dim; i++) {
             data.push_back(*data_ptr);
             data_ptr += 1;
         }
@@ -1216,11 +1303,12 @@ class HierarchicalNSW : public AlgorithmInterface<dist_t> {
         }
 
         std::priority_queue<std::pair<dist_t, tableint>, std::vector<std::pair<dist_t, tableint>>, CompareByFirst> top_candidates;
-        if (num_deleted_) {
-            top_candidates = searchBaseLayerST<true, true>(
+        bool bare_bone_search = !num_deleted_ && !isIdAllowed;
+        if (bare_bone_search) {
+            top_candidates = searchBaseLayerST<true>(
                     currObj, query_data, std::max(ef_, k), isIdAllowed);
         } else {
-            top_candidates = searchBaseLayerST<false, true>(
+            top_candidates = searchBaseLayerST<false>(
                     currObj, query_data, std::max(ef_, k), isIdAllowed);
         }
 
@@ -1236,6 +1324,60 @@ class HierarchicalNSW : public AlgorithmInterface<dist_t> {
     }
 
 
+    std::vector<std::pair<dist_t, labeltype >>
+    searchStopConditionClosest(
+        const void *query_data,
+        BaseSearchStopCondition<dist_t>& stop_condition,
+        BaseFilterFunctor* isIdAllowed = nullptr) const {
+        std::vector<std::pair<dist_t, labeltype >> result;
+        if (cur_element_count == 0) return result;
+
+        tableint currObj = enterpoint_node_;
+        dist_t curdist = fstdistfunc_(query_data, getDataByInternalId(enterpoint_node_), dist_func_param_);
+
+        for (int level = maxlevel_; level > 0; level--) {
+            bool changed = true;
+            while (changed) {
+                changed = false;
+                unsigned int *data;
+
+                data = (unsigned int *) get_linklist(currObj, level);
+                int size = getListCount(data);
+                metric_hops++;
+                metric_distance_computations+=size;
+
+                tableint *datal = (tableint *) (data + 1);
+                for (int i = 0; i < size; i++) {
+                    tableint cand = datal[i];
+                    if (cand < 0 || cand > max_elements_)
+                        throw std::runtime_error("cand error");
+                    dist_t d = fstdistfunc_(query_data, getDataByInternalId(cand), dist_func_param_);
+
+                    if (d < curdist) {
+                        curdist = d;
+                        currObj = cand;
+                        changed = true;
+                    }
+                }
+            }
+        }
+
+        std::priority_queue<std::pair<dist_t, tableint>, std::vector<std::pair<dist_t, tableint>>, CompareByFirst> top_candidates;
+        top_candidates = searchBaseLayerST<false>(currObj, query_data, 0, isIdAllowed, &stop_condition);
+
+        size_t sz = top_candidates.size();
+        result.resize(sz);
+        while (!top_candidates.empty()) {
+            result[--sz] = top_candidates.top();
+            top_candidates.pop();
+        }
+
+        stop_condition.filter_results(result);
+
+        return result;
+    }
+
+
     void checkIntegrity() {
         int connections_checked = 0;
         std::vector <int > inbound_connections_num(cur_element_count, 0);
@@ -1246,7 +1388,6 @@ class HierarchicalNSW : public AlgorithmInterface<dist_t> {
                 tableint *data = (tableint *) (ll_cur + 1);
                 std::unordered_set<tableint> s;
                 for (int j = 0; j < size; j++) {
-                    assert(data[j] > 0);
                     assert(data[j] < cur_element_count);
                     assert(data[j] != i);
                     inbound_connections_num[data[j]]++;
diff --git a/hnswlib/hnswlib.h b/hnswlib/hnswlib.h
index fb7118fa..7ccfbba5 100644
--- a/hnswlib/hnswlib.h
+++ b/hnswlib/hnswlib.h
@@ -1,4 +1,13 @@
 #pragma once
+
+// https://github.com/nmslib/hnswlib/pull/508
+// This allows others to provide their own error stream (e.g. RcppHNSW)
+#ifndef HNSWLIB_ERR_OVERRIDE
+  #define HNSWERR std::cerr
+#else
+  #define HNSWERR HNSWLIB_ERR_OVERRIDE
+#endif
+
 #ifndef NO_MANUAL_VECTORIZATION
 #if (defined(__SSE__) || _M_IX86_FP > 0 || defined(_M_AMD64) || defined(_M_X64))
 #define USE_SSE
@@ -15,7 +24,7 @@
 #ifdef _MSC_VER
 #include <intrin.h>
 #include <stdexcept>
-void cpuid(int32_t out[4], int32_t eax, int32_t ecx) {
+static void cpuid(int32_t out[4], int32_t eax, int32_t ecx) {
     __cpuidex(out, eax, ecx);
 }
 static __int64 xgetbv(unsigned int x) {
@@ -119,6 +128,25 @@ typedef size_t labeltype;
 class BaseFilterFunctor {
  public:
     virtual bool operator()(hnswlib::labeltype id) { return true; }
+    virtual ~BaseFilterFunctor() {};
+};
+
+template<typename dist_t>
+class BaseSearchStopCondition {
+ public:
+    virtual void add_point_to_result(labeltype label, const void *datapoint, dist_t dist) = 0;
+
+    virtual void remove_point_from_result(labeltype label, const void *datapoint, dist_t dist) = 0;
+
+    virtual bool should_stop_search(dist_t candidate_dist, dist_t lowerBound) = 0;
+
+    virtual bool should_consider_candidate(dist_t candidate_dist, dist_t lowerBound) = 0;
+
+    virtual bool should_remove_extra() = 0;
+
+    virtual void filter_results(std::vector<std::pair<dist_t, labeltype >> &candidates) = 0;
+
+    virtual ~BaseSearchStopCondition() {}
 };
 
 template <typename T>
@@ -195,5 +223,6 @@ AlgorithmInterface<dist_t>::searchKnnCloserFirst(const void* query_data, size_t
 
 #include "space_l2.h"
 #include "space_ip.h"
+#include "stop_condition.h"
 #include "bruteforce.h"
 #include "hnswalg.h"
diff --git a/hnswlib/space_ip.h b/hnswlib/space_ip.h
index 2b1c359e..0e6834c1 100644
--- a/hnswlib/space_ip.h
+++ b/hnswlib/space_ip.h
@@ -157,19 +157,44 @@ InnerProductSIMD16ExtAVX512(const void *pVect1v, const void *pVect2v, const void
 
     __m512 sum512 = _mm512_set1_ps(0);
 
-    while (pVect1 < pEnd1) {
-        //_mm_prefetch((char*)(pVect2 + 16), _MM_HINT_T0);
-
+    size_t loop = qty16 / 4;
+    
+    while (loop--) {
         __m512 v1 = _mm512_loadu_ps(pVect1);
-        pVect1 += 16;
         __m512 v2 = _mm512_loadu_ps(pVect2);
+        pVect1 += 16;
+        pVect2 += 16;
+
+        __m512 v3 = _mm512_loadu_ps(pVect1);
+        __m512 v4 = _mm512_loadu_ps(pVect2);
+        pVect1 += 16;
+        pVect2 += 16;
+
+        __m512 v5 = _mm512_loadu_ps(pVect1);
+        __m512 v6 = _mm512_loadu_ps(pVect2);
+        pVect1 += 16;
         pVect2 += 16;
-        sum512 = _mm512_add_ps(sum512, _mm512_mul_ps(v1, v2));
+
+        __m512 v7 = _mm512_loadu_ps(pVect1);
+        __m512 v8 = _mm512_loadu_ps(pVect2);
+        pVect1 += 16;
+        pVect2 += 16;
+
+        sum512 = _mm512_fmadd_ps(v1, v2, sum512);
+        sum512 = _mm512_fmadd_ps(v3, v4, sum512);
+        sum512 = _mm512_fmadd_ps(v5, v6, sum512);
+        sum512 = _mm512_fmadd_ps(v7, v8, sum512);
     }
 
-    _mm512_store_ps(TmpRes, sum512);
-    float sum = TmpRes[0] + TmpRes[1] + TmpRes[2] + TmpRes[3] + TmpRes[4] + TmpRes[5] + TmpRes[6] + TmpRes[7] + TmpRes[8] + TmpRes[9] + TmpRes[10] + TmpRes[11] + TmpRes[12] + TmpRes[13] + TmpRes[14] + TmpRes[15];
+    while (pVect1 < pEnd1) {
+        __m512 v1 = _mm512_loadu_ps(pVect1);
+        __m512 v2 = _mm512_loadu_ps(pVect2);
+        pVect1 += 16;
+        pVect2 += 16;
+        sum512 = _mm512_fmadd_ps(v1, v2, sum512);
+    }
 
+    float sum = _mm512_reduce_add_ps(sum512);
     return sum;
 }
 
diff --git a/hnswlib/stop_condition.h b/hnswlib/stop_condition.h
new file mode 100644
index 00000000..acc80ebe
--- /dev/null
+++ b/hnswlib/stop_condition.h
@@ -0,0 +1,276 @@
+#pragma once
+#include "space_l2.h"
+#include "space_ip.h"
+#include <assert.h>
+#include <unordered_map>
+
+namespace hnswlib {
+
+template<typename DOCIDTYPE>
+class BaseMultiVectorSpace : public SpaceInterface<float> {
+ public:
+    virtual DOCIDTYPE get_doc_id(const void *datapoint) = 0;
+
+    virtual void set_doc_id(void *datapoint, DOCIDTYPE doc_id) = 0;
+};
+
+
+template<typename DOCIDTYPE>
+class MultiVectorL2Space : public BaseMultiVectorSpace<DOCIDTYPE> {
+    DISTFUNC<float> fstdistfunc_;
+    size_t data_size_;
+    size_t vector_size_;
+    size_t dim_;
+
+ public:
+    MultiVectorL2Space(size_t dim) {
+        fstdistfunc_ = L2Sqr;
+#if defined(USE_SSE) || defined(USE_AVX) || defined(USE_AVX512)
+    #if defined(USE_AVX512)
+        if (AVX512Capable())
+            L2SqrSIMD16Ext = L2SqrSIMD16ExtAVX512;
+        else if (AVXCapable())
+            L2SqrSIMD16Ext = L2SqrSIMD16ExtAVX;
+    #elif defined(USE_AVX)
+        if (AVXCapable())
+            L2SqrSIMD16Ext = L2SqrSIMD16ExtAVX;
+    #endif
+
+        if (dim % 16 == 0)
+            fstdistfunc_ = L2SqrSIMD16Ext;
+        else if (dim % 4 == 0)
+            fstdistfunc_ = L2SqrSIMD4Ext;
+        else if (dim > 16)
+            fstdistfunc_ = L2SqrSIMD16ExtResiduals;
+        else if (dim > 4)
+            fstdistfunc_ = L2SqrSIMD4ExtResiduals;
+#endif
+        dim_ = dim;
+        vector_size_ = dim * sizeof(float);
+        data_size_ = vector_size_ + sizeof(DOCIDTYPE);
+    }
+
+    size_t get_data_size() override {
+        return data_size_;
+    }
+
+    DISTFUNC<float> get_dist_func() override {
+        return fstdistfunc_;
+    }
+
+    void *get_dist_func_param() override {
+        return &dim_;
+    }
+
+    DOCIDTYPE get_doc_id(const void *datapoint) override {
+        return *(DOCIDTYPE *)((char *)datapoint + vector_size_);
+    }
+
+    void set_doc_id(void *datapoint, DOCIDTYPE doc_id) override {
+        *(DOCIDTYPE*)((char *)datapoint + vector_size_) = doc_id;
+    }
+
+    ~MultiVectorL2Space() {}
+};
+
+
+template<typename DOCIDTYPE>
+class MultiVectorInnerProductSpace : public BaseMultiVectorSpace<DOCIDTYPE> {
+    DISTFUNC<float> fstdistfunc_;
+    size_t data_size_;
+    size_t vector_size_;
+    size_t dim_;
+
+ public:
+    MultiVectorInnerProductSpace(size_t dim) {
+        fstdistfunc_ = InnerProductDistance;
+#if defined(USE_AVX) || defined(USE_SSE) || defined(USE_AVX512)
+    #if defined(USE_AVX512)
+        if (AVX512Capable()) {
+            InnerProductSIMD16Ext = InnerProductSIMD16ExtAVX512;
+            InnerProductDistanceSIMD16Ext = InnerProductDistanceSIMD16ExtAVX512;
+        } else if (AVXCapable()) {
+            InnerProductSIMD16Ext = InnerProductSIMD16ExtAVX;
+            InnerProductDistanceSIMD16Ext = InnerProductDistanceSIMD16ExtAVX;
+        }
+    #elif defined(USE_AVX)
+        if (AVXCapable()) {
+            InnerProductSIMD16Ext = InnerProductSIMD16ExtAVX;
+            InnerProductDistanceSIMD16Ext = InnerProductDistanceSIMD16ExtAVX;
+        }
+    #endif
+    #if defined(USE_AVX)
+        if (AVXCapable()) {
+            InnerProductSIMD4Ext = InnerProductSIMD4ExtAVX;
+            InnerProductDistanceSIMD4Ext = InnerProductDistanceSIMD4ExtAVX;
+        }
+    #endif
+
+        if (dim % 16 == 0)
+            fstdistfunc_ = InnerProductDistanceSIMD16Ext;
+        else if (dim % 4 == 0)
+            fstdistfunc_ = InnerProductDistanceSIMD4Ext;
+        else if (dim > 16)
+            fstdistfunc_ = InnerProductDistanceSIMD16ExtResiduals;
+        else if (dim > 4)
+            fstdistfunc_ = InnerProductDistanceSIMD4ExtResiduals;
+#endif
+        vector_size_ = dim * sizeof(float);
+        data_size_ = vector_size_ + sizeof(DOCIDTYPE);
+    }
+
+    size_t get_data_size() override {
+        return data_size_;
+    }
+
+    DISTFUNC<float> get_dist_func() override {
+        return fstdistfunc_;
+    }
+
+    void *get_dist_func_param() override {
+        return &dim_;
+    }
+
+    DOCIDTYPE get_doc_id(const void *datapoint) override {
+        return *(DOCIDTYPE *)((char *)datapoint + vector_size_);
+    }
+
+    void set_doc_id(void *datapoint, DOCIDTYPE doc_id) override {
+        *(DOCIDTYPE*)((char *)datapoint + vector_size_) = doc_id;
+    }
+
+    ~MultiVectorInnerProductSpace() {}
+};
+
+
+template<typename DOCIDTYPE, typename dist_t>
+class MultiVectorSearchStopCondition : public BaseSearchStopCondition<dist_t> {
+    size_t curr_num_docs_;
+    size_t num_docs_to_search_;
+    size_t ef_collection_;
+    std::unordered_map<DOCIDTYPE, size_t> doc_counter_;
+    std::priority_queue<std::pair<dist_t, DOCIDTYPE>> search_results_;
+    BaseMultiVectorSpace<DOCIDTYPE>& space_;
+
+ public:
+    MultiVectorSearchStopCondition(
+        BaseMultiVectorSpace<DOCIDTYPE>& space,
+        size_t num_docs_to_search,
+        size_t ef_collection = 10)
+        : space_(space) {
+            curr_num_docs_ = 0;
+            num_docs_to_search_ = num_docs_to_search;
+            ef_collection_ = std::max(ef_collection, num_docs_to_search);
+        }
+
+    void add_point_to_result(labeltype label, const void *datapoint, dist_t dist) override {
+        DOCIDTYPE doc_id = space_.get_doc_id(datapoint);
+        if (doc_counter_[doc_id] == 0) {
+            curr_num_docs_ += 1;
+        }
+        search_results_.emplace(dist, doc_id);
+        doc_counter_[doc_id] += 1;
+    }
+
+    void remove_point_from_result(labeltype label, const void *datapoint, dist_t dist) override {
+        DOCIDTYPE doc_id = space_.get_doc_id(datapoint);
+        doc_counter_[doc_id] -= 1;
+        if (doc_counter_[doc_id] == 0) {
+            curr_num_docs_ -= 1;
+        }
+        search_results_.pop();
+    }
+
+    bool should_stop_search(dist_t candidate_dist, dist_t lowerBound) override {
+        bool stop_search = candidate_dist > lowerBound && curr_num_docs_ == ef_collection_;
+        return stop_search;
+    }
+
+    bool should_consider_candidate(dist_t candidate_dist, dist_t lowerBound) override {
+        bool flag_consider_candidate = curr_num_docs_ < ef_collection_ || lowerBound > candidate_dist;
+        return flag_consider_candidate;
+    }
+
+    bool should_remove_extra() override {
+        bool flag_remove_extra = curr_num_docs_ > ef_collection_;
+        return flag_remove_extra;
+    }
+
+    void filter_results(std::vector<std::pair<dist_t, labeltype >> &candidates) override {
+        while (curr_num_docs_ > num_docs_to_search_) {
+            dist_t dist_cand = candidates.back().first;
+            dist_t dist_res = search_results_.top().first;
+            assert(dist_cand == dist_res);
+            DOCIDTYPE doc_id = search_results_.top().second;
+            doc_counter_[doc_id] -= 1;
+            if (doc_counter_[doc_id] == 0) {
+                curr_num_docs_ -= 1;
+            }
+            search_results_.pop();
+            candidates.pop_back();
+        }
+    }
+
+    ~MultiVectorSearchStopCondition() {}
+};
+
+
+template<typename dist_t>
+class EpsilonSearchStopCondition : public BaseSearchStopCondition<dist_t> {
+    float epsilon_;
+    size_t min_num_candidates_;
+    size_t max_num_candidates_;
+    size_t curr_num_items_;
+
+ public:
+    EpsilonSearchStopCondition(float epsilon, size_t min_num_candidates, size_t max_num_candidates) {
+        assert(min_num_candidates <= max_num_candidates);
+        epsilon_ = epsilon;
+        min_num_candidates_ = min_num_candidates;
+        max_num_candidates_ = max_num_candidates;
+        curr_num_items_ = 0;
+    }
+
+    void add_point_to_result(labeltype label, const void *datapoint, dist_t dist) override {
+        curr_num_items_ += 1;
+    }
+
+    void remove_point_from_result(labeltype label, const void *datapoint, dist_t dist) override {
+        curr_num_items_ -= 1;
+    }
+
+    bool should_stop_search(dist_t candidate_dist, dist_t lowerBound) override {
+        if (candidate_dist > lowerBound && curr_num_items_ == max_num_candidates_) {
+            // new candidate can't improve found results
+            return true;
+        }
+        if (candidate_dist > epsilon_ && curr_num_items_ >= min_num_candidates_) {
+            // new candidate is out of epsilon region and
+            // minimum number of candidates is checked
+            return true;
+        }
+        return false;
+    }
+
+    bool should_consider_candidate(dist_t candidate_dist, dist_t lowerBound) override {
+        bool flag_consider_candidate = curr_num_items_ < max_num_candidates_ || lowerBound > candidate_dist;
+        return flag_consider_candidate;
+    }
+
+    bool should_remove_extra() {
+        bool flag_remove_extra = curr_num_items_ > max_num_candidates_;
+        return flag_remove_extra;
+    }
+
+    void filter_results(std::vector<std::pair<dist_t, labeltype >> &candidates) override {
+        while (!candidates.empty() && candidates.back().first > epsilon_) {
+            candidates.pop_back();
+        }
+        while (candidates.size() > max_num_candidates_) {
+            candidates.pop_back();
+        }
+    }
+
+    ~EpsilonSearchStopCondition() {}
+};
+}  // namespace hnswlib
diff --git a/python_bindings/bindings.cpp b/python_bindings/bindings.cpp
index 5153bb58..dd09e80a 100644
--- a/python_bindings/bindings.cpp
+++ b/python_bindings/bindings.cpp
@@ -218,6 +218,9 @@ class Index {
         this->num_threads_default = num_threads;
     }
 
+    size_t indexFileSize() const {
+        return appr_alg->indexFileSize();
+    }
 
     void saveIndex(const std::string &path_to_index) {
         appr_alg->saveIndex(path_to_index);
@@ -301,7 +304,11 @@ class Index {
     }
 
 
-    std::vector<std::vector<data_t>> getDataReturnList(py::object ids_ = py::none()) {
+    py::object getData(py::object ids_ = py::none(), std::string return_type = "numpy") {
+        std::vector<std::string> return_types{"numpy", "list"};
+        if (std::find(std::begin(return_types), std::end(return_types), return_type) == std::end(return_types)) {
+            throw std::invalid_argument("return_type should be \"numpy\" or \"list\"");
+        }
         std::vector<size_t> ids;
         if (!ids_.is_none()) {
             py::array_t < size_t, py::array::c_style | py::array::forcecast > items(ids_);
@@ -322,7 +329,12 @@ class Index {
         for (auto id : ids) {
             data.push_back(appr_alg->template getDataByLabel<data_t>(id));
         }
-        return data;
+        if (return_type == "list") {
+            return py::cast(data);
+        }
+        if (return_type == "numpy") {
+            return py::array_t< data_t, py::array::c_style | py::array::forcecast >(py::cast(data));
+        }
     }
 
 
@@ -633,7 +645,7 @@ class Index {
                         (void*)items.data(row), k, p_idFilter);
                     if (result.size() != k)
                         throw std::runtime_error(
-                            "Cannot return the results in a contigious 2D array. Probably ef or M is too small");
+                            "Cannot return the results in a contiguous 2D array. Probably ef or M is too small");
                     for (int i = k - 1; i >= 0; i--) {
                         auto& result_tuple = result.top();
                         data_numpy_d[row * k + i] = result_tuple.first;
@@ -653,7 +665,7 @@ class Index {
                         (void*)(norm_array.data() + start_idx), k, p_idFilter);
                     if (result.size() != k)
                         throw std::runtime_error(
-                            "Cannot return the results in a contigious 2D array. Probably ef or M is too small");
+                            "Cannot return the results in a contiguous 2D array. Probably ef or M is too small");
                     for (int i = k - 1; i >= 0; i--) {
                         auto& result_tuple = result.top();
                         data_numpy_d[row * k + i] = result_tuple.first;
@@ -719,6 +731,7 @@ class BFIndex {
     int dim;
     bool index_inited;
     bool normalize;
+    int num_threads_default;
 
     hnswlib::labeltype cur_l;
     hnswlib::BruteforceSearch<dist_t>* alg;
@@ -739,6 +752,8 @@ class BFIndex {
         }
         alg = NULL;
         index_inited = false;
+
+        num_threads_default = std::thread::hardware_concurrency();
     }
 
 
@@ -749,6 +764,21 @@ class BFIndex {
     }
 
 
+    size_t getMaxElements() const {
+        return alg->maxelements_;
+    }
+
+
+    size_t getCurrentCount() const {
+        return alg->cur_element_count;
+    }
+
+
+    void set_num_threads(int num_threads) {
+        this->num_threads_default = num_threads;
+    }
+
+
     void init_new_index(const size_t maxElements) {
         if (alg) {
             throw std::runtime_error("The index is already initiated.");
@@ -820,15 +850,19 @@ class BFIndex {
     py::object knnQuery_return_numpy(
         py::object input,
         size_t k = 1,
+        int num_threads = -1,
         const std::function<bool(hnswlib::labeltype)>& filter = nullptr) {
         py::array_t < dist_t, py::array::c_style | py::array::forcecast > items(input);
         auto buffer = items.request();
         hnswlib::labeltype *data_numpy_l;
         dist_t *data_numpy_d;
         size_t rows, features;
+
+        if (num_threads <= 0)
+            num_threads = num_threads_default;
+
         {
             py::gil_scoped_release l;
-
             get_input_array_shapes(buffer, &rows, &features);
 
             data_numpy_l = new hnswlib::labeltype[rows * k];
@@ -837,16 +871,16 @@ class BFIndex {
             CustomFilterFunctor idFilter(filter);
             CustomFilterFunctor* p_idFilter = filter ? &idFilter : nullptr;
 
-            for (size_t row = 0; row < rows; row++) {
+            ParallelFor(0, rows, num_threads, [&](size_t row, size_t threadId) {
                 std::priority_queue<std::pair<dist_t, hnswlib::labeltype >> result = alg->searchKnn(
-                        (void *) items.data(row), k, p_idFilter);
+                    (void*)items.data(row), k, p_idFilter);
                 for (int i = k - 1; i >= 0; i--) {
-                    auto &result_tuple = result.top();
+                    auto& result_tuple = result.top();
                     data_numpy_d[row * k + i] = result_tuple.first;
                     data_numpy_l[row * k + i] = result_tuple.second;
                     result.pop();
                 }
-            }
+            });
         }
 
         py::capsule free_when_done_l(data_numpy_l, [](void *f) {
@@ -900,10 +934,11 @@ PYBIND11_PLUGIN(hnswlib) {
             py::arg("ids") = py::none(),
             py::arg("num_threads") = -1,
             py::arg("replace_deleted") = false)
-        .def("get_items", &Index<float, float>::getDataReturnList, py::arg("ids") = py::none())
+        .def("get_items", &Index<float>::getData, py::arg("ids") = py::none(), py::arg("return_type") = "numpy")
         .def("get_ids_list", &Index<float>::getIdsList)
         .def("set_ef", &Index<float>::set_ef, py::arg("ef"))
         .def("set_num_threads", &Index<float>::set_num_threads, py::arg("num_threads"))
+        .def("index_file_size", &Index<float>::indexFileSize)
         .def("save_index", &Index<float>::saveIndex, py::arg("path_to_index"))
         .def("load_index",
             &Index<float>::loadIndex,
@@ -957,13 +992,22 @@ PYBIND11_PLUGIN(hnswlib) {
         py::class_<BFIndex<float>>(m, "BFIndex")
         .def(py::init<const std::string &, const int>(), py::arg("space"), py::arg("dim"))
         .def("init_index", &BFIndex<float>::init_new_index, py::arg("max_elements"))
-        .def("knn_query", &BFIndex<float>::knnQuery_return_numpy, py::arg("data"), py::arg("k") = 1, py::arg("filter") = py::none())
+        .def("knn_query",
+            &BFIndex<float>::knnQuery_return_numpy,
+            py::arg("data"),
+            py::arg("k") = 1,
+            py::arg("num_threads") = -1,
+            py::arg("filter") = py::none())
         .def("add_items", &BFIndex<float>::addItems, py::arg("data"), py::arg("ids") = py::none())
         .def("delete_vector", &BFIndex<float>::deleteVector, py::arg("label"))
+        .def("set_num_threads", &BFIndex<float>::set_num_threads, py::arg("num_threads"))
         .def("save_index", &BFIndex<float>::saveIndex, py::arg("path_to_index"))
         .def("load_index", &BFIndex<float>::loadIndex, py::arg("path_to_index"), py::arg("max_elements") = 0)
         .def("__repr__", [](const BFIndex<float> &a) {
             return "<hnswlib.BFIndex(space='" + a.space_name + "', dim="+std::to_string(a.dim)+")>";
-        });
+        })
+        .def("get_max_elements", &BFIndex<float>::getMaxElements)
+        .def("get_current_count", &BFIndex<float>::getCurrentCount)
+        .def_readwrite("num_threads", &BFIndex<float>::num_threads_default);
         return m.ptr();
 }
diff --git a/python_bindings/tests/bindings_test_bf_index.py b/python_bindings/tests/bindings_test_bf_index.py
new file mode 100644
index 00000000..060b9943
--- /dev/null
+++ b/python_bindings/tests/bindings_test_bf_index.py
@@ -0,0 +1,49 @@
+import unittest
+
+import numpy as np
+
+import hnswlib
+
+
+class RandomSelfTestCase(unittest.TestCase):
+    def testBFIndex(self):
+
+        dim = 16
+        num_elements = 10000
+        num_queries = 1000
+        k = 20
+
+        # Generating sample data
+        data = np.float32(np.random.random((num_elements, dim)))
+
+        # Declaring index
+        bf_index = hnswlib.BFIndex(space='l2', dim=dim)  # possible options are l2, cosine or ip
+        bf_index.init_index(max_elements=num_elements)
+
+        num_threads = 8
+        bf_index.set_num_threads(num_threads)  # by default using all available cores
+
+        print(f"Adding all elements {num_elements}")
+        bf_index.add_items(data)
+
+        self.assertEqual(bf_index.num_threads, num_threads)
+        self.assertEqual(bf_index.get_max_elements(), num_elements)
+        self.assertEqual(bf_index.get_current_count(), num_elements)
+
+        queries = np.float32(np.random.random((num_queries, dim)))
+        print("Searching nearest neighbours")
+        labels, distances = bf_index.knn_query(queries, k=k)
+
+        print("Checking results")
+        for i in range(num_queries):
+            query = queries[i]
+            sq_dists = (data - query)**2
+            dists = np.sum(sq_dists, axis=1)
+            labels_gt = np.argsort(dists)[:k]
+            dists_gt = dists[labels_gt]
+            dists_bf = distances[i]
+            # we can compare labels but because of numeric errors in distance calculation in C++ and numpy
+            # sometimes we get different order of labels, therefore we compare distances
+            max_diff_with_gt = np.max(np.abs(dists_gt - dists_bf))
+
+            self.assertTrue(max_diff_with_gt < 1e-5)
diff --git a/setup.py b/setup.py
index 0126585e..d96aea49 100644
--- a/setup.py
+++ b/setup.py
@@ -8,7 +8,7 @@
 from setuptools import Extension, setup
 from setuptools.command.build_ext import build_ext
 
-__version__ = '0.7.0'
+__version__ = '0.8.0'
 
 
 include_dirs = [
@@ -73,22 +73,20 @@ def cpp_flag(compiler):
 
 class BuildExt(build_ext):
     """A custom build extension for adding compiler-specific options."""
+    compiler_flag_native = '-march=native'
     c_opts = {
         'msvc': ['/EHsc', '/openmp', '/O2'],
-        #'unix': ['-O3', '-march=native'],  # , '-w'
-        'unix': ['-O3'],  # , '-w'
+        'unix': ['-O3', compiler_flag_native],  # , '-w'
     }
-    if not os.environ.get("HNSWLIB_NO_NATIVE"):
-        c_opts['unix'].append('-march=native')
-
     link_opts = {
         'unix': [],
         'msvc': [],
     }
 
+    if os.environ.get("HNSWLIB_NO_NATIVE"):
+        c_opts['unix'].remove(compiler_flag_native)
+
     if sys.platform == 'darwin':
-        if platform.machine() == 'arm64':
-            c_opts['unix'].remove('-march=native')
         c_opts['unix'] += ['-stdlib=libc++', '-mmacosx-version-min=10.7']
         link_opts['unix'] += ['-stdlib=libc++', '-mmacosx-version-min=10.7']
     else:
@@ -97,18 +95,35 @@ class BuildExt(build_ext):
 
     def build_extensions(self):
         ct = self.compiler.compiler_type
-        opts = self.c_opts.get(ct, [])
+        opts = BuildExt.c_opts.get(ct, [])
         if ct == 'unix':
             opts.append('-DVERSION_INFO="%s"' % self.distribution.get_version())
             opts.append(cpp_flag(self.compiler))
             if has_flag(self.compiler, '-fvisibility=hidden'):
                 opts.append('-fvisibility=hidden')
+            if not os.environ.get("HNSWLIB_NO_NATIVE"):
+                # check that native flag is available
+                print('checking avalability of flag:', BuildExt.compiler_flag_native)
+                if not has_flag(self.compiler, BuildExt.compiler_flag_native):
+                    print('removing unsupported compiler flag:', BuildExt.compiler_flag_native)
+                    opts.remove(BuildExt.compiler_flag_native)
+                    # for macos add apple-m1 flag if it's available
+                    if sys.platform == 'darwin':
+                        m1_flag = '-mcpu=apple-m1'
+                        print('checking avalability of flag:', m1_flag)
+                        if has_flag(self.compiler, m1_flag):
+                            print('adding flag:', m1_flag)
+                            opts.append(m1_flag)
+                        else:
+                            print(f'flag: {m1_flag} is not available')
+                else:
+                    print(f'flag: {BuildExt.compiler_flag_native} is available')
         elif ct == 'msvc':
             opts.append('/DVERSION_INFO=\\"%s\\"' % self.distribution.get_version())
 
         for ext in self.extensions:
             ext.extra_compile_args.extend(opts)
-            ext.extra_link_args.extend(self.link_opts.get(ct, []))
+            ext.extra_link_args.extend(BuildExt.link_opts.get(ct, []))
 
         build_ext.build_extensions(self)
 
diff --git a/tests/cpp/epsilon_search_test.cpp b/tests/cpp/epsilon_search_test.cpp
new file mode 100644
index 00000000..38df6246
--- /dev/null
+++ b/tests/cpp/epsilon_search_test.cpp
@@ -0,0 +1,114 @@
+#include "assert.h"
+#include "../../hnswlib/hnswlib.h"
+
+typedef unsigned int docidtype;
+typedef float dist_t;
+
+int main() {
+    int dim = 16;               // Dimension of the elements
+    int max_elements = 10000;   // Maximum number of elements, should be known beforehand
+    int M = 16;                 // Tightly connected with internal dimensionality of the data
+                                // strongly affects the memory consumption
+    int ef_construction = 200;  // Controls index search speed/build speed tradeoff
+
+    int num_queries = 100;
+    float epsilon2 = 1.0;                    // Squared distance to query
+    int max_num_candidates = max_elements;   // Upper bound on the number of returned elements in the epsilon region
+    int min_num_candidates = 2000;           // Minimum number of candidates to search in the epsilon region
+                                             // this parameter is similar to ef
+
+    // Initing index
+    hnswlib::L2Space space(dim);
+    hnswlib::BruteforceSearch<dist_t>* alg_brute = new hnswlib::BruteforceSearch<dist_t>(&space, max_elements);
+    hnswlib::HierarchicalNSW<dist_t>* alg_hnsw = new hnswlib::HierarchicalNSW<dist_t>(&space, max_elements, M, ef_construction);
+
+    // Generate random data
+    std::mt19937 rng;
+    rng.seed(47);
+    std::uniform_real_distribution<> distrib_real;
+
+    float* data = new float[dim * max_elements];
+    for (int i = 0; i < dim * max_elements; i++) {
+        data[i] = distrib_real(rng);
+    }
+
+    // Add data to index
+    std::cout << "Building index ...\n";
+    for (int i = 0; i < max_elements; i++) {
+        hnswlib::labeltype label = i;
+        float* point_data = data + i * dim;
+        alg_hnsw->addPoint(point_data, label);
+        alg_brute->addPoint(point_data, label);
+    }
+    std::cout << "Index is ready\n";
+
+    // Query random vectors
+    for (int i = 0; i < num_queries; i++) {
+        float* query_data = new float[dim];
+        for (int j = 0; j < dim; j++) {
+            query_data[j] = distrib_real(rng);
+        }
+        hnswlib::EpsilonSearchStopCondition<dist_t> stop_condition(epsilon2, min_num_candidates, max_num_candidates);
+        std::vector<std::pair<float, hnswlib::labeltype>> result_hnsw =
+            alg_hnsw->searchStopConditionClosest(query_data, stop_condition);
+        
+        // check that returned results are in epsilon region
+        size_t num_vectors = result_hnsw.size();
+        std::unordered_set<hnswlib::labeltype> hnsw_labels;
+        for (auto pair: result_hnsw) {
+            float dist = pair.first;
+            hnswlib::labeltype label = pair.second;
+            hnsw_labels.insert(label);
+            assert(dist >=0 && dist <= epsilon2);
+        }
+        std::priority_queue<std::pair<float, hnswlib::labeltype>> result_brute =
+            alg_brute->searchKnn(query_data, max_elements);
+        
+        // check recall
+        std::unordered_set<hnswlib::labeltype> gt_labels;
+        while (!result_brute.empty()) {
+            float dist = result_brute.top().first;
+            hnswlib::labeltype label = result_brute.top().second;
+            if (dist < epsilon2) {
+                gt_labels.insert(label);
+            }
+            result_brute.pop();
+        }
+        float correct = 0;
+        for (const auto& hnsw_label: hnsw_labels) {
+            if (gt_labels.find(hnsw_label) != gt_labels.end()) {
+                correct += 1;
+            }
+        }
+        if (gt_labels.size() == 0) {
+            assert(correct == 0);
+            continue;
+        }
+        float recall = correct / gt_labels.size();
+        assert(recall > 0.95);
+        delete[] query_data;
+    }
+    std::cout << "Recall is OK\n";
+
+    // Query the elements for themselves and check that query can be found
+    float epsilon2_small = 0.0001f;
+    int min_candidates_small = 500;
+    for (size_t i = 0; i < max_elements; i++) {
+        hnswlib::EpsilonSearchStopCondition<dist_t> stop_condition(epsilon2_small, min_candidates_small, max_num_candidates);
+        std::vector<std::pair<float, hnswlib::labeltype>> result = 
+            alg_hnsw->searchStopConditionClosest(alg_hnsw->getDataByInternalId(i), stop_condition);
+        size_t num_vectors = result.size();
+        // get closest distance
+        float dist = -1;
+        if (!result.empty()) {
+            dist = result[0].first;
+        }
+        assert(dist == 0);
+    }
+    std::cout << "Small epsilon search is OK\n";
+
+    delete[] data;
+    delete alg_brute;
+    delete alg_hnsw;
+    return 0;
+}
diff --git a/tests/cpp/multivector_search_test.cpp b/tests/cpp/multivector_search_test.cpp
new file mode 100644
index 00000000..be783176
--- /dev/null
+++ b/tests/cpp/multivector_search_test.cpp
@@ -0,0 +1,126 @@
+#include <assert.h>
+#include "../../hnswlib/hnswlib.h"
+
+typedef unsigned int docidtype;
+typedef float dist_t;
+
+int main() {
+    int dim = 16;               // Dimension of the elements
+    int max_elements = 1000;    // Maximum number of elements, should be known beforehand
+    int M = 16;                 // Tightly connected with internal dimensionality of the data
+                                // strongly affects the memory consumption
+    int ef_construction = 200;  // Controls index search speed/build speed tradeoff
+
+    int num_queries = 100;
+    int num_docs = 10;          // Number of documents to search
+    int ef_collection = 15;     // Number of candidate documents during the search
+                                // Controlls the recall: higher ef leads to better accuracy, but slower search
+    docidtype min_doc_id = 0;
+    docidtype max_doc_id = 49;
+
+    // Initing index
+    hnswlib::MultiVectorL2Space<docidtype> space(dim);
+    hnswlib::BruteforceSearch<dist_t>* alg_brute = new hnswlib::BruteforceSearch<dist_t>(&space, max_elements);
+    hnswlib::HierarchicalNSW<dist_t>* alg_hnsw = new hnswlib::HierarchicalNSW<dist_t>(&space, max_elements, M, ef_construction);
+
+    // Generate random data
+    std::mt19937 rng;
+    rng.seed(47);
+    std::uniform_real_distribution<> distrib_real;
+    std::uniform_int_distribution<docidtype> distrib_docid(min_doc_id, max_doc_id);
+
+    size_t data_point_size = space.get_data_size();
+    char* data = new char[data_point_size * max_elements];
+    for (int i = 0; i < max_elements; i++) {
+        // set vector value
+        char* point_data = data + i * data_point_size;
+        for (int j = 0; j < dim; j++) {
+            char* vec_data = point_data + j * sizeof(float);
+            float value = distrib_real(rng);
+            *(float*)vec_data = value;
+        }
+        // set document id
+        docidtype doc_id = distrib_docid(rng);
+        space.set_doc_id(point_data, doc_id);
+    }
+
+    // Add data to index
+    std::unordered_map<hnswlib::labeltype, docidtype> label_docid_lookup;
+    for (int i = 0; i < max_elements; i++) {
+        hnswlib::labeltype label = i;
+        char* point_data = data + i * data_point_size;
+        alg_hnsw->addPoint(point_data, label);
+        alg_brute->addPoint(point_data, label);
+        label_docid_lookup[label] = space.get_doc_id(point_data);
+    }
+
+    // Query random vectors and check overall recall
+    float correct = 0;
+    float total_num_elements = 0;
+    size_t query_size = dim * sizeof(float);
+    for (int i = 0; i < num_queries; i++) {
+        char* query_data = new char[query_size];
+        for (int j = 0; j < dim; j++) {
+            size_t offset = j * sizeof(float);
+            char* vec_data = query_data + offset;
+            float value = distrib_real(rng);
+            *(float*)vec_data = value;
+        }
+        hnswlib::MultiVectorSearchStopCondition<docidtype, dist_t> stop_condition(space, num_docs, ef_collection);
+        std::vector<std::pair<dist_t, hnswlib::labeltype>> hnsw_results =
+            alg_hnsw->searchStopConditionClosest(query_data, stop_condition);
+
+        // check number of found documents
+        std::unordered_set<docidtype> hnsw_docs;
+        std::unordered_set<hnswlib::labeltype> hnsw_labels;
+        for (auto pair: hnsw_results) {
+            hnswlib::labeltype label = pair.second;
+            hnsw_labels.emplace(label);
+            docidtype doc_id = label_docid_lookup[label];
+            hnsw_docs.emplace(doc_id);
+        }
+        assert(hnsw_docs.size() == num_docs);
+
+        // Check overall recall
+        std::vector<std::pair<dist_t, hnswlib::labeltype>> gt_results = 
+            alg_brute->searchKnnCloserFirst(query_data, max_elements);
+        std::unordered_set<docidtype> gt_docs;
+        for (int i = 0; i < gt_results.size(); i++) {
+            if (gt_docs.size() == num_docs) {
+                break;
+            }
+            hnswlib::labeltype gt_label = gt_results[i].second;
+            if (hnsw_labels.find(gt_label) != hnsw_labels.end()) {
+                correct += 1;
+            }
+            docidtype gt_doc_id = label_docid_lookup[gt_label];
+            gt_docs.emplace(gt_doc_id);
+            total_num_elements += 1;
+        }
+        delete[] query_data;
+    }
+    float recall = correct / total_num_elements;
+    std::cout << "random elements search recall : " << recall << "\n";
+    assert(recall > 0.95);
+
+    // Query the elements for themselves and measure recall
+    correct = 0;
+    for (int i = 0; i < max_elements; i++) {
+        hnswlib::MultiVectorSearchStopCondition<docidtype, dist_t> stop_condition(space, num_docs, ef_collection);
+        std::vector<std::pair<float, hnswlib::labeltype>> result =
+            alg_hnsw->searchStopConditionClosest(data + i * data_point_size, stop_condition);
+        hnswlib::labeltype label = -1;
+        if (!result.empty()) {
+            label = result[0].second;
+        }
+        if (label == i) correct++;
+    }
+    recall = correct / max_elements;
+    std::cout << "same elements search recall : " << recall << "\n";
+    assert(recall > 0.99);
+
+    delete[] data;
+    delete alg_brute;
+    delete alg_hnsw;
+    return 0;
+}
diff --git a/tests/cpp/sift_1b.cpp b/tests/cpp/sift_1b.cpp
index 43777ff6..c0f296c2 100644
--- a/tests/cpp/sift_1b.cpp
+++ b/tests/cpp/sift_1b.cpp
@@ -250,11 +250,11 @@ void sift_test1B() {
     size_t vecdim = 128;
     char path_index[1024];
     char path_gt[1024];
-    char *path_q = "../bigann/bigann_query.bvecs";
-    char *path_data = "../bigann/bigann_base.bvecs";
-    sprintf(path_index, "sift1b_%dm_ef_%d_M_%d.bin", subset_size_milllions, efConstruction, M);
+    const char *path_q = "../bigann/bigann_query.bvecs";
+    const char *path_data = "../bigann/bigann_base.bvecs";
+    snprintf(path_index, sizeof(path_index), "sift1b_%dm_ef_%d_M_%d.bin", subset_size_milllions, efConstruction, M);
 
-    sprintf(path_gt, "../bigann/gnd/idx_%dM.ivecs", subset_size_milllions);
+    snprintf(path_gt, sizeof(path_gt), "../bigann/gnd/idx_%dM.ivecs", subset_size_milllions);
 
     unsigned char *massb = new unsigned char[vecdim];
 
diff --git a/tests/cpp/updates_test.cpp b/tests/cpp/updates_test.cpp
index 52e1fa14..4dff2f85 100644
--- a/tests/cpp/updates_test.cpp
+++ b/tests/cpp/updates_test.cpp
@@ -239,7 +239,7 @@ int main(int argc, char **argv) {
         for (int b = 1; b < dummy_data_multiplier; b++) {
             std::cout << "Update iteration " << b << "\n";
             char cpath[1024];
-            sprintf(cpath, "batch_dummy_%02d.bin", b);
+            snprintf(cpath, sizeof(cpath), "batch_dummy_%02d.bin", b);
             std::vector<float> dummy_batchb = load_batch<float>(path + cpath, N * d);
 
             ParallelFor(0, N, num_threads, [&](size_t i, size_t threadId) {
diff --git a/tests/python/bindings_test_getdata.py b/tests/python/bindings_test_getdata.py
index 515ecebd..3e16f9b9 100644
--- a/tests/python/bindings_test_getdata.py
+++ b/tests/python/bindings_test_getdata.py
@@ -45,5 +45,11 @@ def testGettingItems(self):
         self.assertRaises(ValueError, lambda: p.get_items(labels[0]))
 
         # After adding them, all labels should be retrievable
-        returned_items = p.get_items(labels)
-        self.assertSequenceEqual(data.tolist(), returned_items)
+        returned_items_np = p.get_items(labels)
+        self.assertTrue((data == returned_items_np).all())
+
+        # check returned type of get_items
+        self.assertTrue(isinstance(returned_items_np, np.ndarray))
+        returned_items_list = p.get_items(labels, return_type="list")
+        self.assertTrue(isinstance(returned_items_list, list))
+        self.assertTrue(isinstance(returned_items_list[0], list))
diff --git a/tests/python/bindings_test_replace.py b/tests/python/bindings_test_replace.py
index 80003a3a..09c1299e 100644
--- a/tests/python/bindings_test_replace.py
+++ b/tests/python/bindings_test_replace.py
@@ -94,10 +94,10 @@ def testRandomSelf(self):
         remaining_data = comb_data[remaining_labels_list]
 
         returned_items = hnsw_index.get_items(remaining_labels_list)
-        self.assertSequenceEqual(remaining_data.tolist(), returned_items)
+        self.assertTrue((remaining_data == returned_items).all())
 
         returned_items = hnsw_index.get_items(labels3_tr)
-        self.assertSequenceEqual(data3_tr.tolist(), returned_items)
+        self.assertTrue((data3_tr == returned_items).all())
 
         # Check index serialization
         # Delete batch 3
diff --git a/tests/python/draw_git_test_plots.py b/tests/python/draw_git_test_plots.py
new file mode 100644
index 00000000..c91c8f5d
--- /dev/null
+++ b/tests/python/draw_git_test_plots.py
@@ -0,0 +1,48 @@
+import os
+import glob
+import pandas as pd
+import matplotlib.pyplot as plt
+import numpy as np
+def plot_data_from_file(file_path):
+    # Load the data, assuming the last column is text
+    data = pd.read_csv(file_path, header=None)
+    rep_size=len(set(data[data.columns[-1]]))
+    data.drop(data.columns[-1], axis=1, inplace=True)  # Drop the last column (text)
+
+    # Number of numerical columns
+    num_columns = data.shape[1]
+
+    # Create a subplot for each column
+    fig, axes = plt.subplots(num_columns, 1, figsize=(10, 6 * num_columns))
+    
+    # In case there is only one column, axes will not be an array, so we convert it
+    if num_columns == 1:
+        axes = [axes]
+    
+    for i, ax in enumerate(axes):
+        idx=0
+        ax.scatter(np.asarray(data.index,dtype=np.int64)%rep_size, data[i], label=f'Column {i+1}')
+        ax.set_title(f'Column {i+1}')
+        ax.set_xlabel('ID Number')
+        ax.set_ylabel('Value')
+        ax.legend()
+        ax.grid(True)
+
+    plt.tight_layout()
+    plt.suptitle(f'Data from {os.path.basename(file_path)}')
+
+    # Save the plot to a file
+    plt.savefig(file_path.replace('.txt', '.png'))
+    plt.close()
+
+def scan_and_plot(directory):
+    # Scan for .txt files in the given directory
+    txt_files = glob.glob(os.path.join(directory, '*.txt'))
+
+    # Process each file
+    for file in txt_files:
+        print(f'Processing {file}...')
+        plot_data_from_file(file)
+        print(f'Plot saved for {file}')
+# Replace 'your_folder_path' with the path to the folder containing the .txt files
+scan_and_plot('./')
\ No newline at end of file
diff --git a/tests/python/git_tester.py b/tests/python/git_tester.py
index 1f9c2ba7..e7657fee 100644
--- a/tests/python/git_tester.py
+++ b/tests/python/git_tester.py
@@ -9,16 +9,18 @@
 speedtest_copy_path = os.path.join("tests", "python", "speedtest2.py")
 shutil.copyfile(speedtest_src_path, speedtest_copy_path) # the file has to be outside of git
 
-commits = list(Repository('.', from_tag="v0.6.2").traverse_commits())
+commits = list(Repository('.', from_tag="v0.7.0").traverse_commits())
 print("Found commits:")
 for idx, commit in enumerate(commits):
     name = commit.msg.replace('\n', ' ').replace('\r', ' ')
     print(idx, commit.hash, name)
 
 for commit in commits:
-    name = commit.msg.replace('\n', ' ').replace('\r', ' ').replace(",", ";")
+    commit_time = commit.author_date.strftime("%Y-%m-%d %H:%M:%S") 
+    author_name = commit.author.name
+    name = "auth:"+author_name+"_"+commit_time+"_msg:"+commit.msg.replace('\n', ' ').replace('\r', ' ').replace(",", ";")
     print("\nProcessing", commit.hash, name)
-
+    
     if os.path.exists("build"):
         shutil.rmtree("build")
     os.system(f"git checkout {commit.hash}")
@@ -43,10 +45,11 @@
         print("build failed!!!!")
         continue
 
-    # os.system(f'python {speedtest_copy_path} -n "{hash[:4]}_{name}" -d 32 -t 1')
+
     os.system(f'python {speedtest_copy_path} -n "{commit.hash[:4]}_{name}" -d 16 -t 1')
     os.system(f'python {speedtest_copy_path} -n "{commit.hash[:4]}_{name}" -d 16 -t 64')
-    # os.system(f'python {speedtest_copy_path} -n "{name}" -d 64 -t 1')
-    # os.system(f'python {speedtest_copy_path} -n "{name}" -d 128 -t 1')
-    # os.system(f'python {speedtest_copy_path} -n "{name}" -d 4 -t 24')
-    # os.system(f'python {speedtest_copy_path} -n "{name}" -d 128 -t 24')
+    os.system(f'python {speedtest_copy_path} -n "{commit.hash[:4]}_{name}" -d 4 -t 1')
+    os.system(f'python {speedtest_copy_path} -n "{commit.hash[:4]}_{name}" -d 4 -t 64')
+    os.system(f'python {speedtest_copy_path} -n "{commit.hash[:4]}_{name}" -d 128 -t 1')
+    os.system(f'python {speedtest_copy_path} -n "{commit.hash[:4]}_{name}" -d 128 -t 64')
+