From 149ff4590f16f3dbff9ec891dde8189fafb30987 Mon Sep 17 00:00:00 2001 From: Wenzel Jakob Date: Fri, 19 Apr 2024 09:18:13 +0200 Subject: [PATCH] Add ``robin_map/set::erase_fast()`` method (fixes #75) (#76) This commit introduces a new method ``void erase_fast(iterator)`` to both set and map classes resembling the existing ``iterator erase(iterator)``. The main difference is that it does _not_ return an iterator, which is useful to avoid a performance pitfall explained in issue #75. --- README.md | 40 ++++++++++++++++++++++++++++++++++++++- include/tsl/robin_hash.h | 9 +++++---- include/tsl/robin_map.h | 8 ++++++++ include/tsl/robin_set.h | 8 ++++++++ tests/robin_map_tests.cpp | 11 +++++++++++ tests/robin_set_tests.cpp | 10 ++++++++++ 6 files changed, 81 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index f8901e4..f012a2b 100644 --- a/README.md +++ b/README.md @@ -275,7 +275,6 @@ int main() { } ``` - #### Serialization The library provides an efficient way to serialize and deserialize a map or a set so that it can be saved to a file or send through the network. @@ -478,6 +477,45 @@ int main() { } ``` +#### Performance pitfalls + +Two potential performance pitfalls involving `tsl::robin_map` and +`tsl::robin_set` are noteworthy: + +1. *Bad hashes*. Hash functions that produce many collisions can lead to the + following surprising behavior: when the number of collisions exceeds a + certain threshold, the hash table will automatically expand to fix the + problem. However, in degenerate cases, this expansion might have _no effect_ + on the collision count, causing a failure mode where a linear sequence of + insertion leads to exponential storage growth. + + This case has mainly been observed when using the default power-of-two + growth strategy with the default STL `std::hash` for arithmetic types + `T`, which is often an identity! See issue + [#39](https://github.com/Tessil/robin-map/issues/39) for an example. The + solution is simple: use a better hash function and/or `tsl::robin_pg_set` / + `tsl::robin_pg_map`. + +2. *Element erasure and low load factors*. `tsl::robin_map` and + `tsl::robin_set` mirror the STL map/set API, which exposes an `iterator + erase(iterator)` method that removes an element at a certain position, + returning a valid iterator that points to the next element. + + Constructing this new iterator object requires walking to the next nonempty + bucket in the table, which can be a expensive operation when the hash table + has a low *load factor* (i.e., when `capacity()` is much larger then + `size()`). + + The `erase()` method furthermore never shrinks & re-hashes the table as + this is not permitted by the specification of this function. A linear + sequence of random removals without intermediate insertions can then lead to + a degenerate case with quadratic runtime cost. + + In such cases, an iterator return value is often not even needed, so the + cost is entirely unnecessary. Both `tsl::robin_set` and `tsl::robin_map` + therefore provide an alternative erasure method `void erase_fast(iterator)` + that does not return an iterator to avoid having to find the next element. + ### License The code is licensed under the MIT license, see the [LICENSE file](LICENSE) for details. diff --git a/include/tsl/robin_hash.h b/include/tsl/robin_hash.h index 14c3943..03d23a9 100644 --- a/include/tsl/robin_hash.h +++ b/include/tsl/robin_hash.h @@ -820,6 +820,10 @@ class robin_hash : private Hash, private KeyEqual, private GrowthPolicy { return try_emplace(std::forward(key), std::forward(args)...).first; } + void erase_fast(iterator pos) { + erase_from_bucket(pos); + } + /** * Here to avoid `template size_type erase(const K& key)` being used * when we use an `iterator` instead of a `const_iterator`. @@ -836,8 +840,6 @@ class robin_hash : private Hash, private KeyEqual, private GrowthPolicy { ++pos; } - m_try_shrink_on_next_insert = true; - return pos; } @@ -916,8 +918,6 @@ class robin_hash : private Hash, private KeyEqual, private GrowthPolicy { auto it = find(key, hash); if (it != end()) { erase_from_bucket(it); - m_try_shrink_on_next_insert = true; - return 1; } else { return 0; @@ -1211,6 +1211,7 @@ class robin_hash : private Hash, private KeyEqual, private GrowthPolicy { previous_ibucket = ibucket; ibucket = next_bucket(ibucket); } + m_try_shrink_on_next_insert = true; } template diff --git a/include/tsl/robin_map.h b/include/tsl/robin_map.h index aeb354c..b594810 100644 --- a/include/tsl/robin_map.h +++ b/include/tsl/robin_map.h @@ -339,6 +339,14 @@ class robin_map { } size_type erase(const key_type& key) { return m_ht.erase(key); } + /** + * Erase the element at position 'pos'. In contrast to the regular erase() + * function, erase_fast() does not return an iterator. This allows it to be + * faster especially in hash tables with a low load factor, where finding the + * next nonempty bucket would be costly. + */ + void erase_fast(iterator pos) { return m_ht.erase_fast(pos); } + /** * Use the hash value 'precalculated_hash' instead of hashing the key. The * hash value should be the same as hash_function()(key). Useful to speed-up diff --git a/include/tsl/robin_set.h b/include/tsl/robin_set.h index 5478950..e115007 100644 --- a/include/tsl/robin_set.h +++ b/include/tsl/robin_set.h @@ -263,6 +263,14 @@ class robin_set { } size_type erase(const key_type& key) { return m_ht.erase(key); } + /** + * Erase the element at position 'pos'. In contrast to the regular erase() + * function, erase_fast() does not return an iterator. This allows it to be + * faster especially in hash sets with a low load factor, where finding the + * next nonempty bucket would be costly. + */ + void erase_fast(iterator pos) { return m_ht.erase_fast(pos); } + /** * Use the hash value 'precalculated_hash' instead of hashing the key. The * hash value should be the same as hash_function()(key). Useful to speed-up diff --git a/tests/robin_map_tests.cpp b/tests/robin_map_tests.cpp index 80858e8..68398e2 100644 --- a/tests/robin_map_tests.cpp +++ b/tests/robin_map_tests.cpp @@ -1448,4 +1448,15 @@ BOOST_AUTO_TEST_CASE(test_precalculated_hash) { BOOST_CHECK_EQUAL(map.erase(4, map.hash_function()(2)), 0); } +BOOST_AUTO_TEST_CASE(test_erase_fast) { + using Map = tsl::robin_map; + Map map; + map.emplace(4, 5); + auto it = map.find(4); + BOOST_CHECK(it != map.end()); + map.erase_fast(it); + BOOST_CHECK(map.size() == 0); +} + + BOOST_AUTO_TEST_SUITE_END() diff --git a/tests/robin_set_tests.cpp b/tests/robin_set_tests.cpp index a710b79..e68b514 100644 --- a/tests/robin_set_tests.cpp +++ b/tests/robin_set_tests.cpp @@ -161,4 +161,14 @@ BOOST_AUTO_TEST_CASE(test_serialize_deserialize) { BOOST_CHECK(set_deserialized == set); } +BOOST_AUTO_TEST_CASE(test_erase_fast) { + using Set = tsl::robin_set; + Set set; + set.emplace(4); + auto it = set.find(4); + BOOST_CHECK(it != set.end()); + set.erase_fast(it); + BOOST_CHECK(set.size() == 0); +} + BOOST_AUTO_TEST_SUITE_END()