From b7a037fac36ab4c39bb24f92cd2146df8ad9f4f6 Mon Sep 17 00:00:00 2001 From: vegetableysm Date: Thu, 28 Mar 2024 16:06:20 +0800 Subject: [PATCH] Add pthash. Signed-off-by: vegetableysm --- NOTICE.txt | 4 + .../external_memory_builder_single_phf.hpp | 753 ++++++++++++++++++ .../internal_memory_builder_single_phf.hpp | 365 +++++++++ thirdparty/pthash/builders/search.hpp | 358 +++++++++ thirdparty/pthash/builders/util.hpp | 301 +++++++ thirdparty/pthash/encoders/bit_vector.hpp | 347 ++++++++ thirdparty/pthash/encoders/compact_vector.hpp | 306 +++++++ thirdparty/pthash/encoders/darray.hpp | 185 +++++ thirdparty/pthash/encoders/ef_sequence.hpp | 133 ++++ thirdparty/pthash/encoders/encoders.hpp | 161 ++++ thirdparty/pthash/encoders/util.hpp | 84 ++ thirdparty/pthash/essentials/essentials.hpp | 612 ++++++++++++++ thirdparty/pthash/fastmod/fastmod.h | 209 +++++ thirdparty/pthash/mm_file/mm_file.hpp | 170 ++++ thirdparty/pthash/pthash.hpp | 25 + thirdparty/pthash/single_phf.hpp | 151 ++++ thirdparty/pthash/utils/bucketers.hpp | 92 +++ thirdparty/pthash/utils/hasher.hpp | 188 +++++ thirdparty/pthash/utils/logger.hpp | 87 ++ thirdparty/pthash/utils/util.hpp | 57 ++ 20 files changed, 4588 insertions(+) create mode 100644 thirdparty/pthash/builders/external_memory_builder_single_phf.hpp create mode 100644 thirdparty/pthash/builders/internal_memory_builder_single_phf.hpp create mode 100644 thirdparty/pthash/builders/search.hpp create mode 100644 thirdparty/pthash/builders/util.hpp create mode 100644 thirdparty/pthash/encoders/bit_vector.hpp create mode 100644 thirdparty/pthash/encoders/compact_vector.hpp create mode 100644 thirdparty/pthash/encoders/darray.hpp create mode 100644 thirdparty/pthash/encoders/ef_sequence.hpp create mode 100644 thirdparty/pthash/encoders/encoders.hpp create mode 100644 thirdparty/pthash/encoders/util.hpp create mode 100644 thirdparty/pthash/essentials/essentials.hpp create mode 100644 thirdparty/pthash/fastmod/fastmod.h create mode 100644 thirdparty/pthash/mm_file/mm_file.hpp create mode 100644 thirdparty/pthash/pthash.hpp create mode 100644 thirdparty/pthash/single_phf.hpp create mode 100644 thirdparty/pthash/utils/bucketers.hpp create mode 100644 thirdparty/pthash/utils/hasher.hpp create mode 100644 thirdparty/pthash/utils/logger.hpp create mode 100644 thirdparty/pthash/utils/util.hpp diff --git a/NOTICE.txt b/NOTICE.txt index c326975c27..ae1b2258a7 100644 --- a/NOTICE.txt +++ b/NOTICE.txt @@ -59,3 +59,7 @@ This product includes software from the BBHash project This product includes software from the rax project (BSD, 2-clause) * Copyright (c) 2017-2019, Salvatore Sanfilippo * https://github.com/antirez/rax + +This product includes software from the pthash project (MIT License) + * Copyright (c) 2020-2024, Giulio Ermanno Pibiri and Roberto Trani + * https://github.com/jermp/pthash diff --git a/thirdparty/pthash/builders/external_memory_builder_single_phf.hpp b/thirdparty/pthash/builders/external_memory_builder_single_phf.hpp new file mode 100644 index 0000000000..605f4809c2 --- /dev/null +++ b/thirdparty/pthash/builders/external_memory_builder_single_phf.hpp @@ -0,0 +1,753 @@ +/** Copyright 2020-2024 Giulio Ermanno Pibiri and Roberto Trani + * + * The following sets forth attribution notices for third party software. + * + * PTHash: + * The software includes components licensed by Giulio Ermanno Pibiri and + * Roberto Trani, available at https://github.com/jermp/pthash + * + * Licensed under the MIT License (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * https://opensource.org/licenses/MIT + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "pthash/builders/search.hpp" +#include "pthash/builders/util.hpp" +#include "pthash/mm_file/mm_file.hpp" + +#include "pthash/utils/bucketers.hpp" +#include "pthash/utils/hasher.hpp" +#include "pthash/utils/logger.hpp" + +namespace pthash { + +template +struct external_memory_builder_single_phf { + typedef Hasher hasher_type; + + external_memory_builder_single_phf() + : m_pilots_filename(""), m_free_slots_filename("") {} + // non construction-copyable + external_memory_builder_single_phf( + external_memory_builder_single_phf const&) = delete; + // non copyable + external_memory_builder_single_phf& operator=( + external_memory_builder_single_phf const&) = delete; + + ~external_memory_builder_single_phf() { + if (m_pilots_filename != "") + std::remove(m_pilots_filename.c_str()); + m_pilots_filename = ""; + if (m_free_slots_filename != "") + std::remove(m_free_slots_filename.c_str()); + m_free_slots_filename = ""; + } + + template + build_timings build_from_keys(Iterator keys, uint64_t num_keys, + build_configuration const& config) { + assert(num_keys > 1); + if (config.alpha == 0 or config.alpha > 1.0) { + throw std::invalid_argument("load factor must be > 0 and <= 1.0"); + } + + build_timings time; + uint64_t table_size = static_cast(num_keys) / config.alpha; + if ((table_size & (table_size - 1)) == 0) + table_size += 1; + uint64_t num_buckets = + std::ceil((config.c * num_keys) / std::log2(num_keys)); + + if (sizeof(bucket_id_type) != sizeof(uint64_t) and + num_buckets > (1ULL << (sizeof(bucket_id_type) * 8))) { + throw std::runtime_error( + "using too many buckets: change bucket_id_type to uint64_t or use a " + "smaller c"); + } + + m_num_keys = num_keys; + m_table_size = table_size; + m_num_buckets = num_buckets; + m_seed = + config.seed == constants::invalid_seed ? random_value() : config.seed; + m_bucketer.init(num_buckets); + + uint64_t ram = config.ram; + + uint64_t bitmap_taken_bytes = 8 * ((table_size + 63) / 64); + uint64_t hashed_pilots_cache_bytes = search_cache_size * sizeof(uint64_t); + if (bitmap_taken_bytes + hashed_pilots_cache_bytes >= ram) { + std::stringstream ss; + ss << "not enough RAM available, the bitmap alone takes " + << static_cast(bitmap_taken_bytes) / 1000000000 + << " GB of space."; + throw std::runtime_error(ss.str()); + } + + if (config.verbose_output) { + constexpr uint64_t GB = 1000000000; + uint64_t peak = + num_keys * (sizeof(bucket_payload_pair) + sizeof(uint64_t)) + + (num_keys + num_buckets) * sizeof(uint64_t); + std::cout << "c = " << config.c << std::endl; + std::cout << "alpha = " << config.alpha << std::endl; + std::cout << "num_keys = " << num_keys << std::endl; + std::cout << "table_size = " << table_size << std::endl; + std::cout << "num_buckets = " << num_buckets << std::endl; + std::cout << "using " << static_cast(ram) / GB << " GB of RAM" + << " (" << static_cast(bitmap_taken_bytes) / GB + << " GB occupied by the bitmap)" << std::endl; + std::cout << "using a peak of " << static_cast(peak) / GB + << " GB of disk space" << std::endl; + } + + uint64_t run_identifier = clock_type::now().time_since_epoch().count(); + temporary_files_manager tfm(config.tmp_dir, run_identifier); + + uint64_t num_non_empty_buckets = 0; + + try { + auto start = clock_type::now(); + { + auto start = clock_type::now(); + std::vector pairs_blocks; + map(keys, num_keys, pairs_blocks, tfm, config); + auto stop = clock_type::now(); + if (config.verbose_output) { + std::cout << " == map+sort " << tfm.get_num_pairs_files() + << " files(s) took: " << seconds(stop - start) << " seconds" + << std::endl; + } + start = clock_type::now(); + buckets_t buckets = tfm.buckets(config); + merge(pairs_blocks, buckets, config.verbose_output); + buckets.flush(); + for (auto& pairs_block : pairs_blocks) + pairs_block.close(); + num_non_empty_buckets = buckets.num_buckets(); + tfm.remove_all_pairs_files(); + stop = clock_type::now(); + if (config.verbose_output) { + std::cout << " == merge+check took: " << seconds(stop - start) + << " seconds" << std::endl; + std::cout << " == max bucket size = " << int(tfm.max_bucket_size()) + << std::endl; + } + } + auto stop = clock_type::now(); + time.mapping_ordering_seconds = seconds(stop - start); + if (config.verbose_output) { + std::cout << " == map+ordering took " << time.mapping_ordering_seconds + << " seconds" << std::endl; + } + } catch (...) { + tfm.remove_all_pairs_files(); + tfm.remove_all_merge_files(); + throw; + } + + try { + auto start = clock_type::now(); + bit_vector_builder taken(m_table_size); + + { // search + auto buckets_iterator = tfm.buckets_iterator(); + + // write all bucket-pilot pairs to files + uint64_t ram_for_pilots = + ram - bitmap_taken_bytes - hashed_pilots_cache_bytes; + auto pilots = tfm.get_multifile_pairs_writer(num_non_empty_buckets, + ram_for_pilots, 1, 0); + + search(m_num_keys, m_num_buckets, num_non_empty_buckets, m_seed, config, + buckets_iterator, taken, pilots); + + pilots.flush(); + buckets_iterator.close(); + // merge all sorted bucket-pilot pairs on a single file, saving only the + // pilot + pilots_merger_t pilots_merger(tfm.get_pilots_filename(), ram); + merge(tfm.pairs_blocks(), pilots_merger, false); + pilots_merger.finalize_and_close(m_num_buckets); + + if (m_pilots_filename != "") + std::remove(m_pilots_filename.c_str()); + m_pilots_filename = tfm.get_pilots_filename(); + + // remove unused temporary files + tfm.remove_all_pairs_files(); + tfm.remove_all_merge_files(); + } + + if (config.minimal_output) { // fill free slots + // write all free slots to file + buffered_file_t writer(tfm.get_free_slots_filename(), + ram - bitmap_taken_bytes); + fill_free_slots(taken, num_keys, writer); + writer.close(); + if (m_free_slots_filename != "") + std::remove(m_free_slots_filename.c_str()); + m_free_slots_filename = tfm.get_free_slots_filename(); + } + + auto stop = clock_type::now(); + time.searching_seconds = seconds(stop - start); + if (config.verbose_output) { + std::cout << " == search took " << time.searching_seconds << " seconds" + << std::endl; + } + } catch (...) { + tfm.remove_all_pairs_files(); + tfm.remove_all_merge_files(); + throw; + } + + return time; + } + + uint64_t seed() const { return m_seed; } + + uint64_t num_keys() const { return m_num_keys; } + + uint64_t table_size() const { return m_table_size; } + + skew_bucketer bucketer() const { return m_bucketer; } + + mm::file_source pilots() const { + return mm::file_source(m_pilots_filename); + } + + mm::file_source free_slots() const { + return mm::file_source(m_free_slots_filename); + } + + private: + uint64_t m_seed; + uint64_t m_num_keys; + uint64_t m_table_size; + uint64_t m_num_buckets; + skew_bucketer m_bucketer; + std::string m_pilots_filename; + std::string m_free_slots_filename; + + template + struct buffer_t { + buffer_t(uint64_t ram) : m_buffer_capacity(ram / sizeof(T)) { + m_buffer.reserve(m_buffer_capacity); + assert(m_buffer_capacity > 0); + } + + template + void emplace_back(_Args&&... __args) { + m_buffer.emplace_back(std::forward<_Args>(__args)...); + if (--m_buffer_capacity == 0) + flush(); + } + + void flush() { + if (!m_buffer.empty()) { + uint64_t buffer_size = m_buffer.size(); + flush_impl(m_buffer); + m_buffer_capacity += buffer_size; + m_buffer.clear(); + } + } + + protected: + virtual void flush_impl(std::vector& buffer) = 0; + + private: + uint64_t m_buffer_capacity; + std::vector m_buffer; + }; + + template + struct buffered_file_t : buffer_t { + buffered_file_t(std::string const& filename, uint64_t ram) + : buffer_t(ram) { + m_out.open(filename, std::ofstream::out | std::ofstream::binary); + if (!m_out.is_open()) + throw std::runtime_error("cannot open binary file in write mode"); + } + + void close() { + buffer_t::flush(); + m_out.close(); + } + + protected: + void flush_impl(std::vector& buffer) { + m_out.write(reinterpret_cast(buffer.data()), + buffer.size() * sizeof(T)); + } + + private: + std::ofstream m_out; + }; + + template + struct memory_view { + typedef T* iterator; + typedef const T* const_iterator; + + memory_view() : m_begin(nullptr), m_end(nullptr){}; + memory_view(T* begin, uint64_t size) + : m_begin(begin), m_end(begin + size) {} + + inline T* begin() const { return m_begin; } + inline T* end() const { return m_end; } + inline T& operator[](uint64_t pos) const { return *(m_begin + pos); } + inline uint64_t size() const { return std::distance(m_begin, m_end); } + + protected: + T *m_begin, *m_end; + }; + + template + struct reader_t : memory_view { + void open(std::string const& filename) { + if (m_is.is_open()) + m_is.close(); + m_is.open(filename, mm::advice::sequential); + if (!m_is.is_open()) + throw std::runtime_error("cannot open temporary file (read)"); + memory_view::m_begin = m_is.data(); + memory_view::m_end = m_is.data() + m_is.size(); + } + + void close() { m_is.close(); } + + private: + mm::file_source m_is; + }; + + typedef reader_t pairs_t; + + struct pairs_merger_t { + pairs_merger_t(std::string const& filename, uint64_t ram) + : m_buffer(filename, ram) {} + + template + void add(bucket_id_type bucket_id, bucket_size_type bucket_size, + HashIterator hashes) { + for (uint64_t k = 0; k != bucket_size; ++k, ++hashes) { + m_buffer.emplace_back(bucket_id, *hashes); + } + } + + void close() { m_buffer.close(); } + + private: + buffered_file_t m_buffer; + }; + + struct buckets_t { // merger + buckets_t(std::vector const& filenames, uint64_t ram, + std::vector& used_bucket_sizes) + : m_filenames(filenames), + m_buffers(filenames.size()), + m_buffer_capacity(ram / (sizeof(uint64_t) * 2)), + m_ram(ram / (sizeof(uint64_t) * 2)), + m_used_bucket_sizes(used_bucket_sizes), + m_outs(filenames.size()), + m_num_buckets(0) { + assert(m_filenames.size() == m_used_bucket_sizes.size()); + m_non_empty_buckets.reserve(filenames.size()); + for (uint64_t i = 0; i != filenames.size(); ++i) { + if (m_used_bucket_sizes[i]) { + throw std::runtime_error("One of the output files is already open"); + } + } + } + + template + void add(bucket_id_type bucket_id, bucket_size_type bucket_size, + HashIterator hashes) { + assert(bucket_size > 0 and bucket_size <= MAX_BUCKET_SIZE); + ensure_capacity(bucket_size); + uint64_t i = bucket_size - 1; + if (m_buffers[i].empty()) + m_non_empty_buckets.push_back(bucket_size - 1); + m_buffers[i].push_back(bucket_id); + for (uint64_t k = 0; k != bucket_size; ++k, ++hashes) + m_buffers[i].push_back(*hashes); + m_buffer_capacity -= bucket_size + 1; + ++m_num_buckets; + } + + uint64_t num_buckets() const { return m_num_buckets; }; + + void flush() { + for (uint64_t i = 0; i != m_buffers.size(); ++i) + flush_i(i); + m_non_empty_buckets.clear(); + } + + private: + void ensure_capacity(uint64_t bucket_size) { + if (bucket_size + 1 > m_buffer_capacity) { + std::sort(m_non_empty_buckets.begin(), m_non_empty_buckets.end(), + [&](uint64_t i, uint64_t j) { + return m_buffers[i].size() < m_buffers[j].size(); + }); + + uint64_t target = + std::max((uint64_t) std::ceil(0.999 * m_ram), bucket_size + 1); + while (m_buffer_capacity < target) { + flush_i(m_non_empty_buckets.back()); + m_non_empty_buckets.pop_back(); + } + } + } + + void flush_i(uint64_t i) { + if (m_buffers[i].size() == 0) + return; + if (!m_used_bucket_sizes[i]) { + m_outs[i].open(m_filenames[i].c_str(), + std::ofstream::out | std::ofstream::binary); + if (!m_outs[i].is_open()) { + throw std::runtime_error("cannot open temporary file (write)"); + } + m_used_bucket_sizes[i] = true; + } + m_outs[i].write(reinterpret_cast(m_buffers[i].data()), + m_buffers[i].size() * sizeof(uint64_t)); + m_buffer_capacity += m_buffers[i].size(); + std::vector().swap(m_buffers[i]); + } + + std::vector m_filenames; + std::vector> m_buffers; + uint64_t m_buffer_capacity; + uint64_t m_ram; + std::vector m_non_empty_buckets; + std::vector& m_used_bucket_sizes; + std::vector m_outs; + uint64_t m_num_buckets; + }; + + struct buckets_iterator_t { + buckets_iterator_t( + std::vector> const& + sizes_filenames) + : m_sizes(sizes_filenames.size()), m_sources(sizes_filenames.size()) { + m_pos = sizes_filenames.size(); + for (uint64_t i = 0, i_end = m_pos; i < i_end; ++i) { + m_sizes[i] = sizes_filenames[i].first; + m_sources[i].open(sizes_filenames[i].second, mm::advice::sequential); + assert(i == 0 or m_sizes[i - 1] < m_sizes[i]); + } + read_next_file(); + } + + void close() { + for (auto& is : m_sources) + is.close(); + } + + inline bucket_t operator*() { + bucket_t bucket; + bucket.init(m_it, m_bucket_size); + return bucket; + } + + void operator++() { + m_it += m_bucket_size + 1; + if (m_it >= m_end) + read_next_file(); + } + + private: + void read_next_file() { + if (m_pos == 0) { + m_it = m_end; + return; + } + --m_pos; + m_bucket_size = m_sizes[m_pos]; + m_it = m_sources[m_pos].data(); + m_end = m_it + m_sources[m_pos].size(); + } + + uint64_t m_pos; + std::vector m_sizes; + std::vector> m_sources; + bucket_size_type m_bucket_size; + uint64_t const* m_it; + uint64_t const* m_end; + }; + + struct pilots_merger_t { + pilots_merger_t(std::string const& filename, uint64_t ram) + : m_buffer(filename, ram), m_next_bucket_id(0) {} + + template + void add(bucket_id_type bucket_id, bucket_size_type bucket_size, + HashIterator hashes) { + assert(bucket_size == 1); + (void) bucket_size; // avoid unused warning in release mode + emplace_back_and_fill(bucket_id, *hashes); + } + + void finalize_and_close(uint64_t num_buckets) { + if (m_next_bucket_id < num_buckets) + emplace_back_and_fill(num_buckets - 1, 0); + m_buffer.close(); + } + + private: + inline void emplace_back_and_fill(bucket_id_type bucket_id, + uint64_t pilot) { + assert(m_next_bucket_id <= bucket_id); + + while (m_next_bucket_id++ < bucket_id) { + m_buffer.emplace_back(0); + } + m_buffer.emplace_back(pilot); + } + + buffered_file_t m_buffer; + uint64_t m_next_bucket_id; + }; + + struct multifile_pairs_writer : buffer_t { + multifile_pairs_writer(std::vector const& filenames, + uint64_t& num_pairs_files, uint64_t num_pairs, + uint64_t ram, uint64_t num_threads_sort = 1, + uint64_t ram_parallel_merge = 0) + : buffer_t(get_balanced_ram(num_pairs, ram)), + m_filenames(filenames), + m_num_pairs_files(num_pairs_files), + m_num_threads_sort(num_threads_sort), + m_ram_parallel_merge(ram_parallel_merge) { + assert(num_threads_sort > 1 or ram_parallel_merge == 0); + } + + protected: + void flush_impl(std::vector& buffer) { + const uint64_t size = buffer.size(); + + if (m_num_threads_sort > 1) { // parallel + std::vector> blocks; + uint64_t num_keys_per_thread = + (size + m_num_threads_sort - 1) / m_num_threads_sort; + auto exe = [&](uint64_t tid) { + std::sort(blocks[tid].begin(), blocks[tid].end()); + }; + + std::vector threads(m_num_threads_sort); + for (uint64_t i = 0; i != m_num_threads_sort; ++i) { + auto begin = buffer.data() + i * num_keys_per_thread; + auto end = + buffer.data() + std::min((i + 1) * num_keys_per_thread, size); + uint64_t block_size = std::distance(begin, end); + + blocks.emplace_back(begin, block_size); + threads[i] = std::thread(exe, i); + } + for (uint64_t i = 0; i != m_num_threads_sort; ++i) { + if (threads[i].joinable()) + threads[i].join(); + } + pairs_merger_t pairs_merger(m_filenames[m_num_pairs_files], + m_ram_parallel_merge); + ++m_num_pairs_files; + merge(blocks, pairs_merger, false); + pairs_merger.close(); + } else { // sequential + std::ofstream out(m_filenames[m_num_pairs_files], + std::ofstream::out | std::ofstream::binary); + if (!out.is_open()) + throw std::runtime_error("cannot open temporary file (write)"); + ++m_num_pairs_files; + std::sort(buffer.begin(), buffer.end()); + out.write(reinterpret_cast(buffer.data()), + size * sizeof(bucket_payload_pair)); + out.close(); + } + } + + private: + std::vector m_filenames; + uint64_t& m_num_pairs_files; + uint64_t m_num_threads_sort; + uint64_t m_ram_parallel_merge; + + static uint64_t get_balanced_ram(uint64_t num_pairs, uint64_t ram) { + uint64_t num_pairs_per_file = ram / sizeof(bucket_payload_pair); + uint64_t num_temporary_files = + (num_pairs + num_pairs_per_file - 1) / num_pairs_per_file; + uint64_t balanced_num_pairs_per_temporary_file = + (num_pairs + num_temporary_files - 1) / num_temporary_files; + uint64_t balanced_ram = + balanced_num_pairs_per_temporary_file * sizeof(bucket_payload_pair); + assert(balanced_ram <= ram); + + return balanced_ram; + } + }; + + struct temporary_files_manager { + temporary_files_manager(std::string const& dir_name, + uint64_t run_identifier) + : m_dir_name(dir_name), + m_run_identifier(run_identifier), + m_num_pairs_files(0), + m_used_bucket_sizes(MAX_BUCKET_SIZE) { + std::fill(m_used_bucket_sizes.begin(), m_used_bucket_sizes.end(), false); + } + + multifile_pairs_writer get_multifile_pairs_writer( + uint64_t num_pairs, uint64_t ram, uint64_t num_threads_sort = 1, + uint64_t ram_parallel_merge = 0) { + uint64_t num_pairs_per_file = ram / sizeof(bucket_payload_pair); + uint64_t num_temporary_files = + (num_pairs + num_pairs_per_file - 1) / num_pairs_per_file; + std::vector filenames; + filenames.reserve(num_temporary_files); + for (uint64_t i = 0; i < num_temporary_files; ++i) { + filenames.emplace_back(get_pairs_filename(m_num_pairs_files + i)); + } + return multifile_pairs_writer(filenames, m_num_pairs_files, num_pairs, + ram, num_threads_sort, ram_parallel_merge); + } + + uint64_t get_num_pairs_files() const { return m_num_pairs_files; } + + void remove_all_pairs_files() { + while (m_num_pairs_files > 0) { + std::remove(get_pairs_filename(--m_num_pairs_files).c_str()); + } + } + + void remove_all_merge_files() { + for (uint64_t i = 0; i != MAX_BUCKET_SIZE; ++i) { + if (m_used_bucket_sizes[i]) { + std::remove(get_buckets_filename(i + 1).c_str()); + m_used_bucket_sizes[i] = false; + } + } + } + + std::vector pairs_blocks() const { + std::vector result(m_num_pairs_files); + for (uint64_t i = 0; i != m_num_pairs_files; ++i) + result[i].open(get_pairs_filename(i)); + return result; + }; + + buckets_t buckets(build_configuration const& config) { + std::vector filenames; + filenames.reserve(MAX_BUCKET_SIZE); + for (uint64_t bucket_size = 1; bucket_size <= MAX_BUCKET_SIZE; + ++bucket_size) { + filenames.emplace_back(get_buckets_filename(bucket_size)); + } + return buckets_t(filenames, config.ram, m_used_bucket_sizes); + } + + buckets_iterator_t buckets_iterator() { + std::vector> sizes_filenames; + for (uint64_t i = 0; i != MAX_BUCKET_SIZE; ++i) { + if (m_used_bucket_sizes[i]) { + uint64_t bucket_size = i + 1; + sizes_filenames.emplace_back(bucket_size, + get_buckets_filename(bucket_size)); + } + } + assert(sizes_filenames.size() > 0); + return buckets_iterator_t(sizes_filenames); + } + + bucket_size_type max_bucket_size() { + bucket_size_type bucket_size = 0; + for (uint64_t i = 0, i_end = m_used_bucket_sizes.size(); i < i_end; ++i) { + if (m_used_bucket_sizes[i]) + bucket_size = i; + } + return bucket_size + 1; + } + + std::string get_pilots_filename() const { + std::stringstream filename; + filename << m_dir_name << "/pthash.tmp.run" << m_run_identifier + << ".pilots" + << ".bin"; + return filename.str(); + } + + std::string get_free_slots_filename() const { + std::stringstream filename; + filename << m_dir_name << "/pthash.tmp.run" << m_run_identifier + << ".free_slots" + << ".bin"; + return filename.str(); + } + + private: + std::string get_pairs_filename(uint32_t file_id) const { + std::stringstream filename; + filename << m_dir_name << "/pthash.tmp.run" << m_run_identifier + << ".pairs" << file_id << ".bin"; + return filename.str(); + } + + std::string get_buckets_filename(bucket_size_type bucket_size) const { + std::stringstream filename; + filename << m_dir_name << "/pthash.tmp.run" << m_run_identifier << ".size" + << static_cast(bucket_size) << ".bin"; + return filename.str(); + } + + std::string m_dir_name; + uint64_t m_run_identifier; + uint64_t m_num_pairs_files; + std::vector m_used_bucket_sizes; + }; + + template + void map(Iterator keys, uint64_t num_keys, std::vector& pairs_blocks, + temporary_files_manager& tfm, build_configuration const& config) { + progress_logger logger(num_keys, " == processed ", " keys from input", + config.verbose_output); + + uint64_t ram = config.ram; + uint64_t ram_parallel_merge = 0; + if (config.num_threads > 1) { + ram_parallel_merge = ram * 0.01; + assert(ram_parallel_merge >= + MAX_BUCKET_SIZE * sizeof(bucket_payload_pair)); + } + + auto writer = + tfm.get_multifile_pairs_writer(num_keys, ram - ram_parallel_merge, + config.num_threads, ram_parallel_merge); + try { + for (uint64_t i = 0; i != num_keys; ++i, ++keys) { + auto const& key = *keys; + auto hash = hasher_type::hash(key, m_seed); + bucket_id_type bucket_id = m_bucketer.bucket(hash.first()); + writer.emplace_back(bucket_id, hash.second()); + logger.log(); + } + writer.flush(); + logger.finalize(); + } catch (std::runtime_error const& e) { throw e; } + + auto tmp = tfm.pairs_blocks(); + pairs_blocks.swap(tmp); + } +}; + +} // namespace pthash diff --git a/thirdparty/pthash/builders/internal_memory_builder_single_phf.hpp b/thirdparty/pthash/builders/internal_memory_builder_single_phf.hpp new file mode 100644 index 0000000000..df9be29900 --- /dev/null +++ b/thirdparty/pthash/builders/internal_memory_builder_single_phf.hpp @@ -0,0 +1,365 @@ +/** Copyright 2020-2024 Giulio Ermanno Pibiri and Roberto Trani + * + * The following sets forth attribution notices for third party software. + * + * PTHash: + * The software includes components licensed by Giulio Ermanno Pibiri and + * Roberto Trani, available at https://github.com/jermp/pthash + * + * Licensed under the MIT License (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * https://opensource.org/licenses/MIT + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "pthash/builders/search.hpp" +#include "pthash/builders/util.hpp" +#include "pthash/utils/bucketers.hpp" +#include "pthash/utils/hasher.hpp" +#include "pthash/utils/logger.hpp" + +namespace pthash { + +template +struct internal_memory_builder_single_phf { + typedef Hasher hasher_type; + + template + build_timings build_from_keys(RandomAccessIterator keys, uint64_t num_keys, + build_configuration const& config) { + if (config.seed == constants::invalid_seed) { + for (auto attempt = 0; attempt < 10; ++attempt) { + m_seed = random_value(); + try { + return build_from_hashes( + hash_generator(keys, m_seed), num_keys, + config); + } catch (seed_runtime_error const& error) { + std::cout << "attempt " << attempt + 1 << " failed" << std::endl; + } + } + throw seed_runtime_error(); + } + m_seed = config.seed; + return build_from_hashes(hash_generator(keys, m_seed), + num_keys, config); + } + + template + build_timings build_from_hashes(RandomAccessIterator hashes, + uint64_t num_keys, + build_configuration const& config) { + assert(num_keys > 1); + if (config.alpha == 0 or config.alpha > 1.0) { + throw std::invalid_argument("load factor must be > 0 and <= 1.0"); + } + + clock_type::time_point start; + + start = clock_type::now(); + + build_timings time; + + uint64_t table_size = static_cast(num_keys) / config.alpha; + if ((table_size & (table_size - 1)) == 0) + table_size += 1; + uint64_t num_buckets = + (config.num_buckets == constants::invalid_num_buckets) + ? (std::ceil((config.c * num_keys) / std::log2(num_keys))) + : config.num_buckets; + + m_num_keys = num_keys; + m_table_size = table_size; + m_num_buckets = num_buckets; + m_bucketer.init(m_num_buckets); + + if (config.verbose_output) { + std::cout << "c = " << config.c << std::endl; + std::cout << "alpha = " << config.alpha << std::endl; + std::cout << "num_keys = " << num_keys << std::endl; + std::cout << "table_size = " << table_size << std::endl; + std::cout << "num_buckets = " << num_buckets << std::endl; + } + + buckets_t buckets; + { + auto start = clock_type::now(); + std::vector pairs_blocks; + map(hashes, num_keys, pairs_blocks, config); + auto elapsed = seconds(clock_type::now() - start); + if (config.verbose_output) { + std::cout << " == map+sort took: " << elapsed << " seconds" + << std::endl; + } + + start = clock_type::now(); + merge(pairs_blocks, buckets, config.verbose_output); + elapsed = seconds(clock_type::now() - start); + if (config.verbose_output) { + std::cout << " == merge+check took: " << elapsed << " seconds" + << std::endl; + } + } + auto buckets_iterator = buckets.begin(); + time.mapping_ordering_seconds = seconds(clock_type::now() - start); + if (config.verbose_output) { + std::cout << " == mapping+ordering took " << time.mapping_ordering_seconds + << " seconds " << std::endl; + std::cout << " == max bucket size = " << int((*buckets_iterator).size()) + << std::endl; + } + + start = clock_type::now(); + { + m_pilots.resize(num_buckets); + std::fill(m_pilots.begin(), m_pilots.end(), 0); + bit_vector_builder taken(m_table_size); + uint64_t num_non_empty_buckets = buckets.num_buckets(); + pilots_wrapper_t pilots_wrapper(m_pilots); + search(m_num_keys, m_num_buckets, num_non_empty_buckets, m_seed, config, + buckets_iterator, taken, pilots_wrapper); + if (config.minimal_output) { + m_free_slots.clear(); + m_free_slots.reserve(taken.size() - num_keys); + fill_free_slots(taken, num_keys, m_free_slots); + } + } + time.searching_seconds = seconds(clock_type::now() - start); + if (config.verbose_output) { + std::cout << " == search took " << time.searching_seconds << " seconds" + << std::endl; + } + + return time; + } + + uint64_t seed() const { return m_seed; } + + uint64_t num_keys() const { return m_num_keys; } + + uint64_t table_size() const { return m_table_size; } + + skew_bucketer bucketer() const { return m_bucketer; } + + std::vector const& pilots() const { return m_pilots; } + + std::vector const& free_slots() const { return m_free_slots; } + + void swap(internal_memory_builder_single_phf& other) { + std::swap(m_seed, other.m_seed); + std::swap(m_num_keys, other.m_num_keys); + std::swap(m_num_buckets, other.m_num_buckets); + std::swap(m_table_size, other.m_table_size); + std::swap(m_bucketer, other.m_bucketer); + m_pilots.swap(other.m_pilots); + m_free_slots.swap(other.m_free_slots); + } + + template + void visit(Visitor& visitor) { + visitor.visit(m_seed); + visitor.visit(m_num_keys); + visitor.visit(m_num_buckets); + visitor.visit(m_table_size); + visitor.visit(m_bucketer); + visitor.visit(m_pilots); + visitor.visit(m_free_slots); + } + + static size_t estimate_num_bytes_for_construction( + uint64_t num_keys, build_configuration const& config) { + uint64_t table_size = static_cast(num_keys) / config.alpha; + if ((table_size & (table_size - 1)) == 0) + table_size += 1; + uint64_t num_buckets = + (config.num_buckets == constants::invalid_num_buckets) + ? (std::ceil((config.c * num_keys) / std::log2(num_keys))) + : config.num_buckets; + + size_t mapping_bytes = + num_keys * sizeof(bucket_payload_pair) // pairs + + (num_keys + num_buckets) * sizeof(uint64_t); // buckets + + size_t search_bytes = + num_buckets * sizeof(uint64_t) // pilots + + num_buckets * sizeof(uint64_t) // buckets + + (config.minimal_output ? (table_size - num_keys) * sizeof(uint64_t) + : 0) // free_slots + + num_keys * sizeof(uint64_t) // hashes + + table_size / 8; // bitmap taken + return std::max(mapping_bytes, search_bytes); + } + + private: + uint64_t m_seed; + uint64_t m_num_keys; + uint64_t m_num_buckets; + uint64_t m_table_size; + skew_bucketer m_bucketer; + std::vector m_pilots; + std::vector m_free_slots; + + template + struct hash_generator { + hash_generator(RandomAccessIterator keys, uint64_t seed) + : m_iterator(keys), m_seed(seed) {} + + inline typename hasher_type::hash_type operator*() { + return hasher_type::hash(*m_iterator, m_seed); + } + + inline void operator++() { ++m_iterator; } + + inline hash_generator operator+(uint64_t offset) const { + return hash_generator(m_iterator + offset, m_seed); + } + + private: + RandomAccessIterator m_iterator; + uint64_t m_seed; + }; + + typedef std::vector pairs_t; + + struct buckets_iterator_t { + buckets_iterator_t(std::vector> const& buffers) + : m_buffers_it(buffers.end() - 1), m_bucket_size(buffers.size()) { + m_bucket.init(m_buffers_it->data(), m_bucket_size); + skip_empty_buckets(); + } + + inline void operator++() { + uint64_t const* begin = m_bucket.begin() + m_bucket_size; + uint64_t const* end = m_buffers_it->data() + m_buffers_it->size(); + m_bucket.init(begin, m_bucket_size); + if ((m_bucket.begin() - 1) == end and m_bucket_size != 0) { + --m_bucket_size; + --m_buffers_it; + skip_empty_buckets(); + } + } + + inline bucket_t operator*() const { return m_bucket; } + + private: + std::vector>::const_iterator m_buffers_it; + bucket_size_type m_bucket_size; + bucket_t m_bucket; + + void skip_empty_buckets() { + while (m_bucket_size != 0 and m_buffers_it->empty()) { + --m_bucket_size; + --m_buffers_it; + } + if (m_bucket_size != 0) + m_bucket.init(m_buffers_it->data(), m_bucket_size); + } + }; + + struct buckets_t { + buckets_t() : m_buffers(MAX_BUCKET_SIZE), m_num_buckets(0) {} + + template + void add(bucket_id_type bucket_id, bucket_size_type bucket_size, + HashIterator hashes) { + assert(bucket_size > 0); + uint64_t i = bucket_size - 1; + m_buffers[i].push_back(bucket_id); + for (uint64_t k = 0; k != bucket_size; ++k, ++hashes) + m_buffers[i].push_back(*hashes); + ++m_num_buckets; + } + + uint64_t num_buckets() const { return m_num_buckets; }; + + buckets_iterator_t begin() const { return buckets_iterator_t(m_buffers); } + + private: + std::vector> m_buffers; + uint64_t m_num_buckets; + }; + + struct pilots_wrapper_t { + pilots_wrapper_t(std::vector& pilots) : m_pilots(pilots) {} + + inline void emplace_back(bucket_id_type bucket_id, uint64_t pilot) { + m_pilots[bucket_id] = pilot; + } + + private: + std::vector& m_pilots; + }; + + template + void map_sequential(RandomAccessIterator hashes, uint64_t num_keys, + std::vector& pairs_blocks, + build_configuration const&) const { + pairs_t pairs(num_keys); + RandomAccessIterator begin = hashes; + for (uint64_t i = 0; i != num_keys; ++i, ++begin) { + auto hash = *begin; + auto bucket_id = m_bucketer.bucket(hash.first()); + pairs[i] = {static_cast(bucket_id), hash.second()}; + } + std::sort(pairs.begin(), pairs.end()); + pairs_blocks.resize(1); + pairs_blocks.front().swap(pairs); + } + + template + void map_parallel(RandomAccessIterator hashes, uint64_t num_keys, + std::vector& pairs_blocks, + build_configuration const& config) const { + pairs_blocks.resize(config.num_threads); + uint64_t num_keys_per_thread = + (num_keys + config.num_threads - 1) / config.num_threads; + + auto exe = [&](uint64_t tid) { + auto& local_pairs = pairs_blocks[tid]; + RandomAccessIterator begin = hashes + tid * num_keys_per_thread; + uint64_t local_num_keys = (tid != config.num_threads - 1) + ? num_keys_per_thread + : (num_keys - tid * num_keys_per_thread); + local_pairs.resize(local_num_keys); + + for (uint64_t local_i = 0; local_i != local_num_keys; + ++local_i, ++begin) { + auto hash = *begin; + auto bucket_id = m_bucketer.bucket(hash.first()); + local_pairs[local_i] = {static_cast(bucket_id), + hash.second()}; + } + std::sort(local_pairs.begin(), local_pairs.end()); + }; + + std::vector threads(config.num_threads); + for (uint64_t i = 0; i != config.num_threads; ++i) + threads[i] = std::thread(exe, i); + for (auto& t : threads) { + if (t.joinable()) + t.join(); + } + } + + template + void map(RandomAccessIterator hashes, uint64_t num_keys, + std::vector& pairs_blocks, + build_configuration const& config) const { + if (config.num_threads > 1) { + map_parallel(hashes, num_keys, pairs_blocks, config); + } else { + map_sequential(hashes, num_keys, pairs_blocks, config); + } + } +}; + +} // namespace pthash diff --git a/thirdparty/pthash/builders/search.hpp b/thirdparty/pthash/builders/search.hpp new file mode 100644 index 0000000000..3924443030 --- /dev/null +++ b/thirdparty/pthash/builders/search.hpp @@ -0,0 +1,358 @@ +/** Copyright 2020-2024 Giulio Ermanno Pibiri and Roberto Trani + * + * The following sets forth attribution notices for third party software. + * + * PTHash: + * The software includes components licensed by Giulio Ermanno Pibiri and + * Roberto Trani, available at https://github.com/jermp/pthash + * + * Licensed under the MIT License (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * https://opensource.org/licenses/MIT + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include // for pow, round, log2 +#include // for stringbuf +#include +#include "pthash/essentials/essentials.hpp" + +#include "pthash/builders/util.hpp" +#include "pthash/encoders/bit_vector.hpp" +#include "pthash/utils/hasher.hpp" + +namespace pthash { + +constexpr uint64_t search_cache_size = 1000; + +struct search_logger { + search_logger(uint64_t num_keys, uint64_t table_size, uint64_t num_buckets) + : m_num_keys(num_keys), + m_table_size(table_size), + m_num_buckets(num_buckets), + m_step(m_num_buckets > 20 ? m_num_buckets / 20 : 1), + m_bucket(0), + m_placed_keys(0), + m_trials(0), + m_total_trials(0), + m_expected_trials(0.0), + m_total_expected_trials(0.0) {} + + void init() { + essentials::logger("search starts"); + m_timer.start(); + } + + /* If X_i is the random variable counting the number of trials + for bucket i, then Pr(X_i <= N - 1) = 1 - (1 - p_i)^N, + where p_i is the success probability for bucket i. + By solving 1 - (1 - p_i)^N >= T wrt N and for a given target + probability T < 1, we obtain N <= log_{1-p_i}(1-T), that is: + we get a pilot <= N with probability T. + Of course, the closer T is to 1, the higher N becomes. + In practice T = 0.65 suffices to have + N > # trials per bucket, for all buckets. + */ + double pilot_wp_T(double T, double p) { + assert(T > 0 and p > 0); + double x = std::log2(1.0 - T) / std::log2(1.0 - p); + return round(x); + } + + void update(uint64_t bucket, uint64_t bucket_size, uint64_t pilot) { + if (bucket > 0) { + double base = + static_cast(m_table_size - m_placed_keys) / m_table_size; + double p = pow(base, bucket_size); + double e = 1.0 / p; + m_expected_trials += e; + m_total_expected_trials += e; + } + + m_placed_keys += bucket_size; + m_trials += pilot + 1; + m_total_trials += pilot + 1; + + if (bucket > 0 and bucket % m_step == 0) + print(bucket); + } + + void finalize(uint64_t bucket) { + m_step = bucket - m_bucket; + print(bucket); + essentials::logger("search ends"); + std::cout << " == " << m_num_buckets - bucket << " empty buckets (" + << ((m_num_buckets - bucket) * 100.0) / m_num_buckets << "%)" + << std::endl; + std::cout << " == total trials = " << m_total_trials << std::endl; + std::cout << " == total expected trials = " + << uint64_t(m_total_expected_trials) << std::endl; + } + + private: + uint64_t m_num_keys; + uint64_t m_table_size; + uint64_t m_num_buckets; + uint64_t m_step; + uint64_t m_bucket; + uint64_t m_placed_keys; + + uint64_t m_trials; + uint64_t m_total_trials; + double m_expected_trials; + double m_total_expected_trials; + + essentials::timer + m_timer; + + void print(uint64_t bucket) { + m_timer.stop(); + std::stringbuf buffer; + std::ostream os(&buffer); + os << m_step << " buckets done in " << m_timer.elapsed() << " seconds (" + << (m_placed_keys * 100.0) / m_num_keys << "% of keys, " + << (bucket * 100.0) / m_num_buckets << "% of buckets, " + << static_cast(m_trials) / m_step << " trials per bucket, " + << m_expected_trials / m_step << " expected trials per bucket)"; + essentials::logger(buffer.str()); + m_bucket = bucket; + m_trials = 0; + m_expected_trials = 0.0; + m_timer.reset(); + m_timer.start(); + } +}; + +template +void search_sequential(uint64_t num_keys, uint64_t num_buckets, + uint64_t num_non_empty_buckets, uint64_t seed, + build_configuration const& config, + BucketsIterator& buckets, bit_vector_builder& taken, + PilotsBuffer& pilots) { + uint64_t max_bucket_size = (*buckets).size(); + uint64_t table_size = taken.size(); + std::vector positions; + positions.reserve(max_bucket_size); + __uint128_t M = fastmod::computeM_u64(table_size); + + std::vector hashed_pilots_cache(search_cache_size); + for (uint64_t pilot = 0; pilot != search_cache_size; ++pilot) { + hashed_pilots_cache[pilot] = default_hash64(pilot, seed); + } + + search_logger log(num_keys, table_size, num_buckets); + if (config.verbose_output) + log.init(); + + uint64_t processed_buckets = 0; + for (; processed_buckets < num_non_empty_buckets; + ++processed_buckets, ++buckets) { + auto const& bucket = *buckets; + assert(bucket.size() > 0); + + for (uint64_t pilot = 0; true; ++pilot) { + uint64_t hashed_pilot = PTHASH_LIKELY(pilot < search_cache_size) + ? hashed_pilots_cache[pilot] + : default_hash64(pilot, seed); + + positions.clear(); + + auto bucket_begin = bucket.begin(), bucket_end = bucket.end(); + for (; bucket_begin != bucket_end; ++bucket_begin) { + uint64_t hash = *bucket_begin; + uint64_t p = fastmod::fastmod_u64(hash ^ hashed_pilot, M, table_size); + if (taken.get(p)) + break; + positions.push_back(p); + } + + if (bucket_begin == + bucket_end) { // all keys do not have collisions with taken + + // check for in-bucket collisions + std::sort(positions.begin(), positions.end()); + auto it = std::adjacent_find(positions.begin(), positions.end()); + if (it != positions.end()) + continue; // in-bucket collision detected, try next pilot + + pilots.emplace_back(bucket.id(), pilot); + for (auto p : positions) { + assert(taken.get(p) == false); + taken.set(p, true); + } + if (config.verbose_output) + log.update(processed_buckets, bucket.size(), pilot); + break; + } + } + } + + if (config.verbose_output) + log.finalize(processed_buckets); +} + +template +void search_parallel(uint64_t num_keys, uint64_t num_buckets, + uint64_t num_non_empty_buckets, uint64_t seed, + build_configuration const& config, + BucketsIterator& buckets, bit_vector_builder& taken, + PilotsBuffer& pilots) { + uint64_t max_bucket_size = (*buckets).size(); + uint64_t table_size = taken.size(); + __uint128_t M = fastmod::computeM_u64(table_size); + + const uint64_t num_threads = config.num_threads; + std::vector hashed_pilots_cache(search_cache_size); + for (uint64_t pilot = 0; pilot != search_cache_size; ++pilot) { + hashed_pilots_cache[pilot] = default_hash64(pilot, seed); + } + + search_logger log(num_keys, table_size, num_buckets); + if (config.verbose_output) + log.init(); + + volatile uint64_t next_bucket_idx = 0; + + auto exe = [&](uint64_t local_bucket_idx, bucket_t bucket) { + std::vector positions; + positions.reserve(max_bucket_size); + + while (true) { + uint64_t pilot = 0; + bool pilot_checked = false; + + while (true) { + uint64_t local_next_bucket_idx = next_bucket_idx; + + for (; true; ++pilot) { + if (PTHASH_LIKELY(!pilot_checked)) { + uint64_t hashed_pilot = PTHASH_LIKELY(pilot < search_cache_size) + ? hashed_pilots_cache[pilot] + : default_hash64(pilot, seed); + + positions.clear(); + + auto bucket_begin = bucket.begin(), bucket_end = bucket.end(); + for (; bucket_begin != bucket_end; ++bucket_begin) { + uint64_t hash = *bucket_begin; + uint64_t p = + fastmod::fastmod_u64(hash ^ hashed_pilot, M, table_size); + if (taken.get(p)) + break; + positions.push_back(p); + } + + if (bucket_begin == bucket_end) { + std::sort(positions.begin(), positions.end()); + auto it = std::adjacent_find(positions.begin(), positions.end()); + if (it != positions.end()) + continue; + + // I can stop the pilot search as there are not collisions + pilot_checked = true; + break; + } + } else { + // I already computed the positions and checked the in-bucket + // collisions I must only check the bitmap again + for (auto p : positions) { + if (taken.get(p)) { + pilot_checked = false; + break; + } + } + // I can stop the pilot search as there are not collisions + if (pilot_checked) + break; + } + } + + // I am the first thread: this is the only condition that can stop the + // loop + if (local_next_bucket_idx == local_bucket_idx) + break; + + // active wait until another thread pushes a change in the bitmap + while (local_next_bucket_idx == next_bucket_idx) + ; + } + assert(local_bucket_idx == next_bucket_idx); + + /* thread-safe from now on */ + + pilots.emplace_back(bucket.id(), pilot); + for (auto p : positions) { + assert(taken.get(p) == false); + taken.set(p, true); + } + if (config.verbose_output) + log.update(local_bucket_idx, bucket.size(), pilot); + + // update (local) local_bucket_idx + local_bucket_idx = next_bucket_idx + num_threads; + + if (local_bucket_idx >= num_non_empty_buckets) { // stop the thread + // update (global) next_bucket_idx, which may unlock other threads + ++next_bucket_idx; + break; + } + + // read the next bucket and advance the iterator + bucket = (*buckets); + ++buckets; + + // update (global) next_bucket_idx, which may unlock other threads + ++next_bucket_idx; + } + }; + + std::vector threads; + threads.reserve(num_threads); + next_bucket_idx = static_cast( + -1); // avoid that some thread advances the iterator + for (uint64_t i = 0; i != num_threads and i < num_non_empty_buckets; + ++i, ++buckets) { + bucket_t bucket = *buckets; + threads.emplace_back(exe, i, bucket); + } + + next_bucket_idx = 0; // notify the first thread + for (auto& t : threads) { + if (t.joinable()) + t.join(); + } + assert(next_bucket_idx == num_non_empty_buckets); + + if (config.verbose_output) + log.finalize(next_bucket_idx); +} + +template +void search(uint64_t num_keys, uint64_t num_buckets, + uint64_t num_non_empty_buckets, uint64_t seed, + build_configuration const& config, BucketsIterator& buckets, + bit_vector_builder& taken, PilotsBuffer& pilots) { + if (config.num_threads > 1) { + if (config.num_threads > std::thread::hardware_concurrency()) { + throw std::invalid_argument( + "parallel search should use at most " + + std::to_string(std::thread::hardware_concurrency()) + " threads"); + } + search_parallel(num_keys, num_buckets, num_non_empty_buckets, seed, config, + buckets, taken, pilots); + } else { + search_sequential(num_keys, num_buckets, num_non_empty_buckets, seed, + config, buckets, taken, pilots); + } +} + +} // namespace pthash diff --git a/thirdparty/pthash/builders/util.hpp b/thirdparty/pthash/builders/util.hpp new file mode 100644 index 0000000000..98ac7c3cef --- /dev/null +++ b/thirdparty/pthash/builders/util.hpp @@ -0,0 +1,301 @@ +/** Copyright 2020-2024 Giulio Ermanno Pibiri and Roberto Trani + * + * The following sets forth attribution notices for third party software. + * + * PTHash: + * The software includes components licensed by Giulio Ermanno Pibiri and + * Roberto Trani, available at https://github.com/jermp/pthash + * + * Licensed under the MIT License (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * https://opensource.org/licenses/MIT + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include + +#include "pthash/encoders/bit_vector.hpp" +#include "pthash/utils/logger.hpp" +#include "pthash/utils/util.hpp" + +namespace pthash { + +typedef uint32_t bucket_id_type; +typedef uint8_t bucket_size_type; +#define MAX_BUCKET_SIZE static_cast(100) + +static inline std::string get_tmp_builder_filename(std::string const& dir_name, + uint64_t id) { + return dir_name + "/pthash.temp." + std::to_string(id) + ".builder"; +} + +struct build_timings { + build_timings() + : partitioning_seconds(0.0), + mapping_ordering_seconds(0.0), + searching_seconds(0.0), + encoding_seconds(0.0) {} + + double partitioning_seconds; + double mapping_ordering_seconds; + double searching_seconds; + double encoding_seconds; +}; + +struct build_configuration { + build_configuration() + : c(4.5), + alpha(0.98), + num_partitions(1), + num_buckets(constants::invalid_num_buckets), + num_threads(1), + seed(constants::invalid_seed), + ram(static_cast(constants::available_ram) * 0.75), + tmp_dir(constants::default_tmp_dirname), + minimal_output(false), + verbose_output(true) {} + + double c; + double alpha; + uint64_t num_partitions; + uint64_t num_buckets; + uint64_t num_threads; + uint64_t seed; + uint64_t ram; + std::string tmp_dir; + bool minimal_output; + bool verbose_output; +}; + +struct seed_runtime_error : public std::runtime_error { + seed_runtime_error() : std::runtime_error("seed did not work") {} +}; + +#pragma pack(push, 4) +struct bucket_payload_pair { + bucket_id_type bucket_id; + uint64_t payload; + + bucket_payload_pair() {} + bucket_payload_pair(bucket_id_type bucket_id, uint64_t payload) + : bucket_id(bucket_id), payload(payload) {} + + bool operator<(bucket_payload_pair const& other) const { + return (bucket_id < other.bucket_id) or + (bucket_id == other.bucket_id and payload < other.payload); + } +}; +#pragma pack(pop) + +struct bucket_t { + bucket_t() : m_begin(nullptr), m_size(0) {} + + void init(uint64_t const* begin, bucket_size_type size) { + m_begin = begin; + m_size = size; + } + + inline bucket_id_type id() const { return *m_begin; } + + inline uint64_t const* begin() const { return m_begin + 1; } + + inline uint64_t const* end() const { return m_begin + 1 + m_size; } + + inline bucket_size_type size() const { return m_size; } + + private: + uint64_t const* m_begin; + bucket_size_type m_size; +}; + +template +struct payload_iterator { + payload_iterator(PairsRandomAccessIterator const& iterator) + : m_iterator(iterator) {} + + uint64_t operator*() const { return (*m_iterator).payload; } + + void operator++() { ++m_iterator; } + + private: + PairsRandomAccessIterator m_iterator; +}; + +template +void merge_single_block(Pairs const& pairs, Merger& merger, bool verbose) { + progress_logger logger(pairs.size(), " == merged ", " pairs", verbose); + + bucket_size_type bucket_size = 1; + uint64_t num_pairs = pairs.size(); + logger.log(); + for (uint64_t i = 1; i != num_pairs; ++i) { + if (pairs[i].bucket_id == pairs[i - 1].bucket_id) { + if (PTHASH_LIKELY(pairs[i].payload != pairs[i - 1].payload)) { + ++bucket_size; + } else { + throw seed_runtime_error(); + } + } else { + merger.add(pairs[i - 1].bucket_id, bucket_size, + payload_iterator( + pairs.begin() + i - bucket_size)); + bucket_size = 1; + } + logger.log(); + } + + // add the last bucket + merger.add(pairs[num_pairs - 1].bucket_id, bucket_size, + payload_iterator(pairs.end() - + bucket_size)); + logger.finalize(); +} + +template +void merge_multiple_blocks(std::vector const& pairs_blocks, + Merger& merger, bool verbose) { + uint64_t num_pairs = std::accumulate( + pairs_blocks.begin(), pairs_blocks.end(), static_cast(0), + [](uint64_t sum, Pairs const& pairs) { return sum + pairs.size(); }); + progress_logger logger(num_pairs, " == merged ", " pairs", verbose); + + // input iterators and heap + std::vector iterators; + std::vector idx_heap; + iterators.reserve(pairs_blocks.size()); + idx_heap.reserve(pairs_blocks.size()); + + // heap functions + auto stdheap_idx_comparator = [&](uint32_t idxa, uint32_t idxb) { + return !((*iterators[idxa]) < (*iterators[idxb])); + }; + auto advance_heap_head = [&]() { + auto idx = idx_heap[0]; + ++iterators[idx]; + if (PTHASH_LIKELY(iterators[idx] != pairs_blocks[idx].end())) { + // percolate down the head + uint64_t pos = 0; + uint64_t size = idx_heap.size(); + while (2 * pos + 1 < size) { + uint64_t i = 2 * pos + 1; + if (i + 1 < size and + stdheap_idx_comparator(idx_heap[i], idx_heap[i + 1])) + ++i; + if (stdheap_idx_comparator(idx_heap[i], idx_heap[pos])) + break; + std::swap(idx_heap[pos], idx_heap[i]); + pos = i; + } + } else { + std::pop_heap(idx_heap.begin(), idx_heap.end(), stdheap_idx_comparator); + idx_heap.pop_back(); + } + }; + + // create the input iterators and the heap + for (uint64_t i = 0; i != pairs_blocks.size(); ++i) { + iterators.push_back(pairs_blocks[i].begin()); + idx_heap.push_back(i); + } + std::make_heap(idx_heap.begin(), idx_heap.end(), stdheap_idx_comparator); + + bucket_id_type bucket_id; + std::vector bucket_payloads; + bucket_payloads.reserve(MAX_BUCKET_SIZE); + + // read the first pair + { + bucket_payload_pair pair = (*iterators[idx_heap[0]]); + bucket_id = pair.bucket_id; + bucket_payloads.push_back(pair.payload); + advance_heap_head(); + logger.log(); + } + + // merge + for (uint64_t i = 0; (PTHASH_LIKELY(idx_heap.size())); + ++i, advance_heap_head()) { + bucket_payload_pair pair = (*iterators[idx_heap[0]]); + + if (pair.bucket_id == bucket_id) { + if (PTHASH_LIKELY(pair.payload != bucket_payloads.back())) { + bucket_payloads.push_back(pair.payload); + } else { + throw seed_runtime_error(); + } + } else { + merger.add(bucket_id, bucket_payloads.size(), bucket_payloads.begin()); + bucket_id = pair.bucket_id; + bucket_payloads.clear(); + bucket_payloads.push_back(pair.payload); + } + logger.log(); + } + + // add the last bucket + merger.add(bucket_id, bucket_payloads.size(), bucket_payloads.begin()); + logger.finalize(); +} + +template +void merge(std::vector const& pairs_blocks, Merger& merger, + bool verbose) { + if (pairs_blocks.size() == 1) { + merge_single_block(pairs_blocks[0], merger, verbose); + } else { + merge_multiple_blocks(pairs_blocks, merger, verbose); + } +} + +template +void fill_free_slots(bit_vector_builder const& taken, uint64_t num_keys, + FreeSlots& free_slots) { + uint64_t table_size = taken.size(); + if (table_size <= num_keys) + return; + + uint64_t next_used_slot = num_keys; + uint64_t last_free_slot = 0, last_valid_free_slot = 0; + + while (true) { + // find the next free slot (on the left) + while (last_free_slot < num_keys && taken.get(last_free_slot)) + ++last_free_slot; + // exit condition + if (last_free_slot == num_keys) + break; + // fill with the last free slot (on the left) until I find a new used slot + // (on the right) note: since I found a free slot on the left, there must be + // an used slot on the right + assert(next_used_slot < table_size); + while (!taken.get(next_used_slot)) { + free_slots.emplace_back(last_free_slot); + ++next_used_slot; + } + assert(next_used_slot < table_size); + // fill the used slot (on the right) with the last free slot and advance all + // cursors + free_slots.emplace_back(last_free_slot); + last_valid_free_slot = last_free_slot; + ++next_used_slot; + ++last_free_slot; + } + // fill the tail with the last valid slot that I found + while (next_used_slot != table_size) { + free_slots.emplace_back(last_valid_free_slot); + ++next_used_slot; + } + assert(next_used_slot == table_size); +} + +} // namespace pthash diff --git a/thirdparty/pthash/encoders/bit_vector.hpp b/thirdparty/pthash/encoders/bit_vector.hpp new file mode 100644 index 0000000000..27547a7df3 --- /dev/null +++ b/thirdparty/pthash/encoders/bit_vector.hpp @@ -0,0 +1,347 @@ +/** Copyright 2020-2024 Giulio Ermanno Pibiri and Roberto Trani + * + * The following sets forth attribution notices for third party software. + * + * PTHash: + * The software includes components licensed by Giulio Ermanno Pibiri and + * Roberto Trani, available at https://github.com/jermp/pthash + * + * Licensed under the MIT License (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * https://opensource.org/licenses/MIT + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include + +#include "pthash/encoders/util.hpp" +#include "pthash/essentials/essentials.hpp" + +namespace pthash { + +struct bit_vector_builder { + bit_vector_builder(uint64_t size = 0, bool init = 0) : m_size(size) { + m_bits.resize(essentials::words_for(size), uint64_t(-init)); + if (size) { + m_cur_word = &m_bits.back(); + // clear padding bits + if (init && (size & 63)) { + *m_cur_word >>= 64 - (size & 63); + } + } + } + + void reserve(uint64_t num_bits) { + m_bits.reserve(essentials::words_for(num_bits)); + } + + inline void push_back(bool b) { + uint64_t pos_in_word = m_size % 64; + if (pos_in_word == 0) { + m_bits.push_back(0); + m_cur_word = &m_bits.back(); + } + *m_cur_word |= (uint64_t) b << pos_in_word; + ++m_size; + } + + inline void zero_extend(uint64_t n) { + m_size += n; + uint64_t needed = essentials::words_for(m_size) - m_bits.size(); + if (needed) { + m_bits.insert(m_bits.end(), needed, 0); + m_cur_word = &m_bits.back(); + } + } + + inline void set(uint64_t pos, bool b = true) { + assert(pos < size()); + uint64_t word = pos >> 6; + uint64_t pos_in_word = pos & 63; + m_bits[word] &= ~(uint64_t(1) << pos_in_word); + m_bits[word] |= uint64_t(b) << pos_in_word; + } + + inline uint64_t get(uint64_t pos) const { + assert(pos < size()); + uint64_t word = pos >> 6; + uint64_t pos_in_word = pos & 63; + return m_bits[word] >> pos_in_word & uint64_t(1); + } + + inline void set_bits(uint64_t pos, uint64_t bits, size_t len) { + assert(pos + len <= size()); + // check there are no spurious bits + assert(len == 64 || (bits >> len) == 0); + if (!len) + return; + uint64_t mask = (len == 64) ? uint64_t(-1) : ((uint64_t(1) << len) - 1); + uint64_t word = pos >> 6; + uint64_t pos_in_word = pos & 63; + + m_bits[word] &= ~(mask << pos_in_word); + m_bits[word] |= bits << pos_in_word; + + uint64_t stored = 64 - pos_in_word; + if (stored < len) { + m_bits[word + 1] &= ~(mask >> stored); + m_bits[word + 1] |= bits >> stored; + } + } + + inline void append_bits(uint64_t bits, size_t len) { + // check there are no spurious bits + assert(len == 64 || (bits >> len) == 0); + if (!len) + return; + uint64_t pos_in_word = m_size & 63; + m_size += len; + if (pos_in_word == 0) { + m_bits.push_back(bits); + } else { + *m_cur_word |= bits << pos_in_word; + if (len > 64 - pos_in_word) { + m_bits.push_back(bits >> (64 - pos_in_word)); + } + } + m_cur_word = &m_bits.back(); + } + + inline uint64_t get_word64(uint64_t pos) const { + assert(pos < size()); + uint64_t block = pos >> 6; + uint64_t shift = pos & 63; + uint64_t word = m_bits[block] >> shift; + if (shift && block + 1 < m_bits.size()) { + word |= m_bits[block + 1] << (64 - shift); + } + return word; + } + + void append(bit_vector_builder const& rhs) { + if (!rhs.size()) + return; + + uint64_t pos = m_bits.size(); + uint64_t shift = size() % 64; + m_size = size() + rhs.size(); + m_bits.resize(essentials::words_for(m_size)); + + if (shift == 0) { // word-aligned, easy case + std::copy(rhs.m_bits.begin(), rhs.m_bits.end(), + m_bits.begin() + ptrdiff_t(pos)); + } else { + uint64_t* cur_word = &m_bits.front() + pos - 1; + for (size_t i = 0; i < rhs.m_bits.size() - 1; ++i) { + uint64_t w = rhs.m_bits[i]; + *cur_word |= w << shift; + *++cur_word = w >> (64 - shift); + } + *cur_word |= rhs.m_bits.back() << shift; + if (cur_word < &m_bits.back()) { + *++cur_word = rhs.m_bits.back() >> (64 - shift); + } + } + m_cur_word = &m_bits.back(); + } + + void resize(uint64_t size) { + m_size = size; + m_bits.resize(essentials::words_for(m_size)); + } + + void swap(bit_vector_builder& other) { + m_bits.swap(other.m_bits); + std::swap(m_size, other.m_size); + std::swap(m_cur_word, other.m_cur_word); + } + + std::vector& data() { return m_bits; } + + uint64_t size() const { return m_size; } + + private: + std::vector m_bits; + uint64_t m_size; + uint64_t* m_cur_word; +}; + +struct bit_vector { + bit_vector() : m_size(0) {} + + void build(bit_vector_builder* in) { + m_size = in->size(); + m_bits.swap(in->data()); + } + + bit_vector(bit_vector_builder* in) { build(in); } + + void swap(bit_vector& other) { + std::swap(other.m_size, m_size); + other.m_bits.swap(m_bits); + } + + inline size_t size() const { return m_size; } + + uint64_t bytes() const { + return sizeof(m_size) + essentials::vec_bytes(m_bits); + } + + // get i-th bit + inline uint64_t operator[](uint64_t i) const { + assert(i < size()); + uint64_t block = i >> 6; + uint64_t shift = i & 63; + return m_bits[block] >> shift & uint64_t(1); + } + + inline uint64_t get_bits(uint64_t pos, uint64_t len) const { + assert(pos + len <= size()); + if (!len) + return 0; + uint64_t block = pos >> 6; + uint64_t shift = pos & 63; + uint64_t mask = -(len == 64) | ((1ULL << len) - 1); + if (shift + len <= 64) { + return m_bits[block] >> shift & mask; + } else { + return (m_bits[block] >> shift) | + (m_bits[block + 1] << (64 - shift) & mask); + } + } + + // fast and unsafe version: it retrieves at least 56 bits + inline uint64_t get_word56(uint64_t pos) const { + const char* base_ptr = reinterpret_cast(m_bits.data()); + return *(reinterpret_cast(base_ptr + (pos >> 3))) >> + (pos & 7); + } + + // pad with zeros if extension further size is needed + inline uint64_t get_word64(uint64_t pos) const { + assert(pos < size()); + uint64_t block = pos >> 6; + uint64_t shift = pos & 63; + uint64_t word = m_bits[block] >> shift; + if (shift && block + 1 < m_bits.size()) { + word |= m_bits[block + 1] << (64 - shift); + } + return word; + } + + inline uint64_t predecessor1(uint64_t pos) const { + assert(pos < m_size); + uint64_t block = pos / 64; + uint64_t shift = 64 - pos % 64 - 1; + uint64_t word = m_bits[block]; + word = (word << shift) >> shift; + + unsigned long ret; + while (!util::msb(word, ret)) { + assert(block); + word = m_bits[--block]; + }; + return block * 64 + ret; + } + + std::vector const& data() const { return m_bits; } + + struct unary_iterator { + unary_iterator() : m_data(0), m_position(0), m_buf(0) {} + + unary_iterator(bit_vector const& bv, uint64_t pos = 0) { + m_data = bv.data().data(); + m_position = pos; + m_buf = m_data[pos >> 6]; + // clear low bits + m_buf &= uint64_t(-1) << (pos & 63); + } + + uint64_t position() const { return m_position; } + + uint64_t next() { + unsigned long pos_in_word; + uint64_t buf = m_buf; + while (!util::lsb(buf, pos_in_word)) { + m_position += 64; + buf = m_data[m_position >> 6]; + } + + m_buf = buf & (buf - 1); // clear LSB + m_position = (m_position & ~uint64_t(63)) + pos_in_word; + return m_position; + } + + // skip to the k-th one after the current position + void skip(uint64_t k) { + uint64_t skipped = 0; + uint64_t buf = m_buf; + uint64_t w = 0; + while (skipped + (w = util::popcount(buf)) <= k) { + skipped += w; + m_position += 64; + buf = m_data[m_position / 64]; + } + assert(buf); + uint64_t pos_in_word = util::select_in_word(buf, k - skipped); + m_buf = buf & (uint64_t(-1) << pos_in_word); + m_position = (m_position & ~uint64_t(63)) + pos_in_word; + } + + // skip to the k-th zero after the current position + void skip0(uint64_t k) { + uint64_t skipped = 0; + uint64_t pos_in_word = m_position % 64; + uint64_t buf = ~m_buf & (uint64_t(-1) << pos_in_word); + uint64_t w = 0; + while (skipped + (w = util::popcount(buf)) <= k) { + skipped += w; + m_position += 64; + buf = ~m_data[m_position / 64]; + } + assert(buf); + pos_in_word = util::select_in_word(buf, k - skipped); + m_buf = ~buf & (uint64_t(-1) << pos_in_word); + m_position = (m_position & ~uint64_t(63)) + pos_in_word; + } + + private: + uint64_t const* m_data; + uint64_t m_position; + uint64_t m_buf; + }; + + template + void visit(Visitor& visitor) { + visitor.visit(m_size); + visitor.visit(m_bits); + } + + template + void load(Loader& loader) { + loader.load(m_size); + loader.load_vec(m_bits); + } + + template + void dump(Dumper& dumper) const { + dumper.dump(m_size); + dumper.dump_vec(m_bits); + } + + protected: + size_t m_size; + std::vector m_bits; +}; + +} // namespace pthash diff --git a/thirdparty/pthash/encoders/compact_vector.hpp b/thirdparty/pthash/encoders/compact_vector.hpp new file mode 100644 index 0000000000..b2ec2a69ff --- /dev/null +++ b/thirdparty/pthash/encoders/compact_vector.hpp @@ -0,0 +1,306 @@ +/** Copyright 2020-2024 Giulio Ermanno Pibiri and Roberto Trani + * + * The following sets forth attribution notices for third party software. + * + * PTHash: + * The software includes components licensed by Giulio Ermanno Pibiri and + * Roberto Trani, available at https://github.com/jermp/pthash + * + * Licensed under the MIT License (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * https://opensource.org/licenses/MIT + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include + +namespace pthash { + +struct compact_vector { + template + struct enumerator { + enumerator() {} + + enumerator(Data const* data, uint64_t i = 0) + : m_i(i), + m_cur_val(0), + m_cur_block((i * data->m_width) >> 6), + m_cur_shift((i * data->m_width) & 63), + m_data(data) {} + + uint64_t operator*() { + read(); + return m_cur_val; + } + + enumerator& operator++() { + ++m_i; + return *this; + } + + inline uint64_t value() { + read(); + return m_cur_val; + } + + inline void next() { ++m_i; } + + bool operator==(enumerator const& other) const { return m_i == other.m_i; } + + bool operator!=(enumerator const& other) const { return !(*this == other); } + + private: + uint64_t m_i; + uint64_t m_cur_val; + uint64_t m_cur_block; + int64_t m_cur_shift; + Data const* m_data; + + void read() { + if (m_cur_shift + m_data->m_width <= 64) { + m_cur_val = m_data->m_bits[m_cur_block] >> m_cur_shift & m_data->m_mask; + } else { + uint64_t res_shift = 64 - m_cur_shift; + m_cur_val = + (m_data->m_bits[m_cur_block] >> m_cur_shift) | + (m_data->m_bits[m_cur_block + 1] << res_shift & m_data->m_mask); + ++m_cur_block; + m_cur_shift = -res_shift; + } + + m_cur_shift += m_data->m_width; + + if (m_cur_shift == 64) { + m_cur_shift = 0; + ++m_cur_block; + } + } + }; + + struct builder { + builder() + : m_size(0), + m_width(0), + m_mask(0), + m_back(0), + m_cur_block(0), + m_cur_shift(0) {} + + builder(uint64_t n, uint64_t w) { resize(n, w); } + + void resize(size_t n, uint64_t w) { + m_size = n; + m_width = w; + m_mask = -(w == 64) | ((uint64_t(1) << w) - 1); + m_back = 0; + m_cur_block = 0; + m_cur_shift = 0; + m_bits.resize( + /* use 1 word more for safe access() */ + essentials::words_for(m_size * m_width) + 1, 0); + } + + template + builder(Iterator begin, uint64_t n, uint64_t w) : builder(n, w) { + fill(begin, n); + } + + template + void fill(Iterator begin, uint64_t n) { + if (!m_width) + throw std::runtime_error("width must be greater than 0"); + for (uint64_t i = 0; i != n; ++i, ++begin) + push_back(*begin); + } + + void set(uint64_t i, uint64_t v) { + assert(m_width); + assert(i < m_size); + if (i == m_size - 1) + m_back = v; + + uint64_t pos = i * m_width; + uint64_t block = pos >> 6; + uint64_t shift = pos & 63; + + m_bits[block] &= ~(m_mask << shift); + m_bits[block] |= v << shift; + + uint64_t res_shift = 64 - shift; + if (res_shift < m_width) { + m_bits[block + 1] &= ~(m_mask >> res_shift); + m_bits[block + 1] |= v >> res_shift; + } + } + + void push_back(uint64_t v) { + assert(m_width); + m_back = v; + m_bits[m_cur_block] &= ~(m_mask << m_cur_shift); + m_bits[m_cur_block] |= v << m_cur_shift; + + uint64_t res_shift = 64 - m_cur_shift; + if (res_shift < m_width) { + ++m_cur_block; + m_bits[m_cur_block] &= ~(m_mask >> res_shift); + m_bits[m_cur_block] |= v >> res_shift; + m_cur_shift = -res_shift; + } + + m_cur_shift += m_width; + + if (m_cur_shift == 64) { + m_cur_shift = 0; + ++m_cur_block; + } + } + + friend struct enumerator; + + typedef enumerator iterator; + + iterator begin() const { return iterator(this); } + + iterator end() const { return iterator(this, size()); } + + void build(compact_vector& cv) { + cv.m_size = m_size; + cv.m_width = m_width; + cv.m_mask = m_mask; + cv.m_bits.swap(m_bits); + builder().swap(*this); + } + + void swap(compact_vector::builder& other) { + std::swap(m_size, other.m_size); + std::swap(m_width, other.m_width); + std::swap(m_mask, other.m_mask); + std::swap(m_cur_block, other.m_cur_block); + std::swap(m_cur_shift, other.m_cur_shift); + m_bits.swap(other.m_bits); + } + + uint64_t back() const { return m_back; } + + uint64_t size() const { return m_size; } + + uint64_t width() const { return m_width; } + + std::vector& bits() { return m_bits; } + + private: + uint64_t m_size; + uint64_t m_width; + uint64_t m_mask; + uint64_t m_back; + uint64_t m_cur_block; + int64_t m_cur_shift; + std::vector m_bits; + }; + + compact_vector() : m_size(0), m_width(0), m_mask(0) {} + + template + void build(Iterator begin, uint64_t n) { + assert(n > 0); + uint64_t max = *std::max_element(begin, begin + n); + uint64_t width = max == 0 ? 1 : std::ceil(std::log2(max + 1)); + build(begin, n, width); + } + + template + void build(Iterator begin, uint64_t n, uint64_t w) { + compact_vector::builder builder(begin, n, w); + builder.build(*this); + } + + inline uint64_t operator[](uint64_t i) const { + assert(i < size()); + uint64_t pos = i * m_width; + uint64_t block = pos >> 6; + uint64_t shift = pos & 63; + return shift + m_width <= 64 + ? m_bits[block] >> shift & m_mask + : (m_bits[block] >> shift) | + (m_bits[block + 1] << (64 - shift) & m_mask); + } + + // it retrieves at least 57 bits + inline uint64_t access(uint64_t pos) const { + assert(pos < size()); + uint64_t i = pos * m_width; + const char* ptr = reinterpret_cast(m_bits.data()); + return (*(reinterpret_cast(ptr + (i >> 3))) >> (i & 7)) & + m_mask; + } + + uint64_t back() const { return operator[](size() - 1); } + + inline uint64_t size() const { return m_size; } + + inline uint64_t width() const { return m_width; } + + typedef enumerator iterator; + + iterator begin() const { return iterator(this); } + + iterator end() const { return iterator(this, size()); } + + iterator at(uint64_t pos) const { return iterator(this, pos); } + + std::vector const& bits() const { return m_bits; } + + size_t bytes() const { + return sizeof(m_size) + sizeof(m_width) + sizeof(m_mask) + + essentials::vec_bytes(m_bits); + } + + void swap(compact_vector& other) { + std::swap(m_size, other.m_size); + std::swap(m_width, other.m_width); + std::swap(m_mask, other.m_mask); + m_bits.swap(other.m_bits); + } + + template + void visit(Visitor& visitor) { + visitor.visit(m_size); + visitor.visit(m_width); + visitor.visit(m_mask); + visitor.visit(m_bits); + } + + template + void load(Loader& loader) { + loader.load(m_size); + loader.load(m_width); + loader.load(m_mask); + loader.load_vec(m_bits); + } + + template + void dump(Dumper& dumper) const { + dumper.dump(m_size); + dumper.dump(m_width); + dumper.dump(m_mask); + dumper.dump_vec(m_bits); + } + + private: + uint64_t m_size; + uint64_t m_width; + uint64_t m_mask; + std::vector m_bits; +}; + +} // namespace pthash \ No newline at end of file diff --git a/thirdparty/pthash/encoders/darray.hpp b/thirdparty/pthash/encoders/darray.hpp new file mode 100644 index 0000000000..48de5991da --- /dev/null +++ b/thirdparty/pthash/encoders/darray.hpp @@ -0,0 +1,185 @@ +/** Copyright 2020-2024 Giulio Ermanno Pibiri and Roberto Trani + * + * The following sets forth attribution notices for third party software. + * + * PTHash: + * The software includes components licensed by Giulio Ermanno Pibiri and + * Roberto Trani, available at https://github.com/jermp/pthash + * + * Licensed under the MIT License (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * https://opensource.org/licenses/MIT + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "pthash/encoders/bit_vector.hpp" +#include "pthash/encoders/util.hpp" + +namespace pthash { +namespace detail { + +template +struct darray { + darray() : m_positions() {} + + darray(bit_vector const& bv) : m_positions() { + std::vector const& data = bv.data(); + std::vector cur_block_positions; + std::vector block_inventory; + std::vector subblock_inventory; + std::vector overflow_positions; + + for (size_t word_idx = 0; word_idx < data.size(); ++word_idx) { + size_t cur_pos = word_idx << 6; + uint64_t cur_word = WordGetter()(data, word_idx); + unsigned long l; + while (util::lsb(cur_word, l)) { + cur_pos += l; + cur_word >>= l; + if (cur_pos >= bv.size()) + break; + + cur_block_positions.push_back(cur_pos); + + if (cur_block_positions.size() == block_size) { + flush_cur_block(cur_block_positions, block_inventory, + subblock_inventory, overflow_positions); + } + + // can't do >>= l + 1, can be 64 + cur_word >>= 1; + cur_pos += 1; + m_positions += 1; + } + } + if (cur_block_positions.size()) { + flush_cur_block(cur_block_positions, block_inventory, subblock_inventory, + overflow_positions); + } + m_block_inventory.swap(block_inventory); + m_subblock_inventory.swap(subblock_inventory); + m_overflow_positions.swap(overflow_positions); + } + + void swap(darray& other) { + std::swap(other.m_positions, m_positions); + m_block_inventory.swap(other.m_block_inventory); + m_subblock_inventory.swap(other.m_subblock_inventory); + m_overflow_positions.swap(other.m_overflow_positions); + } + + inline uint64_t select(bit_vector const& bv, uint64_t idx) const { + assert(idx < num_positions()); + uint64_t block = idx / block_size; + int64_t block_pos = m_block_inventory[block]; + if (block_pos < 0) { // sparse super-block + uint64_t overflow_pos = uint64_t(-block_pos - 1); + return m_overflow_positions[overflow_pos + (idx & (block_size - 1))]; + } + + size_t subblock = idx / subblock_size; + size_t start_pos = uint64_t(block_pos) + m_subblock_inventory[subblock]; + size_t reminder = idx & (subblock_size - 1); + if (!reminder) + return start_pos; + + std::vector const& data = bv.data(); + size_t word_idx = start_pos >> 6; + size_t word_shift = start_pos & 63; + uint64_t word = WordGetter()(data, word_idx) & (uint64_t(-1) << word_shift); + while (true) { + size_t popcnt = util::popcount(word); + if (reminder < popcnt) + break; + reminder -= popcnt; + word = WordGetter()(data, ++word_idx); + } + return (word_idx << 6) + util::select_in_word(word, reminder); + } + + inline uint64_t num_positions() const { return m_positions; } + + uint64_t bytes() const { + return sizeof(m_positions) + essentials::vec_bytes(m_block_inventory) + + essentials::vec_bytes(m_subblock_inventory) + + essentials::vec_bytes(m_overflow_positions); + } + + template + void visit(Visitor& visitor) { + visitor.visit(m_positions); + visitor.visit(m_block_inventory); + visitor.visit(m_subblock_inventory); + visitor.visit(m_overflow_positions); + } + + template + void load(Loader& loader) { + loader.load(m_positions); + loader.load_vec(m_block_inventory); + loader.load_vec(m_subblock_inventory); + loader.load_vec(m_overflow_positions); + } + + template + void dump(Dumper& dumper) const { + dumper.dump(m_positions); + dumper.dump_vec(m_block_inventory); + dumper.dump_vec(m_subblock_inventory); + dumper.dump_vec(m_overflow_positions); + } + + protected: + static void flush_cur_block(std::vector& cur_block_positions, + std::vector& block_inventory, + std::vector& subblock_inventory, + std::vector& overflow_positions) { + if (cur_block_positions.back() - cur_block_positions.front() < + max_in_block_distance) { + block_inventory.push_back(int64_t(cur_block_positions.front())); + for (size_t i = 0; i < cur_block_positions.size(); i += subblock_size) { + subblock_inventory.push_back( + uint16_t(cur_block_positions[i] - cur_block_positions.front())); + } + } else { + block_inventory.push_back(-int64_t(overflow_positions.size()) - 1); + for (size_t i = 0; i < cur_block_positions.size(); ++i) { + overflow_positions.push_back(cur_block_positions[i]); + } + for (size_t i = 0; i < cur_block_positions.size(); i += subblock_size) { + subblock_inventory.push_back(uint16_t(-1)); + } + } + cur_block_positions.clear(); + } + + static const size_t block_size = 1024; // 2048 + static const size_t subblock_size = 32; + static const size_t max_in_block_distance = 1 << 16; + + size_t m_positions; + std::vector m_block_inventory; + std::vector m_subblock_inventory; + std::vector m_overflow_positions; +}; + +struct identity_getter { + uint64_t operator()(std::vector const& data, size_t idx) const { + return data[idx]; + } +}; + +} // namespace detail + +typedef detail::darray darray1; + +} // namespace pthash diff --git a/thirdparty/pthash/encoders/ef_sequence.hpp b/thirdparty/pthash/encoders/ef_sequence.hpp new file mode 100644 index 0000000000..637c873eb0 --- /dev/null +++ b/thirdparty/pthash/encoders/ef_sequence.hpp @@ -0,0 +1,133 @@ +/** Copyright 2020-2024 Giulio Ermanno Pibiri and Roberto Trani + * + * The following sets forth attribution notices for third party software. + * + * PTHash: + * The software includes components licensed by Giulio Ermanno Pibiri and + * Roberto Trani, available at https://github.com/jermp/pthash + * + * Licensed under the MIT License (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * https://opensource.org/licenses/MIT + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include "pthash/encoders/bit_vector.hpp" +#include "pthash/encoders/compact_vector.hpp" +#include "pthash/encoders/darray.hpp" + +namespace pthash { + +template +struct ef_sequence { + ef_sequence() {} + + template + void encode(Iterator begin, uint64_t n) { + if (n == 0) + return; + uint64_t u; + if constexpr (encode_prefix_sum) { + u = std::accumulate(begin, begin + n, static_cast(0)); + n = n + 1; // because I will add a zero at the beginning + } else { + u = *(begin + n - 1); + }; + + uint64_t l = uint64_t((n && u / n) ? util::msb(u / n) : 0); + bit_vector_builder bvb_high_bits(n + (u >> l) + 1); + compact_vector::builder cv_builder_low_bits(n, l); + + uint64_t low_mask = (uint64_t(1) << l) - 1; + uint64_t last = 0; + // I add a zero at the beginning + if constexpr (encode_prefix_sum) { + if (l) + cv_builder_low_bits.push_back(0); + bvb_high_bits.set(0, 1); + n = n - 1; // restore n + } + for (size_t i = 0; i < n; ++i, ++begin) { + auto v = *begin; + if constexpr (encode_prefix_sum) { + v = v + last; // prefix sum + } else if (i and v < last) { // check the order + std::cerr << "error at " << i << "/" << n << ":\n"; + std::cerr << "last " << last << "\n"; + std::cerr << "current " << v << "\n"; + throw std::runtime_error("ef_sequence is not sorted"); + } + if (l) + cv_builder_low_bits.push_back(v & low_mask); + bvb_high_bits.set((v >> l) + i + encode_prefix_sum, 1); + last = v; + } + + bit_vector(&bvb_high_bits).swap(m_high_bits); + cv_builder_low_bits.build(m_low_bits); + darray1(m_high_bits).swap(m_high_bits_d1); + } + + inline uint64_t access(uint64_t i) const { + assert(i < size()); + return ((m_high_bits_d1.select(m_high_bits, i) - i) << m_low_bits.width()) | + m_low_bits.access(i); + } + + inline uint64_t diff(uint64_t i) const { + assert(i < size() && encode_prefix_sum); + uint64_t low1 = m_low_bits.access(i); + uint64_t low2 = m_low_bits.access(i + 1); + uint64_t l = m_low_bits.width(); + uint64_t pos = m_high_bits_d1.select(m_high_bits, i); + uint64_t h1 = pos - i; + uint64_t h2 = + bit_vector::unary_iterator(m_high_bits, pos + 1).next() - i - 1; + uint64_t val1 = (h1 << l) | low1; + uint64_t val2 = (h2 << l) | low2; + return val2 - val1; + } + + inline uint64_t size() const { return m_low_bits.size(); } + + uint64_t num_bits() const { + return 8 * + (m_high_bits.bytes() + m_high_bits_d1.bytes() + m_low_bits.bytes()); + } + + template + void visit(Visitor& visitor) { + visitor.visit(m_high_bits); + visitor.visit(m_high_bits_d1); + visitor.visit(m_low_bits); + } + + template + void load(Loader& loader) { + m_high_bits.load(loader); + m_high_bits_d1.load(loader); + m_low_bits.load(loader); + } + + template + void dump(Dumper& dumper) const { + m_high_bits.dump(dumper); + m_high_bits_d1.dump(dumper); + m_low_bits.dump(dumper); + } + + private: + bit_vector m_high_bits; + darray1 m_high_bits_d1; + compact_vector m_low_bits; +}; + +} // namespace pthash diff --git a/thirdparty/pthash/encoders/encoders.hpp b/thirdparty/pthash/encoders/encoders.hpp new file mode 100644 index 0000000000..75ab6f8571 --- /dev/null +++ b/thirdparty/pthash/encoders/encoders.hpp @@ -0,0 +1,161 @@ +/** Copyright 2020-2024 Giulio Ermanno Pibiri and Roberto Trani + * + * The following sets forth attribution notices for third party software. + * + * PTHash: + * The software includes components licensed by Giulio Ermanno Pibiri and + * Roberto Trani, available at https://github.com/jermp/pthash + * + * Licensed under the MIT License (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * https://opensource.org/licenses/MIT + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "pthash/essentials/essentials.hpp" + +#include "pthash/encoders/compact_vector.hpp" +#include "pthash/encoders/ef_sequence.hpp" + +#include +#include +#include + +namespace pthash { + +template +std::pair, std::vector> +compute_ranks_and_dictionary(Iterator begin, uint64_t n) { + // accumulate frequencies + std::unordered_map distinct; + for (auto it = begin, end = begin + n; it != end; ++it) { + auto find_it = distinct.find(*it); + if (find_it != distinct.end()) { // found + (*find_it).second += 1; + } else { + distinct[*it] = 1; + } + } + std::vector> vec; + vec.reserve(distinct.size()); + for (auto p : distinct) + vec.emplace_back(p.first, p.second); + std::sort(vec.begin(), vec.end(), + [](const std::pair& x, + const std::pair& y) { + return x.second > y.second; + }); + distinct.clear(); + // assign codewords by non-increasing frequency + std::vector dict; + dict.reserve(distinct.size()); + for (uint64_t i = 0; i != vec.size(); ++i) { + auto p = vec[i]; + distinct.insert({p.first, i}); + dict.push_back(p.first); + } + + std::vector ranks; + ranks.reserve(n); + for (auto it = begin, end = begin + n; it != end; ++it) + ranks.push_back(distinct[*it]); + return {ranks, dict}; +} + +struct dictionary { + template + void encode(Iterator begin, uint64_t n) { + auto [ranks, dict] = compute_ranks_and_dictionary(begin, n); + m_ranks.build(ranks.begin(), ranks.size()); + m_dict.build(dict.begin(), dict.size()); + } + + static std::string name() { return "dictionary"; } + + size_t size() const { return m_ranks.size(); } + + size_t num_bits() const { return (m_ranks.bytes() + m_dict.bytes()) * 8; } + + uint64_t access(uint64_t i) const { + uint64_t rank = m_ranks.access(i); + return m_dict.access(rank); + } + + template + void visit(Visitor& visitor) { + visitor.visit(m_ranks); + visitor.visit(m_dict); + } + + template + void load(Loader& loader) { + m_ranks.load(loader); + m_dict.load(loader); + } + + template + void dump(Dumper& dumper) const { + m_ranks.dump(dumper); + m_dict.dump(dumper); + } + + private: + compact_vector m_ranks; + compact_vector m_dict; +}; + +template +struct dual { + template + void encode(Iterator begin, uint64_t n) { + size_t front_size = n * 0.3; + m_front.encode(begin, front_size); + m_back.encode(begin + front_size, n - front_size); + } + + static std::string name() { return Front::name() + "-" + Back::name(); } + + size_t num_bits() const { return m_front.num_bits() + m_back.num_bits(); } + + uint64_t access(uint64_t i) const { + if (i < m_front.size()) + return m_front.access(i); + return m_back.access(i - m_front.size()); + } + + template + void visit(Visitor& visitor) { + visitor.visit(m_front); + visitor.visit(m_back); + } + + template + void load(Loader& loader) { + m_front.load(loader); + m_back.load(loader); + } + + template + void dump(Dumper& dumper) const { + m_front.dump(dumper); + m_back.dump(dumper); + } + + private: + Front m_front; + Back m_back; +}; + +/* dual encoders */ +typedef dual dictionary_dictionary; + +} // namespace pthash diff --git a/thirdparty/pthash/encoders/util.hpp b/thirdparty/pthash/encoders/util.hpp new file mode 100644 index 0000000000..b44884ff43 --- /dev/null +++ b/thirdparty/pthash/encoders/util.hpp @@ -0,0 +1,84 @@ +/** Copyright 2020-2024 Giulio Ermanno Pibiri and Roberto Trani + * + * The following sets forth attribution notices for third party software. + * + * PTHash: + * The software includes components licensed by Giulio Ermanno Pibiri and + * Roberto Trani, available at https://github.com/jermp/pthash + * + * Licensed under the MIT License (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * https://opensource.org/licenses/MIT + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +namespace pthash::util { + +template +inline void prefetch(T const* ptr) { + _mm_prefetch(reinterpret_cast(ptr), _MM_HINT_T0); +} + +inline uint8_t msb(uint64_t x) { + assert(x); + unsigned long ret = -1U; + if (x) { + ret = (unsigned long) (63 - __builtin_clzll(x)); + } + return (uint8_t) ret; +} + +inline bool bsr64(unsigned long* const index, const uint64_t mask) { + if (mask) { + *index = (unsigned long) (63 - __builtin_clzll(mask)); + return true; + } else { + return false; + } +} + +inline uint8_t msb(uint64_t x, unsigned long& ret) { return bsr64(&ret, x); } + +inline uint8_t lsb(uint64_t x, unsigned long& ret) { + if (x) { + ret = (unsigned long) __builtin_ctzll(x); + return true; + } + return false; +} + +inline uint8_t lsb(uint64_t x) { + assert(x); + unsigned long ret = -1U; + lsb(x, ret); + return (uint8_t) ret; +} + +inline uint64_t popcount(uint64_t x) { + return static_cast(_mm_popcnt_u64(x)); +} + +inline uint64_t select64_pdep_tzcnt(uint64_t x, const uint64_t k) { + uint64_t i = 1ULL << k; + asm("pdep %[x], %[mask], %[x]" : [x] "+r"(x) : [mask] "r"(i)); + asm("tzcnt %[bit], %[index]" : [index] "=r"(i) : [bit] "g"(x) : "cc"); + return i; +} + +inline uint64_t select_in_word(const uint64_t x, const uint64_t k) { + assert(k < popcount(x)); + return select64_pdep_tzcnt(x, k); +} + +} // namespace pthash::util \ No newline at end of file diff --git a/thirdparty/pthash/essentials/essentials.hpp b/thirdparty/pthash/essentials/essentials.hpp new file mode 100644 index 0000000000..8f044731ab --- /dev/null +++ b/thirdparty/pthash/essentials/essentials.hpp @@ -0,0 +1,612 @@ +/** Copyright 2019-2021 Giulio Ermanno Pibiri + * + * The following sets forth attribution notices for third party software. + * + * C++ Essentials: + * The software includes components licensed by Giulio Ermanno Pibiri and + * Roberto Trani, available at https://github.com/jermp/essentials + * + * Licensed under the MIT License (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * https://opensource.org/licenses/MIT + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef __GNUG__ +#include // for name demangling +#endif + +namespace essentials { + +inline void logger(std::string const& msg) { + time_t t = std::time(nullptr); + std::locale loc; + const std::time_put& tp = std::use_facet>(loc); + const char* fmt = "%F %T"; + tp.put(std::cout, std::cout, ' ', std::localtime(&t), fmt, fmt + strlen(fmt)); + std::cout << ": " << msg << std::endl; +} + +static const uint64_t GB = 1000 * 1000 * 1000; +static const uint64_t GiB = uint64_t(1) << 30; +static const uint64_t MB = 1000 * 1000; +static const uint64_t MiB = uint64_t(1) << 20; +static const uint64_t KB = 1000; +static const uint64_t KiB = uint64_t(1) << 10; + +inline double convert(size_t bytes, uint64_t unit) { + return static_cast(bytes) / unit; +} + +template +size_t vec_bytes(T const& vec) { + return vec.size() * sizeof(vec.front()) + sizeof(typename T::size_type); +} + +template +size_t pod_bytes(T const& pod) { + static_assert(std::is_pod::value); + return sizeof(pod); +} + +inline size_t file_size(char const* filename) { + std::ifstream is(filename, std::ios::binary | std::ios::ate); + if (!is.good()) { + throw std::runtime_error( + "Error in opening binary " + "file."); + } + size_t bytes = (size_t) is.tellg(); + is.close(); + return bytes; +} + +template +uint64_t words_for(uint64_t bits) { + uint64_t word_bits = sizeof(WordType) * 8; + return (bits + word_bits - 1) / word_bits; +} + +template +inline void do_not_optimize_away(T&& value) { + asm volatile("" : "+r"(value)); +} + +inline uint64_t maxrss_in_bytes() { + struct rusage ru; + if (getrusage(RUSAGE_SELF, &ru) == 0) { + // NOTE: ru_maxrss is in kilobytes on Linux, but not on Apple... +#ifdef __APPLE__ + return ru.ru_maxrss; +#endif + return ru.ru_maxrss * 1000; + } + return 0; +} + +template +void load_pod(std::istream& is, T& val) { + static_assert(std::is_pod::value); + is.read(reinterpret_cast(&val), sizeof(T)); +} + +template +void load_vec(std::istream& is, std::vector& vec) { + size_t n; + load_pod(is, n); + vec.resize(n); + is.read(reinterpret_cast(vec.data()), + static_cast(sizeof(T) * n)); +} + +template +void save_pod(std::ostream& os, T const& val) { + static_assert(std::is_pod::value); + os.write(reinterpret_cast(&val), sizeof(T)); +} + +template +void save_vec(std::ostream& os, std::vector const& vec) { + static_assert(std::is_pod::value); + size_t n = vec.size(); + save_pod(os, n); + os.write(reinterpret_cast(vec.data()), + static_cast(sizeof(T) * n)); +} + +template +struct timer { + void start() { m_start = ClockType::now(); } + + void stop() { + m_stop = ClockType::now(); + auto elapsed = std::chrono::duration_cast(m_stop - m_start); + m_timings.push_back(elapsed.count()); + } + + size_t runs() const { return m_timings.size(); } + + void reset() { m_timings.clear(); } + + double min() const { + return *std::min_element(m_timings.begin(), m_timings.end()); + } + + double max() const { + return *std::max_element(m_timings.begin(), m_timings.end()); + } + + void discard_first() { + if (runs()) { + m_timings.erase(m_timings.begin()); + } + } + + void discard_min() { + if (runs() > 1) { + m_timings.erase(std::min_element(m_timings.begin(), m_timings.end())); + } + } + + void discard_max() { + if (runs() > 1) { + m_timings.erase(std::max_element(m_timings.begin(), m_timings.end())); + } + } + + double elapsed() { + return std::accumulate(m_timings.begin(), m_timings.end(), 0.0); + } + + double average() { return elapsed() / runs(); } + + private: + typename ClockType::time_point m_start; + typename ClockType::time_point m_stop; + std::vector m_timings; +}; + +typedef std::chrono::high_resolution_clock clock_type; +typedef std::chrono::microseconds duration_type; +typedef timer timer_type; + +inline unsigned get_random_seed() { + return std::chrono::system_clock::now().time_since_epoch().count(); +} + +template +struct uniform_int_rng { + uniform_int_rng(IntType from, IntType to, unsigned seed = 13) + : m_rng(seed), m_distr(from, to) {} + + IntType gen() { return m_distr(m_rng); } + + private: + std::mt19937_64 m_rng; + std::uniform_int_distribution m_distr; +}; + +struct loader { + loader(char const* filename) + : m_num_bytes_pods(0), + m_num_bytes_vecs_of_pods(0), + m_is(filename, std::ios::binary) { + if (!m_is.good()) { + throw std::runtime_error( + "Error in opening binary " + "file."); + } + } + + ~loader() { m_is.close(); } + + template + void visit(T& val) { + if constexpr (std::is_pod::value) { + load_pod(m_is, val); + m_num_bytes_pods += pod_bytes(val); + } else { + val.visit(*this); + } + } + + template + void visit(std::vector& vec) { + size_t n; + visit(n); + vec.resize(n); + if constexpr (std::is_pod::value) { + m_is.read(reinterpret_cast(vec.data()), + static_cast(sizeof(T) * n)); + m_num_bytes_vecs_of_pods += n * sizeof(T); + } else { + for (auto& v : vec) + visit(v); + } + } + + size_t bytes() { return m_is.tellg(); } + + size_t bytes_pods() { return m_num_bytes_pods; } + + size_t bytes_vecs_of_pods() { return m_num_bytes_vecs_of_pods; } + + private: + size_t m_num_bytes_pods; + size_t m_num_bytes_vecs_of_pods; + std::ifstream m_is; +}; + +struct saver { + saver(char const* filename) : m_os(filename, std::ios::binary) { + if (!m_os.good()) { + throw std::runtime_error( + "Error in opening binary " + "file."); + } + } + + ~saver() { m_os.close(); } + + template + void visit(T& val) { + if constexpr (std::is_pod::value) { + save_pod(m_os, val); + } else { + val.visit(*this); + } + } + + template + void visit(std::vector& vec) { + if constexpr (std::is_pod::value) { + save_vec(m_os, vec); + } else { + size_t n = vec.size(); + visit(n); + for (auto& v : vec) + visit(v); + } + } + + size_t bytes() { return m_os.tellp(); } + + private: + std::ofstream m_os; +}; + +inline std::string demangle(char const* mangled_name) { + size_t len = 0; + int status = 0; + std::unique_ptr ptr( + __cxxabiv1::__cxa_demangle(mangled_name, nullptr, &len, &status), + &std::free); + return ptr.get(); +} + +struct sizer { + sizer(std::string const& root_name = "") + : m_root(0, 0, root_name), m_current(&m_root) {} + + struct node { + node(size_t b, size_t d, std::string const& n = "") + : bytes(b), depth(d), name(n) {} + + size_t bytes; + size_t depth; + std::string name; + std::vector children; + }; + + template + void visit(T& val) { + if constexpr (std::is_pod::value) { + node n(pod_bytes(val), m_current->depth + 1, demangle(typeid(T).name())); + m_current->children.push_back(n); + m_current->bytes += n.bytes; + } else { + val.visit(*this); + } + } + + template + void visit(std::vector& vec) { + if constexpr (std::is_pod::value) { + node n(vec_bytes(vec), m_current->depth + 1, + demangle(typeid(std::vector).name())); + m_current->children.push_back(n); + m_current->bytes += n.bytes; + } else { + size_t n = vec.size(); + m_current->bytes += pod_bytes(n); + node* parent = m_current; + for (auto& v : vec) { + node n(0, parent->depth + 1, demangle(typeid(T).name())); + parent->children.push_back(n); + m_current = &parent->children.back(); + visit(v); + parent->bytes += m_current->bytes; + } + m_current = parent; + } + } + + template + void print(node const& n, size_t total_bytes, Device& device) const { + auto indent = std::string(n.depth * 4, ' '); + device << indent << "'" << n.name << "' - bytes = " << n.bytes << " (" + << n.bytes * 100.0 / total_bytes << "%)" << std::endl; + for (auto const& child : n.children) { + device << indent; + print(child, total_bytes, device); + } + } + + template + void print(Device& device) const { + print(m_root, bytes(), device); + } + + size_t bytes() const { return m_root.bytes; } + + private: + node m_root; + node* m_current; +}; + +template +struct allocator : std::allocator { + typedef T value_type; + + allocator() : m_addr(nullptr) {} + + allocator(T* addr) : m_addr(addr) {} + + T* allocate(size_t n) { + if (m_addr == nullptr) + return std::allocator::allocate(n); + return m_addr; + } + + void deallocate(T* p, size_t n) { + if (m_addr == nullptr) + return std::allocator::deallocate(p, n); + } + + private: + T* m_addr; +}; + +struct contiguous_memory_allocator { + contiguous_memory_allocator() : m_begin(nullptr), m_end(nullptr), m_size(0) {} + + struct visitor { + visitor(uint8_t* begin, size_t size, char const* filename) + : m_begin(begin), + m_end(begin), + m_size(size), + m_is(filename, std::ios::binary) { + if (!m_is.good()) { + throw std::runtime_error( + "Error in opening binary " + "file."); + } + } + + ~visitor() { m_is.close(); } + + template + void visit(T& val) { + if constexpr (std::is_pod::value) { + load_pod(m_is, val); + } else { + val.visit(*this); + } + } + + template + void visit(std::vector& vec) { + if constexpr (std::is_pod::value) { + vec = std::vector(make_allocator()); + load_vec(m_is, vec); + consume(vec.size() * sizeof(T)); + } else { + size_t n; + visit(n); + vec.resize(n); + for (auto& v : vec) + visit(v); + } + } + + uint8_t* end() { return m_end; } + + size_t size() const { return m_size; } + + size_t allocated() const { + assert(m_end >= m_begin); + return m_end - m_begin; + } + + template + allocator make_allocator() { + return allocator(reinterpret_cast(m_end)); + } + + void consume(size_t num_bytes) { + if (m_end == nullptr) + return; + if (allocated() + num_bytes > size()) { + throw std::runtime_error("allocation failed"); + } + m_end += num_bytes; + } + + private: + uint8_t* m_begin; + uint8_t* m_end; + size_t m_size; + std::ifstream m_is; + }; + + template + size_t allocate(T& data_structure, char const* filename) { + loader l(filename); + l.visit(data_structure); + m_size = l.bytes_vecs_of_pods(); + m_begin = reinterpret_cast(malloc(m_size)); + if (m_begin == nullptr) + throw std::runtime_error("malloc failed"); + visitor v(m_begin, m_size, filename); + v.visit(data_structure); + m_end = v.end(); + return l.bytes(); + } + + ~contiguous_memory_allocator() { free(m_begin); } + + uint8_t* begin() { return m_begin; } + + uint8_t* end() { return m_end; } + + size_t size() const { return m_size; } + + private: + uint8_t* m_begin; + uint8_t* m_end; + size_t m_size; +}; + +template +size_t visit(T& data_structure, char const* filename) { + Visitor visitor(filename); + visitor.visit(data_structure); + return visitor.bytes(); +} + +template +size_t load(T& data_structure, char const* filename) { + return visit(data_structure, filename); +} + +template +size_t load_with_custom_memory_allocation(T& data_structure, + char const* filename) { + return data_structure.get_allocator().allocate(data_structure, filename); +} + +template +size_t save(T& data_structure, char const* filename) { + return visit(data_structure, filename); +} + +template +size_t print_size(T& data_structure, Device& device) { + sizer visitor(demangle(typeid(T).name())); + visitor.visit(data_structure); + visitor.print(device); + return visitor.bytes(); +} + +#if defined(__CYGWIN__) || defined(_WIN32) || defined(_WIN64) +#else +struct directory { + struct file_name { + std::string name; + std::string fullpath; + std::string extension; + }; + + ~directory() { + for (int i = 0; i != items(); ++i) { + free(m_items_names[i]); + } + free(m_items_names); + } + + directory(std::string const& name) : m_name(name) { + m_n = scandir(m_name.c_str(), &m_items_names, NULL, alphasort); + if (m_n < 0) { + throw std::runtime_error("error during scandir"); + } + } + + std::string const& name() const { return m_name; } + + int items() const { return m_n; } + + struct iterator { + iterator(directory const* d, int i) : m_d(d), m_i(i) {} + + file_name operator*() { + file_name fn; + fn.name = m_d->m_items_names[m_i]->d_name; + fn.fullpath = m_d->name() + "/" + fn.name; + size_t p = fn.name.find_last_of("."); + fn.extension = fn.name.substr(p + 1); + return fn; + } + + void operator++() { ++m_i; } + + bool operator==(iterator const& rhs) const { return m_i == rhs.m_i; } + + bool operator!=(iterator const& rhs) const { return !(*this == rhs); } + + private: + directory const* m_d; + int m_i; + }; + + iterator begin() { return iterator(this, 0); } + + iterator end() { return iterator(this, items()); } + + private: + std::string m_name; + struct dirent** m_items_names; + int m_n; +}; +#endif + +inline bool create_directory(std::string const& name) { + if (mkdir(name.c_str(), 0777) != 0) { + if (errno == EEXIST) { + std::cerr << "directory already exists" << std::endl; + } + return false; + } + return true; +} + +inline bool remove_directory(std::string const& name) { + return rmdir(name.c_str()) == 0; +} + +} // namespace essentials \ No newline at end of file diff --git a/thirdparty/pthash/fastmod/fastmod.h b/thirdparty/pthash/fastmod/fastmod.h new file mode 100644 index 0000000000..8ac9743d50 --- /dev/null +++ b/thirdparty/pthash/fastmod/fastmod.h @@ -0,0 +1,209 @@ +// credits to Daniel Lemire: https://github.com/lemire/fastmod + +#ifndef FASTMOD_H +#define FASTMOD_H + +#ifndef __cplusplus +#include +#include +#else +// In C++ / are irelevant as bool is already a type +#include +#endif + +#ifndef __cplusplus +#define FASTMOD_API static inline +#else +// In C++ we mark all the functions inline. +// If C++14 relaxed constexpr is supported we use constexpr so functions +// can be used at compile-time. +#if __cpp_constexpr >= 201304 && !defined(_MSC_VER) +// visual studio does not like constexpr +#define FASTMOD_API constexpr +#define FASTMOD_CONSTEXPR constexpr +#else +#define FASTMOD_API inline +#define FASTMOD_CONSTEXPR +#endif +#endif + +#ifdef _MSC_VER +#include +#endif + +#ifdef __cplusplus +namespace fastmod { +#endif + +#ifdef _MSC_VER + +// __umulh is only available in x64 mode under Visual Studio: don't compile to +// 32-bit! +FASTMOD_API uint64_t mul128_u32(uint64_t lowbits, uint32_t d) { + return __umulh(lowbits, d); +} + +#else // _MSC_VER NOT defined + +FASTMOD_API uint64_t mul128_u32(uint64_t lowbits, uint32_t d) { + return ((__uint128_t) lowbits * d) >> 64; +} + +FASTMOD_API uint64_t mul128_s32(uint64_t lowbits, int32_t d) { + return ((__int128_t) lowbits * d) >> 64; +} + +// This is for the 64-bit functions. +FASTMOD_API uint64_t mul128_u64(__uint128_t lowbits, uint64_t d) { + __uint128_t bottom_half = + (lowbits & UINT64_C(0xFFFFFFFFFFFFFFFF)) * d; // Won't overflow + bottom_half >>= + 64; // Only need the top 64 bits, as we'll shift the lower half away; + __uint128_t top_half = (lowbits >> 64) * d; + __uint128_t both_halves = + bottom_half + top_half; // Both halves are already shifted down by 64 + both_halves >>= 64; // Get top half of both_halves + return (uint64_t) both_halves; +} + +#endif // _MSC_VER + +/** + * Unsigned integers. + * Usage: + * uint32_t d = ... ; // divisor, should be non-zero + * uint64_t M = computeM_u32(d); // do once + * fastmod_u32(a,M,d) is a % d for all 32-bit a. + * + **/ + +// M = ceil( (1<<64) / d ), d > 0 +FASTMOD_API uint64_t computeM_u32(uint32_t d) { + return UINT64_C(0xFFFFFFFFFFFFFFFF) / d + 1; +} + +// fastmod computes (a % d) given precomputed M +FASTMOD_API uint32_t fastmod_u32(uint32_t a, uint64_t M, uint32_t d) { + uint64_t lowbits = M * a; + return (uint32_t) (mul128_u32(lowbits, d)); +} + +// fastmod computes (a / d) given precomputed M for d>1 +FASTMOD_API uint32_t fastdiv_u32(uint32_t a, uint64_t M) { + return (uint32_t) (mul128_u32(M, a)); +} + +// given precomputed M, checks whether n % d == 0 +FASTMOD_API bool is_divisible(uint32_t n, uint64_t M) { return n * M <= M - 1; } + +/** + * signed integers + * Usage: + * int32_t d = ... ; // should be non-zero and between [-2147483647,2147483647] + * int32_t positive_d = d < 0 ? -d : d; // absolute value + * uint64_t M = computeM_s32(d); // do once + * fastmod_s32(a,M,positive_d) is a % d for all 32-bit a. + **/ + +// M = floor( (1<<64) / d ) + 1 +// you must have that d is different from 0 and -2147483648 +// if d = -1 and a = -2147483648, the result is undefined +FASTMOD_API uint64_t computeM_s32(int32_t d) { + if (d < 0) + d = -d; + return UINT64_C(0xFFFFFFFFFFFFFFFF) / d + 1 + ((d & (d - 1)) == 0 ? 1 : 0); +} + +// fastmod computes (a % d) given precomputed M, +// you should pass the absolute value of d +FASTMOD_API int32_t fastmod_s32(int32_t a, uint64_t M, int32_t positive_d) { + uint64_t lowbits = M * a; + int32_t highbits = mul128_u32(lowbits, positive_d); + return highbits - ((positive_d - 1) & (a >> 31)); +} + +#ifndef _MSC_VER +// fastmod computes (a / d) given precomputed M, assumes that d must not +// be one of -1, 1, or -2147483648 +FASTMOD_API int32_t fastdiv_s32(int32_t a, uint64_t M, int32_t d) { + uint64_t highbits = mul128_s32(M, a); + highbits += (a < 0 ? 1 : 0); + if (d < 0) + return -(int32_t) (highbits); + return (int32_t) (highbits); +} + +// What follows is the 64-bit functions. +// They are currently not supported on Visual Studio +// due to the lack of a mul128_u64 function. +// They may not be faster than what the compiler +// can produce. + +FASTMOD_API __uint128_t computeM_u64(uint64_t d) { + // what follows is just ((__uint128_t)0 - 1) / d) + 1 spelled out + __uint128_t M = UINT64_C(0xFFFFFFFFFFFFFFFF); + M <<= 64; + M |= UINT64_C(0xFFFFFFFFFFFFFFFF); + M /= d; + M += 1; + return M; +} + +FASTMOD_API __uint128_t computeM_s64(int64_t d) { + if (d < 0) + d = -d; + __uint128_t M = UINT64_C(0xFFFFFFFFFFFFFFFF); + M <<= 64; + M |= UINT64_C(0xFFFFFFFFFFFFFFFF); + M /= d; + M += 1; + M += ((d & (d - 1)) == 0 ? 1 : 0); + return M; +} + +FASTMOD_API uint64_t fastmod_u64(uint64_t a, __uint128_t M, uint64_t d) { + __uint128_t lowbits = M * a; + return mul128_u64(lowbits, d); +} + +FASTMOD_API uint64_t fastdiv_u64(uint64_t a, __uint128_t M) { + return mul128_u64(M, a); +} + +// End of the 64-bit functions + +#endif // #ifndef _MSC_VER + +#ifdef __cplusplus + +template +FASTMOD_API uint32_t fastmod(uint32_t x) { + FASTMOD_CONSTEXPR uint64_t v = computeM_u32(d); + return fastmod_u32(x, v, d); +} +template +FASTMOD_API uint32_t fastdiv(uint32_t x) { + FASTMOD_CONSTEXPR uint64_t v = computeM_u32(d); + return fastdiv_u32(x, v); +} +template +FASTMOD_API int32_t fastmod(int32_t x) { + FASTMOD_CONSTEXPR uint64_t v = computeM_s32(d); + return fastmod_s32(x, v, d); +} +template +FASTMOD_API int32_t fastdiv(int32_t x) { + FASTMOD_CONSTEXPR uint64_t v = computeM_s32(d); + return fastdiv_s32(x, v, d); +} + +} // fastmod +#endif + +// There's no reason to polute the global scope with this macro once its use +// ends This won't create any problems as the preprocessor will have done its +// thing once it reaches this point +#undef FASTMOD_API +#undef FASTMOD_CONSTEXPR + +#endif // FASTMOD_H \ No newline at end of file diff --git a/thirdparty/pthash/mm_file/mm_file.hpp b/thirdparty/pthash/mm_file/mm_file.hpp new file mode 100644 index 0000000000..53513f2c6d --- /dev/null +++ b/thirdparty/pthash/mm_file/mm_file.hpp @@ -0,0 +1,170 @@ +/** Copyright 2019 Giulio Ermanno Pibiri + * + * The following sets forth attribution notices for third party software. + * + * Memory-mapped files: + * The software includes components licensed by Giulio Ermanno Pibiri and + * Roberto Trani, available at https://github.com/jermp/mm_file + * + * Licensed under the MIT License (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * https://opensource.org/licenses/MIT + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include +#include // close(fd) +#include +#include + +namespace mm { + +namespace advice { +static const int normal = POSIX_MADV_NORMAL; +static const int random = POSIX_MADV_RANDOM; +static const int sequential = POSIX_MADV_SEQUENTIAL; +} // namespace advice + +template +struct file { + file() { init(); } + + ~file() { close(); } + + file(file const&) = delete; // non construction-copyable + file& operator=(file const&) = delete; // non copyable + + bool is_open() const { return m_fd != -1; } + + void close() { + if (is_open()) { + if (munmap((char*) m_data, m_size) == -1) { + throw std::runtime_error("munmap failed when closing file"); + } + ::close(m_fd); + init(); + } + } + + size_t bytes() const { return m_size; } + + size_t size() const { return m_size / sizeof(T); } + + T* data() const { return m_data; } + + struct iterator { + iterator(T* addr, size_t offset = 0) : m_ptr(addr + offset) {} + + T operator*() { return *m_ptr; } + + void operator++() { ++m_ptr; } + + bool operator==(iterator const& rhs) const { return m_ptr == rhs.m_ptr; } + + bool operator!=(iterator const& rhs) const { return !((*this) == rhs); } + + private: + T* m_ptr; + }; + + iterator begin() const { return iterator(m_data); } + + iterator end() const { return iterator(m_data, size()); } + + protected: + int m_fd; + size_t m_size; + T* m_data; + + void init() { + m_fd = -1; + m_size = 0; + m_data = nullptr; + } + + void check_fd() { + if (m_fd == -1) + throw std::runtime_error("cannot open file"); + } +}; + +template +Pointer mmap(int fd, size_t size, int prot) { + static const size_t offset = 0; + Pointer p = + static_cast(::mmap(NULL, size, prot, MAP_SHARED, fd, offset)); + if (p == MAP_FAILED) + throw std::runtime_error("mmap failed"); + return p; +} + +template +struct file_source : public file { + typedef file base; + + file_source() {} + + file_source(std::string const& path, int adv = advice::normal) { + open(path, adv); + } + + void open(std::string const& path, int adv = advice::normal) { + base::m_fd = ::open(path.c_str(), O_RDONLY); + base::check_fd(); + struct stat fs; + if (fstat(base::m_fd, &fs) == -1) { + throw std::runtime_error("cannot stat file"); + } + base::m_size = fs.st_size; + base::m_data = mmap(base::m_fd, base::m_size, PROT_READ); + if (posix_madvise((void*) base::m_data, base::m_size, adv)) { + throw std::runtime_error("madvise failed"); + } + } +}; + +template +struct file_sink : public file { + typedef file base; + + file_sink() {} + + file_sink(std::string const& path) { open(path); } + + file_sink(std::string const& path, size_t n) { open(path, n); } + + void open(std::string const& path) { + static const mode_t mode = 0600; // read/write + base::m_fd = ::open(path.c_str(), O_RDWR, mode); + base::check_fd(); + struct stat fs; + if (fstat(base::m_fd, &fs) == -1) { + throw std::runtime_error("cannot stat file"); + } + base::m_size = fs.st_size; + base::m_data = mmap(base::m_fd, base::m_size, PROT_READ | PROT_WRITE); + } + + void open(std::string const& path, size_t n) { + static const mode_t mode = 0600; // read/write + base::m_fd = ::open(path.c_str(), O_RDWR | O_CREAT | O_TRUNC, mode); + base::check_fd(); + base::m_size = n * sizeof(T); + ftruncate(base::m_fd, + base::m_size); // truncate the file at the new size + base::m_data = mmap(base::m_fd, base::m_size, PROT_READ | PROT_WRITE); + } +}; + +} // namespace mm \ No newline at end of file diff --git a/thirdparty/pthash/pthash.hpp b/thirdparty/pthash/pthash.hpp new file mode 100644 index 0000000000..d46f541218 --- /dev/null +++ b/thirdparty/pthash/pthash.hpp @@ -0,0 +1,25 @@ +/** Copyright 2020-2024 Giulio Ermanno Pibiri and Roberto Trani + * + * The following sets forth attribution notices for third party software. + * + * PTHash: + * The software includes components licensed by Giulio Ermanno Pibiri and + * Roberto Trani, available at https://github.com/jermp/pthash + * + * Licensed under the MIT License (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * https://opensource.org/licenses/MIT + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "pthash/encoders/encoders.hpp" +#include "pthash/single_phf.hpp" diff --git a/thirdparty/pthash/single_phf.hpp b/thirdparty/pthash/single_phf.hpp new file mode 100644 index 0000000000..f8126fc584 --- /dev/null +++ b/thirdparty/pthash/single_phf.hpp @@ -0,0 +1,151 @@ +/** Copyright 2020-2024 Giulio Ermanno Pibiri and Roberto Trani + * + * The following sets forth attribution notices for third party software. + * + * PTHash: + * The software includes components licensed by Giulio Ermanno Pibiri and + * Roberto Trani, available at https://github.com/jermp/pthash + * + * Licensed under the MIT License (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * https://opensource.org/licenses/MIT + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "pthash/builders/external_memory_builder_single_phf.hpp" +#include "pthash/builders/internal_memory_builder_single_phf.hpp" +#include "pthash/builders/util.hpp" +#include "pthash/encoders/ef_sequence.hpp" +#include "pthash/utils/bucketers.hpp" + +namespace pthash { + +template +struct single_phf { + typedef Encoder encoder_type; + static constexpr bool minimal = Minimal; + + template + build_timings build_in_internal_memory(Iterator keys, uint64_t n, + build_configuration const& config) { + internal_memory_builder_single_phf builder; + auto timings = builder.build_from_keys(keys, n, config); + timings.encoding_seconds = build(builder, config); + return timings; + } + + template + build_timings build_in_external_memory(Iterator keys, uint64_t n, + build_configuration const& config) { + external_memory_builder_single_phf builder; + auto timings = builder.build_from_keys(keys, n, config); + timings.encoding_seconds = build(builder, config); + return timings; + } + + template + double build(Builder const& builder, build_configuration const&) { + auto start = clock_type::now(); + m_seed = builder.seed(); + m_num_keys = builder.num_keys(); + m_table_size = builder.table_size(); + m_M = fastmod::computeM_u64(m_table_size); + m_bucketer = builder.bucketer(); + m_pilots.encode(builder.pilots().data(), m_bucketer.num_buckets()); + if constexpr (Minimal) { + m_free_slots.encode(builder.free_slots().data(), + m_table_size - m_num_keys); + } + auto stop = clock_type::now(); + return seconds(stop - start); + } + + template + uint64_t operator()(T const& key) const { + auto hash = Hasher::hash(key, m_seed); + return position(hash); + } + + uint64_t position(typename Hasher::hash_type hash) const { + uint64_t bucket = m_bucketer.bucket(hash.first()); + uint64_t pilot = m_pilots.access(bucket); + uint64_t hashed_pilot = default_hash64(pilot, m_seed); + uint64_t p = + fastmod::fastmod_u64(hash.second() ^ hashed_pilot, m_M, m_table_size); + if constexpr (Minimal) { + if (PTHASH_LIKELY(p < num_keys())) + return p; + return m_free_slots.access(p - num_keys()); + } + return p; + } + + size_t num_bits_for_pilots() const { + return 8 * (sizeof(m_seed) + sizeof(m_num_keys) + sizeof(m_table_size) + + sizeof(m_M)) + + m_bucketer.num_bits() + m_pilots.num_bits(); + } + + size_t num_bits_for_mapper() const { return m_free_slots.num_bits(); } + + size_t num_bits() const { + return num_bits_for_pilots() + num_bits_for_mapper(); + } + + inline uint64_t num_keys() const { return m_num_keys; } + + inline uint64_t table_size() const { return m_table_size; } + + template + void visit(Visitor& visitor) { + visitor.visit(m_seed); + visitor.visit(m_num_keys); + visitor.visit(m_table_size); + visitor.visit(m_M); + visitor.visit(m_bucketer); + visitor.visit(m_pilots); + visitor.visit(m_free_slots); + } + + template + void load(Loader& loader) { + loader.load(m_seed); + loader.load(m_num_keys); + loader.load(m_table_size); + loader.load(m_M); + m_bucketer.load(loader); + m_pilots.load(loader); + m_free_slots.load(loader); + } + + template + void dump(Dumper& dumper) const { + dumper.dump(m_seed); + dumper.dump(m_num_keys); + dumper.dump(m_table_size); + dumper.dump(m_M); + m_bucketer.dump(dumper); + m_pilots.dump(dumper); + m_free_slots.dump(dumper); + } + + private: + uint64_t m_seed; + uint64_t m_num_keys; + uint64_t m_table_size; + __uint128_t m_M; + skew_bucketer m_bucketer; + Encoder m_pilots; + ef_sequence m_free_slots; +}; + +} // namespace pthash diff --git a/thirdparty/pthash/utils/bucketers.hpp b/thirdparty/pthash/utils/bucketers.hpp new file mode 100644 index 0000000000..3af0ce06ab --- /dev/null +++ b/thirdparty/pthash/utils/bucketers.hpp @@ -0,0 +1,92 @@ +/** Copyright 2020-2024 Giulio Ermanno Pibiri and Roberto Trani + * + * The following sets forth attribution notices for third party software. + * + * PTHash: + * The software includes components licensed by Giulio Ermanno Pibiri and + * Roberto Trani, available at https://github.com/jermp/pthash + * + * Licensed under the MIT License (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * https://opensource.org/licenses/MIT + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "pthash/utils/util.hpp" + +namespace pthash { + +struct skew_bucketer { + skew_bucketer() {} + + void init(uint64_t num_buckets) { + m_num_dense_buckets = 0.3 * num_buckets; + m_num_sparse_buckets = num_buckets - m_num_dense_buckets; + m_M_num_dense_buckets = fastmod::computeM_u64(m_num_dense_buckets); + m_M_num_sparse_buckets = fastmod::computeM_u64(m_num_sparse_buckets); + } + + inline uint64_t bucket(uint64_t hash) const { + static const uint64_t T = UINT64_MAX / 5 * 3; + return (hash < T) ? fastmod::fastmod_u64(hash, m_M_num_dense_buckets, + m_num_dense_buckets) + : m_num_dense_buckets + + fastmod::fastmod_u64(hash, m_M_num_sparse_buckets, + m_num_sparse_buckets); + } + + uint64_t num_buckets() const { + return m_num_dense_buckets + m_num_sparse_buckets; + } + + size_t num_bits() const { + return 8 * (sizeof(m_num_dense_buckets) + sizeof(m_num_sparse_buckets) + + sizeof(m_M_num_dense_buckets) + sizeof(m_M_num_sparse_buckets)); + } + + void swap(skew_bucketer& other) { + std::swap(m_num_dense_buckets, other.m_num_dense_buckets); + std::swap(m_num_sparse_buckets, other.m_num_sparse_buckets); + std::swap(m_M_num_dense_buckets, other.m_M_num_dense_buckets); + std::swap(m_M_num_sparse_buckets, other.m_M_num_sparse_buckets); + } + + template + void visit(Visitor& visitor) { + visitor.visit(m_num_dense_buckets); + visitor.visit(m_num_sparse_buckets); + visitor.visit(m_M_num_dense_buckets); + visitor.visit(m_M_num_sparse_buckets); + } + + template + void load(Loader& loader) { + loader.load(m_num_dense_buckets); + loader.load(m_num_sparse_buckets); + loader.load(m_M_num_dense_buckets); + loader.load(m_M_num_sparse_buckets); + } + + template + void dump(Dumper& dumper) const { + dumper.dump(m_num_dense_buckets); + dumper.dump(m_num_sparse_buckets); + dumper.dump(m_M_num_dense_buckets); + dumper.dump(m_M_num_sparse_buckets); + } + + private: + uint64_t m_num_dense_buckets, m_num_sparse_buckets; + __uint128_t m_M_num_dense_buckets, m_M_num_sparse_buckets; +}; + +} // namespace pthash \ No newline at end of file diff --git a/thirdparty/pthash/utils/hasher.hpp b/thirdparty/pthash/utils/hasher.hpp new file mode 100644 index 0000000000..9856b3be33 --- /dev/null +++ b/thirdparty/pthash/utils/hasher.hpp @@ -0,0 +1,188 @@ +/** Copyright 2020-2024 Giulio Ermanno Pibiri and Roberto Trani + * + * The following sets forth attribution notices for third party software. + * + * PTHash: + * The software includes components licensed by Giulio Ermanno Pibiri and + * Roberto Trani, available at https://github.com/jermp/pthash + * + * Licensed under the MIT License (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * https://opensource.org/licenses/MIT + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +// See also https://github.com/jermp/bench_hash_functions + +namespace pthash { + +struct byte_range { + uint8_t const* begin; + uint8_t const* end; +}; + +/* + This code is an adaptation from + https://github.com/aappleby/smhasher/blob/master/src/MurmurHash2.cpp + by Austin Appleby +*/ +inline uint64_t MurmurHash2_64(void const* key, size_t len, uint64_t seed) { + const uint64_t m = 0xc6a4a7935bd1e995ULL; + const int r = 47; + + uint64_t h = seed ^ (len * m); + +#if defined(__arm) || defined(__arm__) + const size_t ksize = sizeof(uint64_t); + const unsigned char* data = (const unsigned char*) key; + const unsigned char* end = data + (std::size_t)(len / 8) * ksize; +#else + const uint64_t* data = (const uint64_t*) key; + const uint64_t* end = data + (len / 8); +#endif + + while (data != end) { +#if defined(__arm) || defined(__arm__) + uint64_t k; + memcpy(&k, data, ksize); + data += ksize; +#else + uint64_t k = *data++; +#endif + + k *= m; + k ^= k >> r; + k *= m; + + h ^= k; + h *= m; + } + + const unsigned char* data2 = (const unsigned char*) data; + + switch (len & 7) { + // fall through + case 7: + h ^= uint64_t(data2[6]) << 48; + // fall through + case 6: + h ^= uint64_t(data2[5]) << 40; + // fall through + case 5: + h ^= uint64_t(data2[4]) << 32; + // fall through + case 4: + h ^= uint64_t(data2[3]) << 24; + // fall through + case 3: + h ^= uint64_t(data2[2]) << 16; + // fall through + case 2: + h ^= uint64_t(data2[1]) << 8; + // fall through + case 1: + h ^= uint64_t(data2[0]); + h *= m; + }; + + h ^= h >> r; + h *= m; + h ^= h >> r; + + return h; +} + +inline uint64_t default_hash64(uint64_t val, uint64_t seed) { + return MurmurHash2_64(&val, sizeof(uint64_t), seed); +} + +struct hash64 { + hash64() {} + hash64(uint64_t hash) : m_hash(hash) {} + + inline uint64_t first() const { return m_hash; } + + inline uint64_t second() const { return m_hash; } + + inline uint64_t mix() const { + // From: + // http://zimbry.blogspot.com/2011/09/better-bit-mixing-improving-on.html + // 13-th variant + uint64_t z = m_hash; + z = (z ^ (z >> 30)) * 0xbf58476d1ce4e5b9; + z = (z ^ (z >> 27)) * 0x94d049bb133111eb; + return z ^ (z >> 31); + } + + private: + uint64_t m_hash; +}; + +struct hash128 { + hash128() {} + hash128(uint64_t first, uint64_t second) : m_first(first), m_second(second) {} + + inline uint64_t first() const { return m_first; } + + inline uint64_t second() const { return m_second; } + + inline uint64_t mix() const { return m_first ^ m_second; } + + private: + uint64_t m_first, m_second; +}; + +struct murmurhash2_64 { + typedef hash64 hash_type; + + // generic range of bytes + static inline hash64 hash(byte_range range, uint64_t seed) { + return MurmurHash2_64(range.begin, range.end - range.begin, seed); + } + + // specialization for std::string + static inline hash64 hash(std::string const& val, uint64_t seed) { + return MurmurHash2_64(val.data(), val.size(), seed); + } + + // specialization for uint64_t + static inline hash64 hash(uint64_t val, uint64_t seed) { + return MurmurHash2_64(reinterpret_cast(&val), sizeof(val), + seed); + } +}; + +struct murmurhash2_128 { + typedef hash128 hash_type; + + // generic range of bytes + static inline hash128 hash(byte_range range, uint64_t seed) { + return {MurmurHash2_64(range.begin, range.end - range.begin, seed), + MurmurHash2_64(range.begin, range.end - range.begin, ~seed)}; + } + + // specialization for std::string + static inline hash128 hash(std::string const& val, uint64_t seed) { + return {MurmurHash2_64(val.data(), val.size(), seed), + MurmurHash2_64(val.data(), val.size(), ~seed)}; + } + + // specialization for uint64_t + static inline hash128 hash(uint64_t val, uint64_t seed) { + return { + MurmurHash2_64(reinterpret_cast(&val), sizeof(val), seed), + MurmurHash2_64(reinterpret_cast(&val), sizeof(val), + ~seed)}; + } +}; + +} // namespace pthash \ No newline at end of file diff --git a/thirdparty/pthash/utils/logger.hpp b/thirdparty/pthash/utils/logger.hpp new file mode 100644 index 0000000000..068d04e889 --- /dev/null +++ b/thirdparty/pthash/utils/logger.hpp @@ -0,0 +1,87 @@ +/** Copyright 2020-2024 Giulio Ermanno Pibiri and Roberto Trani + * + * The following sets forth attribution notices for third party software. + * + * PTHash: + * The software includes components licensed by Giulio Ermanno Pibiri and + * Roberto Trani, available at https://github.com/jermp/pthash + * + * Licensed under the MIT License (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * https://opensource.org/licenses/MIT + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include + +namespace pthash { + +struct progress_logger { + progress_logger(uint64_t total_events, std::string const& prefix = "", + std::string const& suffix = "", bool enable = true) + : m_total_events(total_events), + m_prefix(prefix), + m_suffix(suffix), + m_logged_events(0) { + // TODO: improve the computation of log_step using timings ! + uint64_t perc_fraction = (total_events >= 100000000) ? 100 : 20; + m_log_step = (total_events + perc_fraction - 1) / perc_fraction; + m_next_event_to_log = static_cast(-1); + if (enable) { + m_next_event_to_log = m_log_step; + update(false); + } + } + + inline void log() { + if (++m_logged_events >= m_next_event_to_log) { + update(false); + m_next_event_to_log += m_log_step; + // the following ensures the last update on 100% + if (m_next_event_to_log > m_total_events) + m_next_event_to_log = m_total_events; + } + } + + void finalize() { + if (m_next_event_to_log != static_cast(-1)) { + assert(m_next_event_to_log == m_total_events); + assert(m_logged_events == m_total_events); + update(true); + } + } + + uint64_t total_events() const { return m_total_events; } + + uint64_t logged_events() const { return m_logged_events; } + + private: + inline void update(bool final) const { + uint64_t perc = (100 * m_logged_events / m_total_events); + std::cout << "\r" << m_prefix << perc << "%" << m_suffix; + if (final) { + std::cout << std::endl; + } else { + std::cout << std::flush; + } + } + + const uint64_t m_total_events; + const std::string m_prefix = ""; + const std::string m_suffix = ""; + uint64_t m_logged_events; + uint64_t m_log_step; + uint64_t m_next_event_to_log; +}; + +} // namespace pthash \ No newline at end of file diff --git a/thirdparty/pthash/utils/util.hpp b/thirdparty/pthash/utils/util.hpp new file mode 100644 index 0000000000..c64452c327 --- /dev/null +++ b/thirdparty/pthash/utils/util.hpp @@ -0,0 +1,57 @@ +/** Copyright 2020-2024 Giulio Ermanno Pibiri and Roberto Trani + * + * The following sets forth attribution notices for third party software. + * + * PTHash: + * The software includes components licensed by Giulio Ermanno Pibiri and + * Roberto Trani, available at https://github.com/jermp/pthash + * + * Licensed under the MIT License (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * https://opensource.org/licenses/MIT + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include + +#include "pthash/essentials/essentials.hpp" +#include "pthash/fastmod/fastmod.h" + +#define PTHASH_LIKELY(expr) __builtin_expect((bool) (expr), true) + +namespace pthash { + +typedef std::chrono::high_resolution_clock clock_type; + +namespace constants { +static const uint64_t available_ram = + sysconf(_SC_PAGESIZE) * sysconf(_SC_PHYS_PAGES); +static const uint64_t invalid_seed = uint64_t(-1); +static const uint64_t invalid_num_buckets = uint64_t(-1); +static const std::string default_tmp_dirname("."); +} // namespace constants + +inline uint64_t random_value() { + unsigned seed = std::chrono::system_clock::now().time_since_epoch().count(); + std::mt19937_64 rng(seed); + return rng(); +} + +template +double seconds(DurationType const& d) { + return static_cast( + std::chrono::duration_cast(d).count()) / + 1000; // better resolution than std::chrono::seconds +} + +} // namespace pthash