diff --git a/include/flexflow/dominators.h b/include/flexflow/dominators.h index d3460d3543..d22d708ff9 100644 --- a/include/flexflow/dominators.h +++ b/include/flexflow/dominators.h @@ -8,6 +8,7 @@ #include "tl/optional.hpp" #include #include +#include #include namespace FlexFlow::PCG::Utils { diff --git a/include/flexflow/utils/hash_utils.h b/include/flexflow/utils/hash_utils.h index 745ec04ec9..9e14dfe8f0 100644 --- a/include/flexflow/utils/hash_utils.h +++ b/include/flexflow/utils/hash_utils.h @@ -1,17 +1,117 @@ #ifndef _FLEXFLOW_HASH_UTILS_H #define _FLEXFLOW_HASH_UTILS_H +#include +#include #include #include #include #include -// tuple hashing pulled from -// https://www.variadic.xyz/2018/01/15/hashing-stdpair-and-stdtuple/ +// Copied directly from +// https://github.com/boostorg/container_hash/blob/master/include/boost/container_hash/detail/hash_mix.hpp + +// +// boost::hash_combine +// +namespace hash_detail { + +template +struct hash_mix_impl; + +// hash_mix for 64 bit size_t +// +// The general "xmxmx" form of state of the art 64 bit mixers originates +// from Murmur3 by Austin Appleby, which uses the following function as +// its "final mix": +// +// k ^= k >> 33; +// k *= 0xff51afd7ed558ccd; +// k ^= k >> 33; +// k *= 0xc4ceb9fe1a85ec53; +// k ^= k >> 33; +// +// (https://github.com/aappleby/smhasher/blob/master/src/MurmurHash3.cpp) +// +// It has subsequently been improved multiple times by different authors +// by changing the constants. The most well known improvement is the +// so-called "variant 13" function by David Stafford: +// +// k ^= k >> 30; +// k *= 0xbf58476d1ce4e5b9; +// k ^= k >> 27; +// k *= 0x94d049bb133111eb; +// k ^= k >> 31; +// +// (https://zimbry.blogspot.com/2011/09/better-bit-mixing-improving-on.html) +// +// This mixing function is used in the splitmix64 RNG: +// http://xorshift.di.unimi.it/splitmix64.c +// +// We use Jon Maiga's implementation from +// http://jonkagstrom.com/mx3/mx3_rev2.html +// +// x ^= x >> 32; +// x *= 0xe9846af9b1a615d; +// x ^= x >> 32; +// x *= 0xe9846af9b1a615d; +// x ^= x >> 28; +// +// An equally good alternative is Pelle Evensen's Moremur: +// +// x ^= x >> 27; +// x *= 0x3C79AC492BA7B653; +// x ^= x >> 33; +// x *= 0x1C69B3F74AC4AE35; +// x ^= x >> 27; +// +// (https://mostlymangling.blogspot.com/2019/12/stronger-better-morer-moremur-better.html) + +template <> +struct hash_mix_impl<64> { + inline static std::uint64_t fn(std::uint64_t x) { + std::uint64_t const m = (std::uint64_t(0xe9846af) << 32) + 0x9b1a615d; + + x ^= x >> 32; + x *= m; + x ^= x >> 32; + x *= m; + x ^= x >> 28; + + return x; + } +}; + +// hash_mix for 32 bit size_t +// +// We use the "best xmxmx" implementation from +// https://github.com/skeeto/hash-prospector/issues/19 + +template <> +struct hash_mix_impl<32> { + inline static std::uint32_t fn(std::uint32_t x) { + std::uint32_t const m1 = 0x21f0aaad; + std::uint32_t const m2 = 0x735a2d97; + + x ^= x >> 16; + x *= m1; + x ^= x >> 15; + x *= m2; + x ^= x >> 15; + + return x; + } +}; + +inline std::size_t hash_mix(std::size_t v) { + return hash_mix_impl::fn(v); +} + +} // namespace hash_detail + template inline void hash_combine(std::size_t &seed, T const &v) { - std::hash hasher; - seed ^= hasher(v) + 0x9e3779b9 + (seed << 6) + (seed >> 2); + seed = hash_detail::hash_mix(seed + 0x9e3779b9 + std::hash()(v)); } namespace std { diff --git a/src/runtime/machine_view.cc b/src/runtime/machine_view.cc index dadece7691..2966d56686 100644 --- a/src/runtime/machine_view.cc +++ b/src/runtime/machine_view.cc @@ -1,10 +1,11 @@ #include "flexflow/machine_view.h" +#include "flexflow/utils/hash_utils.h" namespace FlexFlow { using namespace Legion; -const MachineView MachineView::NO_VIEW = MachineView(); +MachineView const MachineView::NO_VIEW = MachineView(); MachineView::MachineView() : device_type(MachineView::GPU), ndims(0), start_device_id(0) { @@ -47,13 +48,13 @@ size_t MachineView::num_parts() const { } size_t MachineView::hash() const { - size_t ret = 17; - ret = ret * 31 + std::hash()(device_type); - ret = ret * 31 + std::hash()(ndims); - ret = ret * 31 + std::hash()(start_device_id); + size_t ret = 0; + hash_combine(ret, device_type); + hash_combine(ret, ndims); + hash_combine(ret, start_device_id); for (int i = 0; i < ndims; i++) { - ret = ret * 31 + std::hash()(dim[i]); - ret = ret * 31 + std::hash()(stride[i]); + hash_combine(ret, dim[i]); + hash_combine(ret, stride[i]); } return ret; } @@ -116,12 +117,12 @@ MachineResource::MachineResource(FFConfig const &config) available_gpus_per_node(config.workersPerNode) {} size_t MachineResource::hash() const { - size_t ret = 17; - ret = ret * 31 + std::hash()(num_nodes); - ret = ret * 31 + std::hash()(available_gpus_per_node); - ret = ret * 31 + std::hash()(available_cpus_per_node); - ret = ret * 31 + std::hash()(start_gpu_id); - ret = ret * 31 + std::hash()(start_cpu_id); + size_t ret = 0; + hash_combine(ret, num_nodes); + hash_combine(ret, available_gpus_per_node); + hash_combine(ret, available_cpus_per_node); + hash_combine(ret, start_gpu_id); + hash_combine(ret, start_cpu_id); return ret; } @@ -132,4 +133,4 @@ size_t hash::operator()( FlexFlow::MachineView const &mv) const { return mv.hash(); } -}; // namespace std \ No newline at end of file +}; // namespace std diff --git a/src/runtime/parallel_tensor.cc b/src/runtime/parallel_tensor.cc index 202983e8f0..d55187fafe 100644 --- a/src/runtime/parallel_tensor.cc +++ b/src/runtime/parallel_tensor.cc @@ -274,7 +274,7 @@ void ParallelTensorBase::attach_raw_ptr(FFConfig &config, Runtime *runtime = config.lg_hlr; AttachLauncher launcher(EXTERNAL_INSTANCE, region, region); std::vector fields(1, FID_DATA); - const Memory local_sysmem = + Memory const local_sysmem = Machine::MemoryQuery(Machine::get_machine()) .has_affinity_to(runtime->get_executing_processor(ctx)) .only_kind(Memory::SYSTEM_MEM) @@ -449,13 +449,14 @@ bool ParallelTensorBase::get_output_sub_tensor(ParallelConfig const &pc, } size_t ParallelTensorBase::get_owner_independent_hash() const { - size_t hash = 17 * 31 + std::hash()((int)data_type); - hash = hash * 31 + std::hash()((int)sync_type); - hash = hash * 31 + std::hash()(num_dims); + size_t hash = 0; + hash_combine(hash, static_cast(data_type)); + hash_combine(hash, static_cast(sync_type)); + hash_combine(hash, num_dims); for (int i = 0; i < num_dims; i++) { - hash = hash * 31 + std::hash()(dims[i].size); - hash = hash * 31 + std::hash()(dims[i].degree); - hash = hash * 31 + std::hash()(dims[i].parallel_idx); + hash_combine(hash, dims[i].size); + hash_combine(hash, dims[i].degree); + hash_combine(hash, dims[i].parallel_idx); } return hash; } diff --git a/src/runtime/simulator.cc b/src/runtime/simulator.cc index b71af0d47e..f6e72c79ee 100644 --- a/src/runtime/simulator.cc +++ b/src/runtime/simulator.cc @@ -315,8 +315,9 @@ SimTask *TaskManager::new_comm_task(std::string const &name, SimTask *TaskManager::new_forward_task(Op const *op, int idx) { SimTask *task = new_task(); task->type = SimTask::TASK_FORWARD; - size_t hash = 17 * 31 + (size_t)(op); - hash = hash * 31 + std::hash()(idx); + size_t hash = 0; + hash_combine(hash, (size_t)op); + hash_combine(hash, idx); hash_to_forward_task[hash] = task; task->name = op->name; return task; @@ -325,23 +326,26 @@ SimTask *TaskManager::new_forward_task(Op const *op, int idx) { SimTask *TaskManager::new_backward_task(Op const *op, int idx) { SimTask *task = new_task(); task->type = SimTask::TASK_BACKWARD; - size_t hash = 17 * 31 + (size_t)(op); - hash = hash * 31 + std::hash()(idx); + size_t hash = 0; + hash_combine(hash, (size_t)op); + hash_combine(hash, idx); hash_to_backward_task[hash] = task; task->name = op->name; return task; } SimTask *TaskManager::get_forward_task(Op const *op, int idx) { - size_t hash = 17 * 31 + (size_t)(op); - hash = hash * 31 + std::hash()(idx); + size_t hash = 0; + hash_combine(hash, (size_t)op); + hash_combine(hash, idx); assert(hash_to_forward_task.find(hash) != hash_to_forward_task.end()); return hash_to_forward_task[hash]; } SimTask *TaskManager::get_backward_task(Op const *op, int idx) { - size_t hash = 17 * 31 + (size_t)(op); - hash = hash * 31 + std::hash()(idx); + size_t hash = 0; + hash_combine(hash, (size_t)op); + hash_combine(hash, idx); assert(hash_to_backward_task.find(hash) != hash_to_backward_task.end()); return hash_to_backward_task[hash]; } @@ -497,7 +501,7 @@ CostMetrics Simulator::measure_operator_cost(Op const *op, ParallelConfig Op::view_to_pc(MachineView const &view) const { ParallelConfig config; config.device_type = (ParallelConfig::DeviceType)view.device_type; - const ParallelTensor output = this->outputs[0]; + ParallelTensor const output = this->outputs[0]; config.nDims = output->num_dims; for (int i = 0; i < config.nDims; i++) { if (output->dims[i].parallel_idx == -1) { @@ -535,11 +539,12 @@ CostMetrics Simulator::measure_operator_cost(Op const *op, return this->strict_hash_to_operator_cost.at(key); } - size_t hash = 17 * 31 + op->get_untyped_params_hash(); - hash = hash * 31 + std::hash()(mv.device_type); - hash = hash * 31 + std::hash()(mv.ndims); + size_t hash = 0; + hash_combine(hash, op->get_untyped_params_hash()); + hash_combine(hash, mv.device_type); + hash_combine(hash, mv.ndims); for (int i = 0; i < mv.ndims; i++) { - hash = hash * 31 + std::hash()(mv.dim[i]); + hash_combine(hash, mv.dim[i]); } std::unordered_map::const_iterator iter = hash_to_operator_cost.find(hash); @@ -607,14 +612,14 @@ float Simulator::estimate_xfer_cost(Op const *op, MachineView const &sink_view) { // assert(tensor->is_valid_machine_view(source_view)); // assert(tensor->is_valid_machine_view(sink_view)); - const ParallelTensor input_tensor = op->inputs[input_idx]; + ParallelTensor const input_tensor = op->inputs[input_idx]; if (input_tensor->owner_op->op_type == OP_INPUT) { return 0.0f; } if (op->is_parallel_op()) { assert(input_idx == 0); - const ParallelTensor output_tensor = op->outputs[0]; + ParallelTensor const output_tensor = op->outputs[0]; switch (op->op_type) { case OP_REPARTITION: { Repartition *rp = (Repartition *)op; @@ -627,7 +632,7 @@ float Simulator::estimate_xfer_cost(Op const *op, } case OP_COMBINE: { Combine *combine = (Combine *)op; - const ParallelTensor output_tensor = op->outputs[0]; + ParallelTensor const output_tensor = op->outputs[0]; return this->estimate_repartition_xfer_cost(combine->combine_dim, combine->combine_degree, output_tensor->get_shape(), @@ -649,7 +654,7 @@ float Simulator::estimate_xfer_cost(Op const *op, } case OP_REDUCTION: { Reduction *reduction = (Reduction *)op; - const ParallelTensor output_tensor = op->outputs[0]; + ParallelTensor const output_tensor = op->outputs[0]; ParallelTensorShape fake_output_shape = output_tensor->get_shape(); fake_output_shape.dims[reduction->reduction_dim].size *= reduction->reduction_degree; @@ -662,8 +667,8 @@ float Simulator::estimate_xfer_cost(Op const *op, } case OP_FUSED_PARALLEL: { FusedParallelOp const *fused = (FusedParallelOp const *)op; - const ParallelTensor input_tensor = op->inputs[0]; - const ParallelTensor output_tensor = op->outputs[0]; + ParallelTensor const input_tensor = op->inputs[0]; + ParallelTensor const output_tensor = op->outputs[0]; ParallelTensorShape input_shape = input_tensor->get_shape(); ParallelTensorShape output_shape = output_tensor->get_shape(); // FIXME: we currently calculate an over estimation @@ -717,7 +722,7 @@ float Simulator::estimate_xfer_cost(Op const *op, d.rect_data[i] = 0; d.rect_data[i + d.dim] = source_view.dim[i] - 1; } - const ParallelTensor input_tensor = op->inputs[input_idx]; + ParallelTensor const input_tensor = op->inputs[input_idx]; size_t total_size = data_type_size(input_tensor->data_type); for (int i = 0; i < input_tensor->num_dims; i++) { total_size *= input_tensor->dims[i].size / input_tensor->dims[i].degree; @@ -748,7 +753,7 @@ bool Op::estimate_sync_cost(Simulator *sim, } float Simulator::default_estimate_sync_cost( - const ParallelDim tensor_dims[MAX_TENSOR_DIM], + ParallelDim const tensor_dims[MAX_TENSOR_DIM], int tensor_ndims, MachineView const &view) { ParallelTensorShape tensor_shape(tensor_ndims, tensor_dims, DT_FLOAT); @@ -757,7 +762,7 @@ float Simulator::default_estimate_sync_cost( tensor_shape, view, tensor_shape.get_num_replica_dims()); } -float Simulator::default_estimate_sync_cost(const ParallelTensor tensor, +float Simulator::default_estimate_sync_cost(ParallelTensor const tensor, MachineView const &view, int num_replica_dims) { return this->default_estimate_sync_cost( diff --git a/tests/unit/test_machine_view.cc b/tests/unit/test_machine_view.cc index eea084db48..94a9f7e3b5 100644 --- a/tests/unit/test_machine_view.cc +++ b/tests/unit/test_machine_view.cc @@ -1,6 +1,7 @@ #include "flexflow/config.h" #include "flexflow/machine_view.h" #include "gtest/gtest.h" +#include using namespace Legion; using namespace FlexFlow; @@ -31,3 +32,120 @@ TEST(machine_view_get_device_id, basic) { EXPECT_EQ(mv.get_device_id({0}), 2); EXPECT_EQ(mv.get_device_id({1}), 3); } + +TEST(machine_view_hash, basic) { + MachineView mv1; + mv1.ndims = 1; + mv1.start_device_id = 2; + mv1.dim[0] = 2; + mv1.stride[0] = 1; + + MachineView mv2; + mv2.ndims = 1; + mv2.start_device_id = 2; + mv2.dim[0] = 2; + mv2.stride[0] = 1; + + EXPECT_EQ(mv1.hash(), mv2.hash()); +} + +TEST(machine_view_hash, different_device_type) { + MachineView mv1; + mv1.device_type = MachineView::GPU; + mv1.ndims = 1; + mv1.start_device_id = 2; + mv1.dim[0] = 2; + mv1.stride[0] = 1; + + MachineView mv2; + mv2.device_type = MachineView::CPU; + mv2.ndims = 1; + mv2.start_device_id = 2; + mv2.dim[0] = 2; + mv2.stride[0] = 1; + + EXPECT_NE(mv1.hash(), mv2.hash()); +} + +TEST(machine_view_hash, different_ndims) { + MachineView mv1; + mv1.ndims = 1; + mv1.start_device_id = 2; + mv1.dim[0] = 2; + mv1.stride[0] = 1; + + MachineView mv2; + mv2.ndims = 2; + mv2.start_device_id = 2; + mv2.dim[0] = 2; + mv2.stride[0] = 1; + + EXPECT_NE(mv1.hash(), mv2.hash()); +} + +TEST(machine_view_hash, different_start_device_id) { + MachineView mv1; + mv1.ndims = 1; + mv1.start_device_id = 2; + mv1.dim[0] = 2; + mv1.stride[0] = 1; + + MachineView mv2; + mv2.ndims = 1; + mv2.start_device_id = 3; + mv2.dim[0] = 2; + mv2.stride[0] = 1; + + EXPECT_NE(mv1.hash(), mv2.hash()); +} + +TEST(machine_view_hash, different_dim) { + MachineView mv1; + mv1.ndims = 1; + mv1.start_device_id = 2; + mv1.dim[0] = 2; + mv1.stride[0] = 1; + + MachineView mv2; + mv2.ndims = 1; + mv2.start_device_id = 2; + mv2.dim[0] = 3; + mv2.stride[0] = 1; + + EXPECT_NE(mv1.hash(), mv2.hash()); +} + +TEST(machine_view_hash, different_stride) { + MachineView mv1; + mv1.ndims = 1; + mv1.start_device_id = 2; + mv1.dim[0] = 2; + mv1.stride[0] = 1; + + MachineView mv2; + mv2.ndims = 1; + mv2.start_device_id = 2; + mv2.dim[0] = 2; + mv2.stride[0] = 2; + + EXPECT_NE(mv1.hash(), mv2.hash()); +} + +TEST(machine_view_hash, known_collision) { + MachineView mv1; + mv1.device_type = MachineView::GPU; + mv1.ndims = 1; + mv1.start_device_id = 0; + mv1.dim[0] = 32; + mv1.stride[0] = 1; + + MachineView mv2; + mv2.device_type = MachineView::GPU; + mv2.ndims = 1; + mv2.start_device_id = 1; + mv2.dim[0] = 1; + mv2.stride[0] = 1; + std::size_t h2 = mv2.hash(); + + EXPECT_NE(mv1.hash(), mv2.hash()); +}