flexflow · suranap · Dec 14, 2024 · Dec 14, 2024 · Aug 29, 2023 · Dec 14, 2024
diff --git a/include/flexflow/dominators.h b/include/flexflow/dominators.h
@@ -8,6 +8,7 @@
 #include "tl/optional.hpp"
 #include <algorithm>
 #include <functional>
+#include <limits>
 #include <queue>
 
 namespace FlexFlow::PCG::Utils {

diff --git a/include/flexflow/utils/hash_utils.h b/include/flexflow/utils/hash_utils.h
@@ -1,17 +1,117 @@
 #ifndef _FLEXFLOW_HASH_UTILS_H
 #define _FLEXFLOW_HASH_UTILS_H
 
+#include <climits>
+#include <cstdint>
 #include <functional>
 #include <tuple>
 #include <type_traits>
 #include <vector>
 
-// tuple hashing pulled from
-// https://www.variadic.xyz/2018/01/15/hashing-stdpair-and-stdtuple/
+// Copied directly from
+// https://github.com/boostorg/container_hash/blob/master/include/boost/container_hash/detail/hash_mix.hpp
+
+//
+// boost::hash_combine
+//
+namespace hash_detail {
+
+template <std::size_t Bits>
+struct hash_mix_impl;
+
+// hash_mix for 64 bit size_t
+//
+// The general "xmxmx" form of state of the art 64 bit mixers originates
+// from Murmur3 by Austin Appleby, which uses the following function as
+// its "final mix":
+//
+//	k ^= k >> 33;
+//	k *= 0xff51afd7ed558ccd;
+//	k ^= k >> 33;
+//	k *= 0xc4ceb9fe1a85ec53;
+//	k ^= k >> 33;
+//
+// (https://github.com/aappleby/smhasher/blob/master/src/MurmurHash3.cpp)
+//
+// It has subsequently been improved multiple times by different authors
+// by changing the constants. The most well known improvement is the
+// so-called "variant 13" function by David Stafford:
+//
+//	k ^= k >> 30;
+//	k *= 0xbf58476d1ce4e5b9;
+//	k ^= k >> 27;
+//	k *= 0x94d049bb133111eb;
+//	k ^= k >> 31;
+//
+// (https://zimbry.blogspot.com/2011/09/better-bit-mixing-improving-on.html)
+//
+// This mixing function is used in the splitmix64 RNG:
+// http://xorshift.di.unimi.it/splitmix64.c
+//
+// We use Jon Maiga's implementation from
+// http://jonkagstrom.com/mx3/mx3_rev2.html
+//
+// 	x ^= x >> 32;
+//	x *= 0xe9846af9b1a615d;
+//	x ^= x >> 32;
+//	x *= 0xe9846af9b1a615d;
+//	x ^= x >> 28;
+//
+// An equally good alternative is Pelle Evensen's Moremur:
+//
+//	x ^= x >> 27;
+//	x *= 0x3C79AC492BA7B653;
+//	x ^= x >> 33;
+//	x *= 0x1C69B3F74AC4AE35;
+//	x ^= x >> 27;
+//
+// (https://mostlymangling.blogspot.com/2019/12/stronger-better-morer-moremur-better.html)
+
+template <>
+struct hash_mix_impl<64> {
+  inline static std::uint64_t fn(std::uint64_t x) {
+    std::uint64_t const m = (std::uint64_t(0xe9846af) << 32) + 0x9b1a615d;
+
+    x ^= x >> 32;
+    x *= m;
+    x ^= x >> 32;
+    x *= m;
+    x ^= x >> 28;
+
+    return x;
+  }
+};
+
+// hash_mix for 32 bit size_t
+//
+// We use the "best xmxmx" implementation from
+// https://github.com/skeeto/hash-prospector/issues/19
+
+template <>
+struct hash_mix_impl<32> {
+  inline static std::uint32_t fn(std::uint32_t x) {
+    std::uint32_t const m1 = 0x21f0aaad;
+    std::uint32_t const m2 = 0x735a2d97;
+
+    x ^= x >> 16;
+    x *= m1;
+    x ^= x >> 15;
+    x *= m2;
+    x ^= x >> 15;
+
+    return x;
+  }
+};
+
+inline std::size_t hash_mix(std::size_t v) {
+  return hash_mix_impl<sizeof(std::size_t) * CHAR_BIT>::fn(v);
+}
+
+} // namespace hash_detail
+
 template <class T>
 inline void hash_combine(std::size_t &seed, T const &v) {
-  std::hash<T> hasher;
-  seed ^= hasher(v) + 0x9e3779b9 + (seed << 6) + (seed >> 2);
+  seed = hash_detail::hash_mix(seed + 0x9e3779b9 + std::hash<T>()(v));
 }
 
 namespace std {

diff --git a/src/runtime/machine_view.cc b/src/runtime/machine_view.cc
@@ -1,10 +1,11 @@
 #include "flexflow/machine_view.h"
+#include "flexflow/utils/hash_utils.h"
 
 namespace FlexFlow {
 
 using namespace Legion;
 
-const MachineView MachineView::NO_VIEW = MachineView();
+MachineView const MachineView::NO_VIEW = MachineView();
 
 MachineView::MachineView()
     : device_type(MachineView::GPU), ndims(0), start_device_id(0) {
@@ -47,13 +48,13 @@ size_t MachineView::num_parts() const {
 }
 
 size_t MachineView::hash() const {
-  size_t ret = 17;
-  ret = ret * 31 + std::hash<int>()(device_type);
-  ret = ret * 31 + std::hash<int>()(ndims);
-  ret = ret * 31 + std::hash<int>()(start_device_id);
+  size_t ret = 0;
+  hash_combine(ret, device_type);
+  hash_combine(ret, ndims);
+  hash_combine(ret, start_device_id);
   for (int i = 0; i < ndims; i++) {
-    ret = ret * 31 + std::hash<int>()(dim[i]);
-    ret = ret * 31 + std::hash<int>()(stride[i]);
+    hash_combine(ret, dim[i]);
+    hash_combine(ret, stride[i]);
   }
   return ret;
 }
@@ -116,12 +117,12 @@ MachineResource::MachineResource(FFConfig const &config)
       available_gpus_per_node(config.workersPerNode) {}
 
 size_t MachineResource::hash() const {
-  size_t ret = 17;
-  ret = ret * 31 + std::hash<int>()(num_nodes);
-  ret = ret * 31 + std::hash<int>()(available_gpus_per_node);
-  ret = ret * 31 + std::hash<int>()(available_cpus_per_node);
-  ret = ret * 31 + std::hash<int>()(start_gpu_id);
-  ret = ret * 31 + std::hash<int>()(start_cpu_id);
+  size_t ret = 0;
+  hash_combine(ret, num_nodes);
+  hash_combine(ret, available_gpus_per_node);
+  hash_combine(ret, available_cpus_per_node);
+  hash_combine(ret, start_gpu_id);
+  hash_combine(ret, start_cpu_id);
   return ret;
 }
 
@@ -132,4 +133,4 @@ size_t hash<FlexFlow::MachineView>::operator()(
     FlexFlow::MachineView const &mv) const {
   return mv.hash();
 }
-}; // namespace std
+}; // namespace std
diff --git a/src/runtime/parallel_tensor.cc b/src/runtime/parallel_tensor.cc
@@ -274,7 +274,7 @@ void ParallelTensorBase::attach_raw_ptr(FFConfig &config,
   Runtime *runtime = config.lg_hlr;
   AttachLauncher launcher(EXTERNAL_INSTANCE, region, region);
   std::vector<FieldID> fields(1, FID_DATA);
-  const Memory local_sysmem =
+  Memory const local_sysmem =
       Machine::MemoryQuery(Machine::get_machine())
           .has_affinity_to(runtime->get_executing_processor(ctx))
           .only_kind(Memory::SYSTEM_MEM)
@@ -449,13 +449,14 @@ bool ParallelTensorBase::get_output_sub_tensor(ParallelConfig const &pc,
 }
 
 size_t ParallelTensorBase::get_owner_independent_hash() const {
-  size_t hash = 17 * 31 + std::hash<int>()((int)data_type);
-  hash = hash * 31 + std::hash<int>()((int)sync_type);
-  hash = hash * 31 + std::hash<int>()(num_dims);
+  size_t hash = 0;
+  hash_combine(hash, static_cast<int>(data_type));
+  hash_combine(hash, static_cast<int>(sync_type));
+  hash_combine(hash, num_dims);
   for (int i = 0; i < num_dims; i++) {
-    hash = hash * 31 + std::hash<int>()(dims[i].size);
-    hash = hash * 31 + std::hash<int>()(dims[i].degree);
-    hash = hash * 31 + std::hash<int>()(dims[i].parallel_idx);
+    hash_combine(hash, dims[i].size);
+    hash_combine(hash, dims[i].degree);
+    hash_combine(hash, dims[i].parallel_idx);
   }
   return hash;
 }

diff --git a/src/runtime/simulator.cc b/src/runtime/simulator.cc
@@ -315,8 +315,9 @@ SimTask *TaskManager::new_comm_task(std::string const &name,
 SimTask *TaskManager::new_forward_task(Op const *op, int idx) {
   SimTask *task = new_task();
   task->type = SimTask::TASK_FORWARD;
-  size_t hash = 17 * 31 + (size_t)(op);
-  hash = hash * 31 + std::hash<int>()(idx);
+  size_t hash = 0;
+  hash_combine(hash, (size_t)op);
+  hash_combine(hash, idx);
   hash_to_forward_task[hash] = task;
   task->name = op->name;
   return task;
@@ -325,23 +326,26 @@ SimTask *TaskManager::new_forward_task(Op const *op, int idx) {
 SimTask *TaskManager::new_backward_task(Op const *op, int idx) {
   SimTask *task = new_task();
   task->type = SimTask::TASK_BACKWARD;
-  size_t hash = 17 * 31 + (size_t)(op);
-  hash = hash * 31 + std::hash<int>()(idx);
+  size_t hash = 0;
+  hash_combine(hash, (size_t)op);
+  hash_combine(hash, idx);
   hash_to_backward_task[hash] = task;
   task->name = op->name;
   return task;
 }
 
 SimTask *TaskManager::get_forward_task(Op const *op, int idx) {
-  size_t hash = 17 * 31 + (size_t)(op);
-  hash = hash * 31 + std::hash<int>()(idx);
+  size_t hash = 0;
+  hash_combine(hash, (size_t)op);
+  hash_combine(hash, idx);
   assert(hash_to_forward_task.find(hash) != hash_to_forward_task.end());
   return hash_to_forward_task[hash];
 }
 
 SimTask *TaskManager::get_backward_task(Op const *op, int idx) {
-  size_t hash = 17 * 31 + (size_t)(op);
-  hash = hash * 31 + std::hash<int>()(idx);
+  size_t hash = 0;
+  hash_combine(hash, (size_t)op);
+  hash_combine(hash, idx);
   assert(hash_to_backward_task.find(hash) != hash_to_backward_task.end());
   return hash_to_backward_task[hash];
 }
@@ -497,7 +501,7 @@ CostMetrics Simulator::measure_operator_cost(Op const *op,
 ParallelConfig Op::view_to_pc(MachineView const &view) const {
   ParallelConfig config;
   config.device_type = (ParallelConfig::DeviceType)view.device_type;
-  const ParallelTensor output = this->outputs[0];
+  ParallelTensor const output = this->outputs[0];
   config.nDims = output->num_dims;
   for (int i = 0; i < config.nDims; i++) {
     if (output->dims[i].parallel_idx == -1) {
@@ -535,11 +539,12 @@ CostMetrics Simulator::measure_operator_cost(Op const *op,
     return this->strict_hash_to_operator_cost.at(key);
   }
 
-  size_t hash = 17 * 31 + op->get_untyped_params_hash();
-  hash = hash * 31 + std::hash<int>()(mv.device_type);
-  hash = hash * 31 + std::hash<int>()(mv.ndims);
+  size_t hash = 0;
+  hash_combine(hash, op->get_untyped_params_hash());
+  hash_combine(hash, mv.device_type);
+  hash_combine(hash, mv.ndims);
   for (int i = 0; i < mv.ndims; i++) {
-    hash = hash * 31 + std::hash<int>()(mv.dim[i]);
+    hash_combine(hash, mv.dim[i]);
   }
   std::unordered_map<size_t, CostMetrics>::const_iterator iter =
       hash_to_operator_cost.find(hash);
@@ -607,14 +612,14 @@ float Simulator::estimate_xfer_cost(Op const *op,
                                     MachineView const &sink_view) {
   // assert(tensor->is_valid_machine_view(source_view));
   // assert(tensor->is_valid_machine_view(sink_view));
-  const ParallelTensor input_tensor = op->inputs[input_idx];
+  ParallelTensor const input_tensor = op->inputs[input_idx];
   if (input_tensor->owner_op->op_type == OP_INPUT) {
     return 0.0f;
   }
 
   if (op->is_parallel_op()) {
     assert(input_idx == 0);
-    const ParallelTensor output_tensor = op->outputs[0];
+    ParallelTensor const output_tensor = op->outputs[0];
     switch (op->op_type) {
       case OP_REPARTITION: {
         Repartition *rp = (Repartition *)op;
@@ -627,7 +632,7 @@ float Simulator::estimate_xfer_cost(Op const *op,
       }
       case OP_COMBINE: {
         Combine *combine = (Combine *)op;
-        const ParallelTensor output_tensor = op->outputs[0];
+        ParallelTensor const output_tensor = op->outputs[0];
         return this->estimate_repartition_xfer_cost(combine->combine_dim,
                                                     combine->combine_degree,
                                                     output_tensor->get_shape(),
@@ -649,7 +654,7 @@ float Simulator::estimate_xfer_cost(Op const *op,
       }
       case OP_REDUCTION: {
         Reduction *reduction = (Reduction *)op;
-        const ParallelTensor output_tensor = op->outputs[0];
+        ParallelTensor const output_tensor = op->outputs[0];
         ParallelTensorShape fake_output_shape = output_tensor->get_shape();
         fake_output_shape.dims[reduction->reduction_dim].size *=
             reduction->reduction_degree;
@@ -662,8 +667,8 @@ float Simulator::estimate_xfer_cost(Op const *op,
       }
       case OP_FUSED_PARALLEL: {
         FusedParallelOp const *fused = (FusedParallelOp const *)op;
-        const ParallelTensor input_tensor = op->inputs[0];
-        const ParallelTensor output_tensor = op->outputs[0];
+        ParallelTensor const input_tensor = op->inputs[0];
+        ParallelTensor const output_tensor = op->outputs[0];
         ParallelTensorShape input_shape = input_tensor->get_shape();
         ParallelTensorShape output_shape = output_tensor->get_shape();
         // FIXME: we currently calculate an over estimation
@@ -717,7 +722,7 @@ float Simulator::estimate_xfer_cost(Op const *op,
       d.rect_data[i] = 0;
       d.rect_data[i + d.dim] = source_view.dim[i] - 1;
     }
-    const ParallelTensor input_tensor = op->inputs[input_idx];
+    ParallelTensor const input_tensor = op->inputs[input_idx];
     size_t total_size = data_type_size(input_tensor->data_type);
     for (int i = 0; i < input_tensor->num_dims; i++) {
       total_size *= input_tensor->dims[i].size / input_tensor->dims[i].degree;
@@ -748,7 +753,7 @@ bool Op::estimate_sync_cost(Simulator *sim,
 }
 
 float Simulator::default_estimate_sync_cost(
-    const ParallelDim tensor_dims[MAX_TENSOR_DIM],
+    ParallelDim const tensor_dims[MAX_TENSOR_DIM],
     int tensor_ndims,
     MachineView const &view) {
   ParallelTensorShape tensor_shape(tensor_ndims, tensor_dims, DT_FLOAT);
@@ -757,7 +762,7 @@ float Simulator::default_estimate_sync_cost(
       tensor_shape, view, tensor_shape.get_num_replica_dims());
 }
 
-float Simulator::default_estimate_sync_cost(const ParallelTensor tensor,
+float Simulator::default_estimate_sync_cost(ParallelTensor const tensor,
                                             MachineView const &view,
                                             int num_replica_dims) {
   return this->default_estimate_sync_cost(