Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Change std::hash<int> to hash_combine #1557

Open
wants to merge 4 commits into
base: inference
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions include/flexflow/dominators.h
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
#include "tl/optional.hpp"
#include <algorithm>
#include <functional>
#include <limits>
#include <queue>

namespace FlexFlow::PCG::Utils {
Expand Down
108 changes: 104 additions & 4 deletions include/flexflow/utils/hash_utils.h
Original file line number Diff line number Diff line change
@@ -1,17 +1,117 @@
#ifndef _FLEXFLOW_HASH_UTILS_H
#define _FLEXFLOW_HASH_UTILS_H

#include <climits>
#include <cstdint>
#include <functional>
#include <tuple>
#include <type_traits>
#include <vector>

// tuple hashing pulled from
// https://www.variadic.xyz/2018/01/15/hashing-stdpair-and-stdtuple/
// Copied directly from
// https://github.com/boostorg/container_hash/blob/master/include/boost/container_hash/detail/hash_mix.hpp

//
// boost::hash_combine
//
namespace hash_detail {

template <std::size_t Bits>
struct hash_mix_impl;

// hash_mix for 64 bit size_t
//
// The general "xmxmx" form of state of the art 64 bit mixers originates
// from Murmur3 by Austin Appleby, which uses the following function as
// its "final mix":
//
// k ^= k >> 33;
// k *= 0xff51afd7ed558ccd;
// k ^= k >> 33;
// k *= 0xc4ceb9fe1a85ec53;
// k ^= k >> 33;
//
// (https://github.com/aappleby/smhasher/blob/master/src/MurmurHash3.cpp)
//
// It has subsequently been improved multiple times by different authors
// by changing the constants. The most well known improvement is the
// so-called "variant 13" function by David Stafford:
//
// k ^= k >> 30;
// k *= 0xbf58476d1ce4e5b9;
// k ^= k >> 27;
// k *= 0x94d049bb133111eb;
// k ^= k >> 31;
//
// (https://zimbry.blogspot.com/2011/09/better-bit-mixing-improving-on.html)
//
// This mixing function is used in the splitmix64 RNG:
// http://xorshift.di.unimi.it/splitmix64.c
//
// We use Jon Maiga's implementation from
// http://jonkagstrom.com/mx3/mx3_rev2.html
//
// x ^= x >> 32;
// x *= 0xe9846af9b1a615d;
// x ^= x >> 32;
// x *= 0xe9846af9b1a615d;
// x ^= x >> 28;
//
// An equally good alternative is Pelle Evensen's Moremur:
//
// x ^= x >> 27;
// x *= 0x3C79AC492BA7B653;
// x ^= x >> 33;
// x *= 0x1C69B3F74AC4AE35;
// x ^= x >> 27;
//
// (https://mostlymangling.blogspot.com/2019/12/stronger-better-morer-moremur-better.html)

template <>
struct hash_mix_impl<64> {
inline static std::uint64_t fn(std::uint64_t x) {
std::uint64_t const m = (std::uint64_t(0xe9846af) << 32) + 0x9b1a615d;

x ^= x >> 32;
x *= m;
x ^= x >> 32;
x *= m;
x ^= x >> 28;

return x;
}
};

// hash_mix for 32 bit size_t
//
// We use the "best xmxmx" implementation from
// https://github.com/skeeto/hash-prospector/issues/19

template <>
struct hash_mix_impl<32> {
inline static std::uint32_t fn(std::uint32_t x) {
std::uint32_t const m1 = 0x21f0aaad;
std::uint32_t const m2 = 0x735a2d97;

x ^= x >> 16;
x *= m1;
x ^= x >> 15;
x *= m2;
x ^= x >> 15;

return x;
}
};

inline std::size_t hash_mix(std::size_t v) {
return hash_mix_impl<sizeof(std::size_t) * CHAR_BIT>::fn(v);
}

} // namespace hash_detail

template <class T>
inline void hash_combine(std::size_t &seed, T const &v) {
std::hash<T> hasher;
seed ^= hasher(v) + 0x9e3779b9 + (seed << 6) + (seed >> 2);
seed = hash_detail::hash_mix(seed + 0x9e3779b9 + std::hash<T>()(v));
}

namespace std {
Expand Down
29 changes: 15 additions & 14 deletions src/runtime/machine_view.cc
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
#include "flexflow/machine_view.h"
#include "flexflow/utils/hash_utils.h"

namespace FlexFlow {

using namespace Legion;

const MachineView MachineView::NO_VIEW = MachineView();
MachineView const MachineView::NO_VIEW = MachineView();

MachineView::MachineView()
: device_type(MachineView::GPU), ndims(0), start_device_id(0) {
Expand Down Expand Up @@ -47,13 +48,13 @@ size_t MachineView::num_parts() const {
}

size_t MachineView::hash() const {
size_t ret = 17;
ret = ret * 31 + std::hash<int>()(device_type);
ret = ret * 31 + std::hash<int>()(ndims);
ret = ret * 31 + std::hash<int>()(start_device_id);
size_t ret = 0;
hash_combine(ret, device_type);
hash_combine(ret, ndims);
hash_combine(ret, start_device_id);
for (int i = 0; i < ndims; i++) {
ret = ret * 31 + std::hash<int>()(dim[i]);
ret = ret * 31 + std::hash<int>()(stride[i]);
hash_combine(ret, dim[i]);
hash_combine(ret, stride[i]);
}
return ret;
}
Expand Down Expand Up @@ -116,12 +117,12 @@ MachineResource::MachineResource(FFConfig const &config)
available_gpus_per_node(config.workersPerNode) {}

size_t MachineResource::hash() const {
size_t ret = 17;
ret = ret * 31 + std::hash<int>()(num_nodes);
ret = ret * 31 + std::hash<int>()(available_gpus_per_node);
ret = ret * 31 + std::hash<int>()(available_cpus_per_node);
ret = ret * 31 + std::hash<int>()(start_gpu_id);
ret = ret * 31 + std::hash<int>()(start_cpu_id);
size_t ret = 0;
hash_combine(ret, num_nodes);
hash_combine(ret, available_gpus_per_node);
hash_combine(ret, available_cpus_per_node);
hash_combine(ret, start_gpu_id);
hash_combine(ret, start_cpu_id);
return ret;
}

Expand All @@ -132,4 +133,4 @@ size_t hash<FlexFlow::MachineView>::operator()(
FlexFlow::MachineView const &mv) const {
return mv.hash();
}
}; // namespace std
}; // namespace std
15 changes: 8 additions & 7 deletions src/runtime/parallel_tensor.cc
Original file line number Diff line number Diff line change
Expand Up @@ -274,7 +274,7 @@ void ParallelTensorBase::attach_raw_ptr(FFConfig &config,
Runtime *runtime = config.lg_hlr;
AttachLauncher launcher(EXTERNAL_INSTANCE, region, region);
std::vector<FieldID> fields(1, FID_DATA);
const Memory local_sysmem =
Memory const local_sysmem =
Machine::MemoryQuery(Machine::get_machine())
.has_affinity_to(runtime->get_executing_processor(ctx))
.only_kind(Memory::SYSTEM_MEM)
Expand Down Expand Up @@ -449,13 +449,14 @@ bool ParallelTensorBase::get_output_sub_tensor(ParallelConfig const &pc,
}

size_t ParallelTensorBase::get_owner_independent_hash() const {
size_t hash = 17 * 31 + std::hash<int>()((int)data_type);
hash = hash * 31 + std::hash<int>()((int)sync_type);
hash = hash * 31 + std::hash<int>()(num_dims);
size_t hash = 0;
hash_combine(hash, static_cast<int>(data_type));
hash_combine(hash, static_cast<int>(sync_type));
hash_combine(hash, num_dims);
for (int i = 0; i < num_dims; i++) {
hash = hash * 31 + std::hash<int>()(dims[i].size);
hash = hash * 31 + std::hash<int>()(dims[i].degree);
hash = hash * 31 + std::hash<int>()(dims[i].parallel_idx);
hash_combine(hash, dims[i].size);
hash_combine(hash, dims[i].degree);
hash_combine(hash, dims[i].parallel_idx);
}
return hash;
}
Expand Down
49 changes: 27 additions & 22 deletions src/runtime/simulator.cc
Original file line number Diff line number Diff line change
Expand Up @@ -315,8 +315,9 @@ SimTask *TaskManager::new_comm_task(std::string const &name,
SimTask *TaskManager::new_forward_task(Op const *op, int idx) {
SimTask *task = new_task();
task->type = SimTask::TASK_FORWARD;
size_t hash = 17 * 31 + (size_t)(op);
hash = hash * 31 + std::hash<int>()(idx);
size_t hash = 0;
hash_combine(hash, (size_t)op);
hash_combine(hash, idx);
hash_to_forward_task[hash] = task;
task->name = op->name;
return task;
Expand All @@ -325,23 +326,26 @@ SimTask *TaskManager::new_forward_task(Op const *op, int idx) {
SimTask *TaskManager::new_backward_task(Op const *op, int idx) {
SimTask *task = new_task();
task->type = SimTask::TASK_BACKWARD;
size_t hash = 17 * 31 + (size_t)(op);
hash = hash * 31 + std::hash<int>()(idx);
size_t hash = 0;
hash_combine(hash, (size_t)op);
hash_combine(hash, idx);
hash_to_backward_task[hash] = task;
task->name = op->name;
return task;
}

SimTask *TaskManager::get_forward_task(Op const *op, int idx) {
size_t hash = 17 * 31 + (size_t)(op);
hash = hash * 31 + std::hash<int>()(idx);
size_t hash = 0;
hash_combine(hash, (size_t)op);
hash_combine(hash, idx);
assert(hash_to_forward_task.find(hash) != hash_to_forward_task.end());
return hash_to_forward_task[hash];
}

SimTask *TaskManager::get_backward_task(Op const *op, int idx) {
size_t hash = 17 * 31 + (size_t)(op);
hash = hash * 31 + std::hash<int>()(idx);
size_t hash = 0;
hash_combine(hash, (size_t)op);
hash_combine(hash, idx);
assert(hash_to_backward_task.find(hash) != hash_to_backward_task.end());
return hash_to_backward_task[hash];
}
Expand Down Expand Up @@ -497,7 +501,7 @@ CostMetrics Simulator::measure_operator_cost(Op const *op,
ParallelConfig Op::view_to_pc(MachineView const &view) const {
ParallelConfig config;
config.device_type = (ParallelConfig::DeviceType)view.device_type;
const ParallelTensor output = this->outputs[0];
ParallelTensor const output = this->outputs[0];
config.nDims = output->num_dims;
for (int i = 0; i < config.nDims; i++) {
if (output->dims[i].parallel_idx == -1) {
Expand Down Expand Up @@ -535,11 +539,12 @@ CostMetrics Simulator::measure_operator_cost(Op const *op,
return this->strict_hash_to_operator_cost.at(key);
}

size_t hash = 17 * 31 + op->get_untyped_params_hash();
hash = hash * 31 + std::hash<int>()(mv.device_type);
hash = hash * 31 + std::hash<int>()(mv.ndims);
size_t hash = 0;
hash_combine(hash, op->get_untyped_params_hash());
hash_combine(hash, mv.device_type);
hash_combine(hash, mv.ndims);
for (int i = 0; i < mv.ndims; i++) {
hash = hash * 31 + std::hash<int>()(mv.dim[i]);
hash_combine(hash, mv.dim[i]);
}
std::unordered_map<size_t, CostMetrics>::const_iterator iter =
hash_to_operator_cost.find(hash);
Expand Down Expand Up @@ -607,14 +612,14 @@ float Simulator::estimate_xfer_cost(Op const *op,
MachineView const &sink_view) {
// assert(tensor->is_valid_machine_view(source_view));
// assert(tensor->is_valid_machine_view(sink_view));
const ParallelTensor input_tensor = op->inputs[input_idx];
ParallelTensor const input_tensor = op->inputs[input_idx];
if (input_tensor->owner_op->op_type == OP_INPUT) {
return 0.0f;
}

if (op->is_parallel_op()) {
assert(input_idx == 0);
const ParallelTensor output_tensor = op->outputs[0];
ParallelTensor const output_tensor = op->outputs[0];
switch (op->op_type) {
case OP_REPARTITION: {
Repartition *rp = (Repartition *)op;
Expand All @@ -627,7 +632,7 @@ float Simulator::estimate_xfer_cost(Op const *op,
}
case OP_COMBINE: {
Combine *combine = (Combine *)op;
const ParallelTensor output_tensor = op->outputs[0];
ParallelTensor const output_tensor = op->outputs[0];
return this->estimate_repartition_xfer_cost(combine->combine_dim,
combine->combine_degree,
output_tensor->get_shape(),
Expand All @@ -649,7 +654,7 @@ float Simulator::estimate_xfer_cost(Op const *op,
}
case OP_REDUCTION: {
Reduction *reduction = (Reduction *)op;
const ParallelTensor output_tensor = op->outputs[0];
ParallelTensor const output_tensor = op->outputs[0];
ParallelTensorShape fake_output_shape = output_tensor->get_shape();
fake_output_shape.dims[reduction->reduction_dim].size *=
reduction->reduction_degree;
Expand All @@ -662,8 +667,8 @@ float Simulator::estimate_xfer_cost(Op const *op,
}
case OP_FUSED_PARALLEL: {
FusedParallelOp const *fused = (FusedParallelOp const *)op;
const ParallelTensor input_tensor = op->inputs[0];
const ParallelTensor output_tensor = op->outputs[0];
ParallelTensor const input_tensor = op->inputs[0];
ParallelTensor const output_tensor = op->outputs[0];
ParallelTensorShape input_shape = input_tensor->get_shape();
ParallelTensorShape output_shape = output_tensor->get_shape();
// FIXME: we currently calculate an over estimation
Expand Down Expand Up @@ -717,7 +722,7 @@ float Simulator::estimate_xfer_cost(Op const *op,
d.rect_data[i] = 0;
d.rect_data[i + d.dim] = source_view.dim[i] - 1;
}
const ParallelTensor input_tensor = op->inputs[input_idx];
ParallelTensor const input_tensor = op->inputs[input_idx];
size_t total_size = data_type_size(input_tensor->data_type);
for (int i = 0; i < input_tensor->num_dims; i++) {
total_size *= input_tensor->dims[i].size / input_tensor->dims[i].degree;
Expand Down Expand Up @@ -748,7 +753,7 @@ bool Op::estimate_sync_cost(Simulator *sim,
}

float Simulator::default_estimate_sync_cost(
const ParallelDim tensor_dims[MAX_TENSOR_DIM],
ParallelDim const tensor_dims[MAX_TENSOR_DIM],
int tensor_ndims,
MachineView const &view) {
ParallelTensorShape tensor_shape(tensor_ndims, tensor_dims, DT_FLOAT);
Expand All @@ -757,7 +762,7 @@ float Simulator::default_estimate_sync_cost(
tensor_shape, view, tensor_shape.get_num_replica_dims());
}

float Simulator::default_estimate_sync_cost(const ParallelTensor tensor,
float Simulator::default_estimate_sync_cost(ParallelTensor const tensor,
MachineView const &view,
int num_replica_dims) {
return this->default_estimate_sync_cost(
Expand Down
Loading
Loading