-
Notifications
You must be signed in to change notification settings - Fork 12
/
Copy pathcommon.h
289 lines (248 loc) · 9.26 KB
/
common.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
/******************************************************************************
# Copyright (c) 2022 Habana Labs, Ltd.
# SPDX-License-Identifier: Apache-2.0
******************************************************************************/
#pragma once
// C++ Standard Libraries
#include <cstdint> // for uint types
#include <cstring> // for std::memcopy
#include <functional> //for std::function
#include <iostream> // for io stream
#include <sstream> // for std::stringstream
#include <iomanip> // for std::setprecision, std::put_time
#include <string> // for std::string
#include <vector> // for std::vector
#include <unordered_map> // for std::unordered_map
#include "hcl_inc.h" // for HCL_Rank
// HCCL :: Habana Collective Communications Library
#include <hccl.h>
// Synapse :: Habana Synapse training API
#include <synapse_api.h>
#if AFFINITY_ENABLED
#include "affinity.h"
#endif
// MPI handling
#if MPI_ENABLED
// Open MPI (v4.0.2)
#include <mpi.h>
#define CHECK_MPI_STATUS(x) \
{ \
const auto _res = (x); \
if (_res != MPI_SUCCESS) \
throw std::runtime_error {"In function " + std::string {__FUNCTION__} + \
"(): " #x " failed with code: " + std::to_string(_res)}; \
}
#endif //MPI_ENABLED
// Error handling
#define CHECK_HCCL_STATUS(x) \
{ \
const auto _res = (x); \
if (_res != hcclSuccess) \
throw std::runtime_error {"In function " + std::string {__FUNCTION__} + \
"(): " #x " failed: " + hcclGetErrorString(_res)}; \
}
#define CHECK_SYNAPSE_STATUS(x) \
{ \
const auto _res = (x); \
if (_res != synSuccess) \
throw std::runtime_error {"In function " + std::string {__FUNCTION__} + \
"(): " #x " failed with synapse error: " + std::to_string((_res))}; \
}
#define ASSERT(x) \
do \
{ \
if (!(x)) throw std::runtime_error {"In function " + std::string {__FUNCTION__} + " assertion failed"}; \
} while (false)
// Constants
static constexpr int DATA_ELEMENTS_MAX = 13;
static constexpr uint64_t ALLOCATED_HBM_SIZE = (2UL * 1024 * 1024 * 1024); // 2GB
static constexpr uint64_t AMOUNT_JUMBO_BUFFERS = (2);
static constexpr uint64_t MAX_BUFFER_COUNT = (33UL);
// demo structures
struct EnvData
{
HCL_Rank root;
std::string testType;
std::string dataType;
uint64_t sizeMin;
uint64_t sizeMax;
uint64_t sizeInc;
std::string redop;
size_t numIters;
bool shouldCheckCorrectness;
std::string dataCSVPath;
std::string resultsCSVPath;
std::string ranksList;
uint64_t expectedScaleoutBW;
HCL_Rank rank;
size_t nranks;
size_t ranksPerNode;
size_t scaleupGroupSize;
std::vector<HCL_Rank> customComm;
};
struct DeviceResources
{
synDeviceId deviceHandle;
hcclComm_t comm;
HCL_Rank commRoot;
synStreamHandle collectiveStream;
synStreamHandle deviceToHostStream;
synStreamHandle hostToDeviceStream;
};
struct Buffers
{
uint64_t inputSize;
uint64_t outputSize;
std::vector<uint64_t> inputDevPtrs;
std::vector<uint64_t> outputDevPtrs;
uint64_t correctnessDevPtr;
};
struct Stats
{
bool isDescribing = false;
std::string statName;
double factor;
double rankDurationInSec;
std::vector<float> expectedOutputs;
};
struct ReportEntry
{
uint64_t size;
uint64_t count;
double time;
double algoBW;
double avgBW;
};
struct RanksPairSendRecv
{
HCL_Rank sendFromRank;
HCL_Rank recvInRank;
};
inline std::ostream& log()
{
return std::cout;
}
inline bool isBfloat16(const EnvData& envData)
{
return envData.dataType == "bfloat16";
}
inline uint16_t floatToBf16(const float f)
{
return ((*(const uint32_t*) &f) >> 16) & 0xffff;
}
inline float bf16ToFloat(const uint16_t a)
{
float val_fp32;
const uint32_t val_32b = ((uint32_t) a) << 16;
static_assert(sizeof(float) == sizeof(val_32b), "`float` size is incompatible!");
std::memcpy(&val_fp32, &val_32b, sizeof(float));
return val_fp32;
}
inline float bf16AccuracyCoefficient(size_t numberOfRanks)
{
numberOfRanks = (numberOfRanks > 8) ? 8 : numberOfRanks;
return (numberOfRanks > 1) ? (float) numberOfRanks / 256.0 : 0.0; // For 1 rank, tolerance should be 0
}
inline bool isRoot(const EnvData& envData)
{
return envData.rank == envData.root;
}
inline hcclDataType_t getDataType(const EnvData& envData)
{
static const std::unordered_map<std::string, hcclDataType_t> dataTypeMap = {
{"float", hcclFloat32},
{"bfloat16", hcclBfloat16},
};
auto it = dataTypeMap.find(envData.dataType);
if (it != dataTypeMap.end())
{
return it->second;
}
else
{
throw std::runtime_error("Unknown data type.");
}
}
inline uint64_t getDataTypeSize(const EnvData& envData)
{
static const std::unordered_map<std::string, uint64_t> dataTypeMap = {
{"float", sizeof(float)},
{"bfloat16", sizeof(uint16_t)},
};
auto it = dataTypeMap.find(envData.dataType);
if (it != dataTypeMap.end())
{
return it->second;
}
else
{
throw std::runtime_error("Unknown data type.");
}
}
inline hcclRedOp_t getReductionOp(const EnvData& envData)
{
static const std::unordered_map<std::string, hcclRedOp_t> redopMap = {
{"sum", hcclSum},
{"min", hcclMin},
{"max", hcclMax},
};
auto it = redopMap.find(envData.redop);
if (it != redopMap.end())
{
return it->second;
}
else
{
throw std::runtime_error("Unknown reduction op.");
}
}
inline std::string formatBW(const double bytesPerSec)
{
std::stringstream ss;
ss << std::fixed << std::setprecision(6) << bytesPerSec / 1e9 << " GB/s";
return ss.str();
}
inline std::string getPrintDelimiter(size_t length, char delimiter)
{
std::stringstream ss;
for (size_t i = 0; i < length; i++)
{
ss << delimiter;
}
return ss.str();
}
template<class T>
float getFloat(T value);
template<>
inline float getFloat<float>(float value)
{
return value;
}
template<>
inline float getFloat<uint16_t>(uint16_t value)
{
return bf16ToFloat(value);
}
inline float getInput(const HCL_Rank rank, const size_t nranks, const uint64_t i)
{
// We want to make sure we use different values on each cell and between ranks,
// but we don't want the summation to get too big, that is why we modulo by DATA_ELEMENTS_MAX.
return rank + nranks * (i % DATA_ELEMENTS_MAX);
}
// demo infrastructure functions
double benchmark(const EnvData& envData,
const DeviceResources& resources,
const std::function<void(uint64_t)>& fn,
const std::function<void()>& fnCorrectness);
// demo environmental variables
EnvData getenvData();
// send-recv interface
void sendRecvTestDriver(
EnvData& envData, const DeviceResources& resources, Buffers& buffers, const uint64_t size, Stats& stats);
// scale validation interface
#ifdef MPI_ENABLED
void scaleValidationTestDriver(EnvData& envData,
const DeviceResources& resources,
const Buffers& buffers,
const uint64_t size);
#endif // MPI_ENABLED