Skip to content

Commit

Permalink
update for XSpace and XEvent
Browse files Browse the repository at this point in the history
  • Loading branch information
cj401-ai committed Nov 19, 2024
1 parent 030d8e8 commit 91535fb
Show file tree
Hide file tree
Showing 5 changed files with 81 additions and 183 deletions.
138 changes: 2 additions & 136 deletions xla/backends/profiler/gpu/device_tracer_rocm.cc
Original file line number Diff line number Diff line change
Expand Up @@ -80,15 +80,11 @@ class GpuTracer : public profiler::ProfilerInterface {
absl::Status Start() override;
absl::Status Stop() override;
absl::Status CollectData(XSpace* space) override;

private:
absl::Status DoStart();
absl::Status DoStop();

RocmTracerOptions GetRocmTracerOptions();

RocmTraceCollectorOptions GetRocmTraceCollectorOptions(uint32_t num_gpus);

enum State {
kNotStarted,
kStartedOk,
Expand All @@ -99,124 +95,17 @@ class GpuTracer : public profiler::ProfilerInterface {
State profiling_state_ = State::kNotStarted;

RocmTracer* rocm_tracer_;
std::unique_ptr<RocmTraceCollector> rocm_trace_collector_;
};

RocmTracerOptions GpuTracer::GetRocmTracerOptions() {
// TODO(rocm-profiler): We need support for context similar to CUDA
RocmTracerOptions options;
std::vector<uint32_t> empty_vec;

// clang formatting does not preserve one entry per line
// clang-format off
std::vector<uint32_t> hip_api_domain_ops{0,1,2,
/*
// KERNEL
HIP_API_ID_hipExtModuleLaunchKernel,
HIP_API_ID_hipModuleLaunchKernel,
HIP_API_ID_hipHccModuleLaunchKernel,
HIP_API_ID_hipLaunchKernel,
hipExtLaunchKernel,
// MEMCPY
hipMemcpy,
hipMemcpyAsync,
HIP_API_ID_hipMemcpyDtoD,
HIP_API_ID_hipMemcpyDtoDAsync,
HIP_API_ID_hipMemcpyDtoH,
HIP_API_ID_hipMemcpyDtoHAsync,
HIP_API_ID_hipMemcpyHtoD,
HIP_API_ID_hipMemcpyHtoDAsync,
HIP_API_ID_hipMemcpyPeer,
HIP_API_ID_hipMemcpyPeerAsync,
// MEMSet
HIP_API_ID_hipMemsetD32,
HIP_API_ID_hipMemsetD32Async,
HIP_API_ID_hipMemsetD16,
HIP_API_ID_hipMemsetD16Async,
HIP_API_ID_hipMemsetD8,
HIP_API_ID_hipMemsetD8Async,
HIP_API_ID_hipMemset,
HIP_API_ID_hipMemsetAsync,
// MEMAlloc
hipMalloc,
hipMallocPitch,
// MEMFree
hipFree,
// GENERIC
hipStreamSynchronize,
*/
};
// clang-format on

options.api_tracking_set =
std::set<uint32_t>(hip_api_domain_ops.begin(), hip_api_domain_ops.end());

// These are the list of APIs we track since roctracer activity
// does not provide all the information necessary to fully populate the
// TF events. We need to track the APIs for those activities in API domain but
// we only use them for filling the missing items in their corresponding
// activity (using correlation id).
// clang-format off
std::vector<uint32_t> hip_api_aux_ops{
0, 1,
// hipStreamWaitEvent,
// TODO(rocm-profiler): finding device ID from hipEventSynchronize need some
// extra work, we ignore it for now.
// hipEventSynchronize,
// HIP_API_ID_hipHostFree,
// HIP_API_ID_hipHostMalloc,
// HIP_API_ID_hipSetDevice // added to track default device
};

// clang-format on

hip_api_domain_ops.insert(hip_api_domain_ops.end(), hip_api_aux_ops.begin(),
hip_api_aux_ops.end());

// options.api_callbacks.emplace(ACTIVITY_DOMAIN_HIP_API, hip_api_domain_ops);
// options.api_callbacks.emplace(ACTIVITY_DOMAIN_HIP_API, empty_vec);

// options.activity_tracing.emplace(ACTIVITY_DOMAIN_HIP_OPS, empty_vec);

return options;
}

RocmTraceCollectorOptions GpuTracer::GetRocmTraceCollectorOptions(
uint32_t num_gpus) {
RocmTraceCollectorOptions options;
options.max_callback_api_events = 2 * 1024 * 1024;
options.max_activity_api_events = 2 * 1024 * 1024;
options.max_annotation_strings = 1024 * 1024;
options.num_gpus = num_gpus;
return options;
}

absl::Status GpuTracer::DoStart() {
if (!rocm_tracer_->IsAvailable()) {
return tsl::errors::Unavailable("Another profile session running.");
}

AnnotationStack::Enable(true);

RocmTraceCollectorOptions trace_collector_options =
GetRocmTraceCollectorOptions(rocm_tracer_->NumGpus());
// AnnotationStack::Enable(true);

rocm_tracer_->setup();
rocm_tracer_->start();

uint64_t start_gputime_ns = rocm_tracer_->GetTimestamp();
uint64_t start_walltime_ns = tsl::EnvTime::NowNanos();
rocm_trace_collector_ = CreateRocmCollector(
trace_collector_options, start_walltime_ns, start_gputime_ns);
LOG(ERROR) << "DoStart interrupted ...";
RocmTracerOptions tracer_options = GetRocmTracerOptions();
rocm_tracer_->Enable(tracer_options, rocm_trace_collector_.get());

return absl::OkStatus();
}

Expand Down Expand Up @@ -245,29 +134,6 @@ absl::Status GpuTracer::Stop() {
return absl::OkStatus();
}

absl::Status GpuTracer::CollectData(XSpace* space) {
LOG(ERROR) << "profiling_state_" << profiling_state_;
switch (profiling_state_) {
case State::kNotStarted:
VLOG(3) << "No trace data collected, session wasn't started";
return absl::OkStatus();
case State::kStartedOk:
return tsl::errors::FailedPrecondition(
"Cannot collect trace before stopping");
case State::kStartedError:
LOG(ERROR) << "Cannot collect, roctracer failed to start";
return absl::OkStatus();
case State::kStoppedError:
VLOG(3) << "No trace data collected";
return absl::OkStatus();
case State::kStoppedOk: {
if (rocm_trace_collector_) rocm_trace_collector_->Export(space);
return absl::OkStatus();
}
}
return tsl::errors::Internal("Invalid profiling state: ", profiling_state_);
}

// Not in anonymous namespace for testing purposes.
std::unique_ptr<profiler::ProfilerInterface> CreateGpuTracer(
const ProfileOptions& options) {
Expand Down
23 changes: 16 additions & 7 deletions xla/backends/profiler/gpu/rocm_collector.cc
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,7 @@ static void DumpRocmTracerEvent(const RocmTracerEvent& event,
// oss << ",source=" << GetRocmTracerEventSourceName(event.source);
// oss << ",domain=" << GetRocmTracerEventDomainName(event.domain);
oss << ",name=" << event.name;
oss << ",annotation=" << event.annotation;
// oss << ",annotation=" << event.annotation;
oss << ",start_time_us="
<< (start_walltime_ns + (start_gputime_ns - event.start_time_ns)) / 1000;
oss << ",duration=" << (event.end_time_ns - event.start_time_ns) / 1000;
Expand Down Expand Up @@ -253,14 +253,16 @@ class PerDeviceCollector {

bool IsHostEvent(const RocmTracerEvent& event, tsl::int64* line_id) {
// DriverCallback(i.e. kernel launching) events are host events.
return false;
/*
if (event.source == RocmTracerEventSource::ApiCallback) {
*line_id = event.thread_id;
return true;
} else { // activities
*line_id = event.stream_id;
return false;
}

*/
// TODO(rocm-profiler): do we have such a report in rocm?
// Non-overhead activity events are device events.
/* if (event.type != CuptiTracerEventType::Overhead) {
Expand All @@ -270,7 +272,7 @@ class PerDeviceCollector {
// Overhead events can be associated with a thread or a stream, etc.
// If a valid thread id is specified, we consider it as a host event.
//

/*
if (event.stream_id != RocmTracerEvent::kInvalidStreamId) {
*line_id = event.stream_id;
return false;
Expand All @@ -282,6 +284,7 @@ class PerDeviceCollector {
*line_id = tsl::profiler::kThreadIdOverhead;
return false;
}
*/
}

public:
Expand All @@ -294,20 +297,24 @@ class PerDeviceCollector {
absl::flat_hash_map<tsl::int64, absl::flat_hash_set<RocmTracerEventType>>
events_types_per_line;
for (const RocmTracerEvent& event : events) {
int64_t line_id = RocmTracerEvent::kInvalidThreadId;
bool is_host_event = IsHostEvent(event, &line_id);
// int64_t line_id = RocmTracerEvent::kInvalidThreadId;
// bool is_host_event = IsHostEvent(event, &line_id);
bool is_host_event = false;
tsl::int64 line_id = event.thread_id;

if (is_host_event) {
host_ev_cnt++;
} else {
dev_ev_cnt++;
}

/*
if (line_id == RocmTracerEvent::kInvalidThreadId ||
line_id == RocmTracerEvent::kInvalidStreamId) {
VLOG(3) << "Ignoring event, type=" << static_cast<int>(event.type);
continue;
}
*/
auto* plane = is_host_event ? host_plane : device_plane;
VLOG(9) << "Event" << " type=" << static_cast<int>(event.type)
<< " line_id=" << line_id
Expand Down Expand Up @@ -405,7 +412,8 @@ class PerDeviceCollector {

private:
mutex events_mutex;
std::vector<RocmTracerEvent> events TF_GUARDED_BY(events_mutex);
// std::vector<RocmTracerEvent>
RocmTracerEvent_t events TF_GUARDED_BY(events_mutex);
absl::flat_hash_map<uint32_t, CorrelationInfo> correlation_info_
TF_GUARDED_BY(events_mutex);
absl::flat_hash_map<RocmDeviceOccupancyParams, OccupancyStats>
Expand Down Expand Up @@ -441,7 +449,8 @@ class RocmTraceCollectorImpl : public profiler::RocmTraceCollector {
int num_gpus_;

mutex event_maps_mutex_;
std::vector<RocmTracerEvent> events_ TF_GUARDED_BY(event_maps_mutex_);
// std::vector<RocmTracerEvent>
RocmTracerEvent_t events_ TF_GUARDED_BY(event_maps_mutex_);
absl::flat_hash_map<uint32_t, PerDeviceCollector> per_device_collector_;

};
Expand Down
2 changes: 2 additions & 0 deletions xla/backends/profiler/gpu/rocm_collector.h
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,8 @@ struct RocmTracerEvent {
int64_t stream_id = 0;
};

using RocmTracerEvent_t = std::vector<RocmTracerEvent>;

struct RocmTraceCollectorOptions {
// Maximum number of events to collect from callback API; if -1, no limit.
// if 0, the callback API is enabled to build a correlation map, but no
Expand Down
Loading

0 comments on commit 91535fb

Please sign in to comment.