From 91535fb9cd29460472839e1502edd43a500e71d2 Mon Sep 17 00:00:00 2001 From: cj401-ai Date: Tue, 19 Nov 2024 09:28:24 +0000 Subject: [PATCH] update for XSpace and XEvent --- .../profiler/gpu/device_tracer_rocm.cc | 138 +----------------- xla/backends/profiler/gpu/rocm_collector.cc | 23 ++- xla/backends/profiler/gpu/rocm_collector.h | 2 + xla/backends/profiler/gpu/rocm_tracer.cc | 87 ++++++----- xla/backends/profiler/gpu/rocm_tracer.h | 14 +- 5 files changed, 81 insertions(+), 183 deletions(-) diff --git a/xla/backends/profiler/gpu/device_tracer_rocm.cc b/xla/backends/profiler/gpu/device_tracer_rocm.cc index 1fa827a3d7d60..b096cbd5c7a12 100644 --- a/xla/backends/profiler/gpu/device_tracer_rocm.cc +++ b/xla/backends/profiler/gpu/device_tracer_rocm.cc @@ -80,15 +80,11 @@ class GpuTracer : public profiler::ProfilerInterface { absl::Status Start() override; absl::Status Stop() override; absl::Status CollectData(XSpace* space) override; - + private: absl::Status DoStart(); absl::Status DoStop(); - RocmTracerOptions GetRocmTracerOptions(); - - RocmTraceCollectorOptions GetRocmTraceCollectorOptions(uint32_t num_gpus); - enum State { kNotStarted, kStartedOk, @@ -99,124 +95,17 @@ class GpuTracer : public profiler::ProfilerInterface { State profiling_state_ = State::kNotStarted; RocmTracer* rocm_tracer_; - std::unique_ptr rocm_trace_collector_; }; -RocmTracerOptions GpuTracer::GetRocmTracerOptions() { - // TODO(rocm-profiler): We need support for context similar to CUDA - RocmTracerOptions options; - std::vector empty_vec; - - // clang formatting does not preserve one entry per line - // clang-format off - std::vector hip_api_domain_ops{0,1,2, - /* - // KERNEL - HIP_API_ID_hipExtModuleLaunchKernel, - HIP_API_ID_hipModuleLaunchKernel, - HIP_API_ID_hipHccModuleLaunchKernel, - HIP_API_ID_hipLaunchKernel, - - hipExtLaunchKernel, - // MEMCPY - hipMemcpy, - hipMemcpyAsync, - - HIP_API_ID_hipMemcpyDtoD, - HIP_API_ID_hipMemcpyDtoDAsync, - HIP_API_ID_hipMemcpyDtoH, - HIP_API_ID_hipMemcpyDtoHAsync, - HIP_API_ID_hipMemcpyHtoD, - HIP_API_ID_hipMemcpyHtoDAsync, - HIP_API_ID_hipMemcpyPeer, - HIP_API_ID_hipMemcpyPeerAsync, - - // MEMSet - - HIP_API_ID_hipMemsetD32, - HIP_API_ID_hipMemsetD32Async, - HIP_API_ID_hipMemsetD16, - HIP_API_ID_hipMemsetD16Async, - HIP_API_ID_hipMemsetD8, - HIP_API_ID_hipMemsetD8Async, - HIP_API_ID_hipMemset, - HIP_API_ID_hipMemsetAsync, - - // MEMAlloc - hipMalloc, - hipMallocPitch, - // MEMFree - hipFree, - // GENERIC - hipStreamSynchronize, - */ - }; - // clang-format on - - options.api_tracking_set = - std::set(hip_api_domain_ops.begin(), hip_api_domain_ops.end()); - - // These are the list of APIs we track since roctracer activity - // does not provide all the information necessary to fully populate the - // TF events. We need to track the APIs for those activities in API domain but - // we only use them for filling the missing items in their corresponding - // activity (using correlation id). - // clang-format off - std::vector hip_api_aux_ops{ - 0, 1, - // hipStreamWaitEvent, - // TODO(rocm-profiler): finding device ID from hipEventSynchronize need some - // extra work, we ignore it for now. - // hipEventSynchronize, - // HIP_API_ID_hipHostFree, - // HIP_API_ID_hipHostMalloc, - // HIP_API_ID_hipSetDevice // added to track default device - }; - - // clang-format on - - hip_api_domain_ops.insert(hip_api_domain_ops.end(), hip_api_aux_ops.begin(), - hip_api_aux_ops.end()); - - // options.api_callbacks.emplace(ACTIVITY_DOMAIN_HIP_API, hip_api_domain_ops); - // options.api_callbacks.emplace(ACTIVITY_DOMAIN_HIP_API, empty_vec); - - // options.activity_tracing.emplace(ACTIVITY_DOMAIN_HIP_OPS, empty_vec); - - return options; -} - -RocmTraceCollectorOptions GpuTracer::GetRocmTraceCollectorOptions( - uint32_t num_gpus) { - RocmTraceCollectorOptions options; - options.max_callback_api_events = 2 * 1024 * 1024; - options.max_activity_api_events = 2 * 1024 * 1024; - options.max_annotation_strings = 1024 * 1024; - options.num_gpus = num_gpus; - return options; -} - absl::Status GpuTracer::DoStart() { if (!rocm_tracer_->IsAvailable()) { return tsl::errors::Unavailable("Another profile session running."); } - AnnotationStack::Enable(true); - - RocmTraceCollectorOptions trace_collector_options = - GetRocmTraceCollectorOptions(rocm_tracer_->NumGpus()); + // AnnotationStack::Enable(true); rocm_tracer_->setup(); rocm_tracer_->start(); - - uint64_t start_gputime_ns = rocm_tracer_->GetTimestamp(); - uint64_t start_walltime_ns = tsl::EnvTime::NowNanos(); - rocm_trace_collector_ = CreateRocmCollector( - trace_collector_options, start_walltime_ns, start_gputime_ns); - LOG(ERROR) << "DoStart interrupted ..."; - RocmTracerOptions tracer_options = GetRocmTracerOptions(); - rocm_tracer_->Enable(tracer_options, rocm_trace_collector_.get()); - return absl::OkStatus(); } @@ -245,29 +134,6 @@ absl::Status GpuTracer::Stop() { return absl::OkStatus(); } -absl::Status GpuTracer::CollectData(XSpace* space) { - LOG(ERROR) << "profiling_state_" << profiling_state_; - switch (profiling_state_) { - case State::kNotStarted: - VLOG(3) << "No trace data collected, session wasn't started"; - return absl::OkStatus(); - case State::kStartedOk: - return tsl::errors::FailedPrecondition( - "Cannot collect trace before stopping"); - case State::kStartedError: - LOG(ERROR) << "Cannot collect, roctracer failed to start"; - return absl::OkStatus(); - case State::kStoppedError: - VLOG(3) << "No trace data collected"; - return absl::OkStatus(); - case State::kStoppedOk: { - if (rocm_trace_collector_) rocm_trace_collector_->Export(space); - return absl::OkStatus(); - } - } - return tsl::errors::Internal("Invalid profiling state: ", profiling_state_); -} - // Not in anonymous namespace for testing purposes. std::unique_ptr CreateGpuTracer( const ProfileOptions& options) { diff --git a/xla/backends/profiler/gpu/rocm_collector.cc b/xla/backends/profiler/gpu/rocm_collector.cc index 90e459ab3ab8f..8c8595716f888 100644 --- a/xla/backends/profiler/gpu/rocm_collector.cc +++ b/xla/backends/profiler/gpu/rocm_collector.cc @@ -110,7 +110,7 @@ static void DumpRocmTracerEvent(const RocmTracerEvent& event, // oss << ",source=" << GetRocmTracerEventSourceName(event.source); // oss << ",domain=" << GetRocmTracerEventDomainName(event.domain); oss << ",name=" << event.name; - oss << ",annotation=" << event.annotation; + // oss << ",annotation=" << event.annotation; oss << ",start_time_us=" << (start_walltime_ns + (start_gputime_ns - event.start_time_ns)) / 1000; oss << ",duration=" << (event.end_time_ns - event.start_time_ns) / 1000; @@ -253,6 +253,8 @@ class PerDeviceCollector { bool IsHostEvent(const RocmTracerEvent& event, tsl::int64* line_id) { // DriverCallback(i.e. kernel launching) events are host events. + return false; + /* if (event.source == RocmTracerEventSource::ApiCallback) { *line_id = event.thread_id; return true; @@ -260,7 +262,7 @@ class PerDeviceCollector { *line_id = event.stream_id; return false; } - + */ // TODO(rocm-profiler): do we have such a report in rocm? // Non-overhead activity events are device events. /* if (event.type != CuptiTracerEventType::Overhead) { @@ -270,7 +272,7 @@ class PerDeviceCollector { // Overhead events can be associated with a thread or a stream, etc. // If a valid thread id is specified, we consider it as a host event. // - + /* if (event.stream_id != RocmTracerEvent::kInvalidStreamId) { *line_id = event.stream_id; return false; @@ -282,6 +284,7 @@ class PerDeviceCollector { *line_id = tsl::profiler::kThreadIdOverhead; return false; } + */ } public: @@ -294,8 +297,10 @@ class PerDeviceCollector { absl::flat_hash_map> events_types_per_line; for (const RocmTracerEvent& event : events) { - int64_t line_id = RocmTracerEvent::kInvalidThreadId; - bool is_host_event = IsHostEvent(event, &line_id); + // int64_t line_id = RocmTracerEvent::kInvalidThreadId; + // bool is_host_event = IsHostEvent(event, &line_id); + bool is_host_event = false; + tsl::int64 line_id = event.thread_id; if (is_host_event) { host_ev_cnt++; @@ -303,11 +308,13 @@ class PerDeviceCollector { dev_ev_cnt++; } + /* if (line_id == RocmTracerEvent::kInvalidThreadId || line_id == RocmTracerEvent::kInvalidStreamId) { VLOG(3) << "Ignoring event, type=" << static_cast(event.type); continue; } + */ auto* plane = is_host_event ? host_plane : device_plane; VLOG(9) << "Event" << " type=" << static_cast(event.type) << " line_id=" << line_id @@ -405,7 +412,8 @@ class PerDeviceCollector { private: mutex events_mutex; - std::vector events TF_GUARDED_BY(events_mutex); + // std::vector + RocmTracerEvent_t events TF_GUARDED_BY(events_mutex); absl::flat_hash_map correlation_info_ TF_GUARDED_BY(events_mutex); absl::flat_hash_map @@ -441,7 +449,8 @@ class RocmTraceCollectorImpl : public profiler::RocmTraceCollector { int num_gpus_; mutex event_maps_mutex_; - std::vector events_ TF_GUARDED_BY(event_maps_mutex_); + // std::vector + RocmTracerEvent_t events_ TF_GUARDED_BY(event_maps_mutex_); absl::flat_hash_map per_device_collector_; }; diff --git a/xla/backends/profiler/gpu/rocm_collector.h b/xla/backends/profiler/gpu/rocm_collector.h index cf73fba2770bc..857fc3361fb7d 100644 --- a/xla/backends/profiler/gpu/rocm_collector.h +++ b/xla/backends/profiler/gpu/rocm_collector.h @@ -125,6 +125,8 @@ struct RocmTracerEvent { int64_t stream_id = 0; }; +using RocmTracerEvent_t = std::vector; + struct RocmTraceCollectorOptions { // Maximum number of events to collect from callback API; if -1, no limit. // if 0, the callback API is enabled to build a correlation map, but no diff --git a/xla/backends/profiler/gpu/rocm_tracer.cc b/xla/backends/profiler/gpu/rocm_tracer.cc index a5be64ed414b4..4e76169ca3f47 100644 --- a/xla/backends/profiler/gpu/rocm_tracer.cc +++ b/xla/backends/profiler/gpu/rocm_tracer.cc @@ -53,6 +53,9 @@ limitations under the License. #include #include #include +// #include +using tsl::profiler::XSpace; +XSpace* space; extern "C" rocprofiler_tool_configure_result_t* rocprofiler_configure( uint32_t version, const char* runtime_version, uint32_t priority, @@ -81,14 +84,6 @@ rocprofiler_buffer_id_t client_buffer = {}; buffer_name_info client_name_info = {}; kernel_symbol_map_t client_kernels = {}; -void -print_call_stack(const call_stack_t& _call_stack) -{ - LOG(ERROR) << "print out call stack..."; - common::print_call_stack("api_buffered_trace.log", _call_stack); - LOG(ERROR) << "complete print out call stack..."; -} - void tool_code_object_callback(rocprofiler_callback_tracing_record_t record, rocprofiler_user_data_t* user_data, @@ -246,15 +241,17 @@ tool_tracing_callback(rocprofiler_context_id_t context, // throw std::runtime_error{msg.str()}; } - static_cast(user_data)->emplace_back( + auto tmp_str = client_name_info[record->kind][record->operation].data(); + + static_cast(user_data)->emplace_back( RocmTracerEvent{RocmTracerEventType::HIP_RUNTIME_API, - client_name_info[record->kind][record->operation], + tmp_str, record->start_timestamp, record->end_timestamp, 0, // how to access device id, record->correlation_id.internal, record->thread_id, - record->stream_id}); + 0}); } else if(header->category == ROCPROFILER_BUFFER_CATEGORY_TRACING && header->kind == ROCPROFILER_BUFFER_TRACING_KERNEL_DISPATCH) @@ -300,15 +297,15 @@ tool_tracing_callback(rocprofiler_context_id_t context, printf("kernel dispatch: start > end"); // throw std::runtime_error("kernel dispatch: start > end"); - static_cast(user_data)->emplace_back( + static_cast(user_data)->emplace_back( RocmTracerEvent{RocmTracerEventType::KERNEL_DISPATCH, - client_name_info[record->kind][record->operation], + client_kernels.at(record->dispatch_info.kernel_id).kernel_name, record->start_timestamp, record->end_timestamp, 0, // how to access device id, record->correlation_id.internal, record->thread_id, - record->stream_id}); + 0}); } else if(header->category == ROCPROFILER_BUFFER_CATEGORY_TRACING && header->kind == ROCPROFILER_BUFFER_TRACING_MEMORY_COPY) @@ -333,15 +330,15 @@ tool_tracing_callback(rocprofiler_context_id_t context, printf("memory copy: start > end \n"); // throw std::runtime_error("memory copy: start > end"); - static_cast(user_data)->emplace_back( + static_cast(user_data)->emplace_back( RocmTracerEvent{RocmTracerEventType::MEMORY_COPY, - client_name_info[record->kind][record->operation], + client_name_info[record->kind][record->operation].data(), record->start_timestamp, record->end_timestamp, 0, // how to access device id, record->correlation_id.internal, record->thread_id, - record->stream_id}); + 0}); } else { @@ -424,16 +421,16 @@ int tool_init(rocprofiler_client_finalize_t fini_func, void* tool_data) return 0; } -void tool_fini(void* tool_data){ - assert(tool_data != nullptr); - - print_call_stack(*_call_stack); - - delete _call_stack; +RocmTraceCollectorOptions GetRocmTraceCollectorOptions(uint32_t num_gpus) { + RocmTraceCollectorOptions options; + options.max_callback_api_events = 2 * 1024 * 1024; + options.max_activity_api_events = 2 * 1024 * 1024; + options.max_annotation_strings = 1024 * 1024; + options.num_gpus = num_gpus; + return options; } -} // end of namespace -int RocmTracer::NumGpus() { +int NumGpus() { static int num_gpus = []() -> int { if (hipInit(0) != hipSuccess) { return 0; @@ -448,13 +445,7 @@ int RocmTracer::NumGpus() { return num_gpus; } -void RocmTracer::Enable(const RocmTracerOptions& options, RocmTraceCollector* collector) { - options_ = options; - collector_ = collector; - LOG(ERROR) << "GpuTracer started"; -} - -/*static*/ uint64_t RocmTracer::GetTimestamp() { +/*static*/ uint64_t GetTimestamp() { uint64_t ts; rocprofiler_status_t CHECKSTATUS = se::wrap::rocprofiler_get_timestamp(&ts); if (CHECKSTATUS != ROCPROFILER_STATUS_SUCCESS) { @@ -466,6 +457,36 @@ void RocmTracer::Enable(const RocmTracerOptions& options, RocmTraceCollector* co return ts; } +void tool_fini(void* tool_data){ + assert(tool_data != nullptr); + + RocmTraceCollectorOptions trace_collector_options = GetRocmTraceCollectorOptions(NumGpus()); + + uint64_t start_gputime_ns = GetTimestamp(); + uint64_t start_walltime_ns = tsl::EnvTime::NowNanos(); + auto rocm_trace_collector_ = CreateRocmCollector(trace_collector_options, start_walltime_ns, start_gputime_ns); + LOG(ERROR) << "tool_fini interrupted ..."; + + auto* tmp_events = static_cast(tool_data); + + if (tmp_events && rocm_trace_collector_) { + size_t failed_additions = 0; + for (auto& itr : *tmp_events) { + rocm_trace_collector_->AddEvent(itr); + } + LOG(ERROR) << "Failed to add " << failed_additions << " events."; + // std::for_each(std::execution::par, tmp_events->begin(), tmp_events->end(), + // [&](const auto& itr) { rocm_trace_collector_->AddEvent(itr); }); + + } + + rocm_trace_collector_->Flush(); + rocm_trace_collector_->Export(space); +} +} // end of namespace + + + void RocmTracer::setup(){ if(int status = 0; se::wrap::rocprofiler_is_initialized(&status) == ROCPROFILER_STATUS_SUCCESS && status == 0){ @@ -526,7 +547,7 @@ rocprofiler_configure(uint32_t version, std::clog << info.str() << std::endl; - auto* client_tool_data = new std::vector{}; + auto* client_tool_data = new std::vector{}; // create configure data static auto cfg = diff --git a/xla/backends/profiler/gpu/rocm_tracer.h b/xla/backends/profiler/gpu/rocm_tracer.h index a8744aa3b4807..0e3ba4886f34c 100644 --- a/xla/backends/profiler/gpu/rocm_tracer.h +++ b/xla/backends/profiler/gpu/rocm_tracer.h @@ -58,10 +58,10 @@ class RocmTracer { // Only one profile session can be live in the same time. bool IsAvailable() const; - void Enable(const RocmTracerOptions& options, RocmTraceCollector* collector); + // void Enable(const RocmTracerOptions& options, RocmTraceCollector* collector); - static uint64_t GetTimestamp(); - static int NumGpus(); + // static uint64_t GetTimestamp(); + // static int NumGpus(); void setup() CLIENT_API; void start() CLIENT_API; @@ -70,7 +70,7 @@ class RocmTracer { private: // Private constructor for singleton - RocmTracer() : is_available_(true), num_gpus_(NumGpus()) { + RocmTracer() : is_available_(true) { LOG(INFO) << "RocmTracer initialized..."; } @@ -80,9 +80,9 @@ class RocmTracer { } bool is_available_; // availability status - int num_gpus_; - std::optional options_; - RocmTraceCollector* collector_ = nullptr; + // int num_gpus_; + // std::optional options_; + // RocmTraceCollector* collector_ = nullptr; // Disable copy constructor and assignment operator RocmTracer(const RocmTracer&) = delete;