Skip to content

Commit

Permalink
[Fix] Fix profiler bug when using immediate command list (#2386)
Browse files Browse the repository at this point in the history
  • Loading branch information
cboss6 authored Sep 12, 2023
1 parent c83e870 commit 8208d49
Show file tree
Hide file tree
Showing 3 changed files with 179 additions and 23 deletions.
11 changes: 11 additions & 0 deletions itex/core/profiler/gpu_profiler.cc
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ limitations under the License.
#include "itex/core/profiler/ze_tracer.h"
#include "itex/core/profiler/ze_utils.h"
#include "itex/core/utils/annotation_stack.h"
#include "itex/core/utils/hw_info.h"
#include "itex/core/utils/logging.h"
#include "itex/core/utils/strcat.h"
#include "protos/xplane.pb.h"
Expand All @@ -52,6 +53,16 @@ static bool IsItexProfilerEnabled() {

void EnableProfiling() {
assert(zeInit(ZE_INIT_FLAG_GPU_ONLY) == ZE_RESULT_SUCCESS);
std::string enable_immediate_commmand_list =
utils::GetEnv("SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS");
if (enable_immediate_commmand_list == "0") {
utils::ImmediateCommandListDisabled();
} else if (enable_immediate_commmand_list.empty()) {
if (!IsXeHPC()) {
utils::ImmediateCommandListDisabled();
}
}

uint32_t flags = 0;
flags |= (1 << TRACE_DEVICE_TIMING);
flags |= (1 << TRACE_HOST_RUNTIME_TIMING);
Expand Down
10 changes: 10 additions & 0 deletions itex/core/profiler/utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,16 @@ limitations under the License.

namespace utils {

std::atomic<int> g_immediate_command_list_enabled(1);

inline bool IsImmediateCommandListEnabled() {
return g_immediate_command_list_enabled.load(std::memory_order_acquire);
}

inline void ImmediateCommandListDisabled() {
g_immediate_command_list_enabled.store(0, std::memory_order_release);
}

struct Comparator {
template <typename T>
bool operator()(const T& left, const T& right) const {
Expand Down
181 changes: 158 additions & 23 deletions itex/core/profiler/ze_kernel_collector.h
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,7 @@ struct ZeKernelCommand {
struct ZeKernelCall {
ZeKernelCommand* command;
ze_command_queue_handle_t queue;
ze_fence_handle_t fence;
uint64_t submit_time;
uint64_t device_submit_time;
uint64_t call_id;
Expand Down Expand Up @@ -252,7 +253,6 @@ class ZeKernelCollector {
zet_core_callbacks_t epilogue_callbacks{};

prologue_callbacks.Event.pfnDestroyCb = OnEnterEventDestroy;

prologue_callbacks.Event.pfnHostResetCb = OnEnterEventHostReset;

prologue_callbacks.EventPool.pfnCreateCb = OnEnterEventPoolCreate;
Expand Down Expand Up @@ -346,6 +346,10 @@ class ZeKernelCollector {

epilogue_callbacks.Event.pfnHostSynchronizeCb = OnExitEventHostSynchronize;

epilogue_callbacks.Fence.pfnHostSynchronizeCb = OnExitFenceHostSynchronize;

epilogue_callbacks.Event.pfnQueryStatusCb = OnExitEventQueryStatus;

ze_result_t status = ZE_RESULT_SUCCESS;
status = zelTracerSetPrologues(tracer_, &prologue_callbacks);
PTI_ASSERT(status == ZE_RESULT_SUCCESS);
Expand Down Expand Up @@ -393,25 +397,109 @@ class ZeKernelCollector {

void ProcessCall(ze_event_handle_t event) {
PTI_ASSERT(event != nullptr);
const std::lock_guard<std::mutex> lock(lock_);

bool isImmEnabled = utils::IsImmediateCommandListEnabled();
ze_result_t status = ZE_RESULT_SUCCESS;
status = zeEventQueryStatus(event);
if (status != ZE_RESULT_SUCCESS) {
return;
}

for (auto it = kernel_call_list_.begin(); it != kernel_call_list_.end();
++it) {
ZeKernelCall* call = *it;
PTI_ASSERT(call != nullptr);
ZeKernelCommand* command = call->command;
PTI_ASSERT(command != nullptr);
if (isImmEnabled) {
bool done = false;
for (auto it = kernel_call_list_.begin();
it != kernel_call_list_.end();) {
ZeKernelCall* call = *it;
PTI_ASSERT(call != nullptr);
ZeKernelCommand* command = call->command;
if (command->event != nullptr) {
if (zeEventQueryStatus(command->event) == ZE_RESULT_SUCCESS) {
if (command->event == event) {
ProcessCall(call);
done = true;
} else {
ProcessCall(call);
}
}
}

if (command->event == event) {
ProcessCall(call);
kernel_call_list_.erase(it);
break;
if (command->event == nullptr) {
delete command;
delete call;
it = kernel_call_list_.erase(it);
} else {
it++;
}

if (done) {
break;
}
}
} else {
for (auto it = kernel_call_list_.begin(); it != kernel_call_list_.end();
++it) {
ZeKernelCall* call = *it;
PTI_ASSERT(call != nullptr);
ZeKernelCommand* command = call->command;
PTI_ASSERT(command != nullptr);

if (command->event == event) {
ProcessCall(call);
kernel_call_list_.erase(it);
break;
}
}
}
}

void ProcessCall(std::string callname, ze_fence_handle_t fence) {
PTI_ASSERT(fence != nullptr);
bool isImmEnabled = utils::IsImmediateCommandListEnabled();
ze_result_t status = ZE_RESULT_SUCCESS;
status = zeFenceQueryStatus(fence);
if (status != ZE_RESULT_SUCCESS) {
return;
}

if (isImmEnabled) {
bool done = false;
for (auto it = kernel_call_list_.begin();
it != kernel_call_list_.end();) {
ZeKernelCall* call = *it;
PTI_ASSERT(call != nullptr);
ZeKernelCommand* command = call->command;
PTI_ASSERT(command != nullptr);
if ((call->fence != nullptr) && (call->fence == fence)) {
PTI_ASSERT(zeEventQueryStatus(command->event) == ZE_RESULT_SUCCESS);
ProcessCall(call);
done = true;
} else if ((command->event != nullptr) &&
(zeEventQueryStatus(command->event) == ZE_RESULT_SUCCESS)) {
ProcessCall(call);
}

if (command->event == nullptr) {
delete command;
it = kernel_call_list_.erase(it);
} else {
it++;
}
if (done) {
break;
}
}
} else {
for (auto it = kernel_call_list_.begin(); it != kernel_call_list_.end();
++it) {
ZeKernelCall* call = *it;
PTI_ASSERT(call != nullptr);
ZeKernelCommand* command = call->command;
PTI_ASSERT(command != nullptr);

if ((call->fence != nullptr) && (call->fence == fence)) {
ProcessCall(call);
kernel_call_list_.erase(it);
break;
}
}
}
}
Expand Down Expand Up @@ -462,7 +550,6 @@ class ZeKernelCollector {
uint64_t end_ns = start_ns + duration;
AddKernelInterval(command->props.name, start_ns, end_ns);
}

ZeKernelCollector::GetzePluggableTracerEventMap()[call->queue].emplace_back(
itex::port::MaybeAbiDemangle(command->props.name.c_str()),
command->props.annotation, command->append_time, call->submit_time,
Expand All @@ -480,12 +567,15 @@ class ZeKernelCollector {
command->append_time, call->submit_time, host_start, host_end);
}

delete call;
if (utils::IsImmediateCommandListEnabled()) {
command->event = nullptr;
} else {
delete call;
}
}

void ProcessCalls() {
ze_result_t status = ZE_RESULT_SUCCESS;
const std::lock_guard<std::mutex> lock(lock_);

auto it = kernel_call_list_.begin();
while (it != kernel_call_list_.end()) {
Expand Down Expand Up @@ -555,6 +645,7 @@ class ZeKernelCollector {
PTI_ASSERT(command_list != nullptr);
PTI_ASSERT(context != nullptr);
const std::lock_guard<std::mutex> lock(lock_);

PTI_ASSERT(command_list_map_.count(command_list) == 0);
command_list_map_[command_list] = {std::vector<ZeKernelCommand*>(), context,
device, immediate};
Expand Down Expand Up @@ -607,7 +698,7 @@ class ZeKernelCollector {
}

void AddKernelCalls(ze_command_list_handle_t command_list,
ze_command_queue_handle_t queue,
ze_command_queue_handle_t queue, ze_fence_handle_t fence,
const ZeSubmitData* submit_data) {
PTI_ASSERT(command_list != nullptr);

Expand All @@ -633,7 +724,7 @@ class ZeKernelCollector {
PTI_ASSERT(command->append_time <= call->submit_time);
++(command->call_count);
call->call_id = command->call_count;

call->fence = fence;
kernel_call_list_.push_back(call);
correlator_->AddCallId(command_list, call->call_id);
}
Expand Down Expand Up @@ -753,7 +844,9 @@ class ZeKernelCollector {
ZeKernelCollector* collector =
reinterpret_cast<ZeKernelCollector*>(global_data);
PTI_ASSERT(collector != nullptr);
collector->lock_.lock();
collector->ProcessCall(*(params->phEvent));
collector->lock_.unlock();
}
}

Expand All @@ -764,7 +857,9 @@ class ZeKernelCollector {
ZeKernelCollector* collector =
reinterpret_cast<ZeKernelCollector*>(global_data);
PTI_ASSERT(collector != nullptr);
collector->lock_.lock();
collector->ProcessCall(*(params->phEvent));
collector->lock_.unlock();
}
}

Expand All @@ -776,7 +871,23 @@ class ZeKernelCollector {
ZeKernelCollector* collector =
reinterpret_cast<ZeKernelCollector*>(global_data);
PTI_ASSERT(collector != nullptr);
collector->lock_.lock();
collector->ProcessCall(*(params->phEvent));
collector->lock_.unlock();
}
}

static void OnExitFenceHostSynchronize(
ze_fence_host_synchronize_params_t* params, ze_result_t result,
void* global_data, void** instance_data) {
if (result == ZE_RESULT_SUCCESS) {
PTI_ASSERT(*(params->phFence) != nullptr);
ZeKernelCollector* collector =
reinterpret_cast<ZeKernelCollector*>(global_data);
PTI_ASSERT(collector != nullptr);
collector->lock_.lock();
collector->ProcessCall("FenceHostSynchronize", *(params->phFence));
collector->lock_.unlock();
}
}

Expand Down Expand Up @@ -840,7 +951,6 @@ class ZeKernelCollector {
ze_event_handle_t& event) { // NOLINT(runtime/references)
PTI_ASSERT(context != nullptr);
ze_result_t status = ZE_RESULT_SUCCESS;

ze_event_pool_desc_t event_pool_desc = {
ZE_STRUCTURE_TYPE_EVENT_POOL_DESC, nullptr,
ZE_EVENT_POOL_FLAG_KERNEL_TIMESTAMP | ZE_EVENT_POOL_FLAG_HOST_VISIBLE,
Expand All @@ -864,7 +974,6 @@ class ZeKernelCollector {
ZeKernelCollector* collector =
reinterpret_cast<ZeKernelCollector*>(global_data);
PTI_ASSERT(collector != nullptr);

if (command_list == nullptr) {
return;
}
Expand Down Expand Up @@ -903,7 +1012,6 @@ class ZeKernelCollector {
ZeKernelCollector::GetzePluggableTracerDeviceQueueMap()[device].insert(
call->queue);
}

*instance_data = static_cast<void*>(call);
}

Expand Down Expand Up @@ -1381,6 +1489,7 @@ class ZeKernelCollector {
ZeKernelCollector* collector =
reinterpret_cast<ZeKernelCollector*>(global_data);
PTI_ASSERT(collector != nullptr);

collector->AddCommandList(**(params->pphCommandList),
*(params->phContext), *(params->phDevice),
true);
Expand All @@ -1395,8 +1504,12 @@ class ZeKernelCollector {
ZeKernelCollector* collector =
reinterpret_cast<ZeKernelCollector*>(global_data);
PTI_ASSERT(collector != nullptr);
collector->lock_.lock();
collector->ProcessCalls();
collector->RemoveCommandList(*params->phCommandList);
collector->lock_.unlock();
if (!utils::IsImmediateCommandListEnabled()) {
collector->RemoveCommandList(*params->phCommandList);
}
}
}

Expand All @@ -1408,8 +1521,12 @@ class ZeKernelCollector {
ZeKernelCollector* collector =
reinterpret_cast<ZeKernelCollector*>(global_data);
PTI_ASSERT(collector != nullptr);
collector->lock_.lock();
collector->ProcessCalls();
collector->ResetCommandList(*params->phCommandList);
collector->lock_.unlock();
if (!utils::IsImmediateCommandListEnabled()) {
collector->ResetCommandList(*params->phCommandList);
}
}
}

Expand Down Expand Up @@ -1464,6 +1581,7 @@ class ZeKernelCollector {
for (uint32_t i = 0; i < command_list_count; ++i) {
if (!collector->IsCommandListImmediate(command_lists[i])) {
collector->AddKernelCalls(command_lists[i], *(params->phCommandQueue),
*(params->phFence),
&submit_data_list->at(i));
}
}
Expand All @@ -1479,7 +1597,9 @@ class ZeKernelCollector {
ZeKernelCollector* collector =
reinterpret_cast<ZeKernelCollector*>(global_data);
PTI_ASSERT(collector != nullptr);
collector->lock_.lock();
collector->ProcessCalls();
collector->lock_.unlock();
}
}

Expand All @@ -1490,7 +1610,9 @@ class ZeKernelCollector {
ZeKernelCollector* collector =
reinterpret_cast<ZeKernelCollector*>(global_data);
PTI_ASSERT(collector != nullptr);
collector->lock_.lock();
collector->ProcessCalls();
collector->lock_.unlock();
}
}

Expand Down Expand Up @@ -1519,6 +1641,19 @@ class ZeKernelCollector {
}
}

static void OnExitEventQueryStatus(ze_event_query_status_params_t* params,
ze_result_t result, void* global_data,
void** instance_data) {
if (result == ZE_RESULT_SUCCESS && utils::IsImmediateCommandListEnabled()) {
PTI_ASSERT(*(params->phEvent) != nullptr);
ZeKernelCollector* collector =
reinterpret_cast<ZeKernelCollector*>(global_data);
collector->lock_.lock();
collector->ProcessCall(*(params->phEvent));
collector->lock_.unlock();
}
}

private: // Data
zel_tracer_handle_t tracer_ = nullptr;

Expand Down

0 comments on commit 8208d49

Please sign in to comment.