[Openmp-commits] [openmp] 6dd8498 - [Libomptarget] Improve next-gen AMDGPU plugin error messages
Joseph Huber via Openmp-commits
openmp-commits at lists.llvm.org
Thu Feb 2 10:56:00 PST 2023
Author: Joseph Huber
Date: 2023-02-02T12:55:53-06:00
New Revision: 6dd84983d0c131fb9fc5e130c80b51b74e397d48
URL: https://github.com/llvm/llvm-project/commit/6dd84983d0c131fb9fc5e130c80b51b74e397d48
DIFF: https://github.com/llvm/llvm-project/commit/6dd84983d0c131fb9fc5e130c80b51b74e397d48.diff
LOG: [Libomptarget] Improve next-gen AMDGPU plugin error messages
The next-gen plugin properly prints errors. This patch improves the
error messages by including the Node-ID of the GPU that failed as well
as a textual representation of the enumeration values.
Reviewed By: kevinsala
Differential Revision: https://reviews.llvm.org/D143192
Added:
Modified:
openmp/libomptarget/include/Debug.h
openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
Removed:
################################################################################
diff --git a/openmp/libomptarget/include/Debug.h b/openmp/libomptarget/include/Debug.h
index 387dfe50659e4..91c12688789af 100644
--- a/openmp/libomptarget/include/Debug.h
+++ b/openmp/libomptarget/include/Debug.h
@@ -119,7 +119,7 @@ inline uint32_t getDebugLevel() {
/// Print fatal error message with a printf string and error identifier
#define FATAL_MESSAGE(_num, _str, ...) \
do { \
- fprintf(stderr, GETNAME(TARGET_NAME) " fatal error %d:" _str "\n", _num, \
+ fprintf(stderr, GETNAME(TARGET_NAME) " fatal error %d: " _str "\n", _num, \
__VA_ARGS__); \
abort(); \
} while (0)
diff --git a/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp b/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
index ca313f553b041..6f57e19b343b3 100644
--- a/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
+++ b/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -431,8 +431,7 @@ struct AMDGPUKernelTy : public GenericKernelTy {
/// Launch the AMDGPU kernel function.
Error launchImpl(GenericDeviceTy &GenericDevice, uint32_t NumThreads,
- uint64_t NumBlocks,
- KernelArgsTy &KernelArgs, void *Args,
+ uint64_t NumBlocks, KernelArgsTy &KernelArgs, void *Args,
AsyncInfoWrapperTy &AsyncInfoWrapper) const override;
/// The default number of blocks is common to the whole device.
@@ -2431,38 +2430,40 @@ struct AMDGPUPluginTy final : public GenericPluginTy {
if (Event->event_type != HSA_AMD_GPU_MEMORY_FAULT_EVENT)
return HSA_STATUS_SUCCESS;
- std::string Reasons;
+ SmallVector<std::string> Reasons;
uint32_t ReasonsMask = Event->memory_fault.fault_reason_mask;
if (ReasonsMask & HSA_AMD_MEMORY_FAULT_PAGE_NOT_PRESENT)
- Reasons += "HSA_AMD_MEMORY_FAULT_PAGE_NOT_PRESENT, ";
+ Reasons.emplace_back("Page not present or supervisor privilege");
if (ReasonsMask & HSA_AMD_MEMORY_FAULT_READ_ONLY)
- Reasons += " HSA_AMD_MEMORY_FAULT_READ_ONLY, ";
+ Reasons.emplace_back("Write access to a read-only page");
if (ReasonsMask & HSA_AMD_MEMORY_FAULT_NX)
- Reasons += " HSA_AMD_MEMORY_FAULT_NX, ";
+ Reasons.emplace_back("Execute access to a page marked NX");
if (ReasonsMask & HSA_AMD_MEMORY_FAULT_HOST_ONLY)
- Reasons += " HSA_AMD_MEMORY_FAULT_HOST_ONLY, ";
+ Reasons.emplace_back("GPU attempted access to a host only page");
if (ReasonsMask & HSA_AMD_MEMORY_FAULT_DRAMECC)
- Reasons += " HSA_AMD_MEMORY_FAULT_DRAMECC, ";
+ Reasons.emplace_back("DRAM ECC failure");
if (ReasonsMask & HSA_AMD_MEMORY_FAULT_IMPRECISE)
- Reasons += " HSA_AMD_MEMORY_FAULT_IMPRECISE, ";
+ Reasons.emplace_back("Can't determine the exact fault address");
if (ReasonsMask & HSA_AMD_MEMORY_FAULT_SRAMECC)
- Reasons += " HSA_AMD_MEMORY_FAULT_SRAMECC, ";
+ Reasons.emplace_back("SRAM ECC failure (ie registers, no fault address)");
if (ReasonsMask & HSA_AMD_MEMORY_FAULT_HANG)
- Reasons += " HSA_AMD_MEMORY_FAULT_HANG, ";
+ Reasons.emplace_back("GPU reset following unspecified hang");
// If we do not know the reason, say so, otherwise remove the trailing comma
// and space.
if (Reasons.empty())
- Reasons = "Unknown (Mask: " + std::to_string(ReasonsMask) + ")";
- else
- Reasons.resize(Reasons.size() - /* ', ' */ 2);
+ Reasons.emplace_back("Unknown (" + std::to_string(ReasonsMask) + ")");
+
+ uint32_t Node = -1;
+ hsa_agent_get_info(Event->memory_fault.agent, HSA_AGENT_INFO_NODE, &Node);
// Abort the execution since we do not recover from this error.
FATAL_MESSAGE(1,
- "Found HSA_AMD_GPU_MEMORY_FAULT_EVENT in agent %" PRIu64
- " at virtual address %p and reasons: %s",
- Event->memory_fault.agent.handle,
- (void *)Event->memory_fault.virtual_address, Reasons.data());
+ "Memory access fault by GPU %" PRIu32 " (agent 0x%" PRIx64
+ ") at virtual address %p. Reasons: %s",
+ Node, Event->memory_fault.agent.handle,
+ (void *)Event->memory_fault.virtual_address,
+ llvm::join(Reasons, ", ").c_str());
return HSA_STATUS_ERROR;
}
More information about the Openmp-commits
mailing list