[llvm] [Offload] Improve error reporting on memory faults (PR #104254)

via llvm-commits llvm-commits at lists.llvm.org
Wed Aug 14 14:33:51 PDT 2024


llvmbot wrote:


<!--LLVM PR SUMMARY COMMENT-->

@llvm/pr-subscribers-offload

Author: Johannes Doerfert (jdoerfert)

<details>
<summary>Changes</summary>

Since we can already track allocations, we can diagnose memory faults to some degree. If the fault happens in a prior allocation (use after free) or "close but outside" one, we can provide that information to the user. Note that the fault address might be page aligned, and not all accesses trigger a fault, especially for allocations that are backed by a MemoryManager. Still, if people disable the MemoryManager or the allocation is big enough, we can sometimes provide valueable feedback.

---
Full diff: https://github.com/llvm/llvm-project/pull/104254.diff


11 Files Affected:

- (modified) offload/plugins-nextgen/amdgpu/src/rtl.cpp (+11-1) 
- (modified) offload/plugins-nextgen/common/include/ErrorReporting.h (+61-6) 
- (modified) offload/plugins-nextgen/common/include/PluginInterface.h (+41-5) 
- (modified) offload/test/sanitizer/double_free.c (+3-3) 
- (modified) offload/test/sanitizer/double_free_racy.c (+1-1) 
- (modified) offload/test/sanitizer/free_wrong_ptr_kind.c (+1-1) 
- (modified) offload/test/sanitizer/free_wrong_ptr_kind.cpp (+1-1) 
- (added) offload/test/sanitizer/ptr_outside_alloc_1.c (+40) 
- (added) offload/test/sanitizer/ptr_outside_alloc_2.c (+26) 
- (added) offload/test/sanitizer/use_after_free_1.c (+39) 
- (added) offload/test/sanitizer/use_after_free_2.c (+32) 


``````````diff
diff --git a/offload/plugins-nextgen/amdgpu/src/rtl.cpp b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
index 604683370cd27d..e7b724626f6e0f 100644
--- a/offload/plugins-nextgen/amdgpu/src/rtl.cpp
+++ b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -3264,8 +3264,18 @@ struct AMDGPUPluginTy final : public GenericPluginTy {
       }
       if (DeviceNode != Node)
         continue;
-
+      void *DevicePtr = (void *)Event->memory_fault.virtual_address;
+      std::string S;
+      llvm::raw_string_ostream OS(S);
+      OS << llvm::format("Memory access fault by GPU %" PRIu32
+                         " (agent 0x%" PRIx64
+                         ") at virtual address %p. Reasons: %s",
+                         Node, Event->memory_fault.agent.handle,
+                         (void *)Event->memory_fault.virtual_address,
+                         llvm::join(Reasons, ", ").c_str());
       ErrorReporter::reportKernelTraces(AMDGPUDevice, *KernelTraceInfoRecord);
+      ErrorReporter::reportMemoryAccessError(AMDGPUDevice, DevicePtr, S,
+                                             /*Abort*/ true);
     }
 
     // Abort the execution since we do not recover from this error.
diff --git a/offload/plugins-nextgen/common/include/ErrorReporting.h b/offload/plugins-nextgen/common/include/ErrorReporting.h
index e557b32c2c24f8..8478977a8f86af 100644
--- a/offload/plugins-nextgen/common/include/ErrorReporting.h
+++ b/offload/plugins-nextgen/common/include/ErrorReporting.h
@@ -157,10 +157,13 @@ class ErrorReporter {
 
     if (ATI->HostPtr)
       print(BoldLightPurple,
-            "Last allocation of size %lu for host pointer %p:\n", ATI->Size,
-            ATI->HostPtr);
+            "Last allocation of size %lu for host pointer %p -> device pointer "
+            "%p:\n",
+            ATI->Size, ATI->HostPtr, ATI->DevicePtr);
     else
-      print(BoldLightPurple, "Last allocation of size %lu:\n", ATI->Size);
+      print(BoldLightPurple,
+            "Last allocation of size %lu -> device pointer %p:\n", ATI->Size,
+            ATI->DevicePtr);
     reportStackTrace(ATI->AllocationTrace);
     if (!ATI->LastAllocationInfo)
       return;
@@ -174,10 +177,13 @@ class ErrorReporter {
             ATI->Size);
       reportStackTrace(ATI->DeallocationTrace);
       if (ATI->HostPtr)
-        print(BoldLightPurple, " #%u Prior allocation for host pointer %p:\n",
-              I, ATI->HostPtr);
+        print(
+            BoldLightPurple,
+            " #%u Prior allocation for host pointer %p -> device pointer %p:\n",
+            I, ATI->HostPtr, ATI->DevicePtr);
       else
-        print(BoldLightPurple, " #%u Prior allocation:\n", I);
+        print(BoldLightPurple, " #%u Prior allocation -> device pointer %p:\n",
+              I, ATI->DevicePtr);
       reportStackTrace(ATI->AllocationTrace);
       ++I;
     }
@@ -219,6 +225,55 @@ class ErrorReporter {
 #undef DEALLOCATION_ERROR
   }
 
+  static void reportMemoryAccessError(GenericDeviceTy &Device, void *DevicePtr,
+                                      std::string &ErrorStr, bool Abort) {
+    reportError(ErrorStr.c_str());
+
+    if (!Device.OMPX_TrackAllocationTraces) {
+      print(Yellow, "Use '%s=true' to track device allocations\n",
+            Device.OMPX_TrackAllocationTraces.getName().data());
+      if (Abort)
+        abortExecution();
+      return;
+    }
+    uintptr_t Distance = false;
+    auto *ATI =
+        Device.getClosestAllocationTraceInfoForAddr(DevicePtr, Distance);
+    if (!ATI) {
+      print(Cyan,
+            "No host-issued allocations; device pointer %p might be "
+            "a global, stack, or shared location\n",
+            DevicePtr);
+      if (Abort)
+        abortExecution();
+      return;
+    }
+    if (!Distance) {
+      print(Cyan, "Device pointer %p points into%s host-issued allocation:\n",
+            DevicePtr, ATI->DeallocationTrace.empty() ? "" : " prior");
+      reportAllocationInfo(ATI);
+      if (Abort)
+        abortExecution();
+      return;
+    }
+
+    bool IsClose = Distance < (1L << 29L /*512MB=*/);
+    print(Cyan,
+          "Device pointer %p does not point into any (current or prior) "
+          "host-issued allocation%s.\n",
+          DevicePtr,
+          IsClose ? "" : " (might be a global, stack, or shared location)");
+    if (IsClose) {
+      print(Cyan,
+            "Closest host-issued allocation (distance %" PRIuPTR
+            " byte%s; might be by page):\n",
+            Distance, Distance > 1 ? "s" : "");
+      reportAllocationInfo(ATI);
+    }
+    if (Abort)
+      abortExecution();
+  }
+
   /// Report that a kernel encountered a trap instruction.
   static void reportTrapInKernel(
       GenericDeviceTy &Device, KernelTraceInfoRecordTy &KTIR,
diff --git a/offload/plugins-nextgen/common/include/PluginInterface.h b/offload/plugins-nextgen/common/include/PluginInterface.h
index 81823338fe2112..7e3e788fa52dc9 100644
--- a/offload/plugins-nextgen/common/include/PluginInterface.h
+++ b/offload/plugins-nextgen/common/include/PluginInterface.h
@@ -938,6 +938,42 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
   /// been deallocated, both for error reporting purposes.
   ProtectedObj<DenseMap<void *, AllocationTraceInfoTy *>> AllocationTraces;
 
+  /// Return the allocation trace info for a device pointer, that is the
+  /// allocation into which this device pointer points to (or pointed into).
+  AllocationTraceInfoTy *getAllocationTraceInfoForAddr(void *DevicePtr) {
+    auto AllocationTraceMap = AllocationTraces.getExclusiveAccessor();
+    for (auto &It : *AllocationTraceMap) {
+      if (It.first <= DevicePtr &&
+          advanceVoidPtr(It.first, It.second->Size) > DevicePtr)
+        return It.second;
+    }
+    return nullptr;
+  }
+
+  /// Return the allocation trace info for a device pointer, that is the
+  /// allocation into which this device pointer points to (or pointed into).
+  AllocationTraceInfoTy *
+  getClosestAllocationTraceInfoForAddr(void *DevicePtr, uintptr_t &Distance) {
+    Distance = 0;
+    if (auto *ATI = getAllocationTraceInfoForAddr(DevicePtr)) {
+      return ATI;
+    }
+
+    AllocationTraceInfoTy *ATI = nullptr;
+    uintptr_t DevicePtrI = uintptr_t(DevicePtr);
+    auto AllocationTraceMap = AllocationTraces.getExclusiveAccessor();
+    for (auto &It : *AllocationTraceMap) {
+      uintptr_t Begin = uintptr_t(It.second->DevicePtr);
+      uintptr_t End = Begin + It.second->Size - 1;
+      uintptr_t ItDistance = std::min(Begin - DevicePtrI, DevicePtrI - End);
+      if (ATI && ItDistance > Distance)
+        continue;
+      ATI = It.second;
+      Distance = ItDistance;
+    }
+    return ATI;
+  }
+
   /// Map to record kernel have been launchedl, for error reporting purposes.
   ProtectedObj<KernelTraceInfoRecordTy> KernelLaunchTraces;
 
@@ -946,6 +982,11 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
   UInt32Envar OMPX_TrackNumKernelLaunches =
       UInt32Envar("OFFLOAD_TRACK_NUM_KERNEL_LAUNCH_TRACES", 0);
 
+  /// Environment variable to determine if stack traces for allocations and
+  /// deallocations are tracked.
+  BoolEnvar OMPX_TrackAllocationTraces =
+      BoolEnvar("OFFLOAD_TRACK_ALLOCATION_TRACES", false);
+
 private:
   /// Get and set the stack size and heap size for the device. If not used, the
   /// plugin can implement the setters as no-op and setting the output
@@ -996,11 +1037,6 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
   UInt32Envar OMPX_InitialNumStreams;
   UInt32Envar OMPX_InitialNumEvents;
 
-  /// Environment variable to determine if stack traces for allocations and
-  /// deallocations are tracked.
-  BoolEnvar OMPX_TrackAllocationTraces =
-      BoolEnvar("OFFLOAD_TRACK_ALLOCATION_TRACES", false);
-
   /// Array of images loaded into the device. Images are automatically
   /// deallocated by the allocator.
   llvm::SmallVector<DeviceImageTy *> LoadedImages;
diff --git a/offload/test/sanitizer/double_free.c b/offload/test/sanitizer/double_free.c
index ca7310e34fc9d0..a3d8b06f1c7381 100644
--- a/offload/test/sanitizer/double_free.c
+++ b/offload/test/sanitizer/double_free.c
@@ -36,7 +36,7 @@ int main(void) {
 // NDEBG:  main
 // DEBUG:  main {{.*}}double_free.c:24
 //
-// CHECK: Last allocation of size 8:
+// CHECK: Last allocation of size 8 -> device pointer
 // CHECK:  dataAlloc
 // CHECK:  omp_target_alloc
 // NDEBG:  main
@@ -49,7 +49,7 @@ int main(void) {
 // NDEBG:  main
 // DEBUG:  main {{.*}}double_free.c:22
 //
-// CHECK: #0 Prior allocation:
+// CHECK: #0 Prior allocation -> device pointer
 // CHECK:  dataAlloc
 // CHECK:  omp_target_alloc
 // NDEBG:  main
@@ -61,7 +61,7 @@ int main(void) {
 // NDEBG:  main
 // DEBUG:  main {{.*}}double_free.c:20
 //
-// CHECK: #1 Prior allocation:
+// CHECK: #1 Prior allocation -> device pointer
 // CHECK:  dataAlloc
 // CHECK:  omp_target_alloc
 // NDEBG:  main
diff --git a/offload/test/sanitizer/double_free_racy.c b/offload/test/sanitizer/double_free_racy.c
index 3b4f2d5c51571c..4ebd8f36efa10c 100644
--- a/offload/test/sanitizer/double_free_racy.c
+++ b/offload/test/sanitizer/double_free_racy.c
@@ -28,6 +28,6 @@ int main(void) {
 // CHECK:  dataDelete
 // CHECK:  omp_target_free
 
-// CHECK: Last allocation of size 8:
+// CHECK: Last allocation of size 8 -> device pointer
 // CHECK:  dataAlloc
 // CHECK:  omp_target_alloc
diff --git a/offload/test/sanitizer/free_wrong_ptr_kind.c b/offload/test/sanitizer/free_wrong_ptr_kind.c
index 0c178541db1170..7c5a4ff7085024 100644
--- a/offload/test/sanitizer/free_wrong_ptr_kind.c
+++ b/offload/test/sanitizer/free_wrong_ptr_kind.c
@@ -28,7 +28,7 @@ int main(void) {
 // NDEBG: main
 // DEBUG:  main {{.*}}free_wrong_ptr_kind.c:22
 //
-// CHECK: Last allocation of size 8:
+// CHECK: Last allocation of size 8 -> device pointer
 // CHECK:  dataAlloc
 // CHECK:  llvm_omp_target_alloc_host
 // NDEBG:  main
diff --git a/offload/test/sanitizer/free_wrong_ptr_kind.cpp b/offload/test/sanitizer/free_wrong_ptr_kind.cpp
index 87a52c5d4baf23..7ebb8c438433a9 100644
--- a/offload/test/sanitizer/free_wrong_ptr_kind.cpp
+++ b/offload/test/sanitizer/free_wrong_ptr_kind.cpp
@@ -31,7 +31,7 @@ int main(void) {
 // NDEBG: main
 // DEBUG:  main {{.*}}free_wrong_ptr_kind.cpp:25
 //
-// CHECK: Last allocation of size 8:
+// CHECK: Last allocation of size 8 -> device pointer
 // CHECK:  dataAlloc
 // CHECK:  llvm_omp_target_alloc_shared
 // NDEBG:  main
diff --git a/offload/test/sanitizer/ptr_outside_alloc_1.c b/offload/test/sanitizer/ptr_outside_alloc_1.c
new file mode 100644
index 00000000000000..ae7dbb1cfb41e2
--- /dev/null
+++ b/offload/test/sanitizer/ptr_outside_alloc_1.c
@@ -0,0 +1,40 @@
+// clang-format off
+// RUN: %libomptarget-compileopt-generic
+// RUN: %not --crash env -u LLVM_DISABLE_SYMBOLIZATION %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefixes=CHECK,NTRCE
+// RUN: %libomptarget-compileopt-generic
+// RUN: %not --crash env -u LLVM_DISABLE_SYMBOLIZATION OFFLOAD_TRACK_ALLOCATION_TRACES=1 %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefixes=CHECK,TRACE
+// clang-format on
+
+// UNSUPPORTED: aarch64-unknown-linux-gnu
+// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
+// UNSUPPORTED: x86_64-pc-linux-gnu
+// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+// UNSUPPORTED: s390x-ibm-linux-gnu
+// UNSUPPORTED: s390x-ibm-linux-gnu-LTO
+
+#include <omp.h>
+
+void *llvm_omp_target_alloc_host(size_t Size, int DeviceNum);
+void llvm_omp_target_free_host(void *Ptr, int DeviceNum);
+
+int main() {
+  int N = (1 << 30);
+  char *A = (char *)llvm_omp_target_alloc_host(N, omp_get_default_device());
+  char *P;
+#pragma omp target map(from : P)
+  {
+    P = &A[0];
+    *P = 3;
+  }
+// clang-format off
+// CHECK: OFFLOAD ERROR: Memory access fault by GPU {{.*}} (agent 0x{{.*}}) at virtual address [[PTR:0x[0-9a-z]*]]. Reasons: {{.*}}
+// NTRCE: Use 'OFFLOAD_TRACK_ALLOCATION_TRACES=true' to track device allocations
+// TRACE: Device pointer [[PTR]] does not point into any (current or prior) host-issued allocation.
+// TRACE: Closest host-issued allocation (distance 4096 bytes; might be by page):
+// TRACE: Last allocation of size 1073741824
+// clang-format on
+#pragma omp target
+  { P[-4] = 5; }
+
+  llvm_omp_target_free_host(A, omp_get_default_device());
+}
diff --git a/offload/test/sanitizer/ptr_outside_alloc_2.c b/offload/test/sanitizer/ptr_outside_alloc_2.c
new file mode 100644
index 00000000000000..af2460149b0d40
--- /dev/null
+++ b/offload/test/sanitizer/ptr_outside_alloc_2.c
@@ -0,0 +1,26 @@
+// clang-format off
+// RUN: %libomptarget-compileopt-generic
+// RUN: %not --crash env -u LLVM_DISABLE_SYMBOLIZATION OFFLOAD_TRACK_ALLOCATION_TRACES=1 %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefixes=CHECK
+// clang-format on
+
+// UNSUPPORTED: aarch64-unknown-linux-gnu
+// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
+// UNSUPPORTED: x86_64-pc-linux-gnu
+// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+// UNSUPPORTED: s390x-ibm-linux-gnu
+// UNSUPPORTED: s390x-ibm-linux-gnu-LTO
+
+#include <omp.h>
+
+int main() {
+  int N = (1 << 30);
+  char *A = (char *)malloc(N);
+#pragma omp target map(A[ : N])
+  { A[N] = 3; }
+  // clang-format off
+// CHECK: OFFLOAD ERROR: Memory access fault by GPU {{.*}} (agent 0x{{.*}}) at virtual address [[PTR:0x[0-9a-z]*]]. Reasons: {{.*}}
+// CHECK: Device pointer [[PTR]] does not point into any (current or prior) host-issued allocation.
+// CHECK: Closest host-issued allocation (distance 1 byte; might be by page):
+// CHECK: Last allocation of size 1073741824
+  // clang-format on
+}
diff --git a/offload/test/sanitizer/use_after_free_1.c b/offload/test/sanitizer/use_after_free_1.c
new file mode 100644
index 00000000000000..3e3b130873fa60
--- /dev/null
+++ b/offload/test/sanitizer/use_after_free_1.c
@@ -0,0 +1,39 @@
+// clang-format off
+// RUN: %libomptarget-compileopt-generic
+// RUN: %not --crash env -u LLVM_DISABLE_SYMBOLIZATION %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefixes=CHECK,NTRCE
+// RUN: %libomptarget-compileopt-generic
+// RUN: %not --crash env -u LLVM_DISABLE_SYMBOLIZATION OFFLOAD_TRACK_ALLOCATION_TRACES=1 %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefixes=CHECK,TRACE
+// clang-format on
+
+// UNSUPPORTED: aarch64-unknown-linux-gnu
+// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
+// UNSUPPORTED: x86_64-pc-linux-gnu
+// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+// UNSUPPORTED: s390x-ibm-linux-gnu
+// UNSUPPORTED: s390x-ibm-linux-gnu-LTO
+
+#include <omp.h>
+
+void *llvm_omp_target_alloc_host(size_t Size, int DeviceNum);
+void llvm_omp_target_free_host(void *Ptr, int DeviceNum);
+
+int main() {
+  int N = (1 << 30);
+  char *A = (char *)llvm_omp_target_alloc_host(N, omp_get_default_device());
+  char *P;
+#pragma omp target map(from : P)
+  {
+    P = &A[N / 2];
+    *P = 3;
+  }
+  llvm_omp_target_free_host(A, omp_get_default_device());
+// clang-format off
+// CHECK: OFFLOAD ERROR: Memory access fault by GPU {{.*}} (agent 0x{{.*}}) at virtual address [[PTR:0x[0-9a-z]*]]. Reasons: {{.*}}
+// NTRCE: Use 'OFFLOAD_TRACK_ALLOCATION_TRACES=true' to track device allocations
+// TRACE: Device pointer [[PTR]] points into prior host-issued allocation:
+// TRACE: Last deallocation:
+// TRACE: Last allocation of size 1073741824
+// clang-format on
+#pragma omp target
+  { *P = 5; }
+}
diff --git a/offload/test/sanitizer/use_after_free_2.c b/offload/test/sanitizer/use_after_free_2.c
new file mode 100644
index 00000000000000..581cf1abc917cb
--- /dev/null
+++ b/offload/test/sanitizer/use_after_free_2.c
@@ -0,0 +1,32 @@
+// clang-format off
+// RUN: %libomptarget-compileopt-generic
+// RUN: %not --crash env -u LLVM_DISABLE_SYMBOLIZATION OFFLOAD_TRACK_ALLOCATION_TRACES=1 %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefixes=CHECK
+// clang-format on
+
+// UNSUPPORTED: aarch64-unknown-linux-gnu
+// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
+// UNSUPPORTED: x86_64-pc-linux-gnu
+// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+// UNSUPPORTED: s390x-ibm-linux-gnu
+// UNSUPPORTED: s390x-ibm-linux-gnu-LTO
+
+#include <omp.h>
+
+int main() {
+  int N = (1 << 30);
+  char *A = (char *)malloc(N);
+  char *P;
+#pragma omp target map(A[ : N]) map(from : P)
+  {
+    P = &A[N / 2];
+    *P = 3;
+  }
+// clang-format off
+// CHECK: OFFLOAD ERROR: Memory access fault by GPU {{.*}} (agent 0x{{.*}}) at virtual address [[PTR:0x[0-9a-z]*]]. Reasons: {{.*}}
+// CHECK: Device pointer [[PTR]] points into prior host-issued allocation:
+// CHECK: Last deallocation:
+// CHECK: Last allocation of size 1073741824
+// clang-format on
+#pragma omp target
+  { *P = 5; }
+}

``````````

</details>


https://github.com/llvm/llvm-project/pull/104254


More information about the llvm-commits mailing list