[Openmp-commits] [llvm] [openmp] [Offload] Allow to record kernel launch stack traces (PR #100472)

Johannes Doerfert via Openmp-commits openmp-commits at lists.llvm.org
Thu Jul 25 12:54:01 PDT 2024


https://github.com/jdoerfert updated https://github.com/llvm/llvm-project/pull/100472

>From f5bf4f21afa02a1139e8e04765d319c0d8eee073 Mon Sep 17 00:00:00 2001
From: Johannes Doerfert <johannes at jdoerfert.de>
Date: Tue, 23 Jul 2024 15:15:11 -0700
Subject: [PATCH 1/2] [Offload] Implement double free (and other allocation
 error) reporting

As a first step towards a GPU sanitizer we now can track allocations
and deallocations in order to report double frees, and other problems
during deallocation.
---
 .../common/include/ErrorReporting.h           | 223 ++++++++++++++++++
 .../common/include/PluginInterface.h          |  39 +++
 .../common/src/PluginInterface.cpp            |  53 +++++
 offload/src/omptarget.cpp                     |   4 +-
 offload/test/sanitizer/double_free.c          |  68 ++++++
 offload/test/sanitizer/double_free_racy.c     |  33 +++
 offload/test/sanitizer/free_host_ptr.c        |  25 ++
 offload/test/sanitizer/free_wrong_ptr_kind.c  |  35 +++
 .../test/sanitizer/free_wrong_ptr_kind.cpp    |  38 +++
 openmp/docs/design/Runtimes.rst               |   7 +
 10 files changed, 524 insertions(+), 1 deletion(-)
 create mode 100644 offload/plugins-nextgen/common/include/ErrorReporting.h
 create mode 100644 offload/test/sanitizer/double_free.c
 create mode 100644 offload/test/sanitizer/double_free_racy.c
 create mode 100644 offload/test/sanitizer/free_host_ptr.c
 create mode 100644 offload/test/sanitizer/free_wrong_ptr_kind.c
 create mode 100644 offload/test/sanitizer/free_wrong_ptr_kind.cpp

diff --git a/offload/plugins-nextgen/common/include/ErrorReporting.h b/offload/plugins-nextgen/common/include/ErrorReporting.h
new file mode 100644
index 0000000000000..84e689dd80fed
--- /dev/null
+++ b/offload/plugins-nextgen/common/include/ErrorReporting.h
@@ -0,0 +1,223 @@
+//===- ErrorReporting.h - Helper to provide nice error messages ----- c++ -===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef OFFLOAD_PLUGINS_NEXTGEN_COMMON_ERROR_REPORTING_H
+#define OFFLOAD_PLUGINS_NEXTGEN_COMMON_ERROR_REPORTING_H
+
+#include "PluginInterface.h"
+#include "Shared/EnvironmentVar.h"
+
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/WithColor.h"
+#include "llvm/Support/raw_ostream.h"
+
+#include <cstdint>
+#include <cstdio>
+#include <cstdlib>
+#include <functional>
+#include <optional>
+#include <string>
+#include <unistd.h>
+
+namespace llvm {
+namespace omp {
+namespace target {
+namespace plugin {
+
+class ErrorReporter {
+
+  enum ColorTy {
+    Yellow = int(HighlightColor::Address),
+    Green = int(HighlightColor::String),
+    DarkBlue = int(HighlightColor::Tag),
+    Cyan = int(HighlightColor::Attribute),
+    DarkPurple = int(HighlightColor::Enumerator),
+    DarkRed = int(HighlightColor::Macro),
+    BoldRed = int(HighlightColor::Error),
+    BoldLightPurple = int(HighlightColor::Warning),
+    BoldDarkGrey = int(HighlightColor::Note),
+    BoldLightBlue = int(HighlightColor::Remark),
+  };
+
+  /// The banner printed at the beginning of an error report.
+  static constexpr auto ErrorBanner = "OFFLOAD ERROR: ";
+
+  /// Return the device id as string, or n/a if not available.
+  static std::string getDeviceIdStr(GenericDeviceTy *Device) {
+    return Device ? std::to_string(Device->getDeviceId()) : "n/a";
+  }
+
+  /// Return a nice name for an TargetAllocTy.
+  static StringRef getAllocTyName(TargetAllocTy Kind) {
+    switch (Kind) {
+    case TARGET_ALLOC_DEVICE_NON_BLOCKING:
+    case TARGET_ALLOC_DEFAULT:
+    case TARGET_ALLOC_DEVICE:
+      return "device memory";
+    case TARGET_ALLOC_HOST:
+      return "pinned host memory";
+    case TARGET_ALLOC_SHARED:
+      return "managed memory";
+      break;
+    }
+    llvm_unreachable("Unknown target alloc kind");
+  }
+
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wgcc-compat"
+#pragma clang diagnostic ignored "-Wformat-security"
+  /// Print \p Format, instantiated with \p Args to stderr.
+  /// TODO: Allow redirection into a file stream.
+  template <typename... ArgsTy>
+  [[gnu::format(__printf__, 1, 2)]] static void print(const char *Format,
+                                                      ArgsTy &&...Args) {
+    raw_fd_ostream OS(STDERR_FILENO, false);
+    OS << llvm::format(Format, Args...);
+  }
+
+  /// Print \p Format, instantiated with \p Args to stderr, but colored.
+  /// TODO: Allow redirection into a file stream.
+  template <typename... ArgsTy>
+  [[gnu::format(__printf__, 2, 3)]] static void
+  print(ColorTy Color, const char *Format, ArgsTy &&...Args) {
+    raw_fd_ostream OS(STDERR_FILENO, false);
+    WithColor(OS, HighlightColor(Color)) << llvm::format(Format, Args...);
+  }
+
+  /// Print \p Format, instantiated with \p Args to stderr, but colored and with
+  /// a banner.
+  /// TODO: Allow redirection into a file stream.
+  template <typename... ArgsTy>
+  [[gnu::format(__printf__, 1, 2)]] static void reportError(const char *Format,
+                                                            ArgsTy &&...Args) {
+    raw_fd_ostream OS(STDERR_FILENO, false);
+    WithColor(OS, HighlightColor::Error)
+        << ErrorBanner << llvm::format(Format, Args...) << "\n";
+  }
+#pragma clang diagnostic pop
+
+  static void reportError(const char *Str) { reportError("%s", Str); }
+  static void print(const char *Str) { print("%s", Str); }
+  static void print(StringRef Str) { print("%s", Str.str().c_str()); }
+  static void print(ColorTy Color, const char *Str) { print(Color, "%s", Str); }
+  static void print(ColorTy Color, StringRef Str) {
+    print(Color, "%s", Str.str().c_str());
+  }
+
+  /// Pretty print a stack trace.
+  static void reportStackTrace(StringRef StackTrace) {
+    if (StackTrace.empty())
+      return;
+
+    SmallVector<StringRef> Lines, Parts;
+    StackTrace.split(Lines, "\n", /*MaxSplit=*/-1, /*KeepEmpty=*/false);
+    int Start = Lines.empty() || !Lines[0].contains("PrintStackTrace") ? 0 : 1;
+    unsigned NumDigits =
+        (int)(floor(log10(Lines.size() - Start - /*0*/ 1)) + 1);
+    for (int I = Start, E = Lines.size(); I < E; ++I) {
+      auto Line = Lines[I];
+      Parts.clear();
+      Line = Line.drop_while([](char C) { return std::isspace(C); });
+      Line.split(Parts, " ", /*MaxSplit=*/2);
+      if (Parts.size() != 3 || Parts[0].size() < 2 || Parts[0][0] != '#') {
+        print("%s\n", Line.str().c_str());
+        continue;
+      }
+      unsigned FrameIdx = std::stoi(Parts[0].drop_front(1).str());
+      if (Start)
+        FrameIdx -= 1;
+      print(DarkPurple, "    %s", Parts[0].take_front().str().c_str());
+      print(Green, "%*u", NumDigits, FrameIdx);
+      print(BoldLightBlue, " %s", Parts[1].str().c_str());
+      print(" %s\n", Parts[2].str().c_str());
+    }
+    print("\n");
+  }
+
+  /// Report information about an allocation associated with \p ATI.
+  static void reportAllocationInfo(AllocationTraceInfoTy *ATI) {
+    if (!ATI)
+      return;
+
+    if (!ATI->DeallocationTrace.empty()) {
+      print(BoldLightPurple, "Last deallocation:\n");
+      reportStackTrace(ATI->DeallocationTrace);
+    }
+
+    if (ATI->HostPtr)
+      print(BoldLightPurple,
+            "Last allocation of size %lu for host pointer %p:\n", ATI->Size,
+            ATI->HostPtr);
+    else
+      print(BoldLightPurple, "Last allocation of size %lu:\n", ATI->Size);
+    reportStackTrace(ATI->AllocationTrace);
+    if (!ATI->LastAllocationInfo)
+      return;
+
+    unsigned I = 0;
+    print(BoldLightPurple, "Prior allocations with the same base pointer:");
+    while (ATI->LastAllocationInfo) {
+      print("\n");
+      ATI = ATI->LastAllocationInfo;
+      print(BoldLightPurple, " #%u Prior deallocation of size %lu:\n", I,
+            ATI->Size);
+      reportStackTrace(ATI->DeallocationTrace);
+      if (ATI->HostPtr)
+        print(BoldLightPurple, " #%u Prior allocation for host pointer %p:\n",
+              I, ATI->HostPtr);
+      else
+        print(BoldLightPurple, " #%u Prior allocation:\n", I);
+      reportStackTrace(ATI->AllocationTrace);
+      ++I;
+    }
+  }
+
+public:
+#define DEALLOCATION_ERROR(Format, ...)                                        \
+  reportError(Format, __VA_ARGS__);                                            \
+  reportStackTrace(StackTrace);                                                \
+  reportAllocationInfo(ATI);                                                   \
+  abort();
+
+  static void reportDeallocationOfNonAllocatedPtr(void *DevicePtr,
+                                                  TargetAllocTy Kind,
+                                                  AllocationTraceInfoTy *ATI,
+                                                  std::string &StackTrace) {
+    DEALLOCATION_ERROR("deallocation of non-allocated %s: %p",
+                       getAllocTyName(Kind).data(), DevicePtr);
+  }
+
+  static void reportDeallocationOfDeallocatedPtr(void *DevicePtr,
+                                                 TargetAllocTy Kind,
+                                                 AllocationTraceInfoTy *ATI,
+                                                 std::string &StackTrace) {
+    DEALLOCATION_ERROR("double-free of %s: %p", getAllocTyName(Kind).data(),
+                       DevicePtr);
+  }
+
+  static void reportDeallocationOfWrongPtrKind(void *DevicePtr,
+                                               TargetAllocTy Kind,
+                                               AllocationTraceInfoTy *ATI,
+                                               std::string &StackTrace) {
+    DEALLOCATION_ERROR("deallocation requires %s but allocation was %s: %p",
+                       getAllocTyName(Kind).data(),
+                       getAllocTyName(ATI->Kind).data(), DevicePtr);
+#undef DEALLOCATION_ERROR
+  }
+};
+
+} // namespace plugin
+} // namespace target
+} // namespace omp
+} // namespace llvm
+
+#endif // OFFLOAD_PLUGINS_NEXTGEN_COMMON_ERROR_REPORTING_H
diff --git a/offload/plugins-nextgen/common/include/PluginInterface.h b/offload/plugins-nextgen/common/include/PluginInterface.h
index 973add0ba1000..49a3943a27429 100644
--- a/offload/plugins-nextgen/common/include/PluginInterface.h
+++ b/offload/plugins-nextgen/common/include/PluginInterface.h
@@ -19,6 +19,7 @@
 #include <shared_mutex>
 #include <vector>
 
+#include "ExclusiveAccess.h"
 #include "Shared/APITypes.h"
 #include "Shared/Debug.h"
 #include "Shared/Environment.h"
@@ -382,6 +383,35 @@ struct GenericKernelTy {
   bool IsBareKernel = false;
 };
 
+/// Information about an allocation, when it has been allocated, and when/if it
+/// has been deallocated, for error reporting purposes.
+struct AllocationTraceInfoTy {
+
+  /// The stack trace of the allocation itself.
+  std::string AllocationTrace;
+
+  /// The stack trace of the deallocation, or empty.
+  std::string DeallocationTrace;
+
+  /// The allocated device pointer.
+  void *DevicePtr = nullptr;
+
+  /// The corresponding host pointer (can be null).
+  void *HostPtr = nullptr;
+
+  /// The size of the allocation.
+  uint64_t Size = 0;
+
+  /// The kind of the allocation.
+  TargetAllocTy Kind = TargetAllocTy::TARGET_ALLOC_DEFAULT;
+
+  /// Information about the last allocation at this address, if any.
+  AllocationTraceInfoTy *LastAllocationInfo = nullptr;
+
+  /// Lock to keep accesses race free.
+  std::mutex Lock;
+};
+
 /// Class representing a map of host pinned allocations. We track these pinned
 /// allocations, so memory tranfers invloving these buffers can be optimized.
 class PinnedAllocationMapTy {
@@ -866,6 +896,10 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
   /// Reference to the underlying plugin that created this device.
   GenericPluginTy &Plugin;
 
+  /// Map to record when allocations have been performed, and when they have
+  /// been deallocated, both for error reporting purposes.
+  ProtectedObj<DenseMap<void *, AllocationTraceInfoTy *>> AllocationTraces;
+
 private:
   /// Get and set the stack size and heap size for the device. If not used, the
   /// plugin can implement the setters as no-op and setting the output
@@ -916,6 +950,11 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
   UInt32Envar OMPX_InitialNumStreams;
   UInt32Envar OMPX_InitialNumEvents;
 
+  /// Environment variable to determine if stack traces for allocations and
+  /// deallocations are tracked.
+  BoolEnvar OMPX_TrackAllocationTraces =
+      BoolEnvar("OFFLOAD_TRACK_ALLOCATION_TRACES", false);
+
   /// Array of images loaded into the device. Images are automatically
   /// deallocated by the allocator.
   llvm::SmallVector<DeviceImageTy *> LoadedImages;
diff --git a/offload/plugins-nextgen/common/src/PluginInterface.cpp b/offload/plugins-nextgen/common/src/PluginInterface.cpp
index 118265973f327..3b6f545fe91fa 100644
--- a/offload/plugins-nextgen/common/src/PluginInterface.cpp
+++ b/offload/plugins-nextgen/common/src/PluginInterface.cpp
@@ -14,6 +14,7 @@
 #include "Shared/Debug.h"
 #include "Shared/Environment.h"
 
+#include "ErrorReporting.h"
 #include "GlobalHandler.h"
 #include "JIT.h"
 #include "Utils/ELF.h"
@@ -30,6 +31,8 @@
 #include "llvm/Support/JSON.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/Signals.h"
+#include "llvm/Support/raw_ostream.h"
 
 #include <cstdint>
 #include <limits>
@@ -1337,6 +1340,25 @@ Expected<void *> GenericDeviceTy::dataAlloc(int64_t Size, void *HostPtr,
     if (auto Err = PinnedAllocs.registerHostBuffer(Alloc, Alloc, Size))
       return std::move(Err);
 
+  // Keep track of the allocation stack if we track allocation traces.
+  if (OMPX_TrackAllocationTraces) {
+    std::string StackTrace;
+    llvm::raw_string_ostream OS(StackTrace);
+    llvm::sys::PrintStackTrace(OS);
+
+    AllocationTraceInfoTy *ATI = new AllocationTraceInfoTy();
+    ATI->AllocationTrace = std::move(StackTrace);
+    ATI->DevicePtr = Alloc;
+    ATI->HostPtr = HostPtr;
+    ATI->Size = Size;
+    ATI->Kind = Kind;
+
+    auto AllocationTraceMap = AllocationTraces.getExclusiveAccessor();
+    auto *&MapATI = (*AllocationTraceMap)[Alloc];
+    ATI->LastAllocationInfo = MapATI;
+    MapATI = ATI;
+  }
+
   return Alloc;
 }
 
@@ -1345,6 +1367,37 @@ Error GenericDeviceTy::dataDelete(void *TgtPtr, TargetAllocTy Kind) {
   if (Plugin.getRecordReplay().isRecordingOrReplaying())
     return Plugin::success();
 
+  // Keep track of the deallocation stack if we track allocation traces.
+  if (OMPX_TrackAllocationTraces) {
+    AllocationTraceInfoTy *ATI = nullptr;
+    {
+      auto AllocationTraceMap = AllocationTraces.getExclusiveAccessor();
+      ATI = (*AllocationTraceMap)[TgtPtr];
+    }
+
+    std::string StackTrace;
+    llvm::raw_string_ostream OS(StackTrace);
+    llvm::sys::PrintStackTrace(OS);
+
+    if (!ATI)
+      ErrorReporter::reportDeallocationOfNonAllocatedPtr(TgtPtr, Kind, ATI,
+                                                         StackTrace);
+
+    // ATI is not null, thus we can lock it to inspect and modify it further.
+    std::lock_guard<std::mutex> LG(ATI->Lock);
+    if (!ATI->DeallocationTrace.empty())
+      ErrorReporter::reportDeallocationOfDeallocatedPtr(TgtPtr, Kind, ATI,
+                                                        StackTrace);
+
+    if (ATI->Kind != Kind)
+      ErrorReporter::reportDeallocationOfWrongPtrKind(TgtPtr, Kind, ATI,
+                                                      StackTrace);
+
+    ATI->DeallocationTrace = StackTrace;
+
+#undef DEALLOCATION_ERROR
+  }
+
   int Res;
   switch (Kind) {
   case TARGET_ALLOC_DEFAULT:
diff --git a/offload/src/omptarget.cpp b/offload/src/omptarget.cpp
index 9bca8529c5ee3..3b627d257a069 100644
--- a/offload/src/omptarget.cpp
+++ b/offload/src/omptarget.cpp
@@ -462,7 +462,9 @@ void targetFreeExplicit(void *DevicePtr, int DeviceNum, int Kind,
     FATAL_MESSAGE(DeviceNum, "%s", toString(DeviceOrErr.takeError()).c_str());
 
   if (DeviceOrErr->deleteData(DevicePtr, Kind) == OFFLOAD_FAIL)
-    FATAL_MESSAGE(DeviceNum, "%s", "Failed to deallocate device ptr");
+    FATAL_MESSAGE(DeviceNum, "%s",
+                  "Failed to deallocate device ptr. Set "
+                  "OFFLOAD_TRACK_ALLOCATION_TRACES=1 to track allocations.");
 
   DP("omp_target_free deallocated device ptr\n");
 }
diff --git a/offload/test/sanitizer/double_free.c b/offload/test/sanitizer/double_free.c
new file mode 100644
index 0000000000000..ca7310e34fc9d
--- /dev/null
+++ b/offload/test/sanitizer/double_free.c
@@ -0,0 +1,68 @@
+// clang-format off
+// RUN: %libomptarget-compileopt-generic
+// RUN: %not --crash env -u LLVM_DISABLE_SYMBOLIZATION OFFLOAD_TRACK_ALLOCATION_TRACES=1 %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefixes=CHECK,NDEBG 
+// RUN: %libomptarget-compileopt-generic -g
+// RUN: %not --crash env -u LLVM_DISABLE_SYMBOLIZATION OFFLOAD_TRACK_ALLOCATION_TRACES=1 %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefixes=CHECK,DEBUG
+// clang-format on
+
+// UNSUPPORTED: aarch64-unknown-linux-gnu
+// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
+// UNSUPPORTED: x86_64-pc-linux-gnu
+// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+// UNSUPPORTED: s390x-ibm-linux-gnu
+// UNSUPPORTED: s390x-ibm-linux-gnu-LTO
+
+#include <omp.h>
+
+int main(void) {
+  void *Ptr1 = omp_target_alloc(8, 0);
+  omp_target_free(Ptr1, 0);
+  void *Ptr2 = omp_target_alloc(8, 0);
+  omp_target_free(Ptr2, 0);
+  void *Ptr3 = omp_target_alloc(8, 0);
+  omp_target_free(Ptr3, 0);
+  omp_target_free(Ptr2, 0);
+}
+
+// CHECK: OFFLOAD ERROR: double-free of device memory: 0x
+// CHECK:   dataDelete
+// CHECK:   omp_target_free
+// NDEBG:   main
+// DEBUG:   main {{.*}}double_free.c:25
+//
+// CHECK: Last deallocation:
+// CHECK:  dataDelete
+// CHECK:  omp_target_free
+// NDEBG:  main
+// DEBUG:  main {{.*}}double_free.c:24
+//
+// CHECK: Last allocation of size 8:
+// CHECK:  dataAlloc
+// CHECK:  omp_target_alloc
+// NDEBG:  main
+// DEBUG:  main {{.*}}double_free.c:23
+//
+// CHECK: Prior allocations with the same base pointer:
+// CHECK: #0 Prior deallocation of size 8:
+// CHECK:  dataDelete
+// CHECK:  omp_target_free
+// NDEBG:  main
+// DEBUG:  main {{.*}}double_free.c:22
+//
+// CHECK: #0 Prior allocation:
+// CHECK:  dataAlloc
+// CHECK:  omp_target_alloc
+// NDEBG:  main
+// DEBUG:  main {{.*}}double_free.c:20
+//
+// CHECK: #1 Prior deallocation of size 8:
+// CHECK:  dataDelete
+// CHECK:  omp_target_free
+// NDEBG:  main
+// DEBUG:  main {{.*}}double_free.c:20
+//
+// CHECK: #1 Prior allocation:
+// CHECK:  dataAlloc
+// CHECK:  omp_target_alloc
+// NDEBG:  main
+// DEBUG:  main {{.*}}double_free.c:19
diff --git a/offload/test/sanitizer/double_free_racy.c b/offload/test/sanitizer/double_free_racy.c
new file mode 100644
index 0000000000000..ae6be26ddb380
--- /dev/null
+++ b/offload/test/sanitizer/double_free_racy.c
@@ -0,0 +1,33 @@
+// clang-format off
+// RUN: %libomptarget-compileopt-generic
+// RUN: %not --crash env -u LLVM_DISABLE_SYMBOLIZATION OFFLOAD_TRACK_ALLOCATION_TRACES=1 %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefixes=CHECK
+// RUN: %libomptarget-compileopt-generic -g
+// RUN: %not --crash env -u LLVM_DISABLE_SYMBOLIZATION OFFLOAD_TRACK_ALLOCATION_TRACES=1 %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefixes=CHECK
+// clang-format on
+
+// UNSUPPORTED: aarch64-unknown-linux-gnu
+// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
+// UNSUPPORTED: x86_64-pc-linux-gnu
+// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+// UNSUPPORTED: s390x-ibm-linux-gnu
+// UNSUPPORTED: s390x-ibm-linux-gnu-LTO
+
+#include <omp.h>
+
+int main(void) {
+  void *Ptr1 = omp_target_alloc(8, 0);
+#pragma omp parallel
+  omp_target_free(Ptr1, 0);
+}
+
+// CHECK: OFFLOAD ERROR: double-free of device memory: 0x
+// CHECK   dataDelete
+// CHECK:  omp_target_free
+//
+// CHECK: Last deallocation:
+// CHECK:  dataDelete
+// CHECK:  omp_target_free
+
+// CHECK: Last allocation of size 8:
+// CHECK:  dataAlloc
+// CHECK:  omp_target_alloc
diff --git a/offload/test/sanitizer/free_host_ptr.c b/offload/test/sanitizer/free_host_ptr.c
new file mode 100644
index 0000000000000..1ec68635499a9
--- /dev/null
+++ b/offload/test/sanitizer/free_host_ptr.c
@@ -0,0 +1,25 @@
+// clang-format off
+// RUN: %libomptarget-compileopt-generic
+// RUN: %not --crash env -u LLVM_DISABLE_SYMBOLIZATION OFFLOAD_TRACK_ALLOCATION_TRACES=1 %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefixes=CHECK,NDEBG 
+// RUN: %libomptarget-compileopt-generic -g
+// RUN: %not --crash env -u LLVM_DISABLE_SYMBOLIZATION OFFLOAD_TRACK_ALLOCATION_TRACES=1 %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefixes=CHECK,DEBUG
+// clang-format on
+
+// UNSUPPORTED: aarch64-unknown-linux-gnu
+// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
+// UNSUPPORTED: x86_64-pc-linux-gnu
+// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+// UNSUPPORTED: s390x-ibm-linux-gnu
+// UNSUPPORTED: s390x-ibm-linux-gnu-LTO
+
+#include <omp.h>
+
+int main(void) {
+  int X;
+  omp_target_free(&X, 0);
+}
+
+// CHECK:  OFFLOAD ERROR: deallocation of non-allocated device memory: 0x
+// CHECK:   dataDelete
+// NDEBG:   main
+// DEBUG:   main {{.*}}free_host_ptr.c:20
diff --git a/offload/test/sanitizer/free_wrong_ptr_kind.c b/offload/test/sanitizer/free_wrong_ptr_kind.c
new file mode 100644
index 0000000000000..0c178541db117
--- /dev/null
+++ b/offload/test/sanitizer/free_wrong_ptr_kind.c
@@ -0,0 +1,35 @@
+// clang-format off
+// RUN: %libomptarget-compileopt-generic
+// RUN: %not --crash env -u LLVM_DISABLE_SYMBOLIZATION OFFLOAD_TRACK_ALLOCATION_TRACES=1 %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefixes=CHECK,NDEBG 
+// RUN: %libomptarget-compileopt-generic -g
+// RUN: %not --crash env -u LLVM_DISABLE_SYMBOLIZATION OFFLOAD_TRACK_ALLOCATION_TRACES=1 %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefixes=CHECK,DEBUG
+// clang-format on
+
+// UNSUPPORTED: aarch64-unknown-linux-gnu
+// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
+// UNSUPPORTED: x86_64-pc-linux-gnu
+// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+// UNSUPPORTED: s390x-ibm-linux-gnu
+// UNSUPPORTED: s390x-ibm-linux-gnu-LTO
+
+#include <omp.h>
+
+void *llvm_omp_target_alloc_host(size_t Size, int DeviceNum);
+
+int main(void) {
+  void *P = llvm_omp_target_alloc_host(8, 0);
+  omp_target_free(P, 0);
+}
+
+// clang-format off
+// CHECK: OFFLOAD ERROR: deallocation requires device memory but allocation was pinned host memory: 0x 
+// CHECK:  dataDelete 
+// CHECK:  omp_target_free
+// NDEBG: main
+// DEBUG:  main {{.*}}free_wrong_ptr_kind.c:22
+//
+// CHECK: Last allocation of size 8:
+// CHECK:  dataAlloc
+// CHECK:  llvm_omp_target_alloc_host
+// NDEBG:  main
+// DEBUG:  main {{.*}}free_wrong_ptr_kind.c:21
diff --git a/offload/test/sanitizer/free_wrong_ptr_kind.cpp b/offload/test/sanitizer/free_wrong_ptr_kind.cpp
new file mode 100644
index 0000000000000..87a52c5d4baf2
--- /dev/null
+++ b/offload/test/sanitizer/free_wrong_ptr_kind.cpp
@@ -0,0 +1,38 @@
+// clang-format off
+// RUN: %libomptarget-compileoptxx-generic
+// RUN: %not --crash env -u LLVM_DISABLE_SYMBOLIZATION OFFLOAD_TRACK_ALLOCATION_TRACES=1 %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefixes=CHECK,NDEBG 
+// RUN: %libomptarget-compileoptxx-generic -g
+// RUN: %not --crash env -u LLVM_DISABLE_SYMBOLIZATION OFFLOAD_TRACK_ALLOCATION_TRACES=1 %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefixes=CHECK,DEBUG
+// clang-format on
+
+// UNSUPPORTED: aarch64-unknown-linux-gnu
+// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
+// UNSUPPORTED: x86_64-pc-linux-gnu
+// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+// UNSUPPORTED: s390x-ibm-linux-gnu
+// UNSUPPORTED: s390x-ibm-linux-gnu-LTO
+
+#include <omp.h>
+
+extern "C" {
+void *llvm_omp_target_alloc_shared(size_t Size, int DeviceNum);
+void llvm_omp_target_free_host(void *Ptr, int DeviceNum);
+}
+
+int main(void) {
+  void *P = llvm_omp_target_alloc_shared(8, 0);
+  llvm_omp_target_free_host(P, 0);
+}
+
+// clang-format off
+// CHECK: OFFLOAD ERROR: deallocation requires pinned host memory but allocation was managed memory: 0x
+// CHECK:  dataDelete 
+// CHECK:  llvm_omp_target_free_host
+// NDEBG: main
+// DEBUG:  main {{.*}}free_wrong_ptr_kind.cpp:25
+//
+// CHECK: Last allocation of size 8:
+// CHECK:  dataAlloc
+// CHECK:  llvm_omp_target_alloc_shared
+// NDEBG:  main
+// DEBUG:  main {{.*}}free_wrong_ptr_kind.cpp:24
diff --git a/openmp/docs/design/Runtimes.rst b/openmp/docs/design/Runtimes.rst
index 98dd984fd4b0c..20e2fa715eaac 100644
--- a/openmp/docs/design/Runtimes.rst
+++ b/openmp/docs/design/Runtimes.rst
@@ -743,6 +743,7 @@ variables is defined below.
     * ``LIBOMPTARGET_JIT_POST_OPT_IR_MODULE=<out:Filename> (LLVM-IR file)``
     * ``LIBOMPTARGET_MIN_THREADS_FOR_LOW_TRIP_COUNT=<Num> (default: 32)``
     * ``LIBOMPTARGET_REUSE_BLOCKS_FOR_HIGH_TRIP_COUNT=[TRUE/FALSE] (default TRUE)``
+    * ``OFFLOAD_TRACK_ALLOCATION_TRACES=[TRUE/FALSE] (default FALSE)``
 
 LIBOMPTARGET_DEBUG
 """"""""""""""""""
@@ -1170,6 +1171,12 @@ This environment variable can be used to control how the OpenMP runtime assigns
 blocks to loops with high trip counts. By default we reuse existing blocks
 rather than spawning new blocks.
 
+OFFLOAD_TRACK_ALLOCATION_TRACES
+"""""""""""""""""""""""""""""""
+
+This environment variable determines if the stack traces of allocations and
+deallocations are tracked to aid in error reporting, e.g., in case of
+double-free.
 
 .. _libomptarget_plugin:
 

>From e018174b2f996dde258324c085e84cc64aedd34c Mon Sep 17 00:00:00 2001
From: Johannes Doerfert <johannes at jdoerfert.de>
Date: Tue, 23 Jul 2024 20:45:39 -0700
Subject: [PATCH 2/2] [Offload] Allow to record kernel launch stack traces

Similar to (de)allocation traces, we can record kernel launch stack
traces and display them in case of an error. However, the AMD GPU plugin
signal handler, which is invoked on memroy faults, cannot pinpoint the
offending kernel. Insteade print `<NUM>`, set via
`OFFLOAD_TRACK_NUM_KERNEL_LAUNCH_TRACES=<NUM>`, many traces. The
recoding/record uses a ring buffer of fixed size (for now 8).
For `trap` errors, we print the actual kernel name, and trace if
recorded.
---
 offload/include/Shared/EnvironmentVar.h       |  6 +-
 .../plugins-nextgen/amdgpu/dynamic_hsa/hsa.h  |  1 +
 offload/plugins-nextgen/amdgpu/src/rtl.cpp    | 74 +++++++++++++---
 .../common/include/ErrorReporting.h           | 84 +++++++++++++++++++
 .../common/include/PluginInterface.h          | 46 ++++++++++
 .../common/src/PluginInterface.cpp            | 12 +++
 offload/test/sanitizer/kernel_crash.c         | 44 ++++++++++
 offload/test/sanitizer/kernel_crash_async.c   | 38 +++++++++
 offload/test/sanitizer/kernel_crash_many.c    | 70 ++++++++++++++++
 offload/test/sanitizer/kernel_crash_single.c  | 34 ++++++++
 offload/test/sanitizer/kernel_trap.c          | 39 +++++++++
 offload/test/sanitizer/kernel_trap_async.c    | 38 +++++++++
 offload/test/sanitizer/kernel_trap_many.c     | 33 ++++++++
 openmp/docs/design/Runtimes.rst               |  7 ++
 14 files changed, 514 insertions(+), 12 deletions(-)
 create mode 100644 offload/test/sanitizer/kernel_crash.c
 create mode 100644 offload/test/sanitizer/kernel_crash_async.c
 create mode 100644 offload/test/sanitizer/kernel_crash_many.c
 create mode 100644 offload/test/sanitizer/kernel_crash_single.c
 create mode 100644 offload/test/sanitizer/kernel_trap.c
 create mode 100644 offload/test/sanitizer/kernel_trap_async.c
 create mode 100644 offload/test/sanitizer/kernel_trap_many.c

diff --git a/offload/include/Shared/EnvironmentVar.h b/offload/include/Shared/EnvironmentVar.h
index 4cbdad695a0ee..82f434e91a85b 100644
--- a/offload/include/Shared/EnvironmentVar.h
+++ b/offload/include/Shared/EnvironmentVar.h
@@ -28,6 +28,7 @@ struct StringParser {
 /// Class for reading and checking environment variables. Currently working with
 /// integer, floats, std::string and bool types.
 template <typename Ty> class Envar {
+  llvm::StringRef Name;
   Ty Data;
   bool IsPresent;
   bool Initialized;
@@ -53,7 +54,7 @@ template <typename Ty> class Envar {
   /// take the value read from the environment variable, or the default if it
   /// was not set or not correct. This constructor is not fallible.
   Envar(llvm::StringRef Name, Ty Default = Ty())
-      : Data(Default), IsPresent(false), Initialized(true) {
+      : Name(Name), Data(Default), IsPresent(false), Initialized(true) {
 
     if (const char *EnvStr = getenv(Name.data())) {
       // Check whether the envar is defined and valid.
@@ -84,6 +85,9 @@ template <typename Ty> class Envar {
   /// Get the definitive value.
   operator Ty() const { return get(); }
 
+  /// Return the environment variable name.
+  llvm::StringRef getName() const { return Name; }
+
   /// Indicate whether the environment variable was defined and valid.
   bool isPresent() const { return IsPresent; }
 
diff --git a/offload/plugins-nextgen/amdgpu/dynamic_hsa/hsa.h b/offload/plugins-nextgen/amdgpu/dynamic_hsa/hsa.h
index 64a1d3308aed0..5d9fb5d7dc7cd 100644
--- a/offload/plugins-nextgen/amdgpu/dynamic_hsa/hsa.h
+++ b/offload/plugins-nextgen/amdgpu/dynamic_hsa/hsa.h
@@ -31,6 +31,7 @@ typedef enum {
   HSA_STATUS_ERROR = 0x1000,
   HSA_STATUS_ERROR_INVALID_CODE_OBJECT = 0x1010,
   HSA_STATUS_ERROR_NOT_INITIALIZED = 0x100B,
+  HSA_STATUS_ERROR_EXCEPTION = 0x1016,
 } hsa_status_t;
 
 hsa_status_t hsa_status_string(hsa_status_t status, const char **status_string);
diff --git a/offload/plugins-nextgen/amdgpu/src/rtl.cpp b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
index e6643d3260eb4..64454fce4bc1b 100644
--- a/offload/plugins-nextgen/amdgpu/src/rtl.cpp
+++ b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -13,13 +13,16 @@
 #include <atomic>
 #include <cassert>
 #include <cstddef>
+#include <cstdint>
 #include <deque>
+#include <functional>
 #include <mutex>
 #include <string>
 #include <system_error>
 #include <unistd.h>
 #include <unordered_map>
 
+#include "ErrorReporting.h"
 #include "Shared/APITypes.h"
 #include "Shared/Debug.h"
 #include "Shared/Environment.h"
@@ -43,6 +46,7 @@
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/Program.h"
+#include "llvm/Support/Signals.h"
 #include "llvm/Support/raw_ostream.h"
 
 #if !defined(__BYTE_ORDER__) || !defined(__ORDER_LITTLE_ENDIAN__) ||           \
@@ -685,12 +689,12 @@ struct AMDGPUQueueTy {
   AMDGPUQueueTy() : Queue(nullptr), Mutex(), NumUsers(0) {}
 
   /// Lazily initialize a new queue belonging to a specific agent.
-  Error init(hsa_agent_t Agent, int32_t QueueSize) {
+  Error init(GenericDeviceTy &Device, hsa_agent_t Agent, int32_t QueueSize) {
     if (Queue)
       return Plugin::success();
     hsa_status_t Status =
         hsa_queue_create(Agent, QueueSize, HSA_QUEUE_TYPE_MULTI, callbackError,
-                         nullptr, UINT32_MAX, UINT32_MAX, &Queue);
+                         &Device, UINT32_MAX, UINT32_MAX, &Queue);
     return Plugin::check(Status, "Error in hsa_queue_create: %s");
   }
 
@@ -875,10 +879,8 @@ struct AMDGPUQueueTy {
   }
 
   /// Callack that will be called when an error is detected on the HSA queue.
-  static void callbackError(hsa_status_t Status, hsa_queue_t *Source, void *) {
-    auto Err = Plugin::check(Status, "Received error in queue %p: %s", Source);
-    FATAL_MESSAGE(1, "%s", toString(std::move(Err)).data());
-  }
+  static void callbackError(hsa_status_t Status, hsa_queue_t *Source,
+                            void *Data);
 
   /// The HSA queue.
   hsa_queue_t *Queue;
@@ -1484,6 +1486,8 @@ struct AMDGPUStreamTy {
     return true;
   }
 
+  const AMDGPUQueueTy *getQueue() const { return Queue; }
+
   /// Record the state of the stream on an event.
   Error recordEvent(AMDGPUEventTy &Event) const;
 
@@ -1594,7 +1598,7 @@ struct AMDGPUStreamManagerTy final
   using ResourcePoolTy = GenericDeviceResourceManagerTy<ResourceRef>;
 
   AMDGPUStreamManagerTy(GenericDeviceTy &Device, hsa_agent_t HSAAgent)
-      : GenericDeviceResourceManagerTy(Device),
+      : GenericDeviceResourceManagerTy(Device), Device(Device),
         OMPX_QueueTracking("LIBOMPTARGET_AMDGPU_HSA_QUEUE_BUSY_TRACKING", true),
         NextQueue(0), Agent(HSAAgent) {}
 
@@ -1603,7 +1607,7 @@ struct AMDGPUStreamManagerTy final
     QueueSize = HSAQueueSize;
     MaxNumQueues = NumHSAQueues;
     // Initialize one queue eagerly
-    if (auto Err = Queues.front().init(Agent, QueueSize))
+    if (auto Err = Queues.front().init(Device, Agent, QueueSize))
       return Err;
 
     return GenericDeviceResourceManagerTy::init(InitialSize);
@@ -1660,7 +1664,7 @@ struct AMDGPUStreamManagerTy final
     }
 
     // Make sure the queue is initialized, then add user & assign.
-    if (auto Err = Queues[Index].init(Agent, QueueSize))
+    if (auto Err = Queues[Index].init(Device, Agent, QueueSize))
       return Err;
     Queues[Index].addUser();
     Stream->Queue = &Queues[Index];
@@ -1668,6 +1672,9 @@ struct AMDGPUStreamManagerTy final
     return Plugin::success();
   }
 
+  /// The device associated with this stream.
+  GenericDeviceTy &Device;
+
   /// Envar for controlling the tracking of busy HSA queues.
   BoolEnvar OMPX_QueueTracking;
 
@@ -3074,7 +3081,7 @@ struct AMDGPUPluginTy final : public GenericPluginTy {
     Initialized = true;
 
     // Register event handler to detect memory errors on the devices.
-    Status = hsa_amd_register_system_event_handler(eventHandler, nullptr);
+    Status = hsa_amd_register_system_event_handler(eventHandler, this);
     if (auto Err = Plugin::check(
             Status, "Error in hsa_amd_register_system_event_handler: %s"))
       return std::move(Err);
@@ -3209,7 +3216,8 @@ struct AMDGPUPluginTy final : public GenericPluginTy {
 
 private:
   /// Event handler that will be called by ROCr if an event is detected.
-  static hsa_status_t eventHandler(const hsa_amd_event_t *Event, void *) {
+  static hsa_status_t eventHandler(const hsa_amd_event_t *Event,
+                                   void *PluginPtr) {
     if (Event->event_type != HSA_AMD_GPU_MEMORY_FAULT_EVENT)
       return HSA_STATUS_SUCCESS;
 
@@ -3240,6 +3248,26 @@ struct AMDGPUPluginTy final : public GenericPluginTy {
     uint32_t Node = -1;
     hsa_agent_get_info(Event->memory_fault.agent, HSA_AGENT_INFO_NODE, &Node);
 
+    AMDGPUPluginTy &Plugin = *reinterpret_cast<AMDGPUPluginTy *>(PluginPtr);
+    for (uint32_t I = 0, E = Plugin.getNumDevices();
+         Node != uint32_t(-1) && I < E; ++I) {
+      AMDGPUDeviceTy &AMDGPUDevice =
+          reinterpret_cast<AMDGPUDeviceTy &>(Plugin.getDevice(I));
+      auto KernelTraceInfoRecord =
+          AMDGPUDevice.KernelLaunchTraces.getExclusiveAccessor();
+
+      uint32_t DeviceNode = -1;
+      if (auto Err =
+              AMDGPUDevice.getDeviceAttr(HSA_AGENT_INFO_NODE, DeviceNode)) {
+        consumeError(std::move(Err));
+        continue;
+      }
+      if (DeviceNode != Node)
+        continue;
+
+      ErrorReporter::reportKernelTraces(AMDGPUDevice, *KernelTraceInfoRecord);
+    }
+
     // Abort the execution since we do not recover from this error.
     FATAL_MESSAGE(1,
                   "Memory access fault by GPU %" PRIu32 " (agent 0x%" PRIx64
@@ -3480,6 +3508,30 @@ void *AMDGPUDeviceTy::allocate(size_t Size, void *, TargetAllocTy Kind) {
   return Alloc;
 }
 
+void AMDGPUQueueTy::callbackError(hsa_status_t Status, hsa_queue_t *Source,
+                                  void *Data) {
+  auto &AMDGPUDevice = *reinterpret_cast<AMDGPUDeviceTy *>(Data);
+
+  if (Status == HSA_STATUS_ERROR_EXCEPTION) {
+    auto KernelTraceInfoRecord =
+        AMDGPUDevice.KernelLaunchTraces.getExclusiveAccessor();
+    std::function<bool(__tgt_async_info &)> AsyncInfoWrapperMatcher =
+        [=](__tgt_async_info &AsyncInfo) {
+          auto *Stream = reinterpret_cast<AMDGPUStreamTy *>(AsyncInfo.Queue);
+          if (!Stream)
+            return false;
+          if (!Stream->getQueue())
+            return false;
+          return Stream->getQueue()->Queue == Source;
+        };
+    ErrorReporter::reportTrapInKernel(AMDGPUDevice, *KernelTraceInfoRecord,
+                                      AsyncInfoWrapperMatcher);
+  }
+
+  auto Err = Plugin::check(Status, "Received error in queue %p: %s", Source);
+  FATAL_MESSAGE(1, "%s", toString(std::move(Err)).data());
+}
+
 } // namespace plugin
 } // namespace target
 } // namespace omp
diff --git a/offload/plugins-nextgen/common/include/ErrorReporting.h b/offload/plugins-nextgen/common/include/ErrorReporting.h
index 84e689dd80fed..221b59f5274d6 100644
--- a/offload/plugins-nextgen/common/include/ErrorReporting.h
+++ b/offload/plugins-nextgen/common/include/ErrorReporting.h
@@ -213,6 +213,90 @@ class ErrorReporter {
                        getAllocTyName(ATI->Kind).data(), DevicePtr);
 #undef DEALLOCATION_ERROR
   }
+
+  /// Report that a kernel encountered a trap instruction.
+  static void reportTrapInKernel(
+      GenericDeviceTy &Device, KernelTraceInfoRecordTy &KTIR,
+      std::function<bool(__tgt_async_info &)> AsyncInfoWrapperMatcher) {
+    assert(AsyncInfoWrapperMatcher && "A matcher is required");
+
+    uint32_t Idx = 0;
+    for (uint32_t I = 0, E = KTIR.size(); I < E; ++I) {
+      auto KTI = KTIR.getKernelTraceInfo(I);
+      if (KTI.Kernel == nullptr)
+        break;
+      // Skip kernels issued in other queues.
+      if (KTI.AsyncInfo && !(AsyncInfoWrapperMatcher(*KTI.AsyncInfo)))
+        continue;
+      Idx = I;
+      break;
+    }
+
+    auto KTI = KTIR.getKernelTraceInfo(Idx);
+    if (KTI.AsyncInfo && (AsyncInfoWrapperMatcher(*KTI.AsyncInfo)))
+      reportError("Kernel '%s'", KTI.Kernel->getName());
+    reportError("execution interrupted by hardware trap instruction");
+    if (KTI.AsyncInfo && (AsyncInfoWrapperMatcher(*KTI.AsyncInfo))) {
+      if (!KTI.LaunchTrace.empty())
+        reportStackTrace(KTI.LaunchTrace);
+      else
+        print(Yellow, "Use '%s=1' to show the stack trace of the kernel\n",
+              Device.OMPX_TrackNumKernelLaunches.getName().data());
+    }
+    abort();
+  }
+
+  /// Report the kernel traces taken from \p KTIR, up to
+  /// OFFLOAD_TRACK_NUM_KERNEL_LAUNCH_TRACES many.
+  static void reportKernelTraces(GenericDeviceTy &Device,
+                                 KernelTraceInfoRecordTy &KTIR) {
+    uint32_t NumKTIs = 0;
+    for (uint32_t I = 0, E = KTIR.size(); I < E; ++I) {
+      auto KTI = KTIR.getKernelTraceInfo(I);
+      if (KTI.Kernel == nullptr)
+        break;
+      ++NumKTIs;
+    }
+    if (NumKTIs == 0) {
+      print(BoldRed, "No kernel launches known\n");
+      return;
+    }
+
+    uint32_t TracesToShow =
+        std::min(Device.OMPX_TrackNumKernelLaunches.get(), NumKTIs);
+    if (TracesToShow == 0) {
+      if (NumKTIs == 1)
+        print(BoldLightPurple, "Display only launched kernel:\n");
+      else
+        print(BoldLightPurple, "Display last %u kernels launched:\n", NumKTIs);
+    } else {
+      if (NumKTIs == 1)
+        print(BoldLightPurple, "Display kernel launch trace:\n");
+      else
+        print(BoldLightPurple,
+              "Display %u of the %u last kernel launch traces:\n", TracesToShow,
+              NumKTIs);
+    }
+
+    for (uint32_t Idx = 0, I = 0; I < NumKTIs; ++Idx) {
+      auto KTI = KTIR.getKernelTraceInfo(Idx);
+      if (NumKTIs == 1)
+        print(BoldLightPurple, "Kernel '%s'\n", KTI.Kernel->getName());
+      else
+        print(BoldLightPurple, "Kernel %d: '%s'\n", I, KTI.Kernel->getName());
+      reportStackTrace(KTI.LaunchTrace);
+      ++I;
+    }
+
+    if (NumKTIs != 1) {
+      print(Yellow,
+            "Use '%s=<num>' to adjust the number of shown stack traces (%u "
+            "now, up to %zu)\n",
+            Device.OMPX_TrackNumKernelLaunches.getName().data(),
+            Device.OMPX_TrackNumKernelLaunches.get(), KTIR.size());
+    }
+    // TODO: Let users know how to serialize kernels
+  }
 };
 
 } // namespace plugin
diff --git a/offload/plugins-nextgen/common/include/PluginInterface.h b/offload/plugins-nextgen/common/include/PluginInterface.h
index 49a3943a27429..81823338fe211 100644
--- a/offload/plugins-nextgen/common/include/PluginInterface.h
+++ b/offload/plugins-nextgen/common/include/PluginInterface.h
@@ -412,6 +412,44 @@ struct AllocationTraceInfoTy {
   std::mutex Lock;
 };
 
+/// Information about an allocation, when it has been allocated, and when/if it
+/// has been deallocated, for error reporting purposes.
+struct KernelTraceInfoTy {
+
+  /// The launched kernel.
+  GenericKernelTy *Kernel;
+
+  /// The stack trace of the launch itself.
+  std::string LaunchTrace;
+
+  /// The async info the kernel was launched in.
+  __tgt_async_info *AsyncInfo;
+};
+
+struct KernelTraceInfoRecordTy {
+  KernelTraceInfoRecordTy() { KTIs.fill({}); }
+
+  /// Return the (maximal) record size.
+  auto size() const { return KTIs.size(); }
+
+  /// Create a new kernel trace info and add it into the record.
+  void emplace(GenericKernelTy *Kernel, const std::string &&StackTrace,
+               __tgt_async_info *AsyncInfo) {
+    KTIs[Idx] = {Kernel, std::move(StackTrace), AsyncInfo};
+    Idx = (Idx + 1) % size();
+  }
+
+  /// Return the \p I'th last kernel trace info.
+  auto getKernelTraceInfo(int32_t I) const {
+    // Note that kernel trace infos "grow forward", so lookup is backwards.
+    return KTIs[(Idx - I - 1 + size()) % size()];
+  }
+
+private:
+  std::array<KernelTraceInfoTy, 8> KTIs;
+  unsigned Idx = 0;
+};
+
 /// Class representing a map of host pinned allocations. We track these pinned
 /// allocations, so memory tranfers invloving these buffers can be optimized.
 class PinnedAllocationMapTy {
@@ -900,6 +938,14 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
   /// been deallocated, both for error reporting purposes.
   ProtectedObj<DenseMap<void *, AllocationTraceInfoTy *>> AllocationTraces;
 
+  /// Map to record kernel have been launchedl, for error reporting purposes.
+  ProtectedObj<KernelTraceInfoRecordTy> KernelLaunchTraces;
+
+  /// Environment variable to determine if stack traces for kernel launches are
+  /// tracked.
+  UInt32Envar OMPX_TrackNumKernelLaunches =
+      UInt32Envar("OFFLOAD_TRACK_NUM_KERNEL_LAUNCH_TRACES", 0);
+
 private:
   /// Get and set the stack size and heap size for the device. If not used, the
   /// plugin can implement the setters as no-op and setting the output
diff --git a/offload/plugins-nextgen/common/src/PluginInterface.cpp b/offload/plugins-nextgen/common/src/PluginInterface.cpp
index 3b6f545fe91fa..c3ecbcc62f71f 100644
--- a/offload/plugins-nextgen/common/src/PluginInterface.cpp
+++ b/offload/plugins-nextgen/common/src/PluginInterface.cpp
@@ -1468,6 +1468,18 @@ Error GenericDeviceTy::launchKernel(void *EntryPtr, void **ArgPtrs,
   GenericKernelTy &GenericKernel =
       *reinterpret_cast<GenericKernelTy *>(EntryPtr);
 
+  {
+    std::string StackTrace;
+    if (OMPX_TrackNumKernelLaunches) {
+      llvm::raw_string_ostream OS(StackTrace);
+      llvm::sys::PrintStackTrace(OS);
+    }
+
+    auto KernelTraceInfoRecord = KernelLaunchTraces.getExclusiveAccessor();
+    (*KernelTraceInfoRecord)
+        .emplace(&GenericKernel, std::move(StackTrace), AsyncInfo);
+  }
+
   auto Err = GenericKernel.launch(*this, ArgPtrs, ArgOffsets, KernelArgs,
                                   AsyncInfoWrapper);
 
diff --git a/offload/test/sanitizer/kernel_crash.c b/offload/test/sanitizer/kernel_crash.c
new file mode 100644
index 0000000000000..167494e2ea706
--- /dev/null
+++ b/offload/test/sanitizer/kernel_crash.c
@@ -0,0 +1,44 @@
+
+// clang-format off
+// RUN: %libomptarget-compile-generic
+// RUN: %not --crash env -u LLVM_DISABLE_SYMBOLIZATION OFFLOAD_TRACK_NUM_KERNEL_LAUNCH_TRACES=1 %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefixes=TRACE,NDEBG 
+// RUN: %not --crash %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefixes=CHECK
+// RUN: %libomptarget-compile-generic -g
+// RUN: %not --crash env -u LLVM_DISABLE_SYMBOLIZATION OFFLOAD_TRACK_NUM_KERNEL_LAUNCH_TRACES=1 %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefixes=TRACE,DEBUG
+// RUN: %not --crash %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefixes=CHECK
+// clang-format on
+
+// UNSUPPORTED: nvptx64-nvidia-cuda
+// UNSUPPORTED: nvptx64-nvidia-cuda-LTO
+// UNSUPPORTED: aarch64-unknown-linux-gnu
+// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
+// UNSUPPORTED: x86_64-pc-linux-gnu
+// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+// UNSUPPORTED: s390x-ibm-linux-gnu
+// UNSUPPORTED: s390x-ibm-linux-gnu-LTO
+
+#include <omp.h>
+
+int main(void) {
+  int *A = 0;
+#pragma omp target
+  {}
+#pragma omp target
+  {}
+#pragma omp target
+  {
+    *A = 42;
+  }
+#pragma omp target
+  {}
+}
+// TRACE: Display 1 of the 3 last kernel launch traces
+// TRACE: Kernel 0: '__omp_offloading_{{.*}}_main_l28'
+// TRACE:     launchKernel
+// NDEBG:     main
+// DEBUG:     main {{.*}}kernel_crash.c:28
+//
+// CHECK: Display last 3 kernels launched:
+// CHECK: Kernel 0: '__omp_offloading_{{.*}}_main_l28'
+// CHECK: Kernel 1: '__omp_offloading_{{.*}}_main_l26'
+// CHECK: Kernel 2: '__omp_offloading_{{.*}}_main_l24'
diff --git a/offload/test/sanitizer/kernel_crash_async.c b/offload/test/sanitizer/kernel_crash_async.c
new file mode 100644
index 0000000000000..6823b489e55eb
--- /dev/null
+++ b/offload/test/sanitizer/kernel_crash_async.c
@@ -0,0 +1,38 @@
+
+// clang-format off
+// RUN: %libomptarget-compileopt-generic
+// RUN: %not --crash env -u LLVM_DISABLE_SYMBOLIZATION OFFLOAD_TRACK_NUM_KERNEL_LAUNCH_TRACES=1 %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefixes=TRACE
+// RUN: %not --crash %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefixes=CHECK
+// RUN: %libomptarget-compileopt-generic -g
+// RUN: %not --crash env -u LLVM_DISABLE_SYMBOLIZATION OFFLOAD_TRACK_NUM_KERNEL_LAUNCH_TRACES=1 %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefixes=TRACE
+// RUN: %not --crash %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefixes=CHECK
+// clang-format on
+
+// UNSUPPORTED: nvptx64-nvidia-cuda
+// UNSUPPORTED: nvptx64-nvidia-cuda-LTO
+// UNSUPPORTED: aarch64-unknown-linux-gnu
+// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
+// UNSUPPORTED: x86_64-pc-linux-gnu
+// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+// UNSUPPORTED: s390x-ibm-linux-gnu
+// UNSUPPORTED: s390x-ibm-linux-gnu-LTO
+
+#include <omp.h>
+
+int main(void) {
+  int *A = 0;
+#pragma omp target nowait
+  {}
+#pragma omp target nowait
+  {}
+#pragma omp target nowait
+  {
+    *A = 42;
+  }
+#pragma omp taskwait
+}
+
+// TRACE: Kernel {{.*}}'__omp_offloading_{{.*}}_main_
+// TRACE:     launchKernel
+//
+// CHECK-DAG: Kernel {{[0-9]}}: '__omp_offloading_{{.*}}_main_l28'
diff --git a/offload/test/sanitizer/kernel_crash_many.c b/offload/test/sanitizer/kernel_crash_many.c
new file mode 100644
index 0000000000000..9bea095eaff7b
--- /dev/null
+++ b/offload/test/sanitizer/kernel_crash_many.c
@@ -0,0 +1,70 @@
+
+// clang-format off
+// RUN: %libomptarget-compile-generic
+// RUN: %not --crash env -u LLVM_DISABLE_SYMBOLIZATION OFFLOAD_TRACK_NUM_KERNEL_LAUNCH_TRACES=24 %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefixes=CHECK,NDEBG 
+// RUN: %libomptarget-compile-generic -g
+// RUN: %not --crash env -u LLVM_DISABLE_SYMBOLIZATION OFFLOAD_TRACK_NUM_KERNEL_LAUNCH_TRACES=16 %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefixes=CHECK,DEBUG
+// clang-format on
+
+// UNSUPPORTED: nvptx64-nvidia-cuda
+// UNSUPPORTED: nvptx64-nvidia-cuda-LTO
+// UNSUPPORTED: aarch64-unknown-linux-gnu
+// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
+// UNSUPPORTED: x86_64-pc-linux-gnu
+// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+// UNSUPPORTED: s390x-ibm-linux-gnu
+// UNSUPPORTED: s390x-ibm-linux-gnu-LTO
+
+#include <omp.h>
+
+int main(void) {
+  int *A = 0;
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target
+    {}
+  }
+#pragma omp target
+  { *A = 42; }
+}
+// CHECK: Display 8 of the 8 last kernel launch traces
+// CHECK: Kernel 0: '__omp_offloading_{{.*}}_main_l26'
+// CHECK:     launchKernel
+// NDEBG:     main
+// DEBUG:     main {{.*}}kernel_crash_many.c:26
+//
+// CHECK: Kernel 1: '__omp_offloading_{{.*}}_main_l23'
+// CHECK:     launchKernel
+// NDEBG:     main
+// DEBUG:     main {{.*}}kernel_crash_many.c:
+//
+// CHECK: Kernel 2: '__omp_offloading_{{.*}}_main_l23'
+// CHECK:     launchKernel
+// NDEBG:     main
+// DEBUG:     main {{.*}}kernel_crash_many.c:
+//
+// CHECK: Kernel 3: '__omp_offloading_{{.*}}_main_l23'
+// CHECK:     launchKernel
+// NDEBG:     main
+// DEBUG:     main {{.*}}kernel_crash_many.c:
+//
+// CHECK: Kernel 4: '__omp_offloading_{{.*}}_main_l23'
+// CHECK:     launchKernel
+// NDEBG:     main
+// DEBUG:     main {{.*}}kernel_crash_many.c:
+//
+// CHECK: Kernel 5: '__omp_offloading_{{.*}}_main_l23'
+// CHECK:     launchKernel
+// NDEBG:     main
+// DEBUG:     main {{.*}}kernel_crash_many.c:
+//
+// CHECK: Kernel 6: '__omp_offloading_{{.*}}_main_l23'
+// CHECK:     launchKernel
+// NDEBG:     main
+// DEBUG:     main {{.*}}kernel_crash_many.c:
+//
+// CHECK: Kernel 7: '__omp_offloading_{{.*}}_main_l23'
+// CHECK:     launchKernel
+// NDEBG:     main
+// DEBUG:     main {{.*}}kernel_crash_many.c:
+//
+// CHECK-NOT: Kernel {{[[0-9]]+}}:
diff --git a/offload/test/sanitizer/kernel_crash_single.c b/offload/test/sanitizer/kernel_crash_single.c
new file mode 100644
index 0000000000000..0dfe93b19d76c
--- /dev/null
+++ b/offload/test/sanitizer/kernel_crash_single.c
@@ -0,0 +1,34 @@
+
+// clang-format off
+// RUN: %libomptarget-compile-generic
+// RUN: %not --crash env -u LLVM_DISABLE_SYMBOLIZATION OFFLOAD_TRACK_NUM_KERNEL_LAUNCH_TRACES=1 %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefixes=TRACE,NDEBG 
+// RUN: %not --crash %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefixes=CHECK
+// RUN: %libomptarget-compile-generic -g
+// RUN: %not --crash env -u LLVM_DISABLE_SYMBOLIZATION OFFLOAD_TRACK_NUM_KERNEL_LAUNCH_TRACES=1 %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefixes=TRACE,DEBUG
+// RUN: %not --crash %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefixes=CHECK
+// clang-format on
+
+// UNSUPPORTED: nvptx64-nvidia-cuda
+// UNSUPPORTED: nvptx64-nvidia-cuda-LTO
+// UNSUPPORTED: aarch64-unknown-linux-gnu
+// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
+// UNSUPPORTED: x86_64-pc-linux-gnu
+// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+// UNSUPPORTED: s390x-ibm-linux-gnu
+// UNSUPPORTED: s390x-ibm-linux-gnu-LTO
+
+#include <omp.h>
+
+int main(void) {
+  int *A = 0;
+#pragma omp target
+  { *A = 42; }
+}
+// TRACE: Display kernel launch trace
+// TRACE: Kernel '__omp_offloading_{{.*}}_main_l24'
+// TRACE:     launchKernel
+// NDEBG:     main
+// DEBUG:     main {{.*}}kernel_crash_single.c:24
+//
+// CHECK: Display only launched kernel:
+// CHECK: Kernel '__omp_offloading_{{.*}}_main_l24'
diff --git a/offload/test/sanitizer/kernel_trap.c b/offload/test/sanitizer/kernel_trap.c
new file mode 100644
index 0000000000000..afa517175b2a9
--- /dev/null
+++ b/offload/test/sanitizer/kernel_trap.c
@@ -0,0 +1,39 @@
+
+// clang-format off
+// RUN: %libomptarget-compile-generic
+// RUN: %not --crash env -u LLVM_DISABLE_SYMBOLIZATION OFFLOAD_TRACK_NUM_KERNEL_LAUNCH_TRACES=1 %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefixes=CHECK,TRACE,NDEBG 
+// RUN: %not --crash %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefixes=CHECK
+// RUN: %libomptarget-compile-generic -g
+// RUN: %not --crash env -u LLVM_DISABLE_SYMBOLIZATION OFFLOAD_TRACK_NUM_KERNEL_LAUNCH_TRACES=1 %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefixes=CHECK,TRACE,DEBUG
+// RUN: %not --crash %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefixes=CHECK
+// clang-format on
+
+// UNSUPPORTED: nvptx64-nvidia-cuda
+// UNSUPPORTED: nvptx64-nvidia-cuda-LTO
+// UNSUPPORTED: aarch64-unknown-linux-gnu
+// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
+// UNSUPPORTED: x86_64-pc-linux-gnu
+// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+// UNSUPPORTED: s390x-ibm-linux-gnu
+// UNSUPPORTED: s390x-ibm-linux-gnu-LTO
+
+#include <omp.h>
+
+int main(void) {
+
+#pragma omp target
+  {}
+#pragma omp target
+  {}
+#pragma omp target
+  {
+    __builtin_trap();
+  }
+#pragma omp target
+  {}
+}
+// CHECK: OFFLOAD ERROR: Kernel '__omp_offloading_{{.*}}_main_l28'
+// CHECK: OFFLOAD ERROR: execution interrupted by hardware trap instruction
+// TRACE:     launchKernel
+// NDEBG:     main
+// DEBUG:     main {{.*}}kernel_trap.c:
diff --git a/offload/test/sanitizer/kernel_trap_async.c b/offload/test/sanitizer/kernel_trap_async.c
new file mode 100644
index 0000000000000..2f37c052083dd
--- /dev/null
+++ b/offload/test/sanitizer/kernel_trap_async.c
@@ -0,0 +1,38 @@
+
+// clang-format off
+// RUN: %libomptarget-compileopt-generic
+// RUN: %not --crash env -u LLVM_DISABLE_SYMBOLIZATION OFFLOAD_TRACK_NUM_KERNEL_LAUNCH_TRACES=1 %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefixes=CHECK,TRACE
+// RUN: %not --crash %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefixes=CHECK
+// RUN: %libomptarget-compileopt-generic -g
+// RUN: %not --crash env -u LLVM_DISABLE_SYMBOLIZATION OFFLOAD_TRACK_NUM_KERNEL_LAUNCH_TRACES=1 %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefixes=CHECK,TRACE,DEBUG
+// RUN: %not --crash %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefixes=CHECK
+// clang-format on
+
+// UNSUPPORTED: nvptx64-nvidia-cuda
+// UNSUPPORTED: nvptx64-nvidia-cuda-LTO
+// UNSUPPORTED: aarch64-unknown-linux-gnu
+// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
+// UNSUPPORTED: x86_64-pc-linux-gnu
+// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+// UNSUPPORTED: s390x-ibm-linux-gnu
+// UNSUPPORTED: s390x-ibm-linux-gnu-LTO
+
+#include <omp.h>
+
+int main(void) {
+
+#pragma omp target nowait
+  {}
+#pragma omp target nowait
+  {}
+#pragma omp target nowait
+  {
+    __builtin_trap();
+  }
+#pragma omp taskwait
+}
+
+// CHECK: OFFLOAD ERROR: Kernel '__omp_offloading_{{.*}}_main_l28'
+// CHECK: OFFLOAD ERROR: execution interrupted by hardware trap instruction
+// TRACE:     launchKernel
+// DEBUG:     kernel_trap_async.c:
diff --git a/offload/test/sanitizer/kernel_trap_many.c b/offload/test/sanitizer/kernel_trap_many.c
new file mode 100644
index 0000000000000..9c79f8942299e
--- /dev/null
+++ b/offload/test/sanitizer/kernel_trap_many.c
@@ -0,0 +1,33 @@
+
+// clang-format off
+// RUN: %libomptarget-compile-generic
+// RUN: %not --crash env -u LLVM_DISABLE_SYMBOLIZATION OFFLOAD_TRACK_NUM_KERNEL_LAUNCH_TRACES=24 %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefixes=TRACE,NDEBG 
+// RUN: %libomptarget-compile-generic -g
+// RUN: %not --crash env -u LLVM_DISABLE_SYMBOLIZATION OFFLOAD_TRACK_NUM_KERNEL_LAUNCH_TRACES=16 %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefixes=TRACE,DEBUG
+// clang-format on
+
+// UNSUPPORTED: nvptx64-nvidia-cuda
+// UNSUPPORTED: nvptx64-nvidia-cuda-LTO
+// UNSUPPORTED: aarch64-unknown-linux-gnu
+// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
+// UNSUPPORTED: x86_64-pc-linux-gnu
+// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+// UNSUPPORTED: s390x-ibm-linux-gnu
+// UNSUPPORTED: s390x-ibm-linux-gnu-LTO
+
+#include <omp.h>
+
+int main(void) {
+
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target
+    {}
+  }
+#pragma omp target
+  { __builtin_trap(); }
+}
+// TRACE: OFFLOAD ERROR: Kernel '__omp_offloading_{{.*}}_main_l26'
+// TRACE: OFFLOAD ERROR: execution interrupted by hardware trap instruction
+// TRACE:     launchKernel
+// NDEBG:     main
+// DEBUG:     main {{.*}}kernel_trap_many.c:
diff --git a/openmp/docs/design/Runtimes.rst b/openmp/docs/design/Runtimes.rst
index 20e2fa715eaac..ed002c8cf0f80 100644
--- a/openmp/docs/design/Runtimes.rst
+++ b/openmp/docs/design/Runtimes.rst
@@ -744,6 +744,7 @@ variables is defined below.
     * ``LIBOMPTARGET_MIN_THREADS_FOR_LOW_TRIP_COUNT=<Num> (default: 32)``
     * ``LIBOMPTARGET_REUSE_BLOCKS_FOR_HIGH_TRIP_COUNT=[TRUE/FALSE] (default TRUE)``
     * ``OFFLOAD_TRACK_ALLOCATION_TRACES=[TRUE/FALSE] (default FALSE)``
+    * ``OFFLOAD_TRACK_NUM_KERNEL_LAUNCH_TRACES=<Num> (default 0)``
 
 LIBOMPTARGET_DEBUG
 """"""""""""""""""
@@ -1178,6 +1179,12 @@ This environment variable determines if the stack traces of allocations and
 deallocations are tracked to aid in error reporting, e.g., in case of
 double-free.
 
+OFFLOAD_TRACK_KERNEL_LAUNCH_TRACES
+""""""""""""""""""""""""""""""""""
+
+This environment variable determines how manytstack traces of kernel launches
+are tracked to aid in error reporting, e.g., what asynchronous kernel failed.
+
 .. _libomptarget_plugin:
 
 LLVM/OpenMP Target Host Runtime Plugins (``libomptarget.rtl.XXXX``)



More information about the Openmp-commits mailing list