[Openmp-commits] [clang] [llvm] [openmp] Check multiple pointers in one call (PR #100671)

Vidush Singhal via Openmp-commits openmp-commits at lists.llvm.org
Thu Jul 25 16:51:52 PDT 2024


https://github.com/vidsinghal created https://github.com/llvm/llvm-project/pull/100671

None

>From d06585044bd6d2dd76d6110bce933e01fd4b333e Mon Sep 17 00:00:00 2001
From: Johannes Doerfert <johannes at jdoerfert.de>
Date: Mon, 3 Jun 2024 19:52:12 -0700
Subject: [PATCH 01/31] [Offload][CUDA] Allow CUDA kernels to use LLVM/Offload
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Through the new `-foffload-via-llvm` flag, CUDA kernels can now be
lowered to the LLVM/Offload API. On the Clang side, this is simply done
by using the OpenMP offload toolchain and emitting calls to `llvm*`
functions to orchestrate the kernel launch rather than `cuda*`
functions. These `llvm*` functions are implemented on top of the
existing LLVM/Offload API.

As we are about to redefine the Offload API, this wil help us in the
design process as a second offload language.

We do not support any CUDA APIs yet, however, we could:
  https://www.osti.gov/servlets/purl/1892137

For proper host execution we need to resurrect/rebase
  https://tianshilei.me/wp-content/uploads/2021/12/llpp-2021.pdf
(which was designed for debugging).

```
❯❯❯ cat test.cu
extern "C" {
void *llvm_omp_target_alloc_shared(size_t Size, int DeviceNum);
void llvm_omp_target_free_shared(void *DevicePtr, int DeviceNum);
}

__global__ void square(int *A) { *A = 42; }

int main(int argc, char **argv) {
  int DevNo = 0;
  int *Ptr = reinterpret_cast<int *>(llvm_omp_target_alloc_shared(4, DevNo));
  *Ptr = 7;
  printf("Ptr %p, *Ptr %i\n", Ptr, *Ptr);
  square<<<1, 1>>>(Ptr);
  printf("Ptr %p, *Ptr %i\n", Ptr, *Ptr);
  llvm_omp_target_free_shared(Ptr, DevNo);
}

❯❯❯ clang++ test.cu -O3 -o test123 -foffload-via-llvm --offload-arch=native

❯❯❯ llvm-objdump --offloading test123

test123:        file format elf64-x86-64

OFFLOADING IMAGE [0]:
kind            elf
arch            gfx90a
triple          amdgcn-amd-amdhsa
producer        openmp

❯❯❯ LIBOMPTARGET_INFO=16 ./test123
Ptr 0x155448ac8000, *Ptr 7
Ptr 0x155448ac8000, *Ptr 42
```
---
 clang/include/clang/Basic/LangOptions.def     |  1 +
 clang/include/clang/Driver/Options.td         |  6 ++
 clang/lib/CodeGen/CGCUDANV.cpp                | 97 ++++++++++++++++---
 clang/lib/Driver/Driver.cpp                   | 19 ++--
 clang/lib/Driver/ToolChains/Clang.cpp         | 27 +++++-
 clang/lib/Driver/ToolChains/CommonArgs.cpp    |  7 +-
 clang/lib/Driver/ToolChains/Cuda.cpp          | 27 +++---
 clang/lib/Headers/CMakeLists.txt              | 18 +++-
 .../llvm_offload_wrappers/__llvm_offload.h    | 31 ++++++
 .../__llvm_offload_device.h                   | 10 ++
 .../__llvm_offload_host.h                     | 15 +++
 .../__clang_openmp_device_functions.h         |  9 +-
 clang/lib/Sema/SemaCUDA.cpp                   |  3 +
 clang/test/CodeGenCUDA/offload_via_llvm.cu    | 97 +++++++++++++++++++
 clang/test/Driver/cuda-via-liboffload.cu      | 23 +++++
 offload/include/Shared/APITypes.h             |  5 +-
 offload/include/omptarget.h                   |  2 +-
 .../common/src/PluginInterface.cpp            | 13 ++-
 offload/src/CMakeLists.txt                    |  1 +
 offload/src/KernelLanguage/API.cpp            | 76 +++++++++++++++
 offload/src/exports                           |  3 +
 offload/test/lit.cfg                          |  2 +-
 offload/test/offloading/CUDA/basic_launch.cu  | 31 ++++++
 .../CUDA/basic_launch_blocks_and_threads.cu   | 32 ++++++
 .../offloading/CUDA/basic_launch_multi_arg.cu | 41 ++++++++
 offload/test/offloading/CUDA/kernel_tu.cu.inc |  1 +
 offload/test/offloading/CUDA/launch_tu.cu     | 32 ++++++
 27 files changed, 576 insertions(+), 53 deletions(-)
 create mode 100644 clang/lib/Headers/llvm_offload_wrappers/__llvm_offload.h
 create mode 100644 clang/lib/Headers/llvm_offload_wrappers/__llvm_offload_device.h
 create mode 100644 clang/lib/Headers/llvm_offload_wrappers/__llvm_offload_host.h
 create mode 100644 clang/test/CodeGenCUDA/offload_via_llvm.cu
 create mode 100644 clang/test/Driver/cuda-via-liboffload.cu
 create mode 100644 offload/src/KernelLanguage/API.cpp
 create mode 100644 offload/test/offloading/CUDA/basic_launch.cu
 create mode 100644 offload/test/offloading/CUDA/basic_launch_blocks_and_threads.cu
 create mode 100644 offload/test/offloading/CUDA/basic_launch_multi_arg.cu
 create mode 100644 offload/test/offloading/CUDA/kernel_tu.cu.inc
 create mode 100644 offload/test/offloading/CUDA/launch_tu.cu

diff --git a/clang/include/clang/Basic/LangOptions.def b/clang/include/clang/Basic/LangOptions.def
index 2dea3cd4d795b..e8d3be7e89dbb 100644
--- a/clang/include/clang/Basic/LangOptions.def
+++ b/clang/include/clang/Basic/LangOptions.def
@@ -288,6 +288,7 @@ LANGOPT(GPUMaxThreadsPerBlock, 32, 1024, "default max threads per block for kern
 LANGOPT(GPUDeferDiag, 1, 0, "defer host/device related diagnostic messages for CUDA/HIP")
 LANGOPT(GPUExcludeWrongSideOverloads, 1, 0, "always exclude wrong side overloads in overloading resolution for CUDA/HIP")
 LANGOPT(OffloadingNewDriver, 1, 0, "use the new driver for generating offloading code.")
+LANGOPT(OffloadViaLLVM, 1, 0, "target LLVM/Offload as portable offloading runtime.")
 
 LANGOPT(SYCLIsDevice      , 1, 0, "Generate code for SYCL device")
 LANGOPT(SYCLIsHost        , 1, 0, "SYCL host compilation")
diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index d44faa55c456f..b77fd063f5519 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -1293,6 +1293,12 @@ def no_offload_compress : Flag<["--"], "no-offload-compress">;
 def offload_compression_level_EQ : Joined<["--"], "offload-compression-level=">,
   Flags<[HelpHidden]>,
   HelpText<"Compression level for offload device binaries (HIP only)">;
+
+defm offload_via_llvm : BoolFOption<"offload-via-llvm",
+  LangOpts<"OffloadViaLLVM">, DefaultFalse,
+  PosFlag<SetTrue, [], [ClangOption, CC1Option], "Use">,
+  NegFlag<SetFalse, [], [ClangOption], "Don't use">,
+  BothFlags<[], [ClangOption], " LLVM/Offload as portable offloading runtime.">>;
 }
 
 // CUDA options
diff --git a/clang/lib/CodeGen/CGCUDANV.cpp b/clang/lib/CodeGen/CGCUDANV.cpp
index 43dfbbb90dd52..2ebe0bf802dfa 100644
--- a/clang/lib/CodeGen/CGCUDANV.cpp
+++ b/clang/lib/CodeGen/CGCUDANV.cpp
@@ -15,10 +15,12 @@
 #include "CGCXXABI.h"
 #include "CodeGenFunction.h"
 #include "CodeGenModule.h"
+#include "clang/AST/CharUnits.h"
 #include "clang/AST/Decl.h"
 #include "clang/Basic/Cuda.h"
 #include "clang/CodeGen/CodeGenABITypes.h"
 #include "clang/CodeGen/ConstantInitBuilder.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/Frontend/Offloading/Utility.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Constants.h"
@@ -36,6 +38,11 @@ constexpr unsigned HIPFatMagic = 0x48495046; // "HIPF"
 
 class CGNVCUDARuntime : public CGCUDARuntime {
 
+  /// The prefix used for function calls and section names (CUDA, HIP, LLVM)
+  StringRef Prefix;
+  /// TODO: We should transition the OpenMP section to LLVM/Offload
+  StringRef SectionPrefix;
+
 private:
   llvm::IntegerType *IntTy, *SizeTy;
   llvm::Type *VoidTy;
@@ -132,6 +139,9 @@ class CGNVCUDARuntime : public CGCUDARuntime {
     return DummyFunc;
   }
 
+  Address prepareKernelArgs(CodeGenFunction &CGF, FunctionArgList &Args);
+  Address prepareKernelArgsLLVMOffload(CodeGenFunction &CGF,
+                                       FunctionArgList &Args);
   void emitDeviceStubBodyLegacy(CodeGenFunction &CGF, FunctionArgList &Args);
   void emitDeviceStubBodyNew(CodeGenFunction &CGF, FunctionArgList &Args);
   std::string getDeviceSideName(const NamedDecl *ND) override;
@@ -191,15 +201,11 @@ class CGNVCUDARuntime : public CGCUDARuntime {
 } // end anonymous namespace
 
 std::string CGNVCUDARuntime::addPrefixToName(StringRef FuncName) const {
-  if (CGM.getLangOpts().HIP)
-    return ((Twine("hip") + Twine(FuncName)).str());
-  return ((Twine("cuda") + Twine(FuncName)).str());
+  return (Prefix + FuncName).str();
 }
 std::string
 CGNVCUDARuntime::addUnderscoredPrefixToName(StringRef FuncName) const {
-  if (CGM.getLangOpts().HIP)
-    return ((Twine("__hip") + Twine(FuncName)).str());
-  return ((Twine("__cuda") + Twine(FuncName)).str());
+  return ("__" + Prefix + FuncName).str();
 }
 
 static std::unique_ptr<MangleContext> InitDeviceMC(CodeGenModule &CGM) {
@@ -227,6 +233,14 @@ CGNVCUDARuntime::CGNVCUDARuntime(CodeGenModule &CGM)
   SizeTy = CGM.SizeTy;
   VoidTy = CGM.VoidTy;
   PtrTy = CGM.UnqualPtrTy;
+
+  if (CGM.getLangOpts().OffloadViaLLVM) {
+    Prefix = "llvm";
+    SectionPrefix = "omp";
+  } else if (CGM.getLangOpts().HIP)
+    SectionPrefix = Prefix = "hip";
+  else
+    SectionPrefix = Prefix = "cuda";
 }
 
 llvm::FunctionCallee CGNVCUDARuntime::getSetupArgumentFn() const {
@@ -305,18 +319,58 @@ void CGNVCUDARuntime::emitDeviceStub(CodeGenFunction &CGF,
   }
   if (CudaFeatureEnabled(CGM.getTarget().getSDKVersion(),
                          CudaFeature::CUDA_USES_NEW_LAUNCH) ||
-      (CGF.getLangOpts().HIP && CGF.getLangOpts().HIPUseNewLaunchAPI))
+      (CGF.getLangOpts().HIP && CGF.getLangOpts().HIPUseNewLaunchAPI) ||
+      (CGF.getLangOpts().OffloadViaLLVM))
     emitDeviceStubBodyNew(CGF, Args);
   else
     emitDeviceStubBodyLegacy(CGF, Args);
 }
 
-// CUDA 9.0+ uses new way to launch kernels. Parameters are packed in a local
-// array and kernels are launched using cudaLaunchKernel().
-void CGNVCUDARuntime::emitDeviceStubBodyNew(CodeGenFunction &CGF,
-                                            FunctionArgList &Args) {
-  // Build the shadow stack entry at the very start of the function.
+/// CUDA passes the arguments with a level of indirection. For example, a
+/// (void*, short, void*) is passed as {void **, short *, void **} to the launch
+/// function. For the LLVM/offload launch we flatten the arguments into the
+/// struct directly. In addition, we include the size of the arguments, thus
+/// pass {sizeof({void *, short, void *}), ptr to {void *, short, void *},
+/// nullptr}. The last nullptr needs to be initialized to an array of pointers
+/// pointing to the arguments if we want to offload to the host.
+Address CGNVCUDARuntime::prepareKernelArgsLLVMOffload(CodeGenFunction &CGF,
+                                                      FunctionArgList &Args) {
+  SmallVector<llvm::Type *> ArgTypes, KernelLaunchParamsTypes;
+  for (auto &Arg : Args)
+    ArgTypes.push_back(CGF.ConvertTypeForMem(Arg->getType()));
+  llvm::StructType *KernelArgsTy = llvm::StructType::create(ArgTypes);
+
+  auto *Int64Ty = CGF.Builder.getInt64Ty();
+  KernelLaunchParamsTypes.push_back(Int64Ty);
+  KernelLaunchParamsTypes.push_back(PtrTy);
+  KernelLaunchParamsTypes.push_back(PtrTy);
+
+  llvm::StructType *KernelLaunchParamsTy =
+      llvm::StructType::create(KernelLaunchParamsTypes);
+  Address KernelArgs = CGF.CreateTempAllocaWithoutCast(
+      KernelArgsTy, CharUnits::fromQuantity(16), "kernel_args");
+  Address KernelLaunchParams = CGF.CreateTempAllocaWithoutCast(
+      KernelLaunchParamsTy, CharUnits::fromQuantity(16),
+      "kernel_launch_params");
+
+  auto KernelArgsSize = CGM.getDataLayout().getTypeAllocSize(KernelArgsTy);
+  CGF.Builder.CreateStore(llvm::ConstantInt::get(Int64Ty, KernelArgsSize),
+                          CGF.Builder.CreateStructGEP(KernelLaunchParams, 0));
+  CGF.Builder.CreateStore(KernelArgs.emitRawPointer(CGF),
+                          CGF.Builder.CreateStructGEP(KernelLaunchParams, 1));
+  CGF.Builder.CreateStore(llvm::Constant::getNullValue(PtrTy),
+                          CGF.Builder.CreateStructGEP(KernelLaunchParams, 2));
+
+  for (unsigned i = 0; i < Args.size(); ++i) {
+    auto *ArgVal = CGF.Builder.CreateLoad(CGF.GetAddrOfLocalVar(Args[i]));
+    CGF.Builder.CreateStore(ArgVal, CGF.Builder.CreateStructGEP(KernelArgs, i));
+  }
 
+  return KernelLaunchParams;
+}
+
+Address CGNVCUDARuntime::prepareKernelArgs(CodeGenFunction &CGF,
+                                           FunctionArgList &Args) {
   // Calculate amount of space we will need for all arguments.  If we have no
   // args, allocate a single pointer so we still have a valid pointer to the
   // argument array that we can pass to runtime, even if it will be unused.
@@ -331,6 +385,17 @@ void CGNVCUDARuntime::emitDeviceStubBodyNew(CodeGenFunction &CGF,
         VoidVarPtr, CGF.Builder.CreateConstGEP1_32(
                         PtrTy, KernelArgs.emitRawPointer(CGF), i));
   }
+  return KernelArgs;
+}
+
+// CUDA 9.0+ uses new way to launch kernels. Parameters are packed in a local
+// array and kernels are launched using cudaLaunchKernel().
+void CGNVCUDARuntime::emitDeviceStubBodyNew(CodeGenFunction &CGF,
+                                            FunctionArgList &Args) {
+  // Build the shadow stack entry at the very start of the function.
+  Address KernelArgs = CGF.getLangOpts().OffloadViaLLVM
+                           ? prepareKernelArgsLLVMOffload(CGF, Args)
+                           : prepareKernelArgs(CGF, Args);
 
   llvm::BasicBlock *EndBlock = CGF.createBasicBlock("setup.end");
 
@@ -1129,8 +1194,9 @@ void CGNVCUDARuntime::transformManagedVars() {
 // registered. The linker will provide a pointer to this section so we can
 // register the symbols with the linked device image.
 void CGNVCUDARuntime::createOffloadingEntries() {
-  StringRef Section = CGM.getLangOpts().HIP ? "hip_offloading_entries"
-                                            : "cuda_offloading_entries";
+  SmallVector<char, 32> Out;
+  StringRef Section = (SectionPrefix + "_offloading_entries").toStringRef(Out);
+
   llvm::Module &M = CGM.getModule();
   for (KernelInfo &I : EmittedKernels)
     llvm::offloading::emitOffloadingEntry(
@@ -1199,7 +1265,8 @@ llvm::Function *CGNVCUDARuntime::finalizeModule() {
     }
     return nullptr;
   }
-  if (CGM.getLangOpts().OffloadingNewDriver && RelocatableDeviceCode)
+  if (CGM.getLangOpts().OffloadViaLLVM ||
+      (CGM.getLangOpts().OffloadingNewDriver && RelocatableDeviceCode))
     createOffloadingEntries();
   else
     return makeModuleCtorFunction();
diff --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp
index 67bf0604acd6e..0f55b429512c2 100644
--- a/clang/lib/Driver/Driver.cpp
+++ b/clang/lib/Driver/Driver.cpp
@@ -792,11 +792,13 @@ void Driver::CreateOffloadingDeviceToolChains(Compilation &C,
                    }) ||
       C.getInputArgs().hasArg(options::OPT_hip_link) ||
       C.getInputArgs().hasArg(options::OPT_hipstdpar);
+  bool UseLLVMOffload = C.getInputArgs().hasArg(
+      options::OPT_foffload_via_llvm, options::OPT_fno_offload_via_llvm, false);
   if (IsCuda && IsHIP) {
     Diag(clang::diag::err_drv_mix_cuda_hip);
     return;
   }
-  if (IsCuda) {
+  if (IsCuda && !UseLLVMOffload) {
     const ToolChain *HostTC = C.getSingleOffloadToolChain<Action::OFK_Host>();
     const llvm::Triple &HostTriple = HostTC->getTriple();
     auto OFK = Action::OFK_Cuda;
@@ -818,7 +820,7 @@ void Driver::CreateOffloadingDeviceToolChains(Compilation &C,
         CudaInstallation.WarnIfUnsupportedVersion();
     }
     C.addOffloadDeviceToolChain(CudaTC.get(), OFK);
-  } else if (IsHIP) {
+  } else if (IsHIP && !UseLLVMOffload) {
     if (auto *OMPTargetArg =
             C.getInputArgs().getLastArg(options::OPT_fopenmp_targets_EQ)) {
       Diag(clang::diag::err_drv_unsupported_opt_for_language_mode)
@@ -842,10 +844,11 @@ void Driver::CreateOffloadingDeviceToolChains(Compilation &C,
   // We need to generate an OpenMP toolchain if the user specified targets with
   // the -fopenmp-targets option or used --offload-arch with OpenMP enabled.
   bool IsOpenMPOffloading =
-      C.getInputArgs().hasFlag(options::OPT_fopenmp, options::OPT_fopenmp_EQ,
-                               options::OPT_fno_openmp, false) &&
-      (C.getInputArgs().hasArg(options::OPT_fopenmp_targets_EQ) ||
-       C.getInputArgs().hasArg(options::OPT_offload_arch_EQ));
+      ((IsCuda || IsHIP) && UseLLVMOffload) ||
+      (C.getInputArgs().hasFlag(options::OPT_fopenmp, options::OPT_fopenmp_EQ,
+                                options::OPT_fno_openmp, false) &&
+       (C.getInputArgs().hasArg(options::OPT_fopenmp_targets_EQ) ||
+        C.getInputArgs().hasArg(options::OPT_offload_arch_EQ)));
   if (IsOpenMPOffloading) {
     // We expect that -fopenmp-targets is always used in conjunction with the
     // option -fopenmp specifying a valid runtime with offloading support, i.e.
@@ -873,7 +876,7 @@ void Driver::CreateOffloadingDeviceToolChains(Compilation &C,
       for (StringRef T : OpenMPTargets->getValues())
         OpenMPTriples.insert(T);
     } else if (C.getInputArgs().hasArg(options::OPT_offload_arch_EQ) &&
-               !IsHIP && !IsCuda) {
+               ((!IsHIP && !IsCuda) || UseLLVMOffload)) {
       const ToolChain *HostTC = C.getSingleOffloadToolChain<Action::OFK_Host>();
       auto AMDTriple = getHIPOffloadTargetTriple(*this, C.getInputArgs());
       auto NVPTXTriple = getNVIDIAOffloadTargetTriple(*this, C.getInputArgs(),
@@ -4146,6 +4149,8 @@ void Driver::BuildActions(Compilation &C, DerivedArgList &Args,
 
   bool UseNewOffloadingDriver =
       C.isOffloadingHostKind(Action::OFK_OpenMP) ||
+      Args.hasFlag(options::OPT_foffload_via_llvm,
+                   options::OPT_fno_offload_via_llvm, false) ||
       Args.hasFlag(options::OPT_offload_new_driver,
                    options::OPT_no_offload_new_driver, false);
 
diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index b8d8ff3db5d1f..77d365bad229f 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -1125,6 +1125,18 @@ void Clang::AddPreprocessingOptions(Compilation &C, const JobAction &JA,
     CmdArgs.push_back("__clang_openmp_device_functions.h");
   }
 
+  if (Args.hasArg(options::OPT_foffload_via_llvm)) {
+    // Add llvm_wrappers/* to our system include path.  This lets us wrap
+    // standard library headers and other headers.
+    SmallString<128> P(D.ResourceDir);
+    llvm::sys::path::append(P, "include", "llvm_offload_wrappers");
+    CmdArgs.append({"-internal-isystem", Args.MakeArgString(P), "-include"});
+    if (JA.isDeviceOffloading(Action::OFK_OpenMP))
+      CmdArgs.push_back("__llvm_offload_device.h");
+    else
+      CmdArgs.push_back("__llvm_offload_host.h");
+  }
+
   // Add -i* options, and automatically translate to
   // -include-pch/-include-pth for transparent PCH support. It's
   // wonky, but we include looking for .gch so we can support seamless
@@ -6598,6 +6610,8 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
   // device offloading action other than OpenMP.
   if (Args.hasFlag(options::OPT_fopenmp, options::OPT_fopenmp_EQ,
                    options::OPT_fno_openmp, false) &&
+      !Args.hasFlag(options::OPT_foffload_via_llvm,
+                    options::OPT_fno_offload_via_llvm, false) &&
       (JA.isDeviceOffloading(Action::OFK_None) ||
        JA.isDeviceOffloading(Action::OFK_OpenMP))) {
     switch (D.getOpenMPRuntime(Args)) {
@@ -6675,11 +6689,16 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
     Args.addOptOutFlag(CmdArgs, options::OPT_fopenmp_extensions,
                        options::OPT_fno_openmp_extensions);
   }
-
-  // Forward the new driver to change offloading code generation.
-  if (Args.hasFlag(options::OPT_offload_new_driver,
-                   options::OPT_no_offload_new_driver, false))
+  // Forward the offload runtime change to code generation, liboffload implies
+  // new driver. Otherwise, check if we should forward the new driver to change
+  // offloading code generation.
+  if (Args.hasFlag(options::OPT_foffload_via_llvm,
+                   options::OPT_fno_offload_via_llvm, false)) {
+    CmdArgs.append({"--offload-new-driver", "-foffload-via-llvm"});
+  } else if (Args.hasFlag(options::OPT_offload_new_driver,
+                          options::OPT_no_offload_new_driver, false)) {
     CmdArgs.push_back("--offload-new-driver");
+  }
 
   SanitizeArgs.addArgs(TC, Args, CmdArgs, InputType);
 
diff --git a/clang/lib/Driver/ToolChains/CommonArgs.cpp b/clang/lib/Driver/ToolChains/CommonArgs.cpp
index 2a4c1369f5a73..d142cc791925d 100644
--- a/clang/lib/Driver/ToolChains/CommonArgs.cpp
+++ b/clang/lib/Driver/ToolChains/CommonArgs.cpp
@@ -1205,8 +1205,13 @@ bool tools::addOpenMPRuntime(const Compilation &C, ArgStringList &CmdArgs,
                              bool ForceStaticHostRuntime, bool IsOffloadingHost,
                              bool GompNeedsRT) {
   if (!Args.hasFlag(options::OPT_fopenmp, options::OPT_fopenmp_EQ,
-                    options::OPT_fno_openmp, false))
+                    options::OPT_fno_openmp, false)) {
+    // We need libomptarget (liboffload) if it's the choosen offloading runtime.
+    if (Args.hasFlag(options::OPT_foffload_via_llvm,
+                     options::OPT_fno_offload_via_llvm, false))
+      CmdArgs.push_back("-lomptarget");
     return false;
+  }
 
   Driver::OpenMPRuntimeKind RTKind = TC.getDriver().getOpenMPRuntime(Args);
 
diff --git a/clang/lib/Driver/ToolChains/Cuda.cpp b/clang/lib/Driver/ToolChains/Cuda.cpp
index 2dfc7457b0ac7..7d154865ce3f2 100644
--- a/clang/lib/Driver/ToolChains/Cuda.cpp
+++ b/clang/lib/Driver/ToolChains/Cuda.cpp
@@ -861,17 +861,15 @@ void CudaToolChain::addClangTargetOptions(
           DeviceOffloadingKind == Action::OFK_Cuda) &&
          "Only OpenMP or CUDA offloading kinds are supported for NVIDIA GPUs.");
 
-  if (DeviceOffloadingKind == Action::OFK_Cuda) {
-    CC1Args.append(
-        {"-fcuda-is-device", "-mllvm", "-enable-memcpyopt-without-libcalls"});
-
-    // Unsized function arguments used for variadics were introduced in CUDA-9.0
-    // We still do not support generating code that actually uses variadic
-    // arguments yet, but we do need to allow parsing them as recent CUDA
-    // headers rely on that. https://github.com/llvm/llvm-project/issues/58410
-    if (CudaInstallation.version() >= CudaVersion::CUDA_90)
-      CC1Args.push_back("-fcuda-allow-variadic-functions");
-  }
+  CC1Args.append(
+      {"-fcuda-is-device", "-mllvm", "-enable-memcpyopt-without-libcalls"});
+
+  // Unsized function arguments used for variadics were introduced in CUDA-9.0
+  // We still do not support generating code that actually uses variadic
+  // arguments yet, but we do need to allow parsing them as recent CUDA
+  // headers rely on that. https://github.com/llvm/llvm-project/issues/58410
+  if (CudaInstallation.version() >= CudaVersion::CUDA_90)
+    CC1Args.push_back("-fcuda-allow-variadic-functions");
 
   if (DriverArgs.hasArg(options::OPT_nogpulib))
     return;
@@ -889,6 +887,13 @@ void CudaToolChain::addClangTargetOptions(
   CC1Args.push_back("-mlink-builtin-bitcode");
   CC1Args.push_back(DriverArgs.MakeArgString(LibDeviceFile));
 
+  // For now, we don't use any Offload/OpenMP device runtime when we offload
+  // CUDA via LLVM/Offload. We should split the Offload/OpenMP device runtime
+  // and include the "generic" (or CUDA-specific) parts.
+  if (DriverArgs.hasFlag(options::OPT_foffload_via_llvm,
+                         options::OPT_fno_offload_via_llvm, false))
+    return;
+
   clang::CudaVersion CudaInstallationVersion = CudaInstallation.version();
 
   if (DriverArgs.hasFlag(options::OPT_fcuda_short_ptr,
diff --git a/clang/lib/Headers/CMakeLists.txt b/clang/lib/Headers/CMakeLists.txt
index d3090e488306f..9e0eb0f4cde89 100644
--- a/clang/lib/Headers/CMakeLists.txt
+++ b/clang/lib/Headers/CMakeLists.txt
@@ -325,6 +325,12 @@ set(openmp_wrapper_files
   openmp_wrappers/new
 )
 
+set(llvm_offload_wrapper_files
+  llvm_offload_wrappers/__llvm_offload.h
+  llvm_offload_wrappers/__llvm_offload_host.h
+  llvm_offload_wrappers/__llvm_offload_device.h
+)
+
 set(llvm_libc_wrapper_files
   llvm_libc_wrappers/assert.h
   llvm_libc_wrappers/stdio.h
@@ -375,7 +381,7 @@ endfunction(clang_generate_header)
 # Copy header files from the source directory to the build directory
 foreach( f ${files} ${cuda_wrapper_files} ${cuda_wrapper_bits_files}
            ${ppc_wrapper_files} ${openmp_wrapper_files} ${zos_wrapper_files} ${hlsl_files}
-           ${llvm_libc_wrapper_files})
+	   ${llvm_libc_wrapper_files} ${llvm_offload_wrapper_files})
   copy_header_to_output_dir(${CMAKE_CURRENT_SOURCE_DIR} ${f})
 endforeach( f )
 
@@ -501,6 +507,7 @@ add_header_target("hlsl-resource-headers" ${hlsl_files})
 add_header_target("opencl-resource-headers" ${opencl_files})
 add_header_target("llvm-libc-resource-headers" ${llvm_libc_wrapper_files})
 add_header_target("openmp-resource-headers" ${openmp_wrapper_files})
+add_header_target("llvm-offload-resource-headers" ${llvm_libc_wrapper_files})
 add_header_target("windows-resource-headers" ${windows_only_files})
 add_header_target("utility-resource-headers" ${utility_files})
 
@@ -542,6 +549,11 @@ install(
   DESTINATION ${header_install_dir}/openmp_wrappers
   COMPONENT clang-resource-headers)
 
+install(
+  FILES ${llvm_offload_wrapper_files}
+  DESTINATION ${header_install_dir}/llvm_offload_wrappers
+  COMPONENT clang-resource-headers)
+
 install(
   FILES ${zos_wrapper_files}
   DESTINATION ${header_install_dir}/zos_wrappers
@@ -704,8 +716,8 @@ install(
   COMPONENT openmp-resource-headers)
 
 install(
-  FILES ${openmp_wrapper_files}
-  DESTINATION ${header_install_dir}/openmp_wrappers
+  FILES ${llvm_offload_wrapper_files}
+  DESTINATION ${header_install_dir}/llvm_offload_wrappers
   EXCLUDE_FROM_ALL
   COMPONENT openmp-resource-headers)
 
diff --git a/clang/lib/Headers/llvm_offload_wrappers/__llvm_offload.h b/clang/lib/Headers/llvm_offload_wrappers/__llvm_offload.h
new file mode 100644
index 0000000000000..2898898904e29
--- /dev/null
+++ b/clang/lib/Headers/llvm_offload_wrappers/__llvm_offload.h
@@ -0,0 +1,31 @@
+/*===------ LLVM/Offload helpers for kernel languages (CUDA/HIP) -*- c++ -*-===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#include <stddef.h>
+
+#define __host__ __attribute__((host))
+#define __device__ __attribute__((device))
+#define __global__ __attribute__((global))
+#define __shared__ __attribute__((shared))
+#define __constant__ __attribute__((constant))
+#define __managed__ __attribute__((managed))
+
+extern "C" {
+
+typedef struct dim3 {
+  dim3() {}
+  dim3(unsigned x) : x(x) {}
+  unsigned x = 0, y = 0, z = 0;
+} dim3;
+
+// TODO: For some reason the CUDA device compilation requires this declaration
+// to be present on the device while it is only used on the host.
+unsigned __llvmPushCallConfiguration(dim3 gridDim, dim3 blockDim,
+                                     size_t sharedMem = 0, void *stream = 0);
+}
diff --git a/clang/lib/Headers/llvm_offload_wrappers/__llvm_offload_device.h b/clang/lib/Headers/llvm_offload_wrappers/__llvm_offload_device.h
new file mode 100644
index 0000000000000..1a813b331515b
--- /dev/null
+++ b/clang/lib/Headers/llvm_offload_wrappers/__llvm_offload_device.h
@@ -0,0 +1,10 @@
+/*===------ LLVM/Offload helpers for kernel languages (CUDA/HIP) -*- c++ -*-===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#include "__llvm_offload.h"
diff --git a/clang/lib/Headers/llvm_offload_wrappers/__llvm_offload_host.h b/clang/lib/Headers/llvm_offload_wrappers/__llvm_offload_host.h
new file mode 100644
index 0000000000000..160289d169b55
--- /dev/null
+++ b/clang/lib/Headers/llvm_offload_wrappers/__llvm_offload_host.h
@@ -0,0 +1,15 @@
+/*===------ LLVM/Offload helpers for kernel languages (CUDA/HIP) -*- c++ -*-===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#include "__llvm_offload.h"
+
+extern "C" {
+unsigned llvmLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim,
+                          void **args, size_t sharedMem = 0, void *stream = 0);
+}
diff --git a/clang/lib/Headers/openmp_wrappers/__clang_openmp_device_functions.h b/clang/lib/Headers/openmp_wrappers/__clang_openmp_device_functions.h
index d5b6846b03488..3e354c63efc66 100644
--- a/clang/lib/Headers/openmp_wrappers/__clang_openmp_device_functions.h
+++ b/clang/lib/Headers/openmp_wrappers/__clang_openmp_device_functions.h
@@ -10,17 +10,15 @@
 #ifndef __CLANG_OPENMP_DEVICE_FUNCTIONS_H__
 #define __CLANG_OPENMP_DEVICE_FUNCTIONS_H__
 
-#ifndef _OPENMP
-#error "This file is for OpenMP compilation only."
-#endif
-
 #ifdef __cplusplus
 extern "C" {
 #endif
 
+#ifdef __NVPTX__
 #pragma omp begin declare variant match(                                       \
     device = {arch(nvptx, nvptx64)}, implementation = {extension(match_any)})
 
+#pragma push_macro("__CUDA__")
 #define __CUDA__
 #define __OPENMP_NVPTX__
 
@@ -31,9 +29,10 @@ extern "C" {
 #include <__clang_cuda_device_functions.h>
 
 #undef __OPENMP_NVPTX__
-#undef __CUDA__
+#pragma pop_macro("__CUDA__")
 
 #pragma omp end declare variant
+#endif
 
 #ifdef __AMDGCN__
 #pragma omp begin declare variant match(device = {arch(amdgcn)})
diff --git a/clang/lib/Sema/SemaCUDA.cpp b/clang/lib/Sema/SemaCUDA.cpp
index 580b9872c6a1d..ec37c0df56c67 100644
--- a/clang/lib/Sema/SemaCUDA.cpp
+++ b/clang/lib/Sema/SemaCUDA.cpp
@@ -1068,6 +1068,9 @@ void SemaCUDA::inheritTargetAttrs(FunctionDecl *FD,
 }
 
 std::string SemaCUDA::getConfigureFuncName() const {
+  if (getLangOpts().OffloadViaLLVM)
+    return "__llvmPushCallConfiguration";
+
   if (getLangOpts().HIP)
     return getLangOpts().HIPUseNewLaunchAPI ? "__hipPushCallConfiguration"
                                             : "hipConfigureCall";
diff --git a/clang/test/CodeGenCUDA/offload_via_llvm.cu b/clang/test/CodeGenCUDA/offload_via_llvm.cu
new file mode 100644
index 0000000000000..3eb580850fc48
--- /dev/null
+++ b/clang/test/CodeGenCUDA/offload_via_llvm.cu
@@ -0,0 +1,97 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// RUN: %clang -Xclang -triple -Xclang "x86_64-unknown-linux-gnu" -S -c -foffload-via-llvm -emit-llvm -o - %s | FileCheck %s
+
+// Check that we generate LLVM/Offload calls, including the KERNEL_LAUNCH_PARAMS argument.
+
+// CHECK-LABEL: define dso_local void @_Z18__device_stub__fooisPvS_(
+// CHECK-SAME: i32 noundef [[TMP0:%.*]], i16 noundef signext [[TMP1:%.*]], ptr noundef [[TMP2:%.*]], ptr noundef [[TMP3:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[DOTADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[DOTADDR1:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    [[DOTADDR2:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[DOTADDR3:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[KERNEL_ARGS:%.*]] = alloca [[TMP0]], align 16
+// CHECK-NEXT:    [[KERNEL_LAUNCH_PARAMS:%.*]] = alloca [[TMP1]], align 16
+// CHECK-NEXT:    [[GRID_DIM:%.*]] = alloca [[STRUCT_DIM3:%.*]], align 8
+// CHECK-NEXT:    [[BLOCK_DIM:%.*]] = alloca [[STRUCT_DIM3]], align 8
+// CHECK-NEXT:    [[SHMEM_SIZE:%.*]] = alloca i64, align 8
+// CHECK-NEXT:    [[STREAM:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[GRID_DIM_COERCE:%.*]] = alloca { i64, i32 }, align 8
+// CHECK-NEXT:    [[BLOCK_DIM_COERCE:%.*]] = alloca { i64, i32 }, align 8
+// CHECK-NEXT:    store i32 [[TMP0]], ptr [[DOTADDR]], align 4
+// CHECK-NEXT:    store i16 [[TMP1]], ptr [[DOTADDR1]], align 2
+// CHECK-NEXT:    store ptr [[TMP2]], ptr [[DOTADDR2]], align 8
+// CHECK-NEXT:    store ptr [[TMP3]], ptr [[DOTADDR3]], align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [[TMP1]], ptr [[KERNEL_LAUNCH_PARAMS]], i32 0, i32 0
+// CHECK-NEXT:    store i64 24, ptr [[TMP4]], align 16
+// CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[TMP1]], ptr [[KERNEL_LAUNCH_PARAMS]], i32 0, i32 1
+// CHECK-NEXT:    store ptr [[KERNEL_ARGS]], ptr [[TMP5]], align 8
+// CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[TMP1]], ptr [[KERNEL_LAUNCH_PARAMS]], i32 0, i32 2
+// CHECK-NEXT:    store ptr null, ptr [[TMP6]], align 16
+// CHECK-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTADDR]], align 4
+// CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[TMP0]], ptr [[KERNEL_ARGS]], i32 0, i32 0
+// CHECK-NEXT:    store i32 [[TMP7]], ptr [[TMP8]], align 16
+// CHECK-NEXT:    [[TMP9:%.*]] = load i16, ptr [[DOTADDR1]], align 2
+// CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [[TMP0]], ptr [[KERNEL_ARGS]], i32 0, i32 1
+// CHECK-NEXT:    store i16 [[TMP9]], ptr [[TMP10]], align 4
+// CHECK-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[DOTADDR2]], align 8
+// CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [[TMP0]], ptr [[KERNEL_ARGS]], i32 0, i32 2
+// CHECK-NEXT:    store ptr [[TMP11]], ptr [[TMP12]], align 8
+// CHECK-NEXT:    [[TMP13:%.*]] = load ptr, ptr [[DOTADDR3]], align 8
+// CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [[TMP0]], ptr [[KERNEL_ARGS]], i32 0, i32 3
+// CHECK-NEXT:    store ptr [[TMP13]], ptr [[TMP14]], align 16
+// CHECK-NEXT:    [[TMP15:%.*]] = call i32 @__llvmPopCallConfiguration(ptr [[GRID_DIM]], ptr [[BLOCK_DIM]], ptr [[SHMEM_SIZE]], ptr [[STREAM]])
+// CHECK-NEXT:    [[TMP16:%.*]] = load i64, ptr [[SHMEM_SIZE]], align 8
+// CHECK-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[STREAM]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[GRID_DIM_COERCE]], ptr align 8 [[GRID_DIM]], i64 12, i1 false)
+// CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds { i64, i32 }, ptr [[GRID_DIM_COERCE]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP19:%.*]] = load i64, ptr [[TMP18]], align 8
+// CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds { i64, i32 }, ptr [[GRID_DIM_COERCE]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP21:%.*]] = load i32, ptr [[TMP20]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[BLOCK_DIM_COERCE]], ptr align 8 [[BLOCK_DIM]], i64 12, i1 false)
+// CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds { i64, i32 }, ptr [[BLOCK_DIM_COERCE]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP23:%.*]] = load i64, ptr [[TMP22]], align 8
+// CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds { i64, i32 }, ptr [[BLOCK_DIM_COERCE]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP25:%.*]] = load i32, ptr [[TMP24]], align 8
+// CHECK-NEXT:    [[CALL:%.*]] = call noundef i32 @llvmLaunchKernel(ptr noundef @_Z18__device_stub__fooisPvS_, i64 [[TMP19]], i32 [[TMP21]], i64 [[TMP23]], i32 [[TMP25]], ptr noundef [[KERNEL_LAUNCH_PARAMS]], i64 noundef [[TMP16]], ptr noundef [[TMP17]])
+// CHECK-NEXT:    br label %[[SETUP_END:.*]]
+// CHECK:       [[SETUP_END]]:
+// CHECK-NEXT:    ret void
+//
+__global__ void foo(int, short, void *, void *) {}
+
+// CHECK-LABEL: define dso_local void @_Z5test1Pv(
+// CHECK-SAME: ptr noundef [[PTR:%.*]]) #[[ATTR2:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[PTR_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[AGG_TMP:%.*]] = alloca [[STRUCT_DIM3:%.*]], align 4
+// CHECK-NEXT:    [[AGG_TMP1:%.*]] = alloca [[STRUCT_DIM3]], align 4
+// CHECK-NEXT:    [[AGG_TMP_COERCE:%.*]] = alloca { i64, i32 }, align 4
+// CHECK-NEXT:    [[AGG_TMP1_COERCE:%.*]] = alloca { i64, i32 }, align 4
+// CHECK-NEXT:    store ptr [[PTR]], ptr [[PTR_ADDR]], align 8
+// CHECK-NEXT:    call void @_ZN4dim3C2Ej(ptr noundef nonnull align 4 dereferenceable(12) [[AGG_TMP]], i32 noundef 3)
+// CHECK-NEXT:    call void @_ZN4dim3C2Ej(ptr noundef nonnull align 4 dereferenceable(12) [[AGG_TMP1]], i32 noundef 7)
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[AGG_TMP_COERCE]], ptr align 4 [[AGG_TMP]], i64 12, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds { i64, i32 }, ptr [[AGG_TMP_COERCE]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr [[TMP0]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds { i64, i32 }, ptr [[AGG_TMP_COERCE]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[AGG_TMP1_COERCE]], ptr align 4 [[AGG_TMP1]], i64 12, i1 false)
+// CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds { i64, i32 }, ptr [[AGG_TMP1_COERCE]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP5:%.*]] = load i64, ptr [[TMP4]], align 4
+// CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds { i64, i32 }, ptr [[AGG_TMP1_COERCE]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP6]], align 4
+// CHECK-NEXT:    [[CALL:%.*]] = call i32 @__llvmPushCallConfiguration(i64 [[TMP1]], i32 [[TMP3]], i64 [[TMP5]], i32 [[TMP7]], i64 noundef 0, ptr noundef null)
+// CHECK-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[CALL]], 0
+// CHECK-NEXT:    br i1 [[TOBOOL]], label %[[KCALL_END:.*]], label %[[KCALL_CONFIGOK:.*]]
+// CHECK:       [[KCALL_CONFIGOK]]:
+// CHECK-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8
+// CHECK-NEXT:    [[TMP9:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8
+// CHECK-NEXT:    call void @_Z18__device_stub__fooisPvS_(i32 noundef 13, i16 noundef signext 1, ptr noundef [[TMP8]], ptr noundef [[TMP9]]) #[[ATTR5:[0-9]+]]
+// CHECK-NEXT:    br label %[[KCALL_END]]
+// CHECK:       [[KCALL_END]]:
+// CHECK-NEXT:    ret void
+//
+void test1(void *Ptr) {
+  foo<<<3, 7>>>(13, 1, Ptr, Ptr);
+}
diff --git a/clang/test/Driver/cuda-via-liboffload.cu b/clang/test/Driver/cuda-via-liboffload.cu
new file mode 100644
index 0000000000000..68dc963e906b2
--- /dev/null
+++ b/clang/test/Driver/cuda-via-liboffload.cu
@@ -0,0 +1,23 @@
+// RUN: %clang -### -target x86_64-linux-gnu -foffload-via-llvm -ccc-print-bindings \
+// RUN:        --offload-arch=sm_35 --offload-arch=sm_70 %s 2>&1 \
+// RUN: | FileCheck -check-prefix BINDINGS %s
+
+//      BINDINGS: "x86_64-unknown-linux-gnu" - "clang", inputs: ["[[INPUT:.+]]"], output: "[[HOST_BC:.+]]"
+// BINDINGS-NEXT: "nvptx64-nvidia-cuda" - "clang", inputs: ["[[INPUT]]", "[[HOST_BC]]"], output: "[[PTX_SM_35:.+]]"
+// BINDINGS-NEXT: "nvptx64-nvidia-cuda" - "NVPTX::Assembler", inputs: ["[[PTX_SM_35]]"], output: "[[CUBIN_SM_35:.+]]"
+// BINDINGS-NEXT: "nvptx64-nvidia-cuda" - "clang", inputs: ["[[INPUT]]", "[[HOST_BC]]"], output: "[[PTX_SM_70:.+]]"
+// BINDINGS-NEXT: "nvptx64-nvidia-cuda" - "NVPTX::Assembler", inputs: ["[[PTX_SM_70:.+]]"], output: "[[CUBIN_SM_70:.+]]"
+// BINDINGS-NEXT: "x86_64-unknown-linux-gnu" - "Offload::Packager", inputs: ["[[CUBIN_SM_35]]", "[[CUBIN_SM_70]]"], output: "[[BINARY:.+]]"
+// BINDINGS-NEXT: "x86_64-unknown-linux-gnu" - "clang", inputs: ["[[HOST_BC]]", "[[BINARY]]"], output: "[[HOST_OBJ:.+]]"
+// BINDINGS-NEXT: "x86_64-unknown-linux-gnu" - "Offload::Linker", inputs: ["[[HOST_OBJ]]"], output: "a.out"
+
+// RUN: %clang -### -target x86_64-linux-gnu -foffload-via-llvm -ccc-print-bindings \
+// RUN:        --offload-arch=sm_35 --offload-arch=sm_70 %s 2>&1 \
+// RUN: | FileCheck -check-prefix BINDINGS-DEVICE %s
+
+// BINDINGS-DEVICE: # "nvptx64-nvidia-cuda" - "clang", inputs: ["[[INPUT:.+]]"], output: "[[PTX:.+]]"
+// BINDINGS-DEVICE: # "nvptx64-nvidia-cuda" - "NVPTX::Assembler", inputs: ["[[PTX]]"], output: "[[CUBIN:.+]]"
+
+// RUN: %clang -### -target x86_64-linux-gnu -ccc-print-bindings --offload-link -foffload-via-llvm %s 2>&1 | FileCheck -check-prefix DEVICE-LINK %s
+
+// DEVICE-LINK: "x86_64-unknown-linux-gnu" - "Offload::Linker", inputs: ["[[INPUT:.+]]"], output: "a.out"
diff --git a/offload/include/Shared/APITypes.h b/offload/include/Shared/APITypes.h
index 5b22bbaac144f..4c1f7712249a3 100644
--- a/offload/include/Shared/APITypes.h
+++ b/offload/include/Shared/APITypes.h
@@ -102,8 +102,9 @@ struct KernelArgsTy {
       0; // Tripcount for the teams / distribute loop, 0 otherwise.
   struct {
     uint64_t NoWait : 1; // Was this kernel spawned with a `nowait` clause.
-    uint64_t Unused : 63;
-  } Flags = {0, 0};
+    uint64_t IsCUDA : 1; // Was this kernel spawned via CUDA.
+    uint64_t Unused : 62;
+  } Flags = {0, 0, 0};
   // The number of teams (for x,y,z dimension).
   uint32_t NumTeams[3] = {0, 0, 0};
    // The number of threads (for x,y,z dimension).
diff --git a/offload/include/omptarget.h b/offload/include/omptarget.h
index 323dee41630f2..2b6445e9fbe55 100644
--- a/offload/include/omptarget.h
+++ b/offload/include/omptarget.h
@@ -107,7 +107,7 @@ enum TargetAllocTy : int32_t {
 
 inline KernelArgsTy CTorDTorKernelArgs = {1,       0,       nullptr,   nullptr,
 	     nullptr, nullptr, nullptr,   nullptr,
-	     0,      {0,0},       {1, 0, 0}, {1, 0, 0}, 0};
+	     0,      {0,0,0},       {1, 0, 0}, {1, 0, 0}, 0};
 
 struct DeviceTy;
 
diff --git a/offload/plugins-nextgen/common/src/PluginInterface.cpp b/offload/plugins-nextgen/common/src/PluginInterface.cpp
index 94f9d4670b672..2608af016284b 100644
--- a/offload/plugins-nextgen/common/src/PluginInterface.cpp
+++ b/offload/plugins-nextgen/common/src/PluginInterface.cpp
@@ -549,9 +549,16 @@ Error GenericKernelTy::launch(GenericDeviceTy &GenericDevice, void **ArgPtrs,
   if (!KernelLaunchEnvOrErr)
     return KernelLaunchEnvOrErr.takeError();
 
-  KernelLaunchParamsTy LaunchParams =
-      prepareArgs(GenericDevice, ArgPtrs, ArgOffsets, KernelArgs.NumArgs, Args,
-                  Ptrs, *KernelLaunchEnvOrErr);
+  KernelLaunchParamsTy LaunchParams;
+
+  // Kernel languages don't use indirection.
+  if (KernelArgs.Flags.IsCUDA) {
+    LaunchParams = *reinterpret_cast<KernelLaunchParamsTy *>(KernelArgs.ArgPtrs);
+  } else {
+    LaunchParams =
+        prepareArgs(GenericDevice, ArgPtrs, ArgOffsets, KernelArgs.NumArgs,
+                    Args, Ptrs, *KernelLaunchEnvOrErr);
+  }
 
   uint32_t NumThreads = getNumThreads(GenericDevice, KernelArgs.ThreadLimit);
   uint64_t NumBlocks =
diff --git a/offload/src/CMakeLists.txt b/offload/src/CMakeLists.txt
index efa5cdab33ec9..b442df45deaa5 100644
--- a/offload/src/CMakeLists.txt
+++ b/offload/src/CMakeLists.txt
@@ -22,6 +22,7 @@ add_llvm_library(omptarget
   OpenMP/InteropAPI.cpp
   OpenMP/OMPT/Callback.cpp
 
+  KernelLanguage/API.cpp
 
   ADDITIONAL_HEADER_DIRS
   ${LIBOMPTARGET_INCLUDE_DIR}
diff --git a/offload/src/KernelLanguage/API.cpp b/offload/src/KernelLanguage/API.cpp
new file mode 100644
index 0000000000000..9ffc199b5da7d
--- /dev/null
+++ b/offload/src/KernelLanguage/API.cpp
@@ -0,0 +1,76 @@
+//===------ API.cpp - Kernel Language (CUDA/HIP) entry points ----- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+
+
+#include "Shared/APITypes.h"
+
+#include <cstdio>
+
+struct dim3 {
+  unsigned x = 0, y = 0, z = 0;
+};
+
+struct __omp_kernel_t {
+  dim3 __grid_size;
+  dim3 __block_size;
+  size_t __shared_memory;
+
+  void *__stream;
+};
+
+static __omp_kernel_t __current_kernel = {};
+#pragma omp threadprivate(__current_kernel);
+
+extern "C" {
+
+// TODO: There is little reason we need to keep these names or the way calls are
+// issued. For now we do to avoid modifying Clang's CUDA codegen. Unclear when
+// we actually need to push/pop configurations.
+unsigned __llvmPushCallConfiguration(dim3 __grid_size, dim3 __block_size,
+                                     size_t __shared_memory, void *__stream) {
+  __omp_kernel_t &__kernel = __current_kernel;
+  __kernel.__grid_size = __grid_size;
+  __kernel.__block_size = __block_size;
+  __kernel.__shared_memory = __shared_memory;
+  __kernel.__stream = __stream;
+  return 0;
+}
+
+unsigned __llvmPopCallConfiguration(dim3 *__grid_size, dim3 *__block_size,
+                                    size_t *__shared_memory, void *__stream) {
+   __omp_kernel_t &__kernel = __current_kernel;
+  *__grid_size = __kernel.__grid_size;
+  *__block_size = __kernel.__block_size;
+  *__shared_memory = __kernel.__shared_memory;
+  *((void **)__stream) = __kernel.__stream;
+  return 0;
+}
+
+int __tgt_target_kernel(void *Loc, int64_t DeviceId, int32_t NumTeams,
+                        int32_t ThreadLimit, const void *HostPtr,
+                        KernelArgsTy *Args);
+
+unsigned llvmLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim,
+                          void *args, size_t sharedMem, void *stream) {
+  KernelArgsTy Args = {};
+  Args.DynCGroupMem = sharedMem;
+  Args.NumTeams[0] = gridDim.x;
+  Args.NumTeams[1] = gridDim.y;
+  Args.NumTeams[2] = gridDim.z;
+  Args.ThreadLimit[0] = blockDim.x;
+  Args.ThreadLimit[1] = blockDim.y;
+  Args.ThreadLimit[2] = blockDim.z;
+  Args.ArgPtrs = reinterpret_cast<void **>(args);
+  Args.Flags.IsCUDA = true;
+  int rv = __tgt_target_kernel(nullptr, 0, gridDim.x,
+                               blockDim.x, func, &Args);
+  return rv;
+}
+}
diff --git a/offload/src/exports b/offload/src/exports
index f95544ec8329c..7bdc7d2a531bb 100644
--- a/offload/src/exports
+++ b/offload/src/exports
@@ -71,6 +71,9 @@ VERS1.0 {
     __tgt_interop_use;
     __tgt_interop_destroy;
     ompt_libomptarget_connect;
+    __llvmPushCallConfiguration;
+    __llvmPopCallConfiguration;
+    llvmLaunchKernel;
   local:
     *;
 };
diff --git a/offload/test/lit.cfg b/offload/test/lit.cfg
index 6c590603079c4..9053151e44a78 100644
--- a/offload/test/lit.cfg
+++ b/offload/test/lit.cfg
@@ -66,7 +66,7 @@ def evaluate_bool_env(env):
 config.name = 'libomptarget :: ' + config.libomptarget_current_target
 
 # suffixes: A list of file extensions to treat as test files.
-config.suffixes = ['.c', '.cpp', '.cc', '.f90']
+config.suffixes = ['.c', '.cpp', '.cc', '.f90', '.cu']
 
 # excludes: A list of directories to exclude from the testuites.
 config.excludes = ['Inputs']
diff --git a/offload/test/offloading/CUDA/basic_launch.cu b/offload/test/offloading/CUDA/basic_launch.cu
new file mode 100644
index 0000000000000..2915a7c216ab5
--- /dev/null
+++ b/offload/test/offloading/CUDA/basic_launch.cu
@@ -0,0 +1,31 @@
+// RUN: %clang++ -foffload-via-llvm --offload-arch=native %s -o %t
+// RUN: %t | %fcheck-generic
+// RUN: %clang++ -foffload-via-llvm --offload-arch=native %s -o %t -fopenmp
+// RUN: %t | %fcheck-generic
+
+// UNSUPPORTED: aarch64-unknown-linux-gnu
+// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
+// UNSUPPORTED: x86_64-pc-linux-gnu
+// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+
+
+#include <stdio.h>
+
+extern "C" {
+void *llvm_omp_target_alloc_shared(size_t Size, int DeviceNum);
+void llvm_omp_target_free_shared(void *DevicePtr, int DeviceNum);
+}
+
+__global__ void square(int *A) { *A = 42; }
+
+int main(int argc, char **argv) {
+  int DevNo = 0;
+  int *Ptr = reinterpret_cast<int *>(llvm_omp_target_alloc_shared(4, DevNo));
+  *Ptr = 7;
+  printf("Ptr %p, *Ptr: %i\n", Ptr, *Ptr);
+  // CHECK: Ptr [[Ptr:0x.*]], *Ptr: 7
+  square<<<1, 1>>>(Ptr);
+  printf("Ptr %p, *Ptr: %i\n", Ptr, *Ptr);
+  // CHECK: Ptr [[Ptr]], *Ptr: 42
+  llvm_omp_target_free_shared(Ptr, DevNo);
+}
diff --git a/offload/test/offloading/CUDA/basic_launch_blocks_and_threads.cu b/offload/test/offloading/CUDA/basic_launch_blocks_and_threads.cu
new file mode 100644
index 0000000000000..615cae6f7b233
--- /dev/null
+++ b/offload/test/offloading/CUDA/basic_launch_blocks_and_threads.cu
@@ -0,0 +1,32 @@
+// RUN: %clang++ -foffload-via-llvm --offload-arch=native %s -o %t
+// RUN: %t | %fcheck-generic
+// RUN: %clang++ -foffload-via-llvm --offload-arch=native %s -o %t -fopenmp
+// RUN: %t | %fcheck-generic
+
+// UNSUPPORTED: aarch64-unknown-linux-gnu
+// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
+// UNSUPPORTED: x86_64-pc-linux-gnu
+// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+
+#include <stdio.h>
+
+extern "C" {
+void *llvm_omp_target_alloc_shared(size_t Size, int DeviceNum);
+void llvm_omp_target_free_shared(void *DevicePtr, int DeviceNum);
+}
+
+__global__ void square(int *A) {
+  __scoped_atomic_fetch_add(A, 1, __ATOMIC_SEQ_CST, __MEMORY_SCOPE_DEVICE);
+}
+
+int main(int argc, char **argv) {
+  int DevNo = 0;
+  int *Ptr = reinterpret_cast<int *>(llvm_omp_target_alloc_shared(4, DevNo));
+  *Ptr = 0;
+  printf("Ptr %p, *Ptr: %i\n", Ptr, *Ptr);
+  // CHECK: Ptr [[Ptr:0x.*]], *Ptr: 0
+  square<<<7, 6>>>(Ptr);
+  printf("Ptr %p, *Ptr: %i\n", Ptr, *Ptr);
+  // CHECK: Ptr [[Ptr]], *Ptr: 42
+  llvm_omp_target_free_shared(Ptr, DevNo);
+}
diff --git a/offload/test/offloading/CUDA/basic_launch_multi_arg.cu b/offload/test/offloading/CUDA/basic_launch_multi_arg.cu
new file mode 100644
index 0000000000000..f95f1dbacc79c
--- /dev/null
+++ b/offload/test/offloading/CUDA/basic_launch_multi_arg.cu
@@ -0,0 +1,41 @@
+// RUN: %clang++ -foffload-via-llvm --offload-arch=native %s -o %t
+// RUN: %t | %fcheck-generic
+// RUN: %clang++ -foffload-via-llvm --offload-arch=native %s -o %t -fopenmp
+// RUN: %t | %fcheck-generic
+
+// UNSUPPORTED: aarch64-unknown-linux-gnu
+// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
+// UNSUPPORTED: x86_64-pc-linux-gnu
+// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+
+#include <stdio.h>
+
+extern "C" {
+void *llvm_omp_target_alloc_shared(size_t Size, int DeviceNum);
+void llvm_omp_target_free_shared(void *DevicePtr, int DeviceNum);
+}
+
+__global__ void square(int *Dst, short Q, int *Src, short P) {
+  *Dst = (Src[0] + Src[1]) * (Q + P);
+  Src[0] = Q;
+  Src[1] = P;
+}
+
+int main(int argc, char **argv) {
+  int DevNo = 0;
+  int *Ptr = reinterpret_cast<int *>(llvm_omp_target_alloc_shared(4, DevNo));
+  int *Src = reinterpret_cast<int *>(llvm_omp_target_alloc_shared(8, DevNo));
+  *Ptr = 7;
+  Src[0] = -2;
+  Src[1] = 8;
+  printf("Ptr %p, *Ptr: %i\n", Ptr, *Ptr);
+  // CHECK: Ptr [[Ptr:0x.*]], *Ptr: 7
+  printf("Src: %i : %i\n", Src[0], Src[1]);
+  // CHECK: Src: -2 : 8
+  square<<<1, 1>>>(Ptr, 3, Src, 4);
+  printf("Ptr %p, *Ptr: %i\n", Ptr, *Ptr);
+  // CHECK: Ptr [[Ptr]], *Ptr: 42
+  printf("Src: %i : %i\n", Src[0], Src[1]);
+  // CHECK: Src: 3 : 4
+  llvm_omp_target_free_shared(Ptr, DevNo);
+}
diff --git a/offload/test/offloading/CUDA/kernel_tu.cu.inc b/offload/test/offloading/CUDA/kernel_tu.cu.inc
new file mode 100644
index 0000000000000..d7d28a109dfc5
--- /dev/null
+++ b/offload/test/offloading/CUDA/kernel_tu.cu.inc
@@ -0,0 +1 @@
+__global__ void square(int *A) { *A = 42; }
diff --git a/offload/test/offloading/CUDA/launch_tu.cu b/offload/test/offloading/CUDA/launch_tu.cu
new file mode 100644
index 0000000000000..c45f40b115b56
--- /dev/null
+++ b/offload/test/offloading/CUDA/launch_tu.cu
@@ -0,0 +1,32 @@
+// clang-format off
+// RUN: %clang++ -foffload-via-llvm --offload-arch=native %s -o %t.launch_tu.o -c
+// RUN: %clang++ -foffload-via-llvm --offload-arch=native -x cuda %S/kernel_tu.cu.inc -o %t.kernel_tu.o -c
+// RUN: %clang++ -foffload-via-llvm --offload-arch=native %t.launch_tu.o %t.kernel_tu.o -o %t
+// RUN: %t | %fcheck-generic
+// clang-format on
+
+// UNSUPPORTED: aarch64-unknown-linux-gnu
+// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
+// UNSUPPORTED: x86_64-pc-linux-gnu
+// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+
+#include <stdio.h>
+
+extern "C" {
+void *llvm_omp_target_alloc_shared(size_t Size, int DeviceNum);
+void llvm_omp_target_free_shared(void *DevicePtr, int DeviceNum);
+}
+
+extern __global__ void square(int *A);
+
+int main(int argc, char **argv) {
+  int DevNo = 0;
+  int *Ptr = reinterpret_cast<int *>(llvm_omp_target_alloc_shared(4, DevNo));
+  *Ptr = 7;
+  printf("Ptr %p, *Ptr: %i\n", Ptr, *Ptr);
+  // CHECK: Ptr [[Ptr:0x.*]], *Ptr: 7
+  square<<<1, 1>>>(Ptr);
+  printf("Ptr %p, *Ptr: %i\n", Ptr, *Ptr);
+  // CHECK: Ptr [[Ptr]], *Ptr: 42
+  llvm_omp_target_free_shared(Ptr, DevNo);
+}

>From 710445500e47cc1ac6b611d7583690e881135da9 Mon Sep 17 00:00:00 2001
From: Johannes Doerfert <johannes at jdoerfert.de>
Date: Fri, 7 Jun 2024 17:06:02 -0700
Subject: [PATCH 02/31] [Offload][CUDA] Add initial cuda_runtime.h overlay

This provides the header overlay for cuda_runtime.h which is found
before any CUDA installation (none is necessary). Some basic APIs are
defined in terms of the omp_target_* ones, but with the API redesign
the requirements of CUDA should be taken into account.

Based on: https://github.com/llvm/llvm-project/pull/94549
---
 clang/lib/Headers/CMakeLists.txt              |   1 +
 .../llvm_offload_wrappers/cuda_runtime.h      | 131 ++++++++++++++++++
 .../offloading/CUDA/basic_api_malloc_free.cu  |  41 ++++++
 .../test/offloading/CUDA/basic_api_memcpy.cu  |  46 ++++++
 .../test/offloading/CUDA/basic_api_memset.cu  |  43 ++++++
 5 files changed, 262 insertions(+)
 create mode 100644 clang/lib/Headers/llvm_offload_wrappers/cuda_runtime.h
 create mode 100644 offload/test/offloading/CUDA/basic_api_malloc_free.cu
 create mode 100644 offload/test/offloading/CUDA/basic_api_memcpy.cu
 create mode 100644 offload/test/offloading/CUDA/basic_api_memset.cu

diff --git a/clang/lib/Headers/CMakeLists.txt b/clang/lib/Headers/CMakeLists.txt
index 9e0eb0f4cde89..251e5b0ba2381 100644
--- a/clang/lib/Headers/CMakeLists.txt
+++ b/clang/lib/Headers/CMakeLists.txt
@@ -329,6 +329,7 @@ set(llvm_offload_wrapper_files
   llvm_offload_wrappers/__llvm_offload.h
   llvm_offload_wrappers/__llvm_offload_host.h
   llvm_offload_wrappers/__llvm_offload_device.h
+  llvm_offload_wrappers/cuda_runtime.h
 )
 
 set(llvm_libc_wrapper_files
diff --git a/clang/lib/Headers/llvm_offload_wrappers/cuda_runtime.h b/clang/lib/Headers/llvm_offload_wrappers/cuda_runtime.h
new file mode 100644
index 0000000000000..8718e462a82d3
--- /dev/null
+++ b/clang/lib/Headers/llvm_offload_wrappers/cuda_runtime.h
@@ -0,0 +1,131 @@
+/*===- __cuda_runtime.h - LLVM/Offload wrappers for CUDA runtime API -------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __CUDA_RUNTIME_API__
+#define __CUDA_RUNTIME_API__
+
+#include <cstddef>
+#include <optional>
+
+extern "C" {
+int omp_get_initial_device(void);
+void omp_target_free(void *Ptr, int Device);
+void *omp_target_alloc(size_t Size, int Device);
+int omp_target_memcpy(void *Dst, const void *Src, size_t Length,
+                      size_t DstOffset, size_t SrcOffset, int DstDevice,
+                      int SrcDevice);
+void *omp_target_memset(void *Ptr, int C, size_t N, int DeviceNum);
+}
+
+// TODO: There are many fields missing in this enumeration.
+typedef enum cudaError {
+  cudaSuccess = 0,
+  cudaErrorInvalidValue = 1,
+  cudaErrorMemoryAllocation = 2,
+  cudaErrorNoDevice = 100,
+  cudaErrorInvalidDevice = 101,
+  cudaErrorOTHER = -1,
+} cudaError_t;
+
+enum cudaMemcpyKind {
+  cudaMemcpyHostToHost = 0,
+  cudaMemcpyHostToDevice = 1,
+  cudaMemcpyDeviceToHost = 2,
+  cudaMemcpyDeviceToDevice = 3,
+  cudaMemcpyDefault = 4
+};
+
+typedef void *cudaStream_t;
+
+static thread_local cudaError_t __cudaomp_last_error = cudaSuccess;
+
+// Returns the last error that has been produced and resets it to cudaSuccess.
+inline cudaError_t cudaGetLastError() {
+  cudaError_t TempError = __cudaomp_last_error;
+  __cudaomp_last_error = cudaSuccess;
+  return TempError;
+}
+
+// Returns the last error that has been produced without reseting it.
+inline cudaError_t cudaPeekAtLastError() { return __cudaomp_last_error; }
+
+inline cudaError_t __cudaMalloc(void **devPtr, size_t size) {
+  int DeviceNum = 0;
+  *devPtr = omp_target_alloc(size, DeviceNum);
+  if (*devPtr == NULL)
+    return __cudaomp_last_error = cudaErrorMemoryAllocation;
+
+  return __cudaomp_last_error = cudaSuccess;
+}
+
+template <class T> cudaError_t cudaMalloc(T **devPtr, size_t size) {
+  return __cudaMalloc((void **)devPtr, size);
+}
+
+inline cudaError_t __cudaFree(void *devPtr) {
+  int DeviceNum = 0;
+  omp_target_free(devPtr, DeviceNum);
+  return __cudaomp_last_error = cudaSuccess;
+}
+
+template <class T> inline cudaError_t cudaFree(T *ptr) {
+  return __cudaFree((void *)ptr);
+}
+
+inline cudaError_t __cudaMemcpy(void *dst, const void *src, size_t count,
+                                cudaMemcpyKind kind) {
+  // get the host device number (which is the inital device)
+  int HostDeviceNum = omp_get_initial_device();
+
+  // use the default device for gpu
+  int GPUDeviceNum = 0;
+
+  // default to copy from host to device
+  int DstDeviceNum = GPUDeviceNum;
+  int SrcDeviceNum = HostDeviceNum;
+
+  if (kind == cudaMemcpyDeviceToHost)
+    std::swap(DstDeviceNum, SrcDeviceNum);
+
+  // omp_target_memcpy returns 0 on success and non-zero on failure
+  if (omp_target_memcpy(dst, src, count, 0, 0, DstDeviceNum, SrcDeviceNum))
+    return __cudaomp_last_error = cudaErrorInvalidValue;
+  return __cudaomp_last_error = cudaSuccess;
+}
+
+template <class T>
+inline cudaError_t cudaMemcpy(T *dst, const T *src, size_t count,
+                              cudaMemcpyKind kind) {
+  return __cudaMemcpy((void *)dst, (const void *)src, count, kind);
+}
+
+inline cudaError_t __cudaMemset(void *devPtr, int value, size_t count,
+                                cudaStream_t stream = 0) {
+  int DeviceNum = 0;
+  if (!omp_target_memset(devPtr, value, count, DeviceNum))
+    return __cudaomp_last_error = cudaErrorInvalidValue;
+  return __cudaomp_last_error = cudaSuccess;
+}
+
+template <class T>
+inline cudaError_t cudaMemset(T *devPtr, int value, size_t count) {
+  return __cudaMemset((void *)devPtr, value, count);
+}
+
+inline cudaError_t cudaDeviceSynchronize() {
+  // TODO: not implemented, not async yet.
+  return __cudaomp_last_error = cudaSuccess;
+}
+
+inline cudaError_t cudaDeviceReset(void) {
+  // TODO: not implemented.
+  return __cudaomp_last_error = cudaSuccess;
+}
+
+#endif
diff --git a/offload/test/offloading/CUDA/basic_api_malloc_free.cu b/offload/test/offloading/CUDA/basic_api_malloc_free.cu
new file mode 100644
index 0000000000000..86dc5f8fef68e
--- /dev/null
+++ b/offload/test/offloading/CUDA/basic_api_malloc_free.cu
@@ -0,0 +1,41 @@
+// RUN: %clang++ -foffload-via-llvm --offload-arch=native %s -o %t
+// RUN: %t | %fcheck-generic
+
+// UNSUPPORTED: aarch64-unknown-linux-gnu
+// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
+// UNSUPPORTED: x86_64-pc-linux-gnu
+// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+
+#include <cuda_runtime.h>
+#include <stdio.h>
+
+extern "C" {
+void *llvm_omp_target_alloc_shared(size_t Size, int DeviceNum);
+void llvm_omp_target_free_shared(void *DevicePtr, int DeviceNum);
+}
+
+__global__ void kernel(int *A, int *DevPtr, int N) {
+  for (int i = 0; i < N; ++i)
+    DevPtr[i] = 1;
+  for (int i = 0; i < N; ++i)
+    *A += DevPtr[i];
+}
+
+int main(int argc, char **argv) {
+  int DevNo = 0;
+  int *Ptr = reinterpret_cast<int *>(llvm_omp_target_alloc_shared(4, DevNo));
+  int *DevPtr;
+  auto Err = cudaMalloc(&DevPtr, 42 * sizeof(int));
+  if (Err != cudaSuccess)
+    return -1;
+  *Ptr = 0;
+  printf("Ptr %p, *Ptr: %i\n", Ptr, *Ptr);
+  // CHECK: Ptr [[Ptr:0x.*]], *Ptr: 0
+  kernel<<<1, 1>>>(Ptr, DevPtr, 42);
+  printf("Ptr %p, *Ptr: %i\n", Ptr, *Ptr);
+  // CHECK: Ptr [[Ptr]], *Ptr: 42
+  Err = cudaFree(DevPtr);
+  if (Err != cudaSuccess)
+    return -1;
+  llvm_omp_target_free_shared(Ptr, DevNo);
+}
diff --git a/offload/test/offloading/CUDA/basic_api_memcpy.cu b/offload/test/offloading/CUDA/basic_api_memcpy.cu
new file mode 100644
index 0000000000000..d5c0929abe43c
--- /dev/null
+++ b/offload/test/offloading/CUDA/basic_api_memcpy.cu
@@ -0,0 +1,46 @@
+// RUN: %clang++ -foffload-via-llvm --offload-arch=native %s -o %t
+// RUN: %t | %fcheck-generic
+
+// UNSUPPORTED: aarch64-unknown-linux-gnu
+// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
+// UNSUPPORTED: x86_64-pc-linux-gnu
+// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+
+#include <cuda_runtime.h>
+#include <stdio.h>
+
+__global__ void kernel(int *DevPtr, int N) {
+  for (int i = 0; i < N; ++i)
+    DevPtr[i]--;
+}
+
+int main(int argc, char **argv) {
+  int DevNo = 0;
+  int Res = 0;
+  int *DevPtr;
+  auto Err = cudaMalloc(&DevPtr, 42 * sizeof(int));
+  if (Err != cudaSuccess)
+    return -1;
+  int HstPtr[42];
+  for (int i = 0; i < 42; ++i) {
+    HstPtr[i] = 2;
+  }
+  Err = cudaMemcpy(DevPtr, HstPtr, 42 * sizeof(int), cudaMemcpyHostToDevice);
+  if (Err != cudaSuccess)
+    return -1;
+  printf("Res: %i\n", Res);
+  // CHECK: Res: 0
+  kernel<<<1, 1>>>(DevPtr, 42);
+  Err = cudaMemcpy(HstPtr, DevPtr, 42 * sizeof(int), cudaMemcpyDeviceToHost);
+  if (Err != cudaSuccess)
+    return -1;
+  for (int i = 0; i < 42; ++i) {
+    printf("%i : %i\n", i, HstPtr[i]);
+    Res += HstPtr[i];
+  }
+  printf("Res: %i\n", Res);
+  // CHECK: Res: 42
+  Err = cudaFree(DevPtr);
+  if (Err != cudaSuccess)
+    return -1;
+}
diff --git a/offload/test/offloading/CUDA/basic_api_memset.cu b/offload/test/offloading/CUDA/basic_api_memset.cu
new file mode 100644
index 0000000000000..36dcc729a3a9e
--- /dev/null
+++ b/offload/test/offloading/CUDA/basic_api_memset.cu
@@ -0,0 +1,43 @@
+// RUN: %clang++ -foffload-via-llvm --offload-arch=native %s -o %t
+// RUN: %t | %fcheck-generic
+
+// UNSUPPORTED: aarch64-unknown-linux-gnu
+// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
+// UNSUPPORTED: x86_64-pc-linux-gnu
+// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+
+#include <cuda_runtime.h>
+#include <stdio.h>
+
+extern "C" {
+void *llvm_omp_target_alloc_shared(size_t Size, int DeviceNum);
+void llvm_omp_target_free_shared(void *DevicePtr, int DeviceNum);
+}
+
+__global__ void kernel(int *A, int *DevPtr, int N) {
+  for (int i = 0; i < N; ++i)
+    *A += DevPtr[i];
+  *A *= -1;
+}
+
+int main(int argc, char **argv) {
+  int DevNo = 0;
+  int *Ptr = reinterpret_cast<int *>(llvm_omp_target_alloc_shared(4, DevNo));
+  int *DevPtr;
+  auto Err = cudaMalloc(&DevPtr, 42 * sizeof(int));
+  if (Err != cudaSuccess)
+    return -1;
+  Err = cudaMemset(DevPtr, -1, 42 * sizeof(int));
+  if (Err != cudaSuccess)
+    return -1;
+  *Ptr = 0;
+  printf("Ptr %p, *Ptr: %i\n", Ptr, *Ptr);
+  // CHECK: Ptr [[Ptr:0x.*]], *Ptr: 0
+  kernel<<<1, 1>>>(Ptr, DevPtr, 42);
+  printf("Ptr %p, *Ptr: %i\n", Ptr, *Ptr);
+  // CHECK: Ptr [[Ptr]], *Ptr: 42
+  Err = cudaFree(DevPtr);
+  if (Err != cudaSuccess)
+    return -1;
+  llvm_omp_target_free_shared(Ptr, DevNo);
+}

>From 4340516b6a4b0af012c399c8ae5fa80065e96f2e Mon Sep 17 00:00:00 2001
From: Johannes Doerfert <johannes at jdoerfert.de>
Date: Wed, 12 Jun 2024 06:37:09 -0700
Subject: [PATCH 03/31] [Offload] Introduce the concept of "default streams"

The offload APIs, and the CUDA wrappers in clang, now support "default
streams" per thread (and per device). It should be per context but we
don't really expose that concept yet. The KernelArguments allow an
LLVM/Offload user to provide a "AsyncInfoQueue", which is plugin
dependent and can later also be created outside or queried from the
runtime. User managed "queues" are kept persistent, thus not returned to
the pool once synchronized.

The CUDA tests will synchronize via `cudaDeviceSynchronize` before
checking the results.

Based on: https://github.com/llvm/llvm-project/pull/94821
---
 .../llvm_offload_wrappers/cuda_runtime.h      | 16 ++++--
 .../llvm/Frontend/OpenMP/OMPConstants.h       |  2 +-
 .../include/llvm/Frontend/OpenMP/OMPKinds.def |  2 +-
 llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp     |  4 +-
 offload/include/Shared/APITypes.h             |  9 +++-
 offload/include/omptarget.h                   | 18 ++++++-
 offload/plugins-nextgen/amdgpu/src/rtl.cpp    | 21 +++++---
 .../common/src/PluginInterface.cpp            |  2 +
 offload/plugins-nextgen/cuda/src/rtl.cpp      | 16 ++++--
 offload/src/KernelLanguage/API.cpp            | 14 +++++-
 offload/src/exports                           |  2 +
 offload/src/interface.cpp                     | 49 ++++++++++++++++++-
 offload/src/omptarget.cpp                     |  2 +-
 .../offloading/CUDA/basic_api_malloc_free.cu  |  1 +
 .../test/offloading/CUDA/basic_api_memcpy.cu  |  1 +
 .../test/offloading/CUDA/basic_api_memset.cu  |  1 +
 offload/test/offloading/CUDA/basic_launch.cu  |  3 +-
 .../CUDA/basic_launch_blocks_and_threads.cu   |  2 +
 .../offloading/CUDA/basic_launch_multi_arg.cu |  2 +
 19 files changed, 140 insertions(+), 27 deletions(-)

diff --git a/clang/lib/Headers/llvm_offload_wrappers/cuda_runtime.h b/clang/lib/Headers/llvm_offload_wrappers/cuda_runtime.h
index 8718e462a82d3..2d698e1c14e49 100644
--- a/clang/lib/Headers/llvm_offload_wrappers/cuda_runtime.h
+++ b/clang/lib/Headers/llvm_offload_wrappers/cuda_runtime.h
@@ -11,6 +11,7 @@
 #define __CUDA_RUNTIME_API__
 
 #include <cstddef>
+#include <cstdint>
 #include <optional>
 
 extern "C" {
@@ -21,6 +22,8 @@ int omp_target_memcpy(void *Dst, const void *Src, size_t Length,
                       size_t DstOffset, size_t SrcOffset, int DstDevice,
                       int SrcDevice);
 void *omp_target_memset(void *Ptr, int C, size_t N, int DeviceNum);
+int __tgt_target_synchronize_async_info_queue(void *Loc, int64_t DeviceNum,
+                                              void *AsyncInfoQueue);
 }
 
 // TODO: There are many fields missing in this enumeration.
@@ -55,6 +58,13 @@ inline cudaError_t cudaGetLastError() {
 // Returns the last error that has been produced without reseting it.
 inline cudaError_t cudaPeekAtLastError() { return __cudaomp_last_error; }
 
+inline cudaError_t cudaDeviceSynchronize() {
+  int DeviceNum = 0;
+  return __cudaomp_last_error =
+             (cudaError_t)__tgt_target_synchronize_async_info_queue(
+                 /*Loc=*/nullptr, DeviceNum, /*AsyncInfoQueue=*/nullptr);
+}
+
 inline cudaError_t __cudaMalloc(void **devPtr, size_t size) {
   int DeviceNum = 0;
   *devPtr = omp_target_alloc(size, DeviceNum);
@@ -118,12 +128,8 @@ inline cudaError_t cudaMemset(T *devPtr, int value, size_t count) {
   return __cudaMemset((void *)devPtr, value, count);
 }
 
-inline cudaError_t cudaDeviceSynchronize() {
-  // TODO: not implemented, not async yet.
-  return __cudaomp_last_error = cudaSuccess;
-}
-
 inline cudaError_t cudaDeviceReset(void) {
+  cudaDeviceSynchronize();
   // TODO: not implemented.
   return __cudaomp_last_error = cudaSuccess;
 }
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPConstants.h b/llvm/include/llvm/Frontend/OpenMP/OMPConstants.h
index 338b56226f204..a7be3f51fac7d 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMPConstants.h
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPConstants.h
@@ -72,7 +72,7 @@ enum class IdentFlag {
 #include "llvm/Frontend/OpenMP/OMPKinds.def"
 
 // Version of the kernel argument format used by the omp runtime.
-#define OMP_KERNEL_ARG_VERSION 3
+#define OMP_KERNEL_ARG_VERSION 4
 
 // Minimum version of the compiler that generates a kernel dynamic pointer.
 #define OMP_KERNEL_ARG_MIN_VERSION_WITH_DYN_PTR 3
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
index fe09bb8177c28..0be3827185e2e 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
@@ -90,7 +90,7 @@ __OMP_ARRAY_TYPE(Int32Arr3, Int32, 3)
 __OMP_STRUCT_TYPE(Ident, ident_t, false, Int32, Int32, Int32, Int32, Int8Ptr)
 __OMP_STRUCT_TYPE(KernelArgs, __tgt_kernel_arguments, false, Int32, Int32, VoidPtrPtr,
 		  VoidPtrPtr, Int64Ptr, Int64Ptr, VoidPtrPtr, VoidPtrPtr,
-		  Int64, Int64, Int32Arr3Ty, Int32Arr3Ty, Int32)
+		  Int64, Int64, Int32Arr3Ty, Int32Arr3Ty, Int32, VoidPtr)
 __OMP_STRUCT_TYPE(AsyncInfo, __tgt_async_info, false, Int8Ptr)
 __OMP_STRUCT_TYPE(DependInfo, kmp_dep_info, false, SizeTy, SizeTy, Int8)
 __OMP_STRUCT_TYPE(Task, kmp_task_ompbuilder_t, false, VoidPtr, VoidPtr, Int32, VoidPtr, VoidPtr)
diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index 92213e19c9d9d..ba73d48bc9a8b 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -496,6 +496,7 @@ void OpenMPIRBuilder::getKernelArgsVector(TargetKernelArgs &KernelArgs,
   auto Int32Ty = Type::getInt32Ty(Builder.getContext());
   Value *ZeroArray = Constant::getNullValue(ArrayType::get(Int32Ty, 3));
   Value *Flags = Builder.getInt64(KernelArgs.HasNoWait);
+  Value *AsyncInfoQueue = Constant::getNullValue(Builder.getPtrTy());
 
   Value *NumTeams3D =
       Builder.CreateInsertValue(ZeroArray, KernelArgs.NumTeams, {0});
@@ -514,7 +515,8 @@ void OpenMPIRBuilder::getKernelArgsVector(TargetKernelArgs &KernelArgs,
                 Flags,
                 NumTeams3D,
                 NumThreads3D,
-                KernelArgs.DynCGGroupMem};
+                KernelArgs.DynCGGroupMem,
+                AsyncInfoQueue};
 }
 
 void OpenMPIRBuilder::addAttributes(omp::RuntimeFunction FnID, Function &Fn) {
diff --git a/offload/include/Shared/APITypes.h b/offload/include/Shared/APITypes.h
index 4c1f7712249a3..f96b2f9ca259d 100644
--- a/offload/include/Shared/APITypes.h
+++ b/offload/include/Shared/APITypes.h
@@ -85,6 +85,9 @@ struct __tgt_async_info {
   /// ensure it is a valid location while the transfer to the device is
   /// happening.
   KernelLaunchEnvironmentTy KernelLaunchEnvironment;
+
+  /// Flag to indicate the Queue should be persistent.
+  bool PersistentQueue = false;
 };
 
 /// This struct contains all of the arguments to a target kernel region launch.
@@ -110,12 +113,16 @@ struct KernelArgsTy {
    // The number of threads (for x,y,z dimension).
   uint32_t ThreadLimit[3] = {0, 0, 0};
   uint32_t DynCGroupMem = 0;     // Amount of dynamic cgroup memory requested.
+  // A __tgt_async_info queue pointer to be used for the kernel and all
+  // associated device interactions. The operations are implicitly made
+  // non-blocking.
+  void *AsyncInfoQueue = nullptr;
 };
 static_assert(sizeof(KernelArgsTy().Flags) == sizeof(uint64_t),
               "Invalid struct size");
 static_assert(sizeof(KernelArgsTy) ==
                   (8 * sizeof(int32_t) + 3 * sizeof(int64_t) +
-                   4 * sizeof(void **) + 2 * sizeof(int64_t *)),
+                   5 * sizeof(void **) + 2 * sizeof(int64_t *)),
               "Invalid struct size");
 
 /// Flat array of kernel launch parameters and their total size.
diff --git a/offload/include/omptarget.h b/offload/include/omptarget.h
index 2b6445e9fbe55..8730879905984 100644
--- a/offload/include/omptarget.h
+++ b/offload/include/omptarget.h
@@ -136,8 +136,19 @@ class AsyncInfoTy {
   /// Synchronization method to be used.
   SyncTy SyncType;
 
-  AsyncInfoTy(DeviceTy &Device, SyncTy SyncType = SyncTy::BLOCKING)
+  AsyncInfoTy(DeviceTy &Device,  SyncTy SyncType = SyncTy::BLOCKING) 
       : Device(Device), SyncType(SyncType) {}
+  AsyncInfoTy(DeviceTy &Device, void *AsyncInfoQueue)
+      : Device(Device), SyncType(AsyncInfoQueue ? SyncTy::NON_BLOCKING : SyncTy::BLOCKING) {
+    AsyncInfo.Queue = AsyncInfoQueue;
+    AsyncInfo.PersistentQueue = !!AsyncInfoQueue;
+  }
+  AsyncInfoTy(DeviceTy &Device, void *AsyncInfoQueue, SyncTy SyncType)
+      : Device(Device), SyncType(SyncType) {
+    AsyncInfo.Queue = AsyncInfoQueue;
+    AsyncInfo.PersistentQueue = !!AsyncInfoQueue;
+  }
+
   ~AsyncInfoTy() { synchronize(); }
 
   /// Implicit conversion to the __tgt_async_info which is used in the
@@ -207,8 +218,9 @@ class TaskAsyncInfoWrapperTy {
   void **TaskAsyncInfoPtr = nullptr;
 
 public:
-  TaskAsyncInfoWrapperTy(DeviceTy &Device)
+  TaskAsyncInfoWrapperTy(DeviceTy &Device, void *AsyncInfoQueue=  nullptr) 
       : ExecThreadID(__kmpc_global_thread_num(NULL)), LocalAsyncInfo(Device) {
+    assert(!AsyncInfoQueue && "Async tasks do not support predefined async queue pointers!");
     // If we failed to acquired the current global thread id, we cannot
     // re-enqueue the current task. Thus we should use the local blocking async
     // info.
@@ -425,6 +437,8 @@ int __tgt_activate_record_replay(int64_t DeviceId, uint64_t MemorySize,
                                  void *VAddr, bool IsRecord, bool SaveOutput,
                                  uint64_t &ReqPtrArgOffset);
 
+void *__tgt_target_get_default_queue(void *Loc, int64_t DeviceId);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/offload/plugins-nextgen/amdgpu/src/rtl.cpp b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
index e678213df18ce..b37c3e23d44ab 100644
--- a/offload/plugins-nextgen/amdgpu/src/rtl.cpp
+++ b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -2208,8 +2208,11 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
       return Err;
 
     // Once the stream is synchronized, return it to stream pool and reset
-    // AsyncInfo. This is to make sure the synchronization only works for its
-    // own tasks.
+    // AsyncInfo if the queue is not persistent. This is to make sure the
+    // synchronization only works for its own tasks.
+    if (AsyncInfo.PersistentQueue)
+      return Plugin::success();
+
     AsyncInfo.Queue = nullptr;
     return AMDGPUStreamManager.returnResource(Stream);
   }
@@ -2228,9 +2231,12 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
     if (!(*CompletedOrErr))
       return Plugin::success();
 
-    // Once the stream is completed, return it to stream pool and reset
-    // AsyncInfo. This is to make sure the synchronization only works for its
-    // own tasks.
+    // Once the stream is synchronized, return it to stream pool and reset
+    // AsyncInfo if the queue is not persistent. This is to make sure the
+    // synchronization only works for its own tasks.
+    if (AsyncInfo.PersistentQueue)
+      return Plugin::success();
+
     AsyncInfo.Queue = nullptr;
     return AMDGPUStreamManager.returnResource(Stream);
   }
@@ -2443,7 +2449,10 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
 
   /// Initialize the async info for interoperability purposes.
   Error initAsyncInfoImpl(AsyncInfoWrapperTy &AsyncInfoWrapper) override {
-    // TODO: Implement this function.
+    AMDGPUStreamTy *Stream;
+    if (auto Err = getStream(AsyncInfoWrapper, Stream))
+      return Err;
+
     return Plugin::success();
   }
 
diff --git a/offload/plugins-nextgen/common/src/PluginInterface.cpp b/offload/plugins-nextgen/common/src/PluginInterface.cpp
index 2608af016284b..ff0f6edfcd693 100644
--- a/offload/plugins-nextgen/common/src/PluginInterface.cpp
+++ b/offload/plugins-nextgen/common/src/PluginInterface.cpp
@@ -1435,8 +1435,10 @@ Error GenericDeviceTy::launchKernel(void *EntryPtr, void **ArgPtrs,
 
 Error GenericDeviceTy::initAsyncInfo(__tgt_async_info **AsyncInfoPtr) {
   assert(AsyncInfoPtr && "Invalid async info");
+  assert(!(*AsyncInfoPtr) && "Already initialized async info");
 
   *AsyncInfoPtr = new __tgt_async_info();
+  (*AsyncInfoPtr)->PersistentQueue = true;
 
   AsyncInfoWrapperTy AsyncInfoWrapper(*this, *AsyncInfoPtr);
 
diff --git a/offload/plugins-nextgen/cuda/src/rtl.cpp b/offload/plugins-nextgen/cuda/src/rtl.cpp
index b6465d61bd033..bfbc101529e18 100644
--- a/offload/plugins-nextgen/cuda/src/rtl.cpp
+++ b/offload/plugins-nextgen/cuda/src/rtl.cpp
@@ -643,8 +643,11 @@ struct CUDADeviceTy : public GenericDeviceTy {
     }
 
     // Once the stream is synchronized, return it to stream pool and reset
-    // AsyncInfo. This is to make sure the synchronization only works for its
-    // own tasks.
+    // AsyncInfo if the queue is not persistent. This is to make sure the
+    // synchronization only works for its own tasks.
+    if (AsyncInfo.PersistentQueue)
+      return Plugin::success();
+
     AsyncInfo.Queue = nullptr;
     if (auto Err = CUDAStreamManager.returnResource(Stream))
       return Err;
@@ -777,9 +780,12 @@ struct CUDADeviceTy : public GenericDeviceTy {
     if (Res == CUDA_ERROR_NOT_READY)
       return Plugin::success();
 
-    // Once the stream is synchronized and the operations completed (or an error
-    // occurs), return it to stream pool and reset AsyncInfo. This is to make
-    // sure the synchronization only works for its own tasks.
+    // Once the stream is synchronized, return it to stream pool and reset
+    // AsyncInfo if the queue is not persistent. This is to make sure the
+    // synchronization only works for its own tasks.
+    if (AsyncInfo.PersistentQueue)
+      return Plugin::success();
+
     AsyncInfo.Queue = nullptr;
     if (auto Err = CUDAStreamManager.returnResource(Stream))
       return Err;
diff --git a/offload/src/KernelLanguage/API.cpp b/offload/src/KernelLanguage/API.cpp
index 9ffc199b5da7d..779751deed661 100644
--- a/offload/src/KernelLanguage/API.cpp
+++ b/offload/src/KernelLanguage/API.cpp
@@ -8,9 +8,11 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/Frontend/OpenMP/OMPConstants.h"
 
 #include "Shared/APITypes.h"
 
+#include <cstdint>
 #include <cstdio>
 
 struct dim3 {
@@ -56,10 +58,13 @@ unsigned __llvmPopCallConfiguration(dim3 *__grid_size, dim3 *__block_size,
 int __tgt_target_kernel(void *Loc, int64_t DeviceId, int32_t NumTeams,
                         int32_t ThreadLimit, const void *HostPtr,
                         KernelArgsTy *Args);
+void *__tgt_target_get_default_async_info_queue(void *Loc, int64_t DeviceId);
 
 unsigned llvmLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim,
                           void *args, size_t sharedMem, void *stream) {
+  int64_t DeviceNo = 0;
   KernelArgsTy Args = {};
+  Args.Version = OMP_KERNEL_ARG_VERSION;
   Args.DynCGroupMem = sharedMem;
   Args.NumTeams[0] = gridDim.x;
   Args.NumTeams[1] = gridDim.y;
@@ -69,8 +74,13 @@ unsigned llvmLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim,
   Args.ThreadLimit[2] = blockDim.z;
   Args.ArgPtrs = reinterpret_cast<void **>(args);
   Args.Flags.IsCUDA = true;
-  int rv = __tgt_target_kernel(nullptr, 0, gridDim.x,
-                               blockDim.x, func, &Args);
+  if (stream)
+    Args.AsyncInfoQueue = stream;
+  else
+    Args.AsyncInfoQueue =
+        __tgt_target_get_default_async_info_queue(nullptr, DeviceNo);
+  int rv = __tgt_target_kernel(nullptr, DeviceNo, gridDim.x, blockDim.x, func,
+                               &Args);
   return rv;
 }
 }
diff --git a/offload/src/exports b/offload/src/exports
index 7bdc7d2a531bb..11830f62af388 100644
--- a/offload/src/exports
+++ b/offload/src/exports
@@ -29,6 +29,8 @@ VERS1.0 {
     __tgt_target_kernel;
     __tgt_target_kernel_nowait;
     __tgt_target_nowait_query;
+    __tgt_target_get_default_async_info_queue;
+    __tgt_target_synchronize_async_info_queue;
     __tgt_target_kernel_replay;
     __tgt_activate_record_replay;
     __tgt_mapper_num_components;
diff --git a/offload/src/interface.cpp b/offload/src/interface.cpp
index 763b051cc6d77..759fb54a11262 100644
--- a/offload/src/interface.cpp
+++ b/offload/src/interface.cpp
@@ -14,6 +14,8 @@
 #include "OpenMP/OMPT/Interface.h"
 #include "OpenMP/OMPT/Callback.h"
 #include "PluginManager.h"
+#include "Shared/APITypes.h"
+#include "omptarget.h"
 #include "private.h"
 
 #include "Shared/EnvironmentVar.h"
@@ -312,7 +314,7 @@ static inline int targetKernel(ident_t *Loc, int64_t DeviceId, int32_t NumTeams,
   if (!DeviceOrErr)
     FATAL_MESSAGE(DeviceId, "%s", toString(DeviceOrErr.takeError()).c_str());
 
-  TargetAsyncInfoTy TargetAsyncInfo(*DeviceOrErr);
+  TargetAsyncInfoTy TargetAsyncInfo(*DeviceOrErr, KernelArgs->AsyncInfoQueue);
   AsyncInfoTy &AsyncInfo = TargetAsyncInfo;
   /// RAII to establish tool anchors before and after target region
   OMPT_IF_BUILT(InterfaceRAII TargetRAII(
@@ -510,3 +512,48 @@ EXTERN void __tgt_target_nowait_query(void **AsyncHandle) {
   delete AsyncInfo;
   *AsyncHandle = nullptr;
 }
+
+EXTERN void *__tgt_target_get_default_async_info_queue(void *Loc,
+                                                       int64_t DeviceId) {
+  assert(PM && "Runtime not initialized");
+
+  static thread_local void **AsyncInfoQueue = nullptr;
+
+  if (!AsyncInfoQueue)
+    AsyncInfoQueue = reinterpret_cast<void **>(
+        calloc(PM->getNumDevices(), sizeof(AsyncInfoQueue[0])));
+
+  if (!AsyncInfoQueue[DeviceId]) {
+    auto DeviceOrErr = PM->getDevice(DeviceId);
+    if (!DeviceOrErr)
+      FATAL_MESSAGE(DeviceId, "%s", toString(DeviceOrErr.takeError()).c_str());
+
+    __tgt_async_info *AsyncInfo = nullptr;
+    DeviceOrErr->RTL->init_async_info(DeviceId, &AsyncInfo);
+    AsyncInfoQueue[DeviceId] = AsyncInfo->Queue;
+  }
+
+  return AsyncInfoQueue[DeviceId];
+}
+
+EXTERN int __tgt_target_synchronize_async_info_queue(void *Loc,
+                                                     int64_t DeviceId,
+                                                     void *AsyncInfoQueue) {
+  assert(PM && "Runtime not initialized");
+
+  auto DeviceOrErr = PM->getDevice(DeviceId);
+  if (!DeviceOrErr)
+    FATAL_MESSAGE(DeviceId, "%s", toString(DeviceOrErr.takeError()).c_str());
+  if (!AsyncInfoQueue)
+    AsyncInfoQueue = __tgt_target_get_default_async_info_queue(Loc, DeviceId);
+  AsyncInfoTy AsyncInfo(*DeviceOrErr, AsyncInfoQueue,
+                        AsyncInfoTy::SyncTy::BLOCKING);
+
+  if (AsyncInfo.synchronize())
+    FATAL_MESSAGE0(1, "Error while querying the async queue for completion.\n");
+  [[maybe_unused]] __tgt_async_info *ASI = AsyncInfo;
+  assert(ASI->Queue);
+  assert(ASI->Queue && ASI->PersistentQueue);
+
+  return 0;
+}
diff --git a/offload/src/omptarget.cpp b/offload/src/omptarget.cpp
index 9bca8529c5ee3..73e26c0def6da 100644
--- a/offload/src/omptarget.cpp
+++ b/offload/src/omptarget.cpp
@@ -49,7 +49,7 @@ int AsyncInfoTy::synchronize() {
     case SyncTy::BLOCKING:
       // If we have a queue we need to synchronize it now.
       Result = Device.synchronize(*this);
-      assert(AsyncInfo.Queue == nullptr &&
+      assert((AsyncInfo.PersistentQueue || !AsyncInfo.Queue) &&
              "The device plugin should have nulled the queue to indicate there "
              "are no outstanding actions!");
       break;
diff --git a/offload/test/offloading/CUDA/basic_api_malloc_free.cu b/offload/test/offloading/CUDA/basic_api_malloc_free.cu
index 86dc5f8fef68e..60a51e33a5af9 100644
--- a/offload/test/offloading/CUDA/basic_api_malloc_free.cu
+++ b/offload/test/offloading/CUDA/basic_api_malloc_free.cu
@@ -32,6 +32,7 @@ int main(int argc, char **argv) {
   printf("Ptr %p, *Ptr: %i\n", Ptr, *Ptr);
   // CHECK: Ptr [[Ptr:0x.*]], *Ptr: 0
   kernel<<<1, 1>>>(Ptr, DevPtr, 42);
+  cudaDeviceSynchronize();
   printf("Ptr %p, *Ptr: %i\n", Ptr, *Ptr);
   // CHECK: Ptr [[Ptr]], *Ptr: 42
   Err = cudaFree(DevPtr);
diff --git a/offload/test/offloading/CUDA/basic_api_memcpy.cu b/offload/test/offloading/CUDA/basic_api_memcpy.cu
index d5c0929abe43c..088e20ffa9e2b 100644
--- a/offload/test/offloading/CUDA/basic_api_memcpy.cu
+++ b/offload/test/offloading/CUDA/basic_api_memcpy.cu
@@ -31,6 +31,7 @@ int main(int argc, char **argv) {
   printf("Res: %i\n", Res);
   // CHECK: Res: 0
   kernel<<<1, 1>>>(DevPtr, 42);
+  cudaDeviceSynchronize();
   Err = cudaMemcpy(HstPtr, DevPtr, 42 * sizeof(int), cudaMemcpyDeviceToHost);
   if (Err != cudaSuccess)
     return -1;
diff --git a/offload/test/offloading/CUDA/basic_api_memset.cu b/offload/test/offloading/CUDA/basic_api_memset.cu
index 36dcc729a3a9e..474eb2a46f0a2 100644
--- a/offload/test/offloading/CUDA/basic_api_memset.cu
+++ b/offload/test/offloading/CUDA/basic_api_memset.cu
@@ -34,6 +34,7 @@ int main(int argc, char **argv) {
   printf("Ptr %p, *Ptr: %i\n", Ptr, *Ptr);
   // CHECK: Ptr [[Ptr:0x.*]], *Ptr: 0
   kernel<<<1, 1>>>(Ptr, DevPtr, 42);
+  cudaDeviceSynchronize();
   printf("Ptr %p, *Ptr: %i\n", Ptr, *Ptr);
   // CHECK: Ptr [[Ptr]], *Ptr: 42
   Err = cudaFree(DevPtr);
diff --git a/offload/test/offloading/CUDA/basic_launch.cu b/offload/test/offloading/CUDA/basic_launch.cu
index 2915a7c216ab5..298aa7db83bad 100644
--- a/offload/test/offloading/CUDA/basic_launch.cu
+++ b/offload/test/offloading/CUDA/basic_launch.cu
@@ -8,7 +8,7 @@
 // UNSUPPORTED: x86_64-pc-linux-gnu
 // UNSUPPORTED: x86_64-pc-linux-gnu-LTO
 
-
+#include <cuda_runtime.h>
 #include <stdio.h>
 
 extern "C" {
@@ -25,6 +25,7 @@ int main(int argc, char **argv) {
   printf("Ptr %p, *Ptr: %i\n", Ptr, *Ptr);
   // CHECK: Ptr [[Ptr:0x.*]], *Ptr: 7
   square<<<1, 1>>>(Ptr);
+  cudaDeviceSynchronize();
   printf("Ptr %p, *Ptr: %i\n", Ptr, *Ptr);
   // CHECK: Ptr [[Ptr]], *Ptr: 42
   llvm_omp_target_free_shared(Ptr, DevNo);
diff --git a/offload/test/offloading/CUDA/basic_launch_blocks_and_threads.cu b/offload/test/offloading/CUDA/basic_launch_blocks_and_threads.cu
index 615cae6f7b233..c47b1a1b83bde 100644
--- a/offload/test/offloading/CUDA/basic_launch_blocks_and_threads.cu
+++ b/offload/test/offloading/CUDA/basic_launch_blocks_and_threads.cu
@@ -8,6 +8,7 @@
 // UNSUPPORTED: x86_64-pc-linux-gnu
 // UNSUPPORTED: x86_64-pc-linux-gnu-LTO
 
+#include <cuda_runtime.h>
 #include <stdio.h>
 
 extern "C" {
@@ -26,6 +27,7 @@ int main(int argc, char **argv) {
   printf("Ptr %p, *Ptr: %i\n", Ptr, *Ptr);
   // CHECK: Ptr [[Ptr:0x.*]], *Ptr: 0
   square<<<7, 6>>>(Ptr);
+  cudaDeviceSynchronize();
   printf("Ptr %p, *Ptr: %i\n", Ptr, *Ptr);
   // CHECK: Ptr [[Ptr]], *Ptr: 42
   llvm_omp_target_free_shared(Ptr, DevNo);
diff --git a/offload/test/offloading/CUDA/basic_launch_multi_arg.cu b/offload/test/offloading/CUDA/basic_launch_multi_arg.cu
index f95f1dbacc79c..58ff89dcd4aac 100644
--- a/offload/test/offloading/CUDA/basic_launch_multi_arg.cu
+++ b/offload/test/offloading/CUDA/basic_launch_multi_arg.cu
@@ -8,6 +8,7 @@
 // UNSUPPORTED: x86_64-pc-linux-gnu
 // UNSUPPORTED: x86_64-pc-linux-gnu-LTO
 
+#include <cuda_runtime.h>
 #include <stdio.h>
 
 extern "C" {
@@ -33,6 +34,7 @@ int main(int argc, char **argv) {
   printf("Src: %i : %i\n", Src[0], Src[1]);
   // CHECK: Src: -2 : 8
   square<<<1, 1>>>(Ptr, 3, Src, 4);
+  cudaDeviceSynchronize();
   printf("Ptr %p, *Ptr: %i\n", Ptr, *Ptr);
   // CHECK: Ptr [[Ptr]], *Ptr: 42
   printf("Src: %i : %i\n", Src[0], Src[1]);

>From 930970a9eaea23a15fe16336d4e6406f6df93033 Mon Sep 17 00:00:00 2001
From: Johannes Doerfert <johannes at jdoerfert.de>
Date: Sat, 15 Jun 2024 16:16:33 -0700
Subject: [PATCH 04/31] [WIP] Playing with ideas

---
 .../llvm/Transforms/Instrumentation/GPUSan.h  |  26 ++
 llvm/lib/Passes/PassBuilder.cpp               |   1 +
 llvm/lib/Passes/PassBuilderPipelines.cpp      |  10 +
 llvm/lib/Passes/PassRegistry.def              |   1 +
 .../Transforms/Instrumentation/CMakeLists.txt |   1 +
 .../lib/Transforms/Instrumentation/GPUSan.cpp | 237 ++++++++++++++++++
 offload/DeviceRTL/CMakeLists.txt              |   1 +
 offload/DeviceRTL/src/AllocationTracker.cpp   | 134 ++++++++++
 8 files changed, 411 insertions(+)
 create mode 100644 llvm/include/llvm/Transforms/Instrumentation/GPUSan.h
 create mode 100644 llvm/lib/Transforms/Instrumentation/GPUSan.cpp
 create mode 100644 offload/DeviceRTL/src/AllocationTracker.cpp

diff --git a/llvm/include/llvm/Transforms/Instrumentation/GPUSan.h b/llvm/include/llvm/Transforms/Instrumentation/GPUSan.h
new file mode 100644
index 0000000000000..1201b3c47ac06
--- /dev/null
+++ b/llvm/include/llvm/Transforms/Instrumentation/GPUSan.h
@@ -0,0 +1,26 @@
+//===- Transforms/Instrumentation/GPUSan.h ----------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file provides the interface for LLVM's PGO Instrumentation lowering
+/// pass.
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_INSTRUMENTATION_GPUSAN_H
+#define LLVM_TRANSFORMS_INSTRUMENTATION_GPUSAN_H
+
+#include "llvm/IR/PassManager.h"
+
+namespace llvm {
+
+class GPUSanPass : public PassInfoMixin<GPUSanPass> {
+public:
+  PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
+};
+} // end namespace llvm
+
+#endif // LLVM_TRANSFORMS_INSTRUMENTATION_GPUSAN_H
diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp
index 19e8a8ab68a73..42fdb2415c2fe 100644
--- a/llvm/lib/Passes/PassBuilder.cpp
+++ b/llvm/lib/Passes/PassBuilder.cpp
@@ -174,6 +174,7 @@
 #include "llvm/Transforms/Instrumentation/ControlHeightReduction.h"
 #include "llvm/Transforms/Instrumentation/DataFlowSanitizer.h"
 #include "llvm/Transforms/Instrumentation/GCOVProfiler.h"
+#include "llvm/Transforms/Instrumentation/GPUSan.h"
 #include "llvm/Transforms/Instrumentation/HWAddressSanitizer.h"
 #include "llvm/Transforms/Instrumentation/InstrOrderFile.h"
 #include "llvm/Transforms/Instrumentation/InstrProfiling.h"
diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp
index 926515c9508a9..37dcf59a0eccf 100644
--- a/llvm/lib/Passes/PassBuilderPipelines.cpp
+++ b/llvm/lib/Passes/PassBuilderPipelines.cpp
@@ -71,6 +71,7 @@
 #include "llvm/Transforms/InstCombine/InstCombine.h"
 #include "llvm/Transforms/Instrumentation/CGProfile.h"
 #include "llvm/Transforms/Instrumentation/ControlHeightReduction.h"
+#include "llvm/Transforms/Instrumentation/GPUSan.h"
 #include "llvm/Transforms/Instrumentation/InstrOrderFile.h"
 #include "llvm/Transforms/Instrumentation/InstrProfiling.h"
 #include "llvm/Transforms/Instrumentation/MemProfiler.h"
@@ -163,6 +164,9 @@ static cl::opt<bool>
                             cl::Hidden,
                             cl::desc("Enable inline deferral during PGO"));
 
+static cl::opt<bool> EnableGPUSan("enable-gpu-san", cl::init(false), cl::Hidden,
+                                  cl::desc("Enable gpu san"));
+
 static cl::opt<bool> EnableModuleInliner("enable-module-inliner",
                                          cl::init(false), cl::Hidden,
                                          cl::desc("Enable module inliner"));
@@ -1103,6 +1107,9 @@ PassBuilder::buildModuleSimplificationPipeline(OptimizationLevel Level,
           PGOIndirectCallPromotion(true /* IsInLTO */, true /* SamplePGO */));
   }
 
+  if (EnableGPUSan)
+    MPM.addPass(GPUSanPass());
+
   // Try to perform OpenMP specific optimizations on the module. This is a
   // (quick!) no-op if there are no OpenMP runtime calls present in the module.
   MPM.addPass(OpenMPOptPass());
@@ -1888,6 +1895,9 @@ PassBuilder::buildLTODefaultPipeline(OptimizationLevel Level,
   // Optimize globals again after we ran the inliner.
   MPM.addPass(GlobalOptPass());
 
+  if (EnableGPUSan)
+    MPM.addPass(GPUSanPass());
+
   // Run the OpenMPOpt pass again after global optimizations.
   MPM.addPass(OpenMPOptPass(ThinOrFullLTOPhase::FullLTOPostLink));
 
diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def
index 60c517790bcab..741c1d3e1842a 100644
--- a/llvm/lib/Passes/PassRegistry.def
+++ b/llvm/lib/Passes/PassRegistry.def
@@ -78,6 +78,7 @@ MODULE_PASS("inliner-wrapper-no-mandatory-first",
 MODULE_PASS("insert-gcov-profiling", GCOVProfilerPass())
 MODULE_PASS("instrorderfile", InstrOrderFilePass())
 MODULE_PASS("instrprof", InstrProfilingLoweringPass())
+MODULE_PASS("gpusan", GPUSanPass())
 MODULE_PASS("ctx-instr-lower", PGOCtxProfLoweringPass())
 MODULE_PASS("invalidate<all>", InvalidateAllAnalysesPass())
 MODULE_PASS("iroutliner", IROutlinerPass())
diff --git a/llvm/lib/Transforms/Instrumentation/CMakeLists.txt b/llvm/lib/Transforms/Instrumentation/CMakeLists.txt
index 8d345d394b51a..9aa530229ab7e 100644
--- a/llvm/lib/Transforms/Instrumentation/CMakeLists.txt
+++ b/llvm/lib/Transforms/Instrumentation/CMakeLists.txt
@@ -6,6 +6,7 @@ add_llvm_component_library(LLVMInstrumentation
   DataFlowSanitizer.cpp
   GCOVProfiling.cpp
   BlockCoverageInference.cpp
+  GPUSan.cpp
   MemProfiler.cpp
   MemorySanitizer.cpp
   IndirectCallPromotion.cpp
diff --git a/llvm/lib/Transforms/Instrumentation/GPUSan.cpp b/llvm/lib/Transforms/Instrumentation/GPUSan.cpp
new file mode 100644
index 0000000000000..5302cb14ed22c
--- /dev/null
+++ b/llvm/lib/Transforms/Instrumentation/GPUSan.cpp
@@ -0,0 +1,237 @@
+//===-- GPUSan.cpp - GPU sanitizer ----------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Instrumentation/GPUSan.h"
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Transforms/Utils/ModuleUtils.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "gpusan"
+
+cl::opt<bool> UseTags(
+    "gpusan-use-tags",
+    cl::desc(
+        "Use tags to detect use after if the number of allocations is large"),
+    cl::init(false));
+
+namespace {
+class GPUSanImpl final {
+public:
+  GPUSanImpl(Module &M) : M(M), Ctx(M.getContext()) {}
+
+  bool instrument();
+
+private:
+  bool instrumentGlobals();
+  bool instrumentFunction(Function &Fn);
+  void instrumentAllocation(Instruction &I, Value &Size);
+  void instrumentAllocaInst(AllocaInst &AI);
+  void instrumentAccess(Instruction &I, int PtrIdx, Type &AccessTy);
+  void instrumentLoadInst(LoadInst &LI);
+  void instrumentStoreInst(StoreInst &SI);
+  void instrumentGEPInst(GetElementPtrInst &GEP);
+  bool instrumentCallInst(CallInst &CI);
+
+  void getOrCreateFn(FunctionCallee &FC, StringRef Name, Type *RetTy,
+                     ArrayRef<Type *> ArgTys) {
+    if (!FC) {
+      auto *NewAllocationFnTy = FunctionType::get(RetTy, ArgTys, false);
+      FC = M.getOrInsertFunction(Name, NewAllocationFnTy);
+    }
+  }
+
+  FunctionCallee getNewAllocationFn() {
+    getOrCreateFn(NewAllocationFn, "ompx_new_allocation", PtrTy,
+                  {PtrTy, Int64Ty});
+    return NewAllocationFn;
+  }
+  FunctionCallee getAccessFn() {
+    getOrCreateFn(AccessFn, "ompx_check_access", PtrTy, {PtrTy, Int64Ty});
+    return AccessFn;
+  }
+  FunctionCallee getGEPFn() {
+    getOrCreateFn(GEPFn, "ompx_gep", PtrTy, {PtrTy, Int64Ty});
+    return GEPFn;
+  }
+  FunctionCallee getUnpackFn() {
+    getOrCreateFn(UnpackFn, "ompx_unpack", PtrTy, {PtrTy});
+    return UnpackFn;
+  }
+
+  Module &M;
+  LLVMContext &Ctx;
+
+  Type *VoidTy = Type::getVoidTy(Ctx);
+  Type *IntptrTy = M.getDataLayout().getIntPtrType(Ctx);
+  PointerType *PtrTy = PointerType::getUnqual(Ctx);
+  Type *Int8Ty = Type::getInt8Ty(Ctx);
+  Type *Int32Ty = Type::getInt32Ty(Ctx);
+  Type *Int64Ty = Type::getInt64Ty(Ctx);
+
+  const DataLayout &DL = M.getDataLayout();
+
+  FunctionCallee GEPFn;
+  FunctionCallee UnpackFn;
+  FunctionCallee AccessFn;
+  FunctionCallee NewAllocationFn;
+};
+
+} // end anonymous namespace
+
+bool GPUSanImpl::instrumentGlobals() {
+  return false;
+  Function *CTorFn;
+  std::tie(CTorFn, std::ignore) = getOrCreateSanitizerCtorAndInitFunctions(
+      M, "ompx.ctor", "ompx.init",
+      /*InitArgTypes=*/{},
+      /*InitArgs=*/{},
+      // This callback is invoked when the functions are created the first
+      // time. Hook them into the global ctors list in that case:
+      [&](Function *Ctor, FunctionCallee) {
+        appendToGlobalCtors(M, Ctor, 0, Ctor);
+      });
+  return true;
+}
+
+void GPUSanImpl::instrumentAllocation(Instruction &I, Value &Size) {
+  IRBuilder<> IRB(I.getNextNode());
+  auto *CB = IRB.CreateCall(getNewAllocationFn(),
+                            {UndefValue::get(I.getType()), &Size},
+                            I.getName() + ".san");
+  I.replaceUsesWithIf(
+      IRB.CreatePointerBitCastOrAddrSpaceCast(CB, I.getType()),
+      [](Use &U) { return !isa<LifetimeIntrinsic>(U.getUser()); });
+  CB->setArgOperand(0, &I);
+}
+
+void GPUSanImpl::instrumentAllocaInst(AllocaInst &AI) {
+  auto SizeOrNone = AI.getAllocationSize(DL);
+  if (!SizeOrNone)
+    llvm_unreachable("TODO");
+  Value *Size = ConstantInt::get(Int64Ty, *SizeOrNone);
+  instrumentAllocation(AI, *Size);
+}
+
+void GPUSanImpl::instrumentAccess(Instruction &I, int PtrIdx, Type &AccessTy) {
+  auto TySize = DL.getTypeStoreSize(&AccessTy);
+  assert(!TySize.isScalable());
+  Value *Size = ConstantInt::get(Int64Ty, TySize.getFixedValue());
+  IRBuilder<> IRB(&I);
+  Value *PtrOp = I.getOperand(PtrIdx);
+  auto *CB = IRB.CreateCall(getAccessFn(), {PtrOp, Size}, I.getName() + ".san");
+  I.setOperand(PtrIdx,
+               IRB.CreatePointerBitCastOrAddrSpaceCast(CB, PtrOp->getType()));
+}
+
+void GPUSanImpl::instrumentLoadInst(LoadInst &LI) {
+  instrumentAccess(LI, LoadInst::getPointerOperandIndex(), *LI.getType());
+}
+
+void GPUSanImpl::instrumentStoreInst(StoreInst &SI) {
+  instrumentAccess(SI, StoreInst::getPointerOperandIndex(),
+                   *SI.getValueOperand()->getType());
+}
+
+void GPUSanImpl::instrumentGEPInst(GetElementPtrInst &GEP) {
+  Value *PtrOp = GEP.getPointerOperand();
+  GEP.setOperand(GetElementPtrInst::getPointerOperandIndex(),
+                 Constant::getNullValue(PtrOp->getType()));
+
+  IRBuilder<> IRB(GEP.getNextNode());
+  auto *CB = IRB.CreateCall(getGEPFn(), {PtrOp, UndefValue::get(Int64Ty)},
+                            GEP.getName() + ".san");
+  GEP.replaceAllUsesWith(CB);
+  Value *Offset =
+      new PtrToIntInst(&GEP, Int64Ty, GEP.getName() + ".san.offset", CB);
+  CB->setArgOperand(1, Offset);
+}
+
+bool GPUSanImpl::instrumentCallInst(CallInst &CI) {
+  bool Changed = false;
+  if (auto *Fn = CI.getCalledFunction()) {
+    if (Fn->isDeclaration() && !Fn->getName().starts_with("ompx")) {
+      IRBuilder<> IRB(&CI);
+      for (int I = 0, E = CI.arg_size(); I != E; ++I) {
+        auto *Op = CI.getArgOperand(I);
+        if (!Op->getType()->isPointerTy())
+          continue;
+        auto *CB =
+            IRB.CreateCall(getUnpackFn(), {Op}, Op->getName() + ".unpack");
+        CI.setArgOperand(I, CB);
+        Changed = true;
+      }
+    }
+  }
+  return Changed;
+}
+
+bool GPUSanImpl::instrumentFunction(Function &Fn) {
+  bool Changed = false;
+
+  for (auto &I : instructions(Fn)) {
+    switch (I.getOpcode()) {
+    case Instruction::Alloca:
+      instrumentAllocaInst(cast<AllocaInst>(I));
+      Changed = true;
+      break;
+    case Instruction::Load:
+      instrumentLoadInst(cast<LoadInst>(I));
+      Changed = true;
+      break;
+    case Instruction::Store:
+      instrumentStoreInst(cast<StoreInst>(I));
+      Changed = true;
+      break;
+    case Instruction::GetElementPtr:
+      instrumentGEPInst(cast<GetElementPtrInst>(I));
+      Changed = true;
+      break;
+    case Instruction::Call:
+      Changed = instrumentCallInst(cast<CallInst>(I));
+      break;
+    default:
+      break;
+    }
+  }
+
+  return Changed;
+}
+
+bool GPUSanImpl::instrument() {
+  bool Changed = instrumentGlobals();
+  for (Function &Fn : M)
+    Changed |= instrumentFunction(Fn);
+  return Changed;
+}
+
+PreservedAnalyses GPUSanPass::run(Module &M, ModuleAnalysisManager &AM) {
+  FunctionAnalysisManager &FAM =
+      AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
+  GPUSanImpl Lowerer(M);
+  if (!Lowerer.instrument())
+    return PreservedAnalyses::all();
+
+  return PreservedAnalyses::none();
+}
diff --git a/offload/DeviceRTL/CMakeLists.txt b/offload/DeviceRTL/CMakeLists.txt
index d88430a52b8b7..2027725abd40e 100644
--- a/offload/DeviceRTL/CMakeLists.txt
+++ b/offload/DeviceRTL/CMakeLists.txt
@@ -85,6 +85,7 @@ set(include_files
 
 set(src_files
   ${source_directory}/Allocator.cpp
+  ${source_directory}/AllocationTracker.cpp
   ${source_directory}/Configuration.cpp
   ${source_directory}/Debug.cpp
   ${source_directory}/Kernel.cpp
diff --git a/offload/DeviceRTL/src/AllocationTracker.cpp b/offload/DeviceRTL/src/AllocationTracker.cpp
new file mode 100644
index 0000000000000..c6a01b19fed43
--- /dev/null
+++ b/offload/DeviceRTL/src/AllocationTracker.cpp
@@ -0,0 +1,134 @@
+//===------ AllocationTracker.cpp - Track allocation for sanitizers -------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+
+#include "LibC.h"
+#include "Types.h"
+#include "Utils.h"
+
+using namespace ompx;
+using namespace utils;
+
+#pragma omp begin declare target device_type(nohost)
+
+// #define USE_TAGS
+
+#ifdef USE_TAGS
+static constexpr uint32_t TAG_BITS = 8;
+#else
+static constexpr uint32_t TAG_BITS = 0;
+#endif
+
+#define _OBJECT_TY unsigned short
+static constexpr uint32_t OBJECT_BITS = sizeof(_OBJECT_TY) * 8;
+
+static constexpr uint32_t LENGTH_BITS = 64 - TAG_BITS;
+static constexpr uint32_t OFFSET_BITS = LENGTH_BITS - OBJECT_BITS;
+
+static_assert(LENGTH_BITS + TAG_BITS == 64,
+              "Length and tag bits should cover 64 bits");
+static_assert(OFFSET_BITS + TAG_BITS + OBJECT_BITS == 64,
+              "Length, tag, and object bits should cover 64 bits");
+
+struct AllocationTy {
+  void *Start;
+  uint64_t Length : LENGTH_BITS;
+#ifdef USE_TAGS
+  uint64_t Tag : TAG_BITS;
+#endif
+
+  bool contains(void *Ptr, uint64_t Size) const {
+    return Ptr >= Start && advance(Ptr, Size) <= advance(Start, Length);
+  }
+};
+static_assert(sizeof(AllocationTy) == sizeof(void *) * 2,
+              "AllocationTy should not exceed two pointers");
+
+static AllocationTy Allocations[static_cast<_OBJECT_TY>(~0)];
+unsigned short NumAllocs = 1;
+
+struct AllocationPtrTy {
+  static AllocationPtrTy get(void *P) {
+    return convertViaPun<AllocationPtrTy>(P);
+  }
+  static AllocationPtrTy get(intptr_t V) {
+    return convertViaPun<AllocationPtrTy>(V);
+  }
+  operator void *() const { return convertViaPun<void *>(*this); }
+  operator intptr_t() const { return convertViaPun<intptr_t>(*this); }
+
+  uint64_t PtrOffset : OFFSET_BITS;
+#ifdef USE_TAGS
+  uint64_t AllocationTag : TAG_BITS;
+#endif
+  uint64_t AllocationId : OBJECT_BITS;
+};
+static_assert(sizeof(AllocationPtrTy) == sizeof(void *),
+              "AllocationTy pointers should be pointer sized");
+
+extern "C" {
+
+[[gnu::flatten, gnu::always_inline]] void *
+ompx_new_allocation(void *Start, uint64_t Length) {
+  if constexpr (LENGTH_BITS < 64)
+    if (Length >= (1UL << (LENGTH_BITS + 1)))
+      __builtin_trap();
+  auto No = NumAllocs++;
+  AllocationTy &A = Allocations[No];
+  A.Start = Start;
+  A.Length = Length;
+  AllocationPtrTy AP;
+  AP.PtrOffset = 0;
+  AP.AllocationId = No;
+#ifdef USE_TAGS
+  A.Tag = 0;
+  AP.AllocationTag = A.Tag;
+#endif
+  return AP;
+}
+
+[[gnu::flatten, gnu::always_inline]] void ompx_free_allocation(void *P) {
+  AllocationPtrTy AP = AllocationPtrTy::get(P);
+  Allocations[AP.AllocationId] = AllocationTy();
+}
+
+[[gnu::flatten, gnu::always_inline]] void *ompx_gep(void *P, uint64_t Offset) {
+  AllocationPtrTy AP = AllocationPtrTy::get(P);
+  AP.PtrOffset += Offset;
+  return AP;
+}
+
+[[gnu::flatten, gnu::always_inline]] void *ompx_check_access(void *P,
+                                                             uint64_t Size) {
+  AllocationPtrTy AP = AllocationPtrTy::get(P);
+  AllocationTy &A = Allocations[AP.AllocationId];
+#ifdef USE_TAGS
+  if (A.Tag != AP.AllocationTag)
+    __builtin_trap();
+#endif
+  uint64_t Offset = AP.PtrOffset;
+  void *Ptr = advance(A.Start, Offset);
+  if (!A.contains(Ptr, Size)) {
+    printf("Out of bounds, access: %lu inside of %lu allocation @ %p\n", Offset,
+           A.Length, A.Start);
+    __builtin_trap();
+  }
+  return Ptr;
+}
+
+[[gnu::flatten, gnu::always_inline]] void *ompx_unpack(void *P) {
+  AllocationPtrTy AP = AllocationPtrTy::get(P);
+  AllocationTy &A = Allocations[AP.AllocationId];
+  uint64_t Offset = AP.PtrOffset;
+  void *Ptr = advance(A.Start, Offset);
+  return Ptr;
+}
+}
+
+#pragma omp end declare target

>From 628bb35e9ecf7786754aef0d2265ed075124ee51 Mon Sep 17 00:00:00 2001
From: Johannes Doerfert <johannes at jdoerfert.de>
Date: Mon, 17 Jun 2024 19:09:14 -0700
Subject: [PATCH 05/31] Trying

---
 llvm/lib/Passes/PassBuilderPipelines.cpp      |  9 +-
 .../lib/Transforms/Instrumentation/GPUSan.cpp | 58 ++++++++----
 offload/DeviceRTL/src/AllocationTracker.cpp   | 66 +++++++++++---
 offload/include/OpenMP/Mapping.h              |  4 +-
 offload/include/Shared/Environment.h          |  8 ++
 offload/plugins-nextgen/amdgpu/src/rtl.cpp    | 50 +++++++++--
 .../common/include/PluginInterface.h          |  2 +
 .../common/src/PluginInterface.cpp            | 11 +++
 offload/src/OpenMP/Mapping.cpp                | 29 ++++--
 offload/src/exports                           |  1 +
 offload/src/omptarget.cpp                     | 10 ++-
 preload.cpp                                   | 88 +++++++++++++++++++
 12 files changed, 283 insertions(+), 53 deletions(-)
 create mode 100644 preload.cpp

diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp
index 37dcf59a0eccf..81bdfac3b0e79 100644
--- a/llvm/lib/Passes/PassBuilderPipelines.cpp
+++ b/llvm/lib/Passes/PassBuilderPipelines.cpp
@@ -1107,9 +1107,6 @@ PassBuilder::buildModuleSimplificationPipeline(OptimizationLevel Level,
           PGOIndirectCallPromotion(true /* IsInLTO */, true /* SamplePGO */));
   }
 
-  if (EnableGPUSan)
-    MPM.addPass(GPUSanPass());
-
   // Try to perform OpenMP specific optimizations on the module. This is a
   // (quick!) no-op if there are no OpenMP runtime calls present in the module.
   MPM.addPass(OpenMPOptPass());
@@ -1869,6 +1866,9 @@ PassBuilder::buildLTODefaultPipeline(OptimizationLevel Level,
   MPM.addPass(createModuleToFunctionPassAdaptor(std::move(PeepholeFPM),
                                                 PTO.EagerlyInvalidateAnalyses));
 
+  if (EnableGPUSan)
+    MPM.addPass(GPUSanPass());
+
   // Note: historically, the PruneEH pass was run first to deduce nounwind and
   // generally clean up exception handling overhead. It isn't clear this is
   // valuable as the inliner doesn't currently care whether it is inlining an
@@ -1895,9 +1895,6 @@ PassBuilder::buildLTODefaultPipeline(OptimizationLevel Level,
   // Optimize globals again after we ran the inliner.
   MPM.addPass(GlobalOptPass());
 
-  if (EnableGPUSan)
-    MPM.addPass(GPUSanPass());
-
   // Run the OpenMPOpt pass again after global optimizations.
   MPM.addPass(OpenMPOptPass(ThinOrFullLTOPhase::FullLTOPostLink));
 
diff --git a/llvm/lib/Transforms/Instrumentation/GPUSan.cpp b/llvm/lib/Transforms/Instrumentation/GPUSan.cpp
index 5302cb14ed22c..97c8cd8e6705f 100644
--- a/llvm/lib/Transforms/Instrumentation/GPUSan.cpp
+++ b/llvm/lib/Transforms/Instrumentation/GPUSan.cpp
@@ -12,9 +12,11 @@
 
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/GlobalObject.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/InstIterator.h"
 #include "llvm/IR/Instruction.h"
@@ -64,11 +66,12 @@ class GPUSanImpl final {
 
   FunctionCallee getNewAllocationFn() {
     getOrCreateFn(NewAllocationFn, "ompx_new_allocation", PtrTy,
-                  {PtrTy, Int64Ty});
+                  {PtrTy, Int64Ty, Int64Ty});
     return NewAllocationFn;
   }
   FunctionCallee getAccessFn() {
-    getOrCreateFn(AccessFn, "ompx_check_access", PtrTy, {PtrTy, Int64Ty});
+    getOrCreateFn(AccessFn, "ompx_check_access", PtrTy,
+                  {PtrTy, Int64Ty, Int64Ty});
     return AccessFn;
   }
   FunctionCallee getGEPFn() {
@@ -117,13 +120,16 @@ bool GPUSanImpl::instrumentGlobals() {
 
 void GPUSanImpl::instrumentAllocation(Instruction &I, Value &Size) {
   IRBuilder<> IRB(I.getNextNode());
+  Value *PlainI = IRB.CreatePointerBitCastOrAddrSpaceCast(&I, PtrTy);
+  static int X = 1;
   auto *CB = IRB.CreateCall(getNewAllocationFn(),
-                            {UndefValue::get(I.getType()), &Size},
+                            {PlainI, &Size, ConstantInt::get(Int64Ty, X++)},
                             I.getName() + ".san");
-  I.replaceUsesWithIf(
-      IRB.CreatePointerBitCastOrAddrSpaceCast(CB, I.getType()),
-      [](Use &U) { return !isa<LifetimeIntrinsic>(U.getUser()); });
-  CB->setArgOperand(0, &I);
+  I.replaceUsesWithIf(IRB.CreatePointerBitCastOrAddrSpaceCast(CB, I.getType()),
+                      [=](Use &U) {
+                        return U.getUser() != PlainI && U.getUser() != CB &&
+                               !isa<LifetimeIntrinsic>(U.getUser());
+                      });
 }
 
 void GPUSanImpl::instrumentAllocaInst(AllocaInst &AI) {
@@ -140,7 +146,14 @@ void GPUSanImpl::instrumentAccess(Instruction &I, int PtrIdx, Type &AccessTy) {
   Value *Size = ConstantInt::get(Int64Ty, TySize.getFixedValue());
   IRBuilder<> IRB(&I);
   Value *PtrOp = I.getOperand(PtrIdx);
-  auto *CB = IRB.CreateCall(getAccessFn(), {PtrOp, Size}, I.getName() + ".san");
+  auto *UO = getUnderlyingObject(PtrOp);
+  if (isa<GlobalObject>(UO))
+    return;
+  Value *PlainPtrOp = IRB.CreatePointerBitCastOrAddrSpaceCast(PtrOp, PtrTy);
+  static int X = 1;
+  auto *CB = IRB.CreateCall(getAccessFn(),
+                            {PlainPtrOp, Size, ConstantInt::get(Int64Ty, X++)},
+                            I.getName() + ".san");
   I.setOperand(PtrIdx,
                IRB.CreatePointerBitCastOrAddrSpaceCast(CB, PtrOp->getType()));
 }
@@ -156,13 +169,18 @@ void GPUSanImpl::instrumentStoreInst(StoreInst &SI) {
 
 void GPUSanImpl::instrumentGEPInst(GetElementPtrInst &GEP) {
   Value *PtrOp = GEP.getPointerOperand();
+  auto *UO = getUnderlyingObject(PtrOp);
+  if (isa<GlobalObject>(UO))
+    return;
   GEP.setOperand(GetElementPtrInst::getPointerOperandIndex(),
                  Constant::getNullValue(PtrOp->getType()));
 
   IRBuilder<> IRB(GEP.getNextNode());
-  auto *CB = IRB.CreateCall(getGEPFn(), {PtrOp, UndefValue::get(Int64Ty)},
+  Value *PlainPtrOp = IRB.CreatePointerBitCastOrAddrSpaceCast(PtrOp, PtrTy);
+  auto *CB = IRB.CreateCall(getGEPFn(), {PlainPtrOp, UndefValue::get(Int64Ty)},
                             GEP.getName() + ".san");
-  GEP.replaceAllUsesWith(CB);
+  GEP.replaceAllUsesWith(
+      IRB.CreatePointerBitCastOrAddrSpaceCast(CB, GEP.getType()));
   Value *Offset =
       new PtrToIntInst(&GEP, Int64Ty, GEP.getName() + ".san.offset", CB);
   CB->setArgOperand(1, Offset);
@@ -170,16 +188,24 @@ void GPUSanImpl::instrumentGEPInst(GetElementPtrInst &GEP) {
 
 bool GPUSanImpl::instrumentCallInst(CallInst &CI) {
   bool Changed = false;
+  if (isa<LifetimeIntrinsic>(CI))
+    return Changed;
   if (auto *Fn = CI.getCalledFunction()) {
-    if (Fn->isDeclaration() && !Fn->getName().starts_with("ompx")) {
+    if ((Fn->isDeclaration() || Fn->getName().starts_with("__kmpc")) &&
+        !Fn->getName().starts_with("ompx")) {
       IRBuilder<> IRB(&CI);
       for (int I = 0, E = CI.arg_size(); I != E; ++I) {
-        auto *Op = CI.getArgOperand(I);
+        Value *Op = CI.getArgOperand(I);
         if (!Op->getType()->isPointerTy())
           continue;
+        auto *UO = getUnderlyingObject(Op);
+        if (isa<GlobalObject>(UO))
+          continue;
+        Value *PlainOp = IRB.CreatePointerBitCastOrAddrSpaceCast(Op, PtrTy);
         auto *CB =
-            IRB.CreateCall(getUnpackFn(), {Op}, Op->getName() + ".unpack");
-        CI.setArgOperand(I, CB);
+            IRB.CreateCall(getUnpackFn(), {PlainOp}, Op->getName() + ".unpack");
+        CI.setArgOperand(
+            I, IRB.CreatePointerBitCastOrAddrSpaceCast(CB, Op->getType()));
         Changed = true;
       }
     }
@@ -222,7 +248,9 @@ bool GPUSanImpl::instrumentFunction(Function &Fn) {
 bool GPUSanImpl::instrument() {
   bool Changed = instrumentGlobals();
   for (Function &Fn : M)
-    Changed |= instrumentFunction(Fn);
+    if (!Fn.getName().contains("ompx") && !Fn.getName().contains("__kmpc"))
+      Changed |= instrumentFunction(Fn);
+  M.dump();
   return Changed;
 }
 
diff --git a/offload/DeviceRTL/src/AllocationTracker.cpp b/offload/DeviceRTL/src/AllocationTracker.cpp
index c6a01b19fed43..e5b2c782fa663 100644
--- a/offload/DeviceRTL/src/AllocationTracker.cpp
+++ b/offload/DeviceRTL/src/AllocationTracker.cpp
@@ -9,6 +9,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "LibC.h"
+#include "Shared/Environment.h"
+#include "Synchronization.h"
 #include "Types.h"
 #include "Utils.h"
 
@@ -17,6 +19,9 @@ using namespace utils;
 
 #pragma omp begin declare target device_type(nohost)
 
+[[gnu::used, gnu::retain, gnu::weak,
+  gnu::visibility("protected")]] OMPXTrapIDTy *__ompx_trap_id;
+
 // #define USE_TAGS
 
 #ifdef USE_TAGS
@@ -25,15 +30,16 @@ static constexpr uint32_t TAG_BITS = 8;
 static constexpr uint32_t TAG_BITS = 0;
 #endif
 
-#define _OBJECT_TY unsigned short
+#define _OBJECT_TY uint16_t
 static constexpr uint32_t OBJECT_BITS = sizeof(_OBJECT_TY) * 8;
+static constexpr uint32_t SID_BITS = 16;
 
-static constexpr uint32_t LENGTH_BITS = 64 - TAG_BITS;
+static constexpr uint32_t LENGTH_BITS = 64 - TAG_BITS - SID_BITS;
 static constexpr uint32_t OFFSET_BITS = LENGTH_BITS - OBJECT_BITS;
 
-static_assert(LENGTH_BITS + TAG_BITS == 64,
+static_assert(LENGTH_BITS + TAG_BITS + SID_BITS == 64,
               "Length and tag bits should cover 64 bits");
-static_assert(OFFSET_BITS + TAG_BITS + OBJECT_BITS == 64,
+static_assert(OFFSET_BITS + TAG_BITS + SID_BITS + OBJECT_BITS == 64,
               "Length, tag, and object bits should cover 64 bits");
 
 struct AllocationTy {
@@ -42,6 +48,7 @@ struct AllocationTy {
 #ifdef USE_TAGS
   uint64_t Tag : TAG_BITS;
 #endif
+  uint64_t SID : SID_BITS;
 
   bool contains(void *Ptr, uint64_t Size) const {
     return Ptr >= Start && advance(Ptr, Size) <= advance(Start, Length);
@@ -74,8 +81,8 @@ static_assert(sizeof(AllocationPtrTy) == sizeof(void *),
 
 extern "C" {
 
-[[gnu::flatten, gnu::always_inline]] void *
-ompx_new_allocation(void *Start, uint64_t Length) {
+[[gnu::flatten, gnu::always_inline, gnu::used, gnu::retain]] void *
+ompx_new_allocation(void *Start, uint64_t Length, int64_t Id) {
   if constexpr (LENGTH_BITS < 64)
     if (Length >= (1UL << (LENGTH_BITS + 1)))
       __builtin_trap();
@@ -83,6 +90,12 @@ ompx_new_allocation(void *Start, uint64_t Length) {
   AllocationTy &A = Allocations[No];
   A.Start = Start;
   A.Length = Length;
+  A.SID = Id;
+  if (Id != A.SID) {
+    __ompx_trap_id->ID = -2UL;
+    __builtin_trap();
+  }
+
   AllocationPtrTy AP;
   AP.PtrOffset = 0;
   AP.AllocationId = No;
@@ -93,19 +106,21 @@ ompx_new_allocation(void *Start, uint64_t Length) {
   return AP;
 }
 
-[[gnu::flatten, gnu::always_inline]] void ompx_free_allocation(void *P) {
+[[gnu::flatten, gnu::always_inline, gnu::used, gnu::retain]] void
+ompx_free_allocation(void *P) {
   AllocationPtrTy AP = AllocationPtrTy::get(P);
   Allocations[AP.AllocationId] = AllocationTy();
 }
 
-[[gnu::flatten, gnu::always_inline]] void *ompx_gep(void *P, uint64_t Offset) {
+[[gnu::flatten, gnu::always_inline, gnu::used, gnu::retain]] void *
+ompx_gep(void *P, uint64_t Offset) {
   AllocationPtrTy AP = AllocationPtrTy::get(P);
   AP.PtrOffset += Offset;
   return AP;
 }
 
-[[gnu::flatten, gnu::always_inline]] void *ompx_check_access(void *P,
-                                                             uint64_t Size) {
+[[gnu::flatten, gnu::always_inline, gnu::used, gnu::retain]] void *
+ompx_check_access(void *P, uint64_t Size, uint64_t AccessNo) {
   AllocationPtrTy AP = AllocationPtrTy::get(P);
   AllocationTy &A = Allocations[AP.AllocationId];
 #ifdef USE_TAGS
@@ -115,20 +130,45 @@ ompx_new_allocation(void *Start, uint64_t Length) {
   uint64_t Offset = AP.PtrOffset;
   void *Ptr = advance(A.Start, Offset);
   if (!A.contains(Ptr, Size)) {
-    printf("Out of bounds, access: %lu inside of %lu allocation @ %p\n", Offset,
-           A.Length, A.Start);
+    //    printf("Out of bounds, access: %lu inside of %lu allocation @ %p\n",
+    //    Offset, A.Length, A.Start);
+    __ompx_trap_id->Start = A.Start;
+    __ompx_trap_id->Length = A.Length;
+    __ompx_trap_id->Offset = AP.PtrOffset;
+    __ompx_trap_id->ID = AP.AllocationId;
+    __ompx_trap_id->AccessID = AccessNo;
     __builtin_trap();
   }
   return Ptr;
 }
 
-[[gnu::flatten, gnu::always_inline]] void *ompx_unpack(void *P) {
+[[gnu::flatten, gnu::always_inline, gnu::used, gnu::retain]] void *
+ompx_unpack(void *P) {
   AllocationPtrTy AP = AllocationPtrTy::get(P);
   AllocationTy &A = Allocations[AP.AllocationId];
   uint64_t Offset = AP.PtrOffset;
   void *Ptr = advance(A.Start, Offset);
   return Ptr;
 }
+
+[[gnu::flatten, gnu::always_inline, gnu::used, gnu::retain]] void
+ompx_new_host_allocation(void *Start, uint64_t Length, uint16_t AllocationId) {
+  if constexpr (LENGTH_BITS < 64)
+    if (Length >= (1UL << (LENGTH_BITS + 1)))
+      __builtin_trap();
+  AllocationTy &A = Allocations[AllocationId];
+  A.Start = Start;
+  A.Length = Length;
+  A.SID = AllocationId;
+#ifdef USE_TAGS
+  A.Tag = 0;
+#endif
+}
+
+[[gnu::flatten, gnu::always_inline, gnu::used, gnu::retain]] void
+ompx_free_host_allocation(void *P) {
+  ompx_free_allocation(P);
+}
 }
 
 #pragma omp end declare target
diff --git a/offload/include/OpenMP/Mapping.h b/offload/include/OpenMP/Mapping.h
index b9f5c16582931..d310c6c707f8f 100644
--- a/offload/include/OpenMP/Mapping.h
+++ b/offload/include/OpenMP/Mapping.h
@@ -71,6 +71,7 @@ struct HostDataToTargetTy {
 
   const uintptr_t TgtAllocBegin; // allocated target memory
   const uintptr_t TgtPtrBegin; // mapped target memory = TgtAllocBegin + padding
+  void *FakeTgtPtrBegin = 0;   // mapped target memory = TgtAllocBegin + padding
 
 private:
   static const uint64_t INFRefCount = ~(uint64_t)0;
@@ -125,9 +126,10 @@ struct HostDataToTargetTy {
   HostDataToTargetTy(uintptr_t BP, uintptr_t B, uintptr_t E,
                      uintptr_t TgtAllocBegin, uintptr_t TgtPtrBegin,
                      bool UseHoldRefCount, map_var_info_t Name = nullptr,
-                     bool IsINF = false)
+                     bool IsINF = false, void *FakeTgtPtrBegin = nullptr)
       : HstPtrBase(BP), HstPtrBegin(B), HstPtrEnd(E), HstPtrName(Name),
         TgtAllocBegin(TgtAllocBegin), TgtPtrBegin(TgtPtrBegin),
+        FakeTgtPtrBegin(FakeTgtPtrBegin),
         States(std::make_unique<StatesTy>(UseHoldRefCount ? 0
                                           : IsINF         ? INFRefCount
                                                           : 1,
diff --git a/offload/include/Shared/Environment.h b/offload/include/Shared/Environment.h
index d141146b6bd5a..f35570dcee53c 100644
--- a/offload/include/Shared/Environment.h
+++ b/offload/include/Shared/Environment.h
@@ -105,4 +105,12 @@ struct KernelLaunchEnvironmentTy {
   void *ReductionBuffer = nullptr;
 };
 
+struct OMPXTrapIDTy {
+  void *Start;
+  uint64_t Length;
+  uint64_t Offset;
+  uint64_t ID;
+  uint64_t AccessID;
+};
+
 #endif // OMPTARGET_SHARED_ENVIRONMENT_H
diff --git a/offload/plugins-nextgen/amdgpu/src/rtl.cpp b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
index b37c3e23d44ab..c64b3ccf2c8f9 100644
--- a/offload/plugins-nextgen/amdgpu/src/rtl.cpp
+++ b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -13,6 +13,8 @@
 #include <atomic>
 #include <cassert>
 #include <cstddef>
+#include <cstdint>
+#include <cstdio>
 #include <deque>
 #include <mutex>
 #include <string>
@@ -685,12 +687,12 @@ struct AMDGPUQueueTy {
   AMDGPUQueueTy() : Queue(nullptr), Mutex(), NumUsers(0) {}
 
   /// Lazily initialize a new queue belonging to a specific agent.
-  Error init(hsa_agent_t Agent, int32_t QueueSize) {
+  Error init(GenericDeviceTy &Device, hsa_agent_t Agent, int32_t QueueSize) {
     if (Queue)
       return Plugin::success();
     hsa_status_t Status =
         hsa_queue_create(Agent, QueueSize, HSA_QUEUE_TYPE_MULTI, callbackError,
-                         nullptr, UINT32_MAX, UINT32_MAX, &Queue);
+                         &Device, UINT32_MAX, UINT32_MAX, &Queue);
     return Plugin::check(Status, "Error in hsa_queue_create: %s");
   }
 
@@ -875,10 +877,8 @@ struct AMDGPUQueueTy {
   }
 
   /// Callack that will be called when an error is detected on the HSA queue.
-  static void callbackError(hsa_status_t Status, hsa_queue_t *Source, void *) {
-    auto Err = Plugin::check(Status, "Received error in queue %p: %s", Source);
-    FATAL_MESSAGE(1, "%s", toString(std::move(Err)).data());
-  }
+  static void callbackError(hsa_status_t Status, hsa_queue_t *Source,
+                            void *Data);
 
   /// The HSA queue.
   hsa_queue_t *Queue;
@@ -1593,8 +1593,9 @@ struct AMDGPUStreamManagerTy final
   using ResourceRef = AMDGPUResourceRef<AMDGPUStreamTy>;
   using ResourcePoolTy = GenericDeviceResourceManagerTy<ResourceRef>;
 
+  GenericDeviceTy &Device;
   AMDGPUStreamManagerTy(GenericDeviceTy &Device, hsa_agent_t HSAAgent)
-      : GenericDeviceResourceManagerTy(Device),
+      : GenericDeviceResourceManagerTy(Device), Device(Device),
         OMPX_QueueTracking("LIBOMPTARGET_AMDGPU_HSA_QUEUE_BUSY_TRACKING", true),
         NextQueue(0), Agent(HSAAgent) {}
 
@@ -1603,7 +1604,7 @@ struct AMDGPUStreamManagerTy final
     QueueSize = HSAQueueSize;
     MaxNumQueues = NumHSAQueues;
     // Initialize one queue eagerly
-    if (auto Err = Queues.front().init(Agent, QueueSize))
+    if (auto Err = Queues.front().init(Device, Agent, QueueSize))
       return Err;
 
     return GenericDeviceResourceManagerTy::init(InitialSize);
@@ -1660,7 +1661,7 @@ struct AMDGPUStreamManagerTy final
     }
 
     // Make sure the queue is initialized, then add user & assign.
-    if (auto Err = Queues[Index].init(Agent, QueueSize))
+    if (auto Err = Queues[Index].init(Device, Agent, QueueSize))
       return Err;
     Queues[Index].addUser();
     Stream->Queue = &Queues[Index];
@@ -3489,6 +3490,37 @@ void *AMDGPUDeviceTy::allocate(size_t Size, void *, TargetAllocTy Kind) {
   return Alloc;
 }
 
+void AMDGPUQueueTy::callbackError(hsa_status_t Status, hsa_queue_t *Source,
+                                  void *Data) {
+
+  auto *Device = reinterpret_cast<AMDGPUDeviceTy *>(Data);
+  //
+  //  int64_t OmpxTrapId = -1;
+  //  GlobalTy TrapId("__ompx_trap_id", sizeof(int64_t), &OmpxTrapId);
+  //
+  //  printf("Check for trap id\n");
+  //  fflush(stdout);
+  //  // Write device environment values to the device.
+  //  GenericGlobalHandlerTy &GHandler = Device->Plugin.getGlobalHandler();
+  //  for (auto *Image : Device->images()) {
+  //    if (auto Err = GHandler.readGlobalFromDevice(*Device, *Image, TrapId)) {
+  //      REPORT("%s\n", toString(std::move(Err)).data());
+  //      continue;
+  //    }
+  //    if (OmpxTrapId != 0)
+  //      break;
+  //  }
+  //
+  //  printf("Trap ID: %li\n", OmpxTrapId);
+  printf("Trap ID[%zu:%p] Acc[%zu] : %p[:%lu] vs %lu\n",
+         (uint64_t)Device->OmpxTrapId->ID, (void *)Device->OmpxTrapId->ID,
+         Device->OmpxTrapId->AccessID, Device->OmpxTrapId->Start,
+         Device->OmpxTrapId->Length, Device->OmpxTrapId->Offset);
+  fflush(stdout);
+  auto Err = Plugin::check(Status, "Received error in queue %p: %s", Source);
+  FATAL_MESSAGE(1, "%s", toString(std::move(Err)).data());
+}
+
 } // namespace plugin
 } // namespace target
 } // namespace omp
diff --git a/offload/plugins-nextgen/common/include/PluginInterface.h b/offload/plugins-nextgen/common/include/PluginInterface.h
index 0d2a36a42d5fa..1b32ccfc1a291 100644
--- a/offload/plugins-nextgen/common/include/PluginInterface.h
+++ b/offload/plugins-nextgen/common/include/PluginInterface.h
@@ -857,6 +857,8 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
   /// Allocate and construct a kernel object.
   virtual Expected<GenericKernelTy &> constructKernel(const char *Name) = 0;
 
+  OMPXTrapIDTy *OmpxTrapId = nullptr;
+
   /// Reference to the underlying plugin that created this device.
   GenericPluginTy &Plugin;
 
diff --git a/offload/plugins-nextgen/common/src/PluginInterface.cpp b/offload/plugins-nextgen/common/src/PluginInterface.cpp
index ff0f6edfcd693..5d19b01d3da5b 100644
--- a/offload/plugins-nextgen/common/src/PluginInterface.cpp
+++ b/offload/plugins-nextgen/common/src/PluginInterface.cpp
@@ -1001,6 +1001,17 @@ Error GenericDeviceTy::setupDeviceMemoryPool(GenericPluginTy &Plugin,
   if (auto Err = GHandler.writeGlobalToDevice(*this, Image, TrackerGlobal))
     return Err;
 
+  OmpxTrapId = reinterpret_cast<OMPXTrapIDTy *>(
+      allocate(sizeof(*OmpxTrapId), &OmpxTrapId, TARGET_ALLOC_HOST));
+  OmpxTrapId->Start = 0;
+  OmpxTrapId->Length = -1;
+  OmpxTrapId->Offset = -1;
+  OmpxTrapId->ID = -1;
+
+  GlobalTy TrapId("__ompx_trap_id", sizeof(OmpxTrapId), &OmpxTrapId);
+  if (auto Err = GHandler.writeGlobalToDevice(*this, Image, TrapId))
+    return Err;
+
   // Create the metainfo of the device environment global.
   GlobalTy DevEnvGlobal("__omp_rtl_device_memory_pool",
                         sizeof(DeviceMemoryPoolTy), &DeviceMemoryPool);
diff --git a/offload/src/OpenMP/Mapping.cpp b/offload/src/OpenMP/Mapping.cpp
index 595e3456ab54c..10fc76aef12e6 100644
--- a/offload/src/OpenMP/Mapping.cpp
+++ b/offload/src/OpenMP/Mapping.cpp
@@ -15,6 +15,13 @@
 #include "Shared/Requirements.h"
 #include "device.h"
 
+extern "C" {
+[[gnu::weak]] void ompx_free_allocation_host(void *P) {}
+[[gnu::weak]] void *ompx_new_allocation_host(void *P, uint64_t) {
+  return nullptr;
+}
+}
+
 /// Dump a table of all the host-target pointer pairs on failure
 void dumpTargetPointerMappings(const ident_t *Loc, DeviceTy &Device,
                                bool toStdOut) {
@@ -68,6 +75,9 @@ int MappingInfoTy::associatePtr(void *HstPtrBegin, void *TgtPtrBegin,
     return OFFLOAD_FAIL;
   }
 
+  void *FakeTgtPtrBegin = ompx_new_allocation_host(TgtPtrBegin, Size);
+  printf("FP %p -> %p \n", (void *)TgtPtrBegin, FakeTgtPtrBegin);
+
   // Mapping does not exist, allocate it with refCount=INF
   const HostDataToTargetTy &NewEntry =
       *HDTTMap
@@ -78,7 +88,7 @@ int MappingInfoTy::associatePtr(void *HstPtrBegin, void *TgtPtrBegin,
                /*TgtAllocBegin=*/(uintptr_t)TgtPtrBegin,
                /*TgtPtrBegin=*/(uintptr_t)TgtPtrBegin,
                /*UseHoldRefCount=*/false, /*Name=*/nullptr,
-               /*IsRefCountINF=*/true))
+               /*IsRefCountINF=*/true, FakeTgtPtrBegin))
            .first->HDTT;
   DP("Creating new map entry: HstBase=" DPxMOD ", HstBegin=" DPxMOD
      ", HstEnd=" DPxMOD ", TgtBegin=" DPxMOD ", DynRefCount=%s, "
@@ -292,14 +302,16 @@ TargetPointerResultTy MappingInfoTy::getTargetPointer(
     uintptr_t TgtAllocBegin =
         (uintptr_t)Device.allocData(TgtPadding + Size, HstPtrBegin);
     uintptr_t TgtPtrBegin = TgtAllocBegin + TgtPadding;
+    void *FakeTgtPtrBegin = ompx_new_allocation_host((void *)TgtPtrBegin, Size);
     // Release the mapping table lock only after the entry is locked by
     // attaching it to TPR.
-    LR.TPR.setEntry(HDTTMap
-                        ->emplace(new HostDataToTargetTy(
-                            (uintptr_t)HstPtrBase, (uintptr_t)HstPtrBegin,
-                            (uintptr_t)HstPtrBegin + Size, TgtAllocBegin,
-                            TgtPtrBegin, HasHoldModifier, HstPtrName))
-                        .first->HDTT);
+    LR.TPR.setEntry(
+        HDTTMap
+            ->emplace(new HostDataToTargetTy(
+                (uintptr_t)HstPtrBase, (uintptr_t)HstPtrBegin,
+                (uintptr_t)HstPtrBegin + Size, TgtAllocBegin, TgtPtrBegin,
+                HasHoldModifier, HstPtrName, /*IsINF=*/false, FakeTgtPtrBegin))
+            .first->HDTT);
     INFO(OMP_INFOTYPE_MAPPING_CHANGED, Device.DeviceID,
          "Creating new map entry with HstPtrBase=" DPxMOD
          ", HstPtrBegin=" DPxMOD ", TgtAllocBegin=" DPxMOD
@@ -492,6 +504,9 @@ int MappingInfoTy::deallocTgtPtrAndEntry(HostDataToTargetTy *Entry,
     return OFFLOAD_FAIL;
   }
 
+  if (Entry->FakeTgtPtrBegin)
+    ompx_free_allocation_host(Entry->FakeTgtPtrBegin);
+
   int Ret = Device.deleteData((void *)Entry->TgtAllocBegin);
 
   // Notify the plugin about the unmapped memory.
diff --git a/offload/src/exports b/offload/src/exports
index 11830f62af388..5a2f8dd6cdd41 100644
--- a/offload/src/exports
+++ b/offload/src/exports
@@ -76,6 +76,7 @@ VERS1.0 {
     __llvmPushCallConfiguration;
     __llvmPopCallConfiguration;
     llvmLaunchKernel;
+    ompx_new_allocation_host;
   local:
     *;
 };
diff --git a/offload/src/omptarget.cpp b/offload/src/omptarget.cpp
index 73e26c0def6da..f7715a422a5b0 100644
--- a/offload/src/omptarget.cpp
+++ b/offload/src/omptarget.cpp
@@ -670,6 +670,8 @@ int targetDataBegin(ident_t *Loc, DeviceTy &Device, int32_t ArgNum,
         HasFlagTo, HasFlagAlways, IsImplicit, UpdateRef, HasCloseModifier,
         HasPresentModifier, HasHoldModifier, AsyncInfo, PointerTpr.getEntry());
     void *TgtPtrBegin = TPR.TargetPointer;
+    if (auto *FakeTgtPtrBegin = TPR.getEntry()->FakeTgtPtrBegin)
+      TgtPtrBegin = FakeTgtPtrBegin;
     IsHostPtr = TPR.Flags.IsHostPointer;
     // If data_size==0, then the argument could be a zero-length pointer to
     // NULL, so getOrAlloc() returning NULL is not an error.
@@ -1523,11 +1525,15 @@ static int processDataBefore(ident_t *Loc, int64_t DeviceId, void *HostPtr,
           /*UpdateRefCount=*/false,
           /*UseHoldRefCount=*/false);
       TgtPtrBegin = TPR.TargetPointer;
+      if (auto *FakeTgtPtrBegin = TPR.getEntry()->FakeTgtPtrBegin)
+        TgtPtrBegin = FakeTgtPtrBegin;
       TgtBaseOffset = (intptr_t)HstPtrBase - (intptr_t)HstPtrBegin;
 #ifdef OMPTARGET_DEBUG
       void *TgtPtrBase = (void *)((intptr_t)TgtPtrBegin + TgtBaseOffset);
-      DP("Obtained target argument " DPxMOD " from host pointer " DPxMOD "\n",
-         DPxPTR(TgtPtrBase), DPxPTR(HstPtrBegin));
+      DP("Obtained target argument " DPxMOD " from host pointer " DPxMOD
+         " %s\n",
+         DPxPTR(TgtPtrBase), DPxPTR(HstPtrBegin),
+         TgtPtrBegin != TPR.TargetPointer ? "fake" : "");
 #endif
     }
     TgtArgsPositions[I] = TgtArgs.size();
diff --git a/preload.cpp b/preload.cpp
new file mode 100644
index 0000000000000..dd21404938351
--- /dev/null
+++ b/preload.cpp
@@ -0,0 +1,88 @@
+//===------ AllocationTracker.cpp - Track allocation for sanitizers -------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+
+#include <cstdint>
+#include <cstdio>
+
+#ifdef USE_TAGS
+static constexpr uint32_t TAG_BITS = 8;
+#else
+static constexpr uint32_t TAG_BITS = 0;
+#endif
+
+#define _OBJECT_TY unsigned short
+static constexpr uint32_t OBJECT_BITS = sizeof(_OBJECT_TY) * 8;
+static constexpr uint32_t SID_BITS = 16;
+
+static constexpr uint32_t LENGTH_BITS = 64 - TAG_BITS - SID_BITS;
+static constexpr uint32_t OFFSET_BITS = LENGTH_BITS - OBJECT_BITS;
+
+static_assert(LENGTH_BITS + TAG_BITS + SID_BITS == 64,
+              "Length and tag bits should cover 64 bits");
+static_assert(OFFSET_BITS + TAG_BITS + SID_BITS + OBJECT_BITS == 64,
+              "Length, tag, and object bits should cover 64 bits");
+
+template <typename DstTy, typename SrcTy> inline DstTy convertViaPun(SrcTy V) {
+  return *((DstTy *)(&V));
+}
+
+struct AllocationPtrTy {
+  static AllocationPtrTy get(void *P) {
+    return convertViaPun<AllocationPtrTy>(P);
+  }
+  static AllocationPtrTy get(intptr_t V) {
+    return convertViaPun<AllocationPtrTy>(V);
+  }
+  operator void *() const { return convertViaPun<void *>(*this); }
+  operator intptr_t() const { return convertViaPun<intptr_t>(*this); }
+
+  uint64_t PtrOffset : OFFSET_BITS;
+#ifdef USE_TAGS
+  uint64_t AllocationTag : TAG_BITS;
+#endif
+  uint64_t AllocationId : OBJECT_BITS;
+};
+static_assert(sizeof(AllocationPtrTy) == sizeof(void *),
+              "AllocationTy pointers should be pointer sized");
+
+extern "C" {
+
+[[gnu::flatten, gnu::always_inline]] void *ompx_new(uint16_t &AllocationId) {
+  static uint16_t NumHostAllocs = static_cast<_OBJECT_TY>(~0) - 1;
+  AllocationId = NumHostAllocs--;
+  AllocationPtrTy AP;
+  AP.PtrOffset = 0;
+  AP.AllocationId = AllocationId;
+  return AP;
+}
+
+#pragma omp begin declare target
+void ompx_new_host_allocation(void *P, uint64_t Bytes, uint16_t AllocationId);
+void ompx_free_host_allocation(void *P);
+#pragma omp end declare target
+
+void *ompx_new_allocation_host(void *P, uint64_t Bytes) {
+  uint16_t AllocationId;
+  void *NewP = ompx_new(AllocationId);
+#pragma omp target is_device_ptr(P)
+  ompx_new_host_allocation(P, Bytes, AllocationId);
+  printf("registered %p[:%10zu] -> %zu:%p\n", P, Bytes, (uint64_t)AllocationId,
+         NewP);
+  fflush(stdout);
+  return NewP;
+}
+
+void ompx_free_allocation_host(void *P) {
+  printf("unregister   %p\n", P);
+  fflush(stdout);
+#pragma omp target is_device_ptr(P)
+  ompx_free_host_allocation(P);
+}
+}

>From 3a147be5dc1d8fa070accfe2d11c99c5875cbd6a Mon Sep 17 00:00:00 2001
From: Johannes Doerfert <johannes at jdoerfert.de>
Date: Tue, 18 Jun 2024 21:17:03 -0700
Subject: [PATCH 06/31] Rewrite

---
 llvm/lib/Passes/PassBuilderPipelines.cpp      |   6 +-
 .../lib/Transforms/Instrumentation/GPUSan.cpp | 293 ++++++++++----
 offload/DeviceRTL/include/Utils.h             |   3 +
 offload/DeviceRTL/src/AllocationTracker.cpp   | 381 +++++++++++++-----
 offload/DeviceRTL/src/Utils.cpp               |  10 +
 offload/DeviceRTL/src/exports                 |   2 +
 offload/plugins-nextgen/amdgpu/src/rtl.cpp    |   4 +-
 offload/src/OpenMP/Mapping.cpp                |   3 +
 offload/src/exports                           |   1 +
 preload.cpp                                   |  80 ++--
 10 files changed, 564 insertions(+), 219 deletions(-)

diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp
index 81bdfac3b0e79..c1e924145c931 100644
--- a/llvm/lib/Passes/PassBuilderPipelines.cpp
+++ b/llvm/lib/Passes/PassBuilderPipelines.cpp
@@ -1866,9 +1866,6 @@ PassBuilder::buildLTODefaultPipeline(OptimizationLevel Level,
   MPM.addPass(createModuleToFunctionPassAdaptor(std::move(PeepholeFPM),
                                                 PTO.EagerlyInvalidateAnalyses));
 
-  if (EnableGPUSan)
-    MPM.addPass(GPUSanPass());
-
   // Note: historically, the PruneEH pass was run first to deduce nounwind and
   // generally clean up exception handling overhead. It isn't clear this is
   // valuable as the inliner doesn't currently care whether it is inlining an
@@ -2049,6 +2046,9 @@ PassBuilder::buildLTODefaultPipeline(OptimizationLevel Level,
 
   invokeFullLinkTimeOptimizationLastEPCallbacks(MPM, Level);
 
+  if (EnableGPUSan)
+    MPM.addPass(GPUSanPass());
+
   // Emit annotation remarks.
   addAnnotationRemarksPass(MPM);
 
diff --git a/llvm/lib/Transforms/Instrumentation/GPUSan.cpp b/llvm/lib/Transforms/Instrumentation/GPUSan.cpp
index 97c8cd8e6705f..738b75bd3bce4 100644
--- a/llvm/lib/Transforms/Instrumentation/GPUSan.cpp
+++ b/llvm/lib/Transforms/Instrumentation/GPUSan.cpp
@@ -11,17 +11,23 @@
 #include "llvm/Transforms/Instrumentation/GPUSan.h"
 
 #include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/GlobalObject.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/GlobalVariable.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/InstIterator.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/IntrinsicsAMDGPU.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
 #include "llvm/Support/CommandLine.h"
@@ -39,52 +45,99 @@ cl::opt<bool> UseTags(
     cl::init(false));
 
 namespace {
+
+enum PtrOrigin {
+  UNKNOWN,
+  LOCAL,
+  GLOBAL,
+  SYSTEM,
+  NONE,
+};
+
+static std::string getSuffix(PtrOrigin PO) {
+  switch (PO) {
+  case UNKNOWN:
+    return "";
+  case LOCAL:
+    return "_local";
+  case GLOBAL:
+    return "_global";
+  default:
+    break;
+  }
+  llvm_unreachable("Bad pointer origin!");
+}
+
 class GPUSanImpl final {
 public:
-  GPUSanImpl(Module &M) : M(M), Ctx(M.getContext()) {}
+  GPUSanImpl(Module &M, FunctionAnalysisManager &FAM)
+      : M(M), FAM(FAM), Ctx(M.getContext()) {}
 
   bool instrument();
 
 private:
   bool instrumentGlobals();
   bool instrumentFunction(Function &Fn);
-  void instrumentAllocation(Instruction &I, Value &Size);
-  void instrumentAllocaInst(AllocaInst &AI);
-  void instrumentAccess(Instruction &I, int PtrIdx, Type &AccessTy);
-  void instrumentLoadInst(LoadInst &LI);
-  void instrumentStoreInst(StoreInst &SI);
-  void instrumentGEPInst(GetElementPtrInst &GEP);
-  bool instrumentCallInst(CallInst &CI);
-
-  void getOrCreateFn(FunctionCallee &FC, StringRef Name, Type *RetTy,
-                     ArrayRef<Type *> ArgTys) {
+  Value *instrumentAllocation(Instruction &I, Value &Size, FunctionCallee Fn);
+  Value *instrumentAllocaInst(LoopInfo &LI, AllocaInst &AI);
+  void instrumentAccess(LoopInfo &LI, Instruction &I, int PtrIdx,
+                        Type &AccessTy, bool IsRead);
+  void instrumentLoadInst(LoopInfo &LI, LoadInst &LoadI);
+  void instrumentStoreInst(LoopInfo &LI, StoreInst &StoreI);
+  void instrumentGEPInst(LoopInfo &LI, GetElementPtrInst &GEP);
+  bool instrumentCallInst(LoopInfo &LI, CallInst &CI);
+  void
+  instrumentReturns(SmallVectorImpl<std::pair<AllocaInst *, Value *>> &Allocas,
+                    SmallVectorImpl<ReturnInst *> &Returns);
+
+  PtrOrigin getPtrOrigin(LoopInfo &LI, Value *Ptr);
+
+  FunctionCallee getOrCreateFn(FunctionCallee &FC, StringRef Name, Type *RetTy,
+                               ArrayRef<Type *> ArgTys) {
     if (!FC) {
       auto *NewAllocationFnTy = FunctionType::get(RetTy, ArgTys, false);
       FC = M.getOrInsertFunction(Name, NewAllocationFnTy);
     }
+    return FC;
   }
 
-  FunctionCallee getNewAllocationFn() {
-    getOrCreateFn(NewAllocationFn, "ompx_new_allocation", PtrTy,
-                  {PtrTy, Int64Ty, Int64Ty});
-    return NewAllocationFn;
+  FunctionCallee getNewFn(PtrOrigin PO) {
+    assert(PO <= GLOBAL && "Origin does not need handling.");
+    return getOrCreateFn(NewFn[PO], "ompx_new" + getSuffix(PO), PtrTy,
+                         {PtrTy, Int64Ty, Int64Ty});
+  }
+  FunctionCallee getFreeFn(PtrOrigin PO) {
+    assert(PO <= GLOBAL && "Origin does not need handling.");
+    return getOrCreateFn(FreeFn[PO], "ompx_free" + getSuffix(PO), VoidTy,
+                         {PtrTy});
   }
-  FunctionCallee getAccessFn() {
-    getOrCreateFn(AccessFn, "ompx_check_access", PtrTy,
-                  {PtrTy, Int64Ty, Int64Ty});
-    return AccessFn;
+  FunctionCallee getFreeNLocalFn() {
+    return getOrCreateFn(FreeNLocal, "ompx_free_local_n", VoidTy, {Int32Ty});
   }
-  FunctionCallee getGEPFn() {
-    getOrCreateFn(GEPFn, "ompx_gep", PtrTy, {PtrTy, Int64Ty});
-    return GEPFn;
+  FunctionCallee getCheckFn(PtrOrigin PO) {
+    assert(PO <= GLOBAL && "Origin does not need handling.");
+    return getOrCreateFn(CheckFn[PO], "ompx_check" + getSuffix(PO), PtrTy,
+                         {PtrTy, Int64Ty, Int64Ty});
   }
-  FunctionCallee getUnpackFn() {
-    getOrCreateFn(UnpackFn, "ompx_unpack", PtrTy, {PtrTy});
-    return UnpackFn;
+  FunctionCallee getGEPFn(PtrOrigin PO) {
+    assert(PO <= GLOBAL && "Origin does not need handling.");
+    return getOrCreateFn(GEPFn[PO], "ompx_gep" + getSuffix(PO), PtrTy,
+                         {PtrTy, Int64Ty});
+  }
+  FunctionCallee getUnpackFn(PtrOrigin PO) {
+    assert(PO <= GLOBAL && "Origin does not need handling.");
+    return getOrCreateFn(UnpackFn[PO], "ompx_unpack" + getSuffix(PO), PtrTy,
+                         {PtrTy});
+  }
+  FunctionCallee getLeakCheckFn() {
+    FunctionCallee LeakCheckFn;
+    return getOrCreateFn(LeakCheckFn, "ompx_leak_check", VoidTy, {});
   }
 
   Module &M;
+  FunctionAnalysisManager &FAM;
   LLVMContext &Ctx;
+  bool HasAllocas;
 
   Type *VoidTy = Type::getVoidTy(Ctx);
   Type *IntptrTy = M.getDataLayout().getIntPtrType(Ctx);
@@ -95,18 +148,64 @@ class GPUSanImpl final {
 
   const DataLayout &DL = M.getDataLayout();
 
-  FunctionCallee GEPFn;
-  FunctionCallee UnpackFn;
-  FunctionCallee AccessFn;
-  FunctionCallee NewAllocationFn;
+  FunctionCallee NewFn[3];
+  FunctionCallee GEPFn[3];
+  FunctionCallee FreeFn[3];
+  FunctionCallee CheckFn[3];
+  FunctionCallee UnpackFn[3];
+  FunctionCallee FreeNLocal;
 };
 
 } // end anonymous namespace
 
+PtrOrigin GPUSanImpl::getPtrOrigin(LoopInfo &LI, Value *Ptr) {
+  SmallVector<const Value *> Objects;
+  getUnderlyingObjects(Ptr, Objects, &LI);
+  PtrOrigin PO = NONE;
+  for (auto *Obj : Objects) {
+    PtrOrigin ObjPO = HasAllocas ? UNKNOWN : GLOBAL;
+    if (isa<AllocaInst>(Obj)) {
+      ObjPO = LOCAL;
+    } else if (isa<GlobalVariable>(Obj)) {
+      ObjPO = GLOBAL;
+    } else if (auto *II = dyn_cast<IntrinsicInst>(Obj)) {
+      if (II->getIntrinsicID() == Intrinsic::amdgcn_implicitarg_ptr ||
+          II->getIntrinsicID() == Intrinsic::amdgcn_dispatch_ptr)
+        return SYSTEM;
+    } else if (auto *CI = dyn_cast<CallInst>(Obj)) {
+      if (auto *Callee = CI->getCalledFunction())
+        if (Callee->getName().starts_with("ompx_")) {
+          if (Callee->getName().ends_with("_global"))
+            ObjPO = GLOBAL;
+          else if (Callee->getName().ends_with("_local"))
+            ObjPO = LOCAL;
+        }
+    } else if (auto *Arg = dyn_cast<Argument>(Obj)) {
+      if (Arg->getParent()->hasFnAttribute("kernel"))
+        ObjPO = GLOBAL;
+    }
+    if (PO == NONE || PO == ObjPO) {
+      PO = ObjPO;
+    } else {
+      return UNKNOWN;
+    }
+  }
+  return PO;
+}
+
 bool GPUSanImpl::instrumentGlobals() {
-  return false;
-  Function *CTorFn;
-  std::tie(CTorFn, std::ignore) = getOrCreateSanitizerCtorAndInitFunctions(
+  Function *DtorFn =
+      Function::Create(FunctionType::get(VoidTy, false),
+                       GlobalValue::PrivateLinkage, "san.dtor", &M);
+  BasicBlock *Entry = BasicBlock::Create(Ctx, "entry", DtorFn);
+  IRBuilder<> IRB(Entry);
+  IRB.CreateCall(getLeakCheckFn());
+  IRB.CreateRetVoid();
+  appendToGlobalDtors(M, DtorFn, 0, nullptr);
+  return true;
+
+  Function *DTorFn;
+  std::tie(DTorFn, std::ignore) = getOrCreateSanitizerCtorAndInitFunctions(
       M, "ompx.ctor", "ompx.init",
       /*InitArgTypes=*/{},
       /*InitArgs=*/{},
@@ -118,67 +217,77 @@ bool GPUSanImpl::instrumentGlobals() {
   return true;
 }
 
-void GPUSanImpl::instrumentAllocation(Instruction &I, Value &Size) {
-  IRBuilder<> IRB(I.getNextNode());
+Value *GPUSanImpl::instrumentAllocation(Instruction &I, Value &Size,
+                                        FunctionCallee Fn) {
+  IRBuilder<> IRB(&*I.getParent()->getFirstNonPHIOrDbgOrAlloca());
   Value *PlainI = IRB.CreatePointerBitCastOrAddrSpaceCast(&I, PtrTy);
-  static int X = 1;
-  auto *CB = IRB.CreateCall(getNewAllocationFn(),
-                            {PlainI, &Size, ConstantInt::get(Int64Ty, X++)},
-                            I.getName() + ".san");
+  static int AllocationId = 1;
+  auto *CB = IRB.CreateCall(
+      Fn, {PlainI, &Size, ConstantInt::get(Int64Ty, AllocationId++)},
+      I.getName() + ".san");
   I.replaceUsesWithIf(IRB.CreatePointerBitCastOrAddrSpaceCast(CB, I.getType()),
                       [=](Use &U) {
                         return U.getUser() != PlainI && U.getUser() != CB &&
                                !isa<LifetimeIntrinsic>(U.getUser());
                       });
+  return CB;
 }
 
-void GPUSanImpl::instrumentAllocaInst(AllocaInst &AI) {
+Value *GPUSanImpl::instrumentAllocaInst(LoopInfo &LI, AllocaInst &AI) {
   auto SizeOrNone = AI.getAllocationSize(DL);
   if (!SizeOrNone)
     llvm_unreachable("TODO");
   Value *Size = ConstantInt::get(Int64Ty, *SizeOrNone);
-  instrumentAllocation(AI, *Size);
+  return instrumentAllocation(AI, *Size, getNewFn(LOCAL));
 }
 
-void GPUSanImpl::instrumentAccess(Instruction &I, int PtrIdx, Type &AccessTy) {
+void GPUSanImpl::instrumentAccess(LoopInfo &LI, Instruction &I, int PtrIdx,
+                                  Type &AccessTy, bool IsRead) {
+  Value *PtrOp = I.getOperand(PtrIdx);
+  PtrOrigin PO = getPtrOrigin(LI, PtrOp);
+  if (PO > GLOBAL)
+    return;
+
+  static int32_t ReadAccessId = -1;
+  static int32_t WriteAccessId = 1;
+  const int32_t &AccessId = IsRead ? ReadAccessId-- : WriteAccessId++;
+
   auto TySize = DL.getTypeStoreSize(&AccessTy);
   assert(!TySize.isScalable());
   Value *Size = ConstantInt::get(Int64Ty, TySize.getFixedValue());
   IRBuilder<> IRB(&I);
-  Value *PtrOp = I.getOperand(PtrIdx);
-  auto *UO = getUnderlyingObject(PtrOp);
-  if (isa<GlobalObject>(UO))
-    return;
   Value *PlainPtrOp = IRB.CreatePointerBitCastOrAddrSpaceCast(PtrOp, PtrTy);
-  static int X = 1;
-  auto *CB = IRB.CreateCall(getAccessFn(),
-                            {PlainPtrOp, Size, ConstantInt::get(Int64Ty, X++)},
-                            I.getName() + ".san");
+  auto *CB = IRB.CreateCall(
+      getCheckFn(PO), {PlainPtrOp, Size, ConstantInt::get(Int64Ty, AccessId)},
+      I.getName() + ".san");
   I.setOperand(PtrIdx,
                IRB.CreatePointerBitCastOrAddrSpaceCast(CB, PtrOp->getType()));
 }
 
-void GPUSanImpl::instrumentLoadInst(LoadInst &LI) {
-  instrumentAccess(LI, LoadInst::getPointerOperandIndex(), *LI.getType());
+void GPUSanImpl::instrumentLoadInst(LoopInfo &LI, LoadInst &LoadI) {
+  instrumentAccess(LI, LoadI, LoadInst::getPointerOperandIndex(),
+                   *LoadI.getType(),
+                   /*IsRead=*/true);
 }
 
-void GPUSanImpl::instrumentStoreInst(StoreInst &SI) {
-  instrumentAccess(SI, StoreInst::getPointerOperandIndex(),
-                   *SI.getValueOperand()->getType());
+void GPUSanImpl::instrumentStoreInst(LoopInfo &LI, StoreInst &StoreI) {
+  instrumentAccess(LI, StoreI, StoreInst::getPointerOperandIndex(),
+                   *StoreI.getValueOperand()->getType(), /*IsRead=*/false);
 }
 
-void GPUSanImpl::instrumentGEPInst(GetElementPtrInst &GEP) {
+void GPUSanImpl::instrumentGEPInst(LoopInfo &LI, GetElementPtrInst &GEP) {
   Value *PtrOp = GEP.getPointerOperand();
-  auto *UO = getUnderlyingObject(PtrOp);
-  if (isa<GlobalObject>(UO))
+  PtrOrigin PO = getPtrOrigin(LI, PtrOp);
+  if (PO > GLOBAL)
     return;
+
   GEP.setOperand(GetElementPtrInst::getPointerOperandIndex(),
                  Constant::getNullValue(PtrOp->getType()));
-
   IRBuilder<> IRB(GEP.getNextNode());
   Value *PlainPtrOp = IRB.CreatePointerBitCastOrAddrSpaceCast(PtrOp, PtrTy);
-  auto *CB = IRB.CreateCall(getGEPFn(), {PlainPtrOp, UndefValue::get(Int64Ty)},
-                            GEP.getName() + ".san");
+  auto *CB =
+      IRB.CreateCall(getGEPFn(PO), {PlainPtrOp, UndefValue::get(Int64Ty)},
+                     GEP.getName() + ".san");
   GEP.replaceAllUsesWith(
       IRB.CreatePointerBitCastOrAddrSpaceCast(CB, GEP.getType()));
   Value *Offset =
@@ -186,24 +295,25 @@ void GPUSanImpl::instrumentGEPInst(GetElementPtrInst &GEP) {
   CB->setArgOperand(1, Offset);
 }
 
-bool GPUSanImpl::instrumentCallInst(CallInst &CI) {
+bool GPUSanImpl::instrumentCallInst(LoopInfo &LI, CallInst &CI) {
   bool Changed = false;
   if (isa<LifetimeIntrinsic>(CI))
     return Changed;
   if (auto *Fn = CI.getCalledFunction()) {
-    if ((Fn->isDeclaration() || Fn->getName().starts_with("__kmpc")) &&
+    if ((Fn->isDeclaration() || Fn->getName().starts_with("__kmpc") ||
+         Fn->getName().starts_with("rpc_")) &&
         !Fn->getName().starts_with("ompx")) {
       IRBuilder<> IRB(&CI);
       for (int I = 0, E = CI.arg_size(); I != E; ++I) {
         Value *Op = CI.getArgOperand(I);
         if (!Op->getType()->isPointerTy())
           continue;
-        auto *UO = getUnderlyingObject(Op);
-        if (isa<GlobalObject>(UO))
+        PtrOrigin PO = getPtrOrigin(LI, Op);
+        if (PO > GLOBAL)
           continue;
         Value *PlainOp = IRB.CreatePointerBitCastOrAddrSpaceCast(Op, PtrTy);
-        auto *CB =
-            IRB.CreateCall(getUnpackFn(), {PlainOp}, Op->getName() + ".unpack");
+        auto *CB = IRB.CreateCall(getUnpackFn(PO), {PlainOp},
+                                  Op->getName() + ".unpack");
         CI.setArgOperand(
             I, IRB.CreatePointerBitCastOrAddrSpaceCast(CB, Op->getType()));
         Changed = true;
@@ -214,41 +324,74 @@ bool GPUSanImpl::instrumentCallInst(CallInst &CI) {
 }
 
 bool GPUSanImpl::instrumentFunction(Function &Fn) {
+  if (Fn.isDeclaration())
+    return false;
   bool Changed = false;
-
+  LoopInfo &LI = FAM.getResult<LoopAnalysis>(Fn);
+  SmallVector<std::pair<AllocaInst *, Value *>> Allocas;
+  SmallVector<ReturnInst *> Returns;
   for (auto &I : instructions(Fn)) {
     switch (I.getOpcode()) {
-    case Instruction::Alloca:
-      instrumentAllocaInst(cast<AllocaInst>(I));
+    case Instruction::Alloca: {
+      AllocaInst &AI = cast<AllocaInst>(I);
+      Value *FakePtr = instrumentAllocaInst(LI, AI);
+      Allocas.push_back({&AI, FakePtr});
       Changed = true;
       break;
+    }
     case Instruction::Load:
-      instrumentLoadInst(cast<LoadInst>(I));
+      instrumentLoadInst(LI, cast<LoadInst>(I));
       Changed = true;
       break;
     case Instruction::Store:
-      instrumentStoreInst(cast<StoreInst>(I));
+      instrumentStoreInst(LI, cast<StoreInst>(I));
       Changed = true;
       break;
     case Instruction::GetElementPtr:
-      instrumentGEPInst(cast<GetElementPtrInst>(I));
+      instrumentGEPInst(LI, cast<GetElementPtrInst>(I));
       Changed = true;
       break;
     case Instruction::Call:
-      Changed = instrumentCallInst(cast<CallInst>(I));
+      Changed = instrumentCallInst(LI, cast<CallInst>(I));
+      break;
+    case Instruction::Ret:
+      Returns.push_back(&cast<ReturnInst>(I));
       break;
     default:
       break;
     }
   }
 
+  instrumentReturns(Allocas, Returns);
+
   return Changed;
 }
 
+void GPUSanImpl::instrumentReturns(
+    SmallVectorImpl<std::pair<AllocaInst *, Value *>> &Allocas,
+    SmallVectorImpl<ReturnInst *> &Returns) {
+  if (Allocas.empty())
+    return;
+  for (auto *RI : Returns) {
+    IRBuilder<> IRB(RI);
+    IRB.CreateCall(getFreeNLocalFn(),
+                   {ConstantInt::get(Int32Ty, Allocas.size())}, ".free");
+  }
+}
+
 bool GPUSanImpl::instrument() {
   bool Changed = instrumentGlobals();
+  HasAllocas = [&]() {
+    for (Function &Fn : M)
+      for (auto &I : instructions(Fn))
+        if (isa<AllocaInst>(I))
+          return true;
+    return false;
+  }();
+
   for (Function &Fn : M)
-    if (!Fn.getName().contains("ompx") && !Fn.getName().contains("__kmpc"))
+    if (!Fn.getName().contains("ompx") && !Fn.getName().contains("__kmpc") &&
+        !Fn.getName().starts_with("rpc_"))
       Changed |= instrumentFunction(Fn);
   M.dump();
   return Changed;
@@ -257,7 +400,7 @@ bool GPUSanImpl::instrument() {
 PreservedAnalyses GPUSanPass::run(Module &M, ModuleAnalysisManager &AM) {
   FunctionAnalysisManager &FAM =
       AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
-  GPUSanImpl Lowerer(M);
+  GPUSanImpl Lowerer(M, FAM);
   if (!Lowerer.instrument())
     return PreservedAnalyses::all();
 
diff --git a/offload/DeviceRTL/include/Utils.h b/offload/DeviceRTL/include/Utils.h
index 82e2397b5958b..5797229e40a78 100644
--- a/offload/DeviceRTL/include/Utils.h
+++ b/offload/DeviceRTL/include/Utils.h
@@ -81,6 +81,9 @@ template <typename Ty1, typename Ty2> inline Ty1 align_down(Ty1 V, Ty2 Align) {
 /// Return true iff \p Ptr is pointing into shared (local) memory (AS(3)).
 bool isSharedMemPtr(void *Ptr);
 
+/// Return true iff \p Ptr is pointing into (thread) local memory (AS(5)).
+bool isThreadLocalMemPtr(void *Ptr);
+
 /// Return \p V typed punned as \p DstTy.
 template <typename DstTy, typename SrcTy> inline DstTy convertViaPun(SrcTy V) {
   return *((DstTy *)(&V));
diff --git a/offload/DeviceRTL/src/AllocationTracker.cpp b/offload/DeviceRTL/src/AllocationTracker.cpp
index e5b2c782fa663..0b3332d47dad8 100644
--- a/offload/DeviceRTL/src/AllocationTracker.cpp
+++ b/offload/DeviceRTL/src/AllocationTracker.cpp
@@ -8,6 +8,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "Interface.h"
 #include "LibC.h"
 #include "Shared/Environment.h"
 #include "Synchronization.h"
@@ -19,155 +20,323 @@ using namespace utils;
 
 #pragma omp begin declare target device_type(nohost)
 
+extern "C" int ompx_block_id(int Dim);
+
 [[gnu::used, gnu::retain, gnu::weak,
   gnu::visibility("protected")]] OMPXTrapIDTy *__ompx_trap_id;
 
-// #define USE_TAGS
+#define _OBJECT_TY uint16_t
 
-#ifdef USE_TAGS
-static constexpr uint32_t TAG_BITS = 8;
-#else
-static constexpr uint32_t TAG_BITS = 0;
-#endif
+enum class AllocationKind { GLOBAL, LOCAL, LAST = LOCAL };
 
-#define _OBJECT_TY uint16_t
-static constexpr uint32_t OBJECT_BITS = sizeof(_OBJECT_TY) * 8;
-static constexpr uint32_t SID_BITS = 16;
+template <AllocationKind AK> struct Config {
+  static constexpr uint32_t ADDR_SPACE = AK == AllocationKind::GLOBAL ? 0 : 3;
+  static constexpr uint32_t NUM_ALLOCATION_ARRAYS =
+      AK == AllocationKind::GLOBAL ? 1 : (256 * 8 * 4);
+  static constexpr uint32_t TAG_BITS = AK == AllocationKind::GLOBAL ? 1 : 8;
 
-static constexpr uint32_t LENGTH_BITS = 64 - TAG_BITS - SID_BITS;
-static constexpr uint32_t OFFSET_BITS = LENGTH_BITS - OBJECT_BITS;
+  static constexpr uint32_t OBJECT_BITS =
+      AK == AllocationKind::GLOBAL ? 10 : (sizeof(_OBJECT_TY) * 8);
+  static constexpr uint32_t SLOTS =
+      (1 << (OBJECT_BITS)) / NUM_ALLOCATION_ARRAYS;
+  static constexpr uint32_t KIND_BITS = 1;
+  static constexpr uint32_t SID_BITS = 16 - KIND_BITS;
 
-static_assert(LENGTH_BITS + TAG_BITS + SID_BITS == 64,
-              "Length and tag bits should cover 64 bits");
-static_assert(OFFSET_BITS + TAG_BITS + SID_BITS + OBJECT_BITS == 64,
-              "Length, tag, and object bits should cover 64 bits");
+  static constexpr uint32_t LENGTH_BITS = 64 - TAG_BITS - SID_BITS - KIND_BITS;
+  static constexpr uint32_t OFFSET_BITS = LENGTH_BITS - OBJECT_BITS;
 
-struct AllocationTy {
+  static constexpr bool useTags() { return TAG_BITS > 1; }
+
+  static_assert(LENGTH_BITS + TAG_BITS + KIND_BITS + SID_BITS == 64,
+                "Length and tag bits should cover 64 bits");
+  static_assert(OFFSET_BITS + TAG_BITS + KIND_BITS + SID_BITS + OBJECT_BITS ==
+                    64,
+                "Length, tag, and object bits should cover 64 bits");
+  static_assert((1 << KIND_BITS) >= ((uint64_t)AllocationKind::LAST + 1),
+                "Kind bits should match allocation kinds");
+};
+
+template <AllocationKind AK> struct AllocationTy {
   void *Start;
-  uint64_t Length : LENGTH_BITS;
-#ifdef USE_TAGS
-  uint64_t Tag : TAG_BITS;
-#endif
-  uint64_t SID : SID_BITS;
+  uint64_t Length : Config<AK>::LENGTH_BITS;
+  uint64_t Tag : Config<AK>::TAG_BITS;
+  uint64_t SID : Config<AK>::SID_BITS;
 
   bool contains(void *Ptr, uint64_t Size) const {
     return Ptr >= Start && advance(Ptr, Size) <= advance(Start, Length);
   }
 };
-static_assert(sizeof(AllocationTy) == sizeof(void *) * 2,
-              "AllocationTy should not exceed two pointers");
 
-static AllocationTy Allocations[static_cast<_OBJECT_TY>(~0)];
-unsigned short NumAllocs = 1;
+template <AllocationKind AK> struct AllocationArrayTy {
+  AllocationTy<AK> Arr[Config<AK>::SLOTS];
+  uint32_t Cnt;
+};
 
-struct AllocationPtrTy {
-  static AllocationPtrTy get(void *P) {
-    return convertViaPun<AllocationPtrTy>(P);
-  }
-  static AllocationPtrTy get(intptr_t V) {
-    return convertViaPun<AllocationPtrTy>(V);
+template <AllocationKind AK> struct AllocationPtrTy {
+  static AllocationPtrTy<AK> get(void *P) {
+    return convertViaPun<AllocationPtrTy<AK>>(P);
   }
+
   operator void *() const { return convertViaPun<void *>(*this); }
   operator intptr_t() const { return convertViaPun<intptr_t>(*this); }
-
-  uint64_t PtrOffset : OFFSET_BITS;
-#ifdef USE_TAGS
-  uint64_t AllocationTag : TAG_BITS;
-#endif
-  uint64_t AllocationId : OBJECT_BITS;
+  uint64_t PtrOffset : Config<AK>::OFFSET_BITS;
+  uint64_t AllocationTag : Config<AK>::TAG_BITS;
+  uint64_t AllocationId : Config<AK>::OBJECT_BITS;
+  // Must be last, TODO: merge into TAG
+  uint64_t Kind : Config<AK>::KIND_BITS;
 };
-static_assert(sizeof(AllocationPtrTy) == sizeof(void *),
-              "AllocationTy pointers should be pointer sized");
 
-extern "C" {
+template <AllocationKind AK> struct AllocationTracker {
+  static_assert(sizeof(AllocationTy<AK>) == sizeof(void *) * 2,
+                "AllocationTy should not exceed two pointers");
+  static_assert(sizeof(AllocationPtrTy<AK>) == sizeof(void *),
+                "AllocationTy pointers should be pointer sized");
 
-[[gnu::flatten, gnu::always_inline, gnu::used, gnu::retain]] void *
-ompx_new_allocation(void *Start, uint64_t Length, int64_t Id) {
-  if constexpr (LENGTH_BITS < 64)
-    if (Length >= (1UL << (LENGTH_BITS + 1)))
+  static AllocationArrayTy<AK> Allocations[Config<AK>::NUM_ALLOCATION_ARRAYS];
+
+  static void *create(void *Start, uint64_t Length, int64_t AllocationId,
+                      uint32_t Slot) {
+    // printf("New alloc %p, %lu, %li\n", Start, Length, AllocationId);
+
+    if constexpr (Config<AK>::OFFSET_BITS < 64)
+      if (Length >= (1UL << (Config<AK>::OFFSET_BITS))) {
+        __ompx_trap_id->ID = AllocationId;
+        __ompx_trap_id->AccessID = -4;
+        __builtin_trap();
+      }
+
+    uint32_t ThreadId = 0, BlockId = 0;
+    if constexpr (AK == AllocationKind::LOCAL) {
+      ThreadId = __kmpc_get_hardware_thread_id_in_block();
+      BlockId = ompx_block_id(0);
+    }
+
+    // Reserve the 0 element for the null pointer in global space.
+    auto &AllocArr =
+        Allocations[ThreadId +
+                    BlockId * __kmpc_get_hardware_num_threads_in_block()];
+    auto &Cnt = AllocArr.Cnt;
+    if constexpr (AK == AllocationKind::LOCAL)
+      Slot = ++Cnt;
+
+    uint32_t NumSlots = Config<AK>::SLOTS;
+    if (Slot >= NumSlots) {
+      __ompx_trap_id->Offset = Slot;
+      __ompx_trap_id->Length = NumSlots;
+      __ompx_trap_id->ID = AllocationId;
+      __ompx_trap_id->AccessID = -5;
       __builtin_trap();
-  auto No = NumAllocs++;
-  AllocationTy &A = Allocations[No];
-  A.Start = Start;
-  A.Length = Length;
-  A.SID = Id;
-  if (Id != A.SID) {
-    __ompx_trap_id->ID = -2UL;
-    __builtin_trap();
+    }
+
+    auto &A = AllocArr.Arr[Slot];
+
+    A.Start = Start;
+    A.Length = Length;
+    A.SID = AllocationId;
+
+    AllocationPtrTy<AK> AP;
+    AP.PtrOffset = 0;
+    AP.AllocationId = Slot;
+    AP.Kind = (uint64_t)AK;
+    if constexpr (Config<AK>::useTags()) {
+      AP.AllocationTag = ++A.Tag;
+    }
+    return AP;
+  }
+
+  static void remove(void *P) {
+    AllocationPtrTy<AK> AP = AllocationPtrTy<AK>::get(P);
+    uint32_t AllocationId = AP.AllocationId;
+
+    uint32_t ThreadId = 0, BlockId = 0;
+    if constexpr (AK == AllocationKind::LOCAL) {
+      ThreadId = __kmpc_get_hardware_thread_id_in_block();
+      BlockId = ompx_block_id(0);
+    }
+    auto &AllocArr =
+        Allocations[ThreadId +
+                    BlockId * __kmpc_get_hardware_num_threads_in_block()];
+    auto &A = AllocArr.Arr[AllocationId];
+    A.Length = 0;
+
+    auto &Cnt = AllocArr.Cnt;
+    if constexpr (AK == AllocationKind::LOCAL) {
+      if (Cnt == AllocationId)
+        --Cnt;
+    }
+  }
+
+  static void remove_n(int32_t N) {
+    static_assert(AK == AllocationKind::LOCAL, "");
+    uint32_t ThreadId = __kmpc_get_hardware_thread_id_in_block();
+    uint32_t BlockId = ompx_block_id(0);
+    auto &AllocArr =
+        Allocations[ThreadId +
+                    BlockId * __kmpc_get_hardware_num_threads_in_block()];
+    auto &Cnt = AllocArr.Cnt;
+    for (int32_t I = 0; I < N; ++I) {
+      auto &A = AllocArr.Arr[Cnt--];
+      A.Length = 0;
+    }
   }
 
-  AllocationPtrTy AP;
-  AP.PtrOffset = 0;
-  AP.AllocationId = No;
-#ifdef USE_TAGS
-  A.Tag = 0;
-  AP.AllocationTag = A.Tag;
-#endif
-  return AP;
+  static void *advance(void *P, uint64_t Offset) {
+    AllocationPtrTy<AK> AP = AllocationPtrTy<AK>::get(P);
+    AP.PtrOffset += Offset;
+    return AP;
+  }
+
+  static void *check(void *P, uint64_t Size, int64_t AccessId) {
+    AllocationPtrTy<AK> AP = AllocationPtrTy<AK>::get(P);
+    uint32_t ThreadId = 0, BlockId = 0;
+    if constexpr (AK == AllocationKind::LOCAL) {
+      ThreadId = __kmpc_get_hardware_thread_id_in_block();
+      BlockId = ompx_block_id(0);
+    }
+    auto &AllocArr =
+        Allocations[ThreadId +
+                    BlockId * __kmpc_get_hardware_num_threads_in_block()];
+    auto &A = AllocArr.Arr[AP.AllocationId];
+    uint64_t Offset = AP.PtrOffset;
+    uint64_t Length = A.Length;
+    if (Offset > Length - Size ||
+        (Config<AK>::useTags() && A.Tag != AP.AllocationTag)) {
+      __ompx_trap_id->ID = AP.AllocationId;
+      __ompx_trap_id->Start = A.Start;
+      __ompx_trap_id->Length = A.Length;
+      __ompx_trap_id->Offset = AP.PtrOffset;
+      __ompx_trap_id->AccessID = AccessId;
+      __builtin_trap();
+    }
+    return advance(A.Start, Offset);
+  }
+
+  static void *unpack(void *P) {
+    AllocationPtrTy<AK> AP = AllocationPtrTy<AK>::get(P);
+    uint32_t ThreadId = 0, BlockId = 0;
+    if constexpr (AK == AllocationKind::LOCAL) {
+      ThreadId = __kmpc_get_hardware_thread_id_in_block();
+      BlockId = ompx_block_id(0);
+    }
+    auto &AllocArr =
+        Allocations[ThreadId +
+                    BlockId * __kmpc_get_hardware_num_threads_in_block()];
+    auto &A = AllocArr.Arr[AP.AllocationId];
+    uint64_t Offset = AP.PtrOffset;
+    void *Ptr = advance(A.Start, Offset);
+    return Ptr;
+  }
+
+  static void leak_check() {
+    static_assert(AK == AllocationKind::GLOBAL, "");
+    auto &AllocArr = Allocations[0];
+    for (uint32_t I = 0; I < Config<AK>::SLOTS; ++I) {
+      auto &A = AllocArr.Arr[I];
+      if (!A.Length)
+        continue;
+      __ompx_trap_id->ID = I;
+      __ompx_trap_id->Start = A.Start;
+      __ompx_trap_id->Length = A.Length;
+      __ompx_trap_id->AccessID = -6;
+      __builtin_trap();
+    }
+  }
+};
+
+template <AllocationKind AK>
+AllocationArrayTy<AK>
+    AllocationTracker<AK>::Allocations[Config<AK>::NUM_ALLOCATION_ARRAYS];
+
+extern "C" {
+
+#define PTR_CHECK(FUNCTION, PTR, ...)                                          \
+  if (isThreadLocalMemPtr(PTR))                                                \
+    return AllocationTracker<AllocationKind::LOCAL>::FUNCTION(                 \
+        PTR __VA_OPT__(, ) __VA_ARGS__);                                       \
+  return AllocationTracker<AllocationKind::GLOBAL>::FUNCTION(                  \
+      PTR __VA_OPT__(, ) __VA_ARGS__);
+#define FAKE_PTR_CHECK(FUNCTION, PTR, ...)                                     \
+  if (AllocationPtrTy<AllocationKind::GLOBAL>::get(PTR).Kind ==                \
+      (uint32_t)AllocationKind::LOCAL)                                         \
+    return AllocationTracker<AllocationKind::LOCAL>::FUNCTION(                 \
+        PTR __VA_OPT__(, ) __VA_ARGS__);                                       \
+  return AllocationTracker<AllocationKind::GLOBAL>::FUNCTION(                  \
+      PTR __VA_OPT__(, ) __VA_ARGS__);
+
+[[gnu::flatten, gnu::always_inline, gnu::used, gnu::retain]] void *
+ompx_new(void *Start, uint64_t Length, int64_t AllocationId, uint32_t Slot) {
+  PTR_CHECK(create, Start, Length, AllocationId, Slot);
+}
+[[gnu::flatten, gnu::always_inline, gnu::used, gnu::retain]] void *
+ompx_new_local(void *Start, uint64_t Length, int64_t AllocationId,
+               uint32_t Slot) {
+  return AllocationTracker<AllocationKind::LOCAL>::create(Start, Length,
+                                                          AllocationId, Slot);
+}
+[[gnu::flatten, gnu::always_inline, gnu::used, gnu::retain]] void
+ompx_new_global(void *Start, uint64_t Length, uint16_t AllocationId,
+                uint32_t Slot) {
+  AllocationTracker<AllocationKind::GLOBAL>::create(Start, Length, AllocationId,
+                                                    Slot);
 }
 
 [[gnu::flatten, gnu::always_inline, gnu::used, gnu::retain]] void
-ompx_free_allocation(void *P) {
-  AllocationPtrTy AP = AllocationPtrTy::get(P);
-  Allocations[AP.AllocationId] = AllocationTy();
+ompx_free(void *P) {
+  FAKE_PTR_CHECK(remove, P);
+}
+[[gnu::flatten, gnu::always_inline, gnu::used, gnu::retain]] void
+ompx_free_local_n(int32_t N) {
+  return AllocationTracker<AllocationKind::LOCAL>::remove_n(N);
+}
+[[gnu::flatten, gnu::always_inline, gnu::used, gnu::retain]] void
+ompx_free_global(void *P) {
+  AllocationTracker<AllocationKind::GLOBAL>::remove(P);
 }
 
 [[gnu::flatten, gnu::always_inline, gnu::used, gnu::retain]] void *
 ompx_gep(void *P, uint64_t Offset) {
-  AllocationPtrTy AP = AllocationPtrTy::get(P);
-  AP.PtrOffset += Offset;
-  return AP;
+  FAKE_PTR_CHECK(advance, P, Offset);
+}
+[[gnu::flatten, gnu::always_inline, gnu::used, gnu::retain]] void *
+ompx_gep_local(void *P, uint64_t Offset) {
+  return AllocationTracker<AllocationKind::LOCAL>::advance(P, Offset);
+}
+[[gnu::flatten, gnu::always_inline, gnu::used, gnu::retain]] void *
+ompx_gep_global(void *P, uint64_t Offset) {
+  return AllocationTracker<AllocationKind::GLOBAL>::advance(P, Offset);
 }
 
 [[gnu::flatten, gnu::always_inline, gnu::used, gnu::retain]] void *
-ompx_check_access(void *P, uint64_t Size, uint64_t AccessNo) {
-  AllocationPtrTy AP = AllocationPtrTy::get(P);
-  AllocationTy &A = Allocations[AP.AllocationId];
-#ifdef USE_TAGS
-  if (A.Tag != AP.AllocationTag)
-    __builtin_trap();
-#endif
-  uint64_t Offset = AP.PtrOffset;
-  void *Ptr = advance(A.Start, Offset);
-  if (!A.contains(Ptr, Size)) {
-    //    printf("Out of bounds, access: %lu inside of %lu allocation @ %p\n",
-    //    Offset, A.Length, A.Start);
-    __ompx_trap_id->Start = A.Start;
-    __ompx_trap_id->Length = A.Length;
-    __ompx_trap_id->Offset = AP.PtrOffset;
-    __ompx_trap_id->ID = AP.AllocationId;
-    __ompx_trap_id->AccessID = AccessNo;
-    __builtin_trap();
-  }
-  return Ptr;
+ompx_check(void *P, uint64_t Size, uint64_t AccessId) {
+  FAKE_PTR_CHECK(check, P, Size, AccessId);
 }
 
 [[gnu::flatten, gnu::always_inline, gnu::used, gnu::retain]] void *
-ompx_unpack(void *P) {
-  AllocationPtrTy AP = AllocationPtrTy::get(P);
-  AllocationTy &A = Allocations[AP.AllocationId];
-  uint64_t Offset = AP.PtrOffset;
-  void *Ptr = advance(A.Start, Offset);
-  return Ptr;
+ompx_check_local(void *P, uint64_t Size, uint64_t AccessId) {
+  return AllocationTracker<AllocationKind::LOCAL>::check(P, Size, AccessId);
 }
 
-[[gnu::flatten, gnu::always_inline, gnu::used, gnu::retain]] void
-ompx_new_host_allocation(void *Start, uint64_t Length, uint16_t AllocationId) {
-  if constexpr (LENGTH_BITS < 64)
-    if (Length >= (1UL << (LENGTH_BITS + 1)))
-      __builtin_trap();
-  AllocationTy &A = Allocations[AllocationId];
-  A.Start = Start;
-  A.Length = Length;
-  A.SID = AllocationId;
-#ifdef USE_TAGS
-  A.Tag = 0;
-#endif
+[[gnu::flatten, gnu::always_inline, gnu::used, gnu::retain]] void *
+ompx_check_global(void *P, uint64_t Size, uint64_t AccessId) {
+  return AllocationTracker<AllocationKind::GLOBAL>::check(P, Size, AccessId);
+}
+
+[[gnu::flatten, gnu::always_inline, gnu::used, gnu::retain]] void *
+ompx_unpack(void *P) {
+  FAKE_PTR_CHECK(unpack, P);
+}
+[[gnu::flatten, gnu::always_inline, gnu::used, gnu::retain]] void *
+ompx_unpack_local(void *P) {
+  return AllocationTracker<AllocationKind::LOCAL>::unpack(P);
+}
+[[gnu::flatten, gnu::always_inline, gnu::used, gnu::retain]] void *
+ompx_unpack_global(void *P) {
+  return AllocationTracker<AllocationKind::GLOBAL>::unpack(P);
 }
 
 [[gnu::flatten, gnu::always_inline, gnu::used, gnu::retain]] void
-ompx_free_host_allocation(void *P) {
-  ompx_free_allocation(P);
+ompx_leak_check() {
+  AllocationTracker<AllocationKind::GLOBAL>::leak_check();
 }
 }
 
diff --git a/offload/DeviceRTL/src/Utils.cpp b/offload/DeviceRTL/src/Utils.cpp
index 53cc803234867..9ca99bd20f861 100644
--- a/offload/DeviceRTL/src/Utils.cpp
+++ b/offload/DeviceRTL/src/Utils.cpp
@@ -22,6 +22,7 @@ using namespace ompx;
 namespace impl {
 
 bool isSharedMemPtr(const void *Ptr) { return false; }
+bool isThreadLocalMemPtr(const void *Ptr) { return false; }
 
 void Unpack(uint64_t Val, uint32_t *LowBits, uint32_t *HighBits) {
   static_assert(sizeof(unsigned long) == 8, "");
@@ -67,6 +68,10 @@ bool isSharedMemPtr(const void *Ptr) {
   return __builtin_amdgcn_is_shared(
       (const __attribute__((address_space(0))) void *)Ptr);
 }
+bool isThreadLocalMemPtr(const void *Ptr) {
+  return __builtin_amdgcn_is_private(
+      (const __attribute__((address_space(0))) void *)Ptr);
+}
 #pragma omp end declare variant
 ///}
 
@@ -92,6 +97,8 @@ uint64_t ballotSync(uint64_t Mask, int32_t Pred) {
 
 bool isSharedMemPtr(const void *Ptr) { return __nvvm_isspacep_shared(Ptr); }
 
+bool isThreadLocalMemPtr(const void *Ptr) { return __nvvm_isspacep_local(Ptr); }
+
 #pragma omp end declare variant
 ///}
 } // namespace impl
@@ -127,6 +134,9 @@ uint64_t utils::ballotSync(uint64_t Mask, int32_t Pred) {
 }
 
 bool utils::isSharedMemPtr(void *Ptr) { return impl::isSharedMemPtr(Ptr); }
+bool utils::isThreadLocalMemPtr(void *Ptr) {
+  return impl::isThreadLocalMemPtr(Ptr);
+}
 
 extern "C" {
 int32_t __kmpc_shuffle_int32(int32_t Val, int16_t Delta, int16_t SrcLane) {
diff --git a/offload/DeviceRTL/src/exports b/offload/DeviceRTL/src/exports
index 288ddf90b4a9f..b725cca7e1c1d 100644
--- a/offload/DeviceRTL/src/exports
+++ b/offload/DeviceRTL/src/exports
@@ -16,3 +16,5 @@ free
 memcmp
 printf
 __assert_fail
+LocalAllocs
+LocalCnt
diff --git a/offload/plugins-nextgen/amdgpu/src/rtl.cpp b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
index c64b3ccf2c8f9..c3d9d09f21f0f 100644
--- a/offload/plugins-nextgen/amdgpu/src/rtl.cpp
+++ b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -3512,8 +3512,8 @@ void AMDGPUQueueTy::callbackError(hsa_status_t Status, hsa_queue_t *Source,
   //  }
   //
   //  printf("Trap ID: %li\n", OmpxTrapId);
-  printf("Trap ID[%zu:%p] Acc[%zu] : %p[:%lu] vs %lu\n",
-         (uint64_t)Device->OmpxTrapId->ID, (void *)Device->OmpxTrapId->ID,
+  printf("Trap ID[%li:%p] Acc[%li] : %p[:%li] vs %li\n",
+         (int64_t)Device->OmpxTrapId->ID, (void *)Device->OmpxTrapId->ID,
          Device->OmpxTrapId->AccessID, Device->OmpxTrapId->Start,
          Device->OmpxTrapId->Length, Device->OmpxTrapId->Offset);
   fflush(stdout);
diff --git a/offload/src/OpenMP/Mapping.cpp b/offload/src/OpenMP/Mapping.cpp
index 10fc76aef12e6..f198489aef046 100644
--- a/offload/src/OpenMP/Mapping.cpp
+++ b/offload/src/OpenMP/Mapping.cpp
@@ -312,6 +312,8 @@ TargetPointerResultTy MappingInfoTy::getTargetPointer(
                 (uintptr_t)HstPtrBegin + Size, TgtAllocBegin, TgtPtrBegin,
                 HasHoldModifier, HstPtrName, /*IsINF=*/false, FakeTgtPtrBegin))
             .first->HDTT);
+    printf("ENTRY %p -> %p\n", LR.TPR.getEntry(),
+           LR.TPR.getEntry()->FakeTgtPtrBegin);
     INFO(OMP_INFOTYPE_MAPPING_CHANGED, Device.DeviceID,
          "Creating new map entry with HstPtrBase=" DPxMOD
          ", HstPtrBegin=" DPxMOD ", TgtAllocBegin=" DPxMOD
@@ -504,6 +506,7 @@ int MappingInfoTy::deallocTgtPtrAndEntry(HostDataToTargetTy *Entry,
     return OFFLOAD_FAIL;
   }
 
+  printf("DEL %p -> %p\n", Entry, Entry->FakeTgtPtrBegin);
   if (Entry->FakeTgtPtrBegin)
     ompx_free_allocation_host(Entry->FakeTgtPtrBegin);
 
diff --git a/offload/src/exports b/offload/src/exports
index 5a2f8dd6cdd41..f8d69c3814961 100644
--- a/offload/src/exports
+++ b/offload/src/exports
@@ -77,6 +77,7 @@ VERS1.0 {
     __llvmPopCallConfiguration;
     llvmLaunchKernel;
     ompx_new_allocation_host;
+    ompx_free_allocation_host;
   local:
     *;
 };
diff --git a/preload.cpp b/preload.cpp
index dd21404938351..432a1116a6a90 100644
--- a/preload.cpp
+++ b/preload.cpp
@@ -8,71 +8,85 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include <cassert>
 #include <cstdint>
 #include <cstdio>
 
-#ifdef USE_TAGS
-static constexpr uint32_t TAG_BITS = 8;
-#else
-static constexpr uint32_t TAG_BITS = 0;
-#endif
+#define _OBJECT_TY uint16_t
 
-#define _OBJECT_TY unsigned short
-static constexpr uint32_t OBJECT_BITS = sizeof(_OBJECT_TY) * 8;
-static constexpr uint32_t SID_BITS = 16;
+enum class AllocationKind { GLOBAL, LOCAL, LAST = LOCAL };
 
-static constexpr uint32_t LENGTH_BITS = 64 - TAG_BITS - SID_BITS;
-static constexpr uint32_t OFFSET_BITS = LENGTH_BITS - OBJECT_BITS;
+template <AllocationKind AK> struct Config {
+  static constexpr uint32_t ADDR_SPACE = AK == AllocationKind::GLOBAL ? 0 : 3;
+  static constexpr uint32_t NUM_ALLOCATION_ARRAYS =
+      AK == AllocationKind::GLOBAL ? 1 : (256 * 8 * 4);
+  static constexpr uint32_t TAG_BITS = AK == AllocationKind::GLOBAL ? 1 : 8;
 
-static_assert(LENGTH_BITS + TAG_BITS + SID_BITS == 64,
-              "Length and tag bits should cover 64 bits");
-static_assert(OFFSET_BITS + TAG_BITS + SID_BITS + OBJECT_BITS == 64,
-              "Length, tag, and object bits should cover 64 bits");
+  static constexpr uint32_t OBJECT_BITS =
+      AK == AllocationKind::GLOBAL ? 10 : (sizeof(_OBJECT_TY) * 8);
+  static constexpr uint32_t SLOTS =
+      (1 << (OBJECT_BITS)) / NUM_ALLOCATION_ARRAYS;
+  static constexpr uint32_t KIND_BITS = 1;
+  static constexpr uint32_t SID_BITS = 16 - KIND_BITS;
+
+  static constexpr uint32_t LENGTH_BITS = 64 - TAG_BITS - SID_BITS - KIND_BITS;
+  static constexpr uint32_t OFFSET_BITS = LENGTH_BITS - OBJECT_BITS;
+
+  static constexpr bool useTags() { return TAG_BITS > 1; }
+
+  static_assert(LENGTH_BITS + TAG_BITS + KIND_BITS + SID_BITS == 64,
+                "Length and tag bits should cover 64 bits");
+  static_assert(OFFSET_BITS + TAG_BITS + KIND_BITS + SID_BITS + OBJECT_BITS ==
+                    64,
+                "Length, tag, and object bits should cover 64 bits");
+  static_assert((1 << KIND_BITS) >= ((uint64_t)AllocationKind::LAST + 1),
+                "Kind bits should match allocation kinds");
+};
 
 template <typename DstTy, typename SrcTy> inline DstTy convertViaPun(SrcTy V) {
   return *((DstTy *)(&V));
 }
 
-struct AllocationPtrTy {
-  static AllocationPtrTy get(void *P) {
-    return convertViaPun<AllocationPtrTy>(P);
-  }
-  static AllocationPtrTy get(intptr_t V) {
-    return convertViaPun<AllocationPtrTy>(V);
+template <AllocationKind AK> struct AllocationPtrTy {
+  static AllocationPtrTy<AK> get(void *P) {
+    return convertViaPun<AllocationPtrTy<AK>>(P);
   }
+
   operator void *() const { return convertViaPun<void *>(*this); }
   operator intptr_t() const { return convertViaPun<intptr_t>(*this); }
-
-  uint64_t PtrOffset : OFFSET_BITS;
-#ifdef USE_TAGS
-  uint64_t AllocationTag : TAG_BITS;
-#endif
-  uint64_t AllocationId : OBJECT_BITS;
+  uint64_t PtrOffset : Config<AK>::OFFSET_BITS;
+  uint64_t AllocationTag : Config<AK>::TAG_BITS;
+  uint64_t AllocationId : Config<AK>::OBJECT_BITS;
+  // Must be last, TODO: merge into TAG
+  uint64_t Kind : Config<AK>::KIND_BITS;
 };
-static_assert(sizeof(AllocationPtrTy) == sizeof(void *),
+
+static_assert(sizeof(AllocationPtrTy<AllocationKind::GLOBAL>) == sizeof(void *),
               "AllocationTy pointers should be pointer sized");
 
 extern "C" {
 
 [[gnu::flatten, gnu::always_inline]] void *ompx_new(uint16_t &AllocationId) {
-  static uint16_t NumHostAllocs = static_cast<_OBJECT_TY>(~0) - 1;
+  static uint16_t NumHostAllocs = Config<AllocationKind::GLOBAL>::SLOTS - 1;
   AllocationId = NumHostAllocs--;
-  AllocationPtrTy AP;
+  AllocationPtrTy<AllocationKind::GLOBAL> AP;
   AP.PtrOffset = 0;
   AP.AllocationId = AllocationId;
+  AP.Kind = (uint32_t)AllocationKind::GLOBAL;
   return AP;
 }
 
 #pragma omp begin declare target
-void ompx_new_host_allocation(void *P, uint64_t Bytes, uint16_t AllocationId);
-void ompx_free_host_allocation(void *P);
+void ompx_new_global(void *P, uint64_t Bytes, uint16_t AllocationId,
+                     uint32_t Slot);
+void ompx_free_global(void *P);
 #pragma omp end declare target
 
 void *ompx_new_allocation_host(void *P, uint64_t Bytes) {
   uint16_t AllocationId;
   void *NewP = ompx_new(AllocationId);
 #pragma omp target is_device_ptr(P)
-  ompx_new_host_allocation(P, Bytes, AllocationId);
+  ompx_new_global(P, Bytes, AllocationId, AllocationId);
   printf("registered %p[:%10zu] -> %zu:%p\n", P, Bytes, (uint64_t)AllocationId,
          NewP);
   fflush(stdout);
@@ -83,6 +97,6 @@ void ompx_free_allocation_host(void *P) {
   printf("unregister   %p\n", P);
   fflush(stdout);
 #pragma omp target is_device_ptr(P)
-  ompx_free_host_allocation(P);
+  ompx_free_global(P);
 }
 }

>From a0065100c2a0ea7ea02ecaac4c0253a5c18f7b0a Mon Sep 17 00:00:00 2001
From: Johannes Doerfert <johannes at jdoerfert.de>
Date: Fri, 21 Jun 2024 17:12:19 -0700
Subject: [PATCH 07/31] Restructuring

---
 .../lib/Transforms/Instrumentation/GPUSan.cpp |  80 +++-
 offload/DeviceRTL/CMakeLists.txt              |   3 +-
 offload/DeviceRTL/include/Allocator.h         |   2 +-
 offload/DeviceRTL/include/Configuration.h     |   2 +-
 .../include/{Types.h => DeviceTypes.h}        |   2 +-
 offload/DeviceRTL/include/DeviceUtils.h       |  55 +++
 offload/DeviceRTL/include/Interface.h         |   2 +-
 offload/DeviceRTL/include/LibC.h              |   2 +-
 offload/DeviceRTL/include/Mapping.h           |   2 +-
 offload/DeviceRTL/include/State.h             |   4 +-
 offload/DeviceRTL/include/Synchronization.h   |   2 +-
 offload/DeviceRTL/include/Utils.h             | 103 ------
 offload/DeviceRTL/src/AllocationTracker.cpp   | 343 ------------------
 offload/DeviceRTL/src/Allocator.cpp           |   4 +-
 offload/DeviceRTL/src/Configuration.cpp       |   2 +-
 offload/DeviceRTL/src/Debug.cpp               |   2 +-
 offload/DeviceRTL/src/Kernel.cpp              |   2 +-
 offload/DeviceRTL/src/Mapping.cpp             |   4 +-
 offload/DeviceRTL/src/Misc.cpp                |   2 +-
 offload/DeviceRTL/src/Parallelism.cpp         |   4 +-
 offload/DeviceRTL/src/Reduction.cpp           |   4 +-
 offload/DeviceRTL/src/Sanitizer.cpp           | 300 +++++++++++++++
 offload/DeviceRTL/src/State.cpp               |  16 +-
 offload/DeviceRTL/src/Synchronization.cpp     |   4 +-
 offload/DeviceRTL/src/Tasking.cpp             |   6 +-
 offload/DeviceRTL/src/Utils.cpp               |   2 +-
 offload/DeviceRTL/src/Workshare.cpp           |   4 +-
 offload/include/Shared/Environment.h          |   8 -
 offload/include/Shared/RefCnt.h               |  56 +++
 offload/include/Shared/Sanitizer.h            | 244 +++++++++++++
 offload/include/Shared/Types.h                |  22 ++
 offload/include/Shared/Utils.h                |  97 +++--
 offload/include/device.h                      |   8 +-
 offload/plugins-nextgen/amdgpu/src/rtl.cpp    | 125 +++----
 .../amdgpu/utils/UtilitiesRTL.h               |   4 +-
 .../common/include/PluginInterface.h          |  56 ++-
 .../common/src/GlobalHandler.cpp              |   4 +-
 offload/plugins-nextgen/common/src/JIT.cpp    |   4 +-
 .../common/src/PluginInterface.cpp            | 222 ++++++++++--
 offload/plugins-nextgen/cuda/src/rtl.cpp      |   2 +-
 offload/src/CMakeLists.txt                    |  18 +
 offload/src/DeviceImage.cpp                   |   5 +-
 offload/src/Kernels/Sanitizer.cpp             |  27 ++
 offload/src/OpenMP/Mapping.cpp                |  34 +-
 offload/src/device.cpp                        |  13 +-
 offload/src/exports                           |   1 +
 offload/src/omptarget.cpp                     |  27 +-
 offload/test/sanitizer/global_null.c          |  28 ++
 offload/test/sanitizer/heap_null.c            |  26 ++
 offload/test/sanitizer/volatile_stack_null.c  |  26 ++
 preload.cpp                                   | 102 ------
 51 files changed, 1296 insertions(+), 821 deletions(-)
 rename offload/DeviceRTL/include/{Types.h => DeviceTypes.h} (99%)
 create mode 100644 offload/DeviceRTL/include/DeviceUtils.h
 delete mode 100644 offload/DeviceRTL/include/Utils.h
 delete mode 100644 offload/DeviceRTL/src/AllocationTracker.cpp
 create mode 100644 offload/DeviceRTL/src/Sanitizer.cpp
 create mode 100644 offload/include/Shared/RefCnt.h
 create mode 100644 offload/include/Shared/Sanitizer.h
 create mode 100644 offload/include/Shared/Types.h
 create mode 100644 offload/src/Kernels/Sanitizer.cpp
 create mode 100644 offload/test/sanitizer/global_null.c
 create mode 100644 offload/test/sanitizer/heap_null.c
 create mode 100644 offload/test/sanitizer/volatile_stack_null.c
 delete mode 100644 preload.cpp

diff --git a/llvm/lib/Transforms/Instrumentation/GPUSan.cpp b/llvm/lib/Transforms/Instrumentation/GPUSan.cpp
index 738b75bd3bce4..7b726378f7d6b 100644
--- a/llvm/lib/Transforms/Instrumentation/GPUSan.cpp
+++ b/llvm/lib/Transforms/Instrumentation/GPUSan.cpp
@@ -12,6 +12,7 @@
 
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
@@ -90,6 +91,10 @@ class GPUSanImpl final {
   instrumentReturns(SmallVectorImpl<std::pair<AllocaInst *, Value *>> &Allocas,
                     SmallVectorImpl<ReturnInst *> &Returns);
 
+  Value *getPC(IRBuilder<> &IRB);
+  Value *getFunctionName(IRBuilder<> &IRB);
+  Value *getFileName(IRBuilder<> &IRB);
+  Value *getLineNo(IRBuilder<> &IRB);
   PtrOrigin getPtrOrigin(LoopInfo &LI, Value *Ptr);
 
   FunctionCallee getOrCreateFn(FunctionCallee &FC, StringRef Name, Type *RetTy,
@@ -104,30 +109,31 @@ class GPUSanImpl final {
   FunctionCallee getNewFn(PtrOrigin PO) {
     assert(PO <= GLOBAL && "Origin does not need handling.");
     return getOrCreateFn(NewFn[PO], "ompx_new" + getSuffix(PO), PtrTy,
-                         {PtrTy, Int64Ty, Int64Ty});
+                         {PtrTy, Int64Ty, Int64Ty, Int64Ty});
   }
   FunctionCallee getFreeFn(PtrOrigin PO) {
     assert(PO <= GLOBAL && "Origin does not need handling.");
     return getOrCreateFn(FreeFn[PO], "ompx_free" + getSuffix(PO), VoidTy,
-                         {PtrTy});
+                         {PtrTy, Int64Ty});
   }
   FunctionCallee getFreeNLocalFn() {
     return getOrCreateFn(FreeNLocal, "ompx_free_local_n", VoidTy, {Int32Ty});
   }
   FunctionCallee getCheckFn(PtrOrigin PO) {
     assert(PO <= GLOBAL && "Origin does not need handling.");
-    return getOrCreateFn(CheckFn[PO], "ompx_check" + getSuffix(PO), PtrTy,
-                         {PtrTy, Int64Ty, Int64Ty});
+    return getOrCreateFn(
+        CheckFn[PO], "ompx_check" + getSuffix(PO), PtrTy,
+        {PtrTy, Int64Ty, Int64Ty, Int64Ty, PtrTy, PtrTy, Int64Ty});
   }
   FunctionCallee getGEPFn(PtrOrigin PO) {
     assert(PO <= GLOBAL && "Origin does not need handling.");
     return getOrCreateFn(GEPFn[PO], "ompx_gep" + getSuffix(PO), PtrTy,
-                         {PtrTy, Int64Ty});
+                         {PtrTy, Int64Ty, Int64Ty});
   }
   FunctionCallee getUnpackFn(PtrOrigin PO) {
     assert(PO <= GLOBAL && "Origin does not need handling.");
     return getOrCreateFn(UnpackFn[PO], "ompx_unpack" + getSuffix(PO), PtrTy,
-                         {PtrTy});
+                         {PtrTy, Int64Ty});
   }
   FunctionCallee getLeakCheckFn() {
     FunctionCallee LeakCheckFn;
@@ -154,10 +160,53 @@ class GPUSanImpl final {
   FunctionCallee CheckFn[3];
   FunctionCallee UnpackFn[3];
   FunctionCallee FreeNLocal;
+
+  StringMap<Value *> GlobalStringMap;
 };
 
 } // end anonymous namespace
 
+Value *GPUSanImpl::getPC(IRBuilder<> &IRB) {
+  return IRB.CreateIntrinsic(Int64Ty, Intrinsic::amdgcn_s_getpc, {}, nullptr,
+                             "PC");
+}
+Value *GPUSanImpl::getFunctionName(IRBuilder<> &IRB) {
+  const auto &DLoc = IRB.getCurrentDebugLocation();
+  StringRef FnName = IRB.GetInsertPoint()->getFunction()->getName();
+  if (DLoc && DLoc.get()) {
+    StringRef SubprogramName = DLoc.get()->getSubprogramLinkageName();
+    if (!SubprogramName.empty())
+      FnName = SubprogramName;
+  }
+  StringRef Name = FnName.take_back(255);
+  Value *&NameVal = GlobalStringMap[Name];
+  if (!NameVal)
+    NameVal = IRB.CreateAddrSpaceCast(
+        IRB.CreateGlobalStringPtr(Name, "", DL.getDefaultGlobalsAddressSpace(),
+                                  &M),
+        PtrTy);
+  return NameVal;
+}
+Value *GPUSanImpl::getFileName(IRBuilder<> &IRB) {
+  const auto &DLoc = IRB.getCurrentDebugLocation();
+  if (!DLoc || DLoc->getFilename().empty())
+    return ConstantPointerNull::get(PtrTy);
+  StringRef Name = DLoc->getFilename().take_back(255);
+  Value *&NameVal = GlobalStringMap[Name];
+  if (!NameVal)
+    NameVal = IRB.CreateAddrSpaceCast(
+        IRB.CreateGlobalStringPtr(Name, "", DL.getDefaultGlobalsAddressSpace(),
+                                  &M),
+        PtrTy);
+  return NameVal;
+}
+Value *GPUSanImpl::getLineNo(IRBuilder<> &IRB) {
+  const auto &DLoc = IRB.getCurrentDebugLocation();
+  if (!DLoc)
+    return Constant::getNullValue(Int64Ty);
+  return ConstantInt::get(Int64Ty, DLoc.getLine());
+}
+
 PtrOrigin GPUSanImpl::getPtrOrigin(LoopInfo &LI, Value *Ptr) {
   SmallVector<const Value *> Objects;
   getUnderlyingObjects(Ptr, Objects, &LI);
@@ -223,7 +272,8 @@ Value *GPUSanImpl::instrumentAllocation(Instruction &I, Value &Size,
   Value *PlainI = IRB.CreatePointerBitCastOrAddrSpaceCast(&I, PtrTy);
   static int AllocationId = 1;
   auto *CB = IRB.CreateCall(
-      Fn, {PlainI, &Size, ConstantInt::get(Int64Ty, AllocationId++)},
+      Fn,
+      {PlainI, &Size, ConstantInt::get(Int64Ty, AllocationId++), getPC(IRB)},
       I.getName() + ".san");
   I.replaceUsesWithIf(IRB.CreatePointerBitCastOrAddrSpaceCast(CB, I.getType()),
                       [=](Use &U) {
@@ -258,7 +308,9 @@ void GPUSanImpl::instrumentAccess(LoopInfo &LI, Instruction &I, int PtrIdx,
   IRBuilder<> IRB(&I);
   Value *PlainPtrOp = IRB.CreatePointerBitCastOrAddrSpaceCast(PtrOp, PtrTy);
   auto *CB = IRB.CreateCall(
-      getCheckFn(PO), {PlainPtrOp, Size, ConstantInt::get(Int64Ty, AccessId)},
+      getCheckFn(PO),
+      {PlainPtrOp, Size, ConstantInt::get(Int64Ty, AccessId), getPC(IRB),
+       getFunctionName(IRB), getFileName(IRB), getLineNo(IRB)},
       I.getName() + ".san");
   I.setOperand(PtrIdx,
                IRB.CreatePointerBitCastOrAddrSpaceCast(CB, PtrOp->getType()));
@@ -285,9 +337,9 @@ void GPUSanImpl::instrumentGEPInst(LoopInfo &LI, GetElementPtrInst &GEP) {
                  Constant::getNullValue(PtrOp->getType()));
   IRBuilder<> IRB(GEP.getNextNode());
   Value *PlainPtrOp = IRB.CreatePointerBitCastOrAddrSpaceCast(PtrOp, PtrTy);
-  auto *CB =
-      IRB.CreateCall(getGEPFn(PO), {PlainPtrOp, UndefValue::get(Int64Ty)},
-                     GEP.getName() + ".san");
+  auto *CB = IRB.CreateCall(getGEPFn(PO),
+                            {PlainPtrOp, UndefValue::get(Int64Ty), getPC(IRB)},
+                            GEP.getName() + ".san");
   GEP.replaceAllUsesWith(
       IRB.CreatePointerBitCastOrAddrSpaceCast(CB, GEP.getType()));
   Value *Offset =
@@ -312,7 +364,7 @@ bool GPUSanImpl::instrumentCallInst(LoopInfo &LI, CallInst &CI) {
         if (PO > GLOBAL)
           continue;
         Value *PlainOp = IRB.CreatePointerBitCastOrAddrSpaceCast(Op, PtrTy);
-        auto *CB = IRB.CreateCall(getUnpackFn(PO), {PlainOp},
+        auto *CB = IRB.CreateCall(getUnpackFn(PO), {PlainOp, getPC(IRB)},
                                   Op->getName() + ".unpack");
         CI.setArgOperand(
             I, IRB.CreatePointerBitCastOrAddrSpaceCast(CB, Op->getType()));
@@ -392,8 +444,8 @@ bool GPUSanImpl::instrument() {
   for (Function &Fn : M)
     if (!Fn.getName().contains("ompx") && !Fn.getName().contains("__kmpc") &&
         !Fn.getName().starts_with("rpc_"))
-      Changed |= instrumentFunction(Fn);
-  M.dump();
+      if (!Fn.hasFnAttribute(Attribute::DisableSanitizerInstrumentation))
+        Changed |= instrumentFunction(Fn);
   return Changed;
 }
 
diff --git a/offload/DeviceRTL/CMakeLists.txt b/offload/DeviceRTL/CMakeLists.txt
index 2027725abd40e..644befc532ab2 100644
--- a/offload/DeviceRTL/CMakeLists.txt
+++ b/offload/DeviceRTL/CMakeLists.txt
@@ -69,6 +69,7 @@ elseif(LIBOMPTARGET_DEVICE_ARCHITECTURES STREQUAL "auto" OR
       "${LIBOMPTARGET_NVPTX_DETECTED_ARCH_LIST};${LIBOMPTARGET_AMDGPU_DETECTED_ARCH_LIST}")
 endif()
 list(REMOVE_DUPLICATES LIBOMPTARGET_DEVICE_ARCHITECTURES)
+set(LIBOMPTARGET_DEVICE_ARCHITECTURES ${LIBOMPTARGET_DEVICE_ARCHITECTURES} PARENT_SCOPE)
 
 set(include_files
   ${include_directory}/Allocator.h
@@ -85,7 +86,7 @@ set(include_files
 
 set(src_files
   ${source_directory}/Allocator.cpp
-  ${source_directory}/AllocationTracker.cpp
+  ${source_directory}/Sanitizer.cpp
   ${source_directory}/Configuration.cpp
   ${source_directory}/Debug.cpp
   ${source_directory}/Kernel.cpp
diff --git a/offload/DeviceRTL/include/Allocator.h b/offload/DeviceRTL/include/Allocator.h
index a28eb0fb2977e..6bb1cafac720f 100644
--- a/offload/DeviceRTL/include/Allocator.h
+++ b/offload/DeviceRTL/include/Allocator.h
@@ -12,7 +12,7 @@
 #ifndef OMPTARGET_ALLOCATOR_H
 #define OMPTARGET_ALLOCATOR_H
 
-#include "Types.h"
+#include "DeviceTypes.h"
 
 // Forward declaration.
 struct KernelEnvironmentTy;
diff --git a/offload/DeviceRTL/include/Configuration.h b/offload/DeviceRTL/include/Configuration.h
index 8e6f5c89cbf24..f8b7a6c3c6c9d 100644
--- a/offload/DeviceRTL/include/Configuration.h
+++ b/offload/DeviceRTL/include/Configuration.h
@@ -15,7 +15,7 @@
 
 #include "Shared/Environment.h"
 
-#include "Types.h"
+#include "DeviceTypes.h"
 
 namespace ompx {
 namespace config {
diff --git a/offload/DeviceRTL/include/Types.h b/offload/DeviceRTL/include/DeviceTypes.h
similarity index 99%
rename from offload/DeviceRTL/include/Types.h
rename to offload/DeviceRTL/include/DeviceTypes.h
index 2e12d9da0353b..bf30ba31260f5 100644
--- a/offload/DeviceRTL/include/Types.h
+++ b/offload/DeviceRTL/include/DeviceTypes.h
@@ -1,4 +1,4 @@
-//===---------- Types.h - OpenMP types ---------------------------- C++ -*-===//
+//===---- DeviceTypes.h - OpenMP types ---------------------------- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/offload/DeviceRTL/include/DeviceUtils.h b/offload/DeviceRTL/include/DeviceUtils.h
new file mode 100644
index 0000000000000..7b8871a766161
--- /dev/null
+++ b/offload/DeviceRTL/include/DeviceUtils.h
@@ -0,0 +1,55 @@
+//===--------- DeviceUtils.h - OpenMP device runtime utility functions -- C++
+//-*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef OMPTARGET_DEVICERTL_DEVICE_UTILS_H
+#define OMPTARGET_DEVICERTL_DEVICE_UTILS_H
+
+#include "Shared/Utils.h"
+#include "DeviceTypes.h"
+
+#pragma omp begin declare target device_type(nohost)
+
+namespace utils {
+
+/// Return the value \p Var from thread Id \p SrcLane in the warp if the thread
+/// is identified by \p Mask.
+int32_t shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane);
+
+int32_t shuffleDown(uint64_t Mask, int32_t Var, uint32_t Delta, int32_t Width);
+
+int64_t shuffleDown(uint64_t Mask, int64_t Var, uint32_t Delta, int32_t Width);
+
+uint64_t ballotSync(uint64_t Mask, int32_t Pred);
+
+/// Return \p LowBits and \p HighBits packed into a single 64 bit value.
+uint64_t pack(uint32_t LowBits, uint32_t HighBits);
+
+/// Unpack \p Val into \p LowBits and \p HighBits.
+void unpack(uint64_t Val, uint32_t &LowBits, uint32_t &HighBits);
+
+/// Return true iff \p Ptr is pointing into shared (local) memory (AS(3)).
+bool isSharedMemPtr(void *Ptr);
+
+/// Return true iff \p Ptr is pointing into (thread) local memory (AS(5)).
+bool isThreadLocalMemPtr(void *Ptr);
+
+/// A  pointer variable that has by design an `undef` value. Use with care.
+[[clang::loader_uninitialized]] static void *const UndefPtr;
+
+#define OMP_LIKELY(EXPR) __builtin_expect((bool)(EXPR), true)
+#define OMP_UNLIKELY(EXPR) __builtin_expect((bool)(EXPR), false)
+
+} // namespace utils
+
+#pragma omp end declare target
+
+#endif
diff --git a/offload/DeviceRTL/include/Interface.h b/offload/DeviceRTL/include/Interface.h
index f4854ed3d1678..12244a0971f49 100644
--- a/offload/DeviceRTL/include/Interface.h
+++ b/offload/DeviceRTL/include/Interface.h
@@ -14,7 +14,7 @@
 
 #include "Shared/Environment.h"
 
-#include "Types.h"
+#include "DeviceTypes.h"
 
 /// External API
 ///
diff --git a/offload/DeviceRTL/include/LibC.h b/offload/DeviceRTL/include/LibC.h
index dde86af783af9..6e02b4aca462a 100644
--- a/offload/DeviceRTL/include/LibC.h
+++ b/offload/DeviceRTL/include/LibC.h
@@ -12,7 +12,7 @@
 #ifndef OMPTARGET_LIBC_H
 #define OMPTARGET_LIBC_H
 
-#include "Types.h"
+#include "DeviceTypes.h"
 
 extern "C" {
 
diff --git a/offload/DeviceRTL/include/Mapping.h b/offload/DeviceRTL/include/Mapping.h
index 165904644dbb9..2fb87abe5418c 100644
--- a/offload/DeviceRTL/include/Mapping.h
+++ b/offload/DeviceRTL/include/Mapping.h
@@ -12,7 +12,7 @@
 #ifndef OMPTARGET_MAPPING_H
 #define OMPTARGET_MAPPING_H
 
-#include "Types.h"
+#include "DeviceTypes.h"
 
 namespace ompx {
 
diff --git a/offload/DeviceRTL/include/State.h b/offload/DeviceRTL/include/State.h
index 1a3490394458f..37699529e726f 100644
--- a/offload/DeviceRTL/include/State.h
+++ b/offload/DeviceRTL/include/State.h
@@ -16,8 +16,8 @@
 
 #include "Debug.h"
 #include "Mapping.h"
-#include "Types.h"
-#include "Utils.h"
+#include "DeviceTypes.h"
+#include "DeviceUtils.h"
 
 // Forward declaration.
 struct KernelEnvironmentTy;
diff --git a/offload/DeviceRTL/include/Synchronization.h b/offload/DeviceRTL/include/Synchronization.h
index af9e1a673e6a2..874974cc861df 100644
--- a/offload/DeviceRTL/include/Synchronization.h
+++ b/offload/DeviceRTL/include/Synchronization.h
@@ -12,7 +12,7 @@
 #ifndef OMPTARGET_DEVICERTL_SYNCHRONIZATION_H
 #define OMPTARGET_DEVICERTL_SYNCHRONIZATION_H
 
-#include "Types.h"
+#include "DeviceTypes.h"
 
 namespace ompx {
 
diff --git a/offload/DeviceRTL/include/Utils.h b/offload/DeviceRTL/include/Utils.h
deleted file mode 100644
index 5797229e40a78..0000000000000
--- a/offload/DeviceRTL/include/Utils.h
+++ /dev/null
@@ -1,103 +0,0 @@
-//===--------- Utils.h - OpenMP device runtime utility functions -- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef OMPTARGET_DEVICERTL_UTILS_H
-#define OMPTARGET_DEVICERTL_UTILS_H
-
-#include "Types.h"
-
-#pragma omp begin declare target device_type(nohost)
-
-namespace ompx {
-namespace utils {
-
-/// Return the value \p Var from thread Id \p SrcLane in the warp if the thread
-/// is identified by \p Mask.
-int32_t shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane);
-
-int32_t shuffleDown(uint64_t Mask, int32_t Var, uint32_t Delta, int32_t Width);
-
-int64_t shuffleDown(uint64_t Mask, int64_t Var, uint32_t Delta, int32_t Width);
-
-uint64_t ballotSync(uint64_t Mask, int32_t Pred);
-
-/// Return \p LowBits and \p HighBits packed into a single 64 bit value.
-uint64_t pack(uint32_t LowBits, uint32_t HighBits);
-
-/// Unpack \p Val into \p LowBits and \p HighBits.
-void unpack(uint64_t Val, uint32_t &LowBits, uint32_t &HighBits);
-
-/// Round up \p V to a \p Boundary.
-template <typename Ty> inline Ty roundUp(Ty V, Ty Boundary) {
-  return (V + Boundary - 1) / Boundary * Boundary;
-}
-
-/// Advance \p Ptr by \p Bytes bytes.
-template <typename Ty1, typename Ty2> inline Ty1 *advance(Ty1 Ptr, Ty2 Bytes) {
-  return reinterpret_cast<Ty1 *>(reinterpret_cast<char *>(Ptr) + Bytes);
-}
-
-/// Return the first bit set in \p V.
-inline uint32_t ffs(uint32_t V) {
-  static_assert(sizeof(int) == sizeof(uint32_t), "type size mismatch");
-  return __builtin_ffs(V);
-}
-
-/// Return the first bit set in \p V.
-inline uint32_t ffs(uint64_t V) {
-  static_assert(sizeof(long) == sizeof(uint64_t), "type size mismatch");
-  return __builtin_ffsl(V);
-}
-
-/// Return the number of bits set in \p V.
-inline uint32_t popc(uint32_t V) {
-  static_assert(sizeof(int) == sizeof(uint32_t), "type size mismatch");
-  return __builtin_popcount(V);
-}
-
-/// Return the number of bits set in \p V.
-inline uint32_t popc(uint64_t V) {
-  static_assert(sizeof(long) == sizeof(uint64_t), "type size mismatch");
-  return __builtin_popcountl(V);
-}
-
-/// Return \p V aligned "upwards" according to \p Align.
-template <typename Ty1, typename Ty2> inline Ty1 align_up(Ty1 V, Ty2 Align) {
-  return ((V + Ty1(Align) - 1) / Ty1(Align)) * Ty1(Align);
-}
-/// Return \p V aligned "downwards" according to \p Align.
-template <typename Ty1, typename Ty2> inline Ty1 align_down(Ty1 V, Ty2 Align) {
-  return V - V % Align;
-}
-
-/// Return true iff \p Ptr is pointing into shared (local) memory (AS(3)).
-bool isSharedMemPtr(void *Ptr);
-
-/// Return true iff \p Ptr is pointing into (thread) local memory (AS(5)).
-bool isThreadLocalMemPtr(void *Ptr);
-
-/// Return \p V typed punned as \p DstTy.
-template <typename DstTy, typename SrcTy> inline DstTy convertViaPun(SrcTy V) {
-  return *((DstTy *)(&V));
-}
-
-/// A  pointer variable that has by design an `undef` value. Use with care.
-[[clang::loader_uninitialized]] static void *const UndefPtr;
-
-#define OMP_LIKELY(EXPR) __builtin_expect((bool)(EXPR), true)
-#define OMP_UNLIKELY(EXPR) __builtin_expect((bool)(EXPR), false)
-
-} // namespace utils
-} // namespace ompx
-
-#pragma omp end declare target
-
-#endif
diff --git a/offload/DeviceRTL/src/AllocationTracker.cpp b/offload/DeviceRTL/src/AllocationTracker.cpp
deleted file mode 100644
index 0b3332d47dad8..0000000000000
--- a/offload/DeviceRTL/src/AllocationTracker.cpp
+++ /dev/null
@@ -1,343 +0,0 @@
-//===------ AllocationTracker.cpp - Track allocation for sanitizers -------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-//===----------------------------------------------------------------------===//
-
-#include "Interface.h"
-#include "LibC.h"
-#include "Shared/Environment.h"
-#include "Synchronization.h"
-#include "Types.h"
-#include "Utils.h"
-
-using namespace ompx;
-using namespace utils;
-
-#pragma omp begin declare target device_type(nohost)
-
-extern "C" int ompx_block_id(int Dim);
-
-[[gnu::used, gnu::retain, gnu::weak,
-  gnu::visibility("protected")]] OMPXTrapIDTy *__ompx_trap_id;
-
-#define _OBJECT_TY uint16_t
-
-enum class AllocationKind { GLOBAL, LOCAL, LAST = LOCAL };
-
-template <AllocationKind AK> struct Config {
-  static constexpr uint32_t ADDR_SPACE = AK == AllocationKind::GLOBAL ? 0 : 3;
-  static constexpr uint32_t NUM_ALLOCATION_ARRAYS =
-      AK == AllocationKind::GLOBAL ? 1 : (256 * 8 * 4);
-  static constexpr uint32_t TAG_BITS = AK == AllocationKind::GLOBAL ? 1 : 8;
-
-  static constexpr uint32_t OBJECT_BITS =
-      AK == AllocationKind::GLOBAL ? 10 : (sizeof(_OBJECT_TY) * 8);
-  static constexpr uint32_t SLOTS =
-      (1 << (OBJECT_BITS)) / NUM_ALLOCATION_ARRAYS;
-  static constexpr uint32_t KIND_BITS = 1;
-  static constexpr uint32_t SID_BITS = 16 - KIND_BITS;
-
-  static constexpr uint32_t LENGTH_BITS = 64 - TAG_BITS - SID_BITS - KIND_BITS;
-  static constexpr uint32_t OFFSET_BITS = LENGTH_BITS - OBJECT_BITS;
-
-  static constexpr bool useTags() { return TAG_BITS > 1; }
-
-  static_assert(LENGTH_BITS + TAG_BITS + KIND_BITS + SID_BITS == 64,
-                "Length and tag bits should cover 64 bits");
-  static_assert(OFFSET_BITS + TAG_BITS + KIND_BITS + SID_BITS + OBJECT_BITS ==
-                    64,
-                "Length, tag, and object bits should cover 64 bits");
-  static_assert((1 << KIND_BITS) >= ((uint64_t)AllocationKind::LAST + 1),
-                "Kind bits should match allocation kinds");
-};
-
-template <AllocationKind AK> struct AllocationTy {
-  void *Start;
-  uint64_t Length : Config<AK>::LENGTH_BITS;
-  uint64_t Tag : Config<AK>::TAG_BITS;
-  uint64_t SID : Config<AK>::SID_BITS;
-
-  bool contains(void *Ptr, uint64_t Size) const {
-    return Ptr >= Start && advance(Ptr, Size) <= advance(Start, Length);
-  }
-};
-
-template <AllocationKind AK> struct AllocationArrayTy {
-  AllocationTy<AK> Arr[Config<AK>::SLOTS];
-  uint32_t Cnt;
-};
-
-template <AllocationKind AK> struct AllocationPtrTy {
-  static AllocationPtrTy<AK> get(void *P) {
-    return convertViaPun<AllocationPtrTy<AK>>(P);
-  }
-
-  operator void *() const { return convertViaPun<void *>(*this); }
-  operator intptr_t() const { return convertViaPun<intptr_t>(*this); }
-  uint64_t PtrOffset : Config<AK>::OFFSET_BITS;
-  uint64_t AllocationTag : Config<AK>::TAG_BITS;
-  uint64_t AllocationId : Config<AK>::OBJECT_BITS;
-  // Must be last, TODO: merge into TAG
-  uint64_t Kind : Config<AK>::KIND_BITS;
-};
-
-template <AllocationKind AK> struct AllocationTracker {
-  static_assert(sizeof(AllocationTy<AK>) == sizeof(void *) * 2,
-                "AllocationTy should not exceed two pointers");
-  static_assert(sizeof(AllocationPtrTy<AK>) == sizeof(void *),
-                "AllocationTy pointers should be pointer sized");
-
-  static AllocationArrayTy<AK> Allocations[Config<AK>::NUM_ALLOCATION_ARRAYS];
-
-  static void *create(void *Start, uint64_t Length, int64_t AllocationId,
-                      uint32_t Slot) {
-    // printf("New alloc %p, %lu, %li\n", Start, Length, AllocationId);
-
-    if constexpr (Config<AK>::OFFSET_BITS < 64)
-      if (Length >= (1UL << (Config<AK>::OFFSET_BITS))) {
-        __ompx_trap_id->ID = AllocationId;
-        __ompx_trap_id->AccessID = -4;
-        __builtin_trap();
-      }
-
-    uint32_t ThreadId = 0, BlockId = 0;
-    if constexpr (AK == AllocationKind::LOCAL) {
-      ThreadId = __kmpc_get_hardware_thread_id_in_block();
-      BlockId = ompx_block_id(0);
-    }
-
-    // Reserve the 0 element for the null pointer in global space.
-    auto &AllocArr =
-        Allocations[ThreadId +
-                    BlockId * __kmpc_get_hardware_num_threads_in_block()];
-    auto &Cnt = AllocArr.Cnt;
-    if constexpr (AK == AllocationKind::LOCAL)
-      Slot = ++Cnt;
-
-    uint32_t NumSlots = Config<AK>::SLOTS;
-    if (Slot >= NumSlots) {
-      __ompx_trap_id->Offset = Slot;
-      __ompx_trap_id->Length = NumSlots;
-      __ompx_trap_id->ID = AllocationId;
-      __ompx_trap_id->AccessID = -5;
-      __builtin_trap();
-    }
-
-    auto &A = AllocArr.Arr[Slot];
-
-    A.Start = Start;
-    A.Length = Length;
-    A.SID = AllocationId;
-
-    AllocationPtrTy<AK> AP;
-    AP.PtrOffset = 0;
-    AP.AllocationId = Slot;
-    AP.Kind = (uint64_t)AK;
-    if constexpr (Config<AK>::useTags()) {
-      AP.AllocationTag = ++A.Tag;
-    }
-    return AP;
-  }
-
-  static void remove(void *P) {
-    AllocationPtrTy<AK> AP = AllocationPtrTy<AK>::get(P);
-    uint32_t AllocationId = AP.AllocationId;
-
-    uint32_t ThreadId = 0, BlockId = 0;
-    if constexpr (AK == AllocationKind::LOCAL) {
-      ThreadId = __kmpc_get_hardware_thread_id_in_block();
-      BlockId = ompx_block_id(0);
-    }
-    auto &AllocArr =
-        Allocations[ThreadId +
-                    BlockId * __kmpc_get_hardware_num_threads_in_block()];
-    auto &A = AllocArr.Arr[AllocationId];
-    A.Length = 0;
-
-    auto &Cnt = AllocArr.Cnt;
-    if constexpr (AK == AllocationKind::LOCAL) {
-      if (Cnt == AllocationId)
-        --Cnt;
-    }
-  }
-
-  static void remove_n(int32_t N) {
-    static_assert(AK == AllocationKind::LOCAL, "");
-    uint32_t ThreadId = __kmpc_get_hardware_thread_id_in_block();
-    uint32_t BlockId = ompx_block_id(0);
-    auto &AllocArr =
-        Allocations[ThreadId +
-                    BlockId * __kmpc_get_hardware_num_threads_in_block()];
-    auto &Cnt = AllocArr.Cnt;
-    for (int32_t I = 0; I < N; ++I) {
-      auto &A = AllocArr.Arr[Cnt--];
-      A.Length = 0;
-    }
-  }
-
-  static void *advance(void *P, uint64_t Offset) {
-    AllocationPtrTy<AK> AP = AllocationPtrTy<AK>::get(P);
-    AP.PtrOffset += Offset;
-    return AP;
-  }
-
-  static void *check(void *P, uint64_t Size, int64_t AccessId) {
-    AllocationPtrTy<AK> AP = AllocationPtrTy<AK>::get(P);
-    uint32_t ThreadId = 0, BlockId = 0;
-    if constexpr (AK == AllocationKind::LOCAL) {
-      ThreadId = __kmpc_get_hardware_thread_id_in_block();
-      BlockId = ompx_block_id(0);
-    }
-    auto &AllocArr =
-        Allocations[ThreadId +
-                    BlockId * __kmpc_get_hardware_num_threads_in_block()];
-    auto &A = AllocArr.Arr[AP.AllocationId];
-    uint64_t Offset = AP.PtrOffset;
-    uint64_t Length = A.Length;
-    if (Offset > Length - Size ||
-        (Config<AK>::useTags() && A.Tag != AP.AllocationTag)) {
-      __ompx_trap_id->ID = AP.AllocationId;
-      __ompx_trap_id->Start = A.Start;
-      __ompx_trap_id->Length = A.Length;
-      __ompx_trap_id->Offset = AP.PtrOffset;
-      __ompx_trap_id->AccessID = AccessId;
-      __builtin_trap();
-    }
-    return advance(A.Start, Offset);
-  }
-
-  static void *unpack(void *P) {
-    AllocationPtrTy<AK> AP = AllocationPtrTy<AK>::get(P);
-    uint32_t ThreadId = 0, BlockId = 0;
-    if constexpr (AK == AllocationKind::LOCAL) {
-      ThreadId = __kmpc_get_hardware_thread_id_in_block();
-      BlockId = ompx_block_id(0);
-    }
-    auto &AllocArr =
-        Allocations[ThreadId +
-                    BlockId * __kmpc_get_hardware_num_threads_in_block()];
-    auto &A = AllocArr.Arr[AP.AllocationId];
-    uint64_t Offset = AP.PtrOffset;
-    void *Ptr = advance(A.Start, Offset);
-    return Ptr;
-  }
-
-  static void leak_check() {
-    static_assert(AK == AllocationKind::GLOBAL, "");
-    auto &AllocArr = Allocations[0];
-    for (uint32_t I = 0; I < Config<AK>::SLOTS; ++I) {
-      auto &A = AllocArr.Arr[I];
-      if (!A.Length)
-        continue;
-      __ompx_trap_id->ID = I;
-      __ompx_trap_id->Start = A.Start;
-      __ompx_trap_id->Length = A.Length;
-      __ompx_trap_id->AccessID = -6;
-      __builtin_trap();
-    }
-  }
-};
-
-template <AllocationKind AK>
-AllocationArrayTy<AK>
-    AllocationTracker<AK>::Allocations[Config<AK>::NUM_ALLOCATION_ARRAYS];
-
-extern "C" {
-
-#define PTR_CHECK(FUNCTION, PTR, ...)                                          \
-  if (isThreadLocalMemPtr(PTR))                                                \
-    return AllocationTracker<AllocationKind::LOCAL>::FUNCTION(                 \
-        PTR __VA_OPT__(, ) __VA_ARGS__);                                       \
-  return AllocationTracker<AllocationKind::GLOBAL>::FUNCTION(                  \
-      PTR __VA_OPT__(, ) __VA_ARGS__);
-#define FAKE_PTR_CHECK(FUNCTION, PTR, ...)                                     \
-  if (AllocationPtrTy<AllocationKind::GLOBAL>::get(PTR).Kind ==                \
-      (uint32_t)AllocationKind::LOCAL)                                         \
-    return AllocationTracker<AllocationKind::LOCAL>::FUNCTION(                 \
-        PTR __VA_OPT__(, ) __VA_ARGS__);                                       \
-  return AllocationTracker<AllocationKind::GLOBAL>::FUNCTION(                  \
-      PTR __VA_OPT__(, ) __VA_ARGS__);
-
-[[gnu::flatten, gnu::always_inline, gnu::used, gnu::retain]] void *
-ompx_new(void *Start, uint64_t Length, int64_t AllocationId, uint32_t Slot) {
-  PTR_CHECK(create, Start, Length, AllocationId, Slot);
-}
-[[gnu::flatten, gnu::always_inline, gnu::used, gnu::retain]] void *
-ompx_new_local(void *Start, uint64_t Length, int64_t AllocationId,
-               uint32_t Slot) {
-  return AllocationTracker<AllocationKind::LOCAL>::create(Start, Length,
-                                                          AllocationId, Slot);
-}
-[[gnu::flatten, gnu::always_inline, gnu::used, gnu::retain]] void
-ompx_new_global(void *Start, uint64_t Length, uint16_t AllocationId,
-                uint32_t Slot) {
-  AllocationTracker<AllocationKind::GLOBAL>::create(Start, Length, AllocationId,
-                                                    Slot);
-}
-
-[[gnu::flatten, gnu::always_inline, gnu::used, gnu::retain]] void
-ompx_free(void *P) {
-  FAKE_PTR_CHECK(remove, P);
-}
-[[gnu::flatten, gnu::always_inline, gnu::used, gnu::retain]] void
-ompx_free_local_n(int32_t N) {
-  return AllocationTracker<AllocationKind::LOCAL>::remove_n(N);
-}
-[[gnu::flatten, gnu::always_inline, gnu::used, gnu::retain]] void
-ompx_free_global(void *P) {
-  AllocationTracker<AllocationKind::GLOBAL>::remove(P);
-}
-
-[[gnu::flatten, gnu::always_inline, gnu::used, gnu::retain]] void *
-ompx_gep(void *P, uint64_t Offset) {
-  FAKE_PTR_CHECK(advance, P, Offset);
-}
-[[gnu::flatten, gnu::always_inline, gnu::used, gnu::retain]] void *
-ompx_gep_local(void *P, uint64_t Offset) {
-  return AllocationTracker<AllocationKind::LOCAL>::advance(P, Offset);
-}
-[[gnu::flatten, gnu::always_inline, gnu::used, gnu::retain]] void *
-ompx_gep_global(void *P, uint64_t Offset) {
-  return AllocationTracker<AllocationKind::GLOBAL>::advance(P, Offset);
-}
-
-[[gnu::flatten, gnu::always_inline, gnu::used, gnu::retain]] void *
-ompx_check(void *P, uint64_t Size, uint64_t AccessId) {
-  FAKE_PTR_CHECK(check, P, Size, AccessId);
-}
-
-[[gnu::flatten, gnu::always_inline, gnu::used, gnu::retain]] void *
-ompx_check_local(void *P, uint64_t Size, uint64_t AccessId) {
-  return AllocationTracker<AllocationKind::LOCAL>::check(P, Size, AccessId);
-}
-
-[[gnu::flatten, gnu::always_inline, gnu::used, gnu::retain]] void *
-ompx_check_global(void *P, uint64_t Size, uint64_t AccessId) {
-  return AllocationTracker<AllocationKind::GLOBAL>::check(P, Size, AccessId);
-}
-
-[[gnu::flatten, gnu::always_inline, gnu::used, gnu::retain]] void *
-ompx_unpack(void *P) {
-  FAKE_PTR_CHECK(unpack, P);
-}
-[[gnu::flatten, gnu::always_inline, gnu::used, gnu::retain]] void *
-ompx_unpack_local(void *P) {
-  return AllocationTracker<AllocationKind::LOCAL>::unpack(P);
-}
-[[gnu::flatten, gnu::always_inline, gnu::used, gnu::retain]] void *
-ompx_unpack_global(void *P) {
-  return AllocationTracker<AllocationKind::GLOBAL>::unpack(P);
-}
-
-[[gnu::flatten, gnu::always_inline, gnu::used, gnu::retain]] void
-ompx_leak_check() {
-  AllocationTracker<AllocationKind::GLOBAL>::leak_check();
-}
-}
-
-#pragma omp end declare target
diff --git a/offload/DeviceRTL/src/Allocator.cpp b/offload/DeviceRTL/src/Allocator.cpp
index c9c940de62c1a..2a85a34d32f6e 100644
--- a/offload/DeviceRTL/src/Allocator.cpp
+++ b/offload/DeviceRTL/src/Allocator.cpp
@@ -14,8 +14,8 @@
 #include "Configuration.h"
 #include "Mapping.h"
 #include "Synchronization.h"
-#include "Types.h"
-#include "Utils.h"
+#include "DeviceTypes.h"
+#include "DeviceUtils.h"
 
 using namespace ompx;
 
diff --git a/offload/DeviceRTL/src/Configuration.cpp b/offload/DeviceRTL/src/Configuration.cpp
index ef0c3663536f5..4d97ad67313aa 100644
--- a/offload/DeviceRTL/src/Configuration.cpp
+++ b/offload/DeviceRTL/src/Configuration.cpp
@@ -13,7 +13,7 @@
 
 #include "Configuration.h"
 #include "State.h"
-#include "Types.h"
+#include "DeviceTypes.h"
 
 using namespace ompx;
 
diff --git a/offload/DeviceRTL/src/Debug.cpp b/offload/DeviceRTL/src/Debug.cpp
index 31cd54e3de35c..bf228ec1b32cc 100644
--- a/offload/DeviceRTL/src/Debug.cpp
+++ b/offload/DeviceRTL/src/Debug.cpp
@@ -17,7 +17,7 @@
 #include "Interface.h"
 #include "Mapping.h"
 #include "State.h"
-#include "Types.h"
+#include "DeviceTypes.h"
 
 using namespace ompx;
 
diff --git a/offload/DeviceRTL/src/Kernel.cpp b/offload/DeviceRTL/src/Kernel.cpp
index 95d4c728016d2..afee129c2589c 100644
--- a/offload/DeviceRTL/src/Kernel.cpp
+++ b/offload/DeviceRTL/src/Kernel.cpp
@@ -18,7 +18,7 @@
 #include "Mapping.h"
 #include "State.h"
 #include "Synchronization.h"
-#include "Types.h"
+#include "DeviceTypes.h"
 
 #include "llvm/Frontend/OpenMP/OMPDeviceConstants.h"
 
diff --git a/offload/DeviceRTL/src/Mapping.cpp b/offload/DeviceRTL/src/Mapping.cpp
index c1ce878746a69..8287312c74e4e 100644
--- a/offload/DeviceRTL/src/Mapping.cpp
+++ b/offload/DeviceRTL/src/Mapping.cpp
@@ -12,8 +12,8 @@
 #include "Mapping.h"
 #include "Interface.h"
 #include "State.h"
-#include "Types.h"
-#include "Utils.h"
+#include "DeviceTypes.h"
+#include "DeviceUtils.h"
 
 #pragma omp begin declare target device_type(nohost)
 
diff --git a/offload/DeviceRTL/src/Misc.cpp b/offload/DeviceRTL/src/Misc.cpp
index c24af9442d16e..ca8b549b28dbf 100644
--- a/offload/DeviceRTL/src/Misc.cpp
+++ b/offload/DeviceRTL/src/Misc.cpp
@@ -10,7 +10,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "Configuration.h"
-#include "Types.h"
+#include "DeviceTypes.h"
 
 #include "Debug.h"
 
diff --git a/offload/DeviceRTL/src/Parallelism.cpp b/offload/DeviceRTL/src/Parallelism.cpp
index 031a5ced25518..2a510e9531657 100644
--- a/offload/DeviceRTL/src/Parallelism.cpp
+++ b/offload/DeviceRTL/src/Parallelism.cpp
@@ -37,8 +37,8 @@
 #include "Mapping.h"
 #include "State.h"
 #include "Synchronization.h"
-#include "Types.h"
-#include "Utils.h"
+#include "DeviceTypes.h"
+#include "DeviceUtils.h"
 
 using namespace ompx;
 
diff --git a/offload/DeviceRTL/src/Reduction.cpp b/offload/DeviceRTL/src/Reduction.cpp
index 744d1a3a231c8..f4e2e0d25bde9 100644
--- a/offload/DeviceRTL/src/Reduction.cpp
+++ b/offload/DeviceRTL/src/Reduction.cpp
@@ -15,8 +15,8 @@
 #include "Mapping.h"
 #include "State.h"
 #include "Synchronization.h"
-#include "Types.h"
-#include "Utils.h"
+#include "DeviceTypes.h"
+#include "DeviceUtils.h"
 
 using namespace ompx;
 
diff --git a/offload/DeviceRTL/src/Sanitizer.cpp b/offload/DeviceRTL/src/Sanitizer.cpp
new file mode 100644
index 0000000000000..d080214390982
--- /dev/null
+++ b/offload/DeviceRTL/src/Sanitizer.cpp
@@ -0,0 +1,300 @@
+//===------ Sanitizer.cpp - Track allocation for sanitizer checks ---------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+
+#include "DeviceTypes.h"
+#include "DeviceUtils.h"
+#include "Interface.h"
+#include "LibC.h"
+#include "Shared/Environment.h"
+#include "Synchronization.h"
+
+using namespace ompx;
+using namespace utils;
+
+#pragma omp begin declare target device_type(nohost)
+
+#include "Shared/Sanitizer.h"
+
+[[gnu::used, gnu::retain, gnu::weak,
+  gnu::visibility("protected")]] SanitizerTrapInfoTy *__sanitizer_trap_info_ptr;
+
+template <AllocationKind AK> struct AllocationTracker {
+  static_assert(sizeof(AllocationTy<AK>) == sizeof(void *) * 2,
+                "AllocationTy should not exceed two pointers");
+  static_assert(sizeof(AllocationPtrTy<AK>) == sizeof(void *),
+                "AllocationTy pointers should be pointer sized");
+
+  static AllocationArrayTy<AK>
+      Allocations[SanitizerConfig<AK>::NUM_ALLOCATION_ARRAYS];
+
+  [[clang::disable_sanitizer_instrumentation]] static void *
+  create(void *Start, uint64_t Length, int64_t AllocationId, uint64_t Slot,
+         uint64_t PC) {
+    if constexpr (SanitizerConfig<AK>::OFFSET_BITS < 64)
+      if (Length >= (1UL << (SanitizerConfig<AK>::OFFSET_BITS)))
+        __sanitizer_trap_info_ptr->exceedsAllocationLength<AK>(
+            Start, Length, AllocationId, Slot, PC);
+
+    uint32_t ThreadId = 0, BlockId = 0;
+    if constexpr (AK == AllocationKind::LOCAL) {
+      ThreadId = __kmpc_get_hardware_thread_id_in_block();
+      BlockId = ompx_block_id(0);
+    }
+
+    // Reserve the 0 element for the null pointer in global space.
+    auto &AllocArr =
+        Allocations[ThreadId +
+                    BlockId * __kmpc_get_hardware_num_threads_in_block()];
+    auto &Cnt = AllocArr.Cnt;
+    if constexpr (AK == AllocationKind::LOCAL)
+      Slot = ++Cnt;
+
+    uint64_t NumSlots = SanitizerConfig<AK>::SLOTS;
+    if (Slot >= NumSlots)
+      __sanitizer_trap_info_ptr->exceedsAllocationSlots<AK>(
+          Start, Length, AllocationId, Slot, PC);
+
+    auto &A = AllocArr.Arr[Slot];
+
+    A.Start = Start;
+    A.Length = Length;
+    A.Id = AllocationId;
+
+    AllocationPtrTy<AK> AP;
+    AP.Offset = 0;
+    AP.AllocationId = Slot;
+    AP.Kind = (uint64_t)AK;
+    if constexpr (SanitizerConfig<AK>::useTags()) {
+      AP.AllocationTag = ++A.Tag;
+    }
+    return AP;
+  }
+
+  [[clang::disable_sanitizer_instrumentation]] static void remove(void *P,
+                                                                  uint64_t PC) {
+    AllocationPtrTy<AK> AP = AllocationPtrTy<AK>::get(P);
+    uint32_t AllocationId = AP.AllocationId;
+
+    uint32_t ThreadId = 0, BlockId = 0;
+    if constexpr (AK == AllocationKind::LOCAL) {
+      ThreadId = __kmpc_get_hardware_thread_id_in_block();
+      BlockId = ompx_block_id(0);
+    }
+    auto &AllocArr =
+        Allocations[ThreadId +
+                    BlockId * __kmpc_get_hardware_num_threads_in_block()];
+    auto &A = AllocArr.Arr[AllocationId];
+    A.Length = 0;
+
+    auto &Cnt = AllocArr.Cnt;
+    if constexpr (AK == AllocationKind::LOCAL) {
+      if (Cnt == AllocationId)
+        --Cnt;
+    }
+  }
+
+  [[clang::disable_sanitizer_instrumentation]] static void remove_n(int32_t N) {
+    static_assert(AK == AllocationKind::LOCAL, "");
+    uint32_t ThreadId = __kmpc_get_hardware_thread_id_in_block();
+    uint32_t BlockId = ompx_block_id(0);
+    auto &AllocArr =
+        Allocations[ThreadId +
+                    BlockId * __kmpc_get_hardware_num_threads_in_block()];
+    auto &Cnt = AllocArr.Cnt;
+    for (int32_t I = 0; I < N; ++I) {
+      auto &A = AllocArr.Arr[Cnt--];
+      A.Length = 0;
+    }
+  }
+
+  [[clang::disable_sanitizer_instrumentation]] static void *
+  advance(void *P, uint64_t Offset, uint64_t PC) {
+    AllocationPtrTy<AK> AP = AllocationPtrTy<AK>::get(P);
+    AP.Offset += Offset;
+    return AP;
+  }
+
+  [[clang::disable_sanitizer_instrumentation]] static void *
+  check(void *P, int64_t Size, int64_t AccessId, uint64_t PC,
+        const char *FunctionName, const char *FileName, uint64_t LineNo) {
+    AllocationPtrTy<AK> AP = AllocationPtrTy<AK>::get(P);
+    uint32_t ThreadId = 0, BlockId = 0;
+    if constexpr (AK == AllocationKind::LOCAL) {
+      ThreadId = __kmpc_get_hardware_thread_id_in_block();
+      BlockId = ompx_block_id(0);
+    }
+    auto &AllocArr =
+        Allocations[ThreadId +
+                    BlockId * __kmpc_get_hardware_num_threads_in_block()];
+    auto &A = AllocArr.Arr[AP.AllocationId];
+    int64_t Offset = AP.Offset;
+    int64_t Length = A.Length;
+    if (Offset > Length - Size ||
+        (SanitizerConfig<AK>::useTags() && A.Tag != AP.AllocationTag)) {
+      if (Offset > Length - Size)
+        __sanitizer_trap_info_ptr->outOfBoundAccess<AK>(
+            A, AP, Size, AccessId, PC, FunctionName, FileName, LineNo);
+      else
+        __sanitizer_trap_info_ptr->useAfterFree<AK>(
+            A, AP, Size, AccessId, PC, FunctionName, FileName, LineNo);
+    }
+    return utils::advancePtr(A.Start, Offset);
+  }
+
+  [[clang::disable_sanitizer_instrumentation]] static void *
+  unpack(void *P, uint64_t PC = 0) {
+    AllocationPtrTy<AK> AP = AllocationPtrTy<AK>::get(P);
+    uint32_t ThreadId = 0, BlockId = 0;
+    if constexpr (AK == AllocationKind::LOCAL) {
+      ThreadId = __kmpc_get_hardware_thread_id_in_block();
+      BlockId = ompx_block_id(0);
+    }
+    auto &AllocArr =
+        Allocations[ThreadId +
+                    BlockId * __kmpc_get_hardware_num_threads_in_block()];
+    auto &A = AllocArr.Arr[AP.AllocationId];
+    uint64_t Offset = AP.Offset;
+    void *Ptr = utils::advancePtr(A.Start, Offset);
+    return Ptr;
+  }
+
+  [[clang::disable_sanitizer_instrumentation]] static void leakCheck() {
+    static_assert(AK == AllocationKind::GLOBAL, "");
+    auto &AllocArr = Allocations[0];
+    for (uint64_t Slot = 0; Slot < SanitizerConfig<AK>::SLOTS; ++Slot) {
+      auto &A = AllocArr.Arr[Slot];
+      if (A.Length)
+        __sanitizer_trap_info_ptr->memoryLeak<AK>(A, Slot);
+    }
+  }
+};
+
+template <AllocationKind AK>
+AllocationArrayTy<AK> AllocationTracker<
+    AK>::Allocations[SanitizerConfig<AK>::NUM_ALLOCATION_ARRAYS];
+
+extern "C" {
+
+#define PTR_CHECK(FUNCTION, PTR, ...)                                          \
+  if (isThreadLocalMemPtr(PTR))                                                \
+    return AllocationTracker<AllocationKind::LOCAL>::FUNCTION(                 \
+        PTR __VA_OPT__(, ) __VA_ARGS__);                                       \
+  return AllocationTracker<AllocationKind::GLOBAL>::FUNCTION(                  \
+      PTR __VA_OPT__(, ) __VA_ARGS__);
+#define FAKE_PTR_CHECK(FUNCTION, PTR, ...)                                     \
+  if (AllocationPtrTy<AllocationKind::GLOBAL>::get(PTR).Kind ==                \
+      (uint32_t)AllocationKind::LOCAL)                                         \
+    return AllocationTracker<AllocationKind::LOCAL>::FUNCTION(                 \
+        PTR __VA_OPT__(, ) __VA_ARGS__);                                       \
+  return AllocationTracker<AllocationKind::GLOBAL>::FUNCTION(                  \
+      PTR __VA_OPT__(, ) __VA_ARGS__);
+
+[[clang::disable_sanitizer_instrumentation, gnu::flatten, gnu::always_inline,
+  gnu::used, gnu::retain]] void *
+ompx_new(void *Start, uint64_t Length, int64_t AllocationId, uint32_t Slot,
+         uint64_t PC) {
+  PTR_CHECK(create, Start, Length, AllocationId, Slot, PC);
+}
+[[clang::disable_sanitizer_instrumentation, gnu::flatten, gnu::always_inline,
+  gnu::used, gnu::retain]] void *
+ompx_new_local(void *Start, uint64_t Length, int64_t AllocationId,
+               uint32_t Slot, uint64_t PC) {
+  return AllocationTracker<AllocationKind::LOCAL>::create(
+      Start, Length, AllocationId, Slot, PC);
+}
+[[clang::disable_sanitizer_instrumentation, gnu::flatten, gnu::always_inline,
+  gnu::used, gnu::retain]] void
+__sanitizer_register_host(void *Start, uint64_t Length, uint64_t Slot,
+                          uint64_t PC) {
+  AllocationTracker<AllocationKind::GLOBAL>::create(Start, Length, Slot, Slot,
+                                                    PC);
+}
+
+[[clang::disable_sanitizer_instrumentation, gnu::flatten, gnu::always_inline,
+  gnu::used, gnu::retain]] void
+ompx_free(void *P, uint64_t PC) {
+  FAKE_PTR_CHECK(remove, P, PC);
+}
+[[clang::disable_sanitizer_instrumentation, gnu::flatten, gnu::always_inline,
+  gnu::used, gnu::retain]] void
+ompx_free_local_n(int32_t N) {
+  return AllocationTracker<AllocationKind::LOCAL>::remove_n(N);
+}
+[[clang::disable_sanitizer_instrumentation, gnu::flatten, gnu::always_inline,
+  gnu::used, gnu::retain]] void
+__sanitizer_unregister_host(void *P) {
+  AllocationTracker<AllocationKind::GLOBAL>::remove(P, /*PC=*/0);
+}
+
+[[clang::disable_sanitizer_instrumentation, gnu::flatten, gnu::always_inline,
+  gnu::used, gnu::retain]] void *
+ompx_gep(void *P, uint64_t Offset, uint64_t PC) {
+  FAKE_PTR_CHECK(advance, P, Offset, PC);
+}
+[[clang::disable_sanitizer_instrumentation, gnu::flatten, gnu::always_inline,
+  gnu::used, gnu::retain]] void *
+ompx_gep_local(void *P, uint64_t Offset, uint64_t PC) {
+  return AllocationTracker<AllocationKind::LOCAL>::advance(P, Offset, PC);
+}
+[[clang::disable_sanitizer_instrumentation, gnu::flatten, gnu::always_inline,
+  gnu::used, gnu::retain]] void *
+ompx_gep_global(void *P, uint64_t Offset, uint64_t PC) {
+  return AllocationTracker<AllocationKind::GLOBAL>::advance(P, Offset, PC);
+}
+
+[[clang::disable_sanitizer_instrumentation, gnu::flatten, gnu::always_inline,
+  gnu::used, gnu::retain]] void *
+ompx_check(void *P, uint64_t Size, uint64_t AccessId, uint64_t PC,
+           const char *FunctionName, const char *FileName, uint64_t LineNo) {
+  FAKE_PTR_CHECK(check, P, Size, AccessId, PC, FunctionName, FileName, LineNo);
+}
+
+[[clang::disable_sanitizer_instrumentation, gnu::flatten, gnu::always_inline,
+  gnu::used, gnu::retain]] void *
+ompx_check_local(void *P, uint64_t Size, uint64_t AccessId, uint64_t PC,
+                 const char *FunctionName, const char *FileName,
+                 uint64_t LineNo) {
+  return AllocationTracker<AllocationKind::LOCAL>::check(
+      P, Size, AccessId, PC, FunctionName, FileName, LineNo);
+}
+
+[[clang::disable_sanitizer_instrumentation, gnu::flatten, gnu::always_inline,
+  gnu::used, gnu::retain]] void *
+ompx_check_global(void *P, uint64_t Size, uint64_t AccessId, uint64_t PC,
+                  const char *FunctionName, const char *FileName,
+                  uint64_t LineNo) {
+  return AllocationTracker<AllocationKind::GLOBAL>::check(
+      P, Size, AccessId, PC, FunctionName, FileName, LineNo);
+}
+
+[[clang::disable_sanitizer_instrumentation, gnu::flatten, gnu::always_inline,
+  gnu::used, gnu::retain]] void *
+ompx_unpack(void *P, uint64_t PC) {
+  FAKE_PTR_CHECK(unpack, P, PC);
+}
+[[clang::disable_sanitizer_instrumentation, gnu::flatten, gnu::always_inline,
+  gnu::used, gnu::retain]] void *
+ompx_unpack_local(void *P, uint64_t PC) {
+  return AllocationTracker<AllocationKind::LOCAL>::unpack(P, PC);
+}
+[[clang::disable_sanitizer_instrumentation, gnu::flatten, gnu::always_inline,
+  gnu::used, gnu::retain]] void *
+ompx_unpack_global(void *P, uint64_t PC) {
+  return AllocationTracker<AllocationKind::GLOBAL>::unpack(P, PC);
+}
+
+[[clang::disable_sanitizer_instrumentation, gnu::flatten, gnu::always_inline,
+  gnu::used, gnu::retain]] void
+ompx_leak_check() {
+  AllocationTracker<AllocationKind::GLOBAL>::leakCheck();
+}
+}
+
+#pragma omp end declare target
diff --git a/offload/DeviceRTL/src/State.cpp b/offload/DeviceRTL/src/State.cpp
index a1e4fa2449d9a..cb83b7839b5b2 100644
--- a/offload/DeviceRTL/src/State.cpp
+++ b/offload/DeviceRTL/src/State.cpp
@@ -13,13 +13,13 @@
 #include "Allocator.h"
 #include "Configuration.h"
 #include "Debug.h"
+#include "DeviceUtils.h"
 #include "Interface.h"
 #include "LibC.h"
 #include "Mapping.h"
 #include "State.h"
 #include "Synchronization.h"
-#include "Types.h"
-#include "Utils.h"
+#include "DeviceTypes.h"
 
 using namespace ompx;
 
@@ -84,14 +84,14 @@ struct SharedMemorySmartStackTy {
 
   /// Deallocate the last allocation made by the encountering thread and pointed
   /// to by \p Ptr from the stack. Each thread can call this function.
-  void pop(void *Ptr, uint32_t Bytes);
+  void pop(void *Ptr, uint64_t Bytes);
 
 private:
   /// Compute the size of the storage space reserved for a thread.
   uint32_t computeThreadStorageTotal() {
     uint32_t NumLanesInBlock = mapping::getNumberOfThreadsInBlock();
-    return utils::align_down((state::SharedScratchpadSize / NumLanesInBlock),
-                             allocator::ALIGNMENT);
+    return utils::alignDown((state::SharedScratchpadSize / NumLanesInBlock),
+                            allocator::ALIGNMENT);
   }
 
   /// Return the top address of the warp data stack, that is the first address
@@ -121,7 +121,7 @@ void *SharedMemorySmartStackTy::push(uint64_t Bytes) {
   // First align the number of requested bytes.
   /// FIXME: The stack shouldn't require worst-case padding. Alignment needs to
   /// be passed in as an argument and the stack rewritten to support it.
-  uint64_t AlignedBytes = utils::align_up(Bytes, allocator::ALIGNMENT);
+  uint64_t AlignedBytes = utils::alignPtr(Bytes, allocator::ALIGNMENT);
 
   uint32_t StorageTotal = computeThreadStorageTotal();
 
@@ -148,8 +148,8 @@ void *SharedMemorySmartStackTy::push(uint64_t Bytes) {
   return GlobalMemory;
 }
 
-void SharedMemorySmartStackTy::pop(void *Ptr, uint32_t Bytes) {
-  uint64_t AlignedBytes = utils::align_up(Bytes, allocator::ALIGNMENT);
+void SharedMemorySmartStackTy::pop(void *Ptr, uint64_t Bytes) {
+  uint64_t AlignedBytes = utils::alignPtr(Bytes, allocator::ALIGNMENT);
   if (utils::isSharedMemPtr(Ptr)) {
     int TId = mapping::getThreadIdInBlock();
     Usage[TId] -= AlignedBytes;
diff --git a/offload/DeviceRTL/src/Synchronization.cpp b/offload/DeviceRTL/src/Synchronization.cpp
index 80ba87b300bcd..97a6b080169ad 100644
--- a/offload/DeviceRTL/src/Synchronization.cpp
+++ b/offload/DeviceRTL/src/Synchronization.cpp
@@ -16,8 +16,8 @@
 #include "Interface.h"
 #include "Mapping.h"
 #include "State.h"
-#include "Types.h"
-#include "Utils.h"
+#include "DeviceTypes.h"
+#include "DeviceUtils.h"
 
 #pragma omp begin declare target device_type(nohost)
 
diff --git a/offload/DeviceRTL/src/Tasking.cpp b/offload/DeviceRTL/src/Tasking.cpp
index 2dc33562e6d79..34cb67cb1a351 100644
--- a/offload/DeviceRTL/src/Tasking.cpp
+++ b/offload/DeviceRTL/src/Tasking.cpp
@@ -13,10 +13,10 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "DeviceUtils.h"
 #include "Interface.h"
 #include "State.h"
-#include "Types.h"
-#include "Utils.h"
+#include "DeviceTypes.h"
 
 using namespace ompx;
 
@@ -34,7 +34,7 @@ TaskDescriptorTy *__kmpc_omp_task_alloc(IdentTy *, int32_t, int32_t,
   TaskDescriptorTy *TaskDescriptor = (TaskDescriptorTy *)memory::allocGlobal(
       TaskSizeTotal, "explicit task descriptor");
   TaskDescriptor->Payload =
-      utils::advance(TaskDescriptor, TaskSizeInclPrivateValuesPadded);
+      utils::advancePtr(TaskDescriptor, TaskSizeInclPrivateValuesPadded);
   TaskDescriptor->TaskFn = TaskFn;
 
   return TaskDescriptor;
diff --git a/offload/DeviceRTL/src/Utils.cpp b/offload/DeviceRTL/src/Utils.cpp
index 9ca99bd20f861..956e6200ffd5c 100644
--- a/offload/DeviceRTL/src/Utils.cpp
+++ b/offload/DeviceRTL/src/Utils.cpp
@@ -9,7 +9,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "Utils.h"
+#include "DeviceUtils.h"
 
 #include "Debug.h"
 #include "Interface.h"
diff --git a/offload/DeviceRTL/src/Workshare.cpp b/offload/DeviceRTL/src/Workshare.cpp
index bcb7c5ad50a18..fd835f2af677e 100644
--- a/offload/DeviceRTL/src/Workshare.cpp
+++ b/offload/DeviceRTL/src/Workshare.cpp
@@ -17,8 +17,8 @@
 #include "Mapping.h"
 #include "State.h"
 #include "Synchronization.h"
-#include "Types.h"
-#include "Utils.h"
+#include "DeviceTypes.h"
+#include "DeviceUtils.h"
 
 using namespace ompx;
 
diff --git a/offload/include/Shared/Environment.h b/offload/include/Shared/Environment.h
index f35570dcee53c..d141146b6bd5a 100644
--- a/offload/include/Shared/Environment.h
+++ b/offload/include/Shared/Environment.h
@@ -105,12 +105,4 @@ struct KernelLaunchEnvironmentTy {
   void *ReductionBuffer = nullptr;
 };
 
-struct OMPXTrapIDTy {
-  void *Start;
-  uint64_t Length;
-  uint64_t Offset;
-  uint64_t ID;
-  uint64_t AccessID;
-};
-
 #endif // OMPTARGET_SHARED_ENVIRONMENT_H
diff --git a/offload/include/Shared/RefCnt.h b/offload/include/Shared/RefCnt.h
new file mode 100644
index 0000000000000..7c615ba167a3d
--- /dev/null
+++ b/offload/include/Shared/RefCnt.h
@@ -0,0 +1,56 @@
+//===-- Shared/RefCnt.h - Helper to keep track of references --- C++ ------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef OMPTARGET_SHARED_REF_CNT_H
+#define OMPTARGET_SHARED_REF_CNT_H
+
+#include <atomic>
+#include <cassert>
+#include <limits>
+#include <memory>
+
+namespace llvm {
+namespace omp {
+namespace target {
+
+/// Utility class for thread-safe reference counting. Any class that needs
+/// objects' reference counting can inherit from this entity or have it as a
+/// class data member.
+template <typename Ty = uint32_t,
+          std::memory_order MemoryOrder = std::memory_order_relaxed>
+struct RefCountTy {
+  /// Create a refcount object initialized to zero.
+  RefCountTy() : Refs(0) {}
+
+  ~RefCountTy() { assert(Refs == 0 && "Destroying with non-zero refcount"); }
+
+  /// Increase the reference count atomically.
+  void increase() { Refs.fetch_add(1, MemoryOrder); }
+
+  /// Decrease the reference count and return whether it became zero. Decreasing
+  /// the counter in more units than it was previously increased results in
+  /// undefined behavior.
+  bool decrease() {
+    Ty Prev = Refs.fetch_sub(1, MemoryOrder);
+    assert(Prev > 0 && "Invalid refcount");
+    return (Prev == 1);
+  }
+
+  Ty get() const { return Refs.load(MemoryOrder); }
+
+private:
+  /// The atomic reference counter.
+  std::atomic<Ty> Refs;
+};
+} // namespace target
+} // namespace omp
+} // namespace llvm
+
+#endif
diff --git a/offload/include/Shared/Sanitizer.h b/offload/include/Shared/Sanitizer.h
new file mode 100644
index 0000000000000..01fba05c3ce05
--- /dev/null
+++ b/offload/include/Shared/Sanitizer.h
@@ -0,0 +1,244 @@
+//===-- Shared/SanitizerHost.h - OFfload sanitizer host logic ----- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef OMPTARGET_SHARED_SANITIZER_HOST_H
+#define OMPTARGET_SHARED_SANITIZER_HOST_H
+
+#include "Types.h"
+#include "Utils.h"
+
+extern "C" int ompx_block_id(int Dim);
+extern "C" int ompx_thread_id(int Dim);
+
+enum class AllocationKind { GLOBAL, LOCAL, LAST = LOCAL };
+
+#define _OBJECT_TY uint16_t
+
+template <AllocationKind AK> struct SanitizerConfig {
+  static constexpr uint32_t ADDR_SPACE = AK == AllocationKind::GLOBAL ? 0 : 3;
+  static constexpr uint32_t NUM_ALLOCATION_ARRAYS =
+      AK == AllocationKind::GLOBAL ? 1 : (256 * 8 * 4);
+  static constexpr uint32_t TAG_BITS = AK == AllocationKind::GLOBAL ? 1 : 8;
+
+  static constexpr uint32_t OBJECT_BITS =
+      AK == AllocationKind::GLOBAL ? 10 : (sizeof(_OBJECT_TY) * 8);
+  static constexpr uint32_t SLOTS =
+      (1 << (OBJECT_BITS)) / NUM_ALLOCATION_ARRAYS;
+  static constexpr uint32_t KIND_BITS = 1;
+  static constexpr uint32_t ID_BITS = 16 - KIND_BITS;
+
+  static constexpr uint32_t LENGTH_BITS = 64 - TAG_BITS - ID_BITS - KIND_BITS;
+  static constexpr uint32_t OFFSET_BITS = LENGTH_BITS - OBJECT_BITS;
+
+  static constexpr bool useTags() { return TAG_BITS > 1; }
+
+  static_assert(LENGTH_BITS + TAG_BITS + KIND_BITS + ID_BITS == 64,
+                "Length and tag bits should cover 64 bits");
+  static_assert(OFFSET_BITS + TAG_BITS + KIND_BITS + ID_BITS + OBJECT_BITS ==
+                    64,
+                "Length, tag, and object bits should cover 64 bits");
+  static_assert((1 << KIND_BITS) >= ((uint64_t)AllocationKind::LAST + 1),
+                "Kind bits should match allocation kinds");
+};
+
+template <AllocationKind AK> struct AllocationTy {
+  void *Start;
+  uint64_t Length : SanitizerConfig<AK>::LENGTH_BITS;
+  uint64_t Tag : SanitizerConfig<AK>::TAG_BITS;
+  uint64_t Id : SanitizerConfig<AK>::ID_BITS;
+};
+
+template <AllocationKind AK> struct AllocationArrayTy {
+  AllocationTy<AK> Arr[SanitizerConfig<AK>::SLOTS];
+  uint32_t Cnt;
+};
+
+template <AllocationKind AK> struct AllocationPtrTy {
+  static AllocationPtrTy<AK> get(void *P) {
+    return utils::convertViaPun<AllocationPtrTy<AK>>(P);
+  }
+
+  operator void *() const { return utils::convertViaPun<void *>(*this); }
+  operator intptr_t() const { return utils::convertViaPun<intptr_t>(*this); }
+  uint64_t Offset : SanitizerConfig<AK>::OFFSET_BITS;
+  uint64_t AllocationTag : SanitizerConfig<AK>::TAG_BITS;
+  uint64_t AllocationId : SanitizerConfig<AK>::OBJECT_BITS;
+  // Must be last, TODO: merge into TAG
+  uint64_t Kind : SanitizerConfig<AK>::KIND_BITS;
+};
+
+static inline void *__offload_get_new_sanitizer_ptr(int32_t Slot) {
+  AllocationPtrTy<AllocationKind::GLOBAL> AP;
+  AP.Offset = 0;
+  AP.AllocationId = Slot;
+  AP.Kind = (uint32_t)AllocationKind::GLOBAL;
+  return AP;
+}
+
+struct SanitizerTrapInfoTy {
+  /// AllocationTy
+  /// {
+  void *AllocationStart;
+  uint64_t AllocationLength;
+  int32_t AllocationId;
+  uint32_t AllocationTag;
+  uint8_t AllocationKind;
+  ///}
+
+  enum ErrorCodeTy : uint8_t {
+    None = 0,
+    ExceedsLength,
+    ExceedsSlots,
+    OutOfBounds,
+    UseAfterFree,
+    MemoryLeak,
+  } ErrorCode;
+
+  /// AllocationTy
+  /// {
+  uint64_t PtrOffset;
+  uint64_t PtrSlot;
+  uint16_t PtrTag;
+  uint16_t PtrKind;
+  ///}
+
+  /// Access
+  /// {
+  uint32_t AccessSize;
+  int64_t AccessId;
+  /// }
+
+  /// Thread
+  /// {
+  uint64_t BlockId[3];
+  uint32_t ThreadId[3];
+  uint64_t PC;
+  uint64_t LineNo;
+  char FunctionName[256];
+  char FileName[256];
+  /// }
+
+  [[clang::disable_sanitizer_instrumentation]] void
+  setCoordinates(uint64_t PC, const char *FnName, const char *FlName,
+                 uint64_t LineNo) {
+    for (int32_t Dim = 0; Dim < 3; ++Dim) {
+      BlockId[Dim] = ompx_block_id(Dim);
+      ThreadId[Dim] = ompx_thread_id(Dim);
+    }
+    this->PC = PC;
+    this->LineNo = LineNo;
+
+    auto CopyName = [](char *Dst, const char *Src, int32_t Length) {
+      if (!Src)
+        return;
+      for (int32_t I = 0; I < Length; ++I) {
+        Dst[I] = Src[I];
+        if (!Src[I])
+          break;
+      }
+    };
+    CopyName(FunctionName, FnName, sizeof(FunctionName));
+    CopyName(FileName, FlName, sizeof(FileName));
+  }
+
+  template <enum AllocationKind AK>
+  [[clang::disable_sanitizer_instrumentation]] void
+  allocationError(ErrorCodeTy EC, void *Start, uint64_t Length, int64_t Id,
+                  int64_t Tag, uint64_t Slot, uint64_t PC) {
+    AllocationStart = Start;
+    AllocationLength = Length;
+    AllocationId = Id;
+    AllocationTag = Tag;
+    PtrSlot = Slot;
+    AllocationKind = (decltype(AllocationKind))AK;
+
+    ErrorCode = EC;
+    setCoordinates(PC, nullptr, nullptr, 0);
+  }
+
+  template <enum AllocationKind AK>
+  [[clang::disable_sanitizer_instrumentation]] void
+  accessError(ErrorCodeTy EC, const AllocationTy<AK> &A,
+              const AllocationPtrTy<AK> &AP, uint64_t Size, int64_t Id,
+              uint64_t PC, const char *FunctionName, const char *FileName,
+              uint64_t LineNo) {
+    AllocationStart = A.Start;
+    AllocationLength = A.Length;
+    AllocationId = A.Id;
+    AllocationTag = A.Tag;
+
+    ErrorCode = EC;
+
+    PtrOffset = AP.Offset;
+    PtrSlot = AP.AllocationId;
+    PtrTag = AP.AllocationTag;
+    PtrKind = AP.Kind;
+
+    AccessSize = Size;
+    AccessId = Id;
+
+    setCoordinates(PC, FunctionName, FileName, LineNo);
+  }
+
+  template <enum AllocationKind AK>
+  [[clang::disable_sanitizer_instrumentation, noreturn, gnu::flatten,
+    gnu::always_inline]] void
+  exceedsAllocationLength(void *Start, uint64_t Length, int64_t AllocationId,
+                          uint64_t Slot, uint64_t PC) {
+    allocationError<AK>(ExceedsLength, Start, Length, AllocationId, /*Tag=*/0,
+                        Slot, PC);
+    __builtin_trap();
+  }
+
+  template <enum AllocationKind AK>
+  [[clang::disable_sanitizer_instrumentation, noreturn, gnu::flatten,
+    gnu::always_inline]] void
+  exceedsAllocationSlots(void *Start, uint64_t Length, int64_t AllocationId,
+                         uint64_t Slot, uint64_t PC) {
+    allocationError<AK>(ExceedsSlots, Start, Length, AllocationId, /*Tag=*/0,
+                        Slot, PC);
+    __builtin_trap();
+  }
+
+  template <enum AllocationKind AK>
+  [[clang::disable_sanitizer_instrumentation, noreturn, gnu::flatten,
+    gnu::always_inline]] void
+  outOfBoundAccess(const AllocationTy<AK> &A, const AllocationPtrTy<AK> &AP,
+                   uint64_t Size, int64_t AccessId, uint64_t PC,
+                   const char *FunctionName, const char *FileName,
+                   uint64_t LineNo) {
+    accessError(OutOfBounds, A, AP, Size, AccessId, PC, FunctionName, FileName,
+                LineNo);
+    __builtin_trap();
+  }
+
+  template <enum AllocationKind AK>
+  [[clang::disable_sanitizer_instrumentation, noreturn, gnu::flatten,
+    gnu::always_inline]] void
+  useAfterFree(const AllocationTy<AK> &A, const AllocationPtrTy<AK> &AP,
+               uint64_t Size, int64_t AccessId, uint64_t PC,
+               const char *FunctionName, const char *FileName,
+               uint64_t LineNo) {
+    accessError(UseAfterFree, A, AP, Size, AccessId, PC, FunctionName, FileName,
+                LineNo);
+    __builtin_trap();
+  }
+
+  template <enum AllocationKind AK>
+  [[clang::disable_sanitizer_instrumentation, noreturn, gnu::flatten,
+    gnu::always_inline]] void
+  memoryLeak(const AllocationTy<AK> &A, uint64_t Slot) {
+    allocationError<AK>(MemoryLeak, A.Start, A.Length, A.Id, A.Tag, Slot,
+                        /*PC=*/0);
+    __builtin_trap();
+  }
+};
+
+#endif
diff --git a/offload/include/Shared/Types.h b/offload/include/Shared/Types.h
new file mode 100644
index 0000000000000..1503a4b2a1437
--- /dev/null
+++ b/offload/include/Shared/Types.h
@@ -0,0 +1,22 @@
+//===-- Shared/Types.h - Type defs shared between host and device - C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Environments shared between host and device.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef OMPTARGET_SHARED_TYPES_H
+#define OMPTARGET_SHARED_TYPES_H
+
+#ifdef OMPTARGET_DEVICE_RUNTIME
+#include "DeviceTypes.h"
+#else
+#include <cstdint>
+#endif
+
+#endif // OMPTARGET_SHARED_TYPES_H
diff --git a/offload/include/Shared/Utils.h b/offload/include/Shared/Utils.h
index fce14b54edb98..9b2b2df2f3aff 100644
--- a/offload/include/Shared/Utils.h
+++ b/offload/include/Shared/Utils.h
@@ -14,75 +14,66 @@
 #ifndef OMPTARGET_SHARED_UTILS_H
 #define OMPTARGET_SHARED_UTILS_H
 
-#include "llvm/ADT/StringRef.h"
+#include "Types.h"
 
-#include "Debug.h"
-
-#include <atomic>
-#include <cassert>
-#include <limits>
-#include <memory>
-
-namespace llvm {
-namespace omp {
-namespace target {
-
-/// Utility class for thread-safe reference counting. Any class that needs
-/// objects' reference counting can inherit from this entity or have it as a
-/// class data member.
-template <typename Ty = uint32_t,
-          std::memory_order MemoryOrder = std::memory_order_relaxed>
-struct RefCountTy {
-  /// Create a refcount object initialized to zero.
-  RefCountTy() : Refs(0) {}
-
-  ~RefCountTy() { assert(Refs == 0 && "Destroying with non-zero refcount"); }
-
-  /// Increase the reference count atomically.
-  void increase() { Refs.fetch_add(1, MemoryOrder); }
-
-  /// Decrease the reference count and return whether it became zero. Decreasing
-  /// the counter in more units than it was previously increased results in
-  /// undefined behavior.
-  bool decrease() {
-    Ty Prev = Refs.fetch_sub(1, MemoryOrder);
-    assert(Prev > 0 && "Invalid refcount");
-    return (Prev == 1);
-  }
-
-  Ty get() const { return Refs.load(MemoryOrder); }
-
-private:
-  /// The atomic reference counter.
-  std::atomic<Ty> Refs;
-};
+namespace utils {
 
 /// Return the difference (in bytes) between \p Begin and \p End.
 template <typename Ty = char>
-ptrdiff_t getPtrDiff(const void *End, const void *Begin) {
+auto getPtrDiff(const void *End, const void *Begin) {
   return reinterpret_cast<const Ty *>(End) -
          reinterpret_cast<const Ty *>(Begin);
 }
 
 /// Return \p Ptr advanced by \p Offset bytes.
-template <typename Ty> Ty *advanceVoidPtr(Ty *Ptr, int64_t Offset) {
-  static_assert(std::is_void<Ty>::value);
-  return const_cast<char *>(reinterpret_cast<const char *>(Ptr) + Offset);
+template <typename Ty1, typename Ty2> Ty1 *advancePtr(Ty1 *Ptr, Ty2 Offset) {
+  return reinterpret_cast<Ty1 *>(
+      const_cast<char *>(reinterpret_cast<const char *>(Ptr)) + Offset);
 }
 
-/// Return \p Ptr aligned to \p Alignment bytes.
-template <typename Ty> Ty *alignPtr(Ty *Ptr, int64_t Alignment) {
-  size_t Space = std::numeric_limits<size_t>::max();
-  return std::align(Alignment, sizeof(char), Ptr, Space);
+/// Return \p V aligned "upwards" according to \p Align.
+template <typename Ty1, typename Ty2> inline Ty1 alignPtr(Ty1 V, Ty2 Align) {
+  return reinterpret_cast<Ty1>(((uintptr_t(V) + Align - 1) / Align) * Align);
+}
+/// Return \p V aligned "downwards" according to \p Align.
+template <typename Ty1, typename Ty2> inline Ty1 alignDown(Ty1 V, Ty2 Align) {
+  return V - V % Align;
 }
 
 /// Round up \p V to a \p Boundary.
 template <typename Ty> inline Ty roundUp(Ty V, Ty Boundary) {
-  return (V + Boundary - 1) / Boundary * Boundary;
+  return alignPtr(V, Boundary);
+}
+
+/// Return the first bit set in \p V.
+inline uint32_t ffs(uint32_t V) {
+  static_assert(sizeof(int) == sizeof(uint32_t), "type size mismatch");
+  return __builtin_ffs(V);
+}
+
+/// Return the first bit set in \p V.
+inline uint32_t ffs(uint64_t V) {
+  static_assert(sizeof(long) == sizeof(uint64_t), "type size mismatch");
+  return __builtin_ffsl(V);
+}
+
+/// Return the number of bits set in \p V.
+inline uint32_t popc(uint32_t V) {
+  static_assert(sizeof(int) == sizeof(uint32_t), "type size mismatch");
+  return __builtin_popcount(V);
+}
+
+/// Return the number of bits set in \p V.
+inline uint32_t popc(uint64_t V) {
+  static_assert(sizeof(long) == sizeof(uint64_t), "type size mismatch");
+  return __builtin_popcountl(V);
+}
+
+template <typename DstTy, typename SrcTy> inline DstTy convertViaPun(SrcTy V) {
+  static_assert(sizeof(DstTy) == sizeof(SrcTy), "Bad conversion");
+  return *((DstTy *)(&V));
 }
 
-} // namespace target
-} // namespace omp
-} // namespace llvm
+} // namespace utils
 
 #endif // OMPTARGET_SHARED_UTILS_H
diff --git a/offload/include/device.h b/offload/include/device.h
index fd6e5fba5fc53..69954254666b8 100644
--- a/offload/include/device.h
+++ b/offload/include/device.h
@@ -100,11 +100,15 @@ struct DeviceTy {
 
   /// Notify the plugin about a new mapping starting at the host address
   /// \p HstPtr and \p Size bytes.
-  int32_t notifyDataMapped(void *HstPtr, int64_t Size);
+  /// If GPUSan is enabled, \p DevicePtr is registered in each image and
+  /// \p FakeHstPtr is updated.
+  int32_t notifyDataMapped(void *HstPtr, void *DevicePtr, int64_t Size,
+                           void *&FakeHstPtr);
 
   /// Notify the plugin about an existing mapping being unmapped starting at
   /// the host address \p HstPtr.
-  int32_t notifyDataUnmapped(void *HstPtr);
+  /// If GPUSan is enabled, \p FakeHstPtr is unregistered.
+  int32_t notifyDataUnmapped(void *HstPtr, void *FakeHstPtr);
 
   // Launch the kernel identified by \p TgtEntryPtr with the given arguments.
   int32_t launchKernel(void *TgtEntryPtr, void **TgtVarsPtr,
diff --git a/offload/plugins-nextgen/amdgpu/src/rtl.cpp b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
index c3d9d09f21f0f..d9edfb0e8eab6 100644
--- a/offload/plugins-nextgen/amdgpu/src/rtl.cpp
+++ b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -25,6 +25,7 @@
 #include "Shared/APITypes.h"
 #include "Shared/Debug.h"
 #include "Shared/Environment.h"
+#include "Shared/RefCnt.h"
 #include "Shared/Utils.h"
 #include "Utils/ELF.h"
 
@@ -89,7 +90,7 @@ struct AMDGPUDeviceImageTy;
 struct AMDGPUMemoryManagerTy;
 struct AMDGPUMemoryPoolTy;
 
-namespace utils {
+namespace hsa_utils {
 
 /// Iterate elements using an HSA iterate function. Do not use this function
 /// directly but the specialized ones below instead.
@@ -189,7 +190,7 @@ Error asyncMemCopy(bool UseMultipleSdmaEngines, void *Dst, hsa_agent_t DstAgent,
 
 Expected<std::string> getTargetTripleAndFeatures(hsa_agent_t Agent) {
   std::string Target;
-  auto Err = utils::iterateAgentISAs(Agent, [&](hsa_isa_t ISA) {
+  auto Err = hsa_utils::iterateAgentISAs(Agent, [&](hsa_isa_t ISA) {
     uint32_t Length;
     hsa_status_t Status;
     Status = hsa_isa_get_info_alt(ISA, HSA_ISA_INFO_NAME_LENGTH, &Length);
@@ -210,7 +211,7 @@ Expected<std::string> getTargetTripleAndFeatures(hsa_agent_t Agent) {
     return Err;
   return Target;
 }
-} // namespace utils
+} // namespace hsa_utils
 
 /// Utility class representing generic resource references to AMDGPU resources.
 template <typename ResourceTy>
@@ -483,7 +484,7 @@ struct AMDGPUDeviceImageTy : public DeviceImageTy {
   findDeviceSymbol(GenericDeviceTy &Device, StringRef SymbolName) const;
 
   /// Get additional info for kernel, e.g., register spill counts
-  std::optional<utils::KernelMetaDataTy>
+  std::optional<hsa_utils::KernelMetaDataTy>
   getKernelInfo(StringRef Identifier) const {
     auto It = KernelInfoMap.find(Identifier);
 
@@ -497,7 +498,7 @@ struct AMDGPUDeviceImageTy : public DeviceImageTy {
   /// The exectuable loaded on the agent.
   hsa_executable_t Executable;
   hsa_code_object_t CodeObject;
-  StringMap<utils::KernelMetaDataTy> KernelInfoMap;
+  StringMap<hsa_utils::KernelMetaDataTy> KernelInfoMap;
   uint16_t ELFABIVersion;
 };
 
@@ -547,7 +548,8 @@ struct AMDGPUKernelTy : public GenericKernelTy {
     // TODO: Read the kernel descriptor for the max threads per block. May be
     // read from the image.
 
-    ImplicitArgsSize = utils::getImplicitArgsSize(AMDImage.getELFABIVersion());
+    ImplicitArgsSize =
+        hsa_utils::getImplicitArgsSize(AMDImage.getELFABIVersion());
     DP("ELFABIVersion: %d\n", AMDImage.getELFABIVersion());
 
     // Get additional kernel info read from image
@@ -598,7 +600,7 @@ struct AMDGPUKernelTy : public GenericKernelTy {
   uint32_t ImplicitArgsSize;
 
   /// Additional Info for the AMD GPU Kernel
-  std::optional<utils::KernelMetaDataTy> KernelInfo;
+  std::optional<hsa_utils::KernelMetaDataTy> KernelInfo;
 };
 
 /// Class representing an HSA signal. Signals are used to define dependencies
@@ -1268,13 +1270,14 @@ struct AMDGPUStreamTy {
     // Issue the async memory copy.
     if (InputSignal && InputSignal->load()) {
       hsa_signal_t InputSignalRaw = InputSignal->get();
-      return utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, Agent, Src, Agent,
-                                 CopySize, 1, &InputSignalRaw,
-                                 OutputSignal->get());
+      return hsa_utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, Agent, Src,
+                                     Agent, CopySize, 1, &InputSignalRaw,
+                                     OutputSignal->get());
     }
 
-    return utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, Agent, Src, Agent,
-                               CopySize, 0, nullptr, OutputSignal->get());
+    return hsa_utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, Agent, Src,
+                                   Agent, CopySize, 0, nullptr,
+                                   OutputSignal->get());
   }
 
   /// Push an asynchronous memory copy device-to-host involving an unpinned
@@ -1308,14 +1311,14 @@ struct AMDGPUStreamTy {
     // dependency if already satisfied.
     if (InputSignal && InputSignal->load()) {
       hsa_signal_t InputSignalRaw = InputSignal->get();
-      if (auto Err = utils::asyncMemCopy(
+      if (auto Err = hsa_utils::asyncMemCopy(
               UseMultipleSdmaEngines, Inter, Agent, Src, Agent, CopySize, 1,
               &InputSignalRaw, OutputSignals[0]->get()))
         return Err;
     } else {
-      if (auto Err = utils::asyncMemCopy(UseMultipleSdmaEngines, Inter, Agent,
-                                         Src, Agent, CopySize, 0, nullptr,
-                                         OutputSignals[0]->get()))
+      if (auto Err = hsa_utils::asyncMemCopy(UseMultipleSdmaEngines, Inter,
+                                             Agent, Src, Agent, CopySize, 0,
+                                             nullptr, OutputSignals[0]->get()))
         return Err;
     }
 
@@ -1406,12 +1409,13 @@ struct AMDGPUStreamTy {
     // dependency if already satisfied.
     if (InputSignal && InputSignal->load()) {
       hsa_signal_t InputSignalRaw = InputSignal->get();
-      return utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, Agent, Inter,
-                                 Agent, CopySize, 1, &InputSignalRaw,
-                                 OutputSignal->get());
+      return hsa_utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, Agent, Inter,
+                                     Agent, CopySize, 1, &InputSignalRaw,
+                                     OutputSignal->get());
     }
-    return utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, Agent, Inter, Agent,
-                               CopySize, 0, nullptr, OutputSignal->get());
+    return hsa_utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, Agent, Inter,
+                                   Agent, CopySize, 0, nullptr,
+                                   OutputSignal->get());
   }
 
   // AMDGPUDeviceTy is incomplete here, passing the underlying agent instead
@@ -1435,13 +1439,13 @@ struct AMDGPUStreamTy {
 
     if (InputSignal && InputSignal->load()) {
       hsa_signal_t InputSignalRaw = InputSignal->get();
-      return utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, DstAgent, Src,
-                                 SrcAgent, CopySize, 1, &InputSignalRaw,
-                                 OutputSignal->get());
+      return hsa_utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, DstAgent, Src,
+                                     SrcAgent, CopySize, 1, &InputSignalRaw,
+                                     OutputSignal->get());
     }
-    return utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, DstAgent, Src,
-                               SrcAgent, CopySize, 0, nullptr,
-                               OutputSignal->get());
+    return hsa_utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, DstAgent, Src,
+                                   SrcAgent, CopySize, 0, nullptr,
+                                   OutputSignal->get());
   }
 
   /// Synchronize with the stream. The current thread waits until all operations
@@ -1800,7 +1804,7 @@ struct AMDHostDeviceTy : public AMDGenericDeviceTy {
   Error retrieveAllMemoryPools() override {
     // Iterate through the available pools across the host agents.
     for (hsa_agent_t Agent : Agents) {
-      Error Err = utils::iterateAgentMemoryPools(
+      Error Err = hsa_utils::iterateAgentMemoryPools(
           Agent, [&](hsa_amd_memory_pool_t HSAMemoryPool) {
             AMDGPUMemoryPoolTy *MemoryPool =
                 new AMDGPUMemoryPoolTy(HSAMemoryPool);
@@ -1965,7 +1969,7 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
 
     // Detect if XNACK is enabled
     auto TargeTripleAndFeaturesOrError =
-        utils::getTargetTripleAndFeatures(Agent);
+        hsa_utils::getTargetTripleAndFeatures(Agent);
     if (!TargeTripleAndFeaturesOrError)
       return TargeTripleAndFeaturesOrError.takeError();
     if (static_cast<StringRef>(*TargeTripleAndFeaturesOrError)
@@ -2323,9 +2327,9 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
       if (auto Err = Signal.init())
         return Err;
 
-      if (auto Err = utils::asyncMemCopy(useMultipleSdmaEngines(), TgtPtr,
-                                         Agent, PinnedPtr, Agent, Size, 0,
-                                         nullptr, Signal.get()))
+      if (auto Err = hsa_utils::asyncMemCopy(useMultipleSdmaEngines(), TgtPtr,
+                                             Agent, PinnedPtr, Agent, Size, 0,
+                                             nullptr, Signal.get()))
         return Err;
 
       if (auto Err = Signal.wait(getStreamBusyWaitMicroseconds()))
@@ -2383,9 +2387,9 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
       if (auto Err = Signal.init())
         return Err;
 
-      if (auto Err = utils::asyncMemCopy(useMultipleSdmaEngines(), PinnedPtr,
-                                         Agent, TgtPtr, Agent, Size, 0, nullptr,
-                                         Signal.get()))
+      if (auto Err = hsa_utils::asyncMemCopy(useMultipleSdmaEngines(),
+                                             PinnedPtr, Agent, TgtPtr, Agent,
+                                             Size, 0, nullptr, Signal.get()))
         return Err;
 
       if (auto Err = Signal.wait(getStreamBusyWaitMicroseconds()))
@@ -2427,7 +2431,7 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
       if (auto Err = Signal.init())
         return Err;
 
-      if (auto Err = utils::asyncMemCopy(
+      if (auto Err = hsa_utils::asyncMemCopy(
               useMultipleSdmaEngines(), DstPtr, DstDevice.getAgent(), SrcPtr,
               getAgent(), (uint64_t)Size, 0, nullptr, Signal.get()))
         return Err;
@@ -2696,7 +2700,7 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
     }
 
     Info.add("ISAs");
-    auto Err = utils::iterateAgentISAs(getAgent(), [&](hsa_isa_t ISA) {
+    auto Err = hsa_utils::iterateAgentISAs(getAgent(), [&](hsa_isa_t ISA) {
       Status = hsa_isa_get_info_alt(ISA, HSA_ISA_INFO_NAME, TmpChar);
       if (Status == HSA_STATUS_SUCCESS)
         Info.add<InfoLevel2>("Name", TmpChar);
@@ -2778,7 +2782,7 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
   /// Retrieve and construct all memory pools of the device agent.
   Error retrieveAllMemoryPools() override {
     // Iterate through the available pools of the device agent.
-    return utils::iterateAgentMemoryPools(
+    return hsa_utils::iterateAgentMemoryPools(
         Agent, [&](hsa_amd_memory_pool_t HSAMemoryPool) {
           AMDGPUMemoryPoolTy *MemoryPool =
               Plugin.allocate<AMDGPUMemoryPoolTy>();
@@ -2964,7 +2968,7 @@ Error AMDGPUDeviceImageTy::loadExecutable(const AMDGPUDeviceTy &Device) {
   if (Result)
     return Plugin::error("Loaded HSA executable does not validate");
 
-  if (auto Err = utils::readAMDGPUMetaDataFromImage(
+  if (auto Err = hsa_utils::readAMDGPUMetaDataFromImage(
           getMemoryBuffer(), KernelInfoMap, ELFABIVersion))
     return Err;
 
@@ -3093,7 +3097,7 @@ struct AMDGPUPluginTy final : public GenericPluginTy {
     llvm::SmallVector<hsa_agent_t> HostAgents;
 
     // Count the number of available agents.
-    auto Err = utils::iterateAgents([&](hsa_agent_t Agent) {
+    auto Err = hsa_utils::iterateAgents([&](hsa_agent_t Agent) {
       // Get the device type of the agent.
       hsa_device_type_t DeviceType;
       hsa_status_t Status =
@@ -3188,12 +3192,12 @@ struct AMDGPUPluginTy final : public GenericPluginTy {
       return false;
 
     auto TargeTripleAndFeaturesOrError =
-        utils::getTargetTripleAndFeatures(getKernelAgent(DeviceId));
+        hsa_utils::getTargetTripleAndFeatures(getKernelAgent(DeviceId));
     if (!TargeTripleAndFeaturesOrError)
       return TargeTripleAndFeaturesOrError.takeError();
-    return utils::isImageCompatibleWithEnv(Processor ? *Processor : "",
-                                           ElfOrErr->getPlatformFlags(),
-                                           *TargeTripleAndFeaturesOrError);
+    return hsa_utils::isImageCompatibleWithEnv(Processor ? *Processor : "",
+                                               ElfOrErr->getPlatformFlags(),
+                                               *TargeTripleAndFeaturesOrError);
   }
 
   bool isDataExchangable(int32_t SrcDeviceId, int32_t DstDeviceId) override {
@@ -3305,11 +3309,11 @@ Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
   if (auto Err = GenericDevice.getDeviceStackSize(StackSize))
     return Err;
 
-  utils::AMDGPUImplicitArgsTy *ImplArgs = nullptr;
+  hsa_utils::AMDGPUImplicitArgsTy *ImplArgs = nullptr;
   if (ArgsSize == LaunchParams.Size + getImplicitArgsSize()) {
     // Initialize implicit arguments.
-    ImplArgs = reinterpret_cast<utils::AMDGPUImplicitArgsTy *>(
-        advanceVoidPtr(AllArgs, LaunchParams.Size));
+    ImplArgs = reinterpret_cast<hsa_utils::AMDGPUImplicitArgsTy *>(
+        utils::advancePtr(AllArgs, LaunchParams.Size));
 
     // Initialize the implicit arguments to zero.
     std::memset(ImplArgs, 0, getImplicitArgsSize());
@@ -3333,7 +3337,7 @@ Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
 
   // Only COV5 implicitargs needs to be set. COV4 implicitargs are not used.
   if (ImplArgs &&
-      getImplicitArgsSize() == sizeof(utils::AMDGPUImplicitArgsTy)) {
+      getImplicitArgsSize() == sizeof(hsa_utils::AMDGPUImplicitArgsTy)) {
     ImplArgs->BlockCountX = NumBlocks;
     ImplArgs->BlockCountY = 1;
     ImplArgs->BlockCountZ = 1;
@@ -3494,29 +3498,8 @@ void AMDGPUQueueTy::callbackError(hsa_status_t Status, hsa_queue_t *Source,
                                   void *Data) {
 
   auto *Device = reinterpret_cast<AMDGPUDeviceTy *>(Data);
-  //
-  //  int64_t OmpxTrapId = -1;
-  //  GlobalTy TrapId("__ompx_trap_id", sizeof(int64_t), &OmpxTrapId);
-  //
-  //  printf("Check for trap id\n");
-  //  fflush(stdout);
-  //  // Write device environment values to the device.
-  //  GenericGlobalHandlerTy &GHandler = Device->Plugin.getGlobalHandler();
-  //  for (auto *Image : Device->images()) {
-  //    if (auto Err = GHandler.readGlobalFromDevice(*Device, *Image, TrapId)) {
-  //      REPORT("%s\n", toString(std::move(Err)).data());
-  //      continue;
-  //    }
-  //    if (OmpxTrapId != 0)
-  //      break;
-  //  }
-  //
-  //  printf("Trap ID: %li\n", OmpxTrapId);
-  printf("Trap ID[%li:%p] Acc[%li] : %p[:%li] vs %li\n",
-         (int64_t)Device->OmpxTrapId->ID, (void *)Device->OmpxTrapId->ID,
-         Device->OmpxTrapId->AccessID, Device->OmpxTrapId->Start,
-         Device->OmpxTrapId->Length, Device->OmpxTrapId->Offset);
-  fflush(stdout);
+  Device->reportSanitizerError();
+
   auto Err = Plugin::check(Status, "Received error in queue %p: %s", Source);
   FATAL_MESSAGE(1, "%s", toString(std::move(Err)).data());
 }
diff --git a/offload/plugins-nextgen/amdgpu/utils/UtilitiesRTL.h b/offload/plugins-nextgen/amdgpu/utils/UtilitiesRTL.h
index 58a3b5df00fac..1e99d0a30bdf2 100644
--- a/offload/plugins-nextgen/amdgpu/utils/UtilitiesRTL.h
+++ b/offload/plugins-nextgen/amdgpu/utils/UtilitiesRTL.h
@@ -33,7 +33,7 @@ namespace llvm {
 namespace omp {
 namespace target {
 namespace plugin {
-namespace utils {
+namespace hsa_utils {
 
 // The implicit arguments of COV5 AMDGPU kernels.
 struct AMDGPUImplicitArgsTy {
@@ -310,7 +310,7 @@ readAMDGPUMetaDataFromImage(MemoryBufferRef MemBuffer,
   return Error::success();
 }
 
-} // namespace utils
+} // namespace hsa_utils
 } // namespace plugin
 } // namespace target
 } // namespace omp
diff --git a/offload/plugins-nextgen/common/include/PluginInterface.h b/offload/plugins-nextgen/common/include/PluginInterface.h
index 1b32ccfc1a291..d3cafc788b60b 100644
--- a/offload/plugins-nextgen/common/include/PluginInterface.h
+++ b/offload/plugins-nextgen/common/include/PluginInterface.h
@@ -24,6 +24,7 @@
 #include "Shared/Environment.h"
 #include "Shared/EnvironmentVar.h"
 #include "Shared/Requirements.h"
+#include "Shared/Sanitizer.h"
 #include "Shared/Utils.h"
 
 #include "GlobalHandler.h"
@@ -231,7 +232,7 @@ class DeviceImageTy {
 
   /// Get the image size.
   size_t getSize() const {
-    return getPtrDiff(TgtImage->ImageEnd, TgtImage->ImageStart);
+    return utils::getPtrDiff(TgtImage->ImageEnd, TgtImage->ImageStart);
   }
 
   /// Get a memory buffer reference to the whole image.
@@ -471,7 +472,7 @@ class PinnedAllocationMapTy {
     --It;
 
     // The buffer is not contained in the pinned allocation.
-    if (advanceVoidPtr(It->HstPtr, It->Size) > HstPtr)
+    if (utils::advancePtr(It->HstPtr, It->Size) > HstPtr)
       return &(*It);
 
     // None found.
@@ -498,15 +499,15 @@ class PinnedAllocationMapTy {
 
   /// Indicate whether the first range A fully contains the second range B.
   static bool contains(void *PtrA, size_t SizeA, void *PtrB, size_t SizeB) {
-    void *EndA = advanceVoidPtr(PtrA, SizeA);
-    void *EndB = advanceVoidPtr(PtrB, SizeB);
+    void *EndA = utils::advancePtr(PtrA, SizeA);
+    void *EndB = utils::advancePtr(PtrB, SizeB);
     return (PtrB >= PtrA && EndB <= EndA);
   }
 
   /// Indicate whether the first range A intersects with the second range B.
   static bool intersects(void *PtrA, size_t SizeA, void *PtrB, size_t SizeB) {
-    void *EndA = advanceVoidPtr(PtrA, SizeA);
-    void *EndB = advanceVoidPtr(PtrB, SizeB);
+    void *EndA = utils::advancePtr(PtrA, SizeA);
+    void *EndB = utils::advancePtr(PtrB, SizeB);
     return (PtrA < EndB && PtrB < EndA);
   }
 
@@ -588,8 +589,8 @@ class PinnedAllocationMapTy {
     if (!Entry)
       return nullptr;
 
-    return advanceVoidPtr(Entry->DevAccessiblePtr,
-                          getPtrDiff(HstPtr, Entry->HstPtr));
+    return utils::advancePtr(Entry->DevAccessiblePtr,
+                             utils::getPtrDiff(HstPtr, Entry->HstPtr));
   }
 
   /// Check whether a buffer belongs to a registered host pinned allocation.
@@ -601,6 +602,22 @@ class PinnedAllocationMapTy {
   }
 };
 
+struct GPUSanTy {
+  GPUSanTy(GenericDeviceTy &Device) : Device(Device) {}
+  Error notifyDataMapped(void *DevicePtr, uint64_t Size, void *&FakeHstPtr);
+  Error notifyDataUnmapped(void *FakeHstPtr);
+
+  void addGPUSanNewFn(GenericKernelTy &GK) { NewFns.push_back(&GK); }
+  void addGPUSanFreeFn(GenericKernelTy &GK) { FreeFns.push_back(&GK); }
+  void checkAndReportError();
+
+private:
+  uint32_t SlotCnt = SanitizerConfig<AllocationKind::GLOBAL>::SLOTS - 1;
+  GenericDeviceTy &Device;
+  SmallVector<GenericKernelTy *> NewFns;
+  SmallVector<GenericKernelTy *> FreeFns;
+};
+
 /// Class implementing common functionalities of offload devices. Each plugin
 /// should define the specific device class, derive from this generic one, and
 /// implement the necessary virtual function members.
@@ -718,14 +735,19 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
   /// buffer (e.g., because a user OpenMP target map) and the buffer may be used
   /// as source/destination of memory transfers. We can use this information to
   /// lock the host buffer and optimize its memory transfers.
-  Error notifyDataMapped(void *HstPtr, int64_t Size) {
+  Error notifyDataMapped(void *HstPtr, void *DevicePtr, int64_t Size,
+                         void *&FakeHstPtr) {
+    if (auto Err = GPUSan.notifyDataMapped(DevicePtr, Size, FakeHstPtr))
+      return Err;
     return PinnedAllocs.lockMappedHostBuffer(HstPtr, Size);
   }
 
   /// Mark the host buffer with address \p HstPtr as unmapped. This means that
   /// libomptarget removed an existing mapping. If the plugin locked the buffer
   /// in notifyDataMapped, this function should unlock it.
-  Error notifyDataUnmapped(void *HstPtr) {
+  Error notifyDataUnmapped(void *HstPtr, void *FakeHstPtr) {
+    if (auto Err = GPUSan.notifyDataUnmapped(FakeHstPtr))
+      return Err;
     return PinnedAllocs.unlockUnmappedHostBuffer(HstPtr);
   }
 
@@ -736,6 +758,10 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
                                          void *&BaseDevAccessiblePtr,
                                          size_t &BaseSize) const = 0;
 
+  void addGPUSanNewFn(GenericKernelTy &GK) { GPUSan.addGPUSanNewFn(GK); }
+  void addGPUSanFreeFn(GenericKernelTy &GK) { GPUSan.addGPUSanFreeFn(GK); }
+  void reportSanitizerError() { GPUSan.checkAndReportError(); }
+
   /// Submit data to the device (host to device transfer).
   Error dataSubmit(void *TgtPtr, const void *HstPtr, int64_t Size,
                    __tgt_async_info *AsyncInfo);
@@ -857,7 +883,7 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
   /// Allocate and construct a kernel object.
   virtual Expected<GenericKernelTy &> constructKernel(const char *Name) = 0;
 
-  OMPXTrapIDTy *OmpxTrapId = nullptr;
+  SanitizerTrapInfoTy *SanitizerTrapInfo = nullptr;
 
   /// Reference to the underlying plugin that created this device.
   GenericPluginTy &Plugin;
@@ -952,6 +978,8 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
 #endif
 
 private:
+  GPUSanTy GPUSan;
+
   DeviceMemoryPoolTy DeviceMemoryPool = {nullptr, 0};
   DeviceMemoryPoolTrackingTy DeviceMemoryPoolTracking = {0, 0, ~0U, 0};
 };
@@ -1120,10 +1148,12 @@ struct GenericPluginTy {
   int32_t data_unlock(int32_t DeviceId, void *Ptr);
 
   /// Notify the runtime about a new mapping that has been created outside.
-  int32_t data_notify_mapped(int32_t DeviceId, void *HstPtr, int64_t Size);
+  int32_t data_notify_mapped(int32_t DeviceId, void *HstPtr, void *DevicePtr,
+                             int64_t Size, void *&FakeHstPtr);
 
   /// Notify t he runtime about a mapping that has been deleted.
-  int32_t data_notify_unmapped(int32_t DeviceId, void *HstPtr);
+  int32_t data_notify_unmapped(int32_t DeviceId, void *HstPtr,
+                               void *FakeHstPtr);
 
   /// Copy data to the given device.
   int32_t data_submit(int32_t DeviceId, void *TgtPtr, void *HstPtr,
diff --git a/offload/plugins-nextgen/common/src/GlobalHandler.cpp b/offload/plugins-nextgen/common/src/GlobalHandler.cpp
index ba0aa47f8e51c..e18f358af116c 100644
--- a/offload/plugins-nextgen/common/src/GlobalHandler.cpp
+++ b/offload/plugins-nextgen/common/src/GlobalHandler.cpp
@@ -152,8 +152,8 @@ Error GenericGlobalHandlerTy::readGlobalFromImage(GenericDeviceTy &Device,
      HostGlobal.getPtr());
 
   assert(Image.getStart() <= ImageGlobal.getPtr() &&
-         advanceVoidPtr(ImageGlobal.getPtr(), ImageGlobal.getSize()) <
-             advanceVoidPtr(Image.getStart(), Image.getSize()) &&
+         utils::advancePtr(ImageGlobal.getPtr(), ImageGlobal.getSize()) <
+             utils::advancePtr(Image.getStart(), Image.getSize()) &&
          "Attempting to read outside the image!");
 
   // Perform the copy from the image to the host memory.
diff --git a/offload/plugins-nextgen/common/src/JIT.cpp b/offload/plugins-nextgen/common/src/JIT.cpp
index 9dbba1459839d..9adb62b677b92 100644
--- a/offload/plugins-nextgen/common/src/JIT.cpp
+++ b/offload/plugins-nextgen/common/src/JIT.cpp
@@ -51,7 +51,7 @@ namespace {
 
 bool isImageBitcode(const __tgt_device_image &Image) {
   StringRef Binary(reinterpret_cast<const char *>(Image.ImageStart),
-                   target::getPtrDiff(Image.ImageEnd, Image.ImageStart));
+                   utils::getPtrDiff(Image.ImageEnd, Image.ImageStart));
 
   return identify_magic(Binary) == file_magic::bitcode;
 }
@@ -69,7 +69,7 @@ createModuleFromMemoryBuffer(std::unique_ptr<MemoryBuffer> &MB,
 Expected<std::unique_ptr<Module>>
 createModuleFromImage(const __tgt_device_image &Image, LLVMContext &Context) {
   StringRef Data((const char *)Image.ImageStart,
-                 target::getPtrDiff(Image.ImageEnd, Image.ImageStart));
+                 utils::getPtrDiff(Image.ImageEnd, Image.ImageStart));
   std::unique_ptr<MemoryBuffer> MB = MemoryBuffer::getMemBuffer(
       Data, /*BufferName=*/"", /*RequiresNullTerminator=*/false);
   return createModuleFromMemoryBuffer(MB, Context);
diff --git a/offload/plugins-nextgen/common/src/PluginInterface.cpp b/offload/plugins-nextgen/common/src/PluginInterface.cpp
index 5d19b01d3da5b..2c1029af2080d 100644
--- a/offload/plugins-nextgen/common/src/PluginInterface.cpp
+++ b/offload/plugins-nextgen/common/src/PluginInterface.cpp
@@ -16,14 +16,19 @@
 
 #include "GlobalHandler.h"
 #include "JIT.h"
+#include "Shared/Sanitizer.h"
+#include "Shared/Utils.h"
 #include "Utils/ELF.h"
 #include "omptarget.h"
+#include "llvm/Support/ErrorHandling.h"
+#include <string>
 
 #ifdef OMPT_SUPPORT
 #include "OpenMP/OMPT/Callback.h"
 #include "omp-tools.h"
 #endif
 
+#include "llvm/ADT/StringRef.h"
 #include "llvm/Bitcode/BitcodeReader.h"
 #include "llvm/Frontend/OpenMP/OMPConstants.h"
 #include "llvm/Support/Error.h"
@@ -74,7 +79,7 @@ struct RecordReplayTy {
         Device->allocate(1024, /*HstPtr=*/nullptr, TARGET_ALLOC_DEFAULT);
     Device->free(Addr);
     // Align Address to MaxMemoryAllocation
-    Addr = (void *)alignPtr((Addr), MaxMemoryAllocation);
+    Addr = (void *)utils::alignPtr((Addr), MaxMemoryAllocation);
     return Addr;
   }
 
@@ -207,8 +212,8 @@ struct RecordReplayTy {
     if (EC)
       report_fatal_error("Error saving image : " + StringRef(EC.message()));
     if (const auto *TgtImageBitcode = Image.getTgtImageBitcode()) {
-      size_t Size =
-          getPtrDiff(TgtImageBitcode->ImageEnd, TgtImageBitcode->ImageStart);
+      size_t Size = utils::getPtrDiff(TgtImageBitcode->ImageEnd,
+                                      TgtImageBitcode->ImageStart);
       MemoryBufferRef MBR = MemoryBufferRef(
           StringRef((const char *)TgtImageBitcode->ImageStart, Size), "");
       OS << MBR.getBuffer();
@@ -241,10 +246,10 @@ struct RecordReplayTy {
 
       int32_t NameLength = std::strlen(OffloadEntry.Name) + 1;
       memcpy(BufferPtr, OffloadEntry.Name, NameLength);
-      BufferPtr = advanceVoidPtr(BufferPtr, NameLength);
+      BufferPtr = utils::advancePtr(BufferPtr, NameLength);
 
       *((uint32_t *)(BufferPtr)) = OffloadEntry.Size;
-      BufferPtr = advanceVoidPtr(BufferPtr, sizeof(uint32_t));
+      BufferPtr = utils::advancePtr(BufferPtr, sizeof(uint32_t));
 
       auto Err = Plugin::success();
       {
@@ -254,11 +259,12 @@ struct RecordReplayTy {
       }
       if (Err)
         report_fatal_error("Error retrieving data for global");
-      BufferPtr = advanceVoidPtr(BufferPtr, OffloadEntry.Size);
+      BufferPtr = utils::advancePtr(BufferPtr, OffloadEntry.Size);
     }
     assert(BufferPtr == GlobalsMB->get()->getBufferEnd() &&
            "Buffer over/under-filled.");
-    assert(Size == getPtrDiff(BufferPtr, GlobalsMB->get()->getBufferStart()) &&
+    assert(Size == utils::getPtrDiff(BufferPtr,
+                                     GlobalsMB->get()->getBufferStart()) &&
            "Buffer size mismatch");
 
     StringRef GlobalsMemory(GlobalsMB.get()->getBufferStart(), Size);
@@ -730,7 +736,7 @@ GenericDeviceTy::GenericDeviceTy(GenericPluginTy &Plugin, int32_t DeviceId,
       OMPX_InitialNumEvents("LIBOMPTARGET_NUM_INITIAL_EVENTS", 1),
       DeviceId(DeviceId), GridValues(OMPGridValues),
       PeerAccesses(NumDevices, PeerAccessState::PENDING), PeerAccessesLock(),
-      PinnedAllocs(*this), RPCServer(nullptr) {
+      PinnedAllocs(*this), RPCServer(nullptr), GPUSan(*this) {
 #ifdef OMPT_SUPPORT
   OmptInitialized.store(false);
   // Bind the callbacks to this device's member functions
@@ -910,7 +916,7 @@ GenericDeviceTy::loadBinary(GenericPluginTy &Plugin,
 #ifdef OMPT_SUPPORT
   if (ompt::Initialized) {
     size_t Bytes =
-        getPtrDiff(InputTgtImage->ImageEnd, InputTgtImage->ImageStart);
+        utils::getPtrDiff(InputTgtImage->ImageEnd, InputTgtImage->ImageStart);
     performOmptCallback(
         device_load, Plugin.getUserId(DeviceId),
         /*FileName=*/nullptr, /*FileOffset=*/0, /*VmaInFile=*/nullptr,
@@ -923,6 +929,25 @@ GenericDeviceTy::loadBinary(GenericPluginTy &Plugin,
   if (auto Err = callGlobalConstructors(Plugin, *Image))
     return std::move(Err);
 
+  auto GetKernel = [&](StringRef Name) -> GenericKernelTy * {
+    auto KernelOrErr = constructKernel(Name.data());
+    if (Error Err = KernelOrErr.takeError()) {
+      REPORT("Failure to look up kernel: %s\n",
+             toString(std::move(Err)).data());
+      return nullptr;
+    }
+    GenericKernelTy &Kernel = *KernelOrErr;
+    if (auto Err = Kernel.init(*this, *Image)) {
+      REPORT("Failure to init kernel: %s\n", toString(std::move(Err)).data());
+      return nullptr;
+    }
+    return &Kernel;
+  };
+  if (GenericKernelTy *Kernel = GetKernel("__sanitizer_register"))
+    addGPUSanNewFn(*Kernel);
+  if (GenericKernelTy *Kernel = GetKernel("__sanitizer_unregister"))
+    addGPUSanFreeFn(*Kernel);
+
   // Return the pointer to the table of entries.
   return Image;
 }
@@ -1001,14 +1026,12 @@ Error GenericDeviceTy::setupDeviceMemoryPool(GenericPluginTy &Plugin,
   if (auto Err = GHandler.writeGlobalToDevice(*this, Image, TrackerGlobal))
     return Err;
 
-  OmpxTrapId = reinterpret_cast<OMPXTrapIDTy *>(
-      allocate(sizeof(*OmpxTrapId), &OmpxTrapId, TARGET_ALLOC_HOST));
-  OmpxTrapId->Start = 0;
-  OmpxTrapId->Length = -1;
-  OmpxTrapId->Offset = -1;
-  OmpxTrapId->ID = -1;
+  SanitizerTrapInfo = reinterpret_cast<SanitizerTrapInfoTy *>(allocate(
+      sizeof(*SanitizerTrapInfo), &SanitizerTrapInfo, TARGET_ALLOC_HOST));
+  memset(SanitizerTrapInfo, '\0', sizeof(SanitizerTrapInfoTy));
 
-  GlobalTy TrapId("__ompx_trap_id", sizeof(OmpxTrapId), &OmpxTrapId);
+  GlobalTy TrapId("__sanitizer_trap_info_ptr", sizeof(SanitizerTrapInfo),
+                  &SanitizerTrapInfo);
   if (auto Err = GHandler.writeGlobalToDevice(*this, Image, TrapId))
     return Err;
 
@@ -1149,8 +1172,8 @@ Expected<void *> PinnedAllocationMapTy::lockHostBuffer(void *HstPtr,
       return std::move(Err);
 
     // Return the device accessible pointer with the correct offset.
-    return advanceVoidPtr(Entry->DevAccessiblePtr,
-                          getPtrDiff(HstPtr, Entry->HstPtr));
+    return utils::advancePtr(Entry->DevAccessiblePtr,
+                             utils::getPtrDiff(HstPtr, Entry->HstPtr));
   }
 
   // No intersecting registered allocation found in the map. First, lock the
@@ -1627,7 +1650,7 @@ int32_t GenericPluginTy::is_initialized() const { return Initialized; }
 
 int32_t GenericPluginTy::is_plugin_compatible(__tgt_device_image *Image) {
   StringRef Buffer(reinterpret_cast<const char *>(Image->ImageStart),
-                   target::getPtrDiff(Image->ImageEnd, Image->ImageStart));
+                   utils::getPtrDiff(Image->ImageEnd, Image->ImageStart));
 
   auto HandleError = [&](Error Err) -> bool {
     [[maybe_unused]] std::string ErrStr = toString(std::move(Err));
@@ -1659,7 +1682,7 @@ int32_t GenericPluginTy::is_plugin_compatible(__tgt_device_image *Image) {
 int32_t GenericPluginTy::is_device_compatible(int32_t DeviceId,
                                               __tgt_device_image *Image) {
   StringRef Buffer(reinterpret_cast<const char *>(Image->ImageStart),
-                   target::getPtrDiff(Image->ImageEnd, Image->ImageStart));
+                   utils::getPtrDiff(Image->ImageEnd, Image->ImageStart));
 
   auto HandleError = [&](Error Err) -> bool {
     [[maybe_unused]] std::string ErrStr = toString(std::move(Err));
@@ -1821,8 +1844,10 @@ int32_t GenericPluginTy::data_unlock(int32_t DeviceId, void *Ptr) {
 }
 
 int32_t GenericPluginTy::data_notify_mapped(int32_t DeviceId, void *HstPtr,
-                                            int64_t Size) {
-  auto Err = getDevice(DeviceId).notifyDataMapped(HstPtr, Size);
+                                            void *DevicePtr, int64_t Size,
+                                            void *&FakeHstPtr) {
+  auto Err =
+      getDevice(DeviceId).notifyDataMapped(HstPtr, DevicePtr, Size, FakeHstPtr);
   if (Err) {
     REPORT("Failure to notify data mapped %p: %s\n", HstPtr,
            toString(std::move(Err)).data());
@@ -1832,8 +1857,9 @@ int32_t GenericPluginTy::data_notify_mapped(int32_t DeviceId, void *HstPtr,
   return OFFLOAD_SUCCESS;
 }
 
-int32_t GenericPluginTy::data_notify_unmapped(int32_t DeviceId, void *HstPtr) {
-  auto Err = getDevice(DeviceId).notifyDataUnmapped(HstPtr);
+int32_t GenericPluginTy::data_notify_unmapped(int32_t DeviceId, void *HstPtr,
+                                              void *FakeHstPtr) {
+  auto Err = getDevice(DeviceId).notifyDataUnmapped(HstPtr, FakeHstPtr);
   if (Err) {
     REPORT("Failure to notify data unmapped %p: %s\n", HstPtr,
            toString(std::move(Err)).data());
@@ -2108,6 +2134,154 @@ int32_t GenericPluginTy::get_function(__tgt_device_binary Binary,
   return OFFLOAD_SUCCESS;
 }
 
+Error GPUSanTy::notifyDataMapped(void *DevicePtr, uint64_t Size,
+                                 void *&FakeHstPtr) {
+  if (NewFns.empty())
+    return Plugin::success();
+  uint64_t Slot = SlotCnt--;
+  FakeHstPtr = __offload_get_new_sanitizer_ptr(Slot);
+  KernelArgsTy Args = {};
+  Args.NumTeams[0] = 1;
+  Args.ThreadLimit[0] = 1;
+  AsyncInfoWrapperTy AsyncInfoWrapper(Device, nullptr);
+  for (GenericKernelTy *NewFP : NewFns) {
+    struct {
+      void *Ptr;
+      uint64_t Length;
+      uint64_t Slot;
+    } KernelArgs{DevicePtr, Size, Slot};
+    KernelLaunchParamsTy ArgPtrs{sizeof(KernelArgs), &KernelArgs, nullptr};
+    Args.ArgPtrs = reinterpret_cast<void **>(&ArgPtrs);
+    Args.Flags.IsCUDA = true;
+    if (auto Err = NewFP->launch(Device, Args.ArgPtrs, nullptr, Args,
+                                 AsyncInfoWrapper)) {
+      AsyncInfoWrapper.finalize(Err);
+      return Err;
+    }
+  }
+
+  Error Err = Plugin::success();
+  AsyncInfoWrapper.finalize(Err);
+  return Err;
+}
+
+Error GPUSanTy::notifyDataUnmapped(void *FakeHstPtr) {
+  if (!FakeHstPtr)
+    return Plugin::success();
+  KernelArgsTy Args = {};
+  Args.NumTeams[0] = 1;
+  Args.ThreadLimit[0] = 1;
+  AsyncInfoWrapperTy AsyncInfoWrapper(Device, nullptr);
+  for (GenericKernelTy *FreeFn : FreeFns) {
+    KernelLaunchParamsTy ArgPtrs{sizeof(void *), &FakeHstPtr, nullptr};
+    Args.ArgPtrs = reinterpret_cast<void **>(&ArgPtrs);
+    Args.Flags.IsCUDA = true;
+    if (auto Err = FreeFn->launch(Device, Args.ArgPtrs, nullptr, Args,
+                                  AsyncInfoWrapper)) {
+      AsyncInfoWrapper.finalize(Err);
+      return Err;
+    }
+  }
+  Error Err = Plugin::success();
+  AsyncInfoWrapper.finalize(Err);
+  return Err;
+}
+
+void GPUSanTy::checkAndReportError() {
+  if (!Device.SanitizerTrapInfo)
+    return;
+  SanitizerTrapInfoTy &STI = *Device.SanitizerTrapInfo;
+  if (STI.ErrorCode == SanitizerTrapInfoTy::None)
+    return;
+
+  auto Green = []() { return "\033[1m\033[32m"; };
+  auto Blue = []() { return "\033[1m\033[34m"; };
+  auto Red = []() { return "\033[1m\033[31m"; };
+  auto Default = []() { return "\033[1m\033[0m"; };
+
+  std::string KernelName;
+  StringRef FunctionName =
+      STI.FunctionName[0] ? StringRef(STI.FunctionName) : "<unknown>";
+  StringRef FileName = STI.FileName[0] ? StringRef(STI.FileName) : "<unknown>";
+
+  if (FunctionName.starts_with("__omp_offloading_")) {
+    FunctionName = FunctionName.drop_front(sizeof("__omp_offloading_"));
+    auto It = FunctionName.find_first_of("_");
+    if (It != StringRef::npos)
+      FunctionName = FunctionName.drop_front(It + 1);
+    It = FunctionName.find_first_of("_");
+    if (It != StringRef::npos)
+      FunctionName = FunctionName.drop_front(It + 1);
+  }
+
+  if (FunctionName.ends_with("_debug___omp_outlined_debug__"))
+    FunctionName =
+        FunctionName.drop_back(sizeof("debug___omp_outlined_debug__"));
+
+  auto It = FunctionName.find_last_of("_");
+  if (It != StringRef::npos) {
+    if (FunctionName[It + 1] == 'l') {
+      int64_t KernelLineNo = 0;
+      FunctionName
+          .take_back(FunctionName.size() - It -
+                     /* '_' and 'l' */ 2)
+          .getAsInteger(10, KernelLineNo);
+      if (KernelLineNo) {
+        KernelName = "omp target (" + FunctionName.take_front(It).str() + ":" +
+                     std::to_string(KernelLineNo) + ")";
+        FunctionName = KernelName;
+      }
+    }
+  }
+
+  fprintf(stderr, "============================================================"
+                  "====================\n");
+  switch (STI.ErrorCode) {
+  case SanitizerTrapInfoTy::None:
+    llvm_unreachable("Unexpected exception");
+  case SanitizerTrapInfoTy::ExceedsLength:
+    fprintf(stderr, "%sERROR: OffloadSanitizer %s%s\n", Red(), "exceeds length",
+            Default());
+    break;
+  case SanitizerTrapInfoTy::ExceedsSlots:
+    fprintf(stderr, "%sERROR: OffloadSanitizer %s%s\n", Red(), "exceeds slots",
+            Default());
+    break;
+  case SanitizerTrapInfoTy::OutOfBounds: {
+    void *PC = reinterpret_cast<void *>(STI.PC);
+    void *Addr = utils::advancePtr(STI.AllocationStart, STI.PtrOffset);
+    fprintf(stderr,
+            "%sERROR: OffloadSanitizer %s on address %p at pc "
+            "%p%s\n",
+            Red(), "out-of-bounds access", Addr, PC, Default());
+    fprintf(
+        stderr,
+        "%s%s of size %u at %p thread <%u, %u, %u> block <%lu, %lu, %lu>%s\n",
+        Blue(), STI.AccessId > 0 ? "WRITE" : "READ", STI.AccessSize, Addr,
+        STI.ThreadId[0], STI.ThreadId[1], STI.ThreadId[2], STI.BlockId[0],
+        STI.BlockId[1], STI.BlockId[2], Default());
+    fprintf(stderr, "    #0 %p %s in %s:%lu\n\n", PC,
+            FunctionName.str().c_str(), FileName.data(), STI.LineNo);
+    fprintf(stderr,
+            "%s%p is located %lu bytes inside of a %lu-byte region [%p,%p)%s\n",
+            Green(), Addr, STI.PtrOffset, STI.AllocationLength,
+            STI.AllocationStart,
+            utils::advancePtr(STI.AllocationStart, STI.AllocationLength),
+            Default());
+    break;
+  }
+  case SanitizerTrapInfoTy::UseAfterFree:
+    fprintf(stderr, "%sERROR: OffloadSanitizer %s%s\n", Red(), "use-after-free",
+            Default());
+    break;
+  case SanitizerTrapInfoTy::MemoryLeak:
+    fprintf(stderr, "%sERROR: OffloadSanitizer %s%s\n", Red(), "memory leak",
+            Default());
+    break;
+  }
+  fflush(stderr);
+}
+
 bool llvm::omp::target::plugin::libomptargetSupportsRPC() {
 #ifdef LIBOMPTARGET_RPC_SUPPORT
   return true;
diff --git a/offload/plugins-nextgen/cuda/src/rtl.cpp b/offload/plugins-nextgen/cuda/src/rtl.cpp
index bfbc101529e18..6ba51ae0db565 100644
--- a/offload/plugins-nextgen/cuda/src/rtl.cpp
+++ b/offload/plugins-nextgen/cuda/src/rtl.cpp
@@ -708,7 +708,7 @@ struct CUDADeviceTy : public GenericDeviceTy {
       return Plugin::error("Wrong device Page size");
 
     // Ceil to page size.
-    Size = roundUp(Size, Granularity);
+    Size = utils::roundUp(Size, Granularity);
 
     // Create a handler of our allocation
     CUmemGenericAllocationHandle AHandle;
diff --git a/offload/src/CMakeLists.txt b/offload/src/CMakeLists.txt
index b442df45deaa5..0f30f6028f103 100644
--- a/offload/src/CMakeLists.txt
+++ b/offload/src/CMakeLists.txt
@@ -61,6 +61,23 @@ endforeach()
 target_compile_options(omptarget PRIVATE ${offload_compile_flags})
 target_link_options(omptarget PRIVATE ${offload_link_flags})
 
+add_llvm_library(offload.kernels
+  STATIC
+
+  Kernels/Sanitizer.cpp
+  
+  LINK_LIBS
+  PUBLIC
+  omptarget.devicertl
+
+  NO_INSTALL_RPATH
+  BUILDTREE_ONLY
+)
+
+list(JOIN LIBOMPTARGET_DEVICE_ARCHITECTURES "," KERNEL_OFFLOAD_ARCHS)
+target_compile_options(offload.kernels PRIVATE -x cuda --offload-arch=${KERNEL_OFFLOAD_ARCHS} -nocudalib -nogpulib -fopenmp-target-jit -foffload-via-llvm )
+target_link_options(offload.kernels PRIVATE -x cuda --offload-arch=${KERNEL_OFFLOAD_ARCHS} -nocudalib -nogpulib -fopenmp-target-jit -foffload-via-llvm )
+
 # libomptarget.so needs to be aware of where the plugins live as they
 # are now separated in the build directory.
 set_target_properties(omptarget PROPERTIES
@@ -68,3 +85,4 @@ set_target_properties(omptarget PROPERTIES
                       INSTALL_RPATH "$ORIGIN"
                       BUILD_RPATH "$ORIGIN:${CMAKE_CURRENT_BINARY_DIR}/..")
 install(TARGETS omptarget LIBRARY COMPONENT omptarget DESTINATION "${OFFLOAD_INSTALL_LIBDIR}")
+install(TARGETS offload.kernels LIBRARY COMPONENT offload.kernels DESTINATION "${OFFLOAD_INSTALL_LIBDIR}")
diff --git a/offload/src/DeviceImage.cpp b/offload/src/DeviceImage.cpp
index e42460b5cca4f..e5b4bf5526437 100644
--- a/offload/src/DeviceImage.cpp
+++ b/offload/src/DeviceImage.cpp
@@ -27,9 +27,8 @@ DeviceImageTy::DeviceImageTy(__tgt_bin_desc &BinaryDesc,
                              __tgt_device_image &TgtDeviceImage)
     : BinaryDesc(&BinaryDesc), Image(TgtDeviceImage) {
 
-  llvm::StringRef ImageStr(
-      static_cast<char *>(Image.ImageStart),
-      llvm::omp::target::getPtrDiff(Image.ImageEnd, Image.ImageStart));
+  llvm::StringRef ImageStr(static_cast<char *>(Image.ImageStart),
+                           utils::getPtrDiff(Image.ImageEnd, Image.ImageStart));
 
   auto BinaryOrErr =
       llvm::object::OffloadBinary::create(llvm::MemoryBufferRef(ImageStr, ""));
diff --git a/offload/src/Kernels/Sanitizer.cpp b/offload/src/Kernels/Sanitizer.cpp
new file mode 100644
index 0000000000000..7b1d73a81aeba
--- /dev/null
+++ b/offload/src/Kernels/Sanitizer.cpp
@@ -0,0 +1,27 @@
+//===-- Kenrels/Sanitizer.cpp - Sanitizer Kernel Definitions --------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+
+#include <cstdint>
+
+extern "C" {
+__device__ void __sanitizer_register_host(void *P, uint64_t Bytes,
+                                          uint64_t Slot);
+__device__ void __sanitizer_unregister_host(void *P);
+
+[[clang::disable_sanitizer_instrumentation]] __global__ void
+__sanitizer_register(void *P, uint64_t Bytes, uint64_t Slot) {
+  __sanitizer_register_host(P, Bytes, Slot);
+}
+
+[[clang::disable_sanitizer_instrumentation]] __global__ void
+__sanitizer_unregister(void *P) {
+  __sanitizer_unregister_host(P);
+}
+}
diff --git a/offload/src/OpenMP/Mapping.cpp b/offload/src/OpenMP/Mapping.cpp
index f198489aef046..407c6cd422ba7 100644
--- a/offload/src/OpenMP/Mapping.cpp
+++ b/offload/src/OpenMP/Mapping.cpp
@@ -15,13 +15,6 @@
 #include "Shared/Requirements.h"
 #include "device.h"
 
-extern "C" {
-[[gnu::weak]] void ompx_free_allocation_host(void *P) {}
-[[gnu::weak]] void *ompx_new_allocation_host(void *P, uint64_t) {
-  return nullptr;
-}
-}
-
 /// Dump a table of all the host-target pointer pairs on failure
 void dumpTargetPointerMappings(const ident_t *Loc, DeviceTy &Device,
                                bool toStdOut) {
@@ -75,11 +68,8 @@ int MappingInfoTy::associatePtr(void *HstPtrBegin, void *TgtPtrBegin,
     return OFFLOAD_FAIL;
   }
 
-  void *FakeTgtPtrBegin = ompx_new_allocation_host(TgtPtrBegin, Size);
-  printf("FP %p -> %p \n", (void *)TgtPtrBegin, FakeTgtPtrBegin);
-
   // Mapping does not exist, allocate it with refCount=INF
-  const HostDataToTargetTy &NewEntry =
+  HostDataToTargetTy &NewEntry =
       *HDTTMap
            ->emplace(new HostDataToTargetTy(
                /*HstPtrBase=*/(uintptr_t)HstPtrBegin,
@@ -88,7 +78,7 @@ int MappingInfoTy::associatePtr(void *HstPtrBegin, void *TgtPtrBegin,
                /*TgtAllocBegin=*/(uintptr_t)TgtPtrBegin,
                /*TgtPtrBegin=*/(uintptr_t)TgtPtrBegin,
                /*UseHoldRefCount=*/false, /*Name=*/nullptr,
-               /*IsRefCountINF=*/true, FakeTgtPtrBegin))
+               /*IsRefCountINF=*/true))
            .first->HDTT;
   DP("Creating new map entry: HstBase=" DPxMOD ", HstBegin=" DPxMOD
      ", HstEnd=" DPxMOD ", TgtBegin=" DPxMOD ", DynRefCount=%s, "
@@ -99,7 +89,8 @@ int MappingInfoTy::associatePtr(void *HstPtrBegin, void *TgtPtrBegin,
   (void)NewEntry;
 
   // Notify the plugin about the new mapping.
-  return Device.notifyDataMapped(HstPtrBegin, Size);
+  return Device.notifyDataMapped(HstPtrBegin, TgtPtrBegin, Size,
+                                 NewEntry.FakeTgtPtrBegin);
 }
 
 int MappingInfoTy::disassociatePtr(void *HstPtrBegin) {
@@ -130,7 +121,7 @@ int MappingInfoTy::disassociatePtr(void *HstPtrBegin) {
     if (Event)
       Device.destroyEvent(Event);
     HDTTMap->erase(It);
-    return Device.notifyDataUnmapped(HstPtrBegin);
+    return Device.notifyDataUnmapped(HstPtrBegin, HDTT.FakeTgtPtrBegin);
   }
 
   REPORT("Trying to disassociate a pointer which was not mapped via "
@@ -302,7 +293,6 @@ TargetPointerResultTy MappingInfoTy::getTargetPointer(
     uintptr_t TgtAllocBegin =
         (uintptr_t)Device.allocData(TgtPadding + Size, HstPtrBegin);
     uintptr_t TgtPtrBegin = TgtAllocBegin + TgtPadding;
-    void *FakeTgtPtrBegin = ompx_new_allocation_host((void *)TgtPtrBegin, Size);
     // Release the mapping table lock only after the entry is locked by
     // attaching it to TPR.
     LR.TPR.setEntry(
@@ -310,10 +300,8 @@ TargetPointerResultTy MappingInfoTy::getTargetPointer(
             ->emplace(new HostDataToTargetTy(
                 (uintptr_t)HstPtrBase, (uintptr_t)HstPtrBegin,
                 (uintptr_t)HstPtrBegin + Size, TgtAllocBegin, TgtPtrBegin,
-                HasHoldModifier, HstPtrName, /*IsINF=*/false, FakeTgtPtrBegin))
+                HasHoldModifier, HstPtrName, /*IsINF=*/false))
             .first->HDTT);
-    printf("ENTRY %p -> %p\n", LR.TPR.getEntry(),
-           LR.TPR.getEntry()->FakeTgtPtrBegin);
     INFO(OMP_INFOTYPE_MAPPING_CHANGED, Device.DeviceID,
          "Creating new map entry with HstPtrBase=" DPxMOD
          ", HstPtrBegin=" DPxMOD ", TgtAllocBegin=" DPxMOD
@@ -327,7 +315,8 @@ TargetPointerResultTy MappingInfoTy::getTargetPointer(
     LR.TPR.TargetPointer = (void *)TgtPtrBegin;
 
     // Notify the plugin about the new mapping.
-    if (Device.notifyDataMapped(HstPtrBegin, Size))
+    if (Device.notifyDataMapped(HstPtrBegin, LR.TPR.TargetPointer, Size,
+                                LR.TPR.getEntry()->FakeTgtPtrBegin))
       return TargetPointerResultTy{};
   } else {
     // This entry is not present and we did not create a new entry for it.
@@ -506,14 +495,11 @@ int MappingInfoTy::deallocTgtPtrAndEntry(HostDataToTargetTy *Entry,
     return OFFLOAD_FAIL;
   }
 
-  printf("DEL %p -> %p\n", Entry, Entry->FakeTgtPtrBegin);
-  if (Entry->FakeTgtPtrBegin)
-    ompx_free_allocation_host(Entry->FakeTgtPtrBegin);
-
   int Ret = Device.deleteData((void *)Entry->TgtAllocBegin);
 
   // Notify the plugin about the unmapped memory.
-  Ret |= Device.notifyDataUnmapped((void *)Entry->HstPtrBegin);
+  Ret |= Device.notifyDataUnmapped((void *)Entry->HstPtrBegin,
+                                   Entry->FakeTgtPtrBegin);
 
   delete Entry;
 
diff --git a/offload/src/device.cpp b/offload/src/device.cpp
index 943c778278730..fc41721e6c0a6 100644
--- a/offload/src/device.cpp
+++ b/offload/src/device.cpp
@@ -191,21 +191,24 @@ int32_t DeviceTy::dataExchange(void *SrcPtr, DeviceTy &DstDev, void *DstPtr,
                                   DstPtr, Size, AsyncInfo);
 }
 
-int32_t DeviceTy::notifyDataMapped(void *HstPtr, int64_t Size) {
+int32_t DeviceTy::notifyDataMapped(void *HstPtr, void *DevicePtr, int64_t Size,
+                                   void *&FakeHstPtr) {
   DP("Notifying about new mapping: HstPtr=" DPxMOD ", Size=%" PRId64 "\n",
      DPxPTR(HstPtr), Size);
 
-  if (RTL->data_notify_mapped(RTLDeviceID, HstPtr, Size)) {
+  if (RTL->data_notify_mapped(RTLDeviceID, HstPtr, DevicePtr, Size,
+                              FakeHstPtr)) {
     REPORT("Notifiying about data mapping failed.\n");
     return OFFLOAD_FAIL;
   }
   return OFFLOAD_SUCCESS;
 }
 
-int32_t DeviceTy::notifyDataUnmapped(void *HstPtr) {
-  DP("Notifying about an unmapping: HstPtr=" DPxMOD "\n", DPxPTR(HstPtr));
+int32_t DeviceTy::notifyDataUnmapped(void *HstPtr, void *FakeHstPtr) {
+  DP("Notifying about an unmapping: HstPtr=" DPxMOD " FakeHstPtr=" DPxMOD "\n",
+     DPxPTR(HstPtr), DPxPTR(FakeHstPtr));
 
-  if (RTL->data_notify_unmapped(RTLDeviceID, HstPtr)) {
+  if (RTL->data_notify_unmapped(RTLDeviceID, HstPtr, FakeHstPtr)) {
     REPORT("Notifiying about data unmapping failed.\n");
     return OFFLOAD_FAIL;
   }
diff --git a/offload/src/exports b/offload/src/exports
index f8d69c3814961..829a55fc6398e 100644
--- a/offload/src/exports
+++ b/offload/src/exports
@@ -78,6 +78,7 @@ VERS1.0 {
     llvmLaunchKernel;
     ompx_new_allocation_host;
     ompx_free_allocation_host;
+    ompx_register_image_functions;
   local:
     *;
 };
diff --git a/offload/src/omptarget.cpp b/offload/src/omptarget.cpp
index f7715a422a5b0..fa26b21b84a3e 100644
--- a/offload/src/omptarget.cpp
+++ b/offload/src/omptarget.cpp
@@ -271,17 +271,22 @@ static int initLibrary(DeviceTy &Device) {
            ", name \"%s\"\n",
            DPxPTR(CurrHostEntry->addr), DPxPTR(CurrDeviceEntry->addr),
            CurrDeviceEntry->size, CurrDeviceEntry->name);
-        HDTTMap->emplace(new HostDataToTargetTy(
-            (uintptr_t)CurrHostEntry->addr /*HstPtrBase*/,
-            (uintptr_t)CurrHostEntry->addr /*HstPtrBegin*/,
-            (uintptr_t)CurrHostEntry->addr + CurrHostEntry->size /*HstPtrEnd*/,
-            (uintptr_t)CurrDeviceEntryAddr /*TgtAllocBegin*/,
-            (uintptr_t)CurrDeviceEntryAddr /*TgtPtrBegin*/,
-            false /*UseHoldRefCount*/, CurrHostEntry->name,
-            true /*IsRefCountINF*/));
+        auto *Entry = HDTTMap
+                          ->emplace(new HostDataToTargetTy(
+                              (uintptr_t)CurrHostEntry->addr /*HstPtrBase*/,
+                              (uintptr_t)CurrHostEntry->addr /*HstPtrBegin*/,
+                              (uintptr_t)CurrHostEntry->addr +
+                                  CurrHostEntry->size /*HstPtrEnd*/,
+                              (uintptr_t)CurrDeviceEntryAddr /*TgtAllocBegin*/,
+                              (uintptr_t)CurrDeviceEntryAddr /*TgtPtrBegin*/,
+                              false /*UseHoldRefCount*/, CurrHostEntry->name,
+                              true /*IsRefCountINF*/))
+                          .first->HDTT;
 
         // Notify about the new mapping.
-        if (Device.notifyDataMapped(CurrHostEntry->addr, CurrHostEntry->size))
+        if (Device.notifyDataMapped(CurrHostEntry->addr, CurrDeviceEntryAddr,
+                                    CurrHostEntry->size,
+                                    Entry->FakeTgtPtrBegin))
           return OFFLOAD_FAIL;
       }
     }
@@ -323,8 +328,8 @@ void handleTargetOutcome(bool Success, ident_t *Loc) {
         for (auto &Image : PM->deviceImages()) {
           const char *Start = reinterpret_cast<const char *>(
               Image.getExecutableImage().ImageStart);
-          uint64_t Length = llvm::omp::target::getPtrDiff(
-              Start, Image.getExecutableImage().ImageEnd);
+          uint64_t Length =
+              utils::getPtrDiff(Start, Image.getExecutableImage().ImageEnd);
           llvm::MemoryBufferRef Buffer(llvm::StringRef(Start, Length),
                                        /*Identifier=*/"");
 
diff --git a/offload/test/sanitizer/global_null.c b/offload/test/sanitizer/global_null.c
new file mode 100644
index 0000000000000..68bd3257ea788
--- /dev/null
+++ b/offload/test/sanitizer/global_null.c
@@ -0,0 +1,28 @@
+// clang-format off
+// RUN: %libomptarget-compileopt-generic -loffload.kernels -mllvm -enable-gpu-san
+// RUN: %libomptarget-run-generic 2>&1 | %fcheck-generic
+// clang-format on
+
+// UNSUPPORTED: aarch64-unknown-linux-gnu
+// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
+// UNSUPPORTED: x86_64-pc-linux-gnu
+// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+// UNSUPPORTED: s390x-ibm-linux-gnu
+// UNSUPPORTED: s390x-ibm-linux-gnu-LTO
+
+int *Null = 0;
+#pragma omp declare target(Null)
+
+int main(void) {
+
+#pragma omp target
+  {
+    // clang-format off
+    // CHECK:      ERROR: AddressSanitizer out-of-bounds-access on address 0x0 at pc [[PC:0x.*]]
+    // CHECK-NEXT: WRITE of size 4 at 0x0 thread B<0,0,0> T<0,0,0>
+    // CHECK-NEXT: #0 [[PC]] main null.c:[[@LINE+3]]
+    // CHECK-NEXT: 0x0 is located 0 bytes inside of 0-byte region [0x0,0x0)
+    // clang-format on
+    *Null = 42;
+  }
+}
diff --git a/offload/test/sanitizer/heap_null.c b/offload/test/sanitizer/heap_null.c
new file mode 100644
index 0000000000000..2590a4dfab7eb
--- /dev/null
+++ b/offload/test/sanitizer/heap_null.c
@@ -0,0 +1,26 @@
+// clang-format off
+// RUN: %libomptarget-compileopt-generic -loffload.kernels -mllvm -enable-gpu-san
+// RUN: %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefixes=CHECK
+// clang-format on
+
+// UNSUPPORTED: aarch64-unknown-linux-gnu
+// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
+// UNSUPPORTED: x86_64-pc-linux-gnu
+// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+// UNSUPPORTED: s390x-ibm-linux-gnu
+// UNSUPPORTED: s390x-ibm-linux-gnu-LTO
+
+int main(void) {
+
+  int *Null = 0;
+#pragma omp target
+  {
+    // clang-format off
+    // CHECK:      ERROR: AddressSanitizer out-of-bounds-access on address 0x0 at pc [[PC:0x.*]]
+    // CHECK-NEXT: WRITE of size 4 at 0x0 thread B<0,0,0> T<0,0,0>
+    // CHECK-NEXT: #0 [[PC]] main null.c:[[@LINE+3]]
+    // CHECK-NEXT: 0x0 is located 0 bytes inside of 0-byte region [0x0,0x0)
+    // clang-format on
+    *Null = 42;
+  }
+}
diff --git a/offload/test/sanitizer/volatile_stack_null.c b/offload/test/sanitizer/volatile_stack_null.c
new file mode 100644
index 0000000000000..9ad986414b069
--- /dev/null
+++ b/offload/test/sanitizer/volatile_stack_null.c
@@ -0,0 +1,26 @@
+// clang-format off
+// RUN: %libomptarget-compileopt-generic -loffload.kernels -mllvm -enable-gpu-san
+// RUN: %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefixes=CHECK
+// clang-format on
+
+// UNSUPPORTED: aarch64-unknown-linux-gnu
+// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
+// UNSUPPORTED: x86_64-pc-linux-gnu
+// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+// UNSUPPORTED: s390x-ibm-linux-gnu
+// UNSUPPORTED: s390x-ibm-linux-gnu-LTO
+
+int main(void) {
+
+#pragma omp target
+  {
+    volatile int *Null = 0;
+    // clang-format off
+    // CHECK:      ERROR: AddressSanitizer out-of-bounds-access on address 0x0 at pc [[PC:0x.*]]
+    // CHECK-NEXT: WRITE of size 4 at 0x0 thread B<0,0,0> T<0,0,0>
+    // CHECK-NEXT: #0 [[PC]] main null.c:[[@LINE+3]]
+    // CHECK-NEXT: 0x0 is located 0 bytes inside of 0-byte region [0x0,0x0)
+    // clang-format on
+    *Null = 42;
+  }
+}
diff --git a/preload.cpp b/preload.cpp
deleted file mode 100644
index 432a1116a6a90..0000000000000
--- a/preload.cpp
+++ /dev/null
@@ -1,102 +0,0 @@
-//===------ AllocationTracker.cpp - Track allocation for sanitizers -------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-//===----------------------------------------------------------------------===//
-
-#include <cassert>
-#include <cstdint>
-#include <cstdio>
-
-#define _OBJECT_TY uint16_t
-
-enum class AllocationKind { GLOBAL, LOCAL, LAST = LOCAL };
-
-template <AllocationKind AK> struct Config {
-  static constexpr uint32_t ADDR_SPACE = AK == AllocationKind::GLOBAL ? 0 : 3;
-  static constexpr uint32_t NUM_ALLOCATION_ARRAYS =
-      AK == AllocationKind::GLOBAL ? 1 : (256 * 8 * 4);
-  static constexpr uint32_t TAG_BITS = AK == AllocationKind::GLOBAL ? 1 : 8;
-
-  static constexpr uint32_t OBJECT_BITS =
-      AK == AllocationKind::GLOBAL ? 10 : (sizeof(_OBJECT_TY) * 8);
-  static constexpr uint32_t SLOTS =
-      (1 << (OBJECT_BITS)) / NUM_ALLOCATION_ARRAYS;
-  static constexpr uint32_t KIND_BITS = 1;
-  static constexpr uint32_t SID_BITS = 16 - KIND_BITS;
-
-  static constexpr uint32_t LENGTH_BITS = 64 - TAG_BITS - SID_BITS - KIND_BITS;
-  static constexpr uint32_t OFFSET_BITS = LENGTH_BITS - OBJECT_BITS;
-
-  static constexpr bool useTags() { return TAG_BITS > 1; }
-
-  static_assert(LENGTH_BITS + TAG_BITS + KIND_BITS + SID_BITS == 64,
-                "Length and tag bits should cover 64 bits");
-  static_assert(OFFSET_BITS + TAG_BITS + KIND_BITS + SID_BITS + OBJECT_BITS ==
-                    64,
-                "Length, tag, and object bits should cover 64 bits");
-  static_assert((1 << KIND_BITS) >= ((uint64_t)AllocationKind::LAST + 1),
-                "Kind bits should match allocation kinds");
-};
-
-template <typename DstTy, typename SrcTy> inline DstTy convertViaPun(SrcTy V) {
-  return *((DstTy *)(&V));
-}
-
-template <AllocationKind AK> struct AllocationPtrTy {
-  static AllocationPtrTy<AK> get(void *P) {
-    return convertViaPun<AllocationPtrTy<AK>>(P);
-  }
-
-  operator void *() const { return convertViaPun<void *>(*this); }
-  operator intptr_t() const { return convertViaPun<intptr_t>(*this); }
-  uint64_t PtrOffset : Config<AK>::OFFSET_BITS;
-  uint64_t AllocationTag : Config<AK>::TAG_BITS;
-  uint64_t AllocationId : Config<AK>::OBJECT_BITS;
-  // Must be last, TODO: merge into TAG
-  uint64_t Kind : Config<AK>::KIND_BITS;
-};
-
-static_assert(sizeof(AllocationPtrTy<AllocationKind::GLOBAL>) == sizeof(void *),
-              "AllocationTy pointers should be pointer sized");
-
-extern "C" {
-
-[[gnu::flatten, gnu::always_inline]] void *ompx_new(uint16_t &AllocationId) {
-  static uint16_t NumHostAllocs = Config<AllocationKind::GLOBAL>::SLOTS - 1;
-  AllocationId = NumHostAllocs--;
-  AllocationPtrTy<AllocationKind::GLOBAL> AP;
-  AP.PtrOffset = 0;
-  AP.AllocationId = AllocationId;
-  AP.Kind = (uint32_t)AllocationKind::GLOBAL;
-  return AP;
-}
-
-#pragma omp begin declare target
-void ompx_new_global(void *P, uint64_t Bytes, uint16_t AllocationId,
-                     uint32_t Slot);
-void ompx_free_global(void *P);
-#pragma omp end declare target
-
-void *ompx_new_allocation_host(void *P, uint64_t Bytes) {
-  uint16_t AllocationId;
-  void *NewP = ompx_new(AllocationId);
-#pragma omp target is_device_ptr(P)
-  ompx_new_global(P, Bytes, AllocationId, AllocationId);
-  printf("registered %p[:%10zu] -> %zu:%p\n", P, Bytes, (uint64_t)AllocationId,
-         NewP);
-  fflush(stdout);
-  return NewP;
-}
-
-void ompx_free_allocation_host(void *P) {
-  printf("unregister   %p\n", P);
-  fflush(stdout);
-#pragma omp target is_device_ptr(P)
-  ompx_free_global(P);
-}
-}

>From 192d35d4e20e60da32be6ba0b9b0834d29fc05c0 Mon Sep 17 00:00:00 2001
From: Johannes Doerfert <johannes at jdoerfert.de>
Date: Wed, 26 Jun 2024 11:49:57 -0700
Subject: [PATCH 08/31] Add liftetime handling

---
 .../lib/Transforms/Instrumentation/GPUSan.cpp | 41 ++++++++++++++---
 offload/DeviceRTL/src/Sanitizer.cpp           | 43 +++++++++++++++++
 .../common/include/PluginInterface.h          |  4 ++
 .../common/src/PluginInterface.cpp            | 46 ++++++++++---------
 offload/src/omptarget.cpp                     |  7 ++-
 offload/test/sanitizer/global_null.c          |  6 +--
 offload/test/sanitizer/heap_null.c            |  8 ++--
 offload/test/sanitizer/volatile_stack_null.c  |  8 ++--
 8 files changed, 125 insertions(+), 38 deletions(-)

diff --git a/llvm/lib/Transforms/Instrumentation/GPUSan.cpp b/llvm/lib/Transforms/Instrumentation/GPUSan.cpp
index 7b726378f7d6b..9c01abd57f96c 100644
--- a/llvm/lib/Transforms/Instrumentation/GPUSan.cpp
+++ b/llvm/lib/Transforms/Instrumentation/GPUSan.cpp
@@ -12,6 +12,7 @@
 
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Analysis/LoopInfo.h"
@@ -135,6 +136,14 @@ class GPUSanImpl final {
     return getOrCreateFn(UnpackFn[PO], "ompx_unpack" + getSuffix(PO), PtrTy,
                          {PtrTy, Int64Ty});
   }
+  FunctionCallee getLifetimeStart() {
+    return getOrCreateFn(LifetimeStartFn, "ompx_lifetime_start", VoidTy,
+                         {PtrTy, Int64Ty});
+  }
+  FunctionCallee getLifetimeEnd() {
+    return getOrCreateFn(LifetimeEndFn, "ompx_lifetime_end", VoidTy,
+                         {PtrTy, Int64Ty});
+  }
   FunctionCallee getLeakCheckFn() {
     FunctionCallee LeakCheckFn;
     return getOrCreateFn(LeakCheckFn, "ompx_leak_check", VoidTy, {});
@@ -159,6 +168,8 @@ class GPUSanImpl final {
   FunctionCallee FreeFn[3];
   FunctionCallee CheckFn[3];
   FunctionCallee UnpackFn[3];
+  FunctionCallee LifetimeEndFn;
+  FunctionCallee LifetimeStartFn;
   FunctionCallee FreeNLocal;
 
   StringMap<Value *> GlobalStringMap;
@@ -268,18 +279,35 @@ bool GPUSanImpl::instrumentGlobals() {
 
 Value *GPUSanImpl::instrumentAllocation(Instruction &I, Value &Size,
                                         FunctionCallee Fn) {
-  IRBuilder<> IRB(&*I.getParent()->getFirstNonPHIOrDbgOrAlloca());
+  IRBuilder<> IRB(I.getNextNode());
   Value *PlainI = IRB.CreatePointerBitCastOrAddrSpaceCast(&I, PtrTy);
   static int AllocationId = 1;
   auto *CB = IRB.CreateCall(
       Fn,
       {PlainI, &Size, ConstantInt::get(Int64Ty, AllocationId++), getPC(IRB)},
       I.getName() + ".san");
-  I.replaceUsesWithIf(IRB.CreatePointerBitCastOrAddrSpaceCast(CB, I.getType()),
-                      [=](Use &U) {
-                        return U.getUser() != PlainI && U.getUser() != CB &&
-                               !isa<LifetimeIntrinsic>(U.getUser());
-                      });
+  SmallVector<LifetimeIntrinsic *> Lifetimes;
+  I.replaceUsesWithIf(
+      IRB.CreatePointerBitCastOrAddrSpaceCast(CB, I.getType()), [&](Use &U) {
+        if (auto *LT = dyn_cast<LifetimeIntrinsic>(U.getUser())) {
+          Lifetimes.push_back(LT);
+          return false;
+        }
+        return U.getUser() != PlainI && U.getUser() != CB;
+      });
+  if (Lifetimes.empty())
+    return CB;
+
+  CB->setArgOperand(1, ConstantInt::get(Int64Ty, 0));
+  for (auto *LT : Lifetimes) {
+    if (LT->getIntrinsicID() == Intrinsic::lifetime_start) {
+      IRB.SetInsertPoint(LT);
+      IRB.CreateCall(getLifetimeStart(), {CB, LT->getArgOperand(0)});
+    } else {
+      IRB.SetInsertPoint(LT);
+      IRB.CreateCall(getLifetimeEnd(), {CB, LT->getArgOperand(0)});
+    }
+  }
   return CB;
 }
 
@@ -455,6 +483,7 @@ PreservedAnalyses GPUSanPass::run(Module &M, ModuleAnalysisManager &AM) {
   GPUSanImpl Lowerer(M, FAM);
   if (!Lowerer.instrument())
     return PreservedAnalyses::all();
+  LLVM_DEBUG(M.dump());
 
   return PreservedAnalyses::none();
 }
diff --git a/offload/DeviceRTL/src/Sanitizer.cpp b/offload/DeviceRTL/src/Sanitizer.cpp
index d080214390982..d67c85f852fce 100644
--- a/offload/DeviceRTL/src/Sanitizer.cpp
+++ b/offload/DeviceRTL/src/Sanitizer.cpp
@@ -165,6 +165,38 @@ template <AllocationKind AK> struct AllocationTracker {
     return Ptr;
   }
 
+  [[clang::disable_sanitizer_instrumentation]] static void
+  lifetimeStart(void *P, uint64_t Length) {
+    AllocationPtrTy<AK> AP = AllocationPtrTy<AK>::get(P);
+    uint32_t ThreadId = 0, BlockId = 0;
+    if constexpr (AK == AllocationKind::LOCAL) {
+      ThreadId = __kmpc_get_hardware_thread_id_in_block();
+      BlockId = ompx_block_id(0);
+    }
+    auto &AllocArr =
+        Allocations[ThreadId +
+                    BlockId * __kmpc_get_hardware_num_threads_in_block()];
+    auto &A = AllocArr.Arr[AP.AllocationId];
+    // TODO: Check length
+    A.Length = Length;
+  }
+
+  [[clang::disable_sanitizer_instrumentation]] static void
+  lifetimeEnd(void *P, uint64_t Length) {
+    AllocationPtrTy<AK> AP = AllocationPtrTy<AK>::get(P);
+    uint32_t ThreadId = 0, BlockId = 0;
+    if constexpr (AK == AllocationKind::LOCAL) {
+      ThreadId = __kmpc_get_hardware_thread_id_in_block();
+      BlockId = ompx_block_id(0);
+    }
+    auto &AllocArr =
+        Allocations[ThreadId +
+                    BlockId * __kmpc_get_hardware_num_threads_in_block()];
+    auto &A = AllocArr.Arr[AP.AllocationId];
+    // TODO: Check length
+    A.Length = 0;
+  }
+
   [[clang::disable_sanitizer_instrumentation]] static void leakCheck() {
     static_assert(AK == AllocationKind::GLOBAL, "");
     auto &AllocArr = Allocations[0];
@@ -290,6 +322,17 @@ ompx_unpack_global(void *P, uint64_t PC) {
   return AllocationTracker<AllocationKind::GLOBAL>::unpack(P, PC);
 }
 
+[[clang::disable_sanitizer_instrumentation, gnu::flatten, gnu::always_inline,
+  gnu::used, gnu::retain]] void
+ompx_lifetime_start(void *P, uint64_t Length) {
+  AllocationTracker<AllocationKind::LOCAL>::lifetimeStart(P, Length);
+}
+[[clang::disable_sanitizer_instrumentation, gnu::flatten, gnu::always_inline,
+  gnu::used, gnu::retain]] void
+ompx_lifetime_end(void *P, uint64_t Length) {
+  AllocationTracker<AllocationKind::LOCAL>::lifetimeEnd(P, Length);
+}
+
 [[clang::disable_sanitizer_instrumentation, gnu::flatten, gnu::always_inline,
   gnu::used, gnu::retain]] void
 ompx_leak_check() {
diff --git a/offload/plugins-nextgen/common/include/PluginInterface.h b/offload/plugins-nextgen/common/include/PluginInterface.h
index d3cafc788b60b..d1898021c063d 100644
--- a/offload/plugins-nextgen/common/include/PluginInterface.h
+++ b/offload/plugins-nextgen/common/include/PluginInterface.h
@@ -739,6 +739,8 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
                          void *&FakeHstPtr) {
     if (auto Err = GPUSan.notifyDataMapped(DevicePtr, Size, FakeHstPtr))
       return Err;
+    if (!HstPtr)
+      return Error::success();
     return PinnedAllocs.lockMappedHostBuffer(HstPtr, Size);
   }
 
@@ -748,6 +750,8 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
   Error notifyDataUnmapped(void *HstPtr, void *FakeHstPtr) {
     if (auto Err = GPUSan.notifyDataUnmapped(FakeHstPtr))
       return Err;
+    if (!HstPtr)
+      return Error::success();
     return PinnedAllocs.unlockUnmappedHostBuffer(HstPtr);
   }
 
diff --git a/offload/plugins-nextgen/common/src/PluginInterface.cpp b/offload/plugins-nextgen/common/src/PluginInterface.cpp
index 2c1029af2080d..a680fa2f1b7c3 100644
--- a/offload/plugins-nextgen/common/src/PluginInterface.cpp
+++ b/offload/plugins-nextgen/common/src/PluginInterface.cpp
@@ -2207,10 +2207,10 @@ void GPUSanTy::checkAndReportError() {
   if (FunctionName.starts_with("__omp_offloading_")) {
     FunctionName = FunctionName.drop_front(sizeof("__omp_offloading_"));
     auto It = FunctionName.find_first_of("_");
-    if (It != StringRef::npos)
+    if (It != StringRef::npos && It + 1 < FunctionName.size())
       FunctionName = FunctionName.drop_front(It + 1);
     It = FunctionName.find_first_of("_");
-    if (It != StringRef::npos)
+    if (It != StringRef::npos && It + 1 < FunctionName.size())
       FunctionName = FunctionName.drop_front(It + 1);
   }
 
@@ -2219,7 +2219,7 @@ void GPUSanTy::checkAndReportError() {
         FunctionName.drop_back(sizeof("debug___omp_outlined_debug__"));
 
   auto It = FunctionName.find_last_of("_");
-  if (It != StringRef::npos) {
+  if (It != StringRef::npos && It + 1 < FunctionName.size()) {
     if (FunctionName[It + 1] == 'l') {
       int64_t KernelLineNo = 0;
       FunctionName
@@ -2240,42 +2240,44 @@ void GPUSanTy::checkAndReportError() {
   case SanitizerTrapInfoTy::None:
     llvm_unreachable("Unexpected exception");
   case SanitizerTrapInfoTy::ExceedsLength:
-    fprintf(stderr, "%sERROR: OffloadSanitizer %s%s\n", Red(), "exceeds length",
+    fprintf(stderr, "%sERROR: OffloadSanitizer %s\n%s", Red(), "exceeds length",
             Default());
     break;
   case SanitizerTrapInfoTy::ExceedsSlots:
-    fprintf(stderr, "%sERROR: OffloadSanitizer %s%s\n", Red(), "exceeds slots",
+    fprintf(stderr, "%sERROR: OffloadSanitizer %s\n%s", Red(), "exceeds slots",
             Default());
     break;
   case SanitizerTrapInfoTy::OutOfBounds: {
     void *PC = reinterpret_cast<void *>(STI.PC);
     void *Addr = utils::advancePtr(STI.AllocationStart, STI.PtrOffset);
     fprintf(stderr,
-            "%sERROR: OffloadSanitizer %s on address %p at pc "
-            "%p%s\n",
-            Red(), "out-of-bounds access", Addr, PC, Default());
-    fprintf(
-        stderr,
-        "%s%s of size %u at %p thread <%u, %u, %u> block <%lu, %lu, %lu>%s\n",
-        Blue(), STI.AccessId > 0 ? "WRITE" : "READ", STI.AccessSize, Addr,
-        STI.ThreadId[0], STI.ThreadId[1], STI.ThreadId[2], STI.BlockId[0],
-        STI.BlockId[1], STI.BlockId[2], Default());
+            "%sERROR: OffloadSanitizer %s on address " DPxMOD " at pc " DPxMOD
+            "\n%s",
+            Red(), "out-of-bounds access", DPxPTR(Addr), DPxPTR(PC), Default());
+    fprintf(stderr,
+            "%s%s of size %u at " DPxMOD
+            " thread <%u, %u, %u> block <%lu, %lu, %lu>\n%s",
+            Blue(), STI.AccessId > 0 ? "WRITE" : "READ", STI.AccessSize,
+            DPxPTR(Addr), STI.ThreadId[0], STI.ThreadId[1], STI.ThreadId[2],
+            STI.BlockId[0], STI.BlockId[1], STI.BlockId[2], Default());
     fprintf(stderr, "    #0 %p %s in %s:%lu\n\n", PC,
             FunctionName.str().c_str(), FileName.data(), STI.LineNo);
-    fprintf(stderr,
-            "%s%p is located %lu bytes inside of a %lu-byte region [%p,%p)%s\n",
-            Green(), Addr, STI.PtrOffset, STI.AllocationLength,
-            STI.AllocationStart,
-            utils::advancePtr(STI.AllocationStart, STI.AllocationLength),
-            Default());
+    fprintf(
+        stderr,
+        "%s" DPxMOD " is located %lu bytes inside of a %lu-byte region [" DPxMOD
+        "," DPxMOD ")\n%s",
+        Green(), DPxPTR(Addr), STI.PtrOffset, STI.AllocationLength,
+        DPxPTR(STI.AllocationStart),
+        DPxPTR(utils::advancePtr(STI.AllocationStart, STI.AllocationLength)),
+        Default());
     break;
   }
   case SanitizerTrapInfoTy::UseAfterFree:
-    fprintf(stderr, "%sERROR: OffloadSanitizer %s%s\n", Red(), "use-after-free",
+    fprintf(stderr, "%sERROR: OffloadSanitizer %s\n%s", Red(), "use-after-free",
             Default());
     break;
   case SanitizerTrapInfoTy::MemoryLeak:
-    fprintf(stderr, "%sERROR: OffloadSanitizer %s%s\n", Red(), "memory leak",
+    fprintf(stderr, "%sERROR: OffloadSanitizer %s\n%s", Red(), "memory leak",
             Default());
     break;
   }
diff --git a/offload/src/omptarget.cpp b/offload/src/omptarget.cpp
index fa26b21b84a3e..6701e1676bda3 100644
--- a/offload/src/omptarget.cpp
+++ b/offload/src/omptarget.cpp
@@ -443,7 +443,10 @@ void *targetAllocExplicit(size_t Size, int DeviceNum, int Kind,
 
   Rc = DeviceOrErr->allocData(Size, nullptr, Kind);
   DP("%s returns device ptr " DPxMOD "\n", Name, DPxPTR(Rc));
-  return Rc;
+  void *FakeHstPtr;
+  if (DeviceOrErr->notifyDataMapped(nullptr, Rc, Size, FakeHstPtr))
+    return nullptr;
+  return FakeHstPtr;
 }
 
 void targetFreeExplicit(void *DevicePtr, int DeviceNum, int Kind,
@@ -469,6 +472,8 @@ void targetFreeExplicit(void *DevicePtr, int DeviceNum, int Kind,
   if (DeviceOrErr->deleteData(DevicePtr, Kind) == OFFLOAD_FAIL)
     FATAL_MESSAGE(DeviceNum, "%s", "Failed to deallocate device ptr");
 
+  DeviceOrErr->notifyDataUnmapped(nullptr, DevicePtr);
+
   DP("omp_target_free deallocated device ptr\n");
 }
 
diff --git a/offload/test/sanitizer/global_null.c b/offload/test/sanitizer/global_null.c
index 68bd3257ea788..3442deea81e97 100644
--- a/offload/test/sanitizer/global_null.c
+++ b/offload/test/sanitizer/global_null.c
@@ -18,10 +18,10 @@ int main(void) {
 #pragma omp target
   {
     // clang-format off
-    // CHECK:      ERROR: AddressSanitizer out-of-bounds-access on address 0x0 at pc [[PC:0x.*]]
-    // CHECK-NEXT: WRITE of size 4 at 0x0 thread B<0,0,0> T<0,0,0>
+    // CHECK:      ERROR: OffloadSanitizer out-of-bounds access on address 0x0000000000000000 at pc [[PC:0x.*]]
+    // CHECK-NEXT: WRITE of size 4 at 0x0000000000000000 thread <0, 0, 0> block <0, 0, 0>
     // CHECK-NEXT: #0 [[PC]] main null.c:[[@LINE+3]]
-    // CHECK-NEXT: 0x0 is located 0 bytes inside of 0-byte region [0x0,0x0)
+    // CHECK-NEXT: 0x0000000000000000 is located 0 bytes inside of 0-byte region [0x0000000000000000,0x0000000000000000)
     // clang-format on
     *Null = 42;
   }
diff --git a/offload/test/sanitizer/heap_null.c b/offload/test/sanitizer/heap_null.c
index 2590a4dfab7eb..8f1ac0bdf530f 100644
--- a/offload/test/sanitizer/heap_null.c
+++ b/offload/test/sanitizer/heap_null.c
@@ -10,16 +10,18 @@
 // UNSUPPORTED: s390x-ibm-linux-gnu
 // UNSUPPORTED: s390x-ibm-linux-gnu-LTO
 
+// Align lines.
+
 int main(void) {
 
   int *Null = 0;
 #pragma omp target
   {
     // clang-format off
-    // CHECK:      ERROR: AddressSanitizer out-of-bounds-access on address 0x0 at pc [[PC:0x.*]]
-    // CHECK-NEXT: WRITE of size 4 at 0x0 thread B<0,0,0> T<0,0,0>
+    // CHECK:      ERROR: OffloadSanitizer out-of-bounds access on address 0x0000000000000000 at pc [[PC:0x.*]]
+    // CHECK-NEXT: WRITE of size 4 at 0x0000000000000000 thread <0, 0, 0> block <0, 0, 0>
     // CHECK-NEXT: #0 [[PC]] main null.c:[[@LINE+3]]
-    // CHECK-NEXT: 0x0 is located 0 bytes inside of 0-byte region [0x0,0x0)
+    // CHECK-NEXT: 0x0000000000000000 is located 0 bytes inside of 0-byte region [0x0000000000000000,0x0000000000000000)
     // clang-format on
     *Null = 42;
   }
diff --git a/offload/test/sanitizer/volatile_stack_null.c b/offload/test/sanitizer/volatile_stack_null.c
index 9ad986414b069..4f66424ef5a0e 100644
--- a/offload/test/sanitizer/volatile_stack_null.c
+++ b/offload/test/sanitizer/volatile_stack_null.c
@@ -10,16 +10,18 @@
 // UNSUPPORTED: s390x-ibm-linux-gnu
 // UNSUPPORTED: s390x-ibm-linux-gnu-LTO
 
+// Align lines.
+
 int main(void) {
 
 #pragma omp target
   {
     volatile int *Null = 0;
     // clang-format off
-    // CHECK:      ERROR: AddressSanitizer out-of-bounds-access on address 0x0 at pc [[PC:0x.*]]
-    // CHECK-NEXT: WRITE of size 4 at 0x0 thread B<0,0,0> T<0,0,0>
+    // CHECK:      ERROR: OffloadSanitizer out-of-bounds access on address 0x0000000000000000 at pc [[PC:0x.*]]
+    // CHECK-NEXT: WRITE of size 4 at 0x0000000000000000 thread <0, 0, 0> block <0, 0, 0>
     // CHECK-NEXT: #0 [[PC]] main null.c:[[@LINE+3]]
-    // CHECK-NEXT: 0x0 is located 0 bytes inside of 0-byte region [0x0,0x0)
+    // CHECK-NEXT: 0x0000000000000000 is located 0 bytes inside of 0-byte region [0x0000000000000000,0x0000000000000000)
     // clang-format on
     *Null = 42;
   }

>From 6af55022b71004476a00af8f7363111424024461 Mon Sep 17 00:00:00 2001
From: Johannes Doerfert <johannes at jdoerfert.de>
Date: Wed, 26 Jun 2024 17:08:48 -0700
Subject: [PATCH 09/31] Trying to add offload sanitizer to clang

---
 clang/include/clang/Basic/Sanitizers.def            |  3 +++
 clang/lib/CodeGen/BackendUtil.cpp                   |  5 +++++
 clang/lib/CodeGen/CGDeclCXX.cpp                     |  4 ++++
 clang/lib/CodeGen/CodeGenFunction.cpp               |  9 +++++++--
 clang/lib/Driver/ToolChain.cpp                      |  3 ++-
 llvm/lib/Transforms/Instrumentation/GPUSan.cpp      |  4 +++-
 offload/DeviceRTL/src/Sanitizer.cpp                 |  5 ++++-
 offload/include/Shared/Sanitizer.h                  | 13 +++++++++++++
 .../plugins-nextgen/common/src/PluginInterface.cpp  |  4 ++++
 9 files changed, 45 insertions(+), 5 deletions(-)

diff --git a/clang/include/clang/Basic/Sanitizers.def b/clang/include/clang/Basic/Sanitizers.def
index bee35e9dca7c3..0b41187f6db52 100644
--- a/clang/include/clang/Basic/Sanitizers.def
+++ b/clang/include/clang/Basic/Sanitizers.def
@@ -190,6 +190,9 @@ SANITIZER_GROUP("bounds", Bounds, ArrayBounds | LocalBounds)
 // Scudo hardened allocator
 SANITIZER("scudo", Scudo)
 
+// LLVM/Offload sanitizer
+SANITIZER("offload", Offload)
+
 // Magic group, containing all sanitizers. For example, "-fno-sanitize=all"
 // can be used to disable all the sanitizers.
 SANITIZER_GROUP("all", All, ~SanitizerMask())
diff --git a/clang/lib/CodeGen/BackendUtil.cpp b/clang/lib/CodeGen/BackendUtil.cpp
index b09680086248d..a2aae16525421 100644
--- a/clang/lib/CodeGen/BackendUtil.cpp
+++ b/clang/lib/CodeGen/BackendUtil.cpp
@@ -70,6 +70,7 @@
 #include "llvm/Transforms/Instrumentation/BoundsChecking.h"
 #include "llvm/Transforms/Instrumentation/DataFlowSanitizer.h"
 #include "llvm/Transforms/Instrumentation/GCOVProfiler.h"
+#include "llvm/Transforms/Instrumentation/GPUSan.h"
 #include "llvm/Transforms/Instrumentation/HWAddressSanitizer.h"
 #include "llvm/Transforms/Instrumentation/InstrProfiling.h"
 #include "llvm/Transforms/Instrumentation/KCFI.h"
@@ -739,6 +740,10 @@ static void addSanitizers(const Triple &TargetTriple,
     if (LangOpts.Sanitize.has(SanitizerKind::DataFlow)) {
       MPM.addPass(DataFlowSanitizerPass(LangOpts.NoSanitizeFiles));
     }
+    if (LangOpts.Sanitize.has(SanitizerKind::Offload) &&
+        (TargetTriple.isAMDGPU() || TargetTriple.isNVPTX())) {
+      MPM.addPass(GPUSanPass());
+    }
   };
   if (ClSanitizeOnOptimizerEarlyEP) {
     PB.registerOptimizerEarlyEPCallback(
diff --git a/clang/lib/CodeGen/CGDeclCXX.cpp b/clang/lib/CodeGen/CGDeclCXX.cpp
index a88bb2af59fee..90ebc638bd45e 100644
--- a/clang/lib/CodeGen/CGDeclCXX.cpp
+++ b/clang/lib/CodeGen/CGDeclCXX.cpp
@@ -456,6 +456,10 @@ llvm::Function *CodeGenModule::CreateGlobalInitOrCleanUpFunction(
       !isInNoSanitizeList(SanitizerKind::Address, Fn, Loc))
     Fn->addFnAttr(llvm::Attribute::SanitizeAddress);
 
+  if (getLangOpts().Sanitize.has(SanitizerKind::Offload) &&
+      !isInNoSanitizeList(SanitizerKind::Offload, Fn, Loc))
+    Fn->addFnAttr(llvm::Attribute::SanitizeAddress);
+
   if (getLangOpts().Sanitize.has(SanitizerKind::KernelAddress) &&
       !isInNoSanitizeList(SanitizerKind::KernelAddress, Fn, Loc))
     Fn->addFnAttr(llvm::Attribute::SanitizeAddress);
diff --git a/clang/lib/CodeGen/CodeGenFunction.cpp b/clang/lib/CodeGen/CodeGenFunction.cpp
index cea0d84c64bc4..d11c4ed65a597 100644
--- a/clang/lib/CodeGen/CodeGenFunction.cpp
+++ b/clang/lib/CodeGen/CodeGenFunction.cpp
@@ -31,6 +31,7 @@
 #include "clang/AST/StmtObjC.h"
 #include "clang/Basic/Builtins.h"
 #include "clang/Basic/CodeGenOptions.h"
+#include "clang/Basic/Sanitizers.h"
 #include "clang/Basic/TargetBuiltins.h"
 #include "clang/Basic/TargetInfo.h"
 #include "clang/CodeGen/CGFunctionInfo.h"
@@ -67,7 +68,8 @@ static bool shouldEmitLifetimeMarkers(const CodeGenOptions &CGOpts,
   // Sanitizers may use markers.
   if (CGOpts.SanitizeAddressUseAfterScope ||
       LangOpts.Sanitize.has(SanitizerKind::HWAddress) ||
-      LangOpts.Sanitize.has(SanitizerKind::Memory))
+      LangOpts.Sanitize.has(SanitizerKind::Memory) ||
+      LangOpts.Sanitize.has(SanitizerKind::Offload))
     return true;
 
   // For now, only in optimized builds.
@@ -791,6 +793,8 @@ void CodeGenFunction::StartFunction(GlobalDecl GD, QualType RetTy,
       SanOpts.set(SanitizerKind::KernelHWAddress, false);
     if (no_sanitize_mask & SanitizerKind::KernelHWAddress)
       SanOpts.set(SanitizerKind::HWAddress, false);
+    if (no_sanitize_mask & SanitizerKind::Offload)
+      SanOpts.set(SanitizerKind::Offload, false);
 
     if (SanitizeBounds && !SanOpts.hasOneOf(SanitizerKind::Bounds))
       Fn->addFnAttr(llvm::Attribute::NoSanitizeBounds);
@@ -809,7 +813,8 @@ void CodeGenFunction::StartFunction(GlobalDecl GD, QualType RetTy,
     CurFn->addFnAttr(llvm::Attribute::DisableSanitizerInstrumentation);
   } else {
     // Apply sanitizer attributes to the function.
-    if (SanOpts.hasOneOf(SanitizerKind::Address | SanitizerKind::KernelAddress))
+    if (SanOpts.hasOneOf(SanitizerKind::Address | SanitizerKind::KernelAddress |
+                         SanitizerKind::Offload))
       Fn->addFnAttr(llvm::Attribute::SanitizeAddress);
     if (SanOpts.hasOneOf(SanitizerKind::HWAddress |
                          SanitizerKind::KernelHWAddress))
diff --git a/clang/lib/Driver/ToolChain.cpp b/clang/lib/Driver/ToolChain.cpp
index 40ab2e91125d1..05b53c5573a1c 100644
--- a/clang/lib/Driver/ToolChain.cpp
+++ b/clang/lib/Driver/ToolChain.cpp
@@ -1367,7 +1367,8 @@ SanitizerMask ToolChain::getSupportedSanitizers() const {
       SanitizerKind::CFICastStrict | SanitizerKind::FloatDivideByZero |
       SanitizerKind::KCFI | SanitizerKind::UnsignedIntegerOverflow |
       SanitizerKind::UnsignedShiftBase | SanitizerKind::ImplicitConversion |
-      SanitizerKind::Nullability | SanitizerKind::LocalBounds;
+      SanitizerKind::Nullability | SanitizerKind::LocalBounds |
+      SanitizerKind::Offload;
   if (getTriple().getArch() == llvm::Triple::x86 ||
       getTriple().getArch() == llvm::Triple::x86_64 ||
       getTriple().getArch() == llvm::Triple::arm || getTriple().isWasm() ||
diff --git a/llvm/lib/Transforms/Instrumentation/GPUSan.cpp b/llvm/lib/Transforms/Instrumentation/GPUSan.cpp
index 9c01abd57f96c..7c5718cbf12ca 100644
--- a/llvm/lib/Transforms/Instrumentation/GPUSan.cpp
+++ b/llvm/lib/Transforms/Instrumentation/GPUSan.cpp
@@ -380,6 +380,8 @@ bool GPUSanImpl::instrumentCallInst(LoopInfo &LI, CallInst &CI) {
   if (isa<LifetimeIntrinsic>(CI))
     return Changed;
   if (auto *Fn = CI.getCalledFunction()) {
+    if (Fn->getName().starts_with("__kmpc_target_init"))
+      return Changed;
     if ((Fn->isDeclaration() || Fn->getName().starts_with("__kmpc") ||
          Fn->getName().starts_with("rpc_")) &&
         !Fn->getName().starts_with("ompx")) {
@@ -455,7 +457,7 @@ void GPUSanImpl::instrumentReturns(
   for (auto *RI : Returns) {
     IRBuilder<> IRB(RI);
     IRB.CreateCall(getFreeNLocalFn(),
-                   {ConstantInt::get(Int32Ty, Allocas.size())}, ".free");
+                   {ConstantInt::get(Int32Ty, Allocas.size())});
   }
 }
 
diff --git a/offload/DeviceRTL/src/Sanitizer.cpp b/offload/DeviceRTL/src/Sanitizer.cpp
index d67c85f852fce..cc4115f106b87 100644
--- a/offload/DeviceRTL/src/Sanitizer.cpp
+++ b/offload/DeviceRTL/src/Sanitizer.cpp
@@ -138,7 +138,10 @@ template <AllocationKind AK> struct AllocationTracker {
     int64_t Length = A.Length;
     if (Offset > Length - Size ||
         (SanitizerConfig<AK>::useTags() && A.Tag != AP.AllocationTag)) {
-      if (Offset > Length - Size)
+      if (AK == AllocationKind::LOCAL && Length == 0)
+        __sanitizer_trap_info_ptr->useAfterScope<AK>(
+            A, AP, Size, AccessId, PC, FunctionName, FileName, LineNo);
+      else if (Offset > Length - Size)
         __sanitizer_trap_info_ptr->outOfBoundAccess<AK>(
             A, AP, Size, AccessId, PC, FunctionName, FileName, LineNo);
       else
diff --git a/offload/include/Shared/Sanitizer.h b/offload/include/Shared/Sanitizer.h
index 01fba05c3ce05..c752ebf082865 100644
--- a/offload/include/Shared/Sanitizer.h
+++ b/offload/include/Shared/Sanitizer.h
@@ -97,6 +97,7 @@ struct SanitizerTrapInfoTy {
     ExceedsLength,
     ExceedsSlots,
     OutOfBounds,
+    UseAfterScope,
     UseAfterFree,
     MemoryLeak,
   } ErrorCode;
@@ -219,6 +220,18 @@ struct SanitizerTrapInfoTy {
     __builtin_trap();
   }
 
+  template <enum AllocationKind AK>
+  [[clang::disable_sanitizer_instrumentation, noreturn, gnu::flatten,
+    gnu::always_inline]] void
+  useAfterScope(const AllocationTy<AK> &A, const AllocationPtrTy<AK> &AP,
+                uint64_t Size, int64_t AccessId, uint64_t PC,
+                const char *FunctionName, const char *FileName,
+                uint64_t LineNo) {
+    accessError(UseAfterScope, A, AP, Size, AccessId, PC, FunctionName,
+                FileName, LineNo);
+    __builtin_trap();
+  }
+
   template <enum AllocationKind AK>
   [[clang::disable_sanitizer_instrumentation, noreturn, gnu::flatten,
     gnu::always_inline]] void
diff --git a/offload/plugins-nextgen/common/src/PluginInterface.cpp b/offload/plugins-nextgen/common/src/PluginInterface.cpp
index a680fa2f1b7c3..e5ce6d2ab6e9e 100644
--- a/offload/plugins-nextgen/common/src/PluginInterface.cpp
+++ b/offload/plugins-nextgen/common/src/PluginInterface.cpp
@@ -2272,6 +2272,10 @@ void GPUSanTy::checkAndReportError() {
         Default());
     break;
   }
+  case SanitizerTrapInfoTy::UseAfterScope:
+    fprintf(stderr, "%sERROR: OffloadSanitizer %s\n%s", Red(),
+            "use-after-scope", Default());
+    break;
   case SanitizerTrapInfoTy::UseAfterFree:
     fprintf(stderr, "%sERROR: OffloadSanitizer %s\n%s", Red(), "use-after-free",
             Default());

>From e4563fa0e509f0b2bec41b5e9afb94638562c193 Mon Sep 17 00:00:00 2001
From: Johannes Doerfert <johannes at jdoerfert.de>
Date: Thu, 27 Jun 2024 12:04:33 -0700
Subject: [PATCH 10/31] Driver

---
 clang/include/clang/Driver/SanitizerArgs.h    |  3 ++
 clang/lib/CodeGen/BackendUtil.cpp             |  4 ---
 clang/lib/Driver/ToolChains/CommonArgs.cpp    |  6 ++++
 llvm/lib/Passes/PassBuilderPipelines.cpp      |  7 +++--
 llvm/lib/Transforms/IPO/OpenMPOpt.cpp         | 12 ++++----
 .../lib/Transforms/Instrumentation/GPUSan.cpp |  3 +-
 offload/DeviceRTL/src/Sanitizer.cpp           | 11 +++----
 offload/include/Shared/Sanitizer.h            | 30 ++++++++-----------
 offload/src/CMakeLists.txt                    |  8 ++---
 offload/test/sanitizer/volatile_stack_null.c  |  2 +-
 10 files changed, 45 insertions(+), 41 deletions(-)

diff --git a/clang/include/clang/Driver/SanitizerArgs.h b/clang/include/clang/Driver/SanitizerArgs.h
index 47ef175302679..004d5fbf4af73 100644
--- a/clang/include/clang/Driver/SanitizerArgs.h
+++ b/clang/include/clang/Driver/SanitizerArgs.h
@@ -80,6 +80,9 @@ class SanitizerArgs {
 
   bool needsMemProfRt() const { return NeedsMemProfRt; }
   bool needsAsanRt() const { return Sanitizers.has(SanitizerKind::Address); }
+  bool needsOffloadKernels() const {
+    return Sanitizers.has(SanitizerKind::Offload);
+  }
   bool needsHwasanRt() const {
     return Sanitizers.has(SanitizerKind::HWAddress);
   }
diff --git a/clang/lib/CodeGen/BackendUtil.cpp b/clang/lib/CodeGen/BackendUtil.cpp
index a2aae16525421..b1d11b4a6497a 100644
--- a/clang/lib/CodeGen/BackendUtil.cpp
+++ b/clang/lib/CodeGen/BackendUtil.cpp
@@ -740,10 +740,6 @@ static void addSanitizers(const Triple &TargetTriple,
     if (LangOpts.Sanitize.has(SanitizerKind::DataFlow)) {
       MPM.addPass(DataFlowSanitizerPass(LangOpts.NoSanitizeFiles));
     }
-    if (LangOpts.Sanitize.has(SanitizerKind::Offload) &&
-        (TargetTriple.isAMDGPU() || TargetTriple.isNVPTX())) {
-      MPM.addPass(GPUSanPass());
-    }
   };
   if (ClSanitizeOnOptimizerEarlyEP) {
     PB.registerOptimizerEarlyEPCallback(
diff --git a/clang/lib/Driver/ToolChains/CommonArgs.cpp b/clang/lib/Driver/ToolChains/CommonArgs.cpp
index d142cc791925d..ecbee87ed6486 100644
--- a/clang/lib/Driver/ToolChains/CommonArgs.cpp
+++ b/clang/lib/Driver/ToolChains/CommonArgs.cpp
@@ -1204,6 +1204,12 @@ bool tools::addOpenMPRuntime(const Compilation &C, ArgStringList &CmdArgs,
                              const ToolChain &TC, const ArgList &Args,
                              bool ForceStaticHostRuntime, bool IsOffloadingHost,
                              bool GompNeedsRT) {
+  const SanitizerArgs &SanArgs = TC.getSanitizerArgs(Args);
+  if (SanArgs.needsOffloadKernels()) {
+    CmdArgs.push_back("-loffload.kernels");
+    CmdArgs.append({"-mllvm", "-enable-offload-sanitizer"});
+  }
+
   if (!Args.hasFlag(options::OPT_fopenmp, options::OPT_fopenmp_EQ,
                     options::OPT_fno_openmp, false)) {
     // We need libomptarget (liboffload) if it's the choosen offloading runtime.
diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp
index c1e924145c931..be893e719ff0e 100644
--- a/llvm/lib/Passes/PassBuilderPipelines.cpp
+++ b/llvm/lib/Passes/PassBuilderPipelines.cpp
@@ -164,8 +164,9 @@ static cl::opt<bool>
                             cl::Hidden,
                             cl::desc("Enable inline deferral during PGO"));
 
-static cl::opt<bool> EnableGPUSan("enable-gpu-san", cl::init(false), cl::Hidden,
-                                  cl::desc("Enable gpu san"));
+static cl::opt<bool>
+    EnableOffloadSanitizer("enable-offload-sanitizer", cl::init(false),
+                           cl::Hidden, cl::desc("Enable offload sanitizer"));
 
 static cl::opt<bool> EnableModuleInliner("enable-module-inliner",
                                          cl::init(false), cl::Hidden,
@@ -2046,7 +2047,7 @@ PassBuilder::buildLTODefaultPipeline(OptimizationLevel Level,
 
   invokeFullLinkTimeOptimizationLastEPCallbacks(MPM, Level);
 
-  if (EnableGPUSan)
+  if (EnableOffloadSanitizer)
     MPM.addPass(GPUSanPass());
 
   // Emit annotation remarks.
diff --git a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
index e3a4821b8226b..89ce01907deb7 100644
--- a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
+++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
@@ -620,11 +620,13 @@ struct OMPInformationCache : public InformationCache {
     // functions, except if `optnone` is present.
     if (isOpenMPDevice(M)) {
       for (Function &F : M) {
-        for (StringRef Prefix : {"__kmpc", "_ZN4ompx", "omp_"})
-          if (F.hasFnAttribute(Attribute::NoInline) &&
-              F.getName().starts_with(Prefix) &&
-              !F.hasFnAttribute(Attribute::OptimizeNone))
-            F.removeFnAttr(Attribute::NoInline);
+        for (StringRef Prefix : {"__kmpc", "_ZN4ompx", "omp_"}) {
+          if (!F.getName().starts_with(Prefix) ||
+              F.hasFnAttribute(Attribute::OptimizeNone))
+            continue;
+          F.removeFnAttr(Attribute::NoInline);
+          F.addFnAttr(Attribute::AlwaysInline);
+        }
       }
     }
 
diff --git a/llvm/lib/Transforms/Instrumentation/GPUSan.cpp b/llvm/lib/Transforms/Instrumentation/GPUSan.cpp
index 7c5718cbf12ca..cb44267dc8284 100644
--- a/llvm/lib/Transforms/Instrumentation/GPUSan.cpp
+++ b/llvm/lib/Transforms/Instrumentation/GPUSan.cpp
@@ -173,6 +173,7 @@ class GPUSanImpl final {
   FunctionCallee FreeNLocal;
 
   StringMap<Value *> GlobalStringMap;
+  DenseMap<Value *, Value *> PtrMap;
 };
 
 } // end anonymous namespace
@@ -430,7 +431,7 @@ bool GPUSanImpl::instrumentFunction(Function &Fn) {
       Changed = true;
       break;
     case Instruction::GetElementPtr:
-      instrumentGEPInst(LI, cast<GetElementPtrInst>(I));
+      // instrumentGEPInst(LI, cast<GetElementPtrInst>(I));
       Changed = true;
       break;
     case Instruction::Call:
diff --git a/offload/DeviceRTL/src/Sanitizer.cpp b/offload/DeviceRTL/src/Sanitizer.cpp
index cc4115f106b87..9c9a9ac64e4ea 100644
--- a/offload/DeviceRTL/src/Sanitizer.cpp
+++ b/offload/DeviceRTL/src/Sanitizer.cpp
@@ -38,7 +38,7 @@ template <AllocationKind AK> struct AllocationTracker {
   create(void *Start, uint64_t Length, int64_t AllocationId, uint64_t Slot,
          uint64_t PC) {
     if constexpr (SanitizerConfig<AK>::OFFSET_BITS < 64)
-      if (Length >= (1UL << (SanitizerConfig<AK>::OFFSET_BITS)))
+      if (OMP_UNLIKELY(Length >= (1UL << (SanitizerConfig<AK>::OFFSET_BITS))))
         __sanitizer_trap_info_ptr->exceedsAllocationLength<AK>(
             Start, Length, AllocationId, Slot, PC);
 
@@ -57,7 +57,7 @@ template <AllocationKind AK> struct AllocationTracker {
       Slot = ++Cnt;
 
     uint64_t NumSlots = SanitizerConfig<AK>::SLOTS;
-    if (Slot >= NumSlots)
+    if (OMP_UNLIKELY(Slot >= NumSlots))
       __sanitizer_trap_info_ptr->exceedsAllocationSlots<AK>(
           Start, Length, AllocationId, Slot, PC);
 
@@ -136,8 +136,9 @@ template <AllocationKind AK> struct AllocationTracker {
     auto &A = AllocArr.Arr[AP.AllocationId];
     int64_t Offset = AP.Offset;
     int64_t Length = A.Length;
-    if (Offset > Length - Size ||
-        (SanitizerConfig<AK>::useTags() && A.Tag != AP.AllocationTag)) {
+    if (OMP_UNLIKELY(
+            Offset > Length - Size ||
+            (SanitizerConfig<AK>::useTags() && A.Tag != AP.AllocationTag))) {
       if (AK == AllocationKind::LOCAL && Length == 0)
         __sanitizer_trap_info_ptr->useAfterScope<AK>(
             A, AP, Size, AccessId, PC, FunctionName, FileName, LineNo);
@@ -205,7 +206,7 @@ template <AllocationKind AK> struct AllocationTracker {
     auto &AllocArr = Allocations[0];
     for (uint64_t Slot = 0; Slot < SanitizerConfig<AK>::SLOTS; ++Slot) {
       auto &A = AllocArr.Arr[Slot];
-      if (A.Length)
+      if (OMP_UNLIKELY(A.Length))
         __sanitizer_trap_info_ptr->memoryLeak<AK>(A, Slot);
     }
   }
diff --git a/offload/include/Shared/Sanitizer.h b/offload/include/Shared/Sanitizer.h
index c752ebf082865..8c20fabd9b2cb 100644
--- a/offload/include/Shared/Sanitizer.h
+++ b/offload/include/Shared/Sanitizer.h
@@ -150,7 +150,7 @@ struct SanitizerTrapInfoTy {
   }
 
   template <enum AllocationKind AK>
-  [[clang::disable_sanitizer_instrumentation]] void
+  [[clang::disable_sanitizer_instrumentation, gnu::always_inline]] void
   allocationError(ErrorCodeTy EC, void *Start, uint64_t Length, int64_t Id,
                   int64_t Tag, uint64_t Slot, uint64_t PC) {
     AllocationStart = Start;
@@ -165,7 +165,7 @@ struct SanitizerTrapInfoTy {
   }
 
   template <enum AllocationKind AK>
-  [[clang::disable_sanitizer_instrumentation]] void
+  [[clang::disable_sanitizer_instrumentation, gnu::always_inline]] void
   accessError(ErrorCodeTy EC, const AllocationTy<AK> &A,
               const AllocationPtrTy<AK> &AP, uint64_t Size, int64_t Id,
               uint64_t PC, const char *FunctionName, const char *FileName,
@@ -189,8 +189,7 @@ struct SanitizerTrapInfoTy {
   }
 
   template <enum AllocationKind AK>
-  [[clang::disable_sanitizer_instrumentation, noreturn, gnu::flatten,
-    gnu::always_inline]] void
+  [[clang::disable_sanitizer_instrumentation, noreturn, gnu::noinline]] void
   exceedsAllocationLength(void *Start, uint64_t Length, int64_t AllocationId,
                           uint64_t Slot, uint64_t PC) {
     allocationError<AK>(ExceedsLength, Start, Length, AllocationId, /*Tag=*/0,
@@ -199,8 +198,7 @@ struct SanitizerTrapInfoTy {
   }
 
   template <enum AllocationKind AK>
-  [[clang::disable_sanitizer_instrumentation, noreturn, gnu::flatten,
-    gnu::always_inline]] void
+  [[clang::disable_sanitizer_instrumentation, noreturn, gnu::noinline]] void
   exceedsAllocationSlots(void *Start, uint64_t Length, int64_t AllocationId,
                          uint64_t Slot, uint64_t PC) {
     allocationError<AK>(ExceedsSlots, Start, Length, AllocationId, /*Tag=*/0,
@@ -209,9 +207,8 @@ struct SanitizerTrapInfoTy {
   }
 
   template <enum AllocationKind AK>
-  [[clang::disable_sanitizer_instrumentation, noreturn, gnu::flatten,
-    gnu::always_inline]] void
-  outOfBoundAccess(const AllocationTy<AK> &A, const AllocationPtrTy<AK> &AP,
+  [[clang::disable_sanitizer_instrumentation, noreturn, gnu::noinline]] void
+  outOfBoundAccess(const AllocationTy<AK> A, const AllocationPtrTy<AK> AP,
                    uint64_t Size, int64_t AccessId, uint64_t PC,
                    const char *FunctionName, const char *FileName,
                    uint64_t LineNo) {
@@ -221,9 +218,8 @@ struct SanitizerTrapInfoTy {
   }
 
   template <enum AllocationKind AK>
-  [[clang::disable_sanitizer_instrumentation, noreturn, gnu::flatten,
-    gnu::always_inline]] void
-  useAfterScope(const AllocationTy<AK> &A, const AllocationPtrTy<AK> &AP,
+  [[clang::disable_sanitizer_instrumentation, noreturn, gnu::noinline]] void
+  useAfterScope(const AllocationTy<AK> A, const AllocationPtrTy<AK> AP,
                 uint64_t Size, int64_t AccessId, uint64_t PC,
                 const char *FunctionName, const char *FileName,
                 uint64_t LineNo) {
@@ -233,9 +229,8 @@ struct SanitizerTrapInfoTy {
   }
 
   template <enum AllocationKind AK>
-  [[clang::disable_sanitizer_instrumentation, noreturn, gnu::flatten,
-    gnu::always_inline]] void
-  useAfterFree(const AllocationTy<AK> &A, const AllocationPtrTy<AK> &AP,
+  [[clang::disable_sanitizer_instrumentation, noreturn, gnu::noinline]] void
+  useAfterFree(const AllocationTy<AK> A, const AllocationPtrTy<AK> AP,
                uint64_t Size, int64_t AccessId, uint64_t PC,
                const char *FunctionName, const char *FileName,
                uint64_t LineNo) {
@@ -245,9 +240,8 @@ struct SanitizerTrapInfoTy {
   }
 
   template <enum AllocationKind AK>
-  [[clang::disable_sanitizer_instrumentation, noreturn, gnu::flatten,
-    gnu::always_inline]] void
-  memoryLeak(const AllocationTy<AK> &A, uint64_t Slot) {
+  [[clang::disable_sanitizer_instrumentation, noreturn, gnu::noinline]] void
+  memoryLeak(const AllocationTy<AK> A, uint64_t Slot) {
     allocationError<AK>(MemoryLeak, A.Start, A.Length, A.Id, A.Tag, Slot,
                         /*PC=*/0);
     __builtin_trap();
diff --git a/offload/src/CMakeLists.txt b/offload/src/CMakeLists.txt
index 0f30f6028f103..a0250e522e46e 100644
--- a/offload/src/CMakeLists.txt
+++ b/offload/src/CMakeLists.txt
@@ -61,7 +61,7 @@ endforeach()
 target_compile_options(omptarget PRIVATE ${offload_compile_flags})
 target_link_options(omptarget PRIVATE ${offload_link_flags})
 
-add_llvm_library(offload.kernels
+add_llvm_library(clang_rt.offload_kernels
   STATIC
 
   Kernels/Sanitizer.cpp
@@ -75,8 +75,8 @@ add_llvm_library(offload.kernels
 )
 
 list(JOIN LIBOMPTARGET_DEVICE_ARCHITECTURES "," KERNEL_OFFLOAD_ARCHS)
-target_compile_options(offload.kernels PRIVATE -x cuda --offload-arch=${KERNEL_OFFLOAD_ARCHS} -nocudalib -nogpulib -fopenmp-target-jit -foffload-via-llvm )
-target_link_options(offload.kernels PRIVATE -x cuda --offload-arch=${KERNEL_OFFLOAD_ARCHS} -nocudalib -nogpulib -fopenmp-target-jit -foffload-via-llvm )
+target_compile_options(clang_rt.offload_kernels PRIVATE -x cuda --offload-arch=${KERNEL_OFFLOAD_ARCHS} -nocudalib -nogpulib -fopenmp-target-jit -foffload-via-llvm )
+target_link_options(clang_rt.offload_kernels PRIVATE -x cuda --offload-arch=${KERNEL_OFFLOAD_ARCHS} -nocudalib -nogpulib -fopenmp-target-jit -foffload-via-llvm )
 
 # libomptarget.so needs to be aware of where the plugins live as they
 # are now separated in the build directory.
@@ -85,4 +85,4 @@ set_target_properties(omptarget PROPERTIES
                       INSTALL_RPATH "$ORIGIN"
                       BUILD_RPATH "$ORIGIN:${CMAKE_CURRENT_BINARY_DIR}/..")
 install(TARGETS omptarget LIBRARY COMPONENT omptarget DESTINATION "${OFFLOAD_INSTALL_LIBDIR}")
-install(TARGETS offload.kernels LIBRARY COMPONENT offload.kernels DESTINATION "${OFFLOAD_INSTALL_LIBDIR}")
+install(TARGETS clang_rt.offload_kernels LIBRARY COMPONENT clang_rt.offload_kernels DESTINATION "${OFFLOAD_INSTALL_LIBDIR}")
diff --git a/offload/test/sanitizer/volatile_stack_null.c b/offload/test/sanitizer/volatile_stack_null.c
index 4f66424ef5a0e..2de4024daaa24 100644
--- a/offload/test/sanitizer/volatile_stack_null.c
+++ b/offload/test/sanitizer/volatile_stack_null.c
@@ -1,5 +1,5 @@
 // clang-format off
-// RUN: %libomptarget-compileopt-generic -loffload.kernels -mllvm -enable-gpu-san
+// RUN: %libomptarget-compileopt-generic -fsanitize=offload
 // RUN: %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefixes=CHECK
 // clang-format on
 

>From 0e806bce8c8011102dad2ac6ce1be472a64ec923 Mon Sep 17 00:00:00 2001
From: Johannes Doerfert <johannes at jdoerfert.de>
Date: Thu, 27 Jun 2024 19:18:49 -0700
Subject: [PATCH 11/31] AI

---
 .../lib/Transforms/Instrumentation/GPUSan.cpp | 112 +++++++++++--
 offload/DeviceRTL/src/Sanitizer.cpp           | 149 +++++++++---------
 offload/include/Shared/Sanitizer.h            |  57 +++++--
 offload/src/CMakeLists.txt                    |   8 +-
 4 files changed, 222 insertions(+), 104 deletions(-)

diff --git a/llvm/lib/Transforms/Instrumentation/GPUSan.cpp b/llvm/lib/Transforms/Instrumentation/GPUSan.cpp
index cb44267dc8284..591eb5dc55182 100644
--- a/llvm/lib/Transforms/Instrumentation/GPUSan.cpp
+++ b/llvm/lib/Transforms/Instrumentation/GPUSan.cpp
@@ -96,7 +96,11 @@ class GPUSanImpl final {
   Value *getFunctionName(IRBuilder<> &IRB);
   Value *getFileName(IRBuilder<> &IRB);
   Value *getLineNo(IRBuilder<> &IRB);
-  PtrOrigin getPtrOrigin(LoopInfo &LI, Value *Ptr);
+
+  void getAllocationInfo(Function &Fn, PtrOrigin PO, Value &Object,
+                         Value *&Start, Value *&Length, Value *&Tag);
+  PtrOrigin getPtrOrigin(LoopInfo &LI, Value *Ptr,
+                         const Value **Object = nullptr);
 
   FunctionCallee getOrCreateFn(FunctionCallee &FC, StringRef Name, Type *RetTy,
                                ArrayRef<Type *> ArgTys) {
@@ -126,6 +130,19 @@ class GPUSanImpl final {
         CheckFn[PO], "ompx_check" + getSuffix(PO), PtrTy,
         {PtrTy, Int64Ty, Int64Ty, Int64Ty, PtrTy, PtrTy, Int64Ty});
   }
+  FunctionCallee getCheckWithBaseFn(PtrOrigin PO) {
+    assert(PO >= LOCAL && PO <= GLOBAL && "Origin does not need handling.");
+    return getOrCreateFn(CheckWithBaseFn[PO],
+                         "ompx_check_with_base" + getSuffix(PO), PtrTy,
+                         {PtrTy, PtrTy, Int64Ty, Int32Ty, Int64Ty, Int64Ty,
+                          Int64Ty, PtrTy, PtrTy, Int64Ty});
+  }
+  FunctionCallee getAllocationInfoFn(PtrOrigin PO) {
+    assert(PO >= LOCAL && PO <= GLOBAL && "Origin does not need handling.");
+    return getOrCreateFn(
+        AllocationInfoFn[PO], "ompx_get_allocation_info" + getSuffix(PO),
+        StructType::create({PtrTy, Int64Ty, Int32Ty}), {PtrTy});
+  }
   FunctionCallee getGEPFn(PtrOrigin PO) {
     assert(PO <= GLOBAL && "Origin does not need handling.");
     return getOrCreateFn(GEPFn[PO], "ompx_gep" + getSuffix(PO), PtrTy,
@@ -167,13 +184,20 @@ class GPUSanImpl final {
   FunctionCallee GEPFn[3];
   FunctionCallee FreeFn[3];
   FunctionCallee CheckFn[3];
+  FunctionCallee CheckWithBaseFn[3];
+  FunctionCallee AllocationInfoFn[3];
   FunctionCallee UnpackFn[3];
   FunctionCallee LifetimeEndFn;
   FunctionCallee LifetimeStartFn;
   FunctionCallee FreeNLocal;
 
   StringMap<Value *> GlobalStringMap;
-  DenseMap<Value *, Value *> PtrMap;
+  struct AllocationInfoTy {
+    Value *Start;
+    Value *Length;
+    Value *Tag;
+  };
+  DenseMap<std::pair<Function *, Value *>, AllocationInfoTy> AllocationInfoMap;
 };
 
 } // end anonymous namespace
@@ -219,9 +243,32 @@ Value *GPUSanImpl::getLineNo(IRBuilder<> &IRB) {
   return ConstantInt::get(Int64Ty, DLoc.getLine());
 }
 
-PtrOrigin GPUSanImpl::getPtrOrigin(LoopInfo &LI, Value *Ptr) {
+void GPUSanImpl::getAllocationInfo(Function &Fn, PtrOrigin PO, Value &Object,
+                                   Value *&Start, Value *&Length, Value *&Tag) {
+  auto &It = AllocationInfoMap[{&Fn, &Object}];
+  if (!It.Start) {
+    auto *IP = dyn_cast<Instruction>(&Object);
+    if (IP)
+      IP = IP->getNextNode();
+    else
+      IP = &*Fn.getEntryBlock().getFirstNonPHIOrDbgOrAlloca();
+    IRBuilder<> IRB(IP);
+    auto *CB = IRB.CreateCall(getAllocationInfoFn(PO), {&Object});
+    It.Start = IRB.CreateExtractValue(CB, {0});
+    It.Length = IRB.CreateExtractValue(CB, {1});
+    It.Tag = IRB.CreateExtractValue(CB, {2});
+  }
+  Start = It.Start;
+  Length = It.Length;
+  Tag = It.Tag;
+}
+
+PtrOrigin GPUSanImpl::getPtrOrigin(LoopInfo &LI, Value *Ptr,
+                                   const Value **Object) {
   SmallVector<const Value *> Objects;
   getUnderlyingObjects(Ptr, Objects, &LI);
+  if (Object && Objects.size() == 1)
+    *Object = Objects.front();
   PtrOrigin PO = NONE;
   for (auto *Obj : Objects) {
     PtrOrigin ObjPO = HasAllocas ? UNKNOWN : GLOBAL;
@@ -323,10 +370,18 @@ Value *GPUSanImpl::instrumentAllocaInst(LoopInfo &LI, AllocaInst &AI) {
 void GPUSanImpl::instrumentAccess(LoopInfo &LI, Instruction &I, int PtrIdx,
                                   Type &AccessTy, bool IsRead) {
   Value *PtrOp = I.getOperand(PtrIdx);
-  PtrOrigin PO = getPtrOrigin(LI, PtrOp);
+  const Value *Object = nullptr;
+  PtrOrigin PO = getPtrOrigin(LI, PtrOp, &Object);
   if (PO > GLOBAL)
     return;
 
+  Value *Start = nullptr;
+  Value *Length = nullptr;
+  Value *Tag = nullptr;
+  if (PO != UNKNOWN)
+    getAllocationInfo(*I.getFunction(), PO, *const_cast<Value *>(Object), Start,
+                      Length, Tag);
+
   static int32_t ReadAccessId = -1;
   static int32_t WriteAccessId = 1;
   const int32_t &AccessId = IsRead ? ReadAccessId-- : WriteAccessId++;
@@ -336,11 +391,21 @@ void GPUSanImpl::instrumentAccess(LoopInfo &LI, Instruction &I, int PtrIdx,
   Value *Size = ConstantInt::get(Int64Ty, TySize.getFixedValue());
   IRBuilder<> IRB(&I);
   Value *PlainPtrOp = IRB.CreatePointerBitCastOrAddrSpaceCast(PtrOp, PtrTy);
-  auto *CB = IRB.CreateCall(
-      getCheckFn(PO),
-      {PlainPtrOp, Size, ConstantInt::get(Int64Ty, AccessId), getPC(IRB),
-       getFunctionName(IRB), getFileName(IRB), getLineNo(IRB)},
-      I.getName() + ".san");
+  CallInst *CB;
+  if (Start) {
+    CB =
+        IRB.CreateCall(getCheckWithBaseFn(PO),
+                       {PlainPtrOp, Start, Length, Tag, Size,
+                        ConstantInt::get(Int64Ty, AccessId), getPC(IRB),
+                        getFunctionName(IRB), getFileName(IRB), getLineNo(IRB)},
+                       I.getName() + ".san");
+  } else {
+    CB = IRB.CreateCall(getCheckFn(PO),
+                        {PlainPtrOp, Size, ConstantInt::get(Int64Ty, AccessId),
+                         getPC(IRB), getFunctionName(IRB), getFileName(IRB),
+                         getLineNo(IRB)},
+                        I.getName() + ".san");
+  }
   I.setOperand(PtrIdx,
                IRB.CreatePointerBitCastOrAddrSpaceCast(CB, PtrOp->getType()));
 }
@@ -409,33 +474,38 @@ bool GPUSanImpl::instrumentCallInst(LoopInfo &LI, CallInst &CI) {
 bool GPUSanImpl::instrumentFunction(Function &Fn) {
   if (Fn.isDeclaration())
     return false;
+
   bool Changed = false;
   LoopInfo &LI = FAM.getResult<LoopAnalysis>(Fn);
   SmallVector<std::pair<AllocaInst *, Value *>> Allocas;
   SmallVector<ReturnInst *> Returns;
+  SmallVector<LoadInst *> Loads;
+  SmallVector<StoreInst *> Stores;
+  SmallVector<CallInst *> Calls;
+  SmallVector<GetElementPtrInst *> GEPs;
+
   for (auto &I : instructions(Fn)) {
     switch (I.getOpcode()) {
     case Instruction::Alloca: {
       AllocaInst &AI = cast<AllocaInst>(I);
-      Value *FakePtr = instrumentAllocaInst(LI, AI);
-      Allocas.push_back({&AI, FakePtr});
+      Allocas.push_back({&AI, nullptr});
       Changed = true;
       break;
     }
     case Instruction::Load:
-      instrumentLoadInst(LI, cast<LoadInst>(I));
+      Loads.push_back(&cast<LoadInst>(I));
       Changed = true;
       break;
     case Instruction::Store:
-      instrumentStoreInst(LI, cast<StoreInst>(I));
+      Stores.push_back(&cast<StoreInst>(I));
       Changed = true;
       break;
     case Instruction::GetElementPtr:
-      // instrumentGEPInst(LI, cast<GetElementPtrInst>(I));
+      GEPs.push_back(&cast<GetElementPtrInst>(I));
       Changed = true;
       break;
     case Instruction::Call:
-      Changed = instrumentCallInst(LI, cast<CallInst>(I));
+      Calls.push_back(&cast<CallInst>(I));
       break;
     case Instruction::Ret:
       Returns.push_back(&cast<ReturnInst>(I));
@@ -445,6 +515,17 @@ bool GPUSanImpl::instrumentFunction(Function &Fn) {
     }
   }
 
+  for (auto &It : Allocas)
+    It.second = instrumentAllocaInst(LI, *It.first);
+  for (auto *Load : Loads)
+    instrumentLoadInst(LI, *Load);
+  for (auto *Store : Stores)
+    instrumentStoreInst(LI, *Store);
+  for (auto *GEP : GEPs)
+    instrumentGEPInst(LI, *GEP);
+  for (auto *Call : Calls)
+    Changed |= instrumentCallInst(LI, *Call);
+
   instrumentReturns(Allocas, Returns);
 
   return Changed;
@@ -472,6 +553,7 @@ bool GPUSanImpl::instrument() {
     return false;
   }();
 
+  M.dump();
   for (Function &Fn : M)
     if (!Fn.getName().contains("ompx") && !Fn.getName().contains("__kmpc") &&
         !Fn.getName().starts_with("rpc_"))
diff --git a/offload/DeviceRTL/src/Sanitizer.cpp b/offload/DeviceRTL/src/Sanitizer.cpp
index 9c9a9ac64e4ea..7905b55efe450 100644
--- a/offload/DeviceRTL/src/Sanitizer.cpp
+++ b/offload/DeviceRTL/src/Sanitizer.cpp
@@ -22,6 +22,12 @@ using namespace utils;
 
 #include "Shared/Sanitizer.h"
 
+struct AllocationInfoTy {
+  void *Start;
+  uint64_t Length;
+  uint32_t Tag;
+};
+
 [[gnu::used, gnu::retain, gnu::weak,
   gnu::visibility("protected")]] SanitizerTrapInfoTy *__sanitizer_trap_info_ptr;
 
@@ -31,8 +37,14 @@ template <AllocationKind AK> struct AllocationTracker {
   static_assert(sizeof(AllocationPtrTy<AK>) == sizeof(void *),
                 "AllocationTy pointers should be pointer sized");
 
-  static AllocationArrayTy<AK>
-      Allocations[SanitizerConfig<AK>::NUM_ALLOCATION_ARRAYS];
+  [[clang::disable_sanitizer_instrumentation]] static struct AllocationInfoTy
+  getAllocationInfo(void *P) {
+    AllocationPtrTy<AK> AP = AllocationPtrTy<AK>::get(P);
+    uint32_t AllocationId = AP.AllocationId;
+    auto &AllocArr = getAllocationArray<AK>();
+    auto &A = AllocArr.Arr[AllocationId];
+    return {A.Start, A.Length, (uint32_t)A.Tag};
+  }
 
   [[clang::disable_sanitizer_instrumentation]] static void *
   create(void *Start, uint64_t Length, int64_t AllocationId, uint64_t Slot,
@@ -42,16 +54,8 @@ template <AllocationKind AK> struct AllocationTracker {
         __sanitizer_trap_info_ptr->exceedsAllocationLength<AK>(
             Start, Length, AllocationId, Slot, PC);
 
-    uint32_t ThreadId = 0, BlockId = 0;
-    if constexpr (AK == AllocationKind::LOCAL) {
-      ThreadId = __kmpc_get_hardware_thread_id_in_block();
-      BlockId = ompx_block_id(0);
-    }
-
     // Reserve the 0 element for the null pointer in global space.
-    auto &AllocArr =
-        Allocations[ThreadId +
-                    BlockId * __kmpc_get_hardware_num_threads_in_block()];
+    auto &AllocArr = getAllocationArray<AK>();
     auto &Cnt = AllocArr.Cnt;
     if constexpr (AK == AllocationKind::LOCAL)
       Slot = ++Cnt;
@@ -81,15 +85,7 @@ template <AllocationKind AK> struct AllocationTracker {
                                                                   uint64_t PC) {
     AllocationPtrTy<AK> AP = AllocationPtrTy<AK>::get(P);
     uint32_t AllocationId = AP.AllocationId;
-
-    uint32_t ThreadId = 0, BlockId = 0;
-    if constexpr (AK == AllocationKind::LOCAL) {
-      ThreadId = __kmpc_get_hardware_thread_id_in_block();
-      BlockId = ompx_block_id(0);
-    }
-    auto &AllocArr =
-        Allocations[ThreadId +
-                    BlockId * __kmpc_get_hardware_num_threads_in_block()];
+    auto &AllocArr = getAllocationArray<AK>();
     auto &A = AllocArr.Arr[AllocationId];
     A.Length = 0;
 
@@ -102,11 +98,7 @@ template <AllocationKind AK> struct AllocationTracker {
 
   [[clang::disable_sanitizer_instrumentation]] static void remove_n(int32_t N) {
     static_assert(AK == AllocationKind::LOCAL, "");
-    uint32_t ThreadId = __kmpc_get_hardware_thread_id_in_block();
-    uint32_t BlockId = ompx_block_id(0);
-    auto &AllocArr =
-        Allocations[ThreadId +
-                    BlockId * __kmpc_get_hardware_num_threads_in_block()];
+    auto &AllocArr = getAllocationArray<AK>();
     auto &Cnt = AllocArr.Cnt;
     for (int32_t I = 0; I < N; ++I) {
       auto &A = AllocArr.Arr[Cnt--];
@@ -122,47 +114,35 @@ template <AllocationKind AK> struct AllocationTracker {
   }
 
   [[clang::disable_sanitizer_instrumentation]] static void *
-  check(void *P, int64_t Size, int64_t AccessId, uint64_t PC,
-        const char *FunctionName, const char *FileName, uint64_t LineNo) {
+  checkWithBase(void *P, void *Start, int64_t Length, uint32_t Tag,
+                int64_t Size, int64_t AccessId, uint64_t PC,
+                const char *FunctionName, const char *FileName,
+                uint64_t LineNo) {
     AllocationPtrTy<AK> AP = AllocationPtrTy<AK>::get(P);
-    uint32_t ThreadId = 0, BlockId = 0;
-    if constexpr (AK == AllocationKind::LOCAL) {
-      ThreadId = __kmpc_get_hardware_thread_id_in_block();
-      BlockId = ompx_block_id(0);
-    }
-    auto &AllocArr =
-        Allocations[ThreadId +
-                    BlockId * __kmpc_get_hardware_num_threads_in_block()];
-    auto &A = AllocArr.Arr[AP.AllocationId];
     int64_t Offset = AP.Offset;
-    int64_t Length = A.Length;
     if (OMP_UNLIKELY(
             Offset > Length - Size ||
-            (SanitizerConfig<AK>::useTags() && A.Tag != AP.AllocationTag))) {
-      if (AK == AllocationKind::LOCAL && Length == 0)
-        __sanitizer_trap_info_ptr->useAfterScope<AK>(
-            A, AP, Size, AccessId, PC, FunctionName, FileName, LineNo);
-      else if (Offset > Length - Size)
-        __sanitizer_trap_info_ptr->outOfBoundAccess<AK>(
-            A, AP, Size, AccessId, PC, FunctionName, FileName, LineNo);
-      else
-        __sanitizer_trap_info_ptr->useAfterFree<AK>(
-            A, AP, Size, AccessId, PC, FunctionName, FileName, LineNo);
+            (SanitizerConfig<AK>::useTags() && Tag != AP.AllocationTag))) {
+      __sanitizer_trap_info_ptr->accessError(AP, Size, AccessId, PC,
+                                             FunctionName, FileName, LineNo);
     }
-    return utils::advancePtr(A.Start, Offset);
+    return utils::advancePtr(Start, Offset);
+  }
+
+  [[clang::disable_sanitizer_instrumentation]] static void *
+  check(void *P, int64_t Size, int64_t AccessId, uint64_t PC,
+        const char *FunctionName, const char *FileName, uint64_t LineNo) {
+    AllocationPtrTy<AK> AP = AllocationPtrTy<AK>::get(P);
+    auto &AllocArr = getAllocationArray<AK>();
+    auto &Alloc = AllocArr.Arr[AP.AllocationId];
+    return checkWithBase(P, Alloc.Start, Alloc.Length, Alloc.Tag, Size,
+                         AccessId, PC, FunctionName, FileName, LineNo);
   }
 
   [[clang::disable_sanitizer_instrumentation]] static void *
   unpack(void *P, uint64_t PC = 0) {
     AllocationPtrTy<AK> AP = AllocationPtrTy<AK>::get(P);
-    uint32_t ThreadId = 0, BlockId = 0;
-    if constexpr (AK == AllocationKind::LOCAL) {
-      ThreadId = __kmpc_get_hardware_thread_id_in_block();
-      BlockId = ompx_block_id(0);
-    }
-    auto &AllocArr =
-        Allocations[ThreadId +
-                    BlockId * __kmpc_get_hardware_num_threads_in_block()];
+    auto &AllocArr = getAllocationArray<AK>();
     auto &A = AllocArr.Arr[AP.AllocationId];
     uint64_t Offset = AP.Offset;
     void *Ptr = utils::advancePtr(A.Start, Offset);
@@ -172,14 +152,7 @@ template <AllocationKind AK> struct AllocationTracker {
   [[clang::disable_sanitizer_instrumentation]] static void
   lifetimeStart(void *P, uint64_t Length) {
     AllocationPtrTy<AK> AP = AllocationPtrTy<AK>::get(P);
-    uint32_t ThreadId = 0, BlockId = 0;
-    if constexpr (AK == AllocationKind::LOCAL) {
-      ThreadId = __kmpc_get_hardware_thread_id_in_block();
-      BlockId = ompx_block_id(0);
-    }
-    auto &AllocArr =
-        Allocations[ThreadId +
-                    BlockId * __kmpc_get_hardware_num_threads_in_block()];
+    auto &AllocArr = getAllocationArray<AK>();
     auto &A = AllocArr.Arr[AP.AllocationId];
     // TODO: Check length
     A.Length = Length;
@@ -188,14 +161,7 @@ template <AllocationKind AK> struct AllocationTracker {
   [[clang::disable_sanitizer_instrumentation]] static void
   lifetimeEnd(void *P, uint64_t Length) {
     AllocationPtrTy<AK> AP = AllocationPtrTy<AK>::get(P);
-    uint32_t ThreadId = 0, BlockId = 0;
-    if constexpr (AK == AllocationKind::LOCAL) {
-      ThreadId = __kmpc_get_hardware_thread_id_in_block();
-      BlockId = ompx_block_id(0);
-    }
-    auto &AllocArr =
-        Allocations[ThreadId +
-                    BlockId * __kmpc_get_hardware_num_threads_in_block()];
+    auto &AllocArr = getAllocationArray<AK>();
     auto &A = AllocArr.Arr[AP.AllocationId];
     // TODO: Check length
     A.Length = 0;
@@ -203,7 +169,7 @@ template <AllocationKind AK> struct AllocationTracker {
 
   [[clang::disable_sanitizer_instrumentation]] static void leakCheck() {
     static_assert(AK == AllocationKind::GLOBAL, "");
-    auto &AllocArr = Allocations[0];
+    auto &AllocArr = getAllocationArray<AK>();
     for (uint64_t Slot = 0; Slot < SanitizerConfig<AK>::SLOTS; ++Slot) {
       auto &A = AllocArr.Arr[Slot];
       if (OMP_UNLIKELY(A.Length))
@@ -213,8 +179,8 @@ template <AllocationKind AK> struct AllocationTracker {
 };
 
 template <AllocationKind AK>
-AllocationArrayTy<AK> AllocationTracker<
-    AK>::Allocations[SanitizerConfig<AK>::NUM_ALLOCATION_ARRAYS];
+AllocationArrayTy<AK>
+    Allocations<AK>::Arr[SanitizerConfig<AK>::NUM_ALLOCATION_ARRAYS];
 
 extern "C" {
 
@@ -310,6 +276,28 @@ ompx_check_global(void *P, uint64_t Size, uint64_t AccessId, uint64_t PC,
       P, Size, AccessId, PC, FunctionName, FileName, LineNo);
 }
 
+[[clang::disable_sanitizer_instrumentation, gnu::flatten, gnu::always_inline,
+  gnu::used, gnu::retain]] void *
+ompx_check_with_base_local(void *P, void *Start, uint64_t Length, uint32_t Tag,
+                           uint64_t Size, uint64_t AccessId, uint64_t PC,
+                           const char *FunctionName, const char *FileName,
+                           uint64_t LineNo) {
+  return AllocationTracker<AllocationKind::LOCAL>::checkWithBase(
+      P, Start, Length, Tag, Size, AccessId, PC, FunctionName, FileName,
+      LineNo);
+}
+
+[[clang::disable_sanitizer_instrumentation, gnu::flatten, gnu::always_inline,
+  gnu::used, gnu::retain]] void *
+ompx_check_with_base_global(void *P, void *Start, uint64_t Length, uint32_t Tag,
+                            uint64_t Size, uint64_t AccessId, uint64_t PC,
+                            const char *FunctionName, const char *FileName,
+                            uint64_t LineNo) {
+  return AllocationTracker<AllocationKind::GLOBAL>::checkWithBase(
+      P, Start, Length, Tag, Size, AccessId, PC, FunctionName, FileName,
+      LineNo);
+}
+
 [[clang::disable_sanitizer_instrumentation, gnu::flatten, gnu::always_inline,
   gnu::used, gnu::retain]] void *
 ompx_unpack(void *P, uint64_t PC) {
@@ -337,6 +325,17 @@ ompx_lifetime_end(void *P, uint64_t Length) {
   AllocationTracker<AllocationKind::LOCAL>::lifetimeEnd(P, Length);
 }
 
+[[clang::disable_sanitizer_instrumentation, gnu::flatten, gnu::always_inline,
+  gnu::used, gnu::retain]] struct AllocationInfoTy
+ompx_get_allocation_info_local(void *P) {
+  return AllocationTracker<AllocationKind::LOCAL>::getAllocationInfo(P);
+}
+[[clang::disable_sanitizer_instrumentation, gnu::flatten, gnu::always_inline,
+  gnu::used, gnu::retain]] struct AllocationInfoTy
+ompx_get_allocation_info_global(void *P) {
+  return AllocationTracker<AllocationKind::GLOBAL>::getAllocationInfo(P);
+}
+
 [[clang::disable_sanitizer_instrumentation, gnu::flatten, gnu::always_inline,
   gnu::used, gnu::retain]] void
 ompx_leak_check() {
diff --git a/offload/include/Shared/Sanitizer.h b/offload/include/Shared/Sanitizer.h
index 8c20fabd9b2cb..40650013b537c 100644
--- a/offload/include/Shared/Sanitizer.h
+++ b/offload/include/Shared/Sanitizer.h
@@ -15,6 +15,7 @@
 #include "Utils.h"
 
 extern "C" int ompx_block_id(int Dim);
+extern "C" int ompx_block_dim(int Dim);
 extern "C" int ompx_thread_id(int Dim);
 
 enum class AllocationKind { GLOBAL, LOCAL, LAST = LOCAL };
@@ -82,6 +83,23 @@ static inline void *__offload_get_new_sanitizer_ptr(int32_t Slot) {
   return AP;
 }
 
+template <AllocationKind AK> struct Allocations {
+  static AllocationArrayTy<AK> Arr[SanitizerConfig<AK>::NUM_ALLOCATION_ARRAYS];
+};
+
+template <AllocationKind AK>
+[[clang::disable_sanitizer_instrumentation,
+  gnu::always_inline]] AllocationArrayTy<AK> &
+getAllocationArray() {
+  uint32_t ThreadId = 0, BlockId = 0;
+  if constexpr (AK == AllocationKind::LOCAL) {
+    ThreadId = ompx_thread_id(0);
+    BlockId = ompx_block_id(0);
+  }
+  auto &AllocArr = Allocations<AK>::Arr[ThreadId + BlockId * ompx_block_dim(0)];
+  return AllocArr;
+}
+
 struct SanitizerTrapInfoTy {
   /// AllocationTy
   /// {
@@ -166,10 +184,10 @@ struct SanitizerTrapInfoTy {
 
   template <enum AllocationKind AK>
   [[clang::disable_sanitizer_instrumentation, gnu::always_inline]] void
-  accessError(ErrorCodeTy EC, const AllocationTy<AK> &A,
-              const AllocationPtrTy<AK> &AP, uint64_t Size, int64_t Id,
-              uint64_t PC, const char *FunctionName, const char *FileName,
-              uint64_t LineNo) {
+  propagateAccessError(ErrorCodeTy EC, const AllocationTy<AK> &A,
+                       const AllocationPtrTy<AK> &AP, uint64_t Size, int64_t Id,
+                       uint64_t PC, const char *FunctionName,
+                       const char *FileName, uint64_t LineNo) {
     AllocationStart = A.Start;
     AllocationLength = A.Length;
     AllocationId = A.Id;
@@ -212,8 +230,8 @@ struct SanitizerTrapInfoTy {
                    uint64_t Size, int64_t AccessId, uint64_t PC,
                    const char *FunctionName, const char *FileName,
                    uint64_t LineNo) {
-    accessError(OutOfBounds, A, AP, Size, AccessId, PC, FunctionName, FileName,
-                LineNo);
+    propagateAccessError(OutOfBounds, A, AP, Size, AccessId, PC, FunctionName,
+                         FileName, LineNo);
     __builtin_trap();
   }
 
@@ -223,8 +241,8 @@ struct SanitizerTrapInfoTy {
                 uint64_t Size, int64_t AccessId, uint64_t PC,
                 const char *FunctionName, const char *FileName,
                 uint64_t LineNo) {
-    accessError(UseAfterScope, A, AP, Size, AccessId, PC, FunctionName,
-                FileName, LineNo);
+    propagateAccessError(UseAfterScope, A, AP, Size, AccessId, PC, FunctionName,
+                         FileName, LineNo);
     __builtin_trap();
   }
 
@@ -234,11 +252,30 @@ struct SanitizerTrapInfoTy {
                uint64_t Size, int64_t AccessId, uint64_t PC,
                const char *FunctionName, const char *FileName,
                uint64_t LineNo) {
-    accessError(UseAfterFree, A, AP, Size, AccessId, PC, FunctionName, FileName,
-                LineNo);
+    propagateAccessError(UseAfterFree, A, AP, Size, AccessId, PC, FunctionName,
+                         FileName, LineNo);
     __builtin_trap();
   }
 
+  template <enum AllocationKind AK>
+  [[clang::disable_sanitizer_instrumentation, noreturn, gnu::noinline]] void
+  accessError(const AllocationPtrTy<AK> AP, int64_t Size, int64_t AccessId,
+              uint64_t PC, const char *FunctionName, const char *FileName,
+              uint64_t LineNo) {
+    auto &A = getAllocationArray<AK>().Arr[AP.AllocationId];
+    int64_t Offset = AP.Offset;
+    int64_t Length = A.Length;
+    if (AK == AllocationKind::LOCAL && Length == 0)
+      useAfterScope<AK>(A, AP, Size, AccessId, PC, FunctionName, FileName,
+                        LineNo);
+    else if (Offset > Length - Size)
+      outOfBoundAccess<AK>(A, AP, Size, AccessId, PC, FunctionName, FileName,
+                           LineNo);
+    else
+      useAfterFree<AK>(A, AP, Size, AccessId, PC, FunctionName, FileName,
+                       LineNo);
+  }
+
   template <enum AllocationKind AK>
   [[clang::disable_sanitizer_instrumentation, noreturn, gnu::noinline]] void
   memoryLeak(const AllocationTy<AK> A, uint64_t Slot) {
diff --git a/offload/src/CMakeLists.txt b/offload/src/CMakeLists.txt
index a0250e522e46e..0f30f6028f103 100644
--- a/offload/src/CMakeLists.txt
+++ b/offload/src/CMakeLists.txt
@@ -61,7 +61,7 @@ endforeach()
 target_compile_options(omptarget PRIVATE ${offload_compile_flags})
 target_link_options(omptarget PRIVATE ${offload_link_flags})
 
-add_llvm_library(clang_rt.offload_kernels
+add_llvm_library(offload.kernels
   STATIC
 
   Kernels/Sanitizer.cpp
@@ -75,8 +75,8 @@ add_llvm_library(clang_rt.offload_kernels
 )
 
 list(JOIN LIBOMPTARGET_DEVICE_ARCHITECTURES "," KERNEL_OFFLOAD_ARCHS)
-target_compile_options(clang_rt.offload_kernels PRIVATE -x cuda --offload-arch=${KERNEL_OFFLOAD_ARCHS} -nocudalib -nogpulib -fopenmp-target-jit -foffload-via-llvm )
-target_link_options(clang_rt.offload_kernels PRIVATE -x cuda --offload-arch=${KERNEL_OFFLOAD_ARCHS} -nocudalib -nogpulib -fopenmp-target-jit -foffload-via-llvm )
+target_compile_options(offload.kernels PRIVATE -x cuda --offload-arch=${KERNEL_OFFLOAD_ARCHS} -nocudalib -nogpulib -fopenmp-target-jit -foffload-via-llvm )
+target_link_options(offload.kernels PRIVATE -x cuda --offload-arch=${KERNEL_OFFLOAD_ARCHS} -nocudalib -nogpulib -fopenmp-target-jit -foffload-via-llvm )
 
 # libomptarget.so needs to be aware of where the plugins live as they
 # are now separated in the build directory.
@@ -85,4 +85,4 @@ set_target_properties(omptarget PROPERTIES
                       INSTALL_RPATH "$ORIGIN"
                       BUILD_RPATH "$ORIGIN:${CMAKE_CURRENT_BINARY_DIR}/..")
 install(TARGETS omptarget LIBRARY COMPONENT omptarget DESTINATION "${OFFLOAD_INSTALL_LIBDIR}")
-install(TARGETS clang_rt.offload_kernels LIBRARY COMPONENT clang_rt.offload_kernels DESTINATION "${OFFLOAD_INSTALL_LIBDIR}")
+install(TARGETS offload.kernels LIBRARY COMPONENT offload.kernels DESTINATION "${OFFLOAD_INSTALL_LIBDIR}")

>From 6ac086a60e877b90405cac2fd908e7d2d615d006 Mon Sep 17 00:00:00 2001
From: Johannes Doerfert <johannes at jdoerfert.de>
Date: Thu, 27 Jun 2024 20:45:25 -0700
Subject: [PATCH 12/31] WIP

---
 llvm/lib/Passes/PassBuilderPipelines.cpp      |   6 +
 .../Transforms/IPO/AttributorAttributes.cpp   |   7 +-
 .../lib/Transforms/Instrumentation/GPUSan.cpp |  65 +++--
 offload/DeviceRTL/src/Sanitizer.cpp           | 269 ++++++++++--------
 offload/include/Shared/Sanitizer.h            | 178 +++++++-----
 offload/include/Shared/Utils.h                |   5 +-
 .../common/src/PluginInterface.cpp            |  51 ++--
 offload/test/sanitizer/volatile_stack_null.c  |   9 +-
 8 files changed, 356 insertions(+), 234 deletions(-)

diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp
index be893e719ff0e..608bfa81b579b 100644
--- a/llvm/lib/Passes/PassBuilderPipelines.cpp
+++ b/llvm/lib/Passes/PassBuilderPipelines.cpp
@@ -1755,6 +1755,9 @@ PassBuilder::buildLTODefaultPipeline(OptimizationLevel Level,
     // in ICP.
     MPM.addPass(LowerTypeTestsPass(nullptr, nullptr, true));
 
+    if (EnableOffloadSanitizer)
+      MPM.addPass(GPUSanPass());
+
     invokeFullLinkTimeOptimizationLastEPCallbacks(MPM, Level);
 
     // Emit annotation remarks.
@@ -1833,6 +1836,9 @@ PassBuilder::buildLTODefaultPipeline(OptimizationLevel Level,
     // pipeline).
     MPM.addPass(LowerTypeTestsPass(nullptr, nullptr, true));
 
+    if (EnableOffloadSanitizer)
+      MPM.addPass(GPUSanPass());
+
     invokeFullLinkTimeOptimizationLastEPCallbacks(MPM, Level);
 
     // Emit annotation remarks.
diff --git a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
index 05a38cdd7d7b6..ebdc2c3682aab 100644
--- a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
+++ b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
@@ -7536,7 +7536,7 @@ struct AAPrivatizablePtrArgument final : public AAPrivatizablePtrImpl {
     // Collect all tail calls in the function as we cannot allow new allocas to
     // escape into tail recursion.
     // TODO: Be smarter about new allocas escaping into tail calls.
-    SmallVector<CallInst *, 16> TailCalls;
+    SmallVector<WeakTrackingVH, 16> TailCalls;
     bool UsedAssumedInformation = false;
     if (!A.checkForAllInstructions(
             [&](Instruction &I) {
@@ -7574,8 +7574,9 @@ struct AAPrivatizablePtrArgument final : public AAPrivatizablePtrImpl {
                 AI, Arg->getType(), "", IP);
           Arg->replaceAllUsesWith(AI);
 
-          for (CallInst *CI : TailCalls)
-            CI->setTailCall(false);
+          for (auto &CI : TailCalls)
+            if (CI)
+              cast<CallInst>(CI)->setTailCall(false);
         };
 
     // Callback to repair a call site of the associated function. The elements
diff --git a/llvm/lib/Transforms/Instrumentation/GPUSan.cpp b/llvm/lib/Transforms/Instrumentation/GPUSan.cpp
index 591eb5dc55182..d88a9b3de8b07 100644
--- a/llvm/lib/Transforms/Instrumentation/GPUSan.cpp
+++ b/llvm/lib/Transforms/Instrumentation/GPUSan.cpp
@@ -80,7 +80,8 @@ class GPUSanImpl final {
 private:
   bool instrumentGlobals();
   bool instrumentFunction(Function &Fn);
-  Value *instrumentAllocation(Instruction &I, Value &Size, FunctionCallee Fn);
+  Value *instrumentAllocation(Instruction &I, Value &Size, FunctionCallee Fn,
+                              PtrOrigin PO);
   Value *instrumentAllocaInst(LoopInfo &LI, AllocaInst &AI);
   void instrumentAccess(LoopInfo &LI, Instruction &I, int PtrIdx,
                         Type &AccessTy, bool IsRead);
@@ -111,15 +112,21 @@ class GPUSanImpl final {
     return FC;
   }
 
+  PointerType *getPtrTy(PtrOrigin PO) {
+    if (PO == PtrOrigin::LOCAL)
+      return PointerType::get(Ctx, 5);
+    return PtrTy;
+  }
+
   FunctionCallee getNewFn(PtrOrigin PO) {
     assert(PO <= GLOBAL && "Origin does not need handling.");
-    return getOrCreateFn(NewFn[PO], "ompx_new" + getSuffix(PO), PtrTy,
-                         {PtrTy, Int64Ty, Int64Ty, Int64Ty});
+    return getOrCreateFn(NewFn[PO], "ompx_new" + getSuffix(PO), getPtrTy(PO),
+                         {getPtrTy(PO), Int64Ty, Int64Ty, Int64Ty});
   }
   FunctionCallee getFreeFn(PtrOrigin PO) {
     assert(PO <= GLOBAL && "Origin does not need handling.");
     return getOrCreateFn(FreeFn[PO], "ompx_free" + getSuffix(PO), VoidTy,
-                         {PtrTy, Int64Ty});
+                         {getPtrTy(PO), Int64Ty});
   }
   FunctionCallee getFreeNLocalFn() {
     return getOrCreateFn(FreeNLocal, "ompx_free_local_n", VoidTy, {Int32Ty});
@@ -127,39 +134,39 @@ class GPUSanImpl final {
   FunctionCallee getCheckFn(PtrOrigin PO) {
     assert(PO <= GLOBAL && "Origin does not need handling.");
     return getOrCreateFn(
-        CheckFn[PO], "ompx_check" + getSuffix(PO), PtrTy,
-        {PtrTy, Int64Ty, Int64Ty, Int64Ty, PtrTy, PtrTy, Int64Ty});
+        CheckFn[PO], "ompx_check" + getSuffix(PO), getPtrTy(PO),
+        {getPtrTy(PO), Int64Ty, Int64Ty, Int64Ty, PtrTy, PtrTy, Int64Ty});
   }
   FunctionCallee getCheckWithBaseFn(PtrOrigin PO) {
     assert(PO >= LOCAL && PO <= GLOBAL && "Origin does not need handling.");
     return getOrCreateFn(CheckWithBaseFn[PO],
-                         "ompx_check_with_base" + getSuffix(PO), PtrTy,
-                         {PtrTy, PtrTy, Int64Ty, Int32Ty, Int64Ty, Int64Ty,
-                          Int64Ty, PtrTy, PtrTy, Int64Ty});
+                         "ompx_check_with_base" + getSuffix(PO), getPtrTy(PO),
+                         {getPtrTy(PO), getPtrTy(PO), Int64Ty, Int32Ty, Int64Ty,
+                          Int64Ty, Int64Ty, PtrTy, PtrTy, Int64Ty});
   }
   FunctionCallee getAllocationInfoFn(PtrOrigin PO) {
     assert(PO >= LOCAL && PO <= GLOBAL && "Origin does not need handling.");
     return getOrCreateFn(
         AllocationInfoFn[PO], "ompx_get_allocation_info" + getSuffix(PO),
-        StructType::create({PtrTy, Int64Ty, Int32Ty}), {PtrTy});
+        StructType::create({getPtrTy(PO), Int64Ty, Int32Ty}), {getPtrTy(PO)});
   }
   FunctionCallee getGEPFn(PtrOrigin PO) {
     assert(PO <= GLOBAL && "Origin does not need handling.");
-    return getOrCreateFn(GEPFn[PO], "ompx_gep" + getSuffix(PO), PtrTy,
-                         {PtrTy, Int64Ty, Int64Ty});
+    return getOrCreateFn(GEPFn[PO], "ompx_gep" + getSuffix(PO), getPtrTy(PO),
+                         {getPtrTy(PO), Int64Ty, Int64Ty});
   }
   FunctionCallee getUnpackFn(PtrOrigin PO) {
     assert(PO <= GLOBAL && "Origin does not need handling.");
-    return getOrCreateFn(UnpackFn[PO], "ompx_unpack" + getSuffix(PO), PtrTy,
-                         {PtrTy, Int64Ty});
+    return getOrCreateFn(UnpackFn[PO], "ompx_unpack" + getSuffix(PO),
+                         getPtrTy(PO), {getPtrTy(PO), Int64Ty});
   }
   FunctionCallee getLifetimeStart() {
     return getOrCreateFn(LifetimeStartFn, "ompx_lifetime_start", VoidTy,
-                         {PtrTy, Int64Ty});
+                         {getPtrTy(LOCAL), Int64Ty});
   }
   FunctionCallee getLifetimeEnd() {
     return getOrCreateFn(LifetimeEndFn, "ompx_lifetime_end", VoidTy,
-                         {PtrTy, Int64Ty});
+                         {getPtrTy(LOCAL), Int64Ty});
   }
   FunctionCallee getLeakCheckFn() {
     FunctionCallee LeakCheckFn;
@@ -253,7 +260,8 @@ void GPUSanImpl::getAllocationInfo(Function &Fn, PtrOrigin PO, Value &Object,
     else
       IP = &*Fn.getEntryBlock().getFirstNonPHIOrDbgOrAlloca();
     IRBuilder<> IRB(IP);
-    auto *CB = IRB.CreateCall(getAllocationInfoFn(PO), {&Object});
+    auto *CB = IRB.CreateCall(getAllocationInfoFn(PO),
+                              {IRB.CreateAddrSpaceCast(&Object, getPtrTy(PO))});
     It.Start = IRB.CreateExtractValue(CB, {0});
     It.Length = IRB.CreateExtractValue(CB, {1});
     It.Tag = IRB.CreateExtractValue(CB, {2});
@@ -326,9 +334,9 @@ bool GPUSanImpl::instrumentGlobals() {
 }
 
 Value *GPUSanImpl::instrumentAllocation(Instruction &I, Value &Size,
-                                        FunctionCallee Fn) {
+                                        FunctionCallee Fn, PtrOrigin PO) {
   IRBuilder<> IRB(I.getNextNode());
-  Value *PlainI = IRB.CreatePointerBitCastOrAddrSpaceCast(&I, PtrTy);
+  Value *PlainI = IRB.CreatePointerBitCastOrAddrSpaceCast(&I, getPtrTy(PO));
   static int AllocationId = 1;
   auto *CB = IRB.CreateCall(
       Fn,
@@ -364,7 +372,7 @@ Value *GPUSanImpl::instrumentAllocaInst(LoopInfo &LI, AllocaInst &AI) {
   if (!SizeOrNone)
     llvm_unreachable("TODO");
   Value *Size = ConstantInt::get(Int64Ty, *SizeOrNone);
-  return instrumentAllocation(AI, *Size, getNewFn(LOCAL));
+  return instrumentAllocation(AI, *Size, getNewFn(LOCAL), LOCAL);
 }
 
 void GPUSanImpl::instrumentAccess(LoopInfo &LI, Instruction &I, int PtrIdx,
@@ -378,7 +386,7 @@ void GPUSanImpl::instrumentAccess(LoopInfo &LI, Instruction &I, int PtrIdx,
   Value *Start = nullptr;
   Value *Length = nullptr;
   Value *Tag = nullptr;
-  if (PO != UNKNOWN)
+  if (PO != UNKNOWN && Object)
     getAllocationInfo(*I.getFunction(), PO, *const_cast<Value *>(Object), Start,
                       Length, Tag);
 
@@ -390,7 +398,8 @@ void GPUSanImpl::instrumentAccess(LoopInfo &LI, Instruction &I, int PtrIdx,
   assert(!TySize.isScalable());
   Value *Size = ConstantInt::get(Int64Ty, TySize.getFixedValue());
   IRBuilder<> IRB(&I);
-  Value *PlainPtrOp = IRB.CreatePointerBitCastOrAddrSpaceCast(PtrOp, PtrTy);
+  Value *PlainPtrOp =
+      IRB.CreatePointerBitCastOrAddrSpaceCast(PtrOp, getPtrTy(PO));
   CallInst *CB;
   if (Start) {
     CB =
@@ -430,7 +439,8 @@ void GPUSanImpl::instrumentGEPInst(LoopInfo &LI, GetElementPtrInst &GEP) {
   GEP.setOperand(GetElementPtrInst::getPointerOperandIndex(),
                  Constant::getNullValue(PtrOp->getType()));
   IRBuilder<> IRB(GEP.getNextNode());
-  Value *PlainPtrOp = IRB.CreatePointerBitCastOrAddrSpaceCast(PtrOp, PtrTy);
+  Value *PlainPtrOp =
+      IRB.CreatePointerBitCastOrAddrSpaceCast(PtrOp, getPtrTy(PO));
   auto *CB = IRB.CreateCall(getGEPFn(PO),
                             {PlainPtrOp, UndefValue::get(Int64Ty), getPC(IRB)},
                             GEP.getName() + ".san");
@@ -459,7 +469,8 @@ bool GPUSanImpl::instrumentCallInst(LoopInfo &LI, CallInst &CI) {
         PtrOrigin PO = getPtrOrigin(LI, Op);
         if (PO > GLOBAL)
           continue;
-        Value *PlainOp = IRB.CreatePointerBitCastOrAddrSpaceCast(Op, PtrTy);
+        Value *PlainOp =
+            IRB.CreatePointerBitCastOrAddrSpaceCast(Op, getPtrTy(PO));
         auto *CB = IRB.CreateCall(getUnpackFn(PO), {PlainOp, getPC(IRB)},
                                   Op->getName() + ".unpack");
         CI.setArgOperand(
@@ -515,8 +526,6 @@ bool GPUSanImpl::instrumentFunction(Function &Fn) {
     }
   }
 
-  for (auto &It : Allocas)
-    It.second = instrumentAllocaInst(LI, *It.first);
   for (auto *Load : Loads)
     instrumentLoadInst(LI, *Load);
   for (auto *Store : Stores)
@@ -525,6 +534,8 @@ bool GPUSanImpl::instrumentFunction(Function &Fn) {
     instrumentGEPInst(LI, *GEP);
   for (auto *Call : Calls)
     Changed |= instrumentCallInst(LI, *Call);
+  for (auto &It : Allocas)
+    It.second = instrumentAllocaInst(LI, *It.first);
 
   instrumentReturns(Allocas, Returns);
 
@@ -553,12 +564,12 @@ bool GPUSanImpl::instrument() {
     return false;
   }();
 
-  M.dump();
   for (Function &Fn : M)
     if (!Fn.getName().contains("ompx") && !Fn.getName().contains("__kmpc") &&
         !Fn.getName().starts_with("rpc_"))
       if (!Fn.hasFnAttribute(Attribute::DisableSanitizerInstrumentation))
         Changed |= instrumentFunction(Fn);
+
   return Changed;
 }
 
diff --git a/offload/DeviceRTL/src/Sanitizer.cpp b/offload/DeviceRTL/src/Sanitizer.cpp
index 7905b55efe450..7c9bb1a085b24 100644
--- a/offload/DeviceRTL/src/Sanitizer.cpp
+++ b/offload/DeviceRTL/src/Sanitizer.cpp
@@ -22,33 +22,46 @@ using namespace utils;
 
 #include "Shared/Sanitizer.h"
 
-struct AllocationInfoTy {
-  void *Start;
+struct AllocationInfoLocalTy {
+  _AS_PTR(void, AllocationKind::LOCAL) Start;
+  uint64_t Length;
+  uint32_t Tag;
+};
+struct AllocationInfoGlobalTy {
+  _AS_PTR(void, AllocationKind::GLOBAL) Start;
   uint64_t Length;
   uint32_t Tag;
 };
 
-[[gnu::used, gnu::retain, gnu::weak,
-  gnu::visibility("protected")]] SanitizerTrapInfoTy *__sanitizer_trap_info_ptr;
+template <AllocationKind AK> struct AllocationInfoTy {};
+template <> struct AllocationInfoTy<AllocationKind::GLOBAL> {
+  using ASVoidPtrTy = AllocationInfoGlobalTy;
+};
+template <> struct AllocationInfoTy<AllocationKind::LOCAL> {
+  using ASVoidPtrTy = AllocationInfoLocalTy;
+};
 
 template <AllocationKind AK> struct AllocationTracker {
-  static_assert(sizeof(AllocationTy<AK>) == sizeof(void *) * 2,
+  static_assert(sizeof(AllocationTy<AK>) == sizeof(_AS_PTR(void, AK)) * 2,
                 "AllocationTy should not exceed two pointers");
-  static_assert(sizeof(AllocationPtrTy<AK>) == sizeof(void *),
-                "AllocationTy pointers should be pointer sized");
+  //  static_assert(sizeof(AllocationPtrTy<AK>) * 8 ==
+  //                    SanitizerConfig<AK>::ADDR_SPACE_PTR_SIZE,
+  //                "AllocationTy pointers should be pointer sized");
 
-  [[clang::disable_sanitizer_instrumentation]] static struct AllocationInfoTy
-  getAllocationInfo(void *P) {
+  [[clang::disable_sanitizer_instrumentation]] static
+      typename AllocationInfoTy<AK>::ASVoidPtrTy
+      getAllocationInfo(_AS_PTR(void, AK) P) {
     AllocationPtrTy<AK> AP = AllocationPtrTy<AK>::get(P);
     uint32_t AllocationId = AP.AllocationId;
-    auto &AllocArr = getAllocationArray<AK>();
-    auto &A = AllocArr.Arr[AllocationId];
+    if (OMP_UNLIKELY(AllocationId >= SanitizerConfig<AK>::SLOTS))
+      return {P, 0, (uint32_t)-1};
+    auto &A = getAllocation<AK>(AP);
     return {A.Start, A.Length, (uint32_t)A.Tag};
   }
 
-  [[clang::disable_sanitizer_instrumentation]] static void *
-  create(void *Start, uint64_t Length, int64_t AllocationId, uint64_t Slot,
-         uint64_t PC) {
+  [[clang::disable_sanitizer_instrumentation]] static _AS_PTR(void, AK)
+      create(_AS_PTR(void, AK) Start, uint64_t Length, int64_t AllocationId,
+             uint64_t Slot, uint64_t PC) {
     if constexpr (SanitizerConfig<AK>::OFFSET_BITS < 64)
       if (OMP_UNLIKELY(Length >= (1UL << (SanitizerConfig<AK>::OFFSET_BITS))))
         __sanitizer_trap_info_ptr->exceedsAllocationLength<AK>(
@@ -81,10 +94,10 @@ template <AllocationKind AK> struct AllocationTracker {
     return AP;
   }
 
-  [[clang::disable_sanitizer_instrumentation]] static void remove(void *P,
-                                                                  uint64_t PC) {
+  [[clang::disable_sanitizer_instrumentation]] static void
+  remove(_AS_PTR(void, AK) P, uint64_t PC) {
     AllocationPtrTy<AK> AP = AllocationPtrTy<AK>::get(P);
-    uint32_t AllocationId = AP.AllocationId;
+    uint64_t AllocationId = AP.AllocationId;
     auto &AllocArr = getAllocationArray<AK>();
     auto &A = AllocArr.Arr[AllocationId];
     A.Length = 0;
@@ -106,19 +119,22 @@ template <AllocationKind AK> struct AllocationTracker {
     }
   }
 
-  [[clang::disable_sanitizer_instrumentation]] static void *
-  advance(void *P, uint64_t Offset, uint64_t PC) {
+  [[clang::disable_sanitizer_instrumentation]] static _AS_PTR(void, AK)
+      advance(_AS_PTR(void, AK) P, uint64_t Offset, uint64_t PC) {
     AllocationPtrTy<AK> AP = AllocationPtrTy<AK>::get(P);
     AP.Offset += Offset;
     return AP;
   }
 
-  [[clang::disable_sanitizer_instrumentation]] static void *
-  checkWithBase(void *P, void *Start, int64_t Length, uint32_t Tag,
-                int64_t Size, int64_t AccessId, uint64_t PC,
-                const char *FunctionName, const char *FileName,
-                uint64_t LineNo) {
+  [[clang::disable_sanitizer_instrumentation]] static _AS_PTR(void, AK)
+      checkWithBase(_AS_PTR(void, AK) P, _AS_PTR(void, AK) Start,
+                    int64_t Length, uint32_t Tag, int64_t Size,
+                    int64_t AccessId, uint64_t PC, const char *FunctionName,
+                    const char *FileName, uint64_t LineNo) {
     AllocationPtrTy<AK> AP = AllocationPtrTy<AK>::get(P);
+    if constexpr (AK == AllocationKind::LOCAL)
+      if (Length == 0)
+        Length = getAllocation<AK>(AP, AccessId).Length;
     int64_t Offset = AP.Offset;
     if (OMP_UNLIKELY(
             Offset > Length - Size ||
@@ -129,40 +145,36 @@ template <AllocationKind AK> struct AllocationTracker {
     return utils::advancePtr(Start, Offset);
   }
 
-  [[clang::disable_sanitizer_instrumentation]] static void *
-  check(void *P, int64_t Size, int64_t AccessId, uint64_t PC,
-        const char *FunctionName, const char *FileName, uint64_t LineNo) {
+  [[clang::disable_sanitizer_instrumentation]] static _AS_PTR(void, AK)
+      check(_AS_PTR(void, AK) P, int64_t Size, int64_t AccessId, uint64_t PC,
+            const char *FunctionName, const char *FileName, uint64_t LineNo) {
     AllocationPtrTy<AK> AP = AllocationPtrTy<AK>::get(P);
-    auto &AllocArr = getAllocationArray<AK>();
-    auto &Alloc = AllocArr.Arr[AP.AllocationId];
+    auto &Alloc = getAllocation<AK>(AP, AccessId);
     return checkWithBase(P, Alloc.Start, Alloc.Length, Alloc.Tag, Size,
                          AccessId, PC, FunctionName, FileName, LineNo);
   }
 
-  [[clang::disable_sanitizer_instrumentation]] static void *
-  unpack(void *P, uint64_t PC = 0) {
+  [[clang::disable_sanitizer_instrumentation]] static _AS_PTR(void, AK)
+      unpack(_AS_PTR(void, AK) P, uint64_t PC = 0) {
     AllocationPtrTy<AK> AP = AllocationPtrTy<AK>::get(P);
-    auto &AllocArr = getAllocationArray<AK>();
-    auto &A = AllocArr.Arr[AP.AllocationId];
+    auto &A = getAllocation<AK>(AP);
     uint64_t Offset = AP.Offset;
-    void *Ptr = utils::advancePtr(A.Start, Offset);
+    _AS_PTR(void, AK) Ptr = utils::advancePtr(A.Start, Offset);
     return Ptr;
   }
 
   [[clang::disable_sanitizer_instrumentation]] static void
-  lifetimeStart(void *P, uint64_t Length) {
+  lifetimeStart(_AS_PTR(void, AK) P, uint64_t Length) {
     AllocationPtrTy<AK> AP = AllocationPtrTy<AK>::get(P);
-    auto &AllocArr = getAllocationArray<AK>();
-    auto &A = AllocArr.Arr[AP.AllocationId];
+    auto &A = getAllocation<AK>(AP);
     // TODO: Check length
     A.Length = Length;
   }
 
   [[clang::disable_sanitizer_instrumentation]] static void
-  lifetimeEnd(void *P, uint64_t Length) {
+  lifetimeEnd(_AS_PTR(void, AK) P, uint64_t Length) {
     AllocationPtrTy<AK> AP = AllocationPtrTy<AK>::get(P);
-    auto &AllocArr = getAllocationArray<AK>();
-    auto &A = AllocArr.Arr[AP.AllocationId];
+    auto &A = getAllocation<AK>(AP);
     // TODO: Check length
     A.Length = 0;
   }
@@ -184,45 +196,40 @@ AllocationArrayTy<AK>
 
 extern "C" {
 
-#define PTR_CHECK(FUNCTION, PTR, ...)                                          \
-  if (isThreadLocalMemPtr(PTR))                                                \
-    return AllocationTracker<AllocationKind::LOCAL>::FUNCTION(                 \
-        PTR __VA_OPT__(, ) __VA_ARGS__);                                       \
-  return AllocationTracker<AllocationKind::GLOBAL>::FUNCTION(                  \
-      PTR __VA_OPT__(, ) __VA_ARGS__);
-#define FAKE_PTR_CHECK(FUNCTION, PTR, ...)                                     \
-  if (AllocationPtrTy<AllocationKind::GLOBAL>::get(PTR).Kind ==                \
-      (uint32_t)AllocationKind::LOCAL)                                         \
-    return AllocationTracker<AllocationKind::LOCAL>::FUNCTION(                 \
-        PTR __VA_OPT__(, ) __VA_ARGS__);                                       \
-  return AllocationTracker<AllocationKind::GLOBAL>::FUNCTION(                  \
-      PTR __VA_OPT__(, ) __VA_ARGS__);
+#define REAL_PTR_IS_LOCAL(PTR) (isThreadLocalMemPtr(PTR))
+#define IS_LOCAL(PTR) ((intptr_t)PTR & 1)
 
 [[clang::disable_sanitizer_instrumentation, gnu::flatten, gnu::always_inline,
-  gnu::used, gnu::retain]] void *
-ompx_new(void *Start, uint64_t Length, int64_t AllocationId, uint32_t Slot,
-         uint64_t PC) {
-  PTR_CHECK(create, Start, Length, AllocationId, Slot, PC);
+  gnu::used, gnu::retain]] _AS_PTR(void, AllocationKind::LOCAL)
+    ompx_new_local(_AS_PTR(void, AllocationKind::LOCAL) Start, uint64_t Length,
+                   int64_t AllocationId, uint32_t Slot, uint64_t PC) {
+  return AllocationTracker<AllocationKind::LOCAL>::create(
+      Start, Length, AllocationId, Slot, PC);
 }
 [[clang::disable_sanitizer_instrumentation, gnu::flatten, gnu::always_inline,
-  gnu::used, gnu::retain]] void *
-ompx_new_local(void *Start, uint64_t Length, int64_t AllocationId,
-               uint32_t Slot, uint64_t PC) {
-  return AllocationTracker<AllocationKind::LOCAL>::create(
+  gnu::used, gnu::retain]] _AS_PTR(void, AllocationKind::GLOBAL)
+    ompx_new_global(_AS_PTR(void, AllocationKind::GLOBAL) Start,
+                    uint64_t Length, int64_t AllocationId, uint32_t Slot,
+                    uint64_t PC) {
+  return AllocationTracker<AllocationKind::GLOBAL>::create(
       Start, Length, AllocationId, Slot, PC);
 }
 [[clang::disable_sanitizer_instrumentation, gnu::flatten, gnu::always_inline,
   gnu::used, gnu::retain]] void
-__sanitizer_register_host(void *Start, uint64_t Length, uint64_t Slot,
-                          uint64_t PC) {
+__sanitizer_register_host(_AS_PTR(void, AllocationKind::GLOBAL) Start,
+                          uint64_t Length, uint64_t Slot, uint64_t PC) {
   AllocationTracker<AllocationKind::GLOBAL>::create(Start, Length, Slot, Slot,
                                                     PC);
 }
-
 [[clang::disable_sanitizer_instrumentation, gnu::flatten, gnu::always_inline,
-  gnu::used, gnu::retain]] void
-ompx_free(void *P, uint64_t PC) {
-  FAKE_PTR_CHECK(remove, P, PC);
+  gnu::used, gnu::retain]] void *
+ompx_new(void *Start, uint64_t Length, int64_t AllocationId, uint32_t Slot,
+         uint64_t PC) {
+  if (REAL_PTR_IS_LOCAL(Start))
+    return (void *)ompx_new_local((_AS_PTR(void, AllocationKind::LOCAL))Start,
+                                  Length, AllocationId, Slot, PC);
+  return (void *)ompx_new_global((_AS_PTR(void, AllocationKind::GLOBAL))Start,
+                                 Length, AllocationId, Slot, PC);
 }
 [[clang::disable_sanitizer_instrumentation, gnu::flatten, gnu::always_inline,
   gnu::used, gnu::retain]] void
@@ -231,108 +238,144 @@ ompx_free_local_n(int32_t N) {
 }
 [[clang::disable_sanitizer_instrumentation, gnu::flatten, gnu::always_inline,
   gnu::used, gnu::retain]] void
-__sanitizer_unregister_host(void *P) {
+__sanitizer_unregister_host(_AS_PTR(void, AllocationKind::GLOBAL) P) {
   AllocationTracker<AllocationKind::GLOBAL>::remove(P, /*PC=*/0);
 }
-
 [[clang::disable_sanitizer_instrumentation, gnu::flatten, gnu::always_inline,
-  gnu::used, gnu::retain]] void *
-ompx_gep(void *P, uint64_t Offset, uint64_t PC) {
-  FAKE_PTR_CHECK(advance, P, Offset, PC);
+  gnu::used, gnu::retain]] void
+ompx_free_local(_AS_PTR(void, AllocationKind::LOCAL) P) {
+  return AllocationTracker<AllocationKind::LOCAL>::remove(P, /*PC=*/0);
 }
 [[clang::disable_sanitizer_instrumentation, gnu::flatten, gnu::always_inline,
-  gnu::used, gnu::retain]] void *
-ompx_gep_local(void *P, uint64_t Offset, uint64_t PC) {
+  gnu::used, gnu::retain]] void
+ompx_free_global(_AS_PTR(void, AllocationKind::GLOBAL) P) {
+  return AllocationTracker<AllocationKind::GLOBAL>::remove(P, /*PC=*/0);
+}
+[[clang::disable_sanitizer_instrumentation, gnu::flatten, gnu::always_inline,
+  gnu::used, gnu::retain]] void
+ompx_free(void *P, uint64_t PC) {
+  if (IS_LOCAL(P))
+    ompx_free_local((_AS_PTR(void, AllocationKind::LOCAL))P);
+  else
+    ompx_free_global((_AS_PTR(void, AllocationKind::GLOBAL))P);
+}
+
+[[clang::disable_sanitizer_instrumentation, gnu::flatten, gnu::always_inline,
+  gnu::used, gnu::retain]] _AS_PTR(void, AllocationKind::LOCAL)
+    ompx_gep_local(_AS_PTR(void, AllocationKind::LOCAL) P, uint64_t Offset,
+                   uint64_t PC) {
   return AllocationTracker<AllocationKind::LOCAL>::advance(P, Offset, PC);
 }
 [[clang::disable_sanitizer_instrumentation, gnu::flatten, gnu::always_inline,
-  gnu::used, gnu::retain]] void *
-ompx_gep_global(void *P, uint64_t Offset, uint64_t PC) {
+  gnu::used, gnu::retain]] _AS_PTR(void, AllocationKind::GLOBAL)
+    ompx_gep_global(_AS_PTR(void, AllocationKind::GLOBAL) P, uint64_t Offset,
+                    uint64_t PC) {
   return AllocationTracker<AllocationKind::GLOBAL>::advance(P, Offset, PC);
 }
-
 [[clang::disable_sanitizer_instrumentation, gnu::flatten, gnu::always_inline,
   gnu::used, gnu::retain]] void *
-ompx_check(void *P, uint64_t Size, uint64_t AccessId, uint64_t PC,
-           const char *FunctionName, const char *FileName, uint64_t LineNo) {
-  FAKE_PTR_CHECK(check, P, Size, AccessId, PC, FunctionName, FileName, LineNo);
+ompx_gep(void *P, uint64_t Offset, uint64_t PC) {
+  if (IS_LOCAL(P))
+    return (void *)ompx_gep_local((_AS_PTR(void, AllocationKind::LOCAL))P,
+                                  Offset, PC);
+  return (void *)ompx_gep_global((_AS_PTR(void, AllocationKind::GLOBAL))P,
+                                 Offset, PC);
 }
 
 [[clang::disable_sanitizer_instrumentation, gnu::flatten, gnu::always_inline,
-  gnu::used, gnu::retain]] void *
-ompx_check_local(void *P, uint64_t Size, uint64_t AccessId, uint64_t PC,
-                 const char *FunctionName, const char *FileName,
-                 uint64_t LineNo) {
+  gnu::used, gnu::retain]] _AS_PTR(void, AllocationKind::LOCAL)
+    ompx_check_local(_AS_PTR(void, AllocationKind::LOCAL) P, uint64_t Size,
+                     uint64_t AccessId, uint64_t PC, const char *FunctionName,
+                     const char *FileName, uint64_t LineNo) {
   return AllocationTracker<AllocationKind::LOCAL>::check(
       P, Size, AccessId, PC, FunctionName, FileName, LineNo);
 }
-
 [[clang::disable_sanitizer_instrumentation, gnu::flatten, gnu::always_inline,
-  gnu::used, gnu::retain]] void *
-ompx_check_global(void *P, uint64_t Size, uint64_t AccessId, uint64_t PC,
-                  const char *FunctionName, const char *FileName,
-                  uint64_t LineNo) {
+  gnu::used, gnu::retain]] _AS_PTR(void, AllocationKind::GLOBAL)
+    ompx_check_global(_AS_PTR(void, AllocationKind::GLOBAL) P, uint64_t Size,
+                      uint64_t AccessId, uint64_t PC, const char *FunctionName,
+                      const char *FileName, uint64_t LineNo) {
   return AllocationTracker<AllocationKind::GLOBAL>::check(
       P, Size, AccessId, PC, FunctionName, FileName, LineNo);
 }
-
 [[clang::disable_sanitizer_instrumentation, gnu::flatten, gnu::always_inline,
   gnu::used, gnu::retain]] void *
-ompx_check_with_base_local(void *P, void *Start, uint64_t Length, uint32_t Tag,
-                           uint64_t Size, uint64_t AccessId, uint64_t PC,
-                           const char *FunctionName, const char *FileName,
-                           uint64_t LineNo) {
+ompx_check(void *P, uint64_t Size, uint64_t AccessId, uint64_t PC,
+           const char *FunctionName, const char *FileName, uint64_t LineNo) {
+  if (IS_LOCAL(P))
+    return (void *)ompx_check_local((_AS_PTR(void, AllocationKind::LOCAL))P,
+                                    Size, AccessId, PC, FunctionName, FileName,
+                                    LineNo);
+  return (void *)ompx_check_global((_AS_PTR(void, AllocationKind::GLOBAL))P,
+                                   Size, AccessId, PC, FunctionName, FileName,
+                                   LineNo);
+}
+
+[[clang::disable_sanitizer_instrumentation, gnu::flatten, gnu::always_inline,
+  gnu::used, gnu::retain]] _AS_PTR(void, AllocationKind::LOCAL)
+    ompx_check_with_base_local(_AS_PTR(void, AllocationKind::LOCAL) P,
+                               _AS_PTR(void, AllocationKind::LOCAL) Start,
+                               uint64_t Length, uint32_t Tag, uint64_t Size,
+                               uint64_t AccessId, uint64_t PC,
+                               const char *FunctionName, const char *FileName,
+                               uint64_t LineNo) {
   return AllocationTracker<AllocationKind::LOCAL>::checkWithBase(
       P, Start, Length, Tag, Size, AccessId, PC, FunctionName, FileName,
       LineNo);
 }
 
 [[clang::disable_sanitizer_instrumentation, gnu::flatten, gnu::always_inline,
-  gnu::used, gnu::retain]] void *
-ompx_check_with_base_global(void *P, void *Start, uint64_t Length, uint32_t Tag,
-                            uint64_t Size, uint64_t AccessId, uint64_t PC,
-                            const char *FunctionName, const char *FileName,
-                            uint64_t LineNo) {
+  gnu::used, gnu::retain]] _AS_PTR(void, AllocationKind::GLOBAL)
+    ompx_check_with_base_global(_AS_PTR(void, AllocationKind::GLOBAL) P,
+                                _AS_PTR(void, AllocationKind::GLOBAL) Start,
+                                uint64_t Length, uint32_t Tag, uint64_t Size,
+                                uint64_t AccessId, uint64_t PC,
+                                const char *FunctionName, const char *FileName,
+                                uint64_t LineNo) {
   return AllocationTracker<AllocationKind::GLOBAL>::checkWithBase(
       P, Start, Length, Tag, Size, AccessId, PC, FunctionName, FileName,
       LineNo);
 }
 
 [[clang::disable_sanitizer_instrumentation, gnu::flatten, gnu::always_inline,
-  gnu::used, gnu::retain]] void *
-ompx_unpack(void *P, uint64_t PC) {
-  FAKE_PTR_CHECK(unpack, P, PC);
+  gnu::used, gnu::retain]] _AS_PTR(void, AllocationKind::LOCAL)
+    ompx_unpack_local(_AS_PTR(void, AllocationKind::LOCAL) P, uint64_t PC) {
+  return AllocationTracker<AllocationKind::LOCAL>::unpack(P, PC);
 }
 [[clang::disable_sanitizer_instrumentation, gnu::flatten, gnu::always_inline,
-  gnu::used, gnu::retain]] void *
-ompx_unpack_local(void *P, uint64_t PC) {
-  return AllocationTracker<AllocationKind::LOCAL>::unpack(P, PC);
+  gnu::used, gnu::retain]] _AS_PTR(void, AllocationKind::GLOBAL)
+    ompx_unpack_global(_AS_PTR(void, AllocationKind::GLOBAL) P, uint64_t PC) {
+  return AllocationTracker<AllocationKind::GLOBAL>::unpack(P, PC);
 }
 [[clang::disable_sanitizer_instrumentation, gnu::flatten, gnu::always_inline,
   gnu::used, gnu::retain]] void *
-ompx_unpack_global(void *P, uint64_t PC) {
-  return AllocationTracker<AllocationKind::GLOBAL>::unpack(P, PC);
+ompx_unpack(void *P, uint64_t PC) {
+  if (IS_LOCAL(P))
+    return (void *)ompx_unpack_local((_AS_PTR(void, AllocationKind::LOCAL))P,
+                                     PC);
+  return (void *)ompx_unpack_global((_AS_PTR(void, AllocationKind::GLOBAL))P,
+                                    PC);
 }
 
 [[clang::disable_sanitizer_instrumentation, gnu::flatten, gnu::always_inline,
   gnu::used, gnu::retain]] void
-ompx_lifetime_start(void *P, uint64_t Length) {
+ompx_lifetime_start(_AS_PTR(void, AllocationKind::LOCAL) P, uint64_t Length) {
   AllocationTracker<AllocationKind::LOCAL>::lifetimeStart(P, Length);
 }
 [[clang::disable_sanitizer_instrumentation, gnu::flatten, gnu::always_inline,
   gnu::used, gnu::retain]] void
-ompx_lifetime_end(void *P, uint64_t Length) {
+ompx_lifetime_end(_AS_PTR(void, AllocationKind::LOCAL) P, uint64_t Length) {
   AllocationTracker<AllocationKind::LOCAL>::lifetimeEnd(P, Length);
 }
 
 [[clang::disable_sanitizer_instrumentation, gnu::flatten, gnu::always_inline,
-  gnu::used, gnu::retain]] struct AllocationInfoTy
-ompx_get_allocation_info_local(void *P) {
+  gnu::used, gnu::retain]] struct AllocationInfoLocalTy
+ompx_get_allocation_info_local(_AS_PTR(void, AllocationKind::LOCAL) P) {
   return AllocationTracker<AllocationKind::LOCAL>::getAllocationInfo(P);
 }
 [[clang::disable_sanitizer_instrumentation, gnu::flatten, gnu::always_inline,
-  gnu::used, gnu::retain]] struct AllocationInfoTy
-ompx_get_allocation_info_global(void *P) {
+  gnu::used, gnu::retain]] struct AllocationInfoGlobalTy
+ompx_get_allocation_info_global(_AS_PTR(void, AllocationKind::GLOBAL) P) {
   return AllocationTracker<AllocationKind::GLOBAL>::getAllocationInfo(P);
 }
 
diff --git a/offload/include/Shared/Sanitizer.h b/offload/include/Shared/Sanitizer.h
index 40650013b537c..fab9f338540be 100644
--- a/offload/include/Shared/Sanitizer.h
+++ b/offload/include/Shared/Sanitizer.h
@@ -20,86 +20,91 @@ extern "C" int ompx_thread_id(int Dim);
 
 enum class AllocationKind { GLOBAL, LOCAL, LAST = LOCAL };
 
-#define _OBJECT_TY uint16_t
+template <AllocationKind AK> struct ASTypes {};
+template <> struct ASTypes<AllocationKind::GLOBAL> {
+  using INT_TY = uint64_t;
+};
+template <> struct ASTypes<AllocationKind::LOCAL> {
+  using INT_TY = uint32_t;
+};
 
 template <AllocationKind AK> struct SanitizerConfig {
-  static constexpr uint32_t ADDR_SPACE = AK == AllocationKind::GLOBAL ? 0 : 3;
+  static constexpr uint32_t ADDR_SPACE = AK == AllocationKind::GLOBAL ? 0 : 5;
+  static constexpr uint32_t ADDR_SPACE_PTR_SIZE =
+      sizeof(typename ASTypes<AK>::INT_TY) * 8;
+
   static constexpr uint32_t NUM_ALLOCATION_ARRAYS =
-      AK == AllocationKind::GLOBAL ? 1 : (256 * 8 * 4);
+      AK == AllocationKind::GLOBAL ? 1 : (1024 * 1024 * 2);
   static constexpr uint32_t TAG_BITS = AK == AllocationKind::GLOBAL ? 1 : 8;
 
-  static constexpr uint32_t OBJECT_BITS =
-      AK == AllocationKind::GLOBAL ? 10 : (sizeof(_OBJECT_TY) * 8);
-  static constexpr uint32_t SLOTS =
-      (1 << (OBJECT_BITS)) / NUM_ALLOCATION_ARRAYS;
+  static constexpr uint32_t OBJECT_BITS = AK == AllocationKind::GLOBAL ? 10 : 7;
+  static constexpr uint32_t SLOTS = (1 << (OBJECT_BITS));
   static constexpr uint32_t KIND_BITS = 1;
-  static constexpr uint32_t ID_BITS = 16 - KIND_BITS;
+  static constexpr uint32_t ID_BITS = 9 - KIND_BITS;
 
-  static constexpr uint32_t LENGTH_BITS = 64 - TAG_BITS - ID_BITS - KIND_BITS;
-  static constexpr uint32_t OFFSET_BITS = LENGTH_BITS - OBJECT_BITS;
+  static constexpr uint32_t LENGTH_BITS =
+      ADDR_SPACE_PTR_SIZE - TAG_BITS - ID_BITS;
+  static constexpr uint32_t OFFSET_BITS =
+      ADDR_SPACE_PTR_SIZE - TAG_BITS - OBJECT_BITS - KIND_BITS;
 
   static constexpr bool useTags() { return TAG_BITS > 1; }
 
-  static_assert(LENGTH_BITS + TAG_BITS + KIND_BITS + ID_BITS == 64,
-                "Length and tag bits should cover 64 bits");
-  static_assert(OFFSET_BITS + TAG_BITS + KIND_BITS + ID_BITS + OBJECT_BITS ==
-                    64,
-                "Length, tag, and object bits should cover 64 bits");
+  static_assert(LENGTH_BITS + TAG_BITS + ID_BITS == ADDR_SPACE_PTR_SIZE,
+                "Length, tag, and ID bits should cover one pointer");
+  static_assert(OFFSET_BITS + TAG_BITS + OBJECT_BITS + KIND_BITS ==
+                    ADDR_SPACE_PTR_SIZE,
+                "Offset, tag, object, and kind bits should cover one pointer");
   static_assert((1 << KIND_BITS) >= ((uint64_t)AllocationKind::LAST + 1),
                 "Kind bits should match allocation kinds");
 };
 
+#define _AS_PTR(TY, AK)                                                        \
+  TY [[clang::address_space(SanitizerConfig<AK>::ADDR_SPACE)]] *
+
 template <AllocationKind AK> struct AllocationTy {
-  void *Start;
-  uint64_t Length : SanitizerConfig<AK>::LENGTH_BITS;
-  uint64_t Tag : SanitizerConfig<AK>::TAG_BITS;
-  uint64_t Id : SanitizerConfig<AK>::ID_BITS;
+  _AS_PTR(void, AK) Start;
+  typename ASTypes<AK>::INT_TY Length : SanitizerConfig<AK>::LENGTH_BITS;
+  typename ASTypes<AK>::INT_TY Tag : SanitizerConfig<AK>::TAG_BITS;
+  typename ASTypes<AK>::INT_TY Id : SanitizerConfig<AK>::ID_BITS;
 };
 
 template <AllocationKind AK> struct AllocationArrayTy {
   AllocationTy<AK> Arr[SanitizerConfig<AK>::SLOTS];
-  uint32_t Cnt;
+  uint64_t Cnt;
 };
 
 template <AllocationKind AK> struct AllocationPtrTy {
-  static AllocationPtrTy<AK> get(void *P) {
+  static AllocationPtrTy<AK> get(_AS_PTR(void, AK) P) {
     return utils::convertViaPun<AllocationPtrTy<AK>>(P);
   }
 
-  operator void *() const { return utils::convertViaPun<void *>(*this); }
-  operator intptr_t() const { return utils::convertViaPun<intptr_t>(*this); }
-  uint64_t Offset : SanitizerConfig<AK>::OFFSET_BITS;
-  uint64_t AllocationTag : SanitizerConfig<AK>::TAG_BITS;
-  uint64_t AllocationId : SanitizerConfig<AK>::OBJECT_BITS;
+  operator _AS_PTR(void, AK)() const {
+    return utils::convertViaPun<_AS_PTR(void, AK)>(*this);
+  }
+  operator typename ASTypes<AK>::INT_TY() const {
+    return utils::convertViaPun<typename ASTypes<AK>::INT_TY>(*this);
+  }
+  typename ASTypes<AK>::INT_TY Offset : SanitizerConfig<AK>::OFFSET_BITS;
+  typename ASTypes<AK>::INT_TY AllocationTag : SanitizerConfig<AK>::TAG_BITS;
+  typename ASTypes<AK>::INT_TY AllocationId : SanitizerConfig<AK>::OBJECT_BITS;
   // Must be last, TODO: merge into TAG
-  uint64_t Kind : SanitizerConfig<AK>::KIND_BITS;
+  typename ASTypes<AK>::INT_TY Kind : SanitizerConfig<AK>::KIND_BITS;
 };
 
+static_assert(sizeof(AllocationPtrTy<AllocationKind::LOCAL>) * 8 == 32);
+
 static inline void *__offload_get_new_sanitizer_ptr(int32_t Slot) {
   AllocationPtrTy<AllocationKind::GLOBAL> AP;
   AP.Offset = 0;
   AP.AllocationId = Slot;
   AP.Kind = (uint32_t)AllocationKind::GLOBAL;
-  return AP;
+  return (void *)(_AS_PTR(void, AllocationKind::GLOBAL))AP;
 }
 
 template <AllocationKind AK> struct Allocations {
   static AllocationArrayTy<AK> Arr[SanitizerConfig<AK>::NUM_ALLOCATION_ARRAYS];
 };
 
-template <AllocationKind AK>
-[[clang::disable_sanitizer_instrumentation,
-  gnu::always_inline]] AllocationArrayTy<AK> &
-getAllocationArray() {
-  uint32_t ThreadId = 0, BlockId = 0;
-  if constexpr (AK == AllocationKind::LOCAL) {
-    ThreadId = ompx_thread_id(0);
-    BlockId = ompx_block_id(0);
-  }
-  auto &AllocArr = Allocations<AK>::Arr[ThreadId + BlockId * ompx_block_dim(0)];
-  return AllocArr;
-}
-
 struct SanitizerTrapInfoTy {
   /// AllocationTy
   /// {
@@ -114,6 +119,7 @@ struct SanitizerTrapInfoTy {
     None = 0,
     ExceedsLength,
     ExceedsSlots,
+    PointerOutsideAllocation,
     OutOfBounds,
     UseAfterScope,
     UseAfterFree,
@@ -169,14 +175,14 @@ struct SanitizerTrapInfoTy {
 
   template <enum AllocationKind AK>
   [[clang::disable_sanitizer_instrumentation, gnu::always_inline]] void
-  allocationError(ErrorCodeTy EC, void *Start, uint64_t Length, int64_t Id,
-                  int64_t Tag, uint64_t Slot, uint64_t PC) {
-    AllocationStart = Start;
+  allocationError(ErrorCodeTy EC, _AS_PTR(void, AK) Start, uint64_t Length,
+                  int64_t Id, int64_t Tag, uint64_t Slot, uint64_t PC) {
+    AllocationStart = (void *)Start;
     AllocationLength = Length;
     AllocationId = Id;
     AllocationTag = Tag;
-    PtrSlot = Slot;
     AllocationKind = (decltype(AllocationKind))AK;
+    PtrSlot = Slot;
 
     ErrorCode = EC;
     setCoordinates(PC, nullptr, nullptr, 0);
@@ -188,10 +194,11 @@ struct SanitizerTrapInfoTy {
                        const AllocationPtrTy<AK> &AP, uint64_t Size, int64_t Id,
                        uint64_t PC, const char *FunctionName,
                        const char *FileName, uint64_t LineNo) {
-    AllocationStart = A.Start;
+    AllocationStart = (void *)A.Start;
     AllocationLength = A.Length;
     AllocationId = A.Id;
     AllocationTag = A.Tag;
+    AllocationKind = (decltype(AllocationKind))AK;
 
     ErrorCode = EC;
 
@@ -208,8 +215,8 @@ struct SanitizerTrapInfoTy {
 
   template <enum AllocationKind AK>
   [[clang::disable_sanitizer_instrumentation, noreturn, gnu::noinline]] void
-  exceedsAllocationLength(void *Start, uint64_t Length, int64_t AllocationId,
-                          uint64_t Slot, uint64_t PC) {
+  exceedsAllocationLength(_AS_PTR(void, AK) Start, uint64_t Length,
+                          int64_t AllocationId, uint64_t Slot, uint64_t PC) {
     allocationError<AK>(ExceedsLength, Start, Length, AllocationId, /*Tag=*/0,
                         Slot, PC);
     __builtin_trap();
@@ -217,13 +224,22 @@ struct SanitizerTrapInfoTy {
 
   template <enum AllocationKind AK>
   [[clang::disable_sanitizer_instrumentation, noreturn, gnu::noinline]] void
-  exceedsAllocationSlots(void *Start, uint64_t Length, int64_t AllocationId,
-                         uint64_t Slot, uint64_t PC) {
+  exceedsAllocationSlots(_AS_PTR(void, AK) Start, uint64_t Length,
+                         int64_t AllocationId, uint64_t Slot, uint64_t PC) {
     allocationError<AK>(ExceedsSlots, Start, Length, AllocationId, /*Tag=*/0,
                         Slot, PC);
     __builtin_trap();
   }
 
+  template <enum AllocationKind AK>
+  [[clang::disable_sanitizer_instrumentation, noreturn, gnu::noinline]] void
+  pointerOutsideAllocation(_AS_PTR(void, AK) Start, uint64_t Length,
+                           int64_t AllocationId, uint64_t Slot, uint64_t PC) {
+    allocationError<AK>(PointerOutsideAllocation, Start, Length, AllocationId,
+                        /*Tag=*/0, Slot, PC);
+    __builtin_trap();
+  }
+
   template <enum AllocationKind AK>
   [[clang::disable_sanitizer_instrumentation, noreturn, gnu::noinline]] void
   outOfBoundAccess(const AllocationTy<AK> A, const AllocationPtrTy<AK> AP,
@@ -261,20 +277,7 @@ struct SanitizerTrapInfoTy {
   [[clang::disable_sanitizer_instrumentation, noreturn, gnu::noinline]] void
   accessError(const AllocationPtrTy<AK> AP, int64_t Size, int64_t AccessId,
               uint64_t PC, const char *FunctionName, const char *FileName,
-              uint64_t LineNo) {
-    auto &A = getAllocationArray<AK>().Arr[AP.AllocationId];
-    int64_t Offset = AP.Offset;
-    int64_t Length = A.Length;
-    if (AK == AllocationKind::LOCAL && Length == 0)
-      useAfterScope<AK>(A, AP, Size, AccessId, PC, FunctionName, FileName,
-                        LineNo);
-    else if (Offset > Length - Size)
-      outOfBoundAccess<AK>(A, AP, Size, AccessId, PC, FunctionName, FileName,
-                           LineNo);
-    else
-      useAfterFree<AK>(A, AP, Size, AccessId, PC, FunctionName, FileName,
-                       LineNo);
-  }
+              uint64_t LineNo);
 
   template <enum AllocationKind AK>
   [[clang::disable_sanitizer_instrumentation, noreturn, gnu::noinline]] void
@@ -285,4 +288,51 @@ struct SanitizerTrapInfoTy {
   }
 };
 
+[[gnu::used, gnu::retain, gnu::weak,
+  gnu::visibility("protected")]] SanitizerTrapInfoTy *__sanitizer_trap_info_ptr;
+
+template <AllocationKind AK>
+[[clang::disable_sanitizer_instrumentation,
+  gnu::always_inline]] AllocationArrayTy<AK> &
+getAllocationArray() {
+  uint32_t ThreadId = 0, BlockId = 0;
+  if constexpr (AK == AllocationKind::LOCAL) {
+    ThreadId = ompx_thread_id(0);
+    BlockId = ompx_block_id(0);
+  }
+  return Allocations<AK>::Arr[ThreadId + BlockId * ompx_block_dim(0)];
+}
+
+template <AllocationKind AK>
+[[clang::disable_sanitizer_instrumentation,
+  gnu::always_inline]] AllocationTy<AK> &
+getAllocation(const AllocationPtrTy<AK> AP, int64_t AccessId = 0) {
+  auto &AllocArr = getAllocationArray<AK>();
+  uint64_t NumSlots = SanitizerConfig<AK>::SLOTS;
+  uint64_t Slot = AP.AllocationId;
+  if (Slot >= NumSlots)
+    __sanitizer_trap_info_ptr->pointerOutsideAllocation<AK>(AP, AP.Offset,
+                                                            AccessId, Slot, 0);
+  return AllocArr.Arr[Slot];
+}
+
+template <enum AllocationKind AK>
+[[clang::disable_sanitizer_instrumentation, noreturn, gnu::noinline]] void
+SanitizerTrapInfoTy::accessError(const AllocationPtrTy<AK> AP, int64_t Size,
+                                 int64_t AccessId, uint64_t PC,
+                                 const char *FunctionName, const char *FileName,
+                                 uint64_t LineNo) {
+  auto &A = getAllocationArray<AK>().Arr[AP.AllocationId];
+  int64_t Offset = AP.Offset;
+  int64_t Length = A.Length;
+  if (AK == AllocationKind::LOCAL && Length == 0)
+    useAfterScope<AK>(A, AP, Size, AccessId, PC, FunctionName, FileName,
+                      LineNo);
+  else if (Offset > Length - Size)
+    outOfBoundAccess<AK>(A, AP, Size, AccessId, PC, FunctionName, FileName,
+                         LineNo);
+  else
+    useAfterFree<AK>(A, AP, Size, AccessId, PC, FunctionName, FileName, LineNo);
+}
+
 #endif
diff --git a/offload/include/Shared/Utils.h b/offload/include/Shared/Utils.h
index 9b2b2df2f3aff..d786358d4a180 100644
--- a/offload/include/Shared/Utils.h
+++ b/offload/include/Shared/Utils.h
@@ -27,8 +27,7 @@ auto getPtrDiff(const void *End, const void *Begin) {
 
 /// Return \p Ptr advanced by \p Offset bytes.
 template <typename Ty1, typename Ty2> Ty1 *advancePtr(Ty1 *Ptr, Ty2 Offset) {
-  return reinterpret_cast<Ty1 *>(
-      const_cast<char *>(reinterpret_cast<const char *>(Ptr)) + Offset);
+  return (Ty1 *)(const_cast<char *>((const char *)(Ptr)) + Offset);
 }
 
 /// Return \p V aligned "upwards" according to \p Align.
@@ -70,7 +69,7 @@ inline uint32_t popc(uint64_t V) {
 }
 
 template <typename DstTy, typename SrcTy> inline DstTy convertViaPun(SrcTy V) {
-  static_assert(sizeof(DstTy) == sizeof(SrcTy), "Bad conversion");
+  //  static_assert(sizeof(DstTy) == sizeof(SrcTy), "Bad conversion");
   return *((DstTy *)(&V));
 }
 
diff --git a/offload/plugins-nextgen/common/src/PluginInterface.cpp b/offload/plugins-nextgen/common/src/PluginInterface.cpp
index e5ce6d2ab6e9e..ba81207fcac71 100644
--- a/offload/plugins-nextgen/common/src/PluginInterface.cpp
+++ b/offload/plugins-nextgen/common/src/PluginInterface.cpp
@@ -2236,30 +2236,21 @@ void GPUSanTy::checkAndReportError() {
 
   fprintf(stderr, "============================================================"
                   "====================\n");
-  switch (STI.ErrorCode) {
-  case SanitizerTrapInfoTy::None:
-    llvm_unreachable("Unexpected exception");
-  case SanitizerTrapInfoTy::ExceedsLength:
-    fprintf(stderr, "%sERROR: OffloadSanitizer %s\n%s", Red(), "exceeds length",
-            Default());
-    break;
-  case SanitizerTrapInfoTy::ExceedsSlots:
-    fprintf(stderr, "%sERROR: OffloadSanitizer %s\n%s", Red(), "exceeds slots",
-            Default());
-    break;
-  case SanitizerTrapInfoTy::OutOfBounds: {
+
+  auto DiagnoseAccess = [&](StringRef Name) {
     void *PC = reinterpret_cast<void *>(STI.PC);
     void *Addr = utils::advancePtr(STI.AllocationStart, STI.PtrOffset);
     fprintf(stderr,
-            "%sERROR: OffloadSanitizer %s on address " DPxMOD " at pc " DPxMOD
-            "\n%s",
-            Red(), "out-of-bounds access", DPxPTR(Addr), DPxPTR(PC), Default());
+            "%sERROR: OffloadSanitizer %s access on address " DPxMOD
+            " at pc " DPxMOD "\n%s",
+            Red(), Name.data(), DPxPTR(Addr), DPxPTR(PC), Default());
     fprintf(stderr,
             "%s%s of size %u at " DPxMOD
-            " thread <%u, %u, %u> block <%lu, %lu, %lu>\n%s",
+            " thread <%u, %u, %u> block <%lu, %lu, %lu> (acc %li, %s)\n%s",
             Blue(), STI.AccessId > 0 ? "WRITE" : "READ", STI.AccessSize,
             DPxPTR(Addr), STI.ThreadId[0], STI.ThreadId[1], STI.ThreadId[2],
-            STI.BlockId[0], STI.BlockId[1], STI.BlockId[2], Default());
+            STI.BlockId[0], STI.BlockId[1], STI.BlockId[2], STI.AccessId,
+            (STI.AllocationKind ? "stack" : "heap"), Default());
     fprintf(stderr, "    #0 %p %s in %s:%lu\n\n", PC,
             FunctionName.str().c_str(), FileName.data(), STI.LineNo);
     fprintf(
@@ -2270,15 +2261,33 @@ void GPUSanTy::checkAndReportError() {
         DPxPTR(STI.AllocationStart),
         DPxPTR(utils::advancePtr(STI.AllocationStart, STI.AllocationLength)),
         Default());
+  };
+
+  switch (STI.ErrorCode) {
+  case SanitizerTrapInfoTy::None:
+    llvm_unreachable("Unexpected exception");
+  case SanitizerTrapInfoTy::ExceedsLength:
+    fprintf(stderr, "%sERROR: OffloadSanitizer %s\n%s", Red(), "exceeds length",
+            Default());
+    break;
+  case SanitizerTrapInfoTy::ExceedsSlots:
+    fprintf(stderr, "%sERROR: OffloadSanitizer %s\n%s", Red(), "exceeds slots",
+            Default());
+    break;
+  case SanitizerTrapInfoTy::PointerOutsideAllocation:
+    fprintf(stderr, "%sERROR: OffloadSanitizer %s : %p : %i %lu (%s)\n%s",
+            Red(), "outside allocation", STI.AllocationStart, STI.AllocationId,
+            STI.PtrSlot, (STI.AllocationKind ? "stack" : "heap"), Default());
+    break;
+  case SanitizerTrapInfoTy::OutOfBounds: {
+    DiagnoseAccess("out-of-bounds");
     break;
   }
   case SanitizerTrapInfoTy::UseAfterScope:
-    fprintf(stderr, "%sERROR: OffloadSanitizer %s\n%s", Red(),
-            "use-after-scope", Default());
+    DiagnoseAccess("use-after-scope");
     break;
   case SanitizerTrapInfoTy::UseAfterFree:
-    fprintf(stderr, "%sERROR: OffloadSanitizer %s\n%s", Red(), "use-after-free",
-            Default());
+    DiagnoseAccess("use-after-free");
     break;
   case SanitizerTrapInfoTy::MemoryLeak:
     fprintf(stderr, "%sERROR: OffloadSanitizer %s\n%s", Red(), "memory leak",
diff --git a/offload/test/sanitizer/volatile_stack_null.c b/offload/test/sanitizer/volatile_stack_null.c
index 2de4024daaa24..58eaad1cd1d85 100644
--- a/offload/test/sanitizer/volatile_stack_null.c
+++ b/offload/test/sanitizer/volatile_stack_null.c
@@ -1,5 +1,5 @@
 // clang-format off
-// RUN: %libomptarget-compileopt-generic -fsanitize=offload
+// RUN: %libomptarget-compileopt-generic -fsanitize=offload -O0
 // RUN: %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefixes=CHECK
 // clang-format on
 
@@ -12,11 +12,14 @@
 
 // Align lines.
 
+__attribute__((optnone)) void *foo(void *P) { return P; }
+
 int main(void) {
 
-#pragma omp target
+  int *Ptr = 0;
+#pragma omp target is_device_ptr(Ptr)
   {
-    volatile int *Null = 0;
+    int *Null = foo(Ptr);
     // clang-format off
     // CHECK:      ERROR: OffloadSanitizer out-of-bounds access on address 0x0000000000000000 at pc [[PC:0x.*]]
     // CHECK-NEXT: WRITE of size 4 at 0x0000000000000000 thread <0, 0, 0> block <0, 0, 0>

>From 514bbb9c6f3437aae9c61c425dea1f48a23a701c Mon Sep 17 00:00:00 2001
From: Johannes Doerfert <johannes at jdoerfert.de>
Date: Fri, 28 Jun 2024 14:47:54 -0700
Subject: [PATCH 13/31] Fix cuda and fake ptr handling

---
 offload/plugins-nextgen/common/src/PluginInterface.cpp | 1 +
 offload/src/omptarget.cpp                              | 4 ++--
 offload/test/offloading/CUDA/launch_tu.cu              | 2 ++
 3 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/offload/plugins-nextgen/common/src/PluginInterface.cpp b/offload/plugins-nextgen/common/src/PluginInterface.cpp
index ba81207fcac71..8354ffe269df1 100644
--- a/offload/plugins-nextgen/common/src/PluginInterface.cpp
+++ b/offload/plugins-nextgen/common/src/PluginInterface.cpp
@@ -2136,6 +2136,7 @@ int32_t GenericPluginTy::get_function(__tgt_device_binary Binary,
 
 Error GPUSanTy::notifyDataMapped(void *DevicePtr, uint64_t Size,
                                  void *&FakeHstPtr) {
+  FakeHstPtr = nullptr;
   if (NewFns.empty())
     return Plugin::success();
   uint64_t Slot = SlotCnt--;
diff --git a/offload/src/omptarget.cpp b/offload/src/omptarget.cpp
index 6701e1676bda3..f95a8021141b0 100644
--- a/offload/src/omptarget.cpp
+++ b/offload/src/omptarget.cpp
@@ -443,10 +443,10 @@ void *targetAllocExplicit(size_t Size, int DeviceNum, int Kind,
 
   Rc = DeviceOrErr->allocData(Size, nullptr, Kind);
   DP("%s returns device ptr " DPxMOD "\n", Name, DPxPTR(Rc));
-  void *FakeHstPtr;
+  void *FakeHstPtr = nullptr;
   if (DeviceOrErr->notifyDataMapped(nullptr, Rc, Size, FakeHstPtr))
     return nullptr;
-  return FakeHstPtr;
+  return FakeHstPtr ? FakeHstPtr : Rc;
 }
 
 void targetFreeExplicit(void *DevicePtr, int DeviceNum, int Kind,
diff --git a/offload/test/offloading/CUDA/launch_tu.cu b/offload/test/offloading/CUDA/launch_tu.cu
index c45f40b115b56..fa6b1d7692de5 100644
--- a/offload/test/offloading/CUDA/launch_tu.cu
+++ b/offload/test/offloading/CUDA/launch_tu.cu
@@ -10,6 +10,7 @@
 // UNSUPPORTED: x86_64-pc-linux-gnu
 // UNSUPPORTED: x86_64-pc-linux-gnu-LTO
 
+#include <cuda_runtime.h>
 #include <stdio.h>
 
 extern "C" {
@@ -26,6 +27,7 @@ int main(int argc, char **argv) {
   printf("Ptr %p, *Ptr: %i\n", Ptr, *Ptr);
   // CHECK: Ptr [[Ptr:0x.*]], *Ptr: 7
   square<<<1, 1>>>(Ptr);
+  cudaDeviceSynchronize();
   printf("Ptr %p, *Ptr: %i\n", Ptr, *Ptr);
   // CHECK: Ptr [[Ptr]], *Ptr: 42
   llvm_omp_target_free_shared(Ptr, DevNo);

>From 6c053f3cbfdf97bd10d6063efbb4b80c853a9174 Mon Sep 17 00:00:00 2001
From: Johannes Doerfert <johannes at jdoerfert.de>
Date: Fri, 28 Jun 2024 18:34:05 -0700
Subject: [PATCH 14/31] Fixes and more tests

---
 .../Target/AMDGPU/AMDGPUCodeGenPrepare.cpp    |  4 +-
 offload/DeviceRTL/src/Sanitizer.cpp           | 92 +++++++++++++------
 offload/include/Shared/Sanitizer.h            | 41 ++++++++-
 offload/include/Shared/Utils.h                |  2 +-
 .../common/src/PluginInterface.cpp            | 22 ++++-
 offload/src/omptarget.cpp                     | 10 +-
 offload/test/sanitizer/heap_null.c            | 21 ++++-
 offload/test/sanitizer/heap_out_of_bounds.c   | 27 ++++++
 offload/test/sanitizer/heap_random.c          | 22 +++++
 offload/test/sanitizer/null_forced_stack.c    | 41 +++++++++
 offload/test/sanitizer/volatile_stack_null.c  | 34 ++++---
 11 files changed, 258 insertions(+), 58 deletions(-)
 create mode 100644 offload/test/sanitizer/heap_out_of_bounds.c
 create mode 100644 offload/test/sanitizer/heap_random.c
 create mode 100644 offload/test/sanitizer/null_forced_stack.c

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
index 6e7d34f5adaa3..a310038543532 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
@@ -2043,7 +2043,9 @@ static bool isPtrKnownNeverNull(const Value *V, const DataLayout &DL,
   //
   // TODO: Use ValueTracking's isKnownNeverNull if it becomes aware that some
   // address spaces have non-zero null values.
-  auto SrcPtrKB = computeKnownBits(V, DL).trunc(DL.getPointerSizeInBits(AS));
+  auto SrcPtrKB = computeKnownBits(V, DL);
+  if (SrcPtrKB.getBitWidth() > DL.getPointerSizeInBits(AS))
+    SrcPtrKB = SrcPtrKB.trunc(DL.getPointerSizeInBits(AS));
   const auto NullVal = TM.getNullPointerValue(AS);
   assert((NullVal == 0 || NullVal == -1) &&
          "don't know how to check for this null value!");
diff --git a/offload/DeviceRTL/src/Sanitizer.cpp b/offload/DeviceRTL/src/Sanitizer.cpp
index 7c9bb1a085b24..11101e603d4ab 100644
--- a/offload/DeviceRTL/src/Sanitizer.cpp
+++ b/offload/DeviceRTL/src/Sanitizer.cpp
@@ -41,6 +41,23 @@ template <> struct AllocationInfoTy<AllocationKind::LOCAL> {
   using ASVoidPtrTy = AllocationInfoLocalTy;
 };
 
+template <>
+AllocationPtrTy<AllocationKind::LOCAL>
+AllocationPtrTy<AllocationKind::LOCAL>::get(_AS_PTR(void, AllocationKind::LOCAL)
+                                                P) {
+  TypePunUnion TPU;
+  TPU.P = (void *)P;
+  return TPU.AP;
+}
+
+template <>
+AllocationPtrTy<AllocationKind::LOCAL>::operator _AS_PTR(
+    void, AllocationKind::LOCAL)() const {
+  TypePunUnion TPU;
+  TPU.AP = *this;
+  return TPU.AddrP;
+}
+
 template <AllocationKind AK> struct AllocationTracker {
   static_assert(sizeof(AllocationTy<AK>) == sizeof(_AS_PTR(void, AK)) * 2,
                 "AllocationTy should not exceed two pointers");
@@ -86,11 +103,12 @@ template <AllocationKind AK> struct AllocationTracker {
 
     AllocationPtrTy<AK> AP;
     AP.Offset = 0;
-    AP.AllocationId = Slot;
-    AP.Kind = (uint64_t)AK;
     if constexpr (SanitizerConfig<AK>::useTags()) {
       AP.AllocationTag = ++A.Tag;
     }
+    AP.AllocationId = Slot;
+    AP.Magic = SanitizerConfig<AK>::MAGIC;
+    AP.Kind = (uint64_t)AK;
     return AP;
   }
 
@@ -135,12 +153,15 @@ template <AllocationKind AK> struct AllocationTracker {
     if constexpr (AK == AllocationKind::LOCAL)
       if (Length == 0)
         Length = getAllocation<AK>(AP, AccessId).Length;
+    if constexpr (AK == AllocationKind::GLOBAL)
+      if (AP.Magic != SanitizerConfig<AllocationKind::GLOBAL>::MAGIC)
+        __sanitizer_trap_info_ptr->garbagePointer<AK>(AP, (void *)P, PC);
     int64_t Offset = AP.Offset;
     if (OMP_UNLIKELY(
             Offset > Length - Size ||
             (SanitizerConfig<AK>::useTags() && Tag != AP.AllocationTag))) {
-      __sanitizer_trap_info_ptr->accessError(AP, Size, AccessId, PC,
-                                             FunctionName, FileName, LineNo);
+      __sanitizer_trap_info_ptr->accessError<AK>(
+          AP, Size, AccessId, PC, FunctionName, FileName, LineNo);
     }
     return utils::advancePtr(Start, Offset);
   }
@@ -194,10 +215,24 @@ template <AllocationKind AK>
 AllocationArrayTy<AK>
     Allocations<AK>::Arr[SanitizerConfig<AK>::NUM_ALLOCATION_ARRAYS];
 
+static void checkForMagic(bool IsGlobal, void *P, uint64_t PC) {
+  if (IsGlobal) {
+    auto AP = AllocationPtrTy<AllocationKind::GLOBAL>::get(P);
+    if (AP.Magic != SanitizerConfig<AllocationKind::GLOBAL>::MAGIC)
+      __sanitizer_trap_info_ptr->garbagePointer<AllocationKind::GLOBAL>(AP, P,
+                                                                        PC);
+  } else {
+    auto AP = AllocationPtrTy<AllocationKind::LOCAL>::get(P);
+    if (AP.Magic != SanitizerConfig<AllocationKind::LOCAL>::MAGIC)
+      __sanitizer_trap_info_ptr->garbagePointer<AllocationKind::LOCAL>(AP, P,
+                                                                       PC);
+  }
+}
+
 extern "C" {
 
 #define REAL_PTR_IS_LOCAL(PTR) (isThreadLocalMemPtr(PTR))
-#define IS_LOCAL(PTR) ((intptr_t)PTR & 1)
+#define IS_GLOBAL(PTR) ((uintptr_t)PTR & (1UL << 63))
 
 [[clang::disable_sanitizer_instrumentation, gnu::flatten, gnu::always_inline,
   gnu::used, gnu::retain]] _AS_PTR(void, AllocationKind::LOCAL)
@@ -254,10 +289,11 @@ ompx_free_global(_AS_PTR(void, AllocationKind::GLOBAL) P) {
 [[clang::disable_sanitizer_instrumentation, gnu::flatten, gnu::always_inline,
   gnu::used, gnu::retain]] void
 ompx_free(void *P, uint64_t PC) {
-  if (IS_LOCAL(P))
-    ompx_free_local((_AS_PTR(void, AllocationKind::LOCAL))P);
-  else
-    ompx_free_global((_AS_PTR(void, AllocationKind::GLOBAL))P);
+  bool IsGlobal = IS_GLOBAL(P);
+  checkForMagic(IsGlobal, P, PC);
+  if (IsGlobal)
+    return ompx_free_global((_AS_PTR(void, AllocationKind::GLOBAL))P);
+  return ompx_free_local((_AS_PTR(void, AllocationKind::LOCAL))P);
 }
 
 [[clang::disable_sanitizer_instrumentation, gnu::flatten, gnu::always_inline,
@@ -275,11 +311,13 @@ ompx_free(void *P, uint64_t PC) {
 [[clang::disable_sanitizer_instrumentation, gnu::flatten, gnu::always_inline,
   gnu::used, gnu::retain]] void *
 ompx_gep(void *P, uint64_t Offset, uint64_t PC) {
-  if (IS_LOCAL(P))
-    return (void *)ompx_gep_local((_AS_PTR(void, AllocationKind::LOCAL))P,
-                                  Offset, PC);
-  return (void *)ompx_gep_global((_AS_PTR(void, AllocationKind::GLOBAL))P,
-                                 Offset, PC);
+  bool IsGlobal = IS_GLOBAL(P);
+  checkForMagic(IsGlobal, P, PC);
+  if (IsGlobal)
+    return (void *)ompx_gep_global((_AS_PTR(void, AllocationKind::GLOBAL))P,
+                                   Offset, PC);
+  return (void *)ompx_gep_local((_AS_PTR(void, AllocationKind::LOCAL))P, Offset,
+                                PC);
 }
 
 [[clang::disable_sanitizer_instrumentation, gnu::flatten, gnu::always_inline,
@@ -302,13 +340,14 @@ ompx_gep(void *P, uint64_t Offset, uint64_t PC) {
   gnu::used, gnu::retain]] void *
 ompx_check(void *P, uint64_t Size, uint64_t AccessId, uint64_t PC,
            const char *FunctionName, const char *FileName, uint64_t LineNo) {
-  if (IS_LOCAL(P))
-    return (void *)ompx_check_local((_AS_PTR(void, AllocationKind::LOCAL))P,
-                                    Size, AccessId, PC, FunctionName, FileName,
-                                    LineNo);
-  return (void *)ompx_check_global((_AS_PTR(void, AllocationKind::GLOBAL))P,
-                                   Size, AccessId, PC, FunctionName, FileName,
-                                   LineNo);
+  bool IsGlobal = IS_GLOBAL(P);
+  checkForMagic(IsGlobal, P, PC);
+  if (IsGlobal)
+    return (void *)ompx_check_global((_AS_PTR(void, AllocationKind::GLOBAL))P,
+                                     Size, AccessId, PC, FunctionName, FileName,
+                                     LineNo);
+  return (void *)ompx_check_local((_AS_PTR(void, AllocationKind::LOCAL))P, Size,
+                                  AccessId, PC, FunctionName, FileName, LineNo);
 }
 
 [[clang::disable_sanitizer_instrumentation, gnu::flatten, gnu::always_inline,
@@ -350,11 +389,12 @@ ompx_check(void *P, uint64_t Size, uint64_t AccessId, uint64_t PC,
 [[clang::disable_sanitizer_instrumentation, gnu::flatten, gnu::always_inline,
   gnu::used, gnu::retain]] void *
 ompx_unpack(void *P, uint64_t PC) {
-  if (IS_LOCAL(P))
-    return (void *)ompx_unpack_local((_AS_PTR(void, AllocationKind::LOCAL))P,
-                                     PC);
-  return (void *)ompx_unpack_global((_AS_PTR(void, AllocationKind::GLOBAL))P,
-                                    PC);
+  bool IsGlobal = IS_GLOBAL(P);
+  checkForMagic(IsGlobal, P, PC);
+  if (IsGlobal)
+    return (void *)ompx_unpack_global((_AS_PTR(void, AllocationKind::GLOBAL))P,
+                                      PC);
+  return (void *)ompx_unpack_local((_AS_PTR(void, AllocationKind::LOCAL))P, PC);
 }
 
 [[clang::disable_sanitizer_instrumentation, gnu::flatten, gnu::always_inline,
diff --git a/offload/include/Shared/Sanitizer.h b/offload/include/Shared/Sanitizer.h
index fab9f338540be..fa275b27832e0 100644
--- a/offload/include/Shared/Sanitizer.h
+++ b/offload/include/Shared/Sanitizer.h
@@ -18,7 +18,7 @@ extern "C" int ompx_block_id(int Dim);
 extern "C" int ompx_block_dim(int Dim);
 extern "C" int ompx_thread_id(int Dim);
 
-enum class AllocationKind { GLOBAL, LOCAL, LAST = LOCAL };
+enum class AllocationKind { LOCAL, GLOBAL, LAST = GLOBAL };
 
 template <AllocationKind AK> struct ASTypes {};
 template <> struct ASTypes<AllocationKind::GLOBAL> {
@@ -36,6 +36,8 @@ template <AllocationKind AK> struct SanitizerConfig {
   static constexpr uint32_t NUM_ALLOCATION_ARRAYS =
       AK == AllocationKind::GLOBAL ? 1 : (1024 * 1024 * 2);
   static constexpr uint32_t TAG_BITS = AK == AllocationKind::GLOBAL ? 1 : 8;
+  static constexpr uint32_t MAGIC_BITS = 3;
+  static constexpr uint32_t MAGIC = 0b101;
 
   static constexpr uint32_t OBJECT_BITS = AK == AllocationKind::GLOBAL ? 10 : 7;
   static constexpr uint32_t SLOTS = (1 << (OBJECT_BITS));
@@ -45,13 +47,13 @@ template <AllocationKind AK> struct SanitizerConfig {
   static constexpr uint32_t LENGTH_BITS =
       ADDR_SPACE_PTR_SIZE - TAG_BITS - ID_BITS;
   static constexpr uint32_t OFFSET_BITS =
-      ADDR_SPACE_PTR_SIZE - TAG_BITS - OBJECT_BITS - KIND_BITS;
+      ADDR_SPACE_PTR_SIZE - TAG_BITS - OBJECT_BITS - KIND_BITS - MAGIC_BITS;
 
   static constexpr bool useTags() { return TAG_BITS > 1; }
 
   static_assert(LENGTH_BITS + TAG_BITS + ID_BITS == ADDR_SPACE_PTR_SIZE,
                 "Length, tag, and ID bits should cover one pointer");
-  static_assert(OFFSET_BITS + TAG_BITS + OBJECT_BITS + KIND_BITS ==
+  static_assert(OFFSET_BITS + TAG_BITS + OBJECT_BITS + MAGIC_BITS + KIND_BITS ==
                     ADDR_SPACE_PTR_SIZE,
                 "Offset, tag, object, and kind bits should cover one pointer");
   static_assert((1 << KIND_BITS) >= ((uint64_t)AllocationKind::LAST + 1),
@@ -77,7 +79,9 @@ template <AllocationKind AK> struct AllocationPtrTy {
   static AllocationPtrTy<AK> get(_AS_PTR(void, AK) P) {
     return utils::convertViaPun<AllocationPtrTy<AK>>(P);
   }
-
+  static AllocationPtrTy<AK> get(void *P) {
+    return get((_AS_PTR(void, AK))(P));
+  }
   operator _AS_PTR(void, AK)() const {
     return utils::convertViaPun<_AS_PTR(void, AK)>(*this);
   }
@@ -87,16 +91,28 @@ template <AllocationKind AK> struct AllocationPtrTy {
   typename ASTypes<AK>::INT_TY Offset : SanitizerConfig<AK>::OFFSET_BITS;
   typename ASTypes<AK>::INT_TY AllocationTag : SanitizerConfig<AK>::TAG_BITS;
   typename ASTypes<AK>::INT_TY AllocationId : SanitizerConfig<AK>::OBJECT_BITS;
+  typename ASTypes<AK>::INT_TY Magic : SanitizerConfig<AK>::MAGIC_BITS;
   // Must be last, TODO: merge into TAG
   typename ASTypes<AK>::INT_TY Kind : SanitizerConfig<AK>::KIND_BITS;
 };
-
 static_assert(sizeof(AllocationPtrTy<AllocationKind::LOCAL>) * 8 == 32);
 
+union TypePunUnion {
+  uint64_t I;
+  void *P;
+  _AS_PTR(void, AllocationKind::LOCAL) AddrP;
+  struct {
+    AllocationPtrTy<AllocationKind::LOCAL> AP;
+    uint32_t U;
+  };
+};
+static_assert(sizeof(TypePunUnion) * 8 == 64);
+
 static inline void *__offload_get_new_sanitizer_ptr(int32_t Slot) {
   AllocationPtrTy<AllocationKind::GLOBAL> AP;
   AP.Offset = 0;
   AP.AllocationId = Slot;
+  AP.Magic = SanitizerConfig<AllocationKind::GLOBAL>::MAGIC;
   AP.Kind = (uint32_t)AllocationKind::GLOBAL;
   return (void *)(_AS_PTR(void, AllocationKind::GLOBAL))AP;
 }
@@ -124,6 +140,7 @@ struct SanitizerTrapInfoTy {
     UseAfterScope,
     UseAfterFree,
     MemoryLeak,
+    GarbagePointer,
   } ErrorCode;
 
   /// AllocationTy
@@ -279,6 +296,20 @@ struct SanitizerTrapInfoTy {
               uint64_t PC, const char *FunctionName, const char *FileName,
               uint64_t LineNo);
 
+  template <enum AllocationKind AK>
+  [[clang::disable_sanitizer_instrumentation, noreturn, gnu::noinline]] void
+  garbagePointer(const AllocationPtrTy<AK> AP, void *P, uint64_t PC) {
+    ErrorCode = GarbagePointer;
+    AllocationStart = P;
+    AllocationKind = (decltype(AllocationKind))AK;
+    PtrOffset = AP.Offset;
+    PtrSlot = AP.AllocationId;
+    PtrTag = AP.AllocationTag;
+    PtrKind = AP.Kind;
+    setCoordinates(PC, nullptr, nullptr, 0);
+    __builtin_trap();
+  }
+
   template <enum AllocationKind AK>
   [[clang::disable_sanitizer_instrumentation, noreturn, gnu::noinline]] void
   memoryLeak(const AllocationTy<AK> A, uint64_t Slot) {
diff --git a/offload/include/Shared/Utils.h b/offload/include/Shared/Utils.h
index d786358d4a180..22ea2ba262d2a 100644
--- a/offload/include/Shared/Utils.h
+++ b/offload/include/Shared/Utils.h
@@ -69,7 +69,7 @@ inline uint32_t popc(uint64_t V) {
 }
 
 template <typename DstTy, typename SrcTy> inline DstTy convertViaPun(SrcTy V) {
-  //  static_assert(sizeof(DstTy) == sizeof(SrcTy), "Bad conversion");
+  static_assert(sizeof(DstTy) == sizeof(SrcTy), "Bad conversion");
   return *((DstTy *)(&V));
 }
 
diff --git a/offload/plugins-nextgen/common/src/PluginInterface.cpp b/offload/plugins-nextgen/common/src/PluginInterface.cpp
index 8354ffe269df1..e44575c840d66 100644
--- a/offload/plugins-nextgen/common/src/PluginInterface.cpp
+++ b/offload/plugins-nextgen/common/src/PluginInterface.cpp
@@ -2215,6 +2215,8 @@ void GPUSanTy::checkAndReportError() {
       FunctionName = FunctionName.drop_front(It + 1);
   }
 
+  if (FunctionName.ends_with("_debug__"))
+    FunctionName = FunctionName.drop_back(sizeof("debug__"));
   if (FunctionName.ends_with("_debug___omp_outlined_debug__"))
     FunctionName =
         FunctionName.drop_back(sizeof("debug___omp_outlined_debug__"));
@@ -2251,8 +2253,8 @@ void GPUSanTy::checkAndReportError() {
             Blue(), STI.AccessId > 0 ? "WRITE" : "READ", STI.AccessSize,
             DPxPTR(Addr), STI.ThreadId[0], STI.ThreadId[1], STI.ThreadId[2],
             STI.BlockId[0], STI.BlockId[1], STI.BlockId[2], STI.AccessId,
-            (STI.AllocationKind ? "stack" : "heap"), Default());
-    fprintf(stderr, "    #0 %p %s in %s:%lu\n\n", PC,
+            (STI.AllocationKind ? "heap" : "stack"), Default());
+    fprintf(stderr, "    #0 " DPxMOD " %s in %s:%lu\n\n", DPxPTR(PC),
             FunctionName.str().c_str(), FileName.data(), STI.LineNo);
     fprintf(
         stderr,
@@ -2262,6 +2264,11 @@ void GPUSanTy::checkAndReportError() {
         DPxPTR(STI.AllocationStart),
         DPxPTR(utils::advancePtr(STI.AllocationStart, STI.AllocationLength)),
         Default());
+    fprintf(stderr,
+            "%s Pointer[slot:%lu,tag:%u,kind:%i] "
+            "Allocation[slot:%d,tag:%u,kind:%i]\n%s",
+            Green(), STI.PtrSlot, STI.PtrTag, STI.PtrKind, STI.AllocationId,
+            STI.AllocationTag, STI.AllocationKind, Default());
   };
 
   switch (STI.ErrorCode) {
@@ -2278,7 +2285,7 @@ void GPUSanTy::checkAndReportError() {
   case SanitizerTrapInfoTy::PointerOutsideAllocation:
     fprintf(stderr, "%sERROR: OffloadSanitizer %s : %p : %i %lu (%s)\n%s",
             Red(), "outside allocation", STI.AllocationStart, STI.AllocationId,
-            STI.PtrSlot, (STI.AllocationKind ? "stack" : "heap"), Default());
+            STI.PtrSlot, (STI.AllocationKind ? "heap" : "stack"), Default());
     break;
   case SanitizerTrapInfoTy::OutOfBounds: {
     DiagnoseAccess("out-of-bounds");
@@ -2294,6 +2301,15 @@ void GPUSanTy::checkAndReportError() {
     fprintf(stderr, "%sERROR: OffloadSanitizer %s\n%s", Red(), "memory leak",
             Default());
     break;
+  case SanitizerTrapInfoTy::GarbagePointer:
+    fprintf(stderr, "%sERROR: OffloadSanitizer %s : %p\n%s", Red(),
+            "garbage pointer", STI.AllocationStart, Default());
+    fprintf(stderr,
+            "%s Pointer[slot:%lu,tag:%u,kind:%i] "
+            "Allocation[kind:%i]\n%s",
+            Green(), STI.PtrSlot, STI.PtrTag, STI.PtrKind, STI.AllocationKind,
+            Default());
+    break;
   }
   fflush(stderr);
 }
diff --git a/offload/src/omptarget.cpp b/offload/src/omptarget.cpp
index f95a8021141b0..450b047753417 100644
--- a/offload/src/omptarget.cpp
+++ b/offload/src/omptarget.cpp
@@ -680,8 +680,9 @@ int targetDataBegin(ident_t *Loc, DeviceTy &Device, int32_t ArgNum,
         HasFlagTo, HasFlagAlways, IsImplicit, UpdateRef, HasCloseModifier,
         HasPresentModifier, HasHoldModifier, AsyncInfo, PointerTpr.getEntry());
     void *TgtPtrBegin = TPR.TargetPointer;
-    if (auto *FakeTgtPtrBegin = TPR.getEntry()->FakeTgtPtrBegin)
-      TgtPtrBegin = FakeTgtPtrBegin;
+    if (auto *Entry = TPR.getEntry())
+      if (auto *FakeTgtPtrBegin = Entry->FakeTgtPtrBegin)
+        TgtPtrBegin = FakeTgtPtrBegin;
     IsHostPtr = TPR.Flags.IsHostPointer;
     // If data_size==0, then the argument could be a zero-length pointer to
     // NULL, so getOrAlloc() returning NULL is not an error.
@@ -1535,8 +1536,9 @@ static int processDataBefore(ident_t *Loc, int64_t DeviceId, void *HostPtr,
           /*UpdateRefCount=*/false,
           /*UseHoldRefCount=*/false);
       TgtPtrBegin = TPR.TargetPointer;
-      if (auto *FakeTgtPtrBegin = TPR.getEntry()->FakeTgtPtrBegin)
-        TgtPtrBegin = FakeTgtPtrBegin;
+      if (auto *Entry = TPR.getEntry())
+        if (auto *FakeTgtPtrBegin = Entry->FakeTgtPtrBegin)
+          TgtPtrBegin = FakeTgtPtrBegin;
       TgtBaseOffset = (intptr_t)HstPtrBase - (intptr_t)HstPtrBegin;
 #ifdef OMPTARGET_DEBUG
       void *TgtPtrBase = (void *)((intptr_t)TgtPtrBegin + TgtBaseOffset);
diff --git a/offload/test/sanitizer/heap_null.c b/offload/test/sanitizer/heap_null.c
index 8f1ac0bdf530f..568f423227c52 100644
--- a/offload/test/sanitizer/heap_null.c
+++ b/offload/test/sanitizer/heap_null.c
@@ -1,6 +1,7 @@
 // clang-format off
-// RUN: %libomptarget-compileopt-generic -loffload.kernels -mllvm -enable-gpu-san
-// RUN: %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefixes=CHECK
+// RUN: %libomptarget-compileopt-generic -fsanitize=offload
+// RUN: not %libomptarget-run-generic 2>&1 > %t.out
+// RUN: %fcheck-generic --check-prefixes=CHECK < %t.out
 // clang-format on
 
 // UNSUPPORTED: aarch64-unknown-linux-gnu
@@ -12,17 +13,27 @@
 
 // Align lines.
 
+#include <stdint.h>
+#include <stdio.h>
+
 int main(void) {
 
-  int *Null = 0;
-#pragma omp target
+  void *Null = 0;
+  void *Heap, *Stack;
+#pragma omp target map(from : Heap, Stack)
   {
+    int Q[512];
     // clang-format off
     // CHECK:      ERROR: OffloadSanitizer out-of-bounds access on address 0x0000000000000000 at pc [[PC:0x.*]]
     // CHECK-NEXT: WRITE of size 4 at 0x0000000000000000 thread <0, 0, 0> block <0, 0, 0>
     // CHECK-NEXT: #0 [[PC]] main null.c:[[@LINE+3]]
     // CHECK-NEXT: 0x0000000000000000 is located 0 bytes inside of 0-byte region [0x0000000000000000,0x0000000000000000)
     // clang-format on
-    *Null = 42;
+    //    *Null = 42;
+    Stack = &Q[0];
+    Heap = Null;
   }
+  printf("Heap %p Stack %p\n", Heap, Stack);
+  printf("Heap %lu Stack %lu\n", ((uintptr_t)Heap & (1UL << 63)),
+         ((uintptr_t)Stack & (1UL << 63)));
 }
diff --git a/offload/test/sanitizer/heap_out_of_bounds.c b/offload/test/sanitizer/heap_out_of_bounds.c
new file mode 100644
index 0000000000000..37a8d99b5241f
--- /dev/null
+++ b/offload/test/sanitizer/heap_out_of_bounds.c
@@ -0,0 +1,27 @@
+// clang-format off
+// RUN: %libomptarget-compileopt-generic -fsanitize=offload
+// RUN: not %libomptarget-run-generic 2> %t.out
+// RUN: %fcheck-generic --check-prefixes=CHECK < %t.out
+// clang-format on
+
+// UNSUPPORTED: aarch64-unknown-linux-gnu
+// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
+// UNSUPPORTED: x86_64-pc-linux-gnu
+// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+// UNSUPPORTED: s390x-ibm-linux-gnu
+// UNSUPPORTED: s390x-ibm-linux-gnu-LTO
+
+// Align lines.
+
+#include <stdint.h>
+#include <stdio.h>
+
+int main(int argc, char **argv) {
+  int N = argc > 42 ? 1000 : 100;
+  double A[N];
+#pragma omp target map(from : A[ : N])
+  {
+    // CHECK: is located 7992 bytes inside of a 800-byte region
+    A[999] = 3.14;
+  }
+}
diff --git a/offload/test/sanitizer/heap_random.c b/offload/test/sanitizer/heap_random.c
new file mode 100644
index 0000000000000..265495f910f69
--- /dev/null
+++ b/offload/test/sanitizer/heap_random.c
@@ -0,0 +1,22 @@
+// clang-format off
+// RUN: %libomptarget-compileopt-generic -fsanitize=offload
+// RUN: not %libomptarget-run-generic 2>&1 > %t.out
+// RUN: %fcheck-generic --check-prefixes=CHECK < %t.out
+// clang-format on
+
+// UNSUPPORTED: aarch64-unknown-linux-gnu
+// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
+// UNSUPPORTED: x86_64-pc-linux-gnu
+// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+// UNSUPPORTED: s390x-ibm-linux-gnu
+// UNSUPPORTED: s390x-ibm-linux-gnu-LTO
+
+// Align lines.
+
+int main(void) {
+
+  int X = 0;
+  int *Random = &X;
+#pragma omp target
+  { *Random = 99; }
+}
diff --git a/offload/test/sanitizer/null_forced_stack.c b/offload/test/sanitizer/null_forced_stack.c
new file mode 100644
index 0000000000000..02c08643fdc12
--- /dev/null
+++ b/offload/test/sanitizer/null_forced_stack.c
@@ -0,0 +1,41 @@
+// clang-format off
+// RUN: %libomptarget-compileopt-generic -fsanitize=offload -O1
+// RUN: not %libomptarget-run-generic 2> %t.out
+// RUN: %fcheck-generic --check-prefixes=CHECK < %t.out
+// RUN: %libomptarget-compileopt-generic -fsanitize=offload -O3
+// RUN: not %libomptarget-run-generic 2> %t.out
+// RUN: %libomptarget-compileopt-generic -fsanitize=offload -O3 -g
+// RUN: not %libomptarget-run-generic 2> %t.out
+// RUN: %fcheck-generic --check-prefixes=DEBUG < %t.out
+// clang-format on
+
+// UNSUPPORTED: aarch64-unknown-linux-gnu
+// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
+// UNSUPPORTED: x86_64-pc-linux-gnu
+// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+// UNSUPPORTED: s390x-ibm-linux-gnu
+// UNSUPPORTED: s390x-ibm-linux-gnu-LTO
+
+[[clang::optnone]] int *deref(int **P) { return *P; }
+
+int main(void) {
+
+#pragma omp target
+  {
+    int *NullPtr = 0;
+    // clang-format off
+    // CHECK: ERROR: OffloadSanitizer out-of-bounds access on address 0x0000000000000000 at pc [[PC:.*]]
+    // CHECK: WRITE of size 4 at 0x0000000000000000 thread <0, 0, 0> block <0, 0, 0> (acc 1, heap)
+    // CHECK:     #0 [[PC]] omp target (main:[[@LINE-6]]) in <unknown>:0
+    // 
+    // CHECK: 0x0000000000000000 is located 0 bytes inside of a 0-byte region [0x0000000000000000,0x0000000000000000)
+    //
+    // DEBUG: ERROR: OffloadSanitizer out-of-bounds access on address 0x0000000000000000 at pc [[PC:.*]]
+    // DEBUG: WRITE of size 4 at 0x0000000000000000 thread <0, 0, 0> block <0, 0, 0> (acc 1, heap)
+    // DEBUG:     #0 [[PC]] omp target (main:[[@LINE-12]]) in {{.*}}volatile_stack_null.c:[[@LINE+4]]
+    // 
+    // DEBUG: 0x0000000000000000 is located 0 bytes inside of a 0-byte region [0x0000000000000000,0x0000000000000000)
+    // clang-format on
+    deref(&NullPtr)[10] = 42;
+  }
+}
diff --git a/offload/test/sanitizer/volatile_stack_null.c b/offload/test/sanitizer/volatile_stack_null.c
index 58eaad1cd1d85..08542c3516f28 100644
--- a/offload/test/sanitizer/volatile_stack_null.c
+++ b/offload/test/sanitizer/volatile_stack_null.c
@@ -1,6 +1,12 @@
 // clang-format off
-// RUN: %libomptarget-compileopt-generic -fsanitize=offload -O0
-// RUN: %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefixes=CHECK
+// RUN: %libomptarget-compileopt-generic -fsanitize=offload -O1
+// RUN: not %libomptarget-run-generic 2> %t.out
+// RUN: %fcheck-generic --check-prefixes=CHECK < %t.out
+// RUN: %libomptarget-compileopt-generic -fsanitize=offload -O3
+// RUN: not %libomptarget-run-generic 2> %t.out
+// RUN: %libomptarget-compileopt-generic -fsanitize=offload -O3 -g
+// RUN: not %libomptarget-run-generic 2> %t.out
+// RUN: %fcheck-generic --check-prefixes=DEBUG < %t.out
 // clang-format on
 
 // UNSUPPORTED: aarch64-unknown-linux-gnu
@@ -10,21 +16,23 @@
 // UNSUPPORTED: s390x-ibm-linux-gnu
 // UNSUPPORTED: s390x-ibm-linux-gnu-LTO
 
-// Align lines.
-
-__attribute__((optnone)) void *foo(void *P) { return P; }
-
 int main(void) {
 
-  int *Ptr = 0;
-#pragma omp target is_device_ptr(Ptr)
+#pragma omp target
   {
-    int *Null = foo(Ptr);
+    volatile int *Null = 0;
     // clang-format off
-    // CHECK:      ERROR: OffloadSanitizer out-of-bounds access on address 0x0000000000000000 at pc [[PC:0x.*]]
-    // CHECK-NEXT: WRITE of size 4 at 0x0000000000000000 thread <0, 0, 0> block <0, 0, 0>
-    // CHECK-NEXT: #0 [[PC]] main null.c:[[@LINE+3]]
-    // CHECK-NEXT: 0x0000000000000000 is located 0 bytes inside of 0-byte region [0x0000000000000000,0x0000000000000000)
+    // CHECK: ERROR: OffloadSanitizer out-of-bounds access on address 0x0000000000000000 at pc [[PC:.*]]
+    // CHECK: WRITE of size 4 at 0x0000000000000000 thread <0, 0, 0> block <0, 0, 0> (acc 1, heap)
+    // CHECK:     #0 [[PC]] omp target (main:[[@LINE-6]]) in <unknown>:0
+    // 
+    // CHECK: 0x0000000000000000 is located 0 bytes inside of a 0-byte region [0x0000000000000000,0x0000000000000000)
+    //
+    // DEBUG: ERROR: OffloadSanitizer out-of-bounds access on address 0x0000000000000000 at pc [[PC:.*]]
+    // DEBUG: WRITE of size 4 at 0x0000000000000000 thread <0, 0, 0> block <0, 0, 0> (acc 1, heap)
+    // DEBUG:     #0 [[PC]] omp target (main:[[@LINE-12]]) in {{.*}}volatile_stack_null.c:[[@LINE+4]]
+    // 
+    // DEBUG: 0x0000000000000000 is located 0 bytes inside of a 0-byte region [0x0000000000000000,0x0000000000000000)
     // clang-format on
     *Null = 42;
   }

>From 5c49fb65158b0dab63aed6caffb7bd7bed87ff0d Mon Sep 17 00:00:00 2001
From: Johannes Doerfert <johannes at jdoerfert.de>
Date: Fri, 28 Jun 2024 19:57:11 -0700
Subject: [PATCH 15/31] Fixes

---
 offload/src/OpenMP/API.cpp                    |  2 +-
 offload/src/omptarget.cpp                     |  9 ++++--
 .../sanitizer/heap_partial_out_of_bounds.c    | 31 +++++++++++++++++++
 3 files changed, 39 insertions(+), 3 deletions(-)
 create mode 100644 offload/test/sanitizer/heap_partial_out_of_bounds.c

diff --git a/offload/src/OpenMP/API.cpp b/offload/src/OpenMP/API.cpp
index 374c54163d6a4..c07cea550c39c 100644
--- a/offload/src/OpenMP/API.cpp
+++ b/offload/src/OpenMP/API.cpp
@@ -39,7 +39,7 @@ EXTERN void ompx_dump_mapping_tables() {
 using namespace llvm::omp::target::ompt;
 #endif
 
-void *targetAllocExplicit(size_t Size, int DeviceNum, int Kind,
+void *targetAllocExplicit(size_t Size, int64_t DeviceNum, int Kind,
                           const char *Name);
 void targetFreeExplicit(void *DevicePtr, int DeviceNum, int Kind,
                         const char *Name);
diff --git a/offload/src/omptarget.cpp b/offload/src/omptarget.cpp
index 450b047753417..10f9b9a7d9358 100644
--- a/offload/src/omptarget.cpp
+++ b/offload/src/omptarget.cpp
@@ -420,9 +420,9 @@ static int32_t getParentIndex(int64_t Type) {
   return ((Type & OMP_TGT_MAPTYPE_MEMBER_OF) >> 48) - 1;
 }
 
-void *targetAllocExplicit(size_t Size, int DeviceNum, int Kind,
+void *targetAllocExplicit(size_t Size, int64_t DeviceNum, int Kind,
                           const char *Name) {
-  DP("Call to %s for device %d requesting %zu bytes\n", Name, DeviceNum, Size);
+  DP("Call to %s for device %ld requesting %zu bytes\n", Name, DeviceNum, Size);
 
   if (Size <= 0) {
     DP("Call to %s with non-positive length\n", Name);
@@ -437,6 +437,11 @@ void *targetAllocExplicit(size_t Size, int DeviceNum, int Kind,
     return Rc;
   }
 
+  if (checkDeviceAndCtors(DeviceNum, nullptr)) {
+    DP("Not offloading to device %" PRId64 "\n", DeviceNum);
+    return Rc;
+  }
+
   auto DeviceOrErr = PM->getDevice(DeviceNum);
   if (!DeviceOrErr)
     FATAL_MESSAGE(DeviceNum, "%s", toString(DeviceOrErr.takeError()).c_str());
diff --git a/offload/test/sanitizer/heap_partial_out_of_bounds.c b/offload/test/sanitizer/heap_partial_out_of_bounds.c
new file mode 100644
index 0000000000000..981cba3a8eebd
--- /dev/null
+++ b/offload/test/sanitizer/heap_partial_out_of_bounds.c
@@ -0,0 +1,31 @@
+// clang-format off
+// RUN: %libomptarget-compileopt-generic -fsanitize=offload
+// RUN: not %libomptarget-run-generic 2> %t.out
+// RUN: %fcheck-generic --check-prefixes=CHECK < %t.out
+// clang-format on
+
+// UNSUPPORTED: aarch64-unknown-linux-gnu
+// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
+// UNSUPPORTED: x86_64-pc-linux-gnu
+// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+// UNSUPPORTED: s390x-ibm-linux-gnu
+// UNSUPPORTED: s390x-ibm-linux-gnu-LTO
+
+// Align lines.
+
+#include "omp.h"
+#include <stdint.h>
+#include <stdio.h>
+
+int main(int argc, char **argv) {
+  int N = argc > 42 ? 1000 : 100;
+  double *A =
+      (double *)omp_target_alloc(N * sizeof(*A), omp_get_default_device());
+  char *C = ((char *)&A[N - 1] + 1);
+#pragma omp target is_device_ptr(A, C)
+  {
+    // CHECK: is located 793 bytes inside of a 800-byte region
+    double *D = (double *)C;
+    *D = 3.14;
+  }
+}

>From 0ce24406ab09d9ae916a86f166a67833124f1603 Mon Sep 17 00:00:00 2001
From: Johannes Doerfert <johannes at jdoerfert.de>
Date: Mon, 1 Jul 2024 17:01:35 -0700
Subject: [PATCH 16/31] Initial support for source locations

---
 .../lib/Transforms/Instrumentation/GPUSan.cpp | 275 ++++++++++++++++--
 offload/DeviceRTL/src/Mapping.cpp             |   5 +
 offload/DeviceRTL/src/Sanitizer.cpp           | 146 +++++-----
 offload/include/Shared/Sanitizer.h            | 102 +++----
 .../common/src/PluginInterface.cpp            |  10 +-
 offload/test/sanitizer/global_null.c          |   5 +-
 offload/test/sanitizer/heap_null.c            |  13 +-
 7 files changed, 382 insertions(+), 174 deletions(-)

diff --git a/llvm/lib/Transforms/Instrumentation/GPUSan.cpp b/llvm/lib/Transforms/Instrumentation/GPUSan.cpp
index d88a9b3de8b07..1ee4671760f89 100644
--- a/llvm/lib/Transforms/Instrumentation/GPUSan.cpp
+++ b/llvm/lib/Transforms/Instrumentation/GPUSan.cpp
@@ -11,7 +11,9 @@
 #include "llvm/Transforms/Instrumentation/GPUSan.h"
 
 #include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMapInfo.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/StringRef.h"
@@ -21,6 +23,7 @@
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
 #include "llvm/IR/GlobalObject.h"
 #include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/GlobalVariable.h"
@@ -31,10 +34,15 @@
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/IntrinsicsAMDGPU.h"
 #include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Metadata.h"
 #include "llvm/IR/Module.h"
+#include "llvm/Support/Allocator.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/StringSaver.h"
 #include "llvm/Transforms/Utils/ModuleUtils.h"
+#include <cstdint>
 
 using namespace llvm;
 
@@ -46,6 +54,47 @@ cl::opt<bool> UseTags(
         "Use tags to detect use after if the number of allocations is large"),
     cl::init(false));
 
+namespace llvm {
+
+struct LocationInfoTy {
+  uint64_t LineNo = 0;
+  uint32_t ColumnNo = 0;
+  uint32_t ParentIdx = -1;
+  StringRef FileName;
+  StringRef FunctionName;
+  bool operator==(const LocationInfoTy &RHS) const {
+    return LineNo == RHS.LineNo && ColumnNo == RHS.ColumnNo &&
+           FileName == RHS.FileName && FunctionName == RHS.FunctionName;
+  }
+};
+template <> struct DenseMapInfo<LocationInfoTy *> {
+  static LocationInfoTy EmptyKey;
+  static LocationInfoTy TombstoneKey;
+  static inline LocationInfoTy *getEmptyKey() { return &EmptyKey; }
+
+  static inline LocationInfoTy *getTombstoneKey() { return &TombstoneKey; }
+
+  static unsigned getHashValue(const LocationInfoTy *LI) {
+    unsigned Hash = DenseMapInfo<uint64_t>::getHashValue(LI->LineNo);
+    Hash = detail::combineHashValue(
+        Hash, DenseMapInfo<uint64_t>::getHashValue(LI->ColumnNo));
+    Hash = detail::combineHashValue(
+        Hash, DenseMapInfo<StringRef>::getHashValue(LI->FileName));
+    Hash = detail::combineHashValue(
+        Hash, DenseMapInfo<StringRef>::getHashValue(LI->FunctionName));
+    return Hash;
+  }
+
+  static bool isEqual(const LocationInfoTy *LHS, const LocationInfoTy *RHS) {
+    return *LHS == *RHS;
+  }
+};
+LocationInfoTy DenseMapInfo<LocationInfoTy *>::EmptyKey =
+    LocationInfoTy{(uint64_t)-1};
+LocationInfoTy DenseMapInfo<LocationInfoTy *>::TombstoneKey =
+    LocationInfoTy{(uint64_t)-2};
+} // namespace llvm
+
 namespace {
 
 enum PtrOrigin {
@@ -129,20 +178,20 @@ class GPUSanImpl final {
                          {getPtrTy(PO), Int64Ty});
   }
   FunctionCallee getFreeNLocalFn() {
-    return getOrCreateFn(FreeNLocal, "ompx_free_local_n", VoidTy, {Int32Ty});
+    return getOrCreateFn(FreeNLocalFn, "ompx_free_local_n", VoidTy, {Int32Ty});
   }
   FunctionCallee getCheckFn(PtrOrigin PO) {
     assert(PO <= GLOBAL && "Origin does not need handling.");
-    return getOrCreateFn(
-        CheckFn[PO], "ompx_check" + getSuffix(PO), getPtrTy(PO),
-        {getPtrTy(PO), Int64Ty, Int64Ty, Int64Ty, PtrTy, PtrTy, Int64Ty});
+    return getOrCreateFn(CheckFn[PO], "ompx_check" + getSuffix(PO),
+                         getPtrTy(PO),
+                         {getPtrTy(PO), Int64Ty, Int64Ty, Int64Ty});
   }
   FunctionCallee getCheckWithBaseFn(PtrOrigin PO) {
     assert(PO >= LOCAL && PO <= GLOBAL && "Origin does not need handling.");
     return getOrCreateFn(CheckWithBaseFn[PO],
                          "ompx_check_with_base" + getSuffix(PO), getPtrTy(PO),
                          {getPtrTy(PO), getPtrTy(PO), Int64Ty, Int32Ty, Int64Ty,
-                          Int64Ty, Int64Ty, PtrTy, PtrTy, Int64Ty});
+                          Int64Ty, Int64Ty});
   }
   FunctionCallee getAllocationInfoFn(PtrOrigin PO) {
     assert(PO >= LOCAL && PO <= GLOBAL && "Origin does not need handling.");
@@ -172,18 +221,23 @@ class GPUSanImpl final {
     FunctionCallee LeakCheckFn;
     return getOrCreateFn(LeakCheckFn, "ompx_leak_check", VoidTy, {});
   }
+  FunctionCallee getThreadIdFn() {
+    return getOrCreateFn(ThreadIDFn, "ompx_global_thread_id", Int32Ty, {});
+  }
 
   Module &M;
   FunctionAnalysisManager &FAM;
   LLVMContext &Ctx;
   bool HasAllocas;
+  GlobalVariable *LocationsArray;
+  SmallSetVector<CallBase *, 16> AmbiguousCalls;
 
   Type *VoidTy = Type::getVoidTy(Ctx);
   Type *IntptrTy = M.getDataLayout().getIntPtrType(Ctx);
   PointerType *PtrTy = PointerType::getUnqual(Ctx);
-  Type *Int8Ty = Type::getInt8Ty(Ctx);
-  Type *Int32Ty = Type::getInt32Ty(Ctx);
-  Type *Int64Ty = Type::getInt64Ty(Ctx);
+  IntegerType *Int8Ty = Type::getInt8Ty(Ctx);
+  IntegerType *Int32Ty = Type::getInt32Ty(Ctx);
+  IntegerType *Int64Ty = Type::getInt64Ty(Ctx);
 
   const DataLayout &DL = M.getDataLayout();
 
@@ -196,7 +250,8 @@ class GPUSanImpl final {
   FunctionCallee UnpackFn[3];
   FunctionCallee LifetimeEndFn;
   FunctionCallee LifetimeStartFn;
-  FunctionCallee FreeNLocal;
+  FunctionCallee FreeNLocalFn;
+  FunctionCallee ThreadIDFn;
 
   StringMap<Value *> GlobalStringMap;
   struct AllocationInfoTy {
@@ -205,10 +260,139 @@ class GPUSanImpl final {
     Value *Tag;
   };
   DenseMap<std::pair<Function *, Value *>, AllocationInfoTy> AllocationInfoMap;
+
+  DenseMap<LocationInfoTy *, uint64_t, DenseMapInfo<LocationInfoTy *>>
+      LocationMap;
+
+  const std::pair<LocationInfoTy *, uint64_t>
+  addLocationInfo(LocationInfoTy *LI, bool &IsNew) {
+    auto It = LocationMap.insert({LI, LocationMap.size()});
+    IsNew = It.second;
+    if (!IsNew)
+      delete LI;
+    return {It.first->first, It.first->second};
+  }
+
+  void addParentLocationInfo(LocationInfoTy &LI, uint64_t ParentIdx) {
+    LI.ParentIdx = ParentIdx;
+  }
+
+  void buildCallTreeInfo(Function &Fn, LocationInfoTy &LI);
+  ConstantInt *getSourceIndex(Instruction &I, LocationInfoTy *LastLI = nullptr);
+
+  BumpPtrAllocator BPA;
+  StringSaver SS = StringSaver(BPA);
 };
 
 } // end anonymous namespace
 
+ConstantInt *GPUSanImpl::getSourceIndex(Instruction &I,
+                                        LocationInfoTy *LastLI) {
+  LocationInfoTy *LI = new LocationInfoTy();
+  auto *DILoc = I.getDebugLoc().get();
+
+  auto PrettifyFunctionName = [&](StringRef Name) {
+    if (Name.ends_with(".internalized"))
+      return SS.save(Name.drop_back(sizeof(".internalized")) +
+                     " (internalized)");
+    if (!Name.starts_with("__omp_offloading_"))
+      return Name;
+    Name = Name.drop_front(sizeof("__omp_offloading_"));
+    auto It = Name.find_first_of("_");
+    if (It != StringRef::npos && It + 1 < Name.size())
+      Name = Name.drop_front(It + 1);
+    It = Name.find_first_of("_");
+    if (It != StringRef::npos && It + 1 < Name.size())
+      Name = Name.drop_front(It + 1);
+    if (Name.ends_with("_debug__"))
+      Name = Name.drop_back(sizeof("debug__"));
+    if (Name.ends_with("_debug___omp_outlined_debug__"))
+      Name = Name.drop_back(sizeof("debug___omp_outlined_debug__"));
+    It = Name.find_last_of("_");
+    if (It == StringRef::npos || It + 1 >= Name.size())
+      return Name;
+    if (Name[It + 1] != 'l')
+      return Name;
+    int64_t KernelLineNo = 0;
+    Name.take_back(Name.size() - It -
+                   /* '_' and 'l' */ 2)
+        .getAsInteger(10, KernelLineNo);
+    if (KernelLineNo)
+      Name = SS.save("omp target (" + Name.take_front(It).str() + ":" +
+                     std::to_string(KernelLineNo) + ")");
+    return Name;
+  };
+
+  auto FillLI = [&](LocationInfoTy &LI, DILocation &DIL) {
+    LI.FileName = DIL.getFilename();
+    LI.FunctionName = DIL.getSubprogramLinkageName();
+    if (LI.FunctionName.empty())
+      LI.FunctionName = I.getFunction()->getName();
+    LI.FunctionName = PrettifyFunctionName(LI.FunctionName);
+    LI.LineNo = DIL.getLine();
+    LI.ColumnNo = DIL.getColumn();
+  };
+
+  DILocation *ParentDILoc = nullptr;
+  if (DILoc) {
+    FillLI(*LI, *DILoc);
+    ParentDILoc = DILoc->getInlinedAt();
+  } else {
+    LI->FunctionName = I.getFunction()->getName();
+  }
+  errs() << __FUNCTION__ << " : " << I << " : " << LastLI << "\n";
+
+  bool IsNew;
+  uint64_t Idx;
+  std::tie(LI, Idx) = addLocationInfo(LI, IsNew);
+  errs() << "Idx: " << Idx << " : " << IsNew << "\n";
+  if (LastLI)
+    addParentLocationInfo(*LastLI, Idx);
+  if (!IsNew)
+    return ConstantInt::get(Int64Ty, Idx);
+
+  LocationInfoTy *CurLI = LI;
+  while (ParentDILoc) {
+    auto *ParentLI = new LocationInfoTy();
+    FillLI(*ParentLI, *DILoc);
+    uint64_t ParentIdx;
+    std::tie(ParentLI, ParentIdx) = addLocationInfo(ParentLI, IsNew);
+    addParentLocationInfo(*CurLI, ParentIdx);
+    CurLI = ParentLI;
+    if (!IsNew)
+      break;
+    ParentDILoc = DILoc->getInlinedAt();
+  }
+
+  Function &Fn = *I.getFunction();
+  buildCallTreeInfo(Fn, *CurLI);
+
+  return ConstantInt::get(Int64Ty, Idx);
+}
+
+void GPUSanImpl::buildCallTreeInfo(Function &Fn, LocationInfoTy &LI) {
+  errs() << __FUNCTION__ << " : " << Fn.getName() << " : "
+         << Fn.hasFnAttribute("kernel") << "\n";
+  if (Fn.hasFnAttribute("kernel"))
+    return;
+  SmallVector<CallBase *> Calls;
+  for (auto &U : Fn.uses()) {
+    errs() << *U.getUser() << "\n";
+    auto *CB = dyn_cast<CallBase>(U.getUser());
+    if (!CB)
+      continue;
+    if (!CB->isCallee(&U))
+      continue;
+    Calls.push_back(CB);
+  }
+  errs() << "Calls " << Calls.size() << "\n";
+  if (Calls.size() == 1) {
+    getSourceIndex(*Calls.back(), &LI);
+    return;
+  }
+  AmbiguousCalls.insert(Calls.begin(), Calls.end());
+}
+
 Value *GPUSanImpl::getPC(IRBuilder<> &IRB) {
   return IRB.CreateIntrinsic(Int64Ty, Intrinsic::amdgcn_s_getpc, {}, nullptr,
                              "PC");
@@ -312,12 +496,13 @@ PtrOrigin GPUSanImpl::getPtrOrigin(LoopInfo &LI, Value *Ptr,
 bool GPUSanImpl::instrumentGlobals() {
   Function *DtorFn =
       Function::Create(FunctionType::get(VoidTy, false),
-                       GlobalValue::PrivateLinkage, "san.dtor", &M);
+                       GlobalValue::PrivateLinkage, "__san.dtor", &M);
   BasicBlock *Entry = BasicBlock::Create(Ctx, "entry", DtorFn);
   IRBuilder<> IRB(Entry);
   IRB.CreateCall(getLeakCheckFn());
   IRB.CreateRetVoid();
   appendToGlobalDtors(M, DtorFn, 0, nullptr);
+
   return true;
 
   Function *DTorFn;
@@ -338,10 +523,11 @@ Value *GPUSanImpl::instrumentAllocation(Instruction &I, Value &Size,
   IRBuilder<> IRB(I.getNextNode());
   Value *PlainI = IRB.CreatePointerBitCastOrAddrSpaceCast(&I, getPtrTy(PO));
   static int AllocationId = 1;
-  auto *CB = IRB.CreateCall(
-      Fn,
-      {PlainI, &Size, ConstantInt::get(Int64Ty, AllocationId++), getPC(IRB)},
-      I.getName() + ".san");
+  auto *CB =
+      IRB.CreateCall(Fn,
+                     {PlainI, &Size, ConstantInt::get(Int64Ty, AllocationId++),
+                      getSourceIndex(I)},
+                     I.getName() + ".san");
   SmallVector<LifetimeIntrinsic *> Lifetimes;
   I.replaceUsesWithIf(
       IRB.CreatePointerBitCastOrAddrSpaceCast(CB, I.getType()), [&](Use &U) {
@@ -405,14 +591,12 @@ void GPUSanImpl::instrumentAccess(LoopInfo &LI, Instruction &I, int PtrIdx,
     CB =
         IRB.CreateCall(getCheckWithBaseFn(PO),
                        {PlainPtrOp, Start, Length, Tag, Size,
-                        ConstantInt::get(Int64Ty, AccessId), getPC(IRB),
-                        getFunctionName(IRB), getFileName(IRB), getLineNo(IRB)},
+                        ConstantInt::get(Int64Ty, AccessId), getSourceIndex(I)},
                        I.getName() + ".san");
   } else {
     CB = IRB.CreateCall(getCheckFn(PO),
                         {PlainPtrOp, Size, ConstantInt::get(Int64Ty, AccessId),
-                         getPC(IRB), getFunctionName(IRB), getFileName(IRB),
-                         getLineNo(IRB)},
+                         getSourceIndex(I)},
                         I.getName() + ".san");
   }
   I.setOperand(PtrIdx,
@@ -515,9 +699,13 @@ bool GPUSanImpl::instrumentFunction(Function &Fn) {
       GEPs.push_back(&cast<GetElementPtrInst>(I));
       Changed = true;
       break;
-    case Instruction::Call:
-      Calls.push_back(&cast<CallInst>(I));
+    case Instruction::Call: {
+      auto &CI = cast<CallInst>(I);
+      Calls.push_back(&CI);
+      if (CI.isIndirectCall())
+        AmbiguousCalls.insert(&CI);
       break;
+    }
     case Instruction::Ret:
       Returns.push_back(&cast<ReturnInst>(I));
       break;
@@ -570,6 +758,53 @@ bool GPUSanImpl::instrument() {
       if (!Fn.hasFnAttribute(Attribute::DisableSanitizerInstrumentation))
         Changed |= instrumentFunction(Fn);
 
+  SmallVector<std::pair<CallBase *, ConstantInt *>> AmbiguousCallsNumbered;
+  for (size_t I = 0; I < AmbiguousCalls.size(); ++I) {
+    CallBase &CB = *AmbiguousCalls[I];
+    AmbiguousCallsNumbered.push_back({&CB, getSourceIndex(CB)});
+  }
+  IntegerType *ITy = nullptr;
+  if (size_t NumAmbiguousCalls = AmbiguousCalls.size()) {
+    ITy = IntegerType::get(Ctx, llvm::PowerOf2Ceil(NumAmbiguousCalls));
+    auto *ArrayTy = ArrayType::get(ITy, 1024);
+    LocationsArray = new GlobalVariable(
+        ArrayTy, /*isConstant=*/false, GlobalValue::PrivateLinkage,
+        UndefValue::get(ArrayTy), "__san.locations",
+        GlobalValue::ThreadLocalMode::NotThreadLocal, 3);
+    M.insertGlobalVariable(LocationsArray);
+
+    Function *LocationGetter = Function::Create(
+        FunctionType::get(Int64Ty, false), llvm::GlobalValue::ExternalLinkage,
+        "__san_get_location_value", M);
+    auto *EntryBB = BasicBlock::Create(Ctx, "entry", LocationGetter);
+    IRBuilder<> IRB(EntryBB);
+    Value *Idx = IRB.CreateCall(getThreadIdFn(), {}, "san.gtid");
+    Value *Ptr = IRB.CreateGEP(ITy, LocationsArray, {Idx});
+    auto *LocationValue = IRB.CreateLoad(ITy, Ptr);
+    IRB.CreateRet(IRB.CreateZExt(LocationValue, Int64Ty));
+  }
+
+  for (auto &It : AmbiguousCallsNumbered) {
+    IRBuilder<> IRB(It.first);
+    Value *Idx = IRB.CreateCall(getThreadIdFn(), {}, "san.gtid");
+    Value *Ptr = IRB.CreateGEP(ITy, LocationsArray, {Idx});
+    IRB.CreateStore(It.second, Ptr);
+  }
+
+  SmallVector<LocationInfoTy *> Locations;
+  Locations.resize(LocationMap.size());
+  for (auto &It : LocationMap)
+    Locations[It.second] = It.first;
+  for (size_t I = 0; I < Locations.size(); ++I) {
+    LocationInfoTy &LI = *Locations[I];
+    errs() << "[" << I << "]";
+    errs() << " - File: " << LI.FileName << "\n";
+    errs() << " - Func: " << LI.FunctionName << "\n";
+    errs() << " - Line: " << LI.LineNo << "\n";
+    errs() << " - Coln: " << LI.ColumnNo << "\n";
+    errs() << " - ParI: " << LI.ParentIdx << "\n";
+  }
+  M.dump();
   return Changed;
 }
 
diff --git a/offload/DeviceRTL/src/Mapping.cpp b/offload/DeviceRTL/src/Mapping.cpp
index 8287312c74e4e..cbfa4e78cd9f4 100644
--- a/offload/DeviceRTL/src/Mapping.cpp
+++ b/offload/DeviceRTL/src/Mapping.cpp
@@ -364,6 +364,11 @@ _TGT_KERNEL_LANGUAGE(block_id, getBlockIdInKernel)
 _TGT_KERNEL_LANGUAGE(block_dim, getNumberOfThreadsInBlock)
 _TGT_KERNEL_LANGUAGE(grid_dim, getNumberOfBlocksInKernel)
 
+extern "C" int ompx_global_thread_id() {
+  return ompx_thread_id(0) + ompx_thread_id(1) * ompx_block_dim(0) +
+         ompx_thread_id(2) * ompx_block_dim(0) * ompx_block_dim(1);
+}
+
 extern "C" {
 uint64_t ompx_ballot_sync(uint64_t mask, int pred) {
   return utils::ballotSync(mask, pred);
diff --git a/offload/DeviceRTL/src/Sanitizer.cpp b/offload/DeviceRTL/src/Sanitizer.cpp
index 11101e603d4ab..d953e813194bb 100644
--- a/offload/DeviceRTL/src/Sanitizer.cpp
+++ b/offload/DeviceRTL/src/Sanitizer.cpp
@@ -78,11 +78,11 @@ template <AllocationKind AK> struct AllocationTracker {
 
   [[clang::disable_sanitizer_instrumentation]] static _AS_PTR(void, AK)
       create(_AS_PTR(void, AK) Start, uint64_t Length, int64_t AllocationId,
-             uint64_t Slot, uint64_t PC) {
+             uint64_t Slot, int64_t SourceId) {
     if constexpr (SanitizerConfig<AK>::OFFSET_BITS < 64)
       if (OMP_UNLIKELY(Length >= (1UL << (SanitizerConfig<AK>::OFFSET_BITS))))
         __sanitizer_trap_info_ptr->exceedsAllocationLength<AK>(
-            Start, Length, AllocationId, Slot, PC);
+            Start, Length, AllocationId, Slot, SourceId);
 
     // Reserve the 0 element for the null pointer in global space.
     auto &AllocArr = getAllocationArray<AK>();
@@ -93,7 +93,7 @@ template <AllocationKind AK> struct AllocationTracker {
     uint64_t NumSlots = SanitizerConfig<AK>::SLOTS;
     if (OMP_UNLIKELY(Slot >= NumSlots))
       __sanitizer_trap_info_ptr->exceedsAllocationSlots<AK>(
-          Start, Length, AllocationId, Slot, PC);
+          Start, Length, AllocationId, Slot, SourceId);
 
     auto &A = AllocArr.Arr[Slot];
 
@@ -113,7 +113,7 @@ template <AllocationKind AK> struct AllocationTracker {
   }
 
   [[clang::disable_sanitizer_instrumentation]] static void
-  remove(_AS_PTR(void, AK) P, uint64_t PC) {
+  remove(_AS_PTR(void, AK) P, int64_t SourceId) {
     AllocationPtrTy<AK> AP = AllocationPtrTy<AK>::get(P);
     uint64_t AllocationId = AP.AllocationId;
     auto &AllocArr = getAllocationArray<AK>();
@@ -138,7 +138,7 @@ template <AllocationKind AK> struct AllocationTracker {
   }
 
   [[clang::disable_sanitizer_instrumentation]] static _AS_PTR(void, AK)
-      advance(_AS_PTR(void, AK) P, uint64_t Offset, uint64_t PC) {
+      advance(_AS_PTR(void, AK) P, uint64_t Offset, int64_t SourceId) {
     AllocationPtrTy<AK> AP = AllocationPtrTy<AK>::get(P);
     AP.Offset += Offset;
     return AP;
@@ -147,36 +147,34 @@ template <AllocationKind AK> struct AllocationTracker {
   [[clang::disable_sanitizer_instrumentation]] static _AS_PTR(void, AK)
       checkWithBase(_AS_PTR(void, AK) P, _AS_PTR(void, AK) Start,
                     int64_t Length, uint32_t Tag, int64_t Size,
-                    int64_t AccessId, uint64_t PC, const char *FunctionName,
-                    const char *FileName, uint64_t LineNo) {
+                    int64_t AccessId, int64_t SourceId) {
     AllocationPtrTy<AK> AP = AllocationPtrTy<AK>::get(P);
     if constexpr (AK == AllocationKind::LOCAL)
       if (Length == 0)
         Length = getAllocation<AK>(AP, AccessId).Length;
     if constexpr (AK == AllocationKind::GLOBAL)
       if (AP.Magic != SanitizerConfig<AllocationKind::GLOBAL>::MAGIC)
-        __sanitizer_trap_info_ptr->garbagePointer<AK>(AP, (void *)P, PC);
+        __sanitizer_trap_info_ptr->garbagePointer<AK>(AP, (void *)P, SourceId);
     int64_t Offset = AP.Offset;
     if (OMP_UNLIKELY(
             Offset > Length - Size ||
             (SanitizerConfig<AK>::useTags() && Tag != AP.AllocationTag))) {
-      __sanitizer_trap_info_ptr->accessError<AK>(
-          AP, Size, AccessId, PC, FunctionName, FileName, LineNo);
+      __sanitizer_trap_info_ptr->accessError<AK>(AP, Size, AccessId, SourceId);
     }
     return utils::advancePtr(Start, Offset);
   }
 
   [[clang::disable_sanitizer_instrumentation]] static _AS_PTR(void, AK)
-      check(_AS_PTR(void, AK) P, int64_t Size, int64_t AccessId, uint64_t PC,
-            const char *FunctionName, const char *FileName, uint64_t LineNo) {
+      check(_AS_PTR(void, AK) P, int64_t Size, int64_t AccessId,
+            int64_t SourceId) {
     AllocationPtrTy<AK> AP = AllocationPtrTy<AK>::get(P);
     auto &Alloc = getAllocation<AK>(AP, AccessId);
     return checkWithBase(P, Alloc.Start, Alloc.Length, Alloc.Tag, Size,
-                         AccessId, PC, FunctionName, FileName, LineNo);
+                         AccessId, SourceId);
   }
 
   [[clang::disable_sanitizer_instrumentation]] static _AS_PTR(void, AK)
-      unpack(_AS_PTR(void, AK) P, uint64_t PC = 0) {
+      unpack(_AS_PTR(void, AK) P, int64_t SourceId = 0) {
     AllocationPtrTy<AK> AP = AllocationPtrTy<AK>::get(P);
     auto &A = getAllocation<AK>(AP);
     uint64_t Offset = AP.Offset;
@@ -215,17 +213,17 @@ template <AllocationKind AK>
 AllocationArrayTy<AK>
     Allocations<AK>::Arr[SanitizerConfig<AK>::NUM_ALLOCATION_ARRAYS];
 
-static void checkForMagic(bool IsGlobal, void *P, uint64_t PC) {
+static void checkForMagic(bool IsGlobal, void *P, int64_t SourceId) {
   if (IsGlobal) {
     auto AP = AllocationPtrTy<AllocationKind::GLOBAL>::get(P);
     if (AP.Magic != SanitizerConfig<AllocationKind::GLOBAL>::MAGIC)
-      __sanitizer_trap_info_ptr->garbagePointer<AllocationKind::GLOBAL>(AP, P,
-                                                                        PC);
+      __sanitizer_trap_info_ptr->garbagePointer<AllocationKind::GLOBAL>(
+          AP, P, SourceId);
   } else {
     auto AP = AllocationPtrTy<AllocationKind::LOCAL>::get(P);
     if (AP.Magic != SanitizerConfig<AllocationKind::LOCAL>::MAGIC)
-      __sanitizer_trap_info_ptr->garbagePointer<AllocationKind::LOCAL>(AP, P,
-                                                                       PC);
+      __sanitizer_trap_info_ptr->garbagePointer<AllocationKind::LOCAL>(
+          AP, P, SourceId);
   }
 }
 
@@ -237,34 +235,34 @@ extern "C" {
 [[clang::disable_sanitizer_instrumentation, gnu::flatten, gnu::always_inline,
   gnu::used, gnu::retain]] _AS_PTR(void, AllocationKind::LOCAL)
     ompx_new_local(_AS_PTR(void, AllocationKind::LOCAL) Start, uint64_t Length,
-                   int64_t AllocationId, uint32_t Slot, uint64_t PC) {
+                   int64_t AllocationId, uint32_t Slot, int64_t SourceId) {
   return AllocationTracker<AllocationKind::LOCAL>::create(
-      Start, Length, AllocationId, Slot, PC);
+      Start, Length, AllocationId, Slot, SourceId);
 }
 [[clang::disable_sanitizer_instrumentation, gnu::flatten, gnu::always_inline,
   gnu::used, gnu::retain]] _AS_PTR(void, AllocationKind::GLOBAL)
     ompx_new_global(_AS_PTR(void, AllocationKind::GLOBAL) Start,
                     uint64_t Length, int64_t AllocationId, uint32_t Slot,
-                    uint64_t PC) {
+                    int64_t SourceId) {
   return AllocationTracker<AllocationKind::GLOBAL>::create(
-      Start, Length, AllocationId, Slot, PC);
+      Start, Length, AllocationId, Slot, SourceId);
 }
 [[clang::disable_sanitizer_instrumentation, gnu::flatten, gnu::always_inline,
   gnu::used, gnu::retain]] void
 __sanitizer_register_host(_AS_PTR(void, AllocationKind::GLOBAL) Start,
-                          uint64_t Length, uint64_t Slot, uint64_t PC) {
+                          uint64_t Length, uint64_t Slot, int64_t SourceId) {
   AllocationTracker<AllocationKind::GLOBAL>::create(Start, Length, Slot, Slot,
-                                                    PC);
+                                                    SourceId);
 }
 [[clang::disable_sanitizer_instrumentation, gnu::flatten, gnu::always_inline,
   gnu::used, gnu::retain]] void *
 ompx_new(void *Start, uint64_t Length, int64_t AllocationId, uint32_t Slot,
-         uint64_t PC) {
+         int64_t SourceId) {
   if (REAL_PTR_IS_LOCAL(Start))
     return (void *)ompx_new_local((_AS_PTR(void, AllocationKind::LOCAL))Start,
-                                  Length, AllocationId, Slot, PC);
+                                  Length, AllocationId, Slot, SourceId);
   return (void *)ompx_new_global((_AS_PTR(void, AllocationKind::GLOBAL))Start,
-                                 Length, AllocationId, Slot, PC);
+                                 Length, AllocationId, Slot, SourceId);
 }
 [[clang::disable_sanitizer_instrumentation, gnu::flatten, gnu::always_inline,
   gnu::used, gnu::retain]] void
@@ -274,80 +272,77 @@ ompx_free_local_n(int32_t N) {
 [[clang::disable_sanitizer_instrumentation, gnu::flatten, gnu::always_inline,
   gnu::used, gnu::retain]] void
 __sanitizer_unregister_host(_AS_PTR(void, AllocationKind::GLOBAL) P) {
-  AllocationTracker<AllocationKind::GLOBAL>::remove(P, /*PC=*/0);
+  AllocationTracker<AllocationKind::GLOBAL>::remove(P, /*SourceId=*/0);
 }
 [[clang::disable_sanitizer_instrumentation, gnu::flatten, gnu::always_inline,
   gnu::used, gnu::retain]] void
-ompx_free_local(_AS_PTR(void, AllocationKind::LOCAL) P) {
-  return AllocationTracker<AllocationKind::LOCAL>::remove(P, /*PC=*/0);
+ompx_free_local(_AS_PTR(void, AllocationKind::LOCAL) P, int64_t SourceId) {
+  return AllocationTracker<AllocationKind::LOCAL>::remove(P, SourceId);
 }
 [[clang::disable_sanitizer_instrumentation, gnu::flatten, gnu::always_inline,
   gnu::used, gnu::retain]] void
-ompx_free_global(_AS_PTR(void, AllocationKind::GLOBAL) P) {
-  return AllocationTracker<AllocationKind::GLOBAL>::remove(P, /*PC=*/0);
+ompx_free_global(_AS_PTR(void, AllocationKind::GLOBAL) P, int64_t SourceId) {
+  return AllocationTracker<AllocationKind::GLOBAL>::remove(P, SourceId);
 }
 [[clang::disable_sanitizer_instrumentation, gnu::flatten, gnu::always_inline,
   gnu::used, gnu::retain]] void
-ompx_free(void *P, uint64_t PC) {
+ompx_free(void *P, int64_t SourceId) {
   bool IsGlobal = IS_GLOBAL(P);
-  checkForMagic(IsGlobal, P, PC);
+  checkForMagic(IsGlobal, P, SourceId);
   if (IsGlobal)
-    return ompx_free_global((_AS_PTR(void, AllocationKind::GLOBAL))P);
-  return ompx_free_local((_AS_PTR(void, AllocationKind::LOCAL))P);
+    return ompx_free_global((_AS_PTR(void, AllocationKind::GLOBAL))P, SourceId);
+  return ompx_free_local((_AS_PTR(void, AllocationKind::LOCAL))P, SourceId);
 }
 
 [[clang::disable_sanitizer_instrumentation, gnu::flatten, gnu::always_inline,
   gnu::used, gnu::retain]] _AS_PTR(void, AllocationKind::LOCAL)
     ompx_gep_local(_AS_PTR(void, AllocationKind::LOCAL) P, uint64_t Offset,
-                   uint64_t PC) {
-  return AllocationTracker<AllocationKind::LOCAL>::advance(P, Offset, PC);
+                   int64_t SourceId) {
+  return AllocationTracker<AllocationKind::LOCAL>::advance(P, Offset, SourceId);
 }
 [[clang::disable_sanitizer_instrumentation, gnu::flatten, gnu::always_inline,
   gnu::used, gnu::retain]] _AS_PTR(void, AllocationKind::GLOBAL)
     ompx_gep_global(_AS_PTR(void, AllocationKind::GLOBAL) P, uint64_t Offset,
-                    uint64_t PC) {
-  return AllocationTracker<AllocationKind::GLOBAL>::advance(P, Offset, PC);
+                    int64_t SourceId) {
+  return AllocationTracker<AllocationKind::GLOBAL>::advance(P, Offset,
+                                                            SourceId);
 }
 [[clang::disable_sanitizer_instrumentation, gnu::flatten, gnu::always_inline,
   gnu::used, gnu::retain]] void *
-ompx_gep(void *P, uint64_t Offset, uint64_t PC) {
+ompx_gep(void *P, uint64_t Offset, int64_t SourceId) {
   bool IsGlobal = IS_GLOBAL(P);
-  checkForMagic(IsGlobal, P, PC);
+  checkForMagic(IsGlobal, P, SourceId);
   if (IsGlobal)
     return (void *)ompx_gep_global((_AS_PTR(void, AllocationKind::GLOBAL))P,
-                                   Offset, PC);
+                                   Offset, SourceId);
   return (void *)ompx_gep_local((_AS_PTR(void, AllocationKind::LOCAL))P, Offset,
-                                PC);
+                                SourceId);
 }
 
 [[clang::disable_sanitizer_instrumentation, gnu::flatten, gnu::always_inline,
   gnu::used, gnu::retain]] _AS_PTR(void, AllocationKind::LOCAL)
     ompx_check_local(_AS_PTR(void, AllocationKind::LOCAL) P, uint64_t Size,
-                     uint64_t AccessId, uint64_t PC, const char *FunctionName,
-                     const char *FileName, uint64_t LineNo) {
-  return AllocationTracker<AllocationKind::LOCAL>::check(
-      P, Size, AccessId, PC, FunctionName, FileName, LineNo);
+                     uint64_t AccessId, int64_t SourceId) {
+  return AllocationTracker<AllocationKind::LOCAL>::check(P, Size, AccessId,
+                                                         SourceId);
 }
 [[clang::disable_sanitizer_instrumentation, gnu::flatten, gnu::always_inline,
   gnu::used, gnu::retain]] _AS_PTR(void, AllocationKind::GLOBAL)
     ompx_check_global(_AS_PTR(void, AllocationKind::GLOBAL) P, uint64_t Size,
-                      uint64_t AccessId, uint64_t PC, const char *FunctionName,
-                      const char *FileName, uint64_t LineNo) {
-  return AllocationTracker<AllocationKind::GLOBAL>::check(
-      P, Size, AccessId, PC, FunctionName, FileName, LineNo);
+                      uint64_t AccessId, int64_t SourceId) {
+  return AllocationTracker<AllocationKind::GLOBAL>::check(P, Size, AccessId,
+                                                          SourceId);
 }
 [[clang::disable_sanitizer_instrumentation, gnu::flatten, gnu::always_inline,
   gnu::used, gnu::retain]] void *
-ompx_check(void *P, uint64_t Size, uint64_t AccessId, uint64_t PC,
-           const char *FunctionName, const char *FileName, uint64_t LineNo) {
+ompx_check(void *P, uint64_t Size, uint64_t AccessId, int64_t SourceId) {
   bool IsGlobal = IS_GLOBAL(P);
-  checkForMagic(IsGlobal, P, PC);
+  checkForMagic(IsGlobal, P, SourceId);
   if (IsGlobal)
     return (void *)ompx_check_global((_AS_PTR(void, AllocationKind::GLOBAL))P,
-                                     Size, AccessId, PC, FunctionName, FileName,
-                                     LineNo);
+                                     Size, AccessId, SourceId);
   return (void *)ompx_check_local((_AS_PTR(void, AllocationKind::LOCAL))P, Size,
-                                  AccessId, PC, FunctionName, FileName, LineNo);
+                                  AccessId, SourceId);
 }
 
 [[clang::disable_sanitizer_instrumentation, gnu::flatten, gnu::always_inline,
@@ -355,12 +350,9 @@ ompx_check(void *P, uint64_t Size, uint64_t AccessId, uint64_t PC,
     ompx_check_with_base_local(_AS_PTR(void, AllocationKind::LOCAL) P,
                                _AS_PTR(void, AllocationKind::LOCAL) Start,
                                uint64_t Length, uint32_t Tag, uint64_t Size,
-                               uint64_t AccessId, uint64_t PC,
-                               const char *FunctionName, const char *FileName,
-                               uint64_t LineNo) {
+                               uint64_t AccessId, int64_t SourceId) {
   return AllocationTracker<AllocationKind::LOCAL>::checkWithBase(
-      P, Start, Length, Tag, Size, AccessId, PC, FunctionName, FileName,
-      LineNo);
+      P, Start, Length, Tag, Size, AccessId, SourceId);
 }
 
 [[clang::disable_sanitizer_instrumentation, gnu::flatten, gnu::always_inline,
@@ -368,33 +360,33 @@ ompx_check(void *P, uint64_t Size, uint64_t AccessId, uint64_t PC,
     ompx_check_with_base_global(_AS_PTR(void, AllocationKind::GLOBAL) P,
                                 _AS_PTR(void, AllocationKind::GLOBAL) Start,
                                 uint64_t Length, uint32_t Tag, uint64_t Size,
-                                uint64_t AccessId, uint64_t PC,
-                                const char *FunctionName, const char *FileName,
-                                uint64_t LineNo) {
+                                uint64_t AccessId, int64_t SourceId) {
   return AllocationTracker<AllocationKind::GLOBAL>::checkWithBase(
-      P, Start, Length, Tag, Size, AccessId, PC, FunctionName, FileName,
-      LineNo);
+      P, Start, Length, Tag, Size, AccessId, SourceId);
 }
 
 [[clang::disable_sanitizer_instrumentation, gnu::flatten, gnu::always_inline,
   gnu::used, gnu::retain]] _AS_PTR(void, AllocationKind::LOCAL)
-    ompx_unpack_local(_AS_PTR(void, AllocationKind::LOCAL) P, uint64_t PC) {
-  return AllocationTracker<AllocationKind::LOCAL>::unpack(P, PC);
+    ompx_unpack_local(_AS_PTR(void, AllocationKind::LOCAL) P,
+                      int64_t SourceId) {
+  return AllocationTracker<AllocationKind::LOCAL>::unpack(P, SourceId);
 }
 [[clang::disable_sanitizer_instrumentation, gnu::flatten, gnu::always_inline,
   gnu::used, gnu::retain]] _AS_PTR(void, AllocationKind::GLOBAL)
-    ompx_unpack_global(_AS_PTR(void, AllocationKind::GLOBAL) P, uint64_t PC) {
-  return AllocationTracker<AllocationKind::GLOBAL>::unpack(P, PC);
+    ompx_unpack_global(_AS_PTR(void, AllocationKind::GLOBAL) P,
+                       int64_t SourceId) {
+  return AllocationTracker<AllocationKind::GLOBAL>::unpack(P, SourceId);
 }
 [[clang::disable_sanitizer_instrumentation, gnu::flatten, gnu::always_inline,
   gnu::used, gnu::retain]] void *
-ompx_unpack(void *P, uint64_t PC) {
+ompx_unpack(void *P, int64_t SourceId) {
   bool IsGlobal = IS_GLOBAL(P);
-  checkForMagic(IsGlobal, P, PC);
+  checkForMagic(IsGlobal, P, SourceId);
   if (IsGlobal)
     return (void *)ompx_unpack_global((_AS_PTR(void, AllocationKind::GLOBAL))P,
-                                      PC);
-  return (void *)ompx_unpack_local((_AS_PTR(void, AllocationKind::LOCAL))P, PC);
+                                      SourceId);
+  return (void *)ompx_unpack_local((_AS_PTR(void, AllocationKind::LOCAL))P,
+                                   SourceId);
 }
 
 [[clang::disable_sanitizer_instrumentation, gnu::flatten, gnu::always_inline,
diff --git a/offload/include/Shared/Sanitizer.h b/offload/include/Shared/Sanitizer.h
index fa275b27832e0..328216cb2a7fe 100644
--- a/offload/include/Shared/Sanitizer.h
+++ b/offload/include/Shared/Sanitizer.h
@@ -20,13 +20,14 @@ extern "C" int ompx_thread_id(int Dim);
 
 enum class AllocationKind { LOCAL, GLOBAL, LAST = GLOBAL };
 
-template <AllocationKind AK> struct ASTypes {};
-template <> struct ASTypes<AllocationKind::GLOBAL> {
+template <AllocationKind AK> struct ASTypes {
   using INT_TY = uint64_t;
 };
+#pragma omp begin declare variant match(device = {arch(amdgcn)})
 template <> struct ASTypes<AllocationKind::LOCAL> {
   using INT_TY = uint32_t;
 };
+#pragma omp end declare variant
 
 template <AllocationKind AK> struct SanitizerConfig {
   static constexpr uint32_t ADDR_SPACE = AK == AllocationKind::GLOBAL ? 0 : 5;
@@ -42,17 +43,17 @@ template <AllocationKind AK> struct SanitizerConfig {
   static constexpr uint32_t OBJECT_BITS = AK == AllocationKind::GLOBAL ? 10 : 7;
   static constexpr uint32_t SLOTS = (1 << (OBJECT_BITS));
   static constexpr uint32_t KIND_BITS = 1;
-  static constexpr uint32_t ID_BITS = 9 - KIND_BITS;
+  static constexpr uint32_t Id_BITS = 9 - KIND_BITS;
 
   static constexpr uint32_t LENGTH_BITS =
-      ADDR_SPACE_PTR_SIZE - TAG_BITS - ID_BITS;
+      ADDR_SPACE_PTR_SIZE - TAG_BITS - Id_BITS;
   static constexpr uint32_t OFFSET_BITS =
       ADDR_SPACE_PTR_SIZE - TAG_BITS - OBJECT_BITS - KIND_BITS - MAGIC_BITS;
 
   static constexpr bool useTags() { return TAG_BITS > 1; }
 
-  static_assert(LENGTH_BITS + TAG_BITS + ID_BITS == ADDR_SPACE_PTR_SIZE,
-                "Length, tag, and ID bits should cover one pointer");
+  static_assert(LENGTH_BITS + TAG_BITS + Id_BITS == ADDR_SPACE_PTR_SIZE,
+                "Length, tag, and Id bits should cover one pointer");
   static_assert(OFFSET_BITS + TAG_BITS + OBJECT_BITS + MAGIC_BITS + KIND_BITS ==
                     ADDR_SPACE_PTR_SIZE,
                 "Offset, tag, object, and kind bits should cover one pointer");
@@ -67,7 +68,7 @@ template <AllocationKind AK> struct AllocationTy {
   _AS_PTR(void, AK) Start;
   typename ASTypes<AK>::INT_TY Length : SanitizerConfig<AK>::LENGTH_BITS;
   typename ASTypes<AK>::INT_TY Tag : SanitizerConfig<AK>::TAG_BITS;
-  typename ASTypes<AK>::INT_TY Id : SanitizerConfig<AK>::ID_BITS;
+  typename ASTypes<AK>::INT_TY Id : SanitizerConfig<AK>::Id_BITS;
 };
 
 template <AllocationKind AK> struct AllocationArrayTy {
@@ -95,7 +96,9 @@ template <AllocationKind AK> struct AllocationPtrTy {
   // Must be last, TODO: merge into TAG
   typename ASTypes<AK>::INT_TY Kind : SanitizerConfig<AK>::KIND_BITS;
 };
+#pragma omp begin declare variant match(device = {arch(amdgcn)})
 static_assert(sizeof(AllocationPtrTy<AllocationKind::LOCAL>) * 8 == 32);
+#pragma omp end declare variant
 
 union TypePunUnion {
   uint64_t I;
@@ -106,7 +109,9 @@ union TypePunUnion {
     uint32_t U;
   };
 };
+#pragma omp begin declare variant match(device = {arch(amdgcn)})
 static_assert(sizeof(TypePunUnion) * 8 == 64);
+#pragma omp end declare variant
 
 static inline void *__offload_get_new_sanitizer_ptr(int32_t Slot) {
   AllocationPtrTy<AllocationKind::GLOBAL> AP;
@@ -162,38 +167,22 @@ struct SanitizerTrapInfoTy {
   uint64_t BlockId[3];
   uint32_t ThreadId[3];
   uint64_t PC;
-  uint64_t LineNo;
-  char FunctionName[256];
-  char FileName[256];
+  uint64_t SrcId;
   /// }
 
   [[clang::disable_sanitizer_instrumentation]] void
-  setCoordinates(uint64_t PC, const char *FnName, const char *FlName,
-                 uint64_t LineNo) {
+  setCoordinates(int64_t SourceId) {
     for (int32_t Dim = 0; Dim < 3; ++Dim) {
       BlockId[Dim] = ompx_block_id(Dim);
       ThreadId[Dim] = ompx_thread_id(Dim);
     }
-    this->PC = PC;
-    this->LineNo = LineNo;
-
-    auto CopyName = [](char *Dst, const char *Src, int32_t Length) {
-      if (!Src)
-        return;
-      for (int32_t I = 0; I < Length; ++I) {
-        Dst[I] = Src[I];
-        if (!Src[I])
-          break;
-      }
-    };
-    CopyName(FunctionName, FnName, sizeof(FunctionName));
-    CopyName(FileName, FlName, sizeof(FileName));
+    SrcId = SourceId;
   }
 
   template <enum AllocationKind AK>
   [[clang::disable_sanitizer_instrumentation, gnu::always_inline]] void
   allocationError(ErrorCodeTy EC, _AS_PTR(void, AK) Start, uint64_t Length,
-                  int64_t Id, int64_t Tag, uint64_t Slot, uint64_t PC) {
+                  int64_t Id, int64_t Tag, uint64_t Slot, int64_t SourceId) {
     AllocationStart = (void *)Start;
     AllocationLength = Length;
     AllocationId = Id;
@@ -202,15 +191,14 @@ struct SanitizerTrapInfoTy {
     PtrSlot = Slot;
 
     ErrorCode = EC;
-    setCoordinates(PC, nullptr, nullptr, 0);
+    setCoordinates(SourceId);
   }
 
   template <enum AllocationKind AK>
   [[clang::disable_sanitizer_instrumentation, gnu::always_inline]] void
   propagateAccessError(ErrorCodeTy EC, const AllocationTy<AK> &A,
                        const AllocationPtrTy<AK> &AP, uint64_t Size, int64_t Id,
-                       uint64_t PC, const char *FunctionName,
-                       const char *FileName, uint64_t LineNo) {
+                       int64_t SourceId) {
     AllocationStart = (void *)A.Start;
     AllocationLength = A.Length;
     AllocationId = A.Id;
@@ -227,24 +215,26 @@ struct SanitizerTrapInfoTy {
     AccessSize = Size;
     AccessId = Id;
 
-    setCoordinates(PC, FunctionName, FileName, LineNo);
+    setCoordinates(SourceId);
   }
 
   template <enum AllocationKind AK>
   [[clang::disable_sanitizer_instrumentation, noreturn, gnu::noinline]] void
   exceedsAllocationLength(_AS_PTR(void, AK) Start, uint64_t Length,
-                          int64_t AllocationId, uint64_t Slot, uint64_t PC) {
+                          int64_t AllocationId, uint64_t Slot,
+                          int64_t SourceId) {
     allocationError<AK>(ExceedsLength, Start, Length, AllocationId, /*Tag=*/0,
-                        Slot, PC);
+                        Slot, SourceId);
     __builtin_trap();
   }
 
   template <enum AllocationKind AK>
   [[clang::disable_sanitizer_instrumentation, noreturn, gnu::noinline]] void
   exceedsAllocationSlots(_AS_PTR(void, AK) Start, uint64_t Length,
-                         int64_t AllocationId, uint64_t Slot, uint64_t PC) {
+                         int64_t AllocationId, uint64_t Slot,
+                         int64_t SourceId) {
     allocationError<AK>(ExceedsSlots, Start, Length, AllocationId, /*Tag=*/0,
-                        Slot, PC);
+                        Slot, SourceId);
     __builtin_trap();
   }
 
@@ -260,45 +250,35 @@ struct SanitizerTrapInfoTy {
   template <enum AllocationKind AK>
   [[clang::disable_sanitizer_instrumentation, noreturn, gnu::noinline]] void
   outOfBoundAccess(const AllocationTy<AK> A, const AllocationPtrTy<AK> AP,
-                   uint64_t Size, int64_t AccessId, uint64_t PC,
-                   const char *FunctionName, const char *FileName,
-                   uint64_t LineNo) {
-    propagateAccessError(OutOfBounds, A, AP, Size, AccessId, PC, FunctionName,
-                         FileName, LineNo);
+                   uint64_t Size, int64_t AccessId, int64_t SourceId) {
+    propagateAccessError(OutOfBounds, A, AP, Size, AccessId, SourceId);
     __builtin_trap();
   }
 
   template <enum AllocationKind AK>
   [[clang::disable_sanitizer_instrumentation, noreturn, gnu::noinline]] void
   useAfterScope(const AllocationTy<AK> A, const AllocationPtrTy<AK> AP,
-                uint64_t Size, int64_t AccessId, uint64_t PC,
-                const char *FunctionName, const char *FileName,
-                uint64_t LineNo) {
-    propagateAccessError(UseAfterScope, A, AP, Size, AccessId, PC, FunctionName,
-                         FileName, LineNo);
+                uint64_t Size, int64_t AccessId, int64_t SourceId) {
+    propagateAccessError(UseAfterScope, A, AP, Size, AccessId, SourceId);
     __builtin_trap();
   }
 
   template <enum AllocationKind AK>
   [[clang::disable_sanitizer_instrumentation, noreturn, gnu::noinline]] void
   useAfterFree(const AllocationTy<AK> A, const AllocationPtrTy<AK> AP,
-               uint64_t Size, int64_t AccessId, uint64_t PC,
-               const char *FunctionName, const char *FileName,
-               uint64_t LineNo) {
-    propagateAccessError(UseAfterFree, A, AP, Size, AccessId, PC, FunctionName,
-                         FileName, LineNo);
+               uint64_t Size, int64_t AccessId, int64_t SourceId) {
+    propagateAccessError(UseAfterFree, A, AP, Size, AccessId, SourceId);
     __builtin_trap();
   }
 
   template <enum AllocationKind AK>
   [[clang::disable_sanitizer_instrumentation, noreturn, gnu::noinline]] void
   accessError(const AllocationPtrTy<AK> AP, int64_t Size, int64_t AccessId,
-              uint64_t PC, const char *FunctionName, const char *FileName,
-              uint64_t LineNo);
+              int64_t SourceId);
 
   template <enum AllocationKind AK>
   [[clang::disable_sanitizer_instrumentation, noreturn, gnu::noinline]] void
-  garbagePointer(const AllocationPtrTy<AK> AP, void *P, uint64_t PC) {
+  garbagePointer(const AllocationPtrTy<AK> AP, void *P, int64_t SourceId) {
     ErrorCode = GarbagePointer;
     AllocationStart = P;
     AllocationKind = (decltype(AllocationKind))AK;
@@ -306,7 +286,7 @@ struct SanitizerTrapInfoTy {
     PtrSlot = AP.AllocationId;
     PtrTag = AP.AllocationTag;
     PtrKind = AP.Kind;
-    setCoordinates(PC, nullptr, nullptr, 0);
+    setCoordinates(SourceId);
     __builtin_trap();
   }
 
@@ -314,7 +294,7 @@ struct SanitizerTrapInfoTy {
   [[clang::disable_sanitizer_instrumentation, noreturn, gnu::noinline]] void
   memoryLeak(const AllocationTy<AK> A, uint64_t Slot) {
     allocationError<AK>(MemoryLeak, A.Start, A.Length, A.Id, A.Tag, Slot,
-                        /*PC=*/0);
+                        /*SourceId=*/-1);
     __builtin_trap();
   }
 };
@@ -350,20 +330,16 @@ getAllocation(const AllocationPtrTy<AK> AP, int64_t AccessId = 0) {
 template <enum AllocationKind AK>
 [[clang::disable_sanitizer_instrumentation, noreturn, gnu::noinline]] void
 SanitizerTrapInfoTy::accessError(const AllocationPtrTy<AK> AP, int64_t Size,
-                                 int64_t AccessId, uint64_t PC,
-                                 const char *FunctionName, const char *FileName,
-                                 uint64_t LineNo) {
+                                 int64_t AccessId, int64_t SourceId) {
   auto &A = getAllocationArray<AK>().Arr[AP.AllocationId];
   int64_t Offset = AP.Offset;
   int64_t Length = A.Length;
   if (AK == AllocationKind::LOCAL && Length == 0)
-    useAfterScope<AK>(A, AP, Size, AccessId, PC, FunctionName, FileName,
-                      LineNo);
+    useAfterScope<AK>(A, AP, Size, AccessId, SourceId);
   else if (Offset > Length - Size)
-    outOfBoundAccess<AK>(A, AP, Size, AccessId, PC, FunctionName, FileName,
-                         LineNo);
+    outOfBoundAccess<AK>(A, AP, Size, AccessId, SourceId);
   else
-    useAfterFree<AK>(A, AP, Size, AccessId, PC, FunctionName, FileName, LineNo);
+    useAfterFree<AK>(A, AP, Size, AccessId, SourceId);
 }
 
 #endif
diff --git a/offload/plugins-nextgen/common/src/PluginInterface.cpp b/offload/plugins-nextgen/common/src/PluginInterface.cpp
index e44575c840d66..5c800cb6434f1 100644
--- a/offload/plugins-nextgen/common/src/PluginInterface.cpp
+++ b/offload/plugins-nextgen/common/src/PluginInterface.cpp
@@ -2200,6 +2200,7 @@ void GPUSanTy::checkAndReportError() {
   auto Red = []() { return "\033[1m\033[31m"; };
   auto Default = []() { return "\033[1m\033[0m"; };
 
+#if 0
   std::string KernelName;
   StringRef FunctionName =
       STI.FunctionName[0] ? StringRef(STI.FunctionName) : "<unknown>";
@@ -2236,10 +2237,16 @@ void GPUSanTy::checkAndReportError() {
       }
     }
   }
+#endif
 
   fprintf(stderr, "============================================================"
                   "====================\n");
 
+  auto PrintStackTrace = [&](int64_t SourceId) {
+    fprintf(stderr, "    #0 " DPxMOD " %s in %s:%lu\n\n", DPxPTR(0), "unknown",
+            "unknown", 0UL);
+  };
+
   auto DiagnoseAccess = [&](StringRef Name) {
     void *PC = reinterpret_cast<void *>(STI.PC);
     void *Addr = utils::advancePtr(STI.AllocationStart, STI.PtrOffset);
@@ -2254,8 +2261,7 @@ void GPUSanTy::checkAndReportError() {
             DPxPTR(Addr), STI.ThreadId[0], STI.ThreadId[1], STI.ThreadId[2],
             STI.BlockId[0], STI.BlockId[1], STI.BlockId[2], STI.AccessId,
             (STI.AllocationKind ? "heap" : "stack"), Default());
-    fprintf(stderr, "    #0 " DPxMOD " %s in %s:%lu\n\n", DPxPTR(PC),
-            FunctionName.str().c_str(), FileName.data(), STI.LineNo);
+    PrintStackTrace(STI.SrcId);
     fprintf(
         stderr,
         "%s" DPxMOD " is located %lu bytes inside of a %lu-byte region [" DPxMOD
diff --git a/offload/test/sanitizer/global_null.c b/offload/test/sanitizer/global_null.c
index 3442deea81e97..91be2cb499c45 100644
--- a/offload/test/sanitizer/global_null.c
+++ b/offload/test/sanitizer/global_null.c
@@ -1,6 +1,7 @@
 // clang-format off
-// RUN: %libomptarget-compileopt-generic -loffload.kernels -mllvm -enable-gpu-san
-// RUN: %libomptarget-run-generic 2>&1 | %fcheck-generic
+// RUN: %libomptarget-compileopt-generic -fsanitize=offload
+// RUN: not %libomptarget-run-generic 2>&1 > %t.out
+// RUN: %fcheck-generic --check-prefixes=CHECK < %t.out
 // clang-format on
 
 // UNSUPPORTED: aarch64-unknown-linux-gnu
diff --git a/offload/test/sanitizer/heap_null.c b/offload/test/sanitizer/heap_null.c
index 568f423227c52..e13ad234d21c0 100644
--- a/offload/test/sanitizer/heap_null.c
+++ b/offload/test/sanitizer/heap_null.c
@@ -18,22 +18,15 @@
 
 int main(void) {
 
-  void *Null = 0;
-  void *Heap, *Stack;
-#pragma omp target map(from : Heap, Stack)
+  int *Null = 0;
+#pragma omp target
   {
-    int Q[512];
     // clang-format off
     // CHECK:      ERROR: OffloadSanitizer out-of-bounds access on address 0x0000000000000000 at pc [[PC:0x.*]]
     // CHECK-NEXT: WRITE of size 4 at 0x0000000000000000 thread <0, 0, 0> block <0, 0, 0>
     // CHECK-NEXT: #0 [[PC]] main null.c:[[@LINE+3]]
     // CHECK-NEXT: 0x0000000000000000 is located 0 bytes inside of 0-byte region [0x0000000000000000,0x0000000000000000)
     // clang-format on
-    //    *Null = 42;
-    Stack = &Q[0];
-    Heap = Null;
+    *Null = 42;
   }
-  printf("Heap %p Stack %p\n", Heap, Stack);
-  printf("Heap %lu Stack %lu\n", ((uintptr_t)Heap & (1UL << 63)),
-         ((uintptr_t)Stack & (1UL << 63)));
 }

>From d003f7e514a6e6b920d4a1fd1cc5214bda320ccf Mon Sep 17 00:00:00 2001
From: Johannes Doerfert <johannes at jdoerfert.de>
Date: Mon, 1 Jul 2024 17:27:53 -0700
Subject: [PATCH 17/31] Fixes

---
 llvm/lib/Transforms/Instrumentation/GPUSan.cpp | 18 +++++++++++++-----
 offload/test/sanitizer/null_forced_stack.c     |  4 +++-
 2 files changed, 16 insertions(+), 6 deletions(-)

diff --git a/llvm/lib/Transforms/Instrumentation/GPUSan.cpp b/llvm/lib/Transforms/Instrumentation/GPUSan.cpp
index 1ee4671760f89..9313e262e5b9c 100644
--- a/llvm/lib/Transforms/Instrumentation/GPUSan.cpp
+++ b/llvm/lib/Transforms/Instrumentation/GPUSan.cpp
@@ -293,7 +293,7 @@ ConstantInt *GPUSanImpl::getSourceIndex(Instruction &I,
 
   auto PrettifyFunctionName = [&](StringRef Name) {
     if (Name.ends_with(".internalized"))
-      return SS.save(Name.drop_back(sizeof(".internalized")) +
+      return SS.save(Name.drop_back(sizeof("internalized")) +
                      " (internalized)");
     if (!Name.starts_with("__omp_offloading_"))
       return Name;
@@ -325,6 +325,8 @@ ConstantInt *GPUSanImpl::getSourceIndex(Instruction &I,
 
   auto FillLI = [&](LocationInfoTy &LI, DILocation &DIL) {
     LI.FileName = DIL.getFilename();
+    if (LI.FileName.empty())
+      LI.FileName = I.getFunction()->getSubprogram()->getFilename();
     LI.FunctionName = DIL.getSubprogramLinkageName();
     if (LI.FunctionName.empty())
       LI.FunctionName = I.getFunction()->getName();
@@ -338,14 +340,16 @@ ConstantInt *GPUSanImpl::getSourceIndex(Instruction &I,
     FillLI(*LI, *DILoc);
     ParentDILoc = DILoc->getInlinedAt();
   } else {
-    LI->FunctionName = I.getFunction()->getName();
+    LI->FunctionName = PrettifyFunctionName(I.getFunction()->getName());
   }
   errs() << __FUNCTION__ << " : " << I << " : " << LastLI << "\n";
 
   bool IsNew;
   uint64_t Idx;
+  errs() << "Line: " << LI->LineNo << "\n";
   std::tie(LI, Idx) = addLocationInfo(LI, IsNew);
-  errs() << "Idx: " << Idx << " : " << IsNew << "\n";
+  errs() << "Idx: " << Idx << " : IsNew " << IsNew << "\n";
+  errs() << "Line: " << LI->LineNo << "\n";
   if (LastLI)
     addParentLocationInfo(*LastLI, Idx);
   if (!IsNew)
@@ -353,15 +357,19 @@ ConstantInt *GPUSanImpl::getSourceIndex(Instruction &I,
 
   LocationInfoTy *CurLI = LI;
   while (ParentDILoc) {
+    //    if (!ParentDILoc->getScope()->getSubprogram()->isArtificial()) {
     auto *ParentLI = new LocationInfoTy();
-    FillLI(*ParentLI, *DILoc);
+    FillLI(*ParentLI, *ParentDILoc);
     uint64_t ParentIdx;
+    errs() << "Parent " << ParentLI->LineNo << "\n";
     std::tie(ParentLI, ParentIdx) = addLocationInfo(ParentLI, IsNew);
+    errs() << "Parent " << ParentIdx << " :: " << ParentLI->LineNo << "\n";
     addParentLocationInfo(*CurLI, ParentIdx);
     CurLI = ParentLI;
     if (!IsNew)
       break;
-    ParentDILoc = DILoc->getInlinedAt();
+    //   }
+    ParentDILoc = ParentDILoc->getInlinedAt();
   }
 
   Function &Fn = *I.getFunction();
diff --git a/offload/test/sanitizer/null_forced_stack.c b/offload/test/sanitizer/null_forced_stack.c
index 02c08643fdc12..e59e34b3a0cd2 100644
--- a/offload/test/sanitizer/null_forced_stack.c
+++ b/offload/test/sanitizer/null_forced_stack.c
@@ -18,6 +18,8 @@
 
 [[clang::optnone]] int *deref(int **P) { return *P; }
 
+int *bar(int **P) { return deref(P); }
+
 int main(void) {
 
 #pragma omp target
@@ -36,6 +38,6 @@ int main(void) {
     // 
     // DEBUG: 0x0000000000000000 is located 0 bytes inside of a 0-byte region [0x0000000000000000,0x0000000000000000)
     // clang-format on
-    deref(&NullPtr)[10] = 42;
+    bar(&NullPtr)[10] = 42;
   }
 }

>From cbdd509abf29f4c3a163ff9d8bf90e4f2620d7dd Mon Sep 17 00:00:00 2001
From: Johannes Doerfert <johannes at jdoerfert.de>
Date: Tue, 2 Jul 2024 14:57:17 -0700
Subject: [PATCH 18/31] Backtrace support

---
 .../lib/Transforms/Instrumentation/GPUSan.cpp | 161 ++++++++++--------
 offload/include/Shared/Sanitizer.h            |  12 +-
 .../common/include/GlobalHandler.h            |   3 +
 .../common/include/PluginInterface.h          |   2 +-
 .../common/src/PluginInterface.cpp            | 128 +++++++-------
 5 files changed, 166 insertions(+), 140 deletions(-)

diff --git a/llvm/lib/Transforms/Instrumentation/GPUSan.cpp b/llvm/lib/Transforms/Instrumentation/GPUSan.cpp
index 9313e262e5b9c..cfc2fa0732d35 100644
--- a/llvm/lib/Transforms/Instrumentation/GPUSan.cpp
+++ b/llvm/lib/Transforms/Instrumentation/GPUSan.cpp
@@ -58,8 +58,8 @@ namespace llvm {
 
 struct LocationInfoTy {
   uint64_t LineNo = 0;
-  uint32_t ColumnNo = 0;
-  uint32_t ParentIdx = -1;
+  uint64_t ColumnNo = 0;
+  uint64_t ParentIdx = -1;
   StringRef FileName;
   StringRef FunctionName;
   bool operator==(const LocationInfoTy &RHS) const {
@@ -119,6 +119,37 @@ static std::string getSuffix(PtrOrigin PO) {
   llvm_unreachable("Bad pointer origin!");
 }
 
+static StringRef prettifyFunctionName(StringSaver &SS, StringRef Name) {
+  if (Name.ends_with(".internalized"))
+    return SS.save(Name.drop_back(sizeof("internalized")) + " (internalized)");
+  if (!Name.starts_with("__omp_offloading_"))
+    return Name;
+  Name = Name.drop_front(sizeof("__omp_offloading_"));
+  auto It = Name.find_first_of("_");
+  if (It != StringRef::npos && It + 1 < Name.size())
+    Name = Name.drop_front(It + 1);
+  It = Name.find_first_of("_");
+  if (It != StringRef::npos && It + 1 < Name.size())
+    Name = Name.drop_front(It + 1);
+  if (Name.ends_with("_debug__"))
+    Name = Name.drop_back(sizeof("debug__"));
+  if (Name.ends_with("_debug___omp_outlined_debug__"))
+    Name = Name.drop_back(sizeof("debug___omp_outlined_debug__"));
+  It = Name.find_last_of("_");
+  if (It == StringRef::npos || It + 1 >= Name.size())
+    return Name;
+  if (Name[It + 1] != 'l')
+    return Name;
+  int64_t KernelLineNo = 0;
+  Name.take_back(Name.size() - It -
+                 /* '_' and 'l' */ 2)
+      .getAsInteger(10, KernelLineNo);
+  if (KernelLineNo)
+    Name = SS.save("omp target (" + Name.take_front(It).str() + ":" +
+                   std::to_string(KernelLineNo) + ")");
+  return Name;
+}
+
 class GPUSanImpl final {
 public:
   GPUSanImpl(Module &M, FunctionAnalysisManager &FAM)
@@ -273,13 +304,39 @@ class GPUSanImpl final {
     return {It.first->first, It.first->second};
   }
 
-  void addParentLocationInfo(LocationInfoTy &LI, uint64_t ParentIdx) {
-    LI.ParentIdx = ParentIdx;
-  }
-
   void buildCallTreeInfo(Function &Fn, LocationInfoTy &LI);
   ConstantInt *getSourceIndex(Instruction &I, LocationInfoTy *LastLI = nullptr);
 
+  uint64_t addString(StringRef S) {
+    const auto &It = UniqueStrings.insert({S, ConcatenatedString.size()});
+    if (It.second) {
+      ConcatenatedString += S;
+      ConcatenatedString.push_back('\0');
+    }
+    return It.first->second;
+  };
+
+  void encodeLocationInfo(LocationInfoTy &LI, uint64_t Idx) {
+    StringRef FunctionName = LI.FunctionName;
+    if (LI.ParentIdx == (decltype(LI.ParentIdx))-1)
+      FunctionName = prettifyFunctionName(SS, FunctionName);
+
+    auto FuncIdx = addString(FunctionName);
+    auto FileIdx = addString(LI.FileName);
+    if (LocationEncoding.size() < (Idx + 1) * 5)
+      LocationEncoding.resize((Idx + 1) * 5);
+    LocationEncoding[Idx * 5 + 0] = ConstantInt::get(Int64Ty, FuncIdx);
+    LocationEncoding[Idx * 5 + 1] = ConstantInt::get(Int64Ty, FileIdx);
+    LocationEncoding[Idx * 5 + 2] = ConstantInt::get(Int64Ty, LI.LineNo);
+    LocationEncoding[Idx * 5 + 3] = ConstantInt::get(Int64Ty, LI.ColumnNo);
+    LocationEncoding[Idx * 5 + 4] = ConstantInt::get(Int64Ty, LI.ParentIdx);
+  }
+
+  SmallVector<Constant *> LocationEncoding;
+  std::string ConcatenatedString;
+  DenseMap<uint64_t, uint64_t> StringIndexMap;
+  DenseMap<StringRef, uint64_t> UniqueStrings;
+
   BumpPtrAllocator BPA;
   StringSaver SS = StringSaver(BPA);
 };
@@ -291,38 +348,6 @@ ConstantInt *GPUSanImpl::getSourceIndex(Instruction &I,
   LocationInfoTy *LI = new LocationInfoTy();
   auto *DILoc = I.getDebugLoc().get();
 
-  auto PrettifyFunctionName = [&](StringRef Name) {
-    if (Name.ends_with(".internalized"))
-      return SS.save(Name.drop_back(sizeof("internalized")) +
-                     " (internalized)");
-    if (!Name.starts_with("__omp_offloading_"))
-      return Name;
-    Name = Name.drop_front(sizeof("__omp_offloading_"));
-    auto It = Name.find_first_of("_");
-    if (It != StringRef::npos && It + 1 < Name.size())
-      Name = Name.drop_front(It + 1);
-    It = Name.find_first_of("_");
-    if (It != StringRef::npos && It + 1 < Name.size())
-      Name = Name.drop_front(It + 1);
-    if (Name.ends_with("_debug__"))
-      Name = Name.drop_back(sizeof("debug__"));
-    if (Name.ends_with("_debug___omp_outlined_debug__"))
-      Name = Name.drop_back(sizeof("debug___omp_outlined_debug__"));
-    It = Name.find_last_of("_");
-    if (It == StringRef::npos || It + 1 >= Name.size())
-      return Name;
-    if (Name[It + 1] != 'l')
-      return Name;
-    int64_t KernelLineNo = 0;
-    Name.take_back(Name.size() - It -
-                   /* '_' and 'l' */ 2)
-        .getAsInteger(10, KernelLineNo);
-    if (KernelLineNo)
-      Name = SS.save("omp target (" + Name.take_front(It).str() + ":" +
-                     std::to_string(KernelLineNo) + ")");
-    return Name;
-  };
-
   auto FillLI = [&](LocationInfoTy &LI, DILocation &DIL) {
     LI.FileName = DIL.getFilename();
     if (LI.FileName.empty())
@@ -330,7 +355,6 @@ ConstantInt *GPUSanImpl::getSourceIndex(Instruction &I,
     LI.FunctionName = DIL.getSubprogramLinkageName();
     if (LI.FunctionName.empty())
       LI.FunctionName = I.getFunction()->getName();
-    LI.FunctionName = PrettifyFunctionName(LI.FunctionName);
     LI.LineNo = DIL.getLine();
     LI.ColumnNo = DIL.getColumn();
   };
@@ -340,52 +364,46 @@ ConstantInt *GPUSanImpl::getSourceIndex(Instruction &I,
     FillLI(*LI, *DILoc);
     ParentDILoc = DILoc->getInlinedAt();
   } else {
-    LI->FunctionName = PrettifyFunctionName(I.getFunction()->getName());
+    LI->FunctionName = I.getFunction()->getName();
   }
-  errs() << __FUNCTION__ << " : " << I << " : " << LastLI << "\n";
 
   bool IsNew;
   uint64_t Idx;
-  errs() << "Line: " << LI->LineNo << "\n";
   std::tie(LI, Idx) = addLocationInfo(LI, IsNew);
-  errs() << "Idx: " << Idx << " : IsNew " << IsNew << "\n";
-  errs() << "Line: " << LI->LineNo << "\n";
   if (LastLI)
-    addParentLocationInfo(*LastLI, Idx);
+    LastLI->ParentIdx = Idx;
   if (!IsNew)
     return ConstantInt::get(Int64Ty, Idx);
 
+  uint64_t CurIdx = Idx;
   LocationInfoTy *CurLI = LI;
   while (ParentDILoc) {
-    //    if (!ParentDILoc->getScope()->getSubprogram()->isArtificial()) {
     auto *ParentLI = new LocationInfoTy();
     FillLI(*ParentLI, *ParentDILoc);
     uint64_t ParentIdx;
-    errs() << "Parent " << ParentLI->LineNo << "\n";
     std::tie(ParentLI, ParentIdx) = addLocationInfo(ParentLI, IsNew);
-    errs() << "Parent " << ParentIdx << " :: " << ParentLI->LineNo << "\n";
-    addParentLocationInfo(*CurLI, ParentIdx);
-    CurLI = ParentLI;
+    CurLI->ParentIdx = ParentIdx;
     if (!IsNew)
       break;
-    //   }
+    encodeLocationInfo(*CurLI, CurIdx);
+    CurLI = ParentLI;
+    CurIdx = ParentIdx;
     ParentDILoc = ParentDILoc->getInlinedAt();
   }
 
   Function &Fn = *I.getFunction();
   buildCallTreeInfo(Fn, *CurLI);
 
+  encodeLocationInfo(*CurLI, CurIdx);
+
   return ConstantInt::get(Int64Ty, Idx);
 }
 
 void GPUSanImpl::buildCallTreeInfo(Function &Fn, LocationInfoTy &LI) {
-  errs() << __FUNCTION__ << " : " << Fn.getName() << " : "
-         << Fn.hasFnAttribute("kernel") << "\n";
   if (Fn.hasFnAttribute("kernel"))
     return;
   SmallVector<CallBase *> Calls;
   for (auto &U : Fn.uses()) {
-    errs() << *U.getUser() << "\n";
     auto *CB = dyn_cast<CallBase>(U.getUser());
     if (!CB)
       continue;
@@ -393,11 +411,11 @@ void GPUSanImpl::buildCallTreeInfo(Function &Fn, LocationInfoTy &LI) {
       continue;
     Calls.push_back(CB);
   }
-  errs() << "Calls " << Calls.size() << "\n";
   if (Calls.size() == 1) {
     getSourceIndex(*Calls.back(), &LI);
     return;
   }
+  LI.ParentIdx = -2;
   AmbiguousCalls.insert(Calls.begin(), Calls.end());
 }
 
@@ -776,10 +794,9 @@ bool GPUSanImpl::instrument() {
     ITy = IntegerType::get(Ctx, llvm::PowerOf2Ceil(NumAmbiguousCalls));
     auto *ArrayTy = ArrayType::get(ITy, 1024);
     LocationsArray = new GlobalVariable(
-        ArrayTy, /*isConstant=*/false, GlobalValue::PrivateLinkage,
-        UndefValue::get(ArrayTy), "__san.locations",
+        M, ArrayTy, /*isConstant=*/false, GlobalValue::PrivateLinkage,
+        UndefValue::get(ArrayTy), "__san.calls", nullptr,
         GlobalValue::ThreadLocalMode::NotThreadLocal, 3);
-    M.insertGlobalVariable(LocationsArray);
 
     Function *LocationGetter = Function::Create(
         FunctionType::get(Int64Ty, false), llvm::GlobalValue::ExternalLinkage,
@@ -799,19 +816,21 @@ bool GPUSanImpl::instrument() {
     IRB.CreateStore(It.second, Ptr);
   }
 
-  SmallVector<LocationInfoTy *> Locations;
-  Locations.resize(LocationMap.size());
-  for (auto &It : LocationMap)
-    Locations[It.second] = It.first;
-  for (size_t I = 0; I < Locations.size(); ++I) {
-    LocationInfoTy &LI = *Locations[I];
-    errs() << "[" << I << "]";
-    errs() << " - File: " << LI.FileName << "\n";
-    errs() << " - Func: " << LI.FunctionName << "\n";
-    errs() << " - Line: " << LI.LineNo << "\n";
-    errs() << " - Coln: " << LI.ColumnNo << "\n";
-    errs() << " - ParI: " << LI.ParentIdx << "\n";
-  }
+  auto *NamesTy = ArrayType::get(Int8Ty, ConcatenatedString.size() + 1);
+  auto *Names = new GlobalVariable(
+      M, NamesTy, /*isConstant=*/true, GlobalValue::ExternalLinkage,
+      ConstantDataArray::getString(Ctx, ConcatenatedString),
+      "__san.location_names", nullptr,
+      GlobalValue::ThreadLocalMode::NotThreadLocal, 4);
+  Names->setVisibility(GlobalValue::ProtectedVisibility);
+
+  auto *ArrayTy = ArrayType::get(Int64Ty, LocationEncoding.size());
+  auto *GV = new GlobalVariable(
+      M, ArrayTy, /*isConstant=*/true, GlobalValue::ExternalLinkage,
+      ConstantArray::get(ArrayTy, LocationEncoding), "__san.locations", nullptr,
+      GlobalValue::ThreadLocalMode::NotThreadLocal, 4);
+  GV->setVisibility(GlobalValue::ProtectedVisibility);
+
   M.dump();
   return Changed;
 }
diff --git a/offload/include/Shared/Sanitizer.h b/offload/include/Shared/Sanitizer.h
index 328216cb2a7fe..cc2ab4b118bce 100644
--- a/offload/include/Shared/Sanitizer.h
+++ b/offload/include/Shared/Sanitizer.h
@@ -126,6 +126,14 @@ template <AllocationKind AK> struct Allocations {
   static AllocationArrayTy<AK> Arr[SanitizerConfig<AK>::NUM_ALLOCATION_ARRAYS];
 };
 
+struct LocationEncodingTy {
+  uint64_t FunctionNameIdx;
+  uint64_t FileNameIdx;
+  uint64_t LineNo;
+  uint64_t ColumnNo;
+  uint64_t ParentIdx;
+};
+
 struct SanitizerTrapInfoTy {
   /// AllocationTy
   /// {
@@ -167,7 +175,7 @@ struct SanitizerTrapInfoTy {
   uint64_t BlockId[3];
   uint32_t ThreadId[3];
   uint64_t PC;
-  uint64_t SrcId;
+  uint64_t LocationId;
   /// }
 
   [[clang::disable_sanitizer_instrumentation]] void
@@ -176,7 +184,7 @@ struct SanitizerTrapInfoTy {
       BlockId[Dim] = ompx_block_id(Dim);
       ThreadId[Dim] = ompx_thread_id(Dim);
     }
-    SrcId = SourceId;
+    LocationId = SourceId;
   }
 
   template <enum AllocationKind AK>
diff --git a/offload/plugins-nextgen/common/include/GlobalHandler.h b/offload/plugins-nextgen/common/include/GlobalHandler.h
index 829b4b7291193..b88d3e52bbf43 100644
--- a/offload/plugins-nextgen/common/include/GlobalHandler.h
+++ b/offload/plugins-nextgen/common/include/GlobalHandler.h
@@ -50,6 +50,9 @@ class GlobalTy {
   const std::string &getName() const { return Name; }
   uint32_t getSize() const { return Size; }
   void *getPtr() const { return Ptr; }
+  template <typename T> T *getPtrAs() const {
+    return reinterpret_cast<T *>(Ptr);
+  }
 
   void setSize(int32_t S) { Size = S; }
   void setPtr(void *P) { Ptr = P; }
diff --git a/offload/plugins-nextgen/common/include/PluginInterface.h b/offload/plugins-nextgen/common/include/PluginInterface.h
index d1898021c063d..e1e5b5e05acdc 100644
--- a/offload/plugins-nextgen/common/include/PluginInterface.h
+++ b/offload/plugins-nextgen/common/include/PluginInterface.h
@@ -887,7 +887,7 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
   /// Allocate and construct a kernel object.
   virtual Expected<GenericKernelTy &> constructKernel(const char *Name) = 0;
 
-  SanitizerTrapInfoTy *SanitizerTrapInfo = nullptr;
+  DenseMap<DeviceImageTy *, SanitizerTrapInfoTy *> SanitizerTrapInfos;
 
   /// Reference to the underlying plugin that created this device.
   GenericPluginTy &Plugin;
diff --git a/offload/plugins-nextgen/common/src/PluginInterface.cpp b/offload/plugins-nextgen/common/src/PluginInterface.cpp
index 5c800cb6434f1..2d15a4b9189c9 100644
--- a/offload/plugins-nextgen/common/src/PluginInterface.cpp
+++ b/offload/plugins-nextgen/common/src/PluginInterface.cpp
@@ -1026,6 +1026,7 @@ Error GenericDeviceTy::setupDeviceMemoryPool(GenericPluginTy &Plugin,
   if (auto Err = GHandler.writeGlobalToDevice(*this, Image, TrackerGlobal))
     return Err;
 
+  auto *&SanitizerTrapInfo = SanitizerTrapInfos[&Image];
   SanitizerTrapInfo = reinterpret_cast<SanitizerTrapInfoTy *>(allocate(
       sizeof(*SanitizerTrapInfo), &SanitizerTrapInfo, TARGET_ALLOC_HOST));
   memset(SanitizerTrapInfo, '\0', sizeof(SanitizerTrapInfoTy));
@@ -2189,10 +2190,18 @@ Error GPUSanTy::notifyDataUnmapped(void *FakeHstPtr) {
 }
 
 void GPUSanTy::checkAndReportError() {
-  if (!Device.SanitizerTrapInfo)
-    return;
-  SanitizerTrapInfoTy &STI = *Device.SanitizerTrapInfo;
-  if (STI.ErrorCode == SanitizerTrapInfoTy::None)
+  SanitizerTrapInfoTy *STI;
+  DeviceImageTy *Image = nullptr;
+  for (auto &It : Device.SanitizerTrapInfos) {
+    STI = It.second;
+    errs() << "STI " << STI << "\n";
+    if (!STI || STI->ErrorCode == SanitizerTrapInfoTy::None)
+      continue;
+    Image = It.first;
+    break;
+  }
+  errs() << "Img " << Image << "\n";
+  if (!Image)
     return;
 
   auto Green = []() { return "\033[1m\033[32m"; };
@@ -2200,56 +2209,48 @@ void GPUSanTy::checkAndReportError() {
   auto Red = []() { return "\033[1m\033[31m"; };
   auto Default = []() { return "\033[1m\033[0m"; };
 
-#if 0
-  std::string KernelName;
-  StringRef FunctionName =
-      STI.FunctionName[0] ? StringRef(STI.FunctionName) : "<unknown>";
-  StringRef FileName = STI.FileName[0] ? StringRef(STI.FileName) : "<unknown>";
-
-  if (FunctionName.starts_with("__omp_offloading_")) {
-    FunctionName = FunctionName.drop_front(sizeof("__omp_offloading_"));
-    auto It = FunctionName.find_first_of("_");
-    if (It != StringRef::npos && It + 1 < FunctionName.size())
-      FunctionName = FunctionName.drop_front(It + 1);
-    It = FunctionName.find_first_of("_");
-    if (It != StringRef::npos && It + 1 < FunctionName.size())
-      FunctionName = FunctionName.drop_front(It + 1);
-  }
-
-  if (FunctionName.ends_with("_debug__"))
-    FunctionName = FunctionName.drop_back(sizeof("debug__"));
-  if (FunctionName.ends_with("_debug___omp_outlined_debug__"))
-    FunctionName =
-        FunctionName.drop_back(sizeof("debug___omp_outlined_debug__"));
-
-  auto It = FunctionName.find_last_of("_");
-  if (It != StringRef::npos && It + 1 < FunctionName.size()) {
-    if (FunctionName[It + 1] == 'l') {
-      int64_t KernelLineNo = 0;
-      FunctionName
-          .take_back(FunctionName.size() - It -
-                     /* '_' and 'l' */ 2)
-          .getAsInteger(10, KernelLineNo);
-      if (KernelLineNo) {
-        KernelName = "omp target (" + FunctionName.take_front(It).str() + ":" +
-                     std::to_string(KernelLineNo) + ")";
-        FunctionName = KernelName;
-      }
+  GenericGlobalHandlerTy &GHandler = Device.Plugin.getGlobalHandler();
+  auto GetImagePtr = [&](GlobalTy &GV) {
+    if (auto Err = GHandler.getGlobalMetadataFromImage(Device, *Image, GV)) {
+      REPORT("WARNING: Failed to read backtrace "
+             "(%s)\n",
+             toString(std::move(Err)).data());
+      return false;
     }
-  }
-#endif
+    return true;
+  };
+  GlobalTy LocationsGV("__san.locations", -1);
+  GlobalTy LocationNamesGV("__san.location_names", -1);
+  if (GetImagePtr(LocationsGV))
+    GetImagePtr(LocationNamesGV);
 
   fprintf(stderr, "============================================================"
                   "====================\n");
 
-  auto PrintStackTrace = [&](int64_t SourceId) {
-    fprintf(stderr, "    #0 " DPxMOD " %s in %s:%lu\n\n", DPxPTR(0), "unknown",
-            "unknown", 0UL);
+  auto PrintStackTrace = [&](int64_t LocationId) {
+    if (!LocationsGV.getPtr() || !LocationNamesGV.getPtr()) {
+      fprintf(stderr, "    no backtrace available\n");
+      return;
+    }
+    printf("Loc %p : %u\n", LocationsGV.getPtr(), LocationsGV.getSize());
+    printf("Nam %p : %u\n", LocationNamesGV.getPtr(),
+           LocationNamesGV.getSize());
+    char *LocationNames = LocationNamesGV.getPtrAs<char>();
+    LocationEncodingTy *Locations = LocationsGV.getPtrAs<LocationEncodingTy>();
+    int32_t FrameIdx = 0;
+    do {
+      LocationEncodingTy &LE = Locations[LocationId];
+      fprintf(stderr, "    #%i %s in %s:%lu:%lu\n", FrameIdx,
+              &LocationNames[LE.FunctionNameIdx],
+              &LocationNames[LE.FileNameIdx], LE.LineNo, LE.ColumnNo);
+      LocationId = LE.ParentIdx;
+      FrameIdx++;
+    } while (LocationId >= 0);
   };
 
   auto DiagnoseAccess = [&](StringRef Name) {
-    void *PC = reinterpret_cast<void *>(STI.PC);
-    void *Addr = utils::advancePtr(STI.AllocationStart, STI.PtrOffset);
+    void *PC = reinterpret_cast<void *>(STI->PC);
+    void *Addr = utils::advancePtr(STI->AllocationStart, STI->PtrOffset);
     fprintf(stderr,
             "%sERROR: OffloadSanitizer %s access on address " DPxMOD
             " at pc " DPxMOD "\n%s",
@@ -2257,27 +2258,27 @@ void GPUSanTy::checkAndReportError() {
     fprintf(stderr,
             "%s%s of size %u at " DPxMOD
             " thread <%u, %u, %u> block <%lu, %lu, %lu> (acc %li, %s)\n%s",
-            Blue(), STI.AccessId > 0 ? "WRITE" : "READ", STI.AccessSize,
-            DPxPTR(Addr), STI.ThreadId[0], STI.ThreadId[1], STI.ThreadId[2],
-            STI.BlockId[0], STI.BlockId[1], STI.BlockId[2], STI.AccessId,
-            (STI.AllocationKind ? "heap" : "stack"), Default());
-    PrintStackTrace(STI.SrcId);
+            Blue(), STI->AccessId > 0 ? "WRITE" : "READ", STI->AccessSize,
+            DPxPTR(Addr), STI->ThreadId[0], STI->ThreadId[1], STI->ThreadId[2],
+            STI->BlockId[0], STI->BlockId[1], STI->BlockId[2], STI->AccessId,
+            (STI->AllocationKind ? "heap" : "stack"), Default());
+    PrintStackTrace(STI->LocationId);
     fprintf(
         stderr,
         "%s" DPxMOD " is located %lu bytes inside of a %lu-byte region [" DPxMOD
         "," DPxMOD ")\n%s",
-        Green(), DPxPTR(Addr), STI.PtrOffset, STI.AllocationLength,
-        DPxPTR(STI.AllocationStart),
-        DPxPTR(utils::advancePtr(STI.AllocationStart, STI.AllocationLength)),
+        Green(), DPxPTR(Addr), STI->PtrOffset, STI->AllocationLength,
+        DPxPTR(STI->AllocationStart),
+        DPxPTR(utils::advancePtr(STI->AllocationStart, STI->AllocationLength)),
         Default());
     fprintf(stderr,
             "%s Pointer[slot:%lu,tag:%u,kind:%i] "
             "Allocation[slot:%d,tag:%u,kind:%i]\n%s",
-            Green(), STI.PtrSlot, STI.PtrTag, STI.PtrKind, STI.AllocationId,
-            STI.AllocationTag, STI.AllocationKind, Default());
+            Green(), STI->PtrSlot, STI->PtrTag, STI->PtrKind, STI->AllocationId,
+            STI->AllocationTag, STI->AllocationKind, Default());
   };
 
-  switch (STI.ErrorCode) {
+  switch (STI->ErrorCode) {
   case SanitizerTrapInfoTy::None:
     llvm_unreachable("Unexpected exception");
   case SanitizerTrapInfoTy::ExceedsLength:
@@ -2290,8 +2291,9 @@ void GPUSanTy::checkAndReportError() {
     break;
   case SanitizerTrapInfoTy::PointerOutsideAllocation:
     fprintf(stderr, "%sERROR: OffloadSanitizer %s : %p : %i %lu (%s)\n%s",
-            Red(), "outside allocation", STI.AllocationStart, STI.AllocationId,
-            STI.PtrSlot, (STI.AllocationKind ? "heap" : "stack"), Default());
+            Red(), "outside allocation", STI->AllocationStart,
+            STI->AllocationId, STI->PtrSlot,
+            (STI->AllocationKind ? "heap" : "stack"), Default());
     break;
   case SanitizerTrapInfoTy::OutOfBounds: {
     DiagnoseAccess("out-of-bounds");
@@ -2308,13 +2310,7 @@ void GPUSanTy::checkAndReportError() {
             Default());
     break;
   case SanitizerTrapInfoTy::GarbagePointer:
-    fprintf(stderr, "%sERROR: OffloadSanitizer %s : %p\n%s", Red(),
-            "garbage pointer", STI.AllocationStart, Default());
-    fprintf(stderr,
-            "%s Pointer[slot:%lu,tag:%u,kind:%i] "
-            "Allocation[kind:%i]\n%s",
-            Green(), STI.PtrSlot, STI.PtrTag, STI.PtrKind, STI.AllocationKind,
-            Default());
+    DiagnoseAccess("garbage-pointer");
     break;
   }
   fflush(stderr);

>From 612afcc944f651aa9bd8d76398e5762e0f3634a5 Mon Sep 17 00:00:00 2001
From: Johannes Doerfert <johannes at jdoerfert.de>
Date: Tue, 2 Jul 2024 15:21:42 -0700
Subject: [PATCH 19/31] More tests

---
 offload/include/Shared/Sanitizer.h            | 14 ++++--
 .../common/src/PluginInterface.cpp            |  6 +--
 offload/test/sanitizer/stack_trace_1.c        | 43 +++++++++++++++++
 .../test/sanitizer/stack_trace_multi_path_1.c | 47 +++++++++++++++++++
 4 files changed, 104 insertions(+), 6 deletions(-)
 create mode 100644 offload/test/sanitizer/stack_trace_1.c
 create mode 100644 offload/test/sanitizer/stack_trace_multi_path_1.c

diff --git a/offload/include/Shared/Sanitizer.h b/offload/include/Shared/Sanitizer.h
index cc2ab4b118bce..cbf38d5fb96e7 100644
--- a/offload/include/Shared/Sanitizer.h
+++ b/offload/include/Shared/Sanitizer.h
@@ -14,9 +14,15 @@
 #include "Types.h"
 #include "Utils.h"
 
-extern "C" int ompx_block_id(int Dim);
-extern "C" int ompx_block_dim(int Dim);
-extern "C" int ompx_thread_id(int Dim);
+extern "C" {
+int ompx_block_id(int Dim);
+int ompx_block_dim(int Dim);
+int ompx_thread_id(int Dim);
+[[clang::disable_sanitizer_instrumentation, gnu::noinline]] inline int64_t
+__san_get_location_value() {
+  return -1;
+}
+}
 
 enum class AllocationKind { LOCAL, GLOBAL, LAST = GLOBAL };
 
@@ -176,6 +182,7 @@ struct SanitizerTrapInfoTy {
   uint32_t ThreadId[3];
   uint64_t PC;
   uint64_t LocationId;
+  uint64_t CallId;
   /// }
 
   [[clang::disable_sanitizer_instrumentation]] void
@@ -185,6 +192,7 @@ struct SanitizerTrapInfoTy {
       ThreadId[Dim] = ompx_thread_id(Dim);
     }
     LocationId = SourceId;
+    CallId = __san_get_location_value();
   }
 
   template <enum AllocationKind AK>
diff --git a/offload/plugins-nextgen/common/src/PluginInterface.cpp b/offload/plugins-nextgen/common/src/PluginInterface.cpp
index 2d15a4b9189c9..a17c2741f79d2 100644
--- a/offload/plugins-nextgen/common/src/PluginInterface.cpp
+++ b/offload/plugins-nextgen/common/src/PluginInterface.cpp
@@ -21,6 +21,7 @@
 #include "Utils/ELF.h"
 #include "omptarget.h"
 #include "llvm/Support/ErrorHandling.h"
+#include <cstdio>
 #include <string>
 
 #ifdef OMPT_SUPPORT
@@ -2232,9 +2233,7 @@ void GPUSanTy::checkAndReportError() {
       fprintf(stderr, "    no backtrace available\n");
       return;
     }
-    printf("Loc %p : %u\n", LocationsGV.getPtr(), LocationsGV.getSize());
-    printf("Nam %p : %u\n", LocationNamesGV.getPtr(),
-           LocationNamesGV.getSize());
+    fprintf(stderr, "%lu\n", STI->CallId);
     char *LocationNames = LocationNamesGV.getPtrAs<char>();
     LocationEncodingTy *Locations = LocationsGV.getPtrAs<LocationEncodingTy>();
     int32_t FrameIdx = 0;
@@ -2246,6 +2245,7 @@ void GPUSanTy::checkAndReportError() {
       LocationId = LE.ParentIdx;
       FrameIdx++;
     } while (LocationId >= 0);
+    fputc('\n', stderr);
   };
 
   auto DiagnoseAccess = [&](StringRef Name) {
diff --git a/offload/test/sanitizer/stack_trace_1.c b/offload/test/sanitizer/stack_trace_1.c
new file mode 100644
index 0000000000000..7129f67c70900
--- /dev/null
+++ b/offload/test/sanitizer/stack_trace_1.c
@@ -0,0 +1,43 @@
+// clang-format off
+// : %libomptarget-compileopt-generic -fsanitize=offload -O1
+// : not %libomptarget-run-generic 2> %t.out
+// : %fcheck-generic --check-prefixes=CHECK < %t.out
+// : %libomptarget-compileopt-generic -fsanitize=offload -O3
+// : not %libomptarget-run-generic 2> %t.out
+// RUN: %libomptarget-compileopt-generic -fsanitize=offload -O3 -g
+// RUN: not %libomptarget-run-generic 2> %t.out
+// RUN: %fcheck-generic --check-prefixes=DEBUG < %t.out
+// clang-format on
+
+// UNSUPPORTED: aarch64-unknown-linux-gnu
+// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
+// UNSUPPORTED: x86_64-pc-linux-gnu
+// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+// UNSUPPORTED: s390x-ibm-linux-gnu
+// UNSUPPORTED: s390x-ibm-linux-gnu-LTO
+
+[[clang::optnone]] int deref(int *P) { return *P; }
+
+[[gnu::always_inline]] int bar(int *P) { return deref(P); }
+
+int main(void) {
+
+#pragma omp target
+  {
+    int *NullPtr = 0;
+    // clang-format off
+    // CHECK: ERROR: OffloadSanitizer out-of-bounds access on address 0x0000000000000000 at pc [[PC:.*]]
+    // CHECK: WRITE of size 4 at 0x0000000000000000 thread <0, 0, 0> block <0, 0, 0> (acc 1, heap)
+    // CHECK:     #0 [[PC]] omp target (main:[[@LINE-6]]) in <unknown>:0
+    // 
+    // CHECK: 0x0000000000000000 is located 0 bytes inside of a 0-byte region [0x0000000000000000,0x0000000000000000)
+    //
+    // DEBUG: ERROR: OffloadSanitizer out-of-bounds access on address 0x0000000000000000 at pc [[PC:.*]]
+    // DEBUG: WRITE of size 4 at 0x0000000000000000 thread <0, 0, 0> block <0, 0, 0> (acc 1, heap)
+    // DEBUG:     #0 [[PC]] omp target (main:[[@LINE-12]]) in {{.*}}volatile_stack_null.c:[[@LINE+4]]
+    // 
+    // DEBUG: 0x0000000000000000 is located 0 bytes inside of a 0-byte region [0x0000000000000000,0x0000000000000000)
+    // clang-format on
+    bar(NullPtr);
+  }
+}
diff --git a/offload/test/sanitizer/stack_trace_multi_path_1.c b/offload/test/sanitizer/stack_trace_multi_path_1.c
new file mode 100644
index 0000000000000..10de205a6eefc
--- /dev/null
+++ b/offload/test/sanitizer/stack_trace_multi_path_1.c
@@ -0,0 +1,47 @@
+// clang-format off
+// : %libomptarget-compileopt-generic -fsanitize=offload -O1
+// : not %libomptarget-run-generic 2> %t.out
+// : %fcheck-generic --check-prefixes=CHECK < %t.out
+// : %libomptarget-compileopt-generic -fsanitize=offload -O3
+// : not %libomptarget-run-generic 2> %t.out
+// RUN: %libomptarget-compileopt-generic -fsanitize=offload -O3 -g
+// RUN: not %libomptarget-run-generic 2> %t.out
+// RUN: %fcheck-generic --check-prefixes=DEBUG < %t.out
+// clang-format on
+
+// UNSUPPORTED: aarch64-unknown-linux-gnu
+// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
+// UNSUPPORTED: x86_64-pc-linux-gnu
+// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+// UNSUPPORTED: s390x-ibm-linux-gnu
+// UNSUPPORTED: s390x-ibm-linux-gnu-LTO
+
+[[clang::optnone]] int deref(int *P) { return *P; }
+
+[[gnu::noinline]] int bar(int *P) { return deref(P); }
+[[gnu::noinline]] int baz(int *P) { return deref(P); }
+
+int main(void) {
+
+#pragma omp target
+  {
+    int *NullPtr = 0;
+    int X;
+    int *Valid = &X;
+    // clang-format off
+    // CHECK: ERROR: OffloadSanitizer out-of-bounds access on address 0x0000000000000000 at pc [[PC:.*]]
+    // CHECK: WRITE of size 4 at 0x0000000000000000 thread <0, 0, 0> block <0, 0, 0> (acc 1, heap)
+    // CHECK:     #0 [[PC]] omp target (main:[[@LINE-6]]) in <unknown>:0
+    // 
+    // CHECK: 0x0000000000000000 is located 0 bytes inside of a 0-byte region [0x0000000000000000,0x0000000000000000)
+    //
+    // DEBUG: ERROR: OffloadSanitizer out-of-bounds access on address 0x0000000000000000 at pc [[PC:.*]]
+    // DEBUG: WRITE of size 4 at 0x0000000000000000 thread <0, 0, 0> block <0, 0, 0> (acc 1, heap)
+    // DEBUG:     #0 [[PC]] omp target (main:[[@LINE-12]]) in {{.*}}volatile_stack_null.c:[[@LINE+4]]
+    // 
+    // DEBUG: 0x0000000000000000 is located 0 bytes inside of a 0-byte region [0x0000000000000000,0x0000000000000000)
+    // clang-format on
+    bar(Valid);
+    baz(NullPtr);
+  }
+}

>From 4bf46ce0fc758f5c15f32e06673070ea4a4b8ec4 Mon Sep 17 00:00:00 2001
From: Johannes Doerfert <johannes at jdoerfert.de>
Date: Tue, 2 Jul 2024 18:36:54 -0700
Subject: [PATCH 20/31] Ambiguous call trace support

---
 llvm/lib/Passes/PassBuilderPipelines.cpp      |  4 +-
 .../lib/Transforms/Instrumentation/GPUSan.cpp | 85 ++++++++++++++++---
 offload/DeviceRTL/src/Mapping.cpp             |  4 +-
 offload/DeviceRTL/src/Sanitizer.cpp           |  5 ++
 offload/include/Shared/Sanitizer.h            |  7 +-
 .../common/src/PluginInterface.cpp            | 29 +++++--
 .../test/sanitizer/stack_trace_multi_path_1.c |  7 +-
 7 files changed, 109 insertions(+), 32 deletions(-)

diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp
index 608bfa81b579b..410aedd57dc9a 100644
--- a/llvm/lib/Passes/PassBuilderPipelines.cpp
+++ b/llvm/lib/Passes/PassBuilderPipelines.cpp
@@ -2051,11 +2051,11 @@ PassBuilder::buildLTODefaultPipeline(OptimizationLevel Level,
   if (PTO.CallGraphProfile)
     MPM.addPass(CGProfilePass(/*InLTOPostLink=*/true));
 
-  invokeFullLinkTimeOptimizationLastEPCallbacks(MPM, Level);
-
   if (EnableOffloadSanitizer)
     MPM.addPass(GPUSanPass());
 
+  invokeFullLinkTimeOptimizationLastEPCallbacks(MPM, Level);
+
   // Emit annotation remarks.
   addAnnotationRemarksPass(MPM);
 
diff --git a/llvm/lib/Transforms/Instrumentation/GPUSan.cpp b/llvm/lib/Transforms/Instrumentation/GPUSan.cpp
index cfc2fa0732d35..75d257acf3a8a 100644
--- a/llvm/lib/Transforms/Instrumentation/GPUSan.cpp
+++ b/llvm/lib/Transforms/Instrumentation/GPUSan.cpp
@@ -778,42 +778,99 @@ bool GPUSanImpl::instrument() {
     return false;
   }();
 
-  for (Function &Fn : M)
+  SmallVector<Function *> Kernels;
+  for (Function &Fn : M) {
+    if (Fn.hasFnAttribute("kernel"))
+      Kernels.push_back(&Fn);
     if (!Fn.getName().contains("ompx") && !Fn.getName().contains("__kmpc") &&
         !Fn.getName().starts_with("rpc_"))
       if (!Fn.hasFnAttribute(Attribute::DisableSanitizerInstrumentation))
         Changed |= instrumentFunction(Fn);
+  }
 
-  SmallVector<std::pair<CallBase *, ConstantInt *>> AmbiguousCallsNumbered;
+  SmallVector<CallBase *> AmbiguousCallsOrdered;
+  SmallVector<Constant *> AmbiguousCallsMapping;
+  if (LocationMap.empty())
+    AmbiguousCalls.clear();
   for (size_t I = 0; I < AmbiguousCalls.size(); ++I) {
     CallBase &CB = *AmbiguousCalls[I];
-    AmbiguousCallsNumbered.push_back({&CB, getSourceIndex(CB)});
+    AmbiguousCallsOrdered.push_back(&CB);
+    AmbiguousCallsMapping.push_back(getSourceIndex(CB));
   }
-  IntegerType *ITy = nullptr;
+
+  uint64_t AmbiguousCallsBitWidth =
+      llvm::PowerOf2Ceil(AmbiguousCalls.size() + 1);
+
+  new GlobalVariable(M, Int64Ty, /*isConstant=*/true,
+                     GlobalValue::ExternalLinkage,
+                     ConstantInt::get(Int64Ty, AmbiguousCallsBitWidth),
+                     "__san.num_ambiguous_calls", nullptr,
+                     GlobalValue::ThreadLocalMode::NotThreadLocal, 1);
+
   if (size_t NumAmbiguousCalls = AmbiguousCalls.size()) {
-    ITy = IntegerType::get(Ctx, llvm::PowerOf2Ceil(NumAmbiguousCalls));
-    auto *ArrayTy = ArrayType::get(ITy, 1024);
+    {
+      auto *ArrayTy = ArrayType::get(Int64Ty, NumAmbiguousCalls);
+      auto *GV = new GlobalVariable(
+          M, ArrayTy, /*isConstant=*/true, GlobalValue::ExternalLinkage,
+          ConstantArray::get(ArrayTy, AmbiguousCallsMapping),
+          "__san.ambiguous_calls_mapping", nullptr,
+          GlobalValue::ThreadLocalMode::NotThreadLocal, 4);
+      GV->setVisibility(GlobalValue::ProtectedVisibility);
+    }
+
+    auto *ArrayTy = ArrayType::get(Int64Ty, 1024);
     LocationsArray = new GlobalVariable(
         M, ArrayTy, /*isConstant=*/false, GlobalValue::PrivateLinkage,
         UndefValue::get(ArrayTy), "__san.calls", nullptr,
         GlobalValue::ThreadLocalMode::NotThreadLocal, 3);
 
+    auto *OldFn = M.getFunction("__san_get_location_value");
+    if (OldFn)
+      OldFn->setName("");
     Function *LocationGetter = Function::Create(
-        FunctionType::get(Int64Ty, false), llvm::GlobalValue::ExternalLinkage,
+        FunctionType::get(Int64Ty, false), GlobalValue::ExternalLinkage,
         "__san_get_location_value", M);
+    if (OldFn) {
+      OldFn->replaceAllUsesWith(LocationGetter);
+      OldFn->eraseFromParent();
+    }
     auto *EntryBB = BasicBlock::Create(Ctx, "entry", LocationGetter);
     IRBuilder<> IRB(EntryBB);
     Value *Idx = IRB.CreateCall(getThreadIdFn(), {}, "san.gtid");
-    Value *Ptr = IRB.CreateGEP(ITy, LocationsArray, {Idx});
-    auto *LocationValue = IRB.CreateLoad(ITy, Ptr);
-    IRB.CreateRet(IRB.CreateZExt(LocationValue, Int64Ty));
+    Value *Ptr = IRB.CreateGEP(Int64Ty, LocationsArray, {Idx});
+    auto *LocationValue = IRB.CreateLoad(Int64Ty, Ptr);
+    IRB.CreateRet(LocationValue);
+  }
+
+  Function *InitSharedFn =
+      Function::Create(FunctionType::get(VoidTy, false),
+                       GlobalValue::PrivateLinkage, "__san.init_shared", &M);
+  auto *EntryBB = BasicBlock::Create(Ctx, "entry", InitSharedFn);
+  IRBuilder<> IRB(EntryBB);
+  if (!AmbiguousCalls.empty()) {
+    Value *Idx = IRB.CreateCall(getThreadIdFn(), {}, "san.gtid");
+    Value *Ptr = IRB.CreateGEP(Int64Ty, LocationsArray, {Idx});
+    IRB.CreateStore(ConstantInt::get(Int64Ty, 0), Ptr);
+  }
+  IRB.CreateRetVoid();
+
+  for (auto *KernelFn : Kernels) {
+    IRBuilder<> IRB(&*KernelFn->getEntryBlock().getFirstNonPHIOrDbgOrAlloca());
+    IRB.CreateCall(InitSharedFn, {});
   }
 
-  for (auto &It : AmbiguousCallsNumbered) {
-    IRBuilder<> IRB(It.first);
+  for (const auto &It : llvm::enumerate(AmbiguousCallsOrdered)) {
+    IRBuilder<> IRB(It.value());
     Value *Idx = IRB.CreateCall(getThreadIdFn(), {}, "san.gtid");
-    Value *Ptr = IRB.CreateGEP(ITy, LocationsArray, {Idx});
-    IRB.CreateStore(It.second, Ptr);
+    Value *Ptr = IRB.CreateGEP(Int64Ty, LocationsArray, {Idx});
+    Value *OldVal = IRB.CreateLoad(Int64Ty, Ptr);
+    Value *OldValShifted = IRB.CreateShl(
+        OldVal, ConstantInt::get(Int64Ty, AmbiguousCallsBitWidth));
+    Value *NewVal = IRB.CreateBinOp(Instruction::Or, OldValShifted,
+                                    ConstantInt::get(Int64Ty, It.index() + 1));
+    IRB.CreateStore(NewVal, Ptr);
+    IRB.SetInsertPoint(It.value()->getNextNode());
+    IRB.CreateStore(OldVal, Ptr);
   }
 
   auto *NamesTy = ArrayType::get(Int8Ty, ConcatenatedString.size() + 1);
diff --git a/offload/DeviceRTL/src/Mapping.cpp b/offload/DeviceRTL/src/Mapping.cpp
index cbfa4e78cd9f4..4d89c11ee7e0f 100644
--- a/offload/DeviceRTL/src/Mapping.cpp
+++ b/offload/DeviceRTL/src/Mapping.cpp
@@ -364,7 +364,9 @@ _TGT_KERNEL_LANGUAGE(block_id, getBlockIdInKernel)
 _TGT_KERNEL_LANGUAGE(block_dim, getNumberOfThreadsInBlock)
 _TGT_KERNEL_LANGUAGE(grid_dim, getNumberOfBlocksInKernel)
 
-extern "C" int ompx_global_thread_id() {
+extern "C" [[clang::disable_sanitizer_instrumentation, gnu::flatten,
+             gnu::always_inline, gnu::used, gnu::retain]] int
+ompx_global_thread_id() {
   return ompx_thread_id(0) + ompx_thread_id(1) * ompx_block_dim(0) +
          ompx_thread_id(2) * ompx_block_dim(0) * ompx_block_dim(1);
 }
diff --git a/offload/DeviceRTL/src/Sanitizer.cpp b/offload/DeviceRTL/src/Sanitizer.cpp
index d953e813194bb..2acf15c22d992 100644
--- a/offload/DeviceRTL/src/Sanitizer.cpp
+++ b/offload/DeviceRTL/src/Sanitizer.cpp
@@ -416,6 +416,11 @@ ompx_get_allocation_info_global(_AS_PTR(void, AllocationKind::GLOBAL) P) {
 ompx_leak_check() {
   AllocationTracker<AllocationKind::GLOBAL>::leakCheck();
 }
+
+[[gnu::weak, gnu::noinline, gnu::used, gnu::retain]] int64_t
+__san_get_location_value() {
+  return -1;
+}
 }
 
 #pragma omp end declare target
diff --git a/offload/include/Shared/Sanitizer.h b/offload/include/Shared/Sanitizer.h
index cbf38d5fb96e7..79e3af2d33bbd 100644
--- a/offload/include/Shared/Sanitizer.h
+++ b/offload/include/Shared/Sanitizer.h
@@ -18,10 +18,7 @@ extern "C" {
 int ompx_block_id(int Dim);
 int ompx_block_dim(int Dim);
 int ompx_thread_id(int Dim);
-[[clang::disable_sanitizer_instrumentation, gnu::noinline]] inline int64_t
-__san_get_location_value() {
-  return -1;
-}
+int64_t __san_get_location_value();
 }
 
 enum class AllocationKind { LOCAL, GLOBAL, LAST = GLOBAL };
@@ -182,7 +179,7 @@ struct SanitizerTrapInfoTy {
   uint32_t ThreadId[3];
   uint64_t PC;
   uint64_t LocationId;
-  uint64_t CallId;
+  int64_t CallId;
   /// }
 
   [[clang::disable_sanitizer_instrumentation]] void
diff --git a/offload/plugins-nextgen/common/src/PluginInterface.cpp b/offload/plugins-nextgen/common/src/PluginInterface.cpp
index a17c2741f79d2..fe194a42acaab 100644
--- a/offload/plugins-nextgen/common/src/PluginInterface.cpp
+++ b/offload/plugins-nextgen/common/src/PluginInterface.cpp
@@ -2195,13 +2195,11 @@ void GPUSanTy::checkAndReportError() {
   DeviceImageTy *Image = nullptr;
   for (auto &It : Device.SanitizerTrapInfos) {
     STI = It.second;
-    errs() << "STI " << STI << "\n";
     if (!STI || STI->ErrorCode == SanitizerTrapInfoTy::None)
       continue;
     Image = It.first;
     break;
   }
-  errs() << "Img " << Image << "\n";
   if (!Image)
     return;
 
@@ -2211,19 +2209,26 @@ void GPUSanTy::checkAndReportError() {
   auto Default = []() { return "\033[1m\033[0m"; };
 
   GenericGlobalHandlerTy &GHandler = Device.Plugin.getGlobalHandler();
-  auto GetImagePtr = [&](GlobalTy &GV) {
+  auto GetImagePtr = [&](GlobalTy &GV, bool Quiet = false) {
     if (auto Err = GHandler.getGlobalMetadataFromImage(Device, *Image, GV)) {
-      REPORT("WARNING: Failed to read backtrace "
-             "(%s)\n",
-             toString(std::move(Err)).data());
+      if (Quiet)
+        consumeError(std::move(Err));
+      else
+        REPORT("WARNING: Failed to read backtrace "
+               "(%s)\n",
+               toString(std::move(Err)).data());
       return false;
     }
     return true;
   };
   GlobalTy LocationsGV("__san.locations", -1);
   GlobalTy LocationNamesGV("__san.location_names", -1);
+  GlobalTy AmbiguousCallsBitWidthGV("__san.num_ambiguous_calls", -1);
+  GlobalTy AmbiguousCallsLocationsGV("__san.ambiguous_calls_mapping", -1);
   if (GetImagePtr(LocationsGV))
     GetImagePtr(LocationNamesGV);
+  GetImagePtr(AmbiguousCallsBitWidthGV, /*Quiet=*/true);
+  GetImagePtr(AmbiguousCallsLocationsGV, /*Quiet=*/true);
 
   fprintf(stderr, "============================================================"
                   "====================\n");
@@ -2233,9 +2238,12 @@ void GPUSanTy::checkAndReportError() {
       fprintf(stderr, "    no backtrace available\n");
       return;
     }
-    fprintf(stderr, "%lu\n", STI->CallId);
     char *LocationNames = LocationNamesGV.getPtrAs<char>();
     LocationEncodingTy *Locations = LocationsGV.getPtrAs<LocationEncodingTy>();
+    uint64_t *AmbiguousCallsBitWidth =
+        AmbiguousCallsBitWidthGV.getPtrAs<uint64_t>();
+    uint64_t *AmbiguousCallsLocations =
+        AmbiguousCallsLocationsGV.getPtrAs<uint64_t>();
     int32_t FrameIdx = 0;
     do {
       LocationEncodingTy &LE = Locations[LocationId];
@@ -2244,6 +2252,13 @@ void GPUSanTy::checkAndReportError() {
               &LocationNames[LE.FileNameIdx], LE.LineNo, LE.ColumnNo);
       LocationId = LE.ParentIdx;
       FrameIdx++;
+      if (LocationId < 0 && STI->CallId != 0 && AmbiguousCallsBitWidth &&
+          AmbiguousCallsLocations) {
+        uint64_t LastCallId =
+            STI->CallId & ((1 << *AmbiguousCallsBitWidth) - 1);
+        LocationId = AmbiguousCallsLocations[LastCallId - 1];
+        STI->CallId >>= (*AmbiguousCallsBitWidth);
+      }
     } while (LocationId >= 0);
     fputc('\n', stderr);
   };
diff --git a/offload/test/sanitizer/stack_trace_multi_path_1.c b/offload/test/sanitizer/stack_trace_multi_path_1.c
index 10de205a6eefc..58717afe64df5 100644
--- a/offload/test/sanitizer/stack_trace_multi_path_1.c
+++ b/offload/test/sanitizer/stack_trace_multi_path_1.c
@@ -16,6 +16,8 @@
 // UNSUPPORTED: s390x-ibm-linux-gnu
 // UNSUPPORTED: s390x-ibm-linux-gnu-LTO
 
+#include <omp.h>
+
 [[clang::optnone]] int deref(int *P) { return *P; }
 
 [[gnu::noinline]] int bar(int *P) { return deref(P); }
@@ -23,11 +25,10 @@
 
 int main(void) {
 
-#pragma omp target
+  int *Valid = (int *)omp_target_alloc(4, omp_get_default_device());
+#pragma omp target is_device_ptr(Valid)
   {
     int *NullPtr = 0;
-    int X;
-    int *Valid = &X;
     // clang-format off
     // CHECK: ERROR: OffloadSanitizer out-of-bounds access on address 0x0000000000000000 at pc [[PC:.*]]
     // CHECK: WRITE of size 4 at 0x0000000000000000 thread <0, 0, 0> block <0, 0, 0> (acc 1, heap)

>From f216b05675ca936dd6243de4383144aac486a824 Mon Sep 17 00:00:00 2001
From: Johannes Doerfert <johannes at jdoerfert.de>
Date: Tue, 2 Jul 2024 19:18:55 -0700
Subject: [PATCH 21/31] Multi call

---
 .../lib/Transforms/Instrumentation/GPUSan.cpp |  2 +-
 .../sanitizer/stack_trace_multi_path_many.cpp | 42 +++++++++++++++++++
 2 files changed, 43 insertions(+), 1 deletion(-)
 create mode 100644 offload/test/sanitizer/stack_trace_multi_path_many.cpp

diff --git a/llvm/lib/Transforms/Instrumentation/GPUSan.cpp b/llvm/lib/Transforms/Instrumentation/GPUSan.cpp
index 75d257acf3a8a..611ec9c888782 100644
--- a/llvm/lib/Transforms/Instrumentation/GPUSan.cpp
+++ b/llvm/lib/Transforms/Instrumentation/GPUSan.cpp
@@ -799,7 +799,7 @@ bool GPUSanImpl::instrument() {
   }
 
   uint64_t AmbiguousCallsBitWidth =
-      llvm::PowerOf2Ceil(AmbiguousCalls.size() + 1);
+      llvm::Log2_64_Ceil(AmbiguousCalls.size() + 1);
 
   new GlobalVariable(M, Int64Ty, /*isConstant=*/true,
                      GlobalValue::ExternalLinkage,
diff --git a/offload/test/sanitizer/stack_trace_multi_path_many.cpp b/offload/test/sanitizer/stack_trace_multi_path_many.cpp
new file mode 100644
index 0000000000000..5f6871e825e54
--- /dev/null
+++ b/offload/test/sanitizer/stack_trace_multi_path_many.cpp
@@ -0,0 +1,42 @@
+// clang-format off
+// : %libomptarget-compileoptxx-generic -fsanitize=offload -O1
+// : not %libomptarget-run-generic 2> %t.out
+// : %fcheck-generic --check-prefixes=CHECK < %t.out
+// : %libomptarget-compileoptxx-generic -fsanitize=offload -O3
+// : not %libomptarget-run-generic 2> %t.out
+// RUN: %libomptarget-compileoptxx-generic -fsanitize=offload -O3 -g
+// RUN: not %libomptarget-run-generic 2> %t.out
+// RUN: %fcheck-generic --check-prefixes=DEBUG < %t.out
+// clang-format on
+
+// UNSUPPORTED: aarch64-unknown-linux-gnu
+// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
+// UNSUPPORTED: x86_64-pc-linux-gnu
+// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+// UNSUPPORTED: s390x-ibm-linux-gnu
+// UNSUPPORTED: s390x-ibm-linux-gnu-LTO
+
+#include <omp.h>
+
+template <typename T> [[clang::optnone]] T deref(T *P) { return *P; }
+
+template <int LEVEL, typename T> [[gnu::noinline]] T level(T *P) {
+  if constexpr (LEVEL > 1)
+    return level<LEVEL - 1>(P) + level<LEVEL - 2>(P);
+  if constexpr (LEVEL > 0)
+    return level<LEVEL - 1>(P);
+  return deref(P);
+}
+
+int main(void) {
+
+  int *ValidInt = (int *)omp_target_alloc(4, omp_get_default_device());
+#pragma omp target is_device_ptr(ValidInt)
+  {
+    level<12>(ValidInt);
+    short *ValidShort = ((short *)ValidInt) + 2;
+    level<12>(ValidShort);
+    char *Invalid = ((char *)ValidInt) + 4;
+    level<12>(Invalid);
+  }
+}

>From b55babec1e27b22d34e47709ed2c1a8b4c3ba92d Mon Sep 17 00:00:00 2001
From: Johannes Doerfert <johannes at jdoerfert.de>
Date: Wed, 3 Jul 2024 10:23:20 -0700
Subject: [PATCH 22/31] Improve test

---
 .../test/sanitizer/stack_trace_multi_path_many.cpp  | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/offload/test/sanitizer/stack_trace_multi_path_many.cpp b/offload/test/sanitizer/stack_trace_multi_path_many.cpp
index 5f6871e825e54..6d113985670ff 100644
--- a/offload/test/sanitizer/stack_trace_multi_path_many.cpp
+++ b/offload/test/sanitizer/stack_trace_multi_path_many.cpp
@@ -4,9 +4,12 @@
 // : %fcheck-generic --check-prefixes=CHECK < %t.out
 // : %libomptarget-compileoptxx-generic -fsanitize=offload -O3
 // : not %libomptarget-run-generic 2> %t.out
-// RUN: %libomptarget-compileoptxx-generic -fsanitize=offload -O3 -g
+// RUN: %libomptarget-compileoptxx-generic -fsanitize=offload -O3 -g -DLEVELS=11
 // RUN: not %libomptarget-run-generic 2> %t.out
-// RUN: %fcheck-generic --check-prefixes=DEBUG < %t.out
+// RUN: %fcheck-generic --check-prefixes=DBG11 < %t.out
+//  %libomptarget-compileoptxx-generic -fsanitize=offload -O3 -g -DLEVELS=12
+//  not %libomptarget-run-generic 2> %t.out
+//  %fcheck-generic --check-prefixes=DBG12 < %t.out
 // clang-format on
 
 // UNSUPPORTED: aarch64-unknown-linux-gnu
@@ -33,10 +36,10 @@ int main(void) {
   int *ValidInt = (int *)omp_target_alloc(4, omp_get_default_device());
 #pragma omp target is_device_ptr(ValidInt)
   {
-    level<12>(ValidInt);
+    level<LEVELS>(ValidInt);
     short *ValidShort = ((short *)ValidInt) + 2;
-    level<12>(ValidShort);
+    level<LEVELS>(ValidShort);
     char *Invalid = ((char *)ValidInt) + 4;
-    level<12>(Invalid);
+    level<LEVELS>(Invalid);
   }
 }

>From 6f4233bbf2722fcbfe10d62d0ca874bedb9fca23 Mon Sep 17 00:00:00 2001
From: Johannes Doerfert <johannes at jdoerfert.de>
Date: Mon, 8 Jul 2024 18:16:13 -0700
Subject: [PATCH 23/31] Inline calls

---
 .../lib/Transforms/Instrumentation/GPUSan.cpp | 92 ++++++++++++-------
 offload/DeviceRTL/src/Sanitizer.cpp           |  1 +
 offload/include/Shared/Sanitizer.h            | 28 +++---
 offload/include/Shared/Types.h                |  6 +-
 4 files changed, 78 insertions(+), 49 deletions(-)

diff --git a/llvm/lib/Transforms/Instrumentation/GPUSan.cpp b/llvm/lib/Transforms/Instrumentation/GPUSan.cpp
index 611ec9c888782..3b01337d28c9f 100644
--- a/llvm/lib/Transforms/Instrumentation/GPUSan.cpp
+++ b/llvm/lib/Transforms/Instrumentation/GPUSan.cpp
@@ -18,8 +18,10 @@
 #include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CallingConv.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DerivedTypes.h"
@@ -41,8 +43,10 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/StringSaver.h"
+#include "llvm/Transforms/Utils/Cloning.h"
 #include "llvm/Transforms/Utils/ModuleUtils.h"
 #include <cstdint>
+#include <optional>
 
 using namespace llvm;
 
@@ -188,6 +192,7 @@ class GPUSanImpl final {
     if (!FC) {
       auto *NewAllocationFnTy = FunctionType::get(RetTy, ArgTys, false);
       FC = M.getOrInsertFunction(Name, NewAllocationFnTy);
+      Function *F = cast<Function>(FC.getCallee());
     }
     return FC;
   }
@@ -256,6 +261,14 @@ class GPUSanImpl final {
     return getOrCreateFn(ThreadIDFn, "ompx_global_thread_id", Int32Ty, {});
   }
 
+  CallInst *createCall(IRBuilder<> &IRB, FunctionCallee Callee,
+                       ArrayRef<Value *> Args = std::nullopt,
+                       const Twine &Name = "") {
+    Calls.push_back(IRB.CreateCall(Callee, Args, Name));
+    return Calls.back();
+  }
+  SmallVector<CallInst *> Calls;
+
   Module &M;
   FunctionAnalysisManager &FAM;
   LLVMContext &Ctx;
@@ -470,8 +483,8 @@ void GPUSanImpl::getAllocationInfo(Function &Fn, PtrOrigin PO, Value &Object,
     else
       IP = &*Fn.getEntryBlock().getFirstNonPHIOrDbgOrAlloca();
     IRBuilder<> IRB(IP);
-    auto *CB = IRB.CreateCall(getAllocationInfoFn(PO),
-                              {IRB.CreateAddrSpaceCast(&Object, getPtrTy(PO))});
+    auto *CB = createCall(IRB, getAllocationInfoFn(PO),
+                          {IRB.CreateAddrSpaceCast(&Object, getPtrTy(PO))});
     It.Start = IRB.CreateExtractValue(CB, {0});
     It.Length = IRB.CreateExtractValue(CB, {1});
     It.Tag = IRB.CreateExtractValue(CB, {2});
@@ -525,7 +538,7 @@ bool GPUSanImpl::instrumentGlobals() {
                        GlobalValue::PrivateLinkage, "__san.dtor", &M);
   BasicBlock *Entry = BasicBlock::Create(Ctx, "entry", DtorFn);
   IRBuilder<> IRB(Entry);
-  IRB.CreateCall(getLeakCheckFn());
+  createCall(IRB, getLeakCheckFn());
   IRB.CreateRetVoid();
   appendToGlobalDtors(M, DtorFn, 0, nullptr);
 
@@ -550,10 +563,10 @@ Value *GPUSanImpl::instrumentAllocation(Instruction &I, Value &Size,
   Value *PlainI = IRB.CreatePointerBitCastOrAddrSpaceCast(&I, getPtrTy(PO));
   static int AllocationId = 1;
   auto *CB =
-      IRB.CreateCall(Fn,
-                     {PlainI, &Size, ConstantInt::get(Int64Ty, AllocationId++),
-                      getSourceIndex(I)},
-                     I.getName() + ".san");
+      createCall(IRB, Fn,
+                 {PlainI, &Size, ConstantInt::get(Int64Ty, AllocationId++),
+                  getSourceIndex(I)},
+                 I.getName() + ".san");
   SmallVector<LifetimeIntrinsic *> Lifetimes;
   I.replaceUsesWithIf(
       IRB.CreatePointerBitCastOrAddrSpaceCast(CB, I.getType()), [&](Use &U) {
@@ -570,10 +583,10 @@ Value *GPUSanImpl::instrumentAllocation(Instruction &I, Value &Size,
   for (auto *LT : Lifetimes) {
     if (LT->getIntrinsicID() == Intrinsic::lifetime_start) {
       IRB.SetInsertPoint(LT);
-      IRB.CreateCall(getLifetimeStart(), {CB, LT->getArgOperand(0)});
+      createCall(IRB, getLifetimeStart(), {CB, LT->getArgOperand(0)});
     } else {
       IRB.SetInsertPoint(LT);
-      IRB.CreateCall(getLifetimeEnd(), {CB, LT->getArgOperand(0)});
+      createCall(IRB, getLifetimeEnd(), {CB, LT->getArgOperand(0)});
     }
   }
   return CB;
@@ -602,6 +615,12 @@ void GPUSanImpl::instrumentAccess(LoopInfo &LI, Instruction &I, int PtrIdx,
     getAllocationInfo(*I.getFunction(), PO, *const_cast<Value *>(Object), Start,
                       Length, Tag);
 
+  if (Loop *L = LI.getLoopFor(I.getParent())) {
+    auto &SE = FAM.getResult<ScalarEvolutionAnalysis>(*I.getFunction());
+    const auto &LD = SE.getLoopDisposition(SE.getSCEVAtScope(PtrOp, L), L);
+    LD->
+  }
+
   static int32_t ReadAccessId = -1;
   static int32_t WriteAccessId = 1;
   const int32_t &AccessId = IsRead ? ReadAccessId-- : WriteAccessId++;
@@ -614,16 +633,15 @@ void GPUSanImpl::instrumentAccess(LoopInfo &LI, Instruction &I, int PtrIdx,
       IRB.CreatePointerBitCastOrAddrSpaceCast(PtrOp, getPtrTy(PO));
   CallInst *CB;
   if (Start) {
-    CB =
-        IRB.CreateCall(getCheckWithBaseFn(PO),
-                       {PlainPtrOp, Start, Length, Tag, Size,
-                        ConstantInt::get(Int64Ty, AccessId), getSourceIndex(I)},
-                       I.getName() + ".san");
+    CB = createCall(IRB, getCheckWithBaseFn(PO),
+                    {PlainPtrOp, Start, Length, Tag, Size,
+                     ConstantInt::get(Int64Ty, AccessId), getSourceIndex(I)},
+                    I.getName() + ".san");
   } else {
-    CB = IRB.CreateCall(getCheckFn(PO),
-                        {PlainPtrOp, Size, ConstantInt::get(Int64Ty, AccessId),
-                         getSourceIndex(I)},
-                        I.getName() + ".san");
+    CB = createCall(IRB, getCheckFn(PO),
+                    {PlainPtrOp, Size, ConstantInt::get(Int64Ty, AccessId),
+                     getSourceIndex(I)},
+                    I.getName() + ".san");
   }
   I.setOperand(PtrIdx,
                IRB.CreatePointerBitCastOrAddrSpaceCast(CB, PtrOp->getType()));
@@ -651,9 +669,9 @@ void GPUSanImpl::instrumentGEPInst(LoopInfo &LI, GetElementPtrInst &GEP) {
   IRBuilder<> IRB(GEP.getNextNode());
   Value *PlainPtrOp =
       IRB.CreatePointerBitCastOrAddrSpaceCast(PtrOp, getPtrTy(PO));
-  auto *CB = IRB.CreateCall(getGEPFn(PO),
-                            {PlainPtrOp, UndefValue::get(Int64Ty), getPC(IRB)},
-                            GEP.getName() + ".san");
+  auto *CB = createCall(IRB, getGEPFn(PO),
+                        {PlainPtrOp, UndefValue::get(Int64Ty), getPC(IRB)},
+                        GEP.getName() + ".san");
   GEP.replaceAllUsesWith(
       IRB.CreatePointerBitCastOrAddrSpaceCast(CB, GEP.getType()));
   Value *Offset =
@@ -681,8 +699,8 @@ bool GPUSanImpl::instrumentCallInst(LoopInfo &LI, CallInst &CI) {
           continue;
         Value *PlainOp =
             IRB.CreatePointerBitCastOrAddrSpaceCast(Op, getPtrTy(PO));
-        auto *CB = IRB.CreateCall(getUnpackFn(PO), {PlainOp, getPC(IRB)},
-                                  Op->getName() + ".unpack");
+        auto *CB = createCall(IRB, getUnpackFn(PO), {PlainOp, getPC(IRB)},
+                              Op->getName() + ".unpack");
         CI.setArgOperand(
             I, IRB.CreatePointerBitCastOrAddrSpaceCast(CB, Op->getType()));
         Changed = true;
@@ -763,8 +781,8 @@ void GPUSanImpl::instrumentReturns(
     return;
   for (auto *RI : Returns) {
     IRBuilder<> IRB(RI);
-    IRB.CreateCall(getFreeNLocalFn(),
-                   {ConstantInt::get(Int32Ty, Allocas.size())});
+    createCall(IRB, getFreeNLocalFn(),
+               {ConstantInt::get(Int32Ty, Allocas.size())});
   }
 }
 
@@ -783,9 +801,13 @@ bool GPUSanImpl::instrument() {
     if (Fn.hasFnAttribute("kernel"))
       Kernels.push_back(&Fn);
     if (!Fn.getName().contains("ompx") && !Fn.getName().contains("__kmpc") &&
-        !Fn.getName().starts_with("rpc_"))
-      if (!Fn.hasFnAttribute(Attribute::DisableSanitizerInstrumentation))
+        !Fn.getName().starts_with("rpc_")) {
+      if (!Fn.hasFnAttribute(Attribute::DisableSanitizerInstrumentation)) {
         Changed |= instrumentFunction(Fn);
+      } else if (!Fn.isDeclaration() &&
+                 Fn.getName().contains("SanitizerTrapInfoTy")) {
+      }
+    }
   }
 
   SmallVector<CallBase *> AmbiguousCallsOrdered;
@@ -836,7 +858,7 @@ bool GPUSanImpl::instrument() {
     }
     auto *EntryBB = BasicBlock::Create(Ctx, "entry", LocationGetter);
     IRBuilder<> IRB(EntryBB);
-    Value *Idx = IRB.CreateCall(getThreadIdFn(), {}, "san.gtid");
+    Value *Idx = createCall(IRB, getThreadIdFn(), {}, "san.gtid");
     Value *Ptr = IRB.CreateGEP(Int64Ty, LocationsArray, {Idx});
     auto *LocationValue = IRB.CreateLoad(Int64Ty, Ptr);
     IRB.CreateRet(LocationValue);
@@ -848,7 +870,7 @@ bool GPUSanImpl::instrument() {
   auto *EntryBB = BasicBlock::Create(Ctx, "entry", InitSharedFn);
   IRBuilder<> IRB(EntryBB);
   if (!AmbiguousCalls.empty()) {
-    Value *Idx = IRB.CreateCall(getThreadIdFn(), {}, "san.gtid");
+    Value *Idx = createCall(IRB, getThreadIdFn(), {}, "san.gtid");
     Value *Ptr = IRB.CreateGEP(Int64Ty, LocationsArray, {Idx});
     IRB.CreateStore(ConstantInt::get(Int64Ty, 0), Ptr);
   }
@@ -856,12 +878,12 @@ bool GPUSanImpl::instrument() {
 
   for (auto *KernelFn : Kernels) {
     IRBuilder<> IRB(&*KernelFn->getEntryBlock().getFirstNonPHIOrDbgOrAlloca());
-    IRB.CreateCall(InitSharedFn, {});
+    createCall(IRB, InitSharedFn, {});
   }
 
   for (const auto &It : llvm::enumerate(AmbiguousCallsOrdered)) {
     IRBuilder<> IRB(It.value());
-    Value *Idx = IRB.CreateCall(getThreadIdFn(), {}, "san.gtid");
+    Value *Idx = createCall(IRB, getThreadIdFn(), {}, "san.gtid");
     Value *Ptr = IRB.CreateGEP(Int64Ty, LocationsArray, {Idx});
     Value *OldVal = IRB.CreateLoad(Int64Ty, Ptr);
     Value *OldValShifted = IRB.CreateShl(
@@ -888,7 +910,12 @@ bool GPUSanImpl::instrument() {
       GlobalValue::ThreadLocalMode::NotThreadLocal, 4);
   GV->setVisibility(GlobalValue::ProtectedVisibility);
 
-  M.dump();
+  for (auto *CI : Calls) {
+    InlineFunctionInfo IFI;
+    if (InlineFunction(*CI, IFI).isSuccess())
+      Changed = true;
+  }
+
   return Changed;
 }
 
@@ -899,6 +926,5 @@ PreservedAnalyses GPUSanPass::run(Module &M, ModuleAnalysisManager &AM) {
   if (!Lowerer.instrument())
     return PreservedAnalyses::all();
   LLVM_DEBUG(M.dump());
-
   return PreservedAnalyses::none();
 }
diff --git a/offload/DeviceRTL/src/Sanitizer.cpp b/offload/DeviceRTL/src/Sanitizer.cpp
index 2acf15c22d992..3260b92d583f3 100644
--- a/offload/DeviceRTL/src/Sanitizer.cpp
+++ b/offload/DeviceRTL/src/Sanitizer.cpp
@@ -12,6 +12,7 @@
 #include "DeviceUtils.h"
 #include "Interface.h"
 #include "LibC.h"
+#include "Mapping.h"
 #include "Shared/Environment.h"
 #include "Synchronization.h"
 
diff --git a/offload/include/Shared/Sanitizer.h b/offload/include/Shared/Sanitizer.h
index 79e3af2d33bbd..d28f3f43922cd 100644
--- a/offload/include/Shared/Sanitizer.h
+++ b/offload/include/Shared/Sanitizer.h
@@ -21,6 +21,8 @@ int ompx_thread_id(int Dim);
 int64_t __san_get_location_value();
 }
 
+#define INLINE gnu::always_inline
+
 enum class AllocationKind { LOCAL, GLOBAL, LAST = GLOBAL };
 
 template <AllocationKind AK> struct ASTypes {
@@ -182,7 +184,7 @@ struct SanitizerTrapInfoTy {
   int64_t CallId;
   /// }
 
-  [[clang::disable_sanitizer_instrumentation]] void
+  [[clang::disable_sanitizer_instrumentation, INLINE]] void
   setCoordinates(int64_t SourceId) {
     for (int32_t Dim = 0; Dim < 3; ++Dim) {
       BlockId[Dim] = ompx_block_id(Dim);
@@ -193,7 +195,7 @@ struct SanitizerTrapInfoTy {
   }
 
   template <enum AllocationKind AK>
-  [[clang::disable_sanitizer_instrumentation, gnu::always_inline]] void
+  [[clang::disable_sanitizer_instrumentation, INLINE]] void
   allocationError(ErrorCodeTy EC, _AS_PTR(void, AK) Start, uint64_t Length,
                   int64_t Id, int64_t Tag, uint64_t Slot, int64_t SourceId) {
     AllocationStart = (void *)Start;
@@ -208,7 +210,7 @@ struct SanitizerTrapInfoTy {
   }
 
   template <enum AllocationKind AK>
-  [[clang::disable_sanitizer_instrumentation, gnu::always_inline]] void
+  [[clang::disable_sanitizer_instrumentation, INLINE]] void
   propagateAccessError(ErrorCodeTy EC, const AllocationTy<AK> &A,
                        const AllocationPtrTy<AK> &AP, uint64_t Size, int64_t Id,
                        int64_t SourceId) {
@@ -232,7 +234,7 @@ struct SanitizerTrapInfoTy {
   }
 
   template <enum AllocationKind AK>
-  [[clang::disable_sanitizer_instrumentation, noreturn, gnu::noinline]] void
+  [[clang::disable_sanitizer_instrumentation, noreturn, INLINE, gnu::cold]] void
   exceedsAllocationLength(_AS_PTR(void, AK) Start, uint64_t Length,
                           int64_t AllocationId, uint64_t Slot,
                           int64_t SourceId) {
@@ -242,7 +244,7 @@ struct SanitizerTrapInfoTy {
   }
 
   template <enum AllocationKind AK>
-  [[clang::disable_sanitizer_instrumentation, noreturn, gnu::noinline]] void
+  [[clang::disable_sanitizer_instrumentation, noreturn, INLINE, gnu::cold]] void
   exceedsAllocationSlots(_AS_PTR(void, AK) Start, uint64_t Length,
                          int64_t AllocationId, uint64_t Slot,
                          int64_t SourceId) {
@@ -252,7 +254,7 @@ struct SanitizerTrapInfoTy {
   }
 
   template <enum AllocationKind AK>
-  [[clang::disable_sanitizer_instrumentation, noreturn, gnu::noinline]] void
+  [[clang::disable_sanitizer_instrumentation, noreturn, INLINE, gnu::cold]] void
   pointerOutsideAllocation(_AS_PTR(void, AK) Start, uint64_t Length,
                            int64_t AllocationId, uint64_t Slot, uint64_t PC) {
     allocationError<AK>(PointerOutsideAllocation, Start, Length, AllocationId,
@@ -261,7 +263,7 @@ struct SanitizerTrapInfoTy {
   }
 
   template <enum AllocationKind AK>
-  [[clang::disable_sanitizer_instrumentation, noreturn, gnu::noinline]] void
+  [[clang::disable_sanitizer_instrumentation, noreturn, INLINE, gnu::cold]] void
   outOfBoundAccess(const AllocationTy<AK> A, const AllocationPtrTy<AK> AP,
                    uint64_t Size, int64_t AccessId, int64_t SourceId) {
     propagateAccessError(OutOfBounds, A, AP, Size, AccessId, SourceId);
@@ -269,7 +271,7 @@ struct SanitizerTrapInfoTy {
   }
 
   template <enum AllocationKind AK>
-  [[clang::disable_sanitizer_instrumentation, noreturn, gnu::noinline]] void
+  [[clang::disable_sanitizer_instrumentation, noreturn, INLINE, gnu::cold]] void
   useAfterScope(const AllocationTy<AK> A, const AllocationPtrTy<AK> AP,
                 uint64_t Size, int64_t AccessId, int64_t SourceId) {
     propagateAccessError(UseAfterScope, A, AP, Size, AccessId, SourceId);
@@ -277,7 +279,7 @@ struct SanitizerTrapInfoTy {
   }
 
   template <enum AllocationKind AK>
-  [[clang::disable_sanitizer_instrumentation, noreturn, gnu::noinline]] void
+  [[clang::disable_sanitizer_instrumentation, noreturn, INLINE, gnu::cold]] void
   useAfterFree(const AllocationTy<AK> A, const AllocationPtrTy<AK> AP,
                uint64_t Size, int64_t AccessId, int64_t SourceId) {
     propagateAccessError(UseAfterFree, A, AP, Size, AccessId, SourceId);
@@ -285,12 +287,12 @@ struct SanitizerTrapInfoTy {
   }
 
   template <enum AllocationKind AK>
-  [[clang::disable_sanitizer_instrumentation, noreturn, gnu::noinline]] void
+  [[clang::disable_sanitizer_instrumentation, noreturn, INLINE, gnu::cold]] void
   accessError(const AllocationPtrTy<AK> AP, int64_t Size, int64_t AccessId,
               int64_t SourceId);
 
   template <enum AllocationKind AK>
-  [[clang::disable_sanitizer_instrumentation, noreturn, gnu::noinline]] void
+  [[clang::disable_sanitizer_instrumentation, noreturn, INLINE, gnu::cold]] void
   garbagePointer(const AllocationPtrTy<AK> AP, void *P, int64_t SourceId) {
     ErrorCode = GarbagePointer;
     AllocationStart = P;
@@ -304,7 +306,7 @@ struct SanitizerTrapInfoTy {
   }
 
   template <enum AllocationKind AK>
-  [[clang::disable_sanitizer_instrumentation, noreturn, gnu::noinline]] void
+  [[clang::disable_sanitizer_instrumentation, noreturn, INLINE, gnu::cold]] void
   memoryLeak(const AllocationTy<AK> A, uint64_t Slot) {
     allocationError<AK>(MemoryLeak, A.Start, A.Length, A.Id, A.Tag, Slot,
                         /*SourceId=*/-1);
@@ -341,7 +343,7 @@ getAllocation(const AllocationPtrTy<AK> AP, int64_t AccessId = 0) {
 }
 
 template <enum AllocationKind AK>
-[[clang::disable_sanitizer_instrumentation, noreturn, gnu::noinline]] void
+[[clang::disable_sanitizer_instrumentation, noreturn, INLINE, gnu::cold]] void
 SanitizerTrapInfoTy::accessError(const AllocationPtrTy<AK> AP, int64_t Size,
                                  int64_t AccessId, int64_t SourceId) {
   auto &A = getAllocationArray<AK>().Arr[AP.AllocationId];
diff --git a/offload/include/Shared/Types.h b/offload/include/Shared/Types.h
index 1503a4b2a1437..15e3cfefa37ed 100644
--- a/offload/include/Shared/Types.h
+++ b/offload/include/Shared/Types.h
@@ -13,10 +13,10 @@
 #ifndef OMPTARGET_SHARED_TYPES_H
 #define OMPTARGET_SHARED_TYPES_H
 
-#ifdef OMPTARGET_DEVICE_RUNTIME
-#include "DeviceTypes.h"
-#else
+#ifndef OMPTARGET_DEVICE_RUNTIME
 #include <cstdint>
+#else
+#include "DeviceTypes.h"
 #endif
 
 #endif // OMPTARGET_SHARED_TYPES_H

>From 6a82721324c63740b5c865362adb07f8f01e5393 Mon Sep 17 00:00:00 2001
From: Johannes Doerfert <johannes at jdoerfert.de>
Date: Fri, 12 Jul 2024 17:33:33 -0700
Subject: [PATCH 24/31] Fix build, add shuffle

---
 .../lib/Transforms/Instrumentation/GPUSan.cpp |  1 -
 offload/DeviceRTL/include/DeviceUtils.h       |  2 +-
 offload/DeviceRTL/src/Mapping.cpp             | 12 ++++++++
 offload/DeviceRTL/src/Utils.cpp               | 14 ++++-----
 offload/DeviceRTL/src/Workshare.cpp           |  2 +-
 openmp/runtime/src/include/ompx.h.var         | 30 +++++++++++++++++++
 6 files changed, 51 insertions(+), 10 deletions(-)

diff --git a/llvm/lib/Transforms/Instrumentation/GPUSan.cpp b/llvm/lib/Transforms/Instrumentation/GPUSan.cpp
index 3b01337d28c9f..26a6dc7ea33d0 100644
--- a/llvm/lib/Transforms/Instrumentation/GPUSan.cpp
+++ b/llvm/lib/Transforms/Instrumentation/GPUSan.cpp
@@ -618,7 +618,6 @@ void GPUSanImpl::instrumentAccess(LoopInfo &LI, Instruction &I, int PtrIdx,
   if (Loop *L = LI.getLoopFor(I.getParent())) {
     auto &SE = FAM.getResult<ScalarEvolutionAnalysis>(*I.getFunction());
     const auto &LD = SE.getLoopDisposition(SE.getSCEVAtScope(PtrOp, L), L);
-    LD->
   }
 
   static int32_t ReadAccessId = -1;
diff --git a/offload/DeviceRTL/include/DeviceUtils.h b/offload/DeviceRTL/include/DeviceUtils.h
index 7b8871a766161..9581ba83604db 100644
--- a/offload/DeviceRTL/include/DeviceUtils.h
+++ b/offload/DeviceRTL/include/DeviceUtils.h
@@ -22,7 +22,7 @@ namespace utils {
 
 /// Return the value \p Var from thread Id \p SrcLane in the warp if the thread
 /// is identified by \p Mask.
-int32_t shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane);
+int32_t shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane, int32_t Width);
 
 int32_t shuffleDown(uint64_t Mask, int32_t Var, uint32_t Delta, int32_t Width);
 
diff --git a/offload/DeviceRTL/src/Mapping.cpp b/offload/DeviceRTL/src/Mapping.cpp
index 4d89c11ee7e0f..06318635e0170 100644
--- a/offload/DeviceRTL/src/Mapping.cpp
+++ b/offload/DeviceRTL/src/Mapping.cpp
@@ -376,6 +376,18 @@ uint64_t ompx_ballot_sync(uint64_t mask, int pred) {
   return utils::ballotSync(mask, pred);
 }
 
+#define _TGT_KERNEL_LANGUAGE_SHFL_SYNC(TYPE, TY)                               \
+  int ompx_shfl_sync_##TY(uint64_t mask, TYPE var, int src, int width) {       \
+    return utils::shuffle(mask, var, src, width);                              \
+  }
+
+_TGT_KERNEL_LANGUAGE_SHFL_SYNC(int, i)
+_TGT_KERNEL_LANGUAGE_SHFL_SYNC(float, f)
+_TGT_KERNEL_LANGUAGE_SHFL_SYNC(long, l)
+_TGT_KERNEL_LANGUAGE_SHFL_SYNC(double, d)
+
+#undef _TGT_KERNEL_LANGUAGE_SHFL_SYNC
+
 int ompx_shfl_down_sync_i(uint64_t mask, int var, unsigned delta, int width) {
   return utils::shuffleDown(mask, var, delta, width);
 }
diff --git a/offload/DeviceRTL/src/Utils.cpp b/offload/DeviceRTL/src/Utils.cpp
index 956e6200ffd5c..baefce4c65b68 100644
--- a/offload/DeviceRTL/src/Utils.cpp
+++ b/offload/DeviceRTL/src/Utils.cpp
@@ -34,7 +34,7 @@ uint64_t Pack(uint32_t LowBits, uint32_t HighBits) {
   return (((uint64_t)HighBits) << 32) | (uint64_t)LowBits;
 }
 
-int32_t shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane);
+int32_t shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane, int32_t Width);
 int32_t shuffleDown(uint64_t Mask, int32_t Var, uint32_t LaneDelta,
                     int32_t Width);
 
@@ -45,8 +45,7 @@ uint64_t ballotSync(uint64_t Mask, int32_t Pred);
 ///{
 #pragma omp begin declare variant match(device = {arch(amdgcn)})
 
-int32_t shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane) {
-  int Width = mapping::getWarpSize();
+int32_t shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane, int32_t Width) {
   int Self = mapping::getThreadIdInWarp();
   int Index = SrcLane + (Self & ~(Width - 1));
   return __builtin_amdgcn_ds_bpermute(Index << 2, Var);
@@ -82,8 +81,8 @@ bool isThreadLocalMemPtr(const void *Ptr) {
         device = {arch(nvptx, nvptx64)},                                       \
             implementation = {extension(match_any)})
 
-int32_t shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane) {
-  return __nvvm_shfl_sync_idx_i32(Mask, Var, SrcLane, 0x1f);
+int32_t shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane, int32_t Width) {
+  return __nvvm_shfl_sync_idx_i32(Mask, Var, SrcLane, Width);
 }
 
 int32_t shuffleDown(uint64_t Mask, int32_t Var, uint32_t Delta, int32_t Width) {
@@ -111,8 +110,9 @@ void utils::unpack(uint64_t Val, uint32_t &LowBits, uint32_t &HighBits) {
   impl::Unpack(Val, &LowBits, &HighBits);
 }
 
-int32_t utils::shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane) {
-  return impl::shuffle(Mask, Var, SrcLane);
+int32_t utils::shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane,
+                       int32_t Width) {
+  return impl::shuffle(Mask, Var, SrcLane, Width);
 }
 
 int32_t utils::shuffleDown(uint64_t Mask, int32_t Var, uint32_t Delta,
diff --git a/offload/DeviceRTL/src/Workshare.cpp b/offload/DeviceRTL/src/Workshare.cpp
index fd835f2af677e..26238e7c7bc82 100644
--- a/offload/DeviceRTL/src/Workshare.cpp
+++ b/offload/DeviceRTL/src/Workshare.cpp
@@ -348,7 +348,7 @@ template <typename T, typename ST> struct omptarget_nvptx_LoopSupport {
     if (rank == 0) {
       warp_res = atomic::add(&Cnt, change, atomic::seq_cst);
     }
-    warp_res = utils::shuffle(active, warp_res, leader);
+    warp_res = utils::shuffle(active, warp_res, leader, mapping::getWarpSize());
     return warp_res + rank;
   }
 
diff --git a/openmp/runtime/src/include/ompx.h.var b/openmp/runtime/src/include/ompx.h.var
index 623f0b9c315bd..2d3ad31188ced 100644
--- a/openmp/runtime/src/include/ompx.h.var
+++ b/openmp/runtime/src/include/ompx.h.var
@@ -139,6 +139,20 @@ _TGT_KERNEL_LANGUAGE_DECL_GRID_C(grid_dim)
 
 uint64_t ompx_ballot_sync(uint64_t mask, int pred);
 
+/// ompx_shfl_sync_{i,f,l,d}
+///{
+#define _TGT_KERNEL_LANGUAGE_SHFL_SYNC(TYPE, TY)                          \
+  TYPE ompx_shfl_sync_##TY(uint64_t mask, TYPE var, int src,                   \
+                            int width);
+
+_TGT_KERNEL_LANGUAGE_SHFL_SYNC(int, i)
+_TGT_KERNEL_LANGUAGE_SHFL_SYNC(float, f)
+_TGT_KERNEL_LANGUAGE_SHFL_SYNC(long, l)
+_TGT_KERNEL_LANGUAGE_SHFL_SYNC(double, d)
+
+#undef _TGT_KERNEL_LANGUAGE_SHFL_SYNC
+///}
+
 /// ompx_shfl_down_sync_{i,f,l,d}
 ///{
 #define _TGT_KERNEL_LANGUAGE_SHFL_DOWN_SYNC(TYPE, TY)                          \
@@ -208,6 +222,22 @@ static inline uint64_t ballot_sync(uint64_t mask, int pred) {
   return ompx_ballot_sync(mask, pred);
 }
 
+/// shfl_sync
+///{
+#define _TGT_KERNEL_LANGUAGE_SHFL_SYNC(TYPE, TY)                          \
+  static inline TYPE shfl_sync(uint64_t mask, TYPE var, int src,               \
+                                    int width = __WARP_SIZE) {                 \
+    return ompx_shfl_down_sync_##TY(mask, var, int, width);                  \
+  }
+
+_TGT_KERNEL_LANGUAGE_SHFL_SYNC(int, i)
+_TGT_KERNEL_LANGUAGE_SHFL_SYNC(float, f)
+_TGT_KERNEL_LANGUAGE_SHFL_SYNC(long, l)
+_TGT_KERNEL_LANGUAGE_SHFL_SYNC(double, d)
+
+#undef _TGT_KERNEL_LANGUAGE_SHFL_DOWN_SYNC
+///}
+
 /// shfl_down_sync
 ///{
 #define _TGT_KERNEL_LANGUAGE_SHFL_DOWN_SYNC(TYPE, TY)                          \

>From 06b4d0c371ece0d0112b268e8c142947e4c7971c Mon Sep 17 00:00:00 2001
From: Johannes Doerfert <johannes at jdoerfert.de>
Date: Wed, 17 Jul 2024 13:04:13 -0700
Subject: [PATCH 25/31] Tune inlining, improve perf for -g

---
 .../lib/Transforms/Instrumentation/GPUSan.cpp | 20 ++++++++++++++-----
 offload/include/Shared/Sanitizer.h            | 18 +++++++++++------
 2 files changed, 27 insertions(+), 11 deletions(-)

diff --git a/llvm/lib/Transforms/Instrumentation/GPUSan.cpp b/llvm/lib/Transforms/Instrumentation/GPUSan.cpp
index 26a6dc7ea33d0..728396654e75b 100644
--- a/llvm/lib/Transforms/Instrumentation/GPUSan.cpp
+++ b/llvm/lib/Transforms/Instrumentation/GPUSan.cpp
@@ -231,6 +231,8 @@ class GPUSanImpl final {
   }
   FunctionCallee getAllocationInfoFn(PtrOrigin PO) {
     assert(PO >= LOCAL && PO <= GLOBAL && "Origin does not need handling.");
+    if (auto *F = M.getFunction("ompx_get_allocation_info" + getSuffix(PO)))
+      return FunctionCallee(F->getFunctionType(), F);
     return getOrCreateFn(
         AllocationInfoFn[PO], "ompx_get_allocation_info" + getSuffix(PO),
         StructType::create({getPtrTy(PO), Int64Ty, Int32Ty}), {getPtrTy(PO)});
@@ -872,13 +874,14 @@ bool GPUSanImpl::instrument() {
     Value *Idx = createCall(IRB, getThreadIdFn(), {}, "san.gtid");
     Value *Ptr = IRB.CreateGEP(Int64Ty, LocationsArray, {Idx});
     IRB.CreateStore(ConstantInt::get(Int64Ty, 0), Ptr);
-  }
-  IRB.CreateRetVoid();
 
-  for (auto *KernelFn : Kernels) {
-    IRBuilder<> IRB(&*KernelFn->getEntryBlock().getFirstNonPHIOrDbgOrAlloca());
-    createCall(IRB, InitSharedFn, {});
+    for (auto *KernelFn : Kernels) {
+      IRBuilder<> IRB(
+          &*KernelFn->getEntryBlock().getFirstNonPHIOrDbgOrAlloca());
+      createCall(IRB, InitSharedFn, {});
+    }
   }
+  IRB.CreateRetVoid();
 
   for (const auto &It : llvm::enumerate(AmbiguousCallsOrdered)) {
     IRBuilder<> IRB(It.value());
@@ -910,6 +913,13 @@ bool GPUSanImpl::instrument() {
   GV->setVisibility(GlobalValue::ProtectedVisibility);
 
   for (auto *CI : Calls) {
+    if (!CI->getCalledFunction()) {
+      CI->dump();
+      continue;
+    }
+    //  if (!CI->getCalledFunction()->getName().contains("gep") &&
+    //      !CI->getCalledFunction()->getName().contains("info"))
+    //    continue;
     InlineFunctionInfo IFI;
     if (InlineFunction(*CI, IFI).isSuccess())
       Changed = true;
diff --git a/offload/include/Shared/Sanitizer.h b/offload/include/Shared/Sanitizer.h
index d28f3f43922cd..e24e687863279 100644
--- a/offload/include/Shared/Sanitizer.h
+++ b/offload/include/Shared/Sanitizer.h
@@ -22,6 +22,7 @@ int64_t __san_get_location_value();
 }
 
 #define INLINE gnu::always_inline
+#define NOINLINE gnu::noinline
 
 enum class AllocationKind { LOCAL, GLOBAL, LAST = GLOBAL };
 
@@ -234,7 +235,8 @@ struct SanitizerTrapInfoTy {
   }
 
   template <enum AllocationKind AK>
-  [[clang::disable_sanitizer_instrumentation, noreturn, INLINE, gnu::cold]] void
+  [[clang::disable_sanitizer_instrumentation, noreturn, NOINLINE,
+    gnu::cold]] void
   exceedsAllocationLength(_AS_PTR(void, AK) Start, uint64_t Length,
                           int64_t AllocationId, uint64_t Slot,
                           int64_t SourceId) {
@@ -244,7 +246,8 @@ struct SanitizerTrapInfoTy {
   }
 
   template <enum AllocationKind AK>
-  [[clang::disable_sanitizer_instrumentation, noreturn, INLINE, gnu::cold]] void
+  [[clang::disable_sanitizer_instrumentation, noreturn, NOINLINE,
+    gnu::cold]] void
   exceedsAllocationSlots(_AS_PTR(void, AK) Start, uint64_t Length,
                          int64_t AllocationId, uint64_t Slot,
                          int64_t SourceId) {
@@ -254,7 +257,8 @@ struct SanitizerTrapInfoTy {
   }
 
   template <enum AllocationKind AK>
-  [[clang::disable_sanitizer_instrumentation, noreturn, INLINE, gnu::cold]] void
+  [[clang::disable_sanitizer_instrumentation, noreturn, NOINLINE,
+    gnu::cold]] void
   pointerOutsideAllocation(_AS_PTR(void, AK) Start, uint64_t Length,
                            int64_t AllocationId, uint64_t Slot, uint64_t PC) {
     allocationError<AK>(PointerOutsideAllocation, Start, Length, AllocationId,
@@ -287,12 +291,14 @@ struct SanitizerTrapInfoTy {
   }
 
   template <enum AllocationKind AK>
-  [[clang::disable_sanitizer_instrumentation, noreturn, INLINE, gnu::cold]] void
+  [[clang::disable_sanitizer_instrumentation, noreturn, NOINLINE,
+    gnu::cold]] void
   accessError(const AllocationPtrTy<AK> AP, int64_t Size, int64_t AccessId,
               int64_t SourceId);
 
   template <enum AllocationKind AK>
-  [[clang::disable_sanitizer_instrumentation, noreturn, INLINE, gnu::cold]] void
+  [[clang::disable_sanitizer_instrumentation, noreturn, NOINLINE,
+    gnu::cold]] void
   garbagePointer(const AllocationPtrTy<AK> AP, void *P, int64_t SourceId) {
     ErrorCode = GarbagePointer;
     AllocationStart = P;
@@ -343,7 +349,7 @@ getAllocation(const AllocationPtrTy<AK> AP, int64_t AccessId = 0) {
 }
 
 template <enum AllocationKind AK>
-[[clang::disable_sanitizer_instrumentation, noreturn, INLINE, gnu::cold]] void
+[[clang::disable_sanitizer_instrumentation, noreturn, NOINLINE, gnu::cold]] void
 SanitizerTrapInfoTy::accessError(const AllocationPtrTy<AK> AP, int64_t Size,
                                  int64_t AccessId, int64_t SourceId) {
   auto &A = getAllocationArray<AK>().Arr[AP.AllocationId];

>From c004067f181c3cf27f2c0b4bca0024ce67578c81 Mon Sep 17 00:00:00 2001
From: Johannes Doerfert <johannes at jdoerfert.de>
Date: Thu, 18 Jul 2024 22:49:45 -0700
Subject: [PATCH 26/31] [OpenMP][NFC] Precommit test auto-update

---
 clang/test/OpenMP/parallel_codegen.cpp        | 206 ++--
 .../OpenMP/target_parallel_debug_codegen.cpp  | 652 ++++++-------
 .../target_parallel_for_debug_codegen.cpp     | 894 +++++++++---------
 ...target_parallel_generic_loop_codegen-3.cpp | 894 +++++++++---------
 4 files changed, 1323 insertions(+), 1323 deletions(-)

diff --git a/clang/test/OpenMP/parallel_codegen.cpp b/clang/test/OpenMP/parallel_codegen.cpp
index 9082f1c3232af..867b250332fd5 100644
--- a/clang/test/OpenMP/parallel_codegen.cpp
+++ b/clang/test/OpenMP/parallel_codegen.cpp
@@ -115,7 +115,7 @@ int main (int argc, char **argv) {
 // CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 1
 // CHECK1-NEXT:    [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
 // CHECK1-NEXT:    invoke void @_Z3fooIiEvT_(i32 noundef [[TMP2]])
-// CHECK1-NEXT:    to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]]
+// CHECK1-NEXT:            to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]]
 // CHECK1:       invoke.cont:
 // CHECK1-NEXT:    [[TMP3:%.*]] = load i32, ptr @global, align 4
 // CHECK1-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 1
@@ -123,7 +123,7 @@ int main (int argc, char **argv) {
 // CHECK1-NEXT:    ret void
 // CHECK1:       terminate.lpad:
 // CHECK1-NEXT:    [[TMP4:%.*]] = landingpad { ptr, i32 }
-// CHECK1-NEXT:    catch ptr null
+// CHECK1-NEXT:            catch ptr null
 // CHECK1-NEXT:    [[TMP5:%.*]] = extractvalue { ptr, i32 } [[TMP4]], 0
 // CHECK1-NEXT:    call void @__clang_call_terminate(ptr [[TMP5]]) #[[ATTR6:[0-9]+]]
 // CHECK1-NEXT:    unreachable
@@ -186,7 +186,7 @@ int main (int argc, char **argv) {
 // CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 1
 // CHECK1-NEXT:    [[TMP3:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
 // CHECK1-NEXT:    invoke void @_Z3fooIiEvT_(i32 noundef [[TMP3]])
-// CHECK1-NEXT:    to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]]
+// CHECK1-NEXT:            to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]]
 // CHECK1:       invoke.cont:
 // CHECK1-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP2]], align 4
 // CHECK1-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 1
@@ -194,7 +194,7 @@ int main (int argc, char **argv) {
 // CHECK1-NEXT:    ret void
 // CHECK1:       terminate.lpad:
 // CHECK1-NEXT:    [[TMP5:%.*]] = landingpad { ptr, i32 }
-// CHECK1-NEXT:    catch ptr null
+// CHECK1-NEXT:            catch ptr null
 // CHECK1-NEXT:    [[TMP6:%.*]] = extractvalue { ptr, i32 } [[TMP5]], 0
 // CHECK1-NEXT:    call void @__clang_call_terminate(ptr [[TMP6]]) #[[ATTR6]]
 // CHECK1-NEXT:    unreachable
@@ -233,7 +233,7 @@ int main (int argc, char **argv) {
 // CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 1
 // CHECK1-NEXT:    [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
 // CHECK1-NEXT:    invoke void @_Z3fooIiEvT_(i32 noundef [[TMP2]])
-// CHECK1-NEXT:    to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]]
+// CHECK1-NEXT:            to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]]
 // CHECK1:       invoke.cont:
 // CHECK1-NEXT:    [[TMP3:%.*]] = load i32, ptr @global, align 4
 // CHECK1-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 1
@@ -241,7 +241,7 @@ int main (int argc, char **argv) {
 // CHECK1-NEXT:    ret void
 // CHECK1:       terminate.lpad:
 // CHECK1-NEXT:    [[TMP4:%.*]] = landingpad { ptr, i32 }
-// CHECK1-NEXT:    catch ptr null
+// CHECK1-NEXT:            catch ptr null
 // CHECK1-NEXT:    [[TMP5:%.*]] = extractvalue { ptr, i32 } [[TMP4]], 0
 // CHECK1-NEXT:    call void @__clang_call_terminate(ptr [[TMP5]]) #[[ATTR6]]
 // CHECK1-NEXT:    unreachable
@@ -278,7 +278,7 @@ int main (int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP1:%.*]] = load i64, ptr [[VLA_ADDR]], align 8
 // CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP0]], align 8
 // CHECK1-NEXT:    invoke void @_Z3fooIPPcEvT_(ptr noundef [[TMP2]])
-// CHECK1-NEXT:    to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]]
+// CHECK1-NEXT:            to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]]
 // CHECK1:       invoke.cont:
 // CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[VAR]], align 8
 // CHECK1-NEXT:    [[TMP4:%.*]] = mul nsw i64 0, [[TMP1]]
@@ -287,7 +287,7 @@ int main (int argc, char **argv) {
 // CHECK1-NEXT:    ret void
 // CHECK1:       terminate.lpad:
 // CHECK1-NEXT:    [[TMP5:%.*]] = landingpad { ptr, i32 }
-// CHECK1-NEXT:    catch ptr null
+// CHECK1-NEXT:            catch ptr null
 // CHECK1-NEXT:    [[TMP6:%.*]] = extractvalue { ptr, i32 } [[TMP5]], 0
 // CHECK1-NEXT:    call void @__clang_call_terminate(ptr [[TMP6]]) #[[ATTR6]]
 // CHECK1-NEXT:    unreachable
@@ -311,17 +311,17 @@ int main (int argc, char **argv) {
 // CHECK2-NEXT:    [[__VLA_EXPR0:%.*]] = alloca i64, align 8
 // CHECK2-NEXT:    store i32 0, ptr [[RETVAL]], align 4
 // CHECK2-NEXT:    store i32 [[ARGC]], ptr [[ARGC_ADDR]], align 4
-// CHECK2-NEXT:    call void @llvm.dbg.declare(metadata ptr [[ARGC_ADDR]], metadata [[META18:![0-9]+]], metadata !DIExpression()), !dbg [[DBG19:![0-9]+]]
+// CHECK2-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[ARGC_ADDR]], metadata [[META18:![0-9]+]], metadata !DIExpression()), !dbg [[DBG19:![0-9]+]]
 // CHECK2-NEXT:    store ptr [[ARGV]], ptr [[ARGV_ADDR]], align 8
-// CHECK2-NEXT:    call void @llvm.dbg.declare(metadata ptr [[ARGV_ADDR]], metadata [[META20:![0-9]+]], metadata !DIExpression()), !dbg [[DBG21:![0-9]+]]
+// CHECK2-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[ARGV_ADDR]], metadata [[META20:![0-9]+]], metadata !DIExpression()), !dbg [[DBG21:![0-9]+]]
 // CHECK2-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARGC_ADDR]], align 4, !dbg [[DBG22:![0-9]+]]
 // CHECK2-NEXT:    [[TMP1:%.*]] = zext i32 [[TMP0]] to i64, !dbg [[DBG23:![0-9]+]]
 // CHECK2-NEXT:    [[TMP2:%.*]] = call ptr @llvm.stacksave.p0(), !dbg [[DBG23]]
 // CHECK2-NEXT:    store ptr [[TMP2]], ptr [[SAVED_STACK]], align 8, !dbg [[DBG23]]
 // CHECK2-NEXT:    [[VLA:%.*]] = alloca i32, i64 [[TMP1]], align 16, !dbg [[DBG23]]
 // CHECK2-NEXT:    store i64 [[TMP1]], ptr [[__VLA_EXPR0]], align 8, !dbg [[DBG23]]
-// CHECK2-NEXT:    call void @llvm.dbg.declare(metadata ptr [[__VLA_EXPR0]], metadata [[META24:![0-9]+]], metadata !DIExpression()), !dbg [[DBG26:![0-9]+]]
-// CHECK2-NEXT:    call void @llvm.dbg.declare(metadata ptr [[VLA]], metadata [[META27:![0-9]+]], metadata !DIExpression()), !dbg [[DBG31:![0-9]+]]
+// CHECK2-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[__VLA_EXPR0]], metadata [[META24:![0-9]+]], metadata !DIExpression()), !dbg [[DBG26:![0-9]+]]
+// CHECK2-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[VLA]], metadata [[META27:![0-9]+]], metadata !DIExpression()), !dbg [[DBG31:![0-9]+]]
 // CHECK2-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB1:[0-9]+]], i32 2, ptr @main.omp_outlined, i64 [[TMP1]], ptr [[VLA]]), !dbg [[DBG32:![0-9]+]]
 // CHECK2-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB5:[0-9]+]], i32 1, ptr @main.omp_outlined.2, i64 [[TMP1]]), !dbg [[DBG33:![0-9]+]]
 // CHECK2-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB9:[0-9]+]], i32 2, ptr @main.omp_outlined.4, i64 [[TMP1]], ptr [[VLA]]), !dbg [[DBG34:![0-9]+]]
@@ -342,19 +342,19 @@ int main (int argc, char **argv) {
 // CHECK2-NEXT:    [[VLA_ADDR:%.*]] = alloca i64, align 8
 // CHECK2-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
 // CHECK2-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
-// CHECK2-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTGLOBAL_TID__ADDR]], metadata [[META47:![0-9]+]], metadata !DIExpression()), !dbg [[DBG48:![0-9]+]]
+// CHECK2-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[DOTGLOBAL_TID__ADDR]], metadata [[META47:![0-9]+]], metadata !DIExpression()), !dbg [[DBG48:![0-9]+]]
 // CHECK2-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
-// CHECK2-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTBOUND_TID__ADDR]], metadata [[META49:![0-9]+]], metadata !DIExpression()), !dbg [[DBG48]]
+// CHECK2-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[DOTBOUND_TID__ADDR]], metadata [[META49:![0-9]+]], metadata !DIExpression()), !dbg [[DBG48]]
 // CHECK2-NEXT:    store i64 [[VLA]], ptr [[VLA_ADDR]], align 8
-// CHECK2-NEXT:    call void @llvm.dbg.declare(metadata ptr [[VLA_ADDR]], metadata [[META50:![0-9]+]], metadata !DIExpression()), !dbg [[DBG48]]
+// CHECK2-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[VLA_ADDR]], metadata [[META50:![0-9]+]], metadata !DIExpression()), !dbg [[DBG48]]
 // CHECK2-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK2-NEXT:    call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META51:![0-9]+]], metadata !DIExpression()), !dbg [[DBG52:![0-9]+]]
+// CHECK2-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META51:![0-9]+]], metadata !DIExpression()), !dbg [[DBG52:![0-9]+]]
 // CHECK2-NEXT:    [[TMP0:%.*]] = load i64, ptr [[VLA_ADDR]], align 8, !dbg [[DBG53:![0-9]+]]
 // CHECK2-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[A_ADDR]], align 8, !dbg [[DBG53]]
 // CHECK2-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 1, !dbg [[DBG54:![0-9]+]]
 // CHECK2-NEXT:    [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !dbg [[DBG54]]
 // CHECK2-NEXT:    invoke void @_Z3fooIiEvT_(i32 noundef [[TMP2]])
-// CHECK2-NEXT:    to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]], !dbg [[DBG53]]
+// CHECK2-NEXT:            to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]], !dbg [[DBG53]]
 // CHECK2:       invoke.cont:
 // CHECK2-NEXT:    [[TMP3:%.*]] = load i32, ptr @global, align 4, !dbg [[DBG55:![0-9]+]]
 // CHECK2-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 1, !dbg [[DBG56:![0-9]+]]
@@ -362,7 +362,7 @@ int main (int argc, char **argv) {
 // CHECK2-NEXT:    ret void, !dbg [[DBG55]]
 // CHECK2:       terminate.lpad:
 // CHECK2-NEXT:    [[TMP4:%.*]] = landingpad { ptr, i32 }
-// CHECK2-NEXT:    catch ptr null, !dbg [[DBG53]]
+// CHECK2-NEXT:            catch ptr null, !dbg [[DBG53]]
 // CHECK2-NEXT:    [[TMP5:%.*]] = extractvalue { ptr, i32 } [[TMP4]], 0, !dbg [[DBG53]]
 // CHECK2-NEXT:    call void @__clang_call_terminate(ptr [[TMP5]]) #[[ATTR7:[0-9]+]], !dbg [[DBG53]]
 // CHECK2-NEXT:    unreachable, !dbg [[DBG53]]
@@ -373,7 +373,7 @@ int main (int argc, char **argv) {
 // CHECK2-NEXT:  entry:
 // CHECK2-NEXT:    [[ARGC_ADDR:%.*]] = alloca i32, align 4
 // CHECK2-NEXT:    store i32 [[ARGC]], ptr [[ARGC_ADDR]], align 4
-// CHECK2-NEXT:    call void @llvm.dbg.declare(metadata ptr [[ARGC_ADDR]], metadata [[META63:![0-9]+]], metadata !DIExpression()), !dbg [[DBG64:![0-9]+]]
+// CHECK2-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[ARGC_ADDR]], metadata [[META63:![0-9]+]], metadata !DIExpression()), !dbg [[DBG64:![0-9]+]]
 // CHECK2-NEXT:    ret void, !dbg [[DBG65:![0-9]+]]
 //
 //
@@ -392,13 +392,13 @@ int main (int argc, char **argv) {
 // CHECK2-NEXT:    [[VLA_ADDR:%.*]] = alloca i64, align 8
 // CHECK2-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
 // CHECK2-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
-// CHECK2-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTGLOBAL_TID__ADDR]], metadata [[META67:![0-9]+]], metadata !DIExpression()), !dbg [[DBG68:![0-9]+]]
+// CHECK2-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[DOTGLOBAL_TID__ADDR]], metadata [[META67:![0-9]+]], metadata !DIExpression()), !dbg [[DBG68:![0-9]+]]
 // CHECK2-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
-// CHECK2-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTBOUND_TID__ADDR]], metadata [[META69:![0-9]+]], metadata !DIExpression()), !dbg [[DBG68]]
+// CHECK2-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[DOTBOUND_TID__ADDR]], metadata [[META69:![0-9]+]], metadata !DIExpression()), !dbg [[DBG68]]
 // CHECK2-NEXT:    store i64 [[VLA]], ptr [[VLA_ADDR]], align 8
-// CHECK2-NEXT:    call void @llvm.dbg.declare(metadata ptr [[VLA_ADDR]], metadata [[META70:![0-9]+]], metadata !DIExpression()), !dbg [[DBG68]]
+// CHECK2-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[VLA_ADDR]], metadata [[META70:![0-9]+]], metadata !DIExpression()), !dbg [[DBG68]]
 // CHECK2-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK2-NEXT:    call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META71:![0-9]+]], metadata !DIExpression()), !dbg [[DBG68]]
+// CHECK2-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META71:![0-9]+]], metadata !DIExpression()), !dbg [[DBG68]]
 // CHECK2-NEXT:    [[TMP0:%.*]] = load i64, ptr [[VLA_ADDR]], align 8, !dbg [[DBG72:![0-9]+]]
 // CHECK2-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[A_ADDR]], align 8, !dbg [[DBG72]]
 // CHECK2-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !dbg [[DBG72]]
@@ -418,19 +418,19 @@ int main (int argc, char **argv) {
 // CHECK2-NEXT:    [[SAVED_STACK:%.*]] = alloca ptr, align 8
 // CHECK2-NEXT:    [[__VLA_EXPR0:%.*]] = alloca i64, align 8
 // CHECK2-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
-// CHECK2-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTGLOBAL_TID__ADDR]], metadata [[META78:![0-9]+]], metadata !DIExpression()), !dbg [[DBG79:![0-9]+]]
+// CHECK2-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[DOTGLOBAL_TID__ADDR]], metadata [[META78:![0-9]+]], metadata !DIExpression()), !dbg [[DBG79:![0-9]+]]
 // CHECK2-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
-// CHECK2-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTBOUND_TID__ADDR]], metadata [[META80:![0-9]+]], metadata !DIExpression()), !dbg [[DBG79]]
+// CHECK2-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[DOTBOUND_TID__ADDR]], metadata [[META80:![0-9]+]], metadata !DIExpression()), !dbg [[DBG79]]
 // CHECK2-NEXT:    store i64 [[VLA]], ptr [[VLA_ADDR]], align 8
-// CHECK2-NEXT:    call void @llvm.dbg.declare(metadata ptr [[VLA_ADDR]], metadata [[META81:![0-9]+]], metadata !DIExpression()), !dbg [[DBG79]]
+// CHECK2-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[VLA_ADDR]], metadata [[META81:![0-9]+]], metadata !DIExpression()), !dbg [[DBG79]]
 // CHECK2-NEXT:    [[TMP0:%.*]] = load i64, ptr [[VLA_ADDR]], align 8, !dbg [[DBG82:![0-9]+]]
-// CHECK2-NEXT:    call void @llvm.dbg.declare(metadata ptr [[GLOBAL]], metadata [[META83:![0-9]+]], metadata !DIExpression()), !dbg [[DBG79]]
+// CHECK2-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[GLOBAL]], metadata [[META83:![0-9]+]], metadata !DIExpression()), !dbg [[DBG79]]
 // CHECK2-NEXT:    [[TMP1:%.*]] = call ptr @llvm.stacksave.p0(), !dbg [[DBG82]]
 // CHECK2-NEXT:    store ptr [[TMP1]], ptr [[SAVED_STACK]], align 8, !dbg [[DBG82]]
 // CHECK2-NEXT:    [[VLA1:%.*]] = alloca i32, i64 [[TMP0]], align 16, !dbg [[DBG82]]
 // CHECK2-NEXT:    store i64 [[TMP0]], ptr [[__VLA_EXPR0]], align 8, !dbg [[DBG82]]
-// CHECK2-NEXT:    call void @llvm.dbg.declare(metadata ptr [[__VLA_EXPR0]], metadata [[META84:![0-9]+]], metadata !DIExpression()), !dbg [[DBG79]]
-// CHECK2-NEXT:    call void @llvm.dbg.declare(metadata ptr [[VLA1]], metadata [[META85:![0-9]+]], metadata !DIExpression()), !dbg [[DBG79]]
+// CHECK2-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[__VLA_EXPR0]], metadata [[META84:![0-9]+]], metadata !DIExpression()), !dbg [[DBG79]]
+// CHECK2-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[VLA1]], metadata [[META85:![0-9]+]], metadata !DIExpression()), !dbg [[DBG79]]
 // CHECK2-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB3:[0-9]+]], i32 3, ptr @main.omp_outlined_debug__.1.omp_outlined, i64 [[TMP0]], ptr [[VLA1]], ptr [[GLOBAL]]), !dbg [[DBG82]]
 // CHECK2-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[SAVED_STACK]], align 8, !dbg [[DBG86:![0-9]+]]
 // CHECK2-NEXT:    call void @llvm.stackrestore.p0(ptr [[TMP2]]), !dbg [[DBG86]]
@@ -446,22 +446,22 @@ int main (int argc, char **argv) {
 // CHECK2-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
 // CHECK2-NEXT:    [[GLOBAL_ADDR:%.*]] = alloca ptr, align 8
 // CHECK2-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
-// CHECK2-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTGLOBAL_TID__ADDR]], metadata [[META92:![0-9]+]], metadata !DIExpression()), !dbg [[DBG93:![0-9]+]]
+// CHECK2-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[DOTGLOBAL_TID__ADDR]], metadata [[META92:![0-9]+]], metadata !DIExpression()), !dbg [[DBG93:![0-9]+]]
 // CHECK2-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
-// CHECK2-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTBOUND_TID__ADDR]], metadata [[META94:![0-9]+]], metadata !DIExpression()), !dbg [[DBG93]]
+// CHECK2-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[DOTBOUND_TID__ADDR]], metadata [[META94:![0-9]+]], metadata !DIExpression()), !dbg [[DBG93]]
 // CHECK2-NEXT:    store i64 [[VLA]], ptr [[VLA_ADDR]], align 8
-// CHECK2-NEXT:    call void @llvm.dbg.declare(metadata ptr [[VLA_ADDR]], metadata [[META95:![0-9]+]], metadata !DIExpression()), !dbg [[DBG93]]
+// CHECK2-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[VLA_ADDR]], metadata [[META95:![0-9]+]], metadata !DIExpression()), !dbg [[DBG93]]
 // CHECK2-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK2-NEXT:    call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META96:![0-9]+]], metadata !DIExpression()), !dbg [[DBG97:![0-9]+]]
+// CHECK2-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META96:![0-9]+]], metadata !DIExpression()), !dbg [[DBG97:![0-9]+]]
 // CHECK2-NEXT:    store ptr [[GLOBAL]], ptr [[GLOBAL_ADDR]], align 8
-// CHECK2-NEXT:    call void @llvm.dbg.declare(metadata ptr [[GLOBAL_ADDR]], metadata [[META98:![0-9]+]], metadata !DIExpression()), !dbg [[DBG99:![0-9]+]]
+// CHECK2-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[GLOBAL_ADDR]], metadata [[META98:![0-9]+]], metadata !DIExpression()), !dbg [[DBG99:![0-9]+]]
 // CHECK2-NEXT:    [[TMP0:%.*]] = load i64, ptr [[VLA_ADDR]], align 8, !dbg [[DBG100:![0-9]+]]
 // CHECK2-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[A_ADDR]], align 8, !dbg [[DBG100]]
 // CHECK2-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[GLOBAL_ADDR]], align 8, !dbg [[DBG100]]
 // CHECK2-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 1, !dbg [[DBG101:![0-9]+]]
 // CHECK2-NEXT:    [[TMP3:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !dbg [[DBG101]]
 // CHECK2-NEXT:    invoke void @_Z3fooIiEvT_(i32 noundef [[TMP3]])
-// CHECK2-NEXT:    to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]], !dbg [[DBG100]]
+// CHECK2-NEXT:            to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]], !dbg [[DBG100]]
 // CHECK2:       invoke.cont:
 // CHECK2-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP2]], align 4, !dbg [[DBG102:![0-9]+]]
 // CHECK2-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 1, !dbg [[DBG103:![0-9]+]]
@@ -469,7 +469,7 @@ int main (int argc, char **argv) {
 // CHECK2-NEXT:    ret void, !dbg [[DBG102]]
 // CHECK2:       terminate.lpad:
 // CHECK2-NEXT:    [[TMP5:%.*]] = landingpad { ptr, i32 }
-// CHECK2-NEXT:    catch ptr null, !dbg [[DBG100]]
+// CHECK2-NEXT:            catch ptr null, !dbg [[DBG100]]
 // CHECK2-NEXT:    [[TMP6:%.*]] = extractvalue { ptr, i32 } [[TMP5]], 0, !dbg [[DBG100]]
 // CHECK2-NEXT:    call void @__clang_call_terminate(ptr [[TMP6]]) #[[ATTR7]], !dbg [[DBG100]]
 // CHECK2-NEXT:    unreachable, !dbg [[DBG100]]
@@ -484,15 +484,15 @@ int main (int argc, char **argv) {
 // CHECK2-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
 // CHECK2-NEXT:    [[GLOBAL_ADDR:%.*]] = alloca ptr, align 8
 // CHECK2-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
-// CHECK2-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTGLOBAL_TID__ADDR]], metadata [[META106:![0-9]+]], metadata !DIExpression()), !dbg [[DBG107:![0-9]+]]
+// CHECK2-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[DOTGLOBAL_TID__ADDR]], metadata [[META106:![0-9]+]], metadata !DIExpression()), !dbg [[DBG107:![0-9]+]]
 // CHECK2-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
-// CHECK2-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTBOUND_TID__ADDR]], metadata [[META108:![0-9]+]], metadata !DIExpression()), !dbg [[DBG107]]
+// CHECK2-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[DOTBOUND_TID__ADDR]], metadata [[META108:![0-9]+]], metadata !DIExpression()), !dbg [[DBG107]]
 // CHECK2-NEXT:    store i64 [[VLA]], ptr [[VLA_ADDR]], align 8
-// CHECK2-NEXT:    call void @llvm.dbg.declare(metadata ptr [[VLA_ADDR]], metadata [[META109:![0-9]+]], metadata !DIExpression()), !dbg [[DBG107]]
+// CHECK2-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[VLA_ADDR]], metadata [[META109:![0-9]+]], metadata !DIExpression()), !dbg [[DBG107]]
 // CHECK2-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK2-NEXT:    call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META110:![0-9]+]], metadata !DIExpression()), !dbg [[DBG107]]
+// CHECK2-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META110:![0-9]+]], metadata !DIExpression()), !dbg [[DBG107]]
 // CHECK2-NEXT:    store ptr [[GLOBAL]], ptr [[GLOBAL_ADDR]], align 8
-// CHECK2-NEXT:    call void @llvm.dbg.declare(metadata ptr [[GLOBAL_ADDR]], metadata [[META111:![0-9]+]], metadata !DIExpression()), !dbg [[DBG107]]
+// CHECK2-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[GLOBAL_ADDR]], metadata [[META111:![0-9]+]], metadata !DIExpression()), !dbg [[DBG107]]
 // CHECK2-NEXT:    [[TMP0:%.*]] = load i64, ptr [[VLA_ADDR]], align 8, !dbg [[DBG112:![0-9]+]]
 // CHECK2-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[A_ADDR]], align 8, !dbg [[DBG112]]
 // CHECK2-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[GLOBAL_ADDR]], align 8, !dbg [[DBG112]]
@@ -511,11 +511,11 @@ int main (int argc, char **argv) {
 // CHECK2-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK2-NEXT:    [[VLA_ADDR:%.*]] = alloca i64, align 8
 // CHECK2-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
-// CHECK2-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTGLOBAL_TID__ADDR]], metadata [[META114:![0-9]+]], metadata !DIExpression()), !dbg [[DBG115:![0-9]+]]
+// CHECK2-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[DOTGLOBAL_TID__ADDR]], metadata [[META114:![0-9]+]], metadata !DIExpression()), !dbg [[DBG115:![0-9]+]]
 // CHECK2-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
-// CHECK2-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTBOUND_TID__ADDR]], metadata [[META116:![0-9]+]], metadata !DIExpression()), !dbg [[DBG115]]
+// CHECK2-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[DOTBOUND_TID__ADDR]], metadata [[META116:![0-9]+]], metadata !DIExpression()), !dbg [[DBG115]]
 // CHECK2-NEXT:    store i64 [[VLA]], ptr [[VLA_ADDR]], align 8
-// CHECK2-NEXT:    call void @llvm.dbg.declare(metadata ptr [[VLA_ADDR]], metadata [[META117:![0-9]+]], metadata !DIExpression()), !dbg [[DBG115]]
+// CHECK2-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[VLA_ADDR]], metadata [[META117:![0-9]+]], metadata !DIExpression()), !dbg [[DBG115]]
 // CHECK2-NEXT:    [[TMP0:%.*]] = load i64, ptr [[VLA_ADDR]], align 8, !dbg [[DBG118:![0-9]+]]
 // CHECK2-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !dbg [[DBG118]]
 // CHECK2-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[DOTBOUND_TID__ADDR]], align 8, !dbg [[DBG118]]
@@ -531,13 +531,13 @@ int main (int argc, char **argv) {
 // CHECK2-NEXT:    [[VLA_ADDR:%.*]] = alloca i64, align 8
 // CHECK2-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
 // CHECK2-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
-// CHECK2-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTGLOBAL_TID__ADDR]], metadata [[META120:![0-9]+]], metadata !DIExpression()), !dbg [[DBG121:![0-9]+]]
+// CHECK2-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[DOTGLOBAL_TID__ADDR]], metadata [[META120:![0-9]+]], metadata !DIExpression()), !dbg [[DBG121:![0-9]+]]
 // CHECK2-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
-// CHECK2-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTBOUND_TID__ADDR]], metadata [[META122:![0-9]+]], metadata !DIExpression()), !dbg [[DBG121]]
+// CHECK2-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[DOTBOUND_TID__ADDR]], metadata [[META122:![0-9]+]], metadata !DIExpression()), !dbg [[DBG121]]
 // CHECK2-NEXT:    store i64 [[VLA]], ptr [[VLA_ADDR]], align 8
-// CHECK2-NEXT:    call void @llvm.dbg.declare(metadata ptr [[VLA_ADDR]], metadata [[META123:![0-9]+]], metadata !DIExpression()), !dbg [[DBG121]]
+// CHECK2-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[VLA_ADDR]], metadata [[META123:![0-9]+]], metadata !DIExpression()), !dbg [[DBG121]]
 // CHECK2-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK2-NEXT:    call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META124:![0-9]+]], metadata !DIExpression()), !dbg [[DBG125:![0-9]+]]
+// CHECK2-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META124:![0-9]+]], metadata !DIExpression()), !dbg [[DBG125:![0-9]+]]
 // CHECK2-NEXT:    [[TMP0:%.*]] = load i64, ptr [[VLA_ADDR]], align 8, !dbg [[DBG126:![0-9]+]]
 // CHECK2-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[A_ADDR]], align 8, !dbg [[DBG126]]
 // CHECK2-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB7:[0-9]+]], i32 2, ptr @main.omp_outlined_debug__.3.omp_outlined, i64 [[TMP0]], ptr [[TMP1]]), !dbg [[DBG126]]
@@ -552,19 +552,19 @@ int main (int argc, char **argv) {
 // CHECK2-NEXT:    [[VLA_ADDR:%.*]] = alloca i64, align 8
 // CHECK2-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
 // CHECK2-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
-// CHECK2-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTGLOBAL_TID__ADDR]], metadata [[META129:![0-9]+]], metadata !DIExpression()), !dbg [[DBG130:![0-9]+]]
+// CHECK2-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[DOTGLOBAL_TID__ADDR]], metadata [[META129:![0-9]+]], metadata !DIExpression()), !dbg [[DBG130:![0-9]+]]
 // CHECK2-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
-// CHECK2-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTBOUND_TID__ADDR]], metadata [[META131:![0-9]+]], metadata !DIExpression()), !dbg [[DBG130]]
+// CHECK2-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[DOTBOUND_TID__ADDR]], metadata [[META131:![0-9]+]], metadata !DIExpression()), !dbg [[DBG130]]
 // CHECK2-NEXT:    store i64 [[VLA]], ptr [[VLA_ADDR]], align 8
-// CHECK2-NEXT:    call void @llvm.dbg.declare(metadata ptr [[VLA_ADDR]], metadata [[META132:![0-9]+]], metadata !DIExpression()), !dbg [[DBG130]]
+// CHECK2-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[VLA_ADDR]], metadata [[META132:![0-9]+]], metadata !DIExpression()), !dbg [[DBG130]]
 // CHECK2-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK2-NEXT:    call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META133:![0-9]+]], metadata !DIExpression()), !dbg [[DBG134:![0-9]+]]
+// CHECK2-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META133:![0-9]+]], metadata !DIExpression()), !dbg [[DBG134:![0-9]+]]
 // CHECK2-NEXT:    [[TMP0:%.*]] = load i64, ptr [[VLA_ADDR]], align 8, !dbg [[DBG135:![0-9]+]]
 // CHECK2-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[A_ADDR]], align 8, !dbg [[DBG135]]
 // CHECK2-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 1, !dbg [[DBG136:![0-9]+]]
 // CHECK2-NEXT:    [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !dbg [[DBG136]]
 // CHECK2-NEXT:    invoke void @_Z3fooIiEvT_(i32 noundef [[TMP2]])
-// CHECK2-NEXT:    to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]], !dbg [[DBG135]]
+// CHECK2-NEXT:            to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]], !dbg [[DBG135]]
 // CHECK2:       invoke.cont:
 // CHECK2-NEXT:    [[TMP3:%.*]] = load i32, ptr @global, align 4, !dbg [[DBG137:![0-9]+]]
 // CHECK2-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 1, !dbg [[DBG138:![0-9]+]]
@@ -572,7 +572,7 @@ int main (int argc, char **argv) {
 // CHECK2-NEXT:    ret void, !dbg [[DBG137]]
 // CHECK2:       terminate.lpad:
 // CHECK2-NEXT:    [[TMP4:%.*]] = landingpad { ptr, i32 }
-// CHECK2-NEXT:    catch ptr null, !dbg [[DBG135]]
+// CHECK2-NEXT:            catch ptr null, !dbg [[DBG135]]
 // CHECK2-NEXT:    [[TMP5:%.*]] = extractvalue { ptr, i32 } [[TMP4]], 0, !dbg [[DBG135]]
 // CHECK2-NEXT:    call void @__clang_call_terminate(ptr [[TMP5]]) #[[ATTR7]], !dbg [[DBG135]]
 // CHECK2-NEXT:    unreachable, !dbg [[DBG135]]
@@ -586,13 +586,13 @@ int main (int argc, char **argv) {
 // CHECK2-NEXT:    [[VLA_ADDR:%.*]] = alloca i64, align 8
 // CHECK2-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
 // CHECK2-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
-// CHECK2-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTGLOBAL_TID__ADDR]], metadata [[META141:![0-9]+]], metadata !DIExpression()), !dbg [[DBG142:![0-9]+]]
+// CHECK2-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[DOTGLOBAL_TID__ADDR]], metadata [[META141:![0-9]+]], metadata !DIExpression()), !dbg [[DBG142:![0-9]+]]
 // CHECK2-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
-// CHECK2-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTBOUND_TID__ADDR]], metadata [[META143:![0-9]+]], metadata !DIExpression()), !dbg [[DBG142]]
+// CHECK2-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[DOTBOUND_TID__ADDR]], metadata [[META143:![0-9]+]], metadata !DIExpression()), !dbg [[DBG142]]
 // CHECK2-NEXT:    store i64 [[VLA]], ptr [[VLA_ADDR]], align 8
-// CHECK2-NEXT:    call void @llvm.dbg.declare(metadata ptr [[VLA_ADDR]], metadata [[META144:![0-9]+]], metadata !DIExpression()), !dbg [[DBG142]]
+// CHECK2-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[VLA_ADDR]], metadata [[META144:![0-9]+]], metadata !DIExpression()), !dbg [[DBG142]]
 // CHECK2-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK2-NEXT:    call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META145:![0-9]+]], metadata !DIExpression()), !dbg [[DBG142]]
+// CHECK2-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META145:![0-9]+]], metadata !DIExpression()), !dbg [[DBG142]]
 // CHECK2-NEXT:    [[TMP0:%.*]] = load i64, ptr [[VLA_ADDR]], align 8, !dbg [[DBG146:![0-9]+]]
 // CHECK2-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[A_ADDR]], align 8, !dbg [[DBG146]]
 // CHECK2-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !dbg [[DBG146]]
@@ -610,13 +610,13 @@ int main (int argc, char **argv) {
 // CHECK2-NEXT:    [[VLA_ADDR:%.*]] = alloca i64, align 8
 // CHECK2-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
 // CHECK2-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
-// CHECK2-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTGLOBAL_TID__ADDR]], metadata [[META148:![0-9]+]], metadata !DIExpression()), !dbg [[DBG149:![0-9]+]]
+// CHECK2-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[DOTGLOBAL_TID__ADDR]], metadata [[META148:![0-9]+]], metadata !DIExpression()), !dbg [[DBG149:![0-9]+]]
 // CHECK2-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
-// CHECK2-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTBOUND_TID__ADDR]], metadata [[META150:![0-9]+]], metadata !DIExpression()), !dbg [[DBG149]]
+// CHECK2-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[DOTBOUND_TID__ADDR]], metadata [[META150:![0-9]+]], metadata !DIExpression()), !dbg [[DBG149]]
 // CHECK2-NEXT:    store i64 [[VLA]], ptr [[VLA_ADDR]], align 8
-// CHECK2-NEXT:    call void @llvm.dbg.declare(metadata ptr [[VLA_ADDR]], metadata [[META151:![0-9]+]], metadata !DIExpression()), !dbg [[DBG149]]
+// CHECK2-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[VLA_ADDR]], metadata [[META151:![0-9]+]], metadata !DIExpression()), !dbg [[DBG149]]
 // CHECK2-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK2-NEXT:    call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META152:![0-9]+]], metadata !DIExpression()), !dbg [[DBG149]]
+// CHECK2-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META152:![0-9]+]], metadata !DIExpression()), !dbg [[DBG149]]
 // CHECK2-NEXT:    [[TMP0:%.*]] = load i64, ptr [[VLA_ADDR]], align 8, !dbg [[DBG153:![0-9]+]]
 // CHECK2-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[A_ADDR]], align 8, !dbg [[DBG153]]
 // CHECK2-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !dbg [[DBG153]]
@@ -631,7 +631,7 @@ int main (int argc, char **argv) {
 // CHECK2-NEXT:  entry:
 // CHECK2-NEXT:    [[ARGC_ADDR:%.*]] = alloca ptr, align 8
 // CHECK2-NEXT:    store ptr [[ARGC]], ptr [[ARGC_ADDR]], align 8
-// CHECK2-NEXT:    call void @llvm.dbg.declare(metadata ptr [[ARGC_ADDR]], metadata [[META159:![0-9]+]], metadata !DIExpression()), !dbg [[DBG160:![0-9]+]]
+// CHECK2-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[ARGC_ADDR]], metadata [[META159:![0-9]+]], metadata !DIExpression()), !dbg [[DBG160:![0-9]+]]
 // CHECK2-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ARGC_ADDR]], align 8, !dbg [[DBG161:![0-9]+]]
 // CHECK2-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds ptr, ptr [[TMP0]], i64 0, !dbg [[DBG161]]
 // CHECK2-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[ARRAYIDX]], align 8, !dbg [[DBG161]]
@@ -651,20 +651,20 @@ int main (int argc, char **argv) {
 // CHECK2-NEXT:    [[VLA_ADDR:%.*]] = alloca i64, align 8
 // CHECK2-NEXT:    [[VAR:%.*]] = alloca ptr, align 8
 // CHECK2-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
-// CHECK2-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTGLOBAL_TID__ADDR]], metadata [[META169:![0-9]+]], metadata !DIExpression()), !dbg [[DBG170:![0-9]+]]
+// CHECK2-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[DOTGLOBAL_TID__ADDR]], metadata [[META169:![0-9]+]], metadata !DIExpression()), !dbg [[DBG170:![0-9]+]]
 // CHECK2-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
-// CHECK2-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTBOUND_TID__ADDR]], metadata [[META171:![0-9]+]], metadata !DIExpression()), !dbg [[DBG170]]
+// CHECK2-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[DOTBOUND_TID__ADDR]], metadata [[META171:![0-9]+]], metadata !DIExpression()), !dbg [[DBG170]]
 // CHECK2-NEXT:    store ptr [[ARGC]], ptr [[ARGC_ADDR]], align 8
-// CHECK2-NEXT:    call void @llvm.dbg.declare(metadata ptr [[ARGC_ADDR]], metadata [[META172:![0-9]+]], metadata !DIExpression()), !dbg [[DBG173:![0-9]+]]
+// CHECK2-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[ARGC_ADDR]], metadata [[META172:![0-9]+]], metadata !DIExpression()), !dbg [[DBG173:![0-9]+]]
 // CHECK2-NEXT:    store i64 [[VLA]], ptr [[VLA_ADDR]], align 8
-// CHECK2-NEXT:    call void @llvm.dbg.declare(metadata ptr [[VLA_ADDR]], metadata [[META174:![0-9]+]], metadata !DIExpression()), !dbg [[DBG170]]
+// CHECK2-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[VLA_ADDR]], metadata [[META174:![0-9]+]], metadata !DIExpression()), !dbg [[DBG170]]
 // CHECK2-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ARGC_ADDR]], align 8, !dbg [[DBG175:![0-9]+]]
 // CHECK2-NEXT:    [[TMP1:%.*]] = load i64, ptr [[VLA_ADDR]], align 8, !dbg [[DBG175]]
 // CHECK2-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP0]], align 8, !dbg [[DBG176:![0-9]+]]
 // CHECK2-NEXT:    invoke void @_Z3fooIPPcEvT_(ptr noundef [[TMP2]])
-// CHECK2-NEXT:    to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]], !dbg [[DBG178:![0-9]+]]
+// CHECK2-NEXT:            to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]], !dbg [[DBG178:![0-9]+]]
 // CHECK2:       invoke.cont:
-// CHECK2-NEXT:    call void @llvm.dbg.declare(metadata ptr [[VAR]], metadata [[META179:![0-9]+]], metadata !DIExpression()), !dbg [[DBG186:![0-9]+]]
+// CHECK2-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[VAR]], metadata [[META179:![0-9]+]], metadata !DIExpression()), !dbg [[DBG186:![0-9]+]]
 // CHECK2-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[VAR]], align 8, !dbg [[DBG187:![0-9]+]]
 // CHECK2-NEXT:    [[TMP4:%.*]] = mul nsw i64 0, [[TMP1]], !dbg [[DBG187]]
 // CHECK2-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds double, ptr [[TMP3]], i64 [[TMP4]], !dbg [[DBG187]]
@@ -672,7 +672,7 @@ int main (int argc, char **argv) {
 // CHECK2-NEXT:    ret void, !dbg [[DBG188:![0-9]+]]
 // CHECK2:       terminate.lpad:
 // CHECK2-NEXT:    [[TMP5:%.*]] = landingpad { ptr, i32 }
-// CHECK2-NEXT:    catch ptr null, !dbg [[DBG178]]
+// CHECK2-NEXT:            catch ptr null, !dbg [[DBG178]]
 // CHECK2-NEXT:    [[TMP6:%.*]] = extractvalue { ptr, i32 } [[TMP5]], 0, !dbg [[DBG178]]
 // CHECK2-NEXT:    call void @__clang_call_terminate(ptr [[TMP6]]) #[[ATTR7]], !dbg [[DBG178]]
 // CHECK2-NEXT:    unreachable, !dbg [[DBG178]]
@@ -683,7 +683,7 @@ int main (int argc, char **argv) {
 // CHECK2-NEXT:  entry:
 // CHECK2-NEXT:    [[ARGC_ADDR:%.*]] = alloca ptr, align 8
 // CHECK2-NEXT:    store ptr [[ARGC]], ptr [[ARGC_ADDR]], align 8
-// CHECK2-NEXT:    call void @llvm.dbg.declare(metadata ptr [[ARGC_ADDR]], metadata [[META192:![0-9]+]], metadata !DIExpression()), !dbg [[DBG193:![0-9]+]]
+// CHECK2-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[ARGC_ADDR]], metadata [[META192:![0-9]+]], metadata !DIExpression()), !dbg [[DBG193:![0-9]+]]
 // CHECK2-NEXT:    ret void, !dbg [[DBG194:![0-9]+]]
 //
 //
@@ -695,13 +695,13 @@ int main (int argc, char **argv) {
 // CHECK2-NEXT:    [[ARGC_ADDR:%.*]] = alloca ptr, align 8
 // CHECK2-NEXT:    [[VLA_ADDR:%.*]] = alloca i64, align 8
 // CHECK2-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
-// CHECK2-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTGLOBAL_TID__ADDR]], metadata [[META196:![0-9]+]], metadata !DIExpression()), !dbg [[DBG197:![0-9]+]]
+// CHECK2-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[DOTGLOBAL_TID__ADDR]], metadata [[META196:![0-9]+]], metadata !DIExpression()), !dbg [[DBG197:![0-9]+]]
 // CHECK2-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
-// CHECK2-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTBOUND_TID__ADDR]], metadata [[META198:![0-9]+]], metadata !DIExpression()), !dbg [[DBG197]]
+// CHECK2-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[DOTBOUND_TID__ADDR]], metadata [[META198:![0-9]+]], metadata !DIExpression()), !dbg [[DBG197]]
 // CHECK2-NEXT:    store ptr [[ARGC]], ptr [[ARGC_ADDR]], align 8
-// CHECK2-NEXT:    call void @llvm.dbg.declare(metadata ptr [[ARGC_ADDR]], metadata [[META199:![0-9]+]], metadata !DIExpression()), !dbg [[DBG197]]
+// CHECK2-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[ARGC_ADDR]], metadata [[META199:![0-9]+]], metadata !DIExpression()), !dbg [[DBG197]]
 // CHECK2-NEXT:    store i64 [[VLA]], ptr [[VLA_ADDR]], align 8
-// CHECK2-NEXT:    call void @llvm.dbg.declare(metadata ptr [[VLA_ADDR]], metadata [[META200:![0-9]+]], metadata !DIExpression()), !dbg [[DBG197]]
+// CHECK2-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[VLA_ADDR]], metadata [[META200:![0-9]+]], metadata !DIExpression()), !dbg [[DBG197]]
 // CHECK2-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ARGC_ADDR]], align 8, !dbg [[DBG201:![0-9]+]]
 // CHECK2-NEXT:    [[TMP1:%.*]] = load i64, ptr [[VLA_ADDR]], align 8, !dbg [[DBG201]]
 // CHECK2-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !dbg [[DBG201]]
@@ -812,7 +812,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@_Z5tmainIPPcEiT_..omp_par
-// CHECK3-SAME: (ptr noalias [[TID_ADDR:%.*]], ptr noalias [[ZERO_ADDR:%.*]], ptr [[TMP0:%.*]]) #[[ATTR2:[0-9]+]]
+// CHECK3-SAME: (ptr noalias [[TID_ADDR:%.*]], ptr noalias [[ZERO_ADDR:%.*]], ptr [[TMP0:%.*]]) #[[ATTR6:[0-9]+]] {
 // CHECK3-NEXT:  omp.par.entry:
 // CHECK3-NEXT:    [[GEP__RELOADED:%.*]] = getelementptr { ptr, ptr }, ptr [[TMP0]], i32 0, i32 0
 // CHECK3-NEXT:    [[LOADGEP__RELOADED:%.*]] = load ptr, ptr [[GEP__RELOADED]], align 8
@@ -860,17 +860,17 @@ int main (int argc, char **argv) {
 // CHECK4-NEXT:    [[__VLA_EXPR0:%.*]] = alloca i64, align 8
 // CHECK4-NEXT:    store i32 0, ptr [[RETVAL]], align 4
 // CHECK4-NEXT:    store i32 [[ARGC]], ptr [[ARGC_ADDR]], align 4
-// CHECK4-NEXT:    call void @llvm.dbg.declare(metadata ptr [[ARGC_ADDR]], metadata [[META18:![0-9]+]], metadata !DIExpression()), !dbg [[DBG19:![0-9]+]]
+// CHECK4-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[ARGC_ADDR]], metadata [[META18:![0-9]+]], metadata !DIExpression()), !dbg [[DBG19:![0-9]+]]
 // CHECK4-NEXT:    store ptr [[ARGV]], ptr [[ARGV_ADDR]], align 8
-// CHECK4-NEXT:    call void @llvm.dbg.declare(metadata ptr [[ARGV_ADDR]], metadata [[META20:![0-9]+]], metadata !DIExpression()), !dbg [[DBG19]]
+// CHECK4-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[ARGV_ADDR]], metadata [[META20:![0-9]+]], metadata !DIExpression()), !dbg [[DBG19]]
 // CHECK4-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARGC_ADDR]], align 4, !dbg [[DBG21:![0-9]+]]
 // CHECK4-NEXT:    [[TMP1:%.*]] = zext i32 [[TMP0]] to i64, !dbg [[DBG21]]
 // CHECK4-NEXT:    [[TMP2:%.*]] = call ptr @llvm.stacksave.p0(), !dbg [[DBG21]]
 // CHECK4-NEXT:    store ptr [[TMP2]], ptr [[SAVED_STACK]], align 8, !dbg [[DBG21]]
 // CHECK4-NEXT:    [[VLA:%.*]] = alloca i32, i64 [[TMP1]], align 16, !dbg [[DBG21]]
 // CHECK4-NEXT:    store i64 [[TMP1]], ptr [[__VLA_EXPR0]], align 8, !dbg [[DBG21]]
-// CHECK4-NEXT:    call void @llvm.dbg.declare(metadata ptr [[__VLA_EXPR0]], metadata [[META22:![0-9]+]], metadata !DIExpression()), !dbg [[DBG24:![0-9]+]]
-// CHECK4-NEXT:    call void @llvm.dbg.declare(metadata ptr [[VLA]], metadata [[META25:![0-9]+]], metadata !DIExpression()), !dbg [[DBG21]]
+// CHECK4-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[__VLA_EXPR0]], metadata [[META22:![0-9]+]], metadata !DIExpression()), !dbg [[DBG24:![0-9]+]]
+// CHECK4-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[VLA]], metadata [[META25:![0-9]+]], metadata !DIExpression()), !dbg [[DBG21]]
 // CHECK4-NEXT:    [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1:[0-9]+]]), !dbg [[DBG29:![0-9]+]]
 // CHECK4-NEXT:    br label [[OMP_PARALLEL:%.*]]
 // CHECK4:       omp_parallel:
@@ -921,25 +921,25 @@ int main (int argc, char **argv) {
 // CHECK4-NEXT:  entry:
 // CHECK4-NEXT:    [[ARGC_ADDR:%.*]] = alloca i32, align 4
 // CHECK4-NEXT:    store i32 [[ARGC]], ptr [[ARGC_ADDR]], align 4
-// CHECK4-NEXT:    call void @llvm.dbg.declare(metadata ptr [[ARGC_ADDR]], metadata [[META41:![0-9]+]], metadata !DIExpression()), !dbg [[DBG42:![0-9]+]]
+// CHECK4-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[ARGC_ADDR]], metadata [[META41:![0-9]+]], metadata !DIExpression()), !dbg [[DBG42:![0-9]+]]
 // CHECK4-NEXT:    ret void, !dbg [[DBG42]]
 //
 //
 // CHECK4-LABEL: define {{[^@]+}}@_Z5tmainIPPcEiT_
-// CHECK4-SAME: (ptr noundef [[ARGC:%.*]]) #[[ATTR6:[0-9]+]] comdat !dbg [[DBG45:![0-9]+]] {
+// CHECK4-SAME: (ptr noundef [[ARGC:%.*]]) #[[ATTR6:[0-9]+]] comdat !dbg [[DBG43:![0-9]+]] {
 // CHECK4-NEXT:  entry:
 // CHECK4-NEXT:    [[STRUCTARG:%.*]] = alloca { ptr, ptr }, align 8
 // CHECK4-NEXT:    [[DOTRELOADED:%.*]] = alloca i64, align 8
 // CHECK4-NEXT:    [[ARGC_ADDR:%.*]] = alloca ptr, align 8
 // CHECK4-NEXT:    store ptr [[ARGC]], ptr [[ARGC_ADDR]], align 8
-// CHECK4-NEXT:    call void @llvm.dbg.declare(metadata ptr [[ARGC_ADDR]], metadata [[META50:![0-9]+]], metadata !DIExpression()), !dbg [[DBG51:![0-9]+]]
-// CHECK4-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ARGC_ADDR]], align 8, !dbg [[DBG52:![0-9]+]]
-// CHECK4-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds ptr, ptr [[TMP0]], i64 0, !dbg [[DBG52]]
-// CHECK4-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[ARRAYIDX]], align 8, !dbg [[DBG52]]
-// CHECK4-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 0, !dbg [[DBG52]]
-// CHECK4-NEXT:    [[TMP2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1, !dbg [[DBG52]]
-// CHECK4-NEXT:    [[TMP3:%.*]] = zext i8 [[TMP2]] to i64, !dbg [[DBG52]]
-// CHECK4-NEXT:    [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB3:[0-9]+]]), !dbg [[DBG53:![0-9]+]]
+// CHECK4-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[ARGC_ADDR]], metadata [[META48:![0-9]+]], metadata !DIExpression()), !dbg [[DBG49:![0-9]+]]
+// CHECK4-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ARGC_ADDR]], align 8, !dbg [[DBG50:![0-9]+]]
+// CHECK4-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds ptr, ptr [[TMP0]], i64 0, !dbg [[DBG50]]
+// CHECK4-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[ARRAYIDX]], align 8, !dbg [[DBG50]]
+// CHECK4-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 0, !dbg [[DBG50]]
+// CHECK4-NEXT:    [[TMP2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1, !dbg [[DBG50]]
+// CHECK4-NEXT:    [[TMP3:%.*]] = zext i8 [[TMP2]] to i64, !dbg [[DBG50]]
+// CHECK4-NEXT:    [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB3:[0-9]+]]), !dbg [[DBG51:![0-9]+]]
 // CHECK4-NEXT:    store i64 [[TMP3]], ptr [[DOTRELOADED]], align 8
 // CHECK4-NEXT:    br label [[OMP_PARALLEL:%.*]]
 // CHECK4:       omp_parallel:
@@ -947,16 +947,16 @@ int main (int argc, char **argv) {
 // CHECK4-NEXT:    store ptr [[DOTRELOADED]], ptr [[GEP__RELOADED]], align 8
 // CHECK4-NEXT:    [[GEP_ARGC_ADDR:%.*]] = getelementptr { ptr, ptr }, ptr [[STRUCTARG]], i32 0, i32 1
 // CHECK4-NEXT:    store ptr [[ARGC_ADDR]], ptr [[GEP_ARGC_ADDR]], align 8
-// CHECK4-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB3]], i32 1, ptr @_Z5tmainIPPcEiT_..omp_par, ptr [[STRUCTARG]]), !dbg [[DBG54:![0-9]+]]
+// CHECK4-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB3]], i32 1, ptr @_Z5tmainIPPcEiT_..omp_par, ptr [[STRUCTARG]]), !dbg [[DBG52:![0-9]+]]
 // CHECK4-NEXT:    br label [[OMP_PAR_OUTLINED_EXIT:%.*]]
 // CHECK4:       omp.par.outlined.exit:
 // CHECK4-NEXT:    br label [[OMP_PAR_EXIT_SPLIT:%.*]]
 // CHECK4:       omp.par.exit.split:
-// CHECK4-NEXT:    ret i32 0, !dbg [[DBG56:![0-9]+]]
+// CHECK4-NEXT:    ret i32 0, !dbg [[DBG54:![0-9]+]]
 //
 //
 // CHECK4-LABEL: define {{[^@]+}}@_Z5tmainIPPcEiT_..omp_par
-// CHECK4-SAME: (ptr noalias [[TID_ADDR:%.*]], ptr noalias [[ZERO_ADDR:%.*]], ptr [[TMP0:%.*]]) #[[ATTR2:[0-9]+]] !dbg [[DBG57:![0-9]+]] {
+// CHECK4-SAME: (ptr noalias [[TID_ADDR:%.*]], ptr noalias [[ZERO_ADDR:%.*]], ptr [[TMP0:%.*]]) #[[ATTR7:[0-9]+]] !dbg [[DBG55:![0-9]+]] {
 // CHECK4-NEXT:  omp.par.entry:
 // CHECK4-NEXT:    [[GEP__RELOADED:%.*]] = getelementptr { ptr, ptr }, ptr [[TMP0]], i32 0, i32 0
 // CHECK4-NEXT:    [[LOADGEP__RELOADED:%.*]] = load ptr, ptr [[GEP__RELOADED]], align 8
@@ -970,18 +970,18 @@ int main (int argc, char **argv) {
 // CHECK4-NEXT:    [[TMP2:%.*]] = load i64, ptr [[LOADGEP__RELOADED]], align 8
 // CHECK4-NEXT:    br label [[OMP_PAR_REGION:%.*]]
 // CHECK4:       omp.par.region:
-// CHECK4-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[LOADGEP_ARGC_ADDR]], align 8, !dbg [[DBG58:![0-9]+]]
-// CHECK4-NEXT:    call void @_Z3fooIPPcEvT_(ptr noundef [[TMP3]]), !dbg [[DBG58]]
-// CHECK4-NEXT:    call void @llvm.dbg.declare(metadata ptr [[VAR]], metadata [[META60:![0-9]+]], metadata !DIExpression()), !dbg [[DBG67:![0-9]+]]
-// CHECK4-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[VAR]], align 8, !dbg [[DBG67]]
-// CHECK4-NEXT:    [[TMP5:%.*]] = mul nsw i64 0, [[TMP2]], !dbg [[DBG67]]
-// CHECK4-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds double, ptr [[TMP4]], i64 [[TMP5]], !dbg [[DBG67]]
-// CHECK4-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds double, ptr [[ARRAYIDX2]], i64 0, !dbg [[DBG67]]
-// CHECK4-NEXT:    br label [[OMP_PAR_REGION_PARALLEL_AFTER:%.*]], !dbg [[DBG68:![0-9]+]]
+// CHECK4-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[LOADGEP_ARGC_ADDR]], align 8, !dbg [[DBG56:![0-9]+]]
+// CHECK4-NEXT:    call void @_Z3fooIPPcEvT_(ptr noundef [[TMP3]]), !dbg [[DBG56]]
+// CHECK4-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[VAR]], metadata [[META58:![0-9]+]], metadata !DIExpression()), !dbg [[DBG65:![0-9]+]]
+// CHECK4-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[VAR]], align 8, !dbg [[DBG65]]
+// CHECK4-NEXT:    [[TMP5:%.*]] = mul nsw i64 0, [[TMP2]], !dbg [[DBG65]]
+// CHECK4-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds double, ptr [[TMP4]], i64 [[TMP5]], !dbg [[DBG65]]
+// CHECK4-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds double, ptr [[ARRAYIDX2]], i64 0, !dbg [[DBG65]]
+// CHECK4-NEXT:    br label [[OMP_PAR_REGION_PARALLEL_AFTER:%.*]], !dbg [[DBG66:![0-9]+]]
 // CHECK4:       omp.par.region.parallel.after:
 // CHECK4-NEXT:    br label [[OMP_PAR_PRE_FINALIZE:%.*]]
 // CHECK4:       omp.par.pre_finalize:
-// CHECK4-NEXT:    br label [[OMP_PAR_OUTLINED_EXIT_EXITSTUB:%.*]], !dbg [[DBG68]]
+// CHECK4-NEXT:    br label [[OMP_PAR_OUTLINED_EXIT_EXITSTUB:%.*]], !dbg [[DBG66]]
 // CHECK4:       omp.par.outlined.exit.exitStub:
 // CHECK4-NEXT:    ret void
 //
@@ -991,6 +991,6 @@ int main (int argc, char **argv) {
 // CHECK4-NEXT:  entry:
 // CHECK4-NEXT:    [[ARGC_ADDR:%.*]] = alloca ptr, align 8
 // CHECK4-NEXT:    store ptr [[ARGC]], ptr [[ARGC_ADDR]], align 8
-// CHECK4-NEXT:    call void @llvm.dbg.declare(metadata ptr [[ARGC_ADDR]], metadata [[META72:![0-9]+]], metadata !DIExpression()), !dbg [[DBG73:![0-9]+]]
+// CHECK4-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[ARGC_ADDR]], metadata [[META72:![0-9]+]], metadata !DIExpression()), !dbg [[DBG73:![0-9]+]]
 // CHECK4-NEXT:    ret void, !dbg [[DBG73]]
 //
diff --git a/clang/test/OpenMP/target_parallel_debug_codegen.cpp b/clang/test/OpenMP/target_parallel_debug_codegen.cpp
index e99c393a4021c..74929bfa29740 100644
--- a/clang/test/OpenMP/target_parallel_debug_codegen.cpp
+++ b/clang/test/OpenMP/target_parallel_debug_codegen.cpp
@@ -65,7 +65,7 @@ int main() {
   return 0;
 }
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l23_debug__
-// CHECK1-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr addrspace(1) noalias noundef [[C:%.*]], i32 noundef [[A:%.*]], ptr noalias noundef [[B:%.*]], ptr addrspace(1) noalias noundef [[BB:%.*]]) #[[ATTR0:[0-9]+]] !dbg [[DBG32:![0-9]+]] {
+// CHECK1-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr addrspace(1) noalias noundef [[C:%.*]], i32 noundef [[A:%.*]], ptr noalias noundef [[B:%.*]], ptr addrspace(1) noalias noundef [[BB:%.*]]) #[[ATTR0:[0-9]+]] !dbg [[DBG29:![0-9]+]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[C_ADDR:%.*]] = alloca ptr addrspace(1), align 8
@@ -78,52 +78,52 @@ int main() {
 // CHECK1-NEXT:    [[A_CASTED:%.*]] = alloca i64, align 8
 // CHECK1-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x ptr], align 8
 // CHECK1-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DYN_PTR_ADDR]], metadata [[META51:![0-9]+]], metadata !DIExpression()), !dbg [[DBG52:![0-9]+]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[DYN_PTR_ADDR]], metadata [[META48:![0-9]+]], metadata !DIExpression()), !dbg [[DBG49:![0-9]+]]
 // CHECK1-NEXT:    store ptr addrspace(1) [[C]], ptr [[C_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META53:![0-9]+]], metadata !DIExpression()), !dbg [[DBG54:![0-9]+]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META50:![0-9]+]], metadata !DIExpression()), !dbg [[DBG51:![0-9]+]]
 // CHECK1-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META55:![0-9]+]], metadata !DIExpression()), !dbg [[DBG56:![0-9]+]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META52:![0-9]+]], metadata !DIExpression()), !dbg [[DBG53:![0-9]+]]
 // CHECK1-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META57:![0-9]+]], metadata !DIExpression()), !dbg [[DBG58:![0-9]+]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META54:![0-9]+]], metadata !DIExpression()), !dbg [[DBG55:![0-9]+]]
 // CHECK1-NEXT:    store ptr addrspace(1) [[BB]], ptr [[BB_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META59:![0-9]+]], metadata !DIExpression()), !dbg [[DBG60:![0-9]+]]
-// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr [[C_ADDR]], align 8, !dbg [[DBG61:![0-9]+]]
-// CHECK1-NEXT:    [[TMP1:%.*]] = addrspacecast ptr addrspace(1) [[TMP0]] to ptr, !dbg [[DBG61]]
-// CHECK1-NEXT:    store ptr [[TMP1]], ptr [[TMP]], align 8, !dbg [[DBG61]]
-// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP]], align 8, !dbg [[DBG61]]
-// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG61]]
-// CHECK1-NEXT:    store ptr [[TMP3]], ptr [[_TMP1]], align 8, !dbg [[DBG61]]
-// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[_TMP1]], align 8, !dbg [[DBG61]]
-// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr addrspace(1), ptr [[BB_ADDR]], align 8, !dbg [[DBG61]]
-// CHECK1-NEXT:    [[TMP6:%.*]] = addrspacecast ptr addrspace(1) [[TMP5]] to ptr, !dbg [[DBG61]]
-// CHECK1-NEXT:    store ptr [[TMP6]], ptr [[_TMP2]], align 8, !dbg [[DBG61]]
-// CHECK1-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[_TMP2]], align 8, !dbg [[DBG61]]
-// CHECK1-NEXT:    [[TMP8:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l23_kernel_environment, ptr [[DYN_PTR]]), !dbg [[DBG61]]
-// CHECK1-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP8]], -1, !dbg [[DBG61]]
-// CHECK1-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]], !dbg [[DBG61]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META56:![0-9]+]], metadata !DIExpression()), !dbg [[DBG57:![0-9]+]]
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr [[C_ADDR]], align 8, !dbg [[DBG58:![0-9]+]]
+// CHECK1-NEXT:    [[TMP1:%.*]] = addrspacecast ptr addrspace(1) [[TMP0]] to ptr, !dbg [[DBG58]]
+// CHECK1-NEXT:    store ptr [[TMP1]], ptr [[TMP]], align 8, !dbg [[DBG58]]
+// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP]], align 8, !dbg [[DBG58]]
+// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG58]]
+// CHECK1-NEXT:    store ptr [[TMP3]], ptr [[_TMP1]], align 8, !dbg [[DBG58]]
+// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[_TMP1]], align 8, !dbg [[DBG58]]
+// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr addrspace(1), ptr [[BB_ADDR]], align 8, !dbg [[DBG58]]
+// CHECK1-NEXT:    [[TMP6:%.*]] = addrspacecast ptr addrspace(1) [[TMP5]] to ptr, !dbg [[DBG58]]
+// CHECK1-NEXT:    store ptr [[TMP6]], ptr [[_TMP2]], align 8, !dbg [[DBG58]]
+// CHECK1-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[_TMP2]], align 8, !dbg [[DBG58]]
+// CHECK1-NEXT:    [[TMP8:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l23_kernel_environment, ptr [[DYN_PTR]]), !dbg [[DBG58]]
+// CHECK1-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP8]], -1, !dbg [[DBG58]]
+// CHECK1-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]], !dbg [[DBG58]]
 // CHECK1:       user_code.entry:
-// CHECK1-NEXT:    [[TMP9:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB3:[0-9]+]])
-// CHECK1-NEXT:    [[TMP10:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG62:![0-9]+]]
-// CHECK1-NEXT:    store i32 [[TMP10]], ptr [[A_CASTED]], align 4, !dbg [[DBG62]]
-// CHECK1-NEXT:    [[TMP11:%.*]] = load i64, ptr [[A_CASTED]], align 8, !dbg [[DBG62]]
-// CHECK1-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 0, !dbg [[DBG62]]
-// CHECK1-NEXT:    store ptr [[TMP2]], ptr [[TMP12]], align 8, !dbg [[DBG62]]
-// CHECK1-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 1, !dbg [[DBG62]]
-// CHECK1-NEXT:    [[TMP14:%.*]] = inttoptr i64 [[TMP11]] to ptr, !dbg [[DBG62]]
-// CHECK1-NEXT:    store ptr [[TMP14]], ptr [[TMP13]], align 8, !dbg [[DBG62]]
-// CHECK1-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 2, !dbg [[DBG62]]
-// CHECK1-NEXT:    store ptr [[TMP4]], ptr [[TMP15]], align 8, !dbg [[DBG62]]
-// CHECK1-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 3, !dbg [[DBG62]]
-// CHECK1-NEXT:    store ptr [[TMP7]], ptr [[TMP16]], align 8, !dbg [[DBG62]]
-// CHECK1-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB3]], i32 [[TMP9]], i32 1, i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l23_debug___omp_outlined, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i64 4), !dbg [[DBG62]]
-// CHECK1-NEXT:    call void @__kmpc_target_deinit(), !dbg [[DBG63:![0-9]+]]
-// CHECK1-NEXT:    ret void, !dbg [[DBG65:![0-9]+]]
+// CHECK1-NEXT:    [[TMP9:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB3:[0-9]+]]), !dbg [[DBG59:![0-9]+]]
+// CHECK1-NEXT:    [[TMP10:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG61:![0-9]+]]
+// CHECK1-NEXT:    store i32 [[TMP10]], ptr [[A_CASTED]], align 4, !dbg [[DBG61]]
+// CHECK1-NEXT:    [[TMP11:%.*]] = load i64, ptr [[A_CASTED]], align 8, !dbg [[DBG61]]
+// CHECK1-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 0, !dbg [[DBG61]]
+// CHECK1-NEXT:    store ptr [[TMP2]], ptr [[TMP12]], align 8, !dbg [[DBG61]]
+// CHECK1-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 1, !dbg [[DBG61]]
+// CHECK1-NEXT:    [[TMP14:%.*]] = inttoptr i64 [[TMP11]] to ptr, !dbg [[DBG61]]
+// CHECK1-NEXT:    store ptr [[TMP14]], ptr [[TMP13]], align 8, !dbg [[DBG61]]
+// CHECK1-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 2, !dbg [[DBG61]]
+// CHECK1-NEXT:    store ptr [[TMP4]], ptr [[TMP15]], align 8, !dbg [[DBG61]]
+// CHECK1-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 3, !dbg [[DBG61]]
+// CHECK1-NEXT:    store ptr [[TMP7]], ptr [[TMP16]], align 8, !dbg [[DBG61]]
+// CHECK1-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB3]], i32 [[TMP9]], i32 1, i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l23_debug___omp_outlined, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i64 4), !dbg [[DBG61]]
+// CHECK1-NEXT:    call void @__kmpc_target_deinit(), !dbg [[DBG62:![0-9]+]]
+// CHECK1-NEXT:    ret void, !dbg [[DBG63:![0-9]+]]
 // CHECK1:       worker.exit:
-// CHECK1-NEXT:    ret void, !dbg [[DBG61]]
+// CHECK1-NEXT:    ret void, !dbg [[DBG58]]
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l23_debug___omp_outlined_debug__
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr addrspace(1) noalias noundef [[C:%.*]], i32 noundef [[A:%.*]], ptr noalias noundef [[B:%.*]], ptr addrspace(1) noalias noundef [[BB:%.*]]) #[[ATTR2:[0-9]+]] !dbg [[DBG66:![0-9]+]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr addrspace(1) noalias noundef [[C:%.*]], i32 noundef [[A:%.*]], ptr noalias noundef [[B:%.*]], ptr addrspace(1) noalias noundef [[BB:%.*]]) #[[ATTR2:[0-9]+]] !dbg [[DBG64:![0-9]+]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -140,83 +140,83 @@ int main() {
 // CHECK1-NEXT:    [[H:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[D:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTGLOBAL_TID__ADDR]], metadata [[META73:![0-9]+]], metadata !DIExpression()), !dbg [[DBG74:![0-9]+]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[DOTGLOBAL_TID__ADDR]], metadata [[META71:![0-9]+]], metadata !DIExpression()), !dbg [[DBG72:![0-9]+]]
 // CHECK1-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTBOUND_TID__ADDR]], metadata [[META75:![0-9]+]], metadata !DIExpression()), !dbg [[DBG74]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[DOTBOUND_TID__ADDR]], metadata [[META73:![0-9]+]], metadata !DIExpression()), !dbg [[DBG72]]
 // CHECK1-NEXT:    store ptr addrspace(1) [[C]], ptr [[C_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META76:![0-9]+]], metadata !DIExpression()), !dbg [[DBG77:![0-9]+]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META74:![0-9]+]], metadata !DIExpression()), !dbg [[DBG75:![0-9]+]]
 // CHECK1-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META78:![0-9]+]], metadata !DIExpression()), !dbg [[DBG79:![0-9]+]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META76:![0-9]+]], metadata !DIExpression()), !dbg [[DBG77:![0-9]+]]
 // CHECK1-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META80:![0-9]+]], metadata !DIExpression()), !dbg [[DBG81:![0-9]+]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META78:![0-9]+]], metadata !DIExpression()), !dbg [[DBG79:![0-9]+]]
 // CHECK1-NEXT:    store ptr addrspace(1) [[BB]], ptr [[BB_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META82:![0-9]+]], metadata !DIExpression()), !dbg [[DBG83:![0-9]+]]
-// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr [[C_ADDR]], align 8, !dbg [[DBG84:![0-9]+]]
-// CHECK1-NEXT:    [[TMP1:%.*]] = addrspacecast ptr addrspace(1) [[TMP0]] to ptr, !dbg [[DBG84]]
-// CHECK1-NEXT:    store ptr [[TMP1]], ptr [[TMP]], align 8, !dbg [[DBG84]]
-// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP]], align 8, !dbg [[DBG84]]
-// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG84]]
-// CHECK1-NEXT:    store ptr [[TMP3]], ptr [[_TMP1]], align 8, !dbg [[DBG84]]
-// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[_TMP1]], align 8, !dbg [[DBG84]]
-// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr addrspace(1), ptr [[BB_ADDR]], align 8, !dbg [[DBG84]]
-// CHECK1-NEXT:    [[TMP6:%.*]] = addrspacecast ptr addrspace(1) [[TMP5]] to ptr, !dbg [[DBG84]]
-// CHECK1-NEXT:    store ptr [[TMP6]], ptr [[_TMP2]], align 8, !dbg [[DBG84]]
-// CHECK1-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[_TMP2]], align 8, !dbg [[DBG84]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[B3]], metadata [[META85:![0-9]+]], metadata !DIExpression()), !dbg [[DBG74]]
-// CHECK1-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[B3]], ptr align 4 [[TMP4]], i64 400, i1 false), !dbg [[DBG84]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[F]], metadata [[META86:![0-9]+]], metadata !DIExpression()), !dbg [[DBG89:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 1, !dbg [[DBG90:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX]], i64 0, i64 1, !dbg [[DBG90]]
-// CHECK1-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX4]], i64 0, i64 1, !dbg [[DBG90]]
-// CHECK1-NEXT:    store ptr [[ARRAYIDX5]], ptr [[F]], align 8, !dbg [[DBG89]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[G]], metadata [[META91:![0-9]+]], metadata !DIExpression()), !dbg [[DBG92:![0-9]+]]
-// CHECK1-NEXT:    store ptr [[A_ADDR]], ptr [[G]], align 8, !dbg [[DBG92]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[H]], metadata [[META93:![0-9]+]], metadata !DIExpression()), !dbg [[DBG94:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[B3]], i64 0, i64 1, !dbg [[DBG95:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX6]], i64 0, i64 1, !dbg [[DBG95]]
-// CHECK1-NEXT:    store ptr [[ARRAYIDX7]], ptr [[H]], align 8, !dbg [[DBG94]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[D]], metadata [[META96:![0-9]+]], metadata !DIExpression()), !dbg [[DBG97:![0-9]+]]
-// CHECK1-NEXT:    store i32 15, ptr [[D]], align 4, !dbg [[DBG97]]
-// CHECK1-NEXT:    store i32 5, ptr [[A_ADDR]], align 4, !dbg [[DBG98:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[B3]], i64 0, i64 0, !dbg [[DBG99:![0-9]+]]
-// CHECK1-NEXT:    [[TMP8:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG100:![0-9]+]]
-// CHECK1-NEXT:    [[IDXPROM:%.*]] = sext i32 [[TMP8]] to i64, !dbg [[DBG99]]
-// CHECK1-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX8]], i64 0, i64 [[IDXPROM]], !dbg [[DBG99]]
-// CHECK1-NEXT:    store i32 10, ptr [[ARRAYIDX9]], align 4, !dbg [[DBG101:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 0, !dbg [[DBG102:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX10]], i64 0, i64 0, !dbg [[DBG102]]
-// CHECK1-NEXT:    [[TMP9:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG103:![0-9]+]]
-// CHECK1-NEXT:    [[IDXPROM12:%.*]] = sext i32 [[TMP9]] to i64, !dbg [[DBG102]]
-// CHECK1-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX11]], i64 0, i64 [[IDXPROM12]], !dbg [[DBG102]]
-// CHECK1-NEXT:    store i32 11, ptr [[ARRAYIDX13]], align 4, !dbg [[DBG104:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 0, !dbg [[DBG105:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX14]], i64 0, i64 0, !dbg [[DBG105]]
-// CHECK1-NEXT:    [[TMP10:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG106:![0-9]+]]
-// CHECK1-NEXT:    [[IDXPROM16:%.*]] = sext i32 [[TMP10]] to i64, !dbg [[DBG105]]
-// CHECK1-NEXT:    [[ARRAYIDX17:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX15]], i64 0, i64 [[IDXPROM16]], !dbg [[DBG105]]
-// CHECK1-NEXT:    [[TMP11:%.*]] = load i32, ptr [[ARRAYIDX17]], align 4, !dbg [[DBG105]]
-// CHECK1-NEXT:    [[ARRAYIDX18:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[B3]], i64 0, i64 0, !dbg [[DBG107:![0-9]+]]
-// CHECK1-NEXT:    [[TMP12:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG108:![0-9]+]]
-// CHECK1-NEXT:    [[IDXPROM19:%.*]] = sext i32 [[TMP12]] to i64, !dbg [[DBG107]]
-// CHECK1-NEXT:    [[ARRAYIDX20:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX18]], i64 0, i64 [[IDXPROM19]], !dbg [[DBG107]]
-// CHECK1-NEXT:    store i32 [[TMP11]], ptr [[ARRAYIDX20]], align 4, !dbg [[DBG109:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX21:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[B3]], i64 0, i64 0, !dbg [[DBG110:![0-9]+]]
-// CHECK1-NEXT:    [[TMP13:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG111:![0-9]+]]
-// CHECK1-NEXT:    [[IDXPROM22:%.*]] = sext i32 [[TMP13]] to i64, !dbg [[DBG110]]
-// CHECK1-NEXT:    [[ARRAYIDX23:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX21]], i64 0, i64 [[IDXPROM22]], !dbg [[DBG110]]
-// CHECK1-NEXT:    [[TMP14:%.*]] = load i32, ptr [[ARRAYIDX23]], align 4, !dbg [[DBG110]]
-// CHECK1-NEXT:    [[TMP15:%.*]] = load i8, ptr [[TMP7]], align 1, !dbg [[DBG112:![0-9]+]]
-// CHECK1-NEXT:    [[TOBOOL:%.*]] = trunc i8 [[TMP15]] to i1, !dbg [[DBG112]]
-// CHECK1-NEXT:    [[CONV:%.*]] = zext i1 [[TOBOOL]] to i32, !dbg [[DBG112]]
-// CHECK1-NEXT:    [[OR:%.*]] = or i32 [[CONV]], [[TMP14]], !dbg [[DBG112]]
-// CHECK1-NEXT:    [[TOBOOL24:%.*]] = icmp ne i32 [[OR]], 0, !dbg [[DBG112]]
-// CHECK1-NEXT:    [[FROMBOOL:%.*]] = zext i1 [[TOBOOL24]] to i8, !dbg [[DBG112]]
-// CHECK1-NEXT:    store i8 [[FROMBOOL]], ptr [[TMP7]], align 1, !dbg [[DBG112]]
-// CHECK1-NEXT:    ret void, !dbg [[DBG113:![0-9]+]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META80:![0-9]+]], metadata !DIExpression()), !dbg [[DBG81:![0-9]+]]
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr [[C_ADDR]], align 8, !dbg [[DBG82:![0-9]+]]
+// CHECK1-NEXT:    [[TMP1:%.*]] = addrspacecast ptr addrspace(1) [[TMP0]] to ptr, !dbg [[DBG82]]
+// CHECK1-NEXT:    store ptr [[TMP1]], ptr [[TMP]], align 8, !dbg [[DBG82]]
+// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP]], align 8, !dbg [[DBG82]]
+// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG82]]
+// CHECK1-NEXT:    store ptr [[TMP3]], ptr [[_TMP1]], align 8, !dbg [[DBG82]]
+// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[_TMP1]], align 8, !dbg [[DBG82]]
+// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr addrspace(1), ptr [[BB_ADDR]], align 8, !dbg [[DBG82]]
+// CHECK1-NEXT:    [[TMP6:%.*]] = addrspacecast ptr addrspace(1) [[TMP5]] to ptr, !dbg [[DBG82]]
+// CHECK1-NEXT:    store ptr [[TMP6]], ptr [[_TMP2]], align 8, !dbg [[DBG82]]
+// CHECK1-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[_TMP2]], align 8, !dbg [[DBG82]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[B3]], metadata [[META83:![0-9]+]], metadata !DIExpression()), !dbg [[DBG72]]
+// CHECK1-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[B3]], ptr align 4 [[TMP4]], i64 400, i1 false), !dbg [[DBG82]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[F]], metadata [[META84:![0-9]+]], metadata !DIExpression()), !dbg [[DBG87:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 1, !dbg [[DBG88:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX]], i64 0, i64 1, !dbg [[DBG88]]
+// CHECK1-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX4]], i64 0, i64 1, !dbg [[DBG88]]
+// CHECK1-NEXT:    store ptr [[ARRAYIDX5]], ptr [[F]], align 8, !dbg [[DBG87]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[G]], metadata [[META89:![0-9]+]], metadata !DIExpression()), !dbg [[DBG90:![0-9]+]]
+// CHECK1-NEXT:    store ptr [[A_ADDR]], ptr [[G]], align 8, !dbg [[DBG90]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[H]], metadata [[META91:![0-9]+]], metadata !DIExpression()), !dbg [[DBG92:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[B3]], i64 0, i64 1, !dbg [[DBG93:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX6]], i64 0, i64 1, !dbg [[DBG93]]
+// CHECK1-NEXT:    store ptr [[ARRAYIDX7]], ptr [[H]], align 8, !dbg [[DBG92]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[D]], metadata [[META94:![0-9]+]], metadata !DIExpression()), !dbg [[DBG95:![0-9]+]]
+// CHECK1-NEXT:    store i32 15, ptr [[D]], align 4, !dbg [[DBG95]]
+// CHECK1-NEXT:    store i32 5, ptr [[A_ADDR]], align 4, !dbg [[DBG96:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[B3]], i64 0, i64 0, !dbg [[DBG97:![0-9]+]]
+// CHECK1-NEXT:    [[TMP8:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG98:![0-9]+]]
+// CHECK1-NEXT:    [[IDXPROM:%.*]] = sext i32 [[TMP8]] to i64, !dbg [[DBG97]]
+// CHECK1-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX8]], i64 0, i64 [[IDXPROM]], !dbg [[DBG97]]
+// CHECK1-NEXT:    store i32 10, ptr [[ARRAYIDX9]], align 4, !dbg [[DBG99:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 0, !dbg [[DBG100:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX10]], i64 0, i64 0, !dbg [[DBG100]]
+// CHECK1-NEXT:    [[TMP9:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG101:![0-9]+]]
+// CHECK1-NEXT:    [[IDXPROM12:%.*]] = sext i32 [[TMP9]] to i64, !dbg [[DBG100]]
+// CHECK1-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX11]], i64 0, i64 [[IDXPROM12]], !dbg [[DBG100]]
+// CHECK1-NEXT:    store i32 11, ptr [[ARRAYIDX13]], align 4, !dbg [[DBG102:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 0, !dbg [[DBG103:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX14]], i64 0, i64 0, !dbg [[DBG103]]
+// CHECK1-NEXT:    [[TMP10:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG104:![0-9]+]]
+// CHECK1-NEXT:    [[IDXPROM16:%.*]] = sext i32 [[TMP10]] to i64, !dbg [[DBG103]]
+// CHECK1-NEXT:    [[ARRAYIDX17:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX15]], i64 0, i64 [[IDXPROM16]], !dbg [[DBG103]]
+// CHECK1-NEXT:    [[TMP11:%.*]] = load i32, ptr [[ARRAYIDX17]], align 4, !dbg [[DBG103]]
+// CHECK1-NEXT:    [[ARRAYIDX18:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[B3]], i64 0, i64 0, !dbg [[DBG105:![0-9]+]]
+// CHECK1-NEXT:    [[TMP12:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG106:![0-9]+]]
+// CHECK1-NEXT:    [[IDXPROM19:%.*]] = sext i32 [[TMP12]] to i64, !dbg [[DBG105]]
+// CHECK1-NEXT:    [[ARRAYIDX20:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX18]], i64 0, i64 [[IDXPROM19]], !dbg [[DBG105]]
+// CHECK1-NEXT:    store i32 [[TMP11]], ptr [[ARRAYIDX20]], align 4, !dbg [[DBG107:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX21:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[B3]], i64 0, i64 0, !dbg [[DBG108:![0-9]+]]
+// CHECK1-NEXT:    [[TMP13:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG109:![0-9]+]]
+// CHECK1-NEXT:    [[IDXPROM22:%.*]] = sext i32 [[TMP13]] to i64, !dbg [[DBG108]]
+// CHECK1-NEXT:    [[ARRAYIDX23:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX21]], i64 0, i64 [[IDXPROM22]], !dbg [[DBG108]]
+// CHECK1-NEXT:    [[TMP14:%.*]] = load i32, ptr [[ARRAYIDX23]], align 4, !dbg [[DBG108]]
+// CHECK1-NEXT:    [[TMP15:%.*]] = load i8, ptr [[TMP7]], align 1, !dbg [[DBG110:![0-9]+]]
+// CHECK1-NEXT:    [[TOBOOL:%.*]] = trunc i8 [[TMP15]] to i1, !dbg [[DBG110]]
+// CHECK1-NEXT:    [[CONV:%.*]] = zext i1 [[TOBOOL]] to i32, !dbg [[DBG110]]
+// CHECK1-NEXT:    [[OR:%.*]] = or i32 [[CONV]], [[TMP14]], !dbg [[DBG110]]
+// CHECK1-NEXT:    [[TOBOOL24:%.*]] = icmp ne i32 [[OR]], 0, !dbg [[DBG110]]
+// CHECK1-NEXT:    [[FROMBOOL:%.*]] = zext i1 [[TOBOOL24]] to i8, !dbg [[DBG110]]
+// CHECK1-NEXT:    store i8 [[FROMBOOL]], ptr [[TMP7]], align 1, !dbg [[DBG110]]
+// CHECK1-NEXT:    ret void, !dbg [[DBG111:![0-9]+]]
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l23_debug___omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[C:%.*]], i64 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[B:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[BB:%.*]]) #[[ATTR2]] !dbg [[DBG114:![0-9]+]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[C:%.*]], i64 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[B:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[BB:%.*]]) #[[ATTR2]] !dbg [[DBG112:![0-9]+]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -225,34 +225,34 @@ int main() {
 // CHECK1-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[BB_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTGLOBAL_TID__ADDR]], metadata [[META121:![0-9]+]], metadata !DIExpression()), !dbg [[DBG122:![0-9]+]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[DOTGLOBAL_TID__ADDR]], metadata [[META119:![0-9]+]], metadata !DIExpression()), !dbg [[DBG120:![0-9]+]]
 // CHECK1-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTBOUND_TID__ADDR]], metadata [[META123:![0-9]+]], metadata !DIExpression()), !dbg [[DBG122]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[DOTBOUND_TID__ADDR]], metadata [[META121:![0-9]+]], metadata !DIExpression()), !dbg [[DBG120]]
 // CHECK1-NEXT:    store ptr [[C]], ptr [[C_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META124:![0-9]+]], metadata !DIExpression()), !dbg [[DBG122]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META122:![0-9]+]], metadata !DIExpression()), !dbg [[DBG120]]
 // CHECK1-NEXT:    store i64 [[A]], ptr [[A_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META125:![0-9]+]], metadata !DIExpression()), !dbg [[DBG122]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META123:![0-9]+]], metadata !DIExpression()), !dbg [[DBG120]]
 // CHECK1-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META126:![0-9]+]], metadata !DIExpression()), !dbg [[DBG122]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META124:![0-9]+]], metadata !DIExpression()), !dbg [[DBG120]]
 // CHECK1-NEXT:    store ptr [[BB]], ptr [[BB_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META127:![0-9]+]], metadata !DIExpression()), !dbg [[DBG122]]
-// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG128:![0-9]+]]
-// CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG128]]
-// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG128]]
-// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !dbg [[DBG128]]
-// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[DOTBOUND_TID__ADDR]], align 8, !dbg [[DBG128]]
-// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG128]]
-// CHECK1-NEXT:    [[TMP6:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG128]]
-// CHECK1-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG128]]
-// CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG128]]
-// CHECK1-NEXT:    [[TMP9:%.*]] = addrspacecast ptr [[TMP5]] to ptr addrspace(1), !dbg [[DBG128]]
-// CHECK1-NEXT:    [[TMP10:%.*]] = addrspacecast ptr [[TMP8]] to ptr addrspace(1), !dbg [[DBG128]]
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l23_debug___omp_outlined_debug__(ptr [[TMP3]], ptr [[TMP4]], ptr addrspace(1) [[TMP9]], i32 [[TMP6]], ptr [[TMP7]], ptr addrspace(1) [[TMP10]]) #[[ATTR4:[0-9]+]], !dbg [[DBG128]]
-// CHECK1-NEXT:    ret void, !dbg [[DBG128]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META125:![0-9]+]], metadata !DIExpression()), !dbg [[DBG120]]
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG126:![0-9]+]]
+// CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG126]]
+// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG126]]
+// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !dbg [[DBG126]]
+// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[DOTBOUND_TID__ADDR]], align 8, !dbg [[DBG126]]
+// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG126]]
+// CHECK1-NEXT:    [[TMP6:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG126]]
+// CHECK1-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG126]]
+// CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG126]]
+// CHECK1-NEXT:    [[TMP9:%.*]] = addrspacecast ptr [[TMP5]] to ptr addrspace(1), !dbg [[DBG126]]
+// CHECK1-NEXT:    [[TMP10:%.*]] = addrspacecast ptr [[TMP8]] to ptr addrspace(1), !dbg [[DBG126]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l23_debug___omp_outlined_debug__(ptr [[TMP3]], ptr [[TMP4]], ptr addrspace(1) [[TMP9]], i32 [[TMP6]], ptr [[TMP7]], ptr addrspace(1) [[TMP10]]) #[[ATTR4:[0-9]+]], !dbg [[DBG126]]
+// CHECK1-NEXT:    ret void, !dbg [[DBG126]]
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l23
-// CHECK1-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[C:%.*]], i64 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[B:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[BB:%.*]]) #[[ATTR6:[0-9]+]] !dbg [[DBG129:![0-9]+]] {
+// CHECK1-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[C:%.*]], i64 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[B:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[BB:%.*]]) #[[ATTR6:[0-9]+]] !dbg [[DBG127:![0-9]+]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[C_ADDR:%.*]] = alloca ptr, align 8
@@ -260,31 +260,31 @@ int main() {
 // CHECK1-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[BB_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DYN_PTR_ADDR]], metadata [[META132:![0-9]+]], metadata !DIExpression()), !dbg [[DBG133:![0-9]+]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[DYN_PTR_ADDR]], metadata [[META130:![0-9]+]], metadata !DIExpression()), !dbg [[DBG131:![0-9]+]]
 // CHECK1-NEXT:    store ptr [[C]], ptr [[C_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META134:![0-9]+]], metadata !DIExpression()), !dbg [[DBG133]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META132:![0-9]+]], metadata !DIExpression()), !dbg [[DBG131]]
 // CHECK1-NEXT:    store i64 [[A]], ptr [[A_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META135:![0-9]+]], metadata !DIExpression()), !dbg [[DBG133]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META133:![0-9]+]], metadata !DIExpression()), !dbg [[DBG131]]
 // CHECK1-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META136:![0-9]+]], metadata !DIExpression()), !dbg [[DBG133]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META134:![0-9]+]], metadata !DIExpression()), !dbg [[DBG131]]
 // CHECK1-NEXT:    store ptr [[BB]], ptr [[BB_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META137:![0-9]+]], metadata !DIExpression()), !dbg [[DBG133]]
-// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG138:![0-9]+]]
-// CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG138]]
-// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG138]]
-// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[DYN_PTR_ADDR]], align 8, !dbg [[DBG138]]
-// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG138]]
-// CHECK1-NEXT:    [[TMP5:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG138]]
-// CHECK1-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG138]]
-// CHECK1-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG138]]
-// CHECK1-NEXT:    [[TMP8:%.*]] = addrspacecast ptr [[TMP4]] to ptr addrspace(1), !dbg [[DBG138]]
-// CHECK1-NEXT:    [[TMP9:%.*]] = addrspacecast ptr [[TMP7]] to ptr addrspace(1), !dbg [[DBG138]]
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l23_debug__(ptr [[TMP3]], ptr addrspace(1) [[TMP8]], i32 [[TMP5]], ptr [[TMP6]], ptr addrspace(1) [[TMP9]]) #[[ATTR4]], !dbg [[DBG138]]
-// CHECK1-NEXT:    ret void, !dbg [[DBG138]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META135:![0-9]+]], metadata !DIExpression()), !dbg [[DBG131]]
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG136:![0-9]+]]
+// CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG136]]
+// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG136]]
+// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[DYN_PTR_ADDR]], align 8, !dbg [[DBG136]]
+// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG136]]
+// CHECK1-NEXT:    [[TMP5:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG136]]
+// CHECK1-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG136]]
+// CHECK1-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG136]]
+// CHECK1-NEXT:    [[TMP8:%.*]] = addrspacecast ptr [[TMP4]] to ptr addrspace(1), !dbg [[DBG136]]
+// CHECK1-NEXT:    [[TMP9:%.*]] = addrspacecast ptr [[TMP7]] to ptr addrspace(1), !dbg [[DBG136]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l23_debug__(ptr [[TMP3]], ptr addrspace(1) [[TMP8]], i32 [[TMP5]], ptr [[TMP6]], ptr addrspace(1) [[TMP9]]) #[[ATTR4]], !dbg [[DBG136]]
+// CHECK1-NEXT:    ret void, !dbg [[DBG136]]
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l37_debug__
-// CHECK1-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr addrspace(1) noalias noundef [[C:%.*]], i32 noundef [[A:%.*]], ptr addrspace(1) noalias noundef [[B:%.*]], ptr addrspace(1) noalias noundef [[BB:%.*]]) #[[ATTR0]] !dbg [[DBG139:![0-9]+]] {
+// CHECK1-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr addrspace(1) noalias noundef [[C:%.*]], i32 noundef [[A:%.*]], ptr addrspace(1) noalias noundef [[B:%.*]], ptr addrspace(1) noalias noundef [[BB:%.*]]) #[[ATTR0]] !dbg [[DBG137:![0-9]+]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[C_ADDR:%.*]] = alloca ptr addrspace(1), align 8
@@ -297,32 +297,32 @@ int main() {
 // CHECK1-NEXT:    [[A_CASTED:%.*]] = alloca i64, align 8
 // CHECK1-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x ptr], align 8
 // CHECK1-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DYN_PTR_ADDR]], metadata [[META144:![0-9]+]], metadata !DIExpression()), !dbg [[DBG145:![0-9]+]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[DYN_PTR_ADDR]], metadata [[META142:![0-9]+]], metadata !DIExpression()), !dbg [[DBG143:![0-9]+]]
 // CHECK1-NEXT:    store ptr addrspace(1) [[C]], ptr [[C_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META146:![0-9]+]], metadata !DIExpression()), !dbg [[DBG147:![0-9]+]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META144:![0-9]+]], metadata !DIExpression()), !dbg [[DBG145:![0-9]+]]
 // CHECK1-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META148:![0-9]+]], metadata !DIExpression()), !dbg [[DBG149:![0-9]+]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META146:![0-9]+]], metadata !DIExpression()), !dbg [[DBG147:![0-9]+]]
 // CHECK1-NEXT:    store ptr addrspace(1) [[B]], ptr [[B_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META150:![0-9]+]], metadata !DIExpression()), !dbg [[DBG151:![0-9]+]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META148:![0-9]+]], metadata !DIExpression()), !dbg [[DBG149:![0-9]+]]
 // CHECK1-NEXT:    store ptr addrspace(1) [[BB]], ptr [[BB_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META152:![0-9]+]], metadata !DIExpression()), !dbg [[DBG153:![0-9]+]]
-// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr [[C_ADDR]], align 8, !dbg [[DBG154:![0-9]+]]
-// CHECK1-NEXT:    [[TMP1:%.*]] = addrspacecast ptr addrspace(1) [[TMP0]] to ptr, !dbg [[DBG154]]
-// CHECK1-NEXT:    store ptr [[TMP1]], ptr [[TMP]], align 8, !dbg [[DBG154]]
-// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP]], align 8, !dbg [[DBG154]]
-// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr addrspace(1), ptr [[B_ADDR]], align 8, !dbg [[DBG154]]
-// CHECK1-NEXT:    [[TMP4:%.*]] = addrspacecast ptr addrspace(1) [[TMP3]] to ptr, !dbg [[DBG154]]
-// CHECK1-NEXT:    store ptr [[TMP4]], ptr [[_TMP1]], align 8, !dbg [[DBG154]]
-// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[_TMP1]], align 8, !dbg [[DBG154]]
-// CHECK1-NEXT:    [[TMP6:%.*]] = load ptr addrspace(1), ptr [[BB_ADDR]], align 8, !dbg [[DBG154]]
-// CHECK1-NEXT:    [[TMP7:%.*]] = addrspacecast ptr addrspace(1) [[TMP6]] to ptr, !dbg [[DBG154]]
-// CHECK1-NEXT:    store ptr [[TMP7]], ptr [[_TMP2]], align 8, !dbg [[DBG154]]
-// CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[_TMP2]], align 8, !dbg [[DBG154]]
-// CHECK1-NEXT:    [[TMP9:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l37_kernel_environment, ptr [[DYN_PTR]]), !dbg [[DBG154]]
-// CHECK1-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP9]], -1, !dbg [[DBG154]]
-// CHECK1-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]], !dbg [[DBG154]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META150:![0-9]+]], metadata !DIExpression()), !dbg [[DBG151:![0-9]+]]
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr [[C_ADDR]], align 8, !dbg [[DBG152:![0-9]+]]
+// CHECK1-NEXT:    [[TMP1:%.*]] = addrspacecast ptr addrspace(1) [[TMP0]] to ptr, !dbg [[DBG152]]
+// CHECK1-NEXT:    store ptr [[TMP1]], ptr [[TMP]], align 8, !dbg [[DBG152]]
+// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP]], align 8, !dbg [[DBG152]]
+// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr addrspace(1), ptr [[B_ADDR]], align 8, !dbg [[DBG152]]
+// CHECK1-NEXT:    [[TMP4:%.*]] = addrspacecast ptr addrspace(1) [[TMP3]] to ptr, !dbg [[DBG152]]
+// CHECK1-NEXT:    store ptr [[TMP4]], ptr [[_TMP1]], align 8, !dbg [[DBG152]]
+// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[_TMP1]], align 8, !dbg [[DBG152]]
+// CHECK1-NEXT:    [[TMP6:%.*]] = load ptr addrspace(1), ptr [[BB_ADDR]], align 8, !dbg [[DBG152]]
+// CHECK1-NEXT:    [[TMP7:%.*]] = addrspacecast ptr addrspace(1) [[TMP6]] to ptr, !dbg [[DBG152]]
+// CHECK1-NEXT:    store ptr [[TMP7]], ptr [[_TMP2]], align 8, !dbg [[DBG152]]
+// CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[_TMP2]], align 8, !dbg [[DBG152]]
+// CHECK1-NEXT:    [[TMP9:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l37_kernel_environment, ptr [[DYN_PTR]]), !dbg [[DBG152]]
+// CHECK1-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP9]], -1, !dbg [[DBG152]]
+// CHECK1-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]], !dbg [[DBG152]]
 // CHECK1:       user_code.entry:
-// CHECK1-NEXT:    [[TMP10:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB7:[0-9]+]])
+// CHECK1-NEXT:    [[TMP10:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB7:[0-9]+]]), !dbg [[DBG153:![0-9]+]]
 // CHECK1-NEXT:    [[TMP11:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG155:![0-9]+]]
 // CHECK1-NEXT:    store i32 [[TMP11]], ptr [[A_CASTED]], align 4, !dbg [[DBG155]]
 // CHECK1-NEXT:    [[TMP12:%.*]] = load i64, ptr [[A_CASTED]], align 8, !dbg [[DBG155]]
@@ -337,13 +337,13 @@ int main() {
 // CHECK1-NEXT:    store ptr [[TMP8]], ptr [[TMP17]], align 8, !dbg [[DBG155]]
 // CHECK1-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB7]], i32 [[TMP10]], i32 1, i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l37_debug___omp_outlined, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i64 4), !dbg [[DBG155]]
 // CHECK1-NEXT:    call void @__kmpc_target_deinit(), !dbg [[DBG156:![0-9]+]]
-// CHECK1-NEXT:    ret void, !dbg [[DBG158:![0-9]+]]
+// CHECK1-NEXT:    ret void, !dbg [[DBG157:![0-9]+]]
 // CHECK1:       worker.exit:
-// CHECK1-NEXT:    ret void, !dbg [[DBG154]]
+// CHECK1-NEXT:    ret void, !dbg [[DBG152]]
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l37_debug___omp_outlined_debug__
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr addrspace(1) noalias noundef [[C:%.*]], i32 noundef [[A:%.*]], ptr addrspace(1) noalias noundef [[B:%.*]], ptr addrspace(1) noalias noundef [[BB:%.*]]) #[[ATTR2]] !dbg [[DBG159:![0-9]+]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr addrspace(1) noalias noundef [[C:%.*]], i32 noundef [[A:%.*]], ptr addrspace(1) noalias noundef [[B:%.*]], ptr addrspace(1) noalias noundef [[BB:%.*]]) #[[ATTR2]] !dbg [[DBG158:![0-9]+]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -359,74 +359,74 @@ int main() {
 // CHECK1-NEXT:    [[H:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[D:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTGLOBAL_TID__ADDR]], metadata [[META162:![0-9]+]], metadata !DIExpression()), !dbg [[DBG163:![0-9]+]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[DOTGLOBAL_TID__ADDR]], metadata [[META161:![0-9]+]], metadata !DIExpression()), !dbg [[DBG162:![0-9]+]]
 // CHECK1-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTBOUND_TID__ADDR]], metadata [[META164:![0-9]+]], metadata !DIExpression()), !dbg [[DBG163]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[DOTBOUND_TID__ADDR]], metadata [[META163:![0-9]+]], metadata !DIExpression()), !dbg [[DBG162]]
 // CHECK1-NEXT:    store ptr addrspace(1) [[C]], ptr [[C_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META165:![0-9]+]], metadata !DIExpression()), !dbg [[DBG166:![0-9]+]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META164:![0-9]+]], metadata !DIExpression()), !dbg [[DBG165:![0-9]+]]
 // CHECK1-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META167:![0-9]+]], metadata !DIExpression()), !dbg [[DBG168:![0-9]+]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META166:![0-9]+]], metadata !DIExpression()), !dbg [[DBG167:![0-9]+]]
 // CHECK1-NEXT:    store ptr addrspace(1) [[B]], ptr [[B_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META169:![0-9]+]], metadata !DIExpression()), !dbg [[DBG170:![0-9]+]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META168:![0-9]+]], metadata !DIExpression()), !dbg [[DBG169:![0-9]+]]
 // CHECK1-NEXT:    store ptr addrspace(1) [[BB]], ptr [[BB_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META171:![0-9]+]], metadata !DIExpression()), !dbg [[DBG172:![0-9]+]]
-// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr [[C_ADDR]], align 8, !dbg [[DBG173:![0-9]+]]
-// CHECK1-NEXT:    [[TMP1:%.*]] = addrspacecast ptr addrspace(1) [[TMP0]] to ptr, !dbg [[DBG173]]
-// CHECK1-NEXT:    store ptr [[TMP1]], ptr [[TMP]], align 8, !dbg [[DBG173]]
-// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP]], align 8, !dbg [[DBG173]]
-// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr addrspace(1), ptr [[B_ADDR]], align 8, !dbg [[DBG173]]
-// CHECK1-NEXT:    [[TMP4:%.*]] = addrspacecast ptr addrspace(1) [[TMP3]] to ptr, !dbg [[DBG173]]
-// CHECK1-NEXT:    store ptr [[TMP4]], ptr [[_TMP1]], align 8, !dbg [[DBG173]]
-// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[_TMP1]], align 8, !dbg [[DBG173]]
-// CHECK1-NEXT:    [[TMP6:%.*]] = load ptr addrspace(1), ptr [[BB_ADDR]], align 8, !dbg [[DBG173]]
-// CHECK1-NEXT:    [[TMP7:%.*]] = addrspacecast ptr addrspace(1) [[TMP6]] to ptr, !dbg [[DBG173]]
-// CHECK1-NEXT:    store ptr [[TMP7]], ptr [[_TMP2]], align 8, !dbg [[DBG173]]
-// CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[_TMP2]], align 8, !dbg [[DBG173]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[F]], metadata [[META174:![0-9]+]], metadata !DIExpression()), !dbg [[DBG176:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 1, !dbg [[DBG177:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX]], i64 0, i64 1, !dbg [[DBG177]]
-// CHECK1-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX3]], i64 0, i64 1, !dbg [[DBG177]]
-// CHECK1-NEXT:    store ptr [[ARRAYIDX4]], ptr [[F]], align 8, !dbg [[DBG176]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[G]], metadata [[META178:![0-9]+]], metadata !DIExpression()), !dbg [[DBG179:![0-9]+]]
-// CHECK1-NEXT:    store ptr [[A_ADDR]], ptr [[G]], align 8, !dbg [[DBG179]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[H]], metadata [[META180:![0-9]+]], metadata !DIExpression()), !dbg [[DBG181:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP5]], i64 0, i64 1, !dbg [[DBG182:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX5]], i64 0, i64 1, !dbg [[DBG182]]
-// CHECK1-NEXT:    store ptr [[ARRAYIDX6]], ptr [[H]], align 8, !dbg [[DBG181]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[D]], metadata [[META183:![0-9]+]], metadata !DIExpression()), !dbg [[DBG184:![0-9]+]]
-// CHECK1-NEXT:    store i32 15, ptr [[D]], align 4, !dbg [[DBG184]]
-// CHECK1-NEXT:    store i32 5, ptr [[A_ADDR]], align 4, !dbg [[DBG185:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP5]], i64 0, i64 0, !dbg [[DBG186:![0-9]+]]
-// CHECK1-NEXT:    [[TMP9:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG187:![0-9]+]]
-// CHECK1-NEXT:    [[IDXPROM:%.*]] = sext i32 [[TMP9]] to i64, !dbg [[DBG186]]
-// CHECK1-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX7]], i64 0, i64 [[IDXPROM]], !dbg [[DBG186]]
-// CHECK1-NEXT:    store i32 10, ptr [[ARRAYIDX8]], align 4, !dbg [[DBG188:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 0, !dbg [[DBG189:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX9]], i64 0, i64 0, !dbg [[DBG189]]
-// CHECK1-NEXT:    [[TMP10:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG190:![0-9]+]]
-// CHECK1-NEXT:    [[IDXPROM11:%.*]] = sext i32 [[TMP10]] to i64, !dbg [[DBG189]]
-// CHECK1-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX10]], i64 0, i64 [[IDXPROM11]], !dbg [[DBG189]]
-// CHECK1-NEXT:    store i32 11, ptr [[ARRAYIDX12]], align 4, !dbg [[DBG191:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 0, !dbg [[DBG192:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX13]], i64 0, i64 0, !dbg [[DBG192]]
-// CHECK1-NEXT:    [[TMP11:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG193:![0-9]+]]
-// CHECK1-NEXT:    [[IDXPROM15:%.*]] = sext i32 [[TMP11]] to i64, !dbg [[DBG192]]
-// CHECK1-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX14]], i64 0, i64 [[IDXPROM15]], !dbg [[DBG192]]
-// CHECK1-NEXT:    [[TMP12:%.*]] = load i32, ptr [[ARRAYIDX16]], align 4, !dbg [[DBG192]]
-// CHECK1-NEXT:    [[ARRAYIDX17:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP5]], i64 0, i64 0, !dbg [[DBG194:![0-9]+]]
-// CHECK1-NEXT:    [[TMP13:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG195:![0-9]+]]
-// CHECK1-NEXT:    [[IDXPROM18:%.*]] = sext i32 [[TMP13]] to i64, !dbg [[DBG194]]
-// CHECK1-NEXT:    [[ARRAYIDX19:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX17]], i64 0, i64 [[IDXPROM18]], !dbg [[DBG194]]
-// CHECK1-NEXT:    store i32 [[TMP12]], ptr [[ARRAYIDX19]], align 4, !dbg [[DBG196:![0-9]+]]
-// CHECK1-NEXT:    [[TMP14:%.*]] = load i8, ptr [[TMP8]], align 1, !dbg [[DBG197:![0-9]+]]
-// CHECK1-NEXT:    [[TOBOOL:%.*]] = trunc i8 [[TMP14]] to i1, !dbg [[DBG197]]
-// CHECK1-NEXT:    [[CONV:%.*]] = zext i1 [[TOBOOL]] to i32, !dbg [[DBG197]]
-// CHECK1-NEXT:    store i32 [[CONV]], ptr [[D]], align 4, !dbg [[DBG198:![0-9]+]]
-// CHECK1-NEXT:    ret void, !dbg [[DBG199:![0-9]+]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META170:![0-9]+]], metadata !DIExpression()), !dbg [[DBG171:![0-9]+]]
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr [[C_ADDR]], align 8, !dbg [[DBG172:![0-9]+]]
+// CHECK1-NEXT:    [[TMP1:%.*]] = addrspacecast ptr addrspace(1) [[TMP0]] to ptr, !dbg [[DBG172]]
+// CHECK1-NEXT:    store ptr [[TMP1]], ptr [[TMP]], align 8, !dbg [[DBG172]]
+// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP]], align 8, !dbg [[DBG172]]
+// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr addrspace(1), ptr [[B_ADDR]], align 8, !dbg [[DBG172]]
+// CHECK1-NEXT:    [[TMP4:%.*]] = addrspacecast ptr addrspace(1) [[TMP3]] to ptr, !dbg [[DBG172]]
+// CHECK1-NEXT:    store ptr [[TMP4]], ptr [[_TMP1]], align 8, !dbg [[DBG172]]
+// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[_TMP1]], align 8, !dbg [[DBG172]]
+// CHECK1-NEXT:    [[TMP6:%.*]] = load ptr addrspace(1), ptr [[BB_ADDR]], align 8, !dbg [[DBG172]]
+// CHECK1-NEXT:    [[TMP7:%.*]] = addrspacecast ptr addrspace(1) [[TMP6]] to ptr, !dbg [[DBG172]]
+// CHECK1-NEXT:    store ptr [[TMP7]], ptr [[_TMP2]], align 8, !dbg [[DBG172]]
+// CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[_TMP2]], align 8, !dbg [[DBG172]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[F]], metadata [[META173:![0-9]+]], metadata !DIExpression()), !dbg [[DBG175:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 1, !dbg [[DBG176:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX]], i64 0, i64 1, !dbg [[DBG176]]
+// CHECK1-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX3]], i64 0, i64 1, !dbg [[DBG176]]
+// CHECK1-NEXT:    store ptr [[ARRAYIDX4]], ptr [[F]], align 8, !dbg [[DBG175]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[G]], metadata [[META177:![0-9]+]], metadata !DIExpression()), !dbg [[DBG178:![0-9]+]]
+// CHECK1-NEXT:    store ptr [[A_ADDR]], ptr [[G]], align 8, !dbg [[DBG178]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[H]], metadata [[META179:![0-9]+]], metadata !DIExpression()), !dbg [[DBG180:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP5]], i64 0, i64 1, !dbg [[DBG181:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX5]], i64 0, i64 1, !dbg [[DBG181]]
+// CHECK1-NEXT:    store ptr [[ARRAYIDX6]], ptr [[H]], align 8, !dbg [[DBG180]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[D]], metadata [[META182:![0-9]+]], metadata !DIExpression()), !dbg [[DBG183:![0-9]+]]
+// CHECK1-NEXT:    store i32 15, ptr [[D]], align 4, !dbg [[DBG183]]
+// CHECK1-NEXT:    store i32 5, ptr [[A_ADDR]], align 4, !dbg [[DBG184:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP5]], i64 0, i64 0, !dbg [[DBG185:![0-9]+]]
+// CHECK1-NEXT:    [[TMP9:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG186:![0-9]+]]
+// CHECK1-NEXT:    [[IDXPROM:%.*]] = sext i32 [[TMP9]] to i64, !dbg [[DBG185]]
+// CHECK1-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX7]], i64 0, i64 [[IDXPROM]], !dbg [[DBG185]]
+// CHECK1-NEXT:    store i32 10, ptr [[ARRAYIDX8]], align 4, !dbg [[DBG187:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 0, !dbg [[DBG188:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX9]], i64 0, i64 0, !dbg [[DBG188]]
+// CHECK1-NEXT:    [[TMP10:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG189:![0-9]+]]
+// CHECK1-NEXT:    [[IDXPROM11:%.*]] = sext i32 [[TMP10]] to i64, !dbg [[DBG188]]
+// CHECK1-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX10]], i64 0, i64 [[IDXPROM11]], !dbg [[DBG188]]
+// CHECK1-NEXT:    store i32 11, ptr [[ARRAYIDX12]], align 4, !dbg [[DBG190:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 0, !dbg [[DBG191:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX13]], i64 0, i64 0, !dbg [[DBG191]]
+// CHECK1-NEXT:    [[TMP11:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG192:![0-9]+]]
+// CHECK1-NEXT:    [[IDXPROM15:%.*]] = sext i32 [[TMP11]] to i64, !dbg [[DBG191]]
+// CHECK1-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX14]], i64 0, i64 [[IDXPROM15]], !dbg [[DBG191]]
+// CHECK1-NEXT:    [[TMP12:%.*]] = load i32, ptr [[ARRAYIDX16]], align 4, !dbg [[DBG191]]
+// CHECK1-NEXT:    [[ARRAYIDX17:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP5]], i64 0, i64 0, !dbg [[DBG193:![0-9]+]]
+// CHECK1-NEXT:    [[TMP13:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG194:![0-9]+]]
+// CHECK1-NEXT:    [[IDXPROM18:%.*]] = sext i32 [[TMP13]] to i64, !dbg [[DBG193]]
+// CHECK1-NEXT:    [[ARRAYIDX19:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX17]], i64 0, i64 [[IDXPROM18]], !dbg [[DBG193]]
+// CHECK1-NEXT:    store i32 [[TMP12]], ptr [[ARRAYIDX19]], align 4, !dbg [[DBG195:![0-9]+]]
+// CHECK1-NEXT:    [[TMP14:%.*]] = load i8, ptr [[TMP8]], align 1, !dbg [[DBG196:![0-9]+]]
+// CHECK1-NEXT:    [[TOBOOL:%.*]] = trunc i8 [[TMP14]] to i1, !dbg [[DBG196]]
+// CHECK1-NEXT:    [[CONV:%.*]] = zext i1 [[TOBOOL]] to i32, !dbg [[DBG196]]
+// CHECK1-NEXT:    store i32 [[CONV]], ptr [[D]], align 4, !dbg [[DBG197:![0-9]+]]
+// CHECK1-NEXT:    ret void, !dbg [[DBG198:![0-9]+]]
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l37_debug___omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[C:%.*]], i64 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[B:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[BB:%.*]]) #[[ATTR2]] !dbg [[DBG200:![0-9]+]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[C:%.*]], i64 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[B:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[BB:%.*]]) #[[ATTR2]] !dbg [[DBG199:![0-9]+]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -435,35 +435,35 @@ int main() {
 // CHECK1-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[BB_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTGLOBAL_TID__ADDR]], metadata [[META201:![0-9]+]], metadata !DIExpression()), !dbg [[DBG202:![0-9]+]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[DOTGLOBAL_TID__ADDR]], metadata [[META200:![0-9]+]], metadata !DIExpression()), !dbg [[DBG201:![0-9]+]]
 // CHECK1-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTBOUND_TID__ADDR]], metadata [[META203:![0-9]+]], metadata !DIExpression()), !dbg [[DBG202]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[DOTBOUND_TID__ADDR]], metadata [[META202:![0-9]+]], metadata !DIExpression()), !dbg [[DBG201]]
 // CHECK1-NEXT:    store ptr [[C]], ptr [[C_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META204:![0-9]+]], metadata !DIExpression()), !dbg [[DBG202]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META203:![0-9]+]], metadata !DIExpression()), !dbg [[DBG201]]
 // CHECK1-NEXT:    store i64 [[A]], ptr [[A_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META205:![0-9]+]], metadata !DIExpression()), !dbg [[DBG202]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META204:![0-9]+]], metadata !DIExpression()), !dbg [[DBG201]]
 // CHECK1-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META206:![0-9]+]], metadata !DIExpression()), !dbg [[DBG202]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META205:![0-9]+]], metadata !DIExpression()), !dbg [[DBG201]]
 // CHECK1-NEXT:    store ptr [[BB]], ptr [[BB_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META207:![0-9]+]], metadata !DIExpression()), !dbg [[DBG202]]
-// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG208:![0-9]+]]
-// CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG208]]
-// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG208]]
-// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !dbg [[DBG208]]
-// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[DOTBOUND_TID__ADDR]], align 8, !dbg [[DBG208]]
-// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG208]]
-// CHECK1-NEXT:    [[TMP6:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG208]]
-// CHECK1-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG208]]
-// CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG208]]
-// CHECK1-NEXT:    [[TMP9:%.*]] = addrspacecast ptr [[TMP5]] to ptr addrspace(1), !dbg [[DBG208]]
-// CHECK1-NEXT:    [[TMP10:%.*]] = addrspacecast ptr [[TMP7]] to ptr addrspace(1), !dbg [[DBG208]]
-// CHECK1-NEXT:    [[TMP11:%.*]] = addrspacecast ptr [[TMP8]] to ptr addrspace(1), !dbg [[DBG208]]
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l37_debug___omp_outlined_debug__(ptr [[TMP3]], ptr [[TMP4]], ptr addrspace(1) [[TMP9]], i32 [[TMP6]], ptr addrspace(1) [[TMP10]], ptr addrspace(1) [[TMP11]]) #[[ATTR4]], !dbg [[DBG208]]
-// CHECK1-NEXT:    ret void, !dbg [[DBG208]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META206:![0-9]+]], metadata !DIExpression()), !dbg [[DBG201]]
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG207:![0-9]+]]
+// CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG207]]
+// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG207]]
+// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !dbg [[DBG207]]
+// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[DOTBOUND_TID__ADDR]], align 8, !dbg [[DBG207]]
+// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG207]]
+// CHECK1-NEXT:    [[TMP6:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG207]]
+// CHECK1-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG207]]
+// CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG207]]
+// CHECK1-NEXT:    [[TMP9:%.*]] = addrspacecast ptr [[TMP5]] to ptr addrspace(1), !dbg [[DBG207]]
+// CHECK1-NEXT:    [[TMP10:%.*]] = addrspacecast ptr [[TMP7]] to ptr addrspace(1), !dbg [[DBG207]]
+// CHECK1-NEXT:    [[TMP11:%.*]] = addrspacecast ptr [[TMP8]] to ptr addrspace(1), !dbg [[DBG207]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l37_debug___omp_outlined_debug__(ptr [[TMP3]], ptr [[TMP4]], ptr addrspace(1) [[TMP9]], i32 [[TMP6]], ptr addrspace(1) [[TMP10]], ptr addrspace(1) [[TMP11]]) #[[ATTR4]], !dbg [[DBG207]]
+// CHECK1-NEXT:    ret void, !dbg [[DBG207]]
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l37
-// CHECK1-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[C:%.*]], i64 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[B:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[BB:%.*]]) #[[ATTR6]] !dbg [[DBG209:![0-9]+]] {
+// CHECK1-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[C:%.*]], i64 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[B:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[BB:%.*]]) #[[ATTR6]] !dbg [[DBG208:![0-9]+]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[C_ADDR:%.*]] = alloca ptr, align 8
@@ -471,32 +471,32 @@ int main() {
 // CHECK1-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[BB_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DYN_PTR_ADDR]], metadata [[META210:![0-9]+]], metadata !DIExpression()), !dbg [[DBG211:![0-9]+]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[DYN_PTR_ADDR]], metadata [[META209:![0-9]+]], metadata !DIExpression()), !dbg [[DBG210:![0-9]+]]
 // CHECK1-NEXT:    store ptr [[C]], ptr [[C_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META212:![0-9]+]], metadata !DIExpression()), !dbg [[DBG211]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META211:![0-9]+]], metadata !DIExpression()), !dbg [[DBG210]]
 // CHECK1-NEXT:    store i64 [[A]], ptr [[A_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META213:![0-9]+]], metadata !DIExpression()), !dbg [[DBG211]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META212:![0-9]+]], metadata !DIExpression()), !dbg [[DBG210]]
 // CHECK1-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META214:![0-9]+]], metadata !DIExpression()), !dbg [[DBG211]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META213:![0-9]+]], metadata !DIExpression()), !dbg [[DBG210]]
 // CHECK1-NEXT:    store ptr [[BB]], ptr [[BB_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META215:![0-9]+]], metadata !DIExpression()), !dbg [[DBG211]]
-// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG216:![0-9]+]]
-// CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG216]]
-// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG216]]
-// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[DYN_PTR_ADDR]], align 8, !dbg [[DBG216]]
-// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG216]]
-// CHECK1-NEXT:    [[TMP5:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG216]]
-// CHECK1-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG216]]
-// CHECK1-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG216]]
-// CHECK1-NEXT:    [[TMP8:%.*]] = addrspacecast ptr [[TMP4]] to ptr addrspace(1), !dbg [[DBG216]]
-// CHECK1-NEXT:    [[TMP9:%.*]] = addrspacecast ptr [[TMP6]] to ptr addrspace(1), !dbg [[DBG216]]
-// CHECK1-NEXT:    [[TMP10:%.*]] = addrspacecast ptr [[TMP7]] to ptr addrspace(1), !dbg [[DBG216]]
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l37_debug__(ptr [[TMP3]], ptr addrspace(1) [[TMP8]], i32 [[TMP5]], ptr addrspace(1) [[TMP9]], ptr addrspace(1) [[TMP10]]) #[[ATTR4]], !dbg [[DBG216]]
-// CHECK1-NEXT:    ret void, !dbg [[DBG216]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META214:![0-9]+]], metadata !DIExpression()), !dbg [[DBG210]]
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG215:![0-9]+]]
+// CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG215]]
+// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG215]]
+// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[DYN_PTR_ADDR]], align 8, !dbg [[DBG215]]
+// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG215]]
+// CHECK1-NEXT:    [[TMP5:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG215]]
+// CHECK1-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG215]]
+// CHECK1-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG215]]
+// CHECK1-NEXT:    [[TMP8:%.*]] = addrspacecast ptr [[TMP4]] to ptr addrspace(1), !dbg [[DBG215]]
+// CHECK1-NEXT:    [[TMP9:%.*]] = addrspacecast ptr [[TMP6]] to ptr addrspace(1), !dbg [[DBG215]]
+// CHECK1-NEXT:    [[TMP10:%.*]] = addrspacecast ptr [[TMP7]] to ptr addrspace(1), !dbg [[DBG215]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l37_debug__(ptr [[TMP3]], ptr addrspace(1) [[TMP8]], i32 [[TMP5]], ptr addrspace(1) [[TMP9]], ptr addrspace(1) [[TMP10]]) #[[ATTR4]], !dbg [[DBG215]]
+// CHECK1-NEXT:    ret void, !dbg [[DBG215]]
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l51_debug__
-// CHECK1-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr addrspace(1) noalias noundef [[C:%.*]], ptr addrspace(1) noalias noundef [[A:%.*]], ptr addrspace(1) noalias noundef [[B:%.*]], ptr addrspace(1) noalias noundef [[BB:%.*]]) #[[ATTR0]] !dbg [[DBG217:![0-9]+]] {
+// CHECK1-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr addrspace(1) noalias noundef [[C:%.*]], ptr addrspace(1) noalias noundef [[A:%.*]], ptr addrspace(1) noalias noundef [[B:%.*]], ptr addrspace(1) noalias noundef [[BB:%.*]]) #[[ATTR0]] !dbg [[DBG216:![0-9]+]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[C_ADDR:%.*]] = alloca ptr addrspace(1), align 8
@@ -509,49 +509,49 @@ int main() {
 // CHECK1-NEXT:    [[_TMP3:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x ptr], align 8
 // CHECK1-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DYN_PTR_ADDR]], metadata [[META222:![0-9]+]], metadata !DIExpression()), !dbg [[DBG223:![0-9]+]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[DYN_PTR_ADDR]], metadata [[META221:![0-9]+]], metadata !DIExpression()), !dbg [[DBG222:![0-9]+]]
 // CHECK1-NEXT:    store ptr addrspace(1) [[C]], ptr [[C_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META224:![0-9]+]], metadata !DIExpression()), !dbg [[DBG225:![0-9]+]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META223:![0-9]+]], metadata !DIExpression()), !dbg [[DBG224:![0-9]+]]
 // CHECK1-NEXT:    store ptr addrspace(1) [[A]], ptr [[A_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META226:![0-9]+]], metadata !DIExpression()), !dbg [[DBG227:![0-9]+]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META225:![0-9]+]], metadata !DIExpression()), !dbg [[DBG226:![0-9]+]]
 // CHECK1-NEXT:    store ptr addrspace(1) [[B]], ptr [[B_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META228:![0-9]+]], metadata !DIExpression()), !dbg [[DBG229:![0-9]+]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META227:![0-9]+]], metadata !DIExpression()), !dbg [[DBG228:![0-9]+]]
 // CHECK1-NEXT:    store ptr addrspace(1) [[BB]], ptr [[BB_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META230:![0-9]+]], metadata !DIExpression()), !dbg [[DBG231:![0-9]+]]
-// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr [[C_ADDR]], align 8, !dbg [[DBG232:![0-9]+]]
-// CHECK1-NEXT:    [[TMP1:%.*]] = addrspacecast ptr addrspace(1) [[TMP0]] to ptr, !dbg [[DBG232]]
-// CHECK1-NEXT:    store ptr [[TMP1]], ptr [[TMP]], align 8, !dbg [[DBG232]]
-// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP]], align 8, !dbg [[DBG232]]
-// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr addrspace(1), ptr [[A_ADDR]], align 8, !dbg [[DBG232]]
-// CHECK1-NEXT:    [[TMP4:%.*]] = addrspacecast ptr addrspace(1) [[TMP3]] to ptr, !dbg [[DBG232]]
-// CHECK1-NEXT:    store ptr [[TMP4]], ptr [[_TMP1]], align 8, !dbg [[DBG232]]
-// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[_TMP1]], align 8, !dbg [[DBG232]]
-// CHECK1-NEXT:    [[TMP6:%.*]] = load ptr addrspace(1), ptr [[B_ADDR]], align 8, !dbg [[DBG232]]
-// CHECK1-NEXT:    [[TMP7:%.*]] = addrspacecast ptr addrspace(1) [[TMP6]] to ptr, !dbg [[DBG232]]
-// CHECK1-NEXT:    store ptr [[TMP7]], ptr [[_TMP2]], align 8, !dbg [[DBG232]]
-// CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[_TMP2]], align 8, !dbg [[DBG232]]
-// CHECK1-NEXT:    [[TMP9:%.*]] = load ptr addrspace(1), ptr [[BB_ADDR]], align 8, !dbg [[DBG232]]
-// CHECK1-NEXT:    [[TMP10:%.*]] = addrspacecast ptr addrspace(1) [[TMP9]] to ptr, !dbg [[DBG232]]
-// CHECK1-NEXT:    store ptr [[TMP10]], ptr [[_TMP3]], align 8, !dbg [[DBG232]]
-// CHECK1-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[_TMP3]], align 8, !dbg [[DBG232]]
-// CHECK1-NEXT:    [[TMP12:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l51_kernel_environment, ptr [[DYN_PTR]]), !dbg [[DBG232]]
-// CHECK1-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP12]], -1, !dbg [[DBG232]]
-// CHECK1-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]], !dbg [[DBG232]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META229:![0-9]+]], metadata !DIExpression()), !dbg [[DBG230:![0-9]+]]
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr [[C_ADDR]], align 8, !dbg [[DBG231:![0-9]+]]
+// CHECK1-NEXT:    [[TMP1:%.*]] = addrspacecast ptr addrspace(1) [[TMP0]] to ptr, !dbg [[DBG231]]
+// CHECK1-NEXT:    store ptr [[TMP1]], ptr [[TMP]], align 8, !dbg [[DBG231]]
+// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP]], align 8, !dbg [[DBG231]]
+// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr addrspace(1), ptr [[A_ADDR]], align 8, !dbg [[DBG231]]
+// CHECK1-NEXT:    [[TMP4:%.*]] = addrspacecast ptr addrspace(1) [[TMP3]] to ptr, !dbg [[DBG231]]
+// CHECK1-NEXT:    store ptr [[TMP4]], ptr [[_TMP1]], align 8, !dbg [[DBG231]]
+// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[_TMP1]], align 8, !dbg [[DBG231]]
+// CHECK1-NEXT:    [[TMP6:%.*]] = load ptr addrspace(1), ptr [[B_ADDR]], align 8, !dbg [[DBG231]]
+// CHECK1-NEXT:    [[TMP7:%.*]] = addrspacecast ptr addrspace(1) [[TMP6]] to ptr, !dbg [[DBG231]]
+// CHECK1-NEXT:    store ptr [[TMP7]], ptr [[_TMP2]], align 8, !dbg [[DBG231]]
+// CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[_TMP2]], align 8, !dbg [[DBG231]]
+// CHECK1-NEXT:    [[TMP9:%.*]] = load ptr addrspace(1), ptr [[BB_ADDR]], align 8, !dbg [[DBG231]]
+// CHECK1-NEXT:    [[TMP10:%.*]] = addrspacecast ptr addrspace(1) [[TMP9]] to ptr, !dbg [[DBG231]]
+// CHECK1-NEXT:    store ptr [[TMP10]], ptr [[_TMP3]], align 8, !dbg [[DBG231]]
+// CHECK1-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[_TMP3]], align 8, !dbg [[DBG231]]
+// CHECK1-NEXT:    [[TMP12:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l51_kernel_environment, ptr [[DYN_PTR]]), !dbg [[DBG231]]
+// CHECK1-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP12]], -1, !dbg [[DBG231]]
+// CHECK1-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]], !dbg [[DBG231]]
 // CHECK1:       user_code.entry:
-// CHECK1-NEXT:    [[TMP13:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB11:[0-9]+]])
-// CHECK1-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 0, !dbg [[DBG233:![0-9]+]]
-// CHECK1-NEXT:    store ptr [[TMP2]], ptr [[TMP14]], align 8, !dbg [[DBG233]]
-// CHECK1-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 1, !dbg [[DBG233]]
-// CHECK1-NEXT:    store ptr [[TMP5]], ptr [[TMP15]], align 8, !dbg [[DBG233]]
-// CHECK1-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 2, !dbg [[DBG233]]
-// CHECK1-NEXT:    store ptr [[TMP8]], ptr [[TMP16]], align 8, !dbg [[DBG233]]
-// CHECK1-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 3, !dbg [[DBG233]]
-// CHECK1-NEXT:    store ptr [[TMP11]], ptr [[TMP17]], align 8, !dbg [[DBG233]]
-// CHECK1-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB11]], i32 [[TMP13]], i32 1, i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l51_debug___omp_outlined, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i64 4), !dbg [[DBG233]]
-// CHECK1-NEXT:    call void @__kmpc_target_deinit(), !dbg [[DBG234:![0-9]+]]
+// CHECK1-NEXT:    [[TMP13:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB11:[0-9]+]]), !dbg [[DBG232:![0-9]+]]
+// CHECK1-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 0, !dbg [[DBG234:![0-9]+]]
+// CHECK1-NEXT:    store ptr [[TMP2]], ptr [[TMP14]], align 8, !dbg [[DBG234]]
+// CHECK1-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 1, !dbg [[DBG234]]
+// CHECK1-NEXT:    store ptr [[TMP5]], ptr [[TMP15]], align 8, !dbg [[DBG234]]
+// CHECK1-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 2, !dbg [[DBG234]]
+// CHECK1-NEXT:    store ptr [[TMP8]], ptr [[TMP16]], align 8, !dbg [[DBG234]]
+// CHECK1-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 3, !dbg [[DBG234]]
+// CHECK1-NEXT:    store ptr [[TMP11]], ptr [[TMP17]], align 8, !dbg [[DBG234]]
+// CHECK1-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB11]], i32 [[TMP13]], i32 1, i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l51_debug___omp_outlined, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i64 4), !dbg [[DBG234]]
+// CHECK1-NEXT:    call void @__kmpc_target_deinit(), !dbg [[DBG235:![0-9]+]]
 // CHECK1-NEXT:    ret void, !dbg [[DBG236:![0-9]+]]
 // CHECK1:       worker.exit:
-// CHECK1-NEXT:    ret void, !dbg [[DBG232]]
+// CHECK1-NEXT:    ret void, !dbg [[DBG231]]
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l51_debug___omp_outlined_debug__
@@ -572,17 +572,17 @@ int main() {
 // CHECK1-NEXT:    [[H:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[D:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTGLOBAL_TID__ADDR]], metadata [[META240:![0-9]+]], metadata !DIExpression()), !dbg [[DBG241:![0-9]+]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[DOTGLOBAL_TID__ADDR]], metadata [[META240:![0-9]+]], metadata !DIExpression()), !dbg [[DBG241:![0-9]+]]
 // CHECK1-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTBOUND_TID__ADDR]], metadata [[META242:![0-9]+]], metadata !DIExpression()), !dbg [[DBG241]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[DOTBOUND_TID__ADDR]], metadata [[META242:![0-9]+]], metadata !DIExpression()), !dbg [[DBG241]]
 // CHECK1-NEXT:    store ptr addrspace(1) [[C]], ptr [[C_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META243:![0-9]+]], metadata !DIExpression()), !dbg [[DBG244:![0-9]+]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META243:![0-9]+]], metadata !DIExpression()), !dbg [[DBG244:![0-9]+]]
 // CHECK1-NEXT:    store ptr addrspace(1) [[A]], ptr [[A_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META245:![0-9]+]], metadata !DIExpression()), !dbg [[DBG246:![0-9]+]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META245:![0-9]+]], metadata !DIExpression()), !dbg [[DBG246:![0-9]+]]
 // CHECK1-NEXT:    store ptr addrspace(1) [[B]], ptr [[B_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META247:![0-9]+]], metadata !DIExpression()), !dbg [[DBG248:![0-9]+]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META247:![0-9]+]], metadata !DIExpression()), !dbg [[DBG248:![0-9]+]]
 // CHECK1-NEXT:    store ptr addrspace(1) [[BB]], ptr [[BB_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META249:![0-9]+]], metadata !DIExpression()), !dbg [[DBG250:![0-9]+]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META249:![0-9]+]], metadata !DIExpression()), !dbg [[DBG250:![0-9]+]]
 // CHECK1-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr [[C_ADDR]], align 8, !dbg [[DBG251:![0-9]+]]
 // CHECK1-NEXT:    [[TMP1:%.*]] = addrspacecast ptr addrspace(1) [[TMP0]] to ptr, !dbg [[DBG251]]
 // CHECK1-NEXT:    store ptr [[TMP1]], ptr [[TMP]], align 8, !dbg [[DBG251]]
@@ -599,18 +599,18 @@ int main() {
 // CHECK1-NEXT:    [[TMP10:%.*]] = addrspacecast ptr addrspace(1) [[TMP9]] to ptr, !dbg [[DBG251]]
 // CHECK1-NEXT:    store ptr [[TMP10]], ptr [[_TMP3]], align 8, !dbg [[DBG251]]
 // CHECK1-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[_TMP3]], align 8, !dbg [[DBG251]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[F]], metadata [[META252:![0-9]+]], metadata !DIExpression()), !dbg [[DBG254:![0-9]+]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[F]], metadata [[META252:![0-9]+]], metadata !DIExpression()), !dbg [[DBG254:![0-9]+]]
 // CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 1, !dbg [[DBG255:![0-9]+]]
 // CHECK1-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX]], i64 0, i64 1, !dbg [[DBG255]]
 // CHECK1-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX4]], i64 0, i64 1, !dbg [[DBG255]]
 // CHECK1-NEXT:    store ptr [[ARRAYIDX5]], ptr [[F]], align 8, !dbg [[DBG254]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[G]], metadata [[META256:![0-9]+]], metadata !DIExpression()), !dbg [[DBG257:![0-9]+]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[G]], metadata [[META256:![0-9]+]], metadata !DIExpression()), !dbg [[DBG257:![0-9]+]]
 // CHECK1-NEXT:    store ptr [[TMP5]], ptr [[G]], align 8, !dbg [[DBG257]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[H]], metadata [[META258:![0-9]+]], metadata !DIExpression()), !dbg [[DBG259:![0-9]+]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[H]], metadata [[META258:![0-9]+]], metadata !DIExpression()), !dbg [[DBG259:![0-9]+]]
 // CHECK1-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP8]], i64 0, i64 1, !dbg [[DBG260:![0-9]+]]
 // CHECK1-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX6]], i64 0, i64 1, !dbg [[DBG260]]
 // CHECK1-NEXT:    store ptr [[ARRAYIDX7]], ptr [[H]], align 8, !dbg [[DBG259]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[D]], metadata [[META261:![0-9]+]], metadata !DIExpression()), !dbg [[DBG262:![0-9]+]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[D]], metadata [[META261:![0-9]+]], metadata !DIExpression()), !dbg [[DBG262:![0-9]+]]
 // CHECK1-NEXT:    store i32 15, ptr [[D]], align 4, !dbg [[DBG262]]
 // CHECK1-NEXT:    store i32 5, ptr [[TMP5]], align 4, !dbg [[DBG263:![0-9]+]]
 // CHECK1-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP8]], i64 0, i64 0, !dbg [[DBG264:![0-9]+]]
@@ -656,17 +656,17 @@ int main() {
 // CHECK1-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[BB_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTGLOBAL_TID__ADDR]], metadata [[META282:![0-9]+]], metadata !DIExpression()), !dbg [[DBG283:![0-9]+]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[DOTGLOBAL_TID__ADDR]], metadata [[META282:![0-9]+]], metadata !DIExpression()), !dbg [[DBG283:![0-9]+]]
 // CHECK1-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTBOUND_TID__ADDR]], metadata [[META284:![0-9]+]], metadata !DIExpression()), !dbg [[DBG283]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[DOTBOUND_TID__ADDR]], metadata [[META284:![0-9]+]], metadata !DIExpression()), !dbg [[DBG283]]
 // CHECK1-NEXT:    store ptr [[C]], ptr [[C_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META285:![0-9]+]], metadata !DIExpression()), !dbg [[DBG283]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META285:![0-9]+]], metadata !DIExpression()), !dbg [[DBG283]]
 // CHECK1-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META286:![0-9]+]], metadata !DIExpression()), !dbg [[DBG283]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META286:![0-9]+]], metadata !DIExpression()), !dbg [[DBG283]]
 // CHECK1-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META287:![0-9]+]], metadata !DIExpression()), !dbg [[DBG283]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META287:![0-9]+]], metadata !DIExpression()), !dbg [[DBG283]]
 // CHECK1-NEXT:    store ptr [[BB]], ptr [[BB_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META288:![0-9]+]], metadata !DIExpression()), !dbg [[DBG283]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META288:![0-9]+]], metadata !DIExpression()), !dbg [[DBG283]]
 // CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG289:![0-9]+]]
 // CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[A_ADDR]], align 8, !dbg [[DBG289]]
 // CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG289]]
@@ -694,15 +694,15 @@ int main() {
 // CHECK1-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[BB_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DYN_PTR_ADDR]], metadata [[META293:![0-9]+]], metadata !DIExpression()), !dbg [[DBG294:![0-9]+]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[DYN_PTR_ADDR]], metadata [[META293:![0-9]+]], metadata !DIExpression()), !dbg [[DBG294:![0-9]+]]
 // CHECK1-NEXT:    store ptr [[C]], ptr [[C_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META295:![0-9]+]], metadata !DIExpression()), !dbg [[DBG294]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META295:![0-9]+]], metadata !DIExpression()), !dbg [[DBG294]]
 // CHECK1-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META296:![0-9]+]], metadata !DIExpression()), !dbg [[DBG294]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META296:![0-9]+]], metadata !DIExpression()), !dbg [[DBG294]]
 // CHECK1-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META297:![0-9]+]], metadata !DIExpression()), !dbg [[DBG294]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META297:![0-9]+]], metadata !DIExpression()), !dbg [[DBG294]]
 // CHECK1-NEXT:    store ptr [[BB]], ptr [[BB_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META298:![0-9]+]], metadata !DIExpression()), !dbg [[DBG294]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META298:![0-9]+]], metadata !DIExpression()), !dbg [[DBG294]]
 // CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG299:![0-9]+]]
 // CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[A_ADDR]], align 8, !dbg [[DBG299]]
 // CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG299]]
diff --git a/clang/test/OpenMP/target_parallel_for_debug_codegen.cpp b/clang/test/OpenMP/target_parallel_for_debug_codegen.cpp
index 688456a647764..28577d1c585f7 100644
--- a/clang/test/OpenMP/target_parallel_for_debug_codegen.cpp
+++ b/clang/test/OpenMP/target_parallel_for_debug_codegen.cpp
@@ -55,7 +55,7 @@ int main() {
   return 0;
 }
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l13_debug__
-// CHECK1-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr addrspace(1) noalias noundef [[C:%.*]], i32 noundef [[A:%.*]], ptr noalias noundef [[B:%.*]], ptr addrspace(1) noalias noundef [[BB:%.*]], i1 noundef zeroext [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR0:[0-9]+]] !dbg [[DBG22:![0-9]+]] {
+// CHECK1-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr addrspace(1) noalias noundef [[C:%.*]], i32 noundef [[A:%.*]], ptr noalias noundef [[B:%.*]], ptr addrspace(1) noalias noundef [[BB:%.*]], i1 noundef zeroext [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR0:[0-9]+]] !dbg [[DBG19:![0-9]+]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[C_ADDR:%.*]] = alloca ptr addrspace(1), align 8
@@ -69,58 +69,58 @@ int main() {
 // CHECK1-NEXT:    [[A_CASTED:%.*]] = alloca i64, align 8
 // CHECK1-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x ptr], align 8
 // CHECK1-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DYN_PTR_ADDR]], metadata [[META43:![0-9]+]], metadata !DIExpression()), !dbg [[DBG44:![0-9]+]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[DYN_PTR_ADDR]], metadata [[META40:![0-9]+]], metadata !DIExpression()), !dbg [[DBG41:![0-9]+]]
 // CHECK1-NEXT:    store ptr addrspace(1) [[C]], ptr [[C_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META45:![0-9]+]], metadata !DIExpression()), !dbg [[DBG46:![0-9]+]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META42:![0-9]+]], metadata !DIExpression()), !dbg [[DBG43:![0-9]+]]
 // CHECK1-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META47:![0-9]+]], metadata !DIExpression()), !dbg [[DBG48:![0-9]+]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META44:![0-9]+]], metadata !DIExpression()), !dbg [[DBG45:![0-9]+]]
 // CHECK1-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META49:![0-9]+]], metadata !DIExpression()), !dbg [[DBG50:![0-9]+]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META46:![0-9]+]], metadata !DIExpression()), !dbg [[DBG47:![0-9]+]]
 // CHECK1-NEXT:    store ptr addrspace(1) [[BB]], ptr [[BB_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META51:![0-9]+]], metadata !DIExpression()), !dbg [[DBG52:![0-9]+]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META48:![0-9]+]], metadata !DIExpression()), !dbg [[DBG49:![0-9]+]]
 // CHECK1-NEXT:    [[FROMBOOL:%.*]] = zext i1 [[DOTCAPTURE_EXPR_]] to i8
 // CHECK1-NEXT:    store i8 [[FROMBOOL]], ptr [[DOTCAPTURE_EXPR__ADDR]], align 1
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTCAPTURE_EXPR__ADDR]], metadata [[META53:![0-9]+]], metadata !DIExpression()), !dbg [[DBG54:![0-9]+]]
-// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr [[C_ADDR]], align 8, !dbg [[DBG55:![0-9]+]]
-// CHECK1-NEXT:    [[TMP1:%.*]] = addrspacecast ptr addrspace(1) [[TMP0]] to ptr, !dbg [[DBG55]]
-// CHECK1-NEXT:    store ptr [[TMP1]], ptr [[TMP]], align 8, !dbg [[DBG55]]
-// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP]], align 8, !dbg [[DBG55]]
-// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG55]]
-// CHECK1-NEXT:    store ptr [[TMP3]], ptr [[_TMP1]], align 8, !dbg [[DBG55]]
-// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[_TMP1]], align 8, !dbg [[DBG55]]
-// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr addrspace(1), ptr [[BB_ADDR]], align 8, !dbg [[DBG55]]
-// CHECK1-NEXT:    [[TMP6:%.*]] = addrspacecast ptr addrspace(1) [[TMP5]] to ptr, !dbg [[DBG55]]
-// CHECK1-NEXT:    store ptr [[TMP6]], ptr [[_TMP2]], align 8, !dbg [[DBG55]]
-// CHECK1-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[_TMP2]], align 8, !dbg [[DBG55]]
-// CHECK1-NEXT:    [[TMP8:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l13_kernel_environment, ptr [[DYN_PTR]]), !dbg [[DBG55]]
-// CHECK1-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP8]], -1, !dbg [[DBG55]]
-// CHECK1-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]], !dbg [[DBG55]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[DOTCAPTURE_EXPR__ADDR]], metadata [[META50:![0-9]+]], metadata !DIExpression()), !dbg [[DBG51:![0-9]+]]
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr [[C_ADDR]], align 8, !dbg [[DBG52:![0-9]+]]
+// CHECK1-NEXT:    [[TMP1:%.*]] = addrspacecast ptr addrspace(1) [[TMP0]] to ptr, !dbg [[DBG52]]
+// CHECK1-NEXT:    store ptr [[TMP1]], ptr [[TMP]], align 8, !dbg [[DBG52]]
+// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP]], align 8, !dbg [[DBG52]]
+// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG52]]
+// CHECK1-NEXT:    store ptr [[TMP3]], ptr [[_TMP1]], align 8, !dbg [[DBG52]]
+// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[_TMP1]], align 8, !dbg [[DBG52]]
+// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr addrspace(1), ptr [[BB_ADDR]], align 8, !dbg [[DBG52]]
+// CHECK1-NEXT:    [[TMP6:%.*]] = addrspacecast ptr addrspace(1) [[TMP5]] to ptr, !dbg [[DBG52]]
+// CHECK1-NEXT:    store ptr [[TMP6]], ptr [[_TMP2]], align 8, !dbg [[DBG52]]
+// CHECK1-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[_TMP2]], align 8, !dbg [[DBG52]]
+// CHECK1-NEXT:    [[TMP8:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l13_kernel_environment, ptr [[DYN_PTR]]), !dbg [[DBG52]]
+// CHECK1-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP8]], -1, !dbg [[DBG52]]
+// CHECK1-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]], !dbg [[DBG52]]
 // CHECK1:       user_code.entry:
-// CHECK1-NEXT:    [[TMP9:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB6:[0-9]+]])
-// CHECK1-NEXT:    [[TMP10:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG56:![0-9]+]]
-// CHECK1-NEXT:    store i32 [[TMP10]], ptr [[A_CASTED]], align 4, !dbg [[DBG56]]
-// CHECK1-NEXT:    [[TMP11:%.*]] = load i64, ptr [[A_CASTED]], align 8, !dbg [[DBG56]]
-// CHECK1-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 0, !dbg [[DBG56]]
-// CHECK1-NEXT:    store ptr [[TMP2]], ptr [[TMP12]], align 8, !dbg [[DBG56]]
-// CHECK1-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 1, !dbg [[DBG56]]
-// CHECK1-NEXT:    [[TMP14:%.*]] = inttoptr i64 [[TMP11]] to ptr, !dbg [[DBG56]]
-// CHECK1-NEXT:    store ptr [[TMP14]], ptr [[TMP13]], align 8, !dbg [[DBG56]]
-// CHECK1-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 2, !dbg [[DBG56]]
-// CHECK1-NEXT:    store ptr [[TMP4]], ptr [[TMP15]], align 8, !dbg [[DBG56]]
-// CHECK1-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 3, !dbg [[DBG56]]
-// CHECK1-NEXT:    store ptr [[TMP7]], ptr [[TMP16]], align 8, !dbg [[DBG56]]
-// CHECK1-NEXT:    [[TMP17:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR__ADDR]], align 1, !dbg [[DBG57:![0-9]+]]
-// CHECK1-NEXT:    [[TOBOOL:%.*]] = trunc i8 [[TMP17]] to i1, !dbg [[DBG57]]
-// CHECK1-NEXT:    [[TMP18:%.*]] = zext i1 [[TOBOOL]] to i32, !dbg [[DBG56]]
-// CHECK1-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB6]], i32 [[TMP9]], i32 [[TMP18]], i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l13_debug___omp_outlined, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i64 4), !dbg [[DBG56]]
-// CHECK1-NEXT:    call void @__kmpc_target_deinit(), !dbg [[DBG59:![0-9]+]]
-// CHECK1-NEXT:    ret void, !dbg [[DBG60:![0-9]+]]
+// CHECK1-NEXT:    [[TMP9:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB6:[0-9]+]]), !dbg [[DBG53:![0-9]+]]
+// CHECK1-NEXT:    [[TMP10:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG55:![0-9]+]]
+// CHECK1-NEXT:    store i32 [[TMP10]], ptr [[A_CASTED]], align 4, !dbg [[DBG55]]
+// CHECK1-NEXT:    [[TMP11:%.*]] = load i64, ptr [[A_CASTED]], align 8, !dbg [[DBG55]]
+// CHECK1-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 0, !dbg [[DBG55]]
+// CHECK1-NEXT:    store ptr [[TMP2]], ptr [[TMP12]], align 8, !dbg [[DBG55]]
+// CHECK1-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 1, !dbg [[DBG55]]
+// CHECK1-NEXT:    [[TMP14:%.*]] = inttoptr i64 [[TMP11]] to ptr, !dbg [[DBG55]]
+// CHECK1-NEXT:    store ptr [[TMP14]], ptr [[TMP13]], align 8, !dbg [[DBG55]]
+// CHECK1-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 2, !dbg [[DBG55]]
+// CHECK1-NEXT:    store ptr [[TMP4]], ptr [[TMP15]], align 8, !dbg [[DBG55]]
+// CHECK1-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 3, !dbg [[DBG55]]
+// CHECK1-NEXT:    store ptr [[TMP7]], ptr [[TMP16]], align 8, !dbg [[DBG55]]
+// CHECK1-NEXT:    [[TMP17:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR__ADDR]], align 1, !dbg [[DBG56:![0-9]+]]
+// CHECK1-NEXT:    [[TOBOOL:%.*]] = trunc i8 [[TMP17]] to i1, !dbg [[DBG56]]
+// CHECK1-NEXT:    [[TMP18:%.*]] = zext i1 [[TOBOOL]] to i32, !dbg [[DBG55]]
+// CHECK1-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB6]], i32 [[TMP9]], i32 [[TMP18]], i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l13_debug___omp_outlined, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i64 4), !dbg [[DBG55]]
+// CHECK1-NEXT:    call void @__kmpc_target_deinit(), !dbg [[DBG57:![0-9]+]]
+// CHECK1-NEXT:    ret void, !dbg [[DBG58:![0-9]+]]
 // CHECK1:       worker.exit:
-// CHECK1-NEXT:    ret void, !dbg [[DBG55]]
+// CHECK1-NEXT:    ret void, !dbg [[DBG52]]
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l13_debug___omp_outlined_debug__
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr addrspace(1) noalias noundef [[C:%.*]], i32 noundef [[A:%.*]], ptr noalias noundef [[B:%.*]], ptr addrspace(1) noalias noundef [[BB:%.*]]) #[[ATTR2:[0-9]+]] !dbg [[DBG61:![0-9]+]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr addrspace(1) noalias noundef [[C:%.*]], i32 noundef [[A:%.*]], ptr noalias noundef [[B:%.*]], ptr addrspace(1) noalias noundef [[BB:%.*]]) #[[ATTR2:[0-9]+]] !dbg [[DBG59:![0-9]+]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -144,149 +144,149 @@ int main() {
 // CHECK1-NEXT:    [[H:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[D:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTGLOBAL_TID__ADDR]], metadata [[META68:![0-9]+]], metadata !DIExpression()), !dbg [[DBG69:![0-9]+]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[DOTGLOBAL_TID__ADDR]], metadata [[META66:![0-9]+]], metadata !DIExpression()), !dbg [[DBG67:![0-9]+]]
 // CHECK1-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTBOUND_TID__ADDR]], metadata [[META70:![0-9]+]], metadata !DIExpression()), !dbg [[DBG69]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[DOTBOUND_TID__ADDR]], metadata [[META68:![0-9]+]], metadata !DIExpression()), !dbg [[DBG67]]
 // CHECK1-NEXT:    store ptr addrspace(1) [[C]], ptr [[C_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META71:![0-9]+]], metadata !DIExpression()), !dbg [[DBG72:![0-9]+]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META69:![0-9]+]], metadata !DIExpression()), !dbg [[DBG70:![0-9]+]]
 // CHECK1-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META73:![0-9]+]], metadata !DIExpression()), !dbg [[DBG74:![0-9]+]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META71:![0-9]+]], metadata !DIExpression()), !dbg [[DBG72:![0-9]+]]
 // CHECK1-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META75:![0-9]+]], metadata !DIExpression()), !dbg [[DBG76:![0-9]+]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META73:![0-9]+]], metadata !DIExpression()), !dbg [[DBG74:![0-9]+]]
 // CHECK1-NEXT:    store ptr addrspace(1) [[BB]], ptr [[BB_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META77:![0-9]+]], metadata !DIExpression()), !dbg [[DBG78:![0-9]+]]
-// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr [[C_ADDR]], align 8, !dbg [[DBG79:![0-9]+]]
-// CHECK1-NEXT:    [[TMP1:%.*]] = addrspacecast ptr addrspace(1) [[TMP0]] to ptr, !dbg [[DBG79]]
-// CHECK1-NEXT:    store ptr [[TMP1]], ptr [[TMP]], align 8, !dbg [[DBG79]]
-// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP]], align 8, !dbg [[DBG79]]
-// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG79]]
-// CHECK1-NEXT:    store ptr [[TMP3]], ptr [[_TMP1]], align 8, !dbg [[DBG79]]
-// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[_TMP1]], align 8, !dbg [[DBG79]]
-// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr addrspace(1), ptr [[BB_ADDR]], align 8, !dbg [[DBG79]]
-// CHECK1-NEXT:    [[TMP6:%.*]] = addrspacecast ptr addrspace(1) [[TMP5]] to ptr, !dbg [[DBG79]]
-// CHECK1-NEXT:    store ptr [[TMP6]], ptr [[_TMP2]], align 8, !dbg [[DBG79]]
-// CHECK1-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[_TMP2]], align 8, !dbg [[DBG79]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTOMP_IV]], metadata [[META80:![0-9]+]], metadata !DIExpression()), !dbg [[DBG69]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTOMP_LB]], metadata [[META81:![0-9]+]], metadata !DIExpression()), !dbg [[DBG69]]
-// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_LB]], align 4, !dbg [[DBG82:![0-9]+]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTOMP_UB]], metadata [[META83:![0-9]+]], metadata !DIExpression()), !dbg [[DBG69]]
-// CHECK1-NEXT:    store i32 9, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG82]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTOMP_STRIDE]], metadata [[META84:![0-9]+]], metadata !DIExpression()), !dbg [[DBG69]]
-// CHECK1-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4, !dbg [[DBG82]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTOMP_IS_LAST]], metadata [[META85:![0-9]+]], metadata !DIExpression()), !dbg [[DBG69]]
-// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4, !dbg [[DBG82]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[B4]], metadata [[META86:![0-9]+]], metadata !DIExpression()), !dbg [[DBG69]]
-// CHECK1-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[B4]], ptr align 4 [[TMP4]], i64 400, i1 false), !dbg [[DBG79]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[I]], metadata [[META87:![0-9]+]], metadata !DIExpression()), !dbg [[DBG69]]
-// CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !dbg [[DBG79]]
-// CHECK1-NEXT:    [[TMP9:%.*]] = load i32, ptr [[TMP8]], align 4, !dbg [[DBG79]]
-// CHECK1-NEXT:    call void @__kmpc_for_static_init_4(ptr @[[GLOB3:[0-9]+]], i32 [[TMP9]], i32 33, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1), !dbg [[DBG88:![0-9]+]]
-// CHECK1-NEXT:    br label [[OMP_DISPATCH_COND:%.*]], !dbg [[DBG79]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META75:![0-9]+]], metadata !DIExpression()), !dbg [[DBG76:![0-9]+]]
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr [[C_ADDR]], align 8, !dbg [[DBG77:![0-9]+]]
+// CHECK1-NEXT:    [[TMP1:%.*]] = addrspacecast ptr addrspace(1) [[TMP0]] to ptr, !dbg [[DBG77]]
+// CHECK1-NEXT:    store ptr [[TMP1]], ptr [[TMP]], align 8, !dbg [[DBG77]]
+// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP]], align 8, !dbg [[DBG77]]
+// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG77]]
+// CHECK1-NEXT:    store ptr [[TMP3]], ptr [[_TMP1]], align 8, !dbg [[DBG77]]
+// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[_TMP1]], align 8, !dbg [[DBG77]]
+// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr addrspace(1), ptr [[BB_ADDR]], align 8, !dbg [[DBG77]]
+// CHECK1-NEXT:    [[TMP6:%.*]] = addrspacecast ptr addrspace(1) [[TMP5]] to ptr, !dbg [[DBG77]]
+// CHECK1-NEXT:    store ptr [[TMP6]], ptr [[_TMP2]], align 8, !dbg [[DBG77]]
+// CHECK1-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[_TMP2]], align 8, !dbg [[DBG77]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[DOTOMP_IV]], metadata [[META78:![0-9]+]], metadata !DIExpression()), !dbg [[DBG67]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[DOTOMP_LB]], metadata [[META79:![0-9]+]], metadata !DIExpression()), !dbg [[DBG67]]
+// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_LB]], align 4, !dbg [[DBG80:![0-9]+]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[DOTOMP_UB]], metadata [[META81:![0-9]+]], metadata !DIExpression()), !dbg [[DBG67]]
+// CHECK1-NEXT:    store i32 9, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG80]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[DOTOMP_STRIDE]], metadata [[META82:![0-9]+]], metadata !DIExpression()), !dbg [[DBG67]]
+// CHECK1-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4, !dbg [[DBG80]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[DOTOMP_IS_LAST]], metadata [[META83:![0-9]+]], metadata !DIExpression()), !dbg [[DBG67]]
+// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4, !dbg [[DBG80]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[B4]], metadata [[META84:![0-9]+]], metadata !DIExpression()), !dbg [[DBG67]]
+// CHECK1-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[B4]], ptr align 4 [[TMP4]], i64 400, i1 false), !dbg [[DBG77]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[I]], metadata [[META85:![0-9]+]], metadata !DIExpression()), !dbg [[DBG67]]
+// CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !dbg [[DBG77]]
+// CHECK1-NEXT:    [[TMP9:%.*]] = load i32, ptr [[TMP8]], align 4, !dbg [[DBG77]]
+// CHECK1-NEXT:    call void @__kmpc_for_static_init_4(ptr @[[GLOB3:[0-9]+]], i32 [[TMP9]], i32 33, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1), !dbg [[DBG86:![0-9]+]]
+// CHECK1-NEXT:    br label [[OMP_DISPATCH_COND:%.*]], !dbg [[DBG77]]
 // CHECK1:       omp.dispatch.cond:
-// CHECK1-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG82]]
-// CHECK1-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP10]], 9, !dbg [[DBG82]]
-// CHECK1-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]], !dbg [[DBG82]]
+// CHECK1-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG80]]
+// CHECK1-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP10]], 9, !dbg [[DBG80]]
+// CHECK1-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]], !dbg [[DBG80]]
 // CHECK1:       cond.true:
-// CHECK1-NEXT:    br label [[COND_END:%.*]], !dbg [[DBG82]]
+// CHECK1-NEXT:    br label [[COND_END:%.*]], !dbg [[DBG80]]
 // CHECK1:       cond.false:
-// CHECK1-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG82]]
-// CHECK1-NEXT:    br label [[COND_END]], !dbg [[DBG82]]
+// CHECK1-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG80]]
+// CHECK1-NEXT:    br label [[COND_END]], !dbg [[DBG80]]
 // CHECK1:       cond.end:
-// CHECK1-NEXT:    [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP11]], [[COND_FALSE]] ], !dbg [[DBG82]]
-// CHECK1-NEXT:    store i32 [[COND]], ptr [[DOTOMP_UB]], align 4, !dbg [[DBG82]]
-// CHECK1-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4, !dbg [[DBG82]]
-// CHECK1-NEXT:    store i32 [[TMP12]], ptr [[DOTOMP_IV]], align 4, !dbg [[DBG82]]
-// CHECK1-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG82]]
-// CHECK1-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG82]]
-// CHECK1-NEXT:    [[CMP5:%.*]] = icmp sle i32 [[TMP13]], [[TMP14]], !dbg [[DBG79]]
-// CHECK1-NEXT:    br i1 [[CMP5]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]], !dbg [[DBG79]]
+// CHECK1-NEXT:    [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP11]], [[COND_FALSE]] ], !dbg [[DBG80]]
+// CHECK1-NEXT:    store i32 [[COND]], ptr [[DOTOMP_UB]], align 4, !dbg [[DBG80]]
+// CHECK1-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4, !dbg [[DBG80]]
+// CHECK1-NEXT:    store i32 [[TMP12]], ptr [[DOTOMP_IV]], align 4, !dbg [[DBG80]]
+// CHECK1-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG80]]
+// CHECK1-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG80]]
+// CHECK1-NEXT:    [[CMP5:%.*]] = icmp sle i32 [[TMP13]], [[TMP14]], !dbg [[DBG77]]
+// CHECK1-NEXT:    br i1 [[CMP5]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]], !dbg [[DBG77]]
 // CHECK1:       omp.dispatch.body:
-// CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]], !dbg [[DBG79]]
+// CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]], !dbg [[DBG77]]
 // CHECK1:       omp.inner.for.cond:
-// CHECK1-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG82]]
-// CHECK1-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG82]]
-// CHECK1-NEXT:    [[CMP6:%.*]] = icmp sle i32 [[TMP15]], [[TMP16]], !dbg [[DBG79]]
-// CHECK1-NEXT:    br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]], !dbg [[DBG79]]
+// CHECK1-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG80]]
+// CHECK1-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG80]]
+// CHECK1-NEXT:    [[CMP6:%.*]] = icmp sle i32 [[TMP15]], [[TMP16]], !dbg [[DBG77]]
+// CHECK1-NEXT:    br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]], !dbg [[DBG77]]
 // CHECK1:       omp.inner.for.body:
-// CHECK1-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG82]]
-// CHECK1-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP17]], 1, !dbg [[DBG89:![0-9]+]]
-// CHECK1-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]], !dbg [[DBG89]]
-// CHECK1-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !dbg [[DBG89]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[F]], metadata [[META90:![0-9]+]], metadata !DIExpression()), !dbg [[DBG93:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 1, !dbg [[DBG94:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX]], i64 0, i64 1, !dbg [[DBG94]]
-// CHECK1-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX7]], i64 0, i64 1, !dbg [[DBG94]]
-// CHECK1-NEXT:    store ptr [[ARRAYIDX8]], ptr [[F]], align 8, !dbg [[DBG93]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[G]], metadata [[META95:![0-9]+]], metadata !DIExpression()), !dbg [[DBG96:![0-9]+]]
-// CHECK1-NEXT:    store ptr [[A_ADDR]], ptr [[G]], align 8, !dbg [[DBG96]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[H]], metadata [[META97:![0-9]+]], metadata !DIExpression()), !dbg [[DBG98:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[B4]], i64 0, i64 1, !dbg [[DBG99:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX9]], i64 0, i64 1, !dbg [[DBG99]]
-// CHECK1-NEXT:    store ptr [[ARRAYIDX10]], ptr [[H]], align 8, !dbg [[DBG98]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[D]], metadata [[META100:![0-9]+]], metadata !DIExpression()), !dbg [[DBG101:![0-9]+]]
-// CHECK1-NEXT:    store i32 15, ptr [[D]], align 4, !dbg [[DBG101]]
-// CHECK1-NEXT:    store i32 5, ptr [[A_ADDR]], align 4, !dbg [[DBG102:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[B4]], i64 0, i64 0, !dbg [[DBG103:![0-9]+]]
-// CHECK1-NEXT:    [[TMP18:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG104:![0-9]+]]
-// CHECK1-NEXT:    [[IDXPROM:%.*]] = sext i32 [[TMP18]] to i64, !dbg [[DBG103]]
-// CHECK1-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX11]], i64 0, i64 [[IDXPROM]], !dbg [[DBG103]]
-// CHECK1-NEXT:    store i32 10, ptr [[ARRAYIDX12]], align 4, !dbg [[DBG105:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 0, !dbg [[DBG106:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX13]], i64 0, i64 0, !dbg [[DBG106]]
-// CHECK1-NEXT:    [[TMP19:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG107:![0-9]+]]
-// CHECK1-NEXT:    [[IDXPROM15:%.*]] = sext i32 [[TMP19]] to i64, !dbg [[DBG106]]
-// CHECK1-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX14]], i64 0, i64 [[IDXPROM15]], !dbg [[DBG106]]
-// CHECK1-NEXT:    store i32 11, ptr [[ARRAYIDX16]], align 4, !dbg [[DBG108:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX17:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 0, !dbg [[DBG109:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX18:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX17]], i64 0, i64 0, !dbg [[DBG109]]
-// CHECK1-NEXT:    [[TMP20:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG110:![0-9]+]]
-// CHECK1-NEXT:    [[IDXPROM19:%.*]] = sext i32 [[TMP20]] to i64, !dbg [[DBG109]]
-// CHECK1-NEXT:    [[ARRAYIDX20:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX18]], i64 0, i64 [[IDXPROM19]], !dbg [[DBG109]]
-// CHECK1-NEXT:    [[TMP21:%.*]] = load i32, ptr [[ARRAYIDX20]], align 4, !dbg [[DBG109]]
-// CHECK1-NEXT:    [[ARRAYIDX21:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[B4]], i64 0, i64 0, !dbg [[DBG111:![0-9]+]]
-// CHECK1-NEXT:    [[TMP22:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG112:![0-9]+]]
-// CHECK1-NEXT:    [[IDXPROM22:%.*]] = sext i32 [[TMP22]] to i64, !dbg [[DBG111]]
-// CHECK1-NEXT:    [[ARRAYIDX23:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX21]], i64 0, i64 [[IDXPROM22]], !dbg [[DBG111]]
-// CHECK1-NEXT:    store i32 [[TMP21]], ptr [[ARRAYIDX23]], align 4, !dbg [[DBG113:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX24:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[B4]], i64 0, i64 0, !dbg [[DBG114:![0-9]+]]
-// CHECK1-NEXT:    [[TMP23:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG115:![0-9]+]]
-// CHECK1-NEXT:    [[IDXPROM25:%.*]] = sext i32 [[TMP23]] to i64, !dbg [[DBG114]]
-// CHECK1-NEXT:    [[ARRAYIDX26:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX24]], i64 0, i64 [[IDXPROM25]], !dbg [[DBG114]]
-// CHECK1-NEXT:    [[TMP24:%.*]] = load i32, ptr [[ARRAYIDX26]], align 4, !dbg [[DBG114]]
-// CHECK1-NEXT:    [[TMP25:%.*]] = load i8, ptr [[TMP7]], align 1, !dbg [[DBG116:![0-9]+]]
-// CHECK1-NEXT:    [[TOBOOL:%.*]] = trunc i8 [[TMP25]] to i1, !dbg [[DBG116]]
-// CHECK1-NEXT:    [[CONV:%.*]] = zext i1 [[TOBOOL]] to i32, !dbg [[DBG116]]
-// CHECK1-NEXT:    [[OR:%.*]] = or i32 [[CONV]], [[TMP24]], !dbg [[DBG116]]
-// CHECK1-NEXT:    [[TOBOOL27:%.*]] = icmp ne i32 [[OR]], 0, !dbg [[DBG116]]
-// CHECK1-NEXT:    [[FROMBOOL:%.*]] = zext i1 [[TOBOOL27]] to i8, !dbg [[DBG116]]
-// CHECK1-NEXT:    store i8 [[FROMBOOL]], ptr [[TMP7]], align 1, !dbg [[DBG116]]
-// CHECK1-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]], !dbg [[DBG117:![0-9]+]]
+// CHECK1-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG80]]
+// CHECK1-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP17]], 1, !dbg [[DBG87:![0-9]+]]
+// CHECK1-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]], !dbg [[DBG87]]
+// CHECK1-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !dbg [[DBG87]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[F]], metadata [[META88:![0-9]+]], metadata !DIExpression()), !dbg [[DBG91:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 1, !dbg [[DBG92:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX]], i64 0, i64 1, !dbg [[DBG92]]
+// CHECK1-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX7]], i64 0, i64 1, !dbg [[DBG92]]
+// CHECK1-NEXT:    store ptr [[ARRAYIDX8]], ptr [[F]], align 8, !dbg [[DBG91]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[G]], metadata [[META93:![0-9]+]], metadata !DIExpression()), !dbg [[DBG94:![0-9]+]]
+// CHECK1-NEXT:    store ptr [[A_ADDR]], ptr [[G]], align 8, !dbg [[DBG94]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[H]], metadata [[META95:![0-9]+]], metadata !DIExpression()), !dbg [[DBG96:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[B4]], i64 0, i64 1, !dbg [[DBG97:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX9]], i64 0, i64 1, !dbg [[DBG97]]
+// CHECK1-NEXT:    store ptr [[ARRAYIDX10]], ptr [[H]], align 8, !dbg [[DBG96]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[D]], metadata [[META98:![0-9]+]], metadata !DIExpression()), !dbg [[DBG99:![0-9]+]]
+// CHECK1-NEXT:    store i32 15, ptr [[D]], align 4, !dbg [[DBG99]]
+// CHECK1-NEXT:    store i32 5, ptr [[A_ADDR]], align 4, !dbg [[DBG100:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[B4]], i64 0, i64 0, !dbg [[DBG101:![0-9]+]]
+// CHECK1-NEXT:    [[TMP18:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG102:![0-9]+]]
+// CHECK1-NEXT:    [[IDXPROM:%.*]] = sext i32 [[TMP18]] to i64, !dbg [[DBG101]]
+// CHECK1-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX11]], i64 0, i64 [[IDXPROM]], !dbg [[DBG101]]
+// CHECK1-NEXT:    store i32 10, ptr [[ARRAYIDX12]], align 4, !dbg [[DBG103:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 0, !dbg [[DBG104:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX13]], i64 0, i64 0, !dbg [[DBG104]]
+// CHECK1-NEXT:    [[TMP19:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG105:![0-9]+]]
+// CHECK1-NEXT:    [[IDXPROM15:%.*]] = sext i32 [[TMP19]] to i64, !dbg [[DBG104]]
+// CHECK1-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX14]], i64 0, i64 [[IDXPROM15]], !dbg [[DBG104]]
+// CHECK1-NEXT:    store i32 11, ptr [[ARRAYIDX16]], align 4, !dbg [[DBG106:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX17:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 0, !dbg [[DBG107:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX18:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX17]], i64 0, i64 0, !dbg [[DBG107]]
+// CHECK1-NEXT:    [[TMP20:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG108:![0-9]+]]
+// CHECK1-NEXT:    [[IDXPROM19:%.*]] = sext i32 [[TMP20]] to i64, !dbg [[DBG107]]
+// CHECK1-NEXT:    [[ARRAYIDX20:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX18]], i64 0, i64 [[IDXPROM19]], !dbg [[DBG107]]
+// CHECK1-NEXT:    [[TMP21:%.*]] = load i32, ptr [[ARRAYIDX20]], align 4, !dbg [[DBG107]]
+// CHECK1-NEXT:    [[ARRAYIDX21:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[B4]], i64 0, i64 0, !dbg [[DBG109:![0-9]+]]
+// CHECK1-NEXT:    [[TMP22:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG110:![0-9]+]]
+// CHECK1-NEXT:    [[IDXPROM22:%.*]] = sext i32 [[TMP22]] to i64, !dbg [[DBG109]]
+// CHECK1-NEXT:    [[ARRAYIDX23:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX21]], i64 0, i64 [[IDXPROM22]], !dbg [[DBG109]]
+// CHECK1-NEXT:    store i32 [[TMP21]], ptr [[ARRAYIDX23]], align 4, !dbg [[DBG111:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX24:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[B4]], i64 0, i64 0, !dbg [[DBG112:![0-9]+]]
+// CHECK1-NEXT:    [[TMP23:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG113:![0-9]+]]
+// CHECK1-NEXT:    [[IDXPROM25:%.*]] = sext i32 [[TMP23]] to i64, !dbg [[DBG112]]
+// CHECK1-NEXT:    [[ARRAYIDX26:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX24]], i64 0, i64 [[IDXPROM25]], !dbg [[DBG112]]
+// CHECK1-NEXT:    [[TMP24:%.*]] = load i32, ptr [[ARRAYIDX26]], align 4, !dbg [[DBG112]]
+// CHECK1-NEXT:    [[TMP25:%.*]] = load i8, ptr [[TMP7]], align 1, !dbg [[DBG114:![0-9]+]]
+// CHECK1-NEXT:    [[TOBOOL:%.*]] = trunc i8 [[TMP25]] to i1, !dbg [[DBG114]]
+// CHECK1-NEXT:    [[CONV:%.*]] = zext i1 [[TOBOOL]] to i32, !dbg [[DBG114]]
+// CHECK1-NEXT:    [[OR:%.*]] = or i32 [[CONV]], [[TMP24]], !dbg [[DBG114]]
+// CHECK1-NEXT:    [[TOBOOL27:%.*]] = icmp ne i32 [[OR]], 0, !dbg [[DBG114]]
+// CHECK1-NEXT:    [[FROMBOOL:%.*]] = zext i1 [[TOBOOL27]] to i8, !dbg [[DBG114]]
+// CHECK1-NEXT:    store i8 [[FROMBOOL]], ptr [[TMP7]], align 1, !dbg [[DBG114]]
+// CHECK1-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]], !dbg [[DBG115:![0-9]+]]
 // CHECK1:       omp.body.continue:
-// CHECK1-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]], !dbg [[DBG88]]
+// CHECK1-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]], !dbg [[DBG86]]
 // CHECK1:       omp.inner.for.inc:
-// CHECK1-NEXT:    [[TMP26:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG82]]
-// CHECK1-NEXT:    [[ADD28:%.*]] = add nsw i32 [[TMP26]], 1, !dbg [[DBG79]]
-// CHECK1-NEXT:    store i32 [[ADD28]], ptr [[DOTOMP_IV]], align 4, !dbg [[DBG79]]
-// CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND]], !dbg [[DBG88]], !llvm.loop [[LOOP118:![0-9]+]]
+// CHECK1-NEXT:    [[TMP26:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG80]]
+// CHECK1-NEXT:    [[ADD28:%.*]] = add nsw i32 [[TMP26]], 1, !dbg [[DBG77]]
+// CHECK1-NEXT:    store i32 [[ADD28]], ptr [[DOTOMP_IV]], align 4, !dbg [[DBG77]]
+// CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND]], !dbg [[DBG86]], !llvm.loop [[LOOP116:![0-9]+]]
 // CHECK1:       omp.inner.for.end:
-// CHECK1-NEXT:    br label [[OMP_DISPATCH_INC:%.*]], !dbg [[DBG88]]
+// CHECK1-NEXT:    br label [[OMP_DISPATCH_INC:%.*]], !dbg [[DBG86]]
 // CHECK1:       omp.dispatch.inc:
-// CHECK1-NEXT:    [[TMP27:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4, !dbg [[DBG82]]
-// CHECK1-NEXT:    [[TMP28:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !dbg [[DBG82]]
-// CHECK1-NEXT:    [[ADD29:%.*]] = add nsw i32 [[TMP27]], [[TMP28]], !dbg [[DBG79]]
-// CHECK1-NEXT:    store i32 [[ADD29]], ptr [[DOTOMP_LB]], align 4, !dbg [[DBG79]]
-// CHECK1-NEXT:    [[TMP29:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG82]]
-// CHECK1-NEXT:    [[TMP30:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !dbg [[DBG82]]
-// CHECK1-NEXT:    [[ADD30:%.*]] = add nsw i32 [[TMP29]], [[TMP30]], !dbg [[DBG79]]
-// CHECK1-NEXT:    store i32 [[ADD30]], ptr [[DOTOMP_UB]], align 4, !dbg [[DBG79]]
-// CHECK1-NEXT:    br label [[OMP_DISPATCH_COND]], !dbg [[DBG88]], !llvm.loop [[LOOP120:![0-9]+]]
+// CHECK1-NEXT:    [[TMP27:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4, !dbg [[DBG80]]
+// CHECK1-NEXT:    [[TMP28:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !dbg [[DBG80]]
+// CHECK1-NEXT:    [[ADD29:%.*]] = add nsw i32 [[TMP27]], [[TMP28]], !dbg [[DBG77]]
+// CHECK1-NEXT:    store i32 [[ADD29]], ptr [[DOTOMP_LB]], align 4, !dbg [[DBG77]]
+// CHECK1-NEXT:    [[TMP29:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG80]]
+// CHECK1-NEXT:    [[TMP30:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !dbg [[DBG80]]
+// CHECK1-NEXT:    [[ADD30:%.*]] = add nsw i32 [[TMP29]], [[TMP30]], !dbg [[DBG77]]
+// CHECK1-NEXT:    store i32 [[ADD30]], ptr [[DOTOMP_UB]], align 4, !dbg [[DBG77]]
+// CHECK1-NEXT:    br label [[OMP_DISPATCH_COND]], !dbg [[DBG86]], !llvm.loop [[LOOP118:![0-9]+]]
 // CHECK1:       omp.dispatch.end:
-// CHECK1-NEXT:    call void @__kmpc_for_static_fini(ptr @[[GLOB5:[0-9]+]], i32 [[TMP9]]), !dbg [[DBG119:![0-9]+]]
-// CHECK1-NEXT:    ret void, !dbg [[DBG121:![0-9]+]]
+// CHECK1-NEXT:    call void @__kmpc_for_static_fini(ptr @[[GLOB5:[0-9]+]], i32 [[TMP9]]), !dbg [[DBG117:![0-9]+]]
+// CHECK1-NEXT:    ret void, !dbg [[DBG119:![0-9]+]]
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l13_debug___omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[C:%.*]], i64 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[B:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[BB:%.*]]) #[[ATTR2]] !dbg [[DBG122:![0-9]+]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[C:%.*]], i64 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[B:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[BB:%.*]]) #[[ATTR2]] !dbg [[DBG120:![0-9]+]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -295,34 +295,34 @@ int main() {
 // CHECK1-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[BB_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTGLOBAL_TID__ADDR]], metadata [[META129:![0-9]+]], metadata !DIExpression()), !dbg [[DBG130:![0-9]+]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[DOTGLOBAL_TID__ADDR]], metadata [[META127:![0-9]+]], metadata !DIExpression()), !dbg [[DBG128:![0-9]+]]
 // CHECK1-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTBOUND_TID__ADDR]], metadata [[META131:![0-9]+]], metadata !DIExpression()), !dbg [[DBG130]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[DOTBOUND_TID__ADDR]], metadata [[META129:![0-9]+]], metadata !DIExpression()), !dbg [[DBG128]]
 // CHECK1-NEXT:    store ptr [[C]], ptr [[C_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META132:![0-9]+]], metadata !DIExpression()), !dbg [[DBG130]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META130:![0-9]+]], metadata !DIExpression()), !dbg [[DBG128]]
 // CHECK1-NEXT:    store i64 [[A]], ptr [[A_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META133:![0-9]+]], metadata !DIExpression()), !dbg [[DBG130]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META131:![0-9]+]], metadata !DIExpression()), !dbg [[DBG128]]
 // CHECK1-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META134:![0-9]+]], metadata !DIExpression()), !dbg [[DBG130]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META132:![0-9]+]], metadata !DIExpression()), !dbg [[DBG128]]
 // CHECK1-NEXT:    store ptr [[BB]], ptr [[BB_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META135:![0-9]+]], metadata !DIExpression()), !dbg [[DBG130]]
-// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG136:![0-9]+]]
-// CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG136]]
-// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG136]]
-// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !dbg [[DBG136]]
-// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[DOTBOUND_TID__ADDR]], align 8, !dbg [[DBG136]]
-// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG136]]
-// CHECK1-NEXT:    [[TMP6:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG136]]
-// CHECK1-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG136]]
-// CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG136]]
-// CHECK1-NEXT:    [[TMP9:%.*]] = addrspacecast ptr [[TMP5]] to ptr addrspace(1), !dbg [[DBG136]]
-// CHECK1-NEXT:    [[TMP10:%.*]] = addrspacecast ptr [[TMP8]] to ptr addrspace(1), !dbg [[DBG136]]
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l13_debug___omp_outlined_debug__(ptr [[TMP3]], ptr [[TMP4]], ptr addrspace(1) [[TMP9]], i32 [[TMP6]], ptr [[TMP7]], ptr addrspace(1) [[TMP10]]) #[[ATTR4:[0-9]+]], !dbg [[DBG136]]
-// CHECK1-NEXT:    ret void, !dbg [[DBG136]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META133:![0-9]+]], metadata !DIExpression()), !dbg [[DBG128]]
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG134:![0-9]+]]
+// CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG134]]
+// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG134]]
+// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !dbg [[DBG134]]
+// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[DOTBOUND_TID__ADDR]], align 8, !dbg [[DBG134]]
+// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG134]]
+// CHECK1-NEXT:    [[TMP6:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG134]]
+// CHECK1-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG134]]
+// CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG134]]
+// CHECK1-NEXT:    [[TMP9:%.*]] = addrspacecast ptr [[TMP5]] to ptr addrspace(1), !dbg [[DBG134]]
+// CHECK1-NEXT:    [[TMP10:%.*]] = addrspacecast ptr [[TMP8]] to ptr addrspace(1), !dbg [[DBG134]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l13_debug___omp_outlined_debug__(ptr [[TMP3]], ptr [[TMP4]], ptr addrspace(1) [[TMP9]], i32 [[TMP6]], ptr [[TMP7]], ptr addrspace(1) [[TMP10]]) #[[ATTR4:[0-9]+]], !dbg [[DBG134]]
+// CHECK1-NEXT:    ret void, !dbg [[DBG134]]
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l13
-// CHECK1-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[C:%.*]], i64 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[B:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[BB:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR6:[0-9]+]] !dbg [[DBG137:![0-9]+]] {
+// CHECK1-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[C:%.*]], i64 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[B:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[BB:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR6:[0-9]+]] !dbg [[DBG135:![0-9]+]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[C_ADDR:%.*]] = alloca ptr, align 8
@@ -331,35 +331,35 @@ int main() {
 // CHECK1-NEXT:    [[BB_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i64, align 8
 // CHECK1-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DYN_PTR_ADDR]], metadata [[META140:![0-9]+]], metadata !DIExpression()), !dbg [[DBG141:![0-9]+]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[DYN_PTR_ADDR]], metadata [[META138:![0-9]+]], metadata !DIExpression()), !dbg [[DBG139:![0-9]+]]
 // CHECK1-NEXT:    store ptr [[C]], ptr [[C_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META142:![0-9]+]], metadata !DIExpression()), !dbg [[DBG141]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META140:![0-9]+]], metadata !DIExpression()), !dbg [[DBG139]]
 // CHECK1-NEXT:    store i64 [[A]], ptr [[A_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META143:![0-9]+]], metadata !DIExpression()), !dbg [[DBG141]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META141:![0-9]+]], metadata !DIExpression()), !dbg [[DBG139]]
 // CHECK1-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META144:![0-9]+]], metadata !DIExpression()), !dbg [[DBG141]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META142:![0-9]+]], metadata !DIExpression()), !dbg [[DBG139]]
 // CHECK1-NEXT:    store ptr [[BB]], ptr [[BB_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META145:![0-9]+]], metadata !DIExpression()), !dbg [[DBG141]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META143:![0-9]+]], metadata !DIExpression()), !dbg [[DBG139]]
 // CHECK1-NEXT:    store i64 [[DOTCAPTURE_EXPR_]], ptr [[DOTCAPTURE_EXPR__ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTCAPTURE_EXPR__ADDR]], metadata [[META146:![0-9]+]], metadata !DIExpression()), !dbg [[DBG141]]
-// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG147:![0-9]+]]
-// CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG147]]
-// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG147]]
-// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[DYN_PTR_ADDR]], align 8, !dbg [[DBG147]]
-// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG147]]
-// CHECK1-NEXT:    [[TMP5:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG147]]
-// CHECK1-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG147]]
-// CHECK1-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG147]]
-// CHECK1-NEXT:    [[TMP8:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR__ADDR]], align 1, !dbg [[DBG147]]
-// CHECK1-NEXT:    [[TOBOOL:%.*]] = trunc i8 [[TMP8]] to i1, !dbg [[DBG147]]
-// CHECK1-NEXT:    [[TMP9:%.*]] = addrspacecast ptr [[TMP4]] to ptr addrspace(1), !dbg [[DBG147]]
-// CHECK1-NEXT:    [[TMP10:%.*]] = addrspacecast ptr [[TMP7]] to ptr addrspace(1), !dbg [[DBG147]]
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l13_debug__(ptr [[TMP3]], ptr addrspace(1) [[TMP9]], i32 [[TMP5]], ptr [[TMP6]], ptr addrspace(1) [[TMP10]], i1 [[TOBOOL]]) #[[ATTR4]], !dbg [[DBG147]]
-// CHECK1-NEXT:    ret void, !dbg [[DBG147]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[DOTCAPTURE_EXPR__ADDR]], metadata [[META144:![0-9]+]], metadata !DIExpression()), !dbg [[DBG139]]
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG145:![0-9]+]]
+// CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG145]]
+// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG145]]
+// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[DYN_PTR_ADDR]], align 8, !dbg [[DBG145]]
+// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG145]]
+// CHECK1-NEXT:    [[TMP5:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG145]]
+// CHECK1-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG145]]
+// CHECK1-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG145]]
+// CHECK1-NEXT:    [[TMP8:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR__ADDR]], align 1, !dbg [[DBG145]]
+// CHECK1-NEXT:    [[TOBOOL:%.*]] = trunc i8 [[TMP8]] to i1, !dbg [[DBG145]]
+// CHECK1-NEXT:    [[TMP9:%.*]] = addrspacecast ptr [[TMP4]] to ptr addrspace(1), !dbg [[DBG145]]
+// CHECK1-NEXT:    [[TMP10:%.*]] = addrspacecast ptr [[TMP7]] to ptr addrspace(1), !dbg [[DBG145]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l13_debug__(ptr [[TMP3]], ptr addrspace(1) [[TMP9]], i32 [[TMP5]], ptr [[TMP6]], ptr addrspace(1) [[TMP10]], i1 [[TOBOOL]]) #[[ATTR4]], !dbg [[DBG145]]
+// CHECK1-NEXT:    ret void, !dbg [[DBG145]]
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l27_debug__
-// CHECK1-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr addrspace(1) noalias noundef [[C:%.*]], i32 noundef [[A:%.*]], ptr addrspace(1) noalias noundef [[B:%.*]], ptr addrspace(1) noalias noundef [[BB:%.*]]) #[[ATTR0]] !dbg [[DBG148:![0-9]+]] {
+// CHECK1-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr addrspace(1) noalias noundef [[C:%.*]], i32 noundef [[A:%.*]], ptr addrspace(1) noalias noundef [[B:%.*]], ptr addrspace(1) noalias noundef [[BB:%.*]]) #[[ATTR0]] !dbg [[DBG146:![0-9]+]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[C_ADDR:%.*]] = alloca ptr addrspace(1), align 8
@@ -372,32 +372,32 @@ int main() {
 // CHECK1-NEXT:    [[A_CASTED:%.*]] = alloca i64, align 8
 // CHECK1-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x ptr], align 8
 // CHECK1-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DYN_PTR_ADDR]], metadata [[META153:![0-9]+]], metadata !DIExpression()), !dbg [[DBG154:![0-9]+]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[DYN_PTR_ADDR]], metadata [[META151:![0-9]+]], metadata !DIExpression()), !dbg [[DBG152:![0-9]+]]
 // CHECK1-NEXT:    store ptr addrspace(1) [[C]], ptr [[C_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META155:![0-9]+]], metadata !DIExpression()), !dbg [[DBG156:![0-9]+]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META153:![0-9]+]], metadata !DIExpression()), !dbg [[DBG154:![0-9]+]]
 // CHECK1-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META157:![0-9]+]], metadata !DIExpression()), !dbg [[DBG158:![0-9]+]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META155:![0-9]+]], metadata !DIExpression()), !dbg [[DBG156:![0-9]+]]
 // CHECK1-NEXT:    store ptr addrspace(1) [[B]], ptr [[B_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META159:![0-9]+]], metadata !DIExpression()), !dbg [[DBG160:![0-9]+]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META157:![0-9]+]], metadata !DIExpression()), !dbg [[DBG158:![0-9]+]]
 // CHECK1-NEXT:    store ptr addrspace(1) [[BB]], ptr [[BB_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META161:![0-9]+]], metadata !DIExpression()), !dbg [[DBG162:![0-9]+]]
-// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr [[C_ADDR]], align 8, !dbg [[DBG163:![0-9]+]]
-// CHECK1-NEXT:    [[TMP1:%.*]] = addrspacecast ptr addrspace(1) [[TMP0]] to ptr, !dbg [[DBG163]]
-// CHECK1-NEXT:    store ptr [[TMP1]], ptr [[TMP]], align 8, !dbg [[DBG163]]
-// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP]], align 8, !dbg [[DBG163]]
-// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr addrspace(1), ptr [[B_ADDR]], align 8, !dbg [[DBG163]]
-// CHECK1-NEXT:    [[TMP4:%.*]] = addrspacecast ptr addrspace(1) [[TMP3]] to ptr, !dbg [[DBG163]]
-// CHECK1-NEXT:    store ptr [[TMP4]], ptr [[_TMP1]], align 8, !dbg [[DBG163]]
-// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[_TMP1]], align 8, !dbg [[DBG163]]
-// CHECK1-NEXT:    [[TMP6:%.*]] = load ptr addrspace(1), ptr [[BB_ADDR]], align 8, !dbg [[DBG163]]
-// CHECK1-NEXT:    [[TMP7:%.*]] = addrspacecast ptr addrspace(1) [[TMP6]] to ptr, !dbg [[DBG163]]
-// CHECK1-NEXT:    store ptr [[TMP7]], ptr [[_TMP2]], align 8, !dbg [[DBG163]]
-// CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[_TMP2]], align 8, !dbg [[DBG163]]
-// CHECK1-NEXT:    [[TMP9:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l27_kernel_environment, ptr [[DYN_PTR]]), !dbg [[DBG163]]
-// CHECK1-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP9]], -1, !dbg [[DBG163]]
-// CHECK1-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]], !dbg [[DBG163]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META159:![0-9]+]], metadata !DIExpression()), !dbg [[DBG160:![0-9]+]]
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr [[C_ADDR]], align 8, !dbg [[DBG161:![0-9]+]]
+// CHECK1-NEXT:    [[TMP1:%.*]] = addrspacecast ptr addrspace(1) [[TMP0]] to ptr, !dbg [[DBG161]]
+// CHECK1-NEXT:    store ptr [[TMP1]], ptr [[TMP]], align 8, !dbg [[DBG161]]
+// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP]], align 8, !dbg [[DBG161]]
+// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr addrspace(1), ptr [[B_ADDR]], align 8, !dbg [[DBG161]]
+// CHECK1-NEXT:    [[TMP4:%.*]] = addrspacecast ptr addrspace(1) [[TMP3]] to ptr, !dbg [[DBG161]]
+// CHECK1-NEXT:    store ptr [[TMP4]], ptr [[_TMP1]], align 8, !dbg [[DBG161]]
+// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[_TMP1]], align 8, !dbg [[DBG161]]
+// CHECK1-NEXT:    [[TMP6:%.*]] = load ptr addrspace(1), ptr [[BB_ADDR]], align 8, !dbg [[DBG161]]
+// CHECK1-NEXT:    [[TMP7:%.*]] = addrspacecast ptr addrspace(1) [[TMP6]] to ptr, !dbg [[DBG161]]
+// CHECK1-NEXT:    store ptr [[TMP7]], ptr [[_TMP2]], align 8, !dbg [[DBG161]]
+// CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[_TMP2]], align 8, !dbg [[DBG161]]
+// CHECK1-NEXT:    [[TMP9:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l27_kernel_environment, ptr [[DYN_PTR]]), !dbg [[DBG161]]
+// CHECK1-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP9]], -1, !dbg [[DBG161]]
+// CHECK1-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]], !dbg [[DBG161]]
 // CHECK1:       user_code.entry:
-// CHECK1-NEXT:    [[TMP10:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB13:[0-9]+]])
+// CHECK1-NEXT:    [[TMP10:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB13:[0-9]+]]), !dbg [[DBG162:![0-9]+]]
 // CHECK1-NEXT:    [[TMP11:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG164:![0-9]+]]
 // CHECK1-NEXT:    store i32 [[TMP11]], ptr [[A_CASTED]], align 4, !dbg [[DBG164]]
 // CHECK1-NEXT:    [[TMP12:%.*]] = load i64, ptr [[A_CASTED]], align 8, !dbg [[DBG164]]
@@ -412,13 +412,13 @@ int main() {
 // CHECK1-NEXT:    store ptr [[TMP8]], ptr [[TMP17]], align 8, !dbg [[DBG164]]
 // CHECK1-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB13]], i32 [[TMP10]], i32 1, i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l27_debug___omp_outlined, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i64 4), !dbg [[DBG164]]
 // CHECK1-NEXT:    call void @__kmpc_target_deinit(), !dbg [[DBG165:![0-9]+]]
-// CHECK1-NEXT:    ret void, !dbg [[DBG167:![0-9]+]]
+// CHECK1-NEXT:    ret void, !dbg [[DBG166:![0-9]+]]
 // CHECK1:       worker.exit:
-// CHECK1-NEXT:    ret void, !dbg [[DBG163]]
+// CHECK1-NEXT:    ret void, !dbg [[DBG161]]
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l27_debug___omp_outlined_debug__
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr addrspace(1) noalias noundef [[C:%.*]], i32 noundef [[A:%.*]], ptr addrspace(1) noalias noundef [[B:%.*]], ptr addrspace(1) noalias noundef [[BB:%.*]]) #[[ATTR2]] !dbg [[DBG168:![0-9]+]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr addrspace(1) noalias noundef [[C:%.*]], i32 noundef [[A:%.*]], ptr addrspace(1) noalias noundef [[B:%.*]], ptr addrspace(1) noalias noundef [[BB:%.*]]) #[[ATTR2]] !dbg [[DBG167:![0-9]+]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -441,140 +441,140 @@ int main() {
 // CHECK1-NEXT:    [[H:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[D:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTGLOBAL_TID__ADDR]], metadata [[META171:![0-9]+]], metadata !DIExpression()), !dbg [[DBG172:![0-9]+]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[DOTGLOBAL_TID__ADDR]], metadata [[META170:![0-9]+]], metadata !DIExpression()), !dbg [[DBG171:![0-9]+]]
 // CHECK1-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTBOUND_TID__ADDR]], metadata [[META173:![0-9]+]], metadata !DIExpression()), !dbg [[DBG172]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[DOTBOUND_TID__ADDR]], metadata [[META172:![0-9]+]], metadata !DIExpression()), !dbg [[DBG171]]
 // CHECK1-NEXT:    store ptr addrspace(1) [[C]], ptr [[C_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META174:![0-9]+]], metadata !DIExpression()), !dbg [[DBG175:![0-9]+]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META173:![0-9]+]], metadata !DIExpression()), !dbg [[DBG174:![0-9]+]]
 // CHECK1-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META176:![0-9]+]], metadata !DIExpression()), !dbg [[DBG177:![0-9]+]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META175:![0-9]+]], metadata !DIExpression()), !dbg [[DBG176:![0-9]+]]
 // CHECK1-NEXT:    store ptr addrspace(1) [[B]], ptr [[B_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META178:![0-9]+]], metadata !DIExpression()), !dbg [[DBG179:![0-9]+]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META177:![0-9]+]], metadata !DIExpression()), !dbg [[DBG178:![0-9]+]]
 // CHECK1-NEXT:    store ptr addrspace(1) [[BB]], ptr [[BB_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META180:![0-9]+]], metadata !DIExpression()), !dbg [[DBG181:![0-9]+]]
-// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr [[C_ADDR]], align 8, !dbg [[DBG182:![0-9]+]]
-// CHECK1-NEXT:    [[TMP1:%.*]] = addrspacecast ptr addrspace(1) [[TMP0]] to ptr, !dbg [[DBG182]]
-// CHECK1-NEXT:    store ptr [[TMP1]], ptr [[TMP]], align 8, !dbg [[DBG182]]
-// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP]], align 8, !dbg [[DBG182]]
-// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr addrspace(1), ptr [[B_ADDR]], align 8, !dbg [[DBG182]]
-// CHECK1-NEXT:    [[TMP4:%.*]] = addrspacecast ptr addrspace(1) [[TMP3]] to ptr, !dbg [[DBG182]]
-// CHECK1-NEXT:    store ptr [[TMP4]], ptr [[_TMP1]], align 8, !dbg [[DBG182]]
-// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[_TMP1]], align 8, !dbg [[DBG182]]
-// CHECK1-NEXT:    [[TMP6:%.*]] = load ptr addrspace(1), ptr [[BB_ADDR]], align 8, !dbg [[DBG182]]
-// CHECK1-NEXT:    [[TMP7:%.*]] = addrspacecast ptr addrspace(1) [[TMP6]] to ptr, !dbg [[DBG182]]
-// CHECK1-NEXT:    store ptr [[TMP7]], ptr [[_TMP2]], align 8, !dbg [[DBG182]]
-// CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[_TMP2]], align 8, !dbg [[DBG182]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTOMP_IV]], metadata [[META183:![0-9]+]], metadata !DIExpression()), !dbg [[DBG172]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTOMP_LB]], metadata [[META184:![0-9]+]], metadata !DIExpression()), !dbg [[DBG172]]
-// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_LB]], align 4, !dbg [[DBG185:![0-9]+]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTOMP_UB]], metadata [[META186:![0-9]+]], metadata !DIExpression()), !dbg [[DBG172]]
-// CHECK1-NEXT:    store i32 9, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG185]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTOMP_STRIDE]], metadata [[META187:![0-9]+]], metadata !DIExpression()), !dbg [[DBG172]]
-// CHECK1-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4, !dbg [[DBG185]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTOMP_IS_LAST]], metadata [[META188:![0-9]+]], metadata !DIExpression()), !dbg [[DBG172]]
-// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4, !dbg [[DBG185]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[I]], metadata [[META189:![0-9]+]], metadata !DIExpression()), !dbg [[DBG172]]
-// CHECK1-NEXT:    [[TMP9:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !dbg [[DBG182]]
-// CHECK1-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4, !dbg [[DBG182]]
-// CHECK1-NEXT:    call void @__kmpc_for_static_init_4(ptr @[[GLOB10:[0-9]+]], i32 [[TMP10]], i32 33, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1), !dbg [[DBG190:![0-9]+]]
-// CHECK1-NEXT:    br label [[OMP_DISPATCH_COND:%.*]], !dbg [[DBG182]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META179:![0-9]+]], metadata !DIExpression()), !dbg [[DBG180:![0-9]+]]
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr [[C_ADDR]], align 8, !dbg [[DBG181:![0-9]+]]
+// CHECK1-NEXT:    [[TMP1:%.*]] = addrspacecast ptr addrspace(1) [[TMP0]] to ptr, !dbg [[DBG181]]
+// CHECK1-NEXT:    store ptr [[TMP1]], ptr [[TMP]], align 8, !dbg [[DBG181]]
+// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP]], align 8, !dbg [[DBG181]]
+// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr addrspace(1), ptr [[B_ADDR]], align 8, !dbg [[DBG181]]
+// CHECK1-NEXT:    [[TMP4:%.*]] = addrspacecast ptr addrspace(1) [[TMP3]] to ptr, !dbg [[DBG181]]
+// CHECK1-NEXT:    store ptr [[TMP4]], ptr [[_TMP1]], align 8, !dbg [[DBG181]]
+// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[_TMP1]], align 8, !dbg [[DBG181]]
+// CHECK1-NEXT:    [[TMP6:%.*]] = load ptr addrspace(1), ptr [[BB_ADDR]], align 8, !dbg [[DBG181]]
+// CHECK1-NEXT:    [[TMP7:%.*]] = addrspacecast ptr addrspace(1) [[TMP6]] to ptr, !dbg [[DBG181]]
+// CHECK1-NEXT:    store ptr [[TMP7]], ptr [[_TMP2]], align 8, !dbg [[DBG181]]
+// CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[_TMP2]], align 8, !dbg [[DBG181]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[DOTOMP_IV]], metadata [[META182:![0-9]+]], metadata !DIExpression()), !dbg [[DBG171]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[DOTOMP_LB]], metadata [[META183:![0-9]+]], metadata !DIExpression()), !dbg [[DBG171]]
+// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_LB]], align 4, !dbg [[DBG184:![0-9]+]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[DOTOMP_UB]], metadata [[META185:![0-9]+]], metadata !DIExpression()), !dbg [[DBG171]]
+// CHECK1-NEXT:    store i32 9, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG184]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[DOTOMP_STRIDE]], metadata [[META186:![0-9]+]], metadata !DIExpression()), !dbg [[DBG171]]
+// CHECK1-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4, !dbg [[DBG184]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[DOTOMP_IS_LAST]], metadata [[META187:![0-9]+]], metadata !DIExpression()), !dbg [[DBG171]]
+// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4, !dbg [[DBG184]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[I]], metadata [[META188:![0-9]+]], metadata !DIExpression()), !dbg [[DBG171]]
+// CHECK1-NEXT:    [[TMP9:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !dbg [[DBG181]]
+// CHECK1-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4, !dbg [[DBG181]]
+// CHECK1-NEXT:    call void @__kmpc_for_static_init_4(ptr @[[GLOB10:[0-9]+]], i32 [[TMP10]], i32 33, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1), !dbg [[DBG189:![0-9]+]]
+// CHECK1-NEXT:    br label [[OMP_DISPATCH_COND:%.*]], !dbg [[DBG181]]
 // CHECK1:       omp.dispatch.cond:
-// CHECK1-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG185]]
-// CHECK1-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP11]], 9, !dbg [[DBG185]]
-// CHECK1-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]], !dbg [[DBG185]]
+// CHECK1-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG184]]
+// CHECK1-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP11]], 9, !dbg [[DBG184]]
+// CHECK1-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]], !dbg [[DBG184]]
 // CHECK1:       cond.true:
-// CHECK1-NEXT:    br label [[COND_END:%.*]], !dbg [[DBG185]]
+// CHECK1-NEXT:    br label [[COND_END:%.*]], !dbg [[DBG184]]
 // CHECK1:       cond.false:
-// CHECK1-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG185]]
-// CHECK1-NEXT:    br label [[COND_END]], !dbg [[DBG185]]
+// CHECK1-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG184]]
+// CHECK1-NEXT:    br label [[COND_END]], !dbg [[DBG184]]
 // CHECK1:       cond.end:
-// CHECK1-NEXT:    [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP12]], [[COND_FALSE]] ], !dbg [[DBG185]]
-// CHECK1-NEXT:    store i32 [[COND]], ptr [[DOTOMP_UB]], align 4, !dbg [[DBG185]]
-// CHECK1-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4, !dbg [[DBG185]]
-// CHECK1-NEXT:    store i32 [[TMP13]], ptr [[DOTOMP_IV]], align 4, !dbg [[DBG185]]
-// CHECK1-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG185]]
-// CHECK1-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG185]]
-// CHECK1-NEXT:    [[CMP4:%.*]] = icmp sle i32 [[TMP14]], [[TMP15]], !dbg [[DBG182]]
-// CHECK1-NEXT:    br i1 [[CMP4]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]], !dbg [[DBG182]]
+// CHECK1-NEXT:    [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP12]], [[COND_FALSE]] ], !dbg [[DBG184]]
+// CHECK1-NEXT:    store i32 [[COND]], ptr [[DOTOMP_UB]], align 4, !dbg [[DBG184]]
+// CHECK1-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4, !dbg [[DBG184]]
+// CHECK1-NEXT:    store i32 [[TMP13]], ptr [[DOTOMP_IV]], align 4, !dbg [[DBG184]]
+// CHECK1-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG184]]
+// CHECK1-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG184]]
+// CHECK1-NEXT:    [[CMP4:%.*]] = icmp sle i32 [[TMP14]], [[TMP15]], !dbg [[DBG181]]
+// CHECK1-NEXT:    br i1 [[CMP4]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]], !dbg [[DBG181]]
 // CHECK1:       omp.dispatch.body:
-// CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]], !dbg [[DBG182]]
+// CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]], !dbg [[DBG181]]
 // CHECK1:       omp.inner.for.cond:
-// CHECK1-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG185]]
-// CHECK1-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG185]]
-// CHECK1-NEXT:    [[CMP5:%.*]] = icmp sle i32 [[TMP16]], [[TMP17]], !dbg [[DBG182]]
-// CHECK1-NEXT:    br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]], !dbg [[DBG182]]
+// CHECK1-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG184]]
+// CHECK1-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG184]]
+// CHECK1-NEXT:    [[CMP5:%.*]] = icmp sle i32 [[TMP16]], [[TMP17]], !dbg [[DBG181]]
+// CHECK1-NEXT:    br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]], !dbg [[DBG181]]
 // CHECK1:       omp.inner.for.body:
-// CHECK1-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG185]]
-// CHECK1-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP18]], 1, !dbg [[DBG191:![0-9]+]]
-// CHECK1-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]], !dbg [[DBG191]]
-// CHECK1-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !dbg [[DBG191]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[F]], metadata [[META192:![0-9]+]], metadata !DIExpression()), !dbg [[DBG194:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 1, !dbg [[DBG195:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX]], i64 0, i64 1, !dbg [[DBG195]]
-// CHECK1-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX6]], i64 0, i64 1, !dbg [[DBG195]]
-// CHECK1-NEXT:    store ptr [[ARRAYIDX7]], ptr [[F]], align 8, !dbg [[DBG194]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[G]], metadata [[META196:![0-9]+]], metadata !DIExpression()), !dbg [[DBG197:![0-9]+]]
-// CHECK1-NEXT:    store ptr [[A_ADDR]], ptr [[G]], align 8, !dbg [[DBG197]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[H]], metadata [[META198:![0-9]+]], metadata !DIExpression()), !dbg [[DBG199:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP5]], i64 0, i64 1, !dbg [[DBG200:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX8]], i64 0, i64 1, !dbg [[DBG200]]
-// CHECK1-NEXT:    store ptr [[ARRAYIDX9]], ptr [[H]], align 8, !dbg [[DBG199]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[D]], metadata [[META201:![0-9]+]], metadata !DIExpression()), !dbg [[DBG202:![0-9]+]]
-// CHECK1-NEXT:    store i32 15, ptr [[D]], align 4, !dbg [[DBG202]]
-// CHECK1-NEXT:    store i32 5, ptr [[A_ADDR]], align 4, !dbg [[DBG203:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP5]], i64 0, i64 0, !dbg [[DBG204:![0-9]+]]
-// CHECK1-NEXT:    [[TMP19:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG205:![0-9]+]]
-// CHECK1-NEXT:    [[IDXPROM:%.*]] = sext i32 [[TMP19]] to i64, !dbg [[DBG204]]
-// CHECK1-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX10]], i64 0, i64 [[IDXPROM]], !dbg [[DBG204]]
-// CHECK1-NEXT:    store i32 10, ptr [[ARRAYIDX11]], align 4, !dbg [[DBG206:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 0, !dbg [[DBG207:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX12]], i64 0, i64 0, !dbg [[DBG207]]
-// CHECK1-NEXT:    [[TMP20:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG208:![0-9]+]]
-// CHECK1-NEXT:    [[IDXPROM14:%.*]] = sext i32 [[TMP20]] to i64, !dbg [[DBG207]]
-// CHECK1-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX13]], i64 0, i64 [[IDXPROM14]], !dbg [[DBG207]]
-// CHECK1-NEXT:    store i32 11, ptr [[ARRAYIDX15]], align 4, !dbg [[DBG209:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 0, !dbg [[DBG210:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX17:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX16]], i64 0, i64 0, !dbg [[DBG210]]
-// CHECK1-NEXT:    [[TMP21:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG211:![0-9]+]]
-// CHECK1-NEXT:    [[IDXPROM18:%.*]] = sext i32 [[TMP21]] to i64, !dbg [[DBG210]]
-// CHECK1-NEXT:    [[ARRAYIDX19:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX17]], i64 0, i64 [[IDXPROM18]], !dbg [[DBG210]]
-// CHECK1-NEXT:    [[TMP22:%.*]] = load i32, ptr [[ARRAYIDX19]], align 4, !dbg [[DBG210]]
-// CHECK1-NEXT:    [[ARRAYIDX20:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP5]], i64 0, i64 0, !dbg [[DBG212:![0-9]+]]
-// CHECK1-NEXT:    [[TMP23:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG213:![0-9]+]]
-// CHECK1-NEXT:    [[IDXPROM21:%.*]] = sext i32 [[TMP23]] to i64, !dbg [[DBG212]]
-// CHECK1-NEXT:    [[ARRAYIDX22:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX20]], i64 0, i64 [[IDXPROM21]], !dbg [[DBG212]]
-// CHECK1-NEXT:    store i32 [[TMP22]], ptr [[ARRAYIDX22]], align 4, !dbg [[DBG214:![0-9]+]]
-// CHECK1-NEXT:    [[TMP24:%.*]] = load i8, ptr [[TMP8]], align 1, !dbg [[DBG215:![0-9]+]]
-// CHECK1-NEXT:    [[TOBOOL:%.*]] = trunc i8 [[TMP24]] to i1, !dbg [[DBG215]]
-// CHECK1-NEXT:    [[CONV:%.*]] = zext i1 [[TOBOOL]] to i32, !dbg [[DBG215]]
-// CHECK1-NEXT:    store i32 [[CONV]], ptr [[D]], align 4, !dbg [[DBG216:![0-9]+]]
-// CHECK1-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]], !dbg [[DBG217:![0-9]+]]
+// CHECK1-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG184]]
+// CHECK1-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP18]], 1, !dbg [[DBG190:![0-9]+]]
+// CHECK1-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]], !dbg [[DBG190]]
+// CHECK1-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !dbg [[DBG190]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[F]], metadata [[META191:![0-9]+]], metadata !DIExpression()), !dbg [[DBG193:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 1, !dbg [[DBG194:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX]], i64 0, i64 1, !dbg [[DBG194]]
+// CHECK1-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX6]], i64 0, i64 1, !dbg [[DBG194]]
+// CHECK1-NEXT:    store ptr [[ARRAYIDX7]], ptr [[F]], align 8, !dbg [[DBG193]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[G]], metadata [[META195:![0-9]+]], metadata !DIExpression()), !dbg [[DBG196:![0-9]+]]
+// CHECK1-NEXT:    store ptr [[A_ADDR]], ptr [[G]], align 8, !dbg [[DBG196]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[H]], metadata [[META197:![0-9]+]], metadata !DIExpression()), !dbg [[DBG198:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP5]], i64 0, i64 1, !dbg [[DBG199:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX8]], i64 0, i64 1, !dbg [[DBG199]]
+// CHECK1-NEXT:    store ptr [[ARRAYIDX9]], ptr [[H]], align 8, !dbg [[DBG198]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[D]], metadata [[META200:![0-9]+]], metadata !DIExpression()), !dbg [[DBG201:![0-9]+]]
+// CHECK1-NEXT:    store i32 15, ptr [[D]], align 4, !dbg [[DBG201]]
+// CHECK1-NEXT:    store i32 5, ptr [[A_ADDR]], align 4, !dbg [[DBG202:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP5]], i64 0, i64 0, !dbg [[DBG203:![0-9]+]]
+// CHECK1-NEXT:    [[TMP19:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG204:![0-9]+]]
+// CHECK1-NEXT:    [[IDXPROM:%.*]] = sext i32 [[TMP19]] to i64, !dbg [[DBG203]]
+// CHECK1-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX10]], i64 0, i64 [[IDXPROM]], !dbg [[DBG203]]
+// CHECK1-NEXT:    store i32 10, ptr [[ARRAYIDX11]], align 4, !dbg [[DBG205:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 0, !dbg [[DBG206:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX12]], i64 0, i64 0, !dbg [[DBG206]]
+// CHECK1-NEXT:    [[TMP20:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG207:![0-9]+]]
+// CHECK1-NEXT:    [[IDXPROM14:%.*]] = sext i32 [[TMP20]] to i64, !dbg [[DBG206]]
+// CHECK1-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX13]], i64 0, i64 [[IDXPROM14]], !dbg [[DBG206]]
+// CHECK1-NEXT:    store i32 11, ptr [[ARRAYIDX15]], align 4, !dbg [[DBG208:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 0, !dbg [[DBG209:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX17:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX16]], i64 0, i64 0, !dbg [[DBG209]]
+// CHECK1-NEXT:    [[TMP21:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG210:![0-9]+]]
+// CHECK1-NEXT:    [[IDXPROM18:%.*]] = sext i32 [[TMP21]] to i64, !dbg [[DBG209]]
+// CHECK1-NEXT:    [[ARRAYIDX19:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX17]], i64 0, i64 [[IDXPROM18]], !dbg [[DBG209]]
+// CHECK1-NEXT:    [[TMP22:%.*]] = load i32, ptr [[ARRAYIDX19]], align 4, !dbg [[DBG209]]
+// CHECK1-NEXT:    [[ARRAYIDX20:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP5]], i64 0, i64 0, !dbg [[DBG211:![0-9]+]]
+// CHECK1-NEXT:    [[TMP23:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG212:![0-9]+]]
+// CHECK1-NEXT:    [[IDXPROM21:%.*]] = sext i32 [[TMP23]] to i64, !dbg [[DBG211]]
+// CHECK1-NEXT:    [[ARRAYIDX22:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX20]], i64 0, i64 [[IDXPROM21]], !dbg [[DBG211]]
+// CHECK1-NEXT:    store i32 [[TMP22]], ptr [[ARRAYIDX22]], align 4, !dbg [[DBG213:![0-9]+]]
+// CHECK1-NEXT:    [[TMP24:%.*]] = load i8, ptr [[TMP8]], align 1, !dbg [[DBG214:![0-9]+]]
+// CHECK1-NEXT:    [[TOBOOL:%.*]] = trunc i8 [[TMP24]] to i1, !dbg [[DBG214]]
+// CHECK1-NEXT:    [[CONV:%.*]] = zext i1 [[TOBOOL]] to i32, !dbg [[DBG214]]
+// CHECK1-NEXT:    store i32 [[CONV]], ptr [[D]], align 4, !dbg [[DBG215:![0-9]+]]
+// CHECK1-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]], !dbg [[DBG216:![0-9]+]]
 // CHECK1:       omp.body.continue:
-// CHECK1-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]], !dbg [[DBG190]]
+// CHECK1-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]], !dbg [[DBG189]]
 // CHECK1:       omp.inner.for.inc:
-// CHECK1-NEXT:    [[TMP25:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG185]]
-// CHECK1-NEXT:    [[ADD23:%.*]] = add nsw i32 [[TMP25]], 1, !dbg [[DBG182]]
-// CHECK1-NEXT:    store i32 [[ADD23]], ptr [[DOTOMP_IV]], align 4, !dbg [[DBG182]]
-// CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND]], !dbg [[DBG190]], !llvm.loop [[LOOP218:![0-9]+]]
+// CHECK1-NEXT:    [[TMP25:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG184]]
+// CHECK1-NEXT:    [[ADD23:%.*]] = add nsw i32 [[TMP25]], 1, !dbg [[DBG181]]
+// CHECK1-NEXT:    store i32 [[ADD23]], ptr [[DOTOMP_IV]], align 4, !dbg [[DBG181]]
+// CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND]], !dbg [[DBG189]], !llvm.loop [[LOOP217:![0-9]+]]
 // CHECK1:       omp.inner.for.end:
-// CHECK1-NEXT:    br label [[OMP_DISPATCH_INC:%.*]], !dbg [[DBG190]]
+// CHECK1-NEXT:    br label [[OMP_DISPATCH_INC:%.*]], !dbg [[DBG189]]
 // CHECK1:       omp.dispatch.inc:
-// CHECK1-NEXT:    [[TMP26:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4, !dbg [[DBG185]]
-// CHECK1-NEXT:    [[TMP27:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !dbg [[DBG185]]
-// CHECK1-NEXT:    [[ADD24:%.*]] = add nsw i32 [[TMP26]], [[TMP27]], !dbg [[DBG182]]
-// CHECK1-NEXT:    store i32 [[ADD24]], ptr [[DOTOMP_LB]], align 4, !dbg [[DBG182]]
-// CHECK1-NEXT:    [[TMP28:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG185]]
-// CHECK1-NEXT:    [[TMP29:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !dbg [[DBG185]]
-// CHECK1-NEXT:    [[ADD25:%.*]] = add nsw i32 [[TMP28]], [[TMP29]], !dbg [[DBG182]]
-// CHECK1-NEXT:    store i32 [[ADD25]], ptr [[DOTOMP_UB]], align 4, !dbg [[DBG182]]
-// CHECK1-NEXT:    br label [[OMP_DISPATCH_COND]], !dbg [[DBG190]], !llvm.loop [[LOOP220:![0-9]+]]
+// CHECK1-NEXT:    [[TMP26:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4, !dbg [[DBG184]]
+// CHECK1-NEXT:    [[TMP27:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !dbg [[DBG184]]
+// CHECK1-NEXT:    [[ADD24:%.*]] = add nsw i32 [[TMP26]], [[TMP27]], !dbg [[DBG181]]
+// CHECK1-NEXT:    store i32 [[ADD24]], ptr [[DOTOMP_LB]], align 4, !dbg [[DBG181]]
+// CHECK1-NEXT:    [[TMP28:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG184]]
+// CHECK1-NEXT:    [[TMP29:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !dbg [[DBG184]]
+// CHECK1-NEXT:    [[ADD25:%.*]] = add nsw i32 [[TMP28]], [[TMP29]], !dbg [[DBG181]]
+// CHECK1-NEXT:    store i32 [[ADD25]], ptr [[DOTOMP_UB]], align 4, !dbg [[DBG181]]
+// CHECK1-NEXT:    br label [[OMP_DISPATCH_COND]], !dbg [[DBG189]], !llvm.loop [[LOOP219:![0-9]+]]
 // CHECK1:       omp.dispatch.end:
-// CHECK1-NEXT:    call void @__kmpc_for_static_fini(ptr @[[GLOB12:[0-9]+]], i32 [[TMP10]]), !dbg [[DBG219:![0-9]+]]
-// CHECK1-NEXT:    ret void, !dbg [[DBG221:![0-9]+]]
+// CHECK1-NEXT:    call void @__kmpc_for_static_fini(ptr @[[GLOB12:[0-9]+]], i32 [[TMP10]]), !dbg [[DBG218:![0-9]+]]
+// CHECK1-NEXT:    ret void, !dbg [[DBG220:![0-9]+]]
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l27_debug___omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[C:%.*]], i64 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[B:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[BB:%.*]]) #[[ATTR2]] !dbg [[DBG222:![0-9]+]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[C:%.*]], i64 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[B:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[BB:%.*]]) #[[ATTR2]] !dbg [[DBG221:![0-9]+]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -583,35 +583,35 @@ int main() {
 // CHECK1-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[BB_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTGLOBAL_TID__ADDR]], metadata [[META223:![0-9]+]], metadata !DIExpression()), !dbg [[DBG224:![0-9]+]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[DOTGLOBAL_TID__ADDR]], metadata [[META222:![0-9]+]], metadata !DIExpression()), !dbg [[DBG223:![0-9]+]]
 // CHECK1-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTBOUND_TID__ADDR]], metadata [[META225:![0-9]+]], metadata !DIExpression()), !dbg [[DBG224]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[DOTBOUND_TID__ADDR]], metadata [[META224:![0-9]+]], metadata !DIExpression()), !dbg [[DBG223]]
 // CHECK1-NEXT:    store ptr [[C]], ptr [[C_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META226:![0-9]+]], metadata !DIExpression()), !dbg [[DBG224]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META225:![0-9]+]], metadata !DIExpression()), !dbg [[DBG223]]
 // CHECK1-NEXT:    store i64 [[A]], ptr [[A_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META227:![0-9]+]], metadata !DIExpression()), !dbg [[DBG224]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META226:![0-9]+]], metadata !DIExpression()), !dbg [[DBG223]]
 // CHECK1-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META228:![0-9]+]], metadata !DIExpression()), !dbg [[DBG224]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META227:![0-9]+]], metadata !DIExpression()), !dbg [[DBG223]]
 // CHECK1-NEXT:    store ptr [[BB]], ptr [[BB_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META229:![0-9]+]], metadata !DIExpression()), !dbg [[DBG224]]
-// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG230:![0-9]+]]
-// CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG230]]
-// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG230]]
-// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !dbg [[DBG230]]
-// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[DOTBOUND_TID__ADDR]], align 8, !dbg [[DBG230]]
-// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG230]]
-// CHECK1-NEXT:    [[TMP6:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG230]]
-// CHECK1-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG230]]
-// CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG230]]
-// CHECK1-NEXT:    [[TMP9:%.*]] = addrspacecast ptr [[TMP5]] to ptr addrspace(1), !dbg [[DBG230]]
-// CHECK1-NEXT:    [[TMP10:%.*]] = addrspacecast ptr [[TMP7]] to ptr addrspace(1), !dbg [[DBG230]]
-// CHECK1-NEXT:    [[TMP11:%.*]] = addrspacecast ptr [[TMP8]] to ptr addrspace(1), !dbg [[DBG230]]
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l27_debug___omp_outlined_debug__(ptr [[TMP3]], ptr [[TMP4]], ptr addrspace(1) [[TMP9]], i32 [[TMP6]], ptr addrspace(1) [[TMP10]], ptr addrspace(1) [[TMP11]]) #[[ATTR4]], !dbg [[DBG230]]
-// CHECK1-NEXT:    ret void, !dbg [[DBG230]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META228:![0-9]+]], metadata !DIExpression()), !dbg [[DBG223]]
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG229:![0-9]+]]
+// CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG229]]
+// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG229]]
+// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !dbg [[DBG229]]
+// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[DOTBOUND_TID__ADDR]], align 8, !dbg [[DBG229]]
+// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG229]]
+// CHECK1-NEXT:    [[TMP6:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG229]]
+// CHECK1-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG229]]
+// CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG229]]
+// CHECK1-NEXT:    [[TMP9:%.*]] = addrspacecast ptr [[TMP5]] to ptr addrspace(1), !dbg [[DBG229]]
+// CHECK1-NEXT:    [[TMP10:%.*]] = addrspacecast ptr [[TMP7]] to ptr addrspace(1), !dbg [[DBG229]]
+// CHECK1-NEXT:    [[TMP11:%.*]] = addrspacecast ptr [[TMP8]] to ptr addrspace(1), !dbg [[DBG229]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l27_debug___omp_outlined_debug__(ptr [[TMP3]], ptr [[TMP4]], ptr addrspace(1) [[TMP9]], i32 [[TMP6]], ptr addrspace(1) [[TMP10]], ptr addrspace(1) [[TMP11]]) #[[ATTR4]], !dbg [[DBG229]]
+// CHECK1-NEXT:    ret void, !dbg [[DBG229]]
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l27
-// CHECK1-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[C:%.*]], i64 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[B:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[BB:%.*]]) #[[ATTR6]] !dbg [[DBG231:![0-9]+]] {
+// CHECK1-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[C:%.*]], i64 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[B:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[BB:%.*]]) #[[ATTR6]] !dbg [[DBG230:![0-9]+]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[C_ADDR:%.*]] = alloca ptr, align 8
@@ -619,32 +619,32 @@ int main() {
 // CHECK1-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[BB_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DYN_PTR_ADDR]], metadata [[META234:![0-9]+]], metadata !DIExpression()), !dbg [[DBG235:![0-9]+]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[DYN_PTR_ADDR]], metadata [[META233:![0-9]+]], metadata !DIExpression()), !dbg [[DBG234:![0-9]+]]
 // CHECK1-NEXT:    store ptr [[C]], ptr [[C_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META236:![0-9]+]], metadata !DIExpression()), !dbg [[DBG235]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META235:![0-9]+]], metadata !DIExpression()), !dbg [[DBG234]]
 // CHECK1-NEXT:    store i64 [[A]], ptr [[A_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META237:![0-9]+]], metadata !DIExpression()), !dbg [[DBG235]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META236:![0-9]+]], metadata !DIExpression()), !dbg [[DBG234]]
 // CHECK1-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META238:![0-9]+]], metadata !DIExpression()), !dbg [[DBG235]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META237:![0-9]+]], metadata !DIExpression()), !dbg [[DBG234]]
 // CHECK1-NEXT:    store ptr [[BB]], ptr [[BB_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META239:![0-9]+]], metadata !DIExpression()), !dbg [[DBG235]]
-// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG240:![0-9]+]]
-// CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG240]]
-// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG240]]
-// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[DYN_PTR_ADDR]], align 8, !dbg [[DBG240]]
-// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG240]]
-// CHECK1-NEXT:    [[TMP5:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG240]]
-// CHECK1-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG240]]
-// CHECK1-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG240]]
-// CHECK1-NEXT:    [[TMP8:%.*]] = addrspacecast ptr [[TMP4]] to ptr addrspace(1), !dbg [[DBG240]]
-// CHECK1-NEXT:    [[TMP9:%.*]] = addrspacecast ptr [[TMP6]] to ptr addrspace(1), !dbg [[DBG240]]
-// CHECK1-NEXT:    [[TMP10:%.*]] = addrspacecast ptr [[TMP7]] to ptr addrspace(1), !dbg [[DBG240]]
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l27_debug__(ptr [[TMP3]], ptr addrspace(1) [[TMP8]], i32 [[TMP5]], ptr addrspace(1) [[TMP9]], ptr addrspace(1) [[TMP10]]) #[[ATTR4]], !dbg [[DBG240]]
-// CHECK1-NEXT:    ret void, !dbg [[DBG240]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META238:![0-9]+]], metadata !DIExpression()), !dbg [[DBG234]]
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG239:![0-9]+]]
+// CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG239]]
+// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG239]]
+// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[DYN_PTR_ADDR]], align 8, !dbg [[DBG239]]
+// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG239]]
+// CHECK1-NEXT:    [[TMP5:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG239]]
+// CHECK1-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG239]]
+// CHECK1-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG239]]
+// CHECK1-NEXT:    [[TMP8:%.*]] = addrspacecast ptr [[TMP4]] to ptr addrspace(1), !dbg [[DBG239]]
+// CHECK1-NEXT:    [[TMP9:%.*]] = addrspacecast ptr [[TMP6]] to ptr addrspace(1), !dbg [[DBG239]]
+// CHECK1-NEXT:    [[TMP10:%.*]] = addrspacecast ptr [[TMP7]] to ptr addrspace(1), !dbg [[DBG239]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l27_debug__(ptr [[TMP3]], ptr addrspace(1) [[TMP8]], i32 [[TMP5]], ptr addrspace(1) [[TMP9]], ptr addrspace(1) [[TMP10]]) #[[ATTR4]], !dbg [[DBG239]]
+// CHECK1-NEXT:    ret void, !dbg [[DBG239]]
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l41_debug__
-// CHECK1-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr addrspace(1) noalias noundef [[C:%.*]], ptr addrspace(1) noalias noundef [[A:%.*]], ptr addrspace(1) noalias noundef [[B:%.*]], ptr addrspace(1) noalias noundef [[BB:%.*]]) #[[ATTR0]] !dbg [[DBG241:![0-9]+]] {
+// CHECK1-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr addrspace(1) noalias noundef [[C:%.*]], ptr addrspace(1) noalias noundef [[A:%.*]], ptr addrspace(1) noalias noundef [[B:%.*]], ptr addrspace(1) noalias noundef [[BB:%.*]]) #[[ATTR0]] !dbg [[DBG240:![0-9]+]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[C_ADDR:%.*]] = alloca ptr addrspace(1), align 8
@@ -657,49 +657,49 @@ int main() {
 // CHECK1-NEXT:    [[_TMP3:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x ptr], align 8
 // CHECK1-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DYN_PTR_ADDR]], metadata [[META246:![0-9]+]], metadata !DIExpression()), !dbg [[DBG247:![0-9]+]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[DYN_PTR_ADDR]], metadata [[META245:![0-9]+]], metadata !DIExpression()), !dbg [[DBG246:![0-9]+]]
 // CHECK1-NEXT:    store ptr addrspace(1) [[C]], ptr [[C_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META248:![0-9]+]], metadata !DIExpression()), !dbg [[DBG249:![0-9]+]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META247:![0-9]+]], metadata !DIExpression()), !dbg [[DBG248:![0-9]+]]
 // CHECK1-NEXT:    store ptr addrspace(1) [[A]], ptr [[A_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META250:![0-9]+]], metadata !DIExpression()), !dbg [[DBG251:![0-9]+]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META249:![0-9]+]], metadata !DIExpression()), !dbg [[DBG250:![0-9]+]]
 // CHECK1-NEXT:    store ptr addrspace(1) [[B]], ptr [[B_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META252:![0-9]+]], metadata !DIExpression()), !dbg [[DBG253:![0-9]+]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META251:![0-9]+]], metadata !DIExpression()), !dbg [[DBG252:![0-9]+]]
 // CHECK1-NEXT:    store ptr addrspace(1) [[BB]], ptr [[BB_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META254:![0-9]+]], metadata !DIExpression()), !dbg [[DBG255:![0-9]+]]
-// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr [[C_ADDR]], align 8, !dbg [[DBG256:![0-9]+]]
-// CHECK1-NEXT:    [[TMP1:%.*]] = addrspacecast ptr addrspace(1) [[TMP0]] to ptr, !dbg [[DBG256]]
-// CHECK1-NEXT:    store ptr [[TMP1]], ptr [[TMP]], align 8, !dbg [[DBG256]]
-// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP]], align 8, !dbg [[DBG256]]
-// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr addrspace(1), ptr [[A_ADDR]], align 8, !dbg [[DBG256]]
-// CHECK1-NEXT:    [[TMP4:%.*]] = addrspacecast ptr addrspace(1) [[TMP3]] to ptr, !dbg [[DBG256]]
-// CHECK1-NEXT:    store ptr [[TMP4]], ptr [[_TMP1]], align 8, !dbg [[DBG256]]
-// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[_TMP1]], align 8, !dbg [[DBG256]]
-// CHECK1-NEXT:    [[TMP6:%.*]] = load ptr addrspace(1), ptr [[B_ADDR]], align 8, !dbg [[DBG256]]
-// CHECK1-NEXT:    [[TMP7:%.*]] = addrspacecast ptr addrspace(1) [[TMP6]] to ptr, !dbg [[DBG256]]
-// CHECK1-NEXT:    store ptr [[TMP7]], ptr [[_TMP2]], align 8, !dbg [[DBG256]]
-// CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[_TMP2]], align 8, !dbg [[DBG256]]
-// CHECK1-NEXT:    [[TMP9:%.*]] = load ptr addrspace(1), ptr [[BB_ADDR]], align 8, !dbg [[DBG256]]
-// CHECK1-NEXT:    [[TMP10:%.*]] = addrspacecast ptr addrspace(1) [[TMP9]] to ptr, !dbg [[DBG256]]
-// CHECK1-NEXT:    store ptr [[TMP10]], ptr [[_TMP3]], align 8, !dbg [[DBG256]]
-// CHECK1-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[_TMP3]], align 8, !dbg [[DBG256]]
-// CHECK1-NEXT:    [[TMP12:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l41_kernel_environment, ptr [[DYN_PTR]]), !dbg [[DBG256]]
-// CHECK1-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP12]], -1, !dbg [[DBG256]]
-// CHECK1-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]], !dbg [[DBG256]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META253:![0-9]+]], metadata !DIExpression()), !dbg [[DBG254:![0-9]+]]
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr [[C_ADDR]], align 8, !dbg [[DBG255:![0-9]+]]
+// CHECK1-NEXT:    [[TMP1:%.*]] = addrspacecast ptr addrspace(1) [[TMP0]] to ptr, !dbg [[DBG255]]
+// CHECK1-NEXT:    store ptr [[TMP1]], ptr [[TMP]], align 8, !dbg [[DBG255]]
+// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP]], align 8, !dbg [[DBG255]]
+// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr addrspace(1), ptr [[A_ADDR]], align 8, !dbg [[DBG255]]
+// CHECK1-NEXT:    [[TMP4:%.*]] = addrspacecast ptr addrspace(1) [[TMP3]] to ptr, !dbg [[DBG255]]
+// CHECK1-NEXT:    store ptr [[TMP4]], ptr [[_TMP1]], align 8, !dbg [[DBG255]]
+// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[_TMP1]], align 8, !dbg [[DBG255]]
+// CHECK1-NEXT:    [[TMP6:%.*]] = load ptr addrspace(1), ptr [[B_ADDR]], align 8, !dbg [[DBG255]]
+// CHECK1-NEXT:    [[TMP7:%.*]] = addrspacecast ptr addrspace(1) [[TMP6]] to ptr, !dbg [[DBG255]]
+// CHECK1-NEXT:    store ptr [[TMP7]], ptr [[_TMP2]], align 8, !dbg [[DBG255]]
+// CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[_TMP2]], align 8, !dbg [[DBG255]]
+// CHECK1-NEXT:    [[TMP9:%.*]] = load ptr addrspace(1), ptr [[BB_ADDR]], align 8, !dbg [[DBG255]]
+// CHECK1-NEXT:    [[TMP10:%.*]] = addrspacecast ptr addrspace(1) [[TMP9]] to ptr, !dbg [[DBG255]]
+// CHECK1-NEXT:    store ptr [[TMP10]], ptr [[_TMP3]], align 8, !dbg [[DBG255]]
+// CHECK1-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[_TMP3]], align 8, !dbg [[DBG255]]
+// CHECK1-NEXT:    [[TMP12:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l41_kernel_environment, ptr [[DYN_PTR]]), !dbg [[DBG255]]
+// CHECK1-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP12]], -1, !dbg [[DBG255]]
+// CHECK1-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]], !dbg [[DBG255]]
 // CHECK1:       user_code.entry:
-// CHECK1-NEXT:    [[TMP13:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB20:[0-9]+]])
-// CHECK1-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 0, !dbg [[DBG257:![0-9]+]]
-// CHECK1-NEXT:    store ptr [[TMP2]], ptr [[TMP14]], align 8, !dbg [[DBG257]]
-// CHECK1-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 1, !dbg [[DBG257]]
-// CHECK1-NEXT:    store ptr [[TMP5]], ptr [[TMP15]], align 8, !dbg [[DBG257]]
-// CHECK1-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 2, !dbg [[DBG257]]
-// CHECK1-NEXT:    store ptr [[TMP8]], ptr [[TMP16]], align 8, !dbg [[DBG257]]
-// CHECK1-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 3, !dbg [[DBG257]]
-// CHECK1-NEXT:    store ptr [[TMP11]], ptr [[TMP17]], align 8, !dbg [[DBG257]]
-// CHECK1-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB20]], i32 [[TMP13]], i32 1, i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l41_debug___omp_outlined, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i64 4), !dbg [[DBG257]]
-// CHECK1-NEXT:    call void @__kmpc_target_deinit(), !dbg [[DBG258:![0-9]+]]
+// CHECK1-NEXT:    [[TMP13:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB20:[0-9]+]]), !dbg [[DBG256:![0-9]+]]
+// CHECK1-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 0, !dbg [[DBG258:![0-9]+]]
+// CHECK1-NEXT:    store ptr [[TMP2]], ptr [[TMP14]], align 8, !dbg [[DBG258]]
+// CHECK1-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 1, !dbg [[DBG258]]
+// CHECK1-NEXT:    store ptr [[TMP5]], ptr [[TMP15]], align 8, !dbg [[DBG258]]
+// CHECK1-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 2, !dbg [[DBG258]]
+// CHECK1-NEXT:    store ptr [[TMP8]], ptr [[TMP16]], align 8, !dbg [[DBG258]]
+// CHECK1-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 3, !dbg [[DBG258]]
+// CHECK1-NEXT:    store ptr [[TMP11]], ptr [[TMP17]], align 8, !dbg [[DBG258]]
+// CHECK1-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB20]], i32 [[TMP13]], i32 1, i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l41_debug___omp_outlined, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i64 4), !dbg [[DBG258]]
+// CHECK1-NEXT:    call void @__kmpc_target_deinit(), !dbg [[DBG259:![0-9]+]]
 // CHECK1-NEXT:    ret void, !dbg [[DBG260:![0-9]+]]
 // CHECK1:       worker.exit:
-// CHECK1-NEXT:    ret void, !dbg [[DBG256]]
+// CHECK1-NEXT:    ret void, !dbg [[DBG255]]
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l41_debug___omp_outlined_debug__
@@ -727,17 +727,17 @@ int main() {
 // CHECK1-NEXT:    [[H:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[D:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTGLOBAL_TID__ADDR]], metadata [[META264:![0-9]+]], metadata !DIExpression()), !dbg [[DBG265:![0-9]+]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[DOTGLOBAL_TID__ADDR]], metadata [[META264:![0-9]+]], metadata !DIExpression()), !dbg [[DBG265:![0-9]+]]
 // CHECK1-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTBOUND_TID__ADDR]], metadata [[META266:![0-9]+]], metadata !DIExpression()), !dbg [[DBG265]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[DOTBOUND_TID__ADDR]], metadata [[META266:![0-9]+]], metadata !DIExpression()), !dbg [[DBG265]]
 // CHECK1-NEXT:    store ptr addrspace(1) [[C]], ptr [[C_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META267:![0-9]+]], metadata !DIExpression()), !dbg [[DBG268:![0-9]+]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META267:![0-9]+]], metadata !DIExpression()), !dbg [[DBG268:![0-9]+]]
 // CHECK1-NEXT:    store ptr addrspace(1) [[A]], ptr [[A_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META269:![0-9]+]], metadata !DIExpression()), !dbg [[DBG270:![0-9]+]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META269:![0-9]+]], metadata !DIExpression()), !dbg [[DBG270:![0-9]+]]
 // CHECK1-NEXT:    store ptr addrspace(1) [[B]], ptr [[B_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META271:![0-9]+]], metadata !DIExpression()), !dbg [[DBG272:![0-9]+]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META271:![0-9]+]], metadata !DIExpression()), !dbg [[DBG272:![0-9]+]]
 // CHECK1-NEXT:    store ptr addrspace(1) [[BB]], ptr [[BB_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META273:![0-9]+]], metadata !DIExpression()), !dbg [[DBG274:![0-9]+]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META273:![0-9]+]], metadata !DIExpression()), !dbg [[DBG274:![0-9]+]]
 // CHECK1-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr [[C_ADDR]], align 8, !dbg [[DBG275:![0-9]+]]
 // CHECK1-NEXT:    [[TMP1:%.*]] = addrspacecast ptr addrspace(1) [[TMP0]] to ptr, !dbg [[DBG275]]
 // CHECK1-NEXT:    store ptr [[TMP1]], ptr [[TMP]], align 8, !dbg [[DBG275]]
@@ -754,16 +754,16 @@ int main() {
 // CHECK1-NEXT:    [[TMP10:%.*]] = addrspacecast ptr addrspace(1) [[TMP9]] to ptr, !dbg [[DBG275]]
 // CHECK1-NEXT:    store ptr [[TMP10]], ptr [[_TMP3]], align 8, !dbg [[DBG275]]
 // CHECK1-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[_TMP3]], align 8, !dbg [[DBG275]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTOMP_IV]], metadata [[META276:![0-9]+]], metadata !DIExpression()), !dbg [[DBG265]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTOMP_LB]], metadata [[META277:![0-9]+]], metadata !DIExpression()), !dbg [[DBG265]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[DOTOMP_IV]], metadata [[META276:![0-9]+]], metadata !DIExpression()), !dbg [[DBG265]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[DOTOMP_LB]], metadata [[META277:![0-9]+]], metadata !DIExpression()), !dbg [[DBG265]]
 // CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_LB]], align 4, !dbg [[DBG278:![0-9]+]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTOMP_UB]], metadata [[META279:![0-9]+]], metadata !DIExpression()), !dbg [[DBG265]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[DOTOMP_UB]], metadata [[META279:![0-9]+]], metadata !DIExpression()), !dbg [[DBG265]]
 // CHECK1-NEXT:    store i32 9, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG278]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTOMP_STRIDE]], metadata [[META280:![0-9]+]], metadata !DIExpression()), !dbg [[DBG265]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[DOTOMP_STRIDE]], metadata [[META280:![0-9]+]], metadata !DIExpression()), !dbg [[DBG265]]
 // CHECK1-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4, !dbg [[DBG278]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTOMP_IS_LAST]], metadata [[META281:![0-9]+]], metadata !DIExpression()), !dbg [[DBG265]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[DOTOMP_IS_LAST]], metadata [[META281:![0-9]+]], metadata !DIExpression()), !dbg [[DBG265]]
 // CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4, !dbg [[DBG278]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[I]], metadata [[META282:![0-9]+]], metadata !DIExpression()), !dbg [[DBG265]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[I]], metadata [[META282:![0-9]+]], metadata !DIExpression()), !dbg [[DBG265]]
 // CHECK1-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !dbg [[DBG275]]
 // CHECK1-NEXT:    [[TMP13:%.*]] = load i32, ptr [[TMP12]], align 4, !dbg [[DBG275]]
 // CHECK1-NEXT:    call void @__kmpc_for_static_init_4(ptr @[[GLOB17:[0-9]+]], i32 [[TMP13]], i32 33, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1), !dbg [[DBG283:![0-9]+]]
@@ -798,18 +798,18 @@ int main() {
 // CHECK1-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP21]], 1, !dbg [[DBG284:![0-9]+]]
 // CHECK1-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]], !dbg [[DBG284]]
 // CHECK1-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !dbg [[DBG284]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[F]], metadata [[META285:![0-9]+]], metadata !DIExpression()), !dbg [[DBG287:![0-9]+]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[F]], metadata [[META285:![0-9]+]], metadata !DIExpression()), !dbg [[DBG287:![0-9]+]]
 // CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 1, !dbg [[DBG288:![0-9]+]]
 // CHECK1-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX]], i64 0, i64 1, !dbg [[DBG288]]
 // CHECK1-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX7]], i64 0, i64 1, !dbg [[DBG288]]
 // CHECK1-NEXT:    store ptr [[ARRAYIDX8]], ptr [[F]], align 8, !dbg [[DBG287]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[G]], metadata [[META289:![0-9]+]], metadata !DIExpression()), !dbg [[DBG290:![0-9]+]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[G]], metadata [[META289:![0-9]+]], metadata !DIExpression()), !dbg [[DBG290:![0-9]+]]
 // CHECK1-NEXT:    store ptr [[TMP5]], ptr [[G]], align 8, !dbg [[DBG290]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[H]], metadata [[META291:![0-9]+]], metadata !DIExpression()), !dbg [[DBG292:![0-9]+]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[H]], metadata [[META291:![0-9]+]], metadata !DIExpression()), !dbg [[DBG292:![0-9]+]]
 // CHECK1-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP8]], i64 0, i64 1, !dbg [[DBG293:![0-9]+]]
 // CHECK1-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX9]], i64 0, i64 1, !dbg [[DBG293]]
 // CHECK1-NEXT:    store ptr [[ARRAYIDX10]], ptr [[H]], align 8, !dbg [[DBG292]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[D]], metadata [[META294:![0-9]+]], metadata !DIExpression()), !dbg [[DBG295:![0-9]+]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[D]], metadata [[META294:![0-9]+]], metadata !DIExpression()), !dbg [[DBG295:![0-9]+]]
 // CHECK1-NEXT:    store i32 15, ptr [[D]], align 4, !dbg [[DBG295]]
 // CHECK1-NEXT:    store i32 5, ptr [[TMP5]], align 4, !dbg [[DBG296:![0-9]+]]
 // CHECK1-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP8]], i64 0, i64 0, !dbg [[DBG297:![0-9]+]]
@@ -877,17 +877,17 @@ int main() {
 // CHECK1-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[BB_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTGLOBAL_TID__ADDR]], metadata [[META319:![0-9]+]], metadata !DIExpression()), !dbg [[DBG320:![0-9]+]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[DOTGLOBAL_TID__ADDR]], metadata [[META319:![0-9]+]], metadata !DIExpression()), !dbg [[DBG320:![0-9]+]]
 // CHECK1-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTBOUND_TID__ADDR]], metadata [[META321:![0-9]+]], metadata !DIExpression()), !dbg [[DBG320]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[DOTBOUND_TID__ADDR]], metadata [[META321:![0-9]+]], metadata !DIExpression()), !dbg [[DBG320]]
 // CHECK1-NEXT:    store ptr [[C]], ptr [[C_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META322:![0-9]+]], metadata !DIExpression()), !dbg [[DBG320]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META322:![0-9]+]], metadata !DIExpression()), !dbg [[DBG320]]
 // CHECK1-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META323:![0-9]+]], metadata !DIExpression()), !dbg [[DBG320]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META323:![0-9]+]], metadata !DIExpression()), !dbg [[DBG320]]
 // CHECK1-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META324:![0-9]+]], metadata !DIExpression()), !dbg [[DBG320]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META324:![0-9]+]], metadata !DIExpression()), !dbg [[DBG320]]
 // CHECK1-NEXT:    store ptr [[BB]], ptr [[BB_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META325:![0-9]+]], metadata !DIExpression()), !dbg [[DBG320]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META325:![0-9]+]], metadata !DIExpression()), !dbg [[DBG320]]
 // CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG326:![0-9]+]]
 // CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[A_ADDR]], align 8, !dbg [[DBG326]]
 // CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG326]]
@@ -915,15 +915,15 @@ int main() {
 // CHECK1-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[BB_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DYN_PTR_ADDR]], metadata [[META330:![0-9]+]], metadata !DIExpression()), !dbg [[DBG331:![0-9]+]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[DYN_PTR_ADDR]], metadata [[META330:![0-9]+]], metadata !DIExpression()), !dbg [[DBG331:![0-9]+]]
 // CHECK1-NEXT:    store ptr [[C]], ptr [[C_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META332:![0-9]+]], metadata !DIExpression()), !dbg [[DBG331]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META332:![0-9]+]], metadata !DIExpression()), !dbg [[DBG331]]
 // CHECK1-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META333:![0-9]+]], metadata !DIExpression()), !dbg [[DBG331]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META333:![0-9]+]], metadata !DIExpression()), !dbg [[DBG331]]
 // CHECK1-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META334:![0-9]+]], metadata !DIExpression()), !dbg [[DBG331]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META334:![0-9]+]], metadata !DIExpression()), !dbg [[DBG331]]
 // CHECK1-NEXT:    store ptr [[BB]], ptr [[BB_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META335:![0-9]+]], metadata !DIExpression()), !dbg [[DBG331]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META335:![0-9]+]], metadata !DIExpression()), !dbg [[DBG331]]
 // CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG336:![0-9]+]]
 // CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[A_ADDR]], align 8, !dbg [[DBG336]]
 // CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG336]]
diff --git a/clang/test/OpenMP/target_parallel_generic_loop_codegen-3.cpp b/clang/test/OpenMP/target_parallel_generic_loop_codegen-3.cpp
index 5a34dab313419..b4287487827b9 100644
--- a/clang/test/OpenMP/target_parallel_generic_loop_codegen-3.cpp
+++ b/clang/test/OpenMP/target_parallel_generic_loop_codegen-3.cpp
@@ -55,7 +55,7 @@ int main() {
   return 0;
 }
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l13_debug__
-// CHECK1-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr addrspace(1) noalias noundef [[C:%.*]], i32 noundef [[A:%.*]], ptr noalias noundef [[B:%.*]], ptr addrspace(1) noalias noundef [[BB:%.*]], i1 noundef zeroext [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR0:[0-9]+]] !dbg [[DBG22:![0-9]+]] {
+// CHECK1-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr addrspace(1) noalias noundef [[C:%.*]], i32 noundef [[A:%.*]], ptr noalias noundef [[B:%.*]], ptr addrspace(1) noalias noundef [[BB:%.*]], i1 noundef zeroext [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR0:[0-9]+]] !dbg [[DBG19:![0-9]+]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[C_ADDR:%.*]] = alloca ptr addrspace(1), align 8
@@ -69,58 +69,58 @@ int main() {
 // CHECK1-NEXT:    [[A_CASTED:%.*]] = alloca i64, align 8
 // CHECK1-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x ptr], align 8
 // CHECK1-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DYN_PTR_ADDR]], metadata [[META43:![0-9]+]], metadata !DIExpression()), !dbg [[DBG44:![0-9]+]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[DYN_PTR_ADDR]], metadata [[META40:![0-9]+]], metadata !DIExpression()), !dbg [[DBG41:![0-9]+]]
 // CHECK1-NEXT:    store ptr addrspace(1) [[C]], ptr [[C_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META45:![0-9]+]], metadata !DIExpression()), !dbg [[DBG46:![0-9]+]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META42:![0-9]+]], metadata !DIExpression()), !dbg [[DBG43:![0-9]+]]
 // CHECK1-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META47:![0-9]+]], metadata !DIExpression()), !dbg [[DBG48:![0-9]+]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META44:![0-9]+]], metadata !DIExpression()), !dbg [[DBG45:![0-9]+]]
 // CHECK1-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META49:![0-9]+]], metadata !DIExpression()), !dbg [[DBG50:![0-9]+]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META46:![0-9]+]], metadata !DIExpression()), !dbg [[DBG47:![0-9]+]]
 // CHECK1-NEXT:    store ptr addrspace(1) [[BB]], ptr [[BB_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META51:![0-9]+]], metadata !DIExpression()), !dbg [[DBG52:![0-9]+]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META48:![0-9]+]], metadata !DIExpression()), !dbg [[DBG49:![0-9]+]]
 // CHECK1-NEXT:    [[FROMBOOL:%.*]] = zext i1 [[DOTCAPTURE_EXPR_]] to i8
 // CHECK1-NEXT:    store i8 [[FROMBOOL]], ptr [[DOTCAPTURE_EXPR__ADDR]], align 1
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTCAPTURE_EXPR__ADDR]], metadata [[META53:![0-9]+]], metadata !DIExpression()), !dbg [[DBG54:![0-9]+]]
-// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr [[C_ADDR]], align 8, !dbg [[DBG55:![0-9]+]]
-// CHECK1-NEXT:    [[TMP1:%.*]] = addrspacecast ptr addrspace(1) [[TMP0]] to ptr, !dbg [[DBG55]]
-// CHECK1-NEXT:    store ptr [[TMP1]], ptr [[TMP]], align 8, !dbg [[DBG55]]
-// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP]], align 8, !dbg [[DBG55]]
-// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG55]]
-// CHECK1-NEXT:    store ptr [[TMP3]], ptr [[_TMP1]], align 8, !dbg [[DBG55]]
-// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[_TMP1]], align 8, !dbg [[DBG55]]
-// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr addrspace(1), ptr [[BB_ADDR]], align 8, !dbg [[DBG55]]
-// CHECK1-NEXT:    [[TMP6:%.*]] = addrspacecast ptr addrspace(1) [[TMP5]] to ptr, !dbg [[DBG55]]
-// CHECK1-NEXT:    store ptr [[TMP6]], ptr [[_TMP2]], align 8, !dbg [[DBG55]]
-// CHECK1-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[_TMP2]], align 8, !dbg [[DBG55]]
-// CHECK1-NEXT:    [[TMP8:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l13_kernel_environment, ptr [[DYN_PTR]]), !dbg [[DBG55]]
-// CHECK1-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP8]], -1, !dbg [[DBG55]]
-// CHECK1-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]], !dbg [[DBG55]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[DOTCAPTURE_EXPR__ADDR]], metadata [[META50:![0-9]+]], metadata !DIExpression()), !dbg [[DBG51:![0-9]+]]
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr [[C_ADDR]], align 8, !dbg [[DBG52:![0-9]+]]
+// CHECK1-NEXT:    [[TMP1:%.*]] = addrspacecast ptr addrspace(1) [[TMP0]] to ptr, !dbg [[DBG52]]
+// CHECK1-NEXT:    store ptr [[TMP1]], ptr [[TMP]], align 8, !dbg [[DBG52]]
+// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP]], align 8, !dbg [[DBG52]]
+// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG52]]
+// CHECK1-NEXT:    store ptr [[TMP3]], ptr [[_TMP1]], align 8, !dbg [[DBG52]]
+// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[_TMP1]], align 8, !dbg [[DBG52]]
+// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr addrspace(1), ptr [[BB_ADDR]], align 8, !dbg [[DBG52]]
+// CHECK1-NEXT:    [[TMP6:%.*]] = addrspacecast ptr addrspace(1) [[TMP5]] to ptr, !dbg [[DBG52]]
+// CHECK1-NEXT:    store ptr [[TMP6]], ptr [[_TMP2]], align 8, !dbg [[DBG52]]
+// CHECK1-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[_TMP2]], align 8, !dbg [[DBG52]]
+// CHECK1-NEXT:    [[TMP8:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l13_kernel_environment, ptr [[DYN_PTR]]), !dbg [[DBG52]]
+// CHECK1-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP8]], -1, !dbg [[DBG52]]
+// CHECK1-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]], !dbg [[DBG52]]
 // CHECK1:       user_code.entry:
-// CHECK1-NEXT:    [[TMP9:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB6:[0-9]+]])
-// CHECK1-NEXT:    [[TMP10:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG56:![0-9]+]]
-// CHECK1-NEXT:    store i32 [[TMP10]], ptr [[A_CASTED]], align 4, !dbg [[DBG56]]
-// CHECK1-NEXT:    [[TMP11:%.*]] = load i64, ptr [[A_CASTED]], align 8, !dbg [[DBG56]]
-// CHECK1-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 0, !dbg [[DBG56]]
-// CHECK1-NEXT:    store ptr [[TMP2]], ptr [[TMP12]], align 8, !dbg [[DBG56]]
-// CHECK1-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 1, !dbg [[DBG56]]
-// CHECK1-NEXT:    [[TMP14:%.*]] = inttoptr i64 [[TMP11]] to ptr, !dbg [[DBG56]]
-// CHECK1-NEXT:    store ptr [[TMP14]], ptr [[TMP13]], align 8, !dbg [[DBG56]]
-// CHECK1-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 2, !dbg [[DBG56]]
-// CHECK1-NEXT:    store ptr [[TMP4]], ptr [[TMP15]], align 8, !dbg [[DBG56]]
-// CHECK1-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 3, !dbg [[DBG56]]
-// CHECK1-NEXT:    store ptr [[TMP7]], ptr [[TMP16]], align 8, !dbg [[DBG56]]
-// CHECK1-NEXT:    [[TMP17:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR__ADDR]], align 1, !dbg [[DBG57:![0-9]+]]
-// CHECK1-NEXT:    [[TOBOOL:%.*]] = trunc i8 [[TMP17]] to i1, !dbg [[DBG57]]
-// CHECK1-NEXT:    [[TMP18:%.*]] = zext i1 [[TOBOOL]] to i32, !dbg [[DBG56]]
-// CHECK1-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB6]], i32 [[TMP9]], i32 [[TMP18]], i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l13_debug___omp_outlined, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i64 4), !dbg [[DBG56]]
-// CHECK1-NEXT:    call void @__kmpc_target_deinit(), !dbg [[DBG59:![0-9]+]]
-// CHECK1-NEXT:    ret void, !dbg [[DBG60:![0-9]+]]
+// CHECK1-NEXT:    [[TMP9:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB6:[0-9]+]]), !dbg [[DBG53:![0-9]+]]
+// CHECK1-NEXT:    [[TMP10:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG55:![0-9]+]]
+// CHECK1-NEXT:    store i32 [[TMP10]], ptr [[A_CASTED]], align 4, !dbg [[DBG55]]
+// CHECK1-NEXT:    [[TMP11:%.*]] = load i64, ptr [[A_CASTED]], align 8, !dbg [[DBG55]]
+// CHECK1-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 0, !dbg [[DBG55]]
+// CHECK1-NEXT:    store ptr [[TMP2]], ptr [[TMP12]], align 8, !dbg [[DBG55]]
+// CHECK1-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 1, !dbg [[DBG55]]
+// CHECK1-NEXT:    [[TMP14:%.*]] = inttoptr i64 [[TMP11]] to ptr, !dbg [[DBG55]]
+// CHECK1-NEXT:    store ptr [[TMP14]], ptr [[TMP13]], align 8, !dbg [[DBG55]]
+// CHECK1-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 2, !dbg [[DBG55]]
+// CHECK1-NEXT:    store ptr [[TMP4]], ptr [[TMP15]], align 8, !dbg [[DBG55]]
+// CHECK1-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 3, !dbg [[DBG55]]
+// CHECK1-NEXT:    store ptr [[TMP7]], ptr [[TMP16]], align 8, !dbg [[DBG55]]
+// CHECK1-NEXT:    [[TMP17:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR__ADDR]], align 1, !dbg [[DBG56:![0-9]+]]
+// CHECK1-NEXT:    [[TOBOOL:%.*]] = trunc i8 [[TMP17]] to i1, !dbg [[DBG56]]
+// CHECK1-NEXT:    [[TMP18:%.*]] = zext i1 [[TOBOOL]] to i32, !dbg [[DBG55]]
+// CHECK1-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB6]], i32 [[TMP9]], i32 [[TMP18]], i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l13_debug___omp_outlined, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i64 4), !dbg [[DBG55]]
+// CHECK1-NEXT:    call void @__kmpc_target_deinit(), !dbg [[DBG57:![0-9]+]]
+// CHECK1-NEXT:    ret void, !dbg [[DBG58:![0-9]+]]
 // CHECK1:       worker.exit:
-// CHECK1-NEXT:    ret void, !dbg [[DBG55]]
+// CHECK1-NEXT:    ret void, !dbg [[DBG52]]
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l13_debug___omp_outlined_debug__
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr addrspace(1) noalias noundef [[C:%.*]], i32 noundef [[A:%.*]], ptr noalias noundef [[B:%.*]], ptr addrspace(1) noalias noundef [[BB:%.*]]) #[[ATTR2:[0-9]+]] !dbg [[DBG61:![0-9]+]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr addrspace(1) noalias noundef [[C:%.*]], i32 noundef [[A:%.*]], ptr noalias noundef [[B:%.*]], ptr addrspace(1) noalias noundef [[BB:%.*]]) #[[ATTR2:[0-9]+]] !dbg [[DBG59:![0-9]+]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -144,149 +144,149 @@ int main() {
 // CHECK1-NEXT:    [[H:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[D:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTGLOBAL_TID__ADDR]], metadata [[META68:![0-9]+]], metadata !DIExpression()), !dbg [[DBG69:![0-9]+]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[DOTGLOBAL_TID__ADDR]], metadata [[META66:![0-9]+]], metadata !DIExpression()), !dbg [[DBG67:![0-9]+]]
 // CHECK1-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTBOUND_TID__ADDR]], metadata [[META70:![0-9]+]], metadata !DIExpression()), !dbg [[DBG69]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[DOTBOUND_TID__ADDR]], metadata [[META68:![0-9]+]], metadata !DIExpression()), !dbg [[DBG67]]
 // CHECK1-NEXT:    store ptr addrspace(1) [[C]], ptr [[C_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META71:![0-9]+]], metadata !DIExpression()), !dbg [[DBG72:![0-9]+]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META69:![0-9]+]], metadata !DIExpression()), !dbg [[DBG70:![0-9]+]]
 // CHECK1-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META73:![0-9]+]], metadata !DIExpression()), !dbg [[DBG74:![0-9]+]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META71:![0-9]+]], metadata !DIExpression()), !dbg [[DBG72:![0-9]+]]
 // CHECK1-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META75:![0-9]+]], metadata !DIExpression()), !dbg [[DBG76:![0-9]+]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META73:![0-9]+]], metadata !DIExpression()), !dbg [[DBG74:![0-9]+]]
 // CHECK1-NEXT:    store ptr addrspace(1) [[BB]], ptr [[BB_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META77:![0-9]+]], metadata !DIExpression()), !dbg [[DBG78:![0-9]+]]
-// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr [[C_ADDR]], align 8, !dbg [[DBG79:![0-9]+]]
-// CHECK1-NEXT:    [[TMP1:%.*]] = addrspacecast ptr addrspace(1) [[TMP0]] to ptr, !dbg [[DBG79]]
-// CHECK1-NEXT:    store ptr [[TMP1]], ptr [[TMP]], align 8, !dbg [[DBG79]]
-// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP]], align 8, !dbg [[DBG79]]
-// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG79]]
-// CHECK1-NEXT:    store ptr [[TMP3]], ptr [[_TMP1]], align 8, !dbg [[DBG79]]
-// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[_TMP1]], align 8, !dbg [[DBG79]]
-// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr addrspace(1), ptr [[BB_ADDR]], align 8, !dbg [[DBG79]]
-// CHECK1-NEXT:    [[TMP6:%.*]] = addrspacecast ptr addrspace(1) [[TMP5]] to ptr, !dbg [[DBG79]]
-// CHECK1-NEXT:    store ptr [[TMP6]], ptr [[_TMP2]], align 8, !dbg [[DBG79]]
-// CHECK1-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[_TMP2]], align 8, !dbg [[DBG79]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTOMP_IV]], metadata [[META80:![0-9]+]], metadata !DIExpression()), !dbg [[DBG69]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTOMP_LB]], metadata [[META81:![0-9]+]], metadata !DIExpression()), !dbg [[DBG69]]
-// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_LB]], align 4, !dbg [[DBG82:![0-9]+]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTOMP_UB]], metadata [[META83:![0-9]+]], metadata !DIExpression()), !dbg [[DBG69]]
-// CHECK1-NEXT:    store i32 9, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG82]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTOMP_STRIDE]], metadata [[META84:![0-9]+]], metadata !DIExpression()), !dbg [[DBG69]]
-// CHECK1-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4, !dbg [[DBG82]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTOMP_IS_LAST]], metadata [[META85:![0-9]+]], metadata !DIExpression()), !dbg [[DBG69]]
-// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4, !dbg [[DBG82]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[B4]], metadata [[META86:![0-9]+]], metadata !DIExpression()), !dbg [[DBG69]]
-// CHECK1-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[B4]], ptr align 4 [[TMP4]], i64 400, i1 false), !dbg [[DBG79]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[I]], metadata [[META87:![0-9]+]], metadata !DIExpression()), !dbg [[DBG69]]
-// CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !dbg [[DBG79]]
-// CHECK1-NEXT:    [[TMP9:%.*]] = load i32, ptr [[TMP8]], align 4, !dbg [[DBG79]]
-// CHECK1-NEXT:    call void @__kmpc_for_static_init_4(ptr @[[GLOB3:[0-9]+]], i32 [[TMP9]], i32 33, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1), !dbg [[DBG88:![0-9]+]]
-// CHECK1-NEXT:    br label [[OMP_DISPATCH_COND:%.*]], !dbg [[DBG79]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META75:![0-9]+]], metadata !DIExpression()), !dbg [[DBG76:![0-9]+]]
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr [[C_ADDR]], align 8, !dbg [[DBG77:![0-9]+]]
+// CHECK1-NEXT:    [[TMP1:%.*]] = addrspacecast ptr addrspace(1) [[TMP0]] to ptr, !dbg [[DBG77]]
+// CHECK1-NEXT:    store ptr [[TMP1]], ptr [[TMP]], align 8, !dbg [[DBG77]]
+// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP]], align 8, !dbg [[DBG77]]
+// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG77]]
+// CHECK1-NEXT:    store ptr [[TMP3]], ptr [[_TMP1]], align 8, !dbg [[DBG77]]
+// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[_TMP1]], align 8, !dbg [[DBG77]]
+// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr addrspace(1), ptr [[BB_ADDR]], align 8, !dbg [[DBG77]]
+// CHECK1-NEXT:    [[TMP6:%.*]] = addrspacecast ptr addrspace(1) [[TMP5]] to ptr, !dbg [[DBG77]]
+// CHECK1-NEXT:    store ptr [[TMP6]], ptr [[_TMP2]], align 8, !dbg [[DBG77]]
+// CHECK1-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[_TMP2]], align 8, !dbg [[DBG77]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[DOTOMP_IV]], metadata [[META78:![0-9]+]], metadata !DIExpression()), !dbg [[DBG67]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[DOTOMP_LB]], metadata [[META79:![0-9]+]], metadata !DIExpression()), !dbg [[DBG67]]
+// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_LB]], align 4, !dbg [[DBG80:![0-9]+]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[DOTOMP_UB]], metadata [[META81:![0-9]+]], metadata !DIExpression()), !dbg [[DBG67]]
+// CHECK1-NEXT:    store i32 9, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG80]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[DOTOMP_STRIDE]], metadata [[META82:![0-9]+]], metadata !DIExpression()), !dbg [[DBG67]]
+// CHECK1-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4, !dbg [[DBG80]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[DOTOMP_IS_LAST]], metadata [[META83:![0-9]+]], metadata !DIExpression()), !dbg [[DBG67]]
+// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4, !dbg [[DBG80]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[B4]], metadata [[META84:![0-9]+]], metadata !DIExpression()), !dbg [[DBG67]]
+// CHECK1-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[B4]], ptr align 4 [[TMP4]], i64 400, i1 false), !dbg [[DBG77]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[I]], metadata [[META85:![0-9]+]], metadata !DIExpression()), !dbg [[DBG67]]
+// CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !dbg [[DBG77]]
+// CHECK1-NEXT:    [[TMP9:%.*]] = load i32, ptr [[TMP8]], align 4, !dbg [[DBG77]]
+// CHECK1-NEXT:    call void @__kmpc_for_static_init_4(ptr @[[GLOB3:[0-9]+]], i32 [[TMP9]], i32 33, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1), !dbg [[DBG86:![0-9]+]]
+// CHECK1-NEXT:    br label [[OMP_DISPATCH_COND:%.*]], !dbg [[DBG77]]
 // CHECK1:       omp.dispatch.cond:
-// CHECK1-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG82]]
-// CHECK1-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP10]], 9, !dbg [[DBG82]]
-// CHECK1-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]], !dbg [[DBG82]]
+// CHECK1-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG80]]
+// CHECK1-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP10]], 9, !dbg [[DBG80]]
+// CHECK1-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]], !dbg [[DBG80]]
 // CHECK1:       cond.true:
-// CHECK1-NEXT:    br label [[COND_END:%.*]], !dbg [[DBG82]]
+// CHECK1-NEXT:    br label [[COND_END:%.*]], !dbg [[DBG80]]
 // CHECK1:       cond.false:
-// CHECK1-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG82]]
-// CHECK1-NEXT:    br label [[COND_END]], !dbg [[DBG82]]
+// CHECK1-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG80]]
+// CHECK1-NEXT:    br label [[COND_END]], !dbg [[DBG80]]
 // CHECK1:       cond.end:
-// CHECK1-NEXT:    [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP11]], [[COND_FALSE]] ], !dbg [[DBG82]]
-// CHECK1-NEXT:    store i32 [[COND]], ptr [[DOTOMP_UB]], align 4, !dbg [[DBG82]]
-// CHECK1-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4, !dbg [[DBG82]]
-// CHECK1-NEXT:    store i32 [[TMP12]], ptr [[DOTOMP_IV]], align 4, !dbg [[DBG82]]
-// CHECK1-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG82]]
-// CHECK1-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG82]]
-// CHECK1-NEXT:    [[CMP5:%.*]] = icmp sle i32 [[TMP13]], [[TMP14]], !dbg [[DBG79]]
-// CHECK1-NEXT:    br i1 [[CMP5]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]], !dbg [[DBG79]]
+// CHECK1-NEXT:    [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP11]], [[COND_FALSE]] ], !dbg [[DBG80]]
+// CHECK1-NEXT:    store i32 [[COND]], ptr [[DOTOMP_UB]], align 4, !dbg [[DBG80]]
+// CHECK1-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4, !dbg [[DBG80]]
+// CHECK1-NEXT:    store i32 [[TMP12]], ptr [[DOTOMP_IV]], align 4, !dbg [[DBG80]]
+// CHECK1-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG80]]
+// CHECK1-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG80]]
+// CHECK1-NEXT:    [[CMP5:%.*]] = icmp sle i32 [[TMP13]], [[TMP14]], !dbg [[DBG77]]
+// CHECK1-NEXT:    br i1 [[CMP5]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]], !dbg [[DBG77]]
 // CHECK1:       omp.dispatch.body:
-// CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]], !dbg [[DBG79]]
+// CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]], !dbg [[DBG77]]
 // CHECK1:       omp.inner.for.cond:
-// CHECK1-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG82]]
-// CHECK1-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG82]]
-// CHECK1-NEXT:    [[CMP6:%.*]] = icmp sle i32 [[TMP15]], [[TMP16]], !dbg [[DBG79]]
-// CHECK1-NEXT:    br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]], !dbg [[DBG79]]
+// CHECK1-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG80]]
+// CHECK1-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG80]]
+// CHECK1-NEXT:    [[CMP6:%.*]] = icmp sle i32 [[TMP15]], [[TMP16]], !dbg [[DBG77]]
+// CHECK1-NEXT:    br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]], !dbg [[DBG77]]
 // CHECK1:       omp.inner.for.body:
-// CHECK1-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG82]]
-// CHECK1-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP17]], 1, !dbg [[DBG89:![0-9]+]]
-// CHECK1-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]], !dbg [[DBG89]]
-// CHECK1-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !dbg [[DBG89]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[F]], metadata [[META90:![0-9]+]], metadata !DIExpression()), !dbg [[DBG93:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 1, !dbg [[DBG94:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX]], i64 0, i64 1, !dbg [[DBG94]]
-// CHECK1-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX7]], i64 0, i64 1, !dbg [[DBG94]]
-// CHECK1-NEXT:    store ptr [[ARRAYIDX8]], ptr [[F]], align 8, !dbg [[DBG93]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[G]], metadata [[META95:![0-9]+]], metadata !DIExpression()), !dbg [[DBG96:![0-9]+]]
-// CHECK1-NEXT:    store ptr [[A_ADDR]], ptr [[G]], align 8, !dbg [[DBG96]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[H]], metadata [[META97:![0-9]+]], metadata !DIExpression()), !dbg [[DBG98:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[B4]], i64 0, i64 1, !dbg [[DBG99:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX9]], i64 0, i64 1, !dbg [[DBG99]]
-// CHECK1-NEXT:    store ptr [[ARRAYIDX10]], ptr [[H]], align 8, !dbg [[DBG98]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[D]], metadata [[META100:![0-9]+]], metadata !DIExpression()), !dbg [[DBG101:![0-9]+]]
-// CHECK1-NEXT:    store i32 15, ptr [[D]], align 4, !dbg [[DBG101]]
-// CHECK1-NEXT:    store i32 5, ptr [[A_ADDR]], align 4, !dbg [[DBG102:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[B4]], i64 0, i64 0, !dbg [[DBG103:![0-9]+]]
-// CHECK1-NEXT:    [[TMP18:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG104:![0-9]+]]
-// CHECK1-NEXT:    [[IDXPROM:%.*]] = sext i32 [[TMP18]] to i64, !dbg [[DBG103]]
-// CHECK1-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX11]], i64 0, i64 [[IDXPROM]], !dbg [[DBG103]]
-// CHECK1-NEXT:    store i32 10, ptr [[ARRAYIDX12]], align 4, !dbg [[DBG105:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 0, !dbg [[DBG106:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX13]], i64 0, i64 0, !dbg [[DBG106]]
-// CHECK1-NEXT:    [[TMP19:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG107:![0-9]+]]
-// CHECK1-NEXT:    [[IDXPROM15:%.*]] = sext i32 [[TMP19]] to i64, !dbg [[DBG106]]
-// CHECK1-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX14]], i64 0, i64 [[IDXPROM15]], !dbg [[DBG106]]
-// CHECK1-NEXT:    store i32 11, ptr [[ARRAYIDX16]], align 4, !dbg [[DBG108:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX17:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 0, !dbg [[DBG109:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX18:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX17]], i64 0, i64 0, !dbg [[DBG109]]
-// CHECK1-NEXT:    [[TMP20:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG110:![0-9]+]]
-// CHECK1-NEXT:    [[IDXPROM19:%.*]] = sext i32 [[TMP20]] to i64, !dbg [[DBG109]]
-// CHECK1-NEXT:    [[ARRAYIDX20:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX18]], i64 0, i64 [[IDXPROM19]], !dbg [[DBG109]]
-// CHECK1-NEXT:    [[TMP21:%.*]] = load i32, ptr [[ARRAYIDX20]], align 4, !dbg [[DBG109]]
-// CHECK1-NEXT:    [[ARRAYIDX21:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[B4]], i64 0, i64 0, !dbg [[DBG111:![0-9]+]]
-// CHECK1-NEXT:    [[TMP22:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG112:![0-9]+]]
-// CHECK1-NEXT:    [[IDXPROM22:%.*]] = sext i32 [[TMP22]] to i64, !dbg [[DBG111]]
-// CHECK1-NEXT:    [[ARRAYIDX23:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX21]], i64 0, i64 [[IDXPROM22]], !dbg [[DBG111]]
-// CHECK1-NEXT:    store i32 [[TMP21]], ptr [[ARRAYIDX23]], align 4, !dbg [[DBG113:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX24:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[B4]], i64 0, i64 0, !dbg [[DBG114:![0-9]+]]
-// CHECK1-NEXT:    [[TMP23:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG115:![0-9]+]]
-// CHECK1-NEXT:    [[IDXPROM25:%.*]] = sext i32 [[TMP23]] to i64, !dbg [[DBG114]]
-// CHECK1-NEXT:    [[ARRAYIDX26:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX24]], i64 0, i64 [[IDXPROM25]], !dbg [[DBG114]]
-// CHECK1-NEXT:    [[TMP24:%.*]] = load i32, ptr [[ARRAYIDX26]], align 4, !dbg [[DBG114]]
-// CHECK1-NEXT:    [[TMP25:%.*]] = load i8, ptr [[TMP7]], align 1, !dbg [[DBG116:![0-9]+]]
-// CHECK1-NEXT:    [[TOBOOL:%.*]] = trunc i8 [[TMP25]] to i1, !dbg [[DBG116]]
-// CHECK1-NEXT:    [[CONV:%.*]] = zext i1 [[TOBOOL]] to i32, !dbg [[DBG116]]
-// CHECK1-NEXT:    [[OR:%.*]] = or i32 [[CONV]], [[TMP24]], !dbg [[DBG116]]
-// CHECK1-NEXT:    [[TOBOOL27:%.*]] = icmp ne i32 [[OR]], 0, !dbg [[DBG116]]
-// CHECK1-NEXT:    [[FROMBOOL:%.*]] = zext i1 [[TOBOOL27]] to i8, !dbg [[DBG116]]
-// CHECK1-NEXT:    store i8 [[FROMBOOL]], ptr [[TMP7]], align 1, !dbg [[DBG116]]
-// CHECK1-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]], !dbg [[DBG117:![0-9]+]]
+// CHECK1-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG80]]
+// CHECK1-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP17]], 1, !dbg [[DBG87:![0-9]+]]
+// CHECK1-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]], !dbg [[DBG87]]
+// CHECK1-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !dbg [[DBG87]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[F]], metadata [[META88:![0-9]+]], metadata !DIExpression()), !dbg [[DBG91:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 1, !dbg [[DBG92:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX]], i64 0, i64 1, !dbg [[DBG92]]
+// CHECK1-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX7]], i64 0, i64 1, !dbg [[DBG92]]
+// CHECK1-NEXT:    store ptr [[ARRAYIDX8]], ptr [[F]], align 8, !dbg [[DBG91]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[G]], metadata [[META93:![0-9]+]], metadata !DIExpression()), !dbg [[DBG94:![0-9]+]]
+// CHECK1-NEXT:    store ptr [[A_ADDR]], ptr [[G]], align 8, !dbg [[DBG94]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[H]], metadata [[META95:![0-9]+]], metadata !DIExpression()), !dbg [[DBG96:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[B4]], i64 0, i64 1, !dbg [[DBG97:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX9]], i64 0, i64 1, !dbg [[DBG97]]
+// CHECK1-NEXT:    store ptr [[ARRAYIDX10]], ptr [[H]], align 8, !dbg [[DBG96]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[D]], metadata [[META98:![0-9]+]], metadata !DIExpression()), !dbg [[DBG99:![0-9]+]]
+// CHECK1-NEXT:    store i32 15, ptr [[D]], align 4, !dbg [[DBG99]]
+// CHECK1-NEXT:    store i32 5, ptr [[A_ADDR]], align 4, !dbg [[DBG100:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[B4]], i64 0, i64 0, !dbg [[DBG101:![0-9]+]]
+// CHECK1-NEXT:    [[TMP18:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG102:![0-9]+]]
+// CHECK1-NEXT:    [[IDXPROM:%.*]] = sext i32 [[TMP18]] to i64, !dbg [[DBG101]]
+// CHECK1-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX11]], i64 0, i64 [[IDXPROM]], !dbg [[DBG101]]
+// CHECK1-NEXT:    store i32 10, ptr [[ARRAYIDX12]], align 4, !dbg [[DBG103:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 0, !dbg [[DBG104:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX13]], i64 0, i64 0, !dbg [[DBG104]]
+// CHECK1-NEXT:    [[TMP19:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG105:![0-9]+]]
+// CHECK1-NEXT:    [[IDXPROM15:%.*]] = sext i32 [[TMP19]] to i64, !dbg [[DBG104]]
+// CHECK1-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX14]], i64 0, i64 [[IDXPROM15]], !dbg [[DBG104]]
+// CHECK1-NEXT:    store i32 11, ptr [[ARRAYIDX16]], align 4, !dbg [[DBG106:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX17:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 0, !dbg [[DBG107:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX18:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX17]], i64 0, i64 0, !dbg [[DBG107]]
+// CHECK1-NEXT:    [[TMP20:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG108:![0-9]+]]
+// CHECK1-NEXT:    [[IDXPROM19:%.*]] = sext i32 [[TMP20]] to i64, !dbg [[DBG107]]
+// CHECK1-NEXT:    [[ARRAYIDX20:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX18]], i64 0, i64 [[IDXPROM19]], !dbg [[DBG107]]
+// CHECK1-NEXT:    [[TMP21:%.*]] = load i32, ptr [[ARRAYIDX20]], align 4, !dbg [[DBG107]]
+// CHECK1-NEXT:    [[ARRAYIDX21:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[B4]], i64 0, i64 0, !dbg [[DBG109:![0-9]+]]
+// CHECK1-NEXT:    [[TMP22:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG110:![0-9]+]]
+// CHECK1-NEXT:    [[IDXPROM22:%.*]] = sext i32 [[TMP22]] to i64, !dbg [[DBG109]]
+// CHECK1-NEXT:    [[ARRAYIDX23:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX21]], i64 0, i64 [[IDXPROM22]], !dbg [[DBG109]]
+// CHECK1-NEXT:    store i32 [[TMP21]], ptr [[ARRAYIDX23]], align 4, !dbg [[DBG111:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX24:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[B4]], i64 0, i64 0, !dbg [[DBG112:![0-9]+]]
+// CHECK1-NEXT:    [[TMP23:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG113:![0-9]+]]
+// CHECK1-NEXT:    [[IDXPROM25:%.*]] = sext i32 [[TMP23]] to i64, !dbg [[DBG112]]
+// CHECK1-NEXT:    [[ARRAYIDX26:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX24]], i64 0, i64 [[IDXPROM25]], !dbg [[DBG112]]
+// CHECK1-NEXT:    [[TMP24:%.*]] = load i32, ptr [[ARRAYIDX26]], align 4, !dbg [[DBG112]]
+// CHECK1-NEXT:    [[TMP25:%.*]] = load i8, ptr [[TMP7]], align 1, !dbg [[DBG114:![0-9]+]]
+// CHECK1-NEXT:    [[TOBOOL:%.*]] = trunc i8 [[TMP25]] to i1, !dbg [[DBG114]]
+// CHECK1-NEXT:    [[CONV:%.*]] = zext i1 [[TOBOOL]] to i32, !dbg [[DBG114]]
+// CHECK1-NEXT:    [[OR:%.*]] = or i32 [[CONV]], [[TMP24]], !dbg [[DBG114]]
+// CHECK1-NEXT:    [[TOBOOL27:%.*]] = icmp ne i32 [[OR]], 0, !dbg [[DBG114]]
+// CHECK1-NEXT:    [[FROMBOOL:%.*]] = zext i1 [[TOBOOL27]] to i8, !dbg [[DBG114]]
+// CHECK1-NEXT:    store i8 [[FROMBOOL]], ptr [[TMP7]], align 1, !dbg [[DBG114]]
+// CHECK1-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]], !dbg [[DBG115:![0-9]+]]
 // CHECK1:       omp.body.continue:
-// CHECK1-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]], !dbg [[DBG88]]
+// CHECK1-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]], !dbg [[DBG86]]
 // CHECK1:       omp.inner.for.inc:
-// CHECK1-NEXT:    [[TMP26:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG82]]
-// CHECK1-NEXT:    [[ADD28:%.*]] = add nsw i32 [[TMP26]], 1, !dbg [[DBG79]]
-// CHECK1-NEXT:    store i32 [[ADD28]], ptr [[DOTOMP_IV]], align 4, !dbg [[DBG79]]
-// CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND]], !dbg [[DBG88]], !llvm.loop [[LOOP118:![0-9]+]]
+// CHECK1-NEXT:    [[TMP26:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG80]]
+// CHECK1-NEXT:    [[ADD28:%.*]] = add nsw i32 [[TMP26]], 1, !dbg [[DBG77]]
+// CHECK1-NEXT:    store i32 [[ADD28]], ptr [[DOTOMP_IV]], align 4, !dbg [[DBG77]]
+// CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND]], !dbg [[DBG86]], !llvm.loop [[LOOP116:![0-9]+]]
 // CHECK1:       omp.inner.for.end:
-// CHECK1-NEXT:    br label [[OMP_DISPATCH_INC:%.*]], !dbg [[DBG88]]
+// CHECK1-NEXT:    br label [[OMP_DISPATCH_INC:%.*]], !dbg [[DBG86]]
 // CHECK1:       omp.dispatch.inc:
-// CHECK1-NEXT:    [[TMP27:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4, !dbg [[DBG82]]
-// CHECK1-NEXT:    [[TMP28:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !dbg [[DBG82]]
-// CHECK1-NEXT:    [[ADD29:%.*]] = add nsw i32 [[TMP27]], [[TMP28]], !dbg [[DBG79]]
-// CHECK1-NEXT:    store i32 [[ADD29]], ptr [[DOTOMP_LB]], align 4, !dbg [[DBG79]]
-// CHECK1-NEXT:    [[TMP29:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG82]]
-// CHECK1-NEXT:    [[TMP30:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !dbg [[DBG82]]
-// CHECK1-NEXT:    [[ADD30:%.*]] = add nsw i32 [[TMP29]], [[TMP30]], !dbg [[DBG79]]
-// CHECK1-NEXT:    store i32 [[ADD30]], ptr [[DOTOMP_UB]], align 4, !dbg [[DBG79]]
-// CHECK1-NEXT:    br label [[OMP_DISPATCH_COND]], !dbg [[DBG88]], !llvm.loop [[LOOP120:![0-9]+]]
+// CHECK1-NEXT:    [[TMP27:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4, !dbg [[DBG80]]
+// CHECK1-NEXT:    [[TMP28:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !dbg [[DBG80]]
+// CHECK1-NEXT:    [[ADD29:%.*]] = add nsw i32 [[TMP27]], [[TMP28]], !dbg [[DBG77]]
+// CHECK1-NEXT:    store i32 [[ADD29]], ptr [[DOTOMP_LB]], align 4, !dbg [[DBG77]]
+// CHECK1-NEXT:    [[TMP29:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG80]]
+// CHECK1-NEXT:    [[TMP30:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !dbg [[DBG80]]
+// CHECK1-NEXT:    [[ADD30:%.*]] = add nsw i32 [[TMP29]], [[TMP30]], !dbg [[DBG77]]
+// CHECK1-NEXT:    store i32 [[ADD30]], ptr [[DOTOMP_UB]], align 4, !dbg [[DBG77]]
+// CHECK1-NEXT:    br label [[OMP_DISPATCH_COND]], !dbg [[DBG86]], !llvm.loop [[LOOP118:![0-9]+]]
 // CHECK1:       omp.dispatch.end:
-// CHECK1-NEXT:    call void @__kmpc_for_static_fini(ptr @[[GLOB5:[0-9]+]], i32 [[TMP9]]), !dbg [[DBG119:![0-9]+]]
-// CHECK1-NEXT:    ret void, !dbg [[DBG121:![0-9]+]]
+// CHECK1-NEXT:    call void @__kmpc_for_static_fini(ptr @[[GLOB5:[0-9]+]], i32 [[TMP9]]), !dbg [[DBG117:![0-9]+]]
+// CHECK1-NEXT:    ret void, !dbg [[DBG119:![0-9]+]]
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l13_debug___omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[C:%.*]], i64 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[B:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[BB:%.*]]) #[[ATTR2]] !dbg [[DBG122:![0-9]+]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[C:%.*]], i64 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[B:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[BB:%.*]]) #[[ATTR2]] !dbg [[DBG120:![0-9]+]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -295,34 +295,34 @@ int main() {
 // CHECK1-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[BB_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTGLOBAL_TID__ADDR]], metadata [[META129:![0-9]+]], metadata !DIExpression()), !dbg [[DBG130:![0-9]+]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[DOTGLOBAL_TID__ADDR]], metadata [[META127:![0-9]+]], metadata !DIExpression()), !dbg [[DBG128:![0-9]+]]
 // CHECK1-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTBOUND_TID__ADDR]], metadata [[META131:![0-9]+]], metadata !DIExpression()), !dbg [[DBG130]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[DOTBOUND_TID__ADDR]], metadata [[META129:![0-9]+]], metadata !DIExpression()), !dbg [[DBG128]]
 // CHECK1-NEXT:    store ptr [[C]], ptr [[C_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META132:![0-9]+]], metadata !DIExpression()), !dbg [[DBG130]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META130:![0-9]+]], metadata !DIExpression()), !dbg [[DBG128]]
 // CHECK1-NEXT:    store i64 [[A]], ptr [[A_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META133:![0-9]+]], metadata !DIExpression()), !dbg [[DBG130]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META131:![0-9]+]], metadata !DIExpression()), !dbg [[DBG128]]
 // CHECK1-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META134:![0-9]+]], metadata !DIExpression()), !dbg [[DBG130]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META132:![0-9]+]], metadata !DIExpression()), !dbg [[DBG128]]
 // CHECK1-NEXT:    store ptr [[BB]], ptr [[BB_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META135:![0-9]+]], metadata !DIExpression()), !dbg [[DBG130]]
-// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG136:![0-9]+]]
-// CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG136]]
-// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG136]]
-// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !dbg [[DBG136]]
-// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[DOTBOUND_TID__ADDR]], align 8, !dbg [[DBG136]]
-// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG136]]
-// CHECK1-NEXT:    [[TMP6:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG136]]
-// CHECK1-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG136]]
-// CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG136]]
-// CHECK1-NEXT:    [[TMP9:%.*]] = addrspacecast ptr [[TMP5]] to ptr addrspace(1), !dbg [[DBG136]]
-// CHECK1-NEXT:    [[TMP10:%.*]] = addrspacecast ptr [[TMP8]] to ptr addrspace(1), !dbg [[DBG136]]
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l13_debug___omp_outlined_debug__(ptr [[TMP3]], ptr [[TMP4]], ptr addrspace(1) [[TMP9]], i32 [[TMP6]], ptr [[TMP7]], ptr addrspace(1) [[TMP10]]) #[[ATTR4:[0-9]+]], !dbg [[DBG136]]
-// CHECK1-NEXT:    ret void, !dbg [[DBG136]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META133:![0-9]+]], metadata !DIExpression()), !dbg [[DBG128]]
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG134:![0-9]+]]
+// CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG134]]
+// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG134]]
+// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !dbg [[DBG134]]
+// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[DOTBOUND_TID__ADDR]], align 8, !dbg [[DBG134]]
+// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG134]]
+// CHECK1-NEXT:    [[TMP6:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG134]]
+// CHECK1-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG134]]
+// CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG134]]
+// CHECK1-NEXT:    [[TMP9:%.*]] = addrspacecast ptr [[TMP5]] to ptr addrspace(1), !dbg [[DBG134]]
+// CHECK1-NEXT:    [[TMP10:%.*]] = addrspacecast ptr [[TMP8]] to ptr addrspace(1), !dbg [[DBG134]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l13_debug___omp_outlined_debug__(ptr [[TMP3]], ptr [[TMP4]], ptr addrspace(1) [[TMP9]], i32 [[TMP6]], ptr [[TMP7]], ptr addrspace(1) [[TMP10]]) #[[ATTR4:[0-9]+]], !dbg [[DBG134]]
+// CHECK1-NEXT:    ret void, !dbg [[DBG134]]
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l13
-// CHECK1-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[C:%.*]], i64 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[B:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[BB:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR6:[0-9]+]] !dbg [[DBG137:![0-9]+]] {
+// CHECK1-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[C:%.*]], i64 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[B:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[BB:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR6:[0-9]+]] !dbg [[DBG135:![0-9]+]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[C_ADDR:%.*]] = alloca ptr, align 8
@@ -331,35 +331,35 @@ int main() {
 // CHECK1-NEXT:    [[BB_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i64, align 8
 // CHECK1-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DYN_PTR_ADDR]], metadata [[META140:![0-9]+]], metadata !DIExpression()), !dbg [[DBG141:![0-9]+]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[DYN_PTR_ADDR]], metadata [[META138:![0-9]+]], metadata !DIExpression()), !dbg [[DBG139:![0-9]+]]
 // CHECK1-NEXT:    store ptr [[C]], ptr [[C_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META142:![0-9]+]], metadata !DIExpression()), !dbg [[DBG141]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META140:![0-9]+]], metadata !DIExpression()), !dbg [[DBG139]]
 // CHECK1-NEXT:    store i64 [[A]], ptr [[A_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META143:![0-9]+]], metadata !DIExpression()), !dbg [[DBG141]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META141:![0-9]+]], metadata !DIExpression()), !dbg [[DBG139]]
 // CHECK1-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META144:![0-9]+]], metadata !DIExpression()), !dbg [[DBG141]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META142:![0-9]+]], metadata !DIExpression()), !dbg [[DBG139]]
 // CHECK1-NEXT:    store ptr [[BB]], ptr [[BB_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META145:![0-9]+]], metadata !DIExpression()), !dbg [[DBG141]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META143:![0-9]+]], metadata !DIExpression()), !dbg [[DBG139]]
 // CHECK1-NEXT:    store i64 [[DOTCAPTURE_EXPR_]], ptr [[DOTCAPTURE_EXPR__ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTCAPTURE_EXPR__ADDR]], metadata [[META146:![0-9]+]], metadata !DIExpression()), !dbg [[DBG141]]
-// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG147:![0-9]+]]
-// CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG147]]
-// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG147]]
-// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[DYN_PTR_ADDR]], align 8, !dbg [[DBG147]]
-// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG147]]
-// CHECK1-NEXT:    [[TMP5:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG147]]
-// CHECK1-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG147]]
-// CHECK1-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG147]]
-// CHECK1-NEXT:    [[TMP8:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR__ADDR]], align 1, !dbg [[DBG147]]
-// CHECK1-NEXT:    [[TOBOOL:%.*]] = trunc i8 [[TMP8]] to i1, !dbg [[DBG147]]
-// CHECK1-NEXT:    [[TMP9:%.*]] = addrspacecast ptr [[TMP4]] to ptr addrspace(1), !dbg [[DBG147]]
-// CHECK1-NEXT:    [[TMP10:%.*]] = addrspacecast ptr [[TMP7]] to ptr addrspace(1), !dbg [[DBG147]]
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l13_debug__(ptr [[TMP3]], ptr addrspace(1) [[TMP9]], i32 [[TMP5]], ptr [[TMP6]], ptr addrspace(1) [[TMP10]], i1 [[TOBOOL]]) #[[ATTR4]], !dbg [[DBG147]]
-// CHECK1-NEXT:    ret void, !dbg [[DBG147]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[DOTCAPTURE_EXPR__ADDR]], metadata [[META144:![0-9]+]], metadata !DIExpression()), !dbg [[DBG139]]
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG145:![0-9]+]]
+// CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG145]]
+// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG145]]
+// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[DYN_PTR_ADDR]], align 8, !dbg [[DBG145]]
+// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG145]]
+// CHECK1-NEXT:    [[TMP5:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG145]]
+// CHECK1-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG145]]
+// CHECK1-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG145]]
+// CHECK1-NEXT:    [[TMP8:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR__ADDR]], align 1, !dbg [[DBG145]]
+// CHECK1-NEXT:    [[TOBOOL:%.*]] = trunc i8 [[TMP8]] to i1, !dbg [[DBG145]]
+// CHECK1-NEXT:    [[TMP9:%.*]] = addrspacecast ptr [[TMP4]] to ptr addrspace(1), !dbg [[DBG145]]
+// CHECK1-NEXT:    [[TMP10:%.*]] = addrspacecast ptr [[TMP7]] to ptr addrspace(1), !dbg [[DBG145]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l13_debug__(ptr [[TMP3]], ptr addrspace(1) [[TMP9]], i32 [[TMP5]], ptr [[TMP6]], ptr addrspace(1) [[TMP10]], i1 [[TOBOOL]]) #[[ATTR4]], !dbg [[DBG145]]
+// CHECK1-NEXT:    ret void, !dbg [[DBG145]]
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l27_debug__
-// CHECK1-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr addrspace(1) noalias noundef [[C:%.*]], i32 noundef [[A:%.*]], ptr addrspace(1) noalias noundef [[B:%.*]], ptr addrspace(1) noalias noundef [[BB:%.*]]) #[[ATTR0]] !dbg [[DBG148:![0-9]+]] {
+// CHECK1-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr addrspace(1) noalias noundef [[C:%.*]], i32 noundef [[A:%.*]], ptr addrspace(1) noalias noundef [[B:%.*]], ptr addrspace(1) noalias noundef [[BB:%.*]]) #[[ATTR0]] !dbg [[DBG146:![0-9]+]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[C_ADDR:%.*]] = alloca ptr addrspace(1), align 8
@@ -372,32 +372,32 @@ int main() {
 // CHECK1-NEXT:    [[A_CASTED:%.*]] = alloca i64, align 8
 // CHECK1-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x ptr], align 8
 // CHECK1-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DYN_PTR_ADDR]], metadata [[META153:![0-9]+]], metadata !DIExpression()), !dbg [[DBG154:![0-9]+]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[DYN_PTR_ADDR]], metadata [[META151:![0-9]+]], metadata !DIExpression()), !dbg [[DBG152:![0-9]+]]
 // CHECK1-NEXT:    store ptr addrspace(1) [[C]], ptr [[C_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META155:![0-9]+]], metadata !DIExpression()), !dbg [[DBG156:![0-9]+]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META153:![0-9]+]], metadata !DIExpression()), !dbg [[DBG154:![0-9]+]]
 // CHECK1-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META157:![0-9]+]], metadata !DIExpression()), !dbg [[DBG158:![0-9]+]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META155:![0-9]+]], metadata !DIExpression()), !dbg [[DBG156:![0-9]+]]
 // CHECK1-NEXT:    store ptr addrspace(1) [[B]], ptr [[B_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META159:![0-9]+]], metadata !DIExpression()), !dbg [[DBG160:![0-9]+]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META157:![0-9]+]], metadata !DIExpression()), !dbg [[DBG158:![0-9]+]]
 // CHECK1-NEXT:    store ptr addrspace(1) [[BB]], ptr [[BB_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META161:![0-9]+]], metadata !DIExpression()), !dbg [[DBG162:![0-9]+]]
-// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr [[C_ADDR]], align 8, !dbg [[DBG163:![0-9]+]]
-// CHECK1-NEXT:    [[TMP1:%.*]] = addrspacecast ptr addrspace(1) [[TMP0]] to ptr, !dbg [[DBG163]]
-// CHECK1-NEXT:    store ptr [[TMP1]], ptr [[TMP]], align 8, !dbg [[DBG163]]
-// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP]], align 8, !dbg [[DBG163]]
-// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr addrspace(1), ptr [[B_ADDR]], align 8, !dbg [[DBG163]]
-// CHECK1-NEXT:    [[TMP4:%.*]] = addrspacecast ptr addrspace(1) [[TMP3]] to ptr, !dbg [[DBG163]]
-// CHECK1-NEXT:    store ptr [[TMP4]], ptr [[_TMP1]], align 8, !dbg [[DBG163]]
-// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[_TMP1]], align 8, !dbg [[DBG163]]
-// CHECK1-NEXT:    [[TMP6:%.*]] = load ptr addrspace(1), ptr [[BB_ADDR]], align 8, !dbg [[DBG163]]
-// CHECK1-NEXT:    [[TMP7:%.*]] = addrspacecast ptr addrspace(1) [[TMP6]] to ptr, !dbg [[DBG163]]
-// CHECK1-NEXT:    store ptr [[TMP7]], ptr [[_TMP2]], align 8, !dbg [[DBG163]]
-// CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[_TMP2]], align 8, !dbg [[DBG163]]
-// CHECK1-NEXT:    [[TMP9:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l27_kernel_environment, ptr [[DYN_PTR]]), !dbg [[DBG163]]
-// CHECK1-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP9]], -1, !dbg [[DBG163]]
-// CHECK1-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]], !dbg [[DBG163]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META159:![0-9]+]], metadata !DIExpression()), !dbg [[DBG160:![0-9]+]]
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr [[C_ADDR]], align 8, !dbg [[DBG161:![0-9]+]]
+// CHECK1-NEXT:    [[TMP1:%.*]] = addrspacecast ptr addrspace(1) [[TMP0]] to ptr, !dbg [[DBG161]]
+// CHECK1-NEXT:    store ptr [[TMP1]], ptr [[TMP]], align 8, !dbg [[DBG161]]
+// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP]], align 8, !dbg [[DBG161]]
+// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr addrspace(1), ptr [[B_ADDR]], align 8, !dbg [[DBG161]]
+// CHECK1-NEXT:    [[TMP4:%.*]] = addrspacecast ptr addrspace(1) [[TMP3]] to ptr, !dbg [[DBG161]]
+// CHECK1-NEXT:    store ptr [[TMP4]], ptr [[_TMP1]], align 8, !dbg [[DBG161]]
+// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[_TMP1]], align 8, !dbg [[DBG161]]
+// CHECK1-NEXT:    [[TMP6:%.*]] = load ptr addrspace(1), ptr [[BB_ADDR]], align 8, !dbg [[DBG161]]
+// CHECK1-NEXT:    [[TMP7:%.*]] = addrspacecast ptr addrspace(1) [[TMP6]] to ptr, !dbg [[DBG161]]
+// CHECK1-NEXT:    store ptr [[TMP7]], ptr [[_TMP2]], align 8, !dbg [[DBG161]]
+// CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[_TMP2]], align 8, !dbg [[DBG161]]
+// CHECK1-NEXT:    [[TMP9:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l27_kernel_environment, ptr [[DYN_PTR]]), !dbg [[DBG161]]
+// CHECK1-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP9]], -1, !dbg [[DBG161]]
+// CHECK1-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]], !dbg [[DBG161]]
 // CHECK1:       user_code.entry:
-// CHECK1-NEXT:    [[TMP10:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB13:[0-9]+]])
+// CHECK1-NEXT:    [[TMP10:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB13:[0-9]+]]), !dbg [[DBG162:![0-9]+]]
 // CHECK1-NEXT:    [[TMP11:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG164:![0-9]+]]
 // CHECK1-NEXT:    store i32 [[TMP11]], ptr [[A_CASTED]], align 4, !dbg [[DBG164]]
 // CHECK1-NEXT:    [[TMP12:%.*]] = load i64, ptr [[A_CASTED]], align 8, !dbg [[DBG164]]
@@ -412,13 +412,13 @@ int main() {
 // CHECK1-NEXT:    store ptr [[TMP8]], ptr [[TMP17]], align 8, !dbg [[DBG164]]
 // CHECK1-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB13]], i32 [[TMP10]], i32 1, i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l27_debug___omp_outlined, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i64 4), !dbg [[DBG164]]
 // CHECK1-NEXT:    call void @__kmpc_target_deinit(), !dbg [[DBG165:![0-9]+]]
-// CHECK1-NEXT:    ret void, !dbg [[DBG167:![0-9]+]]
+// CHECK1-NEXT:    ret void, !dbg [[DBG166:![0-9]+]]
 // CHECK1:       worker.exit:
-// CHECK1-NEXT:    ret void, !dbg [[DBG163]]
+// CHECK1-NEXT:    ret void, !dbg [[DBG161]]
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l27_debug___omp_outlined_debug__
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr addrspace(1) noalias noundef [[C:%.*]], i32 noundef [[A:%.*]], ptr addrspace(1) noalias noundef [[B:%.*]], ptr addrspace(1) noalias noundef [[BB:%.*]]) #[[ATTR2]] !dbg [[DBG168:![0-9]+]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr addrspace(1) noalias noundef [[C:%.*]], i32 noundef [[A:%.*]], ptr addrspace(1) noalias noundef [[B:%.*]], ptr addrspace(1) noalias noundef [[BB:%.*]]) #[[ATTR2]] !dbg [[DBG167:![0-9]+]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -441,140 +441,140 @@ int main() {
 // CHECK1-NEXT:    [[H:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[D:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTGLOBAL_TID__ADDR]], metadata [[META171:![0-9]+]], metadata !DIExpression()), !dbg [[DBG172:![0-9]+]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[DOTGLOBAL_TID__ADDR]], metadata [[META170:![0-9]+]], metadata !DIExpression()), !dbg [[DBG171:![0-9]+]]
 // CHECK1-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTBOUND_TID__ADDR]], metadata [[META173:![0-9]+]], metadata !DIExpression()), !dbg [[DBG172]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[DOTBOUND_TID__ADDR]], metadata [[META172:![0-9]+]], metadata !DIExpression()), !dbg [[DBG171]]
 // CHECK1-NEXT:    store ptr addrspace(1) [[C]], ptr [[C_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META174:![0-9]+]], metadata !DIExpression()), !dbg [[DBG175:![0-9]+]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META173:![0-9]+]], metadata !DIExpression()), !dbg [[DBG174:![0-9]+]]
 // CHECK1-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META176:![0-9]+]], metadata !DIExpression()), !dbg [[DBG177:![0-9]+]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META175:![0-9]+]], metadata !DIExpression()), !dbg [[DBG176:![0-9]+]]
 // CHECK1-NEXT:    store ptr addrspace(1) [[B]], ptr [[B_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META178:![0-9]+]], metadata !DIExpression()), !dbg [[DBG179:![0-9]+]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META177:![0-9]+]], metadata !DIExpression()), !dbg [[DBG178:![0-9]+]]
 // CHECK1-NEXT:    store ptr addrspace(1) [[BB]], ptr [[BB_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META180:![0-9]+]], metadata !DIExpression()), !dbg [[DBG181:![0-9]+]]
-// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr [[C_ADDR]], align 8, !dbg [[DBG182:![0-9]+]]
-// CHECK1-NEXT:    [[TMP1:%.*]] = addrspacecast ptr addrspace(1) [[TMP0]] to ptr, !dbg [[DBG182]]
-// CHECK1-NEXT:    store ptr [[TMP1]], ptr [[TMP]], align 8, !dbg [[DBG182]]
-// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP]], align 8, !dbg [[DBG182]]
-// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr addrspace(1), ptr [[B_ADDR]], align 8, !dbg [[DBG182]]
-// CHECK1-NEXT:    [[TMP4:%.*]] = addrspacecast ptr addrspace(1) [[TMP3]] to ptr, !dbg [[DBG182]]
-// CHECK1-NEXT:    store ptr [[TMP4]], ptr [[_TMP1]], align 8, !dbg [[DBG182]]
-// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[_TMP1]], align 8, !dbg [[DBG182]]
-// CHECK1-NEXT:    [[TMP6:%.*]] = load ptr addrspace(1), ptr [[BB_ADDR]], align 8, !dbg [[DBG182]]
-// CHECK1-NEXT:    [[TMP7:%.*]] = addrspacecast ptr addrspace(1) [[TMP6]] to ptr, !dbg [[DBG182]]
-// CHECK1-NEXT:    store ptr [[TMP7]], ptr [[_TMP2]], align 8, !dbg [[DBG182]]
-// CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[_TMP2]], align 8, !dbg [[DBG182]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTOMP_IV]], metadata [[META183:![0-9]+]], metadata !DIExpression()), !dbg [[DBG172]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTOMP_LB]], metadata [[META184:![0-9]+]], metadata !DIExpression()), !dbg [[DBG172]]
-// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_LB]], align 4, !dbg [[DBG185:![0-9]+]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTOMP_UB]], metadata [[META186:![0-9]+]], metadata !DIExpression()), !dbg [[DBG172]]
-// CHECK1-NEXT:    store i32 9, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG185]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTOMP_STRIDE]], metadata [[META187:![0-9]+]], metadata !DIExpression()), !dbg [[DBG172]]
-// CHECK1-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4, !dbg [[DBG185]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTOMP_IS_LAST]], metadata [[META188:![0-9]+]], metadata !DIExpression()), !dbg [[DBG172]]
-// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4, !dbg [[DBG185]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[I]], metadata [[META189:![0-9]+]], metadata !DIExpression()), !dbg [[DBG172]]
-// CHECK1-NEXT:    [[TMP9:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !dbg [[DBG182]]
-// CHECK1-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4, !dbg [[DBG182]]
-// CHECK1-NEXT:    call void @__kmpc_for_static_init_4(ptr @[[GLOB10:[0-9]+]], i32 [[TMP10]], i32 33, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1), !dbg [[DBG190:![0-9]+]]
-// CHECK1-NEXT:    br label [[OMP_DISPATCH_COND:%.*]], !dbg [[DBG182]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META179:![0-9]+]], metadata !DIExpression()), !dbg [[DBG180:![0-9]+]]
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr [[C_ADDR]], align 8, !dbg [[DBG181:![0-9]+]]
+// CHECK1-NEXT:    [[TMP1:%.*]] = addrspacecast ptr addrspace(1) [[TMP0]] to ptr, !dbg [[DBG181]]
+// CHECK1-NEXT:    store ptr [[TMP1]], ptr [[TMP]], align 8, !dbg [[DBG181]]
+// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP]], align 8, !dbg [[DBG181]]
+// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr addrspace(1), ptr [[B_ADDR]], align 8, !dbg [[DBG181]]
+// CHECK1-NEXT:    [[TMP4:%.*]] = addrspacecast ptr addrspace(1) [[TMP3]] to ptr, !dbg [[DBG181]]
+// CHECK1-NEXT:    store ptr [[TMP4]], ptr [[_TMP1]], align 8, !dbg [[DBG181]]
+// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[_TMP1]], align 8, !dbg [[DBG181]]
+// CHECK1-NEXT:    [[TMP6:%.*]] = load ptr addrspace(1), ptr [[BB_ADDR]], align 8, !dbg [[DBG181]]
+// CHECK1-NEXT:    [[TMP7:%.*]] = addrspacecast ptr addrspace(1) [[TMP6]] to ptr, !dbg [[DBG181]]
+// CHECK1-NEXT:    store ptr [[TMP7]], ptr [[_TMP2]], align 8, !dbg [[DBG181]]
+// CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[_TMP2]], align 8, !dbg [[DBG181]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[DOTOMP_IV]], metadata [[META182:![0-9]+]], metadata !DIExpression()), !dbg [[DBG171]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[DOTOMP_LB]], metadata [[META183:![0-9]+]], metadata !DIExpression()), !dbg [[DBG171]]
+// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_LB]], align 4, !dbg [[DBG184:![0-9]+]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[DOTOMP_UB]], metadata [[META185:![0-9]+]], metadata !DIExpression()), !dbg [[DBG171]]
+// CHECK1-NEXT:    store i32 9, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG184]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[DOTOMP_STRIDE]], metadata [[META186:![0-9]+]], metadata !DIExpression()), !dbg [[DBG171]]
+// CHECK1-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4, !dbg [[DBG184]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[DOTOMP_IS_LAST]], metadata [[META187:![0-9]+]], metadata !DIExpression()), !dbg [[DBG171]]
+// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4, !dbg [[DBG184]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[I]], metadata [[META188:![0-9]+]], metadata !DIExpression()), !dbg [[DBG171]]
+// CHECK1-NEXT:    [[TMP9:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !dbg [[DBG181]]
+// CHECK1-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4, !dbg [[DBG181]]
+// CHECK1-NEXT:    call void @__kmpc_for_static_init_4(ptr @[[GLOB10:[0-9]+]], i32 [[TMP10]], i32 33, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1), !dbg [[DBG189:![0-9]+]]
+// CHECK1-NEXT:    br label [[OMP_DISPATCH_COND:%.*]], !dbg [[DBG181]]
 // CHECK1:       omp.dispatch.cond:
-// CHECK1-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG185]]
-// CHECK1-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP11]], 9, !dbg [[DBG185]]
-// CHECK1-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]], !dbg [[DBG185]]
+// CHECK1-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG184]]
+// CHECK1-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP11]], 9, !dbg [[DBG184]]
+// CHECK1-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]], !dbg [[DBG184]]
 // CHECK1:       cond.true:
-// CHECK1-NEXT:    br label [[COND_END:%.*]], !dbg [[DBG185]]
+// CHECK1-NEXT:    br label [[COND_END:%.*]], !dbg [[DBG184]]
 // CHECK1:       cond.false:
-// CHECK1-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG185]]
-// CHECK1-NEXT:    br label [[COND_END]], !dbg [[DBG185]]
+// CHECK1-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG184]]
+// CHECK1-NEXT:    br label [[COND_END]], !dbg [[DBG184]]
 // CHECK1:       cond.end:
-// CHECK1-NEXT:    [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP12]], [[COND_FALSE]] ], !dbg [[DBG185]]
-// CHECK1-NEXT:    store i32 [[COND]], ptr [[DOTOMP_UB]], align 4, !dbg [[DBG185]]
-// CHECK1-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4, !dbg [[DBG185]]
-// CHECK1-NEXT:    store i32 [[TMP13]], ptr [[DOTOMP_IV]], align 4, !dbg [[DBG185]]
-// CHECK1-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG185]]
-// CHECK1-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG185]]
-// CHECK1-NEXT:    [[CMP4:%.*]] = icmp sle i32 [[TMP14]], [[TMP15]], !dbg [[DBG182]]
-// CHECK1-NEXT:    br i1 [[CMP4]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]], !dbg [[DBG182]]
+// CHECK1-NEXT:    [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP12]], [[COND_FALSE]] ], !dbg [[DBG184]]
+// CHECK1-NEXT:    store i32 [[COND]], ptr [[DOTOMP_UB]], align 4, !dbg [[DBG184]]
+// CHECK1-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4, !dbg [[DBG184]]
+// CHECK1-NEXT:    store i32 [[TMP13]], ptr [[DOTOMP_IV]], align 4, !dbg [[DBG184]]
+// CHECK1-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG184]]
+// CHECK1-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG184]]
+// CHECK1-NEXT:    [[CMP4:%.*]] = icmp sle i32 [[TMP14]], [[TMP15]], !dbg [[DBG181]]
+// CHECK1-NEXT:    br i1 [[CMP4]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]], !dbg [[DBG181]]
 // CHECK1:       omp.dispatch.body:
-// CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]], !dbg [[DBG182]]
+// CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]], !dbg [[DBG181]]
 // CHECK1:       omp.inner.for.cond:
-// CHECK1-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG185]]
-// CHECK1-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG185]]
-// CHECK1-NEXT:    [[CMP5:%.*]] = icmp sle i32 [[TMP16]], [[TMP17]], !dbg [[DBG182]]
-// CHECK1-NEXT:    br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]], !dbg [[DBG182]]
+// CHECK1-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG184]]
+// CHECK1-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG184]]
+// CHECK1-NEXT:    [[CMP5:%.*]] = icmp sle i32 [[TMP16]], [[TMP17]], !dbg [[DBG181]]
+// CHECK1-NEXT:    br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]], !dbg [[DBG181]]
 // CHECK1:       omp.inner.for.body:
-// CHECK1-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG185]]
-// CHECK1-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP18]], 1, !dbg [[DBG191:![0-9]+]]
-// CHECK1-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]], !dbg [[DBG191]]
-// CHECK1-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !dbg [[DBG191]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[F]], metadata [[META192:![0-9]+]], metadata !DIExpression()), !dbg [[DBG194:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 1, !dbg [[DBG195:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX]], i64 0, i64 1, !dbg [[DBG195]]
-// CHECK1-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX6]], i64 0, i64 1, !dbg [[DBG195]]
-// CHECK1-NEXT:    store ptr [[ARRAYIDX7]], ptr [[F]], align 8, !dbg [[DBG194]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[G]], metadata [[META196:![0-9]+]], metadata !DIExpression()), !dbg [[DBG197:![0-9]+]]
-// CHECK1-NEXT:    store ptr [[A_ADDR]], ptr [[G]], align 8, !dbg [[DBG197]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[H]], metadata [[META198:![0-9]+]], metadata !DIExpression()), !dbg [[DBG199:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP5]], i64 0, i64 1, !dbg [[DBG200:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX8]], i64 0, i64 1, !dbg [[DBG200]]
-// CHECK1-NEXT:    store ptr [[ARRAYIDX9]], ptr [[H]], align 8, !dbg [[DBG199]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[D]], metadata [[META201:![0-9]+]], metadata !DIExpression()), !dbg [[DBG202:![0-9]+]]
-// CHECK1-NEXT:    store i32 15, ptr [[D]], align 4, !dbg [[DBG202]]
-// CHECK1-NEXT:    store i32 5, ptr [[A_ADDR]], align 4, !dbg [[DBG203:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP5]], i64 0, i64 0, !dbg [[DBG204:![0-9]+]]
-// CHECK1-NEXT:    [[TMP19:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG205:![0-9]+]]
-// CHECK1-NEXT:    [[IDXPROM:%.*]] = sext i32 [[TMP19]] to i64, !dbg [[DBG204]]
-// CHECK1-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX10]], i64 0, i64 [[IDXPROM]], !dbg [[DBG204]]
-// CHECK1-NEXT:    store i32 10, ptr [[ARRAYIDX11]], align 4, !dbg [[DBG206:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 0, !dbg [[DBG207:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX12]], i64 0, i64 0, !dbg [[DBG207]]
-// CHECK1-NEXT:    [[TMP20:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG208:![0-9]+]]
-// CHECK1-NEXT:    [[IDXPROM14:%.*]] = sext i32 [[TMP20]] to i64, !dbg [[DBG207]]
-// CHECK1-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX13]], i64 0, i64 [[IDXPROM14]], !dbg [[DBG207]]
-// CHECK1-NEXT:    store i32 11, ptr [[ARRAYIDX15]], align 4, !dbg [[DBG209:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 0, !dbg [[DBG210:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX17:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX16]], i64 0, i64 0, !dbg [[DBG210]]
-// CHECK1-NEXT:    [[TMP21:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG211:![0-9]+]]
-// CHECK1-NEXT:    [[IDXPROM18:%.*]] = sext i32 [[TMP21]] to i64, !dbg [[DBG210]]
-// CHECK1-NEXT:    [[ARRAYIDX19:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX17]], i64 0, i64 [[IDXPROM18]], !dbg [[DBG210]]
-// CHECK1-NEXT:    [[TMP22:%.*]] = load i32, ptr [[ARRAYIDX19]], align 4, !dbg [[DBG210]]
-// CHECK1-NEXT:    [[ARRAYIDX20:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP5]], i64 0, i64 0, !dbg [[DBG212:![0-9]+]]
-// CHECK1-NEXT:    [[TMP23:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG213:![0-9]+]]
-// CHECK1-NEXT:    [[IDXPROM21:%.*]] = sext i32 [[TMP23]] to i64, !dbg [[DBG212]]
-// CHECK1-NEXT:    [[ARRAYIDX22:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX20]], i64 0, i64 [[IDXPROM21]], !dbg [[DBG212]]
-// CHECK1-NEXT:    store i32 [[TMP22]], ptr [[ARRAYIDX22]], align 4, !dbg [[DBG214:![0-9]+]]
-// CHECK1-NEXT:    [[TMP24:%.*]] = load i8, ptr [[TMP8]], align 1, !dbg [[DBG215:![0-9]+]]
-// CHECK1-NEXT:    [[TOBOOL:%.*]] = trunc i8 [[TMP24]] to i1, !dbg [[DBG215]]
-// CHECK1-NEXT:    [[CONV:%.*]] = zext i1 [[TOBOOL]] to i32, !dbg [[DBG215]]
-// CHECK1-NEXT:    store i32 [[CONV]], ptr [[D]], align 4, !dbg [[DBG216:![0-9]+]]
-// CHECK1-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]], !dbg [[DBG217:![0-9]+]]
+// CHECK1-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG184]]
+// CHECK1-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP18]], 1, !dbg [[DBG190:![0-9]+]]
+// CHECK1-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]], !dbg [[DBG190]]
+// CHECK1-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !dbg [[DBG190]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[F]], metadata [[META191:![0-9]+]], metadata !DIExpression()), !dbg [[DBG193:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 1, !dbg [[DBG194:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX]], i64 0, i64 1, !dbg [[DBG194]]
+// CHECK1-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX6]], i64 0, i64 1, !dbg [[DBG194]]
+// CHECK1-NEXT:    store ptr [[ARRAYIDX7]], ptr [[F]], align 8, !dbg [[DBG193]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[G]], metadata [[META195:![0-9]+]], metadata !DIExpression()), !dbg [[DBG196:![0-9]+]]
+// CHECK1-NEXT:    store ptr [[A_ADDR]], ptr [[G]], align 8, !dbg [[DBG196]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[H]], metadata [[META197:![0-9]+]], metadata !DIExpression()), !dbg [[DBG198:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP5]], i64 0, i64 1, !dbg [[DBG199:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX8]], i64 0, i64 1, !dbg [[DBG199]]
+// CHECK1-NEXT:    store ptr [[ARRAYIDX9]], ptr [[H]], align 8, !dbg [[DBG198]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[D]], metadata [[META200:![0-9]+]], metadata !DIExpression()), !dbg [[DBG201:![0-9]+]]
+// CHECK1-NEXT:    store i32 15, ptr [[D]], align 4, !dbg [[DBG201]]
+// CHECK1-NEXT:    store i32 5, ptr [[A_ADDR]], align 4, !dbg [[DBG202:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP5]], i64 0, i64 0, !dbg [[DBG203:![0-9]+]]
+// CHECK1-NEXT:    [[TMP19:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG204:![0-9]+]]
+// CHECK1-NEXT:    [[IDXPROM:%.*]] = sext i32 [[TMP19]] to i64, !dbg [[DBG203]]
+// CHECK1-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX10]], i64 0, i64 [[IDXPROM]], !dbg [[DBG203]]
+// CHECK1-NEXT:    store i32 10, ptr [[ARRAYIDX11]], align 4, !dbg [[DBG205:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 0, !dbg [[DBG206:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX12]], i64 0, i64 0, !dbg [[DBG206]]
+// CHECK1-NEXT:    [[TMP20:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG207:![0-9]+]]
+// CHECK1-NEXT:    [[IDXPROM14:%.*]] = sext i32 [[TMP20]] to i64, !dbg [[DBG206]]
+// CHECK1-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX13]], i64 0, i64 [[IDXPROM14]], !dbg [[DBG206]]
+// CHECK1-NEXT:    store i32 11, ptr [[ARRAYIDX15]], align 4, !dbg [[DBG208:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 0, !dbg [[DBG209:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX17:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX16]], i64 0, i64 0, !dbg [[DBG209]]
+// CHECK1-NEXT:    [[TMP21:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG210:![0-9]+]]
+// CHECK1-NEXT:    [[IDXPROM18:%.*]] = sext i32 [[TMP21]] to i64, !dbg [[DBG209]]
+// CHECK1-NEXT:    [[ARRAYIDX19:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX17]], i64 0, i64 [[IDXPROM18]], !dbg [[DBG209]]
+// CHECK1-NEXT:    [[TMP22:%.*]] = load i32, ptr [[ARRAYIDX19]], align 4, !dbg [[DBG209]]
+// CHECK1-NEXT:    [[ARRAYIDX20:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP5]], i64 0, i64 0, !dbg [[DBG211:![0-9]+]]
+// CHECK1-NEXT:    [[TMP23:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG212:![0-9]+]]
+// CHECK1-NEXT:    [[IDXPROM21:%.*]] = sext i32 [[TMP23]] to i64, !dbg [[DBG211]]
+// CHECK1-NEXT:    [[ARRAYIDX22:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX20]], i64 0, i64 [[IDXPROM21]], !dbg [[DBG211]]
+// CHECK1-NEXT:    store i32 [[TMP22]], ptr [[ARRAYIDX22]], align 4, !dbg [[DBG213:![0-9]+]]
+// CHECK1-NEXT:    [[TMP24:%.*]] = load i8, ptr [[TMP8]], align 1, !dbg [[DBG214:![0-9]+]]
+// CHECK1-NEXT:    [[TOBOOL:%.*]] = trunc i8 [[TMP24]] to i1, !dbg [[DBG214]]
+// CHECK1-NEXT:    [[CONV:%.*]] = zext i1 [[TOBOOL]] to i32, !dbg [[DBG214]]
+// CHECK1-NEXT:    store i32 [[CONV]], ptr [[D]], align 4, !dbg [[DBG215:![0-9]+]]
+// CHECK1-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]], !dbg [[DBG216:![0-9]+]]
 // CHECK1:       omp.body.continue:
-// CHECK1-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]], !dbg [[DBG190]]
+// CHECK1-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]], !dbg [[DBG189]]
 // CHECK1:       omp.inner.for.inc:
-// CHECK1-NEXT:    [[TMP25:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG185]]
-// CHECK1-NEXT:    [[ADD23:%.*]] = add nsw i32 [[TMP25]], 1, !dbg [[DBG182]]
-// CHECK1-NEXT:    store i32 [[ADD23]], ptr [[DOTOMP_IV]], align 4, !dbg [[DBG182]]
-// CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND]], !dbg [[DBG190]], !llvm.loop [[LOOP218:![0-9]+]]
+// CHECK1-NEXT:    [[TMP25:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG184]]
+// CHECK1-NEXT:    [[ADD23:%.*]] = add nsw i32 [[TMP25]], 1, !dbg [[DBG181]]
+// CHECK1-NEXT:    store i32 [[ADD23]], ptr [[DOTOMP_IV]], align 4, !dbg [[DBG181]]
+// CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND]], !dbg [[DBG189]], !llvm.loop [[LOOP217:![0-9]+]]
 // CHECK1:       omp.inner.for.end:
-// CHECK1-NEXT:    br label [[OMP_DISPATCH_INC:%.*]], !dbg [[DBG190]]
+// CHECK1-NEXT:    br label [[OMP_DISPATCH_INC:%.*]], !dbg [[DBG189]]
 // CHECK1:       omp.dispatch.inc:
-// CHECK1-NEXT:    [[TMP26:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4, !dbg [[DBG185]]
-// CHECK1-NEXT:    [[TMP27:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !dbg [[DBG185]]
-// CHECK1-NEXT:    [[ADD24:%.*]] = add nsw i32 [[TMP26]], [[TMP27]], !dbg [[DBG182]]
-// CHECK1-NEXT:    store i32 [[ADD24]], ptr [[DOTOMP_LB]], align 4, !dbg [[DBG182]]
-// CHECK1-NEXT:    [[TMP28:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG185]]
-// CHECK1-NEXT:    [[TMP29:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !dbg [[DBG185]]
-// CHECK1-NEXT:    [[ADD25:%.*]] = add nsw i32 [[TMP28]], [[TMP29]], !dbg [[DBG182]]
-// CHECK1-NEXT:    store i32 [[ADD25]], ptr [[DOTOMP_UB]], align 4, !dbg [[DBG182]]
-// CHECK1-NEXT:    br label [[OMP_DISPATCH_COND]], !dbg [[DBG190]], !llvm.loop [[LOOP220:![0-9]+]]
+// CHECK1-NEXT:    [[TMP26:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4, !dbg [[DBG184]]
+// CHECK1-NEXT:    [[TMP27:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !dbg [[DBG184]]
+// CHECK1-NEXT:    [[ADD24:%.*]] = add nsw i32 [[TMP26]], [[TMP27]], !dbg [[DBG181]]
+// CHECK1-NEXT:    store i32 [[ADD24]], ptr [[DOTOMP_LB]], align 4, !dbg [[DBG181]]
+// CHECK1-NEXT:    [[TMP28:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG184]]
+// CHECK1-NEXT:    [[TMP29:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !dbg [[DBG184]]
+// CHECK1-NEXT:    [[ADD25:%.*]] = add nsw i32 [[TMP28]], [[TMP29]], !dbg [[DBG181]]
+// CHECK1-NEXT:    store i32 [[ADD25]], ptr [[DOTOMP_UB]], align 4, !dbg [[DBG181]]
+// CHECK1-NEXT:    br label [[OMP_DISPATCH_COND]], !dbg [[DBG189]], !llvm.loop [[LOOP219:![0-9]+]]
 // CHECK1:       omp.dispatch.end:
-// CHECK1-NEXT:    call void @__kmpc_for_static_fini(ptr @[[GLOB12:[0-9]+]], i32 [[TMP10]]), !dbg [[DBG219:![0-9]+]]
-// CHECK1-NEXT:    ret void, !dbg [[DBG221:![0-9]+]]
+// CHECK1-NEXT:    call void @__kmpc_for_static_fini(ptr @[[GLOB12:[0-9]+]], i32 [[TMP10]]), !dbg [[DBG218:![0-9]+]]
+// CHECK1-NEXT:    ret void, !dbg [[DBG220:![0-9]+]]
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l27_debug___omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[C:%.*]], i64 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[B:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[BB:%.*]]) #[[ATTR2]] !dbg [[DBG222:![0-9]+]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[C:%.*]], i64 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[B:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[BB:%.*]]) #[[ATTR2]] !dbg [[DBG221:![0-9]+]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -583,35 +583,35 @@ int main() {
 // CHECK1-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[BB_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTGLOBAL_TID__ADDR]], metadata [[META223:![0-9]+]], metadata !DIExpression()), !dbg [[DBG224:![0-9]+]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[DOTGLOBAL_TID__ADDR]], metadata [[META222:![0-9]+]], metadata !DIExpression()), !dbg [[DBG223:![0-9]+]]
 // CHECK1-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTBOUND_TID__ADDR]], metadata [[META225:![0-9]+]], metadata !DIExpression()), !dbg [[DBG224]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[DOTBOUND_TID__ADDR]], metadata [[META224:![0-9]+]], metadata !DIExpression()), !dbg [[DBG223]]
 // CHECK1-NEXT:    store ptr [[C]], ptr [[C_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META226:![0-9]+]], metadata !DIExpression()), !dbg [[DBG224]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META225:![0-9]+]], metadata !DIExpression()), !dbg [[DBG223]]
 // CHECK1-NEXT:    store i64 [[A]], ptr [[A_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META227:![0-9]+]], metadata !DIExpression()), !dbg [[DBG224]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META226:![0-9]+]], metadata !DIExpression()), !dbg [[DBG223]]
 // CHECK1-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META228:![0-9]+]], metadata !DIExpression()), !dbg [[DBG224]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META227:![0-9]+]], metadata !DIExpression()), !dbg [[DBG223]]
 // CHECK1-NEXT:    store ptr [[BB]], ptr [[BB_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META229:![0-9]+]], metadata !DIExpression()), !dbg [[DBG224]]
-// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG230:![0-9]+]]
-// CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG230]]
-// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG230]]
-// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !dbg [[DBG230]]
-// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[DOTBOUND_TID__ADDR]], align 8, !dbg [[DBG230]]
-// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG230]]
-// CHECK1-NEXT:    [[TMP6:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG230]]
-// CHECK1-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG230]]
-// CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG230]]
-// CHECK1-NEXT:    [[TMP9:%.*]] = addrspacecast ptr [[TMP5]] to ptr addrspace(1), !dbg [[DBG230]]
-// CHECK1-NEXT:    [[TMP10:%.*]] = addrspacecast ptr [[TMP7]] to ptr addrspace(1), !dbg [[DBG230]]
-// CHECK1-NEXT:    [[TMP11:%.*]] = addrspacecast ptr [[TMP8]] to ptr addrspace(1), !dbg [[DBG230]]
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l27_debug___omp_outlined_debug__(ptr [[TMP3]], ptr [[TMP4]], ptr addrspace(1) [[TMP9]], i32 [[TMP6]], ptr addrspace(1) [[TMP10]], ptr addrspace(1) [[TMP11]]) #[[ATTR4]], !dbg [[DBG230]]
-// CHECK1-NEXT:    ret void, !dbg [[DBG230]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META228:![0-9]+]], metadata !DIExpression()), !dbg [[DBG223]]
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG229:![0-9]+]]
+// CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG229]]
+// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG229]]
+// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !dbg [[DBG229]]
+// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[DOTBOUND_TID__ADDR]], align 8, !dbg [[DBG229]]
+// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG229]]
+// CHECK1-NEXT:    [[TMP6:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG229]]
+// CHECK1-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG229]]
+// CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG229]]
+// CHECK1-NEXT:    [[TMP9:%.*]] = addrspacecast ptr [[TMP5]] to ptr addrspace(1), !dbg [[DBG229]]
+// CHECK1-NEXT:    [[TMP10:%.*]] = addrspacecast ptr [[TMP7]] to ptr addrspace(1), !dbg [[DBG229]]
+// CHECK1-NEXT:    [[TMP11:%.*]] = addrspacecast ptr [[TMP8]] to ptr addrspace(1), !dbg [[DBG229]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l27_debug___omp_outlined_debug__(ptr [[TMP3]], ptr [[TMP4]], ptr addrspace(1) [[TMP9]], i32 [[TMP6]], ptr addrspace(1) [[TMP10]], ptr addrspace(1) [[TMP11]]) #[[ATTR4]], !dbg [[DBG229]]
+// CHECK1-NEXT:    ret void, !dbg [[DBG229]]
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l27
-// CHECK1-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[C:%.*]], i64 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[B:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[BB:%.*]]) #[[ATTR6]] !dbg [[DBG231:![0-9]+]] {
+// CHECK1-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[C:%.*]], i64 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[B:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[BB:%.*]]) #[[ATTR6]] !dbg [[DBG230:![0-9]+]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[C_ADDR:%.*]] = alloca ptr, align 8
@@ -619,32 +619,32 @@ int main() {
 // CHECK1-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[BB_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DYN_PTR_ADDR]], metadata [[META234:![0-9]+]], metadata !DIExpression()), !dbg [[DBG235:![0-9]+]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[DYN_PTR_ADDR]], metadata [[META233:![0-9]+]], metadata !DIExpression()), !dbg [[DBG234:![0-9]+]]
 // CHECK1-NEXT:    store ptr [[C]], ptr [[C_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META236:![0-9]+]], metadata !DIExpression()), !dbg [[DBG235]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META235:![0-9]+]], metadata !DIExpression()), !dbg [[DBG234]]
 // CHECK1-NEXT:    store i64 [[A]], ptr [[A_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META237:![0-9]+]], metadata !DIExpression()), !dbg [[DBG235]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META236:![0-9]+]], metadata !DIExpression()), !dbg [[DBG234]]
 // CHECK1-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META238:![0-9]+]], metadata !DIExpression()), !dbg [[DBG235]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META237:![0-9]+]], metadata !DIExpression()), !dbg [[DBG234]]
 // CHECK1-NEXT:    store ptr [[BB]], ptr [[BB_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META239:![0-9]+]], metadata !DIExpression()), !dbg [[DBG235]]
-// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG240:![0-9]+]]
-// CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG240]]
-// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG240]]
-// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[DYN_PTR_ADDR]], align 8, !dbg [[DBG240]]
-// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG240]]
-// CHECK1-NEXT:    [[TMP5:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG240]]
-// CHECK1-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG240]]
-// CHECK1-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG240]]
-// CHECK1-NEXT:    [[TMP8:%.*]] = addrspacecast ptr [[TMP4]] to ptr addrspace(1), !dbg [[DBG240]]
-// CHECK1-NEXT:    [[TMP9:%.*]] = addrspacecast ptr [[TMP6]] to ptr addrspace(1), !dbg [[DBG240]]
-// CHECK1-NEXT:    [[TMP10:%.*]] = addrspacecast ptr [[TMP7]] to ptr addrspace(1), !dbg [[DBG240]]
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l27_debug__(ptr [[TMP3]], ptr addrspace(1) [[TMP8]], i32 [[TMP5]], ptr addrspace(1) [[TMP9]], ptr addrspace(1) [[TMP10]]) #[[ATTR4]], !dbg [[DBG240]]
-// CHECK1-NEXT:    ret void, !dbg [[DBG240]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META238:![0-9]+]], metadata !DIExpression()), !dbg [[DBG234]]
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG239:![0-9]+]]
+// CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG239]]
+// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG239]]
+// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[DYN_PTR_ADDR]], align 8, !dbg [[DBG239]]
+// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG239]]
+// CHECK1-NEXT:    [[TMP5:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG239]]
+// CHECK1-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG239]]
+// CHECK1-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG239]]
+// CHECK1-NEXT:    [[TMP8:%.*]] = addrspacecast ptr [[TMP4]] to ptr addrspace(1), !dbg [[DBG239]]
+// CHECK1-NEXT:    [[TMP9:%.*]] = addrspacecast ptr [[TMP6]] to ptr addrspace(1), !dbg [[DBG239]]
+// CHECK1-NEXT:    [[TMP10:%.*]] = addrspacecast ptr [[TMP7]] to ptr addrspace(1), !dbg [[DBG239]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l27_debug__(ptr [[TMP3]], ptr addrspace(1) [[TMP8]], i32 [[TMP5]], ptr addrspace(1) [[TMP9]], ptr addrspace(1) [[TMP10]]) #[[ATTR4]], !dbg [[DBG239]]
+// CHECK1-NEXT:    ret void, !dbg [[DBG239]]
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l41_debug__
-// CHECK1-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr addrspace(1) noalias noundef [[C:%.*]], ptr addrspace(1) noalias noundef [[A:%.*]], ptr addrspace(1) noalias noundef [[B:%.*]], ptr addrspace(1) noalias noundef [[BB:%.*]]) #[[ATTR0]] !dbg [[DBG241:![0-9]+]] {
+// CHECK1-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr addrspace(1) noalias noundef [[C:%.*]], ptr addrspace(1) noalias noundef [[A:%.*]], ptr addrspace(1) noalias noundef [[B:%.*]], ptr addrspace(1) noalias noundef [[BB:%.*]]) #[[ATTR0]] !dbg [[DBG240:![0-9]+]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[C_ADDR:%.*]] = alloca ptr addrspace(1), align 8
@@ -657,49 +657,49 @@ int main() {
 // CHECK1-NEXT:    [[_TMP3:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x ptr], align 8
 // CHECK1-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DYN_PTR_ADDR]], metadata [[META246:![0-9]+]], metadata !DIExpression()), !dbg [[DBG247:![0-9]+]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[DYN_PTR_ADDR]], metadata [[META245:![0-9]+]], metadata !DIExpression()), !dbg [[DBG246:![0-9]+]]
 // CHECK1-NEXT:    store ptr addrspace(1) [[C]], ptr [[C_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META248:![0-9]+]], metadata !DIExpression()), !dbg [[DBG249:![0-9]+]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META247:![0-9]+]], metadata !DIExpression()), !dbg [[DBG248:![0-9]+]]
 // CHECK1-NEXT:    store ptr addrspace(1) [[A]], ptr [[A_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META250:![0-9]+]], metadata !DIExpression()), !dbg [[DBG251:![0-9]+]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META249:![0-9]+]], metadata !DIExpression()), !dbg [[DBG250:![0-9]+]]
 // CHECK1-NEXT:    store ptr addrspace(1) [[B]], ptr [[B_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META252:![0-9]+]], metadata !DIExpression()), !dbg [[DBG253:![0-9]+]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META251:![0-9]+]], metadata !DIExpression()), !dbg [[DBG252:![0-9]+]]
 // CHECK1-NEXT:    store ptr addrspace(1) [[BB]], ptr [[BB_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META254:![0-9]+]], metadata !DIExpression()), !dbg [[DBG255:![0-9]+]]
-// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr [[C_ADDR]], align 8, !dbg [[DBG256:![0-9]+]]
-// CHECK1-NEXT:    [[TMP1:%.*]] = addrspacecast ptr addrspace(1) [[TMP0]] to ptr, !dbg [[DBG256]]
-// CHECK1-NEXT:    store ptr [[TMP1]], ptr [[TMP]], align 8, !dbg [[DBG256]]
-// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP]], align 8, !dbg [[DBG256]]
-// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr addrspace(1), ptr [[A_ADDR]], align 8, !dbg [[DBG256]]
-// CHECK1-NEXT:    [[TMP4:%.*]] = addrspacecast ptr addrspace(1) [[TMP3]] to ptr, !dbg [[DBG256]]
-// CHECK1-NEXT:    store ptr [[TMP4]], ptr [[_TMP1]], align 8, !dbg [[DBG256]]
-// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[_TMP1]], align 8, !dbg [[DBG256]]
-// CHECK1-NEXT:    [[TMP6:%.*]] = load ptr addrspace(1), ptr [[B_ADDR]], align 8, !dbg [[DBG256]]
-// CHECK1-NEXT:    [[TMP7:%.*]] = addrspacecast ptr addrspace(1) [[TMP6]] to ptr, !dbg [[DBG256]]
-// CHECK1-NEXT:    store ptr [[TMP7]], ptr [[_TMP2]], align 8, !dbg [[DBG256]]
-// CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[_TMP2]], align 8, !dbg [[DBG256]]
-// CHECK1-NEXT:    [[TMP9:%.*]] = load ptr addrspace(1), ptr [[BB_ADDR]], align 8, !dbg [[DBG256]]
-// CHECK1-NEXT:    [[TMP10:%.*]] = addrspacecast ptr addrspace(1) [[TMP9]] to ptr, !dbg [[DBG256]]
-// CHECK1-NEXT:    store ptr [[TMP10]], ptr [[_TMP3]], align 8, !dbg [[DBG256]]
-// CHECK1-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[_TMP3]], align 8, !dbg [[DBG256]]
-// CHECK1-NEXT:    [[TMP12:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l41_kernel_environment, ptr [[DYN_PTR]]), !dbg [[DBG256]]
-// CHECK1-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP12]], -1, !dbg [[DBG256]]
-// CHECK1-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]], !dbg [[DBG256]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META253:![0-9]+]], metadata !DIExpression()), !dbg [[DBG254:![0-9]+]]
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr [[C_ADDR]], align 8, !dbg [[DBG255:![0-9]+]]
+// CHECK1-NEXT:    [[TMP1:%.*]] = addrspacecast ptr addrspace(1) [[TMP0]] to ptr, !dbg [[DBG255]]
+// CHECK1-NEXT:    store ptr [[TMP1]], ptr [[TMP]], align 8, !dbg [[DBG255]]
+// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP]], align 8, !dbg [[DBG255]]
+// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr addrspace(1), ptr [[A_ADDR]], align 8, !dbg [[DBG255]]
+// CHECK1-NEXT:    [[TMP4:%.*]] = addrspacecast ptr addrspace(1) [[TMP3]] to ptr, !dbg [[DBG255]]
+// CHECK1-NEXT:    store ptr [[TMP4]], ptr [[_TMP1]], align 8, !dbg [[DBG255]]
+// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[_TMP1]], align 8, !dbg [[DBG255]]
+// CHECK1-NEXT:    [[TMP6:%.*]] = load ptr addrspace(1), ptr [[B_ADDR]], align 8, !dbg [[DBG255]]
+// CHECK1-NEXT:    [[TMP7:%.*]] = addrspacecast ptr addrspace(1) [[TMP6]] to ptr, !dbg [[DBG255]]
+// CHECK1-NEXT:    store ptr [[TMP7]], ptr [[_TMP2]], align 8, !dbg [[DBG255]]
+// CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[_TMP2]], align 8, !dbg [[DBG255]]
+// CHECK1-NEXT:    [[TMP9:%.*]] = load ptr addrspace(1), ptr [[BB_ADDR]], align 8, !dbg [[DBG255]]
+// CHECK1-NEXT:    [[TMP10:%.*]] = addrspacecast ptr addrspace(1) [[TMP9]] to ptr, !dbg [[DBG255]]
+// CHECK1-NEXT:    store ptr [[TMP10]], ptr [[_TMP3]], align 8, !dbg [[DBG255]]
+// CHECK1-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[_TMP3]], align 8, !dbg [[DBG255]]
+// CHECK1-NEXT:    [[TMP12:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l41_kernel_environment, ptr [[DYN_PTR]]), !dbg [[DBG255]]
+// CHECK1-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP12]], -1, !dbg [[DBG255]]
+// CHECK1-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]], !dbg [[DBG255]]
 // CHECK1:       user_code.entry:
-// CHECK1-NEXT:    [[TMP13:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB20:[0-9]+]])
-// CHECK1-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 0, !dbg [[DBG257:![0-9]+]]
-// CHECK1-NEXT:    store ptr [[TMP2]], ptr [[TMP14]], align 8, !dbg [[DBG257]]
-// CHECK1-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 1, !dbg [[DBG257]]
-// CHECK1-NEXT:    store ptr [[TMP5]], ptr [[TMP15]], align 8, !dbg [[DBG257]]
-// CHECK1-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 2, !dbg [[DBG257]]
-// CHECK1-NEXT:    store ptr [[TMP8]], ptr [[TMP16]], align 8, !dbg [[DBG257]]
-// CHECK1-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 3, !dbg [[DBG257]]
-// CHECK1-NEXT:    store ptr [[TMP11]], ptr [[TMP17]], align 8, !dbg [[DBG257]]
-// CHECK1-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB20]], i32 [[TMP13]], i32 1, i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l41_debug___omp_outlined, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i64 4), !dbg [[DBG257]]
-// CHECK1-NEXT:    call void @__kmpc_target_deinit(), !dbg [[DBG258:![0-9]+]]
+// CHECK1-NEXT:    [[TMP13:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB20:[0-9]+]]), !dbg [[DBG256:![0-9]+]]
+// CHECK1-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 0, !dbg [[DBG258:![0-9]+]]
+// CHECK1-NEXT:    store ptr [[TMP2]], ptr [[TMP14]], align 8, !dbg [[DBG258]]
+// CHECK1-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 1, !dbg [[DBG258]]
+// CHECK1-NEXT:    store ptr [[TMP5]], ptr [[TMP15]], align 8, !dbg [[DBG258]]
+// CHECK1-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 2, !dbg [[DBG258]]
+// CHECK1-NEXT:    store ptr [[TMP8]], ptr [[TMP16]], align 8, !dbg [[DBG258]]
+// CHECK1-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 3, !dbg [[DBG258]]
+// CHECK1-NEXT:    store ptr [[TMP11]], ptr [[TMP17]], align 8, !dbg [[DBG258]]
+// CHECK1-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB20]], i32 [[TMP13]], i32 1, i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l41_debug___omp_outlined, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i64 4), !dbg [[DBG258]]
+// CHECK1-NEXT:    call void @__kmpc_target_deinit(), !dbg [[DBG259:![0-9]+]]
 // CHECK1-NEXT:    ret void, !dbg [[DBG260:![0-9]+]]
 // CHECK1:       worker.exit:
-// CHECK1-NEXT:    ret void, !dbg [[DBG256]]
+// CHECK1-NEXT:    ret void, !dbg [[DBG255]]
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l41_debug___omp_outlined_debug__
@@ -727,17 +727,17 @@ int main() {
 // CHECK1-NEXT:    [[H:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[D:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTGLOBAL_TID__ADDR]], metadata [[META264:![0-9]+]], metadata !DIExpression()), !dbg [[DBG265:![0-9]+]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[DOTGLOBAL_TID__ADDR]], metadata [[META264:![0-9]+]], metadata !DIExpression()), !dbg [[DBG265:![0-9]+]]
 // CHECK1-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTBOUND_TID__ADDR]], metadata [[META266:![0-9]+]], metadata !DIExpression()), !dbg [[DBG265]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[DOTBOUND_TID__ADDR]], metadata [[META266:![0-9]+]], metadata !DIExpression()), !dbg [[DBG265]]
 // CHECK1-NEXT:    store ptr addrspace(1) [[C]], ptr [[C_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META267:![0-9]+]], metadata !DIExpression()), !dbg [[DBG268:![0-9]+]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META267:![0-9]+]], metadata !DIExpression()), !dbg [[DBG268:![0-9]+]]
 // CHECK1-NEXT:    store ptr addrspace(1) [[A]], ptr [[A_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META269:![0-9]+]], metadata !DIExpression()), !dbg [[DBG270:![0-9]+]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META269:![0-9]+]], metadata !DIExpression()), !dbg [[DBG270:![0-9]+]]
 // CHECK1-NEXT:    store ptr addrspace(1) [[B]], ptr [[B_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META271:![0-9]+]], metadata !DIExpression()), !dbg [[DBG272:![0-9]+]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META271:![0-9]+]], metadata !DIExpression()), !dbg [[DBG272:![0-9]+]]
 // CHECK1-NEXT:    store ptr addrspace(1) [[BB]], ptr [[BB_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META273:![0-9]+]], metadata !DIExpression()), !dbg [[DBG274:![0-9]+]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META273:![0-9]+]], metadata !DIExpression()), !dbg [[DBG274:![0-9]+]]
 // CHECK1-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr [[C_ADDR]], align 8, !dbg [[DBG275:![0-9]+]]
 // CHECK1-NEXT:    [[TMP1:%.*]] = addrspacecast ptr addrspace(1) [[TMP0]] to ptr, !dbg [[DBG275]]
 // CHECK1-NEXT:    store ptr [[TMP1]], ptr [[TMP]], align 8, !dbg [[DBG275]]
@@ -754,16 +754,16 @@ int main() {
 // CHECK1-NEXT:    [[TMP10:%.*]] = addrspacecast ptr addrspace(1) [[TMP9]] to ptr, !dbg [[DBG275]]
 // CHECK1-NEXT:    store ptr [[TMP10]], ptr [[_TMP3]], align 8, !dbg [[DBG275]]
 // CHECK1-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[_TMP3]], align 8, !dbg [[DBG275]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTOMP_IV]], metadata [[META276:![0-9]+]], metadata !DIExpression()), !dbg [[DBG265]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTOMP_LB]], metadata [[META277:![0-9]+]], metadata !DIExpression()), !dbg [[DBG265]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[DOTOMP_IV]], metadata [[META276:![0-9]+]], metadata !DIExpression()), !dbg [[DBG265]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[DOTOMP_LB]], metadata [[META277:![0-9]+]], metadata !DIExpression()), !dbg [[DBG265]]
 // CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_LB]], align 4, !dbg [[DBG278:![0-9]+]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTOMP_UB]], metadata [[META279:![0-9]+]], metadata !DIExpression()), !dbg [[DBG265]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[DOTOMP_UB]], metadata [[META279:![0-9]+]], metadata !DIExpression()), !dbg [[DBG265]]
 // CHECK1-NEXT:    store i32 9, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG278]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTOMP_STRIDE]], metadata [[META280:![0-9]+]], metadata !DIExpression()), !dbg [[DBG265]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[DOTOMP_STRIDE]], metadata [[META280:![0-9]+]], metadata !DIExpression()), !dbg [[DBG265]]
 // CHECK1-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4, !dbg [[DBG278]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTOMP_IS_LAST]], metadata [[META281:![0-9]+]], metadata !DIExpression()), !dbg [[DBG265]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[DOTOMP_IS_LAST]], metadata [[META281:![0-9]+]], metadata !DIExpression()), !dbg [[DBG265]]
 // CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4, !dbg [[DBG278]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[I]], metadata [[META282:![0-9]+]], metadata !DIExpression()), !dbg [[DBG265]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[I]], metadata [[META282:![0-9]+]], metadata !DIExpression()), !dbg [[DBG265]]
 // CHECK1-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !dbg [[DBG275]]
 // CHECK1-NEXT:    [[TMP13:%.*]] = load i32, ptr [[TMP12]], align 4, !dbg [[DBG275]]
 // CHECK1-NEXT:    call void @__kmpc_for_static_init_4(ptr @[[GLOB17:[0-9]+]], i32 [[TMP13]], i32 33, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1), !dbg [[DBG283:![0-9]+]]
@@ -798,18 +798,18 @@ int main() {
 // CHECK1-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP21]], 1, !dbg [[DBG284:![0-9]+]]
 // CHECK1-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]], !dbg [[DBG284]]
 // CHECK1-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !dbg [[DBG284]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[F]], metadata [[META285:![0-9]+]], metadata !DIExpression()), !dbg [[DBG287:![0-9]+]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[F]], metadata [[META285:![0-9]+]], metadata !DIExpression()), !dbg [[DBG287:![0-9]+]]
 // CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 1, !dbg [[DBG288:![0-9]+]]
 // CHECK1-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX]], i64 0, i64 1, !dbg [[DBG288]]
 // CHECK1-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX7]], i64 0, i64 1, !dbg [[DBG288]]
 // CHECK1-NEXT:    store ptr [[ARRAYIDX8]], ptr [[F]], align 8, !dbg [[DBG287]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[G]], metadata [[META289:![0-9]+]], metadata !DIExpression()), !dbg [[DBG290:![0-9]+]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[G]], metadata [[META289:![0-9]+]], metadata !DIExpression()), !dbg [[DBG290:![0-9]+]]
 // CHECK1-NEXT:    store ptr [[TMP5]], ptr [[G]], align 8, !dbg [[DBG290]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[H]], metadata [[META291:![0-9]+]], metadata !DIExpression()), !dbg [[DBG292:![0-9]+]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[H]], metadata [[META291:![0-9]+]], metadata !DIExpression()), !dbg [[DBG292:![0-9]+]]
 // CHECK1-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP8]], i64 0, i64 1, !dbg [[DBG293:![0-9]+]]
 // CHECK1-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX9]], i64 0, i64 1, !dbg [[DBG293]]
 // CHECK1-NEXT:    store ptr [[ARRAYIDX10]], ptr [[H]], align 8, !dbg [[DBG292]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[D]], metadata [[META294:![0-9]+]], metadata !DIExpression()), !dbg [[DBG295:![0-9]+]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[D]], metadata [[META294:![0-9]+]], metadata !DIExpression()), !dbg [[DBG295:![0-9]+]]
 // CHECK1-NEXT:    store i32 15, ptr [[D]], align 4, !dbg [[DBG295]]
 // CHECK1-NEXT:    store i32 5, ptr [[TMP5]], align 4, !dbg [[DBG296:![0-9]+]]
 // CHECK1-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP8]], i64 0, i64 0, !dbg [[DBG297:![0-9]+]]
@@ -877,17 +877,17 @@ int main() {
 // CHECK1-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[BB_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTGLOBAL_TID__ADDR]], metadata [[META319:![0-9]+]], metadata !DIExpression()), !dbg [[DBG320:![0-9]+]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[DOTGLOBAL_TID__ADDR]], metadata [[META319:![0-9]+]], metadata !DIExpression()), !dbg [[DBG320:![0-9]+]]
 // CHECK1-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTBOUND_TID__ADDR]], metadata [[META321:![0-9]+]], metadata !DIExpression()), !dbg [[DBG320]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[DOTBOUND_TID__ADDR]], metadata [[META321:![0-9]+]], metadata !DIExpression()), !dbg [[DBG320]]
 // CHECK1-NEXT:    store ptr [[C]], ptr [[C_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META322:![0-9]+]], metadata !DIExpression()), !dbg [[DBG320]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META322:![0-9]+]], metadata !DIExpression()), !dbg [[DBG320]]
 // CHECK1-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META323:![0-9]+]], metadata !DIExpression()), !dbg [[DBG320]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META323:![0-9]+]], metadata !DIExpression()), !dbg [[DBG320]]
 // CHECK1-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META324:![0-9]+]], metadata !DIExpression()), !dbg [[DBG320]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META324:![0-9]+]], metadata !DIExpression()), !dbg [[DBG320]]
 // CHECK1-NEXT:    store ptr [[BB]], ptr [[BB_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META325:![0-9]+]], metadata !DIExpression()), !dbg [[DBG320]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META325:![0-9]+]], metadata !DIExpression()), !dbg [[DBG320]]
 // CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG326:![0-9]+]]
 // CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[A_ADDR]], align 8, !dbg [[DBG326]]
 // CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG326]]
@@ -915,15 +915,15 @@ int main() {
 // CHECK1-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[BB_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DYN_PTR_ADDR]], metadata [[META330:![0-9]+]], metadata !DIExpression()), !dbg [[DBG331:![0-9]+]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[DYN_PTR_ADDR]], metadata [[META330:![0-9]+]], metadata !DIExpression()), !dbg [[DBG331:![0-9]+]]
 // CHECK1-NEXT:    store ptr [[C]], ptr [[C_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META332:![0-9]+]], metadata !DIExpression()), !dbg [[DBG331]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META332:![0-9]+]], metadata !DIExpression()), !dbg [[DBG331]]
 // CHECK1-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META333:![0-9]+]], metadata !DIExpression()), !dbg [[DBG331]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META333:![0-9]+]], metadata !DIExpression()), !dbg [[DBG331]]
 // CHECK1-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META334:![0-9]+]], metadata !DIExpression()), !dbg [[DBG331]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META334:![0-9]+]], metadata !DIExpression()), !dbg [[DBG331]]
 // CHECK1-NEXT:    store ptr [[BB]], ptr [[BB_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META335:![0-9]+]], metadata !DIExpression()), !dbg [[DBG331]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META335:![0-9]+]], metadata !DIExpression()), !dbg [[DBG331]]
 // CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG336:![0-9]+]]
 // CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[A_ADDR]], align 8, !dbg [[DBG336]]
 // CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG336]]

>From 7609f7658e5e59ec3d5d65acc799f2b2001a4053 Mon Sep 17 00:00:00 2001
From: Johannes Doerfert <johannes at jdoerfert.de>
Date: Thu, 18 Jul 2024 22:45:19 -0700
Subject: [PATCH 27/31] [OpenMP] Ensure the actual kernel is annotated with
 launch bounds

---
 clang/lib/CodeGen/CGStmtOpenMP.cpp            |   46 +-
 clang/test/OpenMP/parallel_codegen.cpp        |  292 ++---
 .../OpenMP/target_parallel_debug_codegen.cpp  |  694 +++++------
 .../target_parallel_for_debug_codegen.cpp     | 1028 ++++++++---------
 ...target_parallel_generic_loop_codegen-3.cpp | 1028 ++++++++---------
 llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp     |   20 +-
 6 files changed, 1560 insertions(+), 1548 deletions(-)

diff --git a/clang/lib/CodeGen/CGStmtOpenMP.cpp b/clang/lib/CodeGen/CGStmtOpenMP.cpp
index f73d32de7c484..450371aef12b9 100644
--- a/clang/lib/CodeGen/CGStmtOpenMP.cpp
+++ b/clang/lib/CodeGen/CGStmtOpenMP.cpp
@@ -639,27 +639,42 @@ CodeGenFunction::GenerateOpenMPCapturedStmtFunction(const CapturedStmt &S,
   // Build the argument list.
   bool NeedWrapperFunction =
       getDebugInfo() && CGM.getCodeGenOpts().hasReducedDebugInfo();
-  FunctionArgList Args;
-  llvm::MapVector<const Decl *, std::pair<const VarDecl *, Address>> LocalAddrs;
-  llvm::DenseMap<const Decl *, std::pair<const Expr *, llvm::Value *>> VLASizes;
+  FunctionArgList Args, WrapperArgs;
+  llvm::MapVector<const Decl *, std::pair<const VarDecl *, Address>> LocalAddrs,
+      WrapperLocalAddrs;
+  llvm::DenseMap<const Decl *, std::pair<const Expr *, llvm::Value *>> VLASizes,
+      WrapperVLASizes;
   SmallString<256> Buffer;
   llvm::raw_svector_ostream Out(Buffer);
   Out << CapturedStmtInfo->getHelperName();
-  if (NeedWrapperFunction)
+
+  CodeGenFunction WrapperCGF(CGM, /*suppressNewContext=*/true);
+  llvm::Function *WrapperF = nullptr;
+  if (NeedWrapperFunction) {
+    // Emit the final kernel early to allow attributes to be added by the
+    // OpenMPI-IR-Builder.
+    FunctionOptions WrapperFO(&S, /*UIntPtrCastRequired=*/true,
+                              /*RegisterCastedArgsOnly=*/true,
+                              CapturedStmtInfo->getHelperName(), Loc);
+    WrapperCGF.CapturedStmtInfo = CapturedStmtInfo;
+    WrapperF =
+        emitOutlinedFunctionPrologue(WrapperCGF, Args, LocalAddrs, VLASizes,
+                                     WrapperCGF.CXXThisValue, WrapperFO);
     Out << "_debug__";
+  }
   FunctionOptions FO(&S, !NeedWrapperFunction, /*RegisterCastedArgsOnly=*/false,
                      Out.str(), Loc);
-  llvm::Function *F = emitOutlinedFunctionPrologue(*this, Args, LocalAddrs,
-                                                   VLASizes, CXXThisValue, FO);
+  llvm::Function *F = emitOutlinedFunctionPrologue(
+      *this, WrapperArgs, WrapperLocalAddrs, WrapperVLASizes, CXXThisValue, FO);
   CodeGenFunction::OMPPrivateScope LocalScope(*this);
-  for (const auto &LocalAddrPair : LocalAddrs) {
+  for (const auto &LocalAddrPair : WrapperLocalAddrs) {
     if (LocalAddrPair.second.first) {
       LocalScope.addPrivate(LocalAddrPair.second.first,
                             LocalAddrPair.second.second);
     }
   }
   (void)LocalScope.Privatize();
-  for (const auto &VLASizePair : VLASizes)
+  for (const auto &VLASizePair : WrapperVLASizes)
     VLASizeMap[VLASizePair.second.first] = VLASizePair.second.second;
   PGO.assignRegionCounters(GlobalDecl(CD), F);
   CapturedStmtInfo->EmitBody(*this, CD->getBody());
@@ -668,17 +683,10 @@ CodeGenFunction::GenerateOpenMPCapturedStmtFunction(const CapturedStmt &S,
   if (!NeedWrapperFunction)
     return F;
 
-  FunctionOptions WrapperFO(&S, /*UIntPtrCastRequired=*/true,
-                            /*RegisterCastedArgsOnly=*/true,
-                            CapturedStmtInfo->getHelperName(), Loc);
-  CodeGenFunction WrapperCGF(CGM, /*suppressNewContext=*/true);
-  WrapperCGF.CapturedStmtInfo = CapturedStmtInfo;
-  Args.clear();
-  LocalAddrs.clear();
-  VLASizes.clear();
-  llvm::Function *WrapperF =
-      emitOutlinedFunctionPrologue(WrapperCGF, Args, LocalAddrs, VLASizes,
-                                   WrapperCGF.CXXThisValue, WrapperFO);
+  // Reverse the order.
+  WrapperF->removeFromParent();
+  F->getParent()->getFunctionList().insertAfter(F->getIterator(), WrapperF);
+
   llvm::SmallVector<llvm::Value *, 4> CallArgs;
   auto *PI = F->arg_begin();
   for (const auto *Arg : Args) {
diff --git a/clang/test/OpenMP/parallel_codegen.cpp b/clang/test/OpenMP/parallel_codegen.cpp
index 867b250332fd5..9cdb1da996152 100644
--- a/clang/test/OpenMP/parallel_codegen.cpp
+++ b/clang/test/OpenMP/parallel_codegen.cpp
@@ -323,8 +323,8 @@ int main (int argc, char **argv) {
 // CHECK2-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[__VLA_EXPR0]], metadata [[META24:![0-9]+]], metadata !DIExpression()), !dbg [[DBG26:![0-9]+]]
 // CHECK2-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[VLA]], metadata [[META27:![0-9]+]], metadata !DIExpression()), !dbg [[DBG31:![0-9]+]]
 // CHECK2-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB1:[0-9]+]], i32 2, ptr @main.omp_outlined, i64 [[TMP1]], ptr [[VLA]]), !dbg [[DBG32:![0-9]+]]
-// CHECK2-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB5:[0-9]+]], i32 1, ptr @main.omp_outlined.2, i64 [[TMP1]]), !dbg [[DBG33:![0-9]+]]
-// CHECK2-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB9:[0-9]+]], i32 2, ptr @main.omp_outlined.4, i64 [[TMP1]], ptr [[VLA]]), !dbg [[DBG34:![0-9]+]]
+// CHECK2-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB5:[0-9]+]], i32 1, ptr @main.omp_outlined.1, i64 [[TMP1]]), !dbg [[DBG33:![0-9]+]]
+// CHECK2-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB9:[0-9]+]], i32 2, ptr @main.omp_outlined.3, i64 [[TMP1]], ptr [[VLA]]), !dbg [[DBG34:![0-9]+]]
 // CHECK2-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[ARGV_ADDR]], align 8, !dbg [[DBG35:![0-9]+]]
 // CHECK2-NEXT:    [[CALL:%.*]] = call noundef i32 @_Z5tmainIPPcEiT_(ptr noundef [[TMP3]]), !dbg [[DBG36:![0-9]+]]
 // CHECK2-NEXT:    store i32 [[CALL]], ptr [[RETVAL]], align 4, !dbg [[DBG37:![0-9]+]]
@@ -368,47 +368,47 @@ int main (int argc, char **argv) {
 // CHECK2-NEXT:    unreachable, !dbg [[DBG53]]
 //
 //
-// CHECK2-LABEL: define {{[^@]+}}@_Z3fooIiEvT_
-// CHECK2-SAME: (i32 noundef [[ARGC:%.*]]) #[[ATTR4:[0-9]+]] comdat !dbg [[DBG58:![0-9]+]] {
-// CHECK2-NEXT:  entry:
-// CHECK2-NEXT:    [[ARGC_ADDR:%.*]] = alloca i32, align 4
-// CHECK2-NEXT:    store i32 [[ARGC]], ptr [[ARGC_ADDR]], align 4
-// CHECK2-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[ARGC_ADDR]], metadata [[META63:![0-9]+]], metadata !DIExpression()), !dbg [[DBG64:![0-9]+]]
-// CHECK2-NEXT:    ret void, !dbg [[DBG65:![0-9]+]]
-//
-//
-// CHECK2-LABEL: define {{[^@]+}}@__clang_call_terminate
-// CHECK2-SAME: (ptr noundef [[TMP0:%.*]]) #[[ATTR5:[0-9]+]] comdat {
-// CHECK2-NEXT:    [[TMP2:%.*]] = call ptr @__cxa_begin_catch(ptr [[TMP0]]) #[[ATTR6:[0-9]+]]
-// CHECK2-NEXT:    call void @_ZSt9terminatev() #[[ATTR7]]
-// CHECK2-NEXT:    unreachable
-//
-//
 // CHECK2-LABEL: define {{[^@]+}}@main.omp_outlined
-// CHECK2-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3]] !dbg [[DBG66:![0-9]+]] {
+// CHECK2-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3]] !dbg [[DBG58:![0-9]+]] {
 // CHECK2-NEXT:  entry:
 // CHECK2-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK2-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK2-NEXT:    [[VLA_ADDR:%.*]] = alloca i64, align 8
 // CHECK2-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
 // CHECK2-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
-// CHECK2-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[DOTGLOBAL_TID__ADDR]], metadata [[META67:![0-9]+]], metadata !DIExpression()), !dbg [[DBG68:![0-9]+]]
+// CHECK2-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[DOTGLOBAL_TID__ADDR]], metadata [[META59:![0-9]+]], metadata !DIExpression()), !dbg [[DBG60:![0-9]+]]
 // CHECK2-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
-// CHECK2-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[DOTBOUND_TID__ADDR]], metadata [[META69:![0-9]+]], metadata !DIExpression()), !dbg [[DBG68]]
+// CHECK2-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[DOTBOUND_TID__ADDR]], metadata [[META61:![0-9]+]], metadata !DIExpression()), !dbg [[DBG60]]
 // CHECK2-NEXT:    store i64 [[VLA]], ptr [[VLA_ADDR]], align 8
-// CHECK2-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[VLA_ADDR]], metadata [[META70:![0-9]+]], metadata !DIExpression()), !dbg [[DBG68]]
+// CHECK2-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[VLA_ADDR]], metadata [[META62:![0-9]+]], metadata !DIExpression()), !dbg [[DBG60]]
 // CHECK2-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK2-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META71:![0-9]+]], metadata !DIExpression()), !dbg [[DBG68]]
-// CHECK2-NEXT:    [[TMP0:%.*]] = load i64, ptr [[VLA_ADDR]], align 8, !dbg [[DBG72:![0-9]+]]
-// CHECK2-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[A_ADDR]], align 8, !dbg [[DBG72]]
-// CHECK2-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !dbg [[DBG72]]
-// CHECK2-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[DOTBOUND_TID__ADDR]], align 8, !dbg [[DBG72]]
-// CHECK2-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[A_ADDR]], align 8, !dbg [[DBG72]]
-// CHECK2-NEXT:    call void @main.omp_outlined_debug__(ptr [[TMP2]], ptr [[TMP3]], i64 [[TMP0]], ptr [[TMP4]]) #[[ATTR6]], !dbg [[DBG72]]
-// CHECK2-NEXT:    ret void, !dbg [[DBG72]]
+// CHECK2-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META63:![0-9]+]], metadata !DIExpression()), !dbg [[DBG60]]
+// CHECK2-NEXT:    [[TMP0:%.*]] = load i64, ptr [[VLA_ADDR]], align 8, !dbg [[DBG64:![0-9]+]]
+// CHECK2-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[A_ADDR]], align 8, !dbg [[DBG64]]
+// CHECK2-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !dbg [[DBG64]]
+// CHECK2-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[DOTBOUND_TID__ADDR]], align 8, !dbg [[DBG64]]
+// CHECK2-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[A_ADDR]], align 8, !dbg [[DBG64]]
+// CHECK2-NEXT:    call void @main.omp_outlined_debug__(ptr [[TMP2]], ptr [[TMP3]], i64 [[TMP0]], ptr [[TMP4]]) #[[ATTR6:[0-9]+]], !dbg [[DBG64]]
+// CHECK2-NEXT:    ret void, !dbg [[DBG64]]
 //
 //
-// CHECK2-LABEL: define {{[^@]+}}@main.omp_outlined_debug__.1
+// CHECK2-LABEL: define {{[^@]+}}@_Z3fooIiEvT_
+// CHECK2-SAME: (i32 noundef [[ARGC:%.*]]) #[[ATTR4:[0-9]+]] comdat !dbg [[DBG65:![0-9]+]] {
+// CHECK2-NEXT:  entry:
+// CHECK2-NEXT:    [[ARGC_ADDR:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    store i32 [[ARGC]], ptr [[ARGC_ADDR]], align 4
+// CHECK2-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[ARGC_ADDR]], metadata [[META70:![0-9]+]], metadata !DIExpression()), !dbg [[DBG71:![0-9]+]]
+// CHECK2-NEXT:    ret void, !dbg [[DBG72:![0-9]+]]
+//
+//
+// CHECK2-LABEL: define {{[^@]+}}@__clang_call_terminate
+// CHECK2-SAME: (ptr noundef [[TMP0:%.*]]) #[[ATTR5:[0-9]+]] comdat {
+// CHECK2-NEXT:    [[TMP2:%.*]] = call ptr @__cxa_begin_catch(ptr [[TMP0]]) #[[ATTR6]]
+// CHECK2-NEXT:    call void @_ZSt9terminatev() #[[ATTR7]]
+// CHECK2-NEXT:    unreachable
+//
+//
+// CHECK2-LABEL: define {{[^@]+}}@main.omp_outlined_debug__.2
 // CHECK2-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[VLA:%.*]]) #[[ATTR3]] !dbg [[DBG75:![0-9]+]] {
 // CHECK2-NEXT:  entry:
 // CHECK2-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -431,14 +431,33 @@ int main (int argc, char **argv) {
 // CHECK2-NEXT:    store i64 [[TMP0]], ptr [[__VLA_EXPR0]], align 8, !dbg [[DBG82]]
 // CHECK2-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[__VLA_EXPR0]], metadata [[META84:![0-9]+]], metadata !DIExpression()), !dbg [[DBG79]]
 // CHECK2-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[VLA1]], metadata [[META85:![0-9]+]], metadata !DIExpression()), !dbg [[DBG79]]
-// CHECK2-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB3:[0-9]+]], i32 3, ptr @main.omp_outlined_debug__.1.omp_outlined, i64 [[TMP0]], ptr [[VLA1]], ptr [[GLOBAL]]), !dbg [[DBG82]]
+// CHECK2-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB3:[0-9]+]], i32 3, ptr @main.omp_outlined_debug__.2.omp_outlined, i64 [[TMP0]], ptr [[VLA1]], ptr [[GLOBAL]]), !dbg [[DBG82]]
 // CHECK2-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[SAVED_STACK]], align 8, !dbg [[DBG86:![0-9]+]]
 // CHECK2-NEXT:    call void @llvm.stackrestore.p0(ptr [[TMP2]]), !dbg [[DBG86]]
 // CHECK2-NEXT:    ret void, !dbg [[DBG88:![0-9]+]]
 //
 //
-// CHECK2-LABEL: define {{[^@]+}}@main.omp_outlined_debug__.1.omp_outlined_debug__
-// CHECK2-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[GLOBAL:%.*]]) #[[ATTR3]] personality ptr @__gxx_personality_v0 !dbg [[DBG89:![0-9]+]] {
+// CHECK2-LABEL: define {{[^@]+}}@main.omp_outlined.1
+// CHECK2-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[VLA:%.*]]) #[[ATTR3]] !dbg [[DBG89:![0-9]+]] {
+// CHECK2-NEXT:  entry:
+// CHECK2-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT:    [[VLA_ADDR:%.*]] = alloca i64, align 8
+// CHECK2-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK2-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[DOTGLOBAL_TID__ADDR]], metadata [[META90:![0-9]+]], metadata !DIExpression()), !dbg [[DBG91:![0-9]+]]
+// CHECK2-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// CHECK2-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[DOTBOUND_TID__ADDR]], metadata [[META92:![0-9]+]], metadata !DIExpression()), !dbg [[DBG91]]
+// CHECK2-NEXT:    store i64 [[VLA]], ptr [[VLA_ADDR]], align 8
+// CHECK2-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[VLA_ADDR]], metadata [[META93:![0-9]+]], metadata !DIExpression()), !dbg [[DBG91]]
+// CHECK2-NEXT:    [[TMP0:%.*]] = load i64, ptr [[VLA_ADDR]], align 8, !dbg [[DBG94:![0-9]+]]
+// CHECK2-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !dbg [[DBG94]]
+// CHECK2-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[DOTBOUND_TID__ADDR]], align 8, !dbg [[DBG94]]
+// CHECK2-NEXT:    call void @main.omp_outlined_debug__.2(ptr [[TMP1]], ptr [[TMP2]], i64 [[TMP0]]) #[[ATTR6]], !dbg [[DBG94]]
+// CHECK2-NEXT:    ret void, !dbg [[DBG94]]
+//
+//
+// CHECK2-LABEL: define {{[^@]+}}@main.omp_outlined_debug__.2.omp_outlined_debug__
+// CHECK2-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[GLOBAL:%.*]]) #[[ATTR3]] personality ptr @__gxx_personality_v0 !dbg [[DBG95:![0-9]+]] {
 // CHECK2-NEXT:  entry:
 // CHECK2-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK2-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -446,37 +465,37 @@ int main (int argc, char **argv) {
 // CHECK2-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
 // CHECK2-NEXT:    [[GLOBAL_ADDR:%.*]] = alloca ptr, align 8
 // CHECK2-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
-// CHECK2-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[DOTGLOBAL_TID__ADDR]], metadata [[META92:![0-9]+]], metadata !DIExpression()), !dbg [[DBG93:![0-9]+]]
+// CHECK2-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[DOTGLOBAL_TID__ADDR]], metadata [[META98:![0-9]+]], metadata !DIExpression()), !dbg [[DBG99:![0-9]+]]
 // CHECK2-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
-// CHECK2-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[DOTBOUND_TID__ADDR]], metadata [[META94:![0-9]+]], metadata !DIExpression()), !dbg [[DBG93]]
+// CHECK2-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[DOTBOUND_TID__ADDR]], metadata [[META100:![0-9]+]], metadata !DIExpression()), !dbg [[DBG99]]
 // CHECK2-NEXT:    store i64 [[VLA]], ptr [[VLA_ADDR]], align 8
-// CHECK2-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[VLA_ADDR]], metadata [[META95:![0-9]+]], metadata !DIExpression()), !dbg [[DBG93]]
+// CHECK2-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[VLA_ADDR]], metadata [[META101:![0-9]+]], metadata !DIExpression()), !dbg [[DBG99]]
 // CHECK2-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK2-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META96:![0-9]+]], metadata !DIExpression()), !dbg [[DBG97:![0-9]+]]
+// CHECK2-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META102:![0-9]+]], metadata !DIExpression()), !dbg [[DBG103:![0-9]+]]
 // CHECK2-NEXT:    store ptr [[GLOBAL]], ptr [[GLOBAL_ADDR]], align 8
-// CHECK2-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[GLOBAL_ADDR]], metadata [[META98:![0-9]+]], metadata !DIExpression()), !dbg [[DBG99:![0-9]+]]
-// CHECK2-NEXT:    [[TMP0:%.*]] = load i64, ptr [[VLA_ADDR]], align 8, !dbg [[DBG100:![0-9]+]]
-// CHECK2-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[A_ADDR]], align 8, !dbg [[DBG100]]
-// CHECK2-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[GLOBAL_ADDR]], align 8, !dbg [[DBG100]]
-// CHECK2-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 1, !dbg [[DBG101:![0-9]+]]
-// CHECK2-NEXT:    [[TMP3:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !dbg [[DBG101]]
+// CHECK2-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[GLOBAL_ADDR]], metadata [[META104:![0-9]+]], metadata !DIExpression()), !dbg [[DBG105:![0-9]+]]
+// CHECK2-NEXT:    [[TMP0:%.*]] = load i64, ptr [[VLA_ADDR]], align 8, !dbg [[DBG106:![0-9]+]]
+// CHECK2-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[A_ADDR]], align 8, !dbg [[DBG106]]
+// CHECK2-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[GLOBAL_ADDR]], align 8, !dbg [[DBG106]]
+// CHECK2-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 1, !dbg [[DBG107:![0-9]+]]
+// CHECK2-NEXT:    [[TMP3:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !dbg [[DBG107]]
 // CHECK2-NEXT:    invoke void @_Z3fooIiEvT_(i32 noundef [[TMP3]])
-// CHECK2-NEXT:            to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]], !dbg [[DBG100]]
+// CHECK2-NEXT:            to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]], !dbg [[DBG106]]
 // CHECK2:       invoke.cont:
-// CHECK2-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP2]], align 4, !dbg [[DBG102:![0-9]+]]
-// CHECK2-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 1, !dbg [[DBG103:![0-9]+]]
-// CHECK2-NEXT:    store i32 [[TMP4]], ptr [[ARRAYIDX1]], align 4, !dbg [[DBG104:![0-9]+]]
-// CHECK2-NEXT:    ret void, !dbg [[DBG102]]
+// CHECK2-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP2]], align 4, !dbg [[DBG108:![0-9]+]]
+// CHECK2-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 1, !dbg [[DBG109:![0-9]+]]
+// CHECK2-NEXT:    store i32 [[TMP4]], ptr [[ARRAYIDX1]], align 4, !dbg [[DBG110:![0-9]+]]
+// CHECK2-NEXT:    ret void, !dbg [[DBG108]]
 // CHECK2:       terminate.lpad:
 // CHECK2-NEXT:    [[TMP5:%.*]] = landingpad { ptr, i32 }
-// CHECK2-NEXT:            catch ptr null, !dbg [[DBG100]]
-// CHECK2-NEXT:    [[TMP6:%.*]] = extractvalue { ptr, i32 } [[TMP5]], 0, !dbg [[DBG100]]
-// CHECK2-NEXT:    call void @__clang_call_terminate(ptr [[TMP6]]) #[[ATTR7]], !dbg [[DBG100]]
-// CHECK2-NEXT:    unreachable, !dbg [[DBG100]]
+// CHECK2-NEXT:            catch ptr null, !dbg [[DBG106]]
+// CHECK2-NEXT:    [[TMP6:%.*]] = extractvalue { ptr, i32 } [[TMP5]], 0, !dbg [[DBG106]]
+// CHECK2-NEXT:    call void @__clang_call_terminate(ptr [[TMP6]]) #[[ATTR7]], !dbg [[DBG106]]
+// CHECK2-NEXT:    unreachable, !dbg [[DBG106]]
 //
 //
-// CHECK2-LABEL: define {{[^@]+}}@main.omp_outlined_debug__.1.omp_outlined
-// CHECK2-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[GLOBAL:%.*]]) #[[ATTR3]] !dbg [[DBG105:![0-9]+]] {
+// CHECK2-LABEL: define {{[^@]+}}@main.omp_outlined_debug__.2.omp_outlined
+// CHECK2-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[GLOBAL:%.*]]) #[[ATTR3]] !dbg [[DBG111:![0-9]+]] {
 // CHECK2-NEXT:  entry:
 // CHECK2-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK2-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -484,46 +503,27 @@ int main (int argc, char **argv) {
 // CHECK2-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
 // CHECK2-NEXT:    [[GLOBAL_ADDR:%.*]] = alloca ptr, align 8
 // CHECK2-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
-// CHECK2-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[DOTGLOBAL_TID__ADDR]], metadata [[META106:![0-9]+]], metadata !DIExpression()), !dbg [[DBG107:![0-9]+]]
+// CHECK2-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[DOTGLOBAL_TID__ADDR]], metadata [[META112:![0-9]+]], metadata !DIExpression()), !dbg [[DBG113:![0-9]+]]
 // CHECK2-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
-// CHECK2-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[DOTBOUND_TID__ADDR]], metadata [[META108:![0-9]+]], metadata !DIExpression()), !dbg [[DBG107]]
+// CHECK2-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[DOTBOUND_TID__ADDR]], metadata [[META114:![0-9]+]], metadata !DIExpression()), !dbg [[DBG113]]
 // CHECK2-NEXT:    store i64 [[VLA]], ptr [[VLA_ADDR]], align 8
-// CHECK2-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[VLA_ADDR]], metadata [[META109:![0-9]+]], metadata !DIExpression()), !dbg [[DBG107]]
+// CHECK2-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[VLA_ADDR]], metadata [[META115:![0-9]+]], metadata !DIExpression()), !dbg [[DBG113]]
 // CHECK2-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK2-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META110:![0-9]+]], metadata !DIExpression()), !dbg [[DBG107]]
+// CHECK2-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META116:![0-9]+]], metadata !DIExpression()), !dbg [[DBG113]]
 // CHECK2-NEXT:    store ptr [[GLOBAL]], ptr [[GLOBAL_ADDR]], align 8
-// CHECK2-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[GLOBAL_ADDR]], metadata [[META111:![0-9]+]], metadata !DIExpression()), !dbg [[DBG107]]
-// CHECK2-NEXT:    [[TMP0:%.*]] = load i64, ptr [[VLA_ADDR]], align 8, !dbg [[DBG112:![0-9]+]]
-// CHECK2-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[A_ADDR]], align 8, !dbg [[DBG112]]
-// CHECK2-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[GLOBAL_ADDR]], align 8, !dbg [[DBG112]]
-// CHECK2-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !dbg [[DBG112]]
-// CHECK2-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[DOTBOUND_TID__ADDR]], align 8, !dbg [[DBG112]]
-// CHECK2-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[A_ADDR]], align 8, !dbg [[DBG112]]
-// CHECK2-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[GLOBAL_ADDR]], align 8, !dbg [[DBG112]]
-// CHECK2-NEXT:    call void @main.omp_outlined_debug__.1.omp_outlined_debug__(ptr [[TMP3]], ptr [[TMP4]], i64 [[TMP0]], ptr [[TMP5]], ptr [[TMP6]]) #[[ATTR6]], !dbg [[DBG112]]
-// CHECK2-NEXT:    ret void, !dbg [[DBG112]]
-//
-//
-// CHECK2-LABEL: define {{[^@]+}}@main.omp_outlined.2
-// CHECK2-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[VLA:%.*]]) #[[ATTR3]] !dbg [[DBG113:![0-9]+]] {
-// CHECK2-NEXT:  entry:
-// CHECK2-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
-// CHECK2-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
-// CHECK2-NEXT:    [[VLA_ADDR:%.*]] = alloca i64, align 8
-// CHECK2-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
-// CHECK2-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[DOTGLOBAL_TID__ADDR]], metadata [[META114:![0-9]+]], metadata !DIExpression()), !dbg [[DBG115:![0-9]+]]
-// CHECK2-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
-// CHECK2-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[DOTBOUND_TID__ADDR]], metadata [[META116:![0-9]+]], metadata !DIExpression()), !dbg [[DBG115]]
-// CHECK2-NEXT:    store i64 [[VLA]], ptr [[VLA_ADDR]], align 8
-// CHECK2-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[VLA_ADDR]], metadata [[META117:![0-9]+]], metadata !DIExpression()), !dbg [[DBG115]]
+// CHECK2-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[GLOBAL_ADDR]], metadata [[META117:![0-9]+]], metadata !DIExpression()), !dbg [[DBG113]]
 // CHECK2-NEXT:    [[TMP0:%.*]] = load i64, ptr [[VLA_ADDR]], align 8, !dbg [[DBG118:![0-9]+]]
-// CHECK2-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !dbg [[DBG118]]
-// CHECK2-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[DOTBOUND_TID__ADDR]], align 8, !dbg [[DBG118]]
-// CHECK2-NEXT:    call void @main.omp_outlined_debug__.1(ptr [[TMP1]], ptr [[TMP2]], i64 [[TMP0]]) #[[ATTR6]], !dbg [[DBG118]]
+// CHECK2-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[A_ADDR]], align 8, !dbg [[DBG118]]
+// CHECK2-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[GLOBAL_ADDR]], align 8, !dbg [[DBG118]]
+// CHECK2-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !dbg [[DBG118]]
+// CHECK2-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[DOTBOUND_TID__ADDR]], align 8, !dbg [[DBG118]]
+// CHECK2-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[A_ADDR]], align 8, !dbg [[DBG118]]
+// CHECK2-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[GLOBAL_ADDR]], align 8, !dbg [[DBG118]]
+// CHECK2-NEXT:    call void @main.omp_outlined_debug__.2.omp_outlined_debug__(ptr [[TMP3]], ptr [[TMP4]], i64 [[TMP0]], ptr [[TMP5]], ptr [[TMP6]]) #[[ATTR6]], !dbg [[DBG118]]
 // CHECK2-NEXT:    ret void, !dbg [[DBG118]]
 //
 //
-// CHECK2-LABEL: define {{[^@]+}}@main.omp_outlined_debug__.3
+// CHECK2-LABEL: define {{[^@]+}}@main.omp_outlined_debug__.4
 // CHECK2-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3]] !dbg [[DBG119:![0-9]+]] {
 // CHECK2-NEXT:  entry:
 // CHECK2-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -540,12 +540,12 @@ int main (int argc, char **argv) {
 // CHECK2-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META124:![0-9]+]], metadata !DIExpression()), !dbg [[DBG125:![0-9]+]]
 // CHECK2-NEXT:    [[TMP0:%.*]] = load i64, ptr [[VLA_ADDR]], align 8, !dbg [[DBG126:![0-9]+]]
 // CHECK2-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[A_ADDR]], align 8, !dbg [[DBG126]]
-// CHECK2-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB7:[0-9]+]], i32 2, ptr @main.omp_outlined_debug__.3.omp_outlined, i64 [[TMP0]], ptr [[TMP1]]), !dbg [[DBG126]]
+// CHECK2-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB7:[0-9]+]], i32 2, ptr @main.omp_outlined_debug__.4.omp_outlined, i64 [[TMP0]], ptr [[TMP1]]), !dbg [[DBG126]]
 // CHECK2-NEXT:    ret void, !dbg [[DBG127:![0-9]+]]
 //
 //
-// CHECK2-LABEL: define {{[^@]+}}@main.omp_outlined_debug__.3.omp_outlined_debug__
-// CHECK2-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3]] personality ptr @__gxx_personality_v0 !dbg [[DBG128:![0-9]+]] {
+// CHECK2-LABEL: define {{[^@]+}}@main.omp_outlined.3
+// CHECK2-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3]] !dbg [[DBG128:![0-9]+]] {
 // CHECK2-NEXT:  entry:
 // CHECK2-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK2-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -558,51 +558,51 @@ int main (int argc, char **argv) {
 // CHECK2-NEXT:    store i64 [[VLA]], ptr [[VLA_ADDR]], align 8
 // CHECK2-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[VLA_ADDR]], metadata [[META132:![0-9]+]], metadata !DIExpression()), !dbg [[DBG130]]
 // CHECK2-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK2-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META133:![0-9]+]], metadata !DIExpression()), !dbg [[DBG134:![0-9]+]]
-// CHECK2-NEXT:    [[TMP0:%.*]] = load i64, ptr [[VLA_ADDR]], align 8, !dbg [[DBG135:![0-9]+]]
-// CHECK2-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[A_ADDR]], align 8, !dbg [[DBG135]]
-// CHECK2-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 1, !dbg [[DBG136:![0-9]+]]
-// CHECK2-NEXT:    [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !dbg [[DBG136]]
-// CHECK2-NEXT:    invoke void @_Z3fooIiEvT_(i32 noundef [[TMP2]])
-// CHECK2-NEXT:            to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]], !dbg [[DBG135]]
-// CHECK2:       invoke.cont:
-// CHECK2-NEXT:    [[TMP3:%.*]] = load i32, ptr @global, align 4, !dbg [[DBG137:![0-9]+]]
-// CHECK2-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 1, !dbg [[DBG138:![0-9]+]]
-// CHECK2-NEXT:    store i32 [[TMP3]], ptr [[ARRAYIDX1]], align 4, !dbg [[DBG139:![0-9]+]]
-// CHECK2-NEXT:    ret void, !dbg [[DBG137]]
-// CHECK2:       terminate.lpad:
-// CHECK2-NEXT:    [[TMP4:%.*]] = landingpad { ptr, i32 }
-// CHECK2-NEXT:            catch ptr null, !dbg [[DBG135]]
-// CHECK2-NEXT:    [[TMP5:%.*]] = extractvalue { ptr, i32 } [[TMP4]], 0, !dbg [[DBG135]]
-// CHECK2-NEXT:    call void @__clang_call_terminate(ptr [[TMP5]]) #[[ATTR7]], !dbg [[DBG135]]
-// CHECK2-NEXT:    unreachable, !dbg [[DBG135]]
-//
-//
-// CHECK2-LABEL: define {{[^@]+}}@main.omp_outlined_debug__.3.omp_outlined
-// CHECK2-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3]] !dbg [[DBG140:![0-9]+]] {
+// CHECK2-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META133:![0-9]+]], metadata !DIExpression()), !dbg [[DBG130]]
+// CHECK2-NEXT:    [[TMP0:%.*]] = load i64, ptr [[VLA_ADDR]], align 8, !dbg [[DBG134:![0-9]+]]
+// CHECK2-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[A_ADDR]], align 8, !dbg [[DBG134]]
+// CHECK2-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !dbg [[DBG134]]
+// CHECK2-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[DOTBOUND_TID__ADDR]], align 8, !dbg [[DBG134]]
+// CHECK2-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[A_ADDR]], align 8, !dbg [[DBG134]]
+// CHECK2-NEXT:    call void @main.omp_outlined_debug__.4(ptr [[TMP2]], ptr [[TMP3]], i64 [[TMP0]], ptr [[TMP4]]) #[[ATTR6]], !dbg [[DBG134]]
+// CHECK2-NEXT:    ret void, !dbg [[DBG134]]
+//
+//
+// CHECK2-LABEL: define {{[^@]+}}@main.omp_outlined_debug__.4.omp_outlined_debug__
+// CHECK2-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3]] personality ptr @__gxx_personality_v0 !dbg [[DBG135:![0-9]+]] {
 // CHECK2-NEXT:  entry:
 // CHECK2-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK2-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK2-NEXT:    [[VLA_ADDR:%.*]] = alloca i64, align 8
 // CHECK2-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
 // CHECK2-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
-// CHECK2-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[DOTGLOBAL_TID__ADDR]], metadata [[META141:![0-9]+]], metadata !DIExpression()), !dbg [[DBG142:![0-9]+]]
+// CHECK2-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[DOTGLOBAL_TID__ADDR]], metadata [[META136:![0-9]+]], metadata !DIExpression()), !dbg [[DBG137:![0-9]+]]
 // CHECK2-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
-// CHECK2-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[DOTBOUND_TID__ADDR]], metadata [[META143:![0-9]+]], metadata !DIExpression()), !dbg [[DBG142]]
+// CHECK2-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[DOTBOUND_TID__ADDR]], metadata [[META138:![0-9]+]], metadata !DIExpression()), !dbg [[DBG137]]
 // CHECK2-NEXT:    store i64 [[VLA]], ptr [[VLA_ADDR]], align 8
-// CHECK2-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[VLA_ADDR]], metadata [[META144:![0-9]+]], metadata !DIExpression()), !dbg [[DBG142]]
+// CHECK2-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[VLA_ADDR]], metadata [[META139:![0-9]+]], metadata !DIExpression()), !dbg [[DBG137]]
 // CHECK2-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK2-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META145:![0-9]+]], metadata !DIExpression()), !dbg [[DBG142]]
-// CHECK2-NEXT:    [[TMP0:%.*]] = load i64, ptr [[VLA_ADDR]], align 8, !dbg [[DBG146:![0-9]+]]
-// CHECK2-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[A_ADDR]], align 8, !dbg [[DBG146]]
-// CHECK2-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !dbg [[DBG146]]
-// CHECK2-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[DOTBOUND_TID__ADDR]], align 8, !dbg [[DBG146]]
-// CHECK2-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[A_ADDR]], align 8, !dbg [[DBG146]]
-// CHECK2-NEXT:    call void @main.omp_outlined_debug__.3.omp_outlined_debug__(ptr [[TMP2]], ptr [[TMP3]], i64 [[TMP0]], ptr [[TMP4]]) #[[ATTR6]], !dbg [[DBG146]]
-// CHECK2-NEXT:    ret void, !dbg [[DBG146]]
+// CHECK2-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META140:![0-9]+]], metadata !DIExpression()), !dbg [[DBG141:![0-9]+]]
+// CHECK2-NEXT:    [[TMP0:%.*]] = load i64, ptr [[VLA_ADDR]], align 8, !dbg [[DBG142:![0-9]+]]
+// CHECK2-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[A_ADDR]], align 8, !dbg [[DBG142]]
+// CHECK2-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 1, !dbg [[DBG143:![0-9]+]]
+// CHECK2-NEXT:    [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !dbg [[DBG143]]
+// CHECK2-NEXT:    invoke void @_Z3fooIiEvT_(i32 noundef [[TMP2]])
+// CHECK2-NEXT:            to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]], !dbg [[DBG142]]
+// CHECK2:       invoke.cont:
+// CHECK2-NEXT:    [[TMP3:%.*]] = load i32, ptr @global, align 4, !dbg [[DBG144:![0-9]+]]
+// CHECK2-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 1, !dbg [[DBG145:![0-9]+]]
+// CHECK2-NEXT:    store i32 [[TMP3]], ptr [[ARRAYIDX1]], align 4, !dbg [[DBG146:![0-9]+]]
+// CHECK2-NEXT:    ret void, !dbg [[DBG144]]
+// CHECK2:       terminate.lpad:
+// CHECK2-NEXT:    [[TMP4:%.*]] = landingpad { ptr, i32 }
+// CHECK2-NEXT:            catch ptr null, !dbg [[DBG142]]
+// CHECK2-NEXT:    [[TMP5:%.*]] = extractvalue { ptr, i32 } [[TMP4]], 0, !dbg [[DBG142]]
+// CHECK2-NEXT:    call void @__clang_call_terminate(ptr [[TMP5]]) #[[ATTR7]], !dbg [[DBG142]]
+// CHECK2-NEXT:    unreachable, !dbg [[DBG142]]
 //
 //
-// CHECK2-LABEL: define {{[^@]+}}@main.omp_outlined.4
+// CHECK2-LABEL: define {{[^@]+}}@main.omp_outlined_debug__.4.omp_outlined
 // CHECK2-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3]] !dbg [[DBG147:![0-9]+]] {
 // CHECK2-NEXT:  entry:
 // CHECK2-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -622,7 +622,7 @@ int main (int argc, char **argv) {
 // CHECK2-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !dbg [[DBG153]]
 // CHECK2-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[DOTBOUND_TID__ADDR]], align 8, !dbg [[DBG153]]
 // CHECK2-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[A_ADDR]], align 8, !dbg [[DBG153]]
-// CHECK2-NEXT:    call void @main.omp_outlined_debug__.3(ptr [[TMP2]], ptr [[TMP3]], i64 [[TMP0]], ptr [[TMP4]]) #[[ATTR6]], !dbg [[DBG153]]
+// CHECK2-NEXT:    call void @main.omp_outlined_debug__.4.omp_outlined_debug__(ptr [[TMP2]], ptr [[TMP3]], i64 [[TMP0]], ptr [[TMP4]]) #[[ATTR6]], !dbg [[DBG153]]
 // CHECK2-NEXT:    ret void, !dbg [[DBG153]]
 //
 //
@@ -678,37 +678,37 @@ int main (int argc, char **argv) {
 // CHECK2-NEXT:    unreachable, !dbg [[DBG178]]
 //
 //
-// CHECK2-LABEL: define {{[^@]+}}@_Z3fooIPPcEvT_
-// CHECK2-SAME: (ptr noundef [[ARGC:%.*]]) #[[ATTR4]] comdat !dbg [[DBG189:![0-9]+]] {
-// CHECK2-NEXT:  entry:
-// CHECK2-NEXT:    [[ARGC_ADDR:%.*]] = alloca ptr, align 8
-// CHECK2-NEXT:    store ptr [[ARGC]], ptr [[ARGC_ADDR]], align 8
-// CHECK2-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[ARGC_ADDR]], metadata [[META192:![0-9]+]], metadata !DIExpression()), !dbg [[DBG193:![0-9]+]]
-// CHECK2-NEXT:    ret void, !dbg [[DBG194:![0-9]+]]
-//
-//
 // CHECK2-LABEL: define {{[^@]+}}@_Z5tmainIPPcEiT_.omp_outlined
-// CHECK2-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[ARGC:%.*]], i64 noundef [[VLA:%.*]]) #[[ATTR3]] !dbg [[DBG195:![0-9]+]] {
+// CHECK2-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[ARGC:%.*]], i64 noundef [[VLA:%.*]]) #[[ATTR3]] !dbg [[DBG189:![0-9]+]] {
 // CHECK2-NEXT:  entry:
 // CHECK2-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK2-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK2-NEXT:    [[ARGC_ADDR:%.*]] = alloca ptr, align 8
 // CHECK2-NEXT:    [[VLA_ADDR:%.*]] = alloca i64, align 8
 // CHECK2-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
-// CHECK2-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[DOTGLOBAL_TID__ADDR]], metadata [[META196:![0-9]+]], metadata !DIExpression()), !dbg [[DBG197:![0-9]+]]
+// CHECK2-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[DOTGLOBAL_TID__ADDR]], metadata [[META190:![0-9]+]], metadata !DIExpression()), !dbg [[DBG191:![0-9]+]]
 // CHECK2-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
-// CHECK2-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[DOTBOUND_TID__ADDR]], metadata [[META198:![0-9]+]], metadata !DIExpression()), !dbg [[DBG197]]
+// CHECK2-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[DOTBOUND_TID__ADDR]], metadata [[META192:![0-9]+]], metadata !DIExpression()), !dbg [[DBG191]]
 // CHECK2-NEXT:    store ptr [[ARGC]], ptr [[ARGC_ADDR]], align 8
-// CHECK2-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[ARGC_ADDR]], metadata [[META199:![0-9]+]], metadata !DIExpression()), !dbg [[DBG197]]
+// CHECK2-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[ARGC_ADDR]], metadata [[META193:![0-9]+]], metadata !DIExpression()), !dbg [[DBG191]]
 // CHECK2-NEXT:    store i64 [[VLA]], ptr [[VLA_ADDR]], align 8
-// CHECK2-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[VLA_ADDR]], metadata [[META200:![0-9]+]], metadata !DIExpression()), !dbg [[DBG197]]
-// CHECK2-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ARGC_ADDR]], align 8, !dbg [[DBG201:![0-9]+]]
-// CHECK2-NEXT:    [[TMP1:%.*]] = load i64, ptr [[VLA_ADDR]], align 8, !dbg [[DBG201]]
-// CHECK2-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !dbg [[DBG201]]
-// CHECK2-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[DOTBOUND_TID__ADDR]], align 8, !dbg [[DBG201]]
-// CHECK2-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[ARGC_ADDR]], align 8, !dbg [[DBG201]]
-// CHECK2-NEXT:    call void @_Z5tmainIPPcEiT_.omp_outlined_debug__(ptr [[TMP2]], ptr [[TMP3]], ptr [[TMP4]], i64 [[TMP1]]) #[[ATTR6]], !dbg [[DBG201]]
-// CHECK2-NEXT:    ret void, !dbg [[DBG201]]
+// CHECK2-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[VLA_ADDR]], metadata [[META194:![0-9]+]], metadata !DIExpression()), !dbg [[DBG191]]
+// CHECK2-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ARGC_ADDR]], align 8, !dbg [[DBG195:![0-9]+]]
+// CHECK2-NEXT:    [[TMP1:%.*]] = load i64, ptr [[VLA_ADDR]], align 8, !dbg [[DBG195]]
+// CHECK2-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !dbg [[DBG195]]
+// CHECK2-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[DOTBOUND_TID__ADDR]], align 8, !dbg [[DBG195]]
+// CHECK2-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[ARGC_ADDR]], align 8, !dbg [[DBG195]]
+// CHECK2-NEXT:    call void @_Z5tmainIPPcEiT_.omp_outlined_debug__(ptr [[TMP2]], ptr [[TMP3]], ptr [[TMP4]], i64 [[TMP1]]) #[[ATTR6]], !dbg [[DBG195]]
+// CHECK2-NEXT:    ret void, !dbg [[DBG195]]
+//
+//
+// CHECK2-LABEL: define {{[^@]+}}@_Z3fooIPPcEvT_
+// CHECK2-SAME: (ptr noundef [[ARGC:%.*]]) #[[ATTR4]] comdat !dbg [[DBG196:![0-9]+]] {
+// CHECK2-NEXT:  entry:
+// CHECK2-NEXT:    [[ARGC_ADDR:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT:    store ptr [[ARGC]], ptr [[ARGC_ADDR]], align 8
+// CHECK2-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[ARGC_ADDR]], metadata [[META199:![0-9]+]], metadata !DIExpression()), !dbg [[DBG200:![0-9]+]]
+// CHECK2-NEXT:    ret void, !dbg [[DBG201:![0-9]+]]
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@main
diff --git a/clang/test/OpenMP/target_parallel_debug_codegen.cpp b/clang/test/OpenMP/target_parallel_debug_codegen.cpp
index 74929bfa29740..0852a0772b883 100644
--- a/clang/test/OpenMP/target_parallel_debug_codegen.cpp
+++ b/clang/test/OpenMP/target_parallel_debug_codegen.cpp
@@ -65,7 +65,7 @@ int main() {
   return 0;
 }
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l23_debug__
-// CHECK1-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr addrspace(1) noalias noundef [[C:%.*]], i32 noundef [[A:%.*]], ptr noalias noundef [[B:%.*]], ptr addrspace(1) noalias noundef [[BB:%.*]]) #[[ATTR0:[0-9]+]] !dbg [[DBG29:![0-9]+]] {
+// CHECK1-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr addrspace(1) noalias noundef [[C:%.*]], i32 noundef [[A:%.*]], ptr noalias noundef [[B:%.*]], ptr addrspace(1) noalias noundef [[BB:%.*]]) #[[ATTR1:[0-9]+]] !dbg [[DBG29:![0-9]+]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[C_ADDR:%.*]] = alloca ptr addrspace(1), align 8
@@ -122,8 +122,40 @@ int main() {
 // CHECK1-NEXT:    ret void, !dbg [[DBG58]]
 //
 //
+// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l23
+// CHECK1-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[C:%.*]], i64 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[B:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[BB:%.*]]) #[[ATTR2:[0-9]+]] !dbg [[DBG64:![0-9]+]] {
+// CHECK1-NEXT:  entry:
+// CHECK1-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    [[C_ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    [[A_ADDR:%.*]] = alloca i64, align 8
+// CHECK1-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    [[BB_ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[DYN_PTR_ADDR]], metadata [[META71:![0-9]+]], metadata !DIExpression()), !dbg [[DBG72:![0-9]+]]
+// CHECK1-NEXT:    store ptr [[C]], ptr [[C_ADDR]], align 8
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META73:![0-9]+]], metadata !DIExpression()), !dbg [[DBG72]]
+// CHECK1-NEXT:    store i64 [[A]], ptr [[A_ADDR]], align 8
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META74:![0-9]+]], metadata !DIExpression()), !dbg [[DBG72]]
+// CHECK1-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 8
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META75:![0-9]+]], metadata !DIExpression()), !dbg [[DBG72]]
+// CHECK1-NEXT:    store ptr [[BB]], ptr [[BB_ADDR]], align 8
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META76:![0-9]+]], metadata !DIExpression()), !dbg [[DBG72]]
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG77:![0-9]+]]
+// CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG77]]
+// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG77]]
+// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[DYN_PTR_ADDR]], align 8, !dbg [[DBG77]]
+// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG77]]
+// CHECK1-NEXT:    [[TMP5:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG77]]
+// CHECK1-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG77]]
+// CHECK1-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG77]]
+// CHECK1-NEXT:    [[TMP8:%.*]] = addrspacecast ptr [[TMP4]] to ptr addrspace(1), !dbg [[DBG77]]
+// CHECK1-NEXT:    [[TMP9:%.*]] = addrspacecast ptr [[TMP7]] to ptr addrspace(1), !dbg [[DBG77]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l23_debug__(ptr [[TMP3]], ptr addrspace(1) [[TMP8]], i32 [[TMP5]], ptr [[TMP6]], ptr addrspace(1) [[TMP9]]) #[[ATTR4:[0-9]+]], !dbg [[DBG77]]
+// CHECK1-NEXT:    ret void, !dbg [[DBG77]]
+//
+//
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l23_debug___omp_outlined_debug__
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr addrspace(1) noalias noundef [[C:%.*]], i32 noundef [[A:%.*]], ptr noalias noundef [[B:%.*]], ptr addrspace(1) noalias noundef [[BB:%.*]]) #[[ATTR2:[0-9]+]] !dbg [[DBG64:![0-9]+]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr addrspace(1) noalias noundef [[C:%.*]], i32 noundef [[A:%.*]], ptr noalias noundef [[B:%.*]], ptr addrspace(1) noalias noundef [[BB:%.*]]) #[[ATTR1]] !dbg [[DBG78:![0-9]+]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -140,83 +172,83 @@ int main() {
 // CHECK1-NEXT:    [[H:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[D:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
-// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[DOTGLOBAL_TID__ADDR]], metadata [[META71:![0-9]+]], metadata !DIExpression()), !dbg [[DBG72:![0-9]+]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[DOTGLOBAL_TID__ADDR]], metadata [[META85:![0-9]+]], metadata !DIExpression()), !dbg [[DBG86:![0-9]+]]
 // CHECK1-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
-// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[DOTBOUND_TID__ADDR]], metadata [[META73:![0-9]+]], metadata !DIExpression()), !dbg [[DBG72]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[DOTBOUND_TID__ADDR]], metadata [[META87:![0-9]+]], metadata !DIExpression()), !dbg [[DBG86]]
 // CHECK1-NEXT:    store ptr addrspace(1) [[C]], ptr [[C_ADDR]], align 8
-// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META74:![0-9]+]], metadata !DIExpression()), !dbg [[DBG75:![0-9]+]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META88:![0-9]+]], metadata !DIExpression()), !dbg [[DBG89:![0-9]+]]
 // CHECK1-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
-// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META76:![0-9]+]], metadata !DIExpression()), !dbg [[DBG77:![0-9]+]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META90:![0-9]+]], metadata !DIExpression()), !dbg [[DBG91:![0-9]+]]
 // CHECK1-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 8
-// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META78:![0-9]+]], metadata !DIExpression()), !dbg [[DBG79:![0-9]+]]
+// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META92:![0-9]+]], metadata !DIExpression()), !dbg [[DBG93:![0-9]+]]
 // CHECK1-NEXT:    store ptr addrspace(1) [[BB]], ptr [[BB_ADDR]], align 8
-// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META80:![0-9]+]], metadata !DIExpression()), !dbg [[DBG81:![0-9]+]]
-// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr [[C_ADDR]], align 8, !dbg [[DBG82:![0-9]+]]
-// CHECK1-NEXT:    [[TMP1:%.*]] = addrspacecast ptr addrspace(1) [[TMP0]] to ptr, !dbg [[DBG82]]
-// CHECK1-NEXT:    store ptr [[TMP1]], ptr [[TMP]], align 8, !dbg [[DBG82]]
-// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP]], align 8, !dbg [[DBG82]]
-// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG82]]
-// CHECK1-NEXT:    store ptr [[TMP3]], ptr [[_TMP1]], align 8, !dbg [[DBG82]]
-// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[_TMP1]], align 8, !dbg [[DBG82]]
-// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr addrspace(1), ptr [[BB_ADDR]], align 8, !dbg [[DBG82]]
-// CHECK1-NEXT:    [[TMP6:%.*]] = addrspacecast ptr addrspace(1) [[TMP5]] to ptr, !dbg [[DBG82]]
-// CHECK1-NEXT:    store ptr [[TMP6]], ptr [[_TMP2]], align 8, !dbg [[DBG82]]
-// CHECK1-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[_TMP2]], align 8, !dbg [[DBG82]]
-// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[B3]], metadata [[META83:![0-9]+]], metadata !DIExpression()), !dbg [[DBG72]]
-// CHECK1-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[B3]], ptr align 4 [[TMP4]], i64 400, i1 false), !dbg [[DBG82]]
-// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[F]], metadata [[META84:![0-9]+]], metadata !DIExpression()), !dbg [[DBG87:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 1, !dbg [[DBG88:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX]], i64 0, i64 1, !dbg [[DBG88]]
-// CHECK1-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX4]], i64 0, i64 1, !dbg [[DBG88]]
-// CHECK1-NEXT:    store ptr [[ARRAYIDX5]], ptr [[F]], align 8, !dbg [[DBG87]]
-// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[G]], metadata [[META89:![0-9]+]], metadata !DIExpression()), !dbg [[DBG90:![0-9]+]]
-// CHECK1-NEXT:    store ptr [[A_ADDR]], ptr [[G]], align 8, !dbg [[DBG90]]
-// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[H]], metadata [[META91:![0-9]+]], metadata !DIExpression()), !dbg [[DBG92:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[B3]], i64 0, i64 1, !dbg [[DBG93:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX6]], i64 0, i64 1, !dbg [[DBG93]]
-// CHECK1-NEXT:    store ptr [[ARRAYIDX7]], ptr [[H]], align 8, !dbg [[DBG92]]
-// CHECK1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[D]], metadata [[META94:![0-9]+]], metadata !DIExpression()), !dbg [[DBG95:![0-9]+]]
-// CHECK1-NEXT:    store i32 15, ptr [[D]], align 4, !dbg [[DBG95]]
-// CHECK1-NEXT:    store i32 5, ptr [[A_ADDR]], align 4, !dbg [[DBG96:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[B3]], i64 0, i64 0, !dbg [[DBG97:![0-9]+]]
-// CHECK1-NEXT:    [[TMP8:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG98:![0-9]+]]
-// CHECK1-NEXT:    [[IDXPROM:%.*]] = sext i32 [[TMP8]] to i64, !dbg [[DBG97]]
-// CHECK1-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX8]], i64 0, i64 [[IDXPROM]], !dbg [[DBG97]]
-// CHECK1-NEXT:    store i32 10, ptr [[ARRAYIDX9]], align 4, !dbg [[DBG99:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 0, !dbg [[DBG100:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX10]], i64 0, i64 0, !dbg [[DBG100]]
-// CHECK1-NEXT:    [[TMP9:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG101:![0-9]+]]
-// CHECK1-NEXT:    [[IDXPROM12:%.*]] = sext i32 [[TMP9]] to i64, !dbg [[DBG100]]
-// CHECK1-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX11]], i64 0, i64 [[IDXPROM12]], !dbg [[DBG100]]
-// CHECK1-NEXT:    store i32 11, ptr [[ARRAYIDX13]], align 4, !dbg [[DBG102:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 0, !dbg [[DBG103:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX14]], i64 0, i64 0, !dbg [[DBG103]]
-// CHECK1-NEXT:    [[TMP10:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG104:![0-9]+]]
-// CHECK1-NEXT:    [[IDXPROM16:%.*]] = sext i32 [[TMP10]] to i64, !dbg [[DBG103]]
-// CHECK1-NEXT:    [[ARRAYIDX17:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX15]], i64 0, i64 [[IDXPROM16]], !dbg [[DBG103]]
-// CHECK1-NEXT:    [[TMP11:%.*]] = load i32, ptr [[ARRAYIDX17]], align 4, !dbg [[DBG103]]
-// CHECK1-NEXT:    [[ARRAYIDX18:%.*]] = getelementptr inbounds [10 x [10 x i