[clang] 97f3be2 - [CUDA][HIP] Improve variable registration with the new driver (#73177)

via cfe-commits cfe-commits at lists.llvm.org
Thu Dec 7 13:44:28 PST 2023


Author: Joseph Huber
Date: 2023-12-07T15:44:23-06:00
New Revision: 97f3be2c5a0295632321141bdc001d4f81821958

URL: https://github.com/llvm/llvm-project/commit/97f3be2c5a0295632321141bdc001d4f81821958
DIFF: https://github.com/llvm/llvm-project/commit/97f3be2c5a0295632321141bdc001d4f81821958.diff

LOG: [CUDA][HIP] Improve variable registration with the new driver (#73177)

Summary:
This patch adds support for registering texture / surface variables from
CUDA / HIP. Additionally, we now properly track the `extern` and `const`
flags that are also used in these runtime functions.

This does not implement the `managed` variables yet as those seem to
require some extra handling I'm not familiar with. The issue is that the
current offload entry isn't large enough to carry size and alignment
information along with an extra global.

Added: 
    

Modified: 
    clang/lib/CodeGen/CGCUDANV.cpp
    clang/lib/CodeGen/CGCUDARuntime.h
    clang/test/CodeGenCUDA/offloading-entries.cu
    clang/test/Driver/linker-wrapper-image.c
    clang/tools/clang-linker-wrapper/OffloadWrapper.cpp
    llvm/include/llvm/Frontend/Offloading/Utility.h
    llvm/lib/Frontend/Offloading/Utility.cpp
    llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp

Removed: 
    


################################################################################
diff  --git a/clang/lib/CodeGen/CGCUDANV.cpp b/clang/lib/CodeGen/CGCUDANV.cpp
index 66147f656071f5..520b0c4f117673 100644
--- a/clang/lib/CodeGen/CGCUDANV.cpp
+++ b/clang/lib/CodeGen/CGCUDANV.cpp
@@ -1132,26 +1132,39 @@ void CGNVCUDARuntime::createOffloadingEntries() {
   for (KernelInfo &I : EmittedKernels)
     llvm::offloading::emitOffloadingEntry(
         M, KernelHandles[I.Kernel->getName()],
-        getDeviceSideName(cast<NamedDecl>(I.D)), 0,
-        DeviceVarFlags::OffloadGlobalEntry, Section);
+        getDeviceSideName(cast<NamedDecl>(I.D)), /*Flags=*/0, /*Data=*/0,
+        llvm::offloading::OffloadGlobalEntry, Section);
 
   for (VarInfo &I : DeviceVars) {
     uint64_t VarSize =
         CGM.getDataLayout().getTypeAllocSize(I.Var->getValueType());
+    int32_t Flags =
+        (I.Flags.isExtern()
+             ? static_cast<int32_t>(llvm::offloading::OffloadGlobalExtern)
+             : 0) |
+        (I.Flags.isConstant()
+             ? static_cast<int32_t>(llvm::offloading::OffloadGlobalConstant)
+             : 0) |
+        (I.Flags.isNormalized()
+             ? static_cast<int32_t>(llvm::offloading::OffloadGlobalNormalized)
+             : 0);
     if (I.Flags.getKind() == DeviceVarFlags::Variable) {
       llvm::offloading::emitOffloadingEntry(
           M, I.Var, getDeviceSideName(I.D), VarSize,
-          I.Flags.isManaged() ? DeviceVarFlags::OffloadGlobalManagedEntry
-                              : DeviceVarFlags::OffloadGlobalEntry,
-          Section);
+          (I.Flags.isManaged() ? llvm::offloading::OffloadGlobalManagedEntry
+                               : llvm::offloading::OffloadGlobalEntry) |
+              Flags,
+          /*Data=*/0, Section);
     } else if (I.Flags.getKind() == DeviceVarFlags::Surface) {
       llvm::offloading::emitOffloadingEntry(
           M, I.Var, getDeviceSideName(I.D), VarSize,
-          DeviceVarFlags::OffloadGlobalSurfaceEntry, Section);
+          llvm::offloading::OffloadGlobalSurfaceEntry | Flags,
+          I.Flags.getSurfTexType(), Section);
     } else if (I.Flags.getKind() == DeviceVarFlags::Texture) {
       llvm::offloading::emitOffloadingEntry(
           M, I.Var, getDeviceSideName(I.D), VarSize,
-          DeviceVarFlags::OffloadGlobalTextureEntry, Section);
+          llvm::offloading::OffloadGlobalTextureEntry | Flags,
+          I.Flags.getSurfTexType(), Section);
     }
   }
 }

diff  --git a/clang/lib/CodeGen/CGCUDARuntime.h b/clang/lib/CodeGen/CGCUDARuntime.h
index 9a9c6d26cc63c4..c7af8f1cf0fe95 100644
--- a/clang/lib/CodeGen/CGCUDARuntime.h
+++ b/clang/lib/CodeGen/CGCUDARuntime.h
@@ -17,6 +17,7 @@
 
 #include "clang/AST/GlobalDecl.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/Frontend/Offloading/Utility.h"
 #include "llvm/IR/GlobalValue.h"
 
 namespace llvm {
@@ -52,19 +53,6 @@ class CGCUDARuntime {
       Texture,  // Builtin texture
     };
 
-    /// The kind flag for an offloading entry.
-    enum OffloadEntryKindFlag : uint32_t {
-      /// Mark the entry as a global entry. This indicates the presense of a
-      /// kernel if the size field is zero and a variable otherwise.
-      OffloadGlobalEntry = 0x0,
-      /// Mark the entry as a managed global variable.
-      OffloadGlobalManagedEntry = 0x1,
-      /// Mark the entry as a surface variable.
-      OffloadGlobalSurfaceEntry = 0x2,
-      /// Mark the entry as a texture variable.
-      OffloadGlobalTextureEntry = 0x3,
-    };
-
   private:
     unsigned Kind : 2;
     unsigned Extern : 1;

diff  --git a/clang/test/CodeGenCUDA/offloading-entries.cu b/clang/test/CodeGenCUDA/offloading-entries.cu
index 46235051f1e4f1..4f5cf65ecd0bde 100644
--- a/clang/test/CodeGenCUDA/offloading-entries.cu
+++ b/clang/test/CodeGenCUDA/offloading-entries.cu
@@ -17,31 +17,47 @@
 //.
 // CUDA: @.omp_offloading.entry_name = internal unnamed_addr constant [8 x i8] c"_Z3foov\00"
 // CUDA: @.omp_offloading.entry._Z3foov = weak constant %struct.__tgt_offload_entry { ptr @_Z18__device_stub__foov, ptr @.omp_offloading.entry_name, i64 0, i32 0, i32 0 }, section "cuda_offloading_entries", align 1
-// CUDA: @.omp_offloading.entry_name.1 = internal unnamed_addr constant [8 x i8] c"_Z3barv\00"
-// CUDA: @.omp_offloading.entry._Z3barv = weak constant %struct.__tgt_offload_entry { ptr @_Z18__device_stub__barv, ptr @.omp_offloading.entry_name.1, i64 0, i32 0, i32 0 }, section "cuda_offloading_entries", align 1
-// CUDA: @.omp_offloading.entry_name.2 = internal unnamed_addr constant [2 x i8] c"x\00"
-// CUDA: @.omp_offloading.entry.x = weak constant %struct.__tgt_offload_entry { ptr @x, ptr @.omp_offloading.entry_name.2, i64 4, i32 0, i32 0 }, section "cuda_offloading_entries", align 1
+// CUDA: @.omp_offloading.entry_name.1 = internal unnamed_addr constant [11 x i8] c"_Z6kernelv\00"
+// CUDA: @.omp_offloading.entry._Z6kernelv = weak constant %struct.__tgt_offload_entry { ptr @_Z21__device_stub__kernelv, ptr @.omp_offloading.entry_name.1, i64 0, i32 0, i32 0 }, section "cuda_offloading_entries", align 1
+// CUDA: @.omp_offloading.entry_name.2 = internal unnamed_addr constant [4 x i8] c"var\00"
+// CUDA: @.omp_offloading.entry.var = weak constant %struct.__tgt_offload_entry { ptr @var, ptr @.omp_offloading.entry_name.2, i64 4, i32 0, i32 0 }, section "cuda_offloading_entries", align 1
+// CUDA: @.omp_offloading.entry_name.3 = internal unnamed_addr constant [5 x i8] c"surf\00"
+// CUDA: @.omp_offloading.entry.surf = weak constant %struct.__tgt_offload_entry { ptr @surf, ptr @.omp_offloading.entry_name.3, i64 4, i32 2, i32 1 }, section "cuda_offloading_entries", align 1
+// CUDA: @.omp_offloading.entry_name.4 = internal unnamed_addr constant [4 x i8] c"tex\00"
+// CUDA: @.omp_offloading.entry.tex = weak constant %struct.__tgt_offload_entry { ptr @tex, ptr @.omp_offloading.entry_name.4, i64 4, i32 3, i32 1 }, section "cuda_offloading_entries", align 1
 //.
 // HIP: @.omp_offloading.entry_name = internal unnamed_addr constant [8 x i8] c"_Z3foov\00"
 // HIP: @.omp_offloading.entry._Z3foov = weak constant %struct.__tgt_offload_entry { ptr @_Z3foov, ptr @.omp_offloading.entry_name, i64 0, i32 0, i32 0 }, section "hip_offloading_entries", align 1
-// HIP: @.omp_offloading.entry_name.1 = internal unnamed_addr constant [8 x i8] c"_Z3barv\00"
-// HIP: @.omp_offloading.entry._Z3barv = weak constant %struct.__tgt_offload_entry { ptr @_Z3barv, ptr @.omp_offloading.entry_name.1, i64 0, i32 0, i32 0 }, section "hip_offloading_entries", align 1
-// HIP: @.omp_offloading.entry_name.2 = internal unnamed_addr constant [2 x i8] c"x\00"
-// HIP: @.omp_offloading.entry.x = weak constant %struct.__tgt_offload_entry { ptr @x, ptr @.omp_offloading.entry_name.2, i64 4, i32 0, i32 0 }, section "hip_offloading_entries", align 1
+// HIP: @.omp_offloading.entry_name.1 = internal unnamed_addr constant [11 x i8] c"_Z6kernelv\00"
+// HIP: @.omp_offloading.entry._Z6kernelv = weak constant %struct.__tgt_offload_entry { ptr @_Z6kernelv, ptr @.omp_offloading.entry_name.1, i64 0, i32 0, i32 0 }, section "hip_offloading_entries", align 1
+// HIP: @.omp_offloading.entry_name.2 = internal unnamed_addr constant [4 x i8] c"var\00"
+// HIP: @.omp_offloading.entry.var = weak constant %struct.__tgt_offload_entry { ptr @var, ptr @.omp_offloading.entry_name.2, i64 4, i32 0, i32 0 }, section "hip_offloading_entries", align 1
+// HIP: @.omp_offloading.entry_name.3 = internal unnamed_addr constant [5 x i8] c"surf\00"
+// HIP: @.omp_offloading.entry.surf = weak constant %struct.__tgt_offload_entry { ptr @surf, ptr @.omp_offloading.entry_name.3, i64 4, i32 2, i32 1 }, section "hip_offloading_entries", align 1
+// HIP: @.omp_offloading.entry_name.4 = internal unnamed_addr constant [4 x i8] c"tex\00"
+// HIP: @.omp_offloading.entry.tex = weak constant %struct.__tgt_offload_entry { ptr @tex, ptr @.omp_offloading.entry_name.4, i64 4, i32 3, i32 1 }, section "hip_offloading_entries", align 1
 //.
 // CUDA-COFF: @.omp_offloading.entry_name = internal unnamed_addr constant [8 x i8] c"_Z3foov\00"
 // CUDA-COFF: @.omp_offloading.entry._Z3foov = weak constant %struct.__tgt_offload_entry { ptr @_Z18__device_stub__foov, ptr @.omp_offloading.entry_name, i64 0, i32 0, i32 0 }, section "cuda_offloading_entries$OE", align 1
-// CUDA-COFF: @.omp_offloading.entry_name.1 = internal unnamed_addr constant [8 x i8] c"_Z3barv\00"
-// CUDA-COFF: @.omp_offloading.entry._Z3barv = weak constant %struct.__tgt_offload_entry { ptr @_Z18__device_stub__barv, ptr @.omp_offloading.entry_name.1, i64 0, i32 0, i32 0 }, section "cuda_offloading_entries$OE", align 1
-// CUDA-COFF: @.omp_offloading.entry_name.2 = internal unnamed_addr constant [2 x i8] c"x\00"
-// CUDA-COFF: @.omp_offloading.entry.x = weak constant %struct.__tgt_offload_entry { ptr @x, ptr @.omp_offloading.entry_name.2, i64 4, i32 0, i32 0 }, section "cuda_offloading_entries$OE", align 1
+// CUDA-COFF: @.omp_offloading.entry_name.1 = internal unnamed_addr constant [11 x i8] c"_Z6kernelv\00"
+// CUDA-COFF: @.omp_offloading.entry._Z6kernelv = weak constant %struct.__tgt_offload_entry { ptr @_Z21__device_stub__kernelv, ptr @.omp_offloading.entry_name.1, i64 0, i32 0, i32 0 }, section "cuda_offloading_entries$OE", align 1
+// CUDA-COFF: @.omp_offloading.entry_name.2 = internal unnamed_addr constant [4 x i8] c"var\00"
+// CUDA-COFF: @.omp_offloading.entry.var = weak constant %struct.__tgt_offload_entry { ptr @var, ptr @.omp_offloading.entry_name.2, i64 4, i32 0, i32 0 }, section "cuda_offloading_entries$OE", align 1
+// CUDA-COFF: @.omp_offloading.entry_name.3 = internal unnamed_addr constant [5 x i8] c"surf\00"
+// CUDA-COFF: @.omp_offloading.entry.surf = weak constant %struct.__tgt_offload_entry { ptr @surf, ptr @.omp_offloading.entry_name.3, i64 4, i32 2, i32 1 }, section "cuda_offloading_entries$OE", align 1
+// CUDA-COFF: @.omp_offloading.entry_name.4 = internal unnamed_addr constant [4 x i8] c"tex\00"
+// CUDA-COFF: @.omp_offloading.entry.tex = weak constant %struct.__tgt_offload_entry { ptr @tex, ptr @.omp_offloading.entry_name.4, i64 4, i32 3, i32 1 }, section "cuda_offloading_entries$OE", align 1
 //.
 // HIP-COFF: @.omp_offloading.entry_name = internal unnamed_addr constant [8 x i8] c"_Z3foov\00"
 // HIP-COFF: @.omp_offloading.entry._Z3foov = weak constant %struct.__tgt_offload_entry { ptr @_Z3foov, ptr @.omp_offloading.entry_name, i64 0, i32 0, i32 0 }, section "hip_offloading_entries$OE", align 1
-// HIP-COFF: @.omp_offloading.entry_name.1 = internal unnamed_addr constant [8 x i8] c"_Z3barv\00"
-// HIP-COFF: @.omp_offloading.entry._Z3barv = weak constant %struct.__tgt_offload_entry { ptr @_Z3barv, ptr @.omp_offloading.entry_name.1, i64 0, i32 0, i32 0 }, section "hip_offloading_entries$OE", align 1
-// HIP-COFF: @.omp_offloading.entry_name.2 = internal unnamed_addr constant [2 x i8] c"x\00"
-// HIP-COFF: @.omp_offloading.entry.x = weak constant %struct.__tgt_offload_entry { ptr @x, ptr @.omp_offloading.entry_name.2, i64 4, i32 0, i32 0 }, section "hip_offloading_entries$OE", align 1
+// HIP-COFF: @.omp_offloading.entry_name.1 = internal unnamed_addr constant [11 x i8] c"_Z6kernelv\00"
+// HIP-COFF: @.omp_offloading.entry._Z6kernelv = weak constant %struct.__tgt_offload_entry { ptr @_Z6kernelv, ptr @.omp_offloading.entry_name.1, i64 0, i32 0, i32 0 }, section "hip_offloading_entries$OE", align 1
+// HIP-COFF: @.omp_offloading.entry_name.2 = internal unnamed_addr constant [4 x i8] c"var\00"
+// HIP-COFF: @.omp_offloading.entry.var = weak constant %struct.__tgt_offload_entry { ptr @var, ptr @.omp_offloading.entry_name.2, i64 4, i32 0, i32 0 }, section "hip_offloading_entries$OE", align 1
+// HIP-COFF: @.omp_offloading.entry_name.3 = internal unnamed_addr constant [5 x i8] c"surf\00"
+// HIP-COFF: @.omp_offloading.entry.surf = weak constant %struct.__tgt_offload_entry { ptr @surf, ptr @.omp_offloading.entry_name.3, i64 4, i32 2, i32 1 }, section "hip_offloading_entries$OE", align 1
+// HIP-COFF: @.omp_offloading.entry_name.4 = internal unnamed_addr constant [4 x i8] c"tex\00"
+// HIP-COFF: @.omp_offloading.entry.tex = weak constant %struct.__tgt_offload_entry { ptr @tex, ptr @.omp_offloading.entry_name.4, i64 4, i32 3, i32 1 }, section "hip_offloading_entries$OE", align 1
 //.
 // CUDA-LABEL: @_Z18__device_stub__foov(
 // CUDA-NEXT:  entry:
@@ -72,34 +88,52 @@
 // HIP-COFF-NEXT:    ret void
 //
 __global__ void foo() {}
+__device__ int var = 1;
+const __device__ int constant = 1;
+extern __device__ int external;
 
-// CUDA-LABEL: @_Z18__device_stub__barv(
+// CUDA-LABEL: @_Z21__device_stub__kernelv(
 // CUDA-NEXT:  entry:
-// CUDA-NEXT:    [[TMP0:%.*]] = call i32 @cudaLaunch(ptr @_Z18__device_stub__barv)
+// CUDA-NEXT:    [[TMP0:%.*]] = call i32 @cudaLaunch(ptr @_Z21__device_stub__kernelv)
 // CUDA-NEXT:    br label [[SETUP_END:%.*]]
 // CUDA:       setup.end:
 // CUDA-NEXT:    ret void
 //
-// HIP-LABEL: @_Z18__device_stub__barv(
+// HIP-LABEL: @_Z21__device_stub__kernelv(
 // HIP-NEXT:  entry:
-// HIP-NEXT:    [[TMP0:%.*]] = call i32 @hipLaunchByPtr(ptr @_Z3barv)
+// HIP-NEXT:    [[TMP0:%.*]] = call i32 @hipLaunchByPtr(ptr @_Z6kernelv)
 // HIP-NEXT:    br label [[SETUP_END:%.*]]
 // HIP:       setup.end:
 // HIP-NEXT:    ret void
 //
-// CUDA-COFF-LABEL: @_Z18__device_stub__barv(
+// CUDA-COFF-LABEL: @_Z21__device_stub__kernelv(
 // CUDA-COFF-NEXT:  entry:
-// CUDA-COFF-NEXT:    [[TMP0:%.*]] = call i32 @cudaLaunch(ptr @_Z18__device_stub__barv)
+// CUDA-COFF-NEXT:    [[TMP0:%.*]] = call i32 @cudaLaunch(ptr @_Z21__device_stub__kernelv)
 // CUDA-COFF-NEXT:    br label [[SETUP_END:%.*]]
 // CUDA-COFF:       setup.end:
 // CUDA-COFF-NEXT:    ret void
 //
-// HIP-COFF-LABEL: @_Z18__device_stub__barv(
+// HIP-COFF-LABEL: @_Z21__device_stub__kernelv(
 // HIP-COFF-NEXT:  entry:
-// HIP-COFF-NEXT:    [[TMP0:%.*]] = call i32 @hipLaunchByPtr(ptr @_Z3barv)
+// HIP-COFF-NEXT:    [[TMP0:%.*]] = call i32 @hipLaunchByPtr(ptr @_Z6kernelv)
 // HIP-COFF-NEXT:    br label [[SETUP_END:%.*]]
 // HIP-COFF:       setup.end:
 // HIP-COFF-NEXT:    ret void
 //
-__global__ void bar() {}
-__device__ int x = 1;
+__global__ void kernel() { external = 1; }
+
+struct surfaceReference { int desc; };
+
+template <typename T, int dim = 1>
+struct __attribute__((device_builtin_surface_type)) surface : public surfaceReference {};
+
+surface<void> surf;
+
+struct textureReference {
+  int desc;
+};
+
+template <typename T, int dim = 1, int mode = 0>
+struct __attribute__((device_builtin_texture_type)) texture : public textureReference {};
+
+texture<void> tex;

diff  --git a/clang/test/Driver/linker-wrapper-image.c b/clang/test/Driver/linker-wrapper-image.c
index 73d3c40810c35a..4a17a8324b4627 100644
--- a/clang/test/Driver/linker-wrapper-image.c
+++ b/clang/test/Driver/linker-wrapper-image.c
@@ -80,24 +80,33 @@
 // CUDA-NEXT:   br i1 icmp ne (ptr @__start_cuda_offloading_entries, ptr @__stop_cuda_offloading_entries), label %while.entry, label %while.end
 
 //      CUDA: while.entry:
-// CUDA-NEXT:  %entry1 = phi ptr [ @__start_cuda_offloading_entries, %entry ], [ %7, %if.end ]
-// CUDA-NEXT:  %1 = getelementptr inbounds %struct.__tgt_offload_entry, ptr %entry1, i64 0, i32 0
-// CUDA-NEXT:  %addr = load ptr, ptr %1, align 8
-// CUDA-NEXT:  %2 = getelementptr inbounds %struct.__tgt_offload_entry, ptr %entry1, i64 0, i32 1
-// CUDA-NEXT:  %name = load ptr, ptr %2, align 8
-// CUDA-NEXT:  %3 = getelementptr inbounds %struct.__tgt_offload_entry, ptr %entry1, i64 0, i32 2
-// CUDA-NEXT:  %size = load i64, ptr %3, align 4
-// CUDA-NEXT:  %4 = getelementptr inbounds %struct.__tgt_offload_entry, ptr %entry1, i64 0, i32 3
-// CUDA-NEXT:  %flag = load i32, ptr %4, align 4
-// CUDA-NEXT:  %5 = icmp eq i64 %size, 0
-// CUDA-NEXT:  br i1 %5, label %if.then, label %if.else
+// CUDA-NEXT:   %entry1 = phi ptr [ @__start_cuda_offloading_entries, %entry ], [ %11, %if.end ]
+// CUDA-NEXT:   %1 = getelementptr inbounds %struct.__tgt_offload_entry, ptr %entry1, i64 0, i32 0
+// CUDA-NEXT:   %addr = load ptr, ptr %1, align 8
+// CUDA-NEXT:   %2 = getelementptr inbounds %struct.__tgt_offload_entry, ptr %entry1, i64 0, i32 1
+// CUDA-NEXT:   %name = load ptr, ptr %2, align 8
+// CUDA-NEXT:   %3 = getelementptr inbounds %struct.__tgt_offload_entry, ptr %entry1, i64 0, i32 2
+// CUDA-NEXT:   %size = load i64, ptr %3, align 4
+// CUDA-NEXT:   %4 = getelementptr inbounds %struct.__tgt_offload_entry, ptr %entry1, i64 0, i32 3
+// CUDA-NEXT:   %flags = load i32, ptr %4, align 4
+// CUDA-NEXT:   %5 = getelementptr inbounds %struct.__tgt_offload_entry, ptr %entry1, i64 0, i32 4
+// CUDA-NEXT:   %textype = load i32, ptr %4, align 4
+// CUDA-NEXT:   %type = and i32 %flags, 7
+// CUDA-NEXT:   %6 = and i32 %flags, 8
+// CUDA-NEXT:   %extern = lshr i32 %6, 3
+// CUDA-NEXT:   %7 = and i32 %flags, 16
+// CUDA-NEXT:   %constant = lshr i32 %7, 4
+// CUDA-NEXT:   %8 = and i32 %flags, 32
+// CUDA-NEXT:   %normalized = lshr i32 %8, 5
+// CUDA-NEXT:   %9 = icmp eq i64 %size, 0
+// CUDA-NEXT:   br i1 %9, label %if.then, label %if.else
 
 //      CUDA: if.then:
-// CUDA-NEXT:   %6 = call i32 @__cudaRegisterFunction(ptr %0, ptr %addr, ptr %name, ptr %name, i32 -1, ptr null, ptr null, ptr null, ptr null, ptr null)
+// CUDA-NEXT:   %10 = call i32 @__cudaRegisterFunction(ptr %0, ptr %addr, ptr %name, ptr %name, i32 -1, ptr null, ptr null, ptr null, ptr null, ptr null)
 // CUDA-NEXT:   br label %if.end
 
 //      CUDA: if.else:
-// CUDA-NEXT:   switch i32 %flag, label %if.end [
+// CUDA-NEXT:   switch i32 %type, label %if.end [
 // CUDA-NEXT:     i32 0, label %sw.global
 // CUDA-NEXT:     i32 1, label %sw.managed
 // CUDA-NEXT:     i32 2, label %sw.surface
@@ -105,22 +114,24 @@
 // CUDA-NEXT:   ]
 
 //      CUDA: sw.global:
-// CUDA-NEXT:   call void @__cudaRegisterVar(ptr %0, ptr %addr, ptr %name, ptr %name, i32 0, i64 %size, i32 0, i32 0)
+// CUDA-NEXT:   call void @__cudaRegisterVar(ptr %0, ptr %addr, ptr %name, ptr %name, i32 %extern, i64 %size, i32 %constant, i32 0)
 // CUDA-NEXT:   br label %if.end
 
 //      CUDA: sw.managed:
 // CUDA-NEXT:   br label %if.end
 
 //      CUDA: sw.surface:
+// CUDA-NEXT:   call void @__cudaRegisterSurface(ptr %0, ptr %addr, ptr %name, ptr %name, i32 %textype, i32 %extern)
 // CUDA-NEXT:   br label %if.end
 
 //      CUDA: sw.texture:
+// CUDA-NEXT:   call void @__cudaRegisterTexture(ptr %0, ptr %addr, ptr %name, ptr %name, i32 %textype, i32 %normalized, i32 %extern)
 // CUDA-NEXT:   br label %if.end
 
 //      CUDA: if.end:
-// CUDA-NEXT:   %7 = getelementptr inbounds %struct.__tgt_offload_entry, ptr %entry1, i64 1
-// CUDA-NEXT:   %8 = icmp eq ptr %7, @__stop_cuda_offloading_entries
-// CUDA-NEXT:   br i1 %8, label %while.end, label %while.entry
+// CUDA-NEXT:   %11 = getelementptr inbounds %struct.__tgt_offload_entry, ptr %entry1, i64 1
+// CUDA-NEXT:   %12 = icmp eq ptr %11, @__stop_cuda_offloading_entries
+// CUDA-NEXT:   br i1 %12, label %while.end, label %while.entry
 
 //      CUDA: while.end:
 // CUDA-NEXT:   ret void
@@ -168,7 +179,7 @@
 // HIP-NEXT:   br i1 icmp ne (ptr @__start_hip_offloading_entries, ptr @__stop_hip_offloading_entries), label %while.entry, label %while.end
 
 //      HIP: while.entry:
-// HIP-NEXT:   %entry1 = phi ptr [ @__start_hip_offloading_entries, %entry ], [ %7, %if.end ]
+// HIP-NEXT:   %entry1 = phi ptr [ @__start_hip_offloading_entries, %entry ], [ %11, %if.end ]
 // HIP-NEXT:   %1 = getelementptr inbounds %struct.__tgt_offload_entry, ptr %entry1, i64 0, i32 0
 // HIP-NEXT:   %addr = load ptr, ptr %1, align 8
 // HIP-NEXT:   %2 = getelementptr inbounds %struct.__tgt_offload_entry, ptr %entry1, i64 0, i32 1
@@ -176,16 +187,25 @@
 // HIP-NEXT:   %3 = getelementptr inbounds %struct.__tgt_offload_entry, ptr %entry1, i64 0, i32 2
 // HIP-NEXT:   %size = load i64, ptr %3, align 4
 // HIP-NEXT:   %4 = getelementptr inbounds %struct.__tgt_offload_entry, ptr %entry1, i64 0, i32 3
-// HIP-NEXT:   %flag = load i32, ptr %4, align 4
-// HIP-NEXT:   %5 = icmp eq i64 %size, 0
-// HIP-NEXT:   br i1 %5, label %if.then, label %if.else
+// HIP-NEXT:   %flags = load i32, ptr %4, align 4
+// HIP-NEXT:   %5 = getelementptr inbounds %struct.__tgt_offload_entry, ptr %entry1, i64 0, i32 4
+// HIP-NEXT:   %textype = load i32, ptr %4, align 4
+// HIP-NEXT:   %type = and i32 %flags, 7
+// HIP-NEXT:   %6 = and i32 %flags, 8
+// HIP-NEXT:   %extern = lshr i32 %6, 3
+// HIP-NEXT:   %7 = and i32 %flags, 16
+// HIP-NEXT:   %constant = lshr i32 %7, 4
+// HIP-NEXT:   %8 = and i32 %flags, 32
+// HIP-NEXT:   %normalized = lshr i32 %8, 5
+// HIP-NEXT:   %9 = icmp eq i64 %size, 0
+// HIP-NEXT:   br i1 %9, label %if.then, label %if.else
 
 //      HIP: if.then:
-// HIP-NEXT:   %6 = call i32 @__hipRegisterFunction(ptr %0, ptr %addr, ptr %name, ptr %name, i32 -1, ptr null, ptr null, ptr null, ptr null, ptr null)
+// HIP-NEXT:   %10 = call i32 @__hipRegisterFunction(ptr %0, ptr %addr, ptr %name, ptr %name, i32 -1, ptr null, ptr null, ptr null, ptr null, ptr null)
 // HIP-NEXT:   br label %if.end
 
 //      HIP: if.else:
-// HIP-NEXT:   switch i32 %flag, label %if.end [
+// HIP-NEXT:   switch i32 %type, label %if.end [
 // HIP-NEXT:     i32 0, label %sw.global
 // HIP-NEXT:     i32 1, label %sw.managed
 // HIP-NEXT:     i32 2, label %sw.surface
@@ -193,22 +213,24 @@
 // HIP-NEXT:   ]
 
 //      HIP: sw.global:
-// HIP-NEXT:   call void @__hipRegisterVar(ptr %0, ptr %addr, ptr %name, ptr %name, i32 0, i64 %size, i32 0, i32 0)
+// HIP-NEXT:   call void @__hipRegisterVar(ptr %0, ptr %addr, ptr %name, ptr %name, i32 %extern, i64 %size, i32 %constant, i32 0)
 // HIP-NEXT:   br label %if.end
 
 //      HIP: sw.managed:
 // HIP-NEXT:   br label %if.end
 
 //      HIP: sw.surface:
+// HIP-NEXT:   call void @__hipRegisterSurface(ptr %0, ptr %addr, ptr %name, ptr %name, i32 %textype, i32 %extern)
 // HIP-NEXT:   br label %if.end
 
 //      HIP: sw.texture:
+// HIP-NEXT:   call void @__hipRegisterTexture(ptr %0, ptr %addr, ptr %name, ptr %name, i32 %textype, i32 %normalized, i32 %extern)
 // HIP-NEXT:   br label %if.end
 
 //      HIP: if.end:
-// HIP-NEXT:   %7 = getelementptr inbounds %struct.__tgt_offload_entry, ptr %entry1, i64 1
-// HIP-NEXT:   %8 = icmp eq ptr %7, @__stop_hip_offloading_entries
-// HIP-NEXT:   br i1 %8, label %while.end, label %while.entry
+// HIP-NEXT:   %11 = getelementptr inbounds %struct.__tgt_offload_entry, ptr %entry1, i64 1
+// HIP-NEXT:   %12 = icmp eq ptr %11, @__stop_hip_offloading_entries
+// HIP-NEXT:   br i1 %12, label %while.end, label %while.entry
 
 //      HIP: while.end:
 // HIP-NEXT:   ret void

diff  --git a/clang/tools/clang-linker-wrapper/OffloadWrapper.cpp b/clang/tools/clang-linker-wrapper/OffloadWrapper.cpp
index 3e1dd874216ccf..58d9e1e85ceff9 100644
--- a/clang/tools/clang-linker-wrapper/OffloadWrapper.cpp
+++ b/clang/tools/clang-linker-wrapper/OffloadWrapper.cpp
@@ -26,19 +26,6 @@ namespace {
 constexpr unsigned CudaFatMagic = 0x466243b1;
 constexpr unsigned HIPFatMagic = 0x48495046;
 
-/// Copied from clang/CGCudaRuntime.h.
-enum OffloadEntryKindFlag : uint32_t {
-  /// Mark the entry as a global entry. This indicates the presense of a
-  /// kernel if the size size field is zero and a variable otherwise.
-  OffloadGlobalEntry = 0x0,
-  /// Mark the entry as a managed global variable.
-  OffloadGlobalManagedEntry = 0x1,
-  /// Mark the entry as a surface variable.
-  OffloadGlobalSurfaceEntry = 0x2,
-  /// Mark the entry as a texture variable.
-  OffloadGlobalTextureEntry = 0x3,
-};
-
 IntegerType *getSizeTTy(Module &M) {
   return M.getDataLayout().getIntPtrType(M.getContext());
 }
@@ -333,6 +320,24 @@ Function *createRegisterGlobalsFunction(Module &M, bool IsHIP) {
   FunctionCallee RegVar = M.getOrInsertFunction(
       IsHIP ? "__hipRegisterVar" : "__cudaRegisterVar", RegVarTy);
 
+  // Get the __cudaRegisterSurface function declaration.
+  auto *RegSurfaceTy =
+      FunctionType::get(Type::getVoidTy(C),
+                        {Int8PtrPtrTy, Int8PtrTy, Int8PtrTy, Int8PtrTy,
+                         Type::getInt32Ty(C), Type::getInt32Ty(C)},
+                        /*isVarArg=*/false);
+  FunctionCallee RegSurface = M.getOrInsertFunction(
+      IsHIP ? "__hipRegisterSurface" : "__cudaRegisterSurface", RegSurfaceTy);
+
+  // Get the __cudaRegisterTexture function declaration.
+  auto *RegTextureTy = FunctionType::get(
+      Type::getVoidTy(C),
+      {Int8PtrPtrTy, Int8PtrTy, Int8PtrTy, Int8PtrTy, Type::getInt32Ty(C),
+       Type::getInt32Ty(C), Type::getInt32Ty(C)},
+      /*isVarArg=*/false);
+  FunctionCallee RegTexture = M.getOrInsertFunction(
+      IsHIP ? "__hipRegisterTexture" : "__cudaRegisterTexture", RegTextureTy);
+
   auto *RegGlobalsTy = FunctionType::get(Type::getVoidTy(C), Int8PtrPtrTy,
                                          /*isVarArg*/ false);
   auto *RegGlobalsFn =
@@ -375,7 +380,31 @@ Function *createRegisterGlobalsFunction(Module &M, bool IsHIP) {
       Builder.CreateInBoundsGEP(offloading::getEntryTy(M), Entry,
                                 {ConstantInt::get(getSizeTTy(M), 0),
                                  ConstantInt::get(Type::getInt32Ty(C), 3)});
-  auto *Flags = Builder.CreateLoad(Type::getInt32Ty(C), FlagsPtr, "flag");
+  auto *Flags = Builder.CreateLoad(Type::getInt32Ty(C), FlagsPtr, "flags");
+  auto *DataPtr =
+      Builder.CreateInBoundsGEP(offloading::getEntryTy(M), Entry,
+                                {ConstantInt::get(getSizeTTy(M), 0),
+                                 ConstantInt::get(Type::getInt32Ty(C), 4)});
+  auto *Data = Builder.CreateLoad(Type::getInt32Ty(C), FlagsPtr, "textype");
+  auto *Kind = Builder.CreateAnd(
+      Flags, ConstantInt::get(Type::getInt32Ty(C), 0x7), "type");
+
+  // Extract the flags stored in the bit-field and convert them to C booleans.
+  auto *ExternBit = Builder.CreateAnd(
+      Flags, ConstantInt::get(Type::getInt32Ty(C),
+                              llvm::offloading::OffloadGlobalExtern));
+  auto *Extern = Builder.CreateLShr(
+      ExternBit, ConstantInt::get(Type::getInt32Ty(C), 3), "extern");
+  auto *ConstantBit = Builder.CreateAnd(
+      Flags, ConstantInt::get(Type::getInt32Ty(C),
+                              llvm::offloading::OffloadGlobalConstant));
+  auto *Const = Builder.CreateLShr(
+      ConstantBit, ConstantInt::get(Type::getInt32Ty(C), 4), "constant");
+  auto *NormalizedBit = Builder.CreateAnd(
+      Flags, ConstantInt::get(Type::getInt32Ty(C),
+                              llvm::offloading::OffloadGlobalNormalized));
+  auto *Normalized = Builder.CreateLShr(
+      NormalizedBit, ConstantInt::get(Type::getInt32Ty(C), 5), "normalized");
   auto *FnCond =
       Builder.CreateICmpEQ(Size, ConstantInt::getNullValue(getSizeTTy(M)));
   Builder.CreateCondBr(FnCond, IfThenBB, IfElseBB);
@@ -392,30 +421,37 @@ Function *createRegisterGlobalsFunction(Module &M, bool IsHIP) {
   Builder.CreateBr(IfEndBB);
   Builder.SetInsertPoint(IfElseBB);
 
-  auto *Switch = Builder.CreateSwitch(Flags, IfEndBB);
+  auto *Switch = Builder.CreateSwitch(Kind, IfEndBB);
   // Create global variable registration code.
   Builder.SetInsertPoint(SwGlobalBB);
-  Builder.CreateCall(RegVar, {RegGlobalsFn->arg_begin(), Addr, Name, Name,
-                              ConstantInt::get(Type::getInt32Ty(C), 0), Size,
-                              ConstantInt::get(Type::getInt32Ty(C), 0),
-                              ConstantInt::get(Type::getInt32Ty(C), 0)});
+  Builder.CreateCall(RegVar,
+                     {RegGlobalsFn->arg_begin(), Addr, Name, Name, Extern, Size,
+                      Const, ConstantInt::get(Type::getInt32Ty(C), 0)});
   Builder.CreateBr(IfEndBB);
-  Switch->addCase(Builder.getInt32(OffloadGlobalEntry), SwGlobalBB);
+  Switch->addCase(Builder.getInt32(llvm::offloading::OffloadGlobalEntry),
+                  SwGlobalBB);
 
   // Create managed variable registration code.
   Builder.SetInsertPoint(SwManagedBB);
   Builder.CreateBr(IfEndBB);
-  Switch->addCase(Builder.getInt32(OffloadGlobalManagedEntry), SwManagedBB);
+  Switch->addCase(Builder.getInt32(llvm::offloading::OffloadGlobalManagedEntry),
+                  SwManagedBB);
 
   // Create surface variable registration code.
   Builder.SetInsertPoint(SwSurfaceBB);
+  Builder.CreateCall(
+      RegSurface, {RegGlobalsFn->arg_begin(), Addr, Name, Name, Data, Extern});
   Builder.CreateBr(IfEndBB);
-  Switch->addCase(Builder.getInt32(OffloadGlobalSurfaceEntry), SwSurfaceBB);
+  Switch->addCase(Builder.getInt32(llvm::offloading::OffloadGlobalSurfaceEntry),
+                  SwSurfaceBB);
 
   // Create texture variable registration code.
   Builder.SetInsertPoint(SwTextureBB);
+  Builder.CreateCall(RegTexture, {RegGlobalsFn->arg_begin(), Addr, Name, Name,
+                                  Data, Normalized, Extern});
   Builder.CreateBr(IfEndBB);
-  Switch->addCase(Builder.getInt32(OffloadGlobalTextureEntry), SwTextureBB);
+  Switch->addCase(Builder.getInt32(llvm::offloading::OffloadGlobalTextureEntry),
+                  SwTextureBB);
 
   Builder.SetInsertPoint(IfEndBB);
   auto *NewEntry = Builder.CreateInBoundsGEP(

diff  --git a/llvm/include/llvm/Frontend/Offloading/Utility.h b/llvm/include/llvm/Frontend/Offloading/Utility.h
index 631d5a5a3db68e..520c192996a066 100644
--- a/llvm/include/llvm/Frontend/Offloading/Utility.h
+++ b/llvm/include/llvm/Frontend/Offloading/Utility.h
@@ -6,12 +6,35 @@
 //
 //===----------------------------------------------------------------------===//
 
+#ifndef LLVM_FRONTEND_OFFLOADING_UTILITY_H
+#define LLVM_FRONTEND_OFFLOADING_UTILITY_H
+
 #include "llvm/IR/Module.h"
 #include "llvm/Object/OffloadBinary.h"
 
 namespace llvm {
 namespace offloading {
 
+/// Offloading entry flags for CUDA / HIP. The first three bits indicate the
+/// type of entry while the others are a bit field for additional information.
+enum OffloadEntryKindFlag : uint32_t {
+  /// Mark the entry as a global entry. This indicates the presense of a
+  /// kernel if the size size field is zero and a variable otherwise.
+  OffloadGlobalEntry = 0x0,
+  /// Mark the entry as a managed global variable.
+  OffloadGlobalManagedEntry = 0x1,
+  /// Mark the entry as a surface variable.
+  OffloadGlobalSurfaceEntry = 0x2,
+  /// Mark the entry as a texture variable.
+  OffloadGlobalTextureEntry = 0x3,
+  /// Mark the entry as being extern.
+  OffloadGlobalExtern = 0x1 << 3,
+  /// Mark the entry as being constant.
+  OffloadGlobalConstant = 0x1 << 4,
+  /// Mark the entry as being a normalized surface.
+  OffloadGlobalNormalized = 0x1 << 5,
+};
+
 /// Returns the type of the offloading entry we use to store kernels and
 /// globals that will be registered with the offloading runtime.
 StructType *getEntryTy(Module &M);
@@ -25,7 +48,7 @@ StructType *getEntryTy(Module &M);
 ///   char    *name;      // Name of the function or global.
 ///   size_t  size;       // Size of the entry info (0 if it a function).
 ///   int32_t flags;
-///   int32_t reserved;
+///   int32_t data;
 /// };
 ///
 /// \param M The module to be used
@@ -33,9 +56,11 @@ StructType *getEntryTy(Module &M);
 /// \param Name The symbol name associated with the global.
 /// \param Size The size in bytes of the global (0 for functions).
 /// \param Flags Flags associated with the entry.
+/// \param Data Extra data storage associated with the entry.
 /// \param SectionName The section this entry will be placed at.
 void emitOffloadingEntry(Module &M, Constant *Addr, StringRef Name,
-                         uint64_t Size, int32_t Flags, StringRef SectionName);
+                         uint64_t Size, int32_t Flags, int32_t Data,
+                         StringRef SectionName);
 
 /// Creates a pair of globals used to iterate the array of offloading entries by
 /// accessing the section variables provided by the linker.
@@ -44,3 +69,5 @@ getOffloadEntryArray(Module &M, StringRef SectionName);
 
 } // namespace offloading
 } // namespace llvm
+
+#endif // LLVM_FRONTEND_OFFLOADING_UTILITY_H

diff  --git a/llvm/lib/Frontend/Offloading/Utility.cpp b/llvm/lib/Frontend/Offloading/Utility.cpp
index 1c08f02c17f5fa..25f609517ebeb7 100644
--- a/llvm/lib/Frontend/Offloading/Utility.cpp
+++ b/llvm/lib/Frontend/Offloading/Utility.cpp
@@ -29,7 +29,7 @@ StructType *offloading::getEntryTy(Module &M) {
 
 // TODO: Rework this interface to be more generic.
 void offloading::emitOffloadingEntry(Module &M, Constant *Addr, StringRef Name,
-                                     uint64_t Size, int32_t Flags,
+                                     uint64_t Size, int32_t Flags, int32_t Data,
                                      StringRef SectionName) {
   llvm::Triple Triple(M.getTargetTriple());
 
@@ -51,7 +51,7 @@ void offloading::emitOffloadingEntry(Module &M, Constant *Addr, StringRef Name,
       ConstantExpr::getPointerBitCastOrAddrSpaceCast(Str, Int8PtrTy),
       ConstantInt::get(SizeTy, Size),
       ConstantInt::get(Int32Ty, Flags),
-      ConstantInt::get(Int32Ty, 0),
+      ConstantInt::get(Int32Ty, Data),
   };
   Constant *EntryInitializer = ConstantStruct::get(getEntryTy(M), EntryData);
 

diff  --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index 4c1def507df46c..be9a44885f668e 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -6393,7 +6393,7 @@ void OpenMPIRBuilder::createOffloadEntry(Constant *ID, Constant *Addr,
                                          StringRef Name) {
   if (!Config.isGPU()) {
     llvm::offloading::emitOffloadingEntry(
-        M, ID, Name.empty() ? Addr->getName() : Name, Size, Flags,
+        M, ID, Name.empty() ? Addr->getName() : Name, Size, Flags, /*Data=*/0,
         "omp_offloading_entries");
     return;
   }


        


More information about the cfe-commits mailing list