[clang] ce091eb - [HIP] Add support for handling HIP in the linker wrapper

Mon Jul 11 12:49:34 PDT 2022

Author: Joseph Huber
Date: 2022-07-11T15:49:23-04:00
New Revision: ce091eb3b91fc683513f47a565d68cf2799804c9

URL: https://github.com/llvm/llvm-project/commit/ce091eb3b91fc683513f47a565d68cf2799804c9
DIFF: https://github.com/llvm/llvm-project/commit/ce091eb3b91fc683513f47a565d68cf2799804c9.diff

LOG: [HIP] Add support for handling HIP in the linker wrapper

This patch adds the necessary changes required to bundle and wrap HIP
files. The bundling is done using `clang-offload-bundler` currently to
mimic `fatbinary` and the wrapping is done using very similar runtime
calls to CUDA. This still does not support managed / surface / texture
variables, that would require some additional information in the entry.

One difference in the codegeneration with AMD is that I don't check if
the handle is null before destructing it, I'm not sure if that's
required.

With this we should be able to support HIP with the new driver.

Depends on D128850

Reviewed By: JonChesterfield

Differential Revision: https://reviews.llvm.org/D128914

Added: 
    

Modified: 
    clang/test/Driver/linker-wrapper-image.c
    clang/test/Driver/linker-wrapper.c
    clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
    clang/tools/clang-linker-wrapper/OffloadWrapper.cpp
    clang/tools/clang-linker-wrapper/OffloadWrapper.h

Removed: 
    


################################################################################
diff  --git a/clang/test/Driver/linker-wrapper-image.c b/clang/test/Driver/linker-wrapper-image.c
index 51904e1839cdd..59f2014e466e8 100644

--- a/clang/test/Driver/linker-wrapper-image.c
+++ b/clang/test/Driver/linker-wrapper-image.c
@@ -77,7 +77,6 @@
 // CUDA-NEXT:  %5 = icmp eq i64 %size, 0
 // CUDA-NEXT:  br i1 %5, label %if.then, label %if.else
 
-
 //      CUDA: if.then:
 // CUDA-NEXT:   %6 = call i32 @__cudaRegisterFunction(ptr %0, ptr %addr, ptr %name, ptr %name, i32 -1, ptr null, ptr null, ptr null, ptr null, ptr null)
 // CUDA-NEXT:   br label %if.end
@@ -111,3 +110,84 @@
 //      CUDA: while.end:
 // CUDA-NEXT:   ret void
 // CUDA-NEXT: }
+
+// RUN: clang-offload-packager -o %t.out --image=file=%S/Inputs/dummy-elf.o,kind=hip,triple=amdgcn-amd-amdhsa,arch=gfx908
+// RUN: %clang -cc1 %s -triple x86_64-unknown-linux-gnu -emit-obj -o %t.o \
+// RUN:   -fembed-offload-object=%t.out
+// RUN: clang-linker-wrapper --print-wrapped-module --dry-run --host-triple x86_64-unknown-linux-gnu \
+// RUN:   -linker-path /usr/bin/ld -- %t.o -o a.out 2>&1 | FileCheck %s --check-prefix=HIP
+
+//      HIP: @.fatbin_image = internal constant [0 x i8] zeroinitializer, section ".hip_fatbin"
+// HIP-NEXT: @.fatbin_wrapper = internal constant %fatbin_wrapper { i32 1212764230, i32 1, ptr @.fatbin_image, ptr null }, section ".hipFatBinSegment", align 8
+// HIP-NEXT: @__dummy.hip_offloading.entry = hidden constant [0 x %__tgt_offload_entry] zeroinitializer, section "hip_offloading_entries"
+// HIP-NEXT: @.hip.binary_handle = internal global ptr null
+// HIP-NEXT: @__start_hip_offloading_entries = external hidden constant [0 x %__tgt_offload_entry]
+// HIP-NEXT: @__stop_hip_offloading_entries = external hidden constant [0 x %__tgt_offload_entry]
+// HIP-NEXT: @llvm.global_ctors = appending global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 1, ptr @.hip.fatbin_reg, ptr null }]
+
+//      HIP: define internal void @.hip.fatbin_reg() section ".text.startup" {
+// HIP-NEXT: entry:
+// HIP-NEXT:   %0 = call ptr @__hipRegisterFatBinary(ptr @.fatbin_wrapper)
+// HIP-NEXT:   store ptr %0, ptr @.hip.binary_handle, align 8
+// HIP-NEXT:   call void @.hip.globals_reg(ptr %0)
+// HIP-NEXT:   %1 = call i32 @atexit(ptr @.hip.fatbin_unreg)
+// HIP-NEXT:   ret void
+// HIP-NEXT: }
+
+//      HIP: define internal void @.hip.fatbin_unreg() section ".text.startup" {
+// HIP-NEXT: entry:
+// HIP-NEXT:   %0 = load ptr, ptr @.hip.binary_handle, align 8
+// HIP-NEXT:   call void @__hipUnregisterFatBinary(ptr %0)
+// HIP-NEXT:   ret void
+// HIP-NEXT: }
+
+//      HIP: define internal void @.hip.globals_reg(ptr %0) section ".text.startup" {
+// HIP-NEXT: entry:
+// HIP-NEXT:   br i1 icmp ne (ptr @__start_hip_offloading_entries, ptr @__stop_hip_offloading_entries), label %while.entry, label %while.end
+
+//      HIP: while.entry:
+// HIP-NEXT:   %entry1 = phi ptr [ @__start_hip_offloading_entries, %entry ], [ %7, %if.end ]
+// HIP-NEXT:   %1 = getelementptr inbounds %__tgt_offload_entry, ptr %entry1, i64 0, i32 0
+// HIP-NEXT:   %addr = load ptr, ptr %1, align 8
+// HIP-NEXT:   %2 = getelementptr inbounds %__tgt_offload_entry, ptr %entry1, i64 0, i32 1
+// HIP-NEXT:   %name = load ptr, ptr %2, align 8
+// HIP-NEXT:   %3 = getelementptr inbounds %__tgt_offload_entry, ptr %entry1, i64 0, i32 2
+// HIP-NEXT:   %size = load i64, ptr %3, align 4
+// HIP-NEXT:   %4 = getelementptr inbounds %__tgt_offload_entry, ptr %entry1, i64 0, i32 3
+// HIP-NEXT:   %flag = load i32, ptr %4, align 4
+// HIP-NEXT:   %5 = icmp eq i64 %size, 0
+// HIP-NEXT:   br i1 %5, label %if.then, label %if.else
+
+//      HIP: if.then:
+// HIP-NEXT:   %6 = call i32 @__hipRegisterFunction(ptr %0, ptr %addr, ptr %name, ptr %name, i32 -1, ptr null, ptr null, ptr null, ptr null, ptr null)
+// HIP-NEXT:   br label %if.end
+
+//      HIP: if.else:
+// HIP-NEXT:   switch i32 %flag, label %if.end [
+// HIP-NEXT:     i32 0, label %sw.global
+// HIP-NEXT:     i32 1, label %sw.managed
+// HIP-NEXT:     i32 2, label %sw.surface
+// HIP-NEXT:     i32 3, label %sw.texture
+// HIP-NEXT:   ]
+
+//      HIP: sw.global:
+// HIP-NEXT:   call void @__hipRegisterVar(ptr %0, ptr %addr, ptr %name, ptr %name, i32 0, i64 %size, i32 0, i32 0)
+// HIP-NEXT:   br label %if.end
+
+//      HIP: sw.managed:
+// HIP-NEXT:   br label %if.end
+
+//      HIP: sw.surface:
+// HIP-NEXT:   br label %if.end
+
+//      HIP: sw.texture:
+// HIP-NEXT:   br label %if.end
+
+//      HIP: if.end:
+// HIP-NEXT:   %7 = getelementptr inbounds %__tgt_offload_entry, ptr %entry1, i64 1
+// HIP-NEXT:   %8 = icmp eq ptr %7, @__stop_hip_offloading_entries
+// HIP-NEXT:   br i1 %8, label %while.end, label %while.entry
+
+//      HIP: while.end:
+// HIP-NEXT:   ret void
+// HIP-NEXT: }

diff  --git a/clang/test/Driver/linker-wrapper.c b/clang/test/Driver/linker-wrapper.c
index 201a69c44c999..b8ec9efc0b84e 100644
--- a/clang/test/Driver/linker-wrapper.c
+++ b/clang/test/Driver/linker-wrapper.c
@@ -91,6 +91,19 @@
 // CUDA: nvlink{{.*}}-m64 -o {{.*}}.out -arch sm_70 {{.*}}.o {{.*}}.o
 // CUDA: fatbinary{{.*}}-64 --create {{.*}}.fatbin --image=profile=sm_52,file={{.*}}.out --image=profile=sm_70,file={{.*}}.out
 
+// RUN: clang-offload-packager -o %t.out \
+// RUN:   --image=file=%S/Inputs/dummy-elf.o,kind=hip,triple=amdgcn-amd-amdhsa,arch=gfx90a \
+// RUN:   --image=file=%S/Inputs/dummy-elf.o,kind=openmp,triple=amdgcn-amd-amdhsa,arch=gfx90a \
+// RUN:   --image=file=%S/Inputs/dummy-elf.o,kind=hip,triple=amdgcn-amd-amdhsa,arch=gfx908
+// RUN: %clang -cc1 %s -triple x86_64-unknown-linux-gnu -emit-obj -o %t.o \
+// RUN:   -fembed-offload-object=%t.out
+// RUN: clang-linker-wrapper --dry-run --host-triple x86_64-unknown-linux-gnu -linker-path \
+// RUN:   /usr/bin/ld -- %t.o -o a.out 2>&1 | FileCheck %s --check-prefix=HIP
+
+// HIP: lld{{.*}}-flavor gnu --no-undefined -shared -plugin-opt=-amdgpu-internalize-symbols -plugin-opt=mcpu=gfx908 -o {{.*}}.out {{.*}}.o
+// HIP: lld{{.*}}-flavor gnu --no-undefined -shared -plugin-opt=-amdgpu-internalize-symbols -plugin-opt=mcpu=gfx90a -o {{.*}}.out {{.*}}.o
+// HIP: clang-offload-bundler{{.*}}-type=o -bundle-align=4096 -targets=host-x86_64-unknown-linux,hipv4-amdgcn-amd-amdhsa--gfx908,hipv4-amdgcn-amd-amdhsa--gfx90a -input=/dev/null -input={{.*}}.out -input={{.*}}out -output={{.*}}.hipfb
+
 // RUN: clang-offload-packager -o %t.out \
 // RUN:   --image=file=%S/Inputs/dummy-elf.o,kind=openmp,triple=amdgcn-amd-amdhsa,arch=gfx908 \
 // RUN:   --image=file=%S/Inputs/dummy-elf.o,kind=openmp,triple=nvptx64-nvidia-cuda,arch=sm_70
@@ -103,6 +116,7 @@
 // LINKER_ARGS: lld{{.*}}-flavor gnu --no-undefined -shared -plugin-opt=-amdgpu-internalize-symbols -plugin-opt=mcpu=gfx908 -o {{.*}}.out {{.*}}.o a
 // LINKER_ARGS: nvlink{{.*}}-m64 -o {{.*}}.out -arch sm_70 {{.*}}.o a b
 
+/// Ensure that temp files aren't leftoever from static libraries.
 // RUN: clang-offload-packager -o %t-lib.out \
 // RUN:   --image=file=%S/Inputs/dummy-elf.o,kind=openmp,triple=nvptx64-nvidia-cuda,arch=sm_70 \
 // RUN:   --image=file=%S/Inputs/dummy-elf.o,kind=cuda,triple=nvptx64-nvidia-cuda,arch=sm_52

diff  --git a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
index 21a8bfa966626..11a6a4da991da 100644
--- a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
+++ b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
@@ -587,6 +587,51 @@ Expected<StringRef> link(ArrayRef<StringRef> InputFiles, const ArgList &Args) {
 
   return *TempFileOrErr;
 }
+
+Expected<StringRef>
+fatbinary(ArrayRef<std::pair<StringRef, StringRef>> InputFiles,
+          const ArgList &Args) {
+  // AMDGPU uses the clang-offload-bundler to bundle the linked images.
+  Expected<std::string> OffloadBundlerPath = findProgram(
+      "clang-offload-bundler", {getMainExecutable("clang-offload-bundler")});
+  if (!OffloadBundlerPath)
+    return OffloadBundlerPath.takeError();
+
+  llvm::Triple Triple(
+      Args.getLastArgValue(OPT_host_triple_EQ, sys::getDefaultTargetTriple()));
+
+  // Create a new file to write the linked device image to.
+  auto TempFileOrErr = createOutputFile(sys::path::filename(ExecutableName) +
+                                            "-device-" + Triple.getArchName(),
+                                        "hipfb");
+  if (!TempFileOrErr)
+    return TempFileOrErr.takeError();
+
+  BumpPtrAllocator Alloc;
+  StringSaver Saver(Alloc);
+
+  SmallVector<StringRef, 16> CmdArgs;
+  CmdArgs.push_back(*OffloadBundlerPath);
+  CmdArgs.push_back("-type=o");
+  CmdArgs.push_back("-bundle-align=4096");
+
+  SmallVector<StringRef> Targets = {"-targets=host-x86_64-unknown-linux"};
+  for (const auto &FileAndArch : InputFiles)
+    Targets.push_back(
+        Saver.save("hipv4-amdgcn-amd-amdhsa--" + std::get<1>(FileAndArch)));
+  CmdArgs.push_back(Saver.save(llvm::join(Targets, ",")));
+
+  CmdArgs.push_back("-input=/dev/null");
+  for (const auto &FileAndArch : InputFiles)
+    CmdArgs.push_back(Saver.save("-input=" + std::get<0>(FileAndArch)));
+
+  CmdArgs.push_back(Saver.save("-output=" + *TempFileOrErr));
+
+  if (Error Err = executeCommands(*OffloadBundlerPath, CmdArgs))
+    return std::move(Err);
+
+  return *TempFileOrErr;
+}
 } // namespace amdgcn
 
 namespace generic {
@@ -1093,6 +1138,10 @@ wrapDeviceImages(ArrayRef<std::unique_ptr<MemoryBuffer>> Buffers,
     if (Error Err = wrapCudaBinary(M, BuffersToWrap.front()))
       return std::move(Err);
     break;
+  case OFK_HIP:
+    if (Error Err = wrapHIPBinary(M, BuffersToWrap.front()))
+      return std::move(Err);
+    break;
   default:
     return createStringError(inconvertibleErrorCode(),
                              getOffloadKindName(Kind) +
@@ -1120,20 +1169,43 @@ bundleOpenMP(ArrayRef<OffloadingImage> Images) {
 
 Expected<SmallVector<std::unique_ptr<MemoryBuffer>>>
 bundleCuda(ArrayRef<OffloadingImage> Images, const ArgList &Args) {
+  SmallVector<std::pair<StringRef, StringRef>, 4> InputFiles;
+  for (const OffloadingImage &Image : Images)
+    InputFiles.emplace_back(std::make_pair(Image.Image->getBufferIdentifier(),
+                                           Image.StringData.lookup("arch")));
+
+  Triple TheTriple = Triple(Images.front().StringData.lookup("triple"));
+  auto FileOrErr = nvptx::fatbinary(InputFiles, Args);
+  if (!FileOrErr)
+    return FileOrErr.takeError();
+
+  llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> ImageOrError =
+      llvm::MemoryBuffer::getFileOrSTDIN(*FileOrErr);
+
   SmallVector<std::unique_ptr<MemoryBuffer>> Buffers;
+  if (std::error_code EC = ImageOrError.getError())
+    return createFileError(*FileOrErr, EC);
+  Buffers.emplace_back(std::move(*ImageOrError));
+
+  return std::move(Buffers);
+}
 
+Expected<SmallVector<std::unique_ptr<MemoryBuffer>>>
+bundleHIP(ArrayRef<OffloadingImage> Images, const ArgList &Args) {
   SmallVector<std::pair<StringRef, StringRef>, 4> InputFiles;
   for (const OffloadingImage &Image : Images)
     InputFiles.emplace_back(std::make_pair(Image.Image->getBufferIdentifier(),
                                            Image.StringData.lookup("arch")));
 
   Triple TheTriple = Triple(Images.front().StringData.lookup("triple"));
-  auto FileOrErr = nvptx::fatbinary(InputFiles, Args);
+  auto FileOrErr = amdgcn::fatbinary(InputFiles, Args);
   if (!FileOrErr)
     return FileOrErr.takeError();
 
   llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> ImageOrError =
       llvm::MemoryBuffer::getFileOrSTDIN(*FileOrErr);
+
+  SmallVector<std::unique_ptr<MemoryBuffer>> Buffers;
   if (std::error_code EC = ImageOrError.getError())
     return createFileError(*FileOrErr, EC);
   Buffers.emplace_back(std::move(*ImageOrError));
@@ -1151,6 +1223,8 @@ bundleLinkedOutput(ArrayRef<OffloadingImage> Images, const ArgList &Args,
     return bundleOpenMP(Images);
   case OFK_Cuda:
     return bundleCuda(Images, Args);
+  case OFK_HIP:
+    return bundleHIP(Images, Args);
   default:
     return createStringError(inconvertibleErrorCode(),
                              getOffloadKindName(Kind) +

diff  --git a/clang/tools/clang-linker-wrapper/OffloadWrapper.cpp b/clang/tools/clang-linker-wrapper/OffloadWrapper.cpp
index 847113ef91ce9..a5c4fc981cbdd 100644
--- a/clang/tools/clang-linker-wrapper/OffloadWrapper.cpp
+++ b/clang/tools/clang-linker-wrapper/OffloadWrapper.cpp
@@ -22,6 +22,7 @@ using namespace llvm;
 namespace {
 /// Magic number that begins the section containing the CUDA fatbinary.
 constexpr unsigned CudaFatMagic = 0x466243b1;
+constexpr unsigned HIPFatMagic = 0x48495046;
 
 /// Copied from clang/CGCudaRuntime.h.
 enum OffloadEntryKindFlag : uint32_t {
@@ -288,14 +289,15 @@ StructType *getFatbinWrapperTy(Module &M) {
 
 /// Embed the image \p Image into the module \p M so it can be found by the
 /// runtime.
-GlobalVariable *createFatbinDesc(Module &M, ArrayRef<char> Image) {
+GlobalVariable *createFatbinDesc(Module &M, ArrayRef<char> Image, bool IsHIP) {
   LLVMContext &C = M.getContext();
   llvm::Type *Int8PtrTy = Type::getInt8PtrTy(C);
   llvm::Triple Triple = llvm::Triple(M.getTargetTriple());
 
   // Create the global string containing the fatbinary.
   StringRef FatbinConstantSection =
-      Triple.isMacOSX() ? "__NV_CUDA,__nv_fatbin" : ".nv_fatbin";
+      IsHIP ? ".hip_fatbin"
+            : (Triple.isMacOSX() ? "__NV_CUDA,__nv_fatbin" : ".nv_fatbin");
   auto *Data = ConstantDataArray::get(C, Image);
   auto *Fatbin = new GlobalVariable(M, Data->getType(), /*isConstant*/ true,
                                     GlobalVariable::InternalLinkage, Data,
@@ -303,10 +305,11 @@ GlobalVariable *createFatbinDesc(Module &M, ArrayRef<char> Image) {
   Fatbin->setSection(FatbinConstantSection);
 
   // Create the fatbinary wrapper
-  StringRef FatbinWrapperSection =
-      Triple.isMacOSX() ? "__NV_CUDA,__fatbin" : ".nvFatBinSegment";
+  StringRef FatbinWrapperSection = IsHIP               ? ".hipFatBinSegment"
+                                   : Triple.isMacOSX() ? "__NV_CUDA,__fatbin"
+                                                       : ".nvFatBinSegment";
   Constant *FatbinWrapper[] = {
-      ConstantInt::get(Type::getInt32Ty(C), CudaFatMagic),
+      ConstantInt::get(Type::getInt32Ty(C), IsHIP ? HIPFatMagic : CudaFatMagic),
       ConstantInt::get(Type::getInt32Ty(C), 1),
       ConstantExpr::getPointerBitCastOrAddrSpaceCast(Fatbin, Int8PtrTy),
       ConstantPointerNull::get(Type::getInt8PtrTy(C))};
@@ -328,9 +331,10 @@ GlobalVariable *createFatbinDesc(Module &M, ArrayRef<char> Image) {
       ConstantAggregateZero::get(ArrayType::get(getEntryTy(M), 0u));
   auto *DummyEntry = new GlobalVariable(
       M, DummyInit->getType(), true, GlobalVariable::ExternalLinkage, DummyInit,
-      "__dummy.cuda_offloading.entry");
-  DummyEntry->setSection("cuda_offloading_entries");
+      IsHIP ? "__dummy.hip_offloading.entry" : "__dummy.cuda_offloading.entry");
   DummyEntry->setVisibility(GlobalValue::HiddenVisibility);
+  DummyEntry->setSection(IsHIP ? "hip_offloading_entries"
+                               : "cuda_offloading_entries");
 
   return FatbinDesc;
 }
@@ -358,7 +362,7 @@ GlobalVariable *createFatbinDesc(Module &M, ArrayRef<char> Image) {
 ///                         0, entry->size, 0, 0);
 ///   }
 /// }
-Function *createRegisterGlobalsFunction(Module &M) {
+Function *createRegisterGlobalsFunction(Module &M, bool IsHIP) {
   LLVMContext &C = M.getContext();
   // Get the __cudaRegisterFunction function declaration.
   auto *RegFuncTy = FunctionType::get(
@@ -368,8 +372,8 @@ Function *createRegisterGlobalsFunction(Module &M) {
        Type::getInt8PtrTy(C), Type::getInt8PtrTy(C), Type::getInt8PtrTy(C),
        Type::getInt8PtrTy(C), Type::getInt32PtrTy(C)},
       /*isVarArg*/ false);
-  FunctionCallee RegFunc =
-      M.getOrInsertFunction("__cudaRegisterFunction", RegFuncTy);
+  FunctionCallee RegFunc = M.getOrInsertFunction(
+      IsHIP ? "__hipRegisterFunction" : "__cudaRegisterFunction", RegFuncTy);
 
   // Get the __cudaRegisterVar function declaration.
   auto *RegVarTy = FunctionType::get(
@@ -378,25 +382,31 @@ Function *createRegisterGlobalsFunction(Module &M) {
        Type::getInt8PtrTy(C), Type::getInt8PtrTy(C), Type::getInt32Ty(C),
        getSizeTTy(M), Type::getInt32Ty(C), Type::getInt32Ty(C)},
       /*isVarArg*/ false);
-  FunctionCallee RegVar = M.getOrInsertFunction("__cudaRegisterVar", RegVarTy);
+  FunctionCallee RegVar = M.getOrInsertFunction(
+      IsHIP ? "__hipRegisterVar" : "__cudaRegisterVar", RegVarTy);
 
   // Create the references to the start / stop symbols defined by the linker.
-  auto *EntriesB = new GlobalVariable(
-      M, ArrayType::get(getEntryTy(M), 0), /*isConstant*/ true,
-      GlobalValue::ExternalLinkage,
-      /*Initializer*/ nullptr, "__start_cuda_offloading_entries");
+  auto *EntriesB =
+      new GlobalVariable(M, ArrayType::get(getEntryTy(M), 0),
+                         /*isConstant*/ true, GlobalValue::ExternalLinkage,
+                         /*Initializer*/ nullptr,
+                         IsHIP ? "__start_hip_offloading_entries"
+                               : "__start_cuda_offloading_entries");
   EntriesB->setVisibility(GlobalValue::HiddenVisibility);
-  auto *EntriesE = new GlobalVariable(
-      M, ArrayType::get(getEntryTy(M), 0), /*isConstant*/ true,
-      GlobalValue::ExternalLinkage,
-      /*Initializer*/ nullptr, "__stop_cuda_offloading_entries");
+  auto *EntriesE =
+      new GlobalVariable(M, ArrayType::get(getEntryTy(M), 0),
+                         /*isConstant*/ true, GlobalValue::ExternalLinkage,
+                         /*Initializer*/ nullptr,
+                         IsHIP ? "__stop_hip_offloading_entries"
+                               : "__stop_cuda_offloading_entries");
   EntriesE->setVisibility(GlobalValue::HiddenVisibility);
 
   auto *RegGlobalsTy = FunctionType::get(Type::getVoidTy(C),
                                          Type::getInt8PtrTy(C)->getPointerTo(),
                                          /*isVarArg*/ false);
-  auto *RegGlobalsFn = Function::Create(
-      RegGlobalsTy, GlobalValue::InternalLinkage, ".cuda.globals_reg", &M);
+  auto *RegGlobalsFn =
+      Function::Create(RegGlobalsTy, GlobalValue::InternalLinkage,
+                       IsHIP ? ".hip.globals_reg" : ".cuda.globals_reg", &M);
   RegGlobalsFn->setSection(".text.startup");
 
   // Create the loop to register all the entries.
@@ -502,24 +512,27 @@ Function *createRegisterGlobalsFunction(Module &M) {
 
 // Create the constructor and destructor to register the fatbinary with the CUDA
 // runtime.
-void createRegisterFatbinFunction(Module &M, GlobalVariable *FatbinDesc) {
+void createRegisterFatbinFunction(Module &M, GlobalVariable *FatbinDesc,
+                                  bool IsHIP) {
   LLVMContext &C = M.getContext();
   auto *CtorFuncTy = FunctionType::get(Type::getVoidTy(C), /*isVarArg*/ false);
-  auto *CtorFunc = Function::Create(CtorFuncTy, GlobalValue::InternalLinkage,
-                                    ".cuda.fatbin_reg", &M);
+  auto *CtorFunc =
+      Function::Create(CtorFuncTy, GlobalValue::InternalLinkage,
+                       IsHIP ? ".hip.fatbin_reg" : ".cuda.fatbin_reg", &M);
   CtorFunc->setSection(".text.startup");
 
   auto *DtorFuncTy = FunctionType::get(Type::getVoidTy(C), /*isVarArg*/ false);
-  auto *DtorFunc = Function::Create(DtorFuncTy, GlobalValue::InternalLinkage,
-                                    ".cuda.fatbin_unreg", &M);
+  auto *DtorFunc =
+      Function::Create(DtorFuncTy, GlobalValue::InternalLinkage,
+                       IsHIP ? ".hip.fatbin_unreg" : ".cuda.fatbin_unreg", &M);
   DtorFunc->setSection(".text.startup");
 
   // Get the __cudaRegisterFatBinary function declaration.
   auto *RegFatTy = FunctionType::get(Type::getInt8PtrTy(C)->getPointerTo(),
                                      Type::getInt8PtrTy(C),
                                      /*isVarArg*/ false);
-  FunctionCallee RegFatbin =
-      M.getOrInsertFunction("__cudaRegisterFatBinary", RegFatTy);
+  FunctionCallee RegFatbin = M.getOrInsertFunction(
+      IsHIP ? "__hipRegisterFatBinary" : "__cudaRegisterFatBinary", RegFatTy);
   // Get the __cudaRegisterFatBinaryEnd function declaration.
   auto *RegFatEndTy = FunctionType::get(Type::getVoidTy(C),
                                         Type::getInt8PtrTy(C)->getPointerTo(),
@@ -530,8 +543,9 @@ void createRegisterFatbinFunction(Module &M, GlobalVariable *FatbinDesc) {
   auto *UnregFatTy = FunctionType::get(Type::getVoidTy(C),
                                        Type::getInt8PtrTy(C)->getPointerTo(),
                                        /*isVarArg*/ false);
-  FunctionCallee UnregFatbin =
-      M.getOrInsertFunction("__cudaUnregisterFatBinary", UnregFatTy);
+  FunctionCallee UnregFatbin = M.getOrInsertFunction(
+      IsHIP ? "__hipUnregisterFatBinary" : "__cudaUnregisterFatBinary",
+      UnregFatTy);
 
   auto *AtExitTy =
       FunctionType::get(Type::getInt32Ty(C), DtorFuncTy->getPointerTo(),
@@ -542,7 +556,7 @@ void createRegisterFatbinFunction(Module &M, GlobalVariable *FatbinDesc) {
       M, Type::getInt8PtrTy(C)->getPointerTo(), false,
       llvm::GlobalValue::InternalLinkage,
       llvm::ConstantPointerNull::get(Type::getInt8PtrTy(C)->getPointerTo()),
-      ".cuda.binary_handle");
+      IsHIP ? ".hip.binary_handle" : ".cuda.binary_handle");
 
   // Create the constructor to register this image with the runtime.
   IRBuilder<> CtorBuilder(BasicBlock::Create(C, "entry", CtorFunc));
@@ -552,8 +566,9 @@ void createRegisterFatbinFunction(Module &M, GlobalVariable *FatbinDesc) {
   CtorBuilder.CreateAlignedStore(
       Handle, BinaryHandleGlobal,
       Align(M.getDataLayout().getPointerTypeSize(Type::getInt8PtrTy(C))));
-  CtorBuilder.CreateCall(createRegisterGlobalsFunction(M), Handle);
-  CtorBuilder.CreateCall(RegFatbinEnd, Handle);
+  CtorBuilder.CreateCall(createRegisterGlobalsFunction(M, IsHIP), Handle);
+  if (!IsHIP)
+    CtorBuilder.CreateCall(RegFatbinEnd, Handle);
   CtorBuilder.CreateCall(AtExit, DtorFunc);
   CtorBuilder.CreateRetVoid();
 
@@ -584,11 +599,21 @@ Error wrapOpenMPBinaries(Module &M, ArrayRef<ArrayRef<char>> Images) {
 }
 
 Error wrapCudaBinary(Module &M, ArrayRef<char> Image) {
-  GlobalVariable *Desc = createFatbinDesc(M, Image);
+  GlobalVariable *Desc = createFatbinDesc(M, Image, /* IsHIP */ false);
+  if (!Desc)
+    return createStringError(inconvertibleErrorCode(),
+                             "No fatinbary section created.");
+
+  createRegisterFatbinFunction(M, Desc, /* IsHIP */ false);
+  return Error::success();
+}
+
+Error wrapHIPBinary(Module &M, ArrayRef<char> Image) {
+  GlobalVariable *Desc = createFatbinDesc(M, Image, /* IsHIP */ true);
   if (!Desc)
     return createStringError(inconvertibleErrorCode(),
                              "No fatinbary section created.");
 
-  createRegisterFatbinFunction(M, Desc);
+  createRegisterFatbinFunction(M, Desc, /* IsHIP */ true);
   return Error::success();
 }

diff  --git a/clang/tools/clang-linker-wrapper/OffloadWrapper.h b/clang/tools/clang-linker-wrapper/OffloadWrapper.h
index bfdd7d4357041..679333975b212 100644
--- a/clang/tools/clang-linker-wrapper/OffloadWrapper.h
+++ b/clang/tools/clang-linker-wrapper/OffloadWrapper.h
@@ -21,4 +21,8 @@ llvm::Error wrapOpenMPBinaries(llvm::Module &M,
 /// registers the images with the CUDA runtime.
 llvm::Error wrapCudaBinary(llvm::Module &M, llvm::ArrayRef<char> Images);
 
+/// Wraps the input bundled image into the module \p M as global symbols and
+/// registers the images with the HIP runtime.
+llvm::Error wrapHIPBinary(llvm::Module &M, llvm::ArrayRef<char> Images);
+
 #endif