[clang] dc6a0b0 - [HIP] Align device binary

Yaxun Liu via cfe-commits cfe-commits at lists.llvm.org
Fri Oct 2 15:12:09 PDT 2020


Author: Yaxun (Sam) Liu
Date: 2020-10-02T18:10:44-04:00
New Revision: dc6a0b0ec7e3d72a4cc849af4e4aa6c6a29a53d2

URL: https://github.com/llvm/llvm-project/commit/dc6a0b0ec7e3d72a4cc849af4e4aa6c6a29a53d2
DIFF: https://github.com/llvm/llvm-project/commit/dc6a0b0ec7e3d72a4cc849af4e4aa6c6a29a53d2.diff

LOG: [HIP] Align device binary

To facilitate faster loading of device binaries and share them among processes,
HIP runtime favors their alignment being 4096 bytes. HIP runtime can load
unaligned device binaries, however, aligning them at 4096 bytes results in
faster loading and less shared memory usage.

This patch adds an option -bundle-align to clang-offload-bundler which allows
bundles to be aligned at specified alignment. By default it is 1, which is NFC
compared to existing format.

This patch then aligns embedded fat binary and device binary inside fat binary
at 4096 bytes.

It has been verified this change does not cause significant overall file size increase
for typical HIP applications (less than 1%).

Differential Revision: https://reviews.llvm.org/D88734

Added: 
    

Modified: 
    clang/lib/CodeGen/CGCUDANV.cpp
    clang/lib/Driver/ToolChains/HIP.cpp
    clang/test/CodeGenCUDA/device-stub.cu
    clang/test/Driver/clang-offload-bundler.c
    clang/test/Driver/hip-toolchain-no-rdc.hip
    clang/test/Driver/hip-toolchain-rdc.hip
    clang/tools/clang-offload-bundler/ClangOffloadBundler.cpp

Removed: 
    


################################################################################
diff  --git a/clang/lib/CodeGen/CGCUDANV.cpp b/clang/lib/CodeGen/CGCUDANV.cpp
index baf2c79cc2b6..b0b76ffbebde 100644
--- a/clang/lib/CodeGen/CGCUDANV.cpp
+++ b/clang/lib/CodeGen/CGCUDANV.cpp
@@ -597,8 +597,10 @@ llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() {
     if (CudaGpuBinary) {
       // If fatbin is available from early finalization, create a string
       // literal containing the fat binary loaded from the given file.
-      FatBinStr = makeConstantString(std::string(CudaGpuBinary->getBuffer()),
-                                     "", FatbinConstantName, 8);
+      const unsigned HIPCodeObjectAlign = 4096;
+      FatBinStr =
+          makeConstantString(std::string(CudaGpuBinary->getBuffer()), "",
+                             FatbinConstantName, HIPCodeObjectAlign);
     } else {
       // If fatbin is not available, create an external symbol
       // __hip_fatbin in section .hip_fatbin. The external symbol is supposed

diff  --git a/clang/lib/Driver/ToolChains/HIP.cpp b/clang/lib/Driver/ToolChains/HIP.cpp
index 78f53204bd8c..25b3ab88bc02 100644
--- a/clang/lib/Driver/ToolChains/HIP.cpp
+++ b/clang/lib/Driver/ToolChains/HIP.cpp
@@ -16,6 +16,7 @@
 #include "clang/Driver/Driver.h"
 #include "clang/Driver/DriverDiagnostic.h"
 #include "clang/Driver/Options.h"
+#include "llvm/Support/Alignment.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/Path.h"
 #include "llvm/Support/TargetParser.h"
@@ -33,6 +34,7 @@ using namespace llvm::opt;
 #endif
 
 namespace {
+const unsigned HIPCodeObjectAlign = 4096;
 
 static void addBCLib(const Driver &D, const ArgList &Args,
                      ArgStringList &CmdArgs, ArgStringList LibraryPaths,
@@ -108,6 +110,8 @@ void AMDGCN::constructHIPFatbinCommand(Compilation &C, const JobAction &JA,
   // for 
diff erent GPU archs.
   ArgStringList BundlerArgs;
   BundlerArgs.push_back(Args.MakeArgString("-type=o"));
+  BundlerArgs.push_back(
+      Args.MakeArgString("-bundle-align=" + Twine(HIPCodeObjectAlign)));
 
   // ToDo: Remove the dummy host binary entry which is required by
   // clang-offload-bundler.
@@ -175,7 +179,8 @@ void AMDGCN::Linker::constructGenerateObjFileFromHIPFatBinary(
   ObjStream << "  .section .hip_fatbin,\"aMS\", at progbits,1\n";
   ObjStream << "  .data\n";
   ObjStream << "  .globl __hip_fatbin\n";
-  ObjStream << "  .p2align 3\n";
+  ObjStream << "  .p2align " << llvm::Log2(llvm::Align(HIPCodeObjectAlign))
+            << "\n";
   ObjStream << "__hip_fatbin:\n";
   ObjStream << "  .incbin \"" << BundleFile << "\"\n";
   ObjStream.flush();

diff  --git a/clang/test/CodeGenCUDA/device-stub.cu b/clang/test/CodeGenCUDA/device-stub.cu
index 0f4a5644fd48..ca21116fc989 100644
--- a/clang/test/CodeGenCUDA/device-stub.cu
+++ b/clang/test/CodeGenCUDA/device-stub.cu
@@ -115,7 +115,7 @@ void use_pointers() {
 // ALL: @4 = private unnamed_addr constant [21 x i8] c"ext_constant_var_def\00"
 // * constant unnamed string with GPU binary
 // CUDA: @[[FATBIN:.*]] = private constant{{.*GPU binary would be here.*}}\00",
-// HIPEF: @[[FATBIN:.*]] = private constant{{.*GPU binary would be here.*}}\00",
+// HIPEF: @[[FATBIN:.*]] = private constant{{.*GPU binary would be here.*}}\00",{{.*}}align 4096
 // HIPNEF: @[[FATBIN:__hip_fatbin]] = external constant i8, section ".hip_fatbin"
 // CUDANORDC-SAME: section ".nv_fatbin", align 8
 // CUDARDC-SAME: section "__nv_relfatbin", align 8

diff  --git a/clang/test/Driver/clang-offload-bundler.c b/clang/test/Driver/clang-offload-bundler.c
index a0724b3c60e8..21699e78dda6 100644
--- a/clang/test/Driver/clang-offload-bundler.c
+++ b/clang/test/Driver/clang-offload-bundler.c
@@ -278,6 +278,16 @@
 // RUN: 
diff  %t.empty %t.res.tgt1
 // RUN: 
diff  %t.empty %t.res.tgt2
 
+//
+// Check -bundle-align option
+//
+
+// RUN: clang-offload-bundler -bundle-align=4096 -type=bc -targets=host-%itanium_abi_triple,openmp-powerpc64le-ibm-linux-gnu,openmp-x86_64-pc-linux-gnu -inputs=%t.bc,%t.tgt1,%t.tgt2 -outputs=%t.bundle3.bc
+// RUN: clang-offload-bundler -type=bc -targets=host-%itanium_abi_triple,openmp-powerpc64le-ibm-linux-gnu,openmp-x86_64-pc-linux-gnu -outputs=%t.res.bc,%t.res.tgt1,%t.res.tgt2 -inputs=%t.bundle3.bc -unbundle
+// RUN: 
diff  %t.bc %t.res.bc
+// RUN: 
diff  %t.tgt1 %t.res.tgt1
+// RUN: 
diff  %t.tgt2 %t.res.tgt2
+
 // Some code so that we can create a binary out of this file.
 int A = 0;
 void test_func(void) {

diff  --git a/clang/test/Driver/hip-toolchain-no-rdc.hip b/clang/test/Driver/hip-toolchain-no-rdc.hip
index c03c78a97c94..471c3022ecef 100644
--- a/clang/test/Driver/hip-toolchain-no-rdc.hip
+++ b/clang/test/Driver/hip-toolchain-no-rdc.hip
@@ -81,6 +81,7 @@
 //
 
 // CHECK: [[BUNDLER:".*clang-offload-bundler"]] "-type=o"
+// CHECK-SAME: "-bundle-align=4096"
 // CHECK-SAME: "-targets={{.*}},hip-amdgcn-amd-amdhsa-gfx803,hip-amdgcn-amd-amdhsa-gfx900"
 // CHECK-SAME: "-inputs={{.*}},[[IMG_DEV_A_803]],[[IMG_DEV_A_900]]" "-outputs=[[BUNDLE_A:.*hipfb]]"
 
@@ -143,6 +144,7 @@
 //
 
 // CHECK: [[BUNDLER:".*clang-offload-bundler"]] "-type=o"
+// CHECK-SAME: "-bundle-align=4096"
 // CHECK-SAME: "-targets={{.*}},hip-amdgcn-amd-amdhsa-gfx803,hip-amdgcn-amd-amdhsa-gfx900"
 // CHECK-SAME: "-inputs={{.*}},[[IMG_DEV_B_803]],[[IMG_DEV_B_900]]" "-outputs=[[BUNDLE_A:.*hipfb]]"
 

diff  --git a/clang/test/Driver/hip-toolchain-rdc.hip b/clang/test/Driver/hip-toolchain-rdc.hip
index 97d5e59c0c4b..8d8e67514035 100644
--- a/clang/test/Driver/hip-toolchain-rdc.hip
+++ b/clang/test/Driver/hip-toolchain-rdc.hip
@@ -8,10 +8,14 @@
 // RUN:   --hip-device-lib-path=%S/Inputs/hip_multiple_inputs/lib1 \
 // RUN:   --hip-device-lib-path=%S/Inputs/hip_multiple_inputs/lib2 \
 // RUN:   -fuse-ld=lld -fgpu-rdc -nogpuinc \
+// RUN:   -fhip-dump-offload-linker-script \
 // RUN:   %S/Inputs/hip_multiple_inputs/a.cu \
 // RUN:   %S/Inputs/hip_multiple_inputs/b.hip \
 // RUN: 2>&1 | FileCheck %s
 
+// check code object alignment in dumped llvm-mc input
+// CHECK: .p2align 12
+
 // emit objects for host side path
 // CHECK: [[CLANG:".*clang.*"]] "-cc1" "-triple" "x86_64-unknown-linux-gnu"
 // CHECK-SAME: "-aux-triple" "amdgcn-amd-amdhsa"
@@ -87,6 +91,7 @@
 
 // combine images generated into hip fat binary object
 // CHECK: [[BUNDLER:".*clang-offload-bundler"]] "-type=o"
+// CHECK-SAME: "-bundle-align=4096"
 // CHECK-SAME: "-targets={{.*}},hip-amdgcn-amd-amdhsa-gfx803,hip-amdgcn-amd-amdhsa-gfx900"
 // CHECK-SAME: "-inputs={{.*}},[[IMG_DEV1]],[[IMG_DEV2]]" "-outputs=[[BUNDLE:.*hipfb]]"
 

diff  --git a/clang/tools/clang-offload-bundler/ClangOffloadBundler.cpp b/clang/tools/clang-offload-bundler/ClangOffloadBundler.cpp
index 3f9925d1e099..e4a32d5e8744 100644
--- a/clang/tools/clang-offload-bundler/ClangOffloadBundler.cpp
+++ b/clang/tools/clang-offload-bundler/ClangOffloadBundler.cpp
@@ -94,6 +94,11 @@ static cl::opt<bool> PrintExternalCommands(
              "instead of actually executing them - for testing purposes.\n"),
     cl::init(false), cl::cat(ClangOffloadBundlerCategory));
 
+static cl::opt<unsigned>
+    BundleAlignment("bundle-align",
+                    cl::desc("Alignment of bundle for binary files"),
+                    cl::init(1), cl::cat(ClangOffloadBundlerCategory));
+
 /// Magic string that marks the existence of offloading data.
 #define OFFLOAD_BUNDLER_MAGIC_STR "__CLANG_OFFLOAD_BUNDLE__"
 
@@ -223,6 +228,9 @@ class BinaryFileHandler final : public FileHandler {
   StringMap<BundleInfo>::iterator CurBundleInfo;
   StringMap<BundleInfo>::iterator NextBundleInfo;
 
+  /// Current bundle target to be written.
+  std::string CurWriteBundleTarget;
+
 public:
   BinaryFileHandler() : FileHandler() {}
 
@@ -337,10 +345,12 @@ class BinaryFileHandler final : public FileHandler {
     unsigned Idx = 0;
     for (auto &T : TargetNames) {
       MemoryBuffer &MB = *Inputs[Idx++];
+      HeaderSize = alignTo(HeaderSize, BundleAlignment);
       // Bundle offset.
       Write8byteIntegerToBuffer(OS, HeaderSize);
       // Size of the bundle (adds to the next bundle's offset)
       Write8byteIntegerToBuffer(OS, MB.getBufferSize());
+      BundlesInfo[T] = BundleInfo(MB.getBufferSize(), HeaderSize);
       HeaderSize += MB.getBufferSize();
       // Size of the triple
       Write8byteIntegerToBuffer(OS, T.size());
@@ -351,6 +361,7 @@ class BinaryFileHandler final : public FileHandler {
   }
 
   Error WriteBundleStart(raw_fd_ostream &OS, StringRef TargetTriple) final {
+    CurWriteBundleTarget = TargetTriple.str();
     return Error::success();
   }
 
@@ -359,6 +370,8 @@ class BinaryFileHandler final : public FileHandler {
   }
 
   Error WriteBundle(raw_fd_ostream &OS, MemoryBuffer &Input) final {
+    auto BI = BundlesInfo[CurWriteBundleTarget];
+    OS.seek(BI.Offset);
     OS.write(Input.getBufferStart(), Input.getBufferSize());
     return Error::success();
   }


        


More information about the cfe-commits mailing list