[llvm] 1bcec03 - [HIP][HIPSTDPAR][NFC] Re-order & adapt `hipstdpar` specific passes (#134753)

Mon Apr 14 14:47:14 PDT 2025

Author: Alex Voicu
Date: 2025-04-15T00:47:09+03:00
New Revision: 1bcec036e197f6ab7461722502e4393396b46ec3

URL: https://github.com/llvm/llvm-project/commit/1bcec036e197f6ab7461722502e4393396b46ec3
DIFF: https://github.com/llvm/llvm-project/commit/1bcec036e197f6ab7461722502e4393396b46ec3.diff

LOG: [HIP][HIPSTDPAR][NFC] Re-order & adapt `hipstdpar` specific passes (#134753)

The `hipstdpar` specific passes were not ordered ideally, especially for
`fgpu-rdc` compilations, which meant that we'd eagerly run accelerator
code selection and remove symbols that might end up used. This change
corrects that aspect by ensuring that accelerator code selection is only
done after linking (this will have to be revisited in the future once
the closed-world assumption no longer holds). Furthermore, we take the
opportunity to move allocation interposition so that it properly gets
printed when print-pipeline-passes is requested. NFC.

Added: 
    clang/test/CodeGenHipStdPar/rdc-does-not-enable-hipstdpar.cpp
    clang/test/CodeGenHipStdPar/select-accelerator-code-pass-ordering.cpp

Modified: 
    clang/lib/CodeGen/BackendUtil.cpp
    clang/lib/Driver/ToolChains/HIPAMD.cpp
    llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp

Removed: 
    


################################################################################
diff  --git a/clang/lib/CodeGen/BackendUtil.cpp b/clang/lib/CodeGen/BackendUtil.cpp
index 7557cb8408921..f7eb853beb23c 100644

--- a/clang/lib/CodeGen/BackendUtil.cpp
+++ b/clang/lib/CodeGen/BackendUtil.cpp
@@ -1115,6 +1115,10 @@ void EmitAssemblyHelper::RunOptimizationPipeline(
   if (CodeGenOpts.LinkBitcodePostopt)
     MPM.addPass(LinkInModulesPass(BC));
 
+  if (LangOpts.HIPStdPar && !LangOpts.CUDAIsDevice &&
+      LangOpts.HIPStdParInterposeAlloc)
+    MPM.addPass(HipStdParAllocationInterpositionPass());
+
   // Add a verifier pass if requested. We don't have to do this if the action
   // requires code generation because there will already be a verifier pass in
   // the code-generation pipeline.
@@ -1178,10 +1182,6 @@ void EmitAssemblyHelper::RunOptimizationPipeline(
     return;
   }
 
-  if (LangOpts.HIPStdPar && !LangOpts.CUDAIsDevice &&
-      LangOpts.HIPStdParInterposeAlloc)
-    MPM.addPass(HipStdParAllocationInterpositionPass());
-
   // Now that we have all of the passes ready, run them.
   {
     PrettyStackTraceString CrashInfo("Optimizer");

diff  --git a/clang/lib/Driver/ToolChains/HIPAMD.cpp b/clang/lib/Driver/ToolChains/HIPAMD.cpp
index 4419339720b87..abda4eb453387 100644
--- a/clang/lib/Driver/ToolChains/HIPAMD.cpp
+++ b/clang/lib/Driver/ToolChains/HIPAMD.cpp
@@ -233,10 +233,11 @@ void HIPAMDToolChain::addClangTargetOptions(
   CC1Args.append({"-fcuda-is-device", "-fno-threadsafe-statics"});
 
   if (!DriverArgs.hasFlag(options::OPT_fgpu_rdc, options::OPT_fno_gpu_rdc,
-                          false))
+                          false)) {
     CC1Args.append({"-mllvm", "-amdgpu-internalize-symbols"});
-  if (DriverArgs.hasArgNoClaim(options::OPT_hipstdpar))
-    CC1Args.append({"-mllvm", "-amdgpu-enable-hipstdpar"});
+    if (DriverArgs.hasArgNoClaim(options::OPT_hipstdpar))
+      CC1Args.append({"-mllvm", "-amdgpu-enable-hipstdpar"});
+  }
 
   StringRef MaxThreadsPerBlock =
       DriverArgs.getLastArgValue(options::OPT_gpu_max_threads_per_block_EQ);

diff  --git a/clang/test/CodeGenHipStdPar/rdc-does-not-enable-hipstdpar.cpp b/clang/test/CodeGenHipStdPar/rdc-does-not-enable-hipstdpar.cpp
new file mode 100644
index 0000000000000..f7438c374dd32
--- /dev/null
+++ b/clang/test/CodeGenHipStdPar/rdc-does-not-enable-hipstdpar.cpp
@@ -0,0 +1,17 @@
+// Check that if we are compiling with fgpu-rdc amdgpu-enable-hipstdpar is not
+// passed to CC1, to avoid eager, per TU, removal of potentially accessible
+// functions.
+
+// RUN: %clang -### --hipstdpar --offload-arch=gfx906 -nogpulib -nogpuinc %s \
+// RUN:    --hipstdpar-path=%S/../Driver/Inputs/hipstdpar \
+// RUN:    --hipstdpar-thrust-path=%S/../Driver/Inputs/hipstdpar/thrust \
+// RUN:    --hipstdpar-prim-path=%S/../Driver/Inputs/hipstdpar/rocprim 2>&1 \
+// RUN:    | FileCheck %s -check-prefix=NORDC
+// NORDC: {{.*}}"-mllvm" "-amdgpu-enable-hipstdpar"
+
+// RUN: %clang -### --hipstdpar --offload-arch=gfx906 -nogpulib -nogpuinc %s \
+// RUN:    -fgpu-rdc --hipstdpar-path=%S/../Driver/Inputs/hipstdpar \
+// RUN:    --hipstdpar-thrust-path=%S/../Driver/Inputs/hipstdpar/thrust \
+// RUN:    --hipstdpar-prim-path=%S/../Driver/Inputs/hipstdpar/rocprim 2>&1 \
+// RUN:    | FileCheck %s -check-prefix=RDC
+// RDC-NOT: {{.*}}"-mllvm" "-amdgpu-enable-hipstdpar"

diff  --git a/clang/test/CodeGenHipStdPar/select-accelerator-code-pass-ordering.cpp b/clang/test/CodeGenHipStdPar/select-accelerator-code-pass-ordering.cpp
new file mode 100644
index 0000000000000..c70b651397527
--- /dev/null
+++ b/clang/test/CodeGenHipStdPar/select-accelerator-code-pass-ordering.cpp
@@ -0,0 +1,24 @@
+// Test that the accelerator code selection pass only gets invoked after linking
+
+// Ensure Pass HipStdParAcceleratorCodeSelectionPass is not invoked in PreLink.
+// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -mllvm -amdgpu-enable-hipstdpar -flto -emit-llvm-bc -fcuda-is-device -fdebug-pass-manager \
+// RUN:  %s -o /dev/null 2>&1 | FileCheck --check-prefix=HIPSTDPAR-PRE %s
+// HIPSTDPAR-PRE: Running pass: EntryExitInstrumenterPass
+// HIPSTDPAR-PRE-NEXT: Running pass: EntryExitInstrumenterPass
+// HIPSTDPAR-PRE-NOT: Running pass: HipStdParAcceleratorCodeSelectionPass
+// HIPSTDPAR-PRE-NEXT: Running pass: AlwaysInlinerPass
+
+// Ensure Pass HipStdParAcceleratorCodeSelectionPass is invoked in PostLink.
+// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -mllvm -amdgpu-enable-hipstdpar -fcuda-is-device -fdebug-pass-manager -emit-llvm \
+// RUN:  %s -o /dev/null 2>&1 | FileCheck --check-prefix=HIPSTDPAR-POST %s
+// HIPSTDPAR-POST: Running pass: HipStdParAcceleratorCodeSelection
+
+#define __device__ __attribute__((device))
+
+void foo(float *a, float b) {
+  *a = b;
+}
+
+__device__ void bar(float *a, float b) {
+  *a = b;
+}

diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index b59e940852724..5b2e0558d5664 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -810,17 +810,17 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
 #define GET_PASS_REGISTRY "AMDGPUPassRegistry.def"
 #include "llvm/Passes/TargetPassRegistry.inc"
 
-  PB.registerPipelineStartEPCallback(
-      [](ModulePassManager &PM, OptimizationLevel Level) {
-        if (EnableHipStdPar)
-          PM.addPass(HipStdParAcceleratorCodeSelectionPass());
-      });
-
   PB.registerPipelineEarlySimplificationEPCallback(
       [](ModulePassManager &PM, OptimizationLevel Level,
          ThinOrFullLTOPhase Phase) {
-        if (!isLTOPreLink(Phase))
+        if (!isLTOPreLink(Phase)) {
+          // When we are not using -fgpu-rdc, we can run accelerator code
+          // selection relatively early, but still after linking to prevent
+          // eager removal of potentially reachable symbols.
+          if (EnableHipStdPar)
+            PM.addPass(HipStdParAcceleratorCodeSelectionPass());
           PM.addPass(AMDGPUPrintfRuntimeBindingPass());
+        }
 
         if (Level == OptimizationLevel::O0)
           return;
@@ -891,6 +891,12 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
 
   PB.registerFullLinkTimeOptimizationLastEPCallback(
       [this](ModulePassManager &PM, OptimizationLevel Level) {
+        // When we are using -fgpu-rdc, we can only run accelerator code
+        // selection after linking to prevent, otherwise we end up removing
+        // potentially reachable symbols that were exported as external in other
+        // modules.
+        if (EnableHipStdPar)
+          PM.addPass(HipStdParAcceleratorCodeSelectionPass());
         // We want to support the -lto-partitions=N option as "best effort".
         // For that, we need to lower LDS earlier in the pipeline before the
         // module is partitioned for codegen.