[clang] [llvm] [Offload][PGO] Fix PGO on NVPTX targets (PR #143568)

Ethan Luis McDonough via llvm-commits llvm-commits at lists.llvm.org
Fri Oct 10 13:19:27 PDT 2025


https://github.com/EthanLuisMcDonough updated https://github.com/llvm/llvm-project/pull/143568

>From f73491b1816c86dfd38833257df48aeeb5a2549b Mon Sep 17 00:00:00 2001
From: Ethan Luis McDonough <ethanluismcdonough at gmail.com>
Date: Thu, 22 May 2025 22:20:04 -0500
Subject: [PATCH] [Offload][PGO] Fix PGO on NVPTX targets

---
 clang/lib/Driver/ToolChains/Clang.cpp         |  4 +-
 clang/test/Driver/cuda-no-pgo-or-coverage.cu  | 33 --------
 llvm/include/llvm/ProfileData/InstrProf.h     |  6 ++
 .../Instrumentation/InstrProfiling.cpp        | 80 +++++++++++++++++--
 .../offloading/gpupgo/pgo_device_and_host.c   |  2 +-
 .../test/offloading/gpupgo/pgo_device_only.c  |  2 +-
 6 files changed, 83 insertions(+), 44 deletions(-)
 delete mode 100644 clang/test/Driver/cuda-no-pgo-or-coverage.cu

diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index d326a81feb762..d4111fe6e9b80 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -6201,9 +6201,7 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
   Args.AddLastArg(CmdArgs, options::OPT_fconvergent_functions,
                   options::OPT_fno_convergent_functions);
 
-  // NVPTX doesn't support PGO or coverage
-  if (!Triple.isNVPTX())
-    addPGOAndCoverageFlags(TC, C, JA, Output, Args, SanitizeArgs, CmdArgs);
+  addPGOAndCoverageFlags(TC, C, JA, Output, Args, SanitizeArgs, CmdArgs);
 
   Args.AddLastArg(CmdArgs, options::OPT_fclang_abi_compat_EQ);
 
diff --git a/clang/test/Driver/cuda-no-pgo-or-coverage.cu b/clang/test/Driver/cuda-no-pgo-or-coverage.cu
deleted file mode 100644
index b84587e1e182b..0000000000000
--- a/clang/test/Driver/cuda-no-pgo-or-coverage.cu
+++ /dev/null
@@ -1,33 +0,0 @@
-// Check that profiling/coverage arguments doen't get passed down to device-side
-// compilation.
-//
-//
-// XRUN: not %clang -### --target=x86_64-linux-gnu -c --cuda-gpu-arch=sm_20 \
-// XRUN:   -fprofile-generate %s 2>&1 | \
-// XRUN:   FileCheck --check-prefixes=CHECK,PROF %s
-//
-// RUN: not %clang -### --target=x86_64-linux-gnu -c --cuda-gpu-arch=sm_20 \
-// RUN:   -fprofile-instr-generate %s 2>&1 | \
-// RUN:   FileCheck --check-prefixes=CHECK,PROF %s
-//
-// RUN: not %clang -### --target=x86_64-linux-gnu -c --cuda-gpu-arch=sm_20 \
-// RUN:   -coverage %s 2>&1 | \
-// RUN:   FileCheck --check-prefixes=CHECK,GCOV %s
-//
-// RUN: not %clang -### --target=x86_64-linux-gnu -c --cuda-gpu-arch=sm_20 \
-// RUN:   -ftest-coverage %s 2>&1 | \
-// RUN:   FileCheck --check-prefixes=CHECK,GCOV %s
-//
-// RUN: not %clang -### --target=x86_64-linux-gnu -c --cuda-gpu-arch=sm_20   \
-// RUN:   -fprofile-instr-generate -fcoverage-mapping %s 2>&1 | \
-// RUN:   FileCheck --check-prefixes=CHECK,PROF %s
-//
-//
-// CHECK-NOT: error: unsupported option '-fprofile
-// CHECK-NOT: error: invalid argument
-// CHECK-DAG: "-fcuda-is-device"
-// CHECK-NOT: "-f{{[^"/]*coverage.*}}"
-// CHECK-NOT: "-fprofile{{[^"]*}}"
-// CHECK: "-triple" "x86_64-unknown-linux-gnu"
-// PROF:      "-fprofile{{.*}}"
-// GCOV:      "-coverage-notes-file=
diff --git a/llvm/include/llvm/ProfileData/InstrProf.h b/llvm/include/llvm/ProfileData/InstrProf.h
index 85a9efe73855b..2e5779d521f7a 100644
--- a/llvm/include/llvm/ProfileData/InstrProf.h
+++ b/llvm/include/llvm/ProfileData/InstrProf.h
@@ -171,6 +171,12 @@ inline StringRef getInstrProfRegFuncsName() {
   return "__llvm_profile_register_functions";
 }
 
+/// Return the name of function that initializes self-referential datavar values
+/// on NVPTX targets
+inline StringRef getInstrProfDelayedInitFuncName() {
+  return "__llvm_profile_delayed_data_var_init";
+}
+
 /// Return the name of the runtime interface that registers per-function control
 /// data for one instrumented function.
 inline StringRef getInstrProfRegFuncName() {
diff --git a/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp b/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
index 5e7548b0a2fd1..b3eff3765e144 100644
--- a/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
+++ b/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
@@ -391,6 +391,13 @@ class InstrLowerer final {
   /// Create INSTR_PROF_DATA variable for counters and bitmaps.
   void createDataVariable(InstrProfCntrInstBase *Inc);
 
+  /// Creates delayed initialiation function for data relative offsets
+  /// This is only relevant on NVPTX targets where circular constant structures
+  /// are not allowed
+  bool
+  emitDataDelayedInit(SmallVector<Function *> &Kernels,
+                      SmallVector<const InstrProfCntrInstBase *> &ValueSites);
+
   /// Get the counters for virtual table values, creating them if necessary.
   void getOrCreateVTableProfData(GlobalVariable *GV);
 
@@ -947,11 +954,18 @@ bool InstrLowerer::lower() {
   if (!ContainsProfiling && !CoverageNamesVar)
     return MadeChange;
 
+  // Cached info for generating delayed offset calculations
+  // This is only relevant on NVPTX targets
+  SmallVector<Function *> Kernels;
+  SmallVector<const InstrProfCntrInstBase *> ValueSites;
+
   // We did not know how many value sites there would be inside
   // the instrumented function. This is counting the number of instrumented
   // target value sites to enter it as field in the profile data variable.
   for (Function &F : M) {
     InstrProfCntrInstBase *FirstProfInst = nullptr;
+    if (F.getCallingConv() == CallingConv::PTX_Kernel)
+      Kernels.push_back(&F);
     for (BasicBlock &BB : F) {
       for (auto I = BB.begin(), E = BB.end(); I != E; I++) {
         if (auto *Ind = dyn_cast<InstrProfValueProfileInst>(I))
@@ -971,9 +985,12 @@ bool InstrLowerer::lower() {
     // Also create the data variable based on the MCDCParams.
     if (FirstProfInst != nullptr) {
       static_cast<void>(getOrCreateRegionCounters(FirstProfInst));
+      ValueSites.push_back(FirstProfInst);
     }
   }
 
+  MadeChange |= emitDataDelayedInit(Kernels, ValueSites);
+
   if (EnableVTableValueProfiling)
     for (GlobalVariable &GV : M.globals())
       // Global variables with type metadata are virtual table variables.
@@ -1734,6 +1751,13 @@ InstrLowerer::getOrCreateRegionCounters(InstrProfCntrInstBase *Inc) {
   return PD.RegionCounters;
 }
 
+// Calculates difference between two global variable addresses as an integer
+Constant *globalVarDiff(Module &M, GlobalVariable *A, GlobalVariable *B) {
+  auto *IntPtrTy = M.getDataLayout().getIntPtrType(M.getContext());
+  return ConstantExpr::getSub(ConstantExpr::getPtrToInt(A, IntPtrTy),
+                              ConstantExpr::getPtrToInt(B, IntPtrTy));
+}
+
 void InstrLowerer::createDataVariable(InstrProfCntrInstBase *Inc) {
   // When debug information is correlated to profile data, a data variable
   // is not needed.
@@ -1854,13 +1878,12 @@ void InstrLowerer::createDataVariable(InstrProfCntrInstBase *Inc) {
     // Reference the counter variable with a label difference (link-time
     // constant).
     DataSectionKind = IPSK_data;
-    RelativeCounterPtr =
-        ConstantExpr::getSub(ConstantExpr::getPtrToInt(CounterPtr, IntPtrTy),
-                             ConstantExpr::getPtrToInt(Data, IntPtrTy));
+    const Triple T(M.getTargetTriple());
+    RelativeCounterPtr = T.isNVPTX() ? ConstantInt::get(IntPtrTy, 0)
+                                     : globalVarDiff(M, CounterPtr, Data);
     if (BitmapPtr != nullptr)
-      RelativeBitmapPtr =
-          ConstantExpr::getSub(ConstantExpr::getPtrToInt(BitmapPtr, IntPtrTy),
-                               ConstantExpr::getPtrToInt(Data, IntPtrTy));
+      RelativeBitmapPtr = T.isNVPTX() ? ConstantInt::get(IntPtrTy, 0)
+                                      : globalVarDiff(M, BitmapPtr, Data);
   }
 
   Constant *DataVals[] = {
@@ -1887,6 +1910,51 @@ void InstrLowerer::createDataVariable(InstrProfCntrInstBase *Inc) {
   ReferencedNames.push_back(NamePtr);
 }
 
+bool InstrLowerer::emitDataDelayedInit(
+    SmallVector<Function *> &Kernels,
+    SmallVector<const InstrProfCntrInstBase *> &ValueSites) {
+  const Triple T(M.getTargetTriple());
+  if (!T.isNVPTX() || ProfileCorrelate == InstrProfCorrelator::BINARY ||
+      Kernels.empty() || ValueSites.empty()) {
+    return false;
+  }
+
+  auto *VoidTy = Type::getVoidTy(M.getContext());
+  auto *Int32Ty = Type::getInt32Ty(M.getContext());
+  auto *IntPtrTy = M.getDataLayout().getIntPtrType(M.getContext());
+  auto *DelayedInitFTy = FunctionType::get(VoidTy, false);
+  auto *DelayedInitF =
+      Function::Create(DelayedInitFTy, GlobalValue::InternalLinkage,
+                       getInstrProfDelayedInitFuncName(), M);
+
+  IRBuilder<> IRB(BasicBlock::Create(M.getContext(), "", DelayedInitF));
+
+  for (const auto *ValueSite : ValueSites) {
+    GlobalVariable *NamePtr = ValueSite->getName();
+    auto &PD = ProfileDataMap[NamePtr];
+    auto *RelativeCounter = globalVarDiff(M, PD.RegionCounters, PD.DataVar);
+    auto *RelativeCounterPtr =
+        IRB.CreateGEP(IntPtrTy, PD.DataVar, {ConstantInt::get(Int32Ty, 2)});
+    IRB.CreateStore(RelativeCounter, RelativeCounterPtr);
+    if (PD.RegionBitmaps != nullptr) {
+      auto *RelativeBitmap = globalVarDiff(M, PD.RegionBitmaps, PD.DataVar);
+      auto *RelativeBitmapPtr =
+          IRB.CreateGEP(IntPtrTy, PD.DataVar, {ConstantInt::get(Int32Ty, 3)});
+      IRB.CreateStore(RelativeBitmap, RelativeBitmapPtr);
+    }
+  }
+
+  IRB.CreateRetVoid();
+
+  for (auto *Kernel : Kernels) {
+    auto &KernelEntry = Kernel->getEntryBlock();
+    IRB.SetInsertPoint(KernelEntry.getFirstNonPHI());
+    IRB.CreateCall(DelayedInitF);
+  }
+
+  return true;
+}
+
 void InstrLowerer::emitVNodes() {
   if (!ValueProfileStaticAlloc)
     return;
diff --git a/offload/test/offloading/gpupgo/pgo_device_and_host.c b/offload/test/offloading/gpupgo/pgo_device_and_host.c
index 3e95791ce9a50..68200d297f0fc 100644
--- a/offload/test/offloading/gpupgo/pgo_device_and_host.c
+++ b/offload/test/offloading/gpupgo/pgo_device_and_host.c
@@ -48,7 +48,7 @@
 // RUN:     %target_triple.%basename_t.hfdi.profraw \
 // RUN:     | %fcheck-generic --check-prefix="LLVM-DEVICE"
 
-// REQUIRES: amdgpu
+// REQUIRES: gpu
 // REQUIRES: pgo
 
 int main() {
diff --git a/offload/test/offloading/gpupgo/pgo_device_only.c b/offload/test/offloading/gpupgo/pgo_device_only.c
index 2939af613b6dd..f1221f1927716 100644
--- a/offload/test/offloading/gpupgo/pgo_device_only.c
+++ b/offload/test/offloading/gpupgo/pgo_device_only.c
@@ -14,7 +14,7 @@
 // RUN:     %target_triple.%basename_t.clang.profraw | \
 // RUN:     %fcheck-generic --check-prefix="CLANG-PGO"
 
-// REQUIRES: amdgpu
+// REQUIRES: gpu
 // REQUIRES: pgo
 
 int test1(int a) { return a / 2; }



More information about the llvm-commits mailing list