[clang] [llvm] [Offload][PGO] Fix PGO on NVPTX targets (PR #143568)
Ethan Luis McDonough via llvm-commits
llvm-commits at lists.llvm.org
Fri Oct 10 13:19:27 PDT 2025
https://github.com/EthanLuisMcDonough updated https://github.com/llvm/llvm-project/pull/143568
>From f73491b1816c86dfd38833257df48aeeb5a2549b Mon Sep 17 00:00:00 2001
From: Ethan Luis McDonough <ethanluismcdonough at gmail.com>
Date: Thu, 22 May 2025 22:20:04 -0500
Subject: [PATCH] [Offload][PGO] Fix PGO on NVPTX targets
---
clang/lib/Driver/ToolChains/Clang.cpp | 4 +-
clang/test/Driver/cuda-no-pgo-or-coverage.cu | 33 --------
llvm/include/llvm/ProfileData/InstrProf.h | 6 ++
.../Instrumentation/InstrProfiling.cpp | 80 +++++++++++++++++--
.../offloading/gpupgo/pgo_device_and_host.c | 2 +-
.../test/offloading/gpupgo/pgo_device_only.c | 2 +-
6 files changed, 83 insertions(+), 44 deletions(-)
delete mode 100644 clang/test/Driver/cuda-no-pgo-or-coverage.cu
diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index d326a81feb762..d4111fe6e9b80 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -6201,9 +6201,7 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
Args.AddLastArg(CmdArgs, options::OPT_fconvergent_functions,
options::OPT_fno_convergent_functions);
- // NVPTX doesn't support PGO or coverage
- if (!Triple.isNVPTX())
- addPGOAndCoverageFlags(TC, C, JA, Output, Args, SanitizeArgs, CmdArgs);
+ addPGOAndCoverageFlags(TC, C, JA, Output, Args, SanitizeArgs, CmdArgs);
Args.AddLastArg(CmdArgs, options::OPT_fclang_abi_compat_EQ);
diff --git a/clang/test/Driver/cuda-no-pgo-or-coverage.cu b/clang/test/Driver/cuda-no-pgo-or-coverage.cu
deleted file mode 100644
index b84587e1e182b..0000000000000
--- a/clang/test/Driver/cuda-no-pgo-or-coverage.cu
+++ /dev/null
@@ -1,33 +0,0 @@
-// Check that profiling/coverage arguments doen't get passed down to device-side
-// compilation.
-//
-//
-// XRUN: not %clang -### --target=x86_64-linux-gnu -c --cuda-gpu-arch=sm_20 \
-// XRUN: -fprofile-generate %s 2>&1 | \
-// XRUN: FileCheck --check-prefixes=CHECK,PROF %s
-//
-// RUN: not %clang -### --target=x86_64-linux-gnu -c --cuda-gpu-arch=sm_20 \
-// RUN: -fprofile-instr-generate %s 2>&1 | \
-// RUN: FileCheck --check-prefixes=CHECK,PROF %s
-//
-// RUN: not %clang -### --target=x86_64-linux-gnu -c --cuda-gpu-arch=sm_20 \
-// RUN: -coverage %s 2>&1 | \
-// RUN: FileCheck --check-prefixes=CHECK,GCOV %s
-//
-// RUN: not %clang -### --target=x86_64-linux-gnu -c --cuda-gpu-arch=sm_20 \
-// RUN: -ftest-coverage %s 2>&1 | \
-// RUN: FileCheck --check-prefixes=CHECK,GCOV %s
-//
-// RUN: not %clang -### --target=x86_64-linux-gnu -c --cuda-gpu-arch=sm_20 \
-// RUN: -fprofile-instr-generate -fcoverage-mapping %s 2>&1 | \
-// RUN: FileCheck --check-prefixes=CHECK,PROF %s
-//
-//
-// CHECK-NOT: error: unsupported option '-fprofile
-// CHECK-NOT: error: invalid argument
-// CHECK-DAG: "-fcuda-is-device"
-// CHECK-NOT: "-f{{[^"/]*coverage.*}}"
-// CHECK-NOT: "-fprofile{{[^"]*}}"
-// CHECK: "-triple" "x86_64-unknown-linux-gnu"
-// PROF: "-fprofile{{.*}}"
-// GCOV: "-coverage-notes-file=
diff --git a/llvm/include/llvm/ProfileData/InstrProf.h b/llvm/include/llvm/ProfileData/InstrProf.h
index 85a9efe73855b..2e5779d521f7a 100644
--- a/llvm/include/llvm/ProfileData/InstrProf.h
+++ b/llvm/include/llvm/ProfileData/InstrProf.h
@@ -171,6 +171,12 @@ inline StringRef getInstrProfRegFuncsName() {
return "__llvm_profile_register_functions";
}
+/// Return the name of function that initializes self-referential datavar values
+/// on NVPTX targets
+inline StringRef getInstrProfDelayedInitFuncName() {
+ return "__llvm_profile_delayed_data_var_init";
+}
+
/// Return the name of the runtime interface that registers per-function control
/// data for one instrumented function.
inline StringRef getInstrProfRegFuncName() {
diff --git a/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp b/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
index 5e7548b0a2fd1..b3eff3765e144 100644
--- a/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
+++ b/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
@@ -391,6 +391,13 @@ class InstrLowerer final {
/// Create INSTR_PROF_DATA variable for counters and bitmaps.
void createDataVariable(InstrProfCntrInstBase *Inc);
+ /// Creates delayed initialiation function for data relative offsets
+ /// This is only relevant on NVPTX targets where circular constant structures
+ /// are not allowed
+ bool
+ emitDataDelayedInit(SmallVector<Function *> &Kernels,
+ SmallVector<const InstrProfCntrInstBase *> &ValueSites);
+
/// Get the counters for virtual table values, creating them if necessary.
void getOrCreateVTableProfData(GlobalVariable *GV);
@@ -947,11 +954,18 @@ bool InstrLowerer::lower() {
if (!ContainsProfiling && !CoverageNamesVar)
return MadeChange;
+ // Cached info for generating delayed offset calculations
+ // This is only relevant on NVPTX targets
+ SmallVector<Function *> Kernels;
+ SmallVector<const InstrProfCntrInstBase *> ValueSites;
+
// We did not know how many value sites there would be inside
// the instrumented function. This is counting the number of instrumented
// target value sites to enter it as field in the profile data variable.
for (Function &F : M) {
InstrProfCntrInstBase *FirstProfInst = nullptr;
+ if (F.getCallingConv() == CallingConv::PTX_Kernel)
+ Kernels.push_back(&F);
for (BasicBlock &BB : F) {
for (auto I = BB.begin(), E = BB.end(); I != E; I++) {
if (auto *Ind = dyn_cast<InstrProfValueProfileInst>(I))
@@ -971,9 +985,12 @@ bool InstrLowerer::lower() {
// Also create the data variable based on the MCDCParams.
if (FirstProfInst != nullptr) {
static_cast<void>(getOrCreateRegionCounters(FirstProfInst));
+ ValueSites.push_back(FirstProfInst);
}
}
+ MadeChange |= emitDataDelayedInit(Kernels, ValueSites);
+
if (EnableVTableValueProfiling)
for (GlobalVariable &GV : M.globals())
// Global variables with type metadata are virtual table variables.
@@ -1734,6 +1751,13 @@ InstrLowerer::getOrCreateRegionCounters(InstrProfCntrInstBase *Inc) {
return PD.RegionCounters;
}
+// Calculates difference between two global variable addresses as an integer
+Constant *globalVarDiff(Module &M, GlobalVariable *A, GlobalVariable *B) {
+ auto *IntPtrTy = M.getDataLayout().getIntPtrType(M.getContext());
+ return ConstantExpr::getSub(ConstantExpr::getPtrToInt(A, IntPtrTy),
+ ConstantExpr::getPtrToInt(B, IntPtrTy));
+}
+
void InstrLowerer::createDataVariable(InstrProfCntrInstBase *Inc) {
// When debug information is correlated to profile data, a data variable
// is not needed.
@@ -1854,13 +1878,12 @@ void InstrLowerer::createDataVariable(InstrProfCntrInstBase *Inc) {
// Reference the counter variable with a label difference (link-time
// constant).
DataSectionKind = IPSK_data;
- RelativeCounterPtr =
- ConstantExpr::getSub(ConstantExpr::getPtrToInt(CounterPtr, IntPtrTy),
- ConstantExpr::getPtrToInt(Data, IntPtrTy));
+ const Triple T(M.getTargetTriple());
+ RelativeCounterPtr = T.isNVPTX() ? ConstantInt::get(IntPtrTy, 0)
+ : globalVarDiff(M, CounterPtr, Data);
if (BitmapPtr != nullptr)
- RelativeBitmapPtr =
- ConstantExpr::getSub(ConstantExpr::getPtrToInt(BitmapPtr, IntPtrTy),
- ConstantExpr::getPtrToInt(Data, IntPtrTy));
+ RelativeBitmapPtr = T.isNVPTX() ? ConstantInt::get(IntPtrTy, 0)
+ : globalVarDiff(M, BitmapPtr, Data);
}
Constant *DataVals[] = {
@@ -1887,6 +1910,51 @@ void InstrLowerer::createDataVariable(InstrProfCntrInstBase *Inc) {
ReferencedNames.push_back(NamePtr);
}
+bool InstrLowerer::emitDataDelayedInit(
+ SmallVector<Function *> &Kernels,
+ SmallVector<const InstrProfCntrInstBase *> &ValueSites) {
+ const Triple T(M.getTargetTriple());
+ if (!T.isNVPTX() || ProfileCorrelate == InstrProfCorrelator::BINARY ||
+ Kernels.empty() || ValueSites.empty()) {
+ return false;
+ }
+
+ auto *VoidTy = Type::getVoidTy(M.getContext());
+ auto *Int32Ty = Type::getInt32Ty(M.getContext());
+ auto *IntPtrTy = M.getDataLayout().getIntPtrType(M.getContext());
+ auto *DelayedInitFTy = FunctionType::get(VoidTy, false);
+ auto *DelayedInitF =
+ Function::Create(DelayedInitFTy, GlobalValue::InternalLinkage,
+ getInstrProfDelayedInitFuncName(), M);
+
+ IRBuilder<> IRB(BasicBlock::Create(M.getContext(), "", DelayedInitF));
+
+ for (const auto *ValueSite : ValueSites) {
+ GlobalVariable *NamePtr = ValueSite->getName();
+ auto &PD = ProfileDataMap[NamePtr];
+ auto *RelativeCounter = globalVarDiff(M, PD.RegionCounters, PD.DataVar);
+ auto *RelativeCounterPtr =
+ IRB.CreateGEP(IntPtrTy, PD.DataVar, {ConstantInt::get(Int32Ty, 2)});
+ IRB.CreateStore(RelativeCounter, RelativeCounterPtr);
+ if (PD.RegionBitmaps != nullptr) {
+ auto *RelativeBitmap = globalVarDiff(M, PD.RegionBitmaps, PD.DataVar);
+ auto *RelativeBitmapPtr =
+ IRB.CreateGEP(IntPtrTy, PD.DataVar, {ConstantInt::get(Int32Ty, 3)});
+ IRB.CreateStore(RelativeBitmap, RelativeBitmapPtr);
+ }
+ }
+
+ IRB.CreateRetVoid();
+
+ for (auto *Kernel : Kernels) {
+ auto &KernelEntry = Kernel->getEntryBlock();
+ IRB.SetInsertPoint(KernelEntry.getFirstNonPHI());
+ IRB.CreateCall(DelayedInitF);
+ }
+
+ return true;
+}
+
void InstrLowerer::emitVNodes() {
if (!ValueProfileStaticAlloc)
return;
diff --git a/offload/test/offloading/gpupgo/pgo_device_and_host.c b/offload/test/offloading/gpupgo/pgo_device_and_host.c
index 3e95791ce9a50..68200d297f0fc 100644
--- a/offload/test/offloading/gpupgo/pgo_device_and_host.c
+++ b/offload/test/offloading/gpupgo/pgo_device_and_host.c
@@ -48,7 +48,7 @@
// RUN: %target_triple.%basename_t.hfdi.profraw \
// RUN: | %fcheck-generic --check-prefix="LLVM-DEVICE"
-// REQUIRES: amdgpu
+// REQUIRES: gpu
// REQUIRES: pgo
int main() {
diff --git a/offload/test/offloading/gpupgo/pgo_device_only.c b/offload/test/offloading/gpupgo/pgo_device_only.c
index 2939af613b6dd..f1221f1927716 100644
--- a/offload/test/offloading/gpupgo/pgo_device_only.c
+++ b/offload/test/offloading/gpupgo/pgo_device_only.c
@@ -14,7 +14,7 @@
// RUN: %target_triple.%basename_t.clang.profraw | \
// RUN: %fcheck-generic --check-prefix="CLANG-PGO"
-// REQUIRES: amdgpu
+// REQUIRES: gpu
// REQUIRES: pgo
int test1(int a) { return a / 2; }
More information about the llvm-commits
mailing list