[clang] [llvm] [Offload][PGO] Fix PGO on NVPTX targets (PR #143568)
Ethan Luis McDonough via cfe-commits
cfe-commits at lists.llvm.org
Thu Oct 16 13:14:27 PDT 2025
================
@@ -1887,6 +1910,51 @@ void InstrLowerer::createDataVariable(InstrProfCntrInstBase *Inc) {
ReferencedNames.push_back(NamePtr);
}
+bool InstrLowerer::emitDataDelayedInit(
+ SmallVector<Function *> &Kernels,
+ SmallVector<const InstrProfCntrInstBase *> &ValueSites) {
+ const Triple T(M.getTargetTriple());
+ if (!T.isNVPTX() || ProfileCorrelate == InstrProfCorrelator::BINARY ||
+ Kernels.empty() || ValueSites.empty()) {
+ return false;
+ }
+
+ auto *VoidTy = Type::getVoidTy(M.getContext());
+ auto *Int32Ty = Type::getInt32Ty(M.getContext());
+ auto *IntPtrTy = M.getDataLayout().getIntPtrType(M.getContext());
+ auto *DelayedInitFTy = FunctionType::get(VoidTy, false);
+ auto *DelayedInitF =
+ Function::Create(DelayedInitFTy, GlobalValue::InternalLinkage,
+ getInstrProfDelayedInitFuncName(), M);
+
+ IRBuilder<> IRB(BasicBlock::Create(M.getContext(), "", DelayedInitF));
+
+ for (const auto *ValueSite : ValueSites) {
+ GlobalVariable *NamePtr = ValueSite->getName();
+ auto &PD = ProfileDataMap[NamePtr];
+ auto *RelativeCounter = globalVarDiff(M, PD.RegionCounters, PD.DataVar);
+ auto *RelativeCounterPtr =
+ IRB.CreateGEP(IntPtrTy, PD.DataVar, {ConstantInt::get(Int32Ty, 2)});
+ IRB.CreateStore(RelativeCounter, RelativeCounterPtr);
+ if (PD.RegionBitmaps != nullptr) {
+ auto *RelativeBitmap = globalVarDiff(M, PD.RegionBitmaps, PD.DataVar);
+ auto *RelativeBitmapPtr =
+ IRB.CreateGEP(IntPtrTy, PD.DataVar, {ConstantInt::get(Int32Ty, 3)});
+ IRB.CreateStore(RelativeBitmap, RelativeBitmapPtr);
+ }
+ }
+
+ IRB.CreateRetVoid();
+
+ for (auto *Kernel : Kernels) {
+ auto &KernelEntry = Kernel->getEntryBlock();
+ IRB.SetInsertPoint(KernelEntry.getFirstNonPHIIt());
+ IRB.CreateCall(DelayedInitF);
----------------
EthanLuisMcDonough wrote:
Some level of overhead is expected for PGO, but we definitely want to minimize it if possible. Do you think it would be better to try calling `__llvm_profile_delayed_data_var_init` from the device plugin during initialization?
https://github.com/llvm/llvm-project/pull/143568
More information about the cfe-commits
mailing list