[clang] [llvm] [Offload][PGO] Fix PGO on NVPTX targets (PR #143568)

Ethan Luis McDonough via cfe-commits cfe-commits at lists.llvm.org
Thu Oct 16 13:14:27 PDT 2025


================
@@ -1887,6 +1910,51 @@ void InstrLowerer::createDataVariable(InstrProfCntrInstBase *Inc) {
   ReferencedNames.push_back(NamePtr);
 }
 
+bool InstrLowerer::emitDataDelayedInit(
+    SmallVector<Function *> &Kernels,
+    SmallVector<const InstrProfCntrInstBase *> &ValueSites) {
+  const Triple T(M.getTargetTriple());
+  if (!T.isNVPTX() || ProfileCorrelate == InstrProfCorrelator::BINARY ||
+      Kernels.empty() || ValueSites.empty()) {
+    return false;
+  }
+
+  auto *VoidTy = Type::getVoidTy(M.getContext());
+  auto *Int32Ty = Type::getInt32Ty(M.getContext());
+  auto *IntPtrTy = M.getDataLayout().getIntPtrType(M.getContext());
+  auto *DelayedInitFTy = FunctionType::get(VoidTy, false);
+  auto *DelayedInitF =
+      Function::Create(DelayedInitFTy, GlobalValue::InternalLinkage,
+                       getInstrProfDelayedInitFuncName(), M);
+
+  IRBuilder<> IRB(BasicBlock::Create(M.getContext(), "", DelayedInitF));
+
+  for (const auto *ValueSite : ValueSites) {
+    GlobalVariable *NamePtr = ValueSite->getName();
+    auto &PD = ProfileDataMap[NamePtr];
+    auto *RelativeCounter = globalVarDiff(M, PD.RegionCounters, PD.DataVar);
+    auto *RelativeCounterPtr =
+        IRB.CreateGEP(IntPtrTy, PD.DataVar, {ConstantInt::get(Int32Ty, 2)});
+    IRB.CreateStore(RelativeCounter, RelativeCounterPtr);
+    if (PD.RegionBitmaps != nullptr) {
+      auto *RelativeBitmap = globalVarDiff(M, PD.RegionBitmaps, PD.DataVar);
+      auto *RelativeBitmapPtr =
+          IRB.CreateGEP(IntPtrTy, PD.DataVar, {ConstantInt::get(Int32Ty, 3)});
+      IRB.CreateStore(RelativeBitmap, RelativeBitmapPtr);
+    }
+  }
+
+  IRB.CreateRetVoid();
+
+  for (auto *Kernel : Kernels) {
+    auto &KernelEntry = Kernel->getEntryBlock();
+    IRB.SetInsertPoint(KernelEntry.getFirstNonPHIIt());
+    IRB.CreateCall(DelayedInitF);
----------------
EthanLuisMcDonough wrote:

Some level of overhead is expected for PGO, but we definitely want to minimize it if possible. Do you think it would be better to try calling `__llvm_profile_delayed_data_var_init` from the device plugin during initialization?

https://github.com/llvm/llvm-project/pull/143568


More information about the cfe-commits mailing list