[libc-commits] [clang] [compiler-rt] [libc] [llvm] [PGO][AMDGPU] Add offload profiling with uniformity-aware optimization (PR #177665)

Yaxun Liu via libc-commits libc-commits at lists.llvm.org
Thu Apr 2 16:56:47 PDT 2026


================
@@ -1207,6 +1317,153 @@ void InstrLowerer::lowerIncrement(InstrProfIncrementInst *Inc) {
   Inc->eraseFromParent();
 }
 
+// Determine the wavefront size for an AMDGPU function.
+// Checks target-features attribute first (+wavefrontsize32/+wavefrontsize64),
+// then falls back to the default wavefront size for the target-cpu.
+// Returns 32 or 64. Defaults to 32 if undetermined.
+static unsigned getAMDGPUWavefrontSize(const Function &F) {
+  // Check target-features attribute for explicit wavefront size
+  StringRef Features = F.getFnAttribute("target-features").getValueAsString();
+  if (Features.contains("+wavefrontsize64"))
+    return 64;
+  if (Features.contains("+wavefrontsize32"))
+    return 32;
+
+  // Fall back to default wavefront size based on target-cpu
+  StringRef CPU = F.getFnAttribute("target-cpu").getValueAsString();
+  if (!CPU.empty()) {
+    AMDGPU::GPUKind Kind = AMDGPU::parseArchAMDGCN(CPU);
+    unsigned Features = AMDGPU::getArchAttrAMDGCN(Kind);
+    if (Features & AMDGPU::FEATURE_WAVE32)
+      return 32;
+    return 64; // gfx9 and older default to Wave64
+  }
+
+  return 32; // conservative default
+}
+
+InstrLowerer::AMDGPUPGOInvariants &
+InstrLowerer::getOrCreateAMDGPUInvariants(Function *F) {
+  auto It = AMDGPUInvariantsCache.find(F);
+  if (It != AMDGPUInvariantsCache.end())
+    return It->second;
+
+  LLVMContext &Context = M.getContext();
+
+  BasicBlock &EntryBB = F->getEntryBlock();
+  IRBuilder<> Builder(&*EntryBB.getFirstInsertionPt());
+
+  Value *Matched = ConstantInt::getTrue(Context);
+  if (OffloadPGOSampling > 0) {
+    auto *Int32Ty = Type::getInt32Ty(Context);
+    FunctionCallee IsSampledFn =
+        M.getOrInsertFunction("__gpu_pgo_is_sampled", Int32Ty, Int32Ty);
+    Value *SampledInt = Builder.CreateCall(
+        IsSampledFn, {ConstantInt::get(Int32Ty, OffloadPGOSampling)},
+        "pgo.sampled");
+    Matched = Builder.CreateICmpNE(SampledInt, ConstantInt::get(Int32Ty, 0),
+                                   "pgo.matched");
+  }
+
+  auto &Inv = AMDGPUInvariantsCache[F];
+  Inv.Matched = Matched;
+  return Inv;
+}
+
+void InstrLowerer::lowerIncrementAMDGPU(InstrProfIncrementInst *Inc) {
+  Function *F = Inc->getFunction();
+  auto &Inv = getOrCreateAMDGPUInvariants(F);
+
+  IRBuilder<> Builder(Inc);
+  LLVMContext &Context = M.getContext();
+  auto *Int64Ty = Type::getInt64Ty(Context);
+
+  Value *Matched = Inv.Matched;
+
+  auto *CounterIdx = Inc->getIndex();
+
+  // --- Counter address ---
+  GlobalVariable *Counters = nullptr;
+  GlobalVariable *UniformCounters = nullptr;
+  Value *Addr = nullptr;
+  Value *UniformAddr = nullptr;
+
+  if (ContiguousCnts) {
+    GlobalVariable *NamePtr = Inc->getName();
+    uint64_t FuncOffset = FunctionCounterOffsets.lookup(NamePtr);
+
+    Value *OffsetCounterIdx = Builder.CreateAdd(
+        CounterIdx, Builder.getInt32(FuncOffset), "OffsetCounterIdx");
+
+    Counters = ContiguousCnts;
+    Value *Indices[] = {Builder.getInt32(0), OffsetCounterIdx};
+    Addr = Builder.CreateInBoundsGEP(Counters->getValueType(), Counters,
+                                     Indices, "ctr.addr");
+
+    if (ContiguousUCnts) {
+      UniformCounters = ContiguousUCnts;
+      Value *UniformIndices[] = {Builder.getInt32(0), OffsetCounterIdx};
+      UniformAddr = Builder.CreateInBoundsGEP(UniformCounters->getValueType(),
+                                              UniformCounters, UniformIndices,
+                                              "unifctr.addr");
+    }
+  } else {
+    Counters = getOrCreateRegionCounters(Inc);
+    Value *Indices[] = {Builder.getInt32(0), CounterIdx};
+    Addr = Builder.CreateInBoundsGEP(Counters->getValueType(), Counters,
+                                     Indices, "ctr.addr");
+
+    UniformCounters = getOrCreateUniformCounters(Inc);
+    if (UniformCounters) {
+      Value *UniformIndices[] = {Builder.getInt32(0), CounterIdx};
+      UniformAddr = Builder.CreateInBoundsGEP(UniformCounters->getValueType(),
+                                              UniformCounters, UniformIndices,
+                                              "unifctr.addr");
+    }
+  }
+
+  // Use addrspace(1) pointers directly for the library call to generate
+  // global_load/global_store instead of slower flat_load/flat_store.
+  auto *GlobalPtrTy = PointerType::get(Context, 1);
+  Value *UniformAddrArg =
+      UniformAddr ? UniformAddr
+                  : ConstantPointerNull::get(cast<PointerType>(GlobalPtrTy));
+
+  Value *IncStep = Inc->getStep();
+  Value *StepI64 = Builder.CreateZExtOrTrunc(IncStep, Int64Ty, "step.i64");
+
+  // --- Increment via library call ---
+  if (OffloadPGOSampling > 0) {
+    // Sampled mode: guard the call behind the sampling decision.
+    // Non-sampled blocks skip entirely.
+    BasicBlock *CurBB = Builder.GetInsertBlock();
+    BasicBlock *ContBB =
+        CurBB->splitBasicBlock(BasicBlock::iterator(Inc), "po_cont");
+    BasicBlock *ThenBB = BasicBlock::Create(Context, "po_then", F);
+
+    CurBB->getTerminator()->eraseFromParent();
+    IRBuilder<> HeadBuilder(CurBB);
+    HeadBuilder.CreateCondBr(Matched, ThenBB, ContBB);
+
+    IRBuilder<> ThenBuilder(ThenBB);
+    FunctionCallee IncrFnPO =
+        M.getOrInsertFunction("__gpu_pgo_increment", Type::getVoidTy(Context),
----------------
yxsamliu wrote:

Added PROFILE_INSTRUMENT_GPU and PROFILE_SAMPLING_GPU to RuntimeLibcalls.td with implementations in the AMDGPU section.

https://github.com/llvm/llvm-project/pull/177665


More information about the libc-commits mailing list