[libc-commits] [clang] [compiler-rt] [libc] [llvm] [PGO][AMDGPU] Add offload profiling with uniformity-aware optimization (PR #177665)
Yaxun Liu via libc-commits
libc-commits at lists.llvm.org
Thu Apr 2 16:56:47 PDT 2026
================
@@ -1207,6 +1317,153 @@ void InstrLowerer::lowerIncrement(InstrProfIncrementInst *Inc) {
Inc->eraseFromParent();
}
+// Determine the wavefront size for an AMDGPU function.
+// Checks target-features attribute first (+wavefrontsize32/+wavefrontsize64),
+// then falls back to the default wavefront size for the target-cpu.
+// Returns 32 or 64. Defaults to 32 if undetermined.
+static unsigned getAMDGPUWavefrontSize(const Function &F) {
+ // Check target-features attribute for explicit wavefront size
+ StringRef Features = F.getFnAttribute("target-features").getValueAsString();
+ if (Features.contains("+wavefrontsize64"))
+ return 64;
+ if (Features.contains("+wavefrontsize32"))
+ return 32;
+
+ // Fall back to default wavefront size based on target-cpu
+ StringRef CPU = F.getFnAttribute("target-cpu").getValueAsString();
+ if (!CPU.empty()) {
+ AMDGPU::GPUKind Kind = AMDGPU::parseArchAMDGCN(CPU);
+ unsigned Features = AMDGPU::getArchAttrAMDGCN(Kind);
+ if (Features & AMDGPU::FEATURE_WAVE32)
+ return 32;
+ return 64; // gfx9 and older default to Wave64
+ }
+
+ return 32; // conservative default
+}
+
+InstrLowerer::AMDGPUPGOInvariants &
+InstrLowerer::getOrCreateAMDGPUInvariants(Function *F) {
+ auto It = AMDGPUInvariantsCache.find(F);
+ if (It != AMDGPUInvariantsCache.end())
+ return It->second;
+
+ LLVMContext &Context = M.getContext();
+
+ BasicBlock &EntryBB = F->getEntryBlock();
+ IRBuilder<> Builder(&*EntryBB.getFirstInsertionPt());
+
+ Value *Matched = ConstantInt::getTrue(Context);
+ if (OffloadPGOSampling > 0) {
+ auto *Int32Ty = Type::getInt32Ty(Context);
+ FunctionCallee IsSampledFn =
+ M.getOrInsertFunction("__gpu_pgo_is_sampled", Int32Ty, Int32Ty);
+ Value *SampledInt = Builder.CreateCall(
+ IsSampledFn, {ConstantInt::get(Int32Ty, OffloadPGOSampling)},
+ "pgo.sampled");
+ Matched = Builder.CreateICmpNE(SampledInt, ConstantInt::get(Int32Ty, 0),
+ "pgo.matched");
+ }
+
+ auto &Inv = AMDGPUInvariantsCache[F];
+ Inv.Matched = Matched;
+ return Inv;
+}
+
+void InstrLowerer::lowerIncrementAMDGPU(InstrProfIncrementInst *Inc) {
+ Function *F = Inc->getFunction();
+ auto &Inv = getOrCreateAMDGPUInvariants(F);
+
+ IRBuilder<> Builder(Inc);
+ LLVMContext &Context = M.getContext();
+ auto *Int64Ty = Type::getInt64Ty(Context);
+
+ Value *Matched = Inv.Matched;
+
+ auto *CounterIdx = Inc->getIndex();
+
+ // --- Counter address ---
+ GlobalVariable *Counters = nullptr;
+ GlobalVariable *UniformCounters = nullptr;
+ Value *Addr = nullptr;
+ Value *UniformAddr = nullptr;
+
+ if (ContiguousCnts) {
+ GlobalVariable *NamePtr = Inc->getName();
+ uint64_t FuncOffset = FunctionCounterOffsets.lookup(NamePtr);
+
+ Value *OffsetCounterIdx = Builder.CreateAdd(
+ CounterIdx, Builder.getInt32(FuncOffset), "OffsetCounterIdx");
+
+ Counters = ContiguousCnts;
+ Value *Indices[] = {Builder.getInt32(0), OffsetCounterIdx};
+ Addr = Builder.CreateInBoundsGEP(Counters->getValueType(), Counters,
+ Indices, "ctr.addr");
+
+ if (ContiguousUCnts) {
+ UniformCounters = ContiguousUCnts;
+ Value *UniformIndices[] = {Builder.getInt32(0), OffsetCounterIdx};
+ UniformAddr = Builder.CreateInBoundsGEP(UniformCounters->getValueType(),
+ UniformCounters, UniformIndices,
+ "unifctr.addr");
+ }
+ } else {
+ Counters = getOrCreateRegionCounters(Inc);
+ Value *Indices[] = {Builder.getInt32(0), CounterIdx};
+ Addr = Builder.CreateInBoundsGEP(Counters->getValueType(), Counters,
+ Indices, "ctr.addr");
+
+ UniformCounters = getOrCreateUniformCounters(Inc);
+ if (UniformCounters) {
+ Value *UniformIndices[] = {Builder.getInt32(0), CounterIdx};
+ UniformAddr = Builder.CreateInBoundsGEP(UniformCounters->getValueType(),
+ UniformCounters, UniformIndices,
+ "unifctr.addr");
+ }
+ }
+
+ // Use addrspace(1) pointers directly for the library call to generate
+ // global_load/global_store instead of slower flat_load/flat_store.
+ auto *GlobalPtrTy = PointerType::get(Context, 1);
+ Value *UniformAddrArg =
+ UniformAddr ? UniformAddr
+ : ConstantPointerNull::get(cast<PointerType>(GlobalPtrTy));
+
+ Value *IncStep = Inc->getStep();
+ Value *StepI64 = Builder.CreateZExtOrTrunc(IncStep, Int64Ty, "step.i64");
+
+ // --- Increment via library call ---
+ if (OffloadPGOSampling > 0) {
+ // Sampled mode: guard the call behind the sampling decision.
+ // Non-sampled blocks skip entirely.
+ BasicBlock *CurBB = Builder.GetInsertBlock();
+ BasicBlock *ContBB =
+ CurBB->splitBasicBlock(BasicBlock::iterator(Inc), "po_cont");
+ BasicBlock *ThenBB = BasicBlock::Create(Context, "po_then", F);
+
+ CurBB->getTerminator()->eraseFromParent();
+ IRBuilder<> HeadBuilder(CurBB);
+ HeadBuilder.CreateCondBr(Matched, ThenBB, ContBB);
+
+ IRBuilder<> ThenBuilder(ThenBB);
+ FunctionCallee IncrFnPO =
+ M.getOrInsertFunction("__gpu_pgo_increment", Type::getVoidTy(Context),
----------------
yxsamliu wrote:
Added PROFILE_INSTRUMENT_GPU and PROFILE_SAMPLING_GPU to RuntimeLibcalls.td with implementations in the AMDGPU section.
https://github.com/llvm/llvm-project/pull/177665
More information about the libc-commits
mailing list