[llvm] Reland "AMDGPU/PromoteAlloca: Simplify how deferred loads work (#170510)" (PR #170955)
Nicolai Hähnle via llvm-commits
llvm-commits at lists.llvm.org
Fri Dec 5 16:39:25 PST 2025
https://github.com/nhaehnle created https://github.com/llvm/llvm-project/pull/170955
The second pass of promotion to vector can be quite simple. Reflect that
simplicity in the code for better maintainability.
v2:
- don't put placeholders into the SSAUpdater, and add a test that shows
the problem
>From d4a0dc304fae65d690cc9b4ada9a2829de5b029a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicolai=20H=C3=A4hnle?= <nicolai.haehnle at amd.com>
Date: Fri, 5 Dec 2025 12:54:25 -0800
Subject: [PATCH] Reland "AMDGPU/PromoteAlloca: Simplify how deferred loads
work (#170510)"
The second pass of promotion to vector can be quite simple. Reflect that
simplicity in the code for better maintainability.
v2:
- don't put placeholders into the SSAUpdater, and add a test that shows
the problem
commit-id:d6d2255a
---
.../lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp | 82 ++++++++-----------
.../test/CodeGen/AMDGPU/promote-alloca-cfg.ll | 33 ++++++++
2 files changed, 69 insertions(+), 46 deletions(-)
create mode 100644 llvm/test/CodeGen/AMDGPU/promote-alloca-cfg.ll
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
index 77db14513254f..d8e8c8f024518 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -502,27 +502,14 @@ static Value *promoteAllocaUserToVector(
Instruction *Inst, const DataLayout &DL, FixedVectorType *VectorTy,
unsigned VecStoreSize, unsigned ElementSize,
DenseMap<MemTransferInst *, MemTransferInfo> &TransferInfo,
- std::map<GetElementPtrInst *, WeakTrackingVH> &GEPVectorIdx, Value *CurVal,
- SmallVectorImpl<LoadInst *> &DeferredLoads) {
+ std::map<GetElementPtrInst *, WeakTrackingVH> &GEPVectorIdx,
+ function_ref<Value *()> GetCurVal) {
// Note: we use InstSimplifyFolder because it can leverage the DataLayout
// to do more folding, especially in the case of vector splats.
IRBuilder<InstSimplifyFolder> Builder(Inst->getContext(),
InstSimplifyFolder(DL));
Builder.SetInsertPoint(Inst);
- const auto GetOrLoadCurrentVectorValue = [&]() -> Value * {
- if (CurVal)
- return CurVal;
-
- // If the current value is not known, insert a dummy load and lower it on
- // the second pass.
- LoadInst *Dummy =
- Builder.CreateLoad(VectorTy, PoisonValue::get(Builder.getPtrTy()),
- "promotealloca.dummyload");
- DeferredLoads.push_back(Dummy);
- return Dummy;
- };
-
const auto CreateTempPtrIntCast = [&Builder, DL](Value *Val,
Type *PtrTy) -> Value * {
assert(DL.getTypeStoreSize(Val->getType()) == DL.getTypeStoreSize(PtrTy));
@@ -542,12 +529,7 @@ static Value *promoteAllocaUserToVector(
switch (Inst->getOpcode()) {
case Instruction::Load: {
- // Loads can only be lowered if the value is known.
- if (!CurVal) {
- DeferredLoads.push_back(cast<LoadInst>(Inst));
- return nullptr;
- }
-
+ Value *CurVal = GetCurVal();
Value *Index = calculateVectorIndex(
cast<LoadInst>(Inst)->getPointerOperand(), GEPVectorIdx);
@@ -637,7 +619,7 @@ static Value *promoteAllocaUserToVector(
Val = Builder.CreateBitOrPointerCast(Val, SubVecTy);
- Value *CurVec = GetOrLoadCurrentVectorValue();
+ Value *CurVec = GetCurVal();
for (unsigned K = 0, NumElts = std::min(NumWrittenElts, NumVecElts);
K < NumElts; ++K) {
Value *CurIdx =
@@ -650,8 +632,7 @@ static Value *promoteAllocaUserToVector(
if (Val->getType() != VecEltTy)
Val = Builder.CreateBitOrPointerCast(Val, VecEltTy);
- return Builder.CreateInsertElement(GetOrLoadCurrentVectorValue(), Val,
- Index);
+ return Builder.CreateInsertElement(GetCurVal(), Val, Index);
}
case Instruction::Call: {
if (auto *MTI = dyn_cast<MemTransferInst>(Inst)) {
@@ -673,7 +654,7 @@ static Value *promoteAllocaUserToVector(
}
}
- return Builder.CreateShuffleVector(GetOrLoadCurrentVectorValue(), Mask);
+ return Builder.CreateShuffleVector(GetCurVal(), Mask);
}
if (auto *MSI = dyn_cast<MemSetInst>(Inst)) {
@@ -1038,37 +1019,46 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
Updater.AddAvailableValue(EntryBB, AllocaInitValue);
- // First handle the initial worklist.
- SmallVector<LoadInst *, 4> DeferredLoads;
+ // First handle the initial worklist, in basic block order.
+ //
+ // Insert a placeholder whenever we need the vector value at the top of a
+ // basic block.
+ SmallVector<Instruction *> Placeholders;
forEachWorkListItem(WorkList, [&](Instruction *I) {
BasicBlock *BB = I->getParent();
- // On the first pass, we only take values that are trivially known, i.e.
- // where AddAvailableValue was already called in this block.
- Value *Result = promoteAllocaUserToVector(
- I, *DL, VectorTy, VecStoreSize, ElementSize, TransferInfo, GEPVectorIdx,
- Updater.FindValueForBlock(BB), DeferredLoads);
+ auto GetCurVal = [&]() -> Value * {
+ if (Value *CurVal = Updater.FindValueForBlock(BB))
+ return CurVal;
+
+ if (!Placeholders.empty() && Placeholders.back()->getParent() == BB)
+ return Placeholders.back();
+
+ // If the current value in the basic block is not yet known, insert a
+ // placeholder that we will replace later.
+ IRBuilder<> Builder(I);
+ auto *Placeholder = cast<Instruction>(Builder.CreateFreeze(
+ PoisonValue::get(VectorTy), "promotealloca.placeholder"));
+ Placeholders.push_back(Placeholder);
+ return Placeholders.back();
+ };
+
+ Value *Result =
+ promoteAllocaUserToVector(I, *DL, VectorTy, VecStoreSize, ElementSize,
+ TransferInfo, GEPVectorIdx, GetCurVal);
if (Result)
Updater.AddAvailableValue(BB, Result);
});
- // Then handle deferred loads.
- forEachWorkListItem(DeferredLoads, [&](Instruction *I) {
- SmallVector<LoadInst *, 0> NewDLs;
- BasicBlock *BB = I->getParent();
- // On the second pass, we use GetValueInMiddleOfBlock to guarantee we always
- // get a value, inserting PHIs as needed.
- Value *Result = promoteAllocaUserToVector(
- I, *DL, VectorTy, VecStoreSize, ElementSize, TransferInfo, GEPVectorIdx,
- Updater.GetValueInMiddleOfBlock(I->getParent()), NewDLs);
- if (Result)
- Updater.AddAvailableValue(BB, Result);
- assert(NewDLs.empty() && "No more deferred loads should be queued!");
- });
+ // Now fixup the placeholders.
+ for (Instruction *Placeholder : Placeholders) {
+ Placeholder->replaceAllUsesWith(
+ Updater.GetValueInMiddleOfBlock(Placeholder->getParent()));
+ Placeholder->eraseFromParent();
+ }
// Delete all instructions. On the first pass, new dummy loads may have been
// added so we need to collect them too.
DenseSet<Instruction *> InstsToDelete(WorkList.begin(), WorkList.end());
- InstsToDelete.insert_range(DeferredLoads);
for (Instruction *I : InstsToDelete) {
assert(I->use_empty());
I->eraseFromParent();
diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-cfg.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-cfg.ll
new file mode 100644
index 0000000000000..10d98fd0a0610
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-cfg.ll
@@ -0,0 +1,33 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -passes=amdgpu-promote-alloca < %s | FileCheck %s
+
+; Checks that certain CFGs are handled correctly.
+
+define amdgpu_kernel void @chain(i32 %init, i32 %i.1, i32 %i.2) {
+; CHECK-LABEL: @chain(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[A:%.*]] = freeze <4 x i8> poison
+; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32 [[INIT:%.*]] to <4 x i8>
+; CHECK-NEXT: br label [[BB1:%.*]]
+; CHECK: bb1:
+; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i8> [[TMP0]], i32 [[I_1:%.*]]
+; CHECK-NEXT: br label [[BB2:%.*]]
+; CHECK: bb2:
+; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i8> [[TMP0]], i32 [[I_2:%.*]]
+; CHECK-NEXT: ret void
+;
+entry:
+ %a = alloca [4 x i8], align 4, addrspace(5)
+ store i32 %init, ptr addrspace(5) %a
+ br label %bb1
+
+bb1:
+ %p.1 = getelementptr i8, ptr addrspace(5) %a, i32 %i.1
+ %x.1 = load i8, ptr addrspace(5) %p.1
+ br label %bb2
+
+bb2:
+ %p.2 = getelementptr i8, ptr addrspace(5) %a, i32 %i.2
+ %x.2 = load i8, ptr addrspace(5) %p.2
+ ret void
+}
More information about the llvm-commits
mailing list