[llvm-branch-commits] [llvm] [AMDGPU][PromoteAlloca] Set !amdgpu.non.volatile if promotion fails (PR #179415)
via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Tue Feb 3 00:27:39 PST 2026
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-amdgpu
Author: Pierre van Houtryve (Pierre-vh)
<details>
<summary>Changes</summary>
I thought about doing this in a separate pass, but this pass already has all the necessary analysis for this to be a trivial addition.
We can simply set `!amdgpu.non.volatile` if all other attempts to promote the operation failed.
---
Full diff: https://github.com/llvm/llvm-project/pull/179415.diff
4 Files Affected:
- (modified) llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp (+29-2)
- (modified) llvm/test/CodeGen/AMDGPU/promote-alloca-memset.ll (+2-2)
- (added) llvm/test/CodeGen/AMDGPU/promote-alloca-non-volatile-accesses.ll (+45)
- (modified) llvm/test/CodeGen/AMDGPU/promote-alloca-vgpr-ratio.ll (+23-18)
``````````diff
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
index d18d3a13b29ea..a04944cc5bd2e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -120,6 +120,11 @@ struct AllocaAnalysis {
} LDS;
explicit AllocaAnalysis(AllocaInst *Alloca) : Alloca(Alloca) {}
+
+ void eraseAlloca() {
+ Alloca->eraseFromParent();
+ Alloca = nullptr;
+ }
};
// Shared implementation which can do both promotion to vector and to LDS.
@@ -152,6 +157,10 @@ class AMDGPUPromoteAllocaImpl {
bool binaryOpIsDerivedFromSameAlloca(Value *Alloca, Value *Val,
Instruction *UseInst, int OpIdx0,
int OpIdx1) const;
+ /// Set the amdgpu.non.volatile metadata on all load/store users of \p AA.
+ /// This assumes the pointer of the alloca never escapes, and thus the memory
+ /// is thread-local.
+ void setNonVolatileMetadata(AllocaAnalysis &AA);
/// Check whether we have enough local memory for promotion.
bool hasSufficientLocalMem(const Function &F);
@@ -443,6 +452,15 @@ bool AMDGPUPromoteAllocaImpl::run(Function &F, bool PromoteToLDS) {
if (AA.LDS.Enable &&
tryPromoteAllocaToLDS(AA, SufficientLDS, DeferredIntrs))
Changed = true;
+
+ // If we were unable to remove this alloca, mark all accesses to it as
+ // non-volatile instead. This pass rejects all allocas whose pointer escape,
+ // so the memory of the alloca is known to never be written to outside this
+ // thread.
+ if (AA.Alloca) {
+ setNonVolatileMetadata(AA);
+ Changed = true;
+ }
}
finishDeferredAllocaToLDSPromotion(DeferredIntrs);
@@ -1196,7 +1214,7 @@ void AMDGPUPromoteAllocaImpl::promoteAllocaToVector(AllocaAnalysis &AA) {
// Alloca should now be dead too.
assert(AA.Alloca->use_empty());
- AA.Alloca->eraseFromParent();
+ AA.eraseAlloca();
}
std::pair<Value *, Value *>
@@ -1468,6 +1486,15 @@ void AMDGPUPromoteAllocaImpl::analyzePromoteToLDS(AllocaAnalysis &AA) const {
AA.LDS.Enable = true;
}
+void AMDGPUPromoteAllocaImpl::setNonVolatileMetadata(AllocaAnalysis &AA) {
+ for (Use *U : AA.Uses) {
+ Instruction *I = dyn_cast<Instruction>(U->getUser());
+ if (isa<LoadInst>(I) || isa<StoreInst>(I)) {
+ I->setMetadata("amdgpu.non.volatile", MDNode::get(I->getContext(), {}));
+ }
+ }
+}
+
bool AMDGPUPromoteAllocaImpl::hasSufficientLocalMem(const Function &F) {
FunctionType *FTy = F.getFunctionType();
@@ -1665,7 +1692,7 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToLDS(
Value *Offset = Builder.CreateInBoundsGEP(GVTy, GV, Indices);
AA.Alloca->mutateType(Offset->getType());
AA.Alloca->replaceAllUsesWith(Offset);
- AA.Alloca->eraseFromParent();
+ AA.eraseAlloca();
PointerType *NewPtrTy = PointerType::get(Context, AMDGPUAS::LOCAL_ADDRESS);
diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-memset.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-memset.ll
index 81d6dba494cfc..60526551b22fc 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-alloca-memset.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-memset.ll
@@ -44,7 +44,7 @@ define amdgpu_kernel void @memset_volatile_nopromote(i64 %val) {
; CHECK-NEXT: entry:
; CHECK-NEXT: [[STACK:%.*]] = alloca [4 x i64], align 4, addrspace(5)
; CHECK-NEXT: call void @llvm.memset.p5.i64(ptr addrspace(5) [[STACK]], i8 0, i64 32, i1 true)
-; CHECK-NEXT: store i64 [[VAL:%.*]], ptr addrspace(5) [[STACK]], align 8
+; CHECK-NEXT: store i64 [[VAL:%.*]], ptr addrspace(5) [[STACK]], align 8, !amdgpu.non.volatile [[META0:![0-9]+]]
; CHECK-NEXT: ret void
;
entry:
@@ -59,7 +59,7 @@ define amdgpu_kernel void @memset_badsize_nopromote(i64 %val) {
; CHECK-NEXT: entry:
; CHECK-NEXT: [[STACK:%.*]] = alloca [4 x i64], align 4, addrspace(5)
; CHECK-NEXT: call void @llvm.memset.p5.i64(ptr addrspace(5) [[STACK]], i8 0, i64 31, i1 true)
-; CHECK-NEXT: store i64 [[VAL:%.*]], ptr addrspace(5) [[STACK]], align 8
+; CHECK-NEXT: store i64 [[VAL:%.*]], ptr addrspace(5) [[STACK]], align 8, !amdgpu.non.volatile [[META0]]
; CHECK-NEXT: ret void
;
entry:
diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-non-volatile-accesses.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-non-volatile-accesses.ll
new file mode 100644
index 0000000000000..b912e2199bac1
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-non-volatile-accesses.ll
@@ -0,0 +1,45 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -passes=amdgpu-promote-alloca < %s | FileCheck %s
+
+; Verify that the !amdgpu.non.volatile metadata is set if promoting an alloca fails.
+
+define amdgpu_kernel void @test(i64 %val, i1 %cond) {
+; CHECK-LABEL: define amdgpu_kernel void @test(
+; CHECK-SAME: i64 [[VAL:%.*]], i1 [[COND:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*]]:
+; CHECK-NEXT: [[STACK:%.*]] = alloca [4 x i64], align 4, addrspace(5)
+; CHECK-NEXT: [[STACK_1:%.*]] = getelementptr inbounds i64, ptr addrspace(5) [[STACK]], i32 1
+; CHECK-NEXT: store i64 43, ptr addrspace(5) [[STACK]], align 8, !amdgpu.non.volatile [[META0:![0-9]+]]
+; CHECK-NEXT: br i1 [[COND]], label %[[LOOP:.*]], label %[[END:.*]]
+; CHECK: [[LOOP]]:
+; CHECK-NEXT: [[PSTACK:%.*]] = phi ptr addrspace(5) [ [[STACK]], %[[ENTRY]] ], [ [[STACK_1]], %[[LOOP]] ]
+; CHECK-NEXT: [[LOAD:%.*]] = load i64, ptr addrspace(5) [[PSTACK]], align 8, !amdgpu.non.volatile [[META0]]
+; CHECK-NEXT: store i64 32, ptr addrspace(5) [[STACK_1]], align 8, !amdgpu.non.volatile [[META0]]
+; CHECK-NEXT: [[LOOP_CC:%.*]] = icmp ne i64 [[LOAD]], 32
+; CHECK-NEXT: br i1 [[LOOP_CC]], label %[[LOOP]], label %[[END]]
+; CHECK: [[END]]:
+; CHECK-NEXT: [[RELOAD:%.*]] = load i64, ptr addrspace(5) [[STACK]], align 8, !amdgpu.non.volatile [[META0]]
+; CHECK-NEXT: [[RELOAD_1:%.*]] = load i64, ptr addrspace(5) [[STACK_1]], align 8, !amdgpu.non.volatile [[META0]]
+; CHECK-NEXT: ret void
+;
+entry:
+ %stack = alloca [4 x i64], align 4, addrspace(5)
+ %stack.1 = getelementptr inbounds i64, ptr addrspace(5) %stack, i32 1
+ store i64 43, ptr addrspace(5) %stack
+ br i1 %cond, label %loop, label %end
+
+loop:
+ %pstack = phi ptr addrspace(5) [%stack, %entry], [%stack.1, %loop]
+ %load = load i64, ptr addrspace(5) %pstack
+ store i64 32, ptr addrspace(5) %stack.1
+ %loop.cc = icmp ne i64 %load, 32
+ br i1 %loop.cc, label %loop, label %end
+
+end:
+ %reload = load i64, ptr addrspace(5) %stack
+ %reload.1 = load i64, ptr addrspace(5) %stack.1
+ ret void
+}
+;.
+; CHECK: [[META0]] = !{}
+;.
diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-vgpr-ratio.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-vgpr-ratio.ll
index 4bcc46861d66b..74c4704d716b1 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-alloca-vgpr-ratio.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-vgpr-ratio.ll
@@ -16,10 +16,10 @@ define amdgpu_kernel void @i32_24_elements(ptr %out) #0 {
; DEFAULT-NEXT: call void @llvm.memset.p5.i32(ptr addrspace(5) [[ALLOCA]], i8 0, i32 96, i1 false)
; DEFAULT-NEXT: [[GEP_0:%.*]] = getelementptr inbounds [24 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 0
; DEFAULT-NEXT: [[GEP_1:%.*]] = getelementptr inbounds [24 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 20
-; DEFAULT-NEXT: store i32 42, ptr addrspace(5) [[GEP_0]], align 4
-; DEFAULT-NEXT: store i32 43, ptr addrspace(5) [[GEP_1]], align 4
+; DEFAULT-NEXT: store i32 42, ptr addrspace(5) [[GEP_0]], align 4, !amdgpu.non.volatile [[META0:![0-9]+]]
+; DEFAULT-NEXT: store i32 43, ptr addrspace(5) [[GEP_1]], align 4, !amdgpu.non.volatile [[META0]]
; DEFAULT-NEXT: [[GEP:%.*]] = getelementptr inbounds [24 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 [[SEL2]]
-; DEFAULT-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(5) [[GEP]], align 4
+; DEFAULT-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(5) [[GEP]], align 4, !amdgpu.non.volatile [[META0]]
; DEFAULT-NEXT: store i32 [[LOAD]], ptr [[OUT]], align 4
; DEFAULT-NEXT: ret void
;
@@ -48,10 +48,10 @@ define amdgpu_kernel void @i32_24_elements(ptr %out) #0 {
; RATIO8-NEXT: call void @llvm.memset.p5.i32(ptr addrspace(5) [[ALLOCA]], i8 0, i32 96, i1 false)
; RATIO8-NEXT: [[GEP_0:%.*]] = getelementptr inbounds [24 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 0
; RATIO8-NEXT: [[GEP_1:%.*]] = getelementptr inbounds [24 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 20
-; RATIO8-NEXT: store i32 42, ptr addrspace(5) [[GEP_0]], align 4
-; RATIO8-NEXT: store i32 43, ptr addrspace(5) [[GEP_1]], align 4
+; RATIO8-NEXT: store i32 42, ptr addrspace(5) [[GEP_0]], align 4, !amdgpu.non.volatile [[META0:![0-9]+]]
+; RATIO8-NEXT: store i32 43, ptr addrspace(5) [[GEP_1]], align 4, !amdgpu.non.volatile [[META0]]
; RATIO8-NEXT: [[GEP:%.*]] = getelementptr inbounds [24 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 [[SEL2]]
-; RATIO8-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(5) [[GEP]], align 4
+; RATIO8-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(5) [[GEP]], align 4, !amdgpu.non.volatile [[META0]]
; RATIO8-NEXT: store i32 [[LOAD]], ptr [[OUT]], align 4
; RATIO8-NEXT: ret void
;
@@ -112,10 +112,10 @@ define amdgpu_kernel void @i32_24_elements_attrib(ptr %out) #1 {
; RATIO8-NEXT: call void @llvm.memset.p5.i32(ptr addrspace(5) [[ALLOCA]], i8 0, i32 96, i1 false)
; RATIO8-NEXT: [[GEP_0:%.*]] = getelementptr inbounds [24 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 0
; RATIO8-NEXT: [[GEP_1:%.*]] = getelementptr inbounds [24 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 20
-; RATIO8-NEXT: store i32 42, ptr addrspace(5) [[GEP_0]], align 4
-; RATIO8-NEXT: store i32 43, ptr addrspace(5) [[GEP_1]], align 4
+; RATIO8-NEXT: store i32 42, ptr addrspace(5) [[GEP_0]], align 4, !amdgpu.non.volatile [[META0]]
+; RATIO8-NEXT: store i32 43, ptr addrspace(5) [[GEP_1]], align 4, !amdgpu.non.volatile [[META0]]
; RATIO8-NEXT: [[GEP:%.*]] = getelementptr inbounds [24 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 [[SEL2]]
-; RATIO8-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(5) [[GEP]], align 4
+; RATIO8-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(5) [[GEP]], align 4, !amdgpu.non.volatile [[META0]]
; RATIO8-NEXT: store i32 [[LOAD]], ptr [[OUT]], align 4
; RATIO8-NEXT: ret void
;
@@ -176,10 +176,10 @@ define amdgpu_kernel void @i32_16_elements(ptr %out) #0 {
; RATIO8-NEXT: call void @llvm.memset.p5.i32(ptr addrspace(5) [[ALLOCA]], i8 0, i32 64, i1 false)
; RATIO8-NEXT: [[GEP_0:%.*]] = getelementptr inbounds [16 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 0
; RATIO8-NEXT: [[GEP_1:%.*]] = getelementptr inbounds [16 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 15
-; RATIO8-NEXT: store i32 42, ptr addrspace(5) [[GEP_0]], align 4
-; RATIO8-NEXT: store i32 43, ptr addrspace(5) [[GEP_1]], align 4
+; RATIO8-NEXT: store i32 42, ptr addrspace(5) [[GEP_0]], align 4, !amdgpu.non.volatile [[META0]]
+; RATIO8-NEXT: store i32 43, ptr addrspace(5) [[GEP_1]], align 4, !amdgpu.non.volatile [[META0]]
; RATIO8-NEXT: [[GEP:%.*]] = getelementptr inbounds [16 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 [[SEL2]]
-; RATIO8-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(5) [[GEP]], align 4
+; RATIO8-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(5) [[GEP]], align 4, !amdgpu.non.volatile [[META0]]
; RATIO8-NEXT: store i32 [[LOAD]], ptr [[OUT]], align 4
; RATIO8-NEXT: ret void
;
@@ -214,10 +214,10 @@ define amdgpu_kernel void @i32_16_elements_attrib(ptr %out) #2 {
; DEFAULT-NEXT: call void @llvm.memset.p5.i32(ptr addrspace(5) [[ALLOCA]], i8 0, i32 64, i1 false)
; DEFAULT-NEXT: [[GEP_0:%.*]] = getelementptr inbounds [16 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 0
; DEFAULT-NEXT: [[GEP_1:%.*]] = getelementptr inbounds [16 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 15
-; DEFAULT-NEXT: store i32 42, ptr addrspace(5) [[GEP_0]], align 4
-; DEFAULT-NEXT: store i32 43, ptr addrspace(5) [[GEP_1]], align 4
+; DEFAULT-NEXT: store i32 42, ptr addrspace(5) [[GEP_0]], align 4, !amdgpu.non.volatile [[META0]]
+; DEFAULT-NEXT: store i32 43, ptr addrspace(5) [[GEP_1]], align 4, !amdgpu.non.volatile [[META0]]
; DEFAULT-NEXT: [[GEP:%.*]] = getelementptr inbounds [16 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 [[SEL2]]
-; DEFAULT-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(5) [[GEP]], align 4
+; DEFAULT-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(5) [[GEP]], align 4, !amdgpu.non.volatile [[META0]]
; DEFAULT-NEXT: store i32 [[LOAD]], ptr [[OUT]], align 4
; DEFAULT-NEXT: ret void
;
@@ -246,10 +246,10 @@ define amdgpu_kernel void @i32_16_elements_attrib(ptr %out) #2 {
; RATIO8-NEXT: call void @llvm.memset.p5.i32(ptr addrspace(5) [[ALLOCA]], i8 0, i32 64, i1 false)
; RATIO8-NEXT: [[GEP_0:%.*]] = getelementptr inbounds [16 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 0
; RATIO8-NEXT: [[GEP_1:%.*]] = getelementptr inbounds [16 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 15
-; RATIO8-NEXT: store i32 42, ptr addrspace(5) [[GEP_0]], align 4
-; RATIO8-NEXT: store i32 43, ptr addrspace(5) [[GEP_1]], align 4
+; RATIO8-NEXT: store i32 42, ptr addrspace(5) [[GEP_0]], align 4, !amdgpu.non.volatile [[META0]]
+; RATIO8-NEXT: store i32 43, ptr addrspace(5) [[GEP_1]], align 4, !amdgpu.non.volatile [[META0]]
; RATIO8-NEXT: [[GEP:%.*]] = getelementptr inbounds [16 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 [[SEL2]]
-; RATIO8-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(5) [[GEP]], align 4
+; RATIO8-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(5) [[GEP]], align 4, !amdgpu.non.volatile [[META0]]
; RATIO8-NEXT: store i32 [[LOAD]], ptr [[OUT]], align 4
; RATIO8-NEXT: ret void
;
@@ -278,5 +278,10 @@ declare void @llvm.memset.p5.i32(ptr addrspace(5) nocapture writeonly, i8, i32,
attributes #0 = { nounwind "amdgpu-promote-alloca-to-vector-max-regs"="24" "amdgpu-waves-per-eu"="4,4" }
attributes #1 = { nounwind "amdgpu-promote-alloca-to-vector-max-regs"="24" "amdgpu-waves-per-eu"="4,4" "amdgpu-promote-alloca-to-vector-vgpr-ratio"="2" }
attributes #2 = { nounwind "amdgpu-promote-alloca-to-vector-max-regs"="24" "amdgpu-waves-per-eu"="4,4" "amdgpu-promote-alloca-to-vector-vgpr-ratio"="8" }
+;.
+; DEFAULT: [[META0]] = !{}
+;.
+; RATIO8: [[META0]] = !{}
+;.
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
; BASE: {{.*}}
``````````
</details>
https://github.com/llvm/llvm-project/pull/179415
More information about the llvm-branch-commits
mailing list